CCTools: A suite of tools for analysing CC-seq data.

Documented in CCPipeline

#' @title CCPipeline
#' @description Inputs the "Molecule" files in generated in the "Analysis" folder by termMapper.
#' Outputs whole genome "FullMap" files for each sample, with and without de-duplication (based
#' identical R1&R2 coordinates between independent molecules).
#' Also outputs histograms plots which detail the degree of PCR duplication in the each sample.
#' Also outputs a "DSBList" (list of all FUllMaps), in RDS format.
#' @param work.d location of the desired working directory.
#' @param exp.name name of the rds file generated.
#' @param readMidpoint Do you want to call the midpoint of the molecule
#' @param subsamplereads Do you want to subsample. For example enter 0.5 to sub sample by 50 percent
#' @param R2Map Do you want to generate a R2 map
#' @author Will Gittens George Brown
#' @export
CCPipeline <- function(work.d,exp.name, combine = T,readMidpoint=F,subsamplereads=F,R2Map=F){
  setwd(work.d)
  library("doParallel")
  library("data.table")
  library("plyr")
  library("stringr")
  
  # exp.name <- "All" #Choose a name for the experiment. This will be used in the file name of any combined files.
  
  # setwd("/Users/georgebrown/Desktop/molecules2") #set working directory to the "Analysis" directory generated by termMapper.
  parent.directory <- getwd()
  Mreads.before.duplicate.filter <- c()
  {
    folder.list <- list.dirs(path = parent.directory, full.names = TRUE, recursive = F); abbrev.list <- list.dirs(path = ".", full.names = TRUE, recursive = F); abbrev.list <- substr(abbrev.list, 3, nchar(abbrev.list))
    
    if (R2Map==F){
    ifelse(!dir.exists(file.path(parent.directory, "Output")), dir.create(file.path(parent.directory, "Output")), FALSE); setwd(file.path(parent.directory, "Output")); directory2 <- getwd()}else{
      ifelse(!dir.exists(file.path(parent.directory, "OutputR2")), dir.create(file.path(parent.directory, "OutputR2")), FALSE); setwd(file.path(parent.directory, "OutputR2")); directory2 <- getwd()
      
    }
    ifelse(!dir.exists(file.path(directory2, "dupfree")), dir.create(file.path(directory2, "dupfree")), FALSE); setwd(file.path(directory2, "dupfree")); directory3 <- getwd()
    ifelse(!dir.exists(file.path(directory2, "raw")), dir.create(file.path(directory2, "raw")), FALSE); setwd(file.path(directory2, "raw")); directory4 <- getwd()
    ifelse(!dir.exists(file.path(directory3, "FullMaps")), dir.create(file.path(directory3, "FullMaps")), FALSE); setwd(file.path(directory3, "FullMaps")); directory5 <- getwd()
    ifelse(!dir.exists(file.path(directory3, "RDS")), dir.create(file.path(directory3, "RDS")), FALSE); setwd(file.path(directory3, "RDS")); directory6 <- getwd()
    ifelse(!dir.exists(file.path(directory4, "FullMaps")), dir.create(file.path(directory4, "FullMaps")), FALSE); setwd(file.path(directory4, "FullMaps")); directory7 <- getwd()
    ifelse(!dir.exists(file.path(directory4, "RDS")), dir.create(file.path(directory4, "RDS")), FALSE); setwd(file.path(directory4, "RDS")); directory8 <- getwd()
    ifelse(!dir.exists(file.path(directory2, "DuplicateHistograms")), dir.create(file.path(directory2,  "DuplicateHistograms")), FALSE); setwd(file.path(directory2,  "DuplicateHistograms")); directory9 <- getwd()
    for (g in 1:length(folder.list)) {
      rm(list=setdiff(ls(), c("subsamplereads","readMidpoint","g","files","R2Map", "combine","folder.list", "abbrev.list", "genome.flag", "parent.directory", "exp.name", "Mreads.before.duplicate.filter", "directory2", "directory3", "directory4", "directory5", "directory6", "directory7", "directory8", "directory9")))
      setwd(folder.list[g])
      DSBList <- list()
      files = list.files(pattern="Molecules.")
      isW303 <- grepl("W303", files[1], fixed = TRUE)
      genome_flag <- str_match(files[1], "_(.*?)_")[2]
      
      DSBListNames = substr(files, 11, nchar(files)-12) # Shorten filename by 8 characters from beginning and 6 characters form end (i.e. remove "Molecules." and "_c.txt")
      if (R2Map==T){DSBListNames=paste0(DSBListNames,"_R2")}
      nfiles = length(files) # Count number of files
      cl <- makeCluster(detectCores()-1)
      registerDoParallel(cl)
      
      DSBList=foreach (k = 1:nfiles,.packages="data.table") %dopar% {fread(files[k],colClasses = c("numeric", "numeric", "numeric", "numeric", "numeric")) } #Import datatable
      stopCluster(cl)
      nfiles = length(files); Mreads = NULL
      for (i in 1:nfiles){
        
        if (subsamplereads!=F){
          DSBList[[i]]=  DSBList[[i]] %>% sample_frac(subsamplereads)}
        names(DSBList[[i]])[names(DSBList[[i]]) == 'R1W Freq'] <- 'R1W.Freq'
        names(DSBList[[i]])[names(DSBList[[i]]) == 'R1C Freq'] <- 'R1C.Freq'
        if (R2Map==T){
        names(DSBList[[i]])[names(DSBList[[i]]) == 'Coord-B'] <- 'Coord.A'
        names(DSBList[[i]])[names(DSBList[[i]]) == 'Coord-A'] <- 'Coord.B'
        DSBList[[i]] <- DSBList[[i]][, c(1, 3, 2, 4, 5)]
        }else{
          names(DSBList[[i]])[names(DSBList[[i]]) == 'Coord-A'] <- 'Coord.A'
          names(DSBList[[i]])[names(DSBList[[i]]) == 'Coord-B'] <- 'Coord.B'
        }
        # return(DSBList[[i]])
        if (readMidpoint==T){midpoint=(DSBList[[i]]$Coord.B-DSBList[[i]]$Coord.A)/2+DSBList[[i]]$Coord.A
        DSBList[[i]]$Coord.B=round(midpoint)
        DSBList[[i]]$Coord.A=round(midpoint)
        }
        Mreads[i]=sum(DSBList[[i]]$R1W.Freq+DSBList[[i]]$R1C.Freq)/1000000
      } # Calculate Million reads per sample for conveting to HpM
      sum(Mreads)
      mrgd.coord <- DSBList
      
      #this histogram shows the distribution of singlets and multiplicates etc.
      mrgd.coord.backup<- do.call("rbind", mrgd.coord)
      Mreads.before.duplicate.filter <- c(Mreads.before.duplicate.filter, sum(mrgd.coord.backup$R1W.Freq + mrgd.coord.backup$R1C.Freq)/1000000)
      histogram <- hist(mrgd.coord.backup$R1W.Freq, breaks = seq(-0.5,(max(mrgd.coord.backup$R1W.Freq)+0.5), by = 1), plot = F)
      histogram$counts <- histogram$counts*(0:(length(histogram$counts)-1))
      sum(histogram$counts)
      setwd(directory9)
      if(R2Map==T){abbrev.list[g]=paste0(abbrev.list[g],"_R2")}
      png(file = paste0("DuplicateHistogram_", genome_flag, "-",abbrev.list[g],".png"))
      plot(0:(length(histogram$counts)-1), histogram$counts/sum(histogram$counts), xlim = c(-1,20), type = 'h', main = abbrev.list[g], ylab = "Fraction of Reads", xlab = "No of identical molecules detected")
      dev.off()
      
      # removes duplicates
      mrgd.coord2 <- mrgd.coord
      mrgd.coord <- lapply(mrgd.coord, function(x){
        x[(x$R1W.Freq>1),4] <- 1
        x[(x$R1C.Freq>1),5] <- 1
        return(x)
      })
      undup.DSBList <- lapply(mrgd.coord, function(x){
        x[(x$R1W.Freq==1),c(1,2)]
      })
      undup.DSBList.c <- lapply(mrgd.coord, function(x){
        x[(x$R1C.Freq==1),c(1,3)]
      })
      undup.DSBList <- lapply(undup.DSBList, function(x){
        count(x, c("Chr", "Coord.A"))
      })
      undup.DSBList.c <- lapply(undup.DSBList.c, function(x){
        count(x, c("Chr", "Coord.B"))
      })
      undup.DSBList <- do.call("rbind", undup.DSBList)
      undup.DSBList.c <- do.call("rbind", undup.DSBList.c)
      colnames(undup.DSBList) <- c("Chr", "Pos", "Watson"); colnames(undup.DSBList.c) <- c("Chr", "Pos", "Crick")
      undup.DSBList <- data.table(undup.DSBList); setkey(undup.DSBList, Chr, Pos); undup.DSBList.c <- data.table(undup.DSBList.c); setkey(undup.DSBList.c, Chr, Pos)
      undup.DSBList  <- merge(undup.DSBList, undup.DSBList.c, all=TRUE)
      undup.DSBList[is.na(undup.DSBList)] <- 0L
      undup.DSBList <- undup.DSBList[order(undup.DSBList$Chr, undup.DSBList$Pos),]
      setwd(directory6)
      if(isW303){
        undup.DSBList$Chr <- as.character(undup.DSBList$Chr)
        undup.DSBList[undup.DSBList$Chr == "2-micron","Chr"] <- 18
        undup.DSBList[undup.DSBList$Chr == "chrMito","Chr"] <- 17
        undup.DSBList$Chr <- sub("chr", "", undup.DSBList$Chr)
        undup.DSBList$Chr<-as.roman(undup.DSBList$Chr)
        undup.DSBList$Chr<-as.numeric(undup.DSBList$Chr)
      }
      save(undup.DSBList, file = paste0(genome_flag, "-", abbrev.list[g], "_dupfree.rds"))
      setwd(directory5)
      write.table(undup.DSBList, file = paste0("FullMap.", genome_flag, "-", abbrev.list[g],"_dupfree.txt"), sep = "\t", row.names = F, quote = F)
      
      ###### keeps duplicates #####
      dup.DSBList <- lapply(mrgd.coord2, function(x){
        x[(x$R1W.Freq>=1),c(1,2,4)]
      })
      dup.DSBList.c <- lapply(mrgd.coord2, function(x){
        x[(x$R1C.Freq>=1),c(1,3,5)]
      })
      dup.DSBList <- lapply(dup.DSBList, function(x){
        count(x, c("Chr", "Coord.A"), wt_var = "R1W.Freq")
      })
      dup.DSBList.c <- lapply(dup.DSBList.c, function(x){
        count(x, c("Chr", "Coord.B"), wt_var = "R1C.Freq")
      })
      dup.DSBList <- do.call("rbind", dup.DSBList)
      dup.DSBList.c <- do.call("rbind", dup.DSBList.c)
      colnames(dup.DSBList) <- c("Chr", "Pos", "Watson"); colnames(dup.DSBList.c) <- c("Chr", "Pos", "Crick")
      dup.DSBList <- data.table(dup.DSBList); setkey(dup.DSBList, Chr, Pos); dup.DSBList.c <- data.table(dup.DSBList.c); setkey(dup.DSBList.c, Chr, Pos)
      dup.DSBList  <- merge(dup.DSBList, dup.DSBList.c, all=TRUE)
      dup.DSBList[is.na(dup.DSBList)] <- 0L
      dup.DSBList <- dup.DSBList[order(dup.DSBList$Chr, dup.DSBList$Pos),]
      setwd(directory8)
      if(isW303){
        dup.DSBList$Chr <- as.character(dup.DSBList$Chr)
        dup.DSBList[dup.DSBList$Chr == "2-micron","Chr"] <- 18
        dup.DSBList[dup.DSBList$Chr == "chrMito","Chr"] <- 17
        dup.DSBList$Chr <- sub("chr", "", dup.DSBList$Chr)
        dup.DSBList$Chr<-as.roman(dup.DSBList$Chr)
        dup.DSBList$Chr<-as.numeric(dup.DSBList$Chr)
      }
      save(dup.DSBList, file = paste0(genome_flag, "-", abbrev.list[g], ".rds"))
      setwd(directory7)
      write.table(dup.DSBList, file = paste0("FullMap.", genome_flag, "-", abbrev.list[g],".txt"), sep = "\t", row.names = F, quote = F)
    }
    rm(list=setdiff(ls(), c("g","isW303", "combine", "folder.list", "files", "abbrev.list", "genome.flag", "parent.directory", "exp.name", "Mreads.before.duplicate.filter", "directory2", "directory3", "directory4", "directory5", "directory6", "directory7", "directory8")))
    
    if(combine == T){
      # reads in .rds files and sticks them together
      setwd(directory6)
      DSBList<- NULL
      file.list <- list.files(pattern ="_dupfree.rds")
      for (g in 1:length(file.list)) {
        load(file.list[g])
        DSBList[[g]] <- undup.DSBList
      }
      DSBListNames <- paste0(abbrev.list, "_dupfree"); nfiles <- length(DSBList); MMreads <- Mreads <- sapply(DSBList, function(x) {sum(x$Watson + x$Crick)/ 1000000})
      Mreads.after.duplicate.filter <- Mreads
      Msites.before.duplicate.filter <- Msites.after.duplicate.filter <- sapply(DSBList, nrow)
      perc.Mreads.removed.by.duplicate.filter <- 100*((Mreads.before.duplicate.filter-Mreads.after.duplicate.filter)/Mreads.before.duplicate.filter)
      perc.Msites.removed.by.duplicate.filter <- 100*((Msites.before.duplicate.filter-Msites.after.duplicate.filter)/Msites.before.duplicate.filter)
      filter.metrics <- data.frame(cbind(DSBListNames, Mreads.before.duplicate.filter, Mreads.after.duplicate.filter, perc.Mreads.removed.by.duplicate.filter, Msites.before.duplicate.filter, Msites.after.duplicate.filter, perc.Msites.removed.by.duplicate.filter))
      setwd(directory2)
      save(DSBList, DSBListNames, nfiles, MMreads, filter.metrics, Mreads, file = paste0(exp.name, "_dupfree.rds"))
      
      # reads in NOTdupfree .rds files and sticks them together
      setwd(directory8)
      DSBList<- NULL
      file.list <- list.files(pattern =".rds")
      for (g in 1:length(file.list)) {
        load(file.list[g])
        DSBList[[g]] <- dup.DSBList
      }
      DSBListNames <- paste0(abbrev.list, ""); nfiles <- length(DSBList); MMreads <- Mreads <- sapply(DSBList, function(x) {sum(x$Watson + x$Crick)/ 1000000})
      Mreads.after.duplicate.filter <- Mreads
      Msites.before.duplicate.filter <- Msites.after.duplicate.filter <- sapply(DSBList, nrow)
      perc.Mreads.removed.by.duplicate.filter <- 100*((Mreads.before.duplicate.filter-Mreads.after.duplicate.filter)/Mreads.before.duplicate.filter)
      perc.Msites.removed.by.duplicate.filter <- 100*((Msites.before.duplicate.filter-Msites.after.duplicate.filter)/Msites.before.duplicate.filter)
      filter.metrics <- data.frame(cbind(DSBListNames, Mreads.before.duplicate.filter, Mreads.after.duplicate.filter, perc.Mreads.removed.by.duplicate.filter, Msites.before.duplicate.filter, Msites.after.duplicate.filter, perc.Msites.removed.by.duplicate.filter))
      setwd(directory2)
      save(DSBList, DSBListNames, nfiles, MMreads, filter.metrics, Mreads, file = paste0(exp.name, ".rds"))
    }
  }
}