#' @title CCPipeline
#' @description Inputs the "Molecule" files in generated in the "Analysis" folder by termMapper.
#' Outputs whole genome "FullMap" files for each sample, with and without de-duplication (based
#' identical R1&R2 coordinates between independent molecules).
#' Also outputs histograms plots which detail the degree of PCR duplication in the each sample.
#' Also outputs a "DSBList" (list of all FUllMaps), in RDS format.
#' @param work.d location of the desired working directory.
#' @param exp.name name of the rds file generated.
#' @param readMidpoint Do you want to call the midpoint of the molecule
#' @param subsamplereads Do you want to subsample. For example enter 0.5 to sub sample by 50 percent
#' @param R2Map Do you want to generate a R2 map
#' @author Will Gittens George Brown
#' @export
CCPipeline <- function(work.d,exp.name, combine = T,readMidpoint=F,subsamplereads=F,R2Map=F){
setwd(work.d)
library("doParallel")
library("data.table")
library("plyr")
library("stringr")
# exp.name <- "All" #Choose a name for the experiment. This will be used in the file name of any combined files.
# setwd("/Users/georgebrown/Desktop/molecules2") #set working directory to the "Analysis" directory generated by termMapper.
parent.directory <- getwd()
Mreads.before.duplicate.filter <- c()
{
folder.list <- list.dirs(path = parent.directory, full.names = TRUE, recursive = F); abbrev.list <- list.dirs(path = ".", full.names = TRUE, recursive = F); abbrev.list <- substr(abbrev.list, 3, nchar(abbrev.list))
if (R2Map==F){
ifelse(!dir.exists(file.path(parent.directory, "Output")), dir.create(file.path(parent.directory, "Output")), FALSE); setwd(file.path(parent.directory, "Output")); directory2 <- getwd()}else{
ifelse(!dir.exists(file.path(parent.directory, "OutputR2")), dir.create(file.path(parent.directory, "OutputR2")), FALSE); setwd(file.path(parent.directory, "OutputR2")); directory2 <- getwd()
}
ifelse(!dir.exists(file.path(directory2, "dupfree")), dir.create(file.path(directory2, "dupfree")), FALSE); setwd(file.path(directory2, "dupfree")); directory3 <- getwd()
ifelse(!dir.exists(file.path(directory2, "raw")), dir.create(file.path(directory2, "raw")), FALSE); setwd(file.path(directory2, "raw")); directory4 <- getwd()
ifelse(!dir.exists(file.path(directory3, "FullMaps")), dir.create(file.path(directory3, "FullMaps")), FALSE); setwd(file.path(directory3, "FullMaps")); directory5 <- getwd()
ifelse(!dir.exists(file.path(directory3, "RDS")), dir.create(file.path(directory3, "RDS")), FALSE); setwd(file.path(directory3, "RDS")); directory6 <- getwd()
ifelse(!dir.exists(file.path(directory4, "FullMaps")), dir.create(file.path(directory4, "FullMaps")), FALSE); setwd(file.path(directory4, "FullMaps")); directory7 <- getwd()
ifelse(!dir.exists(file.path(directory4, "RDS")), dir.create(file.path(directory4, "RDS")), FALSE); setwd(file.path(directory4, "RDS")); directory8 <- getwd()
ifelse(!dir.exists(file.path(directory2, "DuplicateHistograms")), dir.create(file.path(directory2, "DuplicateHistograms")), FALSE); setwd(file.path(directory2, "DuplicateHistograms")); directory9 <- getwd()
for (g in 1:length(folder.list)) {
rm(list=setdiff(ls(), c("subsamplereads","readMidpoint","g","files","R2Map", "combine","folder.list", "abbrev.list", "genome.flag", "parent.directory", "exp.name", "Mreads.before.duplicate.filter", "directory2", "directory3", "directory4", "directory5", "directory6", "directory7", "directory8", "directory9")))
setwd(folder.list[g])
DSBList <- list()
files = list.files(pattern="Molecules.")
isW303 <- grepl("W303", files[1], fixed = TRUE)
genome_flag <- str_match(files[1], "_(.*?)_")[2]
DSBListNames = substr(files, 11, nchar(files)-12) # Shorten filename by 8 characters from beginning and 6 characters form end (i.e. remove "Molecules." and "_c.txt")
if (R2Map==T){DSBListNames=paste0(DSBListNames,"_R2")}
nfiles = length(files) # Count number of files
cl <- makeCluster(detectCores()-1)
registerDoParallel(cl)
DSBList=foreach (k = 1:nfiles,.packages="data.table") %dopar% {fread(files[k],colClasses = c("numeric", "numeric", "numeric", "numeric", "numeric")) } #Import datatable
stopCluster(cl)
nfiles = length(files); Mreads = NULL
for (i in 1:nfiles){
if (subsamplereads!=F){
DSBList[[i]]= DSBList[[i]] %>% sample_frac(subsamplereads)}
names(DSBList[[i]])[names(DSBList[[i]]) == 'R1W Freq'] <- 'R1W.Freq'
names(DSBList[[i]])[names(DSBList[[i]]) == 'R1C Freq'] <- 'R1C.Freq'
if (R2Map==T){
names(DSBList[[i]])[names(DSBList[[i]]) == 'Coord-B'] <- 'Coord.A'
names(DSBList[[i]])[names(DSBList[[i]]) == 'Coord-A'] <- 'Coord.B'
DSBList[[i]] <- DSBList[[i]][, c(1, 3, 2, 4, 5)]
}else{
names(DSBList[[i]])[names(DSBList[[i]]) == 'Coord-A'] <- 'Coord.A'
names(DSBList[[i]])[names(DSBList[[i]]) == 'Coord-B'] <- 'Coord.B'
}
# return(DSBList[[i]])
if (readMidpoint==T){midpoint=(DSBList[[i]]$Coord.B-DSBList[[i]]$Coord.A)/2+DSBList[[i]]$Coord.A
DSBList[[i]]$Coord.B=round(midpoint)
DSBList[[i]]$Coord.A=round(midpoint)
}
Mreads[i]=sum(DSBList[[i]]$R1W.Freq+DSBList[[i]]$R1C.Freq)/1000000
} # Calculate Million reads per sample for conveting to HpM
sum(Mreads)
mrgd.coord <- DSBList
#this histogram shows the distribution of singlets and multiplicates etc.
mrgd.coord.backup<- do.call("rbind", mrgd.coord)
Mreads.before.duplicate.filter <- c(Mreads.before.duplicate.filter, sum(mrgd.coord.backup$R1W.Freq + mrgd.coord.backup$R1C.Freq)/1000000)
histogram <- hist(mrgd.coord.backup$R1W.Freq, breaks = seq(-0.5,(max(mrgd.coord.backup$R1W.Freq)+0.5), by = 1), plot = F)
histogram$counts <- histogram$counts*(0:(length(histogram$counts)-1))
sum(histogram$counts)
setwd(directory9)
if(R2Map==T){abbrev.list[g]=paste0(abbrev.list[g],"_R2")}
png(file = paste0("DuplicateHistogram_", genome_flag, "-",abbrev.list[g],".png"))
plot(0:(length(histogram$counts)-1), histogram$counts/sum(histogram$counts), xlim = c(-1,20), type = 'h', main = abbrev.list[g], ylab = "Fraction of Reads", xlab = "No of identical molecules detected")
dev.off()
# removes duplicates
mrgd.coord2 <- mrgd.coord
mrgd.coord <- lapply(mrgd.coord, function(x){
x[(x$R1W.Freq>1),4] <- 1
x[(x$R1C.Freq>1),5] <- 1
return(x)
})
undup.DSBList <- lapply(mrgd.coord, function(x){
x[(x$R1W.Freq==1),c(1,2)]
})
undup.DSBList.c <- lapply(mrgd.coord, function(x){
x[(x$R1C.Freq==1),c(1,3)]
})
undup.DSBList <- lapply(undup.DSBList, function(x){
count(x, c("Chr", "Coord.A"))
})
undup.DSBList.c <- lapply(undup.DSBList.c, function(x){
count(x, c("Chr", "Coord.B"))
})
undup.DSBList <- do.call("rbind", undup.DSBList)
undup.DSBList.c <- do.call("rbind", undup.DSBList.c)
colnames(undup.DSBList) <- c("Chr", "Pos", "Watson"); colnames(undup.DSBList.c) <- c("Chr", "Pos", "Crick")
undup.DSBList <- data.table(undup.DSBList); setkey(undup.DSBList, Chr, Pos); undup.DSBList.c <- data.table(undup.DSBList.c); setkey(undup.DSBList.c, Chr, Pos)
undup.DSBList <- merge(undup.DSBList, undup.DSBList.c, all=TRUE)
undup.DSBList[is.na(undup.DSBList)] <- 0L
undup.DSBList <- undup.DSBList[order(undup.DSBList$Chr, undup.DSBList$Pos),]
setwd(directory6)
if(isW303){
undup.DSBList$Chr <- as.character(undup.DSBList$Chr)
undup.DSBList[undup.DSBList$Chr == "2-micron","Chr"] <- 18
undup.DSBList[undup.DSBList$Chr == "chrMito","Chr"] <- 17
undup.DSBList$Chr <- sub("chr", "", undup.DSBList$Chr)
undup.DSBList$Chr<-as.roman(undup.DSBList$Chr)
undup.DSBList$Chr<-as.numeric(undup.DSBList$Chr)
}
save(undup.DSBList, file = paste0(genome_flag, "-", abbrev.list[g], "_dupfree.rds"))
setwd(directory5)
write.table(undup.DSBList, file = paste0("FullMap.", genome_flag, "-", abbrev.list[g],"_dupfree.txt"), sep = "\t", row.names = F, quote = F)
###### keeps duplicates #####
dup.DSBList <- lapply(mrgd.coord2, function(x){
x[(x$R1W.Freq>=1),c(1,2,4)]
})
dup.DSBList.c <- lapply(mrgd.coord2, function(x){
x[(x$R1C.Freq>=1),c(1,3,5)]
})
dup.DSBList <- lapply(dup.DSBList, function(x){
count(x, c("Chr", "Coord.A"), wt_var = "R1W.Freq")
})
dup.DSBList.c <- lapply(dup.DSBList.c, function(x){
count(x, c("Chr", "Coord.B"), wt_var = "R1C.Freq")
})
dup.DSBList <- do.call("rbind", dup.DSBList)
dup.DSBList.c <- do.call("rbind", dup.DSBList.c)
colnames(dup.DSBList) <- c("Chr", "Pos", "Watson"); colnames(dup.DSBList.c) <- c("Chr", "Pos", "Crick")
dup.DSBList <- data.table(dup.DSBList); setkey(dup.DSBList, Chr, Pos); dup.DSBList.c <- data.table(dup.DSBList.c); setkey(dup.DSBList.c, Chr, Pos)
dup.DSBList <- merge(dup.DSBList, dup.DSBList.c, all=TRUE)
dup.DSBList[is.na(dup.DSBList)] <- 0L
dup.DSBList <- dup.DSBList[order(dup.DSBList$Chr, dup.DSBList$Pos),]
setwd(directory8)
if(isW303){
dup.DSBList$Chr <- as.character(dup.DSBList$Chr)
dup.DSBList[dup.DSBList$Chr == "2-micron","Chr"] <- 18
dup.DSBList[dup.DSBList$Chr == "chrMito","Chr"] <- 17
dup.DSBList$Chr <- sub("chr", "", dup.DSBList$Chr)
dup.DSBList$Chr<-as.roman(dup.DSBList$Chr)
dup.DSBList$Chr<-as.numeric(dup.DSBList$Chr)
}
save(dup.DSBList, file = paste0(genome_flag, "-", abbrev.list[g], ".rds"))
setwd(directory7)
write.table(dup.DSBList, file = paste0("FullMap.", genome_flag, "-", abbrev.list[g],".txt"), sep = "\t", row.names = F, quote = F)
}
rm(list=setdiff(ls(), c("g","isW303", "combine", "folder.list", "files", "abbrev.list", "genome.flag", "parent.directory", "exp.name", "Mreads.before.duplicate.filter", "directory2", "directory3", "directory4", "directory5", "directory6", "directory7", "directory8")))
if(combine == T){
# reads in .rds files and sticks them together
setwd(directory6)
DSBList<- NULL
file.list <- list.files(pattern ="_dupfree.rds")
for (g in 1:length(file.list)) {
load(file.list[g])
DSBList[[g]] <- undup.DSBList
}
DSBListNames <- paste0(abbrev.list, "_dupfree"); nfiles <- length(DSBList); MMreads <- Mreads <- sapply(DSBList, function(x) {sum(x$Watson + x$Crick)/ 1000000})
Mreads.after.duplicate.filter <- Mreads
Msites.before.duplicate.filter <- Msites.after.duplicate.filter <- sapply(DSBList, nrow)
perc.Mreads.removed.by.duplicate.filter <- 100*((Mreads.before.duplicate.filter-Mreads.after.duplicate.filter)/Mreads.before.duplicate.filter)
perc.Msites.removed.by.duplicate.filter <- 100*((Msites.before.duplicate.filter-Msites.after.duplicate.filter)/Msites.before.duplicate.filter)
filter.metrics <- data.frame(cbind(DSBListNames, Mreads.before.duplicate.filter, Mreads.after.duplicate.filter, perc.Mreads.removed.by.duplicate.filter, Msites.before.duplicate.filter, Msites.after.duplicate.filter, perc.Msites.removed.by.duplicate.filter))
setwd(directory2)
save(DSBList, DSBListNames, nfiles, MMreads, filter.metrics, Mreads, file = paste0(exp.name, "_dupfree.rds"))
# reads in NOTdupfree .rds files and sticks them together
setwd(directory8)
DSBList<- NULL
file.list <- list.files(pattern =".rds")
for (g in 1:length(file.list)) {
load(file.list[g])
DSBList[[g]] <- dup.DSBList
}
DSBListNames <- paste0(abbrev.list, ""); nfiles <- length(DSBList); MMreads <- Mreads <- sapply(DSBList, function(x) {sum(x$Watson + x$Crick)/ 1000000})
Mreads.after.duplicate.filter <- Mreads
Msites.before.duplicate.filter <- Msites.after.duplicate.filter <- sapply(DSBList, nrow)
perc.Mreads.removed.by.duplicate.filter <- 100*((Mreads.before.duplicate.filter-Mreads.after.duplicate.filter)/Mreads.before.duplicate.filter)
perc.Msites.removed.by.duplicate.filter <- 100*((Msites.before.duplicate.filter-Msites.after.duplicate.filter)/Msites.before.duplicate.filter)
filter.metrics <- data.frame(cbind(DSBListNames, Mreads.before.duplicate.filter, Mreads.after.duplicate.filter, perc.Mreads.removed.by.duplicate.filter, Msites.before.duplicate.filter, Msites.after.duplicate.filter, perc.Msites.removed.by.duplicate.filter))
setwd(directory2)
save(DSBList, DSBListNames, nfiles, MMreads, filter.metrics, Mreads, file = paste0(exp.name, ".rds"))
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.