R/read_length_distribution.R

Defines functions read_length_distribution

Documented in read_length_distribution

#' @title Read length distribution plots corresponding to mutationFileFormat files
#'
#' @description For each file (of mutationFileFormat type) in a directory, we plot
#' the read length distribution under three settings - all reads, reads containing
#' C to T mutations and reads containing C to T towards the ends of the fragments.
#'
#' @param dir The directory housing all the mutationFileFormat files.
#' @param end_break The position on the read from the end of the fragment such that if
#' C->T change occurs beyond that end break, it is considered in the third class of
#' reads (potentially damaged reads)
#' @param plot_layout The layout of the plot with multiple images (deafult is a 3x3 layout)
#' @param cols The color profiles of the three plots. Defaults to "red", "green" and "blue".
#' @param cex_legend The cex of the legend.
#' @param cex_main The cex of the title of the figure.
#'
#' @return The read length distribution plots for each of the mutationFileFormat files
#' in the folder
#'
#' @keywords read_length
#'
#' @export

read_length_distribution <- function(dir,
                                     pattern = NULL,
                                     end_break = 5,
                                     plot_layout = c(3,3),
                                     cols = c("red", "green", "blue"),
                                     cex_legend = 0.5,
                                     cex.main = 0.5){

  ###  read in all the files in the directory

  if(is.null(pattern)){
    files <- setdiff(list.files(dir, pattern = ".csv"), list.files(dir, pattern = ".csv#"))
  }else{
    files <- list.files(dir, pattern = pattern)
  }
  
  tab <- list()
  for(l in files){
    tab[[l]] <- read.csv(paste0(dir, l), header=FALSE)
    cat("We are at sample", l, "\n")
  }

  ancient_names <- as.character(sapply(files, function(x) return(strsplit(x, "[_]")[[1]][1])))

  par(mfrow=plot_layout)
  for(l in 1:length(tab)){
    tab1 <- tab[[l]]
    read_length <- tab1$V2 + tab1$V3
    indices <- grep("C->T", tab1$V1)
    indices <- c(indices, grep("G->A", tab1$V1))
    read_length_CtoT <- tab1[indices, ]$V2 + tab1[indices, ]$V3 ## read lengths for all C to T
    indices2 <- which(tab1$V2 < end_break | tab1$V3 < end_break)
    indices_matched <- intersect(indices, indices2)
    read_length_3 <- (tab1[indices_matched, ]$V2 + tab1[indices_matched, ]$V3)
    plot(table(read_length)/sum(table(read_length)), type="o", col=cols[1],
         main=paste0(ancient_names[l]), cex.main = cex.main, xlab="read pos", ylab="prop of occur")
    points(table(read_length_CtoT)/sum(table(read_length_CtoT)), type="o", col=cols[2])
    points(table(read_length_3)/ sum(table(read_length_3)), type="o", col=cols[3])
    legend("topright", fill=c("red", "green", "blue"),
           legend = c("all", "CtoT", paste0("CtoT < ", end_break)), cex=cex_legend)
    cat("we are at sample", l, "\n")
  }
}
kkdey/aRchaic.site documentation built on May 20, 2019, 10:31 a.m.