R/msc.quality.R

Defines functions msc.quality

Documented in msc.quality

#' Check the quality of the assembly
#'
#' The msc.quality function allows you to summarise mapping statistics generated by KOMCIS.
#' To check whether minicircles have been assembled successfully, read, mapped read and high quality mapped read frequencies are inspected.
#' The proportion of (near-)perfect alignments of CSB3-containing reads are herefor an important measure.
#'
#' @usage msc.quality(mapstats, groups)
#' @param mapstats a character vector containing the file names of mapping statistics (output of KOMICS).
#' @param groups a vector specifying to which group (e.g. species) the samples belong to.
#' @return 
#'   \item{all}{a table merging mapping statistics of all samples. Mapping statistics include the number of mapped reads (MR), mapped reads with high quality (MR_HQ), CSB3-containing mapped reads (MR_CSB3) and CSB3-containing mapped read with high quality (MR_CSB3_HQ).}
#'   \item{proportions}{a list of tables containing the proportion of previous mentioned items.}
#'   \item{plots}{barplots visualizing previous results.}
#' @examples
#' data(exData)
#' 
#' ### run function
#' map <- msc.quality(mapstats = system.file("extdata", exData$mapstats, package = "rKOMICS"),
#'                    exData$species)
#' 
#' lapply(map$proportions, mean)$MR_HQ 
#' lapply(map$proportions, mean)$MR_CSB3_HQ
#' 
#' ### visualize results
#' barplot(map$proportions$MR)
#'
#' @importFrom utils read.table read.csv
#' @export

msc.quality <- function(mapstats, groups) {

  ############# tests
  
  
  if (sum(file.exists(mapstats))!=length(mapstats)) stop("One or more files doesn't exist")
  if (!is.factor(groups)) stop("ERROR: groups should be a factor")
  if (length(mapstats) != length(groups)) stop("ERROR: mapstats and groups are not of equal length")
  
  
  ############# mapping stats

  
  map <- utils::read.table(mapstats[1], sep=':', row.names = 1)
  for (i in 2:length(mapstats)) {
    map[,i] <- utils::read.table(mapstats[i], sep=':')[,2]
  }
  colnames(map) <- gsub(".map.*", "", gsub(".*/", "", mapstats))

  props <- plots <- list()
  props$MR <- apply(map, 2, function(x) x["Number of mapped reads"]/x["Number of reads"]*100)
  props$MR_HQ <- apply(map, 2, function(x) x["Number of reads with mapping quality >= 20"]/x["Number of reads"]*100)
  props$MR_CSB3 <- apply(map, 2, function(x) x["Number of mapped CSB3-containing reads"]/x["Number of CSB3-containing reads"]*100)
  props$MR_CSB3_HQ <- apply(map, 2, function(x) x["Number of CSB3-containing reads with mapping quality >= 20"]/x["Number of CSB3-containing reads"]*100)

  for (i in 1:length(props)) {
    melted <- melt(props[[i]])
    melted[,2] <- groups
    plots[[i]] <- ggplot(melted, aes(x=rownames(melted), y=melted$value, fill=melted$V2, col=NULL)) + 
      geom_bar(stat="identity", alpha=0.8, show.legend = F) + 
      facet_grid(. ~ V2, space='free', scales='free') + ylim(0,100) + 
      xlab('') + theme_minimal() + ylab('proportions of reads (%)') + 
      theme(axis.text.x = element_text(angle=90, hjust=1),
            axis.title = element_text(face="bold"))
  }
  
  names(plots) <- names(props)
  
  
  ############# return

  
  return(list("all" = map,"proportions" = props, "plots" = plots))
                
}
GeertsManon/rKOMICS documentation built on Dec. 17, 2021, 9:28 p.m.