ngsLCA: Process and Visualize Lca Files and Taxonomic Profiles

Documented in ngsLCA_deContam

#'
#' @title Remove Contamination Taxa
#'
#' @description Generate a contamination list from blank control lca files, and subtract listed taxa from the combined taxa profile. Running this function will update the existing "complete_profile.txt".
#'
#' @param path working directory, same to \code{\link{ngsLCA_profile}}.
#' @param run name of the run, default is "run01".
#' @param control_path path to the directory that contains all blank control lca files.
#' @param threshold.1_control for generating contamination list, minimum reads number required for keeping a taxon in each blank control; default is 2.
#' @param threshold.2_control for generating contamination list, minimum read percentage (to the total reads number of a blank control) required for keeping a taxon in each blank control, ranging from 0 to 1; default is 0.
#' @param threshold.3_control for generating contamination list, minimum sum of reads across all blank controls required for keeping a taxon; default is 5.
#'
#' @return An updated complete_profile.txt (after removing contaminations).
#' @importFrom utils read.csv read.delim write.table
#' @export
#'
#' @examples
#' ngsLCA_deContam(path=system.file("extdata","lca_files",package="ngsLCA"),
#'                 run="run01",
#'                 control_path=system.file("extdata","blank_controls",package="ngsLCA"),
#'                 threshold.1_control=0,
#'                 threshold.2_control=0,
#'                 threshold.3_control=0)
#'
#' ## This will combine the blank control lca files within folder
#' ## "extdata/blank_controls/" to generate a contamination list,
#' ## and then filter this list using the 3 control thresholds
#' ## supplied. The listed taxa will be subtracted from the combined
#' ## taxa profile.
#'
#'
ngsLCA_deContam = function(path,
                           run="run01",
                           control_path,
                           threshold.1_control = 2,
                           threshold.2_control = 0,
                           threshold.3_control = 5){


  #modify input path
  if (!grepl('/$', path)) {
    path = paste(path,"/",sep="")
  }


  #local variable
  taxa = NULL


  #data read-in
  if (length(dir(paste(path, run, "/intermediate/", sep=""), pattern ="taxa_profile_v1.txt")) == 0) {
    cat(paste("\n\n\t-> Error: required input file not found under '", run, "' folder, please run 'ngsLCA_profile' first.\n\n",
              sep = ""))
    stop()
  }else{
    cat("\n\n\t-> De-contaminate.\n\n")
  }

  if (length(dir(paste(path, run, "/intermediate/", sep=""), pattern ="taxa_profile_v2.3.txt")) == 1) {
    cat("\n\n\t-> Taxa profile filtered by 'ngsLCA_filter' is found and will be used as input.\n\n")
    DF1 = read.delim(paste(path, run, "/intermediate/", "taxa_profile_v2.3.txt", sep=""),
                     stringsAsFactors=FALSE,header = T,check.names = F)
  }else{
    cat("\n\n\t-> Original profile generated by 'ngsLCA_profile' will be used as input.\n\n")
    DF1 = read.delim(paste(path, run, "/intermediate/", "taxa_profile_v1.txt", sep=""),
                     stringsAsFactors=FALSE,header = T,check.names = F)
  }


  #blank control read-in
  if (!grepl('/$', control_path)) {
    control_path = paste(control_path,"/",sep="")
  }

  FileList = dir(control_path, pattern = ".lca$")

  if (length(FileList)==0) {
    cat("\n\n\t-> Error: no lca files found in the appointed 'control_path' directory.\n\n")
    stop()}else{

      CON1 = data.frame(taxa=character(), stringsAsFactors=F)

      for (i in 1:length(FileList)) {

        CON2.1 =  read.csv(paste(control_path, FileList[i], sep=""), header=F, sep="\t", stringsAsFactors=F, fill=T,
                          col.names = paste0("V",seq_len(60)), skip = 1)

        if(dim(CON2.1)[1]>0){

          CON2.2 = data.frame(taxa=CON2.1[,2],
                             count=rep(1, dim(CON2.1)[1]),
                             stringsAsFactors = F)

          CON2.3 = aggregate(CON2.2[,2]~CON2.2[,1], data=CON2.2, FUN=sum)
          colnames(CON2.3) = c("taxa",sub(".lca","",FileList[i]))
          CON1 = merge(CON1, CON2.3, by="taxa", all=T)

        }else{
          paste("\n\n\t-> ", FileList[i], " contains no taxa, skipped.\n\n", sep = "")
        }
      }

      CON1[is.na(CON1)] = 0


      if (dim(CON1)[1]>0) {
        write.table(CON1, file = paste(path, run, "/intermediate/", "contamination_list_v1.txt", sep=""), quote=F,
                    row.names=F, col.names=T, sep="\t")
      }else{
        cat("\n\n\t-> No alignments appear in the supplied blacnk control lca files.\n\n")
        stop()
      }
    }


  #filter blank controls
  thr1b=as.numeric(threshold.1_control)
  thr2b=as.numeric(threshold.2_control)
  thr3b=as.numeric(threshold.3_control)

  DF2 = read.delim(paste(path, run, "/intermediate/", "contamination_list_v1.txt", sep=""),
                   stringsAsFactors=FALSE, header = T, check.names = F)

  DF2.1 = as.data.frame(DF2[,-1])
  for (i in 1:dim(DF2.1)[2]) {
    DF2.1[,i] = as.numeric(DF2.1[,i])
  }

  colnames(DF2.1) = colnames(DF2)[-1]
  DF2.1[DF2.1 < thr1b] = 0
  DF2.1$taxa  = DF2$taxa
  DF2.1 = DF2.1[,c(dim(DF2.1)[2],2:dim(DF2.1)[2]-1)]

  if (dim(DF2.1)[2]>2) {
    DF2.1 = DF2.1[which(rowSums(DF2.1[,-1]) !=0),]
  }else{
    DF2.1 = DF2.1[which(DF2.1[,2] !=0),]
  }

  if (dim(DF2.1)[1]>0) {
    write.table(DF2.1, file = paste(path, run, "/intermediate/", "contamination_list_v2.1.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")
  }else{
    cat("\n\n\t-> All taxa in the contamination list removed by threshold.1_control.\n\n")
    stop()
  }


  DF2.2 = DF2.1
  for (i in 2:dim(DF2.2)[2]) {
    if (sum(DF2.2[,i])>0) {
      DF2.2[(DF2.2[,i] < sum(DF2.2[,i])*thr2b),i] = 0
    }
  }

  if (dim(DF2.2)[1]>0) {
    write.table(DF2.2, file = paste(path, run, "/intermediate/", "contamination_list_v2.2.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")
  }else{
    cat("\n\n\t-> All taxa in the contamination list removed by threshold.2_control.\n\n")
    stop()
  }


  if (dim(DF2.2)[2]>2) {
    DF2.3 = DF2.2[which(rowSums(DF2.2[,-1]) >= thr3b),]
  }else{
    DF2.3 = DF2.2[which(DF2.2[,2] >= thr3b),]
  }

  if (dim(DF2.3)[1]>0) {
    write.table(DF2.3, file = paste(path, run, "/intermediate/", "contamination_list_v2.3.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")

    DF2.4 = DF2.3
    taxa_split = strsplit(DF2.3$taxa, ":")
    DF2.4$taxa_ID = sapply(taxa_split, function(x) x[1])
    DF2.4$taxa_name = sapply(taxa_split, function(x) paste(x[2:(length(x)-1)], collapse = ":"))
    DF2.4$taxonomic_rank = sapply(taxa_split, function(x) x[length(x)])
    DF2.4 = DF2.4[,c((dim(DF2.4)[2] - 2):dim(DF2.4)[2],2:(dim(DF2.4)[2] - 3))]
    write.table(DF2.4, file = paste(path, run, "/taxonomic_profiles/", "contamination_list.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")

  }else{
    cat("\n\n\t-> All taxa in the contamination list removed by threshold.3_control.\n\n")
    stop()
  }


  #de-contamination
  CON = read.delim(paste(path, run, "/intermediate/", "contamination_list_v2.3.txt", sep=""),
                   stringsAsFactors=FALSE, header = T)

  N = length(which(DF1$taxa %in% CON$taxa))

  if (N>0) {
    paste("\n\n\t-> A total of", N, " taxa detected in the contamination list and will be subtracted.\n\n")
    DF2 = DF1[-which(DF1$taxa %in% CON$taxa),]
  }else{
    cat("\n\n\t-> No taxa in the contamination list detected in the combined taxa profile.\n\n")
    DF2 = DF1
  }


  #write files
  if (dim(DF2)[1] > 0) {

    write.table(DF2, file = paste(path, run, "/intermediate/", "taxa_profile_v3.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")
    write.table(DF2, file = paste(path, run, "/intermediate/", "taxa_profile_final.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")

    DF3 = DF2
    taxa_split = strsplit(DF2$taxa, ":")
    DF3$taxa_ID = sapply(taxa_split, function(x) x[1])
    DF3$taxa_name = sapply(taxa_split, function(x) paste(x[2:(length(x)-1)], collapse = ":"))
    DF3$taxonomic_rank = sapply(taxa_split, function(x) x[length(x)])
    DF3 = DF3[,c((dim(DF3)[2] - 2):dim(DF3)[2],2:(dim(DF3)[2] - 3))]
    write.table(DF3, file = paste(path, run, "/taxonomic_profiles/", "complete_profile.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")

  }else{
    cat("\n\n\t-> All taxa removed by de-contamination.\n\n")
    stop()
  }

}