R/ngsLCA_filter.R

Defines functions ngsLCA_filter

Documented in ngsLCA_filter

#'
#' @title Filter the Combined Taxa Profile
#'
#' @description Filter the combined taxa profile. Running this function will update the existing "complete_profile.txt".
#'
#' @param path working directory, same to \code{\link{ngsLCA_profile}}.
#' @param run name of the run, default is "run01".
#' @param remove.taxa a list of NCBI taxaID indicating taxa that will be removed. This can be a comma separated vector or the path to a text file listing one taxaID in each line.
#' @param threshold.1 minimum reads number required for confirming a taxon in each sample; default is 2.
#' @param threshold.2 minimum read percentage (to the total reads number of a given sample) required for confirming a taxon in each sample, ranging from 0 to 1; default is 0.
#' @param threshold.3 minimum sum of reads across all samples required for confirming a taxon in the combined taxa profile; default is 5.
#'
#' @return An updated complete_profile.txt (after taxa filters).
#' @importFrom utils read.csv read.delim write.table
#' @export
#'
#' @examples
#' ngsLCA_filter(path=system.file("extdata","lca_files",package="ngsLCA"),
#'               run="run01",
#'               remove.taxa="1115501,10114",
#'               threshold.1=2,
#'               threshold.2=0,
#'               threshold.3=2)
#'
#'
#' ## This will filter and update the combined taxa profile:
#' ## "path/run01/taxonomic_profiles/complete_profile.txt"
#' ## by the 3 thresholds supplied, and by removing taxa
#' ## with taxaID 1115501 and 10114.
#'
#'
ngsLCA_filter = function(path,
                         run="run01",
                         remove.taxa=NULL,
                         threshold.1 = 2,
                         threshold.2 = 0,
                         threshold.3 = 5){


  #modify input path
  if (!grepl('/$', path)) {
    path = paste(path,"/",sep="")
  }


  #local variable
  taxa = NULL


  #read-in file
  if (length(dir(paste(path, run, "/intermediate/", sep=""),pattern =  "taxa_profile_v1.txt")) == 0) {
    cat(paste("\n\n\t-> Error: required input file not found under '", run, "' folder, please run 'ngsLCA_profile' first.\n\n",
              sep = ""))
    stop()
  }else{
    cat("\n\n\t-> Filter taxa profile.\n\n")
  }

  if (length(dir(paste(path, run, "/intermediate/", sep=""), pattern ="taxa_profile_v3.txt")) == 1) {
    cat("\n\n\t-> De-contaminated taxa profile generated by 'ngsLCA_deContam' is found and will be used as input.\n\n")
    DF1 = read.delim(paste(path, run, "/intermediate/", "taxa_profile_v3.txt", sep=""),
                     stringsAsFactors=FALSE,header = T,check.names = F)
  }else{
    cat("\n\n\t-> Original taxa profile generated by 'ngsLCA_profile' will be used as input.\n\n")
    DF1 = read.delim(paste(path, run, "/intermediate/", "taxa_profile_v1.txt", sep=""),
                     stringsAsFactors=FALSE,header = T,check.names = F)
  }


  #removes taxa in the remove.taxa list
  if (!is.null(remove.taxa)) {
    remove.taxa = strsplit(remove.taxa,",")[[1]]
    remove.taxa1 = as.numeric(remove.taxa)

    if (any(is.na(remove.taxa1))) {
      if (file.exists(remove.taxa)) {
        remove.taxa1 = read.csv(remove.taxa,stringsAsFactors = F,header = F)
        remove.taxa1 = as.numeric(remove.taxa1[,1])
      }
    }

    if (is.numeric(remove.taxa1)) {
      remove.taxa = remove.taxa1
    }else{
      cat("\n\n\t-> Error: 'remove.taxa' contains no-numeric values, please input NCBI taxaID only.\n\n")
      stop()
    }

    TAXAID = as.numeric(sapply(strsplit(DF1$taxa, split = ":"),function(x) x[1]))
    if (length(which(TAXAID %in% remove.taxa)) >0 ) {
      DF1 = DF1[-which(TAXAID %in% remove.taxa),]
    }
  }


  #threshold.1 filtering
  DF2.1 = as.data.frame(sapply(DF1[,-1], as.numeric))
  colnames(DF2.1) = colnames(DF1)[-1]
  DF2.1[DF2.1 < threshold.1] = 0
  DF2.1$taxa = DF1$taxa
  DF2.1 = DF2.1[,c(dim(DF2.1)[2],2:dim(DF2.1)[2]-1)]

  if (dim(DF2.1)[2]>2) {
    DF2.1 = DF2.1[which(rowSums(DF2.1[,-1]) !=0),]
  }else{
    DF2.1 = DF2.1[which(DF2.1[,2] !=0),]
  }

  if (dim(DF2.1)[1]>0) {
    write.table(DF2.1, file = paste(path, run, "/intermediate/", "taxa_profile_v2.1.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")
  }else{
    cat("\n\n\t-> Error: all taxa removed by threshold.1, try assign a lower value.\n\n")
    stop()
  }


  #threshold.2 filtering
  DF2.2 = DF2.1
  for (i in 2:dim(DF2.2)[2]) {
    if (sum(DF2.2[,i])>0) {
      DF2.2[(DF2.2[,i] < sum(DF2.2[,i])*threshold.2),i] = 0
    }
  }

  if (dim(DF2.2)[2]>2) {
    DF2.2 = DF2.2[which(rowSums(DF2.2[,-1]) !=0),]
  }else{
    DF2.2 = DF2.2[which(DF2.2[,2] !=0),]
  }

  if (dim(DF2.2)[1]>0) {
    write.table(DF2.2, file = paste(path, run, "/intermediate/", "taxa_profile_v2.2.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")
  }else{
    cat("\n\n\t-> Error: all taxa removed by threshold.2, try assign a lower value.\n\n")
    stop()
  }


  #threshold.3 filtering and write files
  if (dim(DF2.2)[2]>2) {
    DF2.3 = DF2.2[which(rowSums(DF2.2[,-1]) >= threshold.3),]
  }else{
    DF2.3 = DF2.2[which(DF2.2[,2] >= threshold.3),]
  }

  if (dim(DF2.3)[1]>0) {
    write.table(DF2.3, file = paste(path, run, "/intermediate/", "taxa_profile_v2.3.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")

    write.table(DF2.3, file = paste(path, run, "/intermediate/", "taxa_profile_final.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")

    DF3 = DF2.3
    taxa_split = strsplit(DF2.3$taxa, ":")
    DF3$taxa_ID = sapply(taxa_split, function(x) x[1])
    DF3$taxa_name = sapply(taxa_split, function(x) paste(x[2:(length(x)-1)], collapse = ":"))
    DF3$taxonomic_rank = sapply(taxa_split, function(x) x[length(x)])
    DF3 = DF3[,c((dim(DF3)[2] - 2):dim(DF3)[2],2:(dim(DF3)[2] - 3))]
    write.table(DF3, file = paste(path, run, "/taxonomic_profiles/", "complete_profile.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")

  }else{
    cat("\n\n\t-> Error: all taxa removed by threshold.3, try assign a lower value.\n\n")
    stop()
  }
}
wyc661217/ngsLCA documentation built on June 2, 2025, 12:02 a.m.