R/ngsLCA_rank.R

Defines functions ngsLCA_rank

Documented in ngsLCA_rank

#'
#' @title Classify Taxa to Taxonomic Ranks
#'
#' @description Classify the combined taxa profile (and grouped taxa profiles generated by \code{\link{ngsLCA_group}} if available) into user-defined taxonomic ranks. Results will be in "path/run/taxonomic_profiles/taxa_ranks/".
#'
#' @param path working directory, same to \code{\link{ngsLCA_profile}}.
#' @param run name of the run, default is "run01".
#' @param rank.name a comma separated vector listing the taxonomic ranks that will be used for classifying taxa profiles; default is "species,genus,family"
#'
#' @return Taxa profiles clustered into taxa ranks.
#' @importFrom utils read.csv write.table
#' @export
#'
#' @examples
#' ngsLCA_rank(path=system.file("extdata","lca_files",package="ngsLCA"),
#'             run="run01",
#'             rank.name="species,genus")
#'
#'
#' ## This will classify the combined taxa profile (and
#' ## grouped taxa profiles if available) of "run01" into
#' ## species and genus, by merging all taxa below a species
#' ## into that species, and all taxa below a genus into
#' ## that genus. Generated files will be in
#' ## "path/run01/taxonomic_profiles/taxa_ranks/".
#'
#'
ngsLCA_rank=function(path,
                     run="run01",
                     rank.name="species,genus,family"){


  #modify input path
  if (!grepl('/$', path)) {
    path = paste(path,"/",sep="")
  }


  #local variable
  taxa = NULL


  #whether ngsLCA_profile performed
  if (length(dir(paste(path, run, "/intermediate/", sep=""),pattern =  "taxa_profile_v1.txt")) == 0) {
    cat(paste("\n\n\t-> Error: required input file not found under '", run, "' folder, please run 'ngsLCA_profile' first.\n\n",
              sep = ""))
    stop()
  }else{
    cat("\n\n\t-> Cluster taxa into taxonomic ranks.\n\n")
  }


  #generate taxa branch
  if (length(dir(paste(path, run, "/intermediate/", sep=""),pattern = "taxa_branch.txt")) > 0) {
    tax.branch = read.csv(paste(path, run, "/intermediate/", "taxa_branch.txt", sep=""),
                          sep="\t", quote="", stringsAsFactors=F)
  }else{

    FileList = dir(path, pattern = ".lca$")
    if(length(FileList)==0){
      cat("\n\n\t-> Error: no lca files found in the appointed working directory.\n\n")
      stop()
    }

    PATH = read.csv(paste(path, FileList[1], sep=""), header=F, sep="\t", stringsAsFactors=F,
                    fill=T, col.names = paste0("V",seq_len(60)), skip = 1)
    PATH = PATH[,-1]

    if (length(FileList)>1) {
      for (i in 2:length(FileList)) {
        PATH.1 = read.csv(paste(path, FileList[i], sep=""), header=F, sep="\t", stringsAsFactors=F,
                          fill=T, col.names = paste0("V",seq_len(60)), skip = 1)
        PATH.1 = PATH.1[,-1]
        PATH = rbind(PATH,PATH.1)
        PATH = PATH[!duplicated(PATH[,1]),]
      }
    }else{
      PATH = PATH[!duplicated(PATH[,1]),]
    }

    tax.branch = PATH
    write.table(PATH, file = paste(path, run, "/intermediate/", "taxa_branch.txt", sep=""), quote=F,
                row.names=F, col.names=T, sep="\t")
  }


  #create folders
  if (length(dir(paste(path, run, "/taxonomic_profiles/", sep = ""), pattern = "taxa_ranks")) > 0) {
    cat(paste("\n\n\t-> '", path, run, "/taxonomic_profiles/taxa_ranks' already exists; files inside will be overwritten.\n\n",sep = ""))
  }else{
    dir.create(paste(path, run, "/taxonomic_profiles/taxa_ranks", sep=""))
  }


  #Function for clustering taxa into user-defined ranks
  Taxa.cluster = function(DF, OutName){

    DF1 = data.frame(matrix(vector(), 0, dim(DF)[2]+1, dimnames=list(c(), c("taxa",colnames(DF)))),stringsAsFactors=F)
    colnames(DF1)[2:dim(DF1)[2]] = colnames(DF)

    j=1
    for (i in 1:dim(DF)[1]) {
      V1 = grep(Taxa.L, tax.branch[which(tax.branch[,1] %in% rownames(DF)[i]),])
      if (length(V1)>1) {
        V1=V1[1]
      }
      if (length(V1) !=0) {
        DF1[j,] = c(tax.branch[which(tax.branch[,1] %in% rownames(DF)[i]),V1], as.numeric(DF[i,1:dim(DF)[2]]))
        j = j+1
      }
    }

    for (i in 2:dim(DF1)[2]) {
      DF1[,i] = as.numeric(DF1[,i])
    }

    if (dim(DF1)[1] != 0) {
      DF2 = aggregate(. ~ taxa, data = DF1, sum)
      taxa_split = strsplit(DF2$taxa, ":") 
      DF2$taxa_ID = sapply(taxa_split, function(x) x[1])
      DF2$taxa_name = sapply(taxa_split, function(x) paste(x[2:(length(x)-1)], collapse = ":"))
      DF2$taxonomic_rank = sapply(taxa_split, function(x) x[length(x)])
      DF2 = DF2[,c((dim(DF2)[2] - 2):dim(DF2)[2],2:(dim(DF2)[2] - 3))]
      
      write.table(DF2, quote=F, row.names=F, col.names=T, sep="\t",
                  file = paste(path, run, "/taxonomic_profiles/taxa_ranks/", OutName, "_",taxa.level, ".txt", sep=""))
    }else{
      cat(paste("\n\n\t-> For ", OutName, ", no taxa clustered into ", taxa.level, ".\n\n",sep = ""))
    }
  }


  #cluster complete taxa profile
  DF = read.csv(paste(path, run, "/intermediate/", "taxa_profile_final.txt", sep=""),sep="\t",
                quote="", check.names=F, stringsAsFactors=F)
  rownames(DF) = DF$taxa
  DF1 = as.data.frame(DF[,-1])
  colnames(DF1) = colnames(DF)[-1]
  rownames(DF1) = rownames(DF)

  rank.name = strsplit(rank.name,",")[[1]]

  for (taxa.level in rank.name) {
    Taxa.L = paste(":",taxa.level,sep="")
    X2 = Taxa.cluster(DF = DF1, OutName = "all_taxa")
  }


  #cluster on groups
  if (length(dir(paste(path, run, "/intermediate/taxa_groups/", sep=""), pattern = ".txt"))>0) {

    cat("\n\n\t-> Taxa groups found and will also be clustered into ranks.\n\n")

    file.list = dir(paste(path, run, "/intermediate/taxa_groups/", sep=""), pattern = ".txt") #list files

    for (i in 1:length(file.list)) {

      DF = read.csv(paste(path, run, "/intermediate/taxa_groups/", file.list[i], sep=""),sep="\t",
                    quote="", check.names=F, stringsAsFactors=F)
      rownames(DF) = DF$taxa
      DF1 = as.data.frame(DF[,-1])
      colnames(DF1) = colnames(DF)[-1]
      rownames(DF1) = rownames(DF)

      OutName = sub(".txt", "", file.list[i])
      for (taxa.level in rank.name) {
        Taxa.L = paste(":",taxa.level,sep="")
        X2 = Taxa.cluster(DF = DF1, OutName = OutName)
      }
    }
  }
}
wyc661217/ngsLCA documentation built on June 2, 2025, 12:02 a.m.