#'
#' @title Filter the Combined Taxa Profile
#'
#' @description Filter the combined taxa profile. Running this function will update the existing "complete_profile.txt".
#'
#' @param path working directory, same to \code{\link{ngsLCA_profile}}.
#' @param run name of the run, default is "run01".
#' @param remove.taxa a list of NCBI taxaID indicating taxa that will be removed. This can be a comma separated vector or the path to a text file listing one taxaID in each line.
#' @param threshold.1 minimum reads number required for confirming a taxon in each sample; default is 2.
#' @param threshold.2 minimum read percentage (to the total reads number of a given sample) required for confirming a taxon in each sample, ranging from 0 to 1; default is 0.
#' @param threshold.3 minimum sum of reads across all samples required for confirming a taxon in the combined taxa profile; default is 5.
#'
#' @return An updated complete_profile.txt (after taxa filters).
#' @importFrom utils read.csv read.delim write.table
#' @export
#'
#' @examples
#' ngsLCA_filter(path=system.file("extdata","lca_files",package="ngsLCA"),
#' run="run01",
#' remove.taxa="1115501,10114",
#' threshold.1=2,
#' threshold.2=0,
#' threshold.3=2)
#'
#'
#' ## This will filter and update the combined taxa profile:
#' ## "path/run01/taxonomic_profiles/complete_profile.txt"
#' ## by the 3 thresholds supplied, and by removing taxa
#' ## with taxaID 1115501 and 10114.
#'
#'
ngsLCA_filter = function(path,
run="run01",
remove.taxa=NULL,
threshold.1 = 2,
threshold.2 = 0,
threshold.3 = 5){
#modify input path
if (!grepl('/$', path)) {
path = paste(path,"/",sep="")
}
#local variable
taxa = NULL
#read-in file
if (length(dir(paste(path, run, "/intermediate/", sep=""),pattern = "taxa_profile_v1.txt")) == 0) {
cat(paste("\n\n\t-> Error: required input file not found under '", run, "' folder, please run 'ngsLCA_profile' first.\n\n",
sep = ""))
stop()
}else{
cat("\n\n\t-> Filter taxa profile.\n\n")
}
if (length(dir(paste(path, run, "/intermediate/", sep=""), pattern ="taxa_profile_v3.txt")) == 1) {
cat("\n\n\t-> De-contaminated taxa profile generated by 'ngsLCA_deContam' is found and will be used as input.\n\n")
DF1 = read.delim(paste(path, run, "/intermediate/", "taxa_profile_v3.txt", sep=""),
stringsAsFactors=FALSE,header = T,check.names = F)
}else{
cat("\n\n\t-> Original taxa profile generated by 'ngsLCA_profile' will be used as input.\n\n")
DF1 = read.delim(paste(path, run, "/intermediate/", "taxa_profile_v1.txt", sep=""),
stringsAsFactors=FALSE,header = T,check.names = F)
}
#removes taxa in the remove.taxa list
if (!is.null(remove.taxa)) {
remove.taxa = strsplit(remove.taxa,",")[[1]]
remove.taxa1 = as.numeric(remove.taxa)
if (any(is.na(remove.taxa1))) {
if (file.exists(remove.taxa)) {
remove.taxa1 = read.csv(remove.taxa,stringsAsFactors = F,header = F)
remove.taxa1 = as.numeric(remove.taxa1[,1])
}
}
if (is.numeric(remove.taxa1)) {
remove.taxa = remove.taxa1
}else{
cat("\n\n\t-> Error: 'remove.taxa' contains no-numeric values, please input NCBI taxaID only.\n\n")
stop()
}
TAXAID = as.numeric(sapply(strsplit(DF1$taxa, split = ":"),function(x) x[1]))
if (length(which(TAXAID %in% remove.taxa)) >0 ) {
DF1 = DF1[-which(TAXAID %in% remove.taxa),]
}
}
#threshold.1 filtering
DF2.1 = as.data.frame(sapply(DF1[,-1], as.numeric))
colnames(DF2.1) = colnames(DF1)[-1]
DF2.1[DF2.1 < threshold.1] = 0
DF2.1$taxa = DF1$taxa
DF2.1 = DF2.1[,c(dim(DF2.1)[2],2:dim(DF2.1)[2]-1)]
if (dim(DF2.1)[2]>2) {
DF2.1 = DF2.1[which(rowSums(DF2.1[,-1]) !=0),]
}else{
DF2.1 = DF2.1[which(DF2.1[,2] !=0),]
}
if (dim(DF2.1)[1]>0) {
write.table(DF2.1, file = paste(path, run, "/intermediate/", "taxa_profile_v2.1.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
}else{
cat("\n\n\t-> Error: all taxa removed by threshold.1, try assign a lower value.\n\n")
stop()
}
#threshold.2 filtering
DF2.2 = DF2.1
for (i in 2:dim(DF2.2)[2]) {
if (sum(DF2.2[,i])>0) {
DF2.2[(DF2.2[,i] < sum(DF2.2[,i])*threshold.2),i] = 0
}
}
if (dim(DF2.2)[2]>2) {
DF2.2 = DF2.2[which(rowSums(DF2.2[,-1]) !=0),]
}else{
DF2.2 = DF2.2[which(DF2.2[,2] !=0),]
}
if (dim(DF2.2)[1]>0) {
write.table(DF2.2, file = paste(path, run, "/intermediate/", "taxa_profile_v2.2.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
}else{
cat("\n\n\t-> Error: all taxa removed by threshold.2, try assign a lower value.\n\n")
stop()
}
#threshold.3 filtering and write files
if (dim(DF2.2)[2]>2) {
DF2.3 = DF2.2[which(rowSums(DF2.2[,-1]) >= threshold.3),]
}else{
DF2.3 = DF2.2[which(DF2.2[,2] >= threshold.3),]
}
if (dim(DF2.3)[1]>0) {
write.table(DF2.3, file = paste(path, run, "/intermediate/", "taxa_profile_v2.3.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
write.table(DF2.3, file = paste(path, run, "/intermediate/", "taxa_profile_final.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
DF3 = DF2.3
taxa_split = strsplit(DF2.3$taxa, ":")
DF3$taxa_ID = sapply(taxa_split, function(x) x[1])
DF3$taxa_name = sapply(taxa_split, function(x) paste(x[2:(length(x)-1)], collapse = ":"))
DF3$taxonomic_rank = sapply(taxa_split, function(x) x[length(x)])
DF3 = DF3[,c((dim(DF3)[2] - 2):dim(DF3)[2],2:(dim(DF3)[2] - 3))]
write.table(DF3, file = paste(path, run, "/taxonomic_profiles/", "complete_profile.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
}else{
cat("\n\n\t-> Error: all taxa removed by threshold.3, try assign a lower value.\n\n")
stop()
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.