#'
#' @title Remove Contamination Taxa
#'
#' @description Generate a contamination list from blank control lca files, and subtract listed taxa from the combined taxa profile. Running this function will update the existing "complete_profile.txt".
#'
#' @param path working directory, same to \code{\link{ngsLCA_profile}}.
#' @param run name of the run, default is "run01".
#' @param control_path path to the directory that contains all blank control lca files.
#' @param threshold.1_control for generating contamination list, minimum reads number required for keeping a taxon in each blank control; default is 2.
#' @param threshold.2_control for generating contamination list, minimum read percentage (to the total reads number of a blank control) required for keeping a taxon in each blank control, ranging from 0 to 1; default is 0.
#' @param threshold.3_control for generating contamination list, minimum sum of reads across all blank controls required for keeping a taxon; default is 5.
#'
#' @return An updated complete_profile.txt (after removing contaminations).
#' @importFrom utils read.csv read.delim write.table
#' @export
#'
#' @examples
#' ngsLCA_deContam(path=system.file("extdata","lca_files",package="ngsLCA"),
#' run="run01",
#' control_path=system.file("extdata","blank_controls",package="ngsLCA"),
#' threshold.1_control=0,
#' threshold.2_control=0,
#' threshold.3_control=0)
#'
#' ## This will combine the blank control lca files within folder
#' ## "extdata/blank_controls/" to generate a contamination list,
#' ## and then filter this list using the 3 control thresholds
#' ## supplied. The listed taxa will be subtracted from the combined
#' ## taxa profile.
#'
#'
ngsLCA_deContam = function(path,
run="run01",
control_path,
threshold.1_control = 2,
threshold.2_control = 0,
threshold.3_control = 5){
#modify input path
if (!grepl('/$', path)) {
path = paste(path,"/",sep="")
}
#local variable
taxa = NULL
#data read-in
if (length(dir(paste(path, run, "/intermediate/", sep=""), pattern ="taxa_profile_v1.txt")) == 0) {
cat(paste("\n\n\t-> Error: required input file not found under '", run, "' folder, please run 'ngsLCA_profile' first.\n\n",
sep = ""))
stop()
}else{
cat("\n\n\t-> De-contaminate.\n\n")
}
if (length(dir(paste(path, run, "/intermediate/", sep=""), pattern ="taxa_profile_v2.3.txt")) == 1) {
cat("\n\n\t-> Taxa profile filtered by 'ngsLCA_filter' is found and will be used as input.\n\n")
DF1 = read.delim(paste(path, run, "/intermediate/", "taxa_profile_v2.3.txt", sep=""),
stringsAsFactors=FALSE,header = T,check.names = F)
}else{
cat("\n\n\t-> Original profile generated by 'ngsLCA_profile' will be used as input.\n\n")
DF1 = read.delim(paste(path, run, "/intermediate/", "taxa_profile_v1.txt", sep=""),
stringsAsFactors=FALSE,header = T,check.names = F)
}
#blank control read-in
if (!grepl('/$', control_path)) {
control_path = paste(control_path,"/",sep="")
}
FileList = dir(control_path, pattern = ".lca$")
if (length(FileList)==0) {
cat("\n\n\t-> Error: no lca files found in the appointed 'control_path' directory.\n\n")
stop()}else{
CON1 = data.frame(taxa=character(), stringsAsFactors=F)
for (i in 1:length(FileList)) {
CON2.1 = read.csv(paste(control_path, FileList[i], sep=""), header=F, sep="\t", stringsAsFactors=F, fill=T,
col.names = paste0("V",seq_len(60)), skip = 1)
if(dim(CON2.1)[1]>0){
CON2.2 = data.frame(taxa=CON2.1[,2],
count=rep(1, dim(CON2.1)[1]),
stringsAsFactors = F)
CON2.3 = aggregate(CON2.2[,2]~CON2.2[,1], data=CON2.2, FUN=sum)
colnames(CON2.3) = c("taxa",sub(".lca","",FileList[i]))
CON1 = merge(CON1, CON2.3, by="taxa", all=T)
}else{
paste("\n\n\t-> ", FileList[i], " contains no taxa, skipped.\n\n", sep = "")
}
}
CON1[is.na(CON1)] = 0
if (dim(CON1)[1]>0) {
write.table(CON1, file = paste(path, run, "/intermediate/", "contamination_list_v1.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
}else{
cat("\n\n\t-> No alignments appear in the supplied blacnk control lca files.\n\n")
stop()
}
}
#filter blank controls
thr1b=as.numeric(threshold.1_control)
thr2b=as.numeric(threshold.2_control)
thr3b=as.numeric(threshold.3_control)
DF2 = read.delim(paste(path, run, "/intermediate/", "contamination_list_v1.txt", sep=""),
stringsAsFactors=FALSE, header = T, check.names = F)
DF2.1 = as.data.frame(DF2[,-1])
for (i in 1:dim(DF2.1)[2]) {
DF2.1[,i] = as.numeric(DF2.1[,i])
}
colnames(DF2.1) = colnames(DF2)[-1]
DF2.1[DF2.1 < thr1b] = 0
DF2.1$taxa = DF2$taxa
DF2.1 = DF2.1[,c(dim(DF2.1)[2],2:dim(DF2.1)[2]-1)]
if (dim(DF2.1)[2]>2) {
DF2.1 = DF2.1[which(rowSums(DF2.1[,-1]) !=0),]
}else{
DF2.1 = DF2.1[which(DF2.1[,2] !=0),]
}
if (dim(DF2.1)[1]>0) {
write.table(DF2.1, file = paste(path, run, "/intermediate/", "contamination_list_v2.1.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
}else{
cat("\n\n\t-> All taxa in the contamination list removed by threshold.1_control.\n\n")
stop()
}
DF2.2 = DF2.1
for (i in 2:dim(DF2.2)[2]) {
if (sum(DF2.2[,i])>0) {
DF2.2[(DF2.2[,i] < sum(DF2.2[,i])*thr2b),i] = 0
}
}
if (dim(DF2.2)[1]>0) {
write.table(DF2.2, file = paste(path, run, "/intermediate/", "contamination_list_v2.2.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
}else{
cat("\n\n\t-> All taxa in the contamination list removed by threshold.2_control.\n\n")
stop()
}
if (dim(DF2.2)[2]>2) {
DF2.3 = DF2.2[which(rowSums(DF2.2[,-1]) >= thr3b),]
}else{
DF2.3 = DF2.2[which(DF2.2[,2] >= thr3b),]
}
if (dim(DF2.3)[1]>0) {
write.table(DF2.3, file = paste(path, run, "/intermediate/", "contamination_list_v2.3.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
DF2.4 = DF2.3
taxa_split = strsplit(DF2.3$taxa, ":")
DF2.4$taxa_ID = sapply(taxa_split, function(x) x[1])
DF2.4$taxa_name = sapply(taxa_split, function(x) paste(x[2:(length(x)-1)], collapse = ":"))
DF2.4$taxonomic_rank = sapply(taxa_split, function(x) x[length(x)])
DF2.4 = DF2.4[,c((dim(DF2.4)[2] - 2):dim(DF2.4)[2],2:(dim(DF2.4)[2] - 3))]
write.table(DF2.4, file = paste(path, run, "/taxonomic_profiles/", "contamination_list.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
}else{
cat("\n\n\t-> All taxa in the contamination list removed by threshold.3_control.\n\n")
stop()
}
#de-contamination
CON = read.delim(paste(path, run, "/intermediate/", "contamination_list_v2.3.txt", sep=""),
stringsAsFactors=FALSE, header = T)
N = length(which(DF1$taxa %in% CON$taxa))
if (N>0) {
paste("\n\n\t-> A total of", N, " taxa detected in the contamination list and will be subtracted.\n\n")
DF2 = DF1[-which(DF1$taxa %in% CON$taxa),]
}else{
cat("\n\n\t-> No taxa in the contamination list detected in the combined taxa profile.\n\n")
DF2 = DF1
}
#write files
if (dim(DF2)[1] > 0) {
write.table(DF2, file = paste(path, run, "/intermediate/", "taxa_profile_v3.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
write.table(DF2, file = paste(path, run, "/intermediate/", "taxa_profile_final.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
DF3 = DF2
taxa_split = strsplit(DF2$taxa, ":")
DF3$taxa_ID = sapply(taxa_split, function(x) x[1])
DF3$taxa_name = sapply(taxa_split, function(x) paste(x[2:(length(x)-1)], collapse = ":"))
DF3$taxonomic_rank = sapply(taxa_split, function(x) x[length(x)])
DF3 = DF3[,c((dim(DF3)[2] - 2):dim(DF3)[2],2:(dim(DF3)[2] - 3))]
write.table(DF3, file = paste(path, run, "/taxonomic_profiles/", "complete_profile.txt", sep=""), quote=F,
row.names=F, col.names=T, sep="\t")
}else{
cat("\n\n\t-> All taxa removed by de-contamination.\n\n")
stop()
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.