R/Rbec.R

Defines functions Rbec

Documented in Rbec

#' Reference-based error correction of amplicon sequencing data
#'
#' @description
#' This function corrects the amplicon sequencing data from synthetic communities where the reference sequences are known a priori
#'
#' @details Ruben Garrido-Oter's group, Plant-Microbe interaction, Max Planck Institute for Plant Breeding Research
#' @author  Pengfan Zhang
#'
#' @param fastq the path of the fastq file containg merged amplicon sequencing reads (Ns are not allowed in the reads)
#' @param reference the path of the unique reference sequences, each sequence must be in one line (Ns are not allowed in the sequences)
#' @param outdir the output directory, which should be created by the user
#' @param threads the number of threads used, default 1
#' @param sampling_size the sampling size for calculating the error matrix, default 5000
#' @param ascii ascii characters used to encode phred scores (33 or 64), default 33
#' @param min_cont_obs_abd the minimum oberseved abundace of unique tags for detecting contamination sequences, default 200
#' @param min_cont_abd the relative abundance of unique tgas for detecting contamination sequences that can't be corrected by any of the references, default 0.03
#' @param min_E the minimum expectation of the Possion distribution for the identification of paralogues, default 0.05
#' @param min_P the minimum P value threshold of the Possion distribution to correct a read, default 1e-40
#' @param ref_seeker the method for finding the candidate error-producing reference sequence for a tag showing identical lowest K-mer distance to multiple references. 1 for the abundance-based method; 2 for the transition probability-based method, default 1.
#' @param cn the path to the copy number table documenting the copy number of the marker gene in each strain (header inclusive), otherwise Rbec will normalize the abundance based on the internally inferred copy number, which tends to underestimate the true copy number, defaul NULL. 
#'
#' @import readr
#' @import dada2
#' @import doParallel
#' @import foreach
#'
#' @usage Rbec(fastq, reference, outdir, threads=1, sampling_size=5000, ascii=33, min_cont_obs_abd=200, min_cont_abd=0.03, min_E=0.05, min_P=1e-40, ref_seeker=1, cn=NULL)
#' @examples
#' fastq <- system.file("extdata", "test_raw_merged_reads.fastq.gz", package = "Rbec")
#'
#' ref <- system.file("extdata", "test_ref.fasta", package = "Rbec")
#'
#' Rbec(fastq=fastq, reference=ref, outdir=tempdir(), threads=1, sampling_size=500, ascii=33)
#'
#' @return lambda_final.out the lambda value and pvalue of the Poisson distribution for each read
#' @return error_matrix_final.out the error matrix in the final iteration
#' @return strain_table.txt the strain composition of the sample
#' @return strain_table_normalized.txt the copy-number-normalized strain composition of the sample if the copy number table is provided
#' @return contamination_seq.fna the potential sequences generated by contaminants
#' @return rbec.log percentage of corrected reads, which can be used to predict contaminated samples
#' @return paralogue_seq.fna paralogue sequences found in each strain except for the reference provided 
#'
#' @export
#'

Rbec <- function(fastq, reference, outdir, threads=1, sampling_size=5000, ascii=33, min_cont_obs_abd=200, min_cont_abd=0.03, min_E=0.05, min_P=1e-40, ref_seeker=1, cn=NULL){


    t1 <- Sys.time()

    # parse input parameters and read reference sequences

    tryCatch(
        para_check <- as.numeric(threads),
        error = function(e) {stop("parameters 'threads' should be intergers!")},
        warning =  function(w) {stop("parameters 'threads' should be intergers!")}
    )

    tryCatch(
        para_check <- as.numeric(sampling_size),
        error = function(e) {stop("parameters 'sampling_size' should be intergers!")},
        warning =  function(w) {stop("parameters 'sampling_size' should be intergers!")}
    )

    tryCatch(
        para_check <- as.numeric(ascii),
        error = function(e) {stop("parameters 'ascii' should be intergers!")},
        warning =  function(w) {stop("parameters 'ascii' should be intergers!")}
    )


    ref <- read_lines(reference)

    ref_name <- data.frame(name=ref[which(seq_along(ref)%%2==1)], seq=toupper(ref[which(seq_along(ref)%%2==0)]), stringsAsFactors=FALSE)
    ref_name$name <- sub(">", "", ref_name$name)
    ref <- data.frame(ref_seq=toupper(ref[which(seq_along(ref)%%2==0)]), stringsAsFactors=FALSE)

    # calculate the error matrix
    error_ref_matrix <- error_m(fastq, ref, sampling_size, threads, ascii, ref_seeker)
    error_matrix <- error_ref_matrix[["err"]]
    ref <- error_ref_matrix[["ref"]]
    derep <- error_ref_matrix[["derep"]]
    message("Finished calculating the transition and error matrices.")

    # calculate abundance probabilities
    lambda_out <- abd_prob(derep, ref, error_matrix)

    final_list <- consis_err(fastq, derep, ref, lambda_out, sampling_size, ascii, min_E, min_P)

    straintab <- final_list[["table"]]
    straintab[, 1] <- ref_name$name[match(straintab[, 1], ref_name[, 2])]
    # normalize the abundance if copy number is available
    if (!is.null(cn)){
      cp_n <- as.data.frame(read_delim(cn, delim="\t", col_names=FALSE), stringsAsFactors = FALSE)
      straintab_norm <- straintab
      straintab_norm$Corrected_Abundance <- round(straintab_norm$Corrected_Abundance/cp_n[match(straintab_norm$Ref_seq, cp_n[, 1]), 2])
    }

    lambda_out <- final_list[["lambda_out"]]
    lambda_out[, 2] <- ref_name$name[match(lambda_out[, 2], ref_name[, 2])]

    # identify potential contaminants from reads which cannot be corrected and have high abundance (larger than min_cont_abd)
    contamination <- which(final_list$lambda_out$Corrected=="No" &
                         (final_list$lambda_out$Obs_abd >= min_cont_obs_abd | final_list$lambda_out$Obs_abd/error_ref_matrix$total_reads>=min_cont_abd))
    # output sequences of potential contaminants
    conta_out <- data.frame(stringsAsFactors=FALSE)
    c <- 1
    for (i in contamination){
	      conta_align <- nwalign(names(derep$uniques[i]), final_list$lambda_out$Ref_ID[i])
      	conta_align1 <- unlist(strsplit(conta_align[1], ""))
      	conta_align2 <- unlist(strsplit(conta_align[2], ""))
      	iden <- paste(round(length(which(conta_align1==conta_align2))/length(conta_align1)*100, 1), "%", sep="")
	      conta_iden <- paste(lambda_out$Ref_ID[i], iden, sep=":")
        conta_rec <- paste(">Contamination_seq", c, sep="")
      	conta_abd <- paste("Observed Abundace:", derep$uniques[i], sep="")
        conta_rate <- paste("Relative Abundance:", round(derep$uniques[i]/error_ref_matrix$total_reads*100, 1), "%", sep="")
        conta_rec <- paste(conta_rec, conta_abd, conta_rate, conta_iden, sep="; ")
        conta_out <- rbind(conta_out, conta_rec, stringsAsFactors = FALSE)
        conta_rec <- names(derep$uniques[i])
        conta_out <- rbind(conta_out, conta_rec, stringsAsFactors = FALSE)
        c <- c+1
    }
    
    #output paralogue sequences for each strain
    paralog_out <- data.frame(stringsAsFactors=FALSE)
    c <- 1
    cn_internal_inference <- c()
    for (i in seq(nrow(lambda_out))){
      if (lambda_out$Lambda[i]!=1 & lambda_out$E[i]>=min_E & lambda_out$Corrected[i]=="Yes" & lambda_out$Obs_abd[i]/max(lambda_out$Obs_abd[which(lambda_out$Ref_ID==lambda_out$Ref_ID[i])])>=1/15){
        paralog_rec <- paste(paste(">", "Seq_", c, sep=""), paste("Paralogue", lambda_out$Ref_ID[i], sep = "="), sep = ";")
        paralog_out <- rbind(paralog_out, paralog_rec, stringsAsFactors=FALSE)
        paralog_out <- rbind(paralog_out, names(derep$uniques[i]), stringsAsFactors=FALSE)
	      cn_internal_inference <- c(cn_internal_inference, lambda_out$Ref_ID[i])
        c <- c+1
      }
    }
    if (is.null(cn)){
      cp_n <- as.data.frame(table(cn_internal_inference))
      if (nrow(cp_n)>0){
      	cp_n$Freq <- cp_n$Freq + 1
      	straintab_norm <- straintab
      	names(cp_n) <- c("Var1", "Freq")
      	cp_n <- rbind(as.data.frame(table(straintab_norm$Ref_seq[!straintab_norm$Ref_seq %in% cp_n[,1]])), cp_n)
      	straintab_norm$Corrected_Abundance <- round(straintab_norm$Corrected_Abundance/cp_n[match(straintab_norm$Ref_seq, cp_n[, 1]), 2])
	}
	else{straintab_norm <- straintab}
       }

    # output final tables and logs

    dir.create(outdir)
    lambdaout <- paste(outdir, "lambda_final.out", sep="/")
    emout <- paste(outdir, "error_matrix_final.out", sep="/")
    straintabout <- paste(outdir, "strain_table.txt", sep="/")
    straintabout_norm <- paste(outdir, "strain_table_normalized.txt", sep="/")
    contaout <- paste(outdir, "contamination_seq.fna", sep="/")
    paralogout <- paste(outdir, "paralogue_seq.fna", sep="/")
    logout <- paste(outdir, "rbec.log", sep="/")

    write.table(lambda_out, file= lambdaout, quote=FALSE, sep="\t", row.names = FALSE)
    write.table(final_list[["err"]], file= emout, quote=FALSE, sep="\t")
    write.table(straintab, file= straintabout, quote=FALSE, sep="\t", row.names=FALSE)
    write.table(straintab_norm, file= straintabout_norm, quote=FALSE, sep="\t", row.names=FALSE)
    write.table(conta_out, file = contaout, quote=FALSE, sep="", col.names = FALSE, row.names = FALSE)
    write.table(paralog_out, file=paralogout, quote=FALSE, sep="", col.names = FALSE, row.names = FALSE)

    t2 <- Sys.time()
    time.used <- difftime(t2, t1, units = "mins")
    time.used <- sub("Time difference of ","",time.used)
    time.used <- paste("Total time used:", time.used, "mins", sep=" ")
    summ <- paste(final_list$total_corrected/error_ref_matrix$total_reads*100, "% of reads were corrected!", sep="")

    log_out <- rbind(time.used, summ)
    write.table(log_out, file = logout, quote=FALSE, sep="", col.names = FALSE, row.names = FALSE)

}
PengfanZhang/Rbec documentation built on Sept. 13, 2023, 1 a.m.