R/consistent_error_matrix.R

Defines functions consis_err

#' Reference-based error correction of amplicon sequencing data
#'
#' @description
#' This function iterates the error matrix till reaching the stable stage
#'
#' @details Ruben Garrido-Oter's group, Plant-Microbe interaction, Max Planck Institute for Plant Breeding Research
#' @author  Pengfan Zhang
#'
#' @param fq the path of the fastq file (Ns are not allowed in the reads)
#' @param derep the derepliacted reads by dada2 function
#' @param ref the reference sequences, each sequence should take up one line (Ns are not allowed in the reads)
#' @param lambda_out the matrix containg lambda value and pvalue from the former iteration
#' @param sampling_size the subsampling size of the reads
#' @param ascii ascii characters used to encode phred scores
#' @param min_E the minimum expectation value of the Possion distribution for detecting paralogs within the same strain
#' @param  min_P the P value cutoff for identifying errouneous reads
#' @param max_diff_abs the maximum absolute difference in number of corrected reads between two iterations, together with max_diff_ratio, before jumping out of the iiteration
#' @param max_diff_ratio the maximum difference in the percentages of corrected reads between two iterations
#'
#' @usage consis_err(fq, derep, ref, lambda_out, sampling_size, ascii, min_E=0.05, min_P=1e-40, max_diff_abs=100, max_diff_ratio=0.01)
#'
#' @return Returns the final files
#'
#' @noRd
#'

consis_err <- function(fq, derep, ref, lambda_out, sampling_size, ascii, min_E=0.05, min_P=1e-40, max_diff_abs=100, max_diff_ratio=0.01){
    raw_data <- read_lines(gzfile(fq))
    n <- 1
    repeat {
        message("Start ", n ," iterations")

        # calculate total number of corrected reads and total abundances from the previous iteration
        previous_round <- which(lambda_out$E>=min_E | lambda_out$pval>=min_P)
        previous_sum <- sum(lambda_out[previous_round, "Obs_abd"])

        # update abundances of reference sequences from erroneous reads asigned to them
        est_abd <- numeric(0)
        for (i in seq_along(unique(lambda_out$Ref_ID))) {

            # find all sequences with high expectation and P-val which could have been generated by a given reference sequence
            idx <-which((lambda_out$E>=min_E | lambda_out$pval>=min_P) & lambda_out$Ref_ID==unique(lambda_out$Ref_ID)[i])

            # calculate estimated abundances of a given reference sequence by adding up all the abundances of assigned erroneous sequences
            ref_i_abd <- sum(lambda_out$Obs_abd[idx])
            est_abd <- c(est_abd, ref_i_abd)

        }

        # update estimated abundances for each reference sequence
        ref$est_abd[unique(lambda_out$Ref_ID)] <- est_abd
        message(ref$est_abd)

        # extract raw sequences that belong to the error-corrected unique sequences found in the previous round
        current_round <- which(derep[["map"]] %in% previous_round)

        # sample the extracted sequences for transition matrix generation
        current_sample <- sample(current_round, sampling_size)
        seqs <- raw_data[current_sample*4-2]
        seqs <- toupper(seqs)
        quals <- raw_data[current_sample*4]
        bestref <- derep[["bestref"]][derep[["map"]][current_sample]]

        # calculate transition matrix and update error matrix
        query <- data.frame(seqs=seqs, qual=quals, ref=bestref)
        trans_matrix <- trans_m(query, ascii)
        error_matrix <- loessErr(trans_matrix)
        lambda_out <- abd_prob(derep, ref, error_matrix)

        # calculate total number of corrected reads and total abundances from the current iteration
        current_round <- which(lambda_out$E>=min_E | lambda_out$pval>=min_P)
        current_sum <- sum(lambda_out[current_round, "Obs_abd"])

        # check for convergence
        diff_abs <- abs(previous_sum - current_sum)
        diff_ratio <- diff_abs/previous_sum

        if (diff_abs < max_diff_abs & diff_ratio <= max_diff_ratio) {

            message(diff_abs)
            message(diff_ratio)
            message("The model reached consistency after ", n ," iteration(s)")

            final_abd <- numeric(0)
            for (i in seq_along(unique(lambda_out$Ref_ID))){

                # find all sequences with high expectation and P-val which could have been generated by a given reference sequence
                idx <- which((lambda_out$E>=min_E | lambda_out$pval>=min_P) & lambda_out$Ref_ID==unique(lambda_out$Ref_ID)[i])

                # calculate final abundances of a given reference sequence by adding up all the abundances of assigned erroneous sequences
                ref_i_abd <- sum(lambda_out$Obs_abd[idx])
                final_abd <- c(final_abd, ref_i_abd)

            }  

            out_table <- data.frame(Ref_seq=ref[unique(lambda_out$Ref_ID), 1], Corrected_Abundance=final_abd)
            lambda_out$Total_corrected <- sum(lambda_out[which(lambda_out$E>=min_E | lambda_out$pval>=min_P), "Obs_abd"])
            lambda_out$Corrected <- "No"
            lambda_out$Corrected[which(lambda_out$E>=min_E | lambda_out$pval>=min_P)] <- "Yes"
            lambda_out$Ref_ID <- ref[lambda_out$Ref_ID, 1]
            out_file <- list(lambda_out=lambda_out, err=error_matrix, table=out_table, total_corrected=sum(final_abd))
            return(out_file)

          break
        }
        else {

            message(diff_abs)
            message(diff_ratio)
            n <- n+1

        }
    }
}
PengfanZhang/Rbec documentation built on Sept. 13, 2023, 1 a.m.