#' Fit SINGLE's logistic regression
#'
#' This is an auxiliary function in single package. It takes counts_pnq and for each position and nucleotide it fits SINGLE's logistic regression.
#' @param counts_pnq Data frame with columns position nucleoide quality counts, as returned by pileup_by_QUAL
#' @param ref_seq DNAStringSet containing the true reference sequence.
#' @param p_prior_errors Data frame with columns position nucleotide prior.error, as the one returned by p_prior_errors().
#' @param p_prior_mutations Data frame with columns wt.base, nucleotide and p_mutation (probaility of mutation), as the one returned by p_prior_mutations().
#' @param save Logical. Should data be saved in a output_file?
#' @param output_file_fits File into which save the single fits if save=TRUE
#' @param output_file_data File into which save the fitted data if save=TRUE
#' @param verbose Logical.
#' @param keep_fit_quality Logical. Should parameters related to the quality of the fit be returned in extra columns of the output?
#' @return data.frame with columns position, nucleotide, slope and intercept (of the sigmoidal regression).
#' @importFrom tidyr replace_na
#' @importFrom stats glm coefficients
#' @importFrom utils write.table
#' @importFrom rlang .data
#' @import dplyr
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @export fit_logregr
#' @examples
#' refseq_fasta <- system.file("extdata", "ref_seq.fasta", package = "single")
#' ref_seq = Biostrings::readDNAStringSet(refseq_fasta)
#' train_reads_example <- system.file("extdata", "train_seqs_500.sorted.bam",
#' package = "single")
#' counts_pnq <- pileup_by_QUAL(bam_file=train_reads_example,
#' pos_start=1,pos_end=10)
#' p_prior_mutations <- p_prior_mutations(rates.matrix = mutation_rate,
#' mean.n.mut = 5,ref_seq = ref_seq)
#' p_prior_errors <- p_prior_errors(counts_pnq=counts_pnq)
#' fits <- fit_logregr(counts_pnq = counts_pnq,ref_seq=ref_seq,
#' p_prior_errors = p_prior_errors,p_prior_mutations = p_prior_mutations)
fit_logregr <- function(counts_pnq,ref_seq,p_prior_errors,p_prior_mutations,
save=FALSE, output_file_fits,output_file_data,verbose=FALSE,
keep_fit_quality=FALSE){
#Pre-editing data
ref_seq_char = strsplit(as.character(ref_seq),"")[[1]]
data <- dplyr::as_tibble(counts_pnq)%>%
dplyr::mutate(wt.base = ref_seq_char[.data$pos])# Add wildtype base
## Wildtype matrix: keep rows with wildtype reads
data_wt <- data %>%
dplyr::filter(.data$nucleotide==.data$wt.base)%>%
dplyr::select(-.data$nucleotide) %>%
dplyr::rename(counts.wt=count)
## Data with mutations (errors)
data_mut <- data %>% #start from data
dplyr::filter(.data$nucleotide!=.data$wt.base)#keep only mutations
missing_position <- setdiff(seq_len(max(data$pos)) ,data_mut$pos)
if(length(missing_position)>0){
data.aux <- data.frame(pos=missing_position,
nucleotide=NA, QUAL=20,count=NA,wt.base=NA) %>%
mutate(wt.base=ref_seq_char[.data$pos]) %>%
mutate(nucleotide=if_else(.data$wt.base=="A","C","A")) %>%
mutate(strand="+")
data_mut <- data_mut %>% rbind(data.aux)
rm(data.aux)
}
data_mut_expansion <- data_mut %>%
tidyr::expand(.data$pos,.data$nucleotide,.data$QUAL,.data$strand)
data_mut <- data_mut %>%
dplyr::full_join(data_mut_expansion,
by = c("pos", "nucleotide", "QUAL","strand"))%>%
dplyr::mutate(wt.base = ref_seq_char[.data$pos]) %>%
dplyr::full_join(data_wt, by=c("pos", "QUAL","wt.base","strand"))%>%
dplyr::left_join(p_prior_mutations, by=c("wt.base", "nucleotide"))%>%
dplyr::left_join(p_prior_errors,by=c("pos", "nucleotide","strand"))%>%
dplyr::filter(.data$nucleotide!=.data$wt.base)%>%
dplyr::mutate(count=tidyr::replace_na(.data$count,0))%>%
dplyr::mutate(counts.wt=tidyr::replace_na(.data$counts.wt,0))%>%
dplyr::mutate(p_prior_error=tidyr::replace_na(.data$p_prior_error,0))
rm(data_mut_expansion)
## Count number of errors/good reads by position and nucleotide
total_counts <- data_mut %>%
dplyr::group_by(.data$pos, .data$nucleotide,.data$strand)%>%
dplyr::summarise(total.counts.mut=sum(.data$count,na.rm=TRUE),
total.counts.wt=sum(.data$counts.wt,na.rm=TRUE))%>%
dplyr::mutate(total.counts=.data$total.counts.mut+.data$total.counts.wt)
data_mut <- dplyr::full_join(data_mut, total_counts,
by=c("pos", "nucleotide","strand"))
## reweight counts by prior probabilities
data_mut <- data_mut %>%
dplyr::mutate(pc=.data$p_mutation/(.data$p_mutation+.data$p_prior_error),
pi=.data$p_prior_error/(.data$p_mutation+.data$p_prior_error))%>%
dplyr::mutate(counts.scaled =(.data$count/.data$total.counts.mut*.data$pi*.data$total.counts),
counts.wt.scaled =(.data$counts.wt/.data$total.counts.wt*.data$pc*.data$total.counts))
## Fit data:
data_fits <- data_mut %>%
dplyr::select(.data$strand,.data$pos, .data$nucleotide)%>%
dplyr::distinct(.data$strand,.data$pos, .data$nucleotide)%>%
dplyr::arrange(.data$strand,.data$pos, .data$nucleotide)%>%
dplyr::mutate(prior_slope=NA, prior_intercept=NA)
if(keep_fit_quality){
data_fits <- data_fits %>%
dplyr::mutate(slope_pval=NA, intercept_pval=NA,
deviance=NA, null_deviance=NA)
}
n_i <- nrow(data_fits)
if(verbose){message("\n Fitting \n")}
if(verbose){pb=utils::txtProgressBar(min=0,max=nrow(data_fits),style = 3)}
for (i in seq_len(nrow(data_fits))){
if(verbose){utils::setTxtProgressBar(pb,i)}
#Keep data from this position & nucleotide
aux_df <- data_mut %>%
dplyr::filter(.data$pos==data_fits$pos[i] & .data$nucleotide ==data_fits$nucleotide[i] & .data$strand==data_fits$strand[i])%>%
dplyr::select(.data$strand,.data$QUAL,.data$count, .data$counts.wt,.data$counts.scaled,.data$counts.wt.scaled) %>%
dplyr::mutate(tot.scaled=.data$counts.scaled+.data$counts.wt.scaled) %>%
dplyr::mutate(proportion.wt.scaled=.data$counts.wt.scaled/.data$tot.scaled)
if(sum(aux_df$count)==0){
data_fits$prior_slope[i] <- NA
data_fits$prior_intercept[i] <- NA
if(verbose){
warning('Position ', data_fits$pos[i], data_fits$nucleotide[i], " has no data to be fitted.\n")
}
next()
}
qval <- aux_df$QUAL
yvals <- aux_df$proportion.wt.scaled
#Corrected fit by prior data
if(!(all(is.na(aux_df$counts.wt.scaled))|all(is.na(aux_df$counts.scaled)))){
aux_prior_fit <- stats::glm(yvals[!is.na(yvals)]~ qval[!is.na(yvals)],family = "quasibinomial")
aux_prior_coefficients <- stats::coefficients(aux_prior_fit)
data_fits$prior_slope[i] <- aux_prior_coefficients[2]
data_fits$prior_intercept[i] <- aux_prior_coefficients[1]
if(keep_fit_quality){
fit_summ <- summary(aux_prior_fit)$coefficients
data_fits$slope_pval[i] <- fit_summ[2,4]
data_fits$intercept_pval[i] <- fit_summ[1,4]
data_fits$deviance[i] <- summary(aux_prior_fit)$deviance
data_fits$null_deviance[i] <- summary(aux_prior_fit)$null.deviance
}
}
}
# SAVE RESULTS
if(save){
utils::write.table(data_fits, file=output_file_fits,
quote = FALSE, row.names = FALSE)
utils::write.table(data_mut,file=output_file_data,
quote = FALSE, row.names = FALSE)
}
return(data_fits)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.