R/merging_final_pretest_dataframe.R

Defines functions .count_trinuc_muts .merge_sites_and_muts .maf_to_dfr2

Documented in .count_trinuc_muts .maf_to_dfr2 .merge_sites_and_muts

#' Combines sites and mutations for sites and flanks from each bin in prepared_sites
#'
#' @param this_maf Data frame of mutations prepared by get_mut_trinuc_strand. Contains all if not cofactors are included
#' @param prepared_sites List of coordinates and quadnucleotide contexts generated by prepare_bins_of_sites
#'
#' @return Data frame containing mutation counts for each quadnucleotide context + indel from each bin and sites + up/down flanks
.maf_to_dfr2 = function(this_maf, prepared_sites) {

	if (is.null(prepared_sites[[1]])) {
		return(NULL)
	}

	gr_this_maf = GenomicRanges::GRanges(this_maf$chr, 
	                                     IRanges::IRanges(this_maf$start, this_maf$end))
	GenomicRanges::mcols(gr_this_maf)[,"trinuc"] = this_maf$mut_trinuc
	GenomicRanges::mcols(gr_this_maf)[,"ID"] = paste(this_maf$chr, this_maf$start, this_maf$end, 
	                                                 this_maf$ref, this_maf$alt, this_maf$patient_id, sep=":")

	full_dfr = NULL

	for (i in 1:length(prepared_sites)) {

    gr_sites = prepared_sites[[i]][['gr_sites']]
		gr_flank = prepared_sites[[i]][['gr_flank']]
		trinuc_sites = prepared_sites[[i]][['trinuc_sites']]
		trinuc_flank = prepared_sites[[i]][['trinuc_flank']]
	
    sites_dfr = .merge_sites_and_muts(gr_sites, gr_this_maf, trinuc_sites, 
				is_site = TRUE)
		flank_dfr = .merge_sites_and_muts(gr_flank, gr_this_maf, trinuc_flank, 
				is_site = FALSE)

		dfr = rbind(sites_dfr, flank_dfr)
		dfr$mut_rate = as.numeric(gsub("mutrate__", "", names(prepared_sites)[i]))
		dfr$mut_bin = i
			
		full_dfr = rbind(full_dfr, dfr)
	}
	full_dfr
}



#' Combines quadnucleotide counts and site information
#'
#' @param gr_seq GenomicRanges object of site/flanks chromosomes and positions
#' @param gr_maf GenomicRanges object with mcols containing quadnucleotide context or indel
#' @param trinuc Data frame of counts for each trinucleotide counts extended to each quadnucleotide
#' @param is_site Boolean indicating whether mutation is within site or flanking region
#'
#' @return Data frame containing mutation and trinucleotide counts for each quadnucleotide context + indel. Includes boolean indicator of is_site
.merge_sites_and_muts = function(gr_seq, gr_maf, trinuc, is_site) {
	muts_sites = .count_trinuc_muts(gr_seq, gr_maf)
	dfr = merge(trinuc, muts_sites, by.x = "quadnuc", by.y = "tag", all = TRUE)
	dfr = cbind(is_site = is_site, dfr)
	dfr[is.na(dfr$muts_count), "muts_count"] = 0
	dfr
}



#' Counts number of mutations per quadnucleotide context
#'
#' @param gr_sites GenomicRanges object of site/flank chromosomes and positions
#' @param gr_maf GenomicRanges object with mcols containing quadnucleotide context or indel
#'
#' @return Data frame containing mutation count and each quadnucleotide context + indel
.count_trinuc_muts = function(gr_sites, gr_maf) {

	# remove duplicate mutation calls
	gr_maf_in_sites = gr_maf[S4Vectors::subjectHits(GenomicRanges::findOverlaps(gr_sites, gr_maf))]
	gr_maf_in_sites = gr_maf_in_sites[!duplicated(GenomicRanges::mcols(gr_maf_in_sites)[,"ID"])]

	muts_sites = GenomicRanges::mcols(gr_maf_in_sites)[,"trinuc"]

	muts_sites = data.frame(muts_count = c(table(muts_sites)), stringsAsFactors=F)
	muts_sites$tag = rownames(muts_sites)
	muts_sites
}
reimandlab/RM2 documentation built on Aug. 13, 2022, 12:22 p.m.