vcfR: Manipulate and Visualize VCF Data

Documented in AD_frequency freq_peak is_het masplit

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

.NM2winNM <- function(x, pos, maxbp, winsize = 100L, depr = 1L) {
    .Call(`_vcfR_NM2winNM`, x, pos, maxbp, winsize, depr)
}

.windowize_NM <- function(x, pos, starts, ends, summary = "mean", depr = 1L) {
    .Call(`_vcfR_windowize_NM`, x, pos, starts, ends, summary, depr)
}

#' @title AD_frequency
#' @name AD_frequency
#' @rdname AD_frequency
#'
#' @description
#' Create allele frequencies from matrices of allelic depths (AD)
#'
#' @param ad a matrix of allele depths (e.g., "7,2")
#' @param allele which (1-based) allele to report frequency for
#' @param sum_type type of sum to calculate, see details
#' @param delim character that delimits values
#' @param decreasing should the values be sorted decreasing (1) or increasing (0)?
#'
#' @details
#' Files containing VCF data frequently include data on allelic depth (e.g., AD).
#' This is the number of times each allele has been sequenced.
#' Our naive assumption for diploids is that these alleles should be observed at a frequency of 1 or zero for homozygous positions and near 0.5 for heterozygous positions.
#' Deviations from this expectation may indicate allelic imbalance or ploidy differences.
#' This function is intended to facilitate the exploration of allele frequencies for all positions in a sample.
#'
#' The alleles are sorted by their frequency within the function.
#' The user can then specify is the would like to calculate the frequency of the most frequent allele (allele = 1), the second most frequent allele (allele = 2) and so one.
#' If an allele is requested that does not exist it should result in NA for that position and sample.
#'
#' There are two methods to calculate a sum for the denominator of the frequency.
#' When sum_type = 0 the alleles are sorted decendingly and the first two allele counts are used for the sum.
#' This may be useful when a state of diploidy may be known to be appropriate and other alleles may be interpreted as erroneous.
#' When sum_type = 1 a sum is taken over all the observed alleles for a variant.
#'
#' @return A numeric matrix of frequencies
#' 
#' @examples
#' set.seed(999)
#' x1 <- round(rnorm(n=9, mean=10, sd=2))
#' x2 <- round(rnorm(n=9, mean=20, sd=2))
#' ad <- matrix(paste(x1, x2, sep=","), nrow=3, ncol=3)
#' colnames(ad) <- paste('Sample', 1:3, sep="_")
#' rownames(ad) <- paste('Variant', 1:3, sep="_")
#' ad[1,1] <- "9,23,12"
#' AD_frequency(ad=ad)
#' 
#' 
#' @export
AD_frequency <- function(ad, delim = ",", allele = 1L, sum_type = 0L, decreasing = 1L) {
    .Call(`_vcfR_AD_frequency`, ad, delim, allele, sum_type, decreasing)
}

.write_fasta <- function(seq, seqname, filename, rowlength = 80L, verbose = 1L, depr = 1L) {
    invisible(.Call(`_vcfR_write_fasta`, seq, seqname, filename, rowlength, verbose, depr))
}

.elementNumber <- function(x, element = "GT") {
    .Call(`_vcfR_elementNumber`, x, element)
}

.extract_GT_to_CM <- function(fix, gt, element = "DP", alleles = 0L, extract = 1L, convertNA = 1L) {
    .Call(`_vcfR_extract_GT_to_CM`, fix, gt, element, alleles, extract, convertNA)
}

.CM_to_NM <- function(x) {
    .Call(`_vcfR_CM_to_NM`, x)
}

.extract_haps <- function(ref, alt, gt, unphased_as_NA, verbose) {
    .Call(`_vcfR_extract_haps`, ref, alt, gt, unphased_as_NA, verbose)
}

.grepa <- function() {
    invisible(.Call(`_vcfR_grepa`))
}

.shankaR <- function() {
    invisible(.Call(`_vcfR_shankaR`))
}

#' 
#' @rdname freq_peak
#' 
#' @title freq_peak
#' @description Find density peaks in frequency data.
#' 
#' @param myMat a matrix of frequencies [0-1].
#' @param pos a numeric vector describing the position of variants in myMat.
#' @param winsize sliding window size.
#' @param bin_width Width of bins to summarize ferequencies in (0-1].
#' @param lhs logical specifying whether the search for the bin of greatest density should favor values from the left hand side.
#' 
#' @details
#' Noisy data, such as genomic data, lack a clear consensus.
#' Summaries may be made in an attempt to 'clean it up.'
#' Common summaries, such as the mean, rely on an assumption of normalicy.
#' An assumption that frequently can be violated.
#' This leaves a conundrum as to how to effectively summarize these data.
#' 
#' 
#' Here we implement an attempt to summarize noisy data through binning the data and selecting the bin containing the greatest density of data.
#' The data are first divided into parameter sized windows.
#' Next the data are categorized by parameterizable bin widths.
#' Finally, the bin with the greatest density, the greatest count of data, is used as a summary.
#' Because this method is based on binning the data it does not rely on a distributional assumption.
#' 
#' 
#' The parameter \code{lhs} specifyies whether the search for the bin of greatest density should be performed from the left hand side.
#' The default value of TRUE starts at the left hand side, or zero, and selects a new bin as having the greatest density only if a new bin has a greater density.
#' If the new bin has an equal density then no update is made.
#' This causees the analysis to select lower frequencies.
#' When this parameter is set to FALSE ties result in an update of the bin of greatest density.
#' This causes the analysis to select higher frequencies.
#' It is recommended that when testing the most abundant allele (typically [0.5-1]) to use the default of TRUE so that a low value is preferred.
#' Similarly, when testing the less abundant alleles it is recommended to set this value at FALSE to preferentially select high values.
#' 
#' 
#' @return 
#' A freq_peak object (a list) containing:
#' \itemize{
#'   \item The window size
#'   \item The binwidth used for peak binning
#'   \item a matrix containing window coordinates
#'   \item a matrix containing peak locations
#'   \item a matrix containing the counts of variants for each sample in each window
#' }
#' 
#' The window matrix contains start and end coordinates for each window, the rows of the original matrix that demarcate each window and the position of the variants that begin and end each window.
#' 
#' The matrix of peak locations contains the midpoint for the bin of greatest density for each sample and each window.
#' Alternatively, if `count = TRUE` the number of non-missing values in each window is reported.
#' The number of non-mising values in each window may be used to censor windows containing low quantities of data.
#' 
#' @seealso
#' peak_to_ploid,
#' freq_peak_plot
#' 
#' @examples
#' data(vcfR_example)
#' gt <- extract.gt(vcf)
#' hets <- is_het(gt)
#' # Censor non-heterozygous positions.
#' is.na(vcf@gt[,-1][!hets]) <- TRUE
#' # Extract allele depths.
#' ad <- extract.gt(vcf, element = "AD")
#' ad1 <- masplit(ad, record = 1)
#' ad2 <- masplit(ad, record = 2)
#' freq1 <- ad1/(ad1+ad2)
#' freq2 <- ad2/(ad1+ad2)
#' myPeaks1 <- freq_peak(freq1, getPOS(vcf))
#' is.na(myPeaks1$peaks[myPeaks1$counts < 20]) <- TRUE
#' myPeaks2 <- freq_peak(freq2, getPOS(vcf), lhs = FALSE)
#' is.na(myPeaks2$peaks[myPeaks2$counts < 20]) <- TRUE
#' myPeaks1
#' 
#' # Visualize
#' mySample <- "P17777us22"
#' myWin <- 2
#' hist(freq1[myPeaks1$wins[myWin,'START_row']:myPeaks1$wins[myWin,'END_row'], mySample], 
#'      breaks=seq(0,1,by=0.02), col="#A6CEE3", main="", xlab="", xaxt="n")
#' hist(freq2[myPeaks2$wins[myWin,'START_row']:myPeaks2$wins[myWin,'END_row'], mySample], 
#'      breaks=seq(0,1,by=0.02), col="#1F78B4", main="", xlab="", xaxt="n", add = TRUE)
#' axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1), 
#'      labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=3)
#' abline(v=myPeaks1$peaks[myWin,mySample], col=2, lwd=2)
#' abline(v=myPeaks2$peaks[myWin,mySample], col=2, lwd=2)
#' 
#' # Visualize #2
#' mySample <- "P17777us22"
#' plot(getPOS(vcf), freq1[,mySample], ylim=c(0,1), type="n", yaxt='n', 
#'      main = mySample, xlab = "POS", ylab = "Allele balance")
#' axis(side=2, at=c(0,0.25,0.333,0.5,0.666,0.75,1), 
#'      labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
#' abline(h=c(0.25,0.333,0.5,0.666,0.75), col=8)
#' points(getPOS(vcf), freq1[,mySample], pch = 20, col= "#A6CEE3")
#' points(getPOS(vcf), freq2[,mySample], pch = 20, col= "#1F78B4")
#' segments(x0=myPeaks1$wins[,'START_pos'], y0=myPeaks1$peaks[,mySample],
#'          x1=myPeaks1$wins[,'END_pos'], lwd=3)
#' segments(x0=myPeaks1$wins[,'START_pos'], y0=myPeaks2$peaks[,mySample],
#'          x1=myPeaks1$wins[,'END_pos'], lwd=3)
#' 
#' 
#' 
#' @export
freq_peak <- function(myMat, pos, winsize = 10000L, bin_width = 0.02, lhs = TRUE) {
    .Call(`_vcfR_freq_peak`, myMat, pos, winsize, bin_width, lhs)
}

.gt_to_popsum <- function(var_info, gt) {
    .Call(`_vcfR_gt_to_popsum`, var_info, gt)
}

#' @rdname is_het
#' @name is_het
#' 
#' 
#' 
#' @export
is_het <- function(x, na_is_false = TRUE) {
    .Call(`_vcfR_is_het`, x, na_is_false)
}

#' 
#' @rdname masplit
#' 
#' @title masplit
#' @description Split a matrix of delimited strings.
#' 
#' @param myMat a matrix of delimited strings (e.g., "7,2").
#' @param delim character that delimits values.
#' @param count return the count of delimited records.
#' @param record which (1-based) record to return.
#' @param sort should the records be sorted prior to selecting the element (0,1)?
#' @param decreasing should the values be sorted decreasing (1) or increasing (0)?
#' 
#' 
#' @details 
#' Split a matrix of delimited strings that represent numerics into numerics.
#' The parameter \strong{count} returns a matrix of integers indicating how many delimited records exist in each element.
#' This is intended to help if you do not know how many records are in each element particularly if there is a mixture of numbers of records.
#' The parameter \strong{record} indicates which record to return (first, second, third, ...).
#' The parameter \strong{sort} indicates whether the records in each element should be sorted (1) or not (0) prior to selection.
#' When sorting has been selected \strong{decreasing} indicates if the sorting should be performed in a decreasing (1) or increasing (0) manner prior to selection.
#' 
#' 
#' 
#' 
#' @return A numeric matrix
#' 
#' 
#' @examples
#' set.seed(999)
#' x1 <- round(rnorm(n=9, mean=10, sd=2))
#' x2 <- round(rnorm(n=9, mean=20, sd=2))
#' ad <- matrix(paste(x1, x2, sep=","), nrow=3, ncol=3)
#' colnames(ad) <- paste('Sample', 1:3, sep="_")
#' rownames(ad) <- paste('Variant', 1:3, sep="_")
#' ad[1,1] <- "9,23,12"
#' is.na(ad[3,1]) <- TRUE
#' 
#' ad
#' masplit(ad, count = 1)
#' masplit(ad, sort = 0)
#' masplit(ad, sort = 0, record = 2)
#' masplit(ad, sort = 0, record = 3)
#' masplit(ad, sort = 1, decreasing = 0)
#' 
#' 
#' @export
masplit <- function(myMat, delim = ",", count = 0L, record = 1L, sort = 1L, decreasing = 1L) {
    .Call(`_vcfR_masplit`, myMat, delim, count, record, sort, decreasing)
}

pair_sort <- function() {
    .Call(`_vcfR_pair_sort`)
}

.rank_variants <- function(variants, ends, score) {
    .Call(`_vcfR_rank_variants`, variants, ends, score)
}

.vcf_stats_gz <- function(x, nrows = -1L, skip = 0L, verbose = 1L) {
    .Call(`_vcfR_vcf_stats_gz`, x, nrows, skip, verbose)
}

.read_meta_gz <- function(x, stats, verbose) {
    .Call(`_vcfR_read_meta_gz`, x, stats, verbose)
}

.read_body_gz <- function(x, stats, nrows = -1L, skip = 0L, cols = 0L, convertNA = 1L, verbose = 1L) {
    .Call(`_vcfR_read_body_gz`, x, stats, nrows, skip, cols, convertNA, verbose)
}

.seq_to_rects <- function(seq, targets) {
    .Call(`_vcfR_seq_to_rects`, seq, targets)
}

.window_init <- function(window_size, max_bp) {
    .Call(`_vcfR_window_init`, window_size, max_bp)
}

.windowize_fasta <- function(wins, seq) {
    .Call(`_vcfR_windowize_fasta`, wins, seq)
}

.windowize_variants <- function(windows, variants) {
    .Call(`_vcfR_windowize_variants`, windows, variants)
}

.windowize_annotations <- function(wins, ann_starts, ann_ends, chrom_length) {
    .Call(`_vcfR_windowize_annotations`, wins, ann_starts, ann_ends, chrom_length)
}

.write_vcf_body <- function(fix, gt, filename = "myFile.vcf.gz", mask = 0L) {
    invisible(.Call(`_vcfR_write_vcf_body`, fix, gt, filename, mask))
}