#' @title tryNA
#' @description silently get what you can out of an expression
#' @rdname internals
#'
#' @param expr the expression to try to evaluate
#'
#' @return If the expression evaluates without error, the evaluated expression.
#' If there is an error in evaluating the expression, NA.
#' Warnings are suppresssed.
#' @export
#'
tryNA <- function(expr) {
suppressWarnings(tryCatch(expr = expr,
error = function(e) NA,
finally = NA))
}
#' @title tryNULL
#' @description silently get what you can out of an expression
#' @rdname internals
#'
#'
#' @return If the expression evaluates without error, the evaluated expression.
#' If there is an error in evaluating the expression, NULL.
#' Warnings are suppresssed.
#' @export
#'
tryNULL <- function(expr) {
suppressWarnings(tryCatch(expr = expr,
error = function(e) NULL,
finally = NULL))
}
#' @title pull_GT
#' @description Pull the GT (maximum likelihood genotype) from a vcf file snp row.
#' @rdname internals
#'
#' @param snp_row the snp row from the VCF file (first cell is FORMAT)
#' @param min_gt_count minimum number of observations required to keep a genotype group in the study.
#' Five is the default, but I advocate a slightly higher value for research (10 or 20 maybe).
#'
#' @return a data.frame where the first column is the ID and the second column is the genotype
#' @export
#'
pull_GT <- function(snp_row, min_gt_count) {
if (names(snp_row)[1] != 'FORMAT') {
stop('First column in genotype section of VCF file must be "FORMAT".')
}
snp_formats <- stringr::str_split(string = snp_row[1], pattern = ':')[[1]]
gt_idx <- which(x = 'GT' == snp_formats)
if (length(gt_idx) == 0) { stop('GT not found in VCF')}
snp_row <- snp_row[-1]
num_indiv <- length(snp_row)
gts_raw <- stringr::str_split(string = snp_row, pattern = ':', simplify = TRUE)[,gt_idx]
# if hets in both 'directions' are present, collapse them to one
# todo: may need to deal with '0\1' or '0/1' at some point
if (all('0|1' %in% gts_raw, '1|0' %in% gts_raw)) {
gts <- replace(x = gts_raw, list = gts_raw == '1|0', values = '0|1')
} else {
gts <- gts_raw
}
# if there's any too-rare GT, drop it
if (any(table(gts) < min_gt_count)) {
bad_gts <- names(table(gts))[table(gts) < min_gt_count]
gts <- replace(x = gts, list = which(gts %in% bad_gts), values = NA)
}
# if all the same GT, just return all 0's...
# todo: check that this is kosher
if (length(unique(gts)) == 1) {
return(data.frame(ID = names(snp_row),
GT = rep(0, num_indiv),
stringsAsFactors = FALSE))
} else {
return(data.frame(ID = names(snp_row),
GT = factor(gts),
stringsAsFactors = FALSE))
}
}
#' @title pull_DS
#' @description Pull the DS (alternate allele dosage) from a vcf file snp row.
#' @rdname internals
#'
#' @return a data.frame where the first column is the ID and the second column is the genotype
#' @export
#'
pull_DS <- function(snp_row) {
if (names(snp_row)[1] != 'FORMAT') {
stop('First column in genotype section of VCF file must be "FORMAT".')
}
snp_formats <- stringr::str_split(string = snp_row[1], pattern = ':')[[1]]
ds_idx <- which(x = 'DS' == snp_formats)
if (length(ds_idx) == 0) { stop('DS not found in VCF')}
snp_row <- snp_row[-1]
num_indiv <- length(snp_row)
dosage_raw <- stringr::str_split(string = snp_row, pattern = ':', simplify = TRUE)[,ds_idx]
dosage_num <- as.numeric(dosage_raw)
return(data.frame(ID = names(snp_row),
DS = dosage_num,
stringsAsFactors = FALSE))
}
#' @title pull_GP
#' @description Pull the GP (genotype probabilities) from a vcf file snp row.
#' @rdname internals
#'
#' @return a data.frame where the first column is the ID and the second and third columns are genotype probabilities
#' @export
#'
pull_GP <- function(snp_row) {
if (names(snp_row)[1] != 'FORMAT') {
stop('First column in genotype section of VCF file must be "FORMAT".')
}
snp_formats <- stringr::str_split(string = snp_row[1], pattern = ':')[[1]]
gp_idx <- which(x = 'GP' == snp_formats)
if (length(gp_idx) == 0) { stop('GP not found in VCF')}
snp_row <- snp_row[-1]
num_indiv <- length(snp_row)
genoprob_string_vec <- stringr::str_split(string = snp_row, pattern = ':', simplify = TRUE)[,gp_idx]
genoprobs_raw <- stringr::str_split(string = genoprob_string_vec, pattern = ',', simplify = TRUE)
genoprobs_num <- apply(X = genoprobs_raw, MARGIN = 2, FUN = as.numeric)
if (any(abs(apply(X = genoprobs_num, MARGIN = 1, FUN = sum) - 1) > 0.01)) {
stop('Genoprobs dont add to 1.')
}
gp_add <- genoprobs_num[,2] + 2*genoprobs_num[,3]
gp_dom <- genoprobs_num[,1] + genoprobs_num[,3]
return(data.frame(ID = names(snp_row),
GP_add = gp_add,
GP_dom = gp_dom,
stringsAsFactors = FALSE))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.