# Copyright 2018 GRAIL, Inc. All rights reserved.
# Use of this source code is governed by the Apache 2.0
# license that can be found in the LICENSE file.
#' Read a data.table from given path
#'
#' Check if file name starts with s3 or not, use appropriate function to read.
#' If the file does not exist, exit with an error message if needed.
#'
#'
#' @param file file name to read
#' @param header logical whether to read the header
#' @param sep the separator
#' @param stop_if_missing logical whether to stop execution if file is missing
#' @param ... additional arguments
#' @return data.table
#'
#' @export
read_data_table <- function(file, header = TRUE, sep = "\t",
stop_if_missing = FALSE, ...) {
if (startsWith(file, "s3://")) {
if (requireNamespace("grails3r")) {
if (grails3r::s3_file_exists(file)) {
if (endsWith(file, ".gz")) {
stop(paste("Reading zipped file from s3 not supported.", file))
} else {
return(grails3r::read_from_s3(file, fread, header = header,
stringsAsFactors = FALSE, sep = sep, ...))
}
}} else {
# Use aws.s3 package to read if grails3r is not found
creds <- aws.signature::locate_credentials()
if (as.logical(do.call(aws.s3::head_object, c(file, creds)))) {
if (endsWith(file, ".gz")) {
stop(paste("Reading zipped file from s3 not supported.", file))
} else {
return(s3read_using(fread, object = file, header = header,
stringsAsFactors = FALSE, sep = sep, ...))
}
}
}
} else {
if (file.exists(file)) {
if (endsWith(file, ".gz")) {
return(fread(sprintf("zcat %s", file), header = header,
stringsAsFactors = FALSE, sep = sep, ...))
} else {
return(fread(file, header = header, stringsAsFactors = FALSE,
sep = sep, ...))
}
}
}
if (stop_if_missing) {
stop(paste("Input file not found:", file))
} else {
warning(paste("Input file not found:", file))
return(NULL)
}
}
#' Write a data.table to a given path
#'
#' Check if file name starts with s3 or not, use appropriate function to write.
#'
#' @param out data to write
#' @param out_file output file to write to
#' @param header logical whether to read the header
#' @param sep the separator
#' @param ... additional arguments
#' @return data.table
#'
#' @export
write_data_table <- function(out, out_file, header = TRUE, sep = "\t", ...) {
if (startsWith(out_file, "s3")) {
if (requireNamespace("grails3r")) {
grails3r::write_to_s3(out, out_file, write.table, sep = sep,
row.names = FALSE, col.names = header,
quote = FALSE, ...)
} else {
s3write_using(out, FUN = write.table, object = out_file, sep = sep,
col.names = header, row.names = FALSE, ...)
}
} else {
write.table(out, out_file, sep = sep, row.names = FALSE,
col.names = header, quote = FALSE, ...)
}
}
#' Read a counts tsv file and prep it
#'
#' @param file tsv file containing SNP info for the sample
#' @param file_rev tsv file containing SNP counts for reverse strand
#' @param baseline tsv file with blacklist and noise model
#' @param default_het_mean default heterozygote allele frequency mean
#' @param default_het_sum default heterozygote allele frequency alpha + beta
#' from beta binomial alternative formulation
#' @return data.table containing counts and metrics per SNP
#'
#' @export
read_and_prep <- function(file, file_rev = NA, baseline = NA,
default_het_mean = 0.43,
default_het_sum = 200) {
dat <- read_data_table(file, stop_if_missing = TRUE)
# If reverse strand counts are provided, merge it with the first
# set of counts which is supposed to come from the positive strand
if (!is.na(file_rev)) {
dat2 <- read_data_table(file_rev, stop_if_missing = TRUE)
dat <- combine_counts(dat, dat2)
}
# Set chromosome as factors
dat <- factorize_chroms(dat)
# Remove blacklisted SNPs if provided
required_columns <- c("blacklist", "rsid")
if (!is.null(baseline) && !is.na(baseline) &&
all(required_columns %in% colnames(baseline))) {
blacklist <- baseline %>%
filter(blacklist == TRUE)
dat <- dat %>%
filter(!rsid %in% blacklist$rsid)
dat <- as.data.table(dat)
} else if (!is.null(baseline) && !is.na(baseline)) {
warning(paste("One of the required columns in baseline file is missing.",
"Required columns:", paste(required_columns, collapse = ", ")))
}
# Add mean and sum for heterozygote SNPs if provided
optional_beta_binomial_columns <- c("alpha", "beta")
if (all(optional_beta_binomial_columns %in% colnames(baseline))) {
dat <- dat %>%
left_join(baseline %>% dplyr::select(rsid, alpha, beta), by = c("rsid"))
dat <- dat %>%
dplyr::mutate(het_mean = ifelse(is.na(alpha) | is.na(beta),
default_het_mean, alpha / (alpha + beta)),
het_sum = ifelse(is.na(alpha) | is.na(beta),
default_het_sum, alpha + beta))
} else {
dat$het_mean <- default_het_mean
dat$het_sum <- default_het_sum
}
# Return if data.table is empty
if (nrow(dat) == 0)
return(dat)
# Find minor allele
dat <- dat %>%
tidyr::separate(alleles, c("a1", "a2"), sep = "/") %>%
dplyr::mutate(minor = ifelse(a1 == major, a2, a1)) %>%
dplyr::select(-c("a1", "a2"))
# Add some other fields
dat <- ratio_and_counts(dat)
# Final sort
dat <- dat %>%
arrange(chrom, pos)
return(dat)
}
#' Combine two count data tables
#'
#' @param dat data.table counts from first file
#' @param dat2 data.table counts from second file
#' @return data.table containing merged counts
#'
#' @export
combine_counts <- function(dat, dat2) {
dat <- suppressWarnings(set_numeric_chrs(dat))
dat2 <- suppressWarnings(set_numeric_chrs(dat2))
data.table::setkey(dat, chrom_int, pos, ref, major, alleles, rsid, maf, chrom)
data.table::setkey(dat2, chrom_int, pos, ref, major, alleles, rsid, maf, chrom)
datm <- merge(dat, dat2, sort = TRUE, all = TRUE)
datm$A <- as.integer(rowSums(datm[, c("A.x", "A.y")], na.rm = TRUE))
datm$T <- as.integer(rowSums(datm[, c("T.x", "T.y")], na.rm = TRUE))
datm$G <- as.integer(rowSums(datm[, c("G.x", "G.y")], na.rm = TRUE))
datm$C <- as.integer(rowSums(datm[, c("C.x", "C.y")], na.rm = TRUE))
datm$N <- as.integer(rowSums(datm[, c("N.x", "N.y")], na.rm = TRUE))
if (sum(colnames(datm) == "context.x") == 1) {
datm$context <- ifelse(!is.na(datm$context.x),
datm$context.x, datm$context.y)
datm <- datm[, -c("context.x", "context.y")]
}
return(datm[, -c("A.x", "A.y", "T.x", "T.y", "G.x", "G.y",
"C.x", "C.y", "N.x", "N.y", "chrom_int")])
}
#' Add in basic counts and ratios
#'
#' @param dat data.frame containing counts and metrics per SNP
#' @return data.table containing counts and metrics per SNP
#'
#' @export
ratio_and_counts <- function(dat) {
# Recalculate depth and major / minor ratio
# Define minor ratio only in respect to major and minor alleles
# Other count and ratio are calculated as a filter of noisy positions
dat <- dat %>%
dplyr::mutate(depth = A + T + G + C,
major_count = dplyr::case_when(major == 'A' ~ A,
major == 'T' ~ T,
major == 'G' ~ G,
major == 'C' ~ C,
TRUE ~ as.integer(0)),
minor_count = dplyr::case_when(minor == "A" ~ A,
minor == "T" ~ T,
minor == "G" ~ G,
minor == "C" ~ C,
TRUE ~ as.integer(0)),
major_ratio = major_count / (major_count + minor_count),
minor_ratio = minor_count / (major_count + minor_count),
other_count = depth - major_count - minor_count,
other_ratio = other_count / depth)
return(dat)
}
#' Add bayesian_gt estimate of genotype based upon per-snp error rate and major and
#' minor allele counts/frequencies.
#'
#' @param dat data.table containing counts and metrics per SNP
#' @return data.table containing counts and metrics per SNP
#'
#' @export
bayesian_genotype <- function(dat) {
dat <- dat %>%
dplyr::rowwise() %>%
dplyr::mutate(hom_ref_prob = dbinom(minor_count,
size = major_count + minor_count,
prob = er) * (1 - minor_ratio)^2) %>%
dplyr::mutate(hom_alt_prob = dbinom(minor_count,
size = major_count + minor_count,
prob = 1 - er) * minor_ratio^2) %>%
dplyr::mutate(het_prob = dbinom(minor_count,
size = major_count + minor_count,
prob = .5) * 2 * (1 - minor_ratio) * (minor_ratio)) %>%
dplyr::mutate(total_prob = het_prob + hom_ref_prob + hom_alt_prob) %>%
dplyr::mutate(het_prob = ifelse(total_prob == 0, 0, het_prob/total_prob),
hom_alt_prob = ifelse(total_prob == 0, 0, hom_alt_prob/total_prob),
hom_ref_prob = ifelse(total_prob == 0, 0, hom_ref_prob/total_prob)) %>%
dplyr::mutate(bayes_gt = ifelse(total_prob == 0, gt, c("0/0", "0/1", "1/1")[
which(c(hom_ref_prob, het_prob, hom_alt_prob) ==
max(c(hom_ref_prob, het_prob, hom_alt_prob)))[1]])) %>%
dplyr::select(-total_prob) %>%
dplyr::ungroup()
return(data.table(dat))
}
#' Add in more useful columns, annotation and filter sites.
#'
#' @param dat data.table containing counts and metrics per SNP
#' @param het_limit limit fraction to call a heterozygote site.
# SNPs with variant frequency above this value and below 1 minus this value
# will be called as heterozygotes. Rest of the SNPs will be called
# homozygotes.
#' @param min_other_ratio If the ratio of highest depth non-SNP allele is at
#' as high as this number, the SNP will be filtered. These SNPs either have
#' unusually high error rate (or mutation), or has a hidden (multi-allelic)
#' SNP. In either case, this position should be filtered from contamination
#' analysis.
#' @param min_depth if a SNP has depth less than this value, it will be
#' filtered. This metric is mainly for WGS where overall coverage is low.
#' @param max_depth if a SNP has depth more than this value, it will be
#' filtered. This metric is mainly for RNA where some genes have extreme
#' coverage.
#' @param out_frac remove SNPs that are outliers to depth distribution.
#' This filter is applied post min_depth and max_depth filters to further
#' remove any outliers that also tend to generate unexpected noise.
#' @return data.frame containing counts and metrics per SNP
#'
#' @importFrom stats quantile
#' @export
annotate_and_filter <- function(dat, het_limit = 0.25, min_other_ratio = 0.15,
min_depth = 5, max_depth = 10000,
out_frac = 0) {
# Return if data.table is empty
if (nrow(dat) == 0)
return(dat)
# Remove non-ATGC alleles
# Calculate genotypes, noise levels (vfn) and noise reads (vr)
# and contamination probability (cp)
dat <- dat %>%
dplyr::filter(major %in% c(get_bases(), "N"),
minor %in% c(get_bases(), "N"))
dat <- dat %>%
dplyr::mutate(
gt = dplyr::case_when(
depth == 0 ~ "NA",
minor_ratio < het_limit ~ "0/0",
major_ratio < het_limit ~ "1/1",
TRUE ~ "0/1"),
vfn = dplyr::case_when(
gt == "1/1" ~ major_ratio,
gt == "0/0" ~ minor_ratio,
gt == "0/1" ~ pmin(major_ratio, minor_ratio),
TRUE ~ NaN),
vr = as.integer(floor(vfn * depth)),
cp = dplyr::case_when(
gt == "1/1" ~ 1 - (dat$maf) ^ 2,
gt == "0/0" ~ 1 - (1 - dat$maf) ^ 2,
gt == "0/1" ~ 1 - 2 * dat$maf * (1 - dat$maf),
TRUE ~ NaN))
# Apply filters
dat <- dat %>%
dplyr::filter(other_ratio < min_other_ratio,
!is.na(depth) & depth >= min_depth & depth <= max_depth)
# Add chunks
dat <- add_chunks(dat)
# Update context for SNPs with alternative homozygote genotypes
dat <- update_context(dat)
# Final sort
dat <- dat %>%
arrange(chrom, pos)
}
#' Get the stats for a given chromosome from long biometrics file, including:
#' - Mapq60 read count
#' - Normalized mapq60 read count (normalized by total reads for this sample)
#' - Fraction of bases covered by at least 1 read
#'
#' @param biometrics_file input tsv file name
#' @param chr_name name of the chromosome to be counted
#'
#' @return list of chromosome metrics
#'
#' @export
chr_stats <- function(biometrics_file, chr_name) {
# Read file
if (file.exists(biometrics_file) && (file.info(biometrics_file)$size > 0)) {
k1 <- read_data_table(biometrics_file, sep = "\t")
}
else {
return(list( count = NA, normalized_count = NA, fraction_covered = NA))
}
# Calculate total reads
mapq60_reads <- k1[(key == "mapq60_count") &
(metric_key == "fragment_counts")]$value
# Cast metrics of interest to long tables
m11 <- k1[metric_key == "fraction_chr_covered", ]
d11 <- dcast(m11, group_id ~ key, value.var = "value")
m12 <- k1[metric_key == "mapq60_fragment_counts_per_chr"]
d12 <- dcast(m12, group_id ~ key, value.var = "value")
# Normalize counts
d12$normalized_count <- as.numeric(d12$count) / as.numeric(mapq60_reads)
# Return list with NA elements if the specified chr does not exist
if (nrow(d12[ chr == chr_name ]) == 0) {
return(list( count = NA, normalized_count = NA, fraction_covered = NA))
}
# Return a list of values
return(list( count = as.numeric(d12[chr == chr_name, count]),
normalized_count = d12[chr == chr_name, normalized_count],
fraction_covered = as.numeric(d11[chr == chr_name,
fraction_covered])))
}
#' Read a vcf file as a data table
#'
#' @param vcf_file name of the input file
#' @param n max number of header lines to skip
#'
#' @return data.table containing vcf data
#'
#' @export
read_vcf_dt <- function(vcf_file, n = 100000) {
if (startsWith(vcf_file, "s3://")) {
if (requireNamespace("grails3r"))
lines <- grails3r::read_from_s3(vcf_file, readLines, n = n)
else
lines <- aws.s3::s3read_using(readLines, object = vcf_file, n = n)
} else {
lines <- readLines(vcf_file, n)
}
skip_lines <- 0
for (i in 1:length(lines)) {
if (startsWith(lines[i], "##"))
next
else {
skip_lines <- i - 1
break
}
}
vcf_dt <- read_data_table(vcf_file, header = TRUE, sep = "\t",
stop_if_missing = FALSE, skip = skip_lines)
caf <- parse_field(vcf_dt$INFO, "CAF")
vcf_dt$maf <- as.numeric(substring(caf, regexpr(",", caf) + 1,
nchar(caf) - 2))
return(vcf_dt)
}
#' Retrieve a specified field from a vcf info string
#'
#' @param info vcf info string
#' @param field a specific field from info
#' @return string field
#'
#' @export
parse_field <- function(info, field) {
start_loc <- regexpr("CAF=", info)
end_loc <- start_loc +
regexpr(";", substring(info, start_loc, nchar(info))) - 1
caf_string <- substring(info, start_loc, end_loc)
}
#' Partition each chromosome to bins and tag each SNP with its bin
#'
#' @param dat data.table containing SNPs
#' @param max_portions number of portions to split each chromosome
#' @return data.table containing SNPs with tagged chunks
#'
#' @export
add_chunks <- function(dat, max_portions = 5) {
if (nrow(dat) == 0) return(data.table())
# Calculate chromosome boundaries and chunk size
max_chrom_pos <- dat %>%
dplyr::group_by(chrom) %>%
dplyr::summarize(max_snp = max(pos)) %>%
dplyr::mutate(chunk_size = max_snp / max_portions) %>%
dplyr::select(-c("max_snp"))
# Add sizes to chromosomes
dat <- dat %>%
dplyr::left_join(max_chrom_pos, by = "chrom")
dat$chunk <- 0
# Calculate which chunk on its chromosome each SNP belongs to
dat <- dat %>%
dplyr::mutate(chunk = ceiling(pos / chunk_size))
return(dat)
}
#' Update 3-base context for SNPs with alternative homozygote genotypes
#'
#' @param dat data.table containing SNPs
#' @return data.table containing SNPs with tagged chunks
#'
#' @export
update_context <- function(dat) {
# Return as is if context is not provided
if (is.null(dat$context))
return(dat)
# Take the middle base from minor allele if genotype is 1/1
dat$context <- paste(substr(dat$context, 1, 1),
ifelse(dat$gt == "1/1", dat$minor, dat$major),
substr(dat$context, 3, 3), sep = "")
# Take the middle base from minor allele if genotype is 1/1
dat$context_snp <- paste(substr(dat$context, 1, 1),
ifelse(dat$gt == "1/1", dat$major, dat$minor),
substr(dat$context, 3, 3), sep = "")
# Select positions without N base in the context
dat <- dat[which(!grepl(pattern = "N", dat$context)), ]
return(dat)
}
#' Read simulation results from provided path
#'
#' Reads conta simulation results from a local or S3 path. Checks
#' that the required columns are present and properly formatted,
#' and returns the simulation data as a data frame.
#'
#' @param file file name to read
#' @return data.table
#'
#' @export
read_sim_results <- function(file) {
raw_data <- read_data_table(file,
stop_if_missing = TRUE)
req_cols <- c("sample", "avg_log_lr")
diff <- setdiff(req_cols, colnames(raw_data))
if (length(diff) != 0) {
stop(paste0("ERROR: Simulation results file is missing required columns.\n",
"file: ", file, "\nmissing cols: ", diff))
}
sample_splits <- strsplit(raw_data$sample, "_")
sample_names <- sapply(sample_splits,
function(x) paste0(x[1:(length(x) - 1)], collapse = "_"))
conta_level <- sapply(sample_splits,
function(x) suppressWarnings(as.numeric(x[[length(x)]])))
if (any(is.na(conta_level))) {
stop(paste0("ERROR: Expect simulation `sample` column to be of the form ",
"{sample_id}_{conta_level}"))
}
raw_data$sample <- sample_names
raw_data$conta_level <- conta_level
return(raw_data)
}
#' Write genotype file
#'
#' Genotypes and contamination metrics for each SNP are useful for different
#' applications such as contamination source and swap detection.
#'
#' @param dat data.table containing counts and metrics per SNP
#' @param max_result conta result with maximum likelihood
#' @param blackswan blackswan parameter for likelihood calculation
#' @param outlier_fract outlier fraction
#' @param out_file_gt name and location of the output file
#'
#' @export
write_gt_file <- function(dat, max_result, blackswan, outlier_frac,
out_file_gt) {
dat$lr <- ifelse(dat$gt == "0/1", NA,
log_lr(dat$cp, dat$depth,
get_exp_cf(as.numeric(max_result$cf)),
dat$er, dat$vr, blackswan))
dat$outlier <- ifelse(dat$lr < quantile(dat$lr, 1 - outlier_frac,
na.rm = TRUE) &
dat$lr > quantile(dat$lr, outlier_frac,
na.rm = TRUE), 0, 1)
if ("bayes_gt" %in% colnames(dat)) {
# Write genotypes and possible contaminant reads
gt_sum <- rbind(dat[, .(rsid, chr = chrom, pos, et, maf, cp,
dp = depth, er, gt, bayes_gt, vr, minor_ratio,
lr, outlier)])
}
else {
# Write genotypes and possible contaminant reads
gt_sum <- rbind(dat[, .(rsid, chr = chrom, pos, et, maf, cp,
dp = depth, er, gt, vr, minor_ratio, lr, outlier)])
}
write.table(format(gt_sum, digits = 6, trim = TRUE), file = out_file_gt,
sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
}
#' Returns character list of provided precision
#'
#' This function takes in a given numeric data.frame column, rounds it to the
#' provided precision and returns back the values as a character list
#'
#' @param x numeric data.frame or numeric data.frame column
#' @param precision numeric int for decimal place to round to
#'
#' @export
specify_precision <- function(x, precision){
format(round(as.numeric(x), precision), nsmall = precision,
trim = TRUE, scientific = FALSE)
}
#' Read in and parse 1000G vcf
#'
#' Reads in 1000G vcf file. Parses 'INFO' column into their own columns.
#' Filters out MNVs (multiple nucleotide variants), Indels, SNPs with no RSID,
#' and INFO columns of unexpected order.
#'
#' @param vcf 1000 genomes vcf
#' @param remove_sex_chr Defaults to FALSE, meaning sex chromosomes will not be
#' filtered out. If 'TRUE' sex chromosomes will be removed
#' @return Data frame of parsed vcf
#' @importFrom dplyr filter %>%
#'
#' @export
parse_1000g_vcf <- function(vcf, remove_sex_chr = FALSE) {
single_bases <- conta::get_bases()
# Read in 1000g vcf
parsed_vcf <- conta::read_data_table(vcf,
skip = "CHROM")
# Parse INFO field, filter, and rename CHROM column
parsed_vcf <- tidyr::separate(data = parsed_vcf, col = INFO,
into = c("AC", "AF", "AN", "NS", "DP", "EAS_AF",
"AMR_AF", "AFR_AF", "EUR_AF", "SAS_AF"),
sep = ";",
extra = "drop") %>%
dplyr::rename("chr" = 1,
"rsid" = 3) %>%
dplyr::filter(REF %in% single_bases,
ALT %in% single_bases,
!rsid == ".")
if (remove_sex_chr == TRUE) {
parsed_vcf <- parsed_vcf %>%
dplyr::filter(!chr %in% c("X", "Y"))
}
# Remove all non-numeric characters from new columns and change to numeric
parsed_vcf[, 8:17] <- lapply(parsed_vcf[, 8:17], function(x) as.numeric(gsub("[^0-9.]", "", x)))
parsed_vcf <- parsed_vcf %>% tidyr::drop_na(AC) ## drop SNPs with no allele counts
return(parsed_vcf)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.