# Copyright 2018 GRAIL, Inc. All rights reserved.
# Use of this source code is governed by the Apache 2.0
# license that can be found in the LICENSE file.
#' Return CCGA1 ethnicity probabilities
#' @return Data frame of CCGA1 ethnicity probabilities
#'
#' @export
get_ethnicity_priors <- function() {
ethnicity_probs <- list("prob_asian" = 0.009823183,
"prob_african" = 0.078585462,
"prob_hispanic" = 0.064833006,
"prob_white" = 0.827111984)
return(ethnicity_probs)
}
#' Get sample name from gt.loh.tsv file
#'
#' @param gt_loh_file_loc sample gt file location
#' @return sample id
#'
#' @export
get_sample_name <- function(gt_loh_file_loc) {
file_splits <- strsplit(gt_loh_file_loc, "/")
sample_file <- sapply(file_splits,
function(x) suppressWarnings(as.character(x[[length(x)]])))
sample_id <- gsub(".gt.loh.tsv", "", sample_file)
return(sample_id)
}
#' Bounds allele frequencies based on a specified amount
#'
#' Adds in allele count frequency or removes an allele count frequency
#' based on specificied value.
#'
#' @param afs allele frequency
#' @param pseudocount_freq value to bound allele frequencies by
#' @return bounded allele frequencies
#'
#' @export
bounded_freqs <- function(afs, pseudocount_freq) {
pseudocount_afs <- ifelse(afs < 1 - pseudocount_freq,
afs + pseudocount_freq,
1- pseudocount_freq)
return(pseudocount_afs)
}
#' Spike in or remove an allele into a parsed 1000G vcf based on population allele frequency
#'
#' Spikes in an allele count or removes an allele count to SNPs with a 0 or 1
#' population allele frequency respectively so to not drive log likelihoods
#' to -Inf or 0.
#'
#' @param parsed_vcf parsed 1000G vcf generated from 'parse_1000g_vcf()'
#' @return Data frame of parsed vcf
#' @importFrom dplyr mutate
#'
#' @export
prep_1000g_vcf_af <- function(parsed_vcf) {
single_allele_count_freq <- 1/(1+max(parsed_vcf$AN))
prepped_vcf <- parsed_vcf %>%
dplyr::mutate(EAS_AF = conta::bounded_freqs(EAS_AF, single_allele_count_freq),
AMR_AF = conta::bounded_freqs(AMR_AF, single_allele_count_freq),
AFR_AF = conta::bounded_freqs(AFR_AF, single_allele_count_freq),
EUR_AF = conta::bounded_freqs(EUR_AF, single_allele_count_freq),
SAS_AF = conta::bounded_freqs(SAS_AF, single_allele_count_freq))
return(prepped_vcf)
}
#' Create data frame with the intersection of SNPs between sample and vcf
#'
#' @param gt_loh_file_loc sample gt.loh.tsv file
#' @param prepped_vcf 1000G vcf read in and prepped by 'prep_1000g_vcf_af()'
#' @return Data frame with the intersection of gt.loh.tsv file and prepped 1000G vcf
#' @importFrom readr read_tsv
#' @importFrom dplyr select inner_join arrange
#' @importFrom tidyr drop_na
#'
#' @export
get_snp_intersection <- function(gt_loh_file_loc, prepped_vcf) {
# Read in gt.loh.tsv file and keep only rsid and bayesian genotype calls
gt_loh_file <- conta::read_data_table(gt_loh_file_loc) %>%
dplyr::select(rsid, bayes_gt)
# Take intersect between gt_file and vcf
snp_intersect <- data.frame(dplyr::inner_join(gt_loh_file, prepped_vcf,
by = c("rsid"))) %>%
dplyr::arrange(chr) %>%
tidyr::drop_na()
return(snp_intersect)
}
#' Calculate probabilities per genotype and allele frequency,
#' based off of Hardy–Weinberg equilibrium.
#'
#' @param genotype genotype, either "0/0", "0/1", or "1/1"
#' @param af allele frequency, should be 0 to 1
#' @return genotype likelihood
#'
#' @export
get_gt_likelihoods <- function(genotype, af) {
gt_likelihoods <- ifelse(genotype == "0/0", (1-af)^2,
ifelse(genotype == "0/1", 2 * ((1-af)*(af)),
ifelse(genotype == "1/1", af^2, NA)))
return(gt_likelihoods)
}
#' Calculate genotype likelihoods per SNP per 1000G population
#'
#' @param snp_intersect Data frame of the intersection of snps between sample
#' and 1000G vcf by 'get_snp_intersection()'
#' @return Data frame with the initital probabilities
#' @importFrom dplyr mutate
#'
#' @export
get_snp_likelihoods <- function(snp_intersect) {
# Calculate P(Data|Ethnicity)
initial_snp_likelihoods <- snp_intersect %>%
dplyr::mutate(EAS_likelihoods = conta::get_gt_likelihoods(bayes_gt, EAS_AF),
AMR_likelihoods = conta::get_gt_likelihoods(bayes_gt, AMR_AF),
AFR_likelihoods = conta::get_gt_likelihoods(bayes_gt, AFR_AF),
EUR_likelihoods = conta::get_gt_likelihoods(bayes_gt, EUR_AF),
SAS_likelihoods = conta::get_gt_likelihoods(bayes_gt, SAS_AF))
return(initial_snp_likelihoods)
}
#' Predict ethnicity per chromosome
#'
#' This function produces ranked ethnicity predictions per chromosome. First,
#' takes in population likelihoods per snp as output from 'get_snp_likelihoods()'.
#' Second, calculates log likelihoods, and then log likelihood per chromosome.
#' Third, applies ethnicity priors to the likelihood values to generate the
#' posterior likelihoods. Lastly, calculates the P(data) and apply it to the posterior
#' likelihoods to generate the posterior probabilities per chromosome.
#'
#' @param initial_snp_likelihoods Data frame of the initial likelihoods per
#' snp per 1000G populations by
#' 'get_snp_likelihoods()'
#' @return Data frame with the log likelihoods per chromosome
#' @importFrom dplyr mutate group_by summarise select transmute arrange rename row_number
#' @importFrom tidyr gather
#'
#' @export
calculate_ranked_predictions <- function(initial_snp_likelihoods) {
# Calculate log of P(Data|Ethnicity)
# Takes in initial liklihoods and does a log transformation
ll <- initial_snp_likelihoods %>%
dplyr::mutate(EAS_ll = log10(EAS_likelihoods),
AMR_ll = log10(AMR_likelihoods),
AFR_ll = log10(AFR_likelihoods),
EUR_ll = log10(EUR_likelihoods),
SAS_ll = log10(SAS_likelihoods))
# Calculate sum(log(P(Data|Ethnicity))) per chromosome.
# Takes the sum of the log likelihoods per ethnicity per chromosome
ll_chr <- ll %>%
dplyr::group_by(chr) %>%
dplyr::summarise(n_snps = dplyr::row_number(),
EAS = sum(EAS_ll),
AMR = sum(AMR_ll),
AFR = sum(AFR_ll),
EUR = sum(EUR_ll),
SAS = sum(SAS_ll))
# Capture the number of snps per chromosome so we know how much data was used
# to generate these probabilities
snps_per_chr <- ll_chr %>%
dplyr::select(chr, n_snps)
# Calculate exp(sum(log(P(Data|Ethnicity)))) * P(Ethnicity) per chromosome.
# Read in ethnicity priors from CCG1, exponentiate the log likelihoods and
# multiply by the ethnicity priors to calculate posterier likelihoods
eth_priors <- conta::get_ethnicity_priors()
posterior_ll <- ll_chr %>%
dplyr::transmute(EAS = exp(EAS) * eth_priors$prob_asian,
AMR = exp(AMR) * eth_priors$prob_hispanic,
AFR = exp(AFR) * eth_priors$prob_african,
EUR = exp(EUR) * eth_priors$prob_white,
SAS = exp(SAS) * eth_priors$prob_asian)
# Calculate exp(sum(log(P(Data|Ethnicity)))) * P(Ethnicity) / P(Data) per chromosome
# Calculate the probability of data by taking the sum of the posterior likelihoods
# across all possible ethnicities.
posterior_ll_num <- posterior_ll %>%
dplyr::mutate(prob_data = rowSums(posterior_ll))
# Each of the ethnicity's posteiror likelihoods is normalized by the
# probability of data, thus generating posterior probabilities per ethnicity
probs_per_chr <- posterior_ll_num %>%
dplyr::transmute(East_Asian = (EAS/prob_data),
American = (AMR/prob_data),
African = (AFR/prob_data),
European = (EUR/prob_data),
South_Asian = (SAS/prob_data))
probs_per_chr_final <- cbind(snps_per_chr, probs_per_chr)
probs_per_chr_final$chr <- factor(probs_per_chr_final$chr, levels = c(1:22))
# Rank ethnicity predictions per chromosome
pre_ranked_predictions <- probs_per_chr_final %>%
dplyr::select(-n_snps) %>%
tidyr::gather(column, value, -chr) %>%
dplyr::group_by(chr) %>%
dplyr::mutate(pred_rank = rank(-value)) %>%
dplyr::arrange(pred_rank, as.numeric(chr)) %>%
dplyr::rename("ethnicity" = 2,
"probability" = 3)
# Join SNPs per chromosome to ranked predictions data frame. This allows better
# for better diagnostics through understanding how much power (SNPs) went into
# calculating ethnicity probabilities per chromosome.
pre_ranked_predictions$chr <- as.character(pre_ranked_predictions$chr)
ranked_predictions <- dplyr::left_join(snps_per_chr,
pre_ranked_predictions,
by = "chr") %>%
dplyr::arrange(pred_rank)
return(ranked_predictions)
}
#' Make tile plot of ranked predictions per chromosome
#'
#' Makes tile plot of ranked predictions per chromosome using ggplot
#'
#' @param ranked_predictions Data frame of ranked predictions by `calculate_ranked_predictions()`
#' @return ggplot object of the plot
#'
#' @export
plot_ranked_predictions_per_chromosome <- function(ranked_predictions) {
g <- ranked_predictions %>%
ggplot(aes(x = chr, y = pred_rank, fill = ethnicity)) +
geom_tile(color = "black", size = 0.5) +
scale_y_reverse() +
scale_x_discrete(limits=c(1:22, "X")) +
labs(x="Chromosome",
y="Prediction Rank") +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=14, face="bold")) +
guides(fill=guide_legend("Ethnicity:"))
return(g)
}
#' Run the conta ethnicity prediction workflow
#'
#' Runs the full conta ethnicity prediction workflow using a sample genotype
#' file with loh regions removed and 1000G vcf file. Workflow consists of
#' reading in, parsing, and preparing 1000G vcf file, followed by taking
#' the snp intersect with a samples gt.loh.tsv file. Ethnicity probabilities are
#' calculated per snp per chromosome. Generates tables and plots to the specified
#' output directory.
#'
#' @param gt_loh_file_loc sample gt.loh.tsv file
#' @param vcf 1000G vcf
#' @param outdir Path to directory to write outputs to.
#' @return None, writes out TSVs and plots to `outdir`.
#' @importFrom readr write_tsv
#' @importFrom dplyr filter group_by count arrange top_n select bind_cols bind_rows
#'
#' @export
conta_ethnicity <- function(gt_loh_file_loc,
vcf,
outdir){
parsed_vcf <- conta::parse_1000g_vcf(vcf, remove_sex_chr = TRUE)
prepped_vcf <- conta::prep_1000g_vcf_af(parsed_vcf)
snp_intersect <- conta::get_snp_intersection(gt_loh_file_loc, prepped_vcf)
initial_snp_likelihoods <- conta::get_snp_likelihoods(snp_intersect)
ranked_predictions <- conta::calculate_ranked_predictions(initial_snp_likelihoods)
# Write out table of each ranked prediction per chromosome with probabilities
sample_id <- conta::get_sample_name(gt_loh_file_loc)
readr::write_tsv(ranked_predictions,
file.path(outdir, paste(sample_id,
"ranked_ethnicity_predictions_per_chromosome.tsv",
sep = "_")))
# Write out table of top 2 predictions and number of chromosomes which
# support that prediction
top_2_predictions <- ranked_predictions %>%
dplyr::filter(pred_rank == 1) %>%
dplyr::group_by(pred_rank) %>%
dplyr::count(ethnicity) %>%
dplyr::arrange(-n) %>%
dplyr::top_n(2) %>%
dplyr::ungroup() %>%
dplyr::select(-pred_rank)
# Set blank data frame for output template
output_template <- setNames(data.frame(matrix(ncol = 4, nrow = 0)),
c("best_ethnicity_pred", "best_n_chr",
"second_ethnicity_pred", "second_n_chr"))
# Bind predictions from sample to output template
# "top_2_predictions" dataframe is either a 2x2 (two ethnicities with the
# highest number of chromosome probabilities) or 1x2 table (when a single
# ethnicity is predicted for all chromosomes).
if (length(top_2_predictions == 2)) {
top_2_predictions_transformed <- dplyr::bind_cols(top_2_predictions[1, ],
top_2_predictions[2, ])
colnames(top_2_predictions_transformed) <- c("best_ethnicity_pred", "best_n_chr",
"second_ethnicity_pred", "second_n_chr")
pre_top_2_predictions_table <- dplyr::bind_rows(output_template, top_2_predictions_transformed)
} else {
colnames(top_2_predictions) <- c("best_ethnicity_pred", "best_n_chr")
pre_top_2_predictions_table <- dplyr::bind_rows(output_template, top_2_predictions)
}
# Bind sample id to top 2 prediction table
top_2_predictions_table <- cbind(sample_id, pre_top_2_predictions_table)
# Write out table of top 2 predictions and number of chromosomes that support the call
readr::write_tsv(top_2_predictions_table,
file.path(outdir, paste(sample_id,
"top_2_predictions.tsv",
sep = "_")))
# Plot ranked predictions per chromosome
ranked_prediction_plot <- plot_ranked_predictions_per_chromosome(ranked_predictions)
png(file.path(outdir, paste(sample_id, "ethnicity_predictions_tile_plot.png", sep = "_")),
height=720,
width=1280,
res = 150)
print(ranked_prediction_plot)
dev.off()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.