Nothing
#' Create Non-coding Impact Score Data Frame
#'
#' @inheritParams predict_coding_impact
#'
#' @return data frame of meta-prediction scores containing 2 columns: \describe{
#' \item{gene_symbol}{HGNC gene symbol}
#' \item{CADD_phred}{PHRED-scaled CADD score}
#' }
create_noncoding_impact_score_df <- function(annovar_csv_path, na.string = ".") {
annovar_df <- utils::read.csv(annovar_csv_path)
noncoding_df <- annovar_df[annovar_df$Func.refGene != "exonic", ]
noncoding_df <- noncoding_df[noncoding_df$CADD_phred != na.string, ]
noncoding_df$CADD_phred <- as.numeric(noncoding_df$CADD_phred)
noncoding_scores_df <- noncoding_df[, c("Gene.refGene", "CADD_phred")]
colnames(noncoding_scores_df) <- c("gene_symbol", "CADD_score")
# keep first symbol if multiple symbols exist
noncoding_scores_df$gene_symbol[grepl(";", noncoding_scores_df$gene_symbol)] <- vapply(noncoding_scores_df$gene_symbol[grepl(";",
noncoding_scores_df$gene_symbol)], function(x) unlist(strsplit(x, ";"))[1],
"char")
# keep highest score
noncoding_scores_df <- noncoding_scores_df[order(noncoding_scores_df$CADD_score,
decreasing = TRUE), ]
noncoding_scores_df <- noncoding_scores_df[!duplicated(noncoding_scores_df$gene_symbol),
]
return(noncoding_scores_df)
}
#' Create Gene-level SCNA Data Frame
#'
#' @param scna_segs_df the SCNA segments data frame. Must contain: \describe{
#' \item{chr}{chromosome the segment is located in}
#' \item{start}{start position of the segment}
#' \item{end}{end position of the segment}
#' \item{log2ratio}{\ifelse{html}{\out{log<sub>2</sub>}}{\eqn{log_2}} ratio of the segment}
#' }
#' @param build genome build for the SCNA segments data frame (default = 'GRCh37')
#' @param gene_overlap_threshold the percentage threshold for the overlap between
#' a segment and a transcript (default = 25). This means that if only a segment
#' overlaps a transcript more than this threshold, the transcript is assigned
#' the segment's SCNA event.
#'
#' @return data frame of gene-level SCNA events, i.e. table of genes overlapped
#' by SCNA segments.
#'
#' @importFrom rlang .data
create_gene_level_scna_df <- function(scna_segs_df, build = "GRCh37", gene_overlap_threshold = 25) {
if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
stop(
"Package 'org.Hs.eg.db' is required but not installed.\n",
"Install it with:\n",
" if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager')\n",
" BiocManager::install('org.Hs.eg.db')",
call. = FALSE
)
}
if (build == "GRCh37" && !requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene", quietly = TRUE)) {
stop(
"Package 'TxDb.Hsapiens.UCSC.hg19.knownGene' is required but not installed.\n",
"Install it with:\n",
" if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager')\n",
" BiocManager::install('TxDb.Hsapiens.UCSC.hg19.knownGene')",
call. = FALSE
)
}
if (build == "GRCh38" && !requireNamespace("TxDb.Hsapiens.UCSC.hg38.knownGene", quietly = TRUE)) {
stop(
"Package 'TxDb.Hsapiens.UCSC.hg38.knownGene' is required but not installed.\n",
"Install it with:\n",
" if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager')\n",
" BiocManager::install('TxDb.Hsapiens.UCSC.hg38.knownGene')",
call. = FALSE
)
}
### argument checks
nec_cols <- c("chr", "start", "end", "log2ratio")
if (!all(nec_cols %in% colnames(scna_segs_df))) {
stop("`scna_segs_df` should contain all of: ", paste(dQuote(nec_cols), collapse = ", "))
}
if (!build %in% c("GRCh37", "GRCh38"))
stop("`build should be one of ", paste(dQuote(c("GRCh37", "GRCh38")), collapse = ", "))
if (!is.numeric(gene_overlap_threshold))
stop("`gene_overlap_threshold` should be numberic")
if (gene_overlap_threshold < 0 | gene_overlap_threshold > 100)
stop("`gene_overlap_threshold` should be between 0-100")
#### determine gene-level log2-ratios GRanges objects
scna_gr <- GenomicRanges::makeGRangesFromDataFrame(scna_segs_df, keep.extra.columns = TRUE)
GenomeInfoDb::seqlevelsStyle(scna_gr) <- "UCSC"
if (build == "GRCh37") {
genes_gr <- suppressMessages(GenomicFeatures::genes(TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene))
} else {
genes_gr <- suppressMessages(GenomicFeatures::genes(TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene))
}
# overlaps
hits <- GenomicRanges::findOverlaps(scna_gr, genes_gr, type = "any", select = "all")
# obtain overlap data frame between SCNA segments and gene transcripts
ranges <- genes_gr[S4Vectors::subjectHits(hits)]
q_hits <- S4Vectors::queryHits(hits)
S4Vectors::mcols(ranges) <- c(S4Vectors::mcols(ranges), S4Vectors::mcols(scna_gr[q_hits]))
genes_df <- data.frame(chr = as.vector(GenomeInfoDb::seqnames(ranges)), as.data.frame(S4Vectors::mcols(ranges)))
genes_df$segment_start <- GenomicRanges::start(GenomicRanges::ranges(scna_gr[q_hits]))
genes_df$segment_end <- GenomicRanges::end(GenomicRanges::ranges(scna_gr[q_hits]))
genes_df$transcript_start <- GenomicRanges::start(ranges)
genes_df$transcript_end <- GenomicRanges::end(ranges)
tmp1 <- apply(genes_df[, c("transcript_end", "segment_end")], 1, min)
tmp2 <- apply(genes_df[, c("segment_start", "transcript_start")], 1, max)
genes_df$transcript_overlap_percent <- (tmp1 - tmp2)/(genes_df$transcript_end -
genes_df$transcript_start) * 100
# filter for `gene_overlap_threshold`
genes_df <- genes_df[genes_df$transcript_overlap_percent >= gene_overlap_threshold,
]
# add gene symbols
all_syms_tbl <- as.data.frame(org.Hs.eg.db::org.Hs.egSYMBOL)
genes_df$symbol <- all_syms_tbl$symbol[match(genes_df$gene_id, all_syms_tbl$gene_id)]
genes_df <- genes_df[!is.na(genes_df$symbol), ]
return(genes_df)
}
#' Create SCNA Score Data Frame
#'
#' @param scna_genes_df data frame of gene-level SCNAs (can be output of \code{\link{create_gene_level_scna_df}})
#' @inheritParams create_gene_level_scna_df
#' @param log2_ratio_threshold the \ifelse{html}{\out{log<sub>2</sub>}}{\eqn{log_2}}
#' ratio threshold for keeping high-confidence SCNA events (default = 0.25)
#' @param MCR_overlap_threshold the percentage threshold for the overlap between
#' a gene and an MCR region (default = 25). This means that if only a gene
#' overlaps an MCR region more than this threshold, the gene is assigned the
#' SCNA density of the MCR
#'
#' @return data frame of SCNA proxy scores containing 2 columns: \describe{
#' \item{gene_symbol}{HGNC gene symbol}
#' \item{SCNA_density}{SCNA proxy score. SCNA density (SCNA/Mb) of the minimal common region (MCR) in which the gene is located.}
#' }
#'
#' @details The function first aggregates SCNA \ifelse{html}{\out{log<sub>2</sub>}}{\eqn{log_2}} ratio
#' on gene-level (by keeping the ratio with the maximal \ifelse{html}{\out{|log<sub>2</sub>|}}{\eqn{|log_2|}}
#' ratio over all the SCNA segments overlapping a gene). Next, it identifies the
#' minimal common regions (MCRs) that the genes overlap and finally assigns the
#' SCNA density (SCNA/Mb) values as proxy SCNA scores.
create_SCNA_score_df <- function(scna_genes_df, build = "GRCh37", log2_ratio_threshold = 0.25,
MCR_overlap_threshold = 25) {
if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
stop(
"Package 'org.Hs.eg.db' is required but not installed.\n",
"Install it with:\n",
" if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager')\n",
" BiocManager::install('org.Hs.eg.db')",
call. = FALSE
)
}
if (build == "GRCh37" && !requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene", quietly = TRUE)) {
stop(
"Package 'TxDb.Hsapiens.UCSC.hg19.knownGene' is required but not installed.\n",
"Install it with:\n",
" if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager')\n",
" BiocManager::install('TxDb.Hsapiens.UCSC.hg19.knownGene')",
call. = FALSE
)
}
if (build == "GRCh38" && !requireNamespace("TxDb.Hsapiens.UCSC.hg38.knownGene", quietly = TRUE)) {
stop(
"Package 'TxDb.Hsapiens.UCSC.hg38.knownGene' is required but not installed.\n",
"Install it with:\n",
" if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager')\n",
" BiocManager::install('TxDb.Hsapiens.UCSC.hg38.knownGene')",
call. = FALSE
)
}
### argument checks
if (!build %in% c("GRCh37", "GRCh38"))
stop("`build should be one of ", paste(dQuote(c("GRCh37", "GRCh38")), collapse = ", "))
if (!is.numeric(log2_ratio_threshold))
stop("`log2_ratio_threshold` should be numberic")
if (!is.numeric(MCR_overlap_threshold))
stop("`MCR_overlap_threshold` should be numberic")
if (MCR_overlap_threshold < 0 | MCR_overlap_threshold > 100)
stop("`MCR_overlap_threshold` should be between 0-100")
### add genomic coordinates if missing
if (!all(c("chr", "transcript_start", "transcript_end") %in% colnames(scna_genes_df))) {
all_syms_tbl <- as.data.frame(org.Hs.eg.db::org.Hs.egSYMBOL)
if (build == "GRCh37") {
genes_gr <- suppressMessages(GenomicFeatures::genes(TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene))
} else {
genes_gr <- suppressMessages(GenomicFeatures::genes(TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene))
}
all_coords_df <- as.data.frame(genes_gr)
tmp_ids <- all_syms_tbl$gene_id[match(scna_genes_df$symbol, all_syms_tbl$symbol)]
idx <- match(tmp_ids, all_coords_df$gene_id)
scna_genes_df$chr <- all_coords_df$seqnames[idx]
scna_genes_df$transcript_start <- all_coords_df$start[idx]
scna_genes_df$transcript_end <- all_coords_df$end[idx]
scna_genes_df <- scna_genes_df[!is.na(scna_genes_df$chr), ]
}
#### determine gene-level log2-ratios discard sex chromosomes
scna_genes_df <- scna_genes_df[!scna_genes_df$chr %in% c("chrX", "chrY"), ]
# aggregate as max |log2-ratio| over all values per gene
agg_ratios <- tapply(scna_genes_df$log2ratio, scna_genes_df$symbol, function(x) {
max_val <- max(abs(x))
if (max_val %in% x)
return(max_val)
return(-max_val)
})
gene_agg_df <- scna_genes_df[, c("symbol", "chr", "transcript_start", "transcript_end")]
gene_agg_df <- gene_agg_df[!duplicated(gene_agg_df$symbol), ]
gene_agg_df$agg_log2_ratio <- agg_ratios[match(gene_agg_df$symbol, names(agg_ratios))]
gene_agg_df <- gene_agg_df[!is.na(gene_agg_df$agg_log2_ratio), ]
# filter for high confidence
gene_agg_df <- gene_agg_df[abs(gene_agg_df$agg_log2_ratio) > log2_ratio_threshold,
]
# return empty data frame if no gene passing log2 threshold
if (nrow(gene_agg_df) == 0) {
df <- data.frame(symbol = character(), SCNA_density = numeric())
warning("No gene-level SCNA event passed the log2_ratio_threshold")
return(df)
}
if (build == "GRCh37")
MCR_table <- MCR_table_hg19 else MCR_table <- MCR_table_hg38
### assign MCR score to overlapping genes
MCR_gr <- GenomicRanges::makeGRangesFromDataFrame(MCR_table, keep.extra.columns = TRUE)
agg_gr <- GenomicRanges::makeGRangesFromDataFrame(gene_agg_df, seqnames.field = "chr",
start.field = "transcript_start", end.field = "transcript_end", keep.extra.columns = TRUE)
# overlaps
hits <- GenomicRanges::findOverlaps(agg_gr, MCR_gr, type = "any", select = "all")
# return empty data frame if no overlap
if (length(hits) == 0) {
df <- data.frame(symbol = character(), SCNA_density = numeric())
warning("No gene-level SCNA event overlapped any MCR region")
return(df)
}
# obtain overlap data frame between MCR regions and gene-level SCNA
ranges <- MCR_gr[S4Vectors::subjectHits(hits)]
q_hits <- S4Vectors::queryHits(hits)
S4Vectors::mcols(ranges) <- c(S4Vectors::mcols(ranges), S4Vectors::mcols(agg_gr[q_hits]))
final_scna_df <- data.frame(chr = as.vector(GenomeInfoDb::seqnames(ranges)),
as.data.frame(S4Vectors::mcols(ranges)))
final_scna_df$transcript_start <- GenomicRanges::start(GenomicRanges::ranges(agg_gr[q_hits]))
final_scna_df$transcript_end <- GenomicRanges::end(GenomicRanges::ranges(agg_gr[q_hits]))
final_scna_df$MCR_start <- GenomicRanges::start(ranges)
final_scna_df$MCR_end <- GenomicRanges::end(ranges)
tmp1 <- apply(final_scna_df[, c("MCR_end", "transcript_end")], 1, min)
tmp2 <- apply(final_scna_df[, c("transcript_start", "MCR_start")], 1, max)
final_scna_df$MCR_overlap_percent <- (tmp1 - tmp2)/(final_scna_df$MCR_end - final_scna_df$MCR_start) *
100
# filter for `MCR_overlap_threshold`
final_scna_df <- final_scna_df[final_scna_df$MCR_overlap_percent >= MCR_overlap_threshold,
]
# keep only MCR-concordant SCNA events
final_scna_df$scna_type <- ifelse(final_scna_df$agg_log2_ratio > 0, "Amp", "Del")
final_scna_df$MCR_concordant <- final_scna_df$MCR_type == final_scna_df$scna_type
final_scna_df <- final_scna_df[final_scna_df$MCR_concordant, ]
final_scna_df <- final_scna_df[, c("symbol", "SCNA_density")]
colnames(final_scna_df)[1] <- "gene_symbol"
return(final_scna_df)
}
#' Determine Hotspot Containing Genes
#'
#' @inheritParams predict_coding_impact
#' @param hotspot_threshold to determine hotspot genes, the (integer) threshold
#' for the minimum number of cases with certain mutation in COSMIC (default = 5)
#'
#' @return vector of gene symbols of genes containing hotspot mutation(s)
determine_hotspot_genes <- function(annovar_csv_path, hotspot_threshold = 5L) {
# argument check
if (!is.numeric(hotspot_threshold))
stop("`hotspot_threshold` should be numeric")
annovar_df <- utils::read.csv(annovar_csv_path)
# parse COSMIC occurences
coding_column <- colnames(annovar_df)[grepl("cosmic\\d+_coding", colnames(annovar_df))]
non_coding_column <- colnames(annovar_df)[grepl("cosmic\\d+_noncoding", colnames(annovar_df))]
annovar_df$coding_occurence <- vapply(annovar_df[, coding_column], function(x) {
tmp <- unlist(strsplit(x, ";"))[2]
tmp <- sub("OCCURENCE=", "", tmp)
tmp <- unlist(strsplit(tmp, ","))
tmp <- as.numeric(sub("\\(.*\\)", "", tmp))
return(sum(tmp))
}, 1)
annovar_df$noncoding_occurence <- vapply(annovar_df[, non_coding_column], function(x) {
tmp <- unlist(strsplit(x, ";"))[2]
tmp <- sub("OCCURENCE=", "", tmp)
tmp <- unlist(strsplit(tmp, ","))
tmp <- as.numeric(sub("\\(.*\\)", "", tmp))
return(sum(tmp))
}, 1)
annovar_df$coding_occurence <- ifelse(is.na(annovar_df$coding_occurence), 0,
annovar_df$coding_occurence)
annovar_df$noncoding_occurence <- ifelse(is.na(annovar_df$noncoding_occurence),
0, annovar_df$noncoding_occurence)
# keep first symbol if multiple symbols exist
annovar_df$Gene.refGene[grepl(";", annovar_df$Gene.refGene)] <- vapply(annovar_df$Gene.refGene[grepl(";",
annovar_df$Gene.refGene)], function(x) unlist(strsplit(x, ";"))[1], "char")
# determine genes containing hotspot mutation
cond <- (annovar_df$coding_occurence > hotspot_threshold) | (annovar_df$noncoding_occurence >
hotspot_threshold)
hotspot_genes <- unique(annovar_df$Gene.refGene[cond])
return(hotspot_genes)
}
#' Determine Double-Hit Genes
#'
#' @inheritParams predict_coding_impact
#' @inheritParams create_SCNA_score_df
#' @param log2_hom_loss_threshold to determine double-hit events, the
#' \ifelse{html}{\out{log<sub>2</sub>}}{\eqn{log_2}} threshold for identifying
#' homozygous loss events (default = -1).
#' @param batch_analysis boolean to indicate whether to perform batch analysis
#' (\code{TRUE}, default) or personalized analysis (\code{FALSE}). If \code{TRUE},
#' a column named 'tumor_id' should be present in both the ANNOVAR csv and the SCNA
#' table.
#'
#' @return vector of gene symbols that are subject to double-hit event(s), i.e.
#' non-synonymous mutation + homozygous copy-number loss
determine_double_hit_genes <- function(annovar_csv_path, scna_genes_df, log2_hom_loss_threshold = -1,
batch_analysis = FALSE) {
### argument checks
if (!is.numeric(log2_hom_loss_threshold))
stop("`log2_hom_loss_threshold` should be numberic")
if (!is.logical(batch_analysis))
stop("`batch_analysis` should be `TRUE` or `FALSE`")
### gene-level hom. loss df keep only hom. loss
loss_genes_df <- scna_genes_df[scna_genes_df$log2ratio < log2_hom_loss_threshold,
]
# discard sex chromosomes
loss_genes_df <- loss_genes_df[!loss_genes_df$chr %in% c("chrX", "chrY"), ]
### non-synonymous mutations df
annovar_df <- utils::read.csv(annovar_csv_path)
# exclude synonymous mutations
non_syn_df <- annovar_df[annovar_df$ExonicFunc.refGene != "synonymous SNV", ]
# keep first symbol if multiple symbols exist
non_syn_df$Gene.refGene[grepl(";", non_syn_df$Gene.refGene)] <- vapply(non_syn_df$Gene.refGene[grepl(";",
non_syn_df$Gene.refGene)], function(x) unlist(strsplit(x, ";"))[1], "char")
### determine double-hit genes
if (batch_analysis) {
if (!("tumor_id" %in% colnames(non_syn_df) & "tumor_id" %in% colnames(loss_genes_df)))
stop("'tumor id' should be present in both ANNOVAR output and SCNA table if `batch_analysis == TRUE`")
all_donors <- unique(non_syn_df$tumor_id)
all_donors <- all_donors[all_donors %in% loss_genes_df$tumor_id]
dhit_genes <- c()
for (donor in all_donors) {
tmp_loss <- loss_genes_df[loss_genes_df$tumor_id == donor, ]
tmp_loss <- unique(tmp_loss$symbol)
tmp_nonsyn <- non_syn_df$Gene.refGene[non_syn_df$tumor_id == donor]
dhit <- intersect(tmp_loss, tmp_nonsyn)
if (length(dhit) != 0)
dhit_genes <- c(dhit_genes, dhit)
}
dhit_genes <- unique(dhit_genes)
} else {
tmp_loss <- unique(loss_genes_df$symbol)
tmp_nonsyn <- unique(non_syn_df$Gene.refGene)
dhit_genes <- intersect(tmp_loss, tmp_nonsyn)
}
return(dhit_genes)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.