knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    eval = TRUE
)

Set up

Load libraries

library(dplyr)
library(readr)
library(tibble)
library(stringr)
library(magrittr)
library(KrasAlleleCna)

Establish path variables:

pkg_dir <- system.file(package = "KrasAlleleCna")
extdata_dir <- system.file("extdata", package = "KrasAlleleCna")
data_dir <- system.file("data", package = "KrasAlleleCna")

Below are the KRAS gene locations (it is on the negative strand).

KRAS_LOWER <- 25204789  # end of gene
KRAS_UPPER <- 25251003  # start of gene

Load and prepare data

The tumor purity was calculated by TCGA and downloaded from https://gdc.cancer.gov/about-data/publications/pancanatlas where it was labeled as "ABSOLUTE purity/ploidy file - TCGA_mastercalls.abs_tables_JSedit.fixed.txt". The only data required for downstream analysis is the ID (below, extracted from the sample column) and the purity.

purity_tib <- read_tsv(file.path(extdata_dir, "tcga_purity_ploidy.tsv")) %>%
    mutate(common_id = str_sub(sample, 1, 16)) %>%
    select(common_id, purity)
purity_tib

Tumor CNV data were downloaded from https://gdc.cancer.gov/about-data/publications/pancanatlas and labeled "Copy Number - broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.seg" (Due to file size restrictions, "tcga_copynumber.tsv" is available in a compressed for in "inst/extdata/", and was loaded into "data" for use in the package using usethis::use_data().)

KRAS is on chromosome 12 in humans, so only data for that chromosome was retained. The common_id was extracted as above and the start and stop columns for the CNV calls were renamed to be more specific to remove ambiguity after merging data sets, later. Finally, the copy number was calculated from the Segment_Mean; I found out how to do this here: "The GDC further transforms these copy number values into segment mean values, which are equal to log2(copy-number/ 2). Diploid regions will have a segment mean of zero, amplified regions will have positive values, and deletions will have negative values."

cnv_tib %<>%
    filter(Chromosome == 12) %>%
    mutate(common_id = str_sub(Sample, 1, 16)) %>%
    dplyr::rename(CNV_start = Start,
                  CNV_end = End) %>%
    mutate(copy_number = 2^(Segment_Mean + 1)) %>%
    select(common_id, CNV_start, CNV_end, copy_number)
cnv_tib

Both the purity and copy number data was joined to the ANNOVAR and mpileup results.

allele_data <- allele_depth_tib %>%
    dplyr::mutate(common_id = sample_id) %>%
    select(common_id, Chr, Start, End, aa_mod, total_num_reads, num_mut_reads,
           Ref, Alt, Func.refGene, Gene.refGene, ExonicFunc.refGene, file_name,
           file_id, project_id, case_id, sample_id, sample_type) %>%
    left_join(cnv_tib, by = "common_id") %>%
    left_join(purity_tib, by = "common_id") %>%
    unique()

Gather CNV values

Some samples had multiple measures of copy number along KRAS. The following section selects a single value for each sample. It isn't pretty, but gets the job done.

An empty tibble was created with the correct column names for which the data will be add to row-wise

allele_data_filt <- allele_data %>% slice(0)
for (fi in unique(allele_data$file_id)) {
    # temp tib with this loops data
    tt <- allele_data %>%
        filter(file_id == fi)
    # if WT:
    if (tt$aa_mod[[1]] == "WT") {
        # bool for if CNV probe straddles or cuts KRAS
        tt %<>%
            mutate(straddle_kras = (CNV_start < KRAS_LOWER &
                                    KRAS_UPPER < CNV_end),
                   cuts_kras = ((CNV_start < KRAS_LOWER &
                                CNV_end < KRAS_UPPER) |
                               (KRAS_LOWER < CNV_start &
                                KRAS_UPPER < CNV_end)),
                   dist_to_kras = min(abs(CNV_start - KRAS_LOWER),
                                      abs(CNV_start - KRAS_UPPER),
                                      abs(CNV_end - KRAS_LOWER),
                                      abs(CNV_end - KRAS_UPPER)))
        # prefer straddle
        if (any(tt$straddle_kras, na.rm = TRUE)) {
            tt %<>%
                filter(straddle_kras)
        # else use probes that cut KRAS
        } else if (any(tt$cuts_kras, na.rm = TRUE)) {
            tt %<>%
                filter(cuts_kras)
        } else {
            message("insufficient filters")
        }
        # standardize tt to be rbind
        tt  %<>%
            mutate(copy_number = mean(copy_number)) %>%
            slice(1) %>%
            select(-c("straddle_kras", "cuts_kras", "dist_to_kras"))
    } else {  # if mutant:
        # keep CNV probe that contains the mutation
        tt %<>%
            filter(CNV_start < Start & Start < CNV_end) %>%
            mutate(copy_number = mean(copy_number)) %>%
            slice(1)
        if (nrow(tt) == 0) {
            # if no probe contained it, just return the original data and warn
            tt <- allele_data %>%
                filter(file_id == fi) %>%
                slice(1)
            warning(paste("Mutant with no rows:", fi))
        }
    }
    # compile tt into the final tibble
    allele_data_filt <- dplyr::bind_rows(allele_data_filt, tt)
}

Check if any mutants have missing data.

allele_data_filt %>%
    filter(is.na(purity) & aa_mod != "WT") %>%
    select(aa_mod, project_id)

Calculate allele specific copy number

Finally, the allele specific copy number was calculated. The equation was taken from Eq. (2) in Stephens et al. 2012, Nature Letter:

$$ n_{mut} = \frac{r}{R} \times \frac{1}{p} \times (p \times n_{locus} + 2(1 - p)) $$

where

allele_data_filt %<>%
    mutate(cn_mut = (num_mut_reads / (total_num_reads * purity)) *
           (purity * copy_number + (2 * (1 - purity))),
           copy_number_adj = (1 / purity) * (copy_number - (2 * (1 - purity))),
           cn_wt = ifelse(aa_mod == "WT",
                          copy_number_adj,
                          copy_number_adj - cn_mut))
usethis::use_data(allele_data_filt, overwrite = TRUE)


jhrcook/KrasAlleleCna documentation built on May 28, 2019, 1:22 p.m.