knitr::opts_chunk$set(echo = TRUE)

Load packages:

library(MultiAssayExperiment)
library(curatedTCGAData)
library(TCGAutils)

From Masseroli et al. 2018

"In TCGA data of BRCA patients, find the DNA somatic mutations within the first 2000 bp outside of the genes that are both expressed with FPKM > 3 and have at least a methylation in the same patient biospecimen, and extract these mutations of the top 5% patients with the highest number of such mutations."

First we load the 'BRCA' dataset with 'Mutation', 'RNASeq2GeneNorm' and 'Methylation' assays / experiments:

system.time(
    brca <- curatedTCGAData(
        diseaseCode = "BRCA",
        assays = c("Mutation", "RNASeq2GeneNorm", "Methylation"),
        dry.run = FALSE
    )
)

We then add gene symbol annotations with genomic ranges and drop any unmapped symbols. We also do the same for CpG probe identifiers:

brca0 <- symbolsToRanges(brca, unmapped = FALSE)

if (!require("IlluminaHumanMethylation450kanno.ilmn12.hg19", quietly = TRUE))
    BiocManager::install("IlluminaHumanMethylation450kanno.ilmn12.hg19")

library(IlluminaHumanMethylation450kanno.ilmn12.hg19)

brca1 <- CpGtoRanges(brca0, unmapped = FALSE)

Select only tumors in the data

sampleTables(brca1)
head(sampleTypes)

sampleselector <- lapply(colnames(brca1), function(x)
    TCGAsampleSelect(x, c("01", "06"))
)
brca1 <- brca1[, sampleselector]

We check what probes "have at least a methylation..." across all samples:

meth.has.genes <-
    !is.na(rowData(
        brca1[["BRCA_Methylation_methyl450-20160128_ranged"]])$Gene_Symbol)
brca1 <-
    subsetByRow(brca1,
        List("BRCA_Methylation_methyl450-20160128_ranged"=meth.has.genes))
genes <-
    rowData(brca1[["BRCA_Methylation_methyl450-20160128_ranged"]])$Gene_Symbol
methylated <- !is.na(as.matrix(
    assay(brca1[["BRCA_Methylation_methyl450-20160128_ranged"]])
))

Number of non-missing per gene in each column

# convert to integer first
meth <- methylated * 1L
system.time(meth1 <- rowsum(meth, group = genes))
# where genes have at least one methylation
meth2 <- meth1 > 0L
brca1 <- c(brca1, has.meth = SummarizedExperiment(meth2))

Get ranges and add "within the first 2000 bp outside of the genes"

rnaranges <- rowRanges(brca1[["BRCA_RNASeq2GeneNorm-20160128_ranged"]])
allROIs <- flank(rnaranges, 2000, both = TRUE)

Find any somatic mutations within regions of interest:

somatic <- function(scores, ranges, qranges) any(!is.na(scores))

genome(brca1[["BRCA_Mutation-20160128"]]) <-
    TCGAutils::translateBuild(genome(brca1[["BRCA_Mutation-20160128"]]))

mutations <- RaggedExperiment::qreduceAssay(
    x = brca1[["BRCA_Mutation-20160128"]],
    query = allROIs,
    simplifyReduce = somatic,
    i = "Variant_Classification"
)
rownames(mutations) <- names(rnaranges)
mutations[is.na(mutations)] <- 0

brca1 <- c(brca1, mutations = SummarizedExperiment(mutations))

Handle technical replicates and take the sum of number of mutations per sample

reps <- replicated(brca1)[["mutations"]]
compreps <- mergeReplicates(assay(brca1[["mutations"]]), reps, sum)
brca1[["mutations"]] <- SummarizedExperiment(compreps)

Resolving replicates in other assays

brca2 <- brca1[, ,
    c("BRCA_RNASeq2GeneNorm-20160128_ranged", "has.meth", "mutations")]

rnamerged <- mergeReplicates(brca2[["BRCA_RNASeq2GeneNorm-20160128_ranged"]],
    replicates = replicated(brca2[, , "BRCA_RNASeq2GeneNorm-20160128_ranged"])[[1L]],
    simplify = function (x) {
        gt3 <- x > 3
        if (any(gt3))
            BiocGenerics::mean(x[gt3])
        else
            x[[1]]
    }
)

brca2[["BRCA_RNASeq2GeneNorm-20160128_ranged"]] <- rnamerged

methmerge <- mergeReplicates(brca2[["has.meth"]],
    replicates = replicated(brca2[, , "has.meth"])[[1L]],
    simplify = any
)

mutmerge <- mergeReplicates(brca2[["mutations"]],
    replicates = replicated(brca2[, , "mutations"])[[1L]],
    simplify = sum
)

brca2[["has.meth"]] <- methmerge
brca2[["mutations"]] <- mutmerge

brca3 <- as(brca2, "MatchedAssayExperiment")
brca3 <- intersectRows(brca3)

An expression threshold is used where "FPKM > 3". In our case, we use the better alternative: TPM > 3

https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/ http://diytranscriptomics.com/Reading/files/wagnerTPM.pdf

keep <- assay(brca3, "BRCA_RNASeq2GeneNorm-20160128_ranged") > 3 &
  assay(brca3, "has.meth") &
  assay(brca3, "mutations")

summary(colSums(keep))
sort(colSums(keep), decreasing = TRUE)[seq_len(round(ncol(keep)*0.05))]


LiNk-NY/curatedTCGAManu documentation built on July 22, 2020, 4:06 p.m.