In nicolaroberts/hdp: Hierarchical Dirichlet Process for categorical count data

BiocStyle::markdown()

knitr::opts_chunk$set(fig.width=6, fig.height=5)

For the background to this document, see the other package vignette "General introduction to the hdp package."

One specific application of the HDP is the analysis of somatic mutation data from cancer genome sequencing projects. In this setting, the input data consists of counts of mutation categories across a number of cancer samples. The HDP model returns a set of components representing the underlying mutational processes with their characteristic distributions or 'signatures' over the set of possible mutation classes.

The ability to define hierarchies of sample-relatedness (via the tree of parent DP nodes in a customised HDP structure) allows mutational signatures to be inferred both across and within groups of patients, and groups of mutations within patients. Patient groups could be defined by cancer type or driver mutation status, while mutation groups within a patient could be defined by temporal or regional information, subclone status, phylogenetic branch, etc.

Example: de novo signature extraction in lung, ovarian, and skin cancers

Tally somatic mutation categories

The r Biocexptpkg("SomaticCancerAlterations") package contains several somatic alteration datasets for different cancer types (exome only). In this example, the number of somatic base substitution mutations in each of the 96 categories defined by local trinucleotide context are tallied across 100 lung cancers, 100 ovarian cancers, and 100 melanomas.

library(hdp)
# library(GenomicRanges)
# 
# # Lung adenocarcinoma
# data(luad_tcga, package="SomaticCancerAlterations")
# 
# # Ovarian serous cystadenocarcinoma
# data(ov_tcga, package="SomaticCancerAlterations")
# 
# # Skin cutaneous melanoma
# data(skcm_tcga, package="SomaticCancerAlterations")
# 
# # only keep SNP type, add cancer type to sample name, and only keep
# # necessary metadata. Only keep 100 samples. Then concatenate and sort.
# for (cancer_name in c("luad", "ov", "skcm")){
#   raw <- get(paste0(cancer_name, "_tcga"))
#   snv <- raw[which(raw$Variant_Type == "SNP")]
#   snv <- snv[which(snv$Patient_ID %in% levels(snv$Patient_ID)[1:100])]
#   mcols(snv) <- data.frame(sampleID=paste(cancer_name, snv$Patient_ID, sep='_'),
#                            ref=snv$Reference_Allele,
#                            alt=snv$Tumor_Seq_Allele2)
#   assign(cancer_name, snv)
# }
# variants <- sort(c(luad, ov, skcm))
# remove(cancer_name, luad, luad_tcga, ov, ov_tcga, skcm, skcm_tcga, raw, snv)

# tally mutation counts in 96 base substitution classes defined by trinucleotide context
# Could use the tally_mutations_96() function from https://github.com/nicolaroberts/nrmisc (hg19 specific)
# bit slow - don't run. Load pre-made copy. 
# mut_count <- nrmisc::tally_mutations_96(variants)
dim(mut_count)
head(mut_count[,1:5])
tail(mut_count[,1:5])
table(sapply(strsplit(row.names(mut_count), '_'), `[`, 1))

Initialise HDP structure

In this example, the HDP is structured to have

one top grandparent DP drawing from the base distribution (ppindex 0) with its own concentration parameter (cpindex 1),
three parent DPs (one per cancer type) drawing from the grandparent distribution (ppindex 1) and sharing a new concentration parameter (cpindex 2),
100 child DPs (one per lung cancer sample) drawing from the first parent (ppindex 2) and sharing a new concentration parameter (cpindex 3),
100 child DPs (one per ovarian cancer sample) drawing from the second parent (ppindex 3) and sharing a new concentration parameter (cpindex 4), and
100 child DPs (one per melanoma sample) drawing from the third parent (index 4) and sharing a new concentration parameter (cpindex 5).

The base distribution is a uniform Dirichlet with pseudocount 1 in each of the 96 possible mutation categories. Each concentration parameter is drawn from a gamma prior with hyperparameters rate=1, shape=1.

# initialise HDP
hdp_mut <- hdp_init(ppindex = c(0, rep(1, 3), rep(2:4, each=100)), # index of parental nodes
                    cpindex = c(1, rep(2, 3), rep(3:5, each=100)), # index of concentration param
                    hh=rep(1, 96), # prior is uniform over the 96 mutation categories
                    alphaa=rep(1, 5), # shape hyperparams for five different CPs
                    alphab=rep(1, 5)) # rate hyperparams for five different CPs

# add data to leaf nodes (one per cancer sample, in row order of mut_count)
hdp_mut <- hdp_setdata(hdp_mut, 
                       dpindex = 5:numdp(hdp_mut), # index of nodes to add data to
                       mut_count) # input data (mutation counts, sample rows match up with specified dpindex)

hdp_mut

Run multiple posterior sampling chains

For this example, I ran four independent posterior sampling chains, each separately initialised with 10 random starting clusters, followed by 5000 burn-in iterations, and the collection of 50 posterior samples off each chain with 200 iterations between each. For real-world analyses on larger datasets, be sure to check the diagnostic plots and adjust the burn-in and space parameters accordingly, usually collecting at least 1000 posterior samples (overall) from at least 4 independent sampling chains.

# Run four independent posterior sampling chains
# Takes ~15 minutes - don't run. Load pre-made copy.

# chlist <- vector("list", 4)
# for (i in 1:4){
# 
#   # activate DPs, 10 initial components
#   hdp_activated <- dp_activate(hdp_mut, 1:numdp(hdp_mut), initcc=10, seed=i*200)
# 
#   chlist[[i]] <- hdp_posterior(hdp_activated,
#                                burnin=5000,
#                                n=50,
#                                space=200,
#                                cpiter=3,
#                                seed=i*1e3)
# }
# 
# # example multi object
# mut_example_multi <- hdp_multi_chain(chlist)
mut_example_multi

par(mfrow=c(2,2), mar=c(4, 4, 2, 1))
p1 <- lapply(chains(mut_example_multi), plot_lik, bty="L", start=1000)
p2 <- lapply(chains(mut_example_multi), plot_numcluster, bty="L")
p3 <- lapply(chains(mut_example_multi), plot_data_assigned, bty="L")

Always remember to check that the diagnostic plots show no strong trends over the posterior samples collected!

Extract components (mutational signatures)

The extracted components represent the underlying mutational processes giving rise to the observed catalogues of somatic mutation. Several processes are recognisable from COSMIC, for example (1) is the UV radiation signature dominant in melanomas, (2) is the tobacco signature dominant in lung cancers, etc.

The plot_dp_comp_exposure() function plots the estimated proportion of mutations within each sample derived from each signature. Note that by setting incl_nonsig=FALSE, only those signatures with non-zero 95% credibility intervals for exposure in a sample are included. As a result, a fraction of the sample's signature exposure is left unexplained, as we can't have confidence that the other signatures truly contribute to that sample.

mut_example_multi <- hdp_extract_components(mut_example_multi)
mut_example_multi

par(mfrow=c(1,1), mar=c(5, 4, 4, 2))
plot_comp_size(mut_example_multi, bty="L")

trinuc_context <- sapply(strsplit(colnames(mut_count), '\\.'), `[`, 4)
group_factor <- as.factor(rep(c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G"),
                           each=16))

# pick your colours
mut_colours <- c(RColorBrewer::brewer.pal(10, 'Paired')[seq(1,10,2)], 'grey70')

plot_comp_distn(mut_example_multi, cat_names=trinuc_context,
                  grouping=group_factor, col=mut_colours,
                  col_nonsig="grey80", show_group_labels=TRUE)

plot_dp_comp_exposure(mut_example_multi, main_text="Lung adenocarcinoma",
                      dpindices=4+(1:100), 
                      col=RColorBrewer::brewer.pal(12, "Set3"),
                      incl_nonsig=FALSE, 
                      ylab_numdata = 'SNV count', ylab_exp = 'Signature exposure',
                      leg.title = 'Signature')

plot_dp_comp_exposure(mut_example_multi, main_text="Ovarian cancer",
                      dpindices=104+(1:100), 
                      col=RColorBrewer::brewer.pal(12, "Set3"),
                      incl_nonsig=FALSE,
                      ylab_numdata = 'SNV count', ylab_exp = 'Signature exposure',
                      leg.title = 'Signature')

plot_dp_comp_exposure(mut_example_multi, main_text="Melanoma",
                      dpindices=204+(1:100), 
                      col=RColorBrewer::brewer.pal(12, "Set3"),
                      incl_nonsig=FALSE,
                      ylab_numdata = 'SNV count', ylab_exp = 'Signature exposure',
                      leg.title = 'Signature')

plot_dp_comp_exposure(mut_example_multi,
                      dpindices=2:4, incl_numdata_plot=FALSE,
                      col=RColorBrewer::brewer.pal(12, "Set3"),
                      incl_nonsig=TRUE, cex.names=0.8,
                      dpnames=c("Lung Adeno", "Ovarian", "Melanoma"),
                      ylab_exp = 'Signature exposure', leg.title = 'Signature')

Example: matching lung cancers to a prior signature library, while simultaneously discovering new signatures

Initialise HDP structure

Use hdp_prior_init to condition on previously identified signatures (e.g. from COSMIC database), as previously introducted in the other package vignette.

Here, I use 30 previously defined mutational signatures (available from COSMIC website) to define the known set of priors, with a weighting of 1000 pseudocounts on each one. In practice, it may be advisable to put lower weights on prior signatures that you do not expect to be present in your dataset, or even exclude some priors entirely (for example, there may be no need to include the UV radiation signature for cancers of internal organs).

After hdp_prior_init, I use hdp_addconparam to provide additional concentration parameters for the new dataset, and hdp_adddp to set up the node structure for the dataset of 100 lung cancer samples, connected to their group-specific parent node (itself connected to the overall grandparent node). The data is added via hdp_setdata, here using the first 100 rows of mut_count to select the lung samples from Example 1.

cosmic.sigs <- read.table('http://cancer.sanger.ac.uk/cancergenome/assets/signatures_probabilities.txt', header=TRUE, sep='\t')
#  sort by Substitution Type and Trinucleotide
cosmic.sigs <- cosmic.sigs[order(cosmic.sigs$Substitution.Type, cosmic.sigs$Trinucleotide),]
prior_sigs <- as.matrix(cosmic.sigs[,grep('Signature', colnames(cosmic.sigs))])
# number of prior signatures to condition on (30)
nps <- ncol(prior_sigs)
nps

luad_prior <- hdp_prior_init(prior_distn = prior_sigs,
                             prior_pseudoc = rep(1000, nps),
                             hh=rep(1, 96),
                             alphaa=c(1, 1),
                             alphab=c(1, 1))
luad_prior

luad_prior <- hdp_addconparam(luad_prior,
                              alphaa = c(1,1),
                              alphab = c(1,1))

luad_prior <- hdp_adddp(luad_prior,
                        numdp = 101,
                        ppindex = c(1, rep(1+nps+1, 100)),
                        cpindex = c(3, rep(4, 100)))

luad_prior <- hdp_setdata(luad_prior,
                          dpindex = (1+nps+1)+1:100,
                          mut_count[1:100,])

luad_prior

Run multiple posterior sampling chains

Activate the DP nodes added via hdp_adddp (don't activate the frozen pseudo-count nodes for the prior signatures), and then run posterior sampling as usual. I recommend using initcc (number of initialising clusters) slightly higher than the number of prior signatures passed in. In this way, the model initialises with the data spread over all provided priors plus over a few extra clusters solely comprised of new data observations (un-connected to the priors).

# takes about 15 minutes to run - load pre-made version
# chlist <- vector("list", 4)
# for (i in 1:4){
#   luad_activated <- dp_activate(luad_prior,
#                                 dpindex = (1+nps+1)+0:100,
#                                 initcc = nps+5,
#                                 seed = i*1000)
# 
#   chlist[[i]] <- hdp_posterior(luad_activated,
#                                burnin = 4000,
#                                n = 50,
#                                space = 100,
#                                cpiter = 3,
#                                seed = i*1e6)
# }
# 
# luad_multi <- hdp_multi_chain(chlist)
luad_multi

par(mfrow=c(2,2))
p1 <- lapply(chains(luad_multi), plot_lik, bty='L', start=1000)
p2 <- lapply(chains(luad_multi), plot_numcluster, bty='L')
p3 <- lapply(chains(luad_multi), plot_data_assigned, bty='L')

Extract components (mutational signatures)

When extracting components on a HDP model with prior information, the components matching the provided priors (down to a cosine similarity of 0.9) are labelled P, with the numbers reflecting the order in which the priors were provided, and any novel components are labelled N.

In this example, the lung cancer dataset is matched to prior signatures 4, 5, 1, 2, 13, 15, and 17, with one additional signature returned that was not provided in the prior set (N1).

luad_multi <- hdp_extract_components(luad_multi)
luad_multi

par(mfrow=c(1,1), mar=c(5, 4, 4, 2))
plot_comp_size(luad_multi, bty="L")

trinuc_context <- sapply(strsplit(colnames(mut_count), '\\.'), `[`, 4)
group_factor <- as.factor(rep(c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G"),
                              each=16))

plot_comp_distn(luad_multi, cat_names=trinuc_context,
                grouping=group_factor, col=mut_colours,
                col_nonsig="grey80", show_group_labels=TRUE)

plot_dp_comp_exposure(luad_multi, 1+nps+1+(1:100), incl_nonsig = F,
                      col=c('black', RColorBrewer::brewer.pal(8, "Set1")))