#' Genomic risk scores
#'
#' Prolaris, Oncotype DX, Decipher transcriptome panels for prostate cancer risk
#'
#' @param mae MultiAssayExperiment object with gene expression available
#' @param slot Name of the Gene Expression slot, by default grepping for 'gex' prefix and picking the hit
#' @param test Type of test; available: "Prolaris", "Oncotype DX", and "Decipher" (case insensitive)
#' @param log Should data be log(x+1)-transformed prior to calculating the risk score
#'
#' @details https://bjui-journals.onlinelibrary.wiley.com/doi/10.1111/bju.14452 https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-14-690 https://www.nature.com/articles/s41391-019-0167-9
#'
#' @noRd
#' @keywords internal
genomic_risk <- function(mae,
slot = grep("gex", names(mae), value = TRUE),
test = c("Prolaris", "Oncotype DX", "Decipher"),
log = FALSE # Should log-transformation be applied to the data for genomic risk score calculations; should not be utilized if data is transformed already
) {
# Internal function for automatically extracting the latest curatedPCaData::curatedPCaData_genes[,"Aliases"] for a specific hugo symbol
expandAliases <- function(gene) {
if (length(which(curatedPCaData_genes$hgnc_symbol == gene)) > 0) {
unique(c(
gene,
unlist(
strsplit(curatedPCaData_genes[which(curatedPCaData_genes$hgnc_symbol == gene), "Aliases"], ";")[[1]]
)
))
} else {
gene
}
}
# Internal function that extract expression value if it is present in matrix; otherwise imputes defined value, and gives a warning
# List of aliases, and data matrix x with columns as genes
extractGene <- function(genelist, x, impute = NA) {
if (any(genelist %in% colnames(x))) {
# Take the most recent alias hit
x[, intersect(colnames(x), genelist)[1]]
} else {
warning(paste("Gene list", paste(genelist, collapse = ";"), "not found from data matrix"))
rep(impute, times = nrow(x))
}
}
# Prolaris
prolaris_genes <- list(
"FOXM1" = c("FOXM1"),
"CDC20" = c("CDC20"),
"CDKN3" = c("CDKN3"),
"CDC2" = c("CDC2"),
"KIF11" = c("KIF11"),
"KIAA0101" = c("KIAA0101"),
"NUSAP1" = c("NUSAP1"),
"CENPF" = c("CENPF"),
"ASPM" = c("ASPM"),
"BUB1B" = c("BUB1B"),
"RRM2" = c("RRM2"),
"DLGAP5" = c("DLGAP5"),
"BIRC5" = c("BIRC5"),
"KIF20A" = c("KIF20A"),
"PLK1" = c("PLK1"),
"TOP2A" = c("TOP2A"),
"TK1" = c("TK1"),
"PBK" = c("PBK"),
"ASF1B" = c("ASF1B"),
"C18orf24" = c("C18orf24"),
"RAD54L" = c("RAD54L"),
"PTTG1" = c("PTTG1"),
"CDCA3" = c("CDCA3"),
"MCM10" = c("MCM10"),
"PRC1" = c("PRC1"),
"DTL" = c("DTL"),
"CEP55" = c("CEP55"),
"RAD51" = c("RAD51"),
"CENPM" = c("CENPM"),
"CDCA8" = c("CDCA8"),
"ORC6L" = c("ORC6L"),
"SKA1" = c("SKA1"),
"ORC6" = c("ORC6"),
"CDK1" = c("CDK1")
)
prolaris_genes <- lapply(prolaris_genes, FUN = expandAliases)
# Oncotype DX
oncotype_genes <- list(
"AZGP1" = c("AZGP1"),
"KLK2" = c("KLK2"),
"SRD5A2" = c("SRD5A2"),
"FAM13C" = c("FAM13C"),
"FLNC" = c("FLNC"),
"GSN" = c("GSN"),
"TPM2" = c("TPM2"),
"GSTM2" = c("GSTM2"),
"TPX2" = c("TPX2"),
"BGN" = c("BGN"),
"COL1A1" = c("COL1A1"),
"SFRP4" = c("SFRP4")
)
oncotype_genes <- lapply(oncotype_genes, FUN = expandAliases)
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3691249/
# Table 2
decipher_genes_over <- list(
"CAMK2N1" = c("CAMK2N1"),
"EPPK1" = c("EPPK1"),
"IQGAP3" = c("IQGAP3"),
"LASP1" = c("LASP1"),
"NFIB" = c("NFIB"),
"NUSAP1" = c("NUSAP1"),
"PBX1" = c("PBX1"),
"S1PR4" = c("S1PR4"),
"THBS2" = c("THBS2"),
"UBE2C" = c("UBE2C"),
"ZWILCH" = c("ZWILCH")
)
decipher_genes_over <- lapply(decipher_genes_over, FUN = expandAliases)
decipher_genes_under <- list(
"ANO7" = c("ANO7"),
"C6orf10" = c("C6orf10", "TSBP1"),
"PCDH7" = c("PCDH7"),
"MYBPC1" = c("MYBPC1"),
"TSBP" = c("TSBP"),
"RABGAP1" = c("RABGAP1"),
"PCAT-32" = c("PCAT-32", "PCAT1"),
"PCAT-80" = c("PCAT-80", "GLYATL1P4"),
"TNFRSF19" = c("TNFRSF19")
)
decipher_genes_under <- lapply(decipher_genes_under, FUN = expandAliases)
# Gene name annotations / changes from original publication
# C6orf10 -> TSBP1
# PCAT-32 -> PCAT1
dat <- mae@ExperimentList[[slot]]
dat <- t(dat)
dat <- as.data.frame(dat)
# Log-transformation
if (log) {
dat <- log2(dat + 1)
}
# Prolaris
if (base::tolower(test) == "prolaris") {
if (length(intersect(colnames(dat), unlist(prolaris_genes))) != 31) {
warning(
"Prolaris risk score based off of ",
length(intersect(colnames(dat), unlist(prolaris_genes))),
" out of 31 genes"
)
}
risk <- dat[, colnames(dat) %in% unlist(prolaris_genes)]
gene_med <- apply(risk, MARGIN = 2, stats::median)
# risk_centered <- risk - gene_med
# Genes should be shifted in a row-wise manner
risk_centered <- t(apply(risk, MARGIN = 1, FUN = function(x) {
x - gene_med
}))
# Squaring the median centered expression values
risk_centered <- risk_centered^2
risk_score <- apply(risk_centered, 1, mean)
risk_score <- log2(risk_score)
return(risk_score)
# Oncotype DX
} else if (base::tolower(test) %in% c("oncotype dx", "oncotypedx", "oncotype")) {
if (!length(intersect(colnames(dat), unlist(oncotype_genes))) == 12) {
warning(
"The following required Oncotype DX genes: ",
paste(setdiff(names(oncotype_genes), colnames(dat)), collapse = ", "),
" are not present in colnames of the data"
)
}
# dat$TPX2_bounded <- ifelse(dat$TPX2 < 5, 5, dat$TPX2)
# dat$SRD5A2_bounded <- ifelse(dat$SRD5A2 < 5.5, 5.5, dat$SRD5A2)
# cellular_organization_module = dat$FLNC + dat$GSN + dat$TPM2 + dat$GSTM2
# stromal_module = dat$BGN + dat$COL1A1 + dat$SFRP4
# androgen_module = dat$FAM13C + dat$KLK2 + dat$SRD5A2_bounded + dat$AZGP1
cellular_organization_module <-
# (0.163*dat$FLNC) +
(0.163 * extractGene(genelist = oncotype_genes[["FLNC"]], x = dat, impute = 0)) +
# (0.504*dat$GSN) +
(0.504 * extractGene(genelist = oncotype_genes[["GSN"]], x = dat, impute = 0)) +
# (0.421*dat$TPM2) +
(0.421 * extractGene(genelist = oncotype_genes[["TPM2"]], x = dat, impute = 0)) +
# (0.394*dat$GSTM2)
(0.394 * extractGene(genelist = oncotype_genes[["GSTM2"]], x = dat, impute = 0))
stromal_module <-
# (0.527*dat$BGN) +
(0.527 * extractGene(genelist = oncotype_genes[["BGN"]], x = dat, impute = 0)) +
# (0.457*dat$COL1A1) +
(0.457 * extractGene(genelist = oncotype_genes[["COL1A1"]], x = dat, impute = 0)) +
# (0.156*dat$SFRP4)
(0.156 * extractGene(genelist = oncotype_genes[["SFRP4"]], x = dat, impute = 0))
androgen_module <-
# (0.634*dat$FAM13C) +
(0.634 * extractGene(genelist = oncotype_genes[["FAM13C"]], x = dat, impute = 0)) +
# (1.079*dat$KLK2) +
(1.079 * extractGene(genelist = oncotype_genes[["KLK2"]], x = dat, impute = 0)) +
# (0.997*dat$SRD5A2_bounded) +
# Lower bound 5.5
(0.997 * unlist(lapply(extractGene(genelist = oncotype_genes[["SRD5A2"]], x = dat, impute = 0), FUN = function(x) {
max(5.5, x)
}))) +
# (0.642*dat$AZGP1)
(0.642 * extractGene(genelist = oncotype_genes[["AZGP1"]], x = dat, impute = 0))
proliferation_module <- # dat$TPX2_bounded
# Lower bound 5
unlist(lapply(extractGene(genelist = oncotype_genes[["TPX2"]], x = dat, impute = 0), FUN = function(x) {
max(5, x)
}))
risk_score <- 0.735 * stromal_module - 0.368 * cellular_organization_module - 0.352 * androgen_module + 0.095 * proliferation_module
names(risk_score) <- rownames(dat)
return(risk_score)
# Decipher
} else if (base::tolower(test) %in% c("decipher", "decypher")) {
if (length(intersect(colnames(dat), c(unlist(decipher_genes_over), unlist(decipher_genes_under)))) != 19) {
warning(
"The following required Decipher genes: ",
paste(setdiff(c(unlist(decipher_genes_over), unlist(decipher_genes_under)), colnames(dat)), collapse = ", "),
" are not present in colnames of the GEX data"
)
}
# Take intersection of available gene names and over and under expressed gene symbols and their aliases
over <- intersect(colnames(dat), unlist(decipher_genes_over))
under <- intersect(colnames(dat), unlist(decipher_genes_under))
# Intersect between Decipher risk score genes ideally available and current data matrix
risk <- dat[, intersect(colnames(dat), c(over, under))]
# Median centering
gene_med <- apply(risk, MARGIN = 2, stats::median)
# risk_centered <- risk - gene_med
# Genes should be shifted in a row-wise manner
risk_centered <- t(apply(risk, MARGIN = 1, FUN = function(x) {
x - gene_med
}))
# average of the log2 normalized values for the 9 over-expressed targets
c1 <- apply(risk_centered[, over], MARGIN = 1, FUN = function(x) {
mean(x, na.rm = TRUE)
})
# average of the log2 normalized values for the 9 under-expressed targets
c2 <- apply(risk_centered[, under], MARGIN = 1, FUN = function(x) {
mean(x, na.rm = TRUE)
})
# Risk score as the difference between over- and underrepresented genes
risk_score <- c1 - c2
return(risk_score)
} else {
stop(paste("Invalid genomic risk score name:", test))
}
}
#' Various genomic scores
#'
#' AR score by Hieronymus et al 2006 as used by TCGA 2015
#' Quote: "To address these questions, we sought to infer the AR output of tumors
#' by calculating an AR activity score from the expression pattern of 20 genes that
#' are experimentally validated AR transcriptional targets (Hieronymus et al., 2006)."
#'
#' @noRd
#' @keywords internal
genomic_score <- function(mae, # MultiAssayExperiment object
slot = "gex", # Slot inside MAE object to use as the gene expression
test = "AR", # Test/score to calculate; by default Androgen Receptor (AR) score is calculated
verbose = TRUE # Warnings for not found symbols etc
) {
# TCGA methodology for AR output score analysis: (Section 6 in supplementary of https://www.cell.com/cms/10.1016/j.cell.2015.10.025/attachment/70a60372-cdaf-4c72-aa6d-ded4b33ce5a0/mmc1.pdf )
# "The AR output score is derived from the mRNA expression of genes that are experimentally
# validated AR transcriptional targets (Hieronymus et al., 2006). Precisely, a list of 20 genes
# upregulated in LNCaP cells stimulated with the synthetic androgen R1881 was used as a gene
# signature of androgen-induced genes. An AR output score was defined by the quantification of
# the composite expression of this 20-gene signature in each sample. Here, we measured
# differential AR activity between genomic subtypes (ERG, ETV1/4/FLI1, SPOP, FOXA1, other,
# normal prostate). To this aim, we computed a Z-score for the expression of each gene in each
# sample by subtracting the pooled mean from the RNA-seq expression values and dividing by
# the pooled standard deviation."
# https://www.sciencedirect.com/science/article/pii/S1535610806002820
# Fig 1B
# " A gene expression signature of androgen stimulation was defined from gene expression profiles of LNCaP cells stimulated with the synthetic androgen R1881 for 12 hr and 24 hr,
# as compared to androgen-deprived LNCaP cells. The 27 gene signature contains both androgen-induced and androgen-repressed genes, shown here by row-normalized heat map."
#
## Genes as they were in Hieronymus et al 2006
if (FALSE) {
hieronymus_genes_up <- c(
"KLK3", # "PSA", # PSA -> KLK3 gene
"TMPRSS2", "NKX3.1", # "NKX3-1", # Aliases
"KLK2", "GNMT", "PMEPA1", # "TMEPAI", # Updated annotation; TMEPAI -> PMEPA1
"MPHOSHP9", # "MPHOS9", # MPHOS9 -> MPHOSPH9
"ZBTB10", "EAF2",
"CENPN", # "BM039", # BM039 -> CENPN
# "SARG", # SARG not found; could be C1orf116
"ACSL3", "PTGER4", "ABCC4",
"NNMT", "ADAM7", "FKBP5", "ELL2", "MED28", "HERC3", "MAF"
)
# Based on Fib 1C ELL2 might fit better into down than up
hieronymus_genes_dn <- c("TNK1", "GLRA2", "MAPRE2", "PIP5K2B", "MAN1A1", "CD200")
}
# TCGA version of Hieronymus AR-genes (panel A rows): https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4695400/figure/F4/
if (FALSE) {
hieronymus_genes_up <- c(
"KLK3", "KLK2", "PMEPA1",
"ABCC4",
"NKX3-1", "NKX3.1", # Two naming conventions, '-' replaced with '.'
"C1orf116", # Not conventionally found from GEX
"FKBP5", "ACSL3", "ZBTB10", "HERC3",
"PTGER4", "MPHOSPH9", "EAF2", "MED28", "NNMT", "MAF",
"GNMT", "CENPN", "ELL2", "TMPRSS2"
)
}
# TCGA version of AR score supporting gene aliases and different naming conventions
if (base::tolower(test) %in% c("ar", "ar score", "ar-score", "ar_score")) {
# Aliases queried using https://www.genecards.org/
ar_genes <- list(
"KLK3" = c("KLK3", "PSA", "APS", "KLK2A1"), # Possibly HK3; ambiguous
"KLK2" = c("KLK2", "HGK-1", "HGK.1", "KLK2A2"), # Possibly HK2; ambiguous
"PMEPA1" = c("PMEPA1", "STAG1", "TMEPAI"),
"ABCC4" = c("ABCC4", "MRP4", "MOATB", "MOAT-B", "MOAT.B"),
"NKX3-1" = c("NKX3-1", "NKX3.1", "BAPX2", "NKX3A"),
"C1orf116" = c("C1orf116", "SARG", "FLJ36507", "MGC2742", "MGC4309"),
"FKBP5" = c("FKBP5", "FKBP51", "FKBP54", "FKBP-51", "FKBP.51", "AIG6", "FKBP-5", "FKBP.5"),
"ACSL3" = c("ACSL3", "FACL3", "LACS3"),
"ZBTB10" = c("ZBTB10", "RINZFC", "RINZF"),
"HERC3" = c("HERC3"),
"PTGER4" = c("PTGER4", "EP4", "P4R"), # Possibly PTGER2; ambiguous
"MPHOSPH9" = c("MPHOSPH9", "MPHOS9", "MPP9", "MPP-9", "MPP.9"),
"EAF2" = c("EAF2", "TRAITS", "BM040", "U19"),
"MED28" = c("MED28", "EG1"),
"NNMT" = c("NNMT"),
"MAF" = c("MAF", "CTRCT21", "AYGRP", "CCA4", "C-MAF", "C.MAF"),
"GNMT" = c("GNMT"),
"CENPN" = c("CENPN", "ICEN32", "BM039", "FLJ13607", "FLJ22660"),
"ELL2" = c("ELL2", "MRCCAT1"),
"TMPRSS2" = c("TMPRSS2", "PRSS10")
)
missing <- 0
# Pooled normalization - pooling within sample over all genes as done in TCGA
gex <- mae[[slot]]
gez <- t(apply(gex, MARGIN = 1, FUN = function(z) {
scale(z, center = TRUE, scale = TRUE)
}))
dimnames(gez) <- dimnames(gex)
# TCGA computed AR scores based on just up regulated genes; sum of z-scores from pooled normalization
# Check for gene name overlaps and warn of missing ones
if (verbose) {
lapply(1:length(ar_genes), FUN = function(z) {
if (length(intersect(rownames(gez), ar_genes[[z]])) > 1) {
warning(paste("Warning! More than one gene name alias found for:", names(ar_genes)[z]))
}
if (length(intersect(rownames(gez), ar_genes[[z]])) == 0) {
warning(paste("Warning! No gene name from alias list found for:", names(ar_genes)[z]))
}
})
}
# lapply over patients
res <- unlist(lapply(colnames(gez), FUN = function(patient) {
# lapply over genes for AR score and their aliases
sum(unlist(lapply(ar_genes, FUN = function(z) {
gez[intersect(rownames(gez), z), patient]
})), na.rm = TRUE)
}))
names(res) <- colnames(gex)
res
} else {
stop(paste("Unknown genomic score parameter:", test))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.