In vjcitn/ivygapSE: A SummarizedExperiment for Ivy-GAP data

The ivygapSE package includes molecular, imaging, and limited clinical data on glioblastoma (GBM) patients.

Expression data (RNA-seq) was developed in a complex design involving tissue blocks and subblocks and a selection process. See documents at https://vjcitn.github.io/ivygapSE for more details.

To work with the available survival data in an elementary way, the following steps can be taken.

ii = rownames(installed.packages())
if (!("ivygapSE" %in% ii)) BiocManager::install("ivygapSE")

suppressPackageStartupMessages({
library(ivygapSE)
library(survival)
})
data(ivySE)
ivySE

This will show a SummarizedExperiment. There are 270 RNA-seq samples, derived from 37 patients. The variable tumor_name is used to identify different individual donors to the project.

table(table(ivySE$tumor_name))

Molecular subtypes are recorded.

moltype = tumorDetails(ivySE)$molecular_subtype
names(moltype) = tumorDetails(ivySE)$tumor_name
moltype[nchar(moltype)==0] = "missing"
ivySE$moltype = factor(moltype[ivySE$tumor_name])
table(ivySE$moltype)

This shows that there are 2 tumors for which 18 RNA-seq studies were produced. The RNA-seq samples are likely from different subblocks and may represent anatomically distinct components of the tumor. Our job is to help organize data and analyses to help understand relationships among many aspects of data collected in this study.

The first basic exercise is to assess the prognostic information in a variable called mgmt_methylation, collected on most tumors. There is literature indicating that methylation of the promoter of the MGMT gene is associated with a more favorable survival profile.

Unfortunately the survival data is incomplete. It can be obtained as follows.

metadata(ivySE)$tumorDetails$survival_days

We'll make a subset of the tumor data for those with non-missing survival times.

td = metadata(ivySE)$tumorDetails
tdok = td[-which(is.na(td$survival_days)),]

Now we'll produce K-M plots.

Overall survival:

library(survival)
gbmsurv = Surv( tdok$survival_days, rep(1, nrow(tdok)) )
plot(survfit(gbmsurv~1))

Split by MGMT methylation

plot(survfit(gbmsurv~I(tdok$mgmt_methylation=="Yes")), lty=1:2)
legend(x=800,y=1,c("mgmt meth -","mgmt meth +"),lty = 1:2)

table(tdok$mgmt_methylation)

struc = as.character(colData(ivySE)$structure_acronym)
spls = strsplit(struc, "-")
basis = vapply(spls, function(x) x[1], character(1))

specific = which(basis!= "CT")

iseSP = ivySE[ , specific]

spbasis = basis[specific]

useful = c("EIF4E3", "RNASE4", "PBK", "RYR2", "TPD52L1", 
    "GIMAP6", "PRKCB", "ATP6V1A", "RUNDC3B", "CCPG1")

splim = iseSP[useful,]

bb = prcomp(log(t(assay(splim))+1))
rownames(bb$x) = spbasis

biplot(bb)

bb2 = bb
rownames(bb2$x) = splim$short_moltype
biplot(bb2, cex=c(.7,.4))

bb3 = bb
rownames(bb3$x) = splim$short_study
biplot(bb3, cex=c(.7,.4))

mixed effect models for gene expression

library(ivygapSE)
data(ivySE)
library(nlme)
eexp = as.numeric(log(assay(ivySE["EIF4E3",])+1))
library(nlme)
newdf = data.frame(eexp=eexp, id=ivySE$tumor_id, bl=ivySE$block_name, specimen_id=as.character(ivySE$specimen_id))
newdf$rnase4 = as.numeric(log(assay(ivySE["RNASE4",])+1))
mod1 = lme(eexp~rnase4, random=~id|bl, data=newdf)
summary(mod1)

different genes

library(ivygapSE)
library(MASS)
library(nlme)

data(ivySE)

# In this code we drop out the "CT"-only samples
struc = as.character(colData(ivySE)$structure_acronym)
spls = strsplit(struc, "-")
basis = vapply(spls, function(x) x[1], character(1))
specific = which(basis!= "CT")
iseSP = ivySE[ , specific]
#spbasis = basis[specific]
useful = c("EIF4E3", "RNASE4", "PBK", "RYR2", "TPD52L1",
    "GIMAP6", "PRKCB", "ATP6V1A", "RUNDC3B", "CCPG1")
splim = iseSP[useful,]
#bb = prcomp(log(t(assay(splim))+1))
#rownames(bb$x) = spbasis

cd = colData(splim)
stac = cd$structure_acronym
stac1 = sapply(strsplit(stac, "-"), "[[", 1)
table(stac1)

eexp = as.numeric(log(assay(splim["EIF4E3",])+1))
newdf = data.frame(eexp=eexp, id=splim$tumor_id, bl=splim$block_name, specimen_id=as.character(splim$specimen_id))
newdf$rnase4 = as.numeric(log(assay(splim["RNASE4",])+1))

newdf$isCThbv = ifelse(stac1=="CThbv", 1, 0)
table(newdf$isCThbv)
newdf$isLE = ifelse(stac1=="LE", 1, 0)
table(newdf$isLE)

g1 = glmmPQL(isLE~rnase4, data=newdf, fam=binomial, random=~1|id/bl)
summary(g1)
g2 = glmmPQL(isCThbv~rnase4, data=newdf, fam=binomial, random=~1|id/bl)
summary(g2)

# new gene

newdf$ryr2 = as.numeric(log(assay(splim["RYR2",])+1))
g3 = glmmPQL(isLE~ryr2, data=newdf, fam=binomial, random=~1|id/bl)
summary(g3)
g4 = glmmPQL(isLE~ryr2+rnase4, data=newdf, fam=binomial, random=~1|id/bl)
summary(g4)

newdf$eif4e3 = as.numeric(log(assay(splim["EIF4E3",])+1))
g5 = glmmPQL(isLE~eif4e3, data=newdf, fam=binomial, random=~1|id/bl)
summary(g5)
g6 = glmmPQL(isLE~eif4e3+rnase4, data=newdf, fam=binomial, random=~1|id/bl)
summary(g6)

newdf$pbk = as.numeric(log(assay(splim["PBK",])+1))
g7 = glmmPQL(isLE~pbk, data=newdf, fam=binomial, random=~1|id/bl)
summary(g7)
g8 = glmmPQL(isLE~pbk+rnase4, data=newdf, fam=binomial, random=~1|id/bl)
summary(g8)

newdf$TPD52L1 = as.numeric(log(assay(splim["TPD52L1",])+1))
g9 = glmmPQL(isLE~TPD52L1, data=newdf, fam=binomial, random=~1|id/bl)
summary(g9)
g10 = glmmPQL(isLE~TPD52L1+rnase4, data=newdf, fam=binomial, random=~1|id/bl)
summary(g10)

g11 = glmmPQL(isLE~rnase4+eif4e3, data=newdf, fam=binomial, random=~1|id/bl)
summary(g11)

g12 = glmmPQL(isLE~rnase4+eif4e3+TPD52L1, data=newdf, fam=binomial, random=~1|id/bl)
summary(g12)

g13 = glmmPQL(isLE~rnase4+eif4e3+TPD52L1+GIMAP6, data=newdf, fam=binomial, random=~1|id/bl)
summary(g13)

g14 = glm(isLE~rnase4+eif4e3+TPD52L1+GIMAP6+PRKCB, data=newdf, fam=binomial)