In RTCGA/RTCGA: The Cancer Genome Atlas Data Integration

Goals

Here we are presenting some non-standard analyses of RTCGA data.

Starting with BRCA cohort, we check how the number of cases is increasing through consecutive releases.
We are checking how p-values for simple log-rank model are changing through consecutive releases
We are showing that (due to number of genes) some of them have expression confounded with significant clinical outcomes.

1. Download clinical datasets

library(RTCGA)
library(survival)
dates <- checkTCGA("Dates")

library(archivist)
setLocalRepo("~/GitHub/RTCGA/UseCases/")

for (i in seq_along(dates)) {
  try({
    downloadTCGA("BRCA", dataSet = "Clinical", date = daty[i], destDir = "clinical") 
    cat(i,"\n")
  }, silent = TRUE)
}

2. Prepare data

2.1 Prepare expression dataset with RNAseq

library(RTCGA.rnaseq)
expression <- BRCA.rnaseq
rownames(expression) <- expression[,1]
expression <- expression[,-1]
expression <- expression[-1,]
expression <- t(expression)
expression <- as.data.frame(expression)
expression$names <- rownames(expression)
expression <- expression[substr(expression$names, 14, 15) == "01",]
expression$names <- substr(expression$names, 1, 12)

2.2 Load all clinical data

files <- list.files(path = "~/_BiGTCGA_/clinical/", pattern="clin.mer", recursive = TRUE)
files <- paste0("~/_BiGTCGA_/clinical/",files)

# Here gather some useful statistics
n <- c()
names <- c()

# Collect p-values for these genes
selected <- c(23L, 228L, 259L, 309L, 593L, 664L, 665L, 675L, 676L, 717L, 
             847L, 904L, 1148L, 1287L, 1306L, 1369L, 1429L, 1602L, 1718L, 
             1818L, 1856L, 1985L, 2004L, 2034L, 2169L, 2176L, 2248L, 2389L, 
             2478L, 2514L, 2550L, 2551L, 2555L, 2682L, 2944L, 3008L, 3153L, 
             3189L, 3411L, 3640L, 3803L, 3817L, 3857L, 3960L, 4139L, 4157L, 
             4192L, 4338L, 4588L, 4814L, 5179L, 5270L, 5694L, 5744L, 5764L, 
             6028L, 6033L, 6544L, 6593L, 6680L, 6797L, 6798L, 6831L, 6844L, 
             6847L, 6855L, 6878L, 7009L, 7067L, 7082L, 7261L, 7299L, 7430L, 
             7529L, 7857L, 7971L, 7982L, 8015L, 8265L, 8284L, 8316L, 8694L, 
             8706L, 8832L, 9400L, 9585L, 9593L, 9706L, 9734L, 9778L, 9858L, 
             9872L, 9879L, 10206L, 10235L, 10295L, 10511L, 10634L, 10938L, 
             10963L, 11162L, 11174L, 11197L, 11244L, 11257L, 11262L, 11346L, 
             11554L, 11600L, 11713L, 11793L, 11876L, 11879L, 11890L, 11893L, 
             11915L, 11916L, 11917L, 11947L, 11968L, 11971L, 11979L, 11994L, 
             12007L, 12190L, 12257L, 12391L, 12403L, 12575L, 12912L, 13032L, 
             13105L, 13451L, 13486L, 13531L, 13598L, 13617L, 13815L, 14053L, 
             14129L, 14211L, 14289L, 14291L, 14313L, 14389L, 14423L, 14544L, 
             14703L, 14725L, 14760L, 14910L, 14963L, 15101L, 15315L, 15363L, 
             15392L, 15507L, 15696L, 15762L)

## read clinical data 
readClin <- function(fn) {
    clin <- read.table(fn, sep = "\t", header = TRUE, quote="", comment.char = "", row.names = 1)
    clin <- t(clin)
    colnames(clin) <- gsub(colnames(clin), pattern = "_", replacement = "")
    clin <- clin[clin[,"patient.stageevent.tnmcategories.pathologiccategories.pathologict"] != "tx",]
    rr <- substr(clin[,"patient.stageevent.tnmcategories.pathologiccategories.pathologict"],1,2)
    bc <- as.character(clin[,"patient.bcrpatientbarcode"])
    dd <- as.numeric(clin[,"patient.daystodeath"])
    df <- as.numeric(clin[,"patient.daystolastfollowup"])
    dv <- factor(clin[,"patient.vitalstatus"])
    d <- data.frame(barcode = toupper(bc), 
                    dv, time=ifelse(is.na(df), dd, df), patient.stageevent.tnmcategories.pathologiccategories.pathologict=rr)
    d <- unique(d)
    all <- merge(expression, d, by.x = "names", by.y = "barcode")
    all
}

2.3 Calculate p-values for selected genes

pvalues <- matrix(0, length(files), length(selected)) - 1
for (i in seq_along(files)) {
  try({
    all <- readClin(files[i])
    tt <- Surv(all$time, event=all$dv == "death")

    for (j in seq_along(selected)) {
      # convert to numerical, cut into two parts and check the size of groups
      all[,selected[j]] <- as.numeric(as.character(all[,selected[j]]))
      selectedCat <- cut(all[,selected[j]], c(-100,median(all[,selected[j]]),10^9))

      if (min(table(selectedCat)) >= 20) {
        ndf <- data.frame(time = all$time, event = all$dv %in% c("dead", "deceased"), var = selectedCat)
        pvalues[i,j] <- survdiff(Surv(time, event)~var, data=ndf)$chisq
      }
    }

    n[i] <- nrow(all)
    names[i] <- files[i]
  }, silent = TRUE)
}

saveToRepo(names)
saveToRepo(files)
saveToRepo(pvalues)
saveToRepo(n)

3. Number of observations in BRCA / next releases

library(lubridate)
library(ggplot2)

dates <- substr(names, 74, 81)
drd <- na.omit(data.frame(data=ymd(dates), v = n))

saveToRepo(drd)

ggplot(drd[-1,], aes(data,v)) + 
  geom_point(size=3) +
  theme_bw() + xlab("Date of the release") + 
  ylab("# of patients") +
  ggtitle("BRCA")

saveToRepo(.Last.value)

4. p-values for selected genes

pvalues <- pvalues[1:length(names),]
plotPValues <- function(i) {
  drd <- data.frame(data = ymd(substr(names, 74, 81)), 
                    v = pvalues[,i], 
                    p = 1-pchisq(pvalues[,i],1))
  drd <- drd[drd$v > 0,]
  drd <- drd[-1,]

  ggplot(drd, aes(data, p, label=signif(p,2))) + 
    geom_point(size=3) +
    theme_bw() + xlab("Date of the release") + 
    ylab("p-value (survival model)\n for data from this release") +
    geom_text(data=drd[c(which.max(drd$p), which.min(drd$p)),], color="red", nudge_y = .025) + 
    ggtitle(paste0("Gene: ", colnames(all)[selected[i]]))
}

plotPValues( 135 )
plotPValues( 47 )

5. Low p-value, small group

library(survminer)

all <- readClin(files[20])
j <- 15789
all[,j] <- as.numeric(as.character(all[,j]))

vv <- cut(all[,j], c(-100,0,100), labels = c("low expression", "high expression"))
ndf <- data.frame(time = pmax(all$time, 1), event = all$dv %in% c("dead", "deceased"), 
                  var = vv)
ndf <- na.omit(ndf)
fit <- survfit(Surv(time, event) ~ var, data = ndf)
ggsurvplot(fit, pval=F, main = paste("Gene:", colnames(all)[j]))

survdiff(Surv(time, event) ~ var, data = ndf)