Here we are presenting some non-standard analyses of RTCGA data.
Starting with BRCA cohort, we check how the number of cases is increasing through consecutive releases.
We are checking how p-values for simple log-rank model are changing through consecutive releases
We are showing that (due to number of genes) some of them have expression confounded with significant clinical outcomes.
library(RTCGA) library(survival) dates <- checkTCGA("Dates") library(archivist) setLocalRepo("~/GitHub/RTCGA/UseCases/")
for (i in seq_along(dates)) { try({ downloadTCGA("BRCA", dataSet = "Clinical", date = daty[i], destDir = "clinical") cat(i,"\n") }, silent = TRUE) }
library(RTCGA.rnaseq) expression <- BRCA.rnaseq rownames(expression) <- expression[,1] expression <- expression[,-1] expression <- expression[-1,] expression <- t(expression) expression <- as.data.frame(expression) expression$names <- rownames(expression) expression <- expression[substr(expression$names, 14, 15) == "01",] expression$names <- substr(expression$names, 1, 12)
files <- list.files(path = "~/_BiGTCGA_/clinical/", pattern="clin.mer", recursive = TRUE) files <- paste0("~/_BiGTCGA_/clinical/",files) # Here gather some useful statistics n <- c() names <- c() # Collect p-values for these genes selected <- c(23L, 228L, 259L, 309L, 593L, 664L, 665L, 675L, 676L, 717L, 847L, 904L, 1148L, 1287L, 1306L, 1369L, 1429L, 1602L, 1718L, 1818L, 1856L, 1985L, 2004L, 2034L, 2169L, 2176L, 2248L, 2389L, 2478L, 2514L, 2550L, 2551L, 2555L, 2682L, 2944L, 3008L, 3153L, 3189L, 3411L, 3640L, 3803L, 3817L, 3857L, 3960L, 4139L, 4157L, 4192L, 4338L, 4588L, 4814L, 5179L, 5270L, 5694L, 5744L, 5764L, 6028L, 6033L, 6544L, 6593L, 6680L, 6797L, 6798L, 6831L, 6844L, 6847L, 6855L, 6878L, 7009L, 7067L, 7082L, 7261L, 7299L, 7430L, 7529L, 7857L, 7971L, 7982L, 8015L, 8265L, 8284L, 8316L, 8694L, 8706L, 8832L, 9400L, 9585L, 9593L, 9706L, 9734L, 9778L, 9858L, 9872L, 9879L, 10206L, 10235L, 10295L, 10511L, 10634L, 10938L, 10963L, 11162L, 11174L, 11197L, 11244L, 11257L, 11262L, 11346L, 11554L, 11600L, 11713L, 11793L, 11876L, 11879L, 11890L, 11893L, 11915L, 11916L, 11917L, 11947L, 11968L, 11971L, 11979L, 11994L, 12007L, 12190L, 12257L, 12391L, 12403L, 12575L, 12912L, 13032L, 13105L, 13451L, 13486L, 13531L, 13598L, 13617L, 13815L, 14053L, 14129L, 14211L, 14289L, 14291L, 14313L, 14389L, 14423L, 14544L, 14703L, 14725L, 14760L, 14910L, 14963L, 15101L, 15315L, 15363L, 15392L, 15507L, 15696L, 15762L) ## read clinical data readClin <- function(fn) { clin <- read.table(fn, sep = "\t", header = TRUE, quote="", comment.char = "", row.names = 1) clin <- t(clin) colnames(clin) <- gsub(colnames(clin), pattern = "_", replacement = "") clin <- clin[clin[,"patient.stageevent.tnmcategories.pathologiccategories.pathologict"] != "tx",] rr <- substr(clin[,"patient.stageevent.tnmcategories.pathologiccategories.pathologict"],1,2) bc <- as.character(clin[,"patient.bcrpatientbarcode"]) dd <- as.numeric(clin[,"patient.daystodeath"]) df <- as.numeric(clin[,"patient.daystolastfollowup"]) dv <- factor(clin[,"patient.vitalstatus"]) d <- data.frame(barcode = toupper(bc), dv, time=ifelse(is.na(df), dd, df), patient.stageevent.tnmcategories.pathologiccategories.pathologict=rr) d <- unique(d) all <- merge(expression, d, by.x = "names", by.y = "barcode") all }
pvalues <- matrix(0, length(files), length(selected)) - 1 for (i in seq_along(files)) { try({ all <- readClin(files[i]) tt <- Surv(all$time, event=all$dv == "death") for (j in seq_along(selected)) { # convert to numerical, cut into two parts and check the size of groups all[,selected[j]] <- as.numeric(as.character(all[,selected[j]])) selectedCat <- cut(all[,selected[j]], c(-100,median(all[,selected[j]]),10^9)) if (min(table(selectedCat)) >= 20) { ndf <- data.frame(time = all$time, event = all$dv %in% c("dead", "deceased"), var = selectedCat) pvalues[i,j] <- survdiff(Surv(time, event)~var, data=ndf)$chisq } } n[i] <- nrow(all) names[i] <- files[i] }, silent = TRUE) } saveToRepo(names) saveToRepo(files) saveToRepo(pvalues) saveToRepo(n)
library(lubridate) library(ggplot2) dates <- substr(names, 74, 81) drd <- na.omit(data.frame(data=ymd(dates), v = n)) saveToRepo(drd) ggplot(drd[-1,], aes(data,v)) + geom_point(size=3) + theme_bw() + xlab("Date of the release") + ylab("# of patients") + ggtitle("BRCA") saveToRepo(.Last.value)
pvalues <- pvalues[1:length(names),] plotPValues <- function(i) { drd <- data.frame(data = ymd(substr(names, 74, 81)), v = pvalues[,i], p = 1-pchisq(pvalues[,i],1)) drd <- drd[drd$v > 0,] drd <- drd[-1,] ggplot(drd, aes(data, p, label=signif(p,2))) + geom_point(size=3) + theme_bw() + xlab("Date of the release") + ylab("p-value (survival model)\n for data from this release") + geom_text(data=drd[c(which.max(drd$p), which.min(drd$p)),], color="red", nudge_y = .025) + ggtitle(paste0("Gene: ", colnames(all)[selected[i]])) } plotPValues( 135 ) plotPValues( 47 )
library(survminer) all <- readClin(files[20]) j <- 15789 all[,j] <- as.numeric(as.character(all[,j])) vv <- cut(all[,j], c(-100,0,100), labels = c("low expression", "high expression")) ndf <- data.frame(time = pmax(all$time, 1), event = all$dv %in% c("dead", "deceased"), var = vv) ndf <- na.omit(ndf) fit <- survfit(Surv(time, event) ~ var, data = ndf) ggsurvplot(fit, pval=F, main = paste("Gene:", colnames(all)[j])) survdiff(Surv(time, event) ~ var, data = ndf)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.