####
#
# Teemu Daniel Laajala
# Processing data by Barwick et al.
# GEO: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE18655
# Available data types; total 139 samples, of which TMPRSS2:ERG fusion yes (n=69) / no (n=70)
# Platform: a custom 502 cancer-related gene subpanel; a lot smaller GEX than other datasets
# Raw tarball download address: https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE18655&format=file&file=GSE18655%5FHCP%5FToronto%5Fraw%2Etxt%2Egz
# Platform ID gene annotations: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GPL5858&id=7883&db=GeoDb_blob19
#
####
setwd("D:\\curatedProstateData_RAWs\\Barwick\\")
Barwick <- read.table("GSE18655_HCP_Toronto_raw.txt", sep="\t", header=T, skip=4, quote="", stringsAsFactors=F, row.names=1)
Barwick <- as.matrix(Barwick)
Anno <- read.table("GPL5858.txt", sep="\t", header=T, stringsAsFactors=F)
# Invalid RefSeq IDs with trailing ".1" and ".2"
Anno[,"REFSEQ"] <- unlist(lapply(Anno[,"GB_ACC"], FUN=function(z) { substr(z, start=1, nchar(z)-2) }))
# Human genome annotation databases
library("AnnotationDbi")
library("org.Hs.eg.db")
#> length(keys(org.Hs.eg.db, "REFSEQ"))
#[1] 280529
# Annotate gene symbols
Anno[,"GENESYMBOL"] <- unlist(mapIds(org.Hs.eg.db, keys=Anno[,"REFSEQ"], column="SYMBOL", keytype="REFSEQ", multiVals="list"))
# 'select()' returned 1:1 mapping between keys and columns
#> c("BRCA1", "BRCA2", "HOXB13") %in% Anno[,"GENESYMBOL"]
#[1] TRUE TRUE FALSE
## Frequent genes PCa, 2017
# Travis & co: https://www.karger.com/Article/FullText/472146
# Diagnosis
c("BCL2", "HPN", "PCA3", "TP53") %in% Anno[,"GENESYMBOL"]
# Diagnosis & Metas/Recur
c("MYC", "FGFR3") %in% Anno[,"GENESYMBOL"]
# Diagnosis & Metas/Recur & Survival
c("AGR2", "IGFBP3", "AMACR", "HOXC6") %in% Anno[,"GENESYMBOL"]
# Diagnosis & Survival
c("ERG", "PTEN") %in% Anno[,"GENESYMBOL"]
# Metas/Recur
c("MUC1", "VEGFR1", "ABCC") %in% Anno[,"GENESYMBOL"]
# Metas/Recur & Survival
c("RAC3") %in% Anno[,"GENESYMBOL"]
# Survival
c("CD24", "NOB1") %in% Anno[,"GENESYMBOL"]
#> # Diagnosis
#> c("BCL2", "HPN", "PCA3", "TP53") %in% Anno[,"GENESYMBOL"]
#[1] TRUE FALSE FALSE TRUE
#> # Diagnosis & Metas/Recur
#> c("MYC", "FGFR3") %in% Anno[,"GENESYMBOL"]
#[1] TRUE TRUE
#> # Diagnosis & Metas/Recur & Survival
#> c("AGR2", "IGFBP3", "AMACR", "HOXC6") %in% Anno[,"GENESYMBOL"]
#[1] FALSE TRUE FALSE FALSE
#> # Diagnosis & Survival
#> c("ERG", "PTEN") %in% Anno[,"GENESYMBOL"]
#[1] TRUE TRUE
#> # Metas/Recur
#> c("MUC1", "VEGFR1", "ABCC") %in% Anno[,"GENESYMBOL"]
#[1] TRUE FALSE FALSE
#> # Metas/Recur & Survival
#> c("RAC3") %in% Anno[,"GENESYMBOL"]
#[1] FALSE
#> # Survival
#> c("CD24", "NOB1") %in% Anno[,"GENESYMBOL"]
#[1] FALSE FALSE
## -> About half of these "hotspots" were part of Barwick panel
# NAs will be a severe issue for the Barwick panel
# Some gene symbols are multiples
#> all(table(Anno[,"GENESYMBOL"])==1)
#[1] FALSE
#> which(table(Anno[,"GENESYMBOL"])>1)
#MECOM
# 282
# MECOM appears twice
#which(Anno[,"GENESYMBOL"]=="MECOM")
#> cor(t(Barwick[which(Anno[,"GENESYMBOL"]=="MECOM"),]))
# GI_15834616-S-4 GI_20127459-S-4
#GI_15834616-S-4 1.0000000 0.5709279
#GI_20127459-S-4 0.5709279 1.0000000
#
# -> 0.57 Pearson correlation; not great but significantly better than random
#> cor.test(t(Barwick[which(Anno[,"GENESYMBOL"]=="MECOM"),])[,1],t(Barwick[which(Anno[,"GENESYMBOL"]=="MECOM"),])[,2])
#
# Pearson's product-moment correlation
#
#data: t(Barwick[which(Anno[, "GENESYMBOL"] == "MECOM"), ])[, 1] and t(Barwick[which(Anno[, "GENESYMBOL"] == "MECOM"), ])[, 2]
#t = 9.2779, df = 178, p-value < 2.2e-16
#alternative hypothesis: true correlation is not equal to 0
#95 percent confidence interval:
# 0.4633577 0.6619173
#sample estimates:
# cor
#0.5709279
# Data contains technical replicate; collapse these using averages
#> length(gsub("rep1|rep2", "rep", colnames(Barwick)))
#[1] 180
#> length(unique(gsub("rep1|rep2", "rep", colnames(Barwick))))
#[1] 172
# From Barwick description:
# "Data was generated using GenomeStudio software where multiple sample replicates were combined into an averaged signal as were multiple probes averaged into a gene signal"
# Reference value from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GSM463391&id=14485&db=GeoDb_blob38
# (First patient ID)
#ID_REF VALUE
#GI_10092618-S 27387.98
#> Barwick[1:4,1:3]
# X1_rep1 X1_rep2 X10
#GI_10092618-S-3 30329.5600 28241.0300 28005.6900
#GI_10092618-S-1 25222.3700 22141.4000 25871.1700
#GI_10092618-S-2 30202.6800 30705.6000 32448.4700
#GI_10337586-S-1 583.1155 538.5595 520.9351
# -> Average first over columns (technical replicates), then over probes (probes)
mean(c(mean(Barwick[1,1:2]), mean(Barwick[2, 1:2]), mean(Barwick[3, 1:2])))
mean(c(mean(Barwick[1:3,1]), mean(Barwick[1:3,2])))
#> mean(c(mean(Barwick[1,1:2]), mean(Barwick[2, 1:2]), mean(Barwick[3, 1:2])))
#[1] 27807.11
#> mean(c(mean(Barwick[1:3,1]), mean(Barwick[1:3,2])))
#[1] 27807.11
# Some discrepancy, i.e. 27807.11 vs 27387.98
# Possibly because: "Data was quantile normalized with plate scaling for samples run on multiple 96-well Universal Array Matrices"
library(preprocessCore)
Barwick2 <- normalize.quantiles(Barwick)
#> Barwick2[1:4,1:4]
# [,1] [,2] [,3] [,4]
#[1,] 30320.3778 28870.0369 28991.3321 30122.1388
#[2,] 23640.9157 20811.2061 27041.6429 26115.0587
#[3,] 29958.1396 31269.8812 31661.5302 30513.5529
#[4,] 561.0375 555.6551 554.3066 609.2774
#
# Still same issue, not replicating exactly same intensities albeit very close
# Collapse probes
#> length(unique(unlist(lapply(rownames(Barwick), FUN=function(z) substr(z, start=1, stop=nchar(z)-4)))))
#[1] 502
Barwick3 <- do.call("rbind", by(Barwick, INDICES=unlist(lapply(rownames(Barwick), FUN=function(z) substr(z, start=1, stop=nchar(z)-2))), FUN=function(z){
unlist(by(t(z), INDICES=gsub("rep1|rep2", "rep", colnames(Barwick)), FUN=function(z) mean(unlist(z))))
}))
#> all(rownames(Barwick3) == Anno[,"ID"])
#[1] TRUE
# Remove redundant technical replicate suffix
colnames(Barwick3) <- gsub("_rep", "", colnames(Barwick3))
# Sort samples according to increasing ID instead of string based sorting
# For some reason the sample labeled 56 is missing
#
#> setdiff(paste("X",1:173,sep=""),colnames(Barwick3))
#[1] "X56"
# It is also missing from the IDs in https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE18655
Barwick3 <- Barwick3[,c(paste("X",1:55,sep=""),paste("X",57:173,sep=""))]
par(mfrow=c(1,2))
library(oligo)
oligo::boxplot(log2(Barwick3))
Barwick4 <- normalize.quantiles(log2(Barwick3))
oligo::boxplot(Barwick4)
colnames(Barwick4) <- colnames(Barwick3)
# Based on visual inspection, the samples seem to match the GEO IDs, but there are multiple more; fetching from GEO
# GSE18655
library("GEOquery")
BarwickGEO <- getGEO('GSE18655',GSEMatrix=TRUE)[[1]]
BarwickGEONames <- getGEO('GSE18655',GSEMatrix=FALSE)@header$sample_id
# Data looks good
#show(BarwickGEO)
#show(phenoData(BarwickGEO))
#show(pData(phenoData(BarwickGEO)))
# head(pData(phenoData(BarwickGEO)))
BarwickGEOSamples <- paste("X",as.character(pData(phenoData(BarwickGEO))$title), sep="")
# Comparing this to the raw extracted data
head(Barwick3[,BarwickGEOSamples])
head(exprs(BarwickGEO))
plot(unlist(Barwick3[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)), pch=16)
# Almost perfect correlation
plot(unlist(Barwick3[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)), pch=16)
# Averaged samples
cor(unlist(Barwick3[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)))
# Testing Spearman with quantile normalized samples
cor(unlist(Barwick4[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)), method="spearman")
diag(cor(unlist(Barwick4[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)), method="spearman"))
#> > all(diag(cor(unlist(Barwick4[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)), method="spearman")) > 0.995)
#[1] TRUE
# All Barwick samples had over 0.995 Spearman correlation between the newly log2 quantile normalized data from raw files in comparison with the old expressionSet retrieved using GEOquery
Barwick4 <- Barwick4[,BarwickGEOSamples]
rownames(Barwick4) <- Anno[,"GENESYMBOL"]
colnames(Barwick4) <- BarwickGEONames
Barwick4[1:5,1:5]
#> Barwick4[1:5,1:5]
# GSM463391 GSM463392 GSM463393 GSM463394 GSM463395
#NFKBIA 14.725457 14.725457 14.826593 14.826593 14.775766
#FGF6 9.051381 9.086622 9.122977 9.171304 9.220402
#IGFBP5 12.146077 12.750626 12.597781 12.849812 13.118524
#IL6 11.754253 12.740577 12.079711 11.048649 11.376463
#ARHGDIB 14.685890 14.510174 14.775766 14.598148 14.537433
GEX_Barwick <- Barwick4
dim(GEX_Barwick)
#> dim(GEX_Barwick)
#[1] 502 139
# Double checked that RefSeq and gene names match between GPL in GEO and newly created data matrix
# 4 gene annotations are no longer present
#> sum(is.na(rownames(GEX_Barwick)))
#[1] 4
# MECOM is present twice:
#> table(table(rownames(GEX_Barwick)))
#
# 1 2
#496 1
GEX_Barwick <- GEX_Barwick[which(!is.na(rownames(GEX_Barwick))),]
#> cor(t(GEX_Barwick[which(rownames(GEX_Barwick)=="MECOM"),]))
# MECOM MECOM
#MECOM 1.0000000 0.5262722
#MECOM 0.5262722 1.0000000
# Only 0.52 Pearson correlation between the two probes for the same gene?
# Including only the first instance
GEX_Barwick <- GEX_Barwick[-which(rownames(GEX_Barwick)=="MECOM")[2],]
GEX_Barwick <- ExpressionSet(GEX_Barwick)
save(GEX_Barwick, file="GEX_Barwick.RData")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.