R/pipeline_Barwick.R

####
#
# Teemu Daniel Laajala
# Processing data by Barwick et al.
# GEO: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE18655
# Available data types; total 139 samples, of which TMPRSS2:ERG fusion yes (n=69) / no (n=70)
# Platform: a custom 502 cancer-related gene subpanel; a lot smaller GEX than other datasets
# Raw tarball download address: https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE18655&format=file&file=GSE18655%5FHCP%5FToronto%5Fraw%2Etxt%2Egz
# Platform ID gene annotations: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GPL5858&id=7883&db=GeoDb_blob19
#
####

setwd("D:\\curatedProstateData_RAWs\\Barwick\\")
Barwick <- read.table("GSE18655_HCP_Toronto_raw.txt", sep="\t", header=T, skip=4, quote="", stringsAsFactors=F, row.names=1)
Barwick <- as.matrix(Barwick)
Anno <- read.table("GPL5858.txt", sep="\t", header=T, stringsAsFactors=F)
# Invalid RefSeq IDs with trailing ".1" and ".2"
Anno[,"REFSEQ"] <- unlist(lapply(Anno[,"GB_ACC"], FUN=function(z) { substr(z, start=1, nchar(z)-2) }))
# Human genome annotation databases
library("AnnotationDbi")
library("org.Hs.eg.db")
#> length(keys(org.Hs.eg.db, "REFSEQ"))
#[1] 280529
# Annotate gene symbols
Anno[,"GENESYMBOL"] <- unlist(mapIds(org.Hs.eg.db, keys=Anno[,"REFSEQ"], column="SYMBOL", keytype="REFSEQ", multiVals="list"))
# 'select()' returned 1:1 mapping between keys and columns

#> c("BRCA1", "BRCA2", "HOXB13") %in% Anno[,"GENESYMBOL"]
#[1]  TRUE  TRUE FALSE

## Frequent genes PCa, 2017
# Travis & co: https://www.karger.com/Article/FullText/472146

# Diagnosis
c("BCL2", "HPN", "PCA3", "TP53") %in% Anno[,"GENESYMBOL"]
# Diagnosis & Metas/Recur
c("MYC", "FGFR3") %in% Anno[,"GENESYMBOL"]
# Diagnosis & Metas/Recur & Survival
c("AGR2", "IGFBP3", "AMACR", "HOXC6") %in% Anno[,"GENESYMBOL"]
# Diagnosis & Survival
c("ERG", "PTEN") %in% Anno[,"GENESYMBOL"]
# Metas/Recur
c("MUC1", "VEGFR1", "ABCC") %in% Anno[,"GENESYMBOL"]
# Metas/Recur & Survival
c("RAC3") %in% Anno[,"GENESYMBOL"]
# Survival
c("CD24", "NOB1") %in% Anno[,"GENESYMBOL"]


#> # Diagnosis
#> c("BCL2", "HPN", "PCA3", "TP53") %in% Anno[,"GENESYMBOL"]
#[1]  TRUE FALSE FALSE  TRUE
#> # Diagnosis & Metas/Recur
#> c("MYC", "FGFR3") %in% Anno[,"GENESYMBOL"]
#[1] TRUE TRUE
#> # Diagnosis & Metas/Recur & Survival
#> c("AGR2", "IGFBP3", "AMACR", "HOXC6") %in% Anno[,"GENESYMBOL"]
#[1] FALSE  TRUE FALSE FALSE
#> # Diagnosis & Survival
#> c("ERG", "PTEN") %in% Anno[,"GENESYMBOL"]
#[1] TRUE TRUE
#> # Metas/Recur
#> c("MUC1", "VEGFR1", "ABCC") %in% Anno[,"GENESYMBOL"]
#[1]  TRUE FALSE FALSE
#> # Metas/Recur & Survival
#> c("RAC3") %in% Anno[,"GENESYMBOL"]
#[1] FALSE
#> # Survival
#> c("CD24", "NOB1") %in% Anno[,"GENESYMBOL"]
#[1] FALSE FALSE

## -> About half of these "hotspots" were part of Barwick panel
# NAs will be a severe issue for the Barwick panel

# Some gene symbols are multiples
#> all(table(Anno[,"GENESYMBOL"])==1)
#[1] FALSE
#> which(table(Anno[,"GENESYMBOL"])>1)
#MECOM 
#  282

# MECOM appears twice
#which(Anno[,"GENESYMBOL"]=="MECOM")
#> cor(t(Barwick[which(Anno[,"GENESYMBOL"]=="MECOM"),]))
#                GI_15834616-S-4 GI_20127459-S-4
#GI_15834616-S-4       1.0000000       0.5709279
#GI_20127459-S-4       0.5709279       1.0000000                
#
# -> 0.57 Pearson correlation; not great but significantly better than random

#> cor.test(t(Barwick[which(Anno[,"GENESYMBOL"]=="MECOM"),])[,1],t(Barwick[which(Anno[,"GENESYMBOL"]=="MECOM"),])[,2])
#
#        Pearson's product-moment correlation
#
#data:  t(Barwick[which(Anno[, "GENESYMBOL"] == "MECOM"), ])[, 1] and t(Barwick[which(Anno[, "GENESYMBOL"] == "MECOM"), ])[, 2]
#t = 9.2779, df = 178, p-value < 2.2e-16
#alternative hypothesis: true correlation is not equal to 0
#95 percent confidence interval:
# 0.4633577 0.6619173
#sample estimates:
#      cor 
#0.5709279

# Data contains technical replicate; collapse these using averages

#> length(gsub("rep1|rep2", "rep", colnames(Barwick)))
#[1] 180
#> length(unique(gsub("rep1|rep2", "rep", colnames(Barwick))))
#[1] 172

# From Barwick description: 
# "Data was generated using GenomeStudio software where multiple sample replicates were combined into an averaged signal as were multiple probes averaged into a gene signal"

# Reference value from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GSM463391&id=14485&db=GeoDb_blob38
# (First patient ID)
#ID_REF	VALUE
#GI_10092618-S	27387.98

#> Barwick[1:4,1:3]
#                   X1_rep1    X1_rep2        X10
#GI_10092618-S-3 30329.5600 28241.0300 28005.6900
#GI_10092618-S-1 25222.3700 22141.4000 25871.1700
#GI_10092618-S-2 30202.6800 30705.6000 32448.4700
#GI_10337586-S-1   583.1155   538.5595   520.9351

# -> Average first over columns (technical replicates), then over probes (probes)

mean(c(mean(Barwick[1,1:2]), mean(Barwick[2, 1:2]), mean(Barwick[3, 1:2])))
mean(c(mean(Barwick[1:3,1]), mean(Barwick[1:3,2])))
#> mean(c(mean(Barwick[1,1:2]), mean(Barwick[2, 1:2]), mean(Barwick[3, 1:2])))
#[1] 27807.11
#> mean(c(mean(Barwick[1:3,1]), mean(Barwick[1:3,2])))
#[1] 27807.11

# Some discrepancy, i.e. 27807.11 vs 27387.98
# Possibly because: "Data was quantile normalized with plate scaling for samples run on multiple 96-well Universal Array Matrices"

library(preprocessCore)
Barwick2 <- normalize.quantiles(Barwick)

#> Barwick2[1:4,1:4]
#           [,1]       [,2]       [,3]       [,4]
#[1,] 30320.3778 28870.0369 28991.3321 30122.1388
#[2,] 23640.9157 20811.2061 27041.6429 26115.0587
#[3,] 29958.1396 31269.8812 31661.5302 30513.5529
#[4,]   561.0375   555.6551   554.3066   609.2774
#
# Still same issue, not replicating exactly same intensities albeit very close

# Collapse probes
#> length(unique(unlist(lapply(rownames(Barwick), FUN=function(z) substr(z, start=1, stop=nchar(z)-4)))))
#[1] 502
Barwick3 <- do.call("rbind", by(Barwick, INDICES=unlist(lapply(rownames(Barwick), FUN=function(z) substr(z, start=1, stop=nchar(z)-2))), FUN=function(z){
	unlist(by(t(z), INDICES=gsub("rep1|rep2", "rep", colnames(Barwick)), FUN=function(z) mean(unlist(z))))
}))
#> all(rownames(Barwick3) == Anno[,"ID"])
#[1] TRUE

# Remove redundant technical replicate suffix
colnames(Barwick3) <- gsub("_rep", "", colnames(Barwick3))
# Sort samples according to increasing ID instead of string based sorting

# For some reason the sample labeled 56 is missing
#
#> setdiff(paste("X",1:173,sep=""),colnames(Barwick3))
#[1] "X56"
# It is also missing from the IDs in https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE18655

Barwick3 <- Barwick3[,c(paste("X",1:55,sep=""),paste("X",57:173,sep=""))]

par(mfrow=c(1,2))

library(oligo)
oligo::boxplot(log2(Barwick3))

Barwick4 <- normalize.quantiles(log2(Barwick3))
oligo::boxplot(Barwick4)
colnames(Barwick4) <- colnames(Barwick3)

# Based on visual inspection, the samples seem to match the GEO IDs, but there are multiple more; fetching from GEO
# GSE18655

library("GEOquery")
BarwickGEO <- getGEO('GSE18655',GSEMatrix=TRUE)[[1]]

BarwickGEONames <- getGEO('GSE18655',GSEMatrix=FALSE)@header$sample_id

# Data looks good
#show(BarwickGEO)
#show(phenoData(BarwickGEO))
#show(pData(phenoData(BarwickGEO)))

# head(pData(phenoData(BarwickGEO)))

BarwickGEOSamples <- paste("X",as.character(pData(phenoData(BarwickGEO))$title), sep="")

# Comparing this to the raw extracted data
head(Barwick3[,BarwickGEOSamples])
head(exprs(BarwickGEO))

plot(unlist(Barwick3[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)), pch=16)

# Almost perfect correlation
plot(unlist(Barwick3[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)), pch=16)

# Averaged samples
cor(unlist(Barwick3[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)))
# Testing Spearman with quantile normalized samples
cor(unlist(Barwick4[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)), method="spearman")

diag(cor(unlist(Barwick4[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)), method="spearman"))
#> > all(diag(cor(unlist(Barwick4[,BarwickGEOSamples]), unlist(exprs(BarwickGEO)), method="spearman")) > 0.995)
#[1] TRUE

# All Barwick samples had over 0.995 Spearman correlation between the newly log2 quantile normalized data from raw files in comparison with the old expressionSet retrieved using GEOquery

Barwick4 <- Barwick4[,BarwickGEOSamples]
rownames(Barwick4) <- Anno[,"GENESYMBOL"]
colnames(Barwick4) <- BarwickGEONames
Barwick4[1:5,1:5]
#> Barwick4[1:5,1:5]
#        GSM463391 GSM463392 GSM463393 GSM463394 GSM463395
#NFKBIA  14.725457 14.725457 14.826593 14.826593 14.775766
#FGF6     9.051381  9.086622  9.122977  9.171304  9.220402
#IGFBP5  12.146077 12.750626 12.597781 12.849812 13.118524
#IL6     11.754253 12.740577 12.079711 11.048649 11.376463
#ARHGDIB 14.685890 14.510174 14.775766 14.598148 14.537433

GEX_Barwick <- Barwick4
dim(GEX_Barwick)
#> dim(GEX_Barwick)
#[1] 502 139

# Double checked that RefSeq and gene names match between GPL in GEO and newly created data matrix


# 4 gene annotations are no longer present
#> sum(is.na(rownames(GEX_Barwick)))
#[1] 4

# MECOM is present twice:
#> table(table(rownames(GEX_Barwick)))
#
#  1   2 
#496   1

GEX_Barwick <- GEX_Barwick[which(!is.na(rownames(GEX_Barwick))),]

#> cor(t(GEX_Barwick[which(rownames(GEX_Barwick)=="MECOM"),]))
#          MECOM     MECOM
#MECOM 1.0000000 0.5262722
#MECOM 0.5262722 1.0000000

# Only 0.52 Pearson correlation between the two probes for the same gene?
# Including only the first instance
GEX_Barwick <- GEX_Barwick[-which(rownames(GEX_Barwick)=="MECOM")[2],]


GEX_Barwick <- ExpressionSet(GEX_Barwick)

save(GEX_Barwick, file="GEX_Barwick.RData")
Syksy/curatedTools documentation built on May 27, 2019, 9:55 a.m.