R/pipeline_Sboner.R

###
#
# Teemu Daniel Laajala
# Pipeline from processing the data from
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE16560
# Platform: GPL5474	Human 6k Transcriptionally Informative Gene Panel for DASL
# N = 281
#
# PLATFORM
# Status	Public on Jan 07, 2008
# Title	Human 6k Transcriptionally Informative Gene Panel for DASL
# Technology type	oligonucleotide beads
# Distribution	custom-commercial
# Organism	Homo sapiens
# Manufacturer	Illumina Inc.
# Manufacture protocol	http://www.illumina.com/pages.ilmn?ID=10
#
# ORIGINAL DATA PROCESSING
#
# For each probe, Red and Green intensities are computed as average of single bead values. The final signal is the sum of the Red and Green intensities. Data were median-centered by subtraction of median (log2 scale) within sample and within DAP, and median set to a target value computed as the overall within DAP median. Data were then normalized using the qspline algorithm (on the log2 intensity scale) with reference distribution computed as the marginal (always within DAP) mean of every gene
# 
###

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("GEOquery", version = "3.8")
# Required illumina annotations (v3 human)
BiocManager::install("illuminaHumanv3.db", version = "3.8")

# Optionally, relative path to the readily downloaded supplementary files

library(GEOquery)

# Annotation for the probes
#tmp <- read.table(url("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GPL5474&id=8021&db=GeoDb_blob92"), comment="#", sep="\t")
tmp <- read.table("GPL5474.txt", sep="\t", comment="#", header=T, quote="", row.names=1)


# Processed original 2-channel illumina array
Sboner_Original <- getGEO("GSE16560")

Sboner_platform <- getGEO("GPL5474")
Sboner_platform2 <- Sboner_platform@dataTable@table
rownames(Sboner_platform2) <- Sboner_platform2[,1]
Sboner_platform2 <- Sboner_platform2[,-1]

f <- list.files()
f <- f[grep("GSM", f)]

raw <- lapply(f, FUN=function(z) { read.table(f, header=T, sep="\t") } )

tmp <- f[grep("GSM416241", f)]

tmp_fours <- lapply(tmp, FUN=function(z) { read.table(z, sep="\t", header=T) })


illumina_raw <- readIllumina(illuminaAnnotation = "Humanv3")

# Bead array analysis pipeline example: 
# https://www.bioconductor.org/packages/devel/data/experiment/vignettes/BeadArrayUseCases/inst/doc/BeadArrayUseCases.pdf


# Visualizing raw green channel intensities
boxplot(test, transFun=logGreenChannelTransform, col="green")
# Red
boxplot(test, transFun=logRedChannelTransform, col="red")

# 2d-plot example
imageplot(test, array=1, high="darkgreen", low="lightgreen")

# Outliers appear rather randomly scattered spatially
#> invisible(lapply(1:9, FUN=function(z) { outlierplot(test, array=z) }))
#689  outliers found on the section
#697  outliers found on the section
#752  outliers found on the section
#576  outliers found on the section
#1161  outliers found on the section
#706  outliers found on the section
#1107  outliers found on the section
#563  outliers found on the section
#872  outliers found on the section



# Test plotting of control/housekeeping 

# blima-package for bead level normalization: https://www.bioconductor.org/packages/release/bioc/vignettes/blima/inst/doc/blima.pdf

# MBCB (Model-based Background Correction for Beadarray)
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2654805/
# https://www.bioconductor.org/packages/devel/bioc/html/MBCB.html


#> length(unique(getBeadData(test, what="ProbeID")))
#[1] 1624

# Box plotting the raw bead intensities
# Raw data without normalization
par(mfrow=c(2,1))
boxplot(test,  transFun = logGreenChannelTransform, color="green")
boxplot(test,  transFun = logRedChannelTransform, color="red")
# Systematic differences between the two color channels


## Re-do; fetch raw data for Sboner et al. directly from GEO
# Approx 1 GB in size
library("GEOquery")
Sboner_raw <- getGEOSuppFiles("GSE16560")
# Untar the package
untar(tarfile=rownames(Sboner_raw))
# Gunzip the .gz
lapply(list.files()[grep(".gz", list.files())], FUN=gunzip)
# Use 'beadarray' to read the raw two color intensities
# i.e. following http://www.bioconductor.org/packages//data/experiment/vignettes/BeadArrayUseCases/inst/doc/BeadArrayUseCases.pdf

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
if(!require("beadarray")){
	BiocManager::install("beadarray", version = "3.8")
	library("beadarray")
}

#Sboner_bead <- readIllumina()

# Each GSM-patient has 4 DAC-files, and have to be parsed separately
fileNames <- list.files()[grep(".txt", list.files())]
# Extract GSM-portion of the file name
GSMs <- unlist(lapply(fileNames, FUN=function(z) { strsplit(z, "_")[[1]][1] }))
#> all(table(GSMs)==4)
#[1] TRUE
#
## All GSM-patients have four DACs

Sboner_raw <- readIllumina(illuminaAnnotation = "Humanv3")

#> head(Sboner_raw[[1]])
#     ProbeID      GrnX      GrnY Grn      RedX      RedY Red
#[1,]       2 1514.0130 1096.2180 207 1515.1320 1110.7690  99
#[2,]       2  831.3730  455.8590 150  833.0906  470.2326  47
#[3,]       2  768.7205  969.8980 139  770.4794  984.2288  54
#[4,]       2 1327.2620  579.4638 178 1328.5030  594.0328 129
#[5,]       2 1395.5850  737.8775 190 1396.6320  752.3868  69
#[6,]       2 1143.9980 1375.1310 194 1145.3350 1389.4260  88
#
## ProbeID == 2 appears to be control probe, not listed in the platform
Syksy/curatedTools documentation built on May 27, 2019, 9:55 a.m.