R/pipeline_Taylor_GEX.R

####
#
# Teemu Daniel Laajala
# Processing data by Taylor et al.
# GEO: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE21032
# Available data types; Affymetrix Exon arrays, 2 separate types
#
####

# Set to relative path

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("frma", version = "3.8")
BiocManager::install("affy", version = "3.8")
BiocManager::install("oligo", version = "3.8")
BiocManager::install("GEOquery", version = "3.8")

library("frma")
library("affy")
library("oligo")
library("GEOquery")

#> tmp <- ReadAffy("GSM526134_YX_Exon1_PCA0001.CEL")
#Error: 
#
#The affy package is not designed for this array type.
#Please use either the oligo or xps package.

raw <- read.celfiles("GSM526134_YX_Exon1_PCA0001.CEL")
#Loading required package: pd.huex.1.0.st.v2
#Attempting to obtain 'pd.huex.1.0.st.v2' from BioConductor website.
#Checking to see if your internet connection works...
#installing the source package 'pd.huex.1.0.st.v2'
#
#trying URL #'https://bioconductor.org/packages/3.8/data/annotation/src/contrib/pd.huex.1.0.st.v2_3.14.1.tar.gz'
#Content type 'application/x-gzip' length 314073786 bytes (299.5 MB)
#downloaded 299.5 MB
#
#During startup - Warning message:
#Setting LC_CTYPE= failed 
#* installing *source* package 'pd.huex.1.0.st.v2' ...
#** R
#** data
#** inst
#** byte-compile and prepare package for lazy loading
#** help
#*** installing help indices
#  converting help for package 'pd.huex.1.0.st.v2'
#    finding HTML links ... done
#    pkg                                     html  
#    probeSequences                          html  
#** building package indices
#** testing if installed package can be loaded
#During startup - Warning message:
#Setting LC_CTYPE= failed 
#* DONE (pd.huex.1.0.st.v2)
#In R CMD INSTALL
#
#The downloaded source packages are in
#        'C:\Users\teemu\AppData\Local\Temp\Rtmp6B2MFP\downloaded_packages'
#Loading required package: pd.huex.1.0.st.v2
#Loading required package: RSQLite
#Loading required package: DBI
#Platform design info loaded.
#Reading in : GSM526134_YX_Exon1_PCA0001.CEL

# Testing the fRMA example
#library(frmaExampleData)
#data(AffyBatchExample)
#object <- frma(AffyBatchExample)
#e <- exprs(object)
#GNUSE(object,type="stats")
#tmp2 <- rma(raw, target="full")
#tmp3 <- rma(raw, target="core")

# https://www.bioconductor.org/packages//2.7/bioc/vignettes/oligo/inst/doc/V5ExonGene.pdf
#featureData(tmp2) <- getNetAffx(tmp, "probeset")
#featureData(tmp2) <- getNetAffx(tmp2, "transcript")
#featureData(tmp3) <- getNetAffx(tmp3, "transcript")

#> str(tmp2@featureData)
#Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
#  ..@ varMetadata      :'data.frame':   17 obs. of  1 variable:
#  .. ..$ labelDescription: chr [1:17] NA NA NA NA ...
#  ..@ data             :'data.frame':   22011 obs. of  17 variables:
#  .. ..$ transcriptclusterid: int [1:22011] 2315554 2315633 2315674 2315739 2315894 2315918 2315951 2316218 2316245 2316379 ...
#  .. ..$ probesetid         : int [1:22011] 2315554 2315633 2315674 2315739 2315894 2315918 2315951 2316218 2316245 2316379 ...
#  .. ..$ seqname            : chr [1:22011] "chr1" "chr1" "chr1" "chr1" ...
#  .. ..$ strand             : chr [1:22011] "+" "+" "+" "+" ...
#  .. ..$ start              : int [1:22011] 1095869 1165740 1206658 1243987 1370261 1381162 1407062 1840160 2036175 2156254 ...
#  .. ..$ stop               : int [1:22011] 1135910 1170393 1227386 1247168 1378258 1405538 1470060 1848733 2120773 2241652 ...
#  .. ..$ totalprobes        : int [1:22011] 259 48 225 77 89 78 405 57 327 341 ...
#  .. ..$ geneassignment     : chr [1:22011] "NM_001130045 // TTLL10 // tubulin tyrosine ligase-like family, member 10 // 1p36.33 // 254173 /// NM_153254 // "| __truncated__ "NM_080605 // B3GALT6 // UDP-Gal:betaGal beta 1,3-galactosyltransferase polypeptide 6 // 1p36.33 // 126792 /// E"| __truncated__ "NM_001130413 // SCNN1D // sodium channel, non-voltage-gated 1, delta subunit // 1p36.3-p36.2 // 6339 /// NR_037"| __truncated__ "NM_153339 // PUSL1 // pseudouridylate synthase-like 1 // 1p36.33 // 126789 /// ENST00000379031 // PUSL1 // pseu"| __truncated__ ...
#  .. ..$ mrnaassignment     : chr [1:22011] "NM_001130045 // RefSeq // Homo sapiens tubulin tyrosine ligase-like family, member 10 (TTLL10), transcript vari"| __truncated__ "NM_080605 // RefSeq // Homo sapiens UDP-Gal:betaGal beta 1,3-galactosyltransferase polypeptide 6 (B3GALT6), mRN"| __truncated__ "NM_001130413 // RefSeq // Homo sapiens sodium channel, non-voltage-gated 1, delta subunit (SCNN1D), transcript "| __truncated__ "NM_153339 // RefSeq // Homo sapiens pseudouridylate synthase-like 1 (PUSL1), mRNA. // chr1 // 96 // 66 // 49 //"| __truncated__ ...
#  .. ..$ swissprot          : chr [1:22011] "NM_001130045 // Q6ZVT0 /// NM_001130045 // J3QRK8 /// NM_153254 // Q6ZVT0 /// NM_153254 // J3QRK8 /// XM_005244"| __truncated__ "NM_080605 // Q96L58 /// NM_080605 // Q499Z2 /// ENST00000379198 // Q96L58 /// ENST00000379198 // Q499Z2 /// BC0"| __truncated__ "NM_001130413 // P51172 /// NM_001130413 // A6NNF7 /// NM_001130413 // C9JDY8 /// NM_001130413 // F8VWH5 /// NM_"| __truncated__ "NM_153339 // Q8N0Z8 /// NM_153339 // J3KTG4 /// ENST00000379031 // Q8N0Z8 /// ENST00000379031 // J3KTG4 /// ENS"| __truncated__ ...
#  .. ..$ unigene            : chr [1:22011] "NM_001130045 // Hs.454835 // brain| intestine| kidney| lung| muscle| pharynx| testis| colorectal tumor| germ ce"| __truncated__ "NM_080605 // Hs.284284 // adipose tissue| bladder| blood| bone| bone marrow| brain| connective tissue| embryoni"| __truncated__ "NM_001130413 // Hs.512681 // bone| brain| embryonic tissue| eye| heart| kidney| lung| mammary gland| ovary| pan"| __truncated__ "NM_153339 // Hs.400659 // blood| bone| brain| cervix| embryonic tissue| esophagus| eye| heart| intestine| kidne"| __truncated__ ...
#  .. ..$ gobiologicalprocess: chr [1:22011] "NM_001130045 // GO:0018094 // protein polyglycylation // inferred from direct assay  /// NM_001130045 // GO:000"| __truncated__ "NM_080605 // GO:0005975 // carbohydrate metabolic process // traceable author statement  /// NM_080605 // GO:00"| __truncated__ "NM_001130413 // GO:0006814 // sodium ion transport // traceable author statement  /// NM_001130413 // GO:003422"| __truncated__ "NM_153339 // GO:0001522 // pseudouridine synthesis // inferred from electronic annotation  /// NM_153339 // GO:"| __truncated__ ...
#  .. ..$ gocellularcomponent: chr [1:22011] "ENST00000379288 // GO:0005886 // plasma membrane // inferred from direct assay  /// ENST00000379288 // GO:00057"| __truncated__ "NM_080605 // GO:0000139 // Golgi membrane // traceable author statement  /// NM_080605 // GO:0005797 // Golgi m"| __truncated__ "NM_001130413 // GO:0005886 // plasma membrane // inferred from direct assay  /// NM_001130413 // GO:0005886 // "| __truncated__ NA ...
#  .. ..$ gomolecularfunction: chr [1:22011] "NM_001130045 // GO:0005515 // protein binding // inferred from physical interaction  /// NM_001130045 // GO:007"| __truncated__ "NM_080605 // GO:0008499 // UDP-galactose:beta-N-acetylglucosamine beta-1,3-galactosyltransferase activity // in"| __truncated__ "NM_001130413 // GO:0005515 // protein binding // inferred from physical interaction  /// NM_001130413 // GO:001"| __truncated__ "NM_153339 // GO:0003723 // RNA binding // inferred from electronic annotation  /// NM_153339 // GO:0009982 // p"| __truncated__ ...
#  .. ..$ pathway            : chr [1:22011] NA "NM_080605 // MetaCyc // PWY-6557 /// AY050570 // MetaCyc // PWY-6557" NA NA ...
#  .. ..$ proteindomains     : chr [1:22011] "ENST00000379288 // Pfam // IPR004344 // Tubulin-tyrosine ligase/Tubulin polyglutamylase /// ENST00000379289 // "| __truncated__ "ENST00000379198 // Pfam // IPR002659 // Glycosyl transferase, family 31" "ENST00000325425 // Pfam // IPR001873 // Na+ channel, amiloride-sensitive /// ENST00000338555 // Pfam // IPR0018"| __truncated__ "ENST00000379031 // Pfam // IPR020097 // Pseudouridine synthase I, TruA, alpha/beta domain /// ENST00000467712 /"| __truncated__ ...
#  .. ..$ category           : chr [1:22011] "main" "main" "main" "main" ...
#  ..@ dimLabels        : chr [1:2] "rowNames" "columnNames"
#  ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
#  .. .. ..@ .Data:List of 1
#  .. .. .. ..$ : int [1:3] 1 1 0


#> featureData(tmp) <- getNetAffx(tmp, "transcript")
#Error in getNetAffx(tmp, "transcript") : 
#  NetAffx Annotation not available in 'huex.1.0.st.v2'. Consider using 'biomaRt'.


CELs <- read.celfiles(list.celfiles())

#> CELs
#ExonFeatureSet (storageMode: lockedEnvironment)
#assayData: 6553600 features, 370 samples 
#  element names: exprs 
#protocolData
#  rowNames: GSM526134_YX_Exon1_PCA0001.CEL GSM526135_YX_Exon1_PCA0002.CEL ... GSM528049_YX_Exon1_PAN0174.CEL (370
#    total)
#  varLabels: exprs dates
#  varMetadata: labelDescription channel
#phenoData
#  rowNames: GSM526134_YX_Exon1_PCA0001.CEL GSM526135_YX_Exon1_PCA0002.CEL ... GSM528049_YX_Exon1_PAN0174.CEL (370
#    total)
#  varLabels: index
#  varMetadata: labelDescription channel
#featureData: none
#experimentData: use 'experimentData(object)'
#Annotation: pd.huex.1.0.st.v2

RMAs <- rma(CELs)
# Running out of RAM due to large frma reference file
#fRMAs <- frma(CELs)
#fRMAs2 <- frma(CELs, target="core")


# affy vs oligo, in respect to oligo: https://support.bioconductor.org/p/91565/

# oligo preprocessing for Taylor's Exon arrays: https://www.bioconductor.org/packages//2.7/bioc/vignettes/oligo/inst/doc/V5ExonGene.pdf

# frma manual: https://bioconductor.org/packages/release/bioc/manuals/frma/man/frma.pdf
# frma vignette: https://bioconductor.org/packages/release/bioc/vignettes/frma/inst/doc/frma.pdf
# frma publication: https://www.ncbi.nlm.nih.gov/pubmed/20097884

### Not enough local machine RAM to allocate fRMA reference (~15 GB)
### RMA running fine

featureData(RMAs) <- getNetAffx(RMAs, "transcript")
sampleNames(RMAs) <- unlist(lapply(strsplit(list.celfiles(), "_"), FUN=function(z) z[[1]])) # GSM######-type names from GEO
nameMapTaylor <- data.frame(
	GSM = unlist(lapply(strsplit(list.celfiles(), "_"), FUN=function(z) z[[1]])), # GEO, GSM-names
	ID = gsub(".CEL", "", unlist(lapply(strsplit(list.celfiles(), "_"), FUN=function(z) z[[4]]))) # mapping to PCA0001, PCA0002, ...
)

# Diagnostics for QC
oligo::boxplot(RMAs)
# Quantiles appear to behave quite nicely


# RMA has improved the quality of the measurements
MAplot(CELs[,1:3], pairs=T)
# vs
MAplot(RMAs[,1:3], pairs=T)


# To inspect the probe sequences in raw CELs
head(pmSequence(CELs))


# Intensity plots over processed samples

#expressionrange <- extendrange(exprs(RMAs))
xlim <- c(2, 15)
ylim <- c(0, 4e-01)

# Constructing an overlapping density plot of expression values
plot.new()
plot.window(xlim=xlim, ylim=ylim)
apply(exprs(RMAs), MARGIN=2, FUN=function(z){
	points(density(z), type="l")
})
axis(1); axis(2); box(); title(main="Expression densities over RMA normalized samples")


# Fetch original data by Taylor et al. and compare processed data to their data matrices

GEX_Taylor <- RMAs


nam <- fData(GEX_Taylor)[,8]
nam2 <- unlist(lapply(nam, FUN=function(z) { strsplit(z, " // ")[[1]][2] }))
nam2[is.na(nam2)] <- "NA"
rownames(GEX_Taylor) <- make.unique(nam2)

# Save the GEX file
save(GEX_Taylor, file="GEX_Taylor.RData")
Syksy/curatedTools documentation built on May 27, 2019, 9:55 a.m.