####
#
# Teemu Daniel Laajala
# Processing data by Taylor et al.
# GEO: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE21032
# Available data types; Affymetrix Exon arrays, 2 separate types
#
####
# Set to relative path
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("frma", version = "3.8")
BiocManager::install("affy", version = "3.8")
BiocManager::install("oligo", version = "3.8")
BiocManager::install("GEOquery", version = "3.8")
library("frma")
library("affy")
library("oligo")
library("GEOquery")
#> tmp <- ReadAffy("GSM526134_YX_Exon1_PCA0001.CEL")
#Error:
#
#The affy package is not designed for this array type.
#Please use either the oligo or xps package.
raw <- read.celfiles("GSM526134_YX_Exon1_PCA0001.CEL")
#Loading required package: pd.huex.1.0.st.v2
#Attempting to obtain 'pd.huex.1.0.st.v2' from BioConductor website.
#Checking to see if your internet connection works...
#installing the source package 'pd.huex.1.0.st.v2'
#
#trying URL #'https://bioconductor.org/packages/3.8/data/annotation/src/contrib/pd.huex.1.0.st.v2_3.14.1.tar.gz'
#Content type 'application/x-gzip' length 314073786 bytes (299.5 MB)
#downloaded 299.5 MB
#
#During startup - Warning message:
#Setting LC_CTYPE= failed
#* installing *source* package 'pd.huex.1.0.st.v2' ...
#** R
#** data
#** inst
#** byte-compile and prepare package for lazy loading
#** help
#*** installing help indices
# converting help for package 'pd.huex.1.0.st.v2'
# finding HTML links ... done
# pkg html
# probeSequences html
#** building package indices
#** testing if installed package can be loaded
#During startup - Warning message:
#Setting LC_CTYPE= failed
#* DONE (pd.huex.1.0.st.v2)
#In R CMD INSTALL
#
#The downloaded source packages are in
# 'C:\Users\teemu\AppData\Local\Temp\Rtmp6B2MFP\downloaded_packages'
#Loading required package: pd.huex.1.0.st.v2
#Loading required package: RSQLite
#Loading required package: DBI
#Platform design info loaded.
#Reading in : GSM526134_YX_Exon1_PCA0001.CEL
# Testing the fRMA example
#library(frmaExampleData)
#data(AffyBatchExample)
#object <- frma(AffyBatchExample)
#e <- exprs(object)
#GNUSE(object,type="stats")
#tmp2 <- rma(raw, target="full")
#tmp3 <- rma(raw, target="core")
# https://www.bioconductor.org/packages//2.7/bioc/vignettes/oligo/inst/doc/V5ExonGene.pdf
#featureData(tmp2) <- getNetAffx(tmp, "probeset")
#featureData(tmp2) <- getNetAffx(tmp2, "transcript")
#featureData(tmp3) <- getNetAffx(tmp3, "transcript")
#> str(tmp2@featureData)
#Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
# ..@ varMetadata :'data.frame': 17 obs. of 1 variable:
# .. ..$ labelDescription: chr [1:17] NA NA NA NA ...
# ..@ data :'data.frame': 22011 obs. of 17 variables:
# .. ..$ transcriptclusterid: int [1:22011] 2315554 2315633 2315674 2315739 2315894 2315918 2315951 2316218 2316245 2316379 ...
# .. ..$ probesetid : int [1:22011] 2315554 2315633 2315674 2315739 2315894 2315918 2315951 2316218 2316245 2316379 ...
# .. ..$ seqname : chr [1:22011] "chr1" "chr1" "chr1" "chr1" ...
# .. ..$ strand : chr [1:22011] "+" "+" "+" "+" ...
# .. ..$ start : int [1:22011] 1095869 1165740 1206658 1243987 1370261 1381162 1407062 1840160 2036175 2156254 ...
# .. ..$ stop : int [1:22011] 1135910 1170393 1227386 1247168 1378258 1405538 1470060 1848733 2120773 2241652 ...
# .. ..$ totalprobes : int [1:22011] 259 48 225 77 89 78 405 57 327 341 ...
# .. ..$ geneassignment : chr [1:22011] "NM_001130045 // TTLL10 // tubulin tyrosine ligase-like family, member 10 // 1p36.33 // 254173 /// NM_153254 // "| __truncated__ "NM_080605 // B3GALT6 // UDP-Gal:betaGal beta 1,3-galactosyltransferase polypeptide 6 // 1p36.33 // 126792 /// E"| __truncated__ "NM_001130413 // SCNN1D // sodium channel, non-voltage-gated 1, delta subunit // 1p36.3-p36.2 // 6339 /// NR_037"| __truncated__ "NM_153339 // PUSL1 // pseudouridylate synthase-like 1 // 1p36.33 // 126789 /// ENST00000379031 // PUSL1 // pseu"| __truncated__ ...
# .. ..$ mrnaassignment : chr [1:22011] "NM_001130045 // RefSeq // Homo sapiens tubulin tyrosine ligase-like family, member 10 (TTLL10), transcript vari"| __truncated__ "NM_080605 // RefSeq // Homo sapiens UDP-Gal:betaGal beta 1,3-galactosyltransferase polypeptide 6 (B3GALT6), mRN"| __truncated__ "NM_001130413 // RefSeq // Homo sapiens sodium channel, non-voltage-gated 1, delta subunit (SCNN1D), transcript "| __truncated__ "NM_153339 // RefSeq // Homo sapiens pseudouridylate synthase-like 1 (PUSL1), mRNA. // chr1 // 96 // 66 // 49 //"| __truncated__ ...
# .. ..$ swissprot : chr [1:22011] "NM_001130045 // Q6ZVT0 /// NM_001130045 // J3QRK8 /// NM_153254 // Q6ZVT0 /// NM_153254 // J3QRK8 /// XM_005244"| __truncated__ "NM_080605 // Q96L58 /// NM_080605 // Q499Z2 /// ENST00000379198 // Q96L58 /// ENST00000379198 // Q499Z2 /// BC0"| __truncated__ "NM_001130413 // P51172 /// NM_001130413 // A6NNF7 /// NM_001130413 // C9JDY8 /// NM_001130413 // F8VWH5 /// NM_"| __truncated__ "NM_153339 // Q8N0Z8 /// NM_153339 // J3KTG4 /// ENST00000379031 // Q8N0Z8 /// ENST00000379031 // J3KTG4 /// ENS"| __truncated__ ...
# .. ..$ unigene : chr [1:22011] "NM_001130045 // Hs.454835 // brain| intestine| kidney| lung| muscle| pharynx| testis| colorectal tumor| germ ce"| __truncated__ "NM_080605 // Hs.284284 // adipose tissue| bladder| blood| bone| bone marrow| brain| connective tissue| embryoni"| __truncated__ "NM_001130413 // Hs.512681 // bone| brain| embryonic tissue| eye| heart| kidney| lung| mammary gland| ovary| pan"| __truncated__ "NM_153339 // Hs.400659 // blood| bone| brain| cervix| embryonic tissue| esophagus| eye| heart| intestine| kidne"| __truncated__ ...
# .. ..$ gobiologicalprocess: chr [1:22011] "NM_001130045 // GO:0018094 // protein polyglycylation // inferred from direct assay /// NM_001130045 // GO:000"| __truncated__ "NM_080605 // GO:0005975 // carbohydrate metabolic process // traceable author statement /// NM_080605 // GO:00"| __truncated__ "NM_001130413 // GO:0006814 // sodium ion transport // traceable author statement /// NM_001130413 // GO:003422"| __truncated__ "NM_153339 // GO:0001522 // pseudouridine synthesis // inferred from electronic annotation /// NM_153339 // GO:"| __truncated__ ...
# .. ..$ gocellularcomponent: chr [1:22011] "ENST00000379288 // GO:0005886 // plasma membrane // inferred from direct assay /// ENST00000379288 // GO:00057"| __truncated__ "NM_080605 // GO:0000139 // Golgi membrane // traceable author statement /// NM_080605 // GO:0005797 // Golgi m"| __truncated__ "NM_001130413 // GO:0005886 // plasma membrane // inferred from direct assay /// NM_001130413 // GO:0005886 // "| __truncated__ NA ...
# .. ..$ gomolecularfunction: chr [1:22011] "NM_001130045 // GO:0005515 // protein binding // inferred from physical interaction /// NM_001130045 // GO:007"| __truncated__ "NM_080605 // GO:0008499 // UDP-galactose:beta-N-acetylglucosamine beta-1,3-galactosyltransferase activity // in"| __truncated__ "NM_001130413 // GO:0005515 // protein binding // inferred from physical interaction /// NM_001130413 // GO:001"| __truncated__ "NM_153339 // GO:0003723 // RNA binding // inferred from electronic annotation /// NM_153339 // GO:0009982 // p"| __truncated__ ...
# .. ..$ pathway : chr [1:22011] NA "NM_080605 // MetaCyc // PWY-6557 /// AY050570 // MetaCyc // PWY-6557" NA NA ...
# .. ..$ proteindomains : chr [1:22011] "ENST00000379288 // Pfam // IPR004344 // Tubulin-tyrosine ligase/Tubulin polyglutamylase /// ENST00000379289 // "| __truncated__ "ENST00000379198 // Pfam // IPR002659 // Glycosyl transferase, family 31" "ENST00000325425 // Pfam // IPR001873 // Na+ channel, amiloride-sensitive /// ENST00000338555 // Pfam // IPR0018"| __truncated__ "ENST00000379031 // Pfam // IPR020097 // Pseudouridine synthase I, TruA, alpha/beta domain /// ENST00000467712 /"| __truncated__ ...
# .. ..$ category : chr [1:22011] "main" "main" "main" "main" ...
# ..@ dimLabels : chr [1:2] "rowNames" "columnNames"
# ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
# .. .. ..@ .Data:List of 1
# .. .. .. ..$ : int [1:3] 1 1 0
#> featureData(tmp) <- getNetAffx(tmp, "transcript")
#Error in getNetAffx(tmp, "transcript") :
# NetAffx Annotation not available in 'huex.1.0.st.v2'. Consider using 'biomaRt'.
CELs <- read.celfiles(list.celfiles())
#> CELs
#ExonFeatureSet (storageMode: lockedEnvironment)
#assayData: 6553600 features, 370 samples
# element names: exprs
#protocolData
# rowNames: GSM526134_YX_Exon1_PCA0001.CEL GSM526135_YX_Exon1_PCA0002.CEL ... GSM528049_YX_Exon1_PAN0174.CEL (370
# total)
# varLabels: exprs dates
# varMetadata: labelDescription channel
#phenoData
# rowNames: GSM526134_YX_Exon1_PCA0001.CEL GSM526135_YX_Exon1_PCA0002.CEL ... GSM528049_YX_Exon1_PAN0174.CEL (370
# total)
# varLabels: index
# varMetadata: labelDescription channel
#featureData: none
#experimentData: use 'experimentData(object)'
#Annotation: pd.huex.1.0.st.v2
RMAs <- rma(CELs)
# Running out of RAM due to large frma reference file
#fRMAs <- frma(CELs)
#fRMAs2 <- frma(CELs, target="core")
# affy vs oligo, in respect to oligo: https://support.bioconductor.org/p/91565/
# oligo preprocessing for Taylor's Exon arrays: https://www.bioconductor.org/packages//2.7/bioc/vignettes/oligo/inst/doc/V5ExonGene.pdf
# frma manual: https://bioconductor.org/packages/release/bioc/manuals/frma/man/frma.pdf
# frma vignette: https://bioconductor.org/packages/release/bioc/vignettes/frma/inst/doc/frma.pdf
# frma publication: https://www.ncbi.nlm.nih.gov/pubmed/20097884
### Not enough local machine RAM to allocate fRMA reference (~15 GB)
### RMA running fine
featureData(RMAs) <- getNetAffx(RMAs, "transcript")
sampleNames(RMAs) <- unlist(lapply(strsplit(list.celfiles(), "_"), FUN=function(z) z[[1]])) # GSM######-type names from GEO
nameMapTaylor <- data.frame(
GSM = unlist(lapply(strsplit(list.celfiles(), "_"), FUN=function(z) z[[1]])), # GEO, GSM-names
ID = gsub(".CEL", "", unlist(lapply(strsplit(list.celfiles(), "_"), FUN=function(z) z[[4]]))) # mapping to PCA0001, PCA0002, ...
)
# Diagnostics for QC
oligo::boxplot(RMAs)
# Quantiles appear to behave quite nicely
# RMA has improved the quality of the measurements
MAplot(CELs[,1:3], pairs=T)
# vs
MAplot(RMAs[,1:3], pairs=T)
# To inspect the probe sequences in raw CELs
head(pmSequence(CELs))
# Intensity plots over processed samples
#expressionrange <- extendrange(exprs(RMAs))
xlim <- c(2, 15)
ylim <- c(0, 4e-01)
# Constructing an overlapping density plot of expression values
plot.new()
plot.window(xlim=xlim, ylim=ylim)
apply(exprs(RMAs), MARGIN=2, FUN=function(z){
points(density(z), type="l")
})
axis(1); axis(2); box(); title(main="Expression densities over RMA normalized samples")
# Fetch original data by Taylor et al. and compare processed data to their data matrices
GEX_Taylor <- RMAs
nam <- fData(GEX_Taylor)[,8]
nam2 <- unlist(lapply(nam, FUN=function(z) { strsplit(z, " // ")[[1]][2] }))
nam2[is.na(nam2)] <- "NA"
rownames(GEX_Taylor) <- make.unique(nam2)
# Save the GEX file
save(GEX_Taylor, file="GEX_Taylor.RData")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.