####
#
# Teemu Daniel Laajala
# Processing CNA data by Taylor et al.
# GEO: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE21032
# Agilent aCGH
# Platform in GEO: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL4091
# Agilent-014693 Human Genome CGH Microarray 244A
# Ref: Taylor BS, Schultz N, Hieronymus H, Gopalan A et al. Integrative genomic profiling of human prostate cancer. Cancer Cell 2010 Jul 13;18(1):11-22.
#
####
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
#BiocManager::install("aCGH", version = "3.8")
BiocManager::install("rCGH", version = "3.8")
# Ref: Commo F, Guinney J, Ferte C, Bot B, Lefebvre C, Soria JC, and Andre F.
# rcgh : a comprehensive array-based genomic profile platform for precision
# medicine. Bioinformatics, 2015.
# https://bioconductor.org/packages/release/bioc/vignettes/rCGH/inst/doc/rCGH.pdf
# Issues in installing dependencies automatically; may require manual tinkering
BiocManager::install("BiocParallel", version = "3.8") # required by rCGH
BiocManager::install("GenomicFeatures", version = "3.8") # required by rCGH
BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene", version = "3.8") # required by rCGH
# Issues in installing BiocParallel due to nightly builds: https://support.bioconductor.org/p/110059/
# Platform for the Agilent aCGH matches the one in https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL4091
# Following rCGH's guide at https://bioconductor.org/packages/release/bioc/vignettes/rCGH/inst/doc/rCGH.pdf
# Process the data using rCGH from BioConductor, using hg38 as a reference
library("rCGH")
# Read the raw Agilent CGH data
# Name samples according to the filename
# Agilent-014693 Human Genome CGH Microarray 244A (Feature number version)
# rCGH says for readAgilent-function: "Agilent from 44 to 400K are supported."
TaylorCGH <- lapply(list.files(), FUN=function(z) {
try({
cat("\n\nProcessing: ",z,"\n\n");
rCGH::readAgilent(z, genome="hg38", sampleName=gsub(".txt", "", z))
})
} )
#> length(TaylorCGH)
#[1] 230
#> which(unlist(lapply(TaylorCGH, FUN=class))=="try-error")
#[1] 181 188
#> list.files()[which(unlist(lapply(TaylorCGH, FUN=class))=="try-error")]
#[1] "GSM525755.txt" "GSM525763.txt"
# Some files appear broken; missing columns?
# Signal adjustments
TaylorCGH <- lapply(TaylorCGH, FUN=function(z){
try({
rCGH::adjustSignal(z)
})
})
# Segmentation
TaylorCGH <- lapply(TaylorCGH, FUN=function(z){
try({
rCGH::segmentCGH(z)
})
})
# EM-algorithm normalization
TaylorCGH <- lapply(TaylorCGH, FUN=function(z){
try({
rCGH::EMnormalize(z)
})
})
## Examples of visualizing the rCGH output:
plotDensity(CNA_Taylor[[1]]) # EM-fitted distributions of log2-CN alteration distributions; multimodal with most at 1 as expected
# Some whole chromomes appear to behave in a consistent manner
multiplot(CNA_Taylor[[1]]) # Amplification/deletion as a function of genomic coordinate
# For the old pipeline with lacking sampleName but included fileName, run:
CNA_Taylor <- lapply(CNA_Taylor, FUN=.renameSampleName)
# Omit samples that have thrown try-errors during processing
#> length(CNA_Taylor[-which(unlist(lapply(CNA_Taylor, FUN=class))=="try-error")])
#[1] 228
#> length(CNA_Taylor)
#[1] 230
CNA_Taylor <- CNA_Taylor[-which(unlist(lapply(CNA_Taylor, FUN=class))=="try-error")]
# Export a compiled GISTIC 2.0 -compatible file from the segmented samples
exportGISTIC(CNA_Taylor, file="inputGISTIC_Taylor.tsv")
###
### !! Need to run GenePattern manually on the exported file and then manually download the results !!
###
# After the results are available, formulate the GISTIC matrix:
# Need to source 'processing.R'
# Requires correct working directory where the output files are located
CNA_GISTIC_Taylor <- .importGISTIC()
save(CNA_GISTIC_Taylor, file="CNA_GISTIC_Taylor.RData")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.