R/pipeline_Taylor_CNA.R

####
#
# Teemu Daniel Laajala
# Processing CNA data by Taylor et al.
# GEO: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE21032
# Agilent aCGH
# Platform in GEO: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL4091
# Agilent-014693 Human Genome CGH Microarray 244A
# Ref: Taylor BS, Schultz N, Hieronymus H, Gopalan A et al. Integrative genomic profiling of human prostate cancer. Cancer Cell 2010 Jul 13;18(1):11-22. 
#
####

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
#BiocManager::install("aCGH", version = "3.8")
BiocManager::install("rCGH", version = "3.8")
# Ref: Commo F, Guinney J, Ferte C, Bot B, Lefebvre C, Soria JC, and Andre F.
# rcgh : a comprehensive array-based genomic profile platform for precision
# medicine. Bioinformatics, 2015.
# https://bioconductor.org/packages/release/bioc/vignettes/rCGH/inst/doc/rCGH.pdf

# Issues in installing dependencies automatically; may require manual tinkering
BiocManager::install("BiocParallel", version = "3.8") # required by rCGH
BiocManager::install("GenomicFeatures", version = "3.8") # required by rCGH
BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene", version = "3.8") # required by rCGH

# Issues in installing BiocParallel due to nightly builds: https://support.bioconductor.org/p/110059/

# Platform for the Agilent aCGH matches the one in https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL4091

# Following rCGH's guide at https://bioconductor.org/packages/release/bioc/vignettes/rCGH/inst/doc/rCGH.pdf
# Process the data using rCGH from BioConductor, using hg38 as a reference
library("rCGH")

# Read the raw Agilent CGH data
# Name samples according to the filename
# Agilent-014693 Human Genome CGH Microarray 244A (Feature number version)
# rCGH says for readAgilent-function: "Agilent from 44 to 400K are supported."
TaylorCGH <- lapply(list.files(), FUN=function(z) { 
	try({
		cat("\n\nProcessing: ",z,"\n\n"); 
		rCGH::readAgilent(z, genome="hg38", sampleName=gsub(".txt", "", z)) 
	})
} )

#> length(TaylorCGH)
#[1] 230
#> which(unlist(lapply(TaylorCGH, FUN=class))=="try-error")
#[1] 181 188
#> list.files()[which(unlist(lapply(TaylorCGH, FUN=class))=="try-error")]
#[1] "GSM525755.txt" "GSM525763.txt"
# Some files appear broken; missing columns?

# Signal adjustments
TaylorCGH <- lapply(TaylorCGH, FUN=function(z){
	try({
		rCGH::adjustSignal(z) 
	})
})
# Segmentation
TaylorCGH <- lapply(TaylorCGH, FUN=function(z){
	try({
		rCGH::segmentCGH(z) 
	})
})
# EM-algorithm normalization
TaylorCGH <- lapply(TaylorCGH, FUN=function(z){
	try({
		rCGH::EMnormalize(z) 
	})
})

## Examples of visualizing the rCGH output:
plotDensity(CNA_Taylor[[1]]) # EM-fitted distributions of log2-CN alteration distributions; multimodal with most at 1 as expected
# Some whole chromomes appear to behave in a consistent manner
multiplot(CNA_Taylor[[1]]) # Amplification/deletion as a function of genomic coordinate

# For the old pipeline with lacking sampleName but included fileName, run:
CNA_Taylor <- lapply(CNA_Taylor, FUN=.renameSampleName)
# Omit samples that have thrown try-errors during processing
#> length(CNA_Taylor[-which(unlist(lapply(CNA_Taylor, FUN=class))=="try-error")])
#[1] 228
#> length(CNA_Taylor)
#[1] 230
CNA_Taylor <- CNA_Taylor[-which(unlist(lapply(CNA_Taylor, FUN=class))=="try-error")]
# Export a compiled GISTIC 2.0 -compatible file from the segmented samples
exportGISTIC(CNA_Taylor, file="inputGISTIC_Taylor.tsv")

###
### !! Need to run GenePattern manually on the exported file and then manually download the results !!
###

# After the results are available, formulate the GISTIC matrix:
# Need to source 'processing.R'
# Requires correct working directory where the output files are located
CNA_GISTIC_Taylor <- .importGISTIC()
save(CNA_GISTIC_Taylor, file="CNA_GISTIC_Taylor.RData")
Syksy/curatedTools documentation built on May 27, 2019, 9:55 a.m.