R/pipeline_ICGC.R

####
#
# Teemu Daniel Laajala
# Fetching raw data from variou International Cancer Genome Consortium datasets in Release 27
#
####

## Relevant data fields (from Release 27):
#File Descriptions
#Open-access analyzed data:
#
#clinical.[ICGC project code].tsv.gz: contains aggregated clinical donor, specimen and sample information
#exp_array.[ICGC project code].tsv.gz: gene expression measured at the transcriptional level (mRNA) using array-based platforms
#exp_seq.[ICGC project code].tsv.gz: gene expression measured at the transcriptional level (mRNA) using sequencing-based platforms
#meth_array.[ICGC project code].tsv.gz: array-based methylation data
#mirna_array.[ICGC project code].tsv.gz: array-based microRNA data
#mirna_seq.[ICGC project code].tsv.gz: sequencing-based microRNA data
#copy_number_somatic_mutation.[ICGC project code].tsv.gz: DNA copy number alterations (ie. gain, losses, LOH) of genes and other loci in tumour tissues relative to normal control samples.
#simple_somatic_mutation.open.[ICGC project code].tsv.gz: open-access simple somatic mutations calls. These include single and multiple base substitutions, and small (<=200bp) insertions and deletions that appear in the tumour tissue, but not in the normal control tissues.
#simple_germline_variations.controlled.[ICGC project code].tsv.gz: contains controlled-access simple germline variations (only available to DACO approved users)
#protein_expression.[ICGC project code].tsv.gz: translational level expression data
#splice_variant.[ICGC project code].tsv.gz: genomic events that affect the splicing of genes.
#
## All offered file names are not covered

## Key data fields for current 'omics:
#
# GEX: exp_array.* or exp_seq.*
# EPI: meth_array.*
# CNA: copy_number_somatic_mutation.*
# PRT: protein_expression.*
# Clinical information: donor.*
# Clinical family risk: donor_family.*
# Clinical smokiing risk etc: donor_exposure.*
# Clinical interventions: donor_therapy.*
#
# ICGC prostate cancer / adenocarcinoma datasets (only 'omics listed here, not e.g. donor information):
# PRAD-CA : copy_number_somatic_mutation, exp_array, meth_array, simple_somatic_mutation.open, structural_somatic_mutation
# PRAD-CN : simple_somatic_mutation.open
# PRAD-FR : copy_number_somatic_mutation, exp_array, exp_seq, simple_somatic_mutation.open, structural_somatic_mutation.open
# PRAD-UK : copy_number_somatic_mutation, simple_somatic_mutation.open, structural_somatic_mutation
# PRAD-US : copy_number_somatic_mutation, exp_seq, meth_array, mirna_seq, protein_expression, simple_somatic_mutation.open
#
##

## Links to specific datasets:
#
# PRAD-CA: https://icgc.org/node/70542
# PRAD-CN: https://icgc.org/node/1003238 # Appears redundant
# PRAD-FR: https://icgc.org/node/1002116
# PRAD-UK: https://icgc.org/node/71331
# PRAD-US: https://icgc.org/node/70272 # Redundant because TCGA is already fetched through cBioPortal
#
##

## Change working directory; some data (e.g. meth) are quite large

# Save file URLs in vectors nested in a list
icgc <- list()

# PRAD-CA (PRAD-CA Prostate Adenocarcinoma - CA)
icgc[["PRAD-CA"]] <- 
c(
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CA/copy_number_somatic_mutation.PRAD-CA.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CA/donor.PRAD-CA.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CA/donor_exposure.PRAD-CA.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CA/donor_family.PRAD-CA.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CA/donor_therapy.PRAD-CA.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CA/exp_array.PRAD-CA.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CA/meth_array.PRAD-CA.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CA/sample.PRAD-CA.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CA/simple_somatic_mutation.open.PRAD-CA.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CA/specimen.PRAD-CA.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CA/structural_somatic_mutation.PRAD-CA.tsv.gz"
)

# PRAD-CN (PRAD-CN Prostate Cancer - CN)
icgc[["PRAD-CN"]] <- 
c(
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CN/donor.PRAD-CN.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CN/sample.PRAD-CN.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CN/simple_somatic_mutation.open.PRAD-CN.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-CN/specimen.PRAD-CN.tsv.gz"
)

# PRAD-FR (PRAD-FR Prostate Cancer - Adenocarcinoma - FR)
icgc[["PRAD-FR"]] <- 
c(
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-FR/copy_number_somatic_mutation.PRAD-FR.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-FR/donor.PRAD-FR.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-FR/donor_family.PRAD-FR.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-FR/donor_surgery.PRAD-FR.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-FR/exp_array.PRAD-FR.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-FR/exp_seq.PRAD-FR.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-FR/sample.PRAD-FR.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-FR/simple_somatic_mutation.open.PRAD-FR.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-FR/specimen.PRAD-FR.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-FR/structural_somatic_mutation.PRAD-FR.tsv.gz"
)

# PRAD-UK (PRAD-UK Prostate Adenocarcinoma - UK)
icgc[["PRAD-UK"]] <- 
c(
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-UK/copy_number_somatic_mutation.PRAD-UK.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-UK/donor.PRAD-UK.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-UK/donor_exposure.PRAD-UK.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-UK/donor_family.PRAD-UK.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-UK/donor_therapy.PRAD-UK.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-UK/sample.PRAD-UK.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-UK/simple_somatic_mutation.open.PRAD-UK.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-UK/specimen.PRAD-UK.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-UK/structural_somatic_mutation.PRAD-UK.tsv.gz"
)

# PRAD-US (PRAD-US Prostate Adenocarcinoma - TCGA, US)
icgc[["PRAD-US"]] <- 
c(
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-US/copy_number_somatic_mutation.PRAD-US.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-US/donor.PRAD-US.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-US/exp_seq.PRAD-US.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-US/meth_array.PRAD-US.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-US/mirna_seq.PRAD-US.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-US/protein_expression.PRAD-US.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-US/sample.PRAD-US.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-US/simple_somatic_mutation.open.PRAD-US.tsv.gz",
	"https://dcc.icgc.org/api/v1/download?fn=/release_27/Projects/PRAD-US/specimen.PRAD-US.tsv.gz"
)

# Download the interesting datasets
setwd("D:\\Postdoc\\curatedProstateData\\curatedProstateData_RDY\\ICGC\\PRAD-CA\\")
ICGC.PRAD.CA <- lapply(icgc[["PRAD-CA"]], FUN=curatedTools:::.icgcDownload)
setwd("D:\\Postdoc\\curatedProstateData\\curatedProstateData_RDY\\ICGC\\PRAD-FR\\")
ICGC.PRAD.FR <- lapply(icgc[["PRAD-FR"]], FUN=curatedTools:::.icgcDownload)
setwd("D:\\Postdoc\\curatedProstateData\\curatedProstateData_RDY\\ICGC\\PRAD-UK\\")
ICGC.PRAD.UK <- lapply(icgc[["PRAD-UK"]], FUN=curatedTools:::.icgcDownload)
Syksy/curatedTools documentation built on May 27, 2019, 9:55 a.m.