Nothing
##########################################################################
# Segmentation of total copy-number ratios of tumors where tumors
# are hybridized on one chip type and the reference samples on another.
#
# Author: Henrik Bengtsson
# Created on: 2012-08-25
# Last updated: 2012-08-26
#
# DATA SET:
# GEO data set 'GSE12702'. Affymetrix CEL files are available from:
#
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE12702
#
# Place them in rawData/GSE12702/Mapping250K_{Nsp|Sty}/*.CEL. In total
# there are 20 tumor-normal pairs (40 CEL files) per chip type.
#
# ANNOTATION DATA [1]:
# annotationData/
# chipTypes/
# Mapping250K_Nsp/
# Mapping250K_Nsp,na31,HB20101007.ugp
# Mapping250K_Sty/
# Mapping250K_Sty,na31,HB20101007.ugp
# (GenericHuman/
# GenericHuman,10kb,HB20090503.ugp.gz)
#
# [1] https://aroma-project.org/data/annotationData/chipTypes/
##########################################################################
library("aroma.affymetrix")
library("aroma.cn")
verbose <- Arguments$getVerbose(-8, timestamp=TRUE)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# AS-CRMAv2
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
dataSet <- "GSE12702"
chipTypes <- c("Mapping250K_Nsp", "Mapping250K_Sty")
dsList <- lapply(chipTypes, FUN=function(chipType) {
dsNList <- doASCRMAv2(dataSet, chipType=chipType, verbose=verbose)
dsNList$total
})
names(dsList) <- chipTypes
print(dsList)
## $Mapping250K_Nsp
## AromaUnitTotalCnBinarySet:
## Name: GSE12702
## Tags: ACC,-XY,BPN,-XY,AVG,FLN,-XY
## Full name: GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,-XY
## Number of files: 40
## Names: GSM318728, GSM318729, GSM318730, ..., GSM318767 [40]
## Path (to the first file): totalAndFracBData/GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,
## -XY/Mapping250K_Nsp
## Total file size: 40.05 MB
## RAM: 0.04MB
##
## $Mapping250K_Sty
## AromaUnitTotalCnBinarySet:
## Name: GSE12702
## Tags: ACC,-XY,BPN,-XY,AVG,FLN,-XY
## Full name: GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,-XY
## Number of files: 40
## Names: GSM318773, GSM318774, GSM318775, ..., GSM318812 [40]
## Path (to the first file): totalAndFracBData/GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,
## -XY/Mapping250K_Sty
## Total file size: 36.39 MB
## RAM: 0.04MB
# Sample annotation data according to GEO
sad <- read.table(sep="\t", header=TRUE, stringsAsFactors=FALSE, text="
sampleType sampleName
GSM318728 T Patient24
GSM318729 N Patient24
GSM318730 T Patient25
GSM318731 N Patient25
GSM318732 T Patient27
GSM318733 N Patient27
GSM318734 T Patient31
GSM318735 N Patient31
GSM318736 T Patient45
GSM318737 N Patient45
GSM318738 T Patient52
GSM318739 N Patient52
GSM318740 T Patient58
GSM318741 N Patient58
GSM318742 T Patient60
GSM318743 N Patient60
GSM318744 T Patient75
GSM318745 N Patient75
GSM318746 T Patient110
GSM318747 N Patient110
GSM318748 T Patient115
GSM318749 N Patient115
GSM318750 T Patient122
GSM318751 N Patient122
GSM318752 T Patient128
GSM318753 N Patient128
GSM318754 T Patient137
GSM318755 N Patient137
GSM318756 T Patient138
GSM318757 N Patient138
GSM318758 T Patient140
GSM318759 N Patient140
GSM318760 T Patient154
GSM318761 N Patient154
GSM318762 T Patient167
GSM318763 N Patient167
GSM318764 T Patient80
GSM318765 N Patient80
GSM318766 T Patient96
GSM318767 N Patient96
GSM318773 T Patient24
GSM318774 N Patient24
GSM318775 T Patient25
GSM318776 N Patient25
GSM318777 T Patient27
GSM318778 N Patient27
GSM318779 T Patient31
GSM318780 N Patient31
GSM318781 T Patient45
GSM318782 N Patient45
GSM318783 T Patient52
GSM318784 N Patient52
GSM318785 T Patient58
GSM318786 N Patient58
GSM318787 T Patient60
GSM318788 N Patient60
GSM318789 T Patient75
GSM318790 N Patient75
GSM318791 T Patient110
GSM318792 N Patient110
GSM318793 T Patient115
GSM318794 N Patient115
GSM318795 T Patient122
GSM318796 N Patient122
GSM318797 T Patient128
GSM318798 N Patient128
GSM318799 T Patient137
GSM318800 N Patient137
GSM318801 T Patient138
GSM318802 N Patient138
GSM318803 T Patient140
GSM318804 N Patient140
GSM318805 T Patient154
GSM318806 N Patient154
GSM318807 T Patient167
GSM318808 N Patient167
GSM318809 T Patient80
GSM318810 N Patient80
GSM318811 T Patient96
GSM318812 N Patient96
")
# Apply fullname translators
# this avoids having to rename the actual files
fnt <- function(names, ...) {
names <- gsub(",total", "", names)
data <- sad[names,]
paste(data$sampleName, data$sampleType, names, "total", sep=",")
}
dsList <- lapply(dsList, FUN=setFullNamesTranslator, fnt)
# Example without translators
# the names do not pair up across the two chip types
print(getFullNames(dsList[[1]], translate=FALSE)[1:2])
print(getFullNames(dsList[[2]], translate=FALSE)[1:2])
# Example with translators
# the names *do* pair up across the two chip types
print(getFullNames(dsList[[1]], translate=TRUE)[1:2])
print(getFullNames(dsList[[2]], translate=TRUE)[1:2])
# Order both datasets by <sampleName>,<sampleType>
names <- gsub(",GSM.*", "", getFullNames(dsList[[1]]))
dsList <- lapply(dsList, FUN=`[`, names)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Split up in tumor-normal data sets
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Extract tumors and normals
dsTList <- lapply(dsList, FUN=function(ds) {
ds[sapply(ds, FUN=hasTag, "T")]
})
dsNList <- lapply(dsList, FUN=function(ds) {
ds[sapply(ds, FUN=hasTag, "N")]
})
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# (A) Paired tumor-normal segmentation when the tumors and the normals
# are hybridized on the same (Mapping250K_Nsp) chip type
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
dsT <- dsTList$Mapping250K_Nsp
dsN <- dsNList$Mapping250K_Nsp
seg <- CbsModel(dsT, ref=dsN)
print(seg)
ce <- ChromosomeExplorer(seg)
# Segment Chr8 of tumor GSM318736 (cf. FigS3 of the CalMaTe paper).
sampleName <- sad["GSM318736", "sampleName"]
idx <- match(sampleName, getNames(ce))
process(ce, arrays=idx, chromosomes=c(2,8), verbose=verbose)
# Look at the segmentation in ChromosomeExplorer
display(ce)
# Or open manually in the following directory
path <- print(getParent(getPath(ce), 2))
print(path)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# (B) Paired tumor-normal segmentation when the tumors are hybridized
# on Mapping250K_Nsp and the normals on Mapping250K_Sty
#
# In order to segment, one needs TCN *ratios*. The problem is that the
# TCN ratios are calculated as C=2*T/N, but the tumor signals (T) and
# the normal signal (N) are not observered at the same loci because
# of the different chip types. NB: T and N are non-ratios here.
# A solution to this problem is to instead calculate the TCN ratios as
# C'=2*C'_T/C'_N where C'_T and C'_N in turn are TCN *ratios* for the
# tumors (C'_T) and the normals (C'_N) *estimated* at a common set of
# loci. We use C'_T and C'_N to indicated that the are binned TCN
# *ratios*, where the binning is common to the tumors and the normals.
# The C'_T ratios are binned based on tumor TCN *ratios* calculated
# as C_T = 2*T/R_T where T is the tumor signal and R_T is a reference
# TCN signal for the same chip type. The C'_N and C_N = 2*N/R_N ratios
# are obtained analogously.
# The success of this method relies on finding "good" references
# R_T and R_N for the two chip types. Ideally, R_T is from the same
# batch or lab as T, and likewise for R_N.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Obtaining the reference R_T and R_N
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# We use the pool of a set of samples believed to behave rather normal.
# This can for instance be HapMap samples. In this example, in order
# avoid making this analysis too big, we simply use the set of normal
# samples in the GSE12702 data.
dsRList <- dsNList
# Calculate the reference as the average across reference samples
dfRList <- lapply(dsRList, FUN=getAverageFile)
print(dfRList)
## $Mapping250K_Nsp
## AromaUnitTotalCnBinaryFile:
## Name: .average-signals-median-mad
## Tags: 2851ec1b0e76f7366fd8b884df353bc9
## Full name: .average-signals-median-mad,2851ec1b0e76f7366fd8b884df353bc9
## Pathname: totalAndFracBData/GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,-XY/Mapping250K_
## Nsp/.average-signals-median-mad,2851ec1b0e76f7366fd8b884df353bc9.asb
## File size: 1.00 MB (1049713 bytes)
## RAM: 0.00 MB
## Number of data rows: 262338
## File format: v1
## Dimensions: 262338x1
## Column classes: double
## Number of bytes per column: 4
## Footer: <createdOn>20120825 17:19:49 PDT</createdOn><platform>Affymetrix</platfo
## rm><chipType>Mapping250K_Nsp</chipType><srcDetails><nbrOfFiles>20</nbrOfFiles><c
## heckSum>39d886127600a8bce196842a61f477c5</checkSum></srcDetails><params><meanNam
## e>median</meanName><sdName>mad</sdName></params>
## Platform: Affymetrix
## Chip type: Mapping250K_Nsp
##
## $Mapping250K_Sty
## AromaUnitTotalCnBinaryFile:
## Name: .average-signals-median-mad
## Tags: 2851ec1b0e76f7366fd8b884df353bc9
## Full name: .average-signals-median-mad,2851ec1b0e76f7366fd8b884df353bc9
## Pathname: totalAndFracBData/GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,-XY/Mapping250K_
## Sty/.average-signals-median-mad,2851ec1b0e76f7366fd8b884df353bc9.asb
## File size: 931.52 kB (953873 bytes)
## RAM: 0.00 MB
## Number of data rows: 238378
## File format: v1
## Dimensions: 238378x1
## Column classes: double
## Number of bytes per column: 4
## Footer: <createdOn>20120825 17:19:58 PDT</createdOn><platform>Affymetrix</platfo
## rm><chipType>Mapping250K_Sty</chipType><srcDetails><nbrOfFiles>20</nbrOfFiles><c
## heckSum>e5860cd91603c1b77757e97b419a3368</checkSum></srcDetails><params><meanNam
## e>median</meanName><sdName>mad</sdName></params>
## Platform: Affymetrix
## Chip type: Mapping250K_Sty
# Here, R_T = dfRList$Mapping250K_Nsp and R_N = dfRList$Mapping250K_Sty
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Calculating TCN *ratios* C_T = 2*T/R_T and C_N = 2*N/R_N
# where C_T is from Mapping250_Nsp and C_N is from Mapping250_Sty
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# C_T = 2*T/R_T
dsT <- dsTList$Mapping250K_Nsp
dfRT <- dfRList$Mapping250K_Nsp
dsCT <- exportTotalCnRatioSet(dsT, ref=dfRT)
# Workaround: Currently exportTotalCnRatioSet() returns everything
# it finds in the output directory and not (as it should) only the
# samples corresponding to the input data set. The following does
# the latter for us, in this particular example (just in case).
names <- gsub(",GSM.*", "", getFullNames(dsT))
dsCT <- dsCT[names]
# C_N = 2*N/R_N
dsN <- dsNList$Mapping250K_Sty
dfRN <- dfRList$Mapping250K_Sty
dsCN <- exportTotalCnRatioSet(dsN, ref=dfRN)
# Same workaround
names <- gsub(",GSM.*", "", getFullNames(dsN))
dsCN <- dsCN[names]
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# TCN binning of C_T and C_N independently but to a common set of loci
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# To bin only one sample, subset dsCT and dsCN here, e.g.
sampleName <- sad["GSM318736", "sampleName"]; # => Patient45
dsCT <- dsCT[sampleName]
dsCN <- dsCN[sampleName]
# Bin one chip type to the other. If one chip type has many more
# loci than another, bin the former to the latter to increase the
# chances for input loci mapping to target bins.
# NB: Despite this, it is likely that there will be target bins
# for which no input loci fall within, particularly if the two
# chip types have approximately the same number of loci. For similar
# reasons, it is likely that for some target bins there will be
# multiple loci mapping. In other words, the number of loci averaged
# will differ between bins. The more loci averaged over, the
# more precise the binned estimate will be. Because of this, these
# bin counts (or standard error estimates) would ideally be passed
# on to the segmentation method to give different bin estimates
# different weights in the segmentation. This is currently not
# done, mainly because none of the bin counts are recorded by
# TotalCnBinnedSmoothing.
ugpCT <- getAromaUgpFile(dsCT)
ugpCN <- getAromaUgpFile(dsCN)
if (nbrOfUnits(ugpCT) > nbrOfUnits(ugpCN)) {
targetUgp <- ugpCN
} else {
targetUgp <- ugpCT
}
# Alternatively, one can bin to uniformly distributed set of loci,
# e.g. 10kb bins. The downside of this is that both chip types
# have to be binned, which means longer processing times. Example:
# targetUgp <- AromaUgpFile$byChipType("GenericHuman", tags="10kb")
# Calculate C'_T by binning C_T ratios
isBinningNeeded <- (!equals(getAromaUgpFile(dsCT), targetUgp))
if (isBinningNeeded) {
tcsCT <- TotalCnBinnedSmoothing(dsCT, targetUgp=targetUgp)
print(tcsCT)
dsCTs <- process(tcsCT, verbose=verbose)
} else {
dsCTs <- dsCT
}
print(dsCTs)
# Calculate C'_N by binning C_N ratios
isBinningNeeded <- (!equals(getAromaUgpFile(dsCN), targetUgp))
if (isBinningNeeded) {
tcsCN <- TotalCnBinnedSmoothing(dsCN, targetUgp=targetUgp)
print(tcsCN)
dsCNs <- process(tcsCN, verbose=verbose)
print(dsCNs)
} else {
dsCNs <- dsCN
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Paired tumor-normal segmentation of C' = 2*C'_T / C'_N
# (with tumors orginally from Nsp and normals originally from Sty)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Just to short then output names in the segmentation, because
# they'll get really long otherwise.
fnt <- function(names, ...) {
# Drop GSM... and ref=... tags
gsub(",GSM[0-9]*,ref=[^,]*,[^,]*", "", names)
}
setFullNamesTranslator(dsCTs, fnt)
setFullNamesTranslator(dsCNs, fnt)
# Add custom tags to segmentation output to indiciate whether
# one or the other chip types was mapped to the other.
segTags <- NULL
if (!equals(dsCTs, dsCT) && !equals(dsCNs, dsCN)) {
} else if (!equals(dsCTs, dsCT)) {
segTags <- "remappedT"
} else if (!equals(dsCNs, dsCN)) {
segTags <- "remappedN"
}
# Setup paired tumor-normal segmentation model on binned data
segS <- CbsModel(dsCTs, ref=dsCNs, tags=c(segTags, "*"), maxNAFraction=1)
print(segS)
ce <- ChromosomeExplorer(segS)
# Segment Chr8 of tumor GSM318736 as in FigS3 of the CalMaTe paper.
sampleName <- sad["GSM318736", "sampleName"]
idx <- match(sampleName, getNames(ce))
process(ce, arrays=idx, chromosomes=c(8), verbose=verbose)
##########################################################################
# HISTORY:
# 2012-08-26
# o Added custom segmentation tags.
# o Verified that it works to use a target UGP that maps to one of the
# two chip types.
# 2012-08-25
# o (Re)created.
##########################################################################
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.