11.doASCRMAv2,TCNs,500K.R
In aroma.affymetrix: Analysis of Large Affymetrix Microarray Data Sets

##########################################################################
# Segmentation of total copy-number ratios of tumors where tumors
# are hybridized on one chip type and the reference samples on another.
#
# Author: Henrik Bengtsson
# Created on: 2012-08-25
# Last updated: 2012-08-26
#
# DATA SET:
# GEO data set 'GSE12702'. Affymetrix CEL files are available from:
#
#   https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE12702
#
# Place them in rawData/GSE12702/Mapping250K_{Nsp|Sty}/*.CEL.  In total
# there are 20 tumor-normal pairs (40 CEL files) per chip type.
#
# ANNOTATION DATA [1]:
# annotationData/
#  chipTypes/
#   Mapping250K_Nsp/
#    Mapping250K_Nsp,na31,HB20101007.ugp
#   Mapping250K_Sty/
#    Mapping250K_Sty,na31,HB20101007.ugp
#  (GenericHuman/
#    GenericHuman,10kb,HB20090503.ugp.gz)
#
# [1] https://aroma-project.org/data/annotationData/chipTypes/
##########################################################################
library("aroma.affymetrix")
library("aroma.cn")
verbose <- Arguments$getVerbose(-8, timestamp=TRUE)


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# AS-CRMAv2
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
dataSet <- "GSE12702"
chipTypes <- c("Mapping250K_Nsp", "Mapping250K_Sty")
dsList <- lapply(chipTypes, FUN=function(chipType) {
  dsNList <- doASCRMAv2(dataSet, chipType=chipType, verbose=verbose)
  dsNList$total
})
names(dsList) <- chipTypes
print(dsList)
## $Mapping250K_Nsp
## AromaUnitTotalCnBinarySet:
## Name: GSE12702
## Tags: ACC,-XY,BPN,-XY,AVG,FLN,-XY
## Full name: GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,-XY
## Number of files: 40
## Names: GSM318728, GSM318729, GSM318730, ..., GSM318767 [40]
## Path (to the first file): totalAndFracBData/GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,
## -XY/Mapping250K_Nsp
## Total file size: 40.05 MB
## RAM: 0.04MB
##
## $Mapping250K_Sty
## AromaUnitTotalCnBinarySet:
## Name: GSE12702
## Tags: ACC,-XY,BPN,-XY,AVG,FLN,-XY
## Full name: GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,-XY
## Number of files: 40
## Names: GSM318773, GSM318774, GSM318775, ..., GSM318812 [40]
## Path (to the first file): totalAndFracBData/GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,
## -XY/Mapping250K_Sty
## Total file size: 36.39 MB
## RAM: 0.04MB


# Sample annotation data according to GEO
sad <- read.table(sep="\t", header=TRUE, stringsAsFactors=FALSE, text="
sampleType	sampleName
GSM318728	T	Patient24
GSM318729	N	Patient24
GSM318730	T	Patient25
GSM318731	N	Patient25
GSM318732	T	Patient27
GSM318733	N	Patient27
GSM318734	T	Patient31
GSM318735	N	Patient31
GSM318736	T	Patient45
GSM318737	N	Patient45
GSM318738	T	Patient52
GSM318739	N	Patient52
GSM318740	T	Patient58
GSM318741	N	Patient58
GSM318742	T	Patient60
GSM318743	N	Patient60
GSM318744	T	Patient75
GSM318745	N	Patient75
GSM318746	T	Patient110
GSM318747	N	Patient110
GSM318748	T	Patient115
GSM318749	N	Patient115
GSM318750	T	Patient122
GSM318751	N	Patient122
GSM318752	T	Patient128
GSM318753	N	Patient128
GSM318754	T	Patient137
GSM318755	N	Patient137
GSM318756	T	Patient138
GSM318757	N	Patient138
GSM318758	T	Patient140
GSM318759	N	Patient140
GSM318760	T	Patient154
GSM318761	N	Patient154
GSM318762	T	Patient167
GSM318763	N	Patient167
GSM318764	T	Patient80
GSM318765	N	Patient80
GSM318766	T	Patient96
GSM318767	N	Patient96
GSM318773	T	Patient24
GSM318774	N	Patient24
GSM318775	T	Patient25
GSM318776	N	Patient25
GSM318777	T	Patient27
GSM318778	N	Patient27
GSM318779	T	Patient31
GSM318780	N	Patient31
GSM318781	T	Patient45
GSM318782	N	Patient45
GSM318783	T	Patient52
GSM318784	N	Patient52
GSM318785	T	Patient58
GSM318786	N	Patient58
GSM318787	T	Patient60
GSM318788	N	Patient60
GSM318789	T	Patient75
GSM318790	N	Patient75
GSM318791	T	Patient110
GSM318792	N	Patient110
GSM318793	T	Patient115
GSM318794	N	Patient115
GSM318795	T	Patient122
GSM318796	N	Patient122
GSM318797	T	Patient128
GSM318798	N	Patient128
GSM318799	T	Patient137
GSM318800	N	Patient137
GSM318801	T	Patient138
GSM318802	N	Patient138
GSM318803	T	Patient140
GSM318804	N	Patient140
GSM318805	T	Patient154
GSM318806	N	Patient154
GSM318807	T	Patient167
GSM318808	N	Patient167
GSM318809	T	Patient80
GSM318810	N	Patient80
GSM318811	T	Patient96
GSM318812	N	Patient96
")

# Apply fullname translators
# this avoids having to rename the actual files
fnt <- function(names, ...) {
  names <- gsub(",total", "", names)
  data <- sad[names,]
  paste(data$sampleName, data$sampleType, names, "total", sep=",")
}
dsList <- lapply(dsList, FUN=setFullNamesTranslator, fnt)

# Example without translators
# the names do not pair up across the two chip types
print(getFullNames(dsList[[1]], translate=FALSE)[1:2])
print(getFullNames(dsList[[2]], translate=FALSE)[1:2])

# Example with translators
# the names *do* pair up across the two chip types
print(getFullNames(dsList[[1]], translate=TRUE)[1:2])
print(getFullNames(dsList[[2]], translate=TRUE)[1:2])


# Order both datasets by <sampleName>,<sampleType>
names <- gsub(",GSM.*", "", getFullNames(dsList[[1]]))
dsList <- lapply(dsList, FUN=`[`, names)


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Split up in tumor-normal data sets
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Extract tumors and normals
dsTList <- lapply(dsList, FUN=function(ds) {
  ds[sapply(ds, FUN=hasTag, "T")]
})

dsNList <- lapply(dsList, FUN=function(ds) {
  ds[sapply(ds, FUN=hasTag, "N")]
})


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# (A) Paired tumor-normal segmentation when the tumors and the normals
#     are hybridized on the same (Mapping250K_Nsp) chip type
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
dsT <- dsTList$Mapping250K_Nsp
dsN <- dsNList$Mapping250K_Nsp
seg <- CbsModel(dsT, ref=dsN)
print(seg)

ce <- ChromosomeExplorer(seg)

# Segment Chr8 of tumor GSM318736 (cf. FigS3 of the CalMaTe paper).
sampleName <- sad["GSM318736", "sampleName"]
idx <- match(sampleName, getNames(ce))
process(ce, arrays=idx, chromosomes=c(2,8), verbose=verbose)

# Look at the segmentation in ChromosomeExplorer
display(ce)

# Or open manually in the following directory
path <- print(getParent(getPath(ce), 2))
print(path)



# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# (B) Paired tumor-normal segmentation when the tumors are hybridized
#     on Mapping250K_Nsp and the normals on Mapping250K_Sty
#
# In order to segment, one needs TCN *ratios*. The problem is that the
# TCN ratios are calculated as C=2*T/N, but the tumor signals (T) and
# the normal signal (N) are not observered at the same loci because
# of the different chip types.  NB: T and N are non-ratios here.
#   A solution to this problem is to instead calculate the TCN ratios as
# C'=2*C'_T/C'_N where C'_T and C'_N in turn are TCN *ratios* for the
# tumors (C'_T) and the normals (C'_N) *estimated* at a common set of
# loci.  We use C'_T and C'_N to indicated that the are binned TCN
# *ratios*, where the binning is common to the tumors and the normals.
#   The C'_T ratios are binned based on tumor TCN *ratios* calculated
# as C_T = 2*T/R_T where T is the tumor signal and R_T is a reference
# TCN signal for the same chip type. The C'_N and C_N = 2*N/R_N ratios
# are obtained analogously.
#   The success of this method relies on finding "good" references
# R_T and R_N for the two chip types.  Ideally, R_T is from the same
# batch or lab as T, and likewise for R_N.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Obtaining the reference R_T and R_N
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# We use the pool of a set of samples believed to behave rather normal.
# This can for instance be HapMap samples.  In this example, in order
# avoid making this analysis too big, we simply use the set of normal
# samples in the GSE12702 data.
dsRList <- dsNList

# Calculate the reference as the average across reference samples
dfRList <- lapply(dsRList, FUN=getAverageFile)
print(dfRList)
## $Mapping250K_Nsp
## AromaUnitTotalCnBinaryFile:
## Name: .average-signals-median-mad
## Tags: 2851ec1b0e76f7366fd8b884df353bc9
## Full name: .average-signals-median-mad,2851ec1b0e76f7366fd8b884df353bc9
## Pathname: totalAndFracBData/GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,-XY/Mapping250K_
## Nsp/.average-signals-median-mad,2851ec1b0e76f7366fd8b884df353bc9.asb
## File size: 1.00 MB (1049713 bytes)
## RAM: 0.00 MB
## Number of data rows: 262338
## File format: v1
## Dimensions: 262338x1
## Column classes: double
## Number of bytes per column: 4
## Footer: <createdOn>20120825 17:19:49 PDT</createdOn><platform>Affymetrix</platfo
## rm><chipType>Mapping250K_Nsp</chipType><srcDetails><nbrOfFiles>20</nbrOfFiles><c
## heckSum>39d886127600a8bce196842a61f477c5</checkSum></srcDetails><params><meanNam
## e>median</meanName><sdName>mad</sdName></params>
## Platform: Affymetrix
## Chip type: Mapping250K_Nsp
##
## $Mapping250K_Sty
## AromaUnitTotalCnBinaryFile:
## Name: .average-signals-median-mad
## Tags: 2851ec1b0e76f7366fd8b884df353bc9
## Full name: .average-signals-median-mad,2851ec1b0e76f7366fd8b884df353bc9
## Pathname: totalAndFracBData/GSE12702,ACC,-XY,BPN,-XY,AVG,FLN,-XY/Mapping250K_
## Sty/.average-signals-median-mad,2851ec1b0e76f7366fd8b884df353bc9.asb
## File size: 931.52 kB (953873 bytes)
## RAM: 0.00 MB
## Number of data rows: 238378
## File format: v1
## Dimensions: 238378x1
## Column classes: double
## Number of bytes per column: 4
## Footer: <createdOn>20120825 17:19:58 PDT</createdOn><platform>Affymetrix</platfo
## rm><chipType>Mapping250K_Sty</chipType><srcDetails><nbrOfFiles>20</nbrOfFiles><c
## heckSum>e5860cd91603c1b77757e97b419a3368</checkSum></srcDetails><params><meanNam
## e>median</meanName><sdName>mad</sdName></params>
## Platform: Affymetrix
## Chip type: Mapping250K_Sty

# Here, R_T = dfRList$Mapping250K_Nsp and R_N = dfRList$Mapping250K_Sty


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Calculating TCN *ratios* C_T = 2*T/R_T and C_N = 2*N/R_N
# where C_T is from Mapping250_Nsp and C_N is from Mapping250_Sty
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# C_T = 2*T/R_T
dsT <- dsTList$Mapping250K_Nsp
dfRT <- dfRList$Mapping250K_Nsp
dsCT <- exportTotalCnRatioSet(dsT, ref=dfRT)
# Workaround: Currently exportTotalCnRatioSet() returns everything
# it finds in the output directory and not (as it should) only the
# samples corresponding to the input data set.  The following does
# the latter for us, in this particular example (just in case).
names <- gsub(",GSM.*", "", getFullNames(dsT))
dsCT <- dsCT[names]


# C_N = 2*N/R_N
dsN <- dsNList$Mapping250K_Sty
dfRN <- dfRList$Mapping250K_Sty
dsCN <- exportTotalCnRatioSet(dsN, ref=dfRN)
# Same workaround
names <- gsub(",GSM.*", "", getFullNames(dsN))
dsCN <- dsCN[names]


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# TCN binning of C_T and C_N independently but to a common set of loci
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# To bin only one sample, subset dsCT and dsCN here, e.g.
  sampleName <- sad["GSM318736", "sampleName"]; # => Patient45
  dsCT <- dsCT[sampleName]
  dsCN <- dsCN[sampleName]

# Bin one chip type to the other. If one chip type has many more
# loci than another, bin the former to the latter to increase the
# chances for input loci mapping to target bins.
# NB: Despite this, it is likely that there will be target bins
# for which no input loci fall within, particularly if the two
# chip types have approximately the same number of loci. For similar
# reasons, it is likely that for some target bins there will be
# multiple loci mapping.  In other words, the number of loci averaged
# will differ between bins.  The more loci averaged over, the
# more precise the binned estimate will be.  Because of this, these
# bin counts (or standard error estimates) would ideally be passed
# on to the segmentation method to give different bin estimates
# different weights in the segmentation.  This is currently not
# done, mainly because none of the bin counts are recorded by
# TotalCnBinnedSmoothing.
ugpCT <- getAromaUgpFile(dsCT)
ugpCN <- getAromaUgpFile(dsCN)
if (nbrOfUnits(ugpCT) > nbrOfUnits(ugpCN)) {
  targetUgp <- ugpCN
} else {
  targetUgp <- ugpCT
}

# Alternatively, one can bin to uniformly distributed set of loci,
# e.g. 10kb bins.  The downside of this is that both chip types
# have to be binned, which means longer processing times.  Example:
#  targetUgp <- AromaUgpFile$byChipType("GenericHuman", tags="10kb")

# Calculate C'_T by binning C_T ratios
isBinningNeeded <- (!equals(getAromaUgpFile(dsCT), targetUgp))
if (isBinningNeeded) {
  tcsCT <- TotalCnBinnedSmoothing(dsCT, targetUgp=targetUgp)
  print(tcsCT)
  dsCTs <- process(tcsCT, verbose=verbose)
} else {
  dsCTs <- dsCT
}
print(dsCTs)

# Calculate C'_N by binning C_N ratios
isBinningNeeded <- (!equals(getAromaUgpFile(dsCN), targetUgp))
if (isBinningNeeded) {
  tcsCN <- TotalCnBinnedSmoothing(dsCN, targetUgp=targetUgp)
  print(tcsCN)
  dsCNs <- process(tcsCN, verbose=verbose)
  print(dsCNs)
} else {
  dsCNs <- dsCN
}


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Paired tumor-normal segmentation of C' = 2*C'_T / C'_N
# (with tumors orginally from Nsp and normals originally from Sty)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Just to short then output names in the segmentation, because
# they'll get really long otherwise.
fnt <- function(names, ...) {
  # Drop GSM... and ref=... tags
  gsub(",GSM[0-9]*,ref=[^,]*,[^,]*", "", names)
}
setFullNamesTranslator(dsCTs, fnt)
setFullNamesTranslator(dsCNs, fnt)

# Add custom tags to segmentation output to indiciate whether
# one or the other chip types was mapped to the other.
segTags <- NULL
if (!equals(dsCTs, dsCT) && !equals(dsCNs, dsCN)) {
} else if (!equals(dsCTs, dsCT)) {
  segTags <- "remappedT"
} else if (!equals(dsCNs, dsCN)) {
  segTags <- "remappedN"
}

# Setup paired tumor-normal segmentation model on binned data
segS <- CbsModel(dsCTs, ref=dsCNs, tags=c(segTags, "*"), maxNAFraction=1)
print(segS)

ce <- ChromosomeExplorer(segS)

# Segment Chr8 of tumor GSM318736 as in FigS3 of the CalMaTe paper.
sampleName <- sad["GSM318736", "sampleName"]
idx <- match(sampleName, getNames(ce))
process(ce, arrays=idx, chromosomes=c(8), verbose=verbose)


##########################################################################
# HISTORY:
# 2012-08-26
# o Added custom segmentation tags.
# o Verified that it works to use a target UGP that maps to one of the
#   two chip types.
# 2012-08-25
# o (Re)created.
##########################################################################
Any scripts or data that you put into this service are public.
aroma.affymetrix documentation built on May 29, 2024, 4:32 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
aroma.affymetrix
Analysis of Large Affymetrix Microarray Data Sets

inst/testScripts/addons/chipTypes/Mapping250K_Nsp,Sty/11.doASCRMAv2,TCNs,500K.R
In aroma.affymetrix: Analysis of Large Affymetrix Microarray Data Sets

Try the aroma.affymetrix package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

aroma.affymetrix Analysis of Large Affymetrix Microarray Data Sets

inst/testScripts/addons/chipTypes/Mapping250K_Nsp,Sty/11.doASCRMAv2,TCNs,500K.R In aroma.affymetrix: Analysis of Large Affymetrix Microarray Data Sets

Try the aroma.affymetrix package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

aroma.affymetrix
Analysis of Large Affymetrix Microarray Data Sets

inst/testScripts/addons/chipTypes/Mapping250K_Nsp,Sty/11.doASCRMAv2,TCNs,500K.R
In aroma.affymetrix: Analysis of Large Affymetrix Microarray Data Sets