#' Generate a copy number profile by resampling
#'
#' Generate a copy number profile by resampling input data
#'
#' This function generates a random copy number profile of length 'length',
#' with 'nBkp' breakpoints randomly chosen. Between two breakpoints, the
#' profile is constant and taken among the different types of regions in
#' \code{regData}.
#'
#' Elements of \code{regData[["region"]]} must be of the form \code{"(C1,C2)"},
#' where \code{C1} denotes the minor copy number and \code{C2} denotes the
#' major copy number. For example, \describe{ \item{(1,1)}{Normal}
#' \item{(0,1)}{Hemizygous deletion} \item{(0,0)}{Homozygous deletion}
#' \item{(1,2)}{Single copy gain} \item{(0,2)}{Copy-neutral LOH}
#' \item{(2,2)}{Balanced two-copy gain} \item{(1,3)}{Unbalanced two-copy gain}
#' \item{(0,3)}{Single-copy gain with LOH} }
#'
#' If 'connex' is set to TRUE (the default), transitions between copy number
#' regions are constrained in such a way that for any breakpoint, one of the
#' minor and the major copy number does not change. Equivalently, this means
#' that all breakpoints can be seen in both total copy numbers and allelic
#' ratios.
#'
#' @param length length of the profile
#' @param nBkp number of breakpoints. If \code{NULL}, then argument \code{bkp}
#' is expected to be provided.
#' @param bkp a numeric vector of breakpoint positions that may be used to
#' bypass the breakpoint generation step. Defaults to \code{NULL}.
#' @param regData a data.frame containing copy number data for different types
#' of copy number regions. Columns:\describe{ \item{c}{Total copy number}
#' \item{b}{Allele B fraction (a.k.a. BAF)} \item{region}{a character value,
#' annotation label for the region. See Details.} \item{genotype}{the
#' (germline) genotype of SNPs. By definition, rows with missing genotypes are
#' interpreted as non-polymorphic loci (a.k.a. copy number probes).} }
#' @param regions a character vector of region labels that may be used to
#' bypass the region label generation step. Defaults to \code{NULL}.
#' @param regAnnot a data.frame containing annotation data for each copy number
#' region. Columns: \describe{ \item{region}{label of the form (must match
#' \code{regData[["region"]]}).} \item{freq}{frequency (in [0,1]) of this type
#' of region in the genome.} } If \code{NULL} (the default), frequencies of
#' regions (0,1), (0,2), (1,1) and (1,2) (the most common alterations) are set
#' to represent 90\% of the regions. \code{sum(regAnnot[["freq"]])} should be
#' 1.
#' @param minLength minimum length of region between breakpoints. Defaults to
#' 0.
#' @param regionSize If \code{regionSize>0}, breakpoints are included by pairs,
#' where the distance within pair is set to \code{regionSize}. \code{nBkp} is
#' then required to be an even number.
#' @param connex If \code{TRUE}, any two successive regions are constrained to
#' be connex in the (minor CN, major CN) space. See 'Details'.
#' @return A list with elements \describe{\item{profile}{the profile (a \code{length} by \code{2} data.frame
#' containing the same fields as the input data \code{regData}.} \item{bkp}{a
#' vector of bkp positions (the last row index before a breakpoint)}}
#' \item{regions}{a character vector of region labels}
#' @author Morgane Pierre-Jean and Pierre Neuvial
#' @importFrom acnr loadCnRegionData
#' @importFrom acnr listDataSets
#'
#' @references Pierre-Jean, M, Rigaill, G. J. and Neuvial, P. (2015). "Performance
#' Evaluation of DNA Copy Number Segmentation Methods." *Briefings in
#' Bioinformatics*, no. 4: 600-615.
#'
#' @examples
#'
#' affyDat <- acnr::loadCnRegionData(dataSet="GSE29172", tumorFraction=1)
#' sim <- getCopyNumberDataByResampling(len=1e4, nBkp=5, minLength=100, regData=affyDat)
#' plotSeg(sim$profile, sim$bkp)
#'
#' ## another run with identical parameters
#' bkp <- sim$bkp
#' regions <- sim$regions
#' sim2 <- getCopyNumberDataByResampling(len=1e4, bkp=bkp, regData=affyDat, regions=regions)
#' plotSeg(sim2$profile, bkp)
#'
#' ## change tumor fraction but keep same "truth"
#' affyDatC <- acnr::loadCnRegionData(dataSet="GSE29172", tumorFraction=0.5)
#' simC <- getCopyNumberDataByResampling(len=1e4, bkp=bkp, regData=affyDatC, regions=regions)
#' plotSeg(simC$profile, bkp)
#'
#' ## restrict to only normal, single copy gain, and copy-neutral LOH
#' ## with the same bkp
#' affyDatR <- subset(affyDat, region %in% c("(1,1)", "(0,2)", "(1,2)"))
#' simR <- getCopyNumberDataByResampling(len=1e4, bkp=bkp, regData=affyDatR)
#' plotSeg(simR$profile, bkp)
#'
#' ## Same 'truth', on another dataSet
#' regions <- simR$regions
#' illuDat <- acnr::loadCnRegionData(dataSet="GSE11976", tumorFraction=1)
#' sim <- getCopyNumberDataByResampling(len=1e4, bkp=bkp, regData=illuDat, regions=regions)
#' plotSeg(sim$profile, sim$bkp)
#'
#' @export getCopyNumberDataByResampling
getCopyNumberDataByResampling <- function(length, nBkp=NA, bkp=NULL, regData=NULL, regions=NULL, regAnnot=NULL, minLength=0, regionSize=0, connex=TRUE){
## Sanity check: lengths
lb <- length(bkp)
if ((lb==0) & (is.na(nBkp)) & (is.null(regions))) {
stop("Please provide one of 'nBkp', 'bkp', or 'regions'")
}
ls <- length(regions)
if (ls>0) { ## 'regions' is not NULL
if (lb>0) { ## 'bkp' is not NULL
if (lb+1!=ls) {
stop("length(regions) should be equal to length(bkp)+1")
}
} else { ## 'bkp' is NULL
if (is.na(nBkp)) {
nBkp <- ls-1L
}
if (nBkp+1!=ls) {
stop("length(regions) should be equal to length(bkp)+1")
}
}
}
rm(ls, lb) ## not used anymore
## Sanity check: consistent region names
regNames <- unique(regData[["region"]])
if (!is.null(regions)) {
mm <- match(regions, regNames)
if (any(is.na(mm))) {
warning("Regions in argument 'regions' not found in 'regData[[\"region\"]]'\nSetting argument 'regions' to NULL")
regions <- NULL
}
}
if (is.null(regAnnot)) {
commonReg <- intersect(c("(0,1)", "(0,2)", "(1,1)", "(1,2)"), regNames)
nCommonReg <- length(commonReg)
## check if common region are in regDat
if (nCommonReg>0) {
freqC <- rep(0.90/nCommonReg, nCommonReg)
otherReg <- setdiff(regNames, commonReg)
nOtherRef <- length(otherReg)
freqO <- rep(0.10/nOtherRef, nOtherRef)
regAnnot <- data.frame(region=c(commonReg, otherReg),
freq=c(freqC, freqO),
stringsAsFactors=FALSE)
} else {
regAnnot <- data.frame(region=regNames,
freq=1/length(regNames),
stringsAsFactors=FALSE)
}
} else {
mm <- match(regNames, regAnnot[["region"]])
if (any(is.na(mm))) {
stop("Annotation not found for: ", regNames[which(is.na(mm))])
}
freq <- regAnnot[["freq"]]
if (min(freq)<0 || max(freq)>1) {
stop("Elements of regAnnot[[\"freq\"]] should be in [0,1]")
}
if (sum(freq)>1) {
stop("Elements of regAnnot[[\"freq\"]] should sum up to 1")
}
}
## Sanity check: regions
if (!is.null(regions)) {
mm <- match(regNames, regAnnot[["region"]])
}
## Order regAnnot as regNames
o <- order(regAnnot$region)
regAnnot <- regAnnot[o,]
pattern <- "\\(([0-9]),([0-9])\\)"
regAnnot$C1 <- as.numeric(gsub(pattern, "\\1", regNames))
regAnnot$C2 <- as.numeric(gsub(pattern, "\\2", regNames))
ww <- which(is.na(regAnnot$C1) | is.na(regAnnot$C2))
if (length(ww)==length(regNames)) {
stop("Could not retrieve minor and major CNs from regData[[\"region\"]]\nSee ?getCopyNumberDataByResampling")
} else if (length(ww)) {
warning("Could not retrieve region for: ", regNames[ww], "\nDropping these regions'")
regNames <- regNames[-ww]
regAnnot <- regAnnot[-ww, ]
}
names(regNames) <- regNames
regTypes <- lapply(regNames, FUN=function(reg) {
which(regData[["region"]]==reg)
})
if (is.null(bkp)) {
## Choose the breakpoints
interval <- 1:(length-1)
u <- numeric(0)
if (regionSize==0){
while (length(u)<nBkp) {
j <- sample(x=interval, size=1 , replace=FALSE)
jend <- j
u <- c(u, j)
b.inf <- max(1, j-minLength)
b.sup <- min(length, j+minLength)
v <- b.inf:b.sup
interval <- setdiff(interval,v)
}
} else {
interval <-1:(length-regionSize-1)
while (length(u)<nBkp) {
j <- sample(x=interval, size=1, replace=FALSE)
jend <- j+regionSize
u <- c(u, j)
if (length(u)<nBkp) { ## add the other member of the pair
u <- c(u, jend)
b.inf <- max(1, j-minLength-regionSize)
b.sup <- min(length, jend+minLength)
v <- b.inf:b.sup
interval <- setdiff(interval, v)
}
}
}
## u <- sample(length-1, nBkp, replace=FALSE);
bkp <- sort(u);
}
bkpB <- c(0, bkp, length);
candidateRegions <- function(regName) {
if (is.null(regName)) return(regAnnot);
reg <- subset(regAnnot, region==regName)
d1 <- regAnnot[, "C1"]-reg[, "C1"]
d2 <- regAnnot[, "C2"]-reg[, "C2"]
## todo: make sure that all regions are connex...
if (connex){
ww <- which((d1 & !d2) | (!d1 & d2))
if (!length(ww)) {
stop("No candidate region found !")
}
} else {
ww <- which(regAnnot$region!=regName)
}
regAnnot[ww, ]
}
## Add the random piecewise linear profile
idxs <- NULL
regs <- NULL
region <- NULL
for (ii in 1:(length(bkp)+1)) {
n <- bkpB[ii+1]-bkpB[ii]
##meanII <- matrix(rnorm(dim), length(idxsII), dim, byrow=TRUE)
if (is.null(regions)) {
candReg <- candidateRegions(region)
probs <- candReg[["freq"]]/sum(candReg[["freq"]])
region <- sample(x=candReg[["region"]], prob=probs, size=1)
} else {
region <- regions[ii]
}
idxsII <- sample(x=regTypes[[region]], size=n, replace=TRUE)
idxs <- c(idxs, idxsII)
regs <- c(regs, region)
}
profile <- regData[idxs, ]
return(list(bkp=bkp, profile=profile, regions=regs))
}
############################################################################
## HISTORY:
## 2014-07-16
## Now, if 'regAnnot' is NULL (the default), frequencies of regions
## (0,1), (0,2), (1,1) and (1,2) (the most common alterations) are set
## to represent 90% of the regions.
## 2013-02-27
## o Added parameter 'connex' that forces adjacent regions to be connex if connex = TRUE
## 2013-01-23
## o Removed field 'position' from output.
## 2013-01-22
## o Added more sanity checks.
## 2013-01-16
## o Added arguments 'bkp' and 'regions' to allow for bypassing the breakpoint generation step.
## o Added argument 'regAnnot', through which theoretical frequencies
## for each CN regions can be specified.
## o Made the constraints on CN transitions more generic.
## 2013-01-09
## o Replaced all 'jumps' by 'bkp'.
## 2012-12-20
## o Added new argument 'regionSize'.
## 2012-12-01
## o Added example data files and script based on public data set GSE19539.
## 2012-11-25
## o Now adapts to the set of candidate regions provided as input data.
## o Renamed to 'getCopyNumberDataByResampling'.
## o Updated documentation
## o Added an example (requires non-public data for now).
## o Some code cleanups.
## 2012-10-19
## o Added some sanity checks for arguments.
## 2012-10-16
## o Created from randomProfile.R.
############################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.