####################################################################
## Author: Gro Nilsen, Knut Liestøl and Ole Christian Lingjærde.
## Maintainer: Gro Nilsen <gronilse@ifi.uio.no>
## License: Artistic 2.0
## Part of the copynumber package
## Reference: Nilsen and Liestøl et al. (2012), BMC Genomics
####################################################################
## Required by:
## Requires:
### findNN
### getMad
### pcf(not the main function, but the rest of the help functions found in the same document)
### handleMissing
### pullOutContent
## Main function for pcf-analysis to be called by the user
pcfPlain <- function(pos.data, kmin = 5, gamma = 40, normalize = TRUE, fast = TRUE, digits = 4, return.est = FALSE, verbose = TRUE) {
# Input could come from winsorize and thus be a list; check and possibly retrieve data frame wins.data
pos.data <- pullOutContent(pos.data, what = "wins.data")
# Make sure all data columns are numeric:
if (any(!sapply(pos.data, is.numeric))) {
stop("All input in pos.data must be numeric", call. = FALSE)
}
# Check data input:
stopifnot(ncol(pos.data) >= 2) # something is missing in input data
# Extract information from pos.data:
position <- pos.data[, 1]
cn.data <- as.matrix(pos.data[, -1])
nSample <- ncol(pos.data) - 1
sampleid <- colnames(pos.data)[-1]
nProbe <- length(position)
# save user's gamma
gamma0 <- gamma
sd <- rep(1, nSample) # sd is only used if normalize=TRUE, and then these values are replaced by MAD-sd
# If number of probes in entire data set is less than 100K, the MAD sd-estimate is calculated using all obs for every sample
# Only required if normalize=T
if (nProbe < 100000 && normalize) {
# calculate MAD-sd for each sample:
for (j in 1:nSample) {
sample.data <- pos.data[, j + 1]
sd[j] <- getMad(sample.data[!is.na(sample.data)], k = 25) # Take out missing values before calculating mad
}
} # endif
# Initialize
pcf.names <- c("pos", sampleid)
seg.names <- c("sampleID", "start.pos", "end.pos", "n.probes", "mean")
segments <- data.frame(matrix(nrow = 0, ncol = 5))
colnames(segments) <- seg.names
if (return.est) {
pcf.est <- matrix(nrow = 0, ncol = nSample)
}
# Run PCF separately for each sample:
for (i in 1:nSample) {
if (return.est) {
# Initialize:
yhat <- rep(NA, length(nProbe))
}
sample.data <- cn.data[, i]
# Remove probes with missing obs; Only run pcf on non-missing values
obs <- !is.na(sample.data)
obs.data <- sample.data[obs]
if (length(obs.data) > 0) { ## Make sure there are observations for this sample! If not, estimates are left NA as well
# If number of probes in entire data set is >= 100K, the MAD sd-estimate is calculated using obs in this arm for this sample.
# Only required if normalize=T
if (nProbe >= 100000 && normalize) {
sd[i] <- getMad(obs.data, k = 25)
}
# Scale gamma by variance if normalize is TRUE
use.gamma <- gamma0
if (normalize) {
use.gamma <- gamma0 * (sd[i])^2
}
# Must check that sd!=0 and sd!!=NA -> use.gamma=0/NA. If not, simply calculate mean of observations
if (use.gamma == 0 || is.na(use.gamma)) {
if (return.est) {
res <- list(Lengde = length(obs.data), sta = 1, mean = mean(obs.data), nIntervals = 1, yhat = rep(mean(obs.data)))
} else {
res <- list(Lengde = length(obs.data), sta = 1, mean = mean(obs.data), nIntervals = 1)
}
} else {
# Compute piecewise constant fit
# run fast approximate PCF if fast=TRUE and number of probes>400, or exact PCF otherwise
if (!fast || length(obs.data) < 400) {
# Exact PCF:
res <- exactPcf(y = obs.data, kmin = kmin, gamma = use.gamma, yest = return.est)
} else {
# Run fast PCF:
res <- selectFastPcf(x = obs.data, kmin = kmin, gamma = use.gamma, yest = return.est)
} # endif
} # endif
# Retrieve segment info from results:
seg.start <- res$sta
seg.stop <- c(seg.start[-1] - 1, length(obs.data))
seg.npos <- res$Lengde
seg.mean <- res$mean
nSeg <- res$nIntervals
if (return.est) {
yhat[obs] <- res$yhat
}
# Find genomic positions for start and stop of each segment:
pos.start <- position[seg.start]
pos.stop <- position[seg.stop]
# Handle missing values:
if (any(!obs)) {
# first find nearest non-missing neighbour for missing probes:
nn <- findNN(pos = position, obs = obs)
# Include probes with missing values in segments where their nearest neighbour probes are located
new.res <- handleMissing(nn = nn, pos = position, obs = obs, pos.start = pos.start, pos.stop = pos.stop, seg.npos = seg.npos)
pos.start <- new.res$pos.start
pos.stop <- new.res$pos.stop
seg.npos <- new.res$seg.npos
if (return.est) {
yhat[!obs] <- yhat[nn]
}
}
} else {
warning(paste("pcf is not run for sample ", i, " because all observations are missing. NA is returned.", sep = ""), immediate. = TRUE, call. = FALSE)
seg.start <- 1
seg.stop <- nProbe
pos.start <- position[seg.start]
pos.stop <- position[seg.stop]
nSeg <- 1
seg.mean <- NA
seg.npos <- nProbe
}
# Round:
seg.mean <- round(seg.mean, digits = digits)
# Add results for this sample to results for other samples in data frame:
seg <- data.frame(rep(sampleid[i], nSeg), pos.start, pos.stop, seg.npos, seg.mean, stringsAsFactors = FALSE)
colnames(seg) <- seg.names
segments <- rbind(segments, seg)
if (return.est) {
# Rounding:
yhat <- round(yhat, digits = digits)
pcf.est <- cbind(pcf.est, yhat)
}
} # endfor
if (verbose) {
cat(paste("pcf finished for sample ", i, sep = ""), "\n")
}
if (return.est) {
pcf.est <- data.frame(position, pcf.est, stringsAsFactors = FALSE)
colnames(pcf.est) <- pcf.names
return(list(estimates = pcf.est, segments = segments))
} else {
return(segments)
}
} # endfunction
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.