#' Perform inference to detect differential regions in oligo experiments
#'
#' This is the main inference function that aims to find regions with
#' differential signal between two conditions. The two main steps of the
#' procedure are (1) detect candidate regions from the nucleotide level
#' signal, optionally smoothing to combat loss of power due to low coverage,
#' and (2) evaluate the test statistic of condition difference across each
#' candidate region by comparing to a global null distribution generated
#' by permuting sample labels.
#'
#' @param OligoSignal a data frame in the format returned by the function
#' \code{modelNucCounts}. Contains one row per nucleotide count.
#' The first column contains the basepair positions of the nucleotides and
#' the remaining columns hold the counts themselves (one column per sample).
#' @param conditionLabels character vector of length two which contains the
#' condition labels for the two conditions that are being compared.
#' @param minInSpan positive integer that represents the minimum number of
#' nucleotides in a smoothing span window if \code{smooth} is TRUE.
#' Default value is 10.
#' @param minNumRegion positive integer that represents the minimum number of
#' nucleotides to consider for a candidate region. Default value is 5.
#' @param cutoff scalar value that represents the absolute value (or a vector
#' of two numbers representing a lower and upper bound) for the cutoff of
#' the single nucleotide condition coefficient that is used to discover
#' candidate regions.
#' @param smooth logical value that indicates whether or not to smooth the
#' nucleotide level signal when discovering candidate regions.
#' Defaults to FALSE.
#' @param bpSpan a positive integer that represents the length in basepairs
#' of the smoothing span window if \code{smooth} is TRUE. Default value is
#' 100
#' @param verbose logical value that indicates whether addtional progress
#' messages within each iteration should be printed to stout. Default value
#' is FALSE.
#'@param quiet logical value that indicates whether a message is printed to the
#' stout regarding the completion of each permutation iteration. If FALSE
#' (default value) then messages will be printed. If TRUE, then messages
#' will not be printed (but additional messages will be printed within
#' each iteration if \code{verbose} is set to TRUE.
#' @param workers positive integer that represents the number of cores to
#' use if parallelization is desired of the smoothing step.
#' @param logT logical value that indicates whether to model the log2
#' transformed signal (plus a pseudocount of 1). Default is TRUE. Only
#' set to false if transformation has been done prior to running this
#' function, or if distribution of raw values looks relatively symmetric.
#' @param sampleSize positive integer that represents the number of samples
#' in each condition. Defaults to \code{(ncol(OligoSignal)-1)/2}.
#' @param maxPerms a positive integer that represents the maximum number
#' of permutations that will be used to generate the global null
#' distribution of test statistics.
#' @param onlyUp a logical value indicating whether to only consider differences
#' in the positive direction (signal in condition 1 - condition 2 > 0).
#' Default value is FALSE.
#' @param naive a logical value indicating whether to use naive region-level
#' statistic in step 2 that simply takes average of statistic in step 1
#' across the region, instead of the default, which calculates a new
#' statistic that jointly considers all loci in the region. Also, in step 1
#' the standard deviation among replicates is not considered.
#' @param altStat numeric value indicating whether to use alternate statistic
#' for single loci in constructing candidate regions that incorporates the
#' standard deviation among replicates. If 0 (default), differences in means
#' are used as the statistic. If 1, modified t-statistics (instead of
#' effect size estimates) will be used (t-stat = median difference / sd).
#' Since estimates of standard
#' deviations are noisy for small numbers of replicates, the estimates
#' are smoothed across neighboring loci (though the effect size estimates
#' themselves are not smoothed; that can be accomplished by setting
#' smooth=TRUE). If 2, Wilcoxon rank sum statistics are used. If 3,
#' then the same stat as in 1, but using median absolute deviation (MAD)
#' instead of SD.
#' @return a data.frame that contains the results of the inference. The
#' data.frame contains one row for each candidate region, and
#' 9 columns, in the following order: 1. chr =
#' region level labels such as chromosome, gene, or lncRNA, 2. start =
#' start basepair position of the region, 3. end = end basepair position
#' of the region,
#' 4. indexStart = the index of the region's starting nucleotide,
#' 5. indexEnd = the index of the region's ending nucleotide,
#' 6. length = the number of nucleotides contained in the region,
#' 7. stat = the test statistic for the condition difference,
#' 8. pval = the permutation p-value for the significance of the test
#' statistic, and 9. qval = the q-value for the test statistic (adjustment
#' for multiple comparisons to control false discovery rate).
#' @keywords inference
#' @inheritParams bumphunt
#' @importFrom stats model.matrix
#' @importFrom stats p.adjust
#' @importFrom utils combn
#' @export
#' @examples
#' \dontrun{
#' normalizedCounts <- normCounts(rawCounts = system.file("extdata",
#' "allTranscriptsCounts_Raw.tsv", package = "oligoGames"))
#' metaData <- system.file("extdata", "oligoMeta.tsv", package = "oligoGames")
#' oligoLen <- 110
#' conditionLabels <- c("Nuclei", "Total")
#' modeledNucs <- modelNucCounts(normalizedCounts, metaData,
#' conditionLabels, modelMethod = "median", oligoLen = 110)
#' DRregions <- DRfinder(modeledNucs, conditionLabels,
#' minNumRegion = 3, cutoff = 0.25, smooth = FALSE,
#' workers = 1, sampleSize = 4, maxPerms = 50, altStat=1)
#' }
DRfinder <- function(OligoSignal,
conditionLabels=c("condition1", "condition2"),
minInSpan=10,
bpSpan=100,
minNumRegion=5,
cutoff = NULL,
smooth = FALSE,
verbose = FALSE,
quiet=FALSE,
workers = NULL,
sampleSize=(ncol(OligoSignal)-1)/2,
maxPerms=50, logT=TRUE,coef=2,
onlyUp=FALSE, altStat=0,
naive=FALSE){
cond <- c(rep(conditionLabels[1], sampleSize),
rep(conditionLabels[2], sampleSize))
design = model.matrix(~cond)
nlocs = nrow(OligoSignal)
oligo.mat = OligoSignal[,-1]
chr = rownames(OligoSignal)
pos = OligoSignal[,1]
rm(OligoSignal)
maxGap <- maxGapSmooth <-max(diff(pos))+1
# get observed stats
res = bumphunt(oligo.mat,
minInSpan=minInSpan,
design = design, chr = chr, pos = pos,
coef = coef, minNum=minInSpan, maxGapSmooth=maxGapSmooth,
minNumRegion=minNumRegion,
cutoff = cutoff, maxGap = maxGap,
smooth = smooth,
verbose = verbose,
workers = workers, logT=logT,
sampleSize = sampleSize, altStat=altStat,
naive=naive)
if (onlyUp){
# only keep regions with positive directionality
res <- res[res$stat > 0,]
}
if (nrow(design)%%2==0){
perms <- combn(seq(1, nrow(design)), sampleSize)
perms <- perms[, 2:(ncol(perms)/2)]
res.flip <- NULL
if (maxPerms < ncol(perms)){
# subset on 'balanced perms'
if (sampleSize > 3 & sampleSize < 6){
sg <- apply(perms, 2, function(x) sum(x > sampleSize))
perms <- perms[, sg < (sampleSize-1) & sg >= 2 ]
maxPerms <- min(maxPerms, ncol(perms))
}else if(sampleSize >= 6){
sg <- apply(perms, 2, function(x) sum(x > sampleSize))
perms <- perms[, sg >= floor(sampleSize/2) & sg <= ceiling(sampleSize/2) ]
}
perms <- perms[,sort(sample(1:ncol(perms), maxPerms, replace=FALSE))]
}
# Now rerun on flipped designs and concatenate results
for (j in 1:ncol(perms)){
reorder <- perms[,j]
designr <- design
designr[,2] <- 0
designr[reorder,2] <- 1
res.flip.p = bumphunt(oligo.mat,
minInSpan=minInSpan,
design = designr, chr = chr, pos = pos,
coef = coef, minNum=minInSpan,
maxGapSmooth=maxGapSmooth,
minNumRegion=minNumRegion,
cutoff = cutoff, maxGap = maxGap,
smooth = smooth,
verbose = verbose,
workers = workers, logT=logT,
altStat=altStat, sampleSize=sampleSize,
naive=naive)
if (class(res.flip.p)=="data.frame"){
if (nrow(res.flip.p) > 0){
res.flip.p$permNum <- j
res.flip <- rbind(res.flip, res.flip.p)
}
}
if(!quiet)
message(paste0(j, " out of ", ncol(perms), " permutations completed"))
}
}else{
stop("Error: Currently only balanced designs supported")
}
rm(res.flip.p)
perm.ordered <- c(sort(abs(res.flip$stat), method="quick"), Inf)
pval <- rep(NA, nrow(res))
pval[!is.na(res$stat)] <- (1 +
sapply(abs(res$stat[!is.na(res$stat)]),
function(x) length(perm.ordered) -
min(which(x <= perm.ordered)))) /
(1 + sum(!is.na(res.flip$stat)))
res$pval <- pval
res$qval <- p.adjust(pval, method="BH")
return(res)
}
#' Detect and score candidate regions
#'
#' This is an internal workhorse function called by \code{DRfinder} that
#' calculates the nucleotide-level signal, and calls the \code{regionFinder}
#' function to determine candidate regions and score them.
#'
#' @param oligo.mat a matrix that contains the nucleotide level counts that
#' has one row per nucleotide and
#' one column per sample.
#' @param design a model matrix with one row per sample and one column per
#' independent covariate.
#' @param chr a character vector of labels for region-level characteristics,
#' with length equal to the number of rows in \code{oligo.mat} (and in the
#' same order). This can indicate the chromosome, gene, lncRNA, etc.
#' @param pos a numeric vector of basepair positions for each nucleotide in
#' \code{oligo.mat} (and in the same order).
#' @param coef positive integer that indicates which column of the design
#' matrix in \code{design} contains the condition covariate of interest
#' @param maxGap positive integer that indicates the maximum number of basepairs
#' that can separate two nucleotides before they will be divided into two
#' separate candidate regions. Defaults to 50.
#' @param maxGapSmooth positive integer that indicates the maximum number of
#' basepairs
#' that can separate two nucleotides before they will be divided into two
#' separate smoothing regions. Defaults to 50.
#' @param minNum positive integer that represents the minimum number of
#' nucleotides overall in a region to be smoothed (if \code{smooth} is TRUE).
#' Default value is 10
#' @return a data.frame that contains the results of region detection. The
#' data.frame contains one row for each candidate region, and
#' 7 columns, in the following order: 1. chr =
#' region level labels such as chromosome, gene, or lncRNA, 2. start =
#' start basepair position of the region, 3. end = end basepair position
#' of the region,
#' 4. indexStart = the index of the region's starting nucleotide,
#' 5. indexEnd = the index of the region's ending nucleotide,
#' 6. length = the number of nucleotides contained in the region,
#' and 7. stat = the test statistic for the condition difference.
#' @inheritParams DRfinder
#' @import bumphunter
#' @import foreach
#' @import doParallel
#' @importFrom S4Vectors runmean
#' @importFrom S4Vectors Rle
#' @importFrom matrixStats rowVars
#' @keywords inference
##The WGBS version of the 'BumphunterEngine' function with 'LM' and 'GLM'
bumphunt = function(oligo.mat, design,
chr = NULL, pos, coef = 2, minInSpan=10, minNum=10,
minNumRegion=5,
cutoff = NULL, maxGap = 50, maxGapSmooth=50,
smooth = FALSE, bpSpan=100,
verbose = TRUE, workers=NULL,
logT=TRUE, altStat=0, sampleSize,
naive=FALSE, ...)
{
if (!is.matrix(oligo.mat))
stop("'oligo.mat' must be a matrices.")
if (ncol(oligo.mat) != nrow(design))
stop("Total number of columns in 'oligo.mat' must
match number of rows of 'design'")
if (!(is.null(cutoff) || length(cutoff) %in% 1:2))
stop("'cutoff' has to be either NULL or a vector of length 1 or 2")
if (length(cutoff) == 2)
cutoff <- sort(cutoff)
if (!getDoParRegistered())
registerDoSEQ()
registerDoParallel()
if (is.null(workers)) { workers <- 1 }
backend <- getDoParName()
version <- getDoParVersion()
subverbose <- max(as.integer(verbose) - 1L, 0)
if (verbose) {
if (workers == 1) {
mes <- "Using a single core (backend: %s, version: %s)."
message(sprintf(mes, backend, version))
}else {
mes <- "Parallelizing using %s workers/cores (backend: %s, version: %s)."
message(sprintf(mes, workers, backend, version))
}
}
if (is.null(chr))
chr <- rep("Unspecified", length(pos))
if (is.factor(chr))
chr <- as.character(chr)
cluster <- bumphunter:::clusterMaker(factor(chr, levels=unique(chr)), pos,
maxGap = maxGap, assumeSorted=TRUE)
if (verbose)
message("Computing coefficients.")
if (as.numeric(altStat)==1){ # use alternate single-loci stat that incorporates SD
# function to smooth sds (internal function from bsseq package)
smoothSd <- function(Sds, k, minSD) {
k0 <- floor(k/2)
if(all(is.na(Sds))) return(Sds)
thresSD <- pmax(Sds, minSD, na.rm = TRUE)
addSD <- rep(minSD, k0)
sSds <- as.vector(runmean(Rle(c(addSD, thresSD, addSD)), k = k))
sSds
}
g1 <- which(design[,coef]==1)
g2 <- which(design[,coef]==0)
group1.means <- apply(log2(oligo.mat+1)[, g1, drop = FALSE], 1,
function(x) median(x, na.rm=FALSE))
group2.means <- apply(log2(oligo.mat+1)[, g2, drop = FALSE], 1,
function(x) median(x, na.rm=FALSE))
rawSds <- sqrt( ((length(g1) - 1) * rowVars(log2(oligo.mat+1), cols = g1) +
(length(g2) - 1) * rowVars(log2(oligo.mat+1), cols = g2)) /
(length(g1) + length(g2) - 2))
clusterIdx <- split(seq(along=cluster), cluster)
sdThresh <- quantile(rawSds, 0.50, na.rm = TRUE)
smoothSds <- unlist(lapply(clusterIdx, function(idx) {
smoothSd(rawSds[idx], k = 5, minSD = sdThresh)
}))
scale <- sqrt(1/length(g1) + 1/length(g2))
tstat.sd <- smoothSds * scale
tstat.sd <- tstat.sd
if (!naive){
tstat <- (group1.means - group2.means) / tstat.sd
}else{
tstat <- (group1.means - group2.means)
}
is.na(tstat)[tstat.sd == 0] <- TRUE
est <- data.frame(coef=tstat, stdev.unscaled=rawSds, sigma=scale)
}else if(as.numeric(altStat)==2){
g1 <- which(design[,coef]==1)
g2 <- which(design[,coef]==0)
est1 <- apply(log2(oligo.mat+1), 1, function(x)
suppressWarnings(wilcox.test(x[g1], x[g2], exact=FALSE,
correct=FALSE)$statistic))
est <- data.frame(coef= (est1 - sampleSize^2/2) / sampleSize^2)
}else if (as.numeric(altStat)==3){ # use alternate single-loci stat that incorporates SD
# function to smooth sds (internal function from bsseq package)
smoothSd <- function(Sds, k, minSD) {
k0 <- floor(k/2)
if(all(is.na(Sds))) return(Sds)
thresSD <- pmax(Sds, minSD, na.rm = TRUE)
addSD <- rep(minSD, k0)
sSds <- as.vector(runmean(Rle(c(addSD, thresSD, addSD)), k = k))
sSds
}
g1 <- which(design[,coef]==1)
g2 <- which(design[,coef]==0)
group1.means <- apply(log2(oligo.mat+1)[, g1, drop = FALSE], 1,
function(x) median(x, na.rm=FALSE))
group2.means <- apply(log2(oligo.mat+1)[, g2, drop = FALSE], 1,
function(x) median(x, na.rm=FALSE))
group1.MAD <- apply(log2(oligo.mat+1)[, g1, drop = FALSE], 1,
function(x) median(abs(x - median(x, na.rm=FALSE))))
group2.MAD <- apply(log2(oligo.mat+1)[, g2, drop = FALSE], 1,
function(x) median(abs(x - median(x, na.rm=FALSE))))
rawSds <- ((length(g1) - 1) * group1.MAD + (length(g2) - 1) * group2.MAD) /
(length(g1) + length(g2) - 2)
clusterIdx <- split(seq(along=cluster), cluster)
sdThresh <- quantile(rawSds, 0.50, na.rm = TRUE)
smoothSds <- unlist(lapply(clusterIdx, function(idx) {
smoothSd(rawSds[idx], k = 5, minSD = sdThresh)
}))
scale <- sqrt(1/length(g1) + 1/length(g2))
tstat.sd <- smoothSds * scale
tstat.sd <- tstat.sd
if (!naive){
tstat <- (group1.means - group2.means) / tstat.sd
}else{
tstat <- (group1.means - group2.means)
}
is.na(tstat)[tstat.sd == 0] <- TRUE
est <- data.frame(coef=tstat, stdev.unscaled=rawSds, sigma=scale)
}else if(as.numeric(altStat)==0 & logT){
est <- bumphunter:::.getEstimate(log2(oligo.mat+1),
design, coef, full=TRUE)
}else if(as.numeric(altStat)==0 & !logT){
est <- bumphunter:::.getEstimate(oligo.mat,
design, coef, full=TRUE)
}
rawBeta <- est$coef
cluster <- as.integer(cluster)
if (smooth) {
if (as.numeric(altStat)==2){
stop("Error: smoothing for altStat 2 (Wilcoxon) not implemented")
}
sd.raw = est$sigma*est$stdev.unscaled
weights <- sd.raw
weights[sd.raw < 10^-5] = mean(sd.raw[sd.raw > 10^-5])
rm(sd.raw)
}
rm(est)
gc()
if (smooth) {
if (verbose)
message("Smoothing coefficients.")
beta <- vector("list", 3)
beta[[1]] <- beta[[2]] <- rep(NA, length(pos))
for (chromosome in unique(chr)){
beta.tmp <- smoother(y = rawBeta[chr==chromosome],
x = pos[chr==chromosome],
workers=workers, chr=chr[chr==chromosome],
maxGapSmooth=maxGapSmooth, weights = weights,
minNum = minNum, minInSpan = minInSpan, bpSpan = bpSpan,
verbose = verbose)
beta[[1]][chr==chromosome] <- beta.tmp[[1]]
beta[[2]][chr==chromosome] <- beta.tmp[[2]]
}
beta[[3]] <- beta.tmp[[3]]
names(beta) <- names(beta.tmp)
rm(beta.tmp)
Index <- which(beta$smoothed)
beta <- beta$fitted
}else {
beta <- rawBeta
Index <- seq(along = beta)
}
gc()
rm(rawBeta)
if (verbose)
message("Finding regions.")
tab <- regionFinder(x = beta, chr = chr, pos = pos, cluster = cluster,
cutoff = cutoff, ind = Index, minNumRegion = minNumRegion,
oligo.mat = oligo.mat,
verbose = verbose,
design = design, workers=workers, logT=logT,
naive=naive, beta=beta)
rm(beta);
rm(chr);
rm(pos);
gc()
if (nrow(tab) == 0) {
if (verbose)
message("No candidate regions found!")
return(NA)
}else {
if (verbose)
message(sprintf("Found %s candidate regions.", nrow(tab)))
}
return(table = tab)
}
#' Break up nucleotide level signal into candidate regions and score them
#'
#' This is an internal workhorse function for \code{bumphunt} that takes the
#' nucleotide-level signal and parses it into contigous regions that pass
#' the threshold and form
#' the candidates, and then scores each one based on a test statistic
#' of the difference.
#' @param x a vector of condition coefficients (for the covariate of interest)
#' for each nucletide
#' @param cluster a vector of cluster membership values for each nucleotide
#' determined by the \code{clusterMaker} function in the \code{bumphunter}
#' package
#' @param ind a vector if indices of \code{x} which are non-NULL. Defaults to
#' all indices of \code{x}.
#' @param order logical that indicates whether or not to order the candidate
#' regions by the test statistic magnitude (largest to smallest).
#' Defaults to TRUE.
#' @param assumeSorted logical that indicates whether the nucleotides are
#' sorted in ascending order. Defaults to FALSE.
#' @param beta vector of loci-specific statistics from step 1 (only needed
#' if naive is TRUE)
#' @inheritParams DRfinder
#' @inheritParams bumphunt
#' @return a data.frame that contains the results of region detection. The
#' data.frame contains one row for each candidate region, and
#' 7 columns, in the following order: 1. chr =
#' region level labels such as chromosome, gene, or lncRNA, 2. start =
#' start basepair position of the region, 3. end = end basepair position
#' of the region,
#' 4. indexStart = the index of the region's starting nucleotide,
#' 5. indexEnd = the index of the region's ending nucleotide,
#' 6. length = the number of nucleotides contained in the region,
#' and 7. stat = the test statistic for the condition difference.
#' @import bumphunter
#' @import reshape2
#' @importFrom stats lm
#' @importFrom splines ns
#' @importFrom stats quantile
#' @importFrom parallel mclapply
#' @keywords inference
regionFinder <- function(x, chr, pos, cluster=NULL,
ind=seq(along=x),order=TRUE, minNumRegion=5,
maxGap, cutoff=quantile(abs(x), 0.99),
assumeSorted = FALSE, oligo.mat=oligo.mat,
verbose = TRUE,
design=design, workers=workers, logT=TRUE,
naive=FALSE, beta=NULL){
if(any(is.na(x[ind]))){
warning("NAs found and removed. ind changed.")
ind <- intersect(which(!is.na(x)),ind)
}
if (is.null(beta) & naive){
stop("Error: beta needs to be specified if calculating naive region stat")
}
if(is.null(cluster))
cluster <- bumphunter:::clusterMaker(chr, pos, maxGap=maxGap,
assumeSorted = assumeSorted)
Indexes <- bumphunter:::getSegments(x = x[ind], f = cluster[ind],
cutoff = cutoff,
assumeSorted = assumeSorted,
verbose = verbose)
clusterN <- table(cluster)[as.character(cluster)]
# only keep up and down indices
Indexes <- Indexes[1:2]
# only keep clusters that have at least minNumRegion elements
for(i in 1:2){
lns <- sapply(Indexes[[i]], length)
Indexes[[i]] <- Indexes[[i]][lns >= minNumRegion]
}
stat.OligoSignal.spline <- function(ix){
X = as.vector(sapply(design[,2], function(x) rep(x, nrow(oligo.mat[ix,]))))
L = as.vector(rep(pos[ix], nrow(design)))
if(logT){
Y = log2(oligo.mat[ix,] + 1)
}else{
Y = oligo.mat[ix,]
}
# check that signal isn't all constant within condition
if (length(unique(as.vector(Y[,1:sampleSize]))) == 1 &&
length(unique(as.vector(Y[,(sampleSize+1):(2*sampleSize)]))) == 1 )
return(NA)
Y <- melt(Y)
Y$X <- X
Y$L <- L
int.knots <- ceiling((max(pos[ix]) - min(pos[ix])) / 100) + 1
# linear trend
trend.linear <- lm(value ~ ns(L, df=int.knots-1), data=Y)
# polynomial trend
trend.poly <- lm(value ~ ns(L, df=int.knots), data=Y)
# adaptively determine usage of spline adjustment (use none if no
# evidence of trend in the region to guard against over-fitting)
if(anova(trend.poly)[1,5] < 0.01){
stat <- summary(lm(value~ X + ns(L, df=int.knots), data=Y))$coef[2,3]
}else if(anova(trend.linear)[1,5] < 0.01){
stat <- summary(lm(value~ X + ns(L, df=int.knots-1), data=Y))$coef[2,3]
}else{
stat <- summary(lm(value~ X, data=Y))$coef[2,3]
}
return(stat)
}
stat.naive <- function(ix){
return(mean(beta[ix]))
}
cov.upper <- quantile(oligo.mat, 0.9)
t1 <- proc.time()
res <- vector("list",2)
for(i in 1:2){
res[[i]]<-
data.frame(chr=sapply(Indexes[[i]], function(Index) chr[ind[Index[1]]]),
start=sapply(Indexes[[i]], function(Index) min(pos[ind[Index]])),
end=sapply(Indexes[[i]], function(Index) max(pos[ind[Index]])),
indexStart=sapply(Indexes[[i]], function(Index) min(ind[Index])),
indexEnd = sapply(Indexes[[i]], function(Index) max(ind[Index])),
length = sapply(Indexes[[i]], length), stringsAsFactors=FALSE)
if (!naive){
res[[i]]$stat = unlist(mclapply(Indexes[[i]], function(Index)
stat.OligoSignal.spline(ind[Index]),
mc.cores=workers))
}else{
res[[i]]$stat = unlist(mclapply(Indexes[[i]], function(Index)
stat.naive(ind[Index]),
mc.cores=workers))
}
}
t2 <- proc.time()
t2 - t1
names(res) <- c("up","dn")
res <- rbind(res$up,res$dn)
if(order & nrow(res)>0) res <- res[order(-abs(res$stat)),]
# remove regions with no variation
res <- res[!is.na(res$stat),]
return(res)
}
#' A smoother function for coefficient values
#'
#' This is an internal workhorse function that (optionally) smooths the
#' coefficient values for the covariate of interest.
#' @param weights a numeric vector that contains the smoothing weights.
#' @inheritParams DRfinder
#' @inheritParams bumphunt
#' @inheritParams regionFinder
#' @return a list object of length four that contains the following objects
#' in order: 1. fitted = a numeric matrix containing the coefficient
#' estimates after smoothing those that satisify the smoothing conditions
#' for each nucleotide,
#' 2. smoothed = a logical vector that indicates whether each nucleotide
#' was smoothed, 3. smoother = a character that represents the type
#' of smoothing that was performed, and 4. idx a numeric vector that
#' indicates which nucleotides are non-NULL.
#' @import bumphunter
#' @import locfit
#' @import doRNG
#' @importFrom iterators iter
#' @importFrom stats preplot
#' @keywords inference
smoother <- function(y, x=NULL, weights=NULL, chr=chr,
maxGapSmooth=maxGapSmooth,
verbose=TRUE, workers=1, minNum = minNum,
minInSpan = minInSpan, bpSpan = bpSpan) {
locfitByCluster2 <- function(ix) {
## if y is vector change to matrix
yi = matrix(y[ix], ncol=1)
xi = x[ix]
weightsi = matrix(weights[ix], ncol=1)
clusteri = clusterC[ix]
if(is.null((yi)))
stop("y (rawBeta) is missing")
if(is.null(xi))
stop("x (pos) is missing")
if(is.null(clusteri))
stop("cluster is missing")
if(is.null(weightsi))
weightsi <- matrix(1, nrow = nrow(yi), ncol = ncol(yi))
Indexes <- split(seq(along = clusteri), clusteri)
clusterL <- sapply(Indexes, length)
smoothed <- rep(TRUE, nrow(yi))
for(i in seq(along=Indexes)) {
#if(verbose) if(i %% 1e4 == 0) cat(".") # print the cluster index
Index <- Indexes[[i]]
if(clusterL[i] >= minNum &
sum(rowSums(is.na(yi[Index,,drop=FALSE]))==0) >= minNum) {
nn <- minInSpan / length(Index)
for(j in 1:ncol(yi)) {
sdata <- data.frame(posi = xi[Index],
yi = yi[Index, j],
weightsi = weightsi[Index,j])
fit <- locfit(yi ~ lp(posi, nn = nn, h = bpSpan), data = sdata,
weights = weightsi, family = "gaussian", maxk = 10000)
pp <- preplot(fit, where = "data", band = "local",
newdata = data.frame(posi = xi[Index]))
yi[Index,j] <- pp$trans(pp$fit)
}
} else {
yi[Index,] <- NA
smoothed[Index] <- FALSE
}
}
return(list(fitted=yi, smoothed=smoothed, smoother="locfit"))
}
if(is.null(dim(y)))
y <- matrix(y, ncol=1)
if(!is.null(weights) && is.null(dim(weights)))
weights <- matrix(weights, ncol=1)
if (!getDoParRegistered())
registerDoSEQ()
ret.all <- NULL
cores <- workers
# loop through each chromosome to mitigate memory spikes
for (chromosome in unique(chr)){
clusterC <- bumphunter:::clusterMaker(chr[chr==chromosome],
x[chr==chromosome],
maxGap = maxGapSmooth)
Indexes <- split(seq(along=clusterC), clusterC)
IndexesChunks <- vector("list", length = cores)
baseSize <- length(Indexes) %/% cores
remain <- length(Indexes) %% cores
done <- 0L
for(ii in 1:cores) {
if(remain > 0) {
IndexesChunks[[ii]] <- done + 1:(baseSize + 1)
remain <- remain - 1L
done <- done + baseSize + 1L
} else {
IndexesChunks[[ii]] <- done + 1:baseSize
done <- done + baseSize
}
}
IndexesChunks <- lapply(IndexesChunks, function(idxes) {
do.call(c, unname(Indexes[idxes]))
})
idx <- NULL ## for R CMD check
ret <- foreach(idx = iter(IndexesChunks)) %dorng% {
sm <- locfitByCluster2(idx)
c(sm, list(idx = seq(1, length(chr))[chr==chromosome][idx]))
}
attributes(ret)[["rng"]] <- NULL
## Paste together results from different workers
ret <- bumphunter:::reduceIt(ret)
if(is.null(ret.all)){
ret.all <- ret
}else{
ret.all$smoother <- c(ret.all$smoother, ret$smoother)
ret.all$fitted <- c(ret.all$fitted, ret$fitted)
ret.all$smoothed <- c(ret.all$smoothed, ret$smoothed)
ret.all$idx <- c(ret.all$idx, ret$idx)
}
rm(ret)
}
## Now fixing order issue
revOrder <- ret.all$idx
names(revOrder) <- seq_along(ret.all$idx)
revOrder <- sort(revOrder)
revOrder <- as.integer(names(revOrder))
ret.all$smoother <- ret.all$smoother[1]
ret.all$fitted <- ret.all$fitted[revOrder,,drop=FALSE]
ret.all$smoothed <- ret.all$smoothed[revOrder]
ret.all$idx <- NULL
return(ret.all)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.