#' Obtain and clean nucleosome positioning data from tiling array
#'
#' Process and transform the microarray data coming from tiling array
#' nucleosome positioning experiments.
#'
#' The processing of tiling arrays could be complicated as many types exists on
#' the market. This function deals ok with Affymetrix Tiling Arrays in yeast,
#' but hasn't been tested on other species or platforms.
#'
#' The main aim is convert the output of preprocessing steps (supplied by
#' third-parties packages) to a clean genome wide nucleosome occupancy profile.
#'
#' Tiling arrays doesn't use to provide a one-basepair resolution data, so one
#' gets one value per probe in the array, covering X basepairs and shifted
#' (tiled) Y basepairs respect the surrounding ones. So, one gets a piece of
#' information every Y basepairs.
#'
#' This function tries to convert this noisy, low resolution data, to a
#' one-basepair signal, which allows a fast recognition of nucleosomes without
#' using large and artificious statistical machinery as Hidden Markov Models
#' using posterionr noise cleaning process.
#'
#' As example, imagine your array has probes of 20mers and a tiling between
#' probes of 10bp. Starting at position 1 (covering coordinates from 1 to 20),
#' the next probe will be in position 10 (covering the coordinates 10 to 29).
#' This can be represented as two hybridization intensity values on coordinates
#' 1 and 10. This function will try to infer (using a lineal distribution) the
#' values from 2 to 9 using the existing values of probes in coordinate 1 and
#' coordinate 10.
#'
#' The tiling space between adjacent array probes could be not constant, or
#' could be also there are regions not covered in the used microarray. With the
#' function argument `inferLen` you can specify wich amout of space (in
#' basepairs) you allow to infer the non-present values.
#'
#' If at some point the range not covered (gap) between two adjacent probes of
#' the array is greater than `inferLen` value, then the coordinates between
#' these probes will be setted to `NA`.
#'
#' @param data `ExpressionSet` object wich contains the data of the tiling
#' array.
#' @param exprName Name of the `sample` in `ExpressionSet` which contains the
#' ratio between nucleosomal and genomic dna (if using `Starr`, the
#' `description` argument supplied to `getRatio` function). If this name is
#' not provided, it is assumed `data` has only one column.
#' @param chrPattern Only chromosomes that contain `chrPattern` string will be
#' selected from ExpressionSet. Sometimes tilling arrays contain control
#' quality information that is imported as a chromosome. This allows
#' filtering it. If no value is supplied, all chromosomes will be used.
#' @param inferLen Maximum length (in basepairs) for allowing data gaps
#' inference. See `details` for further information.
#' @param mc.cores Number of cores available to parallel data processing.
#' @param quiet Avoid printing on going information (TRUE | FALSE)
#'
#' @return `RleList` with the observed/inferred values for each coordinate.
#'
#' @note This function should be suitable for all `data` objects of kind
#' `ExpressionSet` coding the annotations `"chr"` for chromosome and `"pos"`
#' for position (acccessible by `pData(data@featureData)`) and a expression
#' value (accessible by `exprs(data)`).
#'
#' @section **Warning**: This function could not cover all kind of arrays in
#' the market. This package assumes the data is processed and normalized
#' prior this processing, using standard microarray packages existing for R,
#' like `Starr`.
#'
#' @author Oscar Flores \email{oflores@@mmb.pcb.ub.es}
#' @seealso [Biobase::ExpressionSet], [Starr::getRatio()]
#' @keywords manip
#'
#' @examples
#' \dontrun{
#' # Dataset cannot be provided for size restrictions
#' # This is the code used to get the hybridization ratio with Starr from
#' # CEL files
#' library("Starr")
#' TA_parsed <- readCelFile(
#' BPMap, CELfiles, CELnames, CELtype, featureData=TRUE, log.it=TRUE
#' )
#' TA_loess <- normalize.Probes(TA_parsed, method="loess")
#' TA_ratio <- getRatio(
#' TA_loess, TA_loess$type=="IP", TA_loess$type=="CONTROL", "myRatio"
#' )
#'
#' # From here, we use nucleR:
#'
#' # Preprocess the array, using the calculated ratio feature we named
#' # "myRatio".
#'
#' # This will also select only those chromosomes with the pattern
#' # "Sc:Oct_2003;chr", removing control data present in that tiling
#' # array.
#'
#' # Finally, we allow that loci not covered by a prove being inferred
#' # from adjacent ones, as far as they are separated by 50bp or less
#' arr <- processTilingArray(
#' TA_ratio, "myRatio", chrPattern="Sc:Oct_2003;chr", inferLen=50
#' )
#'
#' # From here we can proceed with the analysis:
#' arr_fft <- filterFFT(arr)
#' arr_pea <- peakDetection(arr_fft)
#' plotPeaks(arr_pea, arr_fft)
#' }
#'
#' @export processTilingArray
#'
#' @importMethodsFrom Biobase pData sampleNames exprs
processTilingArray <- function (data, exprName, chrPattern, inferLen = 50,
mc.cores = 1, quiet = FALSE)
{
# require("Biobase")
# Obtain annotation
if (!quiet) {
message(" * Parsing annotation")
}
df <- pData(data@featureData)[, c("chr", "pos")]
df$chr <- as.character(df$chr)
# Add expression (intensities) values
if (missing(exprName)) {
exprName <- sampleNames(data)[1]
}
if (!quiet) {
message(paste(" * Using feature name:", exprName))
}
df$value <- exprs(data)[, exprName]
# Select names to keep
if (!missing(chrPattern)) {
selection <- unlist(sapply(
unique(df$chr),
function(x)
grep(chrPattern, x, fixed=TRUE, value=TRUE))
)
df <- df[df$chr %in% selection, ]
}
.fillTheGap <- function(chr_name)
{ # Function wich fills the gaps by chromosome
# Select chromosome and sort
dft <- df[df$chr == chr_name, c("pos", "value")]
dft <- dft[order(dft$pos), ]
# Calculate parameters for seq function
v <- c(dft$pos[2:length(dft$pos)], dft$pos[length(dft$value)])
dft$len <- v - dft$pos + 1
dft$from <- dft$value
dft$to <- c(
dft$value[2:length(dft$value)],
dft$value[length(dft$value)]
)
# Those values which are out of the gap range set them to a strange
# value
# Using some non numeric value has a huge impact on performance,
# -999e99 is an odd value...
dft[dft$len > inferLen, c("from", "to")] <- c(-999e99, -999e99)
# Calculate the values sequence, but discard the first one of each
# seq to not account it twice
out <- apply(
data.matrix(dft),
1,
function(x)
seq(
from=x[["from"]],
to=x[["to"]],
length.out=x[["len"]]
)[2:x[["len"]]]
)
out <- unlist(out)
names(out) <- NULL
out <- c(dft$from[1], out) # Now add the first one hat is never counted
out[out == - 999e99] <- NA # Change the "strange" value for NA
# If the starting point it's not 1, add NA's at the beggining
if (dft$pos[[1]] > 1) {
out <- c(rep(NA, dft$pos[[1]] - 1), out)
}
return(out)
}
# Get the names of all unique chromosomes to iterate on it
chrs <- unique(df$chr)
names(chrs) <- chrs
if(!quiet) {
message(" * Selected chromosomes:")
}
if(!quiet) {
message(chrs)
}
message(" * Inferring missed values")
res <- .xlapply(
chrs,
function(x) {
if (!quiet) {
message(paste(" ", x));
}
.fillTheGap(x)
},
mc.cores=mc.cores
)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.