Nothing
# invocation:
# use selex dataset from ALDEx2 library
# x <- aldex.clr( reads, conds, mc.samples=128, denom="all", verbose=FALSE, useMC=FALSE )
# this function generates the centre log-ratio transform of Monte-Carlo instances
# drawn from the Dirichlet distribution.
aldex.clr.function <- function( reads, conds, mc.samples=128, denom="all", verbose=FALSE, useMC=FALSE, summarizedExperiment=NULL ) {
# INPUT
# The 'reads' data.frame MUST have row
# and column names that are unique, and
# looks like the following:
#
# T1a T1b T2 T3 N1 N2
# Gene_00001 0 0 2 0 0 1
# Gene_00002 20 8 12 5 19 26
# Gene_00003 3 0 2 0 0 0
# ... many more rows ...
#
# ---------------------------------------------------------------------
# OUTPUT
# The output returned is a list (x) that contains Monte-Carlo instances of
# the centre log-ratio transformed values for each sample
# Access to values
# sample IDs: names(x)
# number of features (genes, OTUs): length(x[[1]][,1])
# number of Monte-Carlo Dirichlet instances: length(x[[1]][1,])
# feature names: rownames(x[[1]])
# Fully validate and coerce the data into required formats
# coerce SummarizedExperiment reads into data.frame
if (summarizedExperiment) {
reads <- data.frame(as.list(assays(reads,withDimnames=TRUE)))
if (verbose) {
message("converted SummarizedExperiment read count object into data frame")
}
}
# make sure the conditions vector or matrix is reasonable
if(missing(conds)){
message("no conditions provided: forcing denom = 'all'")
message("no conditions provided: forcing conds = 'NA'")
denom <- "all"
conds <- rep("NA", ncol(reads))
}
# if a model matrix is supplied, then aldex.effect is not valid
# force the use of either all for the denominator
# or
# the use of a user-supplied denominator
if(is(conds, "matrix")){
message("checking for condition length disabled!")
if(is.vector(denom, mode="integer")){
message("user-defined denominator used")
} else if (denom == "all"){
message("using all features for denominator")
} else {
stop("please supply a vector of indices for the denominator")
}
# if(conds.col == 0){
# message("conditions provided as matrix: selecting first column for aldex.clr")
# conds <- as.character(conds[,1])
# }else{
# message("conditions provided as matrix: user selected column for aldex.clr")
# if(is.numeric(conds.col)){
# print(conds[,conds.col])
# conds <- as.vector(conds[,conds.col])
# prints(conds)
# print(length(conds))
# }
# }
}
if(ncol(reads) != length(conds) & !is(conds, "matrix")){
print(length(conds))
print(ncol(reads))
stop("mismatch between number of samples and condition vector")
}
# make sure that the multicore package is in scope and return if available
has.BiocParallel <- FALSE
if ("BiocParallel" %in% rownames(installed.packages()) & useMC){
message("multicore environment is is OK -- using the BiocParallel package")
#require(BiocParallel)
has.BiocParallel <- TRUE
}
else {
message("operating in serial mode")
}
# make sure that mc.samples is an integer, despite it being a numeric type value
mc.samples <- as.numeric(as.integer(mc.samples))
# remove all rows with reads less than the minimum set by minsum
minsum <- 0
# remove any row in which the sum of the row is 0
z <- as.numeric(apply(reads, 1, sum))
reads <- as.data.frame( reads[(which(z > minsum)),] )
if (verbose) message("removed rows with sums equal to zero")
# SANITY CHECKS ON THE DATA INPUT
if ( any( round(reads) != reads ) ) stop("not all reads are integers")
if ( any( reads < 0 ) ) stop("one or more reads are negative")
for ( col in names(reads) ) {
if ( any( ! is.finite( reads[[col]] ) ) ) stop("one or more reads are not finite")
}
if ( length(rownames(reads)) == 0 ) stop("rownames(reads) cannot be empty")
if ( length(colnames(reads)) == 0 ) stop("colnames(reads) cannot be empty")
if ( length(rownames(reads)) != length(unique(rownames(reads))) ) stop ("row names are not unique")
if ( length(colnames(reads)) != length(unique(colnames(reads))) ) stop ("col names are not unique")
if ( mc.samples < 128 ) warning("values are unreliable when estimated with so few MC smps")
# add a prior expection to all remaining reads that are 0
# this should be by a Count Zero Multiplicative approach, but in practice
# this is not necessary because of the large number of features
prior <- 0.5
# This extracts the set of features to be used in the geometric mean computation
# returns a list of features
feature.subset <- aldex.set.mode(reads, conds, denom)
if ( length(feature.subset[[1]]) == 0 ) stop("No low variance, high abundance features in common between conditions\nPlease choose another denomiator.")
reads <- reads + prior
if (verbose == TRUE) message("data format is OK")
# ---------------------------------------------------------------------
# Generate a Monte Carlo instance of the frequencies of each sample via the Dirichlet distribution,
# returns frequencies for each feature in each sample that are consistent with the
# feature count observed as a proportion of the total counts per sample given
# technical variation (i.e. proportions consistent with error observed when resequencing the same library)
nr <- nrow( reads )
rn <- rownames( reads )
#this returns a list of proportions that are consistent with the number of reads per feature and the
#total number of reads per sample
# environment test, runs in multicore if possible
if (has.BiocParallel){
p <- bplapply( reads ,
function(col) {
q <- t( rdirichlet( mc.samples, col ) ) ;
rownames(q) <- rn ;
q })
names(p) <- names(reads)
}
else{
p <- lapply( reads ,
function(col) {
q <- t( rdirichlet( mc.samples, col ) ) ;
rownames(q) <- rn ; q } )
}
# sanity check on the data, should never fail
for ( i in 1:length(p) ) {
if ( any( ! is.finite( p[[i]] ) ) ) stop("non-finite frequencies estimated")
}
if (verbose == TRUE) message("dirichlet samples complete")
# ---------------------------------------------------------------------
# Take the log2 of the frequency and subtract the geometric mean log2 frequency per sample
# i.e., do a centered logratio transformation as per Aitchison
# apply the function over elements in a list, that contains an array
# DEFAULT
if (is.list(feature.subset)) {
# ZERO only
feat.result <- vector("list", length(unique(conds))) # Feature Gmeans
condition.list <- vector("list", length(unique(conds))) # list to store conditions
for (i in 1:length(unique(conds)))
{
condition.list[[i]] <- which(conds == unique(conds)[i]) # Condition list
feat.result[[i]] <- lapply( p[condition.list[[i]]], function(m) {
apply(log2(m), 2, function(x){mean(x[feature.subset[[i]]])})
})
}
set.rev <- unlist(feat.result, recursive=FALSE) # Unlist once to aggregate samples
p.copy <- p
for (i in 1:length(set.rev))
{
p.copy[[i]] <- as.data.frame(p.copy[[i]])
p[[i]] <- apply(log2(p.copy[[i]]),1, function(x){ x - (set.rev[[i]])})
p[[i]] <- t(p[[i]])
}
l2p <- p # Save the set in order to generate the aldex.clr variable
} else if (is.vector(feature.subset)){
# Default ALDEx2, iqlr, user defined, lvha
# denom[1] is put in explicitly for the user-defined denominator case
if (has.BiocParallel){
if (denom[1] != "median"){
l2p <- bplapply( p, function(m) {
apply( log2(m), 2, function(col) { col - mean(col[feature.subset]) } )
})
} else if (denom[1] == "median"){
l2p <- bplapply( p, function(m) {
apply( log2(m), 2, function(col) { col - median(col[feature.subset]) } )
})
}
names(l2p) <- names(p)
}
else{
if (denom[1] != "median"){
l2p <- lapply( p, function(m) {
apply( log2(m), 2, function(col) { col - mean(col[feature.subset]) } )
})
} else if (denom[1] == "median"){
l2p <- lapply( p, function(m) {
apply( log2(m), 2, function(col) { col - median(col[feature.subset]) } )
})
}
}
} else {
message("the denominator is not recognized, use a different denominator")
}
# sanity check on data
for ( i in 1:length(l2p) ) {
if ( any( ! is.finite( l2p[[i]] ) ) ) stop("non-finite log-frequencies were unexpectedly computed")
}
if (verbose == TRUE) message("transformation complete")
return(new("aldex.clr",reads=reads,mc.samples=mc.samples,conds=conds,denom=feature.subset,verbose=verbose,useMC=useMC,analysisData=l2p))
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Getters
###
setMethod("getMonteCarloInstances", signature(.object="aldex.clr"), function(.object) .object@analysisData)
setMethod("getSampleIDs", signature(.object="aldex.clr"), function(.object) names(.object@analysisData))
setMethod("getFeatures", signature(.object="aldex.clr"), function(.object) .object@analysisData[[1]][,1])
setMethod("numFeatures", signature(.object="aldex.clr"), function(.object) length(.object@analysisData[[1]][,1]))
setMethod("numMCInstances", signature(.object="aldex.clr"), function(.object) length(.object@analysisData[[1]][1,]))
setMethod("getFeatureNames", signature(.object="aldex.clr"), function(.object) rownames(.object@analysisData[[1]]))
setMethod("getReads", signature(.object="aldex.clr"), function(.object) .object@reads)
setMethod("numConditions", signature(.object="aldex.clr"), function(.object) length(names(.object@analysisData)))
setMethod("getMonteCarloReplicate", signature(.object="aldex.clr",i="numeric"), function(.object,i) .object@analysisData[[i]])
setMethod("getMonteCarloSample", signature(.object="aldex.clr",i="numeric"), function(.object,i) sapply(.object@analysisData, function(x) {x[,i]}))
setMethod("getDenom", signature(.object="aldex.clr"), function(.object) .object@denom)
setMethod("aldex.clr", signature(reads="data.frame"), function(reads, conds, mc.samples=128, denom="all", verbose=FALSE, useMC=FALSE) aldex.clr.function(reads, conds, mc.samples, denom, verbose, useMC, summarizedExperiment=FALSE))
setMethod("aldex.clr", signature(reads="matrix"), function(reads, conds, mc.samples=128, denom="all", verbose=FALSE, useMC=FALSE) aldex.clr.function(as.data.frame(reads), conds, mc.samples, denom, verbose, useMC, summarizedExperiment=FALSE))
setMethod("aldex.clr", signature(reads="RangedSummarizedExperiment"), function(reads, conds, mc.samples=128, denom="all", verbose=FALSE, useMC=FALSE) aldex.clr.function(reads, conds, mc.samples, denom, verbose, useMC, summarizedExperiment=TRUE))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.