# binary qualitative test (present in 1 but not the other)
.binary_test = function(x, y, alternative){
p = NA
in_x = sum(x > 0) > 0
in_y = sum(y > 0) > 0
# in x but non y
if(alternative == 'greater'){
stat = ifelse((in_x == TRUE & in_y != TRUE), 1, 0)
} else
if(alternative == 'less'){
stat = ifelse((in_y == TRUE & in_x != TRUE), 1, 0)
} else {
stop('Alternative option not supported for "binary"')
}
return(c(stat, p))
}
# t.test wrapper
.t_test = function(x, y, alternative){
z = stats::t.test(x, y, alternative=alternative)
stat = as.numeric(z$statistic)
p = as.numeric(z$p.value)
return(c(stat, p))
}
# wilcox.test wrapper
.wilcox_test = function(x, y, alternative){
z = stats::wilcox.test(x, y, alternative=alternative)
stat = as.numeric(z$statistic)
p = as.numeric(z$p.value)
return(c(stat, p))
}
#' Heavy-SIP analysis
#'
#' Compare taxon abundances in 'heavy' fractions versus specific controls.
#'
#' 'Heavy-SIP' encompasses the analyses often used in SIP studies prior
#' to new HTS-SIP methodologies. These methods all consisted of identifying
#' 'heavy' gradient fractions. This was often done by comparing
#' the distribution of DNA conc. or gene copies across gradient fractions
#' in labeled treatments versus unlabeled controls. Sometimes, the unlabeled
#' control was left out, and "heavy" gradients were identified based on
#' comparisons with theoretic distributions of unlabeled DNA.
#'
#' Although hypothesis testing was often used to assess increased
#' taxon abundances in "heavy" gradients of labled treatments (eg., one-tailed
#' t-tests), the hypothesis testing usually did not account for the compositional
#' nature of sequence data (relative abundances).
#'
#' Here, "heavy-SIP" can define incorporators as either:
#' \itemize{
#' \item{"H" =}{
#' Any taxa IN the "heavy" fractions of the labeled treatment gradients
#' }
#' \item{"H-v-L" =}{
#' Any taxa IN the "heavy" fractions of the labeled treatment and NOT
#' present in the "heavy" fractions of the control
#' }
#' \item{"H-v-H" =}{
#' Any taxa IN the "heavy" fractions of the labeled treatment and NOT
#' present in the "light" fractions of the labeled treatment
#' }
#'}
#' Instead of binary comparisions (presence/absence),
#' one-tailed t-tests or Wilcoxon Rank Sum tests can be used to assess
#' differential abundance between "heavy" and controls. The hypothesis
#' testing methods require multiple replicate controls, will use the mean
#' taxon abundance in the "heavy" (and "light") window.
#'
#' @param physeq A phyloseq object of just treatment vs control.
#' If you have a more complicated experimental design, subset the
#' phyloseq object into a list of treatment vs control comparisions.
#' @param ex Expression for selecting controls based on metadata
#' @param rep Column specifying gradient replicates. If the column
#' does not exiset, then the column will be created, and all will be considered
#' "replicate=1"
#' @param light_window A vector designating the "light" BD window start and stop
#' @param heavy_window A vector designating the "heavy" BD window start and stop
#' @param comparison Which light/heavy BD windows to compare (see the description)?
#' @param hypo_test Which hypothesis test to run on each OTU?
#' Note that "binary" isn't really a hypothesis test, but just qualitative.
#' @param alternative The "alternative" option for the hypothesis test functions.
#' Note that "two.sided" doesn't work for the "binary" test.
#' @param sparsity_threshold All OTUs observed in less than this portion (fraction: 0-1)
#' of gradient fraction samples are pruned. A a form of indepedent filtering,
#' The sparsity cutoff with the most rejected hypotheses is used.
#' @param sparsity_apply Apply sparsity threshold to all gradient fraction samples ('all')
#' or just heavy fraction samples ('heavy')
#' @param padj_method Multiple hypothesis correction method. See `p.adjust()` for more
#' details.
#' @return a data.frame object of hypothesis test results
#'
#' @export
#'
#' @examples
#' data(physeq_S2D2)
#' data(physeq_rep3)
#' \dontrun{
#' # Calculating 'binary' for unreplicated experiment
#' ## Subsetting phyloseq by Substrate and Day
#' params = get_treatment_params(physeq_S2D2, c('Substrate', 'Day'))
#' params = dplyr::filter(params, Substrate!='12C-Con')
#' ex = "(Substrate=='12C-Con' & Day=='${Day}') | (Substrate=='${Substrate}' & Day == '${Day}')"
#' physeq_S2D2_l = phyloseq_subset(physeq_S2D2, params, ex)
#'
#' ## Calculating heavy-SIP on 1 subset (use lapply function to process full list)
#' incorps = heavy_SIP(physeq_S2D2_l[[1]])
#'
#' # Calculating wilcox test on replicated design
#' ## (comparing heavy-treatment versus heavy-control)
#' incorps = heavy_SIP(physeq_rep3, ex="Treatment=='12C-Con'", comparison='H-v-H', hypo_test='wilcox')
#' }
heavy_SIP = function(physeq,
ex="Substrate=='12C-Con'",
rep='Replicate',
light_window=c(1.68, 1.70),
heavy_window=c(1.73, 1.75),
comparison=c('H', 'H-v-L', 'H-v-H'),
hypo_test=c('binary', 't-test', 'wilcox'),
alternative = c('greater', 'two.sided', 'less'),
sparsity_threshold=0.1,
sparsity_apply=c('all', 'heavy'),
padj_method='BH'){
# assertions
stopifnot(length(light_window) >= 2)
stopifnot(length(heavy_window) >= 2)
comparison = comparison[1]
hypo_test = hypo_test[1]
## selecting hypothesis test
### how to aggregate abundances within each replicate?
agg_func = switch(hypo_test,
'binary' = function(x) sum(x > 0) > 0,
't-test' = mean,
'wilcox' = mean,
stop('Hypothesis test not recognized'))
### hypothesis test function
hypo_func = switch(hypo_test,
'binary' = .binary_test,
't-test' = .t_test,
'wilcox' = .wilcox_test,
stop('Hypothesis test not recognized'))
# sparsity cutoff applied to all gradient fractions
prn = function(x) sum(x > 0) > sparsity_threshold * length(x)
if(sparsity_apply[1] == 'all'){
physeq = phyloseq::filter_taxa(physeq, prn, TRUE)
}
# removing 0-abundance taxa
physeq = phyloseq::filter_taxa(physeq, function(x) sum(x > 0) > 0 * length(x), TRUE)
# sparsity cutoff applied to just heavy fractions
if(sparsity_apply[1] == 'heavy'){
physeq = phyloseq::filter_taxa(physeq, prn, TRUE)
}
# formatting phyloseq & metadata
physeq = physeq_format(physeq)
metadata = format_metadata(physeq, ex, rep=rep)
# parsing treatment & control
physeq_control = phyloseq::prune_samples(metadata$IS__CONTROL==TRUE, physeq)
physeq_treat = phyloseq::prune_samples(metadata$IS__CONTROL==FALSE, physeq)
# window selection
## applying 'heavy' window pruning
metadata_control = metadata[metadata$IS__CONTROL == TRUE,]
metadata_treat = metadata[metadata$IS__CONTROL == FALSE,]
physeq_treat_heavy = phyloseq::prune_samples((metadata_treat$BD_min >= heavy_window[1]) &
(metadata_treat$BD_min <= heavy_window[2]),
physeq_treat)
# comparing 'heavy' (versus control)
## output format: OTU, statistic, p, padj
if(comparison == 'H'){
# just binary possible
stopifnot(hypo_test == 'binary')
# converting to binary
res = phyloseq::otu_table(physeq_treat_heavy) %>%
as.matrix %>%
apply(1, agg_func) %>%
sapply(function(x) as.numeric(x > 0)) %>%
as.data.frame()
# applying statistic
colnames(res)[1] = 'statistic'
res$p = NA
res$padj = NA
res$OTU = rownames(res)
res = res[,c('OTU', 'statistic', 'p', 'padj')]
return(res)
}
if(comparison == 'H-v-L'){
# comparing heavy vs light from same gradient (no 'control' gradient)
## applying 'light' window pruning
physeq_treat_light = phyloseq::prune_samples((metadata_treat$BD_min >= light_window[1]) &
(metadata_treat$BD_min <= light_window[2]),
physeq_treat)
# aggregating all fractions in same replicates
## heavy fractions, per replicate
metadata_treat_heavy = metadata[metadata$METADATA_ROWNAMES %in%
phyloseq::sample_names(physeq_treat_heavy),]
Replicate = as.data.frame(metadata_treat_heavy)[,rep]
df_h = phyloseq::otu_table(physeq_treat_heavy) %>%
as.matrix %>%
as.data.frame %>% t
df_h = stats::aggregate(df_h,
list(Replicate = Replicate),
agg_func)
## light fractions, per replicate
metadata_treat_light = metadata[metadata$METADATA_ROWNAMES %in%
phyloseq::sample_names(physeq_treat_light),]
Replicate = as.data.frame(metadata_treat_light)[,rep]
df_l = phyloseq::otu_table(physeq_treat_light) %>%
as.matrix %>%
as.data.frame %>% t
df_l = stats::aggregate(df_l,
list(Replicate = Replicate),
agg_func)
# applying statistical test
otus = colnames(df_h)[2:ncol(df_h)]
res = sapply(otus, function(x) hypo_func(df_h[,x], df_l[,x],
alternative=alternative[1]))
res = res %>% t %>% as.data.frame
colnames(res) = c('statistic', 'p')
res$padj = stats::p.adjust(res$p, padj_method)
return(res)
} else
if(comparison == 'H-v-H'){
# comparing heavy vs light from same gradient (no 'control' gradient)
## applying 'heavy' window pruning on control
physeq_control_heavy = phyloseq::prune_samples((metadata_control$BD_min >= heavy_window[1]) &
(metadata_control$BD_min <= heavy_window[2]),
physeq_control)
# aggregating all fractions in same replicates
## heavy treatment fractions, per replicate
metadata_treat_heavy = metadata[metadata$METADATA_ROWNAMES %in%
phyloseq::sample_names(physeq_treat_heavy),]
Replicate = as.data.frame(metadata_treat_heavy)[,rep]
df_ht = phyloseq::otu_table(physeq_treat_heavy) %>%
as.matrix %>%
as.data.frame %>% t
df_ht = stats::aggregate(df_ht,
list(Replicate = Replicate),
agg_func)
## heavy control fractions, per replicate
metadata_control_heavy = metadata[metadata$METADATA_ROWNAMES %in%
phyloseq::sample_names(physeq_control_heavy),]
Replicate = as.data.frame(metadata_control_heavy)[,rep]
df_hc = phyloseq::otu_table(physeq_control_heavy) %>%
as.matrix %>%
as.data.frame %>% t
df_hc = stats::aggregate(df_hc,
list(Replicate = Replicate),
agg_func)
# applying statistical test
otus = colnames(df_ht)[2:ncol(df_ht)]
res = sapply(otus, function(x) hypo_func(df_ht[,x], df_hc[,x],
alternative=alternative[1]))
res = res %>% t %>% as.data.frame
colnames(res) = c('statistic', 'p')
res$padj = stats::p.adjust(res$p, method=padj_method)
return(res)
} else {
stop('comparison not recognized')
}
}
# unreplicated design
#data(physeq_S2D2)
# Subsetting phyloseq by Substrate and Day
#params = get_treatment_params(physeq_rep3, c('Substrate', 'Day'))
#params = dplyr::filter(params, Substrate!='12C-Con')
#ex = "(Substrate=='12C-Con' & Day=='${Day}') | (Substrate=='${Substrate}' & Day == '${Day}')"
#physeq_S2D2_l = phyloseq_subset(physeq_S2D2, params, ex)
# Calculating heavy-SIP on 1 subset (use lapply function to process full list)
#res = heavy_SIP(physeq_S2D2_l[[1]], ex="Substrate=='12C-Con'", comparison='H-v-L')
#res %>% head
# replicated design
#data(physeq_rep3)
# Calculating heavy-SIP on 1 subset (use lapply function to process full list)
# res = heavy_SIP(physeq_rep3, ex="Treatment=='12C-Con'", comparison='H-v-L')
#res = heavy_SIP(physeq_rep3, ex="Treatment=='12C-Con'", rep='Replicate', comparison='H')
#res = heavy_SIP(physeq_rep3, ex="Treatment=='12C-Con'",
# rep='Replicate', comparison='H-v-H',
# hypo_test='t-test')
#res %>% head
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.