Nothing
# ***********************************************************************
# ***********************************************************************
# Last modified: 4/14/2009 Author: james robert white,
# whitej@umd.edu, Center for Bioinformatics and Computational
# Biology. University of Maryland - College Park, MD 20740
# This software is designed to identify differentially
# abundant feaTRUEs between two groups Input is a matrix of
# frequency data. Several thresholding options are avaiLabel.
# See documentation for details.
# ************************************************************************
# ***********************************************************************
# **********************************************************************
# detect_differentially_abundant_feaTRUEs: the major function
# - inputs an R object 'jobj' containing a list of feaTRUE
# names and the corresponding frequency matrix, the argument
# g is the first column of the second group. -> set the
# pflag to be TRUE or FALSE to threshold by p or q values,
# respectively -> threshold is the significance level to
# reject hypotheses by. -> B is the number of bootstrapping
# permutations to use in estimating the null t-stat
# distribution.
# ***********************************************************************
# *******************************************************************
detect_differentially_abundant_feaTRUEs <- function(jobj, g,
output, pflag = NULL, threshold = NULL, B = NULL) {
# *************************************************************
# ************************ INITIALIZE COMMAND-LINE
# ******************************** ************************
# PARAMETERS ********************************
# *******************************************************
qflag = FALSE
if (is.null(B)) {
B = 1000
}
if (is.null(threshold)) {
threshold = 0.05
}
if (is.null(pflag)) {
pflag = TRUE
qflag = FALSE
}
if (pflag == TRUE) {
qflag = FALSE
}
if (pflag == FALSE) {
qflag = TRUE
}
# **********************************************************
# ************************ INITIALIZE PARAMETERS
# ********************************
# ********************************************************
# *************************************
Fmatrix <- jobj$matrix
# the feaTRUE abundance matrix
taxa <- jobj$taxa
# the taxa/(feaTRUE) labels of the TAM
nrows = nrow(Fmatrix)
ncols = ncol(Fmatrix)
Pmatrix <- array(0, dim = c(nrows, ncols))
# the relative proportion matrix
C1 <- array(0, dim = c(nrows, 3))
# statistic profiles for class1 and class 2
C2 <- array(0, dim = c(nrows, 3))
# mean[1], variance[2], standard error[3]
T_statistics <- array(0, dim = c(nrows, 1))
# a place to store the true t-statistics
pvalues <- array(0, dim = c(nrows, 1))
# place to store pvalues
qvalues <- array(0, dim = c(nrows, 1))
# stores qvalues *************************************
# ************************************* convert to
# proportions generate Pmatrix
# *************************************
totals <- array(0, dim = c(ncol(Fmatrix)))
for (i in 1:ncol(Fmatrix)) {
# sum the ith column
totals[i] = sum(Fmatrix[, i])
}
for (i in 1:ncols) {
for (j in 1:nrows) {
Pmatrix[j, i] = Fmatrix[j, i]/totals[i]
}
}
# *************************************************************
# ************************** STATISTICAL TESTING
# ********************************
# **********************************************************
if (ncols == 2) {
# then we have a two sample comparison
# ************************************************************
# generate p values using chisquared or fisher's exact test
# ************************************************************
for (i in 1:nrows) {
# for each feaTRUE
f11 = sum(Fmatrix[i, 1])
f12 = sum(Fmatrix[i, 2])
f21 = totals[1] - f11
f22 = totals[2] - f12
C1[i, 1] = f11/totals[1]
# proportion estimate
C1[i, 2] = (C1[i, 1] * (1 - C1[i, 1]))/(totals[1] -
1)
# sample variance
C1[i, 3] = sqrt(C1[i, 2])
# sample standard error
C2[i, 1] = f12/totals[2]
C2[i, 2] = (C2[i, 1] * (1 - C2[i, 1]))/(totals[2] -
1)
C2[i, 3] = sqrt(C2[i, 2])
# f11 f12
# f21 f22 <- contigency table format
contingencytable <- array(0, dim = c(2, 2))
contingencytable[1, 1] = f11
contingencytable[1, 2] = f12
contingencytable[2, 1] = f21
contingencytable[2, 2] = f22
if (f11 > 20 && f22 > 20) {
csqt <- chisq.test(contingencytable)
pvalues[i] = csqt$p.value
} else {
ft <- fisher.test(contingencytable, workspace = 8e+06,
alternative = "two.sided", conf.int = FALSE)
pvalues[i] = ft$p.value
}
}
# *************************************
# calculate q values from p values
# *************************************
qvalues <- calc_qvalues(pvalues)
} else {
# we have multiple subjects per population
# *************************************
# generate statistics mean, var, stderr
# *************************************
for (i in 1:nrows) {
# for each taxa
# find the mean of each group
C1[i, 1] = mean(Pmatrix[i, 1:g - 1])
C1[i, 2] = var(Pmatrix[i, 1:g - 1])
# variance
C1[i, 3] = C1[i, 2]/(g - 1)
# std err^2 (will change to std err at end)
C2[i, 1] = mean(Pmatrix[i, g:ncols])
C2[i, 2] = var(Pmatrix[i, g:ncols])
# variance
C2[i, 3] = C2[i, 2]/(ncols - g + 1)
# std err^2 (will change to std err at end)
}
# *************************************
# two sample t-statistics
# *************************************
for (i in 1:nrows) {
# for each taxa
xbar_diff = C1[i, 1] - C2[i, 1]
denom = sqrt(C1[i, 3] + C2[i, 3])
T_statistics[i] = xbar_diff/denom
# calculate two sample t-statistic
}
# *************************************
# generate initial permuted p-values
# *************************************
pvalues <- permuted_pvalues(Pmatrix, T_statistics, B,
g, Fmatrix)
# *************************************
# generate p values for sparse data
# using fisher's exact test
# *************************************
for (i in 1:nrows) {
# for each taxa
if (sum(Fmatrix[i, 1:(g - 1)]) < (g - 1) && sum(Fmatrix[i,
g:ncols]) < (ncols - g + 1)) {
# then this is a candidate for fisher's exact test
f11 = sum(Fmatrix[i, 1:(g - 1)])
f12 = sum(Fmatrix[i, g:ncols])
f21 = sum(totals[1:(g - 1)]) - f11
f22 = sum(totals[g:ncols]) - f12
# f11 f12
# f21 f22 <- contigency table format
contingencytable <- array(0, dim = c(2, 2))
contingencytable[1, 1] = f11
contingencytable[1, 2] = f12
contingencytable[2, 1] = f21
contingencytable[2, 2] = f22
ft <- fisher.test(contingencytable, workspace = 8e+06,
alternative = "two.sided", conf.int = FALSE)
pvalues[i] = ft$p.value
}
}
# *************************************
# calculate q values from p values
# *************************************
qvalues <- calc_qvalues(pvalues)
# *************************************
# convert stderr^2 to std error
# *************************************
for (i in 1:nrows) {
C1[i, 3] = sqrt(C1[i, 3])
C2[i, 3] = sqrt(C2[i, 3])
}
}
# ************************************* threshold sigvalues
# and print *************************************
sigvalues <- array(0, dim = c(nrows, 1))
if (pflag == TRUE) {
# which are you thresholding by?
sigvalues <- pvalues
} else {
sigvalues <- qvalues
}
s = sum(sigvalues <= threshold)
Differential_matrix <- array(0, dim = c(s, 9))
dex = 1
for (i in 1:nrows) {
if (sigvalues[i] <= threshold) {
Differential_matrix[dex, 1] = jobj$taxa[i]
Differential_matrix[dex, 2:4] = C1[i, ]
Differential_matrix[dex, 5:7] = C2[i, ]
Differential_matrix[dex, 8] = pvalues[i]
Differential_matrix[dex, 9] = qvalues[i]
dex = dex + 1
}
}
# show(Differential_matrix);
Total_matrix <- array(0, dim = c(nrows, 9))
for (i in 1:nrows) {
Total_matrix[i, 1] = jobj$taxa[i]
Total_matrix[i, 2:4] = C1[i, ]
Total_matrix[i, 5:7] = C2[i, ]
Total_matrix[i, 8] = pvalues[i]
Total_matrix[i, 9] = qvalues[i]
}
write(t(Total_matrix), output, ncolumns = 9, sep = "\t")
}
# ************************************************************************
# ************************** SUBROUTINES
# ********************************
# ************************************************************************
# ***********************************************************************
# calc two sample two statistics g is the first column in the
# matrix representing the second condition
# ********************************************************************
calc_twosample_ts <- function(Pmatrix, g, nrows, ncols) {
C1 <- array(0, dim = c(nrows, 3))
# statistic profiles
C2 <- array(0, dim = c(nrows, 3))
Ts <- array(0, dim = c(nrows, 1))
if (nrows == 1) {
C1[1, 1] = mean(Pmatrix[1:g - 1])
C1[1, 2] = var(Pmatrix[1:g - 1])
# variance
C1[1, 3] = C1[1, 2]/(g - 1)
# std err^2
C2[1, 1] = mean(Pmatrix[g:ncols])
C2[1, 2] = var(Pmatrix[g:ncols])
# variance
C2[1, 3] = C2[1, 2]/(ncols - g + 1)
# std err^2
} else {
# generate statistic profiles for both groups
# mean, var, stderr
for (i in 1:nrows) {
# for each taxa
# find the mean of each group
C1[i, 1] = mean(Pmatrix[i, 1:g - 1])
C1[i, 2] = var(Pmatrix[i, 1:g - 1])
# variance
C1[i, 3] = C1[i, 2]/(g - 1)
# std err^2
C2[i, 1] = mean(Pmatrix[i, g:ncols])
C2[i, 2] = var(Pmatrix[i, g:ncols])
# variance
C2[i, 3] = C2[i, 2]/(ncols - g + 1)
# std err^2
}
}
# permutation based t-statistics
for (i in 1:nrows) {
# for each taxa
xbar_diff = C1[i, 1] - C2[i, 1]
denom = sqrt(C1[i, 3] + C2[i, 3])
Ts[i] = xbar_diff/denom
# calculate two sample t-statistic
}
return(Ts)
}
# **************************************************************************
# function to calculate qvalues. takes an unordered set of
# pvalues corresponding the rows of the matrix
# **********************************************************************
calc_qvalues <- function(pvalues) {
nrows = length(pvalues)
# create lambda vector
lambdas <- seq(0, 0.95, 0.01)
pi0_hat <- array(0, dim = c(length(lambdas)))
# calculate pi0_hat
for (l in 1:length(lambdas)) {
# for each lambda value
count = 0
for (i in 1:nrows) {
# for each p-value in order
if (pvalues[i] > lambdas[l]) {
count = count + 1
}
pi0_hat[l] = count/(nrows * (1 - lambdas[l]))
}
}
f <- unclass(smooth.spline(lambdas, pi0_hat, df = 3))
f_spline <- f$y
pi0 = f_spline[length(lambdas)]
# this is the essential pi0_hat value order p-values
ordered_ps <- order(pvalues)
pvalues <- pvalues
qvalues <- array(0, dim = c(nrows))
ordered_qs <- array(0, dim = c(nrows))
ordered_qs[nrows] <- min(pvalues[ordered_ps[nrows]] * pi0,
1)
for (i in (nrows - 1):1) {
p = pvalues[ordered_ps[i]]
new = p * nrows * pi0/i
ordered_qs[i] <- min(new, ordered_qs[i + 1], 1)
}
# re-distribute calculated qvalues to appropriate rows
for (i in 1:nrows) {
qvalues[ordered_ps[i]] = ordered_qs[i]
}
# plotting pi_hat vs. lambda
# lines(f);
return(qvalues)
}
# ***************************************************************************
# function to calculate permuted pvalues from Storey and
# Tibshirani(2003) B is the number of permutation cycles g is
# the first column in the matrix of the second condition
# **********************************************************************
permuted_pvalues <- function(Imatrix, tstats, B, g, Fmatrix) {
# B is the number of permutations were going to use! g is
# the first column of the second sample matrix stores tstats
# for each taxa(row) for each permuted trial(column)
M = nrow(Imatrix)
ps <- array(0, dim = c(M))
# to store the pvalues
if (is.null(M) || M == 0) {
return(ps)
}
permuted_ttests <- array(0, dim = c(M, B))
ncols = ncol(Fmatrix)
# calculate null version of tstats using B permutations.
for (j in 1:B) {
trial_ts <- permute_and_calc_ts(Imatrix, sample(1:ncol(Imatrix)),
g)
permuted_ttests[, j] <- abs(trial_ts)
}
# calculate each pvalue using the null ts
if ((g - 1) < 8 || (ncols - g + 1) < 8) {
# then pool the t's together!
# count how many high freq taxa there are
hfc = 0
for (i in 1:M) {
# for each taxa
if (sum(Fmatrix[i, 1:(g - 1)]) >= (g - 1) || sum(Fmatrix[i,
g:ncols]) >= (ncols - g + 1)) {
hfc = hfc + 1
}
}
# the array pooling just the frequently observed ts
cleanedpermuted_ttests <- array(0, dim = c(hfc, B))
hfc = 1
for (i in 1:M) {
if (sum(Fmatrix[i, 1:(g - 1)]) >= (g - 1) || sum(Fmatrix[i,
g:ncols]) >= (ncols - g + 1)) {
cleanedpermuted_ttests[hfc, ] = permuted_ttests[i,
]
hfc = hfc + 1
}
}
# now for each taxa
for (i in 1:M) {
ps[i] = (1/(B * hfc)) * sum(cleanedpermuted_ttests >
abs(tstats[i]))
}
} else {
for (i in 1:M) {
ps[i] = (1/(B + 1)) * (sum(permuted_ttests[i, ] >
abs(tstats[i])) + 1)
}
}
return(ps)
}
# *************************************************************************
# takes a matrix, a permutation vector, and a group division
# g. returns a set of ts based on the permutation.
# **************************************************************************
permute_and_calc_ts <- function(Imatrix, y, g) {
nr = nrow(Imatrix)
nc = ncol(Imatrix)
# first permute the rows in the matrix
Pmatrix <- Imatrix[, y[1:length(y)]]
Ts <- calc_twosample_ts(Pmatrix, g, nr, nc)
return(Ts)
}
# ***********************************************************************
# load up the frequency matrix from a file
# **********************************************************************
load_frequency_matrix <- function(file) {
dat2 <- read.table(file, header = TRUE, sep = "\t")
# load names
subjects <- array(0, dim = c(ncol(dat2)))
for (i in 1:length(subjects)) {
subjects[i] <- as.character(dat2[1, i])
}
# load taxa
taxa <- array(0, dim = c(nrow(dat2)))
for (i in 1:length(taxa)) {
taxa[i] <- as.character(dat2[i, 1])
}
dat2 <- read.table(file, header = TRUE, sep = "\t")
# load remaining counts
matrix <- array(0, dim = c(length(taxa), length(subjects)))
for (i in 1:length(taxa)) {
for (j in 1:length(subjects)) {
matrix[i, j] <- as.numeric(dat2[i, j])
}
}
jobj <- list(matrix = matrix, taxa = taxa)
return(jobj)
}
receive_frequency_matrix <- function(countsMatrix) {
dat2 <- countsMatrix
# load names
subjects <- array(0, dim = c(ncol(dat2)))
for (i in 1:length(subjects)) {
subjects[i] <- as.character(dat2[1, i])
}
# load taxa
taxa <- rownames(countsMatrix)
dat2 <- countsMatrix
# load remaining counts
matrix <- array(0, dim = c(length(taxa), length(subjects)))
for (i in 1:length(taxa)) {
for (j in 1:length(subjects)) {
matrix[i, j] <- as.numeric(dat2[i, j])
}
}
jobj <- list(matrix = matrix, taxa = taxa)
return(jobj)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.