#' \code{testRelations}
#' @title testRelations
#' @export
#' @description This function applies a statistical test either a correlation
#' (spearman, pearson), wilcoxon or t.test as a function of a given phenotype.
#' It will return a matrix of probabilities p and q values along with the
#' correlation coefficient or the enrichment variable when a binary parameter.
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param data : frequency matrix with gene_ids in the rownames
#' @param trait : a vector with the trait to test, binary or numerical variable
#' @param type : a character string indicating the type of test to be applied
#' @param restrict : an optional logical vector to select a subset of the
#' samples to perform the test default restrict = rep(TRUE, ncol(data))
#' i.e. all the samples are selected
#' @param multiple.adjust : type of multiple adjustment default is "BH" i.e.
#' Benjamini & Hochberg method
#' @param paired : logical with default FALSE wether the test should be paired or not
#' @param debug : default FALSE, when TRUE the progress is printed each 1000 steps
#' @return a matrix with analytical results (correlation tests) indicating rho,
#' rho2, p and q values for each parameter tested along with the status of the test
#' @note New add on taking into account a trait for correlation, when it is a two
#' class variable with the same number of elements a correlation between
#' both groups is performed
testRelations <- function (data,
trait,
type,
restrict = rep(TRUE, ncol(data)),
multiple.adjust = "BH",
paired = FALSE,
debug = FALSE)
{
# Sanity checks
if (length(trait) != ncol(data))
{
stop(
"testRelations: please make sure that dimensions of the trait correspond to that of the data."
)
}
if (length(trait) != sum(restrict))
{
stop(
"testRelations: please make sure that dimensions of the trait corresponds to the restrict selection."
)
}
if (any(class(data) != "matrix"))
{
print(paste("testRelations: Converting", class(data), "in matrix"))
data <- as.matrix(data)
}
# prepare an empty result table.
res <- as.data.frame(matrix(NA, nrow = nrow(data), ncol = 5))
rownames(res) <- rownames(data)
colnames(res) <- c("rho", "rho2", "p", "q", "status")
trait.val <- names(table(trait))
if (type != "spearman" & type != "pearson")
{
if (length(table(trait)) != 2)
{
stop(
"testRelations: sorry, you can't use this test. The trait should contain only two categories!"
)
}
# Categorical test
if (type == "wilcoxon" | type == "t.test")
{
if (type == "wilcoxon")
{
if (debug)
{
print("Executing Wilcoxon test")
}
for (i in 1:nrow(data))
{
if (i %% 1000 == 0 & !debug)
{
print(i)
}
tmp <- wilcox.test(data[i, restrict] ~ trait[restrict], paired = paired)
res[i, "p"] <- tmp$p.value
if (res[i, "p"] == "NaN")
{
res[i, "status"] <- NA
} else
{
if (mean(data[i, restrict][trait == trait.val[1]]) >
mean(data[i, restrict][trait == trait.val[2]]))
{
res[i, "status"] <- trait.val[1]
} else
{
res[i, "status"] <- trait.val[2]
}
}
} # end for
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
} # end wilcoxon
else
{
if (debug)
{
print("Executing T test")
}
for (i in 1:nrow(data))
{
if (i %% 1000 == 0 & !debug)
{
print(i)
}
tmp <- t.test(data[i, restrict] ~ trait[restrict], paired = paired)
res[i, "p"] <- tmp$p.value
if (res[i, "p"] == "NaN")
{
res[i, "status"] <- NA
} else
{
if (mean(data[i, restrict][trait == trait.val[1]]) >
mean(data[i, restrict][trait == trait.val[2]]))
{
res[i, "status"] <- trait.val[1]
} else
{
res[i, "status"] <- trait.val[2]
}
}
} # end for
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
}
}
else
{
stop(
"testRelations: Sorry, your test does not exist! Available : spearman, pearson, t.test and wilcoxon"
)
}
}
else {
if (type == "spearman")
{
if (paired)
{
if (debug)
{
print("Entering correlation mode between two classes")
}
if (length(table(trait)) != 2 |
table(trait)[1] != table(trait)[2])
{
stop(
"testRelations: Sorry, trait does not seem to be a 2-level categorical variable or with the same prevalence"
)
}
cl1 <- names(table(trait)[1])
cl1.ind <- trait == cl1
cl2 <- names(table(trait)[2])
cl2.ind <- trait == cl2
if (debug)
{
print("Executing Spearman correlation")
}
for (i in 1:nrow(data))
{
if (i %% 1000 == 0 & !debug)
{
print(i)
}
tmp <- Hmisc::rcorr(data[i, cl1.ind], data[i, cl2.ind], type = "spearman")
res[i, 1] <- tmp$r[1, 2]
if (!is.na(res[i, 1]))
{
if (res[i, 1] > 0)
{
res[i, 5] <- "POS"
}
else
{
res[i, 5] <- "NEG"
}
}
res[i, 2] <- res[i, 1] ^ 2
res[i, 3] <- tmp$P[1, 2]
}
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
} # end if paired
else
{
if (debug)
{
print("Executing Spearman correlation")
}
for (i in 1:nrow(data))
{
if (i %% 1000 == 0 & !debug)
{
print(i)
}
tmp <- Hmisc::rcorr(data[i, restrict], trait[restrict], type = "spearman")
res[i, 1] <- tmp$r[1, 2]
if (!is.na(res[i, 1]))
{
if (res[i, 1] > 0)
{
res[i, 5] <- "POS"
}
else
{
res[i, 5] <- "NEG"
}
}
res[i, 2] <- res[i, 1] ^ 2
res[i, 3] <- tmp$P[1, 2]
}
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
} # end else unpaired
} # end spearman
else {
if (paired)
{
if (debug)
{
print("Entering correlation mode between two classes")
}
if (length(table(trait)) != 2 |
table(trait)[1] != table(trait)[2])
{
stop(
"testRelations: Sorry, trait does not seem to be a 2-level categorical variable or with the same prevalence"
)
}
cl1 <- names(table(trait)[1])
cl1.ind <- trait == cl1
cl2 <- names(table(trait)[2])
cl2.ind <- trait == cl2
print("Executing Pearson correlation")
for (i in 1:nrow(data))
{
if (i %% 1000 == 0 & !debug)
{
print(i)
}
tmp <- Hmisc::rcorr(data[i, cl1.ind], data[i, cl2.ind], type = "pearson")
res[i, 1] <- tmp$r[1, 2]
if (!is.na(res[i, 1]))
{
if (res[i, 1] > 0)
{
res[i, 5] <- "POS"
}
else
{
res[i, 5] <- "NEG"
}
}
res[i, 2] <- res[i, 1] ^ 2
res[i, 3] <- tmp$P[1, 2]
}
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
} # end if paired
else
{
if (debug)
{
print("Executing Pearson correlation")
}
for (i in 1:nrow(data))
{
if (i %% 1000 == 0 & !debug)
{
print(i)
}
tmp <- Hmisc::rcorr(data[i, restrict], trait[restrict], type = "pearson")
res[i, 1] <- tmp$r[1, 2]
if (!is.na(res[i, 1]))
{
if (res[i, 1] > 0)
{
res[i, 5] <- "POS"
}
else
{
res[i, 5] <- "NEG"
}
}
res[i, 2] <- res[i, 1] ^ 2
res[i, 3] <- tmp$P[1, 2]
}
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
}
}
}
return(res)
}
#' \code{hierClust}
#' @title hierClust
#' @export
#' @import gplots
#' @export
#' @description This function computes the pairwise distance between samples
#' and computes a hierarchical clustering that is further depicted as a
#' heatmap graphic. The distance is computed as 1-correlation.
#' @author Edi Prifti
#' @param data : frequency matrix with gene_ids in the rownames
#' @param side : the distance can be performed on the columns or on the rows
#' @param dist : the type of distance used. By default this is correlation based similarity
#' @param cor.type : when correlation matrix, the default is Spearman
#' @param hclust.method : the hierarchical clustering method, by default it is the ward.D method
#' @param side.col.c : a vector of colors to be applied in the columns, usually depicting a class
#' @param side.col.r : a vector of colors to be applied in the rows, usually depicting a class
#' @param plot : logical default TRUE. It will plot the heatmap of the similarity with the hierarchical clustering
#' @return it will return a list of three variables, the correlation matrix, the distance matrix and the hclust object
#' @note updated hierClust functions by elechat april 7th 2015 added options SideColors added + spearman == pearson(rank)
hierClust <- function (data,
side = "col",
dist = "correlation",
cor.type = "spearman",
hclust.method = "ward.D",
side.col.c = NULL,
side.col.r = NULL,
plot = TRUE) {
res <- NULL
if (side == "col") {
if (sum(colSums(data, na.rm = TRUE) == 0) > 0) {
warning("Warning there are samples with no signal that need to be discarded")
}
}
else {
if (sum(rowSums(data, na.rm = TRUE) == 0) > 0) {
warning("Warning there are genes with no signal that need to be discarded")
}
}
if (dist == "correlation") {
if (side != "col") {
# if rows we need to transpose
data <- t(data)
}
if (cor.type == "spearman") {
data <- apply(data, 2, rank)
} # compute the rank for spearman
mat.rho <- Hmisc::rcorr(data, type = "pearson")$r
diag(mat.rho) <- NA # don't use the diagonal, correlation with themselves
# compute the distance as 1-correlation
mat.dist <- as.dist(1 - mat.rho)
mat.hclust <- hclust(d = mat.dist, method = hclust.method)
diag(mat.rho) <- 1 # add the diag again
if (plot) {
if (is.null(side.col.c) &
is.null(side.col.r)) {
# if none is provided
heatmap.2(
mat.rho,
scale = "none",
trace = "none",
Rowv = as.dendrogram(mat.hclust),
Colv = as.dendrogram(mat.hclust),
margins = c(6, 6),
cex.axis = 0.7
)
} else {
if (is.null(side.col.c)) {
# if column class is not provided but the row is
if (length(side.col.r) != ncol(mat.rho)) {
warning("side.col.r must be a character vector of entry length")
}
heatmap.2(
mat.rho,
scale = "none",
trace = "none",
Rowv = as.dendrogram(mat.hclust),
Colv = as.dendrogram(mat.hclust),
margins = c(6, 6),
cex.axis = 0.7,
RowSideColors = side.col.r
)
} else{
# if row class is not provided but the culumn is
if (is.null(side.col.r)) {
if (length(side.col.c) != ncol(mat.rho)) {
warning("side.col.c must be a character vector of entry length")
}
heatmap.2(
mat.rho,
scale = "none",
trace = "none",
Rowv = as.dendrogram(mat.hclust),
Colv = as.dendrogram(mat.hclust),
margins = c(6, 6),
cex.axis = 0.7,
ColSideColors = side.col.c
)
} else {
# if both classes are provided
if (length(side.col.r) != ncol(mat.rho)) {
warning("side.col.r must be a character vector of entry length")
}
if (length(side.col.c) != ncol(mat.rho)) {
warning("side.col.c must be a character vector of entry length")
}
heatmap.2(
mat.rho,
scale = "none",
trace = "none",
Rowv = as.dendrogram(mat.hclust),
Colv = as.dendrogram(mat.hclust),
margins = c(6, 6),
cex.axis = 0.7,
ColSideColors = side.col.c,
RowSideColors = side.col.r
)
}
}
}
}
res <- list(mat.rho = mat.rho,
mat.dist = mat.dist,
mat.hclust = mat.hclust)
} else {
stop("Other distances are not yet implemented, only 1-correlation is used here !")
}
return(res)
}
#' \code{filt.hierClust}
#' @title filt.hierClust
#' @export
#' @description This function takes as input a square similarity matrix and searches for clusters of samples with strong associations
#' and extracts the sub matrix with the closely related sampless. Only positive correlations are considered here.
#' @import gplots
#' @author Emmanuelle Le Chatelier & Edi Prifti
#' @param mat.rho : square correlation matrix with ids (can be used for also other than just samples)
#' @param hclust.method : the hierarchical clustering method, by default it is the ward.D method
#' @param margins : change margins of the graph (default = c(6, 6))
#' @param side.col.c : a vector of colors to be applied in the columns, usually depicting a class
#' @param side.col.r : a vector of colors to be applied in the rows, usually depicting a class
#' @param size : the number of samples in the resulting ordered matrix
#' @param plot : logical default TRUE. It will plot the heatmap of the similarity with the hierarchical clustering
#' @param filt : default is 0.5 and is the filtering threshold to be applied
#' @return it will return a matrix with samples in rows and their closely related ones on the columns along with the
#' correlation score.
filt.hierClust <- function (mat.rho,
hclust.method = "ward",
margins = c(6, 6),
side.col.c = NULL,
# margins added as option
side.col.r = NULL,
size = 10,
plot = TRUE,
filt = 0.5)
{
rho <- mat.rho
diag(mat.rho) <- 0
if ((nrow(mat.rho) - 1) < size)
{
warning(paste("There are less than", size, "samples"))
size <- nrow(mat.rho) - 1
}
res <- as.data.frame(matrix(NA, ncol(mat.rho), size * 2))
rownames(res) <- colnames(mat.rho)
colnames(res) <- paste(c("Hit", "Hit_rho"), sort(c(1:size, 1:size)), sep = "_")
for (i in 1:nrow(mat.rho))
{
tmp <- sort(mat.rho[i, ], decreasing = T)
tmp <- tmp[1:size]
res[i, seq(1, size * 2, by = 2)] <- names(tmp)
res[i, seq(2, size * 2, by = 2)] <- round(tmp, 3)
sel <- colnames(res)[res[i, ] == 0]
sel <- gsub("rho_", "", sel)
res[i, sel] <- 0
}
N <- apply(mat.rho, 1, max)
N <- N > filt
mat.rho <- rho[N, N]
if (all(dim(mat.rho) != 0))
{
if (!is.null(side.col.c))
{
side.col.c <- side.col.c[N]
}
if (!is.null(side.col.r))
{
side.col.r <- side.col.r[N]
}
diag(mat.rho) <- NA
mat.dist <- as.dist(1 - mat.rho)
mat.hclust <- hclust(d = mat.dist, method = hclust.method)
diag(mat.rho) <- 1
if (plot)
{
if (is.null(side.col.c) & is.null(side.col.r))
{
heatmap.2(
mat.rho,
scale = "none",
trace = "none",
Rowv = as.dendrogram(mat.hclust),
Colv = as.dendrogram(mat.hclust),
margins = margins,
cex.axis = 0.7
)
}
else {
if (is.null(side.col.c))
{
if (length(side.col.r) != ncol(mat.rho))
{
warning("side.col.r must be a character vector of entry length")
}
heatmap.2(
mat.rho,
scale = "none",
trace = "none",
Rowv = as.dendrogram(mat.hclust),
Colv = as.dendrogram(mat.hclust),
margins = margins,
cex.axis = 0.7,
RowSideColors = side.col.r
)
}
else
{
if (is.null(side.col.r))
{
if (length(side.col.c) != ncol(mat.rho))
{
warning("side.col.c must be a character vector of entry length")
}
heatmap.2(
mat.rho,
scale = "none",
trace = "none",
Rowv = as.dendrogram(mat.hclust),
Colv = as.dendrogram(mat.hclust),
margins = margins,
cex.axis = 0.7,
ColSideColors = side.col.c
)
}
else
{
if (length(side.col.r) != ncol(mat.rho))
{
warning("side.col.r must be a character vector of entry length")
}
if (length(side.col.c) != ncol(mat.rho))
{
warning("side.col.c must be a character vector of entry length")
}
heatmap.2(
mat.rho,
scale = "none",
trace = "none",
Rowv = as.dendrogram(mat.hclust),
Colv = as.dendrogram(mat.hclust),
margins = margins,
cex.axis = 0.7,
ColSideColors = side.col.c,
RowSideColors = side.col.r
)
}
}
}
}
}
else
{
warning(paste("There are no related samples above the threshold", filt))
}
return(res)
}
#' \code{lmp}
#' @title lmp
#' @description This function will extract the p-value from a linear model object. It is used by phenoPairwiseRelations
#' @author Edi Prifti
#' @param modelobject : a linear model object as produced by lm()
#' @return a p-value
lmp <- function (modelobject)
{
if (class(modelobject) != "lm")
stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1], f[2], f[3], lower.tail = F)
attributes(p) <- NULL
return(p)
}
#' \code{phenoPairwiseRelations}
#' @title phenoPairwiseRelations
#' @description This function will compute all the relations between different variables adapting different statistical tests as a
#' function of the data type. It will adjust the p-value matrix for multiple testing.
#' @author Edi Prifti
#' @param data : bioclinical data with variables on the rows and samples on the columns
#' @param adjust : method to adjust for multiple testing (default="BH)
#' @param verbose : default=FALSE. If TRUE information will be printed to follow the progression.
#' @return a list of two matrixes containing the p-values and the multiple testing adjustment.
phenoPairwiseRelations <- function(data,
adjust = "BH",
verbose = FALSE)
{
#require(nortest)
#require(Hmisc)
data.relations <- matrix(NA, ncol(data), ncol(data))
colnames(data.relations) <- colnames(data)
rownames(data.relations) <- colnames(data)
for (i in 1:ncol(data)) {
if (verbose)
print(paste(i, colnames(data)[i]))
for (j in 1:ncol(data)) {
if (verbose)
print(paste(j, colnames(data)[j]))
x <- data[, i]
y <- data[, j]
# test weather the variables are factors
# a) both factors
if (is.factor(x)) {
if (is.factor(y)) {
# perfom a chisquare
if (nrow(table(x, y)) == 1 |
ncol(table(x, y)) == 1 | sum(table(x, y)) == 0) {
# do nothing
} else{
data.relations[i, j] <- chisq.test(table(x, y))$p.value
}
} else{
# b) x is a factor but not y
# test for normality
if (nortest::lillie.test(y)$p.value > 0.05) {
#if normal
if (length(table(x)) > 1) {
if (length(table(x)) == 2) {
if (all(colSums(table(y, x)) > 1)) {
data.relations[i, j] <- t.test(y ~ x)$p.value
}
} else{
if (sum(table(y, x)) > 1 &
length(unique(x[!is.na(x)])) != 1 & sum(colSums(table(y, x)) > 0) > 1) {
tmp <- lmp(lm(y ~ x))
if (!is.null(tmp)) {
data.relations[i, j] <- tmp
}
}
}
}
} else{
# not normal
# run anova
if (length(table(x)) > 1) {
if (length(table(x)) == 2) {
# should test here the distribution of y if normal than t.test otherwise wilcox
if (all(colSums(table(y, x)) > 1)) {
data.relations[i, j] <- wilcox.test(y ~ x)$p.value
}
} else{
if (sum(table(y, x)) > 1 &
length(unique(x[!is.na(x)])) != 1 & sum(colSums(table(y, x)) > 0) > 1) {
tmp <- kruskal.test(y ~ x)
if (!is.null(tmp)) {
data.relations[i, j] <- tmp$p.value
}
}
}
}
}
}
} else{
# x is not a factor
if (is.factor(y)) {
# but y is a factor
# test for normality
if (nortest::lillie.test(x)$p.value > 0.05) {
#if normal
if (length(table(y)) > 1) {
if (length(table(y)) == 2) {
if (all(colSums(table(x, y)) > 1)) {
data.relations[i, j] <- t.test(x ~ y)$p.value
}
} else{
if (sum(table(x, y)) > 1 &
length(unique(y[!is.na(y)])) != 1 & sum(colSums(table(x, y)) > 0) > 1) {
tmp <- lmp(lm(x ~ y))
if (!is.null(tmp)) {
data.relations[i, j] <- tmp
}
}
}
}
} else{
# non normal
if (length(table(y)) > 1) {
if (length(table(y)) == 2) {
if (all(colSums(table(x, y)) > 1)) {
data.relations[i, j] <- wilcox.test(x ~ y)$p.value
}
} else{
if (sum(table(x, y)) > 1 &
length(unique(y[!is.na(y)])) != 1 & sum(colSums(table(x, y)) > 0) > 1) {
tmp <- kruskal.test(x ~ y)
if (!is.null(tmp)) {
data.relations[i, j] <- tmp$p.value
}
}
}
}
}
} else{
# if both are numeric, run a correlation by testing for normality
if (length(x[!is.na(x)]) > 4 & length(x[!is.na(y)]) > 4) {
if (nortest::lillie.test(x)$p.value < 0.05 |
nortest::lillie.test(y)$p.value < 0.05) {
# use spearman
data.relations[i, j] <- Hmisc::rcorr(x, y, type = "spearman")$P[1, 2]
} else{
# use pearson
data.relations[i, j] <- Hmisc::rcorr(x, y, type = "pearson")$P[1, 2]
}
}
}
}
}
}
# adjust for multiple testing
data.relations.q <- matrix(p.adjust(data.relations , method = adjust),
nrow = nrow(data.relations))
colnames(data.relations.q) <- colnames(data.relations)
rownames(data.relations.q) <- rownames(data.relations)
return(list(p = data.relations, q = data.relations.q))
}
#' \code{extractSignificant}
#' @export
#' @title extractSignificant
#' @description This function will extract a list of vectors p- or q-values from an object produced by phenoPairwiseRelations.
#' @author Edi Prifti
#' @param relation.matrix : a matrix of p- produced by or q-values by phenoPairwiseRelations()
#' @param interest : a vector of variable names of interest.
#' @param threshold : default 0.05 needed to select significant relations
#' @return a list of vectors containing p-values or q-values along with the names of the variables.
extractSignificant <- function(relation.matrix, interest, threshold = 0.05)
{
res <- list()
for (i in 1:length(interest)) {
tmp.val <- relation.matrix[, interest[i]]
tmp <- (tmp.val < threshold)
tmp[is.na(tmp)] <- FALSE
res[[i]] <- tmp.val[tmp]
}
names(res) <- interest
return(res)
}
#' End of section and file
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.