#' \code{testRelations}
#' @title testRelations
#' @export
#' @description This function applies a statistical test either a correlation (spearman, pearson),
#' wilcoxon or t.test as a function of a given phenotype. It will return a matrix of probabilities
#' p and q values along with the correlation coefficient or the enrichment variable when a binary parameter.
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param data : frequency matrix with gene_ids in the rownames
#' @param trait : a vector with the trait to test, binary or numerical variable
#' @param type : a character string indicating the type of test to be applied
#' @param restrict : an optional logical vector to select a subset of the samples to perform the test
#' default restrict = rep(TRUE, ncol(data)) ie all the samples are selected
#' @param multiple.adjust : type of multiple adjustment default is "BH" i.e. Benjamini & Hochberg method
#' @param paired : logical with default FALSE wether the test should be paired or not
#' @param debug : default FALSE, when TRUE the progress is printed each 1000 steps
#' @return a matrix with analytical results (correlation tests) indicating rho, rho2, p and q values for each parameter tested
#' @note New addon taking into account a trait for correlation, when it is a two class variable with the same number of elements
#' a correlation between both groups is performed
testRelations <- function (data, trait, type, restrict = rep(TRUE, ncol(data)),
multiple.adjust = "BH", paired = FALSE, debug = FALSE)
{
# Sanity checks
if(length(trait) != ncol(data))
{
stop("testRelations: please make sure that dimensions of the trait correspond to that of the data.")
}
if(length(trait) != sum(restrict))
{
stop("testRelations: please make sure that dimensions of the trait corresponds to the restrict selection.")
}
if(any(class(data) != "matrix"))
{
print(paste("testRelations: Converting",class(data),"in matrix"))
data <- as.matrix(data)
}
# prepare an empty result table.
res <- as.data.frame(matrix(NA, nrow = nrow(data), ncol = 5))
rownames(res) <- rownames(data)
colnames(res) <- c("rho", "rho2", "p", "q", "status")
trait.val <- names(table(trait))
if (type != "spearman" & type != "pearson")
{
if (length(table(trait)) != 2)
{
stop("testRelations: sorry, you can't use this test. The trait should contain only two categories!")
}
# Categorical test
if (type == "wilcoxon" | type == "t.test")
{
if (type == "wilcoxon")
{
if (debug)
{
print("Executing Wilcoxon test")
}
for (i in 1:nrow(data))
{
if (i%%1000 == 0 & !debug)
{
print(i)
}
tmp <- wilcox.test(data[i, restrict] ~ trait[restrict], paired = paired)
res[i, "p"] <- tmp$p.value
if(res[i, "p"]=="NaN")
{
res[i, "status"] <- NA
} else
{
if (mean(data[i, restrict][trait == trait.val[1]]) >
mean(data[i, restrict][trait == trait.val[2]]))
{
res[i, "status"] <- trait.val[1]
} else
{
res[i, "status"] <- trait.val[2]
}
}
} # end for
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
} # end wilcoxon
else
{
if (debug)
{
print("Executing T test")
}
for (i in 1:nrow(data))
{
if (i%%1000 == 0 & !debug)
{
print(i)
}
tmp <- t.test(data[i, restrict] ~ trait[restrict], paired = paired)
res[i, "p"] <- tmp$p.value
if(res[i, "p"]=="NaN")
{
res[i, "status"] <- NA
} else
{
if (mean(data[i, restrict][trait == trait.val[1]]) >
mean(data[i, restrict][trait == trait.val[2]]))
{
res[i, "status"] <- trait.val[1]
} else
{
res[i, "status"] <- trait.val[2]
}
}
} # end for
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
}
}
else
{
stop("testRelations: Sorry, your test does not exist! Available : spearman, pearson, t.test and wilcoxon")
}
}
else {
if (type == "spearman")
{
if (paired)
{
if (debug)
{
print("Entering correlation mode between two classes")
}
if (length(table(trait)) != 2 | table(trait)[1] != table(trait)[2])
{
stop("testRelations: Sorry, trait does not seem to be a 2-level categorical variable or with the same prevalence")
}
cl1 <- names(table(trait)[1])
cl1.ind <- trait == cl1
cl2 <- names(table(trait)[2])
cl2.ind <- trait == cl2
if (debug)
{
print("Executing Spearman correlation")
}
for (i in 1:nrow(data))
{
if (i%%1000 == 0 & !debug)
{
print(i)
}
tmp <- Hmisc::rcorr(data[i, cl1.ind], data[i, cl2.ind], type = "spearman")
res[i, 1] <- tmp$r[1, 2]
if (!is.na(res[i, 1]))
{
if (res[i, 1] > 0)
{
res[i, 5] <- "POS"
}
else
{
res[i, 5] <- "NEG"
}
}
res[i, 2] <- res[i, 1]^2
res[i, 3] <- tmp$P[1, 2]
}
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
} # end if paired
else
{
if (debug)
{
print("Executing Spearman correlation")
}
for (i in 1:nrow(data))
{
if (i%%1000 == 0 & !debug)
{
print(i)
}
tmp <- Hmisc::rcorr(data[i, restrict], trait[restrict], type = "spearman")
res[i, 1] <- tmp$r[1, 2]
if (!is.na(res[i, 1]))
{
if (res[i, 1] > 0)
{
res[i, 5] <- "POS"
}
else
{
res[i, 5] <- "NEG"
}
}
res[i, 2] <- res[i, 1]^2
res[i, 3] <- tmp$P[1, 2]
}
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
} # end else unpaired
} # end spearman
else {
if (paired)
{
if (debug)
{
print("Entering correlation mode between two classes")
}
if (length(table(trait)) != 2 | table(trait)[1] != table(trait)[2])
{
stop("testRelations: Sorry, trait does not seem to be a 2-level categorical variable or with the same prevalence")
}
cl1 <- names(table(trait)[1])
cl1.ind <- trait == cl1
cl2 <- names(table(trait)[2])
cl2.ind <- trait == cl2
print("Executing Pearson correlation")
for (i in 1:nrow(data))
{
if (i%%1000 == 0 & !debug)
{
print(i)
}
tmp <- Hmisc::rcorr(data[i, cl1.ind], data[i, cl2.ind], type = "pearson")
res[i, 1] <- tmp$r[1, 2]
if (!is.na(res[i, 1]))
{
if (res[i, 1] > 0)
{
res[i, 5] <- "POS"
}
else
{
res[i, 5] <- "NEG"
}
}
res[i, 2] <- res[i, 1]^2
res[i, 3] <- tmp$P[1, 2]
}
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
} # end if paired
else
{
if (debug)
{
print("Executing Pearson correlation")
}
for (i in 1:nrow(data))
{
if (i%%1000 == 0 & !debug)
{
print(i)
}
tmp <- Hmisc::rcorr(data[i, restrict], trait[restrict], type = "pearson")
res[i, 1] <- tmp$r[1, 2]
if (!is.na(res[i, 1]))
{
if (res[i, 1] > 0)
{
res[i, 5] <- "POS"
}
else
{
res[i, 5] <- "NEG"
}
}
res[i, 2] <- res[i, 1]^2
res[i, 3] <- tmp$P[1, 2]
}
res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
}
}
}
return(res)
}
#' \code{hierClust}
#' @title hierClust
#' @export
#' @import gplots
#' @export
#' @description This function computes the pairwise distance between samples and computes a hierarchical clustering
#' that is further depicted as a heatmap graphic
#' @author Edi Prifti
#' @param data : frequency matrix with gene_ids in the rownames
#' @param side : the distance can be performed on the columns or on the rows
#' @param dist : the type of distance used. By default this is correlation based similarity
#' @param cor.type : when correlation matrix, the default is spearman
#' @param hclust.method : the hierarchical clustering method, by default it is the ward.D method
#' @param side.col.c : a vector of colors to be applied in the columns, usually depincting a class
#' @param side.col.r : a vector of colors to be applied in the rows, usually depincting a class
#' @param plot : logical default TRUE. It will plot the heatmap of the similarity with the hierchical clustering
#' @return it will return a list of three variables, the correlation matrix, the distance matrix and the hclust object
#' @note updated hierClust functions by elechat april 7th 2015 added options SideColors added + spearman == pearson(rank)
hierClust <- function (data, side = "col", dist = "correlation", cor.type = "spearman",
hclust.method = "ward.D", side.col.c = NULL, side.col.r = NULL, plot = TRUE) {
res <- NULL
if (side == "col") {
if (sum(colSums(data, na.rm=TRUE) == 0) > 0) {
warning("Warning there are samples with no signal that need to be discarded")
}
}
else {
if (sum(rowSums(data, na.rm=TRUE) == 0) > 0) {
warning("Warning there are genes with no signal that need to be discarded")
}
}
if (dist == "correlation") {
if (side != "col") { # if rows we need to transpose
data <- t(data)
}
if (cor.type == "spearman") { data <- apply(data, 2, rank) } # compute the rank for spearman
mat.rho <- Hmisc::rcorr(data, type = "pearson")$r
diag(mat.rho) <- NA # don't use the diagonal, correlation with themselves
# compute the distance as 1-correlation
mat.dist <- as.dist(1 - mat.rho)
mat.hclust <- hclust(d = mat.dist, method = hclust.method)
diag(mat.rho) <- 1 # add the diag again
if (plot) {
if (is.null(side.col.c) & is.null(side.col.r)) { # if none is provided
heatmap.2(mat.rho, scale = "none", trace = "none",
Rowv = as.dendrogram(mat.hclust), Colv = as.dendrogram(mat.hclust),
margins = c(6, 6), cex.axis = 0.7)
} else {
if (is.null(side.col.c)){ # if column class is not provided but the row is
if (length(side.col.r) != ncol(mat.rho)) {warning("side.col.r must be a character vector of entry length")}
heatmap.2(mat.rho, scale = "none", trace = "none",
Rowv = as.dendrogram(mat.hclust), Colv = as.dendrogram(mat.hclust),
margins = c(6, 6), cex.axis = 0.7, RowSideColors=side.col.r)
}else{ # if row class is not provided but the culumn is
if (is.null(side.col.r)){
if (length(side.col.c) != ncol(mat.rho)) {warning("side.col.c must be a character vector of entry length")}
heatmap.2(mat.rho, scale = "none", trace = "none",
Rowv = as.dendrogram(mat.hclust), Colv = as.dendrogram(mat.hclust),
margins = c(6, 6), cex.axis = 0.7, ColSideColors=side.col.c)
} else { # if both classes are provided
if (length(side.col.r) != ncol(mat.rho)) {warning("side.col.r must be a character vector of entry length")}
if (length(side.col.c) != ncol(mat.rho)) {warning("side.col.c must be a character vector of entry length")}
heatmap.2(mat.rho, scale = "none", trace = "none",
Rowv = as.dendrogram(mat.hclust), Colv = as.dendrogram(mat.hclust),
margins = c(6, 6), cex.axis = 0.7, ColSideColors=side.col.c, RowSideColors=side.col.r)
}
}
}
}
res <- list(mat.rho = mat.rho, mat.dist = mat.dist, mat.hclust = mat.hclust)
} else { stop("Other distances are not yet implemented, only 1-correlation is used here !") }
return(res)
}
#' \code{filt.hierClust}
#' @title filt.hierClust
#' @export
#' @description This function takes as input a square similarity matrix and searches for clusters of samples with strong associations
#' and extracts the sub matrix with the closely related sampless. Only positive correlations are considered here.
#' @import gplots
#' @author Emmanuelle Le Chatelier & Edi Prifti
#' @param mat.rho : square correlation matrix with ids (can be used for also other than just samples)
#' @param hclust.method : the hierarchical clustering method, by default it is the ward.D method
#' @param margins : change margins of the graph (default = c(6, 6))
#' @param side.col.c : a vector of colors to be applied in the columns, usually depincting a class
#' @param side.col.r : a vector of colors to be applied in the rows, usually depincting a class
#' @param size : the number of samples in the resulting ordered matrix
#' @param plot : logical default TRUE. It will plot the heatmap of the similarity with the hierchical clustering
#' @param filt : default is 0.5 and is the filtering threshold to be applied
#' @return it will return a matrix with samples in rows and their closely related ones on the columns along with the
#' correlation score.
filt.hierClust <- function (mat.rho, hclust.method = "ward", margins = c(6, 6), side.col.c = NULL, # margins added as option
side.col.r = NULL, size=10, plot = TRUE, filt = 0.5)
{
rho <- mat.rho
diag(mat.rho) <- 0
if ((nrow(mat.rho) -1) < size)
{
warning(paste("There are less than", size, "samples"))
size <- nrow(mat.rho) -1
}
res <- as.data.frame(matrix(NA, ncol(mat.rho), size * 2))
rownames(res) <- colnames(mat.rho)
colnames(res) <- paste(c("Hit", "Hit_rho"), sort(c(1:size, 1:size)), sep = "_")
for (i in 1:nrow(mat.rho))
{
tmp <- sort(mat.rho[i, ], decreasing = T)
tmp <- tmp[1:size]
res[i, seq(1, size * 2, by = 2)] <- names(tmp)
res[i, seq(2, size * 2, by = 2)] <- round(tmp, 3)
sel <- colnames(res)[res[i, ] == 0]
sel <- gsub("rho_", "", sel)
res[i, sel] <- 0
}
N <- apply(mat.rho, 1, max)
N <- N > filt
mat.rho <- rho[N, N]
if (all(dim(mat.rho) != 0))
{
if (!is.null(side.col.c))
{
side.col.c <- side.col.c[N]
}
if (!is.null(side.col.r))
{
side.col.r <- side.col.r[N]
}
diag(mat.rho) <- NA
mat.dist <- as.dist(1 - mat.rho)
mat.hclust <- hclust(d = mat.dist, method = hclust.method)
diag(mat.rho) <- 1
if (plot)
{
if (is.null(side.col.c) & is.null(side.col.r))
{
heatmap.2(mat.rho, scale = "none", trace = "none",
Rowv = as.dendrogram(mat.hclust), Colv = as.dendrogram(mat.hclust),
margins = margins, cex.axis = 0.7)
}
else {
if (is.null(side.col.c))
{
if (length(side.col.r) != ncol(mat.rho))
{
warning("side.col.r must be a character vector of entry length")
}
heatmap.2(mat.rho, scale = "none",
trace = "none", Rowv = as.dendrogram(mat.hclust),
Colv = as.dendrogram(mat.hclust), margins = margins,
cex.axis = 0.7, RowSideColors = side.col.r)
}
else
{
if (is.null(side.col.r))
{
if (length(side.col.c) != ncol(mat.rho))
{
warning("side.col.c must be a character vector of entry length")
}
heatmap.2(mat.rho, scale = "none",
trace = "none", Rowv = as.dendrogram(mat.hclust),
Colv = as.dendrogram(mat.hclust), margins = margins,
cex.axis = 0.7, ColSideColors = side.col.c)
}
else
{
if (length(side.col.r) != ncol(mat.rho))
{
warning("side.col.r must be a character vector of entry length")
}
if (length(side.col.c) != ncol(mat.rho))
{
warning("side.col.c must be a character vector of entry length")
}
heatmap.2(mat.rho, scale = "none",
trace = "none", Rowv = as.dendrogram(mat.hclust),
Colv = as.dendrogram(mat.hclust), margins = margins,
cex.axis = 0.7, ColSideColors = side.col.c,
RowSideColors = side.col.r)
}
}
}
}
}
else
{
warning(paste("There are no related samples above the threshold", filt))
}
return(res)
}
#' \code{lmp}
#' @title lmp
#' @description This function will extract the p-value from a linear model object. It is used by phenoPairwiseRelations
#' @author Edi Prifti
#' @param modelobject : a linear model object as produced by lm()
#' @return a p-value
lmp <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
#' \code{phenoPairwiseRelations}
#' @title phenoPairwiseRelations
#' @description This function will compute all the relations between different variables adapting different statistical tests as a
#' function of the data type. It will adjust the p-value matrix for multiple testing.
#' @author Edi Prifti
#' @param data : bioclinical data with variables on the rows and samples on the columns
#' @param adjust : method to adjust for multiple testing (default="BH)
#' @param verbose : default=FALSE. If TRUE information will be printed to follow the progression.
#' @return a list of two matrixes containing the p-values and the multiple testing adjustment.
phenoPairwiseRelations <- function(data, adjust="BH", verbose=FALSE){
#require(nortest)
#require(Hmisc)
data.relations <- matrix(NA, ncol(data), ncol(data)); colnames(data.relations) <- colnames(data); rownames(data.relations) <- colnames(data)
for(i in 1:ncol(data)){
if (verbose) print(paste(i,colnames(data)[i]))
for(j in 1:ncol(data)){
if (verbose) print(paste(j,colnames(data)[j]))
x <- data[,i]
y <- data[,j]
# test weather the variables are factors
# a) both factors
if(is.factor(x)){
if(is.factor(y)){
# perfom a chisquare
if(nrow(table(x,y))==1 | ncol(table(x,y))==1 | sum(table(x,y))==0){
# do nothing
}else{
data.relations[i,j] <- chisq.test(table(x,y))$p.value
}
}else{
# b) x is a factor but not y
# test for normality
if(nortest::lillie.test(y)$p.value>0.05){ #if normal
if(length(table(x))>1){
if(length(table(x))==2){
if(all(colSums(table(y,x))>1)){
data.relations[i,j] <- t.test(y~x)$p.value
}
}else{
if(sum(table(y,x))>1 & length(unique(x[!is.na(x)]))!=1 & sum(colSums(table(y,x))>0)>1){
tmp <- lmp(lm(y~x))
if(!is.null(tmp)) {
data.relations[i,j] <- tmp
}
}
}
}
}else{ # not normal
# run anova
if(length(table(x))>1){
if(length(table(x))==2){
# should test here the distribution of y if normal than t.test otherwise wilcox
if(all(colSums(table(y,x))>1)){
data.relations[i,j] <- wilcox.test(y~x)$p.value
}
}else{
if(sum(table(y,x))>1 & length(unique(x[!is.na(x)]))!=1 & sum(colSums(table(y,x))>0)>1){
tmp <- kruskal.test(y~x)
if(!is.null(tmp)) {
data.relations[i,j] <- tmp$p.value
}
}
}
}
}
}
}else{ # x is not a factor
if(is.factor(y)){ # but y is a factor
# test for normality
if(nortest::lillie.test(x)$p.value>0.05){ #if normal
if(length(table(y))>1){
if(length(table(y))==2){
if(all(colSums(table(x,y))>1)){
data.relations[i,j] <- t.test(x~y)$p.value
}
}else{
if(sum(table(x,y))>1 & length(unique(y[!is.na(y)]))!=1 & sum(colSums(table(x,y))>0)>1){
tmp <- lmp(lm(x~y))
if(!is.null(tmp)) {
data.relations[i,j] <- tmp
}
}
}
}
}else{ # non normal
if(length(table(y))>1){
if(length(table(y))==2){
if(all(colSums(table(x,y))>1)){
data.relations[i,j] <- wilcox.test(x~y)$p.value
}
}else{
if(sum(table(x,y))>1 & length(unique(y[!is.na(y)]))!=1 & sum(colSums(table(x,y))>0)>1){
tmp <- kruskal.test(x~y)
if(!is.null(tmp)) {
data.relations[i,j] <- tmp$p.value
}
}
}
}
}
}else{
# if both are numeric, run a correlation by testing for normality
if (length(x[!is.na(x)])>4 & length(x[!is.na(y)])>4){
if(nortest::lillie.test(x)$p.value<0.05 | nortest::lillie.test(y)$p.value<0.05){
# use spearman
data.relations[i,j] <- Hmisc::rcorr(x,y,type="spearman")$P[1,2]
}else{
# use pearson
data.relations[i,j] <- Hmisc::rcorr(x,y,type="pearson")$P[1,2]
}
}
}
}
}
}
# adjust for multiple testing
data.relations.q <-matrix(p.adjust(data.relations ,method=adjust), nrow=nrow(data.relations));
colnames(data.relations.q) <- colnames(data.relations); rownames(data.relations.q) <- rownames(data.relations)
return(list(p=data.relations,q=data.relations.q))
}
#' \code{extractSignificant}
#' @export
#' @title extractSignificant
#' @description This function will extract a list of vectors p- or q-values from an object produced by phenoPairwiseRelations.
#' @author Edi Prifti
#' @param relation.matrix : a matrix of p- produced by or q-values by phenoPairwiseRelations()
#' @param interest : a vector of variable names of interest.
#' @param threshold : default 0.05 needed to select significant relations
#' @return a list of vectors containing p-values or q-values along with the names of the variables.
extractSignificant <- function(relation.matrix, interest, threshold=0.05){
res <- list()
for(i in 1:length(interest)){
tmp.val <- relation.matrix[,interest[i]]
tmp <- (tmp.val < threshold)
tmp[is.na(tmp)] <- FALSE
res[[i]] <- tmp.val[tmp]
}
names(res) <- interest
return(res)
}
#' End of section and file
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.