#' BISEP_cv
#'
#' Input a matrix of continuous data with genes as rows and samples as columns.
#' Algorithm may take some considerable time to run (1 hour for 20,500 genes across 800 samples)
#'
#' Note: this is a slightly modified version of the BISEP algorithm from the
#' \code{\link[BiSEp]{BiSEp}} package: see the original \code{\link[BiSEp]{BISEP}} function documentation.
#' The modification allows the random seed to be set so that p-values generated by sampling stay the same
#' from run to run.
#'
#' @param data This should be a log2 gene expression matrix with genes as rownames and samples as
#' column names. Suitable for gene expression data from any platform - NGS datasets should be
#' RPKM or RSEM values.
#' @param seed_val A numeric. Default is NULL - seed not set.
#'
#' @author Mark Wappett, \email{m.a.wappett at googlemail.com}
#' @references \href{https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-016-2375-1}{Wappett et al BMC Genomics 2016}
#' @seealso \code{\link[BiSEp]{BiSEp}}
#'
#' @return A list containing three matrices. Matrix 1 contains the output of the BISEP algorithm -
#' including the midpoint of the bimodal distribution and the associated p value. Matrix 2
#' contains the output from the BI algorithm - including the delta, pi and BI values.
#' Matrix 3 contains the input matrix.
#' @export
#'
#' @examples
#' library(BiSEp)
#' data("INPUT_data")
#' bisep_out <- BISEP_cv(INPUT_data, seed_val = 12345)
#' bisep_out$BISEP
#' bisep_out$BI
BISEP_cv <- function(data=data, seed_val=NULL)
{
require(mclust)
# Deal with missing data
if(missing(data)) stop("Need to input expression data matrix")
# Deal with any negative values (< 1 min)
data2 <- apply(data, 1, function(x) {
w1 <- which(x[1:dim(data)[2]] < 0)
len1 <- length(w1)
vals1 <- runif(len1)
x[w1] <- vals1
x
})
data2 <- t(data2)
# Work out bimodality indexes of genes, and then rank based on this index (< 10 mins)
bimodalIndex <- function(dataset, verbose = TRUE)
{
# From Coombes,KR (2012) ClassDiscovery: Classes and methods for "class discovery" with microarrays or proteomics.
bim <- matrix(NA, nrow = nrow(dataset), ncol = 6)
if (verbose)
cat("1 ")
for (i in 1:nrow(dataset)) {
if (verbose && 0 == i%%100)
cat(".")
if (verbose && 0 == i%%1000)
cat(paste("\n", 1 + i/1000, " ", sep = ""))
x <- as.vector(as.matrix(dataset[i, ]))
if (any(is.na(x)))
next
mc <- mclust::Mclust(x, G = 2, modelNames = "E")
sigma <- sqrt(mc$parameters$variance$sigmasq)
delta <- abs(diff(mc$parameters$mean))/sigma
pi <- mc$parameters$pro[1]
bi <- delta * sqrt(pi * (1 - pi))
bim[i, ] <- c(mc$parameters$mean, sigma = sigma, delta = delta,
pi = pi, bim = bi)
}
if (verbose)
cat("\n")
dimnames(bim) <- list(rownames(dataset), c("mu1", "mu2",
"sigma", "delta", "pi", "BI"))
bim <- as.data.frame(bim)
bim
}
print("Calculating bimodal index.....")
biIndex <- bimodalIndex(data2)
# Set up bimodality in gene expression (BIG) function and run
indexing<-function(x.sort,test.bnd,alpha)
{
n<-length(x.sort);
low<-x.sort[1:test.bnd];
hgh<-x.sort[(test.bnd+1):n];
n1<-length(low);
n2<-length(hgh);
qr.low<-quantile(low,seq(0,1,0.01));
qr.hgh<-quantile(hgh,seq(0,1,0.01));
v1<-abs(qr.low[[75]][1]-qr.low[[25]][1])/1.34896;# statistics of low cluster
v2<-abs(qr.hgh[[75]][1]-qr.hgh[[25]][1])/1.34896;# statistics of high cluster
gap.bet.2.cls<-abs(max(low)-min(hgh));
dst.bet.2.cls<-abs(qr.low[[75]][1]-qr.hgh[[25]][1]);
het.std<-sqrt((n1*v1^2+n2*v2^2)*(1/n1+1/n2)/n);
rtv<-(alpha*gap.bet.2.cls+(1-alpha)*dst.bet.2.cls)/het.std;
return(rtv);
}
big<-function(x,alpha,top)
#get index for one gene
#x: expression vector of a gene
#alpha: weight on gap between two clusters
#top: stop rule for scanning
{
x.sort<-sort(x);
x.sort.or<-order(diff(x.sort),decreasing=T);#order of gaps
max.index<-0;#default index
my.boundary<-0;#default boundary
prediction<-c();
for(i in 1:top)
{
new.ind<-indexing(x.sort,x.sort.or[i],alpha);
new.bnd<-mean(c(x.sort[x.sort.or[i]],x.sort[x.sort.or[i]+1]));
prediction<-rbind(prediction,c(new.ind,new.bnd));
}
sel<-which(prediction[,1]==max(prediction[,1]))[1];#find the highest BIG index
return(list(prediction,c(prediction[sel,1],prediction[sel,2])));
}
Besag.Clifford<-function(BI,H,B,seed_val=NULL)#Beasg-Clifford algorithm for p value
#BI: bimodal index list
#H: rejection number (100: reasonable)
#B: random sampling times (1e5: reasonable)
#seed_val: seed value - NULL by default (not set so p-values won't be stable)
{
if (!is.null(seed_val)) {
set.seed(seed_val) ## ADDED BY PJC 20/3/16 TO STABILISE P-VALS
}
p<-c();
for(i in 1:length(BI))
{
bs<-c();
h<-0;
while(h<H & length(bs)<B)
{
bs<-c(bs,BI[sample(1:length(BI),1)]);
h<-sum(1*(bs>=BI[i]));
}
p<-c(p,h/length(bs));
}
return(p);
}
BIG<-function(dat)
#the subroutine to use BIG algorithm
#dat is a list
#X: expression matrix (logarithm or normalisation pre-processed)
#H: rejection number used by Beasg.Clifford algorithm
#B: random sampling times used by Besag.Clifford algorithm
#alpha: weight on gap between two clusters
#top: stop rule when scanning a gap vector
{
if(length(dat$X)==0)
{
print("no expression matrix!");
return();
} else {
X<-dat$X;
H<-100;
if(length(dat$H)>0)
H<-dat$H;
B<-10000;
if(length(dat$B)>0)
B<-dat$B;
alpha<-0.75;
if(length(dat$alpha)>0)
alpha<-dat$alpha;
top<-10;
if(length(dat$top)>0)
top<-dat$top;
BI<-c();
all<-c();
for(i in 1:nrow(X))
{
buf<-big(X[i,],alpha,top);
all<-c(all,buf);
BI<-rbind(BI,buf[[2]]);
}
p<-Besag.Clifford(BI[,1],H,B,seed_val);
model<-list(BI=BI[,1],BND=BI[,2],p=p);
return(model);
}
}
data<-function(fn)
{
X<-as.matrix(read.csv(fn,header=FALSE));
return(X);
}
# Run the big method (will take ~2 mins per 1,000 samples)
print("Calculating BISEP values.....")
big.model<-BIG(list(X=data2))
bmT <- as.data.frame(cbind(big.model[[2]], big.model[[3]]), stringsAsFactors=FALSE)
rownames(bmT) <- rownames(data2)
outList <- list(BISEP=bmT, BI=biIndex, DATA=data2)
return(outList)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.