knitr::opts_chunk$set(echo = TRUE) options(width=98)
Gene expression data is very important in experimental molecular biology [@brazma00], especially for cancer study [@fehrmann15]. The large-scale microarray data and RNA-seq data provide good opportunity to do the gene co-expression analyses and identify co-expressed gene modules; and the effective and efficient algorithms are needed to implement such analysis. Substantial efforts have been made in this field, such as @cheng00, Plaid [@lazzeroni02], Bayesian Biclustering [BCC, @gu08], among them Cheng and Church and Plaid has the R package implementation. It is worth noting that our in-house biclustering algorithm, QUBIC [@li09], is reviewed as one of the best programs in terms of their prediction performance on benchmark datasets. Most importantly, it is reviewed as the best one for large-scale real biological data [@eren12].
Until now, QUBIC has been cited over 200 times (via Google Scholar) and its web server, QServer, was developed in 2012 to facilitate the users without comprehensive computational background [@zhou12]. In the past five years, the cost of RNA-sequencing decreased dramatically, and the amount of gene expression data keeps increasing. Upon requests from users and collaborators, we developed this R package of QUBIC to void submitting large data to a webserver.
The unique features of our R package [@zhang16] include (1) updated and more stable back-end resource code (re-written by C++), which has better memory control and is more efficient than the one published in 2009. For an input dataset in Arabidopsis, with 25,698 genes and 208 samples, we observed more than 40% time saving; and (2) comprehensive functions and examples, including discretize function, heatmap drawing and network analysis.
citation("QUBIC")
If R is not your thing, there is also a C version of QUBIC
.
If you are having trouble with this R package, contact the maintainer, Yu Zhang.
Stable version from BioConductor
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("QUBIC")
Or development version from GitHub
install.packages("devtools") devtools::install_github("zy26/QUBIC")
Load QUBIC
library("QUBIC")
There are nine functions provided by QUBIC package.
qudiscretize()
creates a discrete matrix for a given gene expression matrix;BCQU()
performs a qualitative biclustering for real matrix;BCQUD()
performs a qualitative biclustering for discretized matrix;quheatmap()
can draw heatmap for singe bicluster or overlapped biclusters;qunetwork()
can automatically create co-expression networks based on the identified biclusters by QUBIC;qunet2xml()
can convert the constructed co-expression networks into XGMML format for further network analysis in Cytoscape, Biomax and JNets; The following examples illustrate how these functions work.
library(QUBIC) set.seed(1) # Create a random matrix test <- matrix(rnorm(10000), 100, 100) colnames(test) <- paste("cond", 1:100, sep = "_") rownames(test) <- paste("gene", 1:100, sep = "_") # Discretization matrix1 <- test[1:7, 1:4] matrix1 matrix2 <- qudiscretize(matrix1) matrix2 # Fill bicluster blocks t1 <- runif(10, 0.8, 1) t2 <- runif(10, 0.8, 1) * (-1) t3 <- runif(10, 0.8, 1) * sample(c(-1, 1), 10, replace = TRUE) test[11:20, 11:20] <- t(rep(t1, 10) * rnorm(100, 3, 0.3)) test[31:40, 31:40] <- t(rep(t2, 10) * rnorm(100, 3, 0.3)) test[51:60, 51:60] <- t(rep(t3, 10) * rnorm(100, 3, 0.3)) # QUBIC res <- biclust::biclust(test, method = BCQU()) summary(res) # Show heatmap hmcols <- colorRampPalette(rev(c("#D73027", "#FC8D59", "#FEE090", "#FFFFBF", "#E0F3F8", "#91BFDB", "#4575B4")))(100) # Specify colors par(mar = c(4, 5, 3, 5) + 0.1) quheatmap(test, res, number = c(1, 3), col = hmcols, showlabel = TRUE)
library(QUBIC) data(BicatYeast) # Discretization matrix1 <- BicatYeast[1:7, 1:4] matrix1 matrix2 <- qudiscretize(matrix1) matrix2 # QUBIC x <- BicatYeast system.time(res <- biclust::biclust(x, method = BCQU())) summary(res)
We can draw heatmap for single bicluster.
# Draw heatmap for the second bicluster identified in Saccharomyces cerevisiae data library(RColorBrewer) paleta <- colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(11) par(mar = c(5, 4, 3, 5) + 0.1, mgp = c(0, 1, 0), cex.lab = 1.1, cex.axis = 0.5, cex.main = 1.1) quheatmap(x, res, number = 2, showlabel = TRUE, col = paleta)
We can draw heatmap for overlapped biclusters.
# Draw for the second and third biclusters identified in Saccharomyces cerevisiae data par(mar = c(5, 5, 5, 5), cex.lab = 1.1, cex.axis = 0.5, cex.main = 1.1) paleta <- colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(11) quheatmap(x, res, number = c(2, 3), showlabel = TRUE, col = paleta)
We can draw network for single bicluster.
# Construct the network for the second identified bicluster in Saccharomyces cerevisiae net <- qunetwork(x, res, number = 2, group = 2, method = "spearman") if (requireNamespace("qgraph", quietly = TRUE)) qgraph::qgraph(net[[1]], groups = net[[2]], layout = "spring", minimum = 0.6, color = cbind(rainbow(length(net[[2]]) - 1), "gray"), edge.label = FALSE)
We can also draw network for overlapped biclusters.
net <- qunetwork(x, res, number = c(2, 3), group = c(2, 3), method = "spearman") if (requireNamespace("qgraph", quietly = TRUE)) qgraph::qgraph(net[[1]], groups = net[[2]], layout = "spring", minimum = 0.6, legend.cex = 0.5, color = c("red", "blue", "gold", "gray"), edge.label = FALSE)
# Output overlapping heatmap XML, could be used in other software such # as Cytoscape, Biomax or JNets sink('tempnetworkresult.gr') qunet2xml(net, minimum = 0.6, color = cbind(rainbow(length(net[[2]]) - 1), "gray")) sink() # We can use Cytoscape, Biomax or JNets open file named 'tempnetworkresult.gr'
The Escherichia coli data consists of 4,297 genes and 466 conditions.
library(QUBIC) library(QUBICdata) data("ecoli", package = "QUBICdata") # Discretization matrix1 <- ecoli[1:7, 1:4] matrix1 matrix2 <- qudiscretize(matrix1) matrix2 # QUBIC res <- biclust::biclust(ecoli, method = BCQU(), r = 1, q = 0.06, c = 0.95, o = 100, f = 0.25, k = max(ncol(ecoli)%/%20, 2)) system.time(res <- biclust::biclust(ecoli, method = BCQU(), r = 1, q = 0.06, c = 0.95, o = 100, f = 0.25, k = max(ncol(ecoli)%/%20, 2))) summary(res)
# Draw heatmap for the 5th bicluster identified in Escherichia coli data library(RColorBrewer) paleta <- colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(11) par(mar = c(5, 4, 3, 5) + 0.1, mgp = c(0, 1, 0), cex.lab = 1.1, cex.axis = 0.5, cex.main = 1.1) quheatmap(ecoli, res, number = 5, showlabel = TRUE, col = paleta)
library(RColorBrewer) paleta <- colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(11) par(mar = c(5, 4, 3, 5), cex.lab = 1.1, cex.axis = 0.5, cex.main = 1.1) quheatmap(ecoli, res, number = c(4, 8), showlabel = TRUE, col = paleta)
# construct the network for the 5th identified bicluster in Escherichia coli data net <- qunetwork(ecoli, res, number = 5, group = 5, method = "spearman") if (requireNamespace("qgraph", quietly = TRUE)) qgraph::qgraph(net[[1]], groups = net[[2]], layout = "spring", minimum = 0.6, color = cbind(rainbow(length(net[[2]]) - 1), "gray"), edge.label = FALSE)
# construct the network for the 4th and 8th identified bicluster in Escherichia coli data net <- qunetwork(ecoli, res, number = c(4, 8), group = c(4, 8), method = "spearman") if (requireNamespace("qgraph", quietly = TRUE)) qgraph::qgraph(net[[1]], groups = net[[2]], legend.cex = 0.5, layout = "spring", minimum = 0.6, color = c("red", "blue", "gold", "gray"), edge.label = FALSE)
We can conduct a query-based biclustering by adding the weight parameter. In this example, the instance file "511145.protein.links.v10.txt" [@szklarczyk14] was downloaded from string (http://string-db.org/download/protein.links.v10/511145.protein.links.v10.txt.gz) and decompressed and saved in working directory.
# Here is an example to download and extract the weight library(igraph) url <- "http://string-db.org/download/protein.links.v10/511145.protein.links.v10.txt.gz" tmp <- tempfile() download.file(url, tmp) graph = read.graph(gzfile(tmp), format = "ncol") unlink(tmp) ecoli.weight <- get.adjacency(graph, attr = "weight")
library(QUBIC) library(QUBICdata) data("ecoli", package = "QUBICdata") data("ecoli.weight", package = "QUBICdata") res0 <- biclust(ecoli, method = BCQU(), verbose = FALSE) res0 res4 <- biclust(ecoli, method = BCQU(), weight = ecoli.weight, verbose = FALSE) res4
we can expand existing biclustering results to recruite more genes according to certain consistency level:
res5 <- biclust(x = ecoli, method = BCQU(), seedbicluster = res, f = 0.25, verbose = FALSE) summary(res5)
We can compare the biclustering results obtained from different algorithms, or from a same algorithm with different combinations of parameter.
test <- ecoli[1:50,] res6 <- biclust(test, method = BCQU(), verbose = FALSE) res7 <- biclust (test, method = BCCC()) res8 <- biclust(test, method = BCBimax()) showinfo (test, c(res6, res7, res8))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.