#'@import ggplot2 mclust scales tools
NULL
#'EMMIXgene:
#'
#'Selects genes using the EMMIXgene algorithm, following the methodology of
#'G. J. McLachlan, R. W. Bean, D. Peel; A mixture model-based approach to the
#'clustering of microarray expression data , Bioinformatics, Volume 18,
#'Issue 3, 1 March 2002, Pages 413–422,
#'https://doi.org/10.1093/bioinformatics/18.3.413
#'
#'@section Functions:
#'\code{\link{select_genes}}: Selects the most differentially expressed genes.
#'
#'\code{\link{cluster_genes}}: Clusters the genes using a mixture model
#'approach.
#'
#'\code{\link{cluster_tissues}}: Clusters the tissues based on the differences
#'between the tissue samples among the gene groups.
#'
#'See \code{vignette('The-EMMIXgene-Workflow')} for more details.
#' @docType package
#' @name EMMIXgene
NULL
#'@title Selects genes using the EMMIXgene algorithm.
#'
#'@description Follows the gene selection methodology of
#'G. J. McLachlan, R. W. Bean, D. Peel; A mixture model-based approach to the
#'clustering of microarray expression data , Bioinformatics, Volume 18,
#'Issue 3, 1 March 2002, Pages 413–422,
#'https://doi.org/10.1093/bioinformatics/18.3.413
#'
#'@param dat A matrix or dataframe containing gene expression data.
#'Rows are genes and columns are samples. Must supply one of filename and dat.
#'@param filename Name of file containing gene data. Can be either .csv
#'or space separated .dat. Rows are genes and columns are samples.
#'Must supply one of filename and dat.
#'@param random_starts The number of random initializations used per gene when
#'fitting mixtures of t-distributions. Initialization uses k-means by default.
#'@param max_it The maximum number of iterations per mixture fit.
#'Default value is 100.
#'@param ll_thresh The difference in -2 log lambda used as a threshold
#'for selecting between g=1 and g=2 for each gene. Default value is 8,
#'which was chosen arbitrarily in the original paper.
#'@param min_clust_size The minimum number of observations per cluster
#'used when fitting mixtures of t-distributions for each gene.
#'Default value is 8.
#'@param tol Tolerance value used for detecting convergence of EMMIX fits.
#'@param start_method Default value is "both".
#'Can also choose "random" for purely random starts.
#'@param three Also test g=2 vs g=3 where appropriate. Defaults to FALSE.
#'@return An EMMIXgene object containing:
#'\item{stat}{The difference in log-likelihood for g=1
#'and g=2 for each gene (or for g=2 and g=3 where relevant).}
#'\item{g}{The selected number of components for each gene.}
#'\item{it}{The number of iterations for each genes selected fit.}
#'\item{selected}{An indicator for each genes selected status}
#'\item{ranks}{selected gene ids ranked by stat}
#'\item{genes}{A dataframe of selected genes.}
#'\item{all_genes}{Returns dat or contents of filename.}
#'
#'@examples
#'#only run on first 100 genes for speed
#'alon_sel <- select_genes(alon_data[seq_len(100), ])
#'
#'@export
select_genes<-function(dat, filename, random_starts=4, max_it = 100,
ll_thresh = 8, min_clust_size = 8, tol = 0.0001,
start_method ="both", three=FALSE){
#housekeeping
if(missing(filename)&missing(dat)){
stop("Must supply one of filename and dat.")
}
if(!missing(filename)&!missing(dat)){
stop("Must supply ONLY one of filename and dat.")
}
if(!missing(filename)&missing(dat)){
if(tools::file_ext(filename)=="dat"){
data<-as.matrix(utils::read.delim(filename, sep=" ", header=FALSE))}
if(tools::file_ext(filename)=="csv"){
data<-as.matrix(utils::read.csv(filename, header=FALSE))}
}
if(missing(filename)&!missing(dat)){
data<-as.matrix(dat)
}
if(any(!stats::complete.cases((data)))|
any(!stats::complete.cases(t(data)))){
warning("Incomplete cases removed removed.")
}
if(any(apply(data,1,stats::var)==0)){
warning("Some rows are constant across samples and have been removed.")
}
data<-data[!apply(data,1,stats::var)==0,]
#remove missing data
data<-data[stats::complete.cases((data)),]
data<-data[,stats::complete.cases(t(data))]
#actual method
#do emmix_gene C++ routine
a<-emmix_gene(data,random_starts, max_it, ll_thresh,
min_clust_size,tol,start_method, three)
a$selected<-a$g>1
#add selected genes to result
a$genes <- data[a$g>1,]
a$all_genes <- data
#ranks of selected
a$stat2<-a$stat
a$stat2[!a$selected]<-NA
a$ranks<-order(a$stat2, decreasing = TRUE, na.last = TRUE)
a$stat2 <- NULL
result<-structure(a, class="EMMIXgene")
#return selected genes
return(result)
}
#'Clusters genes using mixtures of normal distributions
#'
#'Sorts genes into clusters using mixtures of normal distributions with
#'covariance matrices restricted to be multiples of the identity matrix.
#'
#'@param gen an EMMIXgene object produced by select_genes().
#'@param g The desired number of gene clusters. If not specified will be
#'selected automatically on the basis of BIC.
#'@return An array containing the clustering.
#'@examples
#'
#'#only run on first 100 genes for speed
#'alon_sel <- select_genes(alon_data[seq_len(100), ])
#'alon_clust<- cluster_genes(alon_sel , 2)
#'@export
cluster_genes<-function(gen, g=NULL){
clust_genes_k<-tkmeans(as.matrix(gen$genes), k=g, 0.00001,
rep(1, ncol(gen$genes)), 10, 100, 0.001, FALSE)
clusters<-nearest_cluster(as.matrix(gen$genes), clust_genes_k)
#clust_genes<-mclust::Mclust((gen$genes), G=g, modelNames = "VII")
#g<-clust_genes$G
ll_rank_stat<-array(0,g)
for(i in seq_len(g)){
ll_rank_stat[i]<-
each_gene(colMeans(gen$genes[clusters==i, ,drop=FALSE]))$Ratio
}
#reorder clusters as ranked by mean of ll statistic from EMMIXgene fit
tmp<-factor(clusters)
levels(tmp) <- as.character(order(unlist(ll_rank_stat), decreasing = TRUE))
clust_genes<-as.numeric(as.character(tmp))
return(clust_genes)
}
#'Clusters tissues using all group means
#'
#'@param gen EMMIXgene object
#'@param clusters mclust object
#'@param q number of factors if using mfa
#'@param G number of components if using mfa
#'@return a clustering for each sample (columns) by each group(rows)
#'@examples
#'
#'example <- plot_single_gene(alon_data,1)
#'#only run on first 100 genes for speed
#'alon_sel <- select_genes(alon_data[seq_len(100), ])
#'alon_clust<- cluster_genes(alon_sel , 2)
#'\donttest{alon_tissue_all<-all_cluster_tissues(alon_sel, alon_clust, q=1, G=2)}
#'@export
all_cluster_tissues<-function(gen, clusters, q=6, G=2){
g<- length(table(clusters))
p<-ncol(gen$genes)
group_means<-array(0,c(g,p))
for(i in seq_len(g)){
group_means[i,] <- colMeans(gen$genes[clusters==i,,drop=FALSE])
}
mfa_fit<-mcfa(t(group_means), G, q, itmax=100, nkmeans=5, nrandom=5)
clustering<- as.numeric(predict.mcfa(mfa_fit, t(group_means))-1)
return(clustering)
}
#'Clusters tissues
#'
#'@param gen EMMIXgene object
#'@param clusters mclust object
#'@param method Method for separating tissue classes. Can be either 't' for a
#'univariate mixture of t-distributions on gene cluster means, or 'mfa'
#'for a mixture of factor analyzers.
#'@param q number of factors if using mfa
#'@param G number of components if using mfa
#'@return a clustering for each sample (columns) by each group(rows)
#'@examples
#'#only run on first 100 genes for speed
#'alon_sel <- select_genes(alon_data[seq_len(100), ])
#'alon_clust<- cluster_genes(alon_sel,2)
#'alon_tissue_t<-
#' cluster_tissues(alon_sel,alon_clust,method='t')
#'alon_tissue_mfa<-
#' cluster_tissues(alon_sel, alon_clust,method='mfa',q=2,G=2)
#'@export
cluster_tissues<-function(gen, clusters, method='t', q=6, G=2){
g<-length(table(clusters))
p<-ncol(gen$genes)
clustering<-array(0,c(g,p))
clustering2<-array(0,c(g,p))
if(method=='t'){
for(i in seq_len(g)){
group_means <- colMeans(gen$genes[clusters==i,,drop=FALSE])
t_fit<-emmix_t(group_means, G)
clustering[i,]<-t_fit$Clusters
}
}
if(method=='mfa'){
for(i in seq_len(g)){
group <- as.matrix((gen$genes[clusters==i,,drop=FALSE]))
#actually mixture of common factor analysers. consider fixing.
if(dim(group)[1]>20){
q1<-min(q, sum(clusters==i)-1 )
mfa_fit<-mcfa(t(group), G, q1, itmax=100,
nkmeans=50, nrandom=50)
clustering[i,]<- as.numeric(predict.mcfa(mfa_fit, t(group))-1)
}
}
}
return(clustering)
}
#'Cluster tissues
#'
#'@param gen An EMMIXgene object produced by select_genes().
#'@param n_top number of top genes (as ranked by likelihood) to be selected
#'@param method Method for separating tissue classes. Can be either 't' for a
#'univariate mixture of t-distributions on gene cluster means,
#'or 'mfa' for a mixture of factor analysers.
#'@param q number of factors if using mfa
#'@param g number of components if using mfa
#'@return An EMMIXgene object containing:
#'\item{stat}{A matrix containing clustering (0 or 1)
#'for each sample (columns) by each group(rows).}
#'\item{top_gene}{The row numbers of the top genes.}
#'\item{fit}{The fit object used to determine the clustering.}
#'@examples
#'\donttest{alon_sel <- select_genes(alon_data[seq_len(100), ])}
#'\donttest{alon_top_10<-top_genes_cluster_tissues(alon_sel, 10, method='mfa', q=3, g=2)}
#'@export
top_genes_cluster_tissues<-function(gen, n_top=100, method='mfa', q=2, g=2){
p<-ncol(gen$genes)
clustering<-array(0,p)
top_genes<-gen$ranks[seq_len(n_top)]
if(method=='t'){
group_means <- colMeans(gen$all_genes[top_genes,])
fit<-emmix_t(group_means, g)
clustering<-fit$Clusters
}
if(method=='mfa'){
group <- as.matrix((gen$all_genes[top_genes,]))
fit<-mcfa(t(group), g, q)
clustering<- predict.mcfa(fit, t(group))-1
}
return(list(clustering=clustering, top_genes=top_genes, fit=fit))
}
#'Heat maps
#'
#'Plot heat maps of gene expression data. Optionally sort the x-axis
#'according to a predetermined clustering.
#'
#'@param dat matrix of gene expression data.
#'@param clustering a vector of sample classifications.
#'Must be same length as the number of columns in dat.
#'@param y_lab optional label for y-axis.
#'@return A ggplot2 heat map.
#'@examples
#'
#'example <- heat_maps(alon_data[seq_len(100), ])
#'
#'
#'@export
heat_maps<-function(dat, clustering=NULL, y_lab=NULL){
colnames(dat) <- NULL
if(!is.null(clustering)){
dat<-dat[,order(clustering)]
}
df_heatmap<-reshape::melt(dat)
names(df_heatmap)<-c("genes", "samples", "expression_level")
df_heatmap$genes<-factor(df_heatmap$genes)
df_heatmap$samples<-factor(df_heatmap$samples)
plot<-ggplot(df_heatmap, aes(df_heatmap$samples,df_heatmap$genes ))+
geom_tile(aes(fill = df_heatmap$expression_level), color = "white")+
scale_fill_distiller(palette = "Spectral")+
xlab("Samples") + guides(fill=guide_legend(title="Expression Level"))+
theme(axis.text.y = element_blank(),
axis.text.x = element_text(size = 6),
axis.ticks.y=element_blank()
)
if(!is.null(clustering)){
plot<- plot + ylab(y_lab)
}else{
plot<- plot + ylab("Genes")
}
if(is.null(clustering)){
plot<- plot +
scale_x_discrete(breaks = seq(0, length(levels(df_heatmap$genes)), 10) )
}
if(!is.null(clustering)){
plot<- plot + scale_x_discrete(labels =order(clustering))
#+ theme(axis.text.x = element_blank())
}
return(plot)
}
#'Plot a single gene expression histogram with best
#'fitted mixture of t-distributions.
#'
#'Plot a single gene expression histogram with best
#'fitted mixture of t-distributions according to the EMMIX-gene algorithm.
#'
#'@param dat matrix of gene expression data.
#'@param gene_id row number of gene to be plotted.
#'@param g force number of components, default = NULL
#'@param random_starts The number of random initializations used
#'per gene when fitting mixtures of t-distributions.
#'Initialization uses k-means by default.
#'@param max_it The maximum number of iterations per mixture fit.
#'Default value is 100.
#'@param ll_thresh The difference in -2 log lambda used as a
#'threshold for selecting between g=1 and g=2 for each gene.
#'Default value is 8, which was chosen arbitrarily in the original paper.
#'@param min_clust_size The minimum number of observations per cluster
#'used when fitting mixtures of t-distributions for each gene.
#'Default value is 8.
#'@param tol Tolerance value used for detecting convergence of EMMIX fits.
#'@param start_method Default value is "both".
#'Can also choose "random" for purely random starts.
#'@param three Also test g=2 vs g=3 where appropriate. Defaults to TRUE.
#'@param min,max Minimum and maximum x-axis values for the plot window.
#'@return A ggplot2 histogram with fitted t-distributions overlayed.
#'@examples
#'
#'example <- plot_single_gene(alon_data,1)
#'#plot(example)
#'
#'@export
plot_single_gene<-function(dat, gene_id, g=NULL,
random_starts=8, max_it = 100,
ll_thresh = 8, min_clust_size = 8,
tol = 0.0001, start_method = "both",
three=TRUE, min = -4, max = 2){
df<-data.frame(x=dat[gene_id,])
n<-length(df$x)/4
breaks<-seq(min, max, length.out=n)
p<-ggplot(df, aes(x=df$x)) + geom_histogram(breaks=breaks, alpha=.5)
p2<-ggplot_build(p+theme_bw())
df_dens<-data.frame(x=p2$data[[1]]$x, y=p2$data[[1]]$density)
plot<-ggplot(df_dens)
plot<-plot+geom_bar(aes(x=df_dens$x, y=df_dens$y), alpha=.5,
stat="identity", position="identity")
plot<-plot+theme_bw()+xlab("Gene Expression Value")+ylab("Density")
df2<-data.frame(x=seq(min, max, length.out = 1000))
if(!is.null(g)){
res<-emmix_t(dat[gene_id,], g, random_starts, max_it, tol,start_method)
}else{
res<-each_gene(dat[gene_id,],random_starts, max_it, ll_thresh,
min_clust_size, tol,start_method,three)
}
for(i in seq_len(res$components)){
for(j in seq_len(nrow(df2))){
df2[[paste0('y',i)]][j]<-res$pi[i]*t_dist(df2$x[j], res$mu[i],
res$sigma[i], res$nu[i])
}
}
plot<-plot+geom_line(data=df2, aes(x=df2$x, y=df2$y1))
if(res$components>1){
plot<-plot+geom_line(data=df2, aes(x=df2$x, y=df2$y2))
}
if(res$components>2){
plot<-plot+geom_line(data=df2, aes(x=df2$x, y=df2$y3))
}
return(plot)
}
#'@title Normalized gene expression values from Alon et al. (1999).
#'
#'@description A dataset containing centred and normalized values of the
#'logged expression values of a subset of 2000 genes taken from
#'Alon, Uri, et al. "Broad patterns of gene expression revealed by clustering
#'analysis of tumor and normal colon tissues probed by oligonucleotide arrays."
#'Proceedings of the National Academy of Sciences 96.12 (1999): 6745-6750.
#'The method of subset selection was described in G. J. McLachlan, R. W. Bean,
#'D. Peel; A mixture model-based approach to the clustering of microarray
#'expression data ,
#'Bioinformatics, Volume 18, Issue 3, 1 March 2002, Pages 413–422.
#'
#'@docType data
#'@keywords datasets
#'@name alon_data
#'@usage data(alon_data)
#'@format A data frame with 2000 rows (genes) and 62 variables (samples).
#'@examples
#'dim(alon_data)
NULL
#'@title Normalized gene expression values from Golub et al. (1999).
#'
#'@description A dataset containing the centred and normalized values of the
#'logged expression values of a subset of 3731 genes taken from Golub,
#'Todd R., et al. "Molecular classification of cancer: class discovery
#'and class prediction by gene expression monitoring."
#'Science 286.5439 (1999): 531-537.
#'The method of subset selection was described in G. J. McLachlan, R. W. Bean,
#'D. Peel; A mixture model-based approach to the clustering of microarray
#'expression data ,
#'Bioinformatics, Volume 18, Issue 3, 1 March 2002, Pages 413–422.
#'
#'@docType data
#'@keywords datasets
#'@name golub_data
#'@usage data(golub_data)
#'@format A data frame with 3731 rows (genes) and 72 variables (samples).
#'#'@examples
#'dim(golub_data)
NULL
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.