#' Get available Omics Summarizing Methods
#'
#' Gives a vector of the available methods to summarize omics.
#'
#' @return character vector with the implemented methods.
#'
#' @export
availableOmicMethods <- function() {
return(c("summarizeToBinaryEvents",
"summarizeToNumberOfEvents",
"summarizeWithPca",
"summarizeInCluster",
"summarizeInClusterWithoutDictionary",
"summarizeToBinaryDirectionalEvents",
"summarizeToNumberOfDirectionalEvents"))
}
#' Summarize To Binary Events
#'
#' Given a matrix it summarize to a 0 or 1
#'
#' @param data a data matrix
#' @param features a vector with the features to analyze
#' @param name prefix of the covariates
#' @param binaryClassMin the minimum number of event to include the covariate
#' @param cliques the features organized in cliques. Only use for topology.
#'
#' @return NULL
#'
#' @export
summarizeToBinaryEvents <- function(data, features, name="bin",
binaryClassMin=10, cliques=NULL) {
if (is.null(data))
return(NULL)
genes <- intersect(row.names(data), features)
if (length(genes)==0)
return(NULL)
dataClique <- t(data[genes, , drop=F])
if (ncol(dataClique)==0)
return(NULL)
collapsed <- apply(dataClique>0, 1, any, na.rm=T)
if (sum(collapsed) < binaryClassMin | sum(collapsed) > NROW(dataClique)-binaryClassMin)
return(NULL)
collapsed <- data.frame(collapsed, row.names = names(collapsed), stringsAsFactors = F)
colnames(collapsed) <- name
list(x=collapsed, dataModule=t(dataClique), namesCov=name, method="binary", omicName=name, eventThr=1)
}
#' Summarize To Number of Binary Events
#'
#' Given a matrix it summarize to a 0 or 1
#'
#' @param data a data matrix
#' @param features a vector with the features to analyze
#' @param name prefix of the covariates
#' @param min_prop minimal proportion in classes
#' @param cliques the features organized in cliques. Only use for topology.
#'
#' @return NULL
#'
#' @export
summarizeToNumberOfEvents <- function(data, features, name="event", min_prop=0.1, cliques=NULL) {
if (is.null(data))
return(NULL)
genes <- intersect(row.names(data), features)
if (length(genes)==0)
return(NULL)
dataClique <- t(data[genes, , drop=F])
if (ncol(dataClique)==0)
return(NULL)
collapsed <- apply(dataClique>0, 1, sum, na.rm=T)
# min <- ceiling(NCOL(data)*0.01)
# if (sum(collapsed) < min | (sum(collapsed)) > NROW(dataClique)-min)
# return(NULL)
keep <- check_minimal_proportion(collapsed, min_prop=min_prop)
if (!keep)
return(NULL)
collapsed <- data.frame(collapsed, row.names = names(collapsed), stringsAsFactors = F)
colnames(collapsed) <- name
list(x=collapsed, dataModule=t(dataClique), namesCov=name, method="count", omicName=name,
eventThr = 1, min_prop=min_prop)
}
#' Summarize Using Cluster Analysis
#'
#' Given a matrix it summarize in classes
#'
#' The user can define a maximum of classes. The function
#' guess the optimal number of clusters using NbClust methods.
#'
#' @param data a data matrix
#' @param features a vector with the features to analyze
#' @param name prefix of the covariates
#' @param dictionary translate features (genes) into sets (row.names of the data)
#' @param max_cluster_number the maximum number of cluster to evaluate
#' @param cliques the features organized in cliques. Only use for topology
#'
#' @return NULL
#' @importFrom stats cutree dist hclust
#' @importFrom NbClust NbClust
#' @export
summarizeInCluster <- function(data, features, name="clu", dictionary=NULL, max_cluster_number=3, cliques=NULL) {
if (is.null(data) || (ncol(data)==0) || !(is.matrix(data)))
return(NULL)
if (is.null(dictionary)) {
genes <- intersect(rownames(data), features)
if (length(genes)==0)
return(NULL)
used <- genes
names(used) <- genes
datamatClique <- t(data[genes, ,drop=FALSE])
} else {
genes <- intersect(names(dictionary), features)
if (length(genes)==0)
return(NULL)
used <- dictionary[genes]
clusters <- unlist(dictionary[genes])
clusters <- intersect(clusters, row.names(data))
if (length(clusters)==0)
return(NULL)
datamatClique <- t(data[clusters, , drop=F])
}
if (ncol(datamatClique)==0)
return(NULL)
## CREATE CLUSTERS
if (TRUE){
covs <- createOptiomalClusterClasses(datamatClique, name, max_cluster_number = max_cluster_number)
} else {
covs <- createClusterClassesOld(datamatClique, name)
}
if (any(table(covs[[1]])<2)){
warning("Not meaningful class separation\n")
return(NULL)
}
collapse=covs
list(x=collapse, dataModule=t(datamatClique), namesCov=names(covs), cls=used, method="cluster", omicName=name)
}
createOptiomalClusterClasses <- function(datamatClique, name, max_cluster_number, index_method="silhouette") {
nb <- sinkNbClust(data=datamatClique, min.nc=2, max.nc=max_cluster_number, method="ward.D2", index=index_method)
covs <- data.frame(factor(nb$Best.partition), stringsAsFactors = T)
optimalCLusterNumber <- length(table(nb$Best.partition))
names(covs) <- paste0(name,optimalCLusterNumber, "k")
covs
}
sinkNbClust <- function(data, min.nc=2, max.nc=6, method="ward.D2", index="silhouette"){
if (index=="all")
sink(file=tempfile()); pdf(file=NULL)
nb <- NbClust(data=data, min.nc=min.nc, max.nc=max.nc, method=method, index=index)
if (index=="all")
sink(); dev.off()
return(nb)
}
createClusterClassesOld <- function(datamatClique, name){
warning("you are using an old way to evaluate cluster...\n
this function will be deprecated soon...")
md <- dist(datamatClique, method = "euclidean")
if (any(is.na(md)))
return(NULL)
hc <- hclust(md, method="ward.D2")
# clusters <- kmeans(datamatClique, centers=2) # provioamo a implementare anche il Kmeans?
if (ncol(datamatClique)<4){
covs <- data.frame(factor(cutree(hc, k = 2)), stringsAsFactors = T)
names(covs) <- paste0(name,"2k")
} else {
covs <- data.frame(factor(cutree(hc, k = 3)), stringsAsFactors = T)
names(covs) <- paste0(name,"3k")
}
return(covs)
}
#' Summarize Using Cluster Analysis with no dictionary to translate the matrix ids
#'
#' Given a quantitative matrix it clusters the samples
#'
#' @param data a data matrix
#' @param features a vector with the features to analyze
#' @param name prefix of the covariates
#' @param cliques the features organized in cliques. Only use for topology.
#'
#' @return NULL
#' @importFrom stats cutree dist hclust
#' @export
summarizeInClusterWithoutDictionary <- function(data, features, name="clu", cliques=NULL) {
warning("function summarizeInClusterWithoutDictionary has been deprecated...\n
use summarizeInCluster")
if (is.null(data) | (ncol(data)==0) | !(is.matrix(data)))
return(NULL)
genes <- intersect(rownames(data), features)
if (length(genes)==0)
return(NULL)
datamatClique <- t(data[genes, ,drop=FALSE])
used <- colnames(datamatClique)
names(used) <- colnames(datamatClique)
md <- dist(datamatClique, method = "euclidean")
if (any(is.na(md)))
return(NULL)
hc <- hclust(md, method="ward.D2")
# clusters <- kmeans(datamatClique, centers=2) # TO DO: add kmeans
if (ncol(datamatClique)<4){
covs <- data.frame(factor(cutree(hc, k = 2)), stringsAsFactors = T)
names(covs) <- paste0(name,"2k")
} else {
covs <- data.frame(factor(cutree(hc, k = 3)), stringsAsFactors = T)
names(covs) <- paste0(name,"3k")
}
if (any(table(covs[[1]])<2)){
# warning("Not meaningful class separation\n")
return(NULL)
}
list(x=covs, dataModule=t(datamatClique), namesCov=names(covs), cls=used, method="cluster", omicName=name)
}
#' Summarize Using PCA
#'
#' Given a matrix it summarize to a 0 or 1
#'
#' @param data a data matrix
#' @param features a vector with the features to analyze
#' @param name prefix of the covariates
#' @param shrink shirnk or not the covariance matrix.
#' @param method either "regular", "sparse" or "topological"
#' @param cliques the features organized in cliques. Only use for topology.
#' @param maxPCs maximum number of pcs to consider
#' @param loadThr loading threshold
#'
#' @return NULL
#' @importFrom stats sd
#' @importFrom houseOfClipUtility computePCs
#' @export
summarizeWithPca <- function(data, features, name="pca", shrink=FALSE, method="regular", cliques=NULL, maxPCs=3, loadThr=0.6) {
if (is.null(data))
return(NULL)
genes <- intersect(row.names(data), features)
if (length(genes)==0)
return(NULL)
dataClique <- t(data[genes, , drop=F])
if (ncol(dataClique)==0)
return(NULL)
if (NCOL(dataClique)!=1) {
pcs <- computePCs(dataClique, shrink=shrink, method=method, cliques=cliques, maxPCs=maxPCs)
colnames(pcs$x) <- paste0(name,colnames(pcs$x))
names(pcs$sdev) <- paste0(name,names(pcs$sdev))
colnames(pcs$loading) <- paste0(name,colnames(pcs$loading))
} else {
colnames(dataClique) <- paste0(name,"PC1")
pcs <- list(x=dataClique, sdev=sd(dataClique), loadings=1)
}
pcs$dataModule <- t(dataClique)
pcs$method="pca"
pcs$namesCov=colnames(pcs$x)
pcs$omicName=name
pcs
}
#' Summarize With Directed Sum
#'
#' Given a matrix it summarize the positive and negative in two vectors
#'
#' @param data a data matrix
#' @param features a vector with the features to analyze
#' @param name prefix of the covariates
#' @param eventThr the absolute value to threshold an event
#' @param min_prop minimal proportion in classes
#' @param cliques the features organized in cliques. Only use for topology.
#'
#' @return NULL
#'
#' @export
summarizeToNumberOfDirectionalEvents <- function(data, features, name="dCount", eventThr=2,
min_prop=0.1, cliques=NULL) {
if (is.null(data))
return(NULL)
genes <- intersect(row.names(data), features)
if (length(genes)==0)
return(NULL)
dataClique <- t(data[genes, , drop=F])
if (ncol(dataClique)==0)
return(NULL)
posDataClique <- extractPositivePortion(dataClique)
negDataClique <- extractPositivePortion(dataClique, invert=TRUE)
positive <- apply(posDataClique >= eventThr, 1, sum, na.rm=T)
negative <- apply(negDataClique >= eventThr, 1, sum, na.rm=T)
collapsed <- data.frame(positive=positive, negative=negative,
row.names = names(positive), stringsAsFactors = F)
colnames(collapsed) <- paste0(name, c("POS","NEG"))
# min <- ceiling(NCOL(data)*0.01)
# keep <- colSums(collapsed>0) >= min | colSums(collapsed>0) <= NROW(dataClique)-min
keep <- sapply(collapsed, check_minimal_proportion, min_prop=min_prop)
collapsed = collapsed[, keep, drop=F]
if (NCOL(collapsed) == 0)
return(NULL)
list(x=collapsed, dataModule=t(dataClique), namesCov=names(collapsed),
method="directedCount", omicName=name, eventThr=eventThr, min_prop=min_prop)
}
#' @importFrom stats quantile
check_minimal_proportion <- function(x, min_prop=0.1){
min <- quantile(x, probs=c(min_prop))
max <- quantile(x, probs=c(1-min_prop))
if ((min==min(x)) && (max==min(x)))
return(FALSE)
if ((min==max(x)) && (max==max(x)))
return(FALSE)
TRUE
}
#' Summarize To Binary Directional Events
#'
#' Given a matrix it summarize the positive and negative in two vectors
#'
#' @param data a data matrix
#' @param features a vector with the features to analyze
#' @param name prefix of the covariates
#' @param binaryClassMin the minimum number of event to include the covariate
#' @param eventThr the absolute value to threshold an event
#' @param cliques the features organized in cliques. Only use for topology.
#'
#' @return NULL
#'
#' @export
summarizeToBinaryDirectionalEvents <- function(data, features, name="dirBin", binaryClassMin=10,
eventThr=2, cliques=NULL) {
if (is.null(data))
return(NULL)
genes <- intersect(row.names(data), features)
if (length(genes)==0)
return(NULL)
dataClique <- t(data[genes, , drop=F])
if (ncol(dataClique)==0)
return(NULL)
posDataClique <- extractPositivePortion(dataClique)
negDataClique <- extractPositivePortion(dataClique, invert=TRUE)
positive <- apply(posDataClique >= eventThr, 1, any, na.rm=T)
negative <- apply(negDataClique >= eventThr, 1, any, na.rm=T)
collapsed <- data.frame(positive=positive, negative=negative,
row.names = names(positive), stringsAsFactors = F)
colnames(collapsed) <- paste0(name, c("POS","NEG"))
# if (sum(collapsed) < binaryClassMin | sum(collapsed) > NROW(dataClique)-binaryClassMin)
# return(NULL)
keep <- sapply(collapsed, sum) >= binaryClassMin | sapply(collapsed, sum) <= NROW(dataClique)-binaryClassMin
collapsed = collapsed[, keep, drop=F]
if (NCOL(collapsed) == 0)
return(NULL)
list(x=collapsed, dataModule=t(dataClique), namesCov=names(collapsed),
method="directedBinary", omicName=name, eventThr=eventThr)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.