#' Compute distance between peaks
#'
#' Given a query set of genome intervals, report distance to all others within window
#'
#' @param query GRanges object of intervals to query
#' @param windowSize with of window around each interval in bp
#'
#' @details A smaller window size will create a covariance matrix that is faster to evaluate, but larger windows give a better approximation and are less likely to have negative eigen value
#'
#' @return for all pairs of peaks within windowSize, report distance
#'
#' @examples
#' library(GenomicRanges)
#'
#' query = GRanges(rep('chr1', 5), IRanges(1:5, 1:5))
#'
#' getPeakDistances( query )
#'
#' @export
#' @import GenomicRanges
getPeakDistances = function( query, windowSize=10000 ){
# Drop empty chromsomes
chrNameCount = table(seqnames(query))
query = dropSeqlevels( query, names(chrNameCount[chrNameCount==0]))
# expand windows
windowRanges = query
start(windowRanges) = start(windowRanges) - windowSize
end(windowRanges) = end(windowRanges) + windowSize
# get overlaps between query and expanded windows
ovlaps = findOverlaps( windowRanges, query, ignore.strand=TRUE)
rm(windowRanges);
ovlapsDF = data.frame(ovlaps)
rm(ovlaps);
queryHits = subjectHits = NULL
# get only top triangle because matrix is symmatric
ovlapsDF = ovlapsDF[with(ovlapsDF, which(queryHits >= subjectHits)),]
# compute distances
v1 = start(query)[ovlapsDF$queryHits] - end(query)[ovlapsDF$subjectHits]
v2 = end(query)[ovlapsDF$queryHits] - start(query)[ovlapsDF$subjectHits]
v3 = start(query)[ovlapsDF$queryHits] - start(query)[ovlapsDF$subjectHits]
v4 = end(query)[ovlapsDF$queryHits] - end(query)[ovlapsDF$subjectHits]
# for each pair, save distances
ovlapsDF$distance = pmin(abs(v1), abs(v2), abs(v3), abs(v4))
rm(v1); rm(v2); rm(v3); rm(v4);
# ovlapsDF$name1 = query$names[ovlaps@from]
# ovlapsDF$name2 = query$names[ovlaps@to]
ovlapsDF$score1 = score(query)[ovlapsDF$queryHits]
ovlapsDF$score2 = score(query)[ovlapsDF$subjectHits]
attr(ovlapsDF, "N") = length(query)
ovlapsDF
}
#' Create correlation matrix
#'
#' Create correlation matrix based on correlation between pairs of peaks
#'
#' @param query GRanges object of intervals to query
#' @param regionQuant normalized quantifications of regions in query. Rows are features, like in limma
#' @param adjacentCount number of adjacent entries to compute correlation against
#' @param windowSize width of window in bp around each interval beyond which weight is zero
#' @param method 'adjacent': compute corr on fixed count sliding window define by adjacentCount. "distance": compute corr for entries within windowSize bp
#' @param method.corr specify which correlation method: "pearson" or "spearman"
#' @param quiet suppress messages
#' @param setNANtoZero replace NAN correlation values with a zero
#'
#' @return for peak i and j with distance d_{i,j},
#' M[i,j] = cor( vobj$E[i,], vobj$E[j,] )
#'
#' return sparse symmatric matrix
#'
#' @examples
#'
#' data('decorateData')
#'
#' C = createCorrelationMatrix(simLocation, simData)
#'
#' @export
#' @importFrom Matrix sparseMatrix
createCorrelationMatrix = function( query, regionQuant, adjacentCount=500, windowSize=1e6, method = "adjacent", method.corr = c("pearson", "spearman"), quiet=FALSE, setNANtoZero=FALSE){
method.corr = match.arg(method.corr)
if( nrow(regionQuant) != length(query) ){
stop("Number of rows in regionQuant must equal entries in query")
}
if( method == "adjacent"){
if( ! quiet ) message("adjacent: ", adjacentCount, 'entries\n')
# define pairs based on number of entries
start(query) = seq_len(length(query))
end(query) = seq_len(length(query))
df_peaksMap = getPeakDistances( query, windowSize=adjacentCount )
}else if( method == "distance"){
if( ! quiet ) message("distance: ", windowSize, 'bp\n')
# define pairs based on distance
df_peaksMap = getPeakDistances( query, windowSize=windowSize )
}else{
stop("method must be 'adjacent' or 'distance'")
}
corSubsetPairs( regionQuant, df_peaksMap$queryHits, df_peaksMap$subjectHits, method=method.corr, silent=quiet, setNANtoZero=setNANtoZero)
}
#' Run hierarchical clustering preserving order
#'
#' Run hierarchical clustering preserving sequential order of entries
#'
#' @param X data matrix were *rows* are features in sequential order
#' @param gr GenomicRanges object corresponding to rows in X
#' @param alpha mixture parameter weigting correlations (alpha=0) versus chromosome distances (alpha=1)
#'
#' @return hclust tree
#'
#' @details Use hclustgeo in ClustGeo package to generate hierarchical clustering that preserves sequential order.
#'
#' Chavent, et al. 2017. ClustGeo: an R package for hierarchical clustering with spatial constraints. arXiv:1707.03897v2 doi:10.1007/s00180-018-0791-1
#'
#' @examples
#' library(GenomicRanges)
#' library(EnsDb.Hsapiens.v86)
#'
#' # load data
#' data('decorateData')
#'
#' # load gene locations
#' ensdb = EnsDb.Hsapiens.v86
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' # Plot correlations and clusters in region defined by query
#' query = range(simLocation)
#'
#' plotDecorate( ensdb, treeList, treeListClusters, simLocation, query)
#'
#' @import vegan
#' @importFrom ClustGeo hclustgeo
#' @importFrom stats as.dist dist
#' @export
runOrderedClustering = function( X, gr, alpha=0.5 ){
if( is.null(gr) ){
# compute spatial restriction matrix for ordered clustering
N = nrow(X)
dorder = matrix(0,N,N)
rownames(dorder) = rownames(X)
colnames(dorder) = rownames(X)
for(k in seq(1,N-1)){
idx = seq(k+1, N)
dorder[idx,k] = seq(1, (N-k))
}
D2 = as.dist(dorder)
}else{
if( nrow(X) != length(gr) ){
stop("# rows of X must equal # of entries in gr: ", nrow(X), ' != ', length(gr))
}
# get squared distance between all pairs of intervals
D2 = as.dist( outer(gr,gr, "distance") )
}
# compute distance matrix for data
D1 = dist( X )
treeOriginal <- hclustgeo(D1, D2, alpha=alpha)
rm(D1)
rm(D2)
treeReOrder = vegan:::reorder.hclust(treeOriginal, match(treeOriginal$labels, rownames(X)))
rm(treeOriginal)
treeReOrder
}
#' Class epiclust
#'
#' Class \code{epiclust}
#'
#' @name epiclust-class
#' @rdname epiclust-class
#' @exportClass epiclust
setClass("epiclust", representation(clust="ANY", location="GRanges", adjacentCount="numeric", alpha="numeric", method="character", correlation="ANY"))
#' Class epiclustList
#'
#' Class \code{epiclustList}
#'
#' @name epiclustList-class
#' @rdname epiclustList-class
#' @exportClass epiclustList
setClass("epiclustList", representation('ANY'))
setMethod("print", "epiclustList", function( x ){
message("List of hierarchical clustering for ", length(x), " chromosomes:\n\n")
counts = lapply(x, function(x) length(x@clust$labels))
df = data.frame(chrom = names(counts), nFeatures = unlist(counts), stringsAsFactors=FALSE)
rownames(df) = c()
print(df, row.names=FALSE)
message("\n")
})
setMethod("show", "epiclustList", function( object ){
print( object )
})
setMethod("print", "epiclust", function( x ){
message("Hierarchical clustering for", length(x@clust$labels), "features\n\n")
})
setMethod("show", "epiclust", function( object ){
print( object )
})
#' Run hierarchical clustering preserving order
#'
#' Run hierarchical clustering preserving sequential order of entries
#'
#' @param X data matrix were *rows* are features in sequential order
#' @param gr GenomicRanges object with entries corresponding to the *rows* of X
#' @param method 'adjclust': adjacency constrained clustering. 'hclustgeo': incorporate data correlation and distance in bp
#' @param quiet suppress messages
#' @param alpha use by 'hclustgeo': mixture parameter weighing correlations (alpha=0) versus chromosome distances (alpha=1)
#' @param adjacentCount used by 'adjclust': number of adjacent entries to compute correlation against
#' @param setNANtoZero replace NAN correlation values with a zero
#' @param method.corr Specify type of correlation: "pearson", "kendall", "spearman"
#'
#' @return list hclust tree, one entry for each chromosome
#'
#' @details
#' Use adjacency constrained clustering from adjclust package:
#'
#'Alia Dehman, Christophe Ambroise and Pierre Neuvial. 2015. Performance of a blockwise approach in variable selection using linkage disequilibrium information. BMC Bioinformatics 16:148 doi.org:10.1186/s12859-015-0556-6
#'
#' Or, use hclustgeo in ClustGeo package to generate hierarchical clustering that roughly preserves sequential order.
#'
#' Chavent, et al. 2017. ClustGeo: an R package for hierarchical clustering with spatial constraints. arXiv:1707.03897v2 doi:10.1007/s00180-018-0791-1
#'
#' @examples
#' library(GenomicRanges)
#' library(EnsDb.Hsapiens.v86)
#'
#' # load data
#' data('decorateData')
#'
#' # load gene locations
#' ensdb = EnsDb.Hsapiens.v86
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' # Plot correlations and clusters in region defined by query
#' query = range(simLocation)
#'
#' plotDecorate( ensdb, treeList, treeListClusters, simLocation, query)
#'
#' @importFrom rlist list.reverse
#' @importFrom GenomicRanges seqnames
#' @importFrom adjclust adjClust
#' @importFrom methods new
#' @importFrom stats cor
# @importFrom utils assignInNamespace
#' @export
runOrderedClusteringGenome = function( X, gr, method = c("adjclust", 'hclustgeo'), quiet=FALSE, alpha=0.5, adjacentCount=500, setNANtoZero=FALSE, method.corr = c("pearson", "spearman")){
if( nrow(X) != length(gr) ){
stop("# rows of X must equal # of entries in gr: ", nrow(X), ' != ', length(gr))
}
if( is.null(rownames(X)) ){
stop("Data matrix X must have rownames that match names(gr)")
}
if( !all(rownames(X) %in% names(gr)) ){
stop("All rownames of X must be found in names of gr so that all(rownames(X) %in% names(gr)) is TRUE ")
}
method <- match.arg(method)
method.corr <- match.arg(method.corr)
if( alpha < 0 || alpha > 1 ){
stop("Must specify agrument: alpha must be between 0 and 1")
}
# Drop empty chromsomes
chrNameCount = table(seqnames(gr))
gr = dropSeqlevels( gr, names(chrNameCount[chrNameCount==0]))
seqNameCounts = table(seqnames(gr))
seqNameCounts = seqNameCounts[seqNameCounts>0]
# chromosomes must have >- 2 entries
tooFew = seqNameCounts[seqNameCounts<=2]
if( length(tooFew) > 0){
stop("Must have >=2 entires on a chromosome in order to cluster.\nThe following have too few entries: ", paste(names(tooFew), collapse=', '), "\nRemove these chromosomes before analysis")
}
# run last (i.e. smallest) chrom first
chromArray = as.character(seqnames(gr)@values)
# run analysis
##############
# remove costly sanity check in adjclust
# f = function(mat){
# NULL
# }
# assignInNamespace(x="checkCondition", value = f, ns='adjclust')
# for each chromosome
treePerChrom = lapply( chromArray, function(chrom){
if( !quiet ) message("\rEvaluating:", chrom, ' ')
idx = which(array(GenomicRanges::seqnames(gr) == chrom))
if(method == "adjclust"){
C = createCorrelationMatrix( gr[idx], X[idx,], adjacentCount=adjacentCount, quiet=TRUE, setNANtoZero=setNANtoZero, method.corr=method.corr)
# if a feature has no variance, the correlation with it is NaN
# instead set it to zero
# if( any(is.nan(C@x)) ){
# C@x[is.nan(C@x)] = 0
# diag(C) = 1
# }
h = min( adjacentCount, nrow(C)-1)
fitClust = suppressMessages(adjClust( C^2, "similarity", h=h, strictCheck=FALSE))
# fitClust = as.hclust(fitClust)
res = new("epiclust", clust = fitClust, location=gr[idx], adjacentCount=adjacentCount, alpha=0, method=method, correlation=C)
}else{
if( method.corr != "pearson"){
stop(paste("Correlation ", method.corr, "not currently implemented for", method))
}
# compute tree on this chromosome
fitClust = runOrderedClustering( X[idx,], gr[idx], alpha=alpha)
C = cor(t(X[idx,]))
res = new("epiclust", clust = fitClust, location=gr[idx], adjacentCount=adjacentCount, alpha=alpha, method=method, correlation=C)
}
rm(C)
rm(fitClust)
gc()
res
})
message('\n')
names(treePerChrom) = chromArray
new('epiclustList', treePerChrom )
}
#' Extract subset of data points
#'
#' Extract subset of data points
#'
#' @param fit epiclust or epiclustList object
#' @param query GenomicRanges object of which data points to retain
#'
#' @return epiclust or epiclustList object
#'
#' @examples
#' library(GenomicRanges)
#'
#' data('decorateData')
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # extract subset of data after clustering
#' res = getSubset( treeList, simLocation[1:10])
#'
#'
#' @importFrom GenomicRanges GRanges
#' @export
#' @docType methods
#' @rdname getSubset-methods
setGeneric("getSubset", function(fit, query) standardGeneric("getSubset"))
#' @importFrom GenomicRanges GRanges findOverlaps
#' @export
#' @rdname getSubset-methods
#' @aliases getSubset,epiclust,GRanges-method
setMethod("getSubset", c("epiclust", "GRanges"),
function(fit, query) {
# idx = NULL
# v = match(seqnames(fit@location), seqnames(query))@values
# if( any(!is.na(v)) > 0 ){
# idx = which((seqnames(fit@location)@values %in% seqnames(query)@values) & (fit@location %within% query))
# }
ovlp = suppressWarnings(findOverlaps(fit@location, query))
idx = ovlp@from
if( length(idx) == 0){
fitNew = NULL
}else{
fitNew = fit
N = length(idx)
if( N == 0 ){
stop("Cannot select empty subset of entries")
}
idx2 = idx[seq_len(length(idx)-1)]
fitNew@clust$merge = fitNew@clust$merge[idx2,]
fitNew@clust$height = fitNew@clust$height[idx2]
fitNew@clust$order = fitNew@clust$order[idx]
fitNew@clust$labels = fitNew@clust$labels[idx]
fitNew@location = fitNew@location[idx]
fitNew@correlation = fitNew@correlation[idx,idx]
}
fitNew
}
)
#' @importFrom GenomicRanges GRanges
#' @export
#' @rdname getSubset-methods
#' @aliases getSubset,epiclustList,GRanges-method
setMethod("getSubset", c("epiclustList", "GRanges"),
function(fit, query){
res = lapply(fit, function(x) getSubset(x, query))
Filter(Negate(is.null), res)
}
)
#' Create cluster from list of hclust objects
#'
#' Create cluster from list of hclust objects
#'
#' @param treeList list of hclust objects
#' @param method 'capushe': slope heuristic. 'bstick': broken stick. 'meanClusterSize': create clusters based on target mean value.
#' @param meanClusterSize select target mean cluster size. Can be an array of values
#' @param pct minimum percentage of points for the plateau selection in capushe selection. Can be an array of values
#'
#' @return Convert hierarchical clustering into discrete clusters based on selection criteria method
#'
#' @examples
#' library(GenomicRanges)
#' library(EnsDb.Hsapiens.v86)
#'
#' # load data
#' data('decorateData')
#'
#' # load gene locations
#' ensdb = EnsDb.Hsapiens.v86
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' # Plot correlations and clusters in region defined by query
#' query = range(simLocation)
#'
#' plotDecorate( ensdb, treeList, treeListClusters, simLocation, query)
#'
#' @import adjclust
#' @importFrom stats cutree
#' @export
createClusters = function(treeList, method = c("capushe", "bstick", "meanClusterSize"), meanClusterSize=50, pct=0.15) {
method <- match.arg(method)
message("Method:", method, '\n')
if( method == "meanClusterSize"){
# loop over values in meanClusterSize
res = lapply( meanClusterSize, function(mcs){
# get counts on each chromosome
n_features = vapply(treeList, function(x){
length(x@clust$order)
}, FUN.VALUE=numeric(1))
# combine counts across chromosomes
n_features_total = sum(n_features)
n_total_clusters = round(n_features_total / mcs)
# cut tree to get average of meanClusterSize
res = lapply(treeList, function(x){
N = length(x@clust$order)
frac = N / n_features_total
n_clust = round( frac*n_total_clusters )
# message(paste("\r", mcs, n_clust))
cutree(x@clust, k=max(n_clust, 1))
})
names(res) = names(treeList)
new('epiclustDiscreteList',res)
})
names(res) = meanClusterSize
}else{
if( treeList[[1]]@method == 'hclustgeo' ){
stop("Method '", method, "' is only compatible with result of\nrunOrderedClusteringGenome(...,method ='adjclust')")
}
res = lapply(pct, function(val){
res = lapply(treeList, function(x){
select( x@clust, type=method, pct=val)
})
names(res) = names(treeList)
new('epiclustDiscreteList',res)
})
names(res) = pct
}
new('epiclustDiscreteListContain',res)
}
# setClass("epiclustDiscrete", representation("array"))
# setMethod("print", "epiclustDiscrete", function( x ){
# nFeatures = length(x)
# nClusters = length(unique(x))
# message(nFeatures, "features in", nClusters, "discrete clusters\n\n" )
# })
# setMethod("show", "epiclustDiscrete", function( object ){
# print( object )
# })
#' Class epiclustDiscreteList
#'
#' Class \code{epiclustDiscreteList}
#'
#' @name epiclustDiscreteList-class
#' @rdname epiclustDiscreteList-class
#' @exportClass epiclustDiscreteList
setClass("epiclustDiscreteList", representation('list'))
setMethod("print", "epiclustDiscreteList", function( x ){
for( chrom in names(x) ){
nFeatures = length(x[[chrom]])
nClusters = length(unique(x[[chrom]]))
message(paste0(chrom, ':'),nFeatures, " features in ", nClusters, " discrete clusters\n\n" )
}
})
setMethod("show", "epiclustDiscreteList", function( object ){
print( object )
})
#' Class epiclustDiscreteListContain
#'
#' Class \code{epiclustDiscreteListContain}: is a list containing epiclustDiscreteList objects
#'
#' @name epiclustDiscreteListContain-class
#' @rdname epiclustDiscreteListContain-class
#' @exportClass epiclustDiscreteListContain
setClass("epiclustDiscreteListContain", representation('list'))
setMethod("print", "epiclustDiscreteListContain", function( x ){
message("Clusters defined using ", length(x), " parameter values: ", paste(names(x), collapse=', '), '\n\n' )
for( id in names(x) ){
message("parameter: ", id, '\n')
print(x[[id]])
}
})
setMethod("show", "epiclustDiscreteListContain", function( object ){
print( object )
})
#' Simulated data to show correlation clustering
#'
#' @docType data
#' @keywords decorateData
#' @name decorateData
#' @usage data(decorateData)
"simData"
#' Simulated data to show correlation clustering. GRanges object indicating genomic position of data in rows of simData
#'
#' @docType data
#' @keywords decorateData
#' @name decorateData
#' @usage data(decorateData)
"simLocation"
#' Simulated disease status for differential analysis
#'
#' @docType data
#' @keywords decorateData
#' @name decorateData
#' @usage data(decorateData)
"metadata"
#' Find which cluster a peak is in
#'
#' Find which cluster a peak is in
#'
#' @param treeListClusters from createClusters()
#' @param feature_id name of query feature, can also be array
#' @param id clustering parameter identifier. After filtering by feature_id, filter by id
#'
#' @return data.frame of chromosome and cluster
#'
#' @examples
#' library(GenomicRanges)
#'
#' data('decorateData')
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize = c(50, 100) )
#'
#' # Find chromsome and cluster of peak_20
#' whichCluster( treeListClusters, 'peak_20')
#'
#' # Find chromsome and cluster of peak_20 with clustering parameter 50
#' # corresponding to meanClusterSize
#' whichCluster( treeListClusters, 'peak_20', "50")
#'
#' # Search for multiple clusters
#' whichCluster( treeListClusters, c('peak_20', 'peak_21'), "50")
#'
#' @export
whichCluster = function(treeListClusters, feature_id, id=NULL){
# find cluster based on anem
res = lapply( names(treeListClusters), function(msc){
res = lapply(names(treeListClusters[[msc]]), function(chrom){
x = treeListClusters[[msc]][[chrom]]
idx = match(feature_id, names(x))
clust = x[idx]
# if( is.na(clust)){
# clust = NA
# clust[is.na(clust)] = NA
# }
names(clust) = c()
data.frame( id=msc, chrom, feature_id, cluster=clust, stringsAsFactors=FALSE)
})
do.call('rbind', res)
})
res = do.call('rbind', res)
res = res[!is.na(res$cluster),]
# if specified, filter by ID
if(!is.null(id) ){
id = unique(id)
res = res[res$id%in%as.character(id),]
}
res
}
#' Get feature names in selected cluster
#'
#' Get feature names in selected cluster given chrom and clusterid
#'
#' @param treeListClusters from createClusters()
#' @param chrom chromosome name of cluster
#' @param clustID cluster identifier
#' @param id clustering parameter identifier
#'
#' @return array of feature names
#'
#' @examples
#' library(GenomicRanges)
#'
#' data('decorateData')
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize = 50 )
#'
#' # Find chromsome and cluster of peak_204
#' getFeaturesInCluster( treeListClusters, "chr20", 3, "50")
#'
#' @export
getFeaturesInCluster = function( treeListClusters, chrom, clustID, id){
idx = which( treeListClusters[[id]][[chrom]] == clustID )
clsts = treeListClusters[[id]][[chrom]][idx]
names(clsts)
}
#' Get feature names in selected cluster
#'
#' Get feature names in selected cluster given array of chrom and cluster ids
#'
#' @param treeListClusters from createClusters()
#' @param chrom chromosome name of cluster
#' @param clustID cluster identifier
#' @param id clustering parameter identifier
#'
#' @return list of array of feature names. Query with index i as returned in list index i
#'
#' @examples
#' library(GenomicRanges)
#'
#' data('decorateData')
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize = 50 )
#'
#' df_cluster = data.frame( chrom = c("chr20", "chr20"),
#' cluster = c(3,5),
#' id = c("50", "50"), stringsAsFactors=FALSE )
#'
#' # Find features for the clusters in df_cluster
#' getFeaturesInClusterList( treeListClusters, chrom=df_cluster$chrom, clustID=df_cluster$cluster, id=df_cluster$id)
#'
#' @export
getFeaturesInClusterList = function( treeListClusters, chrom, clustID, id){
if( length(unique(c(length(chrom), length(clustID), length(id)))) != 1){
stop("Number of entries in chrom, clustID, id must be equal")
}
res = sapply( seq_len(length(chrom)), function(i){
idx = which( treeListClusters[[id[i]]][[chrom[i]]] == clustID[i] )
clsts = treeListClusters[[id[i]]][[chrom[i]]][idx]
names(clsts)
})
res
}
#' Compute scores for each cluster
#'
#' For each cluster compute summary statistics for the cluster to measure how strong the correlation structure is. Clusters with weak correlation structure can be dropped from downstream analysis.
#'
#' @param treeList list of hclust objects
#' @param treeListClusters from createClusters()
#' @param BPPARAM parameters for parallel evaluation
#'
#' @details For each cluster, extract the correlation matrix and return the mean absolute correlation; the 75th, 90th and 95th quantile absolute correlation, and LEF, the leading eigen-value fraction which is the fraction of variance explained by the leading eigen value of the matrix abs(C).
#'
#' @return for all pairs of peaks within windowSize, report distance
#'
#' @examples
#' library(GenomicRanges)
#' library(BiocParallel)
#'
#' data('decorateData')
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' # Evaluate score for each cluster
#' clstScore = scoreClusters(treeList, treeListClusters, BPPARAM = SerialParam() )
#'
#' @export
#' @importFrom stats quantile
#' @importFrom progress progress_bar
#' @import BiocParallel
# @importFrom irlba partial_eigen
scoreClusters = function(treeList, treeListClusters, BPPARAM=SerialParam()){
# get data.frame of cluster names per chromosomes
# clCount = countClusters( treeListClusters )
clCount = lapply(treeListClusters, countClusters)
df_count = lapply(names(clCount), function(id){
lapply( seq_len(length(clCount[[id]])), function(i){
x = clCount[[id]][i]
data.frame(chrom=rep(names(x), x), cluster=seq(x), meanClusterSize=as.numeric(id), stringsAsFactors=FALSE)
})
})
df_count = do.call("rbind", unlist(df_count, recursive=FALSE))
# evaluate for each cluster
.score_clusts = function( obj ) {
batch = clustID = peakID = NA
values = unique(obj$treeListClusters)
df_values = data.frame(peakID = names(obj$treeListClusters), clustID = obj$treeListClusters, stringsAsFactors=FALSE)
df_values = data.table::data.table(df_values)
n_batches = 50
df_values[,batch:=round(clustID / n_batches)]
Clist = lapply( unique(df_values$batch), function(btch){
peakIDs = df_values[batch==btch,peakID]
as.matrix(obj$treeList@correlation[peakIDs,peakIDs])
})
names(Clist) = unique(df_values$batch)
res = lapply( values, function( clustID ){
peakIDs = names(obj$treeListClusters[obj$treeListClusters==clustID])
N = length(peakIDs)
cor_values = 1
LEF = 1
if( N > 1){
# Get correlation matrix
# C = as.matrix(obj$treeList@correlation[peakIDs,peakIDs])
batch = df_values[peakID%in%peakIDs,as.character(unique(batch))]
C = Clist[[batch]][peakIDs,peakIDs]
# Compute lead eigen-value fraction using full eigen spectrum
# eigen analysis
dcmp = eigen(abs(C), symmetric=TRUE, only.values = TRUE)
LEF = dcmp$values[1] / sum(dcmp$values)
cor_values = C[lower.tri(C)]
cor_values = cor_values[is.finite(cor_values)]
}
# compute statistics
data.frame( id=obj$id, chrom = obj$chrom,
cluster = clustID, N=N,
mean_abs_corr = mean(abs(cor_values)),
quantile75 = quantile(abs(cor_values), .75),
quantile90 = quantile(abs(cor_values), .90),
quantile95 = quantile(abs(cor_values), .95),
LEF=LEF, stringsAsFactors=FALSE)
})
do.call("rbind", res)
}
message("Evaluating strength of each cluster...\n\n")
it = corrIterBatch( treeList, treeListClusters, df_count )
message(paste0("Dividing work into ",attr(it, "n_chunks")," chunks...\n"))
res = bpiterate( it, .score_clusts, BPPARAM=BPPARAM)
res = do.call("rbind", res)
rownames(res) = c()
mscArray = unique(df_count$meanClusterSize)
clustScoresList = lapply( mscArray, function(msc){
res[res$id==msc,]
})
names(clustScoresList) = mscArray
clustScoresList
}
#' Extract subset of clusters
#'
#' Extract subset of clusters based on entries in chroms and clusters
#'
#' @param treeListClusters from createClusters()
#' @param clustInclude data.frame from retainClusters() indcating which clusters to include
#'
#' @return epiclustDiscreteList of specified clusters
#'
#' @examples
#' library(GenomicRanges)
#' library(BiocParallel)
#'
#' data('decorateData')
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' # Evaluate score for each cluster
#' clstScore = scoreClusters(treeList, treeListClusters, BPPARAM=SerialParam() )
#'
#' # Retain clusters that pass this criteria
#' clustInclude = retainClusters( clstScore, "LEF", 0.30 )
#'
#' # get retained clusters
#' treeListClusters_filter = filterClusters( treeListClusters, clustInclude)
#'
#' @export
filterClusters = function( treeListClusters, clustInclude){
res = lapply( names(treeListClusters), function(msc){
tlc = treeListClusters[[as.character(msc)]]
resList = lapply( names(tlc), function(chrom){
idx = which(chrom == clustInclude$chrom & msc == clustInclude$id)
idx2 = tlc[[chrom]] %in% clustInclude$cluster[idx]
tlc[[chrom]][idx2]
})
names(resList) = names(tlc)
new('epiclustDiscreteList', resList)
})
names(res) = names(treeListClusters)
new('epiclustDiscreteListContain',res)
}
#' Retain clusters by applying filter
#'
#' Retain clusters by applying filter
#'
#' @param clstScore score each cluster using scoreClusters()
#' @param metric column of clstScore to use in filtering
#' @param cutoff retain cluster than exceed the cutoff for metric. Can be array with one entry per entry in clstScore
#'
#' @return data.frame of chrom, clutser, id (the clustering parameter value), and the specified metric
#'
#' @examples
#' library(GenomicRanges)
#' library(BiocParallel)
#'
#' data('decorateData')
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' # Evaluate score for each cluster
#' clstScore = scoreClusters(treeList, treeListClusters, BPPARAM = SerialParam() )
#'
#' # Retain clusters that pass this criteria
#' clustInclude = retainClusters( clstScore, "LEF", 0.30 )
#'
#' # Or filter by mean absolute correlation
#' # clustInclude = retainClusters( clstScore, "mean_abs_corr", 0.1 )
#'
#' # get retained clusters
#' treeListClusters_filter = filterClusters( treeListClusters, clustInclude )
#'
#' @export
retainClusters = function(clstScore, metric="LEF", cutoff = 0.40){
if( length(cutoff) == 0){
stop("Must specify cutoff")
}
if( (length(cutoff) > 1) & (length(cutoff) != length(clstScore)) ){
stop( paste("Array values specified for cutoff, but number of entries must match\nnumber of entries in clstScore:", length(clstScore)))
}
# if cutoff is a scalar, repeated it once for each entry in clstScore
if( length(cutoff) == 1 ){
cutoff = rep(cutoff, length(clstScore) )
}
names(cutoff) = names(clstScore)
message("Using cutoffs:\n")
message("Cluster set\tcutoff\n")
for(i in seq_len(length(clstScore))){
message(paste0(" ", names(clstScore)[i], "\t\t", format(cutoff[i], digits=3), '\n'))
}
message("\n")
clstScoreDF = do.call("rbind", clstScore)
if( ! metric %in% colnames(clstScoreDF) ){
cols = colnames(clstScoreDF)[!colnames(clstScoreDF) %in% c("id", "chrom", "cluster")]
stop("Valid metric to filter by are:\n", paste(cols, collapse = ', '))
}
# apply different cutoff for each entry in clstScore
res = lapply( names(clstScore), function(id){
cl = clstScore[[id]]
cl[cl[[metric]] >= cutoff[id],c("id", "chrom", "cluster", metric)]
})
do.call("rbind", res)
}
#' Evaluate Jaccard index
#'
#' Evaluate Jaccard index
#'
#' @param a set 1
#' @param b set 2
#'
#' @examples
#' a = 1:10
#' b = 5:15
#' jaccard(a,b)
#'
#' @return Jaccard index
#' @export
jaccard = function(a,b){
n_I = length(intersect(a,b))
n_U = length(union(a,b))
n_I / n_U
}
`:=` <- function(a,b) {stop(":= should not be called directly")}
#' Collapse clusters based on jaccard index
#'
#' Collapse clusters if jaccard index between clusters exceeds a cutoff
#'
#' @param treeListClusters from createClusters()
#' @param featurePositions GRanges object storing location of each feature
#' @param jaccardCutoff cutoff value for jaccard index
#'
#' @return subset of clusters in treeListClusters that passes cutoff
#'
#' @examples
#' library(GenomicRanges)
#' library(BiocParallel)
#'
#' # load data
#' data('decorateData')
#'
#' # Evaluate hierarchical clustering
#' # adjacentCount is the number of adjacent peaks considered in correlation
#' treeList = runOrderedClusteringGenome( simData, simLocation)
#'
#' # Choose cutoffs and return cluster
#' treeListClusters = createClusters( treeList, method = "meanClusterSize", meanClusterSize=c( 10, 20, 30, 40, 50) )
#'
#' # Evaluate strength of correlation for each cluster
#' clstScore = scoreClusters(treeList, treeListClusters, BPPARAM = SerialParam() )
#'
#' # Filter to retain only strong clusters
#' clustInclude = retainClusters( clstScore, "LEF", 0.30 )
#'
#' # get retained clusters
#' treeListClusters_filter = filterClusters( treeListClusters, clustInclude)
#'
#' # collapse similar clusters
#' treeListClusters_collapse = collapseClusters( treeListClusters_filter, simLocation)
#'
#' @importFrom utils combn
#' @importFrom data.table data.table
#' @importFrom GenomicRanges seqnames
#' @export
collapseClusters = function(treeListClusters, featurePositions, jaccardCutoff=0.9){
if( ! is(featurePositions, "GRanges")){
stop("featurePositions must be a GRanges object")
}
if( is.null(names(featurePositions)) ){
stop("featurePositions must have identifiers for each interval accessable using names(featurePositions)")
}
# Drop empty chromsomes
chrNameCount = table(seqnames(featurePositions))
featurePositions = dropSeqlevels( featurePositions, names(chrNameCount[chrNameCount==0]))
# for each cluster in each chrom and cutoff value, get range
flattened = unlist( treeListClusters )
splt = strsplit( names(flattened), '\\.')
res2 = data.table(data.frame( msc = vapply(splt, function(x) x[1], "character"),
chrom = vapply(splt, function(x) x[2], "character"),
feature = vapply(splt, function(x) paste0(x[-c(1:2)],collapse='.'), "character"),
cluster = flattened,
stringsAsFactors=FALSE))
idx = match( res2$feature, names(featurePositions))
if( any(is.na(idx)) ){
stop("There are ", format(sum(is.na(idx)), big.mark=','), " features in treeListClusters that are not found in featurePositions")
}
res2$start = start(featurePositions)[idx]
res2$end = end(featurePositions)[idx]
res = res2[,data.frame( N = length(start),
start=min(start), end=max(end), stringsAsFactors=FALSE), by=c('msc', 'chrom', 'cluster')]
# Get all pairs of clusters, then filter out pairs with same cutoff or different chromsome
###################
message("Identifying redundant clusters...\n")
msc = chrom = cluster = msc.x = msc.y = chrom.x = chrom.y = N.x = N.y = key.x = key.y = .SD = key = NULL
resRetain = lapply( unique(res$chrom), function(CHROM){
# get only 1 chromosome
resChrom = res[chrom == CHROM,]
resChrom[, key:=do.call(paste,.SD)]
# get all combinations
# idx = t(combn(nrow(resChrom), 2))
# get only cluster combos that have overlapping ranges
gr = GRanges(resChrom$chrom, IRanges(resChrom$start, resChrom$end), name=resChrom$cluster)
fnd = findOverlaps( gr, gr)
# remove self matches
# fnd = fnd[which(fnd@from != fnd@to)]
# remove cases where A matches B, and B matches A
fnd = fnd[which(fnd@from < fnd@to)]
res_1 = resChrom[fnd@from,]
colnames(res_1) = paste0(colnames(res_1), '.x')
res_2 = resChrom[fnd@to,]
colnames(res_2) = paste0(colnames(res_2), '.y')
# get all combinations
df_combo = data.table(cbind( res_1, res_2))
# filter combinations
resLoc = df_combo[(msc.x!=msc.y)&(chrom.x==chrom.y),]
# get jaccard similarity between plairs of clusters
resLoc$jaccard = vapply( seq_len(nrow(resLoc)), function(k){
clst.x = treeListClusters[[resLoc$msc.x[k]]][[resLoc$chrom.x[k]]]
clusters.x = names(clst.x)[clst.x == resLoc$cluster.x[k]]
clst.y = treeListClusters[[resLoc$msc.y[k]]][[resLoc$chrom.y[k]]]
clusters.y = names(clst.y)[clst.y == resLoc$cluster.y[k]]
jaccard( clusters.x, clusters.y)
}, numeric(1))
# if jaccard index is larger than cutoff, drop the smaller cluster
resSub = resLoc[resLoc$jaccard > jaccardCutoff,]
drop.x = resSub[N.x < N.y,key.x]
drop.y = resSub[N.x >= N.y,key.y]
resChrom[!(key %in% c(drop.x, drop.y)),]
})
resRetain = do.call("rbind", resRetain)
# Keep only clusters in resRetain from treeListClusters
#######################################################
# count clusters for each cutoff and chromsome
clCount = lapply(treeListClusters, countClusters)
# create now object without these clusters
idx = vapply(clCount, function(x) sum(x) > 0, logical(1))
# get clusters stats for each cutoff and chromsome
res = lapply(names(treeListClusters)[idx], function(id){
res = lapply(names(treeListClusters[[id]]), function(chr){
# get clusters
clsts = treeListClusters[[id]][[chr]]
# of cluster in in the retain set
clsts[clsts %in% resRetain[msc==id & chrom==chr,cluster]]
})
names(res) = names(treeListClusters[[id]])
count = countClusters( new('epiclustDiscreteList',res) )
new('epiclustDiscreteList', res[count > 0])
})
names(res) = names(treeListClusters)[idx]
res = new('epiclustDiscreteListContain',res)
clCount = lapply(res, countClusters)
idx = vapply(clCount, function(x){
length(x) > 0 & any(x > 0)
}, logical(1))
# drop entries in res if does't contain clusters after filtering
new('epiclustDiscreteListContain', res[idx])
}
#' Allow subsetting of epiclustDiscreteListContain
#'
#' Allow subsetting of epiclustDiscreteListContain
#'
#' @param x epiclustDiscreteListContain
#' @param i index 1
#' @param j index 2
#' @param drop TRUE/FALSE
#' @param ... additional arguement
#'
#' @return subset of epiclustDiscreteListContain
#'
#' @examples
#' library(GenomicRanges)
#'
#' data('decorateData')
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' @export
setMethod("[", "epiclustDiscreteListContain",
function(x, i, j, ..., drop) {
# new("epiclustDiscreteListContain", x[i])
keys = names(x)[i]
obj = lapply(keys, function(key){
x[[key]]
})
names(obj) = keys
new('epiclustDiscreteListContain', obj)
})
#' Count clusters on each chromosome
#'
#' Count clusters on each chromosome
#'
#' @param treeListClusters from createClusters()
#'
#' @return count number of clusters on each chromsome
#'
#' @examples
#' library(GenomicRanges)
#'
#' data('decorateData')
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation )
#'
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' # Count clusters on each chromsome
#' countClusters( treeListClusters )
#'
#' @export
#' @rdname countClusters-methods
setGeneric("countClusters", function(treeListClusters) standardGeneric("countClusters"))
#' @export
#' @rdname countClusters-methods
#' @aliases countClusters,epiclustDiscreteList-method
setMethod("countClusters", c("epiclustDiscreteList"),
function(treeListClusters){
.countClusters( treeListClusters )
})
#' @export
#' @rdname countClusters-methods
#' @aliases countClusters,epiclustDiscreteListContain-method
setMethod("countClusters", c("epiclustDiscreteListContain"),
function(treeListClusters){
sum(unlist(lapply( treeListClusters, .countClusters )))
})
.countClusters = function(treeListClusters){
vapply(treeListClusters, function(obj) length(unique(obj)), numeric(1))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.