R/local_correlations.R

Defines functions .countClusters collapseClusters `:=` jaccard retainClusters filterClusters scoreClusters getFeaturesInClusterList getFeaturesInCluster whichCluster createClusters runOrderedClusteringGenome runOrderedClustering createCorrelationMatrix getPeakDistances

Documented in collapseClusters createClusters createCorrelationMatrix filterClusters getFeaturesInCluster getFeaturesInClusterList getPeakDistances jaccard retainClusters runOrderedClustering runOrderedClusteringGenome scoreClusters whichCluster

#' Compute distance between peaks
#'
#' Given a query set of genome intervals, report distance to all others within window
#'
#' @param query GRanges object of intervals to query
#' @param windowSize with of window around each interval in bp
#' 
#' @details A smaller window size will create a covariance matrix that is faster to evaluate, but larger windows give a better approximation and are less likely to have negative eigen value
#'
#' @return for all pairs of peaks within windowSize, report distance 
#' 
#' @examples
#' library(GenomicRanges)
#' 
#' query = GRanges(rep('chr1', 5), IRanges(1:5, 1:5))
#' 
#' getPeakDistances( query )
#' 
#' @export
#' @import GenomicRanges
getPeakDistances = function( query, windowSize=10000 ){

  # Drop empty chromsomes
  chrNameCount = table(seqnames(query))
  query = dropSeqlevels( query, names(chrNameCount[chrNameCount==0]))

  # expand windows
  windowRanges = query
  start(windowRanges) = start(windowRanges) - windowSize
  end(windowRanges) = end(windowRanges) + windowSize

  # get overlaps between query and expanded windows
  ovlaps = findOverlaps( windowRanges, query, ignore.strand=TRUE)
  rm(windowRanges);

  ovlapsDF = data.frame(ovlaps)
  rm(ovlaps);

  queryHits = subjectHits = NULL

  # get only top triangle because matrix is symmatric
  ovlapsDF = ovlapsDF[with(ovlapsDF, which(queryHits >= subjectHits)),]

  # compute distances
  v1 = start(query)[ovlapsDF$queryHits] - end(query)[ovlapsDF$subjectHits]
  v2 = end(query)[ovlapsDF$queryHits] - start(query)[ovlapsDF$subjectHits]
  v3 = start(query)[ovlapsDF$queryHits] - start(query)[ovlapsDF$subjectHits]
  v4 = end(query)[ovlapsDF$queryHits] - end(query)[ovlapsDF$subjectHits]

  # for each pair, save distances
  ovlapsDF$distance = pmin(abs(v1), abs(v2), abs(v3), abs(v4))
  rm(v1); rm(v2); rm(v3); rm(v4);

  # ovlapsDF$name1 = query$names[ovlaps@from]
  # ovlapsDF$name2 = query$names[ovlaps@to]
  ovlapsDF$score1 = score(query)[ovlapsDF$queryHits]
  ovlapsDF$score2 = score(query)[ovlapsDF$subjectHits]

  attr(ovlapsDF, "N") = length(query)
  ovlapsDF
}

#' Create correlation matrix
#'
#' Create correlation matrix based on correlation between pairs of peaks
#'
#' @param query GRanges object of intervals to query
#' @param regionQuant normalized quantifications of regions in query.  Rows are features, like in limma
#' @param adjacentCount number of adjacent entries to compute correlation against
#' @param windowSize width of window in bp around each interval beyond which weight is zero
#' @param method 'adjacent': compute corr on fixed count sliding window define by adjacentCount.  "distance": compute corr for entries within windowSize bp
#' @param method.corr specify which correlation method: "pearson" or "spearman"
#' @param quiet suppress messages 
#' @param setNANtoZero replace NAN correlation values with a zero
#'
#' @return for peak i and j with distance d_{i,j},
#' M[i,j] = cor( vobj$E[i,], vobj$E[j,] )
#' 
#' return sparse symmatric matrix 
#'
#' @examples
#'
#' data('decorateData')
#'
#' C = createCorrelationMatrix(simLocation, simData)
#'
#' @export
#' @importFrom Matrix sparseMatrix
createCorrelationMatrix = function( query, regionQuant, adjacentCount=500, windowSize=1e6, method = "adjacent", method.corr = c("pearson", "spearman"), quiet=FALSE, setNANtoZero=FALSE){

  if( nrow(regionQuant) != length(query) ){
    stop("Number of rows in regionQuant must equal entries in query")
  }

  if( method == "adjacent"){
    if( ! quiet ) message("adjacent: ", adjacentCount, 'entries\n')
    # define pairs based on number of entries
    start(query) = seq_len(length(query))
    end(query) = seq_len(length(query))

    df_peaksMap = getPeakDistances( query, windowSize=adjacentCount )

  }else if( method == "distance"){

    if( ! quiet ) message("distance: ", windowSize, 'bp\n')
    # define pairs based on distance
    df_peaksMap = getPeakDistances( query, windowSize=windowSize )

  }else{
    stop("method must be 'adjacent' or 'distance'")
  }

  corSubsetPairs( regionQuant, df_peaksMap$queryHits, df_peaksMap$subjectHits, method=method.corr, silent=quiet, setNANtoZero=setNANtoZero)
}


#' Run hierarchical clustering preserving order
#' 
#' Run hierarchical clustering preserving sequential order of entries
#' 
#' @param X data matrix were *rows* are features in sequential order
#' @param gr GenomicRanges object corresponding to rows in X
#' @param alpha mixture parameter weigting correlations (alpha=0) versus chromosome distances (alpha=1)
#' 
#' @return hclust tree
#'
#' @details Use hclustgeo in ClustGeo package to generate hierarchical clustering that preserves sequential order.
#'
#' Chavent, et al. 2017. ClustGeo: an R package for hierarchical clustering with spatial constraints. arXiv:1707.03897v2 doi:10.1007/s00180-018-0791-1
#'
#' @examples
#' library(GenomicRanges)
#' library(EnsDb.Hsapiens.v86)
#'
#' # load data
#' data('decorateData')
#'
#' # load gene locations
#' ensdb = EnsDb.Hsapiens.v86
#'
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#' 
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#' 
#' # Plot correlations and clusters in region defined by query
#' query = range(simLocation)
#' 
#' plotDecorate( ensdb, treeList, treeListClusters, simLocation, query)
#'
#' @import vegan 
#' @importFrom ClustGeo hclustgeo
#' @importFrom stats as.dist dist
#' @export
runOrderedClustering = function( X, gr, alpha=0.5 ){

  if( is.null(gr) ){
    # compute spatial restriction matrix for ordered clustering
    N = nrow(X)
    dorder = matrix(0,N,N)
    rownames(dorder) = rownames(X)
    colnames(dorder) = rownames(X)
    for(k in seq(1,N-1)){
      idx = seq(k+1, N)
      dorder[idx,k] = seq(1, (N-k))
    }
    D2 = as.dist(dorder)

  }else{
    if( nrow(X) != length(gr) ){
      stop("# rows of X must equal # of entries in gr: ", nrow(X), ' != ', length(gr))
    }

    # get squared distance between all pairs of intervals
    D2 = as.dist( outer(gr,gr, "distance") )
  }

  # compute distance matrix for data
  D1 = dist( X )
  
  treeOriginal <- hclustgeo(D1, D2, alpha=alpha)

  rm(D1)
  rm(D2)

  treeReOrder = vegan:::reorder.hclust(treeOriginal, match(treeOriginal$labels, rownames(X)))

  rm(treeOriginal)

  treeReOrder
}

#' Class epiclust
#'
#' Class \code{epiclust} 
#'
#' @name epiclust-class
#' @rdname epiclust-class
#' @exportClass epiclust
setClass("epiclust", representation(clust="ANY", location="GRanges", adjacentCount="numeric", alpha="numeric", method="character", correlation="ANY"))

#' Class epiclustList
#'
#' Class \code{epiclustList} 
#'
#' @name epiclustList-class
#' @rdname epiclustList-class
#' @exportClass epiclustList
setClass("epiclustList", representation('ANY'))


setMethod("print", "epiclustList", function( x ){

  message("List of hierarchical clustering for", length(x), "chromosomes:\n\n")

  counts = lapply(x, function(x) length(x@clust$labels))

  df = data.frame(chrom = names(counts), nFeatures = unlist(counts), stringsAsFactors=FALSE)
  rownames(df) = c()
  print(df, row.names=FALSE)
  message("\n")
})

setMethod("show", "epiclustList", function( object ){
  print( object )
})

setMethod("print", "epiclust", function( x ){
  message("Hierarchical clustering for", length(x@clust$labels), "features\n\n")
})

setMethod("show", "epiclust", function( object ){
  print( object )
})

#' Run hierarchical clustering preserving order
#' 
#' Run hierarchical clustering preserving sequential order of entries
#' 
#' @param X data matrix were *rows* are features in sequential order
#' @param gr GenomicRanges object with entries corresponding to the *rows* of X
#' @param method 'adjclust': adjacency constrained clustering.  'hclustgeo': incorporate data correlation and distance in bp
#' @param quiet suppress messages
#' @param alpha use by 'hclustgeo': mixture parameter weighing correlations (alpha=0) versus chromosome distances (alpha=1)
#' @param adjacentCount used by 'adjclust': number of adjacent entries to compute correlation against
#' @param setNANtoZero replace NAN correlation values with a zero
#' @param method.corr Specify type of correlation: "pearson", "kendall", "spearman"
#' 
#' @return list hclust tree, one entry for each chromosome
#'
#' @details 
#' Use adjacency constrained clustering from adjclust package:
#'
#'Alia Dehman, Christophe Ambroise and Pierre Neuvial. 2015. Performance of a blockwise approach in variable selection using linkage disequilibrium information. BMC Bioinformatics 16:148 doi.org:10.1186/s12859-015-0556-6
#'
#' Or, use hclustgeo in ClustGeo package to generate hierarchical clustering that roughly preserves sequential order.
#'
#' Chavent, et al. 2017. ClustGeo: an R package for hierarchical clustering with spatial constraints. arXiv:1707.03897v2 doi:10.1007/s00180-018-0791-1
#'
#' @examples
#' library(GenomicRanges)
#' library(EnsDb.Hsapiens.v86)
#'
#' # load data
#' data('decorateData')
#'
#' # load gene locations
#' ensdb = EnsDb.Hsapiens.v86
#' 
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#' 
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#' 
#' # Plot correlations and clusters in region defined by query
#' query = range(simLocation)
#' 
#' plotDecorate( ensdb, treeList, treeListClusters, simLocation, query)
#'
#' @importFrom rlist list.reverse
#' @importFrom GenomicRanges seqnames
#' @importFrom adjclust adjClust
#' @importFrom methods new
#' @importFrom stats cor
# @importFrom utils assignInNamespace
#' @export
runOrderedClusteringGenome = function( X, gr, method = c("adjclust", 'hclustgeo'), quiet=FALSE, alpha=0.5, adjacentCount=500, setNANtoZero=FALSE, method.corr = c("pearson", "spearman")){

  if( nrow(X) != length(gr) ){
    stop("# rows of X must equal # of entries in gr: ", nrow(X), ' != ', length(gr))
  }

  if( is.null(rownames(X)) ){
      stop("Data matrix X must have rownames that match names(gr)")
  }

  if( !all(rownames(X) %in% names(gr)) ){
    stop("All rownames of X must be found in names of gr so that all(rownames(X) %in% names(gr)) is TRUE  ")
  } 

  method <- match.arg(method)
  method.corr  <- match.arg(method.corr)

  if( alpha < 0 || alpha > 1 ){
      stop("Must specify agrument: alpha must be between 0 and 1")
  }

  # Drop empty chromsomes
  chrNameCount = table(seqnames(gr))
  gr = dropSeqlevels( gr, names(chrNameCount[chrNameCount==0]))

  seqNameCounts = table(seqnames(gr))
  seqNameCounts = seqNameCounts[seqNameCounts>0]

  # chromosomes must have >- 2 entries
  tooFew = seqNameCounts[seqNameCounts<=2]
  if( length(tooFew) > 0){
    stop("Must have >=2 entires on a chromosome in order to cluster.\nThe following have too few entries: ", paste(names(tooFew), collapse=', '), "\nRemove these chromosomes before analysis")
  }

  # run last (i.e. smallest) chrom first
  chromArray = as.character(seqnames(gr)@values)

  # run analysis
  ##############

  # remove costly sanity check in adjclust
  # f = function(mat){
  #   NULL
  # }

  # assignInNamespace(x="checkCondition", value = f, ns='adjclust')

  # for each chromosome
  treePerChrom = lapply( chromArray, function(chrom){

    if( !quiet ) message("\rEvaluating:", chrom, '          ')

    idx = which(array(GenomicRanges::seqnames(gr) == chrom))

    if(method == "adjclust"){
      C = createCorrelationMatrix( gr[idx], X[idx,], adjacentCount=adjacentCount, quiet=TRUE, setNANtoZero=setNANtoZero, method.corr=method.corr)

      # if a feature has no variance, the correlation with it is NaN
      # instead set it to zero
      # if( any(is.nan(C@x)) ){
      #   C@x[is.nan(C@x)] = 0
      #   diag(C) = 1
      # }

      h = min( adjacentCount, nrow(C)-1)
      fitClust = suppressMessages(adjClust( C^2, "similarity", h=h, strictCheck=FALSE))
      # fitClust = as.hclust(fitClust)

      res = new("epiclust", clust = fitClust, location=gr[idx], adjacentCount=adjacentCount, alpha=0, method=method, correlation=C)
    }else{

       if( method.corr != "pearson"){
        stop(paste("Correlation ", method.corr, "not currently implemented for", method))
       }

      # compute tree on this chromosome
      fitClust = runOrderedClustering( X[idx,], gr[idx], alpha=alpha)

      C = cor(t(X[idx,]))

      res = new("epiclust", clust = fitClust, location=gr[idx], adjacentCount=adjacentCount, alpha=alpha, method=method, correlation=C)
    }
    rm(C)
    rm(fitClust)
    gc()
    res
  })
  message('\n')

  names(treePerChrom) = chromArray
  
  new('epiclustList', treePerChrom )
}



#' Extract subset of data points
#' 
#' Extract subset of data points
#' 
#' @param fit epiclust or epiclustList object
#' @param query GenomicRanges object of which data points to retain
#' 
#' @return epiclust or epiclustList object
#'
#' @examples
#' library(GenomicRanges)
#' 
#' data('decorateData')
#' 
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#'
#' # extract subset of data after clustering 
#' res = getSubset( treeList, simLocation[1:10])
#' 
#' 
#' @importFrom GenomicRanges GRanges
#' @export
#' @docType methods
#' @rdname getSubset-methods
setGeneric("getSubset", function(fit, query) standardGeneric("getSubset"))

#' @importFrom GenomicRanges GRanges findOverlaps
#' @export
#' @rdname getSubset-methods
#' @aliases getSubset,epiclust,GRanges-method
setMethod("getSubset", c("epiclust", "GRanges"),
  function(fit, query) {
    # idx = NULL
    # v = match(seqnames(fit@location), seqnames(query))@values
    # if( any(!is.na(v)) > 0 ){

        # idx = which((seqnames(fit@location)@values %in% seqnames(query)@values) & (fit@location %within% query))

    # }

    ovlp = suppressWarnings(findOverlaps(fit@location, query))
    idx = ovlp@from

    if( length(idx) == 0){
      fitNew = NULL
    }else{
      fitNew = fit

      N = length(idx)

      if( N == 0 ){
        stop("Cannot select empty subset of entries")
      }
      idx2 = idx[seq_len(length(idx)-1)]
      fitNew@clust$merge = fitNew@clust$merge[idx2,]
      fitNew@clust$height = fitNew@clust$height[idx2]
      fitNew@clust$order = fitNew@clust$order[idx]
      fitNew@clust$labels = fitNew@clust$labels[idx]

      fitNew@location = fitNew@location[idx]

      fitNew@correlation = fitNew@correlation[idx,idx]
    }
    fitNew
  }
)

#' @importFrom GenomicRanges GRanges
#' @export
#' @rdname getSubset-methods
#' @aliases getSubset,epiclustList,GRanges-method
setMethod("getSubset", c("epiclustList", "GRanges"),
   function(fit, query){
    res = lapply(fit, function(x) getSubset(x, query))
    Filter(Negate(is.null), res)
  }
)



#' Create cluster from list of hclust objects
#'
#' Create cluster from list of hclust objects
#'
#' @param treeList list of hclust objects 
#' @param method 'capushe': slope heuristic. 'bstick': broken stick. 'meanClusterSize': create clusters based on target mean value. 
#' @param meanClusterSize select target mean cluster size.  Can be an array of values
#' @param pct minimum percentage of points for the plateau selection in capushe selection. Can be an array of values
#'
#' @return  Convert hierarchical clustering into discrete clusters based on selection criteria method
#'
#' @examples
#' library(GenomicRanges)
#' library(EnsDb.Hsapiens.v86)
#'
#' # load data
#' data('decorateData')
#'
#' # load gene locations
#' ensdb = EnsDb.Hsapiens.v86
#' 
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#' 
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#' 
#' # Plot correlations and clusters in region defined by query
#' query = range(simLocation)
#' 
#' plotDecorate( ensdb, treeList, treeListClusters, simLocation, query)
#'
#' @import adjclust
#' @importFrom stats cutree
#' @export
createClusters = function(treeList, method = c("capushe", "bstick", "meanClusterSize"), meanClusterSize=50, pct=0.15) {
  
  method <- match.arg(method)

  message("Method:", method, '\n')

  if( method == "meanClusterSize"){

    # loop over values in meanClusterSize
    res = lapply( meanClusterSize, function(mcs){
      # get counts on each chromosome
      n_features = vapply(treeList, function(x){
         length(x@clust$order)
      }, FUN.VALUE=numeric(1))
      # combine counts across chromosomes
      n_features_total = sum(n_features)

      n_total_clusters = round(n_features_total / mcs)

      # cut tree to get average of meanClusterSize
      res = lapply(treeList, function(x){
        N = length(x@clust$order)
        frac = N / n_features_total
        n_clust = round( frac*n_total_clusters )

        # message(paste("\r", mcs, n_clust))
        cutree(x@clust, k=max(n_clust, 1))
      })
      names(res) = names(treeList)
      new('epiclustDiscreteList',res)
    })
    names(res) = meanClusterSize

  }else{

    if( treeList[[1]]@method == 'hclustgeo' ){
      stop("Method '", method, "' is only compatible with result of\nrunOrderedClusteringGenome(...,method ='adjclust')")
    }

    res = lapply(pct, function(val){
      res = lapply(treeList, function(x){
        select( x@clust, type=method, pct=val)
      })
      names(res) = names(treeList)  
      new('epiclustDiscreteList',res)   
    })
    names(res) = pct
  }
  new('epiclustDiscreteListContain',res)
}

# setClass("epiclustDiscrete", representation("array"))

# setMethod("print", "epiclustDiscrete", function( x ){
#   nFeatures = length(x)
#   nClusters = length(unique(x))
#   message(nFeatures, "features in", nClusters, "discrete clusters\n\n" )
# })

# setMethod("show", "epiclustDiscrete", function( object ){
#   print( object )
# })

#' Class epiclustDiscreteList
#'
#' Class \code{epiclustDiscreteList} 
#'
#' @name epiclustDiscreteList-class
#' @rdname epiclustDiscreteList-class
#' @exportClass epiclustDiscreteList
setClass("epiclustDiscreteList", representation('list'))

setMethod("print", "epiclustDiscreteList", function( x ){
  for( chrom in names(x) ){
    nFeatures = length(x[[chrom]])
    nClusters = length(unique(x[[chrom]]))
    message(paste0(chrom, ':'),nFeatures, "features in", nClusters, "discrete clusters\n\n" )
  }
})

setMethod("show", "epiclustDiscreteList", function( object ){
  print( object )
})



#' Class epiclustDiscreteListContain
#'
#' Class \code{epiclustDiscreteListContain}: is a list containing epiclustDiscreteList objects
#'
#' @name epiclustDiscreteListContain-class
#' @rdname epiclustDiscreteListContain-class
#' @exportClass epiclustDiscreteListContain
setClass("epiclustDiscreteListContain", representation('list'))

setMethod("print", "epiclustDiscreteListContain", function( x ){
  message("Clusters defined using", length(x), "parameter values:",  paste(names(x), collapse=', '), '\n\n' ) 

  for( id in names(x) ){
    message("parameter:", id, '\n')
    print(x[[id]])
  }
})

setMethod("show", "epiclustDiscreteListContain", function( object ){
  print( object )
})

#' Simulated data to show correlation clustering
#'
#' @docType data
#' @keywords decorateData
#' @name decorateData
#' @usage data(decorateData)
"simData"

#' Simulated data to show correlation clustering. GRanges object indicating genomic position of data in rows of simData
#'
#' @docType data
#' @keywords decorateData
#' @name decorateData
#' @usage data(decorateData)
"simLocation"


#' Simulated disease status for differential analysis
#'
#' @docType data
#' @keywords decorateData
#' @name decorateData
#' @usage data(decorateData)
"metadata"


#' Find which cluster a peak is in
#'
#' Find which cluster a peak is in
#'
#' @param treeListClusters from createClusters()
#' @param feature_id name of query feature, can also be array
#' @param id clustering parameter identifier.  After filtering by feature_id, filter by id 
#'
#' @return data.frame of chromosome and cluster
#'
#' @examples
#' library(GenomicRanges)
#' 
#' data('decorateData')
#' 
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#' 
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize = c(50, 100) )
#' 
#' # Find chromsome and cluster of peak_20
#' whichCluster( treeListClusters, 'peak_20')
#'
#' # Find chromsome and cluster of peak_20 with clustering parameter 50 
#' #  corresponding to meanClusterSize 
#' whichCluster( treeListClusters, 'peak_20', "50")
#'
#' # Search for multiple clusters
#' whichCluster( treeListClusters, c('peak_20', 'peak_21'), "50")
#'
#' @export
whichCluster = function(treeListClusters, feature_id, id=NULL){
  # find cluster based on anem
  res = lapply( names(treeListClusters), function(msc){
    res = lapply(names(treeListClusters[[msc]]), function(chrom){
      x = treeListClusters[[msc]][[chrom]]
      idx = match(feature_id, names(x))
      clust = x[idx]
      # if( is.na(clust)){
      #   clust = NA
      #   clust[is.na(clust)] = NA
      # }
      names(clust) = c()
      data.frame( id=msc, chrom, feature_id, cluster=clust, stringsAsFactors=FALSE)
    })
    do.call('rbind', res)
  })
  res = do.call('rbind', res)

  res = res[!is.na(res$cluster),]

  # if specified, filter by ID
  if(!is.null(id) ){
    id = unique(id)
    res = res[res$id%in%as.character(id),]
  }
  res
}


#' Get feature names in selected cluster
#'
#' Get feature names in selected cluster given chrom and clusterid 
#'
#' @param treeListClusters from createClusters()
#' @param chrom chromosome name of cluster
#' @param clustID cluster identifier
#' @param id clustering parameter identifier
#'
#' @return array of feature names
#'
#' @examples
#' library(GenomicRanges)
#' 
#' data('decorateData')
#' 
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#' 
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize = 50 )
#' 
#' # Find chromsome and cluster of peak_204
#' getFeaturesInCluster( treeListClusters, "chr20", 3, "50")
#'
#' @export
getFeaturesInCluster = function( treeListClusters, chrom, clustID, id){

  idx = which( treeListClusters[[id]][[chrom]] == clustID )

  clsts = treeListClusters[[id]][[chrom]][idx]

  names(clsts)
}



#' Get feature names in selected cluster
#'
#' Get feature names in selected cluster given array of chrom and cluster ids 
#'
#' @param treeListClusters from createClusters()
#' @param chrom chromosome name of cluster
#' @param clustID cluster identifier
#' @param id clustering parameter identifier
#'
#' @return list of array of feature names.  Query with index i as returned in list index i
#'
#' @examples
#' library(GenomicRanges)
#' 
#' data('decorateData')
#' 
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#' 
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize = 50 )
#' 
#' df_cluster = data.frame( chrom = c("chr20", "chr20"), 
#'   cluster = c(3,5), 
#'   id = c("50", "50"), stringsAsFactors=FALSE )
#' 
#' # Find features for the clusters in df_cluster
#' getFeaturesInClusterList( treeListClusters, chrom=df_cluster$chrom, clustID=df_cluster$cluster, id=df_cluster$id)
#'
#' @export
getFeaturesInClusterList = function( treeListClusters, chrom, clustID, id){

  if( length(unique(c(length(chrom), length(clustID), length(id)))) != 1){
    stop("Number of entries in chrom, clustID, id must be equal")
  }

  res = sapply( seq_len(length(chrom)), function(i){

    idx = which( treeListClusters[[id[i]]][[chrom[i]]] == clustID[i] )

    clsts = treeListClusters[[id[i]]][[chrom[i]]][idx]

    names(clsts)
  })
  res
}




#' Compute scores for each cluster
#'
#' For each cluster compute summary statistics for the cluster to measure how strong the correlation structure is.  Clusters with weak correlation structure can be dropped from downstream analysis.
#'
#' @param treeList list of hclust objects 
#' @param treeListClusters from createClusters()
#' @param BPPARAM parameters for parallel evaluation
#' 
#' @details For each cluster, extract the correlation matrix and return the mean absolute correlation; the 75th, 90th and 95th quantile absolute correlation, and LEF, the leading eigen-value fraction which is the fraction of variance explained by the leading eigen value of the matrix abs(C).
#'
#' @return for all pairs of peaks within windowSize, report distance 
#' 
#' @examples
#' library(GenomicRanges)
#' library(BiocParallel)
#' 
#' data('decorateData')
#' 
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#' 
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' # Evaluate score for each cluster
#' clstScore = scoreClusters(treeList, treeListClusters, BPPARAM = SerialParam() )
#' 
#' @export
#' @importFrom stats quantile
#' @importFrom progress progress_bar
#' @import BiocParallel 
# @importFrom irlba partial_eigen
scoreClusters = function(treeList, treeListClusters, BPPARAM=bpparam()){

  # get data.frame of cluster names per chromosomes
  # clCount = countClusters( treeListClusters )
  clCount = lapply(treeListClusters, countClusters)

  df_count = lapply(names(clCount), function(id){
    lapply( seq_len(length(clCount[[id]])), function(i){
    x = clCount[[id]][i]
    data.frame(chrom=rep(names(x), x), cluster=seq(x), meanClusterSize=as.numeric(id), stringsAsFactors=FALSE)
    })
  })
  df_count = do.call("rbind", unlist(df_count, recursive=FALSE))


  # evaluate for each cluster
  .score_clusts = function( obj ) {

    batch = clustID = peakID = NA

    values = unique(obj$treeListClusters)

    df_values = data.frame(peakID = names(obj$treeListClusters), clustID = obj$treeListClusters, stringsAsFactors=FALSE)
    df_values = data.table::data.table(df_values)
    n_batches = 50
    df_values[,batch:=round(clustID / n_batches)]

    Clist = lapply( unique(df_values$batch), function(btch){
      peakIDs = df_values[batch==btch,peakID]
      as.matrix(obj$treeList@correlation[peakIDs,peakIDs])
      })
    names(Clist) = unique(df_values$batch)

    res = lapply( values, function( clustID ){
      peakIDs = names(obj$treeListClusters[obj$treeListClusters==clustID])

      N = length(peakIDs)

      cor_values = 1
      LEF = 1

      if( N > 1){
        # Get correlation matrix
        # C = as.matrix(obj$treeList@correlation[peakIDs,peakIDs])
        batch = df_values[peakID%in%peakIDs,as.character(unique(batch))]
        C = Clist[[batch]][peakIDs,peakIDs]

        # Compute lead eigen-value fraction using full eigen spectrum
        # eigen analysis
        dcmp = eigen(abs(C), symmetric=TRUE, only.values = TRUE)
        LEF = dcmp$values[1] / sum(dcmp$values)

        cor_values = C[lower.tri(C)]
        cor_values = cor_values[is.finite(cor_values)]
      }
      # compute statistics
      data.frame( id=obj$id, chrom = obj$chrom, 
        cluster = clustID, N=N,
        mean_abs_corr = mean(abs(cor_values)), 
        quantile75 = quantile(abs(cor_values), .75), 
        quantile90 = quantile(abs(cor_values), .90), 
        quantile95 = quantile(abs(cor_values), .95),
        LEF=LEF, stringsAsFactors=FALSE)
    })
    do.call("rbind", res)
  }
  
  message("Evaluating strength of each cluster...\n\n")

  it = corrIterBatch( treeList, treeListClusters, df_count )
  message(paste0("Dividing work into ",attr(it, "n_chunks")," chunks...\n"))

  res = bpiterate( it, .score_clusts, BPPARAM=BPPARAM)

  res = do.call("rbind", res)
  rownames(res) = c()

  mscArray = unique(df_count$meanClusterSize)
  clustScoresList = lapply( mscArray, function(msc){
     res[res$id==msc,]
    })
  names(clustScoresList) = mscArray
  clustScoresList
}




#' Extract subset of clusters
#'
#' Extract subset of clusters based on entries in chroms and clusters
#'
#' @param treeListClusters from createClusters()
#' @param clustInclude data.frame from retainClusters() indcating which clusters to include
#' 
#' @return epiclustDiscreteList of specified clusters
#' 
#' @examples
#' library(GenomicRanges)
#' library(BiocParallel)
#' 
#' data('decorateData')
#' 
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#' 
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' # Evaluate score for each cluster
#' clstScore = scoreClusters(treeList, treeListClusters, BPPARAM=SerialParam() )
#' 
#' # Retain clusters that pass this criteria
#' clustInclude = retainClusters( clstScore, "LEF", 0.30 )
#' 
#' # get retained clusters
#' treeListClusters_filter = filterClusters( treeListClusters, clustInclude)
#' 
#' @export
filterClusters = function( treeListClusters, clustInclude){

  res = lapply( names(treeListClusters), function(msc){
    tlc = treeListClusters[[as.character(msc)]]

    resList = lapply( names(tlc), function(chrom){ 
      idx = which(chrom == clustInclude$chrom & msc == clustInclude$id)     
      idx2 = tlc[[chrom]] %in% clustInclude$cluster[idx]
      tlc[[chrom]][idx2]
    })
    names(resList) = names(tlc)
    new('epiclustDiscreteList', resList)
  })
  names(res) = names(treeListClusters)
  
  new('epiclustDiscreteListContain',res)
}



#' Retain clusters by applying filter
#'
#' Retain clusters by applying filter
#' 
#' @param clstScore score each cluster using scoreClusters()
#' @param metric column of clstScore to use in filtering
#' @param cutoff retain cluster than exceed the cutoff for metric.  Can be array with one entry per entry in clstScore
#'
#' @return data.frame of chrom, clutser, id (the clustering parameter value), and the specified metric
#'
#' @examples
#' library(GenomicRanges)
#' library(BiocParallel)
#' 
#' data('decorateData')
#' 
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#' 
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' # Evaluate score for each cluster
#' clstScore = scoreClusters(treeList, treeListClusters, BPPARAM = SerialParam() )
#' 
#' # Retain clusters that pass this criteria
#' clustInclude = retainClusters( clstScore, "LEF", 0.30 )
#' 
#' # Or filter by mean absolute correlation
#' # clustInclude = retainClusters( clstScore, "mean_abs_corr", 0.1 )
#' 
#' # get retained clusters
#' treeListClusters_filter = filterClusters( treeListClusters, clustInclude )
#'
#' @export
retainClusters = function(clstScore, metric="LEF", cutoff = 0.40){

  if( length(cutoff) == 0){
    stop("Must specify cutoff")
  }

  if( (length(cutoff) > 1) & (length(cutoff) != length(clstScore)) ){
    stop( paste("Array values specified for cutoff, but number of entries must match\nnumber of entries in clstScore:", length(clstScore)))
  }

  # if cutoff is a scalar, repeated it once for each entry in clstScore
  if( length(cutoff) == 1 ){
    cutoff = rep(cutoff, length(clstScore) )
  }

  names(cutoff) = names(clstScore)

  message("Using cutoffs:\n")
  message("Cluster set\tcutoff\n")
  for(i in seq_len(length(clstScore))){
    message(paste0(" ", names(clstScore)[i], "\t\t", format(cutoff[i], digits=3), '\n'))
  }
  message("\n")

  clstScoreDF = do.call("rbind", clstScore)

  if( ! metric %in% colnames(clstScoreDF) ){
     cols = colnames(clstScoreDF)[!colnames(clstScoreDF) %in% c("id", "chrom", "cluster")]

    stop("Valid metric to filter by are:\n", paste(cols, collapse = ', '))
  }

  # apply different cutoff for each entry in clstScore
  res = lapply( names(clstScore), function(id){
    cl = clstScore[[id]]
    cl[cl[[metric]] >= cutoff[id],c("id", "chrom", "cluster", metric)]
    })

  do.call("rbind", res)  
}


#' Evaluate Jaccard index
#' 
#' Evaluate Jaccard index
#' 
#' @param a set 1
#' @param b set 2
#'
#' @examples
#' a = 1:10
#' b = 5:15
#' jaccard(a,b)
#'
#' @return Jaccard index
#' @export
jaccard = function(a,b){
  n_I = length(intersect(a,b))
  n_U = length(union(a,b))
  n_I / n_U
}

`:=` <- function(a,b) {stop(":= should not be called directly")}

#' Collapse clusters based on jaccard index
#' 
#' Collapse clusters if jaccard index between clusters exceeds a cutoff
#' 
#' @param treeListClusters from createClusters()
#' @param featurePositions GRanges object storing location of each feature
#' @param jaccardCutoff cutoff value for jaccard index
#'
#' @return subset of clusters in treeListClusters that passes cutoff
#'
#' @examples
#' library(GenomicRanges)
#' library(BiocParallel)
#' 
#' # load data
#' data('decorateData')
#' 
#' # Evaluate hierarchical clustering
#' # adjacentCount is the number of adjacent peaks considered in correlation
#' treeList = runOrderedClusteringGenome( simData, simLocation)
#' 
#' # Choose cutoffs and return cluster
#' treeListClusters = createClusters( treeList, method = "meanClusterSize", meanClusterSize=c( 10, 20, 30, 40, 50) )
#' 
#' # Evaluate strength of correlation for each cluster
#' clstScore = scoreClusters(treeList, treeListClusters, BPPARAM = SerialParam() )
#' 
#' # Filter to retain only strong clusters
#' clustInclude = retainClusters( clstScore, "LEF", 0.30 )
#' 
#' # get retained clusters
#' treeListClusters_filter = filterClusters( treeListClusters, clustInclude)
#' 
#' # collapse similar clusters
#' treeListClusters_collapse = collapseClusters( treeListClusters_filter, simLocation)
#' 
#' @importFrom utils combn
#' @importFrom data.table data.table
#' @importFrom GenomicRanges seqnames
#' @export
collapseClusters = function(treeListClusters, featurePositions, jaccardCutoff=0.9){

  if( ! is(featurePositions, "GRanges")){
    stop("featurePositions must be a GRanges object")
  }

  if( is.null(names(featurePositions)) ){
    stop("featurePositions must have identifiers for each interval accessable using names(featurePositions)")
  }

  # Drop empty chromsomes
  chrNameCount = table(seqnames(featurePositions))
  featurePositions = dropSeqlevels( featurePositions, names(chrNameCount[chrNameCount==0]))

  # for each cluster in each chrom and cutoff value, get range
  flattened = unlist( treeListClusters )
  splt = strsplit(  names(flattened), '\\.')
  res2 = data.table(data.frame( msc = vapply(splt, function(x) x[1], "character"),
      chrom = vapply(splt, function(x) x[2], "character"),
      feature = vapply(splt, function(x) paste0(x[-c(1:2)],collapse='.'), "character"),
      cluster = flattened,
      stringsAsFactors=FALSE)) 

  idx = match( res2$feature, names(featurePositions))

  if( any(is.na(idx)) ){
    stop("There are ", format(sum(is.na(idx)), big.mark=','), " features in treeListClusters that are not found in featurePositions")
  }

  res2$start = start(featurePositions)[idx]
  res2$end = end(featurePositions)[idx]

  res = res2[,data.frame( N = length(start),
    start=min(start), end=max(end), stringsAsFactors=FALSE), by=c('msc', 'chrom', 'cluster')]

  # Get all pairs of clusters, then filter out pairs with same cutoff or different chromsome
  ###################

  message("Identifying redundant clusters...\n")

  msc = chrom = cluster = msc.x = msc.y = chrom.x = chrom.y = N.x = N.y = key.x = key.y = .SD = key = NULL

  resRetain = lapply( unique(res$chrom), function(CHROM){

    # get only 1 chromosome
    resChrom = res[chrom == CHROM,]
    resChrom[, key:=do.call(paste,.SD)]

    # get all combinations
    # idx = t(combn(nrow(resChrom), 2))

    # get only cluster combos that have overlapping ranges
    gr = GRanges(resChrom$chrom, IRanges(resChrom$start, resChrom$end), name=resChrom$cluster)
    fnd = findOverlaps( gr, gr)

    # remove self matches
    # fnd = fnd[which(fnd@from != fnd@to)]
    # remove cases where A matches B, and B matches A
    fnd = fnd[which(fnd@from < fnd@to)]

    res_1 = resChrom[fnd@from,]
    colnames(res_1) = paste0(colnames(res_1), '.x')
    res_2 = resChrom[fnd@to,]
    colnames(res_2) = paste0(colnames(res_2), '.y')

    # get all combinations
    df_combo = data.table(cbind( res_1, res_2))

    # filter combinations
    resLoc = df_combo[(msc.x!=msc.y)&(chrom.x==chrom.y),]

    # get jaccard similarity between plairs of clusters
    resLoc$jaccard = vapply( seq_len(nrow(resLoc)), function(k){

      clst.x = treeListClusters[[resLoc$msc.x[k]]][[resLoc$chrom.x[k]]]
      clusters.x = names(clst.x)[clst.x == resLoc$cluster.x[k]]

      clst.y = treeListClusters[[resLoc$msc.y[k]]][[resLoc$chrom.y[k]]]
      clusters.y = names(clst.y)[clst.y == resLoc$cluster.y[k]]

      jaccard( clusters.x, clusters.y)
    }, numeric(1))

    # if jaccard index is larger than cutoff, drop the smaller cluster
    resSub = resLoc[resLoc$jaccard > jaccardCutoff,]
    drop.x = resSub[N.x < N.y,key.x] 
    drop.y = resSub[N.x >= N.y,key.y] 

    resChrom[!(key %in% c(drop.x, drop.y)),]
  })

  resRetain = do.call("rbind", resRetain)

  # Keep only clusters in resRetain from treeListClusters
  #######################################################

  # count clusters for each cutoff and chromsome
  clCount = lapply(treeListClusters, countClusters)

  # create now object without these clusters
  idx = vapply(clCount, function(x) sum(x) > 0, logical(1))

  # get clusters stats for each cutoff and chromsome
  res = lapply(names(treeListClusters)[idx], function(id){
    res = lapply(names(treeListClusters[[id]]), function(chr){

      # get clusters
      clsts = treeListClusters[[id]][[chr]]
      
      # of cluster in in the retain set
      clsts[clsts %in% resRetain[msc==id & chrom==chr,cluster]]
    })
    names(res) = names(treeListClusters[[id]])
    count = countClusters( new('epiclustDiscreteList',res) )    
    new('epiclustDiscreteList', res[count > 0])
  })
  names(res) = names(treeListClusters)[idx]

  res = new('epiclustDiscreteListContain',res)

  clCount = lapply(res, countClusters)

  idx = vapply(clCount, function(x){
      length(x) > 0 && x > 0
    }, logical(1))

  # drop entries in res if does't contain clusters after filtering
  new('epiclustDiscreteListContain', res[idx])
}


#' Allow subsetting of epiclustDiscreteListContain
#'
#' Allow subsetting of epiclustDiscreteListContain
#'
#' @param x epiclustDiscreteListContain
#' @param i index 1
#' @param j index 2
#' @param drop TRUE/FALSE
#' @param ... additional arguement
#'
#' @return subset of epiclustDiscreteListContain
#'
#' @examples
#' library(GenomicRanges)
#' 
#' data('decorateData')
#' 
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#' 
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#'
#' @export
setMethod("[", "epiclustDiscreteListContain",
  function(x, i, j, ..., drop) {
    # new("epiclustDiscreteListContain", x[i])
    keys = names(x)[i]

    obj = lapply(keys, function(key){
      x[[key]]
      })
    names(obj) = keys
    new('epiclustDiscreteListContain', obj)
  })


#' Count clusters on each chromosome
#'
#' Count clusters on each chromosome
#'
#' @param treeListClusters from createClusters()
#'
#' @return  count number of clusters on each chromsome
#'
#' @examples
#' library(GenomicRanges)
#' 
#' data('decorateData')
#' 
#' # Evaluate hierarchical clustering
#' treeList = runOrderedClusteringGenome( simData, simLocation ) 
#' 
#' # Choose cutoffs and return clusters
#' treeListClusters = createClusters( treeList )
#' 
#' # Count clusters on each chromsome
#' countClusters( treeListClusters )
#'
#' @export
#' @rdname countClusters-methods
setGeneric("countClusters", function(treeListClusters) standardGeneric("countClusters"))

#' @export
#' @rdname countClusters-methods
#' @aliases countClusters,epiclustDiscreteList-method
setMethod("countClusters", c("epiclustDiscreteList"),
  function(treeListClusters){
    .countClusters( treeListClusters )
})


#' @export
#' @rdname countClusters-methods
#' @aliases countClusters,epiclustDiscreteListContain-method
setMethod("countClusters", c("epiclustDiscreteListContain"),
  function(treeListClusters){
    sum(unlist(lapply( treeListClusters, .countClusters )))
})

.countClusters = function(treeListClusters){
    vapply(treeListClusters, function(obj) length(unique(obj)), numeric(1))
}
GabrielHoffman/decorate documentation built on Sept. 27, 2022, 1:26 a.m.