R/find_clones.R

Defines functions find_clones

Documented in find_clones

#' Identify nodes in a hierarchical tree which qualify as clones.
#' Identify 'hard' clones first, then expand them to 'soft' clones. Expansion may 
#' result in clone mergers.
#' Based on hierarchical clustering, identify the hard/soft clones.
#' @param hc An hclust object with additional items generated by \code{hclust_tree}.
#' @param fdrthresh maximal allowed value for log10(FDR) for any pair of leaves 
#'        in a clone node. Default: -2.
#' @param sharemin A feature is considered 'widely shared' if present in sharemin 
#'        fraction of leaves in a node.Default: 0.90.
#' @param nshare Minimal number of 'widely shared' features in a hard clone. 
#'        Default: 3.
#' @param bymax Logical. If TRUE (default), use maximal, and otherwise mean, 
#'        FDR for the node as a criterion for a hard clone.
#' @param climbfromsize An integer: minimal size of a hard clone allowed to be 
#'        expanded 
#' @param climbtoshare An integer: expand the clone as long as the number of 
#'        widely shared features is at least this value
#' @return An hclust object, with hard/soft clones indicated
#' @export
find_clones <- function(hc, 
    fdrthresh = -2, 
    sharemin = 0.85, 
    nshare = 3, 
    bymax = T,
    climbfromsize = 2, 
    climbtoshare = 3){

  # fdrthresh: FDR criterion for clone nodes
  # share_min: A feature is considered shared if present in share_min fraction 
  #            of leaves in a node
  # nshare: Minimal number of shared features in a clone node
  # bymax: TRUE, Use maximal of mean FDR for the node to find clones?
  # climbfromsize
  # climbtoshare



  # Number of features (approximately) shared across the node
  # leaves of the node together with the shared features form a bicluster
  count_pins_share <- colSums(hc$sharing > sharemin)



  # A node is considered compliant if FDR is below and its sharing across above 
  # threshod for the node and all its descendants
#  if(bymax){
#    node_compliant <- (hc$maxfdr < fdrthresh & 
#          (count_pins_share - count_pins_share[nrow(hc$merge)]) > nshare)
#  } else {
#    node_compliant <- (hc$meanfdr < fdrthresh & 
#          (count_pins_share - count_pins_share[nrow(hc$merge)]) > nshare)
#  }

  if (bymax) { 
    node_compliant <- (hc$mergefdr < fdrthresh & count_pins_share > nshare)
  } else { 
    node_compliant <- (hc$meanfdr < fdrthresh & count_pins_share > nshare)
  }
  
  leftchild <- (hc$merge[,1] < 0)
  leftchild[hc$merge[,1] > 0] <- node_compliant[hc$merge[hc$merge[,1] > 0, 1]]

  rightchild <- (hc$merge[,2] < 0)
  rightchild[hc$merge[,2] > 0] <- node_compliant[hc$merge[hc$merge[,2] > 0, 2]]

  new_node_compliant <- node_compliant & leftchild & rightchild
  
  while(!is.na(new_node_compliant) && !is.na(node_compliant) && 
        !all(new_node_compliant == node_compliant)){
    node_compliant <- new_node_compliant

    leftchild <- (hc$merge[,1] < 0)
    leftchild[hc$merge[,1] > 0] <- node_compliant[hc$merge[hc$merge[,1] > 0, 1]]

    rightchild <- (hc$merge[,2] < 0)
    rightchild[hc$merge[,2] > 0] <- node_compliant[hc$merge[hc$merge[,2] > 0, 2]]

    new_node_compliant <- node_compliant & leftchild & rightchild
  }


  #Clone nodes are maximum compliant nodes
  clone_nodes <- setdiff((1:nrow(hc$merge))[node_compliant],
                         c(hc$merge[node_compliant, 1], hc$merge[node_compliant, 2]))

  hc$fdrthresh <- fdrthresh
  hc$clonenodes <- clone_nodes
  hc$bymax <- bymax
  hc$count_pins_share <- count_pins_share
  hc$sharemin <- sharemin
  hc$nshare <- nshare

  if(!is.null(hc$clonenodes)){
    hc$softclones <- hc_climb(
        hc, minsize = climbfromsize, 
        minshare = climbtoshare + hc$count_pins_share[nrow(hc$merge)])
  }

  return(hc)
}
seqpipe/SCclust documentation built on May 4, 2021, 11:56 a.m.