R/branchSplit.R

Defines functions branchSplitFromStabilityLabels.individualFraction branchSplitFromStabilityLabels.prediction branchSplitFromStabilityLabels branchSplit.dissim .sizeDependentQuantile .meanInRange branchSplit .histogramsWithCommonBreaks hierarchicalBranchEigengeneDissim branchEigengeneSimilarity mtd.branchEigengeneDissim branchEigengeneDissim .alignedFirstPC

# ======================================================================================================
#
# Aligned svd: svd plus aligning the result along the average expression of the input data.
#
# ======================================================================================================

# CAUTION: this function assumes normalized x and no missing values.

.alignedFirstPC <- function(x, power = 6, verbose = 2, indent = 0) {
  x <- as.matrix(x)
  # printFlush(paste(".alignedFirstPC: dim(x) = ", paste(dim(x), collapse = ", ")));
  pc <- try(svd(x, nu = 1, nv = 0)$u[, 1], silent = TRUE)
  if (inherits(pc, "try-error")) {
    # file = "alignedFirstPC-svdError-inputData-x.RData";
    # save(x, file = file);
    # stop(paste("Error in .alignedFirstPC: svd failed with following message: \n    ",
    #                 pc, "\n. Saving the offending data into file", file));
    if (verbose > 0) {
      spaces <- indentSpaces(indent)
      printFlush(paste(
        spaces, ".alignedFirstPC: FYI: svd failed, using a weighted mean instead.\n",
        spaces, "  ...svd reported:", pc
      ))
    }
    pc <- rowMeans(x, na.rm = TRUE)
    weight <- matrix(abs(cor(x, pc, use = "p"))^power, nrow(x), ncol(x), byrow = TRUE)
    pc <- scale(rowMeans(x * weight, na.rm = TRUE))
  } else {
    weight <- abs(cor(x, pc))^power
    meanX <- rowWeightedMeans(x, weight, na.rm = TRUE)
    cov1 <- cov(pc, meanX)
    if (!is.finite(cov1)) cov1 <- 0
    if (cov1 < 0) pc <- -pc
  }
  pc
}


# ==========================================================================================================
#
# Branch eigengene split (dissimilarity) calculation
#
# ==========================================================================================================

# Assumes correct input: multiExpr is scaled to mean 0 and variance 1, branch1 and branch2 are numeric
# indices that have no overlap.

branchEigengeneDissim <- function(expr, branch1, branch2,
                                  corFnc = cor, corOptions = list(use = "p"),
                                  signed = TRUE, ...) {
  expr.branch1 <- expr[, branch1]
  expr.branch2 <- expr[, branch2]

  corOptions$x <- .alignedFirstPC(expr.branch1, verbose = 0)
  corOptions$y <- .alignedFirstPC(expr.branch2, verbose = 0)

  corFnc <- match.fun(corFnc)
  cor0 <- as.numeric(do.call(corFnc, corOptions))
  if (length(cor0) != 1) stop("Internal error in branchEigengeneDissim: cor has length ", length(cor0))
  if (signed) 1 - cor0 else 1 - abs(cor0)
}


mtd.branchEigengeneDissim <- function(multiExpr, branch1, branch2,
                                      corFnc = cor, corOptions = list(use = "p"),
                                      consensusQuantile = 0, signed = TRUE, reproduceQuantileError = FALSE, ...) {
  setSplits.list <- mtd.apply(multiExpr, branchEigengeneDissim,
    branch1 = branch1, branch2 = branch2,
    corFnc = corFnc, corOptions = corOptions,
    signed = signed,
    returnList = TRUE
  )
  setSplits <- unlist(setSplits.list)
  quantile(setSplits,
    prob = if (reproduceQuantileError) consensusQuantile else 1 - consensusQuantile,
    na.rm = TRUE, names = FALSE
  )
}

branchEigengeneSimilarity <- function(expr, branch1, branch2,
                                      networkOptions,
                                      returnDissim = TRUE,
                                      ...) {
  corFnc <- match.fun(networkOptions$corFnc)
  cor0 <- as.numeric(do.call(
    corFnc,
    c(
      list(
        x = .alignedFirstPC(expr[, branch1], verbose = 0),
        y = .alignedFirstPC(expr[, branch2], verbose = 0)
      ),
      networkOptions$corOptions
    )
  ))
  if (length(cor0) != 1) stop("Internal error in branchEigengeneDissim: cor has length ", length(cor0))
  if (grepl("signed", networkOptions$networkType)) cor0 <- abs(cor0)
  if (returnDissim) 1 - cor0 else cor0
}


hierarchicalBranchEigengeneDissim <- function(
                                              multiExpr,
                                              branch1, branch2,
                                              networkOptions,
                                              consensusTree, ...) {
  setSplits.list <- mtd.mapply(branchEigengeneSimilarity,
    expr = multiExpr, networkOptions = networkOptions,
    MoreArgs = list(
      branch1 = branch1, branch2 = branch2,
      returnDissim = FALSE
    ), returnList = TRUE
  )

  1 - simpleHierarchicalConsensusCalculation(
    individualData = setSplits.list,
    consensusTree = consensusTree
  )
}






# ==========================================================================================================
#
# Branch split calculation
#
# ==========================================================================================================
# Caution: highly experimental!

# Define a function that's supposed to decide whether two given groups of expression data are part of a
# single module, or truly two independent modules.
# assumes no missing values for now.
# assumes all data is scaled to mean zero and equal variance.

# return: criterion is zero or near zero if it looks like a single module, and is near 1 if looks like two
# modules.

# Careful: in the interest of speedy execution, the function doesn't check arguments for validity. For
# example, it assumes that expr is already scaled to the same mean and variance, branch1 and branch2 are valid
# indices, nConsideredPCs does not exceed any of the dimensions of expr etc.

.histogramsWithCommonBreaks <- function(data, groups, discardProp = 0.08) {
  if (is.list(data)) {
    lengths <- sapply(data, length)
    data <- data[lengths > 0]
    lengths <- sapply(data, length)
    nGroups <- length(lengths)
    groups <- rep(c(1:nGroups), lengths)
    data <- unlist(data)
  }

  if (discardProp > 0) {
    # get rid of outliers on either side - those are most likely not interesting.
    # The code is somewhat involved because I want to get rid of outliers that are defined with respect to
    # the combined data, but no more than a certain proportion of either of the groups.

    sizes <- table(groups)
    nAll <- length(data)
    order <- order(data)
    ordGrp <- groups[order]
    cs <- rep(0, nAll)
    nGroups <- length(sizes)
    for (g in 1:nGroups) {
      cs[ordGrp == g] <- ((1:sizes[g]) - 0.5) / sizes[g]
    }

    firstKeep <- min(which(cs > discardProp))
    first <- data[order[firstKeep]]

    # Analogous upper quantile
    lastKeep <- max(which(cs < 1 - discardProp))
    last <- data[order[lastKeep]]

    keep <- ((data >= first) & (data <= last))
    data <- data[keep]
    groups <- groups[keep]
  } else {
    last <- max(data, na.rm = TRUE)
    first <- min(data, na.rm = TRUE)
  }

  # Find the smaller of the two groups and define histogram bin size from the number of elements in that
  # group; the aim is to prevent the group getting splintered into too many bins of the histogram.

  sizes <- table(groups)
  smallerInd <- which.min(sizes)
  smallerSize <- sizes[smallerInd]
  nBins <- ceiling(5 + ifelse(smallerSize > 25, sqrt(smallerSize) - 4, 1))

  smaller <- data[groups == smallerInd]
  binSize <- (max(smaller) - min(smaller)) / nBins

  nAllBins <- ceiling((last - first) / binSize)
  breaks <- first + c(0:nAllBins) * (last - first) / nAllBins

  tapply(data, groups, hist, breaks = breaks, plot = FALSE)
}



branchSplit <- function(expr, branch1, branch2, discardProp = 0.05, minCentralProp = 0.75,
                        nConsideredPCs = 3, signed = FALSE, getDetails = TRUE, ...) {
  nGenes <- c(length(branch1), length(branch2))
  # combined = cbind(expr[, branch1], expr[, branch2]);
  combinedScaled <- cbind(expr[, branch1] / sqrt(length(branch1)), expr[, branch2] / sqrt(length(branch2)))
  groups <- c(rep(1, nGenes[1]), rep(2, nGenes[2]))

  # get the combination of PCs that best approximates the groups vector
  svd <- svd(combinedScaled, nu = 0, nv = nConsideredPCs)
  v2 <- svd$v * c(rep(sqrt(length(branch1)), length(branch1)), rep(sqrt(length(branch2)), length(branch2)))

  # svd = svd(combinedScaled, nu = nConsideredPCs, nv = 0);
  # v2 = cor(combinedScaled, svd$u);

  if (!signed) v2 <- v2 * sign(v2[, 1])

  cor2 <- predict(lm(groups ~ ., data = as.data.frame(v2)))

  # get the histograms of the projections in both groups, but make sure the binning is the same for both.

  # get rid of outliers on either side - those are most likely not interesting.
  # The code is somewhat involved because I want to get rid of outliers that are defined with respect to
  # the combined data, but no more than a certain proportion of either of the groups.

  h <- .histogramsWithCommonBreaks(cor2, groups, discardProp)

  maxAll <- max(c(h[[1]]$counts, h[[2]]$counts))
  h[[1]]$counts <- h[[1]]$counts / maxAll
  h[[2]]$counts <- h[[2]]$counts / maxAll
  max1 <- max(h[[1]]$counts)
  max2 <- max(h[[2]]$counts)
  minMax <- min(max1, max2)

  if (FALSE) {
    plot(h[[1]]$mids, h[[1]]$counts, type = "l")
    lines(h[[2]]$mids, h[[2]]$counts, type = "l", col = "red")
    lines(h[[2]]$mids, h[[1]]$counts + h[[2]]$counts, type = "l", col = "blue")
  }
  # Locate "central" bins: those whose scaled counts exceed a threshold.

  central <- list()
  central[[1]] <- h[[1]]$counts > minCentralProp * minMax
  central[[2]] <- h[[2]]$counts > minCentralProp * minMax

  # Do central bins overlap?

  overlap <- (min(h[[1]]$mids[central[[1]]]) <= max(h[[2]]$mids[central[[2]]])) &
    (min(h[[2]]$mids[central[[2]]]) <= max(h[[1]]$mids[central[[1]]]))

  if (overlap) {
    result <- list(middleCounts = NULL, criterion = minCentralProp, split = -1, histograms = h)
  } else {
    # Locate the region between the two central regions and check whether the gap is deep enough.
    if (min(h[[1]]$mids[central[[1]]]) > max(h[[2]]$mids[central[[2]]])) {
      left <- 2
      right <- 1
    } else {
      left <- 1
      right <- 2
    }
    leftEdge <- max(h[[left]]$mids[central[[left]]])
    rightEdge <- min(h[[right]]$mids[central[[right]]])
    middle <- ((h[[left]]$mids > leftEdge) & (h[[left]]$mids < rightEdge))
    nMiddle <- sum(middle)
    if (nMiddle == 0) {
      result <- list(middleCounts = NULL, criterion = minCentralProp, split = -1, histograms = h)
    } else {
      # Reference level: 75th percentile of the central region of the smaller branch
      # refLevel1 = quantile(h[[1]]$counts [ central[[1]] ], prob = 0.75);
      # refLevel2 = quantile(h[[2]]$counts [ central[[2]] ], prob = 0.75)
      refLevel1 <- mean(h[[1]]$counts [central[[1]]], na.rm = TRUE)
      refLevel2 <- mean(h[[2]]$counts [central[[2]]], na.rm = TRUE)
      peakRefLevel <- min(refLevel1, refLevel2)

      middleCounts <- h[[left]]$counts[middle] + h[[right]]$counts[middle]
      # troughRefLevel = quantile(middleCounts, prob = 0.25)
      troughRefLevel <- mean(middleCounts, na.rm = TRUE)
      meanCorrFactor <- sqrt(min(nMiddle + 1, 3) / min(nMiddle, 3))
      # =sqrt(2, 3/2, 1), for nMiddle=1,2,3,..
      result <- list(
        middleCounts = middleCounts,
        criterion = troughRefLevel * meanCorrFactor,
        split = (peakRefLevel - troughRefLevel * meanCorrFactor) / peakRefLevel,
        histograms = h
      )
    }
  }
  if (getDetails) result else result$split
}



# ==========================================================================================================
#
# Dissimilarity-based branch split
#
# ==========================================================================================================

.meanInRange <- function(mat, rangeMat) {
  nc <- ncol(mat)
  means <- rep(0, nc)
  for (c in 1:nc)
  {
    col <- mat[, c]
    means[c] <- mean(col[col >= rangeMat[c, 1] & col <= rangeMat[c, 2]], na.rm = TRUE)
  }
  means
}

.sizeDependentQuantile <- function(p, sizes, minNumber = 5) {
  refSize <- minNumber / p
  correctionFactor <- pmin(rep(1, length(sizes)), sizes / refSize)

  pmin(rep(1, length(sizes)), p / correctionFactor)
}


branchSplit.dissim <- function(dissimMat, branch1, branch2, upperP,
                               minNumberInSplit = 5, getDetails = FALSE, ...) {
  lowerP <- 0

  sizes <- c(length(branch1), length(branch2))
  upperP <- .sizeDependentQuantile(upperP, sizes, minNumber = minNumberInSplit)

  multiP <- as.data.frame(rbind(rep(0, 2), upperP))

  outDissim <- list(
    list(data = dissimMat[branch2, branch1]),
    list(data = dissimMat[branch1, branch2])
  )

  quantiles <- mtd.mapply(colQuantiles, outDissim, probs = multiP, MoreArgs = list(drop = FALSE))
  averages <- mtd.mapply(.meanInRange, outDissim, quantiles)

  averageQuantiles <- mtd.mapply(quantile, averages, prob = multiP, MoreArgs = list(drop = FALSE))

  betweenQuantiles <- mtd.mapply(
    function(x, quantiles) {
      x >= quantiles[1] & x <= quantiles[2]
    },
    averages, averageQuantiles
  )

  selectedDissim <- list(
    list(data = dissimMat[branch1, branch1[betweenQuantiles[[1]]$data]]),
    list(data = dissimMat[branch2, branch1[betweenQuantiles[[1]]$data]]),
    list(data = dissimMat[branch2, branch2[betweenQuantiles[[2]]$data]]),
    list(data = dissimMat[branch1, branch2[betweenQuantiles[[2]]$data]])
  )

  # n1 = length(branch1);
  # m1 = sum(betweenQuantiles[[1]]$data);
  # indexMat = cbind((1:n1)[betweenQuantiles[[1]]$data], 1:m1);

  # Remove the points nearest to branch 2 from the distances in branch 1
  selectedDissim[[1]]$data[betweenQuantiles[[1]]$data, ] <- NA

  # n2 = length(branch2);
  # m2 = sum(betweenQuantiles[[2]]$data);
  # indexMat = cbind((1:n2)[betweenQuantiles[[2]]$data], 1:m2);

  selectedDissim[[3]]$data[betweenQuantiles[[2]]$data, ] <- NA

  multiP.ext <- cbind(multiP, multiP[, c(2, 1)])

  selectedDissimQuantiles <- mtd.mapply(colQuantiles, selectedDissim,
    probs = multiP.ext,
    MoreArgs = list(drop = FALSE, na.rm = TRUE)
  )

  selectedAverages <- mtd.mapply(.meanInRange, selectedDissim, selectedDissimQuantiles)

  if (FALSE) {
    par(mfrow = c(1, 2))
    verboseBoxplot(c(selectedAverages[[1]]$data, selectedAverages[[2]]$data),
      c(
        rep("in", length(selectedAverages[[1]]$data)),
        rep("out", length(selectedAverages[[2]]$data))
      ),
      main = "branch 1",
      xlab = "", ylab = "mean distance"
    )
    verboseBoxplot(c(selectedAverages[[3]]$data, selectedAverages[[4]]$data),
      c(
        rep("in", length(selectedAverages[[3]]$data)),
        rep("out", length(selectedAverages[[4]]$data))
      ),
      main = "branch 2",
      xlab = "", ylab = "mean distance"
    )
  }

  separation <- function(x, y) {
    nx <- length(x)
    ny <- length(y)
    if (nx * ny == 0) {
      return(0)
    }

    mx <- mean(x, na.rm = TRUE)
    my <- mean(y, na.rm = TRUE)

    if (!is.finite(mx) | !is.finite(my)) {
      return(0)
    }

    if (nx > 1) varx <- var(x, na.rm = TRUE) else varx <- 0
    if (ny > 0) vary <- var(y, na.rm = TRUE) else vary <- 0

    if (is.na(varx)) varx <- 0
    if (is.na(vary)) vary <- 0

    if (varx + vary == 0) {
      if (my == mx) {
        return(0)
      } else {
        return(Inf)
      }
    }
    out <- abs(my - mx) / (sqrt(varx) + sqrt(vary))
    if (is.na(out)) out <- 0
    out
  }

  separations <- c(
    separation(selectedAverages[[1]]$data, selectedAverages[[2]]$data),
    separation(selectedAverages[[3]]$data, selectedAverages[[4]]$data)
  )

  out <- max(separations, na.rm = TRUE)
  if (is.na(out)) out <- 0
  if (out < 0) out <- 0
  if (getDetails) {
    return(list(
      split = out,
      distances = list(
        within1 = selectedAverages[[1]]$data,
        from1to2 = selectedAverages[[2]]$data,
        within2 = selectedAverages[[3]]$data,
        from2to1 = selectedAverages[[4]]$data
      )
    ))
  }
  out
}

# ========================================================================================================
#
# Branch dissimilarity based on a series of alternate branch labels
#
# ========================================================================================================

# this function measures the split of branch1 and branch2 based on alternate labels, typically derived from
# resampled or otherwise perturbed data (but could also be derived from an independent data set).

# Basic idea: if two branches are separate, their membership should be predictable from the alternate
# labels.

# This function takes the l-th stability labels, finds ones that overlap with both branches, and for each
# label calculates the contribution to similarity as
#      r1 = sum(lab1==cl)/n1;
#      r2 = sum(lab2==cl)/n2;
#      sim = sim + min(r1, r2)

# This will penalize similarity of a small and large module if the large module is a composite of several
# branches, only few of which overlap with the small module.


# This method is invariant under splitting of alternate module as long as the branch to which the modules are
# assigned does not change. So in this sense the splitting settings in the resampling study shouldn't
# matter too much but to some degree they still do.

# stabilityLabels: a matrix of dimensions (nGenes) x (number of alternate labels)

branchSplitFromStabilityLabels <- function(
                                           branch1, branch2,
                                           stabilityLabels, ignoreLabels = 0, ...) {
  nLabels <- ncol(stabilityLabels)
  n1 <- length(branch1)
  n2 <- length(branch2)

  sim <- 0
  # browser()
  for (l in 1:nLabels)
  {
    lab1 <- stabilityLabels[branch1, l]
    lab2 <- stabilityLabels[branch2, l]
    commonLevels <- intersect(unique(lab1), unique(lab2))
    commonLevels <- setdiff(commonLevels, ignoreLabels)
    if (length(commonLevels) > 0) {
      for (cl in commonLevels)
      {
        # printFlush(spaste("Common level ", cl, " in clustering ", l))
        r1 <- sum(lab1 == cl) / n1
        r2 <- sum(lab2 == cl) / n2
        sim <- sim + min(r1, r2)
      }
    }
  }
  # printFlush(spaste("branchSplitFromStabilityLabels: returning ", 1-sim/nLabels))

  1 - sim / nLabels
}

# Resurrect the old idea of prediction
# accuracy. For each overlap module, count the genes in the branch with which the module has smaller overlap
# and add it to the score for that branch. The final counts divided by number of genes on branch give a
# "indistinctness" score; take the larger of the the two indistinctness scores and call this the similarity.

# This method is still more or less invariant under splitting of the stability modules, as long as the
# splitting is random with respect to the two branches.

## Note that one could in principle run a chisq.test on the table of labels corresponding to branch1 and
## branch2 vs. stabilityLabels restricted to branch1 and branch2,

# The problem here is that small changes in stability labels could make a big difference in final
# (dis)similarity when one module is large and the other small. Assume a few of the stability labels cover
# small and a part of the large module; other stability labels cover the rest of the large module. If the
# common stability labels cover a bit more of large than small module, similarity will be high; if they
# switch more to the smaller module, similarity could be near zero.

# In summary, this function may be used for experiments but should not be used in production setting.

branchSplitFromStabilityLabels.prediction <- function(
                                                      branch1, branch2,
                                                      stabilityLabels, ignoreLabels = 0, ...) {
  nLabels <- ncol(stabilityLabels)
  n1 <- length(branch1)
  n2 <- length(branch2)

  s1 <- s2 <- 0
  for (l in 1:nLabels)
  {
    lab1 <- stabilityLabels[branch1, l]
    lab2 <- stabilityLabels[branch2, l]
    commonLevels <- intersect(lab1, lab2)
    commonLevels <- setdiff(commonLevels, ignoreLabels)
    if (length(commonLevels) > 0) {
      for (cl in commonLevels)
      {
        # printFlush(spaste("Common level ", cl, " in clustering ", l))
        o1 <- sum(lab1 == cl)
        o2 <- sum(lab2 == cl)
        if (o1 > o2) {
          s2 <- s2 + o2
        } else {
          s1 <- s1 + o1
        }
      }
    }
  }
  indist1 <- s1 / (n1 * nLabels)
  indist2 <- s2 / (n2 * nLabels)
  sim <- min(1, 2 * max(indist1, indist2))
  dissim <- 1 - sim
  # printFlush(spaste("branchSplitFromStabilityLabels.prediction: returning ", dissim))

  dissim
}


# Third idea: for each branch, for each gene sum the fraction of the stability label (restricted to the two
# branches) that belongs to the branch. If this is relatively low, around 0.5, it means most elements are in
# non-discriminative stability labels.

branchSplitFromStabilityLabels.individualFraction <- function(
                                                              branch1, branch2,
                                                              stabilityLabels, ignoreLabels = 0, ...) {
  nLabels <- ncol(stabilityLabels)
  n1 <- length(branch1)
  n2 <- length(branch2)

  s1 <- s2 <- 0
  for (l in 1:nLabels)
  {
    lab1 <- stabilityLabels[branch1, l]
    lab2 <- stabilityLabels[branch2, l]
    commonLevels <- intersect(lab1, lab2)
    commonLevels <- setdiff(commonLevels, ignoreLabels)
    s1.all <- n1
    s2.all <- n2
    for (cl in commonLevels)
    {
      # printFlush(spaste("Common level ", cl, " in clustering ", l))
      o1 <- sum(lab1 == cl, na.rm = TRUE)
      o2 <- sum(lab2 == cl, na.rm = TRUE)
      o12 <- o1 + o2
      coef1 <- max(0.5, o1 / o12)
      coef2 <- max(0.5, o2 / o12)
      s2 <- s2 + o2 * coef2
      s1 <- s1 + o1 * coef1
      s1.all <- s1.all - o1
      s2.all <- s2.all - o2
    }
    s1 <- s1 + s1.all
    s2 <- s2 + s2.all
    # if (is.na(s1) | is.na(s2)) browser();
  }
  distinctness1 <- 2 * s1 / (n1 * nLabels) - 1
  distinctenss2 <- 2 * s2 / (n2 * nLabels) - 1
  dissim <- min(distinctness1, distinctenss2)
  printFlush(spaste("branchSplitFromStabilityLabels.individualFraction: returning ", dissim))

  dissim
}
milescsmith/WGCNA documentation built on April 11, 2024, 1:26 a.m.