R/correlationStatistics.R

Defines functions spatialCrossCorTest spatialCrossCorTorTest signedLisa lisaTest interCellTypeSpatialCrossCor spatialCrossCor spatialCrossCorMatrix moranSimple moranPermutationTest getPv

Documented in getPv interCellTypeSpatialCrossCor lisaTest moranPermutationTest signedLisa spatialCrossCor spatialCrossCorMatrix spatialCrossCorTest spatialCrossCorTorTest

#' Helper function to get p-value from value and distribution mean and standard deviation
#'
#' @param obs Observed value
#' @param ei Mean (Expected value)
#' @param sdi Standard deviation
#' @param alternative "two.sided", "less", or "greater"
#'
#' @return P-value
#'
#' @examples
#' # Probability of observing 5 while expecting 0 with a standard deviation of 1
#' getPv(5, 0, 1, 'greater')
#'
#' @export
#'
getPv <- function(obs, ei, sdi, alternative) {
  alternative <- match.arg(alternative, c("two.sided", "less", "greater"))
  pv <- pnorm(obs, mean = ei, sd = sdi)
  if (alternative == "two.sided") {
    if (obs <= ei) {
      pv <- 2 * pv
    } else {
      pv <- 2 * (1 - pv)
    }
  }
  if (alternative == "greater") {
    pv <- 1 - pv
  }
  if (alternative == "lesser") {
    pv <- pv
  }
  return(pv)
}


#' Spatialy autocorrelation by Moran's I in C++
#'
#' @param x Feature value
#' @param weight Adjacency weight matrix
#' @param alternative "two.sided", "less", or "greater"
#'
#' @return Observed Moran's I statistic, the expected statistic under the null hypothesis of no spatial autocorrelation, the standard deviation under the null hypothesis, and p-value
#'
#' @examples
#' data(mOB)
#' pos <- mOB$pos
#' weight <- getSpatialNeighbors(pos)
#' gexp <- normalizeCounts(mOB$counts, log=FALSE, verbose=FALSE)['Camk4',]
#' moranTest(gexp, weight)
#'
#' @export
#'
moranTest <- function (x, weight, alternative = "greater") {

  if (nrow(weight) != ncol(weight)) {
    stop("'weight' must be a square matrix")
  }

  N <- length(x)
  if (nrow(weight) != N) {
    stop("'weight' must have as many rows as observations in 'x'")
  }

  if(sum(rownames(weight) %in% names(x)) != nrow(weight)) {
    stop('Names in feature vector and adjacency weight matrix do not agree.')
  }
  x <- x[rownames(weight)]

  output <- moranTest_C(x, weight)
  output <- output[,1]
  names(output) <- c('observed', 'expected', 'sd')
  output['p.value'] <- getPv(output[1], output[2], output[3], alternative)
  return(output)
}


#' Spatialy autocorrelation by Moran's I
#' DEPRECATED! Use moranTest()
#'
#' @param x Feature value
#' @param weight Adjacency weight matrix
#' @param alternative "two.sided", "less", or "greater"
#'
#' @return Observed Moran's I statistic, the expected statistic under the null hypothesis of no spatial autocorrelation, the standard deviation under the null hypothesis, and p-value
#'
moranTest_DEPRECATED <- function (x, weight, alternative = "greater") {

  warning('DEPRECATED! Use the faster moranTest()')

  if (nrow(weight) != ncol(weight)) {
    stop("'weight' must be a square matrix")
  }

  N <- length(x)
  if (nrow(weight) != N) {
    stop("'weight' must have as many rows as observations in 'x'")
  }

  if(sum(rownames(weight) %in% names(x)) != nrow(weight)) {
    stop('Names in feature vector and adjacency weight matrix do not agree.')
  }
  x <- x[rownames(weight)]

  # first moment
  ei <- -1/(N - 1)

  # unitization
  rs <- rowSums(weight)
  rs[rs == 0] <- 1
  weight <- weight/rs

  # Moran's I
  W <- sum(weight)
  z <- x - mean(x)
  cv <- sum(weight * z %o% z)
  v <- sum(z^2)
  obs <- (N/W) * (cv/v)

  # second moment
  W.sq <- W^2
  N.sq <- N^2
  S1 <- 0.5 * sum((weight + t(weight))^2)
  S2 <- sum((apply(weight, 1, sum) + apply(weight, 2, sum))^2)
  S3 <- (sum(z^4)/N)/(v/N)^2
  S4 <- (N.sq - 3*N + 3)*S1 - N*S2 + 3*W.sq
  S5 <- (N.sq - N)*S1 - 2*N*S2 + 6*W.sq
  ei2 <- (N*S4 - S3*S5)/((N - 1)*(N - 2)*(N - 3) * W.sq)

  # standard deviation
  sdi <- sqrt(ei2 - (ei)^2)

  # p-value
  pv <- getPv(obs, ei, sdi, alternative)

  return(c(observed = obs, expected = ei, sd = sdi, p.value=pv))
}


#' Derive Moran's I p-value using permutation testing
#'
#' @param z Feature value
#' @param w Adjacency weight matrix
#' @param alternative "two.sided", "less", or "greater"
#' @param N Number of permutations
#' @param seed Random seed
#' @param ncores Number of cores for parallel processing
#' @param plot Plot permutated distribution
#' @param ... Additional parameters to pass to histogram plotting
#'
#' @return Observed Moran's I statistic, the expected statistic under the null hypothesis of no spatial autocorrelation, the standard deviation under the null hypothesis, permutation p-value, and number of permutations
#'
#' @examples
#' data(mOB)
#' pos <- mOB$pos
#' weight <- getSpatialNeighbors(pos)
#' gexp <- normalizeCounts(mOB$counts, log=FALSE, verbose=FALSE)['Camk4',]
#' moranPermutationTest(gexp, weight)
#'
#' @export
#'
moranPermutationTest <- function(z, w, alternative = "greater", N=1e4, seed=0, ncores=1, plot=FALSE, ...) {

  # Set seed for reproducibility
  set.seed(seed)

  # Compute Moran's I
  stat <- moranSimple(z, w)

  # Simulate null distribution
  sim <- unlist(parallel::mclapply(seq_len(N), function(i) {
    foo <- sample(z, length(z), replace=TRUE)
    names(foo) <- names(z)
    moranSimple(foo, w)
  }, mc.cores=ncores))
  sim[is.nan(sim)] <- NA
  sim <- na.omit(sim)
  all <- c(stat, sim)

  # Compute permutation p-value
  alternative <- match.arg(alternative, c("two.sided", "less", "greater"))
  if(alternative == 'greater') {
    p.value <- mean(all >= stat)
  }
  if(alternative == 'less') {
    p.value <- mean(all <= stat)
  }
  if(alternative == 'two.sided') {
    p.value <- mean(abs(all) >= abs(stat))
  }

  # Plot null distribution
  if(plot) {
    hist(sim, sub=paste("p =", round(p.value, 4)), xlim=range(all), ...)
    abline(v = stat, col="red", lty=3, lwd=2)
  }

  results <- c('observed'=stat, 'expected'=mean(sim), 'sd'=sd(sim), 'p.value'=p.value, 'N'=N)
  return(results)
}
moranSimple <- function(x, weight) {
  if (nrow(weight) != ncol(weight)) {
    stop("'weight' must be a square matrix")
  }

  N <- length(x)
  if (nrow(weight) != N) {
    stop("'weight' must have as many rows as observations in 'x'")
  }

  if(sum(rownames(weight) %in% names(x)) != nrow(weight)) {
    stop('Names in feature vector and adjacency weight matrix do not agree.')
  }
  x <- x[rownames(weight)]

  # unitization
  rs <- rowSums(weight)
  rs[rs == 0] <- 1
  weight <- weight/rs

  # Moran's I
  W <- sum(weight)
  z <- x - mean(x)
  cv <- sum(weight * z %o% z)
  v <- sum(z^2)
  obs <- (N/W) * (cv/v)

  return(obs)
}


#' Calculates a spatial cross correlation for all pairs
#'
#' @param mat Matrix of feature values
#' @param weight Adjacency weight matrix
#'
#' @return Matrix of spatial cross correlationf or all pairs
#'
#' @examples
#' data(mOB)
#' pos <- mOB$pos
#' weight <- getSpatialNeighbors(pos)
#' gexp <- normalizeCounts(mOB$counts, log=FALSE, verbose=FALSE)
#' sccMat <- spatialCrossCorMatrix(gexp[1:5,], weight)
#'
#' @export
#'
spatialCrossCorMatrix <- function(mat, weight) {

  if (nrow(weight) != ncol(weight)) {
    stop("'weight' must be a square matrix")
  }

  N <- ncol(mat)
  if (nrow(weight) != N) {
    stop("'weight' must have as many rows as observations in 'x'")
  }

  if(sum(rownames(weight) %in% colnames(mat)) != nrow(weight)) {
    stop('Names in feature vector and adjacency weight matrix do not agree.')
  }
  mat <- mat[, rownames(weight)]

  scor <- spatialCrossCorMatrix_C(as.matrix(mat), weight)
  colnames(scor) <- rownames(scor) <- rownames(mat)
  return(scor)
}


#' Calculates spatial cross correlation for two features
#'
#' @param x Feature 1 value
#' @param y Feature 2 value
#' @param weight Adjacency weight matrix
#'
#' @return Spatial cross correlation statistic
#'
#' @examples
#' data(mOB)
#' pos <- mOB$pos
#' weight <- getSpatialNeighbors(pos)
#' gexp <- normalizeCounts(mOB$counts, log=FALSE, verbose=FALSE)
#' scc <- spatialCrossCor(gexp[1,], gexp[2,], weight)
#'
#' @export
#'
spatialCrossCor <- function(x, y, weight) {
  if(length(x) != length(y)) {
    stop("'x', and 'y' must be equal in length")
  }
  if (nrow(weight) != ncol(weight)) {
    stop("'weight' must be a square matrix")
  }
  N <- length(x)
  if (nrow(weight) != N) {
    stop("'weight' must have as many rows as observations in 'x' and 'y'")
  }

  if(sum(rownames(weight) %in% names(x)) != nrow(weight)) {
    stop('Names in feature vector and adjacency weight matrix do not agree.')
  }
  x <- x[rownames(weight)]
  y <- y[rownames(weight)]

  # scale weights
  rs <- rowSums(weight)
  rs[rs == 0] <- 1
  weight <- weight/rs

  # compute spatial cross correlation
  N <- length(x)
  W <- sum(weight)
  dx <- x - mean(x, na.rm=TRUE)
  dy <- y - mean(y, na.rm=TRUE)

  cv1 <- dx %o% dy
  cv2 <- dy %o% dx
  cv1[is.na(cv1)] <- 0
  cv2[is.na(cv2)] <- 0

  cv <- sum(weight * ( cv1 + cv2 ), na.rm=TRUE)
  v <- sqrt(sum(dx^2, na.rm=TRUE) * sum(dy^2, na.rm=TRUE))
  SCC <- (N/W) * (cv/v) / 2.0

  return(SCC)
}


#' Tests for inter-cell-type spatial cross-correlation between gene A in group A and gene B in group B
#'
#' @param gexpA Expression for gene A
#' @param gexpB Expression for gene B
#' @param groupA Cells in group A
#' @param groupB Cells in group B
#' @param weight Adjacency weight matrix
#'
#' @return statistic of how well gene expression A in cells of group A are correlated in space with gene expression B in cells of group B
#'
#' @examples
#' # Simulate data
#' set.seed(0)
#' N <- 100
#' pos <- cbind(rnorm(N), rnorm(N))
#' rownames(pos) <- paste0('cell', 1:N)
#' colnames(pos) <- c('x', 'y')
#' weight <- getSpatialNeighbors(pos)
#' ctA <- sample(rownames(pos), N/2)
#' ctB <- setdiff(rownames(pos), ctA)
#' gexpA <- pos[,2]
#' gexpA[ctB] <- 0
#' gexpB <- pos[,2]
#' gexpB[ctA] <- 0
#' #plotEmbedding(pos, col=gexpA)
#' #plotEmbedding(pos, col=gexpB)
#' interCellTypeSpatialCrossCor(gexpA, gexpB, ctA, ctB, weight)
#' cor(gexpA, gexpB) # compare
#'
#' @export
#'
interCellTypeSpatialCrossCor <- function(gexpA, gexpB, groupA, groupB, weight) {
    # restrict to expression of gene A in group A
    # and expression of gene B in group B
    x <- gexpA
    y <- gexpB
    x[groupB] <- NA
    y[groupA] <- NA

    # scale weights
    rs <- rowSums(weight)
    rs[rs == 0] <- 1
    weight <- weight/rs

    # compute spatial cross correlation
    N <- length(x)
    W <- sum(weight)
    dx <- x - mean(x, na.rm=TRUE)
    dy <- y - mean(y, na.rm=TRUE)

    cv1 <- dx %o% dy
    cv2 <- dy %o% dx
    cv1[is.na(cv1)] <- 0
    cv2[is.na(cv2)] <- 0

    cv <- sum(weight * ( cv1 + cv2 ), na.rm=TRUE)
    v <- sqrt(sum(dx^2, na.rm=TRUE) * sum(dy^2, na.rm=TRUE))
    SCI <- (N/W) * (cv/v) / 2.0

    return(SCI)
}


#' Local indicators of spatial association
#'
#' @param x Feature value
#' @param weight Adjacency weight matrix
#' @param alternative "two.sided", "less", or "greater"
#'
#' @return Data frame of observed, expected, standard deviation, and p-value for each point
#'
#' @examples
#' data(mOB)
#' pos <- mOB$pos
#' weight <- getSpatialNeighbors(pos)
#' gexp <- normalizeCounts(mOB$counts, log=FALSE, verbose=FALSE)['Camk4',]
#' lisa <- lisaTest(gexp, weight)
#'
#' @export
#'
lisaTest <- function(x, weight, alternative = "greater") {

  if (nrow(weight) != ncol(weight)) {
    stop("'weight' must be a square matrix")
  }

  N <- length(x)
  if (nrow(weight) != N) {
    stop("'weight' must have as many rows as observations in 'x'")
  }

  if(sum(rownames(weight) %in% names(x)) != nrow(weight)) {
    stop('Names in feature vector and adjacency weight matrix do not agree.')
  }
  x <- x[rownames(weight)]

    # unitization
    rs <- rowSums(weight)
    rs[rs == 0] <- 1
    weight <- weight/rs

    # calculate Ii
    n <- length(x)
    xx <- mean(x, na.rm = TRUE)
    z <- x - xx
    s2 <- sum(z^2, na.rm = TRUE)/n
    lz <- apply(weight, 1, function(w) sum(w*z))
    Ii <- (z/s2) * lz

    Wi <- rowSums(weight)
    E.Ii <- -Wi/(n - 1)

    b2 <- (sum(z^4, na.rm = TRUE)/n)/(s2^2)
    Wi2 <- apply(weight, 1, function(w) sum(w^2))
    A <- (n - b2)/(n - 1)
    B <- (2 * b2 - n)/((n - 1) * (n - 2))
    Sd.Ii <- sqrt(A * Wi2 + B * (Wi^2 - Wi2) - E.Ii^2)

    # p-value
    pv <- getPv(Ii, E.Ii, Sd.Ii, alternative)

    return(data.frame(observed = Ii, expected = E.Ii, sd = Sd.Ii, p.value = pv))
}


#' Signed local indicators of spatial association to identify regions driving global autocorrelation
#'
#' @param x Feature value
#' @param weight Adjacency weight matrix
#' @param alternative "two.sided", "less", or "greater"
#'
#' @return Signed LISA statistic for each point
#'
#' @examples
#' data(mOB)
#' pos <- mOB$pos
#' weight <- getSpatialNeighbors(pos)
#' gexp <- normalizeCounts(mOB$counts, log=FALSE, verbose=FALSE)['Camk4',]
#' plotEmbedding(pos, colors=gexp, cex=3)
#' slisa <- signedLisa(gexp, weight)
#' plotEmbedding(pos, colors=slisa, cex=3,
#'    gradientPalette=colorRampPalette(c('darkgreen', 'white', 'darkorange'))(100))
#'
#' @export
#'
signedLisa <- function(x, weight, alternative = "greater") {
  lisa <- lisaTest(x, weight, alternative)
  slisa <- sign(scale(x[rownames(lisa)], center=TRUE)[,1])*lisa$observed
  slisa <- slisa[rownames(weight)]
  return(slisa)
}


#' Tests for significance of spatial cross correlation for two features using toroidal shift null model
#'
#' @param x Feature 1 value
#' @param y Feature 2 value
#' @param pos Position
#' @param k Toroidal shift boxes
#' @param n Permutation iterations
#' @param ncores Number of cores for parallel processing
#' @param plot Plot permutated distribution
#' @param ... Additional parameters to pass to histogram plotting
#'
#' @return P-value
#'
#' @examples
#' \dontrun{
#' data(mOB)
#' pos <- mOB$pos
#' gexp <- normalizeCounts(mOB$counts, log=FALSE, verbose=FALSE)
#' pv1 <- spatialCrossCorTorTest(gexp['Gpsm1',], gexp['Nrgn',], pos)
#' pv2 <- spatialCrossCorTorTest(gexp['Gpsm1',], gexp['Glul',], pos)
#' pv3 <- spatialCrossCorTorTest(gexp[1,], gexp[2,], pos)
#' }
#'
#' @export
#'
spatialCrossCorTorTest <- function(x, y, pos, k=4, n=1000, ncores=1, plot=FALSE, ...) {
  # compute statistic
  w <- suppressMessages(suppressWarnings(getSpatialNeighbors(pos)))
  I <- spatialCrossCor(x,y,w)

  # compute background
  bg <- unlist(parallel::mclapply(seq_len(n), function(i) {
    # shift pos
    randpos <- rtorShift(pos,k=k,seed=i)
    w <- suppressMessages(suppressWarnings(getSpatialNeighbors(randpos)))
    spatialCrossCor(x,y,w)
  }, mc.cores=ncores))
  bg <- c(bg, I)

  if(plot) {
    hist(bg, breaks=n/10, ...)
    abline(v=I, col='red')
  }

  # P-value is the fraction of how many times the permuted difference is equal or more extreme than the observed difference
  pvalue = sum(abs(bg) >= abs(I)) / (n+1)
  return(pvalue)
}


#' Tests for significance of spatial cross correlation for two features using random label null model
#'
#' @param x Feature 1 value
#' @param y Feature 2 value
#' @param w Binary weight matrix
#' @param n Permutation iterations
#' @param ncores Number of cores for parallel processing
#' @param plot Plot permutated distribution
#' @param ... Additional parameters to pass to histogram plotting
#'
#' @return Two-sided test p-value
#'
#' @examples
#' \dontrun{
#' data(mOB)
#' pos <- mOB$pos
#' w <- getSpatialNeighbors(pos)
#' gexp <- normalizeCounts(mOB$counts, log=FALSE, verbose=FALSE)
#' pv1 <- spatialCrossCorTest(gexp['Gpsm1',], gexp['Nrgn',], w)
#' pv2 <- spatialCrossCorTest(gexp['Gpsm1',], gexp['Glul',], w)
#' pv3 <- spatialCrossCorTest(gexp[1,], gexp[2,], w)
#' }
#' @export
#'
spatialCrossCorTest <- function(x, y, w, n=1000, ncores=1, plot=FALSE, ...) {
  I <- spatialCrossCor(x,y,w)

  # compute background
  bg <- unlist(parallel::mclapply(seq_len(n), function(i) {
    set.seed(i)
    xbg <- sample(x)
    names(xbg) <- names(x)
    spatialCrossCor(xbg,y,w)
  }, mc.cores=ncores))
  bg <- c(bg, I)

  if(plot) {
    hist(bg, breaks=n/10, ...)
    abline(v=I, col='red')
  }

  # P-value is the fraction of how many times the permuted difference is equal or more extreme than the observed difference
  # two-sided test
  pvalue = sum(abs(bg) >= abs(I)) / (n+1)
  return(pvalue)
}
JEFworks/MERingue documentation built on June 11, 2022, 4:16 a.m.