R/EvoWeaver-PPPreds.R

Defines functions PAPV.EvoWeaver GLDistance.EvoWeaver Behdenna.EvoWeaver ProfileDCA.EvoWeaver GLMI.EvoWeaver MutualInformationPA.EvoWeaver CorrGL.EvoWeaver PAOverlap.EvoWeaver PAJaccard.EvoWeaver Hamming.EvoWeaver ExtantJaccard.EvoWeaver PAPV PAOverlap PAJaccard GLDistance Behdenna ProfileDCA GLMI CorrGL Hamming ExtantJaccard

Documented in Behdenna.EvoWeaver CorrGL.EvoWeaver ExtantJaccard.EvoWeaver GLDistance.EvoWeaver GLMI.EvoWeaver Hamming.EvoWeaver PAJaccard.EvoWeaver PAOverlap.EvoWeaver

###### Presence/Absence Methods for EvoWeaver #####
# author: Aidan Lakshman
# contact: ahl27@pitt.edu
# TODO: clean up the naming here

#### Implemented Methods: ####
#  - Jaccard Distance at tips
#  - Hamming Distance
#  - Mutual Information at tips
#  - Mutual Information of gain/loss events
#  - Jaccard Distance, centered w/ conserved clades collapsed
#  - Inverse Potts Model
#  - Behdenna 2016 gain/loss
#  - Pearson correlation of gain/loss (w/ p-value)
#  - Distance between gain/loss events
#  - Overlap of ancestral state coverage on tree
##########################

#### S3 Generic Definitions ####
ExtantJaccard <- function(ew, ...) UseMethod('ExtantJaccard')
Hamming <- function(ew, ...) UseMethod('Hamming')
CorrGL <- function(ew, ...) UseMethod('CorrGL')
GLMI <- function(ew, ...) UseMethod('GLMI')
ProfileDCA <- function(ew, ...) UseMethod('ProfileDCA')
Behdenna <- function(ew, ...) UseMethod('Behdenna')
GLDistance <- function(ew, ...) UseMethod('GLDistance')
PAJaccard <- function(ew, ...) UseMethod("PAJaccard")
PAOverlap <- function(ew, ...) UseMethod("PAOverlap")
PAPV <- function(ew, ...) UseMethod('PAPV')
################################

ExtantJaccard.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
                               precalcProfs=NULL, precalcSubset=NULL, ...){

  if (!is.null(precalcSubset))
    subs <- precalcSubset
  else
    subs <- ProcessSubset(ew, Subset)
  uvals <- subs$uvals
  evalmap <- subs$evalmap

  if ( is.null(precalcProfs) ){
    if (Verbose) cat('  Calculating PA Profiles...\n')
    pap <- PAProfiles(ew, uvals, Verbose=Verbose)
  } else {
    pap <- precalcProfs
  }
  l <- length(uvals)
  n <- names(ew)
  if ( l == 1 ){
    mat <- matrix(1, nrow=1, ncol=1)
    rownames(mat) <- colnames(mat) <- n
    return(mat)
  }
  nr <- nrow(pap)
  pap[] <- as.integer(pap)
  ARGS <- list(nr=nr)
  FXN <- function(v1, v2, ARGS, ii, jj) {
    return(.Call("calcScoreJaccard", v1, v2, ARGS$nr, PACKAGE="SynExtend"))
  }
  pairscores <- BuildSimMatInternal(pap, uvals, evalmap, l, n, FXN, ARGS, Verbose)

  return(pairscores)
}

Hamming.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
                               precalcProfs=NULL, precalcSubset=NULL, ...){
  if (!is.null(precalcSubset))
    subs <- precalcSubset
  else
    subs <- ProcessSubset(ew, Subset)
  uvals <- subs$uvals
  evalmap <- subs$evalmap
  if ( is.null(precalcProfs) ){
    if (Verbose) cat('  Calculating PA Profiles...\n')
    pap <- PAProfiles(ew, Verbose=Verbose)
  } else {
    pap <- precalcProfs
  }
  l <- length(uvals)
  n <- names(ew)
  if ( l == 1 ){
    mat <- simMat(1, nelem=1)
    rownames(mat) <- colnames(mat) <- n
    return(mat)
  }
  nr <- nrow(pap)
  pairscores <- rep(NA_real_, l*(l-1)/2)
  #nc <- ncol(pap)
  pap[] <- as.integer(pap)
  ARGS <- list(nr=nr)
  FXN <- function(v1, v2, ARGS, ii, jj) {
    return(.Call("calcScoreHamming", v1, v2, ARGS$nr, 1, PACKAGE="SynExtend"))
  }
  pairscores <- BuildSimMatInternal(pap, uvals, evalmap, l, n, FXN, ARGS, Verbose)

  return(pairscores)
}

PAJaccard.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
                                    MySpeciesTree=NULL,
                                    precalcProfs=NULL, precalcSubset=NULL,
                                    CombinePVal=TRUE, ...){
  if (!is.null(precalcSubset))
    subs <- precalcSubset
  else
    subs <- ProcessSubset(ew, Subset)
  uvals <- subs$uvals
  evalmap <- subs$evalmap
  if (is.null(MySpeciesTree)){
    stopifnot("Method 'GLDistance' requires a species tree"=attr(ew, 'useMT'))
    if (Verbose) cat('Calculating Species Tree...\n')
    MySpeciesTree <- findSpeciesTree(ew, Verbose)
  }

  if ( is.null(precalcProfs) ){
    if (Verbose) cat('  Calculating PA Profiles...\n')
    pap <- PAProfiles(ew, uvals, Verbose=Verbose, speciesList=labels(MySpeciesTree), ...)
  } else {
    pap <- precalcProfs
  }
  l <- length(uvals)
  n <- names(ew)
  if ( l == 1 ){
    v <- ifelse(CombinePVal, 1, 1+1i)
    mat <- simMat(v, nelem=1)
    rownames(mat) <- colnames(mat) <- n
    return(mat)
  }

  # Initialize Dendrogram in C
  y <- .Call("initCDend", MySpeciesTree, PACKAGE="SynExtend")
  on.exit(rm(y))
  numnodes <- .Call("getTreeNodesCount", y, PACKAGE="SynExtend")
  glvs <- matrix(NA_integer_, nrow=numnodes, ncol=l)
  rn <- rownames(pap)

  # Calculate Gain/Loss Vectors
  if (Verbose) cat('  Calculating ancestral states:\n')
  if (Verbose) pb <- txtProgressBar(max=ncol(pap), style=3)
  for (i in seq_len(l)){
    v <- rn[pap[,i]]
    if (length(v) == 0){
      glv <- rep(0L, numnodes)
    } else {
      glv <- .Call("calcGainLoss", y, v, FALSE, PACKAGE="SynExtend")
    }
    glvs[,i] <- glv
    if (Verbose) setTxtProgressBar(pb, i)
  }
  if(Verbose) cat("\n")

  ARGS <- list(numnodes=numnodes, y=y)
  FXN <- function(v1, v2, ARGS, ii, jj) {
    if(all(v1[1]==v1) || all(v2[1]==v2)){
      # catch case where standard deviation is 0
      # (all elements of vector the same)
      return(ifelse(CombinePVal, 0, 0+0i))
    }
    y <- ARGS$y
    # 00 = 1, 01 = 2, 10 = 3, 11 = 4
    # 0 reserved for "ignore me"
    v1 <- 2L*v1 + v2 + 1L
    v1 <- .Call("cladeCollapsePA", y, v1)
    jpd <- tabulate(v1, nbins=4L)
    ## now jpd is: 00 01 10 11
    #pval <- 1-fisher.test(jpd, alternative='two.sided')$p.value
    #jpd <- jpd / sum(jpd)

    # now do we use Jaccard, or MI?
    # Jaccard does have a nice p-value we could use
    # see Chung et al. 2019
    # note indices are reversed relative to paper
    m <- sum(jpd)
    jaccard_index <- ifelse(sum(jpd[2:4]) == 0, 0, jpd[4] / sum(jpd[2:4]))
    p_i <- sum(jpd[3:4]) / m
    p_j <- sum(jpd[c(2,4)]) / m
    q1 <- p_i * p_j
    q2 <- p_i + p_j - 2*p_i*p_j
    expected_jaccard <- ifelse(q2+q1 == 0, 0, q1 / (q2+q1))
    centered_jaccard <- jaccard_index - expected_jaccard

    # Bootstrap p-value calculation
    # note that this is better than asymptotics,
    # which are anti-conservative estimators
    NUM_BOOTSTRAP <- min(m*5L, 1000L)
    NUM_BOOTSTRAP <- max(NUM_BOOTSTRAP, 5L)
    bootstrap_replicates <- numeric(NUM_BOOTSTRAP)
    for(iter in seq_along(bootstrap_replicates)){
      s <- (rbinom(m, 1, p_i) * 2) + (rbinom(m, 1, p_j) + 1L)
      s <- tabulate(s, nbins=4)
      jv <- s[4] / sum(s[2:4])
      if(is.nan(jv) || is.na(jv)) jv <- 0
      bootstrap_replicates[iter] <- jv - expected_jaccard
    }

    p_value_left <- sum(bootstrap_replicates >= centered_jaccard) / NUM_BOOTSTRAP
    p_value_right <- sum(bootstrap_replicates <= centered_jaccard) / NUM_BOOTSTRAP
    p_value <- 2*min(p_value_left, p_value_right)

    weighting <- 1-sqrt(p_value)
    # the centered jaccard is bounded on [-1/3,2/3] due to how it's calculated
    if(centered_jaccard < 0) centered_jaccard <- centered_jaccard * 3
    else centered_jaccard <- centered_jaccard * 3/2

    # clip to [-1,1] just in case
    if(centered_jaccard > 1) centered_jaccard <- 1
    if(centered_jaccard < -1) centered_jaccard <- -1

    return(ifelse(CombinePVal, weighting * centered_jaccard,
      complex(real=centered_jaccard, imaginary=weighting)))
  }
  pairscores <- BuildSimMatInternal(glvs, uvals, evalmap, l, n, FXN, ARGS, Verbose, CombinePVal)
  Diag(pairscores) <- ifelse(CombinePVal, 1, 1+1i)
  return(pairscores)
}


PAOverlap.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
                                MySpeciesTree=NULL,
                               precalcProfs=NULL, precalcSubset=NULL,
                               CombinePVal=TRUE, ...){
  N_PERM <- 200L
  if (!is.null(precalcSubset))
    subs <- precalcSubset
  else
    subs <- ProcessSubset(ew, Subset)
  uvals <- subs$uvals
  evalmap <- subs$evalmap
  if (is.null(MySpeciesTree)){
    stopifnot("Method 'PAOverlap' requires a species tree"=attr(ew, 'useMT'))
    if (Verbose) cat('Calculating Species Tree...\n')
    MySpeciesTree <- findSpeciesTree(ew, Verbose)
  }

  if ( is.null(precalcProfs) ){
    if (Verbose) cat('  Calculating PA Profiles...\n')
    pap <- PAProfiles(ew, uvals, Verbose=Verbose, speciesList=labels(MySpeciesTree), ...)
  } else {
    pap <- precalcProfs
  }
  l <- length(uvals)
  n <- names(ew)

  if ( l == 1 ){
    v <- ifelse(CombinePVal, 1, 1+1i)
    mat <- simMat(v, nelem=1)
    rownames(mat) <- colnames(mat) <- n
    return(mat)
  }
  # Initialize Dendrogram in C
  y <- .Call("initCDend", MySpeciesTree, PACKAGE="SynExtend")
  on.exit(rm(y))
  numnodes <- .Call("getTreeNodesCount", y, PACKAGE="SynExtend")
  rn <- rownames(pap)
  glvs <- a_states <- matrix(NA_integer_, nrow=numnodes, ncol=l)

  # Calculate Gain/Loss Vectors
  if (Verbose) cat('  Calculating gain/loss vectors:\n')
  if (Verbose) pb <- txtProgressBar(max=ncol(pap), style=3)
  for (i in seq_len(l)){
    v <- rn[pap[,i]]
    if (length(v) == 0){
      glv <- a_state <- rep(0L, numnodes)
    } else {
      glv <- .Call("calcGainLoss", y, v, TRUE, PACKAGE="SynExtend")
      a_state <- .Call("calcGainLoss", y, v, FALSE, PACKAGE='SynExtend')
    }
    glvs[,i] <- glv
    a_states[,i] <- a_state
    if (Verbose) setTxtProgressBar(pb, i)
  }
  glvs <- rbind(glvs, a_states)
  if(Verbose) cat("\n")

  all_lengths <- .Call("calcAllTreeLengths", y, PACKAGE='SynExtend')

  ARGS <- list(numnodes=numnodes, all_lens=all_lengths,
               total_len=sum(all_lengths), n_perm=N_PERM)
  FXN <- function(v1, v2, ARGS, ii, jj) {
    nn <- ARGS$numnodes
    pa1 <- v1[seq_len(nn)+nn]
    pa2 <- v2[seq_len(nn)+nn]
    if(sum(pa1) * sum(pa2) == 0) return(ifelse(CombinePVal, 0, 0+0i))
    glv1 <- v1[seq_len(nn)]
    glv2 <- v2[seq_len(nn)]

    total_len <- ARGS$total_len
    lens <- ARGS$all_lens
    n_perm <- ARGS$n_perm

    # ignore length 0 edges unless there's an event on one
    pos_real <- which(lens > 0 | (lens==0 & abs(glv1)+abs(glv2) != 0))
    pa1 <- pa1[pos_real]
    pa2 <- pa2[pos_real]
    glv1 <- glv1[pos_real]
    glv2 <- glv2[pos_real]
    lens <- lens[pos_real]
    numnodes <- length(pa1)

    # scoring
    time_len1 <- sum((pa1 - (0.5*abs(glv1))) * lens)
    time_len2 <- sum((pa2 - (0.5*abs(glv2))) * lens)
    denom <- time_len1 + time_len2
    if(denom == 0) denom <- Inf

    #weighting <- c(time_len1 / total_len, time_len2 / total_len)
    #weighting <- ifelse(any(weighting==0), 0, 2 / sum(1/sqrt(weighting)))
    pos_inter <- which(pa1*pa2==1 & glv1*glv2 != -1)
    # subsetting here means we won't have simultaneous G/L or L/G, only G/G and L/L
    inter_lens <- lens[pos_inter]
    p <- abs(glv1[pos_inter]) + abs(glv2[pos_inter])!=0
    inter_lens[p] <- inter_lens[p] * 0.5
    inter_lens <- 2*sum(inter_lens) / denom
    score <- inter_lens - (time_len1 * time_len2) / (total_len*total_len)
    ## commented lines change it from permutation testing to bootstrapping
    #probs_pa <- c(sum(pa1) / nn, sum(pa2)/nn)
    #probs_glv1 <- tabulate(glv1+2L, nbins=3) / nn
    #probs_glv2 <- tabulate(glv2+2L, nbins=3) / nn
    replicates <- numeric(n_perm)
    for(iter in seq_len(n_perm)){
      pa1 <- .C("shuffleRInt", pa1, numnodes, PACKAGE="SynExtend")[[1]]
      pa2 <- .C("shuffleRInt", pa2, numnodes, PACKAGE="SynExtend")[[1]]
      glv1 <- .C("shuffleRInt", glv1, numnodes, PACKAGE="SynExtend")[[1]]
      glv2 <- .C("shuffleRInt", glv2, numnodes, PACKAGE="SynExtend")[[1]]
      # pa1 <- rbinom(nn, 1, probs_pa[1])
      # pa2 <- rbinom(nn, 1, probs_pa[2])
      # glv1 <- sample(c(-1:1), nn, replace=TRUE, probs_glv1)
      # glv2 <- sample(c(-1:1), nn, replace=TRUE, probs_glv2)
      time_len1 <- sum((pa1 - (0.5*abs(glv1))) * lens)
      time_len2 <- sum((pa2 - (0.5*abs(glv2))) * lens)
      denom <- time_len1 + time_len2
      if(denom == 0) denom <- Inf
      pos_inter <- which((pa1-0.5*glv1) == (pa2-0.5*glv2) & pa1*pa2==1)
      inter_lens <- lens[pos_inter]
      p <- abs(glv1[pos_inter]) + abs(glv2[pos_inter])!=0
      inter_lens[p] <- inter_lens[p] * 0.5
      inter_lens <- 2*sum(inter_lens) / denom
      replicates[iter] <- abs(inter_lens - (time_len1 * time_len2) / (total_len*total_len))
    }
    p_lv <- sum(replicates <= score) / n_perm
    p_rv <- sum(replicates >= score) / n_perm
    p_v <- 2*min(c(p_lv, p_rv))
    if(p_v > 1) p_v <- 1
    p_v <- sqrt(1-p_v)
    return(ifelse(CombinePVal, p_v*score, complex(real=score, imaginary=p_v)))
    #return(sqrt(1-p_v)*score*weighting)
  }
  pairscores <- BuildSimMatInternal(glvs, uvals, evalmap, l, n, FXN, ARGS, Verbose, CombinePVal)
  Diag(pairscores) <- ifelse(CombinePVal, 1, 1+1i)
  return(pairscores)
}

CorrGL.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
                                MySpeciesTree=NULL,
                               precalcProfs=NULL, precalcSubset=NULL,
                               CombinePVal=TRUE, ...){
  if (!is.null(precalcSubset))
    subs <- precalcSubset
  else
    subs <- ProcessSubset(ew, Subset)
  uvals <- subs$uvals
  evalmap <- subs$evalmap
  if (is.null(MySpeciesTree)){
    stopifnot("Method 'GLDistance' requires a species tree"=attr(ew, 'useMT'))
    if (Verbose) cat('Calculating Species Tree...\n')
    MySpeciesTree <- findSpeciesTree(ew, Verbose)
  }

  if ( is.null(precalcProfs) ){
    if (Verbose) cat('  Calculating PA Profiles...\n')
    pap <- PAProfiles(ew, uvals, Verbose=Verbose, speciesList=labels(MySpeciesTree), ...)
  } else {
    pap <- precalcProfs
  }
  l <- length(uvals)
  n <- names(ew)

  if ( l == 1 ){
    v <- ifelse(CombinePVal, 1, 1+1i)
    mat <- simMat(v, nelem=1)
    rownames(mat) <- colnames(mat) <- n
    return(mat)
  }
  # Initialize Dendrogram in C
  y <- .Call("initCDend", MySpeciesTree, PACKAGE="SynExtend")
  on.exit(rm(y))
  numnodes <- .Call("getTreeNodesCount", y, PACKAGE="SynExtend")
  rn <- rownames(pap)
  glvs <- matrix(NA_integer_, nrow=numnodes, ncol=l)

  # Calculate Gain/Loss Vectors
  if (Verbose) cat('  Calculating gain/loss vectors:\n')
  if (Verbose) pb <- txtProgressBar(max=ncol(pap), style=3)
  for (i in seq_len(l)){
    v <- rn[pap[,i]]
    if (length(v) == 0){
      glv <- rep(0L, numnodes)
    } else {
      glv <- .Call("calcGainLoss", y, v, TRUE, PACKAGE="SynExtend")
    }
    glvs[,i] <- glv
    if (Verbose) setTxtProgressBar(pb, i)
  }
  if(Verbose) cat("\n")

  ARGS <- list(numnodes=numnodes)
  FXN <- function(v1, v2, ARGS, ii, jj) {
    if(all(v1[1]==v1) || all(v2[1]==v2)){
      # catch case where standard deviation is 0
      # (all elements of vector the same)
      return(ifelse(CombinePVal, 0, 0+0i))
    }
    val <- cor(v1, v2)
    #pval <- 1 - pt(val, ARGS$numnodes - 2, lower.tail=FALSE)
    pval <- 1-fisher.test(v1, v2, simulate.p.value=TRUE)$p.value
    return(ifelse(CombinePVal, pval*val, complex(real=val, imaginary=pval)))
  }
  pairscores <- BuildSimMatInternal(glvs, uvals, evalmap, l, n, FXN, ARGS, Verbose, CombinePVal)
  Diag(pairscores) <- ifelse(CombinePVal, 1, 1+1i)
  return(pairscores)
}


MutualInformationPA.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
                                         precalcProfs=NULL, precalcSubset=NULL,
                                        switchval=0L, ...){
  if (!is.null(precalcSubset))
    subs <- precalcSubset
  else
    subs <- ProcessSubset(ew, Subset)
  uvals <- subs$uvals
  evalmap <- subs$evalmap
  if ( is.null(precalcProfs) ){
    if (Verbose) cat('  Calculating PA Profiles...\n')
    pap <- PAProfiles(ew, uvals, Verbose=Verbose)
  } else {
    pap <- precalcProfs
  }
  l <- length(uvals)
  n <- names(ew)
  if ( l == 1 ){
    mat <- matrix(1, nrow=1, ncol=1)
    rownames(mat) <- colnames(mat) <- n
    return(mat)
  }

  FXN <- function(v1, v2, ARGS, ii, jj){
    score <- 0
    v1l <- length(v1)
    if(v1l == 0) return(0)
    tt <- sum(v1 & v2) / v1l
    tf <- sum(v1 & !v2) / v1l
    ft <- sum(!v1 & v2) / v1l
    ff <- sum(!v1 & !v2) / v1l
    jpd <- c(tt, tf, ft, ff)

    tv1 <- sum(v1) / v1l
    tv2 <- sum(v2) / v1l
    fv1 <- 1 - tv1
    fv2 <- 1 - tv2
    mpdv1 <- c(tv1, tv1, fv1, fv1)
    mpdv2 <- c(tv2, fv2, tv2, fv2)

    mult <- c(1,-1,-1,1)

    for ( k in seq_along(jpd) ){
      val <- jpd[k] * log(jpd[k] / (mpdv1[k] * mpdv2[k]), base=2) * mult[k]
      score <- score + ifelse(is.nan(val), 0, val)
    }
    pv <- ifelse(all(v1==v1[1]) || all(v2==v2[1]), 1, fisher.test(v1,v2)$p.value)
    return(score*(1-pv))
  }

  # APC Correction, removing for now
  # CORRECTION <- function(ps){
  #   apccorr <- mean(ps, na.rm=TRUE)
  #   ps <- ps - apccorr
  #   ps <- abs(ps)
  #   # Normalize
  #   denom <- max(ps, na.rm=TRUE)
  #   ps <- ps / ifelse(denom==0, 1, denom)
  # }
  pairscores <- BuildSimMatInternal(pap, uvals, evalmap, l, n,
                                    FXN, NULL, Verbose)
                                    #CORRECTION=CORRECTION)

  Diag(pairscores) <- 1
  #pairscores <- pairscores #because distance
  return(pairscores)
}

GLMI.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
                                         precalcProfs=NULL, precalcSubset=NULL,
                                        MySpeciesTree=NULL,
                                        CombinePVal=TRUE, ...){
  if (!is.null(precalcSubset))
    subs <- precalcSubset
  else
    subs <- ProcessSubset(ew, Subset)
  uvals <- subs$uvals
  evalmap <- subs$evalmap
  if (is.null(MySpeciesTree)){
    stopifnot("Method 'GLMI' requires a species tree"=attr(ew, 'useMT'))
    if (Verbose) cat('Calculating Species Tree...\n')
    MySpeciesTree <- findSpeciesTree(ew, Verbose)
  }

  if ( is.null(precalcProfs) ){
    if (Verbose) cat('  Calculating PA Profiles...\n')
    pap <- PAProfiles(ew, uvals, Verbose=Verbose)
  } else {
    pap <- precalcProfs
  }
  l <- length(uvals)
  n <- names(ew)
  if ( l == 1 ){
    v <- ifelse(CombinePVal, 1, 1+1i)
    mat <- simMat(v, nelem=1)
    rownames(mat) <- colnames(mat) <- n
    return(mat)
  }

  y <- .Call("initCDend", MySpeciesTree, PACKAGE="SynExtend")
  on.exit(rm(y))
  numnodes <- .Call("getTreeNodesCount", y, PACKAGE="SynExtend")
  rn <- rownames(pap)
  glvs <- matrix(NA_integer_, nrow=numnodes, ncol=l)

  # Calculate Gain/Loss Vectors
  if (Verbose) cat('  Calculating gain/loss vectors:\n')
  if (Verbose) pb <- txtProgressBar(max=ncol(pap), style=3)
  for (i in seq_len(l)){
    v <- rn[pap[,i]]
    if (length(v) == 0){
      glv <- rep(0L, numnodes)
    } else {
      glv <- .Call("calcGainLoss", y, v, TRUE, PACKAGE="SynExtend")
    }
    glvs[,i] <- glv
    if (Verbose) setTxtProgressBar(pb, i)
  }
  if(Verbose) cat("\n")

  FXN <- function(v1, v2, ARGS, ii, jj){
    # subset just to where events actually occur
    p <- abs(v1) + abs(v2) != 0
    v1 <- v1[p]
    v2 <- v2[p]
    v1p <- v2p <- rep(0, length(v1))
    v1p[v1==v2] <- 1
    v2p[v1==v2] <- 1
    v1p[v1==1 & v2==-1] <- 1
    v2p[v2==1 & v1==-1] <- 1
    v1 <- v1p
    v2 <- v2p
    if(all(v1==v1[1]) || all(v2==v2[1])) return(0)

    score <- 0
    v1l <- length(v1)
    if(v1l == 0) return(ifelse(CombinePVal, 0, 0+0i))
    tt <- sum(v1 & v2) / v1l
    tf <- sum(v1 & !v2) / v1l
    ft <- sum(!v1 & v2) / v1l
    ff <- sum(!v1 & !v2) / v1l
    jpd <- c(tt, tf, ft, ff)

    tv1 <- sum(v1) / v1l
    tv2 <- sum(v2) / v1l
    fv1 <- 1 - tv1
    fv2 <- 1 - tv2
    mpdv1 <- c(tv1, tv1, fv1, fv1)
    mpdv2 <- c(tv2, fv2, tv2, fv2)

    # +1 for same event, -1 for opposite, 0 for event + no change
    mult <- c(1,-1,-1,0)

    for ( k in seq_along(jpd) ){
      val <- jpd[k] * log(jpd[k] / (mpdv1[k] * mpdv2[k]), base=2) * mult[k]
      score <- score + ifelse(is.nan(val), 0, val)
    }
    pv <- 1-fisher.test(v1,v2)$p.value
    return(ifelse(CombinePVal, score*pv, complex(real=score, imaginary=pv)))
  }

  # APC Correction, removing for now
  # CORRECTION <- function(ps){
  #   apccorr <- mean(ps, na.rm=TRUE)
  #   ps <- ps - apccorr
  #   ps <- abs(ps)
  #   # Normalize
  #   denom <- max(ps, na.rm=TRUE)
  #   ps <- ps / ifelse(denom==0, 1, denom)
  # }
  pairscores <- BuildSimMatInternal(glvs, uvals, evalmap, l, n,
                                    FXN, NULL, Verbose, CombinePVal)
                                    #CORRECTION=CORRECTION)

  Diag(pairscores) <- ifelse(CombinePVal, 1, 1+1i)
  #pairscores <- pairscores #because distance
  return(pairscores)
}

ProfileDCA.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE, Processors=1L,
                                  precalcProfs=NULL, precalcSubset=NULL, useAbs=TRUE, ...){
  if (!is.null(precalcSubset))
    subs <- precalcSubset
  else
    subs <- ProcessSubset(ew, Subset)
  uvals <- subs$uvals

  if ( is.null(precalcProfs) ){
    if (Verbose) cat('  Calculating PA Profiles...\n')
    pap <- PAProfiles(ew, uvals, Verbose=Verbose)
  } else {
    pap <- precalcProfs
  }
  l <- ncol(pap)
  n <- colnames(pap)
  if ( l == 1 ){
    mat <- matrix(1, nrow=1, ncol=1)
    rownames(mat) <- colnames(mat) <- n
    return(mat)
  }

  pairscores <- DCA_logRISE(pap, Verbose=Verbose, Processors=Processors, ...)
  rownames(pairscores) <- colnames(pairscores) <- n
  pairscores <- as.simMat(pairscores)
  if (useAbs) pairscores <- abs(pairscores)
  if (max(pairscores) != 0)
    pairscores <- pairscores / max(abs(pairscores))

  return(pairscores)
}


Behdenna.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
                                MySpeciesTree=NULL, useSubtree=FALSE,
                                useACCTRAN=TRUE, rawZScores=FALSE,
                                precalcProfs=NULL, precalcSubset=NULL, ...){
  if (!is.null(precalcSubset))
    subs <- precalcSubset
  else
    subs <- ProcessSubset(ew, Subset)
  uvals <- subs$uvals
  evalmap <- subs$evalmap

  if (is.null(MySpeciesTree)){
      stopifnot("Method 'Behdenna' requires a species tree"=attr(ew, 'useMT'))
      if (Verbose) cat('Calculating Species Tree...\n')
      MySpeciesTree <- findSpeciesTree(ew, Verbose)
  }

  if ( is.null(precalcProfs) ){
    if (Verbose) cat('  Calculating PA Profiles...\n')
    pap <- PAProfiles(ew, uvals, Verbose=Verbose, speciesList=labels(MySpeciesTree))
  } else {
    pap <- precalcProfs
  }
  l <- length(uvals)
  stopifnot(nrow(pap) > 1)
  fd <- FastDend(MySpeciesTree)
  v1 <- abs(generateGainLossVec(fd, pap[,1], moveEventsUpward=useACCTRAN))
  glmat <- matrix(0, nrow=length(v1), ncol=ncol(pap))
  glmat[,1] <- v1
  if (Verbose) cat('  Calculating gain/loss vectors:\n')
  if (Verbose) pb <- txtProgressBar(max=ncol(pap), style=3)
  for ( i in 2:ncol(pap) ){
    glmat[,i] <- abs(generateGainLossVec(fd, pap[,i], moveEventsUpward=useACCTRAN))
    if (Verbose) setTxtProgressBar(pb, i)
  }
  if (Verbose) cat('\n')
  vals <- calc_SId_mat(fd, IdOnly=!useSubtree)
  if (useSubtree)
    M <- vals$S
  else
    M <- vals$Id
  Cmat <- vals$Cmat
  bl <- vals$blengths

  glmat <- abs(glmat)

  ARGS <- list(M=M, Cmat=Cmat, bl=bl)
  FXN <- function(v1, v2, ARGS, ii, jj){
    n1 <- sum(v1)
    n2 <- sum(v2)
    score <- 0
    if ( n1*n2 != 0 ){
      score <- t(v1) %*% ARGS$M %*% v2
      exp_mean <- n2 * (t(v1) %*% ARGS$M %*% ARGS$bl)
      exp_var <- n2*t(v1) %*% ARGS$M %*% ARGS$Cmat %*% t(M) %*% v1
      score <- (score - exp_mean) / sqrt(abs(exp_var))
    }

    return(score)
  }
  CORRECTION <- NULL
  if (!rawZScores){
    CORRECTION <- function(ps){
      ps <- abs(ps)
      ps <- ps / ifelse(max(ps,na.rm=TRUE) != 0,
                        max(ps, na.rm=TRUE), 1)
      return(ps)
    }
  }
  pairscores <- BuildSimMatInternal(glmat, uvals, evalmap, l, names(ew),
                                    FXN, ARGS, Verbose,
                                    CORRECTION=CORRECTION)

  Diag(pairscores) <- ifelse(rawZScores, 1, 0)

  return(pairscores)
}

GLDistance.EvoWeaver <- function(ew, Subset=NULL,
                     Verbose=TRUE, MySpeciesTree=NULL,
                     precalcProfs=NULL, precalcSubset=NULL,
                     CombinePVal=TRUE, ...){
  if (!is.null(precalcSubset))
    subs <- precalcSubset
  else
    subs <- ProcessSubset(ew, Subset)
  uvals <- subs$uvals
  evalmap <- subs$evalmap
  BOOTSTRAP_NUM <- 50L

  CombinePVal <- CombinePVal && BOOTSTRAP_NUM > 0

  if (is.null(MySpeciesTree)){
    stopifnot("Method 'GLDistance' requires a species tree"=attr(ew, 'useMT'))
    if (Verbose) cat('Calculating Species Tree...\n')
    MySpeciesTree <- findSpeciesTree(ew, Verbose)
  }

  if ( is.null(precalcProfs) ){
    if (Verbose) cat('  Calculating PA Profiles...\n')
    pap <- PAProfiles(ew, uvals, Verbose=Verbose, speciesList=labels(MySpeciesTree), ...)
  } else {
    pap <- precalcProfs
  }
  l <- length(uvals)
  stopifnot(nrow(pap) > 1)
  pa_counts <- colSums(pap)

  # Initialize Dendrogram in C
  y <- .Call("initCDend", MySpeciesTree, PACKAGE="SynExtend")
  on.exit(rm(y))
  numnodes <- .Call("getTreeNodesCount", y, PACKAGE="SynExtend")
  rn <- rownames(pap)
  glvs <- matrix(NA_integer_, nrow=numnodes, ncol=l)
  allnonzero <- logical(l)
  # Calculate Gain/Loss Vectors
  if (Verbose) cat('  Calculating gain/loss vectors:\n')
  if (Verbose) pb <- txtProgressBar(max=ncol(pap), style=3)
  for (i in seq_len(l)){
    v <- rn[pap[,i]]
    if (length(v) == 0){
      glv <- rep(0L, numnodes)
      allnonzero[i] <- TRUE
    } else {
      glv <- .Call("calcGainLoss", y, v, TRUE, PACKAGE="SynExtend")
    }
    glvs[,i] <- glv
    if (Verbose) setTxtProgressBar(pb, i)
  }
  if(Verbose) cat("\n")
  lenglv <- nrow(glvs)
  ARGS <- list(allnonzero=allnonzero, y=y, bsn=BOOTSTRAP_NUM, l=lenglv)
  FXN <- function(v1, v2, ARGS, ii, jj){
    allnonzero <- ARGS$allnonzero
    #labs <- ARGS$labs
    #n1 <- ARGS$counts[ii]
    #n2 <- ARGS$counts[jj]
    if (allnonzero[ii] || allnonzero[jj]){
      return(ifelse(CombinePVal, 0, 0+0i))
    } else {
      #res1 <- .Call("calcScoreGL", ARGS$y, v1, v2, PACKAGE="SynExtend")
      #res2 <- .Call("calcScoreGL", ARGS$y, v2, v1, PACKAGE="SynExtend")
      res1 <- .Call("calcScoreGL", ARGS$y, v1, v2, PACKAGE="SynExtend") / sum(abs(v1))
      res2 <- .Call("calcScoreGL", ARGS$y, v2, v1, PACKAGE="SynExtend") / sum(abs(v2))
      res <- res1 + res2
      if (is.na(res) || is.infinite(res) || res==0) return(ifelse(CombinePVal, 0, 0+0i))
      num_bs <- ARGS$bsn
      if(num_bs > 0){
        v1 <- as.integer(v1)
        v2 <- as.integer(v2)
        l <- as.integer(ARGS$l)
        replicates <- rep(NA_real_, num_bs)
        #ss <- seq_along(labs)
        #ssl <- length(ss)
        for(i in seq_len(num_bs)){
          v1tmp <- .C("shuffleRInt", v1, l, PACKAGE="SynExtend")[[1]]
          #pa1 <- labs[v1tmp]
          v2tmp <- .C("shuffleRInt", v2, l, PACKAGE="SynExtend")[[1]]
          #pa2 <- labs[v2tmp]
          #v1tmp <- .Call("calcGainLoss", y, pa1, TRUE, PACKAGE="SynExtend")
          #v2tmp <- .Call("calcGainLoss", y, pa2, TRUE, PACKAGE="SynExtend")
          replicates[i] <- .Call("calcScoreGL", ARGS$y, v1tmp, v2tmp, PACKAGE="SynExtend") / sum(abs(v1tmp)) +
                            .Call("calcScoreGL", ARGS$y, v2tmp, v1tmp, PACKAGE="SynExtend") / sum(abs(v2tmp))
        }

        # pair up gain/loss and loss/gain
        #replicates <- replicates / normer
        #res <- res / normer

        # doing abs instead so we get a 1-sided pv
        # also fixes cases where all values are equal
        pv <- sum(abs(res) > abs(replicates)) / num_bs
        # left.pv <- sum(res <= replicates) / num_bs
        # pv <- 2 * min(right.pv, left.pv)
        # pv <- sum(replicates < res) / num_bs
        # if(pv > 0.5){
        #   pv <- 2*(1-pv)
        # } else {
        #   pv <- 2*pv
        # }
        res <- res / 2
        res <- ifelse(CombinePVal, pv*res, complex(real=res, imaginary=pv))
      } else {
        normer <- mean(sum(abs(v1)), sum(abs(v2)))
        normer <- sum(abs(v1), abs(v2))
        normer <- ifelse(normer==0, 1, normer)
        res <- 2*res / normer
      }
      return(res)
    }
  }
  pairscores <- BuildSimMatInternal(glvs, uvals, evalmap, l, names(ew),
                                    FXN, ARGS, Verbose, CombinePVal)
  Diag(pairscores) <- ifelse(CombinePVal, 1, 1+1i)
  return(pairscores)
}

PAPV.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
                               precalcProfs=NULL, precalcSubset=NULL, ...){

  if (!is.null(precalcSubset))
    subs <- precalcSubset
  else
    subs <- ProcessSubset(ew, Subset)
  uvals <- subs$uvals
  evalmap <- subs$evalmap

  if ( is.null(precalcProfs) ){
    if (Verbose) cat('  Calculating PA Profiles...\n')
    pap <- PAProfiles(ew, uvals, Verbose=Verbose)
  } else {
    pap <- precalcProfs
  }
  l <- length(uvals)
  n <- names(ew)
  if ( l == 1 ){
    mat <- matrix(1, nrow=1, ncol=1)
    rownames(mat) <- colnames(mat) <- n
    return(mat)
  }
  nr <- nrow(pap)
  pap[] <- as.integer(pap)
  ARGS <- list(nr=nr)
  FXN <- function(v1, v2, ARGS, ii, jj) {
    if(all(v1==v1[1]) || all(v2==v2[1])) return(0)
    return(1-fisher.test(v1, v2, simulate.p.value=TRUE)$p.value)
  }
  pairscores <- BuildSimMatInternal(pap, uvals, evalmap, l, n, FXN, ARGS, Verbose)

  return(pairscores)
}
npcooley/SynExtend documentation built on Sept. 20, 2024, 11:58 a.m.