R/01-bankStat.R

Defines functions fdrCutoff fdrCorrection empiricalP altPValues pValuesBanked pValuesBankedSecond pValuesBankedOrig nuValsBank bankStat createBank nuValsIterBank stand roam

Documented in bankStat createBank

# BANKED

roam <- function(M) {
  as.vector(matrixMean(M))
}

stand <- function(M, mu) {
  sqrt(as.vector(matrixVar(M, mu)))
}

### INTERNAL
### Get the background distribution of "unusual" values in the bank
### KRC: why are we going to all the exta trouble of recomputing the loess
###      by removing each individual sample. Does that even matter?
nuValsIterBank <- function(matrix) {
  n <- nrow(matrix)
  s <- ncol(matrix)
  temp.means = temp.sd = matsdEst = matNu = matrix(NA,n,s)

  for (j in 1:s){
    temp.means[ ,j] <- roam(matrix[ ,-j])
    temp.sd[ ,j] <- stand(matrix[ ,-j], temp.means[,j])
    l.mod <- loess(temp.sd[ ,j] ~ temp.means[ ,j])
    matsdEst[ ,j] <- predict(l.mod)
  }
  matNu <- (matrix - temp.means)/matsdEst
  return(matNu)
}

### EXTERNAL
### prepare a "bank" of normal samples against which to test an individual
createBank <- function(bankMatrix) {
  ## warn if data appears to be on the raw scale
  if (diff(range(bankMatrix)) > 100) {
    warning("createBank: Data appears to be on the raw scale and not on a log scale.")
  }
  ## Compute row-wise mean and standard deviation
  bankMeans  <- roam(bankMatrix)
  bankSD <- stand(bankMatrix, bankMeans)
  ## Fit a loess model, since we assume SD is a smooth function of the mean
  loess.model <- loess(bankSD ~ bankMeans)
  sdEst <- predict(loess.model)
  ## bundle the results
  rowStats <- data.frame(mean=bankMeans, SD=bankSD, sdEst=sdEst)
  ##
  background <- nuValsIterBank(bankMatrix)
  ## return a list
  list(rowStats = rowStats, background = background)
}

### EXTERNAL
### Primary interface; compute the actual Newman banked statistic
bankStat <- function(bankObj, testSet, bankMatrix){
  n <- dim(testSet)[1]
  s <- dim(testSet)[2]

  if(missing(bankObj)){
    bankObj <- createBank(bankMatrix)
  } else if (!missing(bankMatrix)) {
    stop("You should only supply one of 'bankObj' or 'bankMatrix'.")
  }
  temp.means <- bankObj$rowStats$mean
  sdEst <- bankObj$rowStats$sdEst
  Back_dist_iter <- bankObj$background

  matNuBank <- nuValsBank(matrix = testSet, mean = temp.means, sd = sdEst)

  matPBanked <- pValuesBanked(matrix = matNuBank, Background = Back_dist_iter, n, s)

  return(list(nu.statistics = matNuBank, p.values = matPBanked))

}
### If an input is missing, for example the bankObj, run your code as
###       pvals_uncorr = bankStat(,testSet,bankMatrix)

nuValsBank<- function(matrix, mean, sd){
  centered <- sweep(matrix, 1, mean, "-")
  scaled <- sweep(centered, 1, sd, '/')
  return(scaled)
}

## Looks stupid slow. Can we speed it up?
pValuesBankedOrig <- function(matrix, Background, n, s){
  meanDist = mean(Background)
  sdDist = sd(Background)

  matP = matrix(NA,n,s)
  for (i in 1:n){
    for (j in 1:s){
      matP[i,j] <- pnorm(matrix[i,j],meanDist,sdDist)
    }
  }
  return(matP)
}
pValuesBankedSecond <- function(matrix, Background, n, s){
  meanDist = mean(Background)
  sdDist = sd(Background)
  pnorm(matrix, meanDist, sdDist)
}
pValuesBanked <- function(matrix, Background, n, s){
  pnorm(matrix, 0, 1)
}

## If pvals_uncorr is your output, fdr gives you a matrix of corrected p values

# v <- as.vector(pvals_uncorr)
#
# BH_vec = p.adjust(v, method = 'fdr')
# fdr =  as.matrix(BH_vec)
# dim(fdr) = c(n,s)


### The following four functions were written in an effort to find p values efficiently. Even though they work for smaller sets, the run time is significantly less for the built in FDR function ###
altPValues <- function(matrix,Background,n,s){
  matP <- matrix(NA,n,s)
  v = as.vector(Background)
  vec = v[!duplicated(v)]

  for (i in 1:n){
    for (j in 1:s){
      matP[i,j] <- length(vec[vec <= matrix[i,j]])/length(vec)
    }
  }
  return(matP)
}

empiricalP <- function(matrix,background,n,s){
  pdf = approxfun(density(background))
  min = min(background)

  pf = function(q){integrate(pdf,min,q,subdivisions = 2000)$value}


  empirMatP = matrix(NA,n,s)

  for (i in 1:n){
    for (j in 1:s){
      if (matrix[i,j] >= min){empirMatP[i,j] = pf(matrix[i,j])}
      else {empirMatP[i,j] = 0.04}
    }
  }
  return(empirMatP)
}

fdrCorrection <- function(matrix, n, s, verbose = FALSE){

  ## Benjamini Hochberg method employed. Could also use built in p.adjust(p, "BH")
  ## Percent correction only factors in during analysis, b/c (rank/total)*percentageFDR only gives the adjusted cutoff


  v <- as.vector(matrix)
  vUnique <- v[!duplicated(v)]
  vs <- sort(vUnique,decreasing = T)
  vRank <- rank(vs)
  num <- max(vRank)

  adjustedP <- length(vUnique)
  adjustedP[1] <- vs[1]

  ##### if statement

  # breakPt = floor(0.9*num)
  # for (i in 1:breakPt){adjustedP[i] = 0.499}
  #
  # for (i in breakPt+1:num){
  #   adjustedP[i] <- min(adjustedP[i-1],(vs[i]*num)/vRank[i])
  #   if(i%%((num-breakPt)/10) == 0)print((i/(num-breakPt))*100)
  # }
  #

  for (i in 1:num){
    adjustedP[i] <- min(adjustedP[i-1],(vs[i]*num)/vRank[i])
    if(verbose & i %% (num/10) == 0) print((i/num)*100)
  }

  fdrP <- matrix(NA,n,s)
  for (i in 1:n){
    for (j in 1:s){
      fdrP[i,j] <- adjustedP[match(matrix[i,j],vUnique)]
    }
  }
  return(fdrP)
}

fdrCutoff <- function(matrix,alpha){

  v <- as.vector(matrix)
  w <- sort(v)
  threshold <- alpha*(1:length(v))/length(v)
  c <- cbind(v,threshold,v<=threshold)

  sig_value <- c[,1][c[,3] == 1]
  cutoff <- max(sig_value)

  return(cutoff)


  ## If we want to input a cutoff then use this method, else use FDR correction on p-values
  # sigP <- fdrCutoff(matPBanked,alpha = 0.05)
  # binarySigBank <- matrix(0,n,s)
  #
  # for (i in 1:n){
  #   for (j in 1:s){
  #     if (matPBanked[i,j] <= sigP){binarySigBank[i,j] = 1}
  #   }
  #   if(i%%(n/10) == 0)print((i/n)*100)
  # }
}
#####################

Try the NewmanOmics package in your browser

Any scripts or data that you put into this service are public.

NewmanOmics documentation built on May 11, 2022, 3 p.m.