R/haystack.R

Defines functions show_result_haystack.haystack show_result_haystack get_density haystack_2D get_reference get_log_p_D_KL get_D_KL get_parameters_haystack default_bandwidth.nrd

Documented in default_bandwidth.nrd get_density get_D_KL get_log_p_D_KL get_parameters_haystack get_reference haystack_2D show_result_haystack show_result_haystack.haystack

#' Based on the MASS kde2d() function, but heavily simplified; it's just tcrossprod() now.
#'
#' @param dens.x Contribution of all cells to densities of the x-axis grid points.
#' @param dens.y Contribution of all cells to densities of the y-axis grid points.
#'
kde2d_faster = function (dens.x, dens.y){
  tcrossprod(dens.x, dens.y)
}

#' Default function given by function bandwidth.nrd in MASS. No changes were made to this function.
#'
#' @param x A numeric vector
#'
#' @return A suitable bandwith.
default_bandwidth.nrd = function(x){
  # NOTE: Remove when we remove binary version.

  r <- quantile(x, c(0.25, 0.75))
  h <- (r[2] - r[1]) / 1.34
  1.06 * min(sqrt(var(x)), h) * length(x) ^ (-1 / 5)
}


#' Function that decides most of the parameters that will be used during the "Haystack" analysis.
#'
#' @param x x-axis coordinates of cells in a 2D representation (e.g. resulting from PCA or t-SNE)
#' @param y y-axis coordinates of cells in a 2D representation
#' @param high.resolution Logical: should high resolution be used? Default is FALSE.
#'
#' @return A list containing various parameters to use in the analysis.
get_parameters_haystack = function(x, y, high.resolution = FALSE){
  # NOTE: Remove when we remove binary version.

  # the bandwidths used for the kernel function
  bandwidth.x <- default_bandwidth.nrd(x)
  bandwidth.y <- default_bandwidth.nrd(y)
  bandwidths <- c(bandwidth.x,bandwidth.y)
  bandwidths <- bandwidths

  # I want to have a fixed number of bins between the 10th and 90th percentile in each dimension
  # this is to avoid bins getting squished because of a few outliers
  # the default number of bins between the 10% and 90% points in each dimension
  if(high.resolution) {
    grid.points.10.90 <- 125
  } else {
    grid.points.10.90 <- 25
  }

  # get the 10% and 90% points in each dimension
  x10 <- as.numeric(quantile(x,0.10))
  x90 <- as.numeric(quantile(x,0.90))
  y10 <- as.numeric(quantile(y,0.10))
  y90 <- as.numeric(quantile(y,0.90))

  # get the minimum and maximum of each dimension
  # with an added padding the size of a bandwidth
  x.min <- min(x) - bandwidths[1]
  x.max <- max(x) + bandwidths[1]
  y.min <- min(y) - bandwidths[2]
  y.max <- max(y) + bandwidths[2]

  # set the bin size and total number of bins in each dimension
  x.bin.size <- (x90-x10)/grid.points.10.90
  y.bin.size <- (y90-y10)/grid.points.10.90
  x.bins.10.to.min <- ceiling(abs(x.min-x10)/x.bin.size)
  x.bins.90.to.max <- ceiling(abs(x.max-x90)/x.bin.size)
  y.bins.10.to.min <- ceiling(abs(y.min-y10)/y.bin.size)
  y.bins.90.to.max <- ceiling(abs(y.max-y90)/y.bin.size)

  # get limits, grid points, bandwidths
  x.lim.min <- x10 - (x.bin.size*x.bins.10.to.min)
  x.lim.max <- x90 + (x.bin.size*x.bins.90.to.max)
  y.lim.min <- y10 - (y.bin.size*y.bins.10.to.min)
  y.lim.max <- y90 + (y.bin.size*y.bins.90.to.max)

  # the total number of grid points for both dimensions
  x.total.grid.points <- round((x.lim.max-x.lim.min)/x.bin.size)
  y.total.grid.points <- round((y.lim.max-y.lim.min)/y.bin.size)
  grid.points <- c(x.total.grid.points,y.total.grid.points)

  # the limits of the grid in both dimensions
  limits <- c(x.lim.min,x.lim.max,y.lim.min,y.lim.max)

  # getting the distances (in units of bandwidths) between all cells and all grid points
  gx <- seq.int(x.lim.min, x.lim.max, length.out = x.total.grid.points)
  gy <- seq.int(y.lim.min, y.lim.max, length.out = y.total.grid.points)
  ax <- outer(gx, x, "-")/bandwidths[1L]
  ay <- outer(gy, y, "-")/bandwidths[2L]

  # densities based on a simplified Gaussian
  dens.x <- matrix(exp(-0.5 * ax * ax), ncol = length(x))
  dens.y <- matrix(exp(-0.5 * ay * ay), ncol = length(x)) # length(x) should be the same as length(y), of course

  # return results
  list(
    grid.points = grid.points,
    limits      = limits,
    bandwidths  = bandwidths,
    dens.x      = dens.x,
    dens.y      = dens.y
  )
}

#' Calculates the Kullback-Leibler divergence between distributions.
#'
#' @param classes A logical vector. Values are T is the gene is expressed in a cell, F is not.
#' @param parameters Parameters of the analysis, as set by function 'get_parameters_haystack'
#' @param reference.prob A reference distribution to calculate the divergence against.
#' @param pseudo A pseudocount, used to avoid log(0) problems.
#'
#' @return A numerical value, the Kullback-Leibler divergence
get_D_KL = function(classes, parameters, reference.prob, pseudo){
  # NOTE: Remove when we remove binary version.

  class.types = c(FALSE, TRUE)

  # the reference distribution Q of cells in the 2D plot
  Q <- reference.prob

  # calculating the Kullback-Leibler divergence of the distribution
  # of cells expressin and not expressing gene X vs reference distribution Q
  D_KLs <- rep(NA,length(class.types))
  for(c in 1:length(class.types)){
    cl <- class.types[c]
    cl.subset <- classes==cl
    if(sum(cl.subset)==0){ # this means a gene is expressed or not expressed in all cells; return 0
      #D_KLs[c] <- 0
      return(0)
    } else {
      density <- kde2d_faster(dens.x=parameters$dens.x[,cl.subset],dens.y=parameters$dens.y[,cl.subset])

      # only decide pseudo if no value was given as input
      # (in total this takes some time)
      if(missing(pseudo))
        pseudo <- quantile(density[density>0],0.01)

      density <- density + pseudo
      P <- density / sum(density)

      D_KL <- sum(P * log(P/Q))
      D_KLs[c] <- D_KL
    }
  }
  sum(D_KLs)
}

#' Estimates the significance of the observed Kullback-Leibler divergence by comparing to randomizations.
#'
#' @param T.counts The number of cells in which a gene is detected.
#' @param D_KL.observed A vector of observed Kullback-Leibler divergences.
#' @param D_KL.randomized A matrix of Kullback-Leibler divergences of randomized datasets.
#' @param output.dir Optional parameter. Default is NULL. If not NULL, some files will be written to this directory.
#'
#' @return A vector of log10 p values, not corrected for multiple testing using the Bonferroni correction.
get_log_p_D_KL = function(T.counts, D_KL.observed, D_KL.randomized, output.dir = NULL){
  # NOTE: Remove when we remove binary version.

  t.points <- as.numeric(row.names(D_KL.randomized))
  dat.mean.log2 <- apply(log2(D_KL.randomized),1,mean)
  dat.sd.log2 <- apply(log2(D_KL.randomized),1,sd)

  # fitting a spline for the mean D_KL

  # set df to 10, but lower if there are few points
  # however, df should be at least 5
  df.mean <- min(10, round(length(t.points)/20))
  df.mean <- max(df.mean,5)

  # set boundary knots to be slightly outside range of predictor values
  boundary.knots <- c(min(T.counts)-5, max(T.counts)+5)
  model.mean.log2 <- lm(dat.mean.log2 ~ bs(t.points, df=df.mean,Boundary.knots=boundary.knots))
  #summary(model.mean.log2)

  t.range <- range(t.points)
  t.seq <- seq(t.range[1],t.range[2],length.out=1000)
  fitted.mean.log2 <- predict(model.mean.log2, data.frame(t.points=t.seq), type="response")

  info <- list()
  info$mean <- list()
  info$mean$observed <- data.frame(feature=t.points, x=t.points, y=dat.mean.log2)
  info$mean$fitted <- data.frame(x=t.seq, y=fitted.mean.log2)

  if(!is.null(output.dir)){
    outfile <- paste0(output.dir,"/fit.mean.log.D_KL_df",df.mean,".pdf")
    message("### writing plot to ", outfile)
    pdf(outfile)
    plot(t.points, dat.mean.log2)
    points(t.seq,fitted.mean.log2,col="red", type="l")
    dev.off()
  }


  # fitting a spline for the standard deviation of D_KL

  # set df to 10, but lower if there are few points
  # however, df should be at least 5
  df.sd <- min(10, round(length(t.points)/20))
  df.sd <- max(df.sd,5)

  model.sd.log2 <- lm(dat.sd.log2 ~ bs(t.points, df=df.sd,Boundary.knots=boundary.knots))
  #summary(model.sd.log2)

  fitted.sd.log2 <- predict(model.sd.log2, data.frame(t.points=t.seq), type="response")

  info$sd <- list()
  info$sd$observed <- data.frame(feature=t.points, x=t.points, y=dat.sd.log2)
  info$sd$fitted <- data.frame(x=t.seq, y=fitted.sd.log2)

  if(!is.null(output.dir)){
    outfile <- paste0(output.dir,"/fit.sd.log.D_KL_df",df.sd,".pdf")
    message("### writing plot to ", outfile)
    pdf(outfile)
    plot(t.points, dat.sd.log2)
    points(t.seq,fitted.sd.log2,col="red", type="l")
    dev.off()
  }

  # apply on actual observed values

  fitted.mean.log2 <- predict(model.mean.log2, data.frame(t.points=T.counts), type="response")
  fitted.sd.log2 <- predict(model.sd.log2, data.frame(t.points=T.counts), type="response")

  fitted.log.p.vals <- pnorm(log2(D_KL.observed), mean = fitted.mean.log2, sd = fitted.sd.log2, lower.tail = FALSE, log.p = T)/log(10)

  list(fitted=fitted.log.p.vals, info=info)
}


#' Get reference distribution
#'
#' @param param Parameters of the analysis, as set by function 'get_parameters_haystack'
#' @param use.advanced.sampling If NULL naive sampling is used. If a vector is given (of length = no. of cells) sampling is done according to the values in the vector.
#'
#' @return A list with two components, Q for the reference distribution and pseudo.
#'
get_reference <- function(param, use.advanced.sampling = NULL) {
  # NOTE: Remove when we remove binary version.

  if (is.null(use.advanced.sampling)) {
    density <- kde2d_faster(param$dens.x, param$dens.y)
    pseudo <- quantile(density[density > 0], 0.01)
  } else {
    density <- kde2d_faster(t(t(param$dens.x) * use.advanced.sampling),
                            t(t(param$dens.y)))
    density <- density / sum(density)
    pseudo <- quantile(density[density>0],0.01)
  }
  density <- density + pseudo

  Q = density / sum(density)
  list(Q = Q, pseudo = pseudo)
}

#' The main Haystack function, for 2-dimensional spaces.
#'
#' @param x x-axis coordinates of cells in a 2D representation (e.g. resulting from PCA or t-SNE)
#' @param y y-axis coordinates of cells in a 2D representation
#' @param detection A logical matrix showing which genes (rows) are detected in which cells (columns)
#' @param use.advanced.sampling If NULL naive sampling is used. If a vector is given (of length = no. of cells) sampling is done according to the values in the vector.
#' @param dir.randomization If NULL, no output is made about the random sampling step. If not NULL, files related to the randomizations are printed to this directory.
#'
#' @return An object of class "haystack"
#' @export
haystack_2D = function(x, y, detection, use.advanced.sampling=NULL, dir.randomization = NULL){
  .Deprecated(msg = "This function has been deprecated and will be removed in the future.")
  message("### calling haystack_2D()...")

  # check input
  if(!is.numeric(x))
    stop("'x' must be a numeric vector")
  if(!is.numeric(y))
    stop("'y' must be a numeric vector")
  if(length(x) != length(y))
    stop("'x' and 'y' must have the same length")
  if(!is.matrix(detection) && ! inherits(detection, "lgCMatrix") && ! inherits(detection, "lgRMatrix"))
    stop("'detection' must be a matrix, lgCMatrix, or lgRMatrix")
  if(ncol(detection) != length(x))
    stop("The number of columns in 'detection' must be the same as the length of 'x'")
  if(!is.null(use.advanced.sampling)){
    if(!is.numeric(use.advanced.sampling))
      stop("'use.advanced.sampling' must either be NULL or a numeric vector")
    if(length(use.advanced.sampling) != length(x))
      stop("The length of 'use.advanced.sampling' must be the same as that of 'x'")
  }

  count.cells <- ncol(detection)
  count.genes <- nrow(detection)

  # warn about unusual input sizes
  if(count.cells < 50)
    warning("The number of cells seems very low (",count.cells,"). Check your input.")
  if(count.cells > 10000)
    message("You are running haystack_2D on a large number of cells (",count.cells,"). Use method 'highD' for shorter runtimes.")
  if(count.genes < 100)
    warning("The number of genes seems very low (",count.genes,"). Check your input.")
  # if detection is a lgCMatrix, convert it to a lgRMatrix
  if(inherits(detection, "lgCMatrix")){
    message("### converting detection data from lgCMatrix to lgRMatrix")
    detection <- as(detection, "RsparseMatrix")
  }

  # advanced sampling is slow on large datasets. Recommend using wrswoR
  if(!is.null(use.advanced.sampling)){
    if(requireNamespace("wrswoR", quietly = TRUE)){
      use.wrswoR <- TRUE
      message("### Using package wrswoR to speed up random sampling")
    } else {
      use.wrswoR <- FALSE
      message("### You are running advanced sampling. Installing the package \"wrswoR\" might result in much shorter runtimes.")
    }
  }

  # make dir if needed
  if(!is.null(dir.randomization)){
    if(!dir.exists(dir.randomization))
      dir.create(dir.randomization)
  }



  ### get reference probabilities "Q"


  # using all points, get number of grid points, limits, and bandwidths
  # get 2D densities using all points (using above grid points, limits, bandwidths)
  # add pseudocount to densities to avoid Inf problems
  # normalize to sum to 1
  message("### setting parameters...")
  parameters <- get_parameters_haystack(x,y)

  # if(is.null(use.advanced.sampling)){
  #   density <- kde2d_faster(dens.x=parameters$dens.x,dens.y=parameters$dens.y)
  #   pseudo <- quantile(density[density>0],0.01)
  # } else {
  #   density <- kde2d_faster(dens.x=t(t(parameters$dens.x)*use.advanced.sampling),
  #                           dens.y=t(t(parameters$dens.y)))
  #   density <- density / sum(density)
  #   pseudo <- quantile(density[density>0],0.01)
  # }
  # density <- density + pseudo
  # # heatmap(density, Rowv=NA,Colv=NA,scale="none")
  # Q <- density / sum(density)
  ref <- get_reference(parameters, use.advanced.sampling)

  ### get probabilities "P" for each group ("F" and "T")
  ### this has to be one for every gene X


  # for class "F" points, and for "T" points, separately
  # get 2D densities (using above grid points, limits, bandwidths)
  # add pseudocount to densities to avoid Inf problems
  # normalize to sum to 1
  # get D_KL (or relative entropy) of this P vs reference Q
  message("### calculating Kullback-Leibler divergences...")
  D_KL.observed <- rep(0,count.genes)
  pb <- txtProgressBar(min = 0, max = count.genes, style = 3, file = stderr()) # progress bar
  if(is.matrix(detection)){
    for(i in 1:count.genes){
      D_KL.observed[i] <- get_D_KL(classes=detection[i,], parameters=parameters, reference.prob=ref$Q, pseudo=ref$pseudo)
      setTxtProgressBar(pb, i) # progress bar
    }
  } else if(inherits(detection, "lgRMatrix")){
    for(i in 1:count.genes){
      D_KL.observed[i] <- get_D_KL(classes=extract_row_lgRMatrix(detection,i), parameters=parameters, reference.prob=ref$Q, pseudo=ref$pseudo)
      setTxtProgressBar(pb, i) # progress bar
    }
  }
  close(pb) # progress bar

  # return the sum of D_KL for "F" and "T"
  # store this value for each gene X



  ### use randomization to estimate expected values
  ### and p values for D_KL


  # Randomized D_KL values depend on the number of "F" and "T" cases
  # Therefore, for all observed numbers of "T" cases (from 0 to 100%):
  # - do x randomizations and get their D_KL
  # - get mean and SD per fraction,
  # - use those to estimate p values

  message("### performing randomizations...")
  T.counts <- Matrix::rowSums(detection)
  T.counts.unique <- sort(unique(T.counts))
  T.counts.unique.no <- length(T.counts.unique)
  p.vals <- rep(NA,nrow(detection))
  p.types <- rep(NA,nrow(detection))
  randomization.count <- 50

  # select T counts to include in randomizations
  T.counts.to.select <- 200
  if(T.counts.unique.no > T.counts.to.select){
    # random selection:
    # T.counts.selected <- sample(T.counts.unique, size=T.counts.to.select, replace = F)

    # more evenly spread selection:
    #tmp.T <- T.counts.unique[!T.counts.unique %in% c(0,count.cells)] # remove 0 or all, if present
    #tmp.T.no <- length(tmp.T)
    #T.counts.selected <- tmp.T[as.integer(seq(1,tmp.T.no,length.out = T.counts.to.select))]

    # include 10 lowest and highest values, otherwise evenly spread selection:
    tmp.T <- T.counts.unique[!T.counts.unique %in% c(0,count.cells)] # remove 0 or all, if present
    tmp.T.no <- length(tmp.T)
    T.counts.selected <- tmp.T[c(1:9,
                               as.integer(seq(10,tmp.T.no-9,length.out = T.counts.to.select-18)),
                               (tmp.T.no-8):tmp.T.no)
                               ]


  } else {
    # include all
    tmp.T <- T.counts.unique[!T.counts.unique %in% c(0,count.cells)] # remove 0 or all, if present
    tmp.T.no <- length(tmp.T)
    T.counts.selected <- tmp.T
    T.counts.to.select <- tmp.T.no
  }

  # to store randomization results
  all.D_KL.randomized <- matrix(NA,nrow=T.counts.to.select,ncol=randomization.count,
                                dimnames=list(T.counts.selected,1:randomization.count))

  pb <- txtProgressBar(min = 0, max = T.counts.to.select, style = 3, file = stderr()) # progress bar

  # taking if - else statements outside of the for loop
  if(!is.null(use.advanced.sampling)){
    sampling.probs <- use.advanced.sampling

    for(i in 1:T.counts.to.select){
      setTxtProgressBar(pb, i) # progress bar

      T.count <- T.counts.selected[i]

      D_KL.randomized <- rep(NA,randomization.count)
      for(r in 1:randomization.count){
        # using sampling according to the number of genes expressed in each cell
        # pick cells according to the number of genes they express
        if(use.wrswoR){
          samp <- wrswoR::sample_int_expj(count.cells, size=T.count, prob=sampling.probs)
        } else {
          samp <- sample(x=count.cells, prob=sampling.probs, size=T.count, replace = FALSE)
        }
        # turn into T or F
        classes <- is.element(1:count.cells,samp)
        D_KL.randomized[r] <- get_D_KL(classes=classes,
                                       parameters=parameters, reference.prob=ref$Q, pseudo=ref$pseudo)
      }
      all.D_KL.randomized[i,] <- D_KL.randomized
    }# end for all T counts to select
  } else {

    for(i in 1:T.counts.to.select){
      setTxtProgressBar(pb, i) # progress bar

      T.count <- T.counts.selected[i]

      D_KL.randomized <- rep(NA,randomization.count)
      vector.to.randomize <- c(rep(TRUE, T.count), rep(FALSE, ncol(detection) - T.count))
      for(r in 1:randomization.count){
        # using default sampling
        D_KL.randomized[r] <- get_D_KL(classes=sample(x = vector.to.randomize),
                                       parameters=parameters, reference.prob=ref$Q, pseudo=ref$pseudo)
      }
      all.D_KL.randomized[i,] <- D_KL.randomized
    }# end for all T counts to select
  }# end if else
  close(pb) # progress bar

  message("### estimating p-values...")
  p.vals <- get_log_p_D_KL(T.counts = T.counts, D_KL.observed = D_KL.observed, D_KL.randomized = all.D_KL.randomized, output.dir = dir.randomization)
  info <- p.vals$info
  info$mean$observed$feature <- rownames(detection)[info$mean$observed$x]
  info$sd$observed$feature <- rownames(detection)[info$sd$observed$x]
  p.vals <- p.vals$fitted

  # bonferroni correction for multiple testing
  p.adjs <- p.vals + log10(length(p.vals))
  p.adjs[p.adjs>0] <- 0 # p values should be at most 1; so log10 should be <= 0

  if(!is.null(dir.randomization)){
    message("### writing randomized Kullback-Leibler divergences to file...")
    outputfile.randomized.D_KL <- paste0(dir.randomization,"/random.D_KL.csv")
    write.csv(file=outputfile.randomized.D_KL,all.D_KL.randomized)
  }

  message("### returning result...")
  # prepare the 'haystack' object to return
  res <- list(
    results = data.frame(
      D_KL = D_KL.observed,
      log.p.vals = p.vals,
      log.p.adj = p.adjs,
      T.counts = T.counts,
      row.names = row.names(detection)
    ),
    info = list(
      method="binary_2D",
      randomization = info
    )
  )
  class(res) <- "haystack"
  res
}

#' Function to get the density of points with value TRUE in the (x,y) plot
#'
#' @param x x-axis coordinates of cells in a 2D representation (e.g. resulting from PCA or t-SNE)
#' @param y y-axis coordinates of cells in a 2D representation
#' @param detection A logical matrix or dgRMatrix showing which gens (rows) are detected in which cells (columns)
#' @param rows.subset Indices of the rows of 'detection' for which to get the densities. Default: all.
#' @param high.resolution Logical: should high resolution be used? Default is FALSE.
#'
#' @return A 3-dimensional array (dim 1: genes/rows of expression, dim 2 and 3: x and y grid points) with density data
get_density = function(x, y, detection, rows.subset=1:nrow(detection), high.resolution = FALSE){
  # NOTE: Remove when we remove binary version.

  # set the parameters for getting the densities
  parameters <- get_parameters_haystack(x,y,high.resolution)

  densities <- array(data=NA, dim=c(length(rows.subset),parameters$grid.points))

  cl <- T # we are only looking at the T points here

  # detection could be a matrix class object now,
  # or a lgRMatrix object
  # if detection is a lgCMatrix object at this point, something is wrong
  if(is.matrix(detection)){
    for(i in 1:length(rows.subset)){
      r <- rows.subset[i]
      x.subset <- detection[r,]==cl
      y.subset <- detection[r,]==cl
      density <- kde2d_faster(dens.x=parameters$dens.x[,x.subset],
                              dens.y=parameters$dens.y[,y.subset])
      densities[i,,] <- density / sum(density)
    }
  } else if(inherits(detection, "lgRMatrix")){
    for(i in 1:length(rows.subset)){
      r <- rows.subset[i]
      x.subset <- extract_row_lgRMatrix(detection,r)==cl
      y.subset <- extract_row_lgRMatrix(detection,r)==cl
      density <- kde2d_faster(dens.x=parameters$dens.x[,x.subset],
                              dens.y=parameters$dens.y[,y.subset])
      densities[i,,] <- density / sum(density)
    }
  } else {
    stop("'detection' must be a matrix or lgRMatrix")
  }
  # set dimension names to genes, and grid points of x and y axes
  dimnames(densities) <- list(rownames(detection)[rows.subset],
                         seq(parameters$limits[1],parameters$limits[2],length.out = parameters$grid.points[1]), # x grid
                         seq(parameters$limits[3],parameters$limits[4],length.out = parameters$grid.points[2])  # y grid
                         )
  densities
}

#' show_result_haystack
#'
#' Shows the results of the 'haystack' analysis in various ways, sorted by significance. Priority of params is genes > p.value.threshold > n.
#'
#' @param res.haystack A 'haystack' result object.
#' @param n If defined, the top "n" significant genes will be returned. Default: NA, which shows all results.
#' @param p.value.threshold If defined, genes passing this p-value threshold will be returned.
#' @param gene If defined, the results of this (these) gene(s) will be returned.
#' @details The output is a data.frame with the following columns:
#' * D_KL the calculated KL divergence.
#' * log.p.vals log10 p.values calculated from randomization.
#' * log.p.adj log10 p.values adjusted by Bonferroni correction.
#' @return A data.frame with 'haystack' results sorted by log.p.vals.
#' @export
#'
#' @examples
#' # using the toy example of the singleCellHaystack package
#'
#' # running haystack
#' res <- haystack(dat.tsne, dat.expression)
#'
#' # below are variations for showing the results in a table
#' # 1. list top 10 biased genes
#' show_result_haystack(res.haystack = res, n =10)
#' # 2. list genes with p value below a certain threshold
#' show_result_haystack(res.haystack = res, p.value.threshold=1e-10)
#' # 3. list a set of specified genes
#' set <- c("gene_497","gene_386", "gene_275")
#' show_result_haystack(res.haystack = res, gene = set)
show_result_haystack <- function(res.haystack, n=NULL, p.value.threshold=NULL, gene=NULL) {
  UseMethod("show_result_haystack")
}

#' @rdname show_result_haystack
#' @export
show_result_haystack.haystack <- function(res.haystack, n=NULL, p.value.threshold=NULL, gene=NULL){

  # check input
  #if(missing(res.haystack))
  #  stop("Parameter 'res.haystack' ('haystack' result) is missing")
  #if(class(res.haystack)!="haystack")
  #  stop("'res.haystack' must be of class 'haystack'")

  result <- res.haystack$results
  if(is.null(result))
    stop("Results seem to be missing from 'haystack' object. Is 'res.haystack' a valid 'haystack' result?")
  if(!is.null(n) & !is.numeric(n))
    stop("The value of 'n' should be an integer")
  if(!is.null(n))
     if(n > nrow(result))
        warning("Integer value of 'n' is larger than the number of rows in the 'haystack' results. Will return all results sorted.")
  if(!is.null(p.value.threshold))
     if (p.value.threshold<0 | p.value.threshold>1)
        stop("If 'p.value.threshold' is given as input, it should be between 0 and 1")

  result <- result[order(result[["log.p.vals"]]), ]

  # run through filters, one by one, if they are not NA
  # priority is genes > p.value.threshold > n
  if(!is.null(gene)){
    result <- result[is.element(rownames(result), gene), ]
  }

  if(!is.null(p.value.threshold)){
    result <- result[result[["log.p.vals"]] <= log10(p.value.threshold), ]
  }

  if (!is.null(n))
    result <- head(result, n)

  result
  # at this point: 1) decide no. to return, and 2) sort by significance
  #n.to.select <- ifelse(is.null(n), nrow(result), min(n, nrow(result)))
  #result[1:n.to.select, ]
  #o <- order(result$log.p.vals)
  #result[o[1:n.to.select],]
}
alexisvdb/singleCellHaystack documentation built on Jan. 17, 2024, 10:45 a.m.