R/randomization.R

Defines functions getRandomPairs weightsByBin weightsByFactorFreq samplePairsByWeight

Documented in getRandomPairs samplePairsByWeight weightsByBin weightsByFactorFreq

#' Sample randomly pairs from set of genes with equal probabilities
#' @param genesGR a \code{\link[GenomicRanges]{GRanges}} with genes.
#' @param n number of pairs to sample.
#' @return A \code{\link{data.frame}} with indexes of ranges in \code{genesGR}
#' in the frist two columns.
#' @export
getRandomPairs <- function(genesGR, n){

    gp = data.frame(
      t(replicate(
          n,
          sample(1:length(genesGR), size=2, replace=FALSE)
        )
      )
    )

  # sort ids that the first <= second
  gp <- t(apply(gp, 1, sort))

  # sort pairs according to first column
  gp <- gp[order(gp[,1], gp[,2]),]

  gp <- as.data.frame(gp)
  names(gp) = c("g1", "g2")

  return(gp)
}



#' Compute sampling weights for continues values by binning.
#'
#' Compute sampling weights to sample from a population with osberved
#' probabilities by binning.
#'
#' @param obs vector of observed values according to which the sampling should
#'   be done (e.g distances observed for paralog gene pairs).
#' @param population vector of all values in the total population from which one
#'   wants to sample (e.g distances of all gene pairs)
#' @param breaks breaks used for sampling resolution (see \code{breaks} argument
#'   in \code{\link[graphics]{hist}} function).
#'
#' @return numeric vector with sampling weights to sample from \code{population} with
#'   probabilities observed in \code{obs}.
#'
#' @seealso \link{weightsByFactorFreq}
#' @export
weightsByBin <- function(obs, population, breaks=50){

  breaksAll <- hist(c(obs, population), breaks=breaks, plot=FALSE)$breaks

  # calculate the number of observation for nBin equal sized bins
  hObs <- hist(obs, breaks=breaksAll, plot=FALSE)

  # get for each individual in the population the bin index
  binPop <- .bincode(population, breaksAll, include.lowest=TRUE)

  # get counts per bin in the population
  hPop <- hist(population, breaks=breaksAll, plot=FALSE)

  # get the number of observed counts normalized to one as weight
  # normalize the number of observed counts by the bias observed in the population
  weight <- hObs$counts[binPop] / hPop$counts[binPop]

  # remove NA's, e,g, bis not observed in obs but in population. Set their probability to zero
  weight[is.na(weight)] = 0

  # normalize the weights to sum up to 1
  weightNormed <- weight / sum(weight)

  return(weightNormed)
}



#' Compute sampling weights for descret values.
#'
#' Computes weights for sampling from a population according to the frequencies
#' of a variable (factor) observed in an observed set.
#'
#' @param obs vector with values (e.g. factor) as observed set.
#' @param population vector with values (e.g. factor) in the population.
#'
#' @return numeric vector with sampling weights to sample from \code{population}
#'   with probabilities observed in \code{obs}.
#'
#' @seealso \link{weightsByBin}
#' @export
weightsByFactorFreq <- function(obs, population){

  # annotate all pairs with sameStrand information
  freqObsDF <- count(as.vector(obs))
  freqPopDF <- count(as.vector(population))


  # take ratio of frequencies in opserved set and weight for randomly sampling
  # from population set
  idxObs <- match(as.vector(population), freqObsDF$x)
  idxPop <- match(as.vector(population), freqPopDF$x)
  weight <- freqObsDF[idxObs, "freq"] / freqPopDF[idxPop, "freq"]

  # remove NA's, e,g, number of enhancers not observed in paralogs but in set of
  # all genes. Set their probability to zero
  weight[is.na(weight)] <- 0

  # normalize weights to 1
  propability <- weight / sum(weight)

  return(propability)
}


#' Sample pairs from the input gene pairs according to given sampling weights.
#'
#' @param gp a data.frame with gene pairs. Holds names or indices of ranges in
#'   an associated GRangs object in the first two columns.
#' @param weights numeric vector of same length as rows in \code{gp} with
#'   sampling weights.
#' @param n the number of pairs to sample.
#' @return a data.frame object with sampled gene pairs.
#'
#' @export
samplePairsByWeight <- function(gp, weights, n){

  # sample pairs from input pairs according to weights
  pairIDX <- sample.int(nrow(gp), n, prob=weights, replace=TRUE)

  # construct gene pair DF from sampled pairs
  rp <- gp[pairIDX,]

  return(rP)

}
ibn-salem/genepair documentation built on May 18, 2019, 1:30 a.m.