R/sfs_functions.R

Defines functions .sanity_check_sfs calc_directionality make_SFS calc_sfs

Documented in calc_directionality calc_sfs make_SFS

#' Generate a 1-2d site frequency spectrum from a snpRdata object.
#'
#' Generates a 1 or 2 dimensional site frequency spectrum from a dadi input file
#' using the projection methods and folding methods of Marth et al (2004) and
#' Gutenkunst et al (2009). This code is essentially an R re-implementation of
#' the SFS construction methods implemented in the program \emph{dadi} (see
#' Gutenkunst et al (2009)).
#'
#' Site frequency spectrums are constructed using the projection methods
#' detailed in Marth et al (2004) and the 2 dimensional expansion in Gutenkunst
#' et al (2009). Folding methods are also taken from Gutenkunst et al (2009).
#' Either 1 or 2d SFSs can be constructed by providing a vector of population
#' names and projection sizes.
#'
#' Note that ref and anc columns are suggested in the SNP metadata, containing
#' the derived and ancestral character states, respectively. These should
#' contain three characters each: two flanking bases and the SNP. For example,
#' for an A/C SNP flanked by a G and a T, "GCT" and "GAT" would be expected.
#' Note that if these character states are not known, the minor and major
#' alleles will be substituted. Unfolded spectra will be misleading in this
#' case.
#'
#' @param x snpRdata object. The SNP metadata must contain "ref" and "anc" data.
#' @param facet character, default NULL. Name of the sample metadata column
#'   which specifies the source population of individuals. For now, allows only
#'   a single simple facet (one column).If NULL, runs the entire dataset.
#' @param pops character, default NULL. A vector of population names of up to
#'   length 2 containing the names of populations for which the an SFS is to be
#'   created. If NULL, runs the entire dataset.
#' @param projection numeric. A vector of sample sizes to project the SFS to, in
#'   \emph{number of gene copies}. Sizes too large will result in a SFS
#'   containing few or no SNPs. Must match the length of the provided pops
#'   vector.
#' @param fold logical, default FALSE. Determines if the SFS should be folded or
#'   left polarized. If FALSE, snp metadata columns named "ref" and "anc"
#'   containing the identity of the derived and ancestral alleles, respectively,
#'   should be present for polarization to be meaningful.
#' @param update_bib character or FALSE, default FALSE. If a file path to an
#'   existing .bib library or to a valid path for a new one, will update or
#'   create a .bib file including any new citations for methods used. Useful
#'   given that this function does not return a snpRdata object, so a
#'   \code{\link{citations}} cannot be used to fetch references.
#'
#' @references Gutenkunst et al (2009). Inferring the joint demographic history
#'   of multiple populations from multidimensional SNP frequency data.
#'   \emph{PLoS genetics}, 5(10), e1000695.
#' @references Marth et al (2004). The allele frequency spectrum in genome-wide
#'   human variation data reveals signals of differential demographic history in
#'   three large world populations. \emph{Genetics}, 166(1), 351-372.
#'
#' @export
#' @return A matrix or vector containing the site frequency spectrum with a
#'   "pops" attribute containing population IDs, such as c("POP1", "POP2"). For
#'   a 2d SFS, the first pop is the matrix columns and the second is the matrix
#'   rows.
#'
#' @author William Hemstrom
#'
#' @examples
#' 
#' \dontrun{
#' # add the needed ref and anc columns, using the major and minor alleles (will fold later)
#' dat <- calc_maf(stickSNPs)
#' # note, setting ref and anc is done by default if these columns don't exist!
#' snp.meta(dat)$ref <- paste0("A", get.snpR.stats(dat)$minor, "A") 
#' snp.meta(dat)$anc <- paste0("A", get.snpR.stats(dat)$major, "A")
#' 
#' # run for two populations
#' ## call calc_sfs()
#' sfs <- calc_sfs(dat, "pop", c("ASP", "CLF"), c(10,10))
#' ## plot
#' plot_sfs(x = sfs)
#' 
#' 
#' # run for the overall dataset
#' sfs <- calc_sfs(dat, projection = 30)
#' ## plot
#' plot_sfs(x = sfs)
#' 
#' # note that plot_sfs() will take a snpRdata object, calling calc_sfs()
#' plot_sfs(dat, projection = 30)
#' }
#' 
calc_sfs <- function(x, facet = NULL, pops = NULL, projection, fold = TRUE, 
                     update_bib = FALSE){
  #=============sanity checks=================
  if(!is.snpRdata(x)){
    stop("x is not a snpRdata object.\n")
  }
  msg <- character(0)
  
  if(is.null(pops) & !is.null(facet)){
    msg <- c(msg, "Pops must be provided if a facet is designated.\n")
  }
  if(is.null(facet) & !is.null(pops)){
    msg <- c(msg, "A facet must be provided if pops are designated.\n")
  }
  if(!is.null(facet)){
    if(length(facet) > 1){
      msg <- c(msg, "For now, only a single facet is allowed at a time.\n")
    }
    check_facet <- .check.snpR.facet.request(x, facet, "none", T)
    if(any(check_facet[[2]] != "sample")){
      msg <- c(msg, "For now, only sample level facets are allowed.\n")
    }
    if(length(.split.facet(facet)[[1]]) > 1){
      msg <- c(msg, "For now, only facets referring to only one column of metadata are allowed.\n")
    }
  }
  
  if(!is.null(pops)){
    if(length(pops) != length(projection)){
      msg <- c(msg, "A projection size must be provided for every requested population of samples.\n")
    }
  }
  
  
  if(any(!c("ref", "anc") %in% colnames(x@snp.meta))){
    om <- snp.meta(x)
    om$ref <- paste0("A", .get.snpR.stats(x)$minor, "A")
    om$anc <- paste0("A", .get.snpR.stats(x)$major, "A")
    .suppress_specific_warning(snp.meta(x) <- om, "duplicated")

    if(fold == FALSE){
      warning("Without ancestral and derived character states, unfolded spectra will be misleading.\n")
    }
  }
  else{
    if(!all(sapply(c(x@snp.meta$ref, x@snp.meta$anc), nchar) == 3)){
      msg <- c(msg, "All ref and anc entries must be exactly three characters long. See documentation for details.\n")
    }
  }

  if(length(msg) > 0){
    stop(msg)
  }
  #==============run=========================
  # subset
  if(!is.null(pops)){
    suppressWarnings(x <- .subset_snpR_data(x, facets = facet, subfacets = pops))
  }

  # get dadi formatted data
  y <- format_snps(x, output = "dadi", facets = facet)

  # get sfs
  if(is.null(pops)){pops <- ".base"}
  sfs <- make_SFS(y, pops, projection, fold, update_bib)
  
  return(sfs)
}





#' Generate a 1-2d site frequency spectrum from a dadi input file.
#'
#' Generates a 1 or 2 dimensional site frequency spectrum from a dadi input file
#' using the projection methods and folding methods of Marth et al (2004) and
#' Gutenkunst et al (2009). This code is essentially an R re-implementation of
#' the SFS construction methods implemented in the program \emph{dadi} (see
#' Gutenkunst et al (2009)).
#'
#' Site frequency spectrums are constructed using the projection methods
#' detailed in Marth et al (2004) and the 2 dimensional expansion in Gutenkunst
#' et al (2009). Folding methods are also taken from Gutenkunst et al (2009).
#' Either 1 or 2d SFSs can be constructed by providing a vector of population
#' names and projection sizes.
#'
#' @param x character or data.frame. Either a path to a dadi formatted input
#'   file or a data.frame containing previously imported dadi formatted data.
#' @param pops character. A vector of population names of up to length 2
#'   containing the names of populations for which the an SFS is to be created.
#' @param projection numeric. A vector of sample sizes to project the SFS to, in
#'   \emph{number of gene copies}. Sizes too large will result in a SFS
#'   containing few or no SNPs.
#' @param fold logical, default FALSE. Determines if the SFS should be folded or
#'   left polarized.
#' @param update_bib character or FALSE, default FALSE. If a file path to an
#'   existing .bib library or to a valid path for a new one, will update or
#'   create a .bib file including any new citations for methods used. Useful
#'   given that this function does not return a snpRdata object, so a
#'   \code{\link{citations}} cannot be used to fetch references.
#'
#' @references Gutenkunst et al (2009). Inferring the joint demographic history
#'   of multiple populations from multidimensional SNP frequency data.
#'   \emph{PLoS genetics}, 5(10), e1000695.
#' @references Marth et al (2004). The allele frequency spectrum in genome-wide
#'   human variation data reveals signals of differential demographic history in
#'   three large world populations. \emph{Genetics}, 166(1), 351-372.
#'
#' @export
#' @return A matrix or vector containing the site frequency spectrum with a
#'   "pops" attribute containing population IDs, such as c("POP1", "POP2"). For
#'   a 2d SFS, the first pop is the matrix columns and the second is the matrix
#'   rows.
#'   
make_SFS <- function(x, pops, projection, fold = FALSE, update_bib = FALSE){
  #================sanity checks=========
  msg <- character()
  if(!is.data.frame(x)){
    if(!is.character(x)){
      msg <- c(msg, "x must either be either a dadi formatted data file or a data.frame derived from such.\n")
    }
    else if(!file.exists(x)){
      msg <- c(msg, "x must either be either a dadi formatted data file or a data.frame derived from such.\n")
    }
    else{
      x <- utils::read.table(x, header = T, stringsAsFactors = F)
    }
  }
  if(length(msg) > 0){
    stop(msg)
  }


  if(any(sapply(pops, function(y) length(grep(y, colnames(x)))) != 2)){
    msg <- c(msg, "Each pop must match two column names in x.\n")
  }
  if(length(pops) > 2){
    msg <- c(msg, "Only one or two dimensional SFSs can be created.\n")
  }

  if(length(msg) > 0){
    stop(msg)
  }
  #================subfunctions==========
  # function to fold an sfs
  fold_sfs <- function(sfs){

    # fold a 2 dimensional sfs
    if(length(dim(sfs)) == 2){
      rev.matrix <- function(x){
        return(x[nrow(x):1, ncol(x):1])
      }

      # figure out the part that gets folded in
      sample_sizes <- matrix(0:(nrow(sfs) - 1), nrow(sfs), ncol(sfs))
      sample_sizes <- sample_sizes +
        t(matrix(0:(ncol(sfs) - 1), ncol(sfs), nrow(sfs)))

      total.samples <- (ncol(sfs) + nrow(sfs) - 2)
      folded.in <- ifelse(sample_sizes > floor(total.samples/2), T, F)

      # reverse the sfs
      reversed <- sfs
      reversed[folded.in == F] <- 0
      reversed <- rev.matrix(reversed)

      # get the folded sfs
      folded <- sfs + reversed

      # deal with ambiguous areas
      ambig <- ifelse(sample_sizes == total.samples/2, T, F)
      ambig[ambig == T] <- sfs[ambig == T]
      rev.ambig <- rev.matrix(ambig)
      folded <- folded + (-.5*ambig + .5*rev.ambig)

      folded[folded.in == T] <- NA
    }

    # fold a 1 dimensional sfs--life is easy
    else{
      rev <- rev(sfs)[1:floor(length(sfs)/2)]
      rev <- c(rev, rep(0, length(sfs) - length(rev)))
      folded <- sfs + rev
      folded[ceiling(length(sfs)/2):length(sfs)] <- NA
    }
    return(folded)
  }

  # make a projected sfs from a counts array
  make_proj_sfs <- function(counts, projection, fold = F){

    # initialize
    if(length(projection) == 2){
      sfs <- matrix(0, projection[1] + 1, projection[2] + 1)
    }
    else{
      sfs <- numeric(projection + 1)
      counts[,,2] <- 1
    }

    # project each snp
    for(i in 1:nrow(counts[,,1])){

      # if the snp isn't sequenced in any pop, skip
      if(any(counts[i,1,] == 0)){
        next
      }

      # if we aren't folding and this snp isn't polarizable (usually meaning we had NNN for the anc condition),
      # skip it
      if(!fold & !counts[i,3,1]){
        next
      }

      if(length(projection) == 2){
        pop_counts <- list(counts[i,,1], counts[i,,2])
      }
      else{
        pop_counts <- list(counts[i,,1])
      }
      contrib <- find_contribs(pop_counts, projection)
      sfs <- sfs + contrib
    }

    return(sfs)
  }

  # function to get a pop_counts array for projections. Result has three columns,
  # containing the three bits of info needed for a pop_counts list. The third
  # index lists the pop
  # input is a dadi formatted data.frame
  get_counts <- function(x, pops){
    # figure out which allele is the outgroup/derived in each row and figure out polarization status
    anc.als <- substr(x$anc, 2, 2)
    ref.als <- substr(x$ref, 2, 2)
    polarized <- rep(T, nrow(x))
    polarized[anc.als == "N"] <- F
    polarized[anc.als != x$Allele1 & anc.als != x$Allele2] <- F
    polarized[ref.als != x$Allele1 & ref.als != x$Allele2] <- F
    anc.als[anc.als == "N"] <- x$Allele1[anc.als == "N"]
    which.derived <- rep(2, nrow(x))
    which.derived[anc.als == x$Allele2] <- 1

    # initialize output
    out <- array(0, dim = c(nrow(x), 3, 2))

    # write to output
    for(i in 1:length(pops)){
      x <- data.table::as.data.table(x)
      dat.cols <- grep(pops[i], colnames(x))
      tdat <- x[,dat.cols, with = FALSE]
      tdat <- as.matrix(tdat)
      out[,1,i] <- rowSums(tdat) # total count
      t.index <- 2 * (1:nrow(tdat))
      t.index[which.derived == 1] <- t.index[which.derived == 1] - 1
      out[,2,i] <- t(tdat)[t.index] # derived allele count
      out[,3,i] <- polarized
    }

    return(out)
  }


  # function to get an sfs contribution for a single snp across 2 populations
  # takes a list of length 2, where the each element is a numeric vector containing:
  # 1) the number of sequenced alleles at the site
  # 2) the number of derived alleles at the site
  # 3) the polarization status
  #
  # also needs a vector of projection sizes for the populations
  find_contribs <- function(pop_counts, projection){
    pcontribs <- vector("list", length(pop_counts))

    for(i in 1:length(pop_counts)){
      # pull info
      hits <- pop_counts[[i]][2]
      n <- pop_counts[[i]][1]
      m <- projection[i]


      # if there are less snps sequenced here than our projection, it gets zero
      # for all contributions
      if(n < m){
        contrib <- rep(0, m + 1)
      }

      else{
        # make the hits vector
        proj_hits <- 0:m

        # do the binomials
        contrib <- choose(m, proj_hits)
        contrib <- contrib*choose(n - m , hits - proj_hits)
        contrib <- contrib/choose(n, hits)
      }
      pcontribs[[i]] <- contrib
    }

    return(Reduce(outer, pcontribs))
  }

  #=========================run the functions================
  counts <- get_counts(x, pops)
  sfs <- make_proj_sfs(counts, projection, fold)
  if(fold){
    sfs <- fold_sfs(sfs)
    if(length(projection) == 1){
      sfs <- sfs[1:floor(projection/2)]
    }
    attr(sfs, which = "folded") <- TRUE
  }
  else{
    attr(sfs, which = "folded") <- FALSE
  }

  # add a pops attribute
  attr(sfs, which = "pop") <- pops

  # return
  masked <- sfs
  if(is.matrix(masked)){
    masked[1,1] <- 0
    if(!fold){
      masked[nrow(masked), ncol(masked)] <- 0
    }
  }
  else{
    masked[1] <- 0
    if(!fold){
      masked[length(masked)] <- 0
    }
  }
  if(sum(masked, na.rm = T) == 0){
    stop("No segrgating sites remain after projection. Try decreasing projection sizes!\n")
  }
  
  cat("SFS completed with", sum(masked, na.rm = T), "segrgating sites.\n")
  
  .yell_citation("Gutenkunst2009", "SFS", "Used to project and possibly fold SFS data. Code is a direct R re-implementation.", outbib = update_bib)
  
  return(sfs)
}

#' Calculate the directionality index from a 2d site frequency spectrum.
#'
#' Calculates the directionality index based on a 2d SFS according to Peter and
#' Slatkin (2013). Input spectra can be created using the \code{\link{calc_sfs}}
#' function, using a provided snpRdata object, or passed from other programs.
#' \emph{Spectra must be not be folded}.
#'
#' Essentially, the directionality index measures the difference in derived
#' allele frequency between two populations to determine the directionality of
#' population spread between the two. Since the "destination" population is
#' sourced from but experienced more genetic drift than the "source" population,
#' it should have relatively more high-frequency derived alleles \emph{after the
#' removal of fixed ancestral alleles}. See Peter and Slatkin (2013) for
#' details.
#'
#' @param x snpRdata object or matrix, default NULL. A snpRdata from which to
#'   calculate a SFS. Alternatively, a 2d site frequency spectra stored in a
#'   matrix, with an additional "pop" or "pops" attribute containing population
#'   IDs, such as c("POP1", "POP2"), where the first pop corresponds to matrix
#'   columns and the second to matrix rows. These objects can be produced from a
#'   dadi input file using \code{\link{make_SFS}}. Note that if x is a snpRdata
#'   object, snp metadata columns named "ref" and "anc" containing the identity
#'   of the derived and ancestral alleles, respectively, must be present.
#' @param facet character, default NULL. Passed to \code{\link{calc_sfs}} -- see
#'   documentation there for details. Ignored if a sfs is provided.
#' @param pops character, default NULL. Passed to \code{\link{calc_sfs}} -- see
#'   documentation there for details. Ignored if a sfs is provided.
#' @param projection numeric, default NULL. Passed to \code{\link{calc_sfs}} --
#'   see documentation there for details. Ignored if a sfs is provided.
#' @param update_bib character or FALSE, default FALSE. If a file path to an
#'   existing .bib library or to a valid path for a new one, will update or
#'   create a .bib file including any new citations for methods used. Useful
#'   given that this function does not return a snpRdata object, so a
#'   \code{\link{citations}} cannot be used to fetch references.
#'
#' @export
#' @references Peter, B. M., & Slatkin, M. (2013). Detecting range expansions
#'   from genetic data. \emph{Evolution}, 67(11), 3274-3289.
#'
#' @return A numeric value giving the directionality with a "direction"
#'   attribute designating the direction between the two populations.
#'   
#' @examples
#' \dontrun{
#' # directionality can be calculated without first calculating a SFS
#' calc_directionality(stickSNPs, facet ="pop", pops = c("ASP", "PAL"), projection = c(10, 10))
#'
#' # an existing SFS can also be fed in. This may be handy if you get a SFS from elsewhere.
#' sfs <- calc_sfs(stickSNPs, "pop", c("ASP", "PAL"), c(10, 10), fold = FALSE)
#' calc_directionality(sfs)
#' }
calc_directionality <- function(x, facet = NULL, pops = NULL, projection = NULL, update_bib = FALSE){
  #==========sanity checks=============
  msg <- character(0)
  
  if(is.snpRdata(x)){
    
    if(any(c(is.null(facet), is.null(pops), is.null(projection)))){
        msg <- c(msg, "facet, pops, and projection arguments must all be provided if an sfs is not.")
    }
    
    if(!is.null(pops)){
      if(length(pops) != 2){
        msg <- c(msg, "Exactly two pops must be listed.\n")
      }
    }
    
    x <- calc_sfs(x, facet, pops, projection, fold = F)
  }
  
  else{
    msg <- c(msg, .sanity_check_sfs(x, 2))
    if(any(is.na(x))){
      msg <- c(msg, "NAs found in provided sfs. This most likely means that the SFS is folded, which is not permitted for directionality calculation.\n")
    }
  }
  
  if(length(msg) > 0){
    stop(msg)
  }
  #===========calc direcitonality===========
  
  # flip everthing, since i should be pop 1 and j should be pop 2, and the
  # matrix is set up so that rows are pop 1 and columns are pop 2. Transposing fixes it.
  pops <- attr(x, "pop")
  x <- t(x)

  # normalize, exluding fixed sites
  x <- x[-1,-1] # remove fixed sites
  x <- x/sum(x)

  # get all of the allele frequencies in each cell
  freqs.i <- (1:(nrow(x)))/(nrow(x))
  freqs.j <- (1:(ncol(x)))/(ncol(x))

  # do the math, this is equ 1b (i - j times sfs)
  directionality <- sum(outer(freqs.i, freqs.j, "-") * x)
  if(directionality > 0){
    attr(directionality, "direction") <- paste0(pops[1], "->", pops[2])
  }
  else{
    attr(directionality, "direction") <- paste0(pops[1], "<-", pops[2])

  }
  
  .yell_citation("Peter2013", "Direcitonality", "Directionality index (psi)", update_bib)

  return(directionality)
}


.sanity_check_sfs <- function(x, allowed_dimensions = c(1, 2)){
  msg <- character(0)
  if("pops" %in% names(attributes(x))){
    attr(x, "pop") <- attr(x, "pops")
    attr(x, "pops") <- NULL
  }
  
  dims <- length(dim(x))
  if(dims == 0){dims <- 1}
  if(!dims %in% allowed_dimensions){
    msg <- c(msg, paste0("SFS dimensionality ", dims, " not allowed for this function. Allowed dimensionalities: ", paste0(allowed_dimensions, collapse = ", ")))
    return(msg)
  }
  
  if(is.matrix(x)){
    if("pop" %in% names(attributes(x))){
      if(length(attr(x, "pop")) == 2){
        sfs <- x
      }
      else{
        msg <- c(msg, "2D SFS pop attribute must be length 2.\n")
      }
    }
    else{
      msg <- c(msg, "2D SFS must have a pop attribute with length 2.\n")
    }
  }
  else if(is.numeric(x)){
    if("pop" %in% names(attributes(x))){
      if(length(attr(x, "pop")) == 1){
        sfs <- x
      }
      else{
        msg <- c(msg, "1D SFS pop attribute must be length 1.\n")
      }
    }
    else{
      msg <- c(msg, "1D SFS must have a pop attribute with length 1.\n")
    }
  }
  else{
    msg <- c(msg, "x must be either a snpRdata object, a matrix containing a 2D SFS, or a numeric vector containing a 1D sfs.\n")
  }
  
  return(msg)
}
hemstrow/snpR documentation built on March 20, 2024, 7:03 a.m.