R/smoothing_functions.R

Defines functions .average_windows calc_smoothed_averages gaussian_weight

Documented in calc_smoothed_averages gaussian_weight

#function for calculating gaussian weight of a point

#' Calculate gaussian weights.
#'
#' Get the gaussian weight of a statistic given its position relative to the
#' center and scaling factor sigma.
#'
#' @param p numeric. Position of value to smooth.
#' @param c numeric. Position of center of window.
#' @param s numeric. Sigma, scaling factor.
#'
#' @return A numeric value, the relative weight of the point.
gaussian_weight <- function(p, c, s) {
  exp(-(p-c)^2/(2*s^2))
}

#'Gaussian smooth or average statistics across sliding windows.
#'
#'Calculates optionally gaussian smoothed average values for statistics and the
#'genomic position at which it was observed, splitting by any requested
#'variables.
#'
#'Averages for multiple statistics can be calculated at once. If the statistics
#'argument is set to c("pairwise", "single"), all calculated stats will be run.
#'If it is set to "single", then all non-pairwise statistics (pi, ho, maf, etc.)
#'will be bootstrapped, if it is set to "pairwise", then all pairwise statistics
#'(fst) will be bootstrapped. Individual statistics currently cannot be
#'requested by name, since the computational differences to add additional types
#'is minimal.
#'
#'The data can be broken up categorically by snp or sample metadata, as
#'described in \code{\link{Facets_in_snpR}}. Windows will only be calculated
#'using only SNPs on the same level of any provided facets. NULL and "all"
#'facets work as normally described in \code{\link{Facets_in_snpR}}.
#'
#'As described in Hohelohe et al. (2010), the contribution of individual SNPs to
#'window averages can be weighted by the number of observations per SNP by
#'setting the nk argument to TRUE, as is the default. For bootstraps, nk values
#'are randomly drawn for each SNP in each window. SNPs can also optionally be
#'weighted by their proximity to the window centroid using a gaussian kernal.
#'
#'Centers for windows can either every SNP (if no step size is provided), or
#'every step kilobases from the 0 position of each snp level facet category
#'(chromosome, etc.).
#'
#'The size of sliding windows are defined by the "sigma" argument. Note that
#'this value, as well as that provided to the "step" argument, are given in
#'kilobases. Each window will include SNPs within 3*sigma kilobases from the
#'window center (if the \code{triple_sigma} option is selected). Past this
#'point, the effect of each additional SNP on the window average would be very
#'small, and so they are dropped for computational efficiency (see Hohenlohe
#'(2010)).
#'
#'@param x snpRdata object.
#'@param facets character or NULL, default NULL. Categories by which to break up
#'  windows.
#'@param sigma numeric. Designates the width of windows in kilobases. Full
#'  window size is 6*sigma or sigma kilobases depending on the
#'  \code{triple_sigma} argument.
#'@param step numeric or NULL, default default \code{2*sigma} (non-overlapping
#'  windows). Designates the number of kilobases between each window centroid.
#'  If NULL, windows are centered on each SNP.
#'@param nk logical, default TRUE. If TRUE, weights SNP contribution to window
#'  averages by the number of observations at those SNPs.
#'@param stats.type character, default c("single", "pairwise"). Designates the
#'  statistic(s) to smooth, either "single",  "pairwise", or c("single",
#'  "pairwise"). See details.
#'@param par numeric or FALSE, default FALSE. If numeric, the number of cores to
#'  use for parallel processing.
#'@param triple_sigma Logical, default TRUE. If TRUE, sigma will be tripled to
#'  create windows of 6*sigma total.
#'@param gaussian Logical, default TRUE. If TRUE, windows will be gaussian
#'  smoothed. If not, windows will be raw averages.
#'@param verbose Logical, default FALSE. If TRUE, some progress updates will be
#'  printed to the console.
#'  
#'@export
#'@author William Hemstrom
#'
#'@references Hohenlohe et al. (2010). \emph{PLOS Genetics}
#'
#'@return snpRdata object with smoothed averages for any requested statistics
#'  merged into the window.stats or pairwise.window.stats slots.
#'
#'@examples
#'\dontrun{
#'# add a few statistics
#'x <- calc_pi(stickSNPs, "chr.pop")
#'x <- calc_ho(x, "chr.pop")
#'x <- calc_pairwise_fst(x, "chr.pop")
#'# smooth with a fixed slide between window centers.
#'x <- calc_smoothed_averages(x, "chr.pop", sigma = 200, step = 50)
#'get.snpR.stats(x, "chr.pop", "single.window") # pi, ho
#'get.snpR.stats(x, "chr.pop", "pairwise.window") # fst
#'}
calc_smoothed_averages <- function(x, facets = NULL, sigma, step = 2*sigma, nk = TRUE, 
                                   stats.type = c("single", "pairwise"), 
                                   par = FALSE, triple_sigma = TRUE, gaussian = TRUE,
                                   verbose = FALSE) {
  .snp.id <- chr <- position <- start <- end <- ..col_ord <- snp.facet <-  NULL
  #==============sanity checks============
  if(!is.snpRdata(x)){
    stop("x is not a snpRdata object.\n")
  }
  
  msg <- character(0)
  if(any(!stats.type %in% c("single", "pairwise"))){
    msg <- c(msg, "Unaccepted stats.type, only 'single' or 'pairwise' accepted.\n")
  }
  if("pairwise" %in% stats.type){
    if(nrow(x@pairwise.stats) == 0){
      if("single" %in% stats.type & nrow(x@stats) > 0){
        stats.type <- "single"
      }
      else{
        msg <- c(msg, "No pairwise stats calculated.\n")
      }
    }
  }
  
  if("single" %in% stats.type){
    if(nrow(x@stats) == 0){
      if("pairwise" %in% stats.type & nrow(x@pairwise.stats) > 0){
        stats.type <- "pairwise"
      }
      else{
        msg <- c(msg, "No single-population stats calculated.\n")
      }
    }
  }
  
  if(length(msg) > 0){
    stop(msg)
  }
  

  step <- eval(step) # forces this to eval sigma now before we change it.
  sigma <- 1000*sigma
  if(!is.null(step)){
    step <- step *1000
    .sanity_check_window(x, sigma/1000, step/1000, stats.type = stats.type, nk, facets = facets)
  }
  else{
    .sanity_check_window(x, sigma/1000, NULL, stats.type = stats.type, nk, facets = facets)
    
  }
  if(verbose){cat("Smoothing Parameters:\n\twindow size = ", ifelse(triple_sigma, 6*sigma, 2*sigma), "\n\tWindow slide = ", ifelse(is.null(step), "per-snp", step), "\n\tOverlap? = ", ifelse(step >= sigma*2, FALSE, TRUE), "\n")}
  
  
  
  par <- .par_checker(par, TRUE)
  
  facets <- .check.snpR.facet.request(x, facets, "none")
  snp.facets <- .check.snpR.facet.request(x, facets, "sample")
  sample.facets <- .check.snpR.facet.request(x, facets, "snp")
  x <- .add.facets.snpR.data(x, facets)

  #============run smoothing==============

  if("single" %in% stats.type){
    stats <- get.snpR.stats(x, facets = sample.facets, "single")
    stats$nk <- stats$maj.count + stats$min.count
    stats$maj.count <- NULL
    stats$min.count <- NULL
    numeric.cols <- which(sapply(stats, class) %in% c("numeric", "integer") & colnames(stats) != "nk")
    numeric.cols <- intersect(numeric.cols, (which(colnames(stats) == ".snp.id") + 1):ncol(stats))
    
    
    task_list <- .get.task.list(x, facets)
    
    if(verbose){cat("Beginning run: single stats.\n")}
    
    if(par > 1){
      cl <- parallel::makePSOCKcluster(min(c(par, nrow(task_list))))
      doParallel::registerDoParallel(cl)
      
      out <- foreach::foreach(q = 1:nrow(task_list), .inorder = FALSE,
                              .export = c("data.table", ".average_windows", ".fetch.snp.meta.matching.task.list", ".paste.by.facet")) %dopar% {
                                
                                .split.facet <- function(facet) strsplit(facet, "(?<!^)\\.", perl = T) # this tends to not pass correctly to workers
                                
                                snp.matches <- .fetch.snp.meta.matching.task.list(x = x, task_list[q,])
                                snp.matches <- snp.meta(x)$.snp.id[snp.matches]
                                
                                sample.matches <- which(.paste.by.facet(stats, colnames(stats)[1:2]) == paste0(task_list[q,1], ".", task_list[q,2]))
                                matches <- intersect(sample.matches, which(stats$.snp.id %in% snp.matches))
                                
                                pos_cols <- c(unlist(.split.facet(task_list[q,3])), "position")
                                if(any(pos_cols == ".base")){
                                  pos_cols <- pos_cols[-which(pos_cols == ".base")]
                                }
                                
                                
                                if(length(matches) != 0){
                                  out <- .average_windows(stats[matches, pos_cols, drop = FALSE], 
                                                          chr =  unlist(.split.facet(task_list[q,3])),
                                                          sigma = sigma,
                                                          step = step, 
                                                          triple_sig = triple_sigma, 
                                                          gaussian = gaussian, 
                                                          stats = data.table::as.data.table(stats[matches,c(numeric.cols, which(colnames(stats) == "nk"))]),
                                                          nk = nk)
                                  out$subfacet <- task_list[q,2]
                                  out$facet <- task_list[q,1]
                                  out$snp.facet <- task_list[q,3]
                                  out$snp.subfacet <- task_list[q,4]
                                }
                                else{
                                  out <- NULL
                                }
                                out
                              }
      
      
      parallel::stopCluster(cl)
    }
    else{
      out <- vector("list", nrow(task_list))
      for(q in 1:nrow(task_list)){

        if(verbose){cat(paste0(task_list[q,], collapse = "\t"), "\n")}
        
        .split.facet <- function(facet) strsplit(facet, "(?<!^)\\.", perl = T) # this tends to not pass correctly to workers
        
        snp.matches <- .fetch.snp.meta.matching.task.list(x = x, task_list[q,])
        snp.matches <- snp.meta(x)$.snp.id[snp.matches]
        
        sample.matches <- which(.paste.by.facet(stats, colnames(stats)[1:2]) == paste0(task_list[q,1], ".", task_list[q,2]))
        matches <- intersect(sample.matches, which(stats$.snp.id %in% snp.matches))
        
        pos_cols <- c(unlist(.split.facet(task_list[q,3])), "position")
        if(any(pos_cols == ".base")){
          pos_cols <- pos_cols[-which(pos_cols == ".base")]
        }
        if(length(matches) != 0){
          out[[q]] <- .average_windows(stats[matches,pos_cols, drop = FALSE], 
                                  chr =  unlist(.split.facet(task_list[q,3])),
                                  sigma = sigma,
                                  step = step, 
                                  triple_sig = triple_sigma, 
                                  gaussian = gaussian, 
                                  stats = data.table::as.data.table(stats[matches,c(numeric.cols, which(colnames(stats) == "nk"))]),
                                  nk = nk)
          out[[q]]$subfacet <- task_list[q,2]
          out[[q]]$facet <- task_list[q,1]
          out[[q]]$snp.facet <- task_list[q,3]
          out[[q]]$snp.subfacet <- task_list[q,4]
        }
        else{
          out[[q]] <- NULL
        }
      }
    }
    
    
    
    
    out <- dplyr::bind_rows(out)
    out <- dplyr::arrange(out, snp.facet, position, start, end)
    out$nk.status <- nk
    out$gaussian <- gaussian
    out$sigma <- sigma/1000
    out$step <- ifelse(is.null(step), NA, step/1000)
    out$triple_sigma <- triple_sigma
    
    header_cols <- c("facet", "subfacet", "snp.facet", "snp.subfacet", "position",
                     "sigma", "step", "nk.status", "gaussian", "n_snps", "triple_sigma")
    col_ord <- c(header_cols, colnames(out)[which(!colnames(out) %in% header_cols)])
    .fix..call(out <- out[,..col_ord])
    x <- .merge.snpR.stats(x, out, "window.stats")
    
  }
  
  
  if("pairwise" %in% stats.type){
    stats <- get.snpR.stats(x, facets = sample.facets, "pairwise")
    numeric.cols <- which(sapply(stats, class) %in% c("numeric", "integer") & colnames(stats) != "nk")
    numeric.cols <- intersect(numeric.cols, (which(colnames(stats) == ".snp.id") + 1):ncol(stats))
    
    
    task_list <- .get.task.list(x, facets, source = "pairwise.stats")
    # task_list <- task_list[,3:4,drop = FALSE]
    task_list <- as.data.frame(task_list)
    
    # sample_task_list <- unique(as.character(stats$comparison))
    # sample_task_list <- cbind(sample.facets, sample_task_list)
    # sample_task_list <- as.data.frame(sample_task_list)
    
    # task_list <- merge(sample_task_list, task_list)
    
    
    
    if(verbose){cat("Beginning run: pairwise stats.\n")}
    
    if(par > 1){
      cl <- parallel::makePSOCKcluster(min(c(par, nrow(task_list))))
      doParallel::registerDoParallel(cl)
      
      out <- foreach::foreach(q = 1:nrow(task_list), .inorder = FALSE,
                              .export = c("data.table", ".average_windows", ".fetch.snp.meta.matching.task.list", ".paste.by.facet")) %dopar% {
                                
                                .split.facet <- function(facet) strsplit(facet, "(?<!^)\\.", perl = T) # this tends to not pass correctly to workers
                                
                                snp.matches <- .fetch.snp.meta.matching.task.list(x = x, task_list[q,])
                                snp.matches <- snp.meta(x)$.snp.id[snp.matches]
                                
                                sample.matches <- which(.paste.by.facet(stats, colnames(stats)[1:2]) == paste0(task_list[q,1], ".", task_list[q,2]))
                                matches <- intersect(sample.matches, which(stats$.snp.id %in% snp.matches))
                                
                                pos_cols <- c(unlist(.split.facet(task_list[q,3])), "position")
                                if(any(pos_cols == ".base")){
                                  pos_cols <- pos_cols[-which(pos_cols == ".base")]
                                }
                                
                                if(length(matches) != 0){
                                  out <- .average_windows(stats[matches, pos_cols, drop = FALSE], 
                                                          chr =  unlist(.split.facet(task_list[q,3])),
                                                          sigma = sigma,
                                                          step = step, 
                                                          triple_sig = triple_sigma, 
                                                          gaussian = gaussian, 
                                                          stats = data.table::as.data.table(stats[matches,c(numeric.cols, which(colnames(stats) == "nk"))]),
                                                          nk = nk)
                                  out$subfacet <- task_list[q,2]
                                  out$facet <- task_list[q,1]
                                  out$snp.facet <- task_list[q,3]
                                  out$snp.subfacet <- task_list[q,4]
                                }
                                else{
                                  out <- NULL
                                }
                                out
                              }
      
      
      parallel::stopCluster(cl)
    }
    else{
      out <- vector("list", nrow(task_list))
      for(q in 1:nrow(task_list)){
        if(verbose){cat(paste0(task_list[q,], collapse = "\t"), "\n")}
        
        .split.facet <- function(facet) strsplit(facet, "(?<!^)\\.", perl = T) # this tends to not pass correctly to workers
        
        snp.matches <- .fetch.snp.meta.matching.task.list(x = x, task_list[q,])
        snp.matches <- snp.meta(x)$.snp.id[snp.matches]
        
        sample.matches <- which(.paste.by.facet(stats, colnames(stats)[1:2]) == paste0(task_list[q,1], ".", task_list[q,2]))
        matches <- intersect(sample.matches, which(stats$.snp.id %in% snp.matches))
        
        
        pos_cols <- c(unlist(.split.facet(task_list[q,3])), "position")
        if(any(pos_cols == ".base")){
          pos_cols <- pos_cols[-which(pos_cols == ".base")]
        }
        
        if(length(matches) != 0){
          out[[q]] <- .average_windows(stats[matches, pos_cols, drop = FALSE], 
                                  chr =  unlist(.split.facet(task_list[q,3])),
                                  sigma = sigma,
                                  step = step, 
                                  triple_sig = triple_sigma, 
                                  gaussian = gaussian, 
                                  stats = data.table::as.data.table(stats[matches,c(numeric.cols, which(colnames(stats) == "nk"))]),
                                  nk = nk)
          out[[q]]$subfacet <- task_list[q,2]
          out[[q]]$facet <- task_list[q,1]
          out[[q]]$snp.facet <- task_list[q,3]
          out[[q]]$snp.subfacet <- task_list[q,4]
        }
        else{
          out[[q]] <- NULL
        }
      }
    }
    
    
    
    
    out <- dplyr::bind_rows(out)
    out <- dplyr::arrange(out, snp.facet, position, start, end)
    out$nk.status <- nk
    out$gaussian <- gaussian
    out$sigma <- sigma/1000
    out$step <- ifelse(is.null(step), NA, step/1000)
    out$triple_sigma <- triple_sigma
    
    header_cols <- c("facet", "subfacet", "snp.facet", "snp.subfacet", "position",
                     "sigma", "step", "nk.status", "gaussian", "n_snps", "triple_sigma")
    col_ord <- c(header_cols, colnames(out)[which(!colnames(out) %in% header_cols)])
    .fix..call(out <- out[,..col_ord])
    x <- .merge.snpR.stats(x, out, "pairwise.window.stats")
  }
  
  
  return(x)
}



# Average stats for windows for snps. 
#
# More memory efficient but a bit slower if there are many SNPs, parts pulled from GeneArchEst.
#
# @param x data.frame. Must contain a "position" column and may contain chromosome info columns
# @param sigma numeric. Size of windows on each side of center, in BP.
# @param step numeric, step size, in BP.
# @param chr character, default "chr". Name of chromosome info column(s) in x. Should be passed through .check.snpR.facet.request first.
# @param triple_sig logical, default FALSE. If true, triples sigma (original protocol)
# @param stats data.table, stats to smooth. any columns not matching position or chr will be smoothed and must be numeric.
# @param gaussian logical, default FALSE. If true, does gaussian smoothing. If not, does averages.
# @param nk logical, default TRUE. If TRUE, weights by nk - 1. Expects 'nk' column in stats.
#' @importFrom foreach %do%
.average_windows <- function(x, sigma, step = NULL, chr = "chr", triple_sig = FALSE,
                          stats, gaussian = FALSE, nk = TRUE, pairwise_snps = FALSE){
  ..scols <- ..chr <- NULL


  if(triple_sig){
    sigma <- sigma * 3
  }

  if(length(chr) > 1){
    nccn <- paste0(chr, collapse = ".")
    
    
    ncc <- .paste.by.facet(x[,chr], chr)
    x[,nccn] <- ncc
    x[,chr] <- NULL
    
    chr <- nccn
  }
  else if(chr == ".base" | is.null(chr)){
    x$chr <- ".base"
    if(!is.null(stats)){stats$chr <- ".base"}
    chr <- "chr"
  }
  
  # window start and end points
  unique.chr <- sort(unique(x[,chr]))
  if(pairwise_snps){
    chr.max <- tapply(c(x$s1_position, x$s2_position), rep(x[,chr], 2), max)
    chr.min <- tapply(c(x$s1_position, x$s2_position), rep(x[,chr], 2), min)
    chr.range <- matrix(c(chr.max, chr.min), ncol = 2)
  }
  else{
    chr.max <- tapply(x$position, x[,chr], max)
    chr.min <- tapply(x$position, x[,chr], min)
    chr.range <- matrix(c(chr.max, chr.min), ncol = 2)
  }
  
  if(!is.null(step)){
    centers <- apply(chr.range, 1, function(y) seq(from = 0, to = y[1] + step, by = step))
  }
  else{
    centers <- lapply(unique.chr, function(y) unique(x[x[,chr] == y,]$position))
  }
  
  if(is.matrix(centers)){
    centers <- as.list(as.data.frame(centers))
  }
  
  ends <- foreach::foreach(q = 1:length(chr.max), .inorder = T) %do% {
    e <- centers[[q]] + sigma
    e[e > chr.max[q]] <- chr.max[q]
    e
  }
  starts <- foreach::foreach(q = 1:length(chr.max), .inorder = T) %do% {
    s <- centers[[q]] - sigma
    s[s < 0] <- 0
    s
  }
  chrs <- foreach::foreach(q = 1:length(chr.max), .inorder = T) %do% {
    rep(unique.chr[q], length(centers[[q]]))
  }
  
  

  
  # for each chr, assign windows to all snps
  ## function per chr
  assign_windows <- function(y, starts, ends, centers, stats, chrs){
    comp_fun <- function(y, starts, ends, centers, stats, scols, nk_dt){

      if(pairwise_snps){
        # find snps in windows
        lmat <- outer(y$s1_position, starts, function(pos, starts) pos > starts)
        lmat <- lmat * outer(y$s1_position, ends, function(pos, ends) pos <= ends)
        lmat <- lmat * outer(y$s2_position, ends, function(pos, ends) pos <= ends)
        lmat <- lmat * outer(y$s2_position, ends, function(pos, ends) pos > starts)
        
        tnsnps <- colSums(lmat)
        
        # weight (gaussian) -- use the BOTH positions to determine weighting (the closer to the center they both are the better)
        if(gaussian){
          lmat <-  matrixStats::colMeans2(rbind(as.numeric(outer(y$s1_position, centers, gaussian_weight, s = sigma)),
            as.numeric(outer(y$s2_position, centers, gaussian_weight, s = sigma)))) *
            lmat
        }
      }
      else{
        # find snps in windows
        lmat <- outer(y$position, starts, function(pos, starts) pos > starts)
        lmat <- lmat * outer(y$position, ends, function(pos, ends) pos <= ends)
        
        tnsnps <- colSums(lmat)
        
        # weight (gaussian)
        if(gaussian){
          lmat <-  outer(y$pos, centers, gaussian_weight, s = sigma)*lmat
        }
      }
      
      
      lmat[is.na(lmat)] <- 0
      
      # short-circut if no stats to smooth, just want window info
      if(is.null(stats)){
        rownames(lmat) <- y$.snp.id
        colnames(lmat) <- centers
        
        # detect levels without any valid pairwise comparisons (for small datasets)
        lmat <- lmat[, matrixStats::colSums2(lmat) >= 2, drop = FALSE]
        lmat <- lmat[matrixStats::rowSums2(lmat) > 0,, drop = FALSE]
        if(nrow(lmat) == 0) return(NULL)
        
        # transform into the list structure expected
        n <- names(lmat)
        
        out <- rep(rownames(lmat), ncol(lmat))
        out <- as.numeric(out[lmat == 1])
        out <- split(out, f = rep(1:ncol(lmat), matrixStats::colSums2(lmat)))
        names(out) <- colnames(lmat)
        
        return(out)
      }
      

      # calculate weighted stats
      if(nk){
        build <- t(lmat) %*% as.matrix(.fix..call(stats[,..scols])*(nk_dt - 1))
        build_weights <- t(lmat) %*% as.matrix(nk_dt - 1)
      }
      else{
        build <- t(lmat) %*% as.matrix(.fix..call(stats[,..scols]))
        build_weights <- matrix(colSums(lmat), nrow = ncol(lmat), ncol = length(scols))
      }
      
      return(list(build = build, build_weights = build_weights, n_snps = tnsnps))
    }
    
    if(!is.null(stats)){
      scols <- which(!colnames(stats) %in% c(chr, "position", "nk"))
      
      # check for NAs in either stats or weights (don't want them to contribute)
      if(nk){
        nk_dt <- data.table::as.data.table(matrix(stats$nk, nrow = nrow(stats), ncol = length(scols)))
        nk_NAs <- which(is.na(stats$nk))
        if(length(nk_NAs) > 0){
          data.table::set(stats, i = as.integer(nk_NAs), j = scols, value = 0)
          stats$nk[nk_NAs] <- 0
          data.table::set(nk_dt, nk_NAs, 1:ncol(nk_dt), value = 0)
        }
      }
      for(k in 1:length(scols)){
        if(nk){
          data.table::set(nk_dt, which(is.na(stats[[scols[k]]])), k, value = 0)
        }
        data.table::set(stats,which(is.na(stats[[scols[k]]])),scols[k],0)
      }
      
      if(nk){
        for(k in 1:ncol(nk_dt)){
          data.table::set(nk_dt, which(nk_dt[[k]] == 0), k, value = 1) # set missing data to 1 to prevent negative weights (since we do nk - 1)
        }
      }
    }
    # if large (say, 50k snps), will iterate through in chunks, solve, and then combine results to minimize memory usage
    max_snps <- 50000
    if(nrow(y) > max_snps){
      n_iters <- ceiling(nrow(y)/max_snps)
      titer <- 1
      
      # initialize storage: build will hold the building weighted values for each stat for each window, build_weights will hold the total weights for those windows
      if(!is.null(stats)){
        build <- matrix(0, length(starts), length(scols))
        build <- data.table::as.data.table(build)
        colnames(build) <- colnames(stats)[scols]
        build <- list(build = build, build_weights = build, n_snps = numeric(length(starts)))
      }
      else{
        build <- vector("list", length = length(centers))
        names(build) <- centers
      }
      
      # for each set of snps, get the weighted values and weights for all relevent windows.
      for(i in 1:n_iters){
        end <- i*max_snps
        end <- ifelse(end > nrow(y), nrow(y), end)
        trows <- titer:end
        consider_windows <- starts <= max(y$position[trows]) & ends >= min(y$position[trows])
        
        # short-circut if no stats to smooth, just want window info
        if(is.null(stats)){
          bpart <- comp_fun(y[trows,], 
                            starts[consider_windows], 
                            ends[consider_windows], 
                            centers[consider_windows], 
                            NULL, 
                            NULL,
                            NULL)
          keys <- unique(c(names(build), names(bpart)))
          build <- stats::setNames(mapply(c, build[keys], bpart[keys]), keys)
          titer <- i*max_snps + 1
        }
        else{
          bpart <- comp_fun(y[trows,], 
                            starts[consider_windows], 
                            ends[consider_windows], 
                            centers[consider_windows], 
                            stats[trows,], 
                            scols,
                            nk_dt[trows,])
          titer <- i*max_snps + 1
          
          build$build[consider_windows,] <- build$build[consider_windows,] + bpart$build
          build$build_weights[consider_windows,] <- build$build_weights[consider_windows,] + bpart$build_weights
          build$n_snps[consider_windows] <- build$n_snps[consider_windows] + bpart$n_snps
        }
      }
    }
    else{
      # short-circut if no stats to smooth, just want window info
      if(is.null(stats)){
        return(comp_fun(y, starts, ends, centers, NULL, chrs))
      }
      
      build <- comp_fun(y, starts, ends, centers, stats, scols, nk_dt)
      build[1:2] <- lapply(build[1:2], data.table::as.data.table)
    }
    
    if(is.null(stats)){
      return(build)
    }
    
    # calculate weighted values
    out <- build$build / build$build_weights
    
    # remove empty windows
    empties <- which(rowSums(dplyr::mutate_all(out, is.nan)) == ncol(out))
        
    out[,chr] <- chrs
    out$position <- centers
    out$n_snps <- build$n_snps
    
    
    if(length(empties) > 0){
      out <- out[-empties,]
    }
    return(out)
  }
  
  
  ## note: need to make sure that the window ids are unique!
  windows <- vector("list", length(unique.chr))
  names(windows) <- unique.chr
  for(i in 1:length(chr.max)){
    matches <- which(x[,chr] == unique.chr[i])
    if(is.null(stats)){
      twin <- assign_windows(x[matches,,drop = FALSE], starts[[i]], ends[[i]], centers[[i]], 
                             stats = NULL, chrs = chrs[[i]])
      if(!is.null(twin)){
        windows[[i]] <- twin
      }
    }
    else{
       windows[[i]] <- assign_windows(x[matches,,drop = FALSE], starts[[i]], ends[[i]], centers[[i]], 
                                   stats = stats[matches,,drop = FALSE], chrs = chrs[[i]])
    }
  }
  
  if(is.null(stats)){
    return(windows)
  }
  
  windows <- data.table::rbindlist(windows)
  
  if(!pairwise_snps){
    windows[,chr] <- NULL
  }
  

  return(windows)
}
hemstrow/snpR documentation built on March 20, 2024, 7:03 a.m.