R/generate_sumstats.R

Defines functions CalcSumstats

Documented in CalcSumstats

# Purpose: Given simulated data from `DGP`, calculate summary statistics.
# Updated: 2025-03-26

#' Calculate Summary Statistics
#' 
#' Generate summary statistics from individual-level data. Provide either a
#' list of data as generated by \code{\link{DGP}}, or all of `anno`, `geno`, 
#' and `pheno`. 
#' 
#' @param anno (snps x 1) annotation vector.
#' @param covar (subjects x covars) covariate matrix.
#' @param data List of data containing the annotation vector `anno`, the 
#'   covariate data `covar`, the genotype matrix `geno`, and the phenotype
#'   vector `pheno`, as returned by \code{\link{DGP}}. Overrides the other
#'   arguments if provided.
#' @param geno (subjects x snps) genotype matrix.
#' @param pheno (subjects x 1) phenotype vector.
#' @param add_intercept Add an intercept if not present in the supplied 
#'   covariate matrix covar? Default: TRUE.
#' @param is_binary Is the phenotype binary? Default: FALSE.
#' 
#' @return List containing the following items:
#' * `ld`: A SNP x SNP correlation (LD) matrix.
#' * `sumstats`: A SNP x 5 matrix of summary statistics, including the 
#'.  annotation, MAF, estimated effect size, standard error, and p-value.
#' * `type`: Either "binary" or "quantitative".`
#' 
#' @export
#' @examples
#' data <- DGP()
#' sumstats <- CalcSumstats(data = data)
CalcSumstats <- function(
  anno = NULL,
  covar = NULL,
  data = NULL,
  geno = NULL,
  pheno = NULL,
  add_intercept = TRUE,
  is_binary = FALSE
) {
  
  # Unpack.
  if (!is.null(data)) {
    anno <- data$anno
    covar <- data$covar
    geno <- data$geno
    pheno <- data$pheno
    is_binary <- (data$type == "binary")
  } else if (is.null(anno) || is.null(geno) || is.null(pheno)) {
    
    msg <- paste0(
      "Unless 'data' as produced by 'DGP' is provided, each of ",
      "anno, geno, and pheno must be provided."
    )
    stop(msg)
    
  }
  
  if (is.null(covar)) {
    covar <- matrix(data = 1, nrow = length(pheno))
  }
  
  # Check if covariate matrix contains an intercept.
  if (!ContainsInt(covar)) {
    warning("No intercept was detected in the covariate matrix. One will
             be added unless `add_intercept = FALSE`.")
    if (add_intercept) {
      covar <- cbind(1, covar)
      colnames(covar)[1] <- "intercept"
    }
  }
  
  # Calculate MAF.
  maf <- apply(geno, 2, mean) / 2
  
  # Calculate LD matrix.
  ld <- CorCpp(geno)
  n_snp <- length(anno)
  
  # Calculate beta and SE.
  if (is_binary) {
    
    type <- "binary"
    sumstats <- lapply(seq_len(n_snp), function(i) {
      fit <- stats::glm(
        pheno ~ 0 + geno[, i] + covar,
        family = stats::binomial()
      )
      results <- summary(fit)
      beta <- as.numeric(results$coefficients[, "Estimate"][1])
      se <- as.numeric(results$coefficients[, "Std. Error"][1])
      p <- as.numeric(results$coefficients[, "Pr(>|z|)"][1])
      out <- data.frame(beta = beta, se = se, p = p)
    })
    
  } else {
    
    type <- "quantitative"
    sumstats <- lapply(seq_len(n_snp), function(i) {
      fit <- OLS(y = pheno, X = cbind(geno[, i], covar))
      beta <- fit$beta[1]
      se <- fit$se[1]
      p <- fit$pval[1]
      out <- data.frame(beta = beta, se = se, p = p)
    })
    
  }
  
  sumstats <- do.call(rbind, sumstats)
  sumstats$anno <- anno
  sumstats$maf <- maf
  sumstats <- sumstats[, c("anno", "maf", "beta", "se", "p")]
  
  # Output.
  out <- list(
    ld = ld,
    sumstats = sumstats,
    type = type
  )
}

Try the AllelicSeries package in your browser

Any scripts or data that you put into this service are public.

AllelicSeries documentation built on April 3, 2025, 7:46 p.m.