R/~old/gpc/calcMAFdiff.R

Defines functions doCalc

#' Calculates difference in minor allele frequency between two populations
#' for use in runGSEA.R
#'
#' @param genoF (char) path to file with SNP genotype data (PLINK format)
#' @param realFAM (char) path to PLINK case/control coded fam file
#' @param phenoPerm (logical) runs GSEA using phenotype permutations
#'		(i.e., randomly re-shuffing case/control labels n times).
#' 		if FALSE, will run GSEA using genotype permutations (see setupGSEArun.R)
#' @param permFAM (char) path to files with fam files from re-shuffled
#'    case/control labels. If phenoPerm is FALSE, leave this blank.
#' @param PLINK (char) path to PLINK executable
#' @param absMAF (logical) if TRUE, calculates absolute ΔMAF
#'    if FALSE, omits absMAF() and calculates directional ΔMAF (default)
#' @param outDir (char) directory to store output files (PLINK frequency
#'    calculations plus GSEA input file)
#' @return (char) vector of data frame with two columns: 1) SNP ID (marker),
#'    2) SNP association statistic
#' @export

require(dplyr)
require(tools)
require(data.table)

args    <- commandArgs(TRUE)
genoF   <- args[1]
realFAM <- args[2]
PLINK   <- args[3]
#permFAM 
outDir  <- args[4]

phenoPerm <- FALSE
absMAF    <- FALSE

doCalc <- function(famF, plinkOut, frqOut) {

  # Setup PLINK to calculate MAF between case/control populations
  cat("* Calculating phenotype-stratified minor allele frequencies\n")
  str1 <- sprintf("%s --bed %s.bed --bim %s.bim --fam %s", PLINK, genoF,
                  genoF, famF)
  str2 <- sprintf("--freq case-control --allow-no-sex")
  str3 <- sprintf("--out %s", plinkOut)
  cmd  <- sprintf("%s %s %s", str1, str2, str3)
  system(cmd)
  cat(" done.\n")

  # Read in generated frequency file and calculate difference in minor
  # allele frequency for each SNP between both populations
  # File format reference for .frq.cc file (via PLINK website)
  #  A1:    allele 1 (usually minor)
  #  MAF_A: allele 1 frequency in cases
  #  MAF_U: allele 1 frequency in controls
  frq_cc  <- fread(sprintf("%s.frq.cc", plinkOut), h=T, data.table=F)

  cat("* Calculating difference in minor allele frequencies b/w pops...")
  if (absMAF == TRUE) {
    frq_cc$MAFdiff <- abs(frq_cc$MAF_U - frq_cc$MAF_A)
  } else {
    frq_cc$MAFdiff <- frq_cc$MAF_U - frq_cc$MAF_A
  }
  marker_MAF_input <- subset(frq_cc, select=c(SNP, MAFdiff))
  marker_MAF_input[is.na(marker_MAF_input)] <- 0
  cat(" done.\n")

  # NOTE: header of column 2 is labelled "CHI2" for purposes of reading into
  # calculate_gsea.pl. This script explicitly accepts two header labels,
  # 'P' or 'CHI2'. The column still contains ΔMAF values
  cat(sprintf("Writing out ΔMAF file to %s.\n", frqOut))
  write.table(marker_MAF_input, file=frqOut,
              col.names=c("Marker", "CHI2"), row=F, sep="\t", quote=F)

}

# Real calculation
doCalc(famF=realFAM,
       plinkOut=sprintf("%s/%s", outDir, basename(genoF)),
       frqOut=sprintf("%s/markerMAF.txt", outDir))

if (phenoPerm == TRUE) {
  # Perm calculations (shuffled case/control labels)
  # NOTE: using file_path_sans_ext() to ensure [k].fam corresponds with
  # [k].frq.cc and [k].txt (PLINK numerical order different from R)
  # i.e., with PLINK order is 100, 10, 11, ..., 19, 1, 20, 21, 22... etc
  # while with R length() order is 1, 2, 3, 4, 5, 6... etc
  perm_files <- list.files(path=permFAM, pattern="*.fam$", full.names=T)
  for (k in 1:length(perm_files)) {
    doCalc(famF=perm_files[k],
           plinkOut=file_path_sans_ext(perm_files[k]),
           frqOut=sprintf("%s.txt", file_path_sans_ext(perm_files[k])))
  }
} else {
  return(NULL)
}
BaderLab/POPPATHR documentation built on Dec. 17, 2021, 9:53 a.m.