eggla: Early Growth Genetics Longitudinal Analysis

Documented in run_eggla_gwas

#' Perform GWAS using PLINK2 (and BCFtools)
#'
#' Format VCF file(s) by filtering out all variants
#' not satisfaying "--min-alleles 2 --max-alleles 2 --types snps"
#' and setting IDs (if no annotation file using VEP is provided)
#' with "%CHROM:%POS:%REF:%ALT" (see https://samtools.github.io/bcftools/).
#' GWAS is performed on the formatted VCF file(s) by PLINK2 software
#' (https://www.cog-genomics.org/plink/2.0).
#'
#' @param data Path to the phenotypes stored as a CSV file.
#' @param results Paths to the zip archives or directories generated by `run_eggla_lmm()`
#'   (vector of length two, one male and one female path).
#' @param id_column Name of the column where sample/individual IDs are stored.
#' @param traits One or multiple traits, *i.e.*, columns' names from `data`, to be analysed separately.
#' @param covariates One or several covariates, *i.e.*, columns' names from `data`, to be used.
#'   Binary trait should be coded as '1' and '2',
#'   where sex must be coded: '1' = male, '2' = female, 'NA'/'0' = missing.
#' @param vcfs Path to the "raw" VCF file(s) containing
#'   the genotypes of the individuals to be analysed.
#' @param working_directory Directory in which computation will occur and where output files will be saved.
#' @param vep_file Path to the VEP annotation file to be used to set variants RSIDs and add gene SYMBOL, etc.
#' @param use_info A logical indicating whether to extract all informations stored in the "INFO" field.
#' @param bin_path A named list containing the path to the PLINK2 and BCFtools binaries
#'   For PLINK2, an URL to the binary can be provided (see https://www.cog-genomics.org/plink/2.0).
#' @param bcftools_view_options A string or a vector of strings (which will be pass to `paste()`)
#'   containing BCFtools view parameters, _e.g._, `"--min-af 0.05"`, `"--exclude 'INFO/INFO < 0.8'"`,
#'   and/or `"--min-alleles 2 --max-alleles 2 --types snps"`.
#' @param build Build of the genome on which the SNP is orientated. Default is "38".
#' @param strand Orientation of the site to the human genome strand used. Should be "+" (default).
#' @param info_type Type of information provided in the INFO column, _e.g._, "IMPUTE2 info score via 'bcftools +impute-info'",
#' @param threads Number of threads to be used by some BCFtools and PLINK2 commands.
#' @param quiet A logical indicating whether to suppress the output.
#' @param clean A logical indicating whether to clean intermediary files or not.
#'
#' @import data.table
#' @importFrom stats as.formula p.adjust
#' @importFrom utils download.file unzip head
#' @importFrom rlang hash
#' @importFrom future.apply future_lapply
#'
#' @return Path to results file.
#'
#' @export
#'
#' @examples
#' if (interactive()) {
#'   data("bmigrowth")
#'   bmigrowth_csv <- file.path(tempdir(), "bmigrowth.csv")
#'   fwrite(
#'     x = bmigrowth,
#'     file = bmigrowth_csv
#'   )
#'   results_archives <- run_eggla_lmm(
#'     data = fread(
#'       file = file.path(tempdir(), "bmigrowth.csv"),
#'       colClasses = list(character = "ID")
#'     ),
#'     id_variable = "ID",
#'     age_days_variable = NULL,
#'     age_years_variable = "age",
#'     weight_kilograms_variable = "weight",
#'     height_centimetres_variable = "height",
#'     sex_variable = "sex",
#'     covariates = NULL,
#'     male_coded_zero = FALSE,
#'     random_complexity = 1,
#'     parallel = FALSE,
#'     parallel_n_chunks = 1,
#'     working_directory = tempdir()
#'   )
#'   run_eggla_gwas(
#'     data = fread(
#'       file = file.path(tempdir(), "bmigrowth.csv"),
#'       colClasses = list(character = "ID")
#'     ),
#'     results = results_archives,
#'     id_column = "ID",
#'     traits = c("slope_.*", "auc_.*", "^AP_.*", "^AR_.*"),
#'     covariates = c("sex"),
#'     vcfs = list.files(
#'       path = system.file("vcf", package = "eggla"),
#'       pattern = "\\.vcf$|\\.vcf.gz$",
#'       full.names = TRUE
#'     ),
#'     working_directory = tempdir(),
#'     vep_file = NULL,
#'     bin_path = list(
#'       bcftools = "/usr/bin/bcftools",
#'       plink2 = "/usr/bin/plink2"
#'     ),
#'     threads = 1
#'   )
#' }
run_eggla_gwas <- function(
  data,
  results,
  id_column,
  traits = c("slope_.*", "auc_.*", "^AP_.*", "^AR_.*"),
  covariates,
  vcfs,
  working_directory,
  vep_file = NULL,
  use_info = TRUE,
  bin_path = list(
    bcftools = "/usr/bin/bcftools",
    plink2 = "/usr/bin/plink2"
  ),
  bcftools_view_options = NULL,
  build = "38",
  strand = "+",
  info_type = "IMPUTE2 info score via 'bcftools +impute-info'",
  threads = 1,
  quiet = FALSE,
  clean = TRUE
) {
  # no visible binding for global variable from data.table
  INFO <- TEST <- P <- trait_model <- SNPID <- F_MISS <- ID <- NULL

  working_directory <- normalizePath(working_directory)
  results <- normalizePath(results)
  if (length(results) != 2) {
    stop("'results' must be a vector of length 2, i.e., a path for female and a path for male.")
  }
  dir.create(
    path = working_directory,
    recursive = TRUE,
    mode = "0775",
    showWarnings = FALSE
  )

  if (is.null(bcftools_view_options)) bcftools_view_options <- ""

  if (bin_path[["plink2"]] != sprintf("%s/plink2", working_directory) && file.exists(bin_path[["plink2"]])) {
    file.copy(
      from = bin_path[["plink2"]],
      to = sprintf("%s/plink2", working_directory),
      overwrite = TRUE
    )
    if (clean) on.exit(unlink(sprintf("%s/plink2", working_directory)))
  }

  if (
    grepl("^http.*\\.zip$", bin_path[["plink2"]]) && !file.exists(sprintf("%s/plink2", working_directory))
  ) {
    zip_file <- sprintf("%s/plink2.zip", working_directory)
    is_plink_downloaded <- try(
      expr = {
        utils::download.file(url = bin_path[["plink2"]], destfile = zip_file)
        utils::unzip(
          zipfile = zip_file,
          exdir = working_directory,
          files = "plink2"
        )
        unlink(zip_file)
      },
      silent = TRUE
    )
    bin_path[["plink2"]] <- sprintf("%s/plink2", working_directory)
    if (inherits(is_plink_downloaded, "try-error") && !file.exists(sprintf("%s/plink2", working_directory))) {
      stop(
        "Error downloading PLINK2 binary. ",
        "Please check the download URL at https://www.cog-genomics.org/plink/2.0."
      )
    }
  }

  Sys.chmod(sprintf("%s/plink2", working_directory), "0777")

  plink_version <- try(
    expr = system(
      command = sprintf("%s --version", bin_path[["plink2"]]),
      intern = TRUE,
      ignore.stdout = FALSE,
      ignore.stderr = TRUE
    ),
    silent = TRUE
  )
  if (inherits(plink_version, "try-error") || !file.exists(bin_path[["plink2"]])) {
    stop("Please check PLINK2 binary path!")
  }

  bcftools_version <- try(
    expr = system(
      command = sprintf("%s --version", bin_path[["bcftools"]]),
      intern = TRUE,
      ignore.stdout = FALSE,
      ignore.stderr = TRUE
    ),
    silent = TRUE
  )
  if (inherits(bcftools_version, "try-error") || !file.exists(bin_path[["bcftools"]])) {
    stop("Please check BCFTools binary path!")
  }

  derived_files <- sprintf(
    "derived-%s.csv",
    c("slopes", "aucs", "apar")
  )

  if (sum(grepl("\\.zip$", results)) == 2) {
    derived_parameters_dt <- data.table::setnames(
      x = data.table::rbindlist(lapply(
        X = results,
        path = working_directory,
        derived_files = derived_files,
        FUN = function(izip, path, derived_files) {
          on.exit(unlink(file.path(path, derived_files)))
          Reduce(
            f = function(x, y) {
              data.table::merge.data.table(
                x = x,
                y = y,
                by = "egg_id"
              )
            },
            x = lapply(
              X = derived_files,
              izip = izip,
              path = path,
              FUN = function(dfile, izip, path) {
                utils::unzip(
                  zipfile = izip,
                  files = dfile,
                  exdir = path
                )
                data.table::fread(
                  file = file.path(path, dfile),
                  colClasses = list("character" = 1)
                )
              }
            )
          )
        }
      )),
      old = "egg_id",
      new = id_column
    )
  } else {
    derived_parameters_dt <- data.table::setnames(
      x = data.table::rbindlist(lapply(
        X = results,
        derived_files = derived_files,
        FUN = function(idir, derived_files) {
          Reduce(
            f = function(x, y) {
              data.table::merge.data.table(
                x = x,
                y = y,
                by = "egg_id"
              )
            },
            x = lapply(
              X = list.files(
                path = idir,
                pattern = paste(derived_files, collapse = "|"),
                full.names = TRUE
              ),
              FUN = data.table::fread,
              colClasses = list("character" = 1)
            )
          )
        }
      )),
      old = "egg_id",
      new = id_column
    )
  }

  if (!inherits(data, "data.frame")) data <- data.table::fread(data)
  dt <- data.table::merge.data.table(
    x =  data.table::data.table(data)[
      j = (id_column) := lapply(.SD, as.character),
      .SDcols = c(id_column)
    ][
      j = data.table::first(.SD),
      by = c(id_column)
    ],
    y =  data.table::data.table(derived_parameters_dt)[
      j = (id_column) := lapply(.SD, as.character),
      .SDcols = c(id_column)
    ],
    by = id_column
  )

  dt <- dt[
    j = unique(na.exclude(.SD)),
    .SDcols = grep(paste(
      c(id_column, sprintf("^%s$", unique(c(traits, covariates)))),
      collapse = "|"
    ), names(dt), value = TRUE)
  ]

  data.table::setnames(x = dt, old = id_column, new = "#IID")

  traits <- grep(
    pattern = paste(sprintf("^%s$", unique(traits)), collapse = "|"),
    x = names(dt),
    value = TRUE
  )
  covariates <- grep(
    pattern = paste(sprintf("^%s$", unique(covariates)), collapse = "|"),
    x = names(dt),
    value = TRUE
  )
  formula <- stats::as.formula(sprintf(
    fmt = "`%s` ~ %s",
    paste(traits, collapse = "` + `"),
    paste(covariates, collapse = " + ")
  ))

  tmpdir <- file.path(working_directory, "gwas_plink2")
  dir.create(path = tmpdir, recursive = TRUE, mode = "0777")
  if (clean) on.exit(unlink(tmpdir, recursive = TRUE))

  if (length(sex_covariate <- grep("^sex", covariates, value = TRUE, ignore.case = TRUE)) > 1) {
    stop(sprintf(
      "Only one column containing \"sex\" can exist in the model, not %s: \"%s\"",
      length(sex_covariate),
      paste(sex_covariate, collapse = '", "')
    ))
  }

  binary_wrongly_coded <- dt[
    j = names(which(sapply(
      X = .SD,
      FUN = function(.col) {
        data.table::uniqueN(.col) == 2 && 0 %in% unique(.col)
      }
    ))),
    .SDcols = c(traits)
  ]

  if (length(binary_wrongly_coded) > 0) {
    stop(
      "Binary traits must be coded as 1 or 2!\n",
      sprintf("Please check: \"%s\"", paste(binary_wrongly_coded, collapse = '", "'))
    )
  }

  if (
    length(setdiff(
      as.character(unlist(dt[j = unique(.SD), .SDcols = "#IID"], use.names = FALSE)),
      as.character(unique(unlist(
        x = lapply(
          X = paste(bin_path[["bcftools"]], "query", "--list-samples", vcfs),
          FUN = function(x) data.table::fread(cmd = x, colClasses = "character")
        ),
        use.names = FALSE
      )))
    )) != 0
  ) {
    stop(paste0(
      "None of the IDs provided in 'id_column' matches VCFs IDs.\n",
      "Please check your VCF with 'bcftools query --list-samples file.vcf'.\n",
      "To rename your samples, use 'bcftools reheader --samples oldid_newid_space_separated_columns.txt'.\n",
      "See <https://samtools.github.io/bcftools/bcftools.html#reheader> for more details."
    ))
  }

  basename_file <- file.path(tmpdir, rlang::hash(formula))

  data.table::fwrite(
    x = dt[j = unique(.SD), .SDcols = "#IID"],
    file = sprintf("%s.samples", basename_file),
    sep = " ",
    col.names = FALSE
  )

  data.table::fwrite(
    x = dt[j = unique(.SD), .SDcols = c("#IID", traits)],
    file = sprintf("%s.pheno", basename_file),
    sep = " "
  )

  if (length(covariates_not_sex <- setdiff(covariates, sex_covariate)) > 0) {
    data.table::fwrite(
      x = dt[j = unique(.SD), .SDcols = c("#IID", covariates_not_sex)],
      file = sprintf("%s.cov", basename_file),
      sep = " "
    )
  }

  if (length(sex_covariate) > 0) {
    if (length(sex_levels <- unique(dt[[sex_covariate]])) == 2 && 0 %in% sex_levels) {
      warning(
        "Sex must be coded: '1' = male, '2' = female, 'NA'/'0' = missing! ",
        "'0' have been recoded as '2', i.e., female."
      )
      dt[
        j = c(sex_covariate) := lapply(.SD, function(x) c("0" = 2, "1" = 1)[as.character(x)]),
        .SDcols = sex_covariate
      ]
    }
    coding_frequencies <- sort(table(dt[[sex_covariate]]))
    if (length(coding_frequencies) == 3 && "O" %in% utils::head(names(coding_frequencies), -1)) {
      warning(
        "Too many '0' have been detected!",
        "Sex must be coded: '1' = male, '2' = female, 'NA'/'0' = missing! "
      )
    }

    data.table::fwrite(
      x = data.table::setnames(
        x = data.table::copy(dt)[j = unique(.SD), .SDcols = c("#IID", sex_covariate)],
        old = sex_covariate,
        new = "SEX"
      ),
      file = sprintf("%s.sex", basename_file),
      sep = " "
    )
  }

  if (!quiet) message("Formatting VCFs ...")
  if (nzchar(system.file(package = "future.apply"))) {
    eggla_lapply <- function(X, FUN, ...) {
      future.apply::future_lapply(
        X = X,
        future.globals = FALSE,
        future.packages = "data.table",
        FUN = FUN,
        ...
      )
    }
  } else {
    eggla_lapply <- function(X, FUN, ...) {
      lapply(
        X = X,
        FUN = FUN,
        ...
      )
    }
  }

  list_results <- eggla_lapply(
    X = vcfs,
    basename_file = basename_file,
    vep_file = vep_file,
    bin_path = bin_path,
    bcftools_view_options = bcftools_view_options,
    build = build,
    strand = strand,
    info_type = info_type,
    use_info = use_info,
    FUN = function(vcf, basename_file, vep_file, bin_path, bcftools_view_options, build, strand, info_type, use_info) {
      vcf_file <- sprintf("%s__%s", basename_file, basename(vcf))
      results_file <- sub("\\.vcf.gz", "", vcf_file)
      cmd <- paste(
        bin_path[["bcftools"]],
          "+fill-tags", vcf,
       "|",
        bin_path[["bcftools"]],
          "view",
          bcftools_view_options,
          "--force-samples",
          "--samples-file", sprintf("%s.samples", basename_file)
      )

      if (!is.null(vep_file) && file.exists(vep_file)) {
        cmd <- paste(
          cmd,
          "|",
          bin_path[["bcftools"]],
            "annotate",
            "--annotations", vep_file,
            "--header-lines", sub("_formatted.tsv.gz", ".header", vep_file),
            "--columns CHROM,POS,Gene,Symbol,rsid",
          "|",
          bin_path[["bcftools"]],
            "annotate",
            "--set-id '%INFO/rsid'",
          "|",
          bin_path[["bcftools"]],
            "annotate",
            "--set-id +'%CHROM:%POS:%REF:%ALT'",
            "--output-type z --output", vcf_file
        )
      } else {
        cmd <- paste(
          cmd,
          "|",
          bin_path[["bcftools"]],
            "annotate",
            "--set-id '%CHROM:%POS:%REF:%ALT'",
            "--output-type z --output", vcf_file
        )
      }

      system(cmd)

      if (!quiet) message(sprintf("[%s] Performing PLINK2 regression ...", basename(vcf)))
      system(paste(c(
        bin_path[["plink2"]],
        "--vcf", vcf_file, "dosage=DS",
        "--threads", threads,
        "--glm sex", "cols=+chrom,+pos,+omitted,+a1count,+totallele,+a1freq,+machr2,+nobs,+se,+p,+err",
        if (file.exists(sprintf("%s.cov", basename_file))) c("--covar", sprintf("%s.cov", basename_file)) else "allow-no-covars",
        if (file.exists(sprintf("%s.samples", basename_file))) c("--keep", sprintf("%s.samples", basename_file)),
        if (file.exists(sprintf("%s.sex", basename_file))) c("--update-sex", sprintf("%s.sex", basename_file)),
        if (file.exists(sprintf("%s.pheno", basename_file))) c("--pheno", sprintf("%s.pheno", basename_file)),
        "--covar-variance-standardize",
        "--silent",
        "--out", results_file
      ), collapse = " "))

      if (!quiet) message(sprintf("[%s] Combining results files ...", basename(vcf)))
      results <- data.table::setnames(
        x = data.table::rbindlist(
          l = lapply(
            X = (function(x) `names<-`(x, sub("[^.]*\\.", "", x)))(
              list.files(
                path = dirname(results_file),
                pattern = sprintf("%s\\..*\\.glm\\..*", basename(results_file)),
                full.names = TRUE
              )
            ),
            FUN = data.table::fread
          ),
          idcol = "trait_model"
        ),
        old = function(x) sub("^#", "", x)
      )[
        TEST %in% "ADD", -c("TEST", "T_STAT", "REF", "ALT")
      ]
      data.table::setnames(
        x = results,
        old = c(
          "CHROM", "POS", "ID", "A1", "OMITTED",
          "A1_FREQ", "MACH_R2", "OBS_CT", "BETA", "SE", "P", "ERRCODE"
        ),
        new = c(
          "CHR", "POS", "SNPID", "EFFECT_ALLELE", "NON_EFFECT_ALLELE",
          "EAF", "PLINK2_MACH_R2", "N", "BETA", "SE", "P", "ERRCODE"
        )
      )
      output_results_file <- sprintf("%s.results.gz", results_file)

      if (use_info) {
        if (!quiet) message(sprintf("[%s] Extracting INFO ...", basename(vcf)))
        info_fields_available <- sapply(
          X = unlist(
            x = data.table::fread(
              cmd = paste(
                bin_path[["bcftools"]], "query", vcf_file, "--format",
                "'%INFO\n'"
              ),
              header = FALSE,
              nrows = 1
            ),
            use.names = FALSE
          ),
          FUN = sub,
          pattern = "=.*",
          replacement = "",
          USE.NAMES = FALSE
        )

        info_fields_required <- c("INFO", "R2")
        names(info_fields_required) <- c("INFO", "INFO")

        annot <- data.table::fread(
          cmd = paste(
            bin_path[["bcftools"]], "query", vcf_file, "--format",
            paste0(
              "'%ID\t%CHROM\t%POS\t%ALT\t%REF\t",
              paste(
                paste0("%INFO/", intersect(info_fields_required, info_fields_available)),
                collapse = "\t"
              ),
              "\n'"
            )
          ),
          col.names = c(
            "SNPID", "CHR", "POS", "EFFECT_ALLELE", "NON_EFFECT_ALLELE",
            names(info_fields_required)[which(info_fields_required %in% info_fields_available)]
          )
        )[
          j = list(
            ID = SNPID,
            INFO,
            IMPUTED = as.numeric(INFO != 1),
            INFO_TYPE = data.table::fifelse(INFO != 1, info_type, NA_character_)
          )
        ]
        annotated_results <- merge(
          x = results,
          y = annot,
          by.x = "SNPID",
          by.y = "ID",
          all.x = TRUE
        )
      } else {
        annotated_results <- results
      }

      if (!quiet) message(sprintf("[%s] Computing PLINK2 missing rate ...", basename(vcf)))
      system(paste(c(
        bin_path[["plink2"]],
        "--vcf", vcf_file, "dosage=DS",
        "--threads", threads,
        "--missing", "vcols=+chrom,+nmissdosage,+nobs,+fmiss",
        if (file.exists(sprintf("%s.samples", basename_file))) c("--keep", sprintf("%s.samples", basename_file)),
        "--silent",
        "--out", results_file
      ), collapse = " "))

      if (!quiet) message(sprintf("[%s] Computing PLINK2 Hardy-Weinberg test ...", basename(vcf)))
      system(paste(c(
        bin_path[["plink2"]],
        "--vcf", vcf_file, "dosage=DS",
        "--threads", threads,
        "--hardy",
        if (file.exists(sprintf("%s.samples", basename_file))) c("--keep", sprintf("%s.samples", basename_file)),
        "--silent",
        "--out", results_file
      ), collapse = " "))

      annotated_results <- merge(
        x = annotated_results,
        y = merge(
          x = data.table::fread(sprintf("%s.vmiss", results_file))[j = list(ID, CALL_RATE = 1 - F_MISS)],
          y = data.table::fread(sprintf("%s.hardy", results_file))[j = list(ID, HWE_P = P)],
          by = "ID"
        ),
        by.x = "SNPID",
        by.y = "ID"
      )[
        j = .SD,
        .SDcols = -c("A1_CT", "ALLELE_CT")
      ]

      data.table::fwrite(x = annotated_results, file = output_results_file)
      if (!quiet) message(sprintf("[%s] Results written in \"%s\"", basename(vcf), output_results_file))

      output_results_file
    }
  )

  if (!quiet) message("Writing trait results ...")
  results_files <- data.table::setcolorder(
    x = data.table::rbindlist(
      l = lapply(list_results, data.table::fread),
      use.names = TRUE
    )[
      j = `:=`(
        COVARIATES = paste(covariates, collapse = "+"),
        STRAND = strand,
        BUILD = build
      ),
      by = "trait_model"
    ][order(P)],
    neworder = c(
      "trait_model",
      "SNPID", "STRAND", "BUILD", "CHR", "POS",
      "EFFECT_ALLELE", "NON_EFFECT_ALLELE", "N",
      "EAF", "BETA", "SE", "P",
      "IMPUTED", "INFO_TYPE", "INFO", "PLINK2_MACH_R2",
      "CALL_RATE", "HWE_P",
      "ERRCODE", "COVARIATES"
    )
  )[
    j = list(file = (function(dt, tm) {
      rf <- sprintf(file.path(working_directory, "%s-gwas.txt.gz"), tm)
      data.table::fwrite(x = .SD, file = rf, sep = "\t")
      rf
    })(.SD, trait_model)),
    by = "trait_model"
  ]

  if (!quiet) message("Writing software versions ...")
  writeLines(
    text = c(R.version.string, plink_version, bcftools_version),
    con = file.path(working_directory, "gwas-software.txt")
  )

  c(results_files[["file"]], results, file.path(working_directory, "gwas-software.txt"))
}