#' Perform GWAS using PLINK2 (and BCFtools)
#'
#' Format VCF file(s) by filtering out all variants
#' not satisfaying "--min-alleles 2 --max-alleles 2 --types snps"
#' and setting IDs (if no annotation file using VEP is provided)
#' with "%CHROM:%POS:%REF:%ALT" (see https://samtools.github.io/bcftools/).
#' GWAS is performed on the formatted VCF file(s) by PLINK2 software
#' (https://www.cog-genomics.org/plink/2.0).
#'
#' @param data Path to the phenotypes stored as a CSV file.
#' @param results Paths to the zip archives or directories generated by `run_eggla_lmm()`
#' (vector of length two, one male and one female path).
#' @param id_column Name of the column where sample/individual IDs are stored.
#' @param traits One or multiple traits, *i.e.*, columns' names from `data`, to be analysed separately.
#' @param covariates One or several covariates, *i.e.*, columns' names from `data`, to be used.
#' Binary trait should be coded as '1' and '2',
#' where sex must be coded: '1' = male, '2' = female, 'NA'/'0' = missing.
#' @param vcfs Path to the "raw" VCF file(s) containing
#' the genotypes of the individuals to be analysed.
#' @param working_directory Directory in which computation will occur and where output files will be saved.
#' @param vep_file Path to the VEP annotation file to be used to set variants RSIDs and add gene SYMBOL, etc.
#' @param use_info A logical indicating whether to extract all informations stored in the "INFO" field.
#' @param bin_path A named list containing the path to the PLINK2 and BCFtools binaries
#' For PLINK2, an URL to the binary can be provided (see https://www.cog-genomics.org/plink/2.0).
#' @param bcftools_view_options A string or a vector of strings (which will be pass to `paste()`)
#' containing BCFtools view parameters, _e.g._, `"--min-af 0.05"`, `"--exclude 'INFO/INFO < 0.8'"`,
#' and/or `"--min-alleles 2 --max-alleles 2 --types snps"`.
#' @param build Build of the genome on which the SNP is orientated. Default is "38".
#' @param strand Orientation of the site to the human genome strand used. Should be "+" (default).
#' @param info_type Type of information provided in the INFO column, _e.g._, "IMPUTE2 info score via 'bcftools +impute-info'",
#' @param threads Number of threads to be used by some BCFtools and PLINK2 commands.
#' @param quiet A logical indicating whether to suppress the output.
#' @param clean A logical indicating whether to clean intermediary files or not.
#'
#' @import data.table
#' @importFrom stats as.formula p.adjust
#' @importFrom utils download.file unzip head
#' @importFrom rlang hash
#' @importFrom future.apply future_lapply
#'
#' @return Path to results file.
#'
#' @export
#'
#' @examples
#' if (interactive()) {
#' data("bmigrowth")
#' bmigrowth_csv <- file.path(tempdir(), "bmigrowth.csv")
#' fwrite(
#' x = bmigrowth,
#' file = bmigrowth_csv
#' )
#' results_archives <- run_eggla_lmm(
#' data = fread(
#' file = file.path(tempdir(), "bmigrowth.csv"),
#' colClasses = list(character = "ID")
#' ),
#' id_variable = "ID",
#' age_days_variable = NULL,
#' age_years_variable = "age",
#' weight_kilograms_variable = "weight",
#' height_centimetres_variable = "height",
#' sex_variable = "sex",
#' covariates = NULL,
#' male_coded_zero = FALSE,
#' random_complexity = 1,
#' parallel = FALSE,
#' parallel_n_chunks = 1,
#' working_directory = tempdir()
#' )
#' run_eggla_gwas(
#' data = fread(
#' file = file.path(tempdir(), "bmigrowth.csv"),
#' colClasses = list(character = "ID")
#' ),
#' results = results_archives,
#' id_column = "ID",
#' traits = c("slope_.*", "auc_.*", "^AP_.*", "^AR_.*"),
#' covariates = c("sex"),
#' vcfs = list.files(
#' path = system.file("vcf", package = "eggla"),
#' pattern = "\\.vcf$|\\.vcf.gz$",
#' full.names = TRUE
#' ),
#' working_directory = tempdir(),
#' vep_file = NULL,
#' bin_path = list(
#' bcftools = "/usr/bin/bcftools",
#' plink2 = "/usr/bin/plink2"
#' ),
#' threads = 1
#' )
#' }
run_eggla_gwas <- function(
data,
results,
id_column,
traits = c("slope_.*", "auc_.*", "^AP_.*", "^AR_.*"),
covariates,
vcfs,
working_directory,
vep_file = NULL,
use_info = TRUE,
bin_path = list(
bcftools = "/usr/bin/bcftools",
plink2 = "/usr/bin/plink2"
),
bcftools_view_options = NULL,
build = "38",
strand = "+",
info_type = "IMPUTE2 info score via 'bcftools +impute-info'",
threads = 1,
quiet = FALSE,
clean = TRUE
) {
# no visible binding for global variable from data.table
INFO <- TEST <- P <- trait_model <- SNPID <- F_MISS <- ID <- NULL
working_directory <- normalizePath(working_directory)
results <- normalizePath(results)
if (length(results) != 2) {
stop("'results' must be a vector of length 2, i.e., a path for female and a path for male.")
}
dir.create(
path = working_directory,
recursive = TRUE,
mode = "0775",
showWarnings = FALSE
)
if (is.null(bcftools_view_options)) bcftools_view_options <- ""
if (bin_path[["plink2"]] != sprintf("%s/plink2", working_directory) && file.exists(bin_path[["plink2"]])) {
file.copy(
from = bin_path[["plink2"]],
to = sprintf("%s/plink2", working_directory),
overwrite = TRUE
)
if (clean) on.exit(unlink(sprintf("%s/plink2", working_directory)))
}
if (
grepl("^http.*\\.zip$", bin_path[["plink2"]]) && !file.exists(sprintf("%s/plink2", working_directory))
) {
zip_file <- sprintf("%s/plink2.zip", working_directory)
is_plink_downloaded <- try(
expr = {
utils::download.file(url = bin_path[["plink2"]], destfile = zip_file)
utils::unzip(
zipfile = zip_file,
exdir = working_directory,
files = "plink2"
)
unlink(zip_file)
},
silent = TRUE
)
bin_path[["plink2"]] <- sprintf("%s/plink2", working_directory)
if (inherits(is_plink_downloaded, "try-error") && !file.exists(sprintf("%s/plink2", working_directory))) {
stop(
"Error downloading PLINK2 binary. ",
"Please check the download URL at https://www.cog-genomics.org/plink/2.0."
)
}
}
Sys.chmod(sprintf("%s/plink2", working_directory), "0777")
plink_version <- try(
expr = system(
command = sprintf("%s --version", bin_path[["plink2"]]),
intern = TRUE,
ignore.stdout = FALSE,
ignore.stderr = TRUE
),
silent = TRUE
)
if (inherits(plink_version, "try-error") || !file.exists(bin_path[["plink2"]])) {
stop("Please check PLINK2 binary path!")
}
bcftools_version <- try(
expr = system(
command = sprintf("%s --version", bin_path[["bcftools"]]),
intern = TRUE,
ignore.stdout = FALSE,
ignore.stderr = TRUE
),
silent = TRUE
)
if (inherits(bcftools_version, "try-error") || !file.exists(bin_path[["bcftools"]])) {
stop("Please check BCFTools binary path!")
}
derived_files <- sprintf(
"derived-%s.csv",
c("slopes", "aucs", "apar")
)
if (sum(grepl("\\.zip$", results)) == 2) {
derived_parameters_dt <- data.table::setnames(
x = data.table::rbindlist(lapply(
X = results,
path = working_directory,
derived_files = derived_files,
FUN = function(izip, path, derived_files) {
on.exit(unlink(file.path(path, derived_files)))
Reduce(
f = function(x, y) {
data.table::merge.data.table(
x = x,
y = y,
by = "egg_id"
)
},
x = lapply(
X = derived_files,
izip = izip,
path = path,
FUN = function(dfile, izip, path) {
utils::unzip(
zipfile = izip,
files = dfile,
exdir = path
)
data.table::fread(
file = file.path(path, dfile),
colClasses = list("character" = 1)
)
}
)
)
}
)),
old = "egg_id",
new = id_column
)
} else {
derived_parameters_dt <- data.table::setnames(
x = data.table::rbindlist(lapply(
X = results,
derived_files = derived_files,
FUN = function(idir, derived_files) {
Reduce(
f = function(x, y) {
data.table::merge.data.table(
x = x,
y = y,
by = "egg_id"
)
},
x = lapply(
X = list.files(
path = idir,
pattern = paste(derived_files, collapse = "|"),
full.names = TRUE
),
FUN = data.table::fread,
colClasses = list("character" = 1)
)
)
}
)),
old = "egg_id",
new = id_column
)
}
if (!inherits(data, "data.frame")) data <- data.table::fread(data)
dt <- data.table::merge.data.table(
x = data.table::data.table(data)[
j = (id_column) := lapply(.SD, as.character),
.SDcols = c(id_column)
][
j = data.table::first(.SD),
by = c(id_column)
],
y = data.table::data.table(derived_parameters_dt)[
j = (id_column) := lapply(.SD, as.character),
.SDcols = c(id_column)
],
by = id_column
)
dt <- dt[
j = unique(na.exclude(.SD)),
.SDcols = grep(paste(
c(id_column, sprintf("^%s$", unique(c(traits, covariates)))),
collapse = "|"
), names(dt), value = TRUE)
]
data.table::setnames(x = dt, old = id_column, new = "#IID")
traits <- grep(
pattern = paste(sprintf("^%s$", unique(traits)), collapse = "|"),
x = names(dt),
value = TRUE
)
covariates <- grep(
pattern = paste(sprintf("^%s$", unique(covariates)), collapse = "|"),
x = names(dt),
value = TRUE
)
formula <- stats::as.formula(sprintf(
fmt = "`%s` ~ %s",
paste(traits, collapse = "` + `"),
paste(covariates, collapse = " + ")
))
tmpdir <- file.path(working_directory, "gwas_plink2")
dir.create(path = tmpdir, recursive = TRUE, mode = "0777")
if (clean) on.exit(unlink(tmpdir, recursive = TRUE))
if (length(sex_covariate <- grep("^sex", covariates, value = TRUE, ignore.case = TRUE)) > 1) {
stop(sprintf(
"Only one column containing \"sex\" can exist in the model, not %s: \"%s\"",
length(sex_covariate),
paste(sex_covariate, collapse = '", "')
))
}
binary_wrongly_coded <- dt[
j = names(which(sapply(
X = .SD,
FUN = function(.col) {
data.table::uniqueN(.col) == 2 && 0 %in% unique(.col)
}
))),
.SDcols = c(traits)
]
if (length(binary_wrongly_coded) > 0) {
stop(
"Binary traits must be coded as 1 or 2!\n",
sprintf("Please check: \"%s\"", paste(binary_wrongly_coded, collapse = '", "'))
)
}
if (
length(setdiff(
as.character(unlist(dt[j = unique(.SD), .SDcols = "#IID"], use.names = FALSE)),
as.character(unique(unlist(
x = lapply(
X = paste(bin_path[["bcftools"]], "query", "--list-samples", vcfs),
FUN = function(x) data.table::fread(cmd = x, colClasses = "character")
),
use.names = FALSE
)))
)) != 0
) {
stop(paste0(
"None of the IDs provided in 'id_column' matches VCFs IDs.\n",
"Please check your VCF with 'bcftools query --list-samples file.vcf'.\n",
"To rename your samples, use 'bcftools reheader --samples oldid_newid_space_separated_columns.txt'.\n",
"See <https://samtools.github.io/bcftools/bcftools.html#reheader> for more details."
))
}
basename_file <- file.path(tmpdir, rlang::hash(formula))
data.table::fwrite(
x = dt[j = unique(.SD), .SDcols = "#IID"],
file = sprintf("%s.samples", basename_file),
sep = " ",
col.names = FALSE
)
data.table::fwrite(
x = dt[j = unique(.SD), .SDcols = c("#IID", traits)],
file = sprintf("%s.pheno", basename_file),
sep = " "
)
if (length(covariates_not_sex <- setdiff(covariates, sex_covariate)) > 0) {
data.table::fwrite(
x = dt[j = unique(.SD), .SDcols = c("#IID", covariates_not_sex)],
file = sprintf("%s.cov", basename_file),
sep = " "
)
}
if (length(sex_covariate) > 0) {
if (length(sex_levels <- unique(dt[[sex_covariate]])) == 2 && 0 %in% sex_levels) {
warning(
"Sex must be coded: '1' = male, '2' = female, 'NA'/'0' = missing! ",
"'0' have been recoded as '2', i.e., female."
)
dt[
j = c(sex_covariate) := lapply(.SD, function(x) c("0" = 2, "1" = 1)[as.character(x)]),
.SDcols = sex_covariate
]
}
coding_frequencies <- sort(table(dt[[sex_covariate]]))
if (length(coding_frequencies) == 3 && "O" %in% utils::head(names(coding_frequencies), -1)) {
warning(
"Too many '0' have been detected!",
"Sex must be coded: '1' = male, '2' = female, 'NA'/'0' = missing! "
)
}
data.table::fwrite(
x = data.table::setnames(
x = data.table::copy(dt)[j = unique(.SD), .SDcols = c("#IID", sex_covariate)],
old = sex_covariate,
new = "SEX"
),
file = sprintf("%s.sex", basename_file),
sep = " "
)
}
if (!quiet) message("Formatting VCFs ...")
if (nzchar(system.file(package = "future.apply"))) {
eggla_lapply <- function(X, FUN, ...) {
future.apply::future_lapply(
X = X,
future.globals = FALSE,
future.packages = "data.table",
FUN = FUN,
...
)
}
} else {
eggla_lapply <- function(X, FUN, ...) {
lapply(
X = X,
FUN = FUN,
...
)
}
}
list_results <- eggla_lapply(
X = vcfs,
basename_file = basename_file,
vep_file = vep_file,
bin_path = bin_path,
bcftools_view_options = bcftools_view_options,
build = build,
strand = strand,
info_type = info_type,
use_info = use_info,
FUN = function(vcf, basename_file, vep_file, bin_path, bcftools_view_options, build, strand, info_type, use_info) {
vcf_file <- sprintf("%s__%s", basename_file, basename(vcf))
results_file <- sub("\\.vcf.gz", "", vcf_file)
cmd <- paste(
bin_path[["bcftools"]],
"+fill-tags", vcf,
"|",
bin_path[["bcftools"]],
"view",
bcftools_view_options,
"--force-samples",
"--samples-file", sprintf("%s.samples", basename_file)
)
if (!is.null(vep_file) && file.exists(vep_file)) {
cmd <- paste(
cmd,
"|",
bin_path[["bcftools"]],
"annotate",
"--annotations", vep_file,
"--header-lines", sub("_formatted.tsv.gz", ".header", vep_file),
"--columns CHROM,POS,Gene,Symbol,rsid",
"|",
bin_path[["bcftools"]],
"annotate",
"--set-id '%INFO/rsid'",
"|",
bin_path[["bcftools"]],
"annotate",
"--set-id +'%CHROM:%POS:%REF:%ALT'",
"--output-type z --output", vcf_file
)
} else {
cmd <- paste(
cmd,
"|",
bin_path[["bcftools"]],
"annotate",
"--set-id '%CHROM:%POS:%REF:%ALT'",
"--output-type z --output", vcf_file
)
}
system(cmd)
if (!quiet) message(sprintf("[%s] Performing PLINK2 regression ...", basename(vcf)))
system(paste(c(
bin_path[["plink2"]],
"--vcf", vcf_file, "dosage=DS",
"--threads", threads,
"--glm sex", "cols=+chrom,+pos,+omitted,+a1count,+totallele,+a1freq,+machr2,+nobs,+se,+p,+err",
if (file.exists(sprintf("%s.cov", basename_file))) c("--covar", sprintf("%s.cov", basename_file)) else "allow-no-covars",
if (file.exists(sprintf("%s.samples", basename_file))) c("--keep", sprintf("%s.samples", basename_file)),
if (file.exists(sprintf("%s.sex", basename_file))) c("--update-sex", sprintf("%s.sex", basename_file)),
if (file.exists(sprintf("%s.pheno", basename_file))) c("--pheno", sprintf("%s.pheno", basename_file)),
"--covar-variance-standardize",
"--silent",
"--out", results_file
), collapse = " "))
if (!quiet) message(sprintf("[%s] Combining results files ...", basename(vcf)))
results <- data.table::setnames(
x = data.table::rbindlist(
l = lapply(
X = (function(x) `names<-`(x, sub("[^.]*\\.", "", x)))(
list.files(
path = dirname(results_file),
pattern = sprintf("%s\\..*\\.glm\\..*", basename(results_file)),
full.names = TRUE
)
),
FUN = data.table::fread
),
idcol = "trait_model"
),
old = function(x) sub("^#", "", x)
)[
TEST %in% "ADD", -c("TEST", "T_STAT", "REF", "ALT")
]
data.table::setnames(
x = results,
old = c(
"CHROM", "POS", "ID", "A1", "OMITTED",
"A1_FREQ", "MACH_R2", "OBS_CT", "BETA", "SE", "P", "ERRCODE"
),
new = c(
"CHR", "POS", "SNPID", "EFFECT_ALLELE", "NON_EFFECT_ALLELE",
"EAF", "PLINK2_MACH_R2", "N", "BETA", "SE", "P", "ERRCODE"
)
)
output_results_file <- sprintf("%s.results.gz", results_file)
if (use_info) {
if (!quiet) message(sprintf("[%s] Extracting INFO ...", basename(vcf)))
info_fields_available <- sapply(
X = unlist(
x = data.table::fread(
cmd = paste(
bin_path[["bcftools"]], "query", vcf_file, "--format",
"'%INFO\n'"
),
header = FALSE,
nrows = 1
),
use.names = FALSE
),
FUN = sub,
pattern = "=.*",
replacement = "",
USE.NAMES = FALSE
)
info_fields_required <- c("INFO", "R2")
names(info_fields_required) <- c("INFO", "INFO")
annot <- data.table::fread(
cmd = paste(
bin_path[["bcftools"]], "query", vcf_file, "--format",
paste0(
"'%ID\t%CHROM\t%POS\t%ALT\t%REF\t",
paste(
paste0("%INFO/", intersect(info_fields_required, info_fields_available)),
collapse = "\t"
),
"\n'"
)
),
col.names = c(
"SNPID", "CHR", "POS", "EFFECT_ALLELE", "NON_EFFECT_ALLELE",
names(info_fields_required)[which(info_fields_required %in% info_fields_available)]
)
)[
j = list(
ID = SNPID,
INFO,
IMPUTED = as.numeric(INFO != 1),
INFO_TYPE = data.table::fifelse(INFO != 1, info_type, NA_character_)
)
]
annotated_results <- merge(
x = results,
y = annot,
by.x = "SNPID",
by.y = "ID",
all.x = TRUE
)
} else {
annotated_results <- results
}
if (!quiet) message(sprintf("[%s] Computing PLINK2 missing rate ...", basename(vcf)))
system(paste(c(
bin_path[["plink2"]],
"--vcf", vcf_file, "dosage=DS",
"--threads", threads,
"--missing", "vcols=+chrom,+nmissdosage,+nobs,+fmiss",
if (file.exists(sprintf("%s.samples", basename_file))) c("--keep", sprintf("%s.samples", basename_file)),
"--silent",
"--out", results_file
), collapse = " "))
if (!quiet) message(sprintf("[%s] Computing PLINK2 Hardy-Weinberg test ...", basename(vcf)))
system(paste(c(
bin_path[["plink2"]],
"--vcf", vcf_file, "dosage=DS",
"--threads", threads,
"--hardy",
if (file.exists(sprintf("%s.samples", basename_file))) c("--keep", sprintf("%s.samples", basename_file)),
"--silent",
"--out", results_file
), collapse = " "))
annotated_results <- merge(
x = annotated_results,
y = merge(
x = data.table::fread(sprintf("%s.vmiss", results_file))[j = list(ID, CALL_RATE = 1 - F_MISS)],
y = data.table::fread(sprintf("%s.hardy", results_file))[j = list(ID, HWE_P = P)],
by = "ID"
),
by.x = "SNPID",
by.y = "ID"
)[
j = .SD,
.SDcols = -c("A1_CT", "ALLELE_CT")
]
data.table::fwrite(x = annotated_results, file = output_results_file)
if (!quiet) message(sprintf("[%s] Results written in \"%s\"", basename(vcf), output_results_file))
output_results_file
}
)
if (!quiet) message("Writing trait results ...")
results_files <- data.table::setcolorder(
x = data.table::rbindlist(
l = lapply(list_results, data.table::fread),
use.names = TRUE
)[
j = `:=`(
COVARIATES = paste(covariates, collapse = "+"),
STRAND = strand,
BUILD = build
),
by = "trait_model"
][order(P)],
neworder = c(
"trait_model",
"SNPID", "STRAND", "BUILD", "CHR", "POS",
"EFFECT_ALLELE", "NON_EFFECT_ALLELE", "N",
"EAF", "BETA", "SE", "P",
"IMPUTED", "INFO_TYPE", "INFO", "PLINK2_MACH_R2",
"CALL_RATE", "HWE_P",
"ERRCODE", "COVARIATES"
)
)[
j = list(file = (function(dt, tm) {
rf <- sprintf(file.path(working_directory, "%s-gwas.txt.gz"), tm)
data.table::fwrite(x = .SD, file = rf, sep = "\t")
rf
})(.SD, trait_model)),
by = "trait_model"
]
if (!quiet) message("Writing software versions ...")
writeLines(
text = c(R.version.string, plink_version, bcftools_version),
con = file.path(working_directory, "gwas-software.txt")
)
c(results_files[["file"]], results, file.path(working_directory, "gwas-software.txt"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.