#!/usr/bin/env Rscript
# (C) Copyright 2019 Sur Herrera Paredes
#
# This file is part of HMVAR.
#
# HMVAR is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# HMVAR is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HMVAR. If not, see <http://www.gnu.org/licenses/>.
library(HMVAR)
library(tidyverse)
library(argparser)
process_arguments <- function(){
p <- arg_parser(paste("Perform metawas on a genome"))
# Positional arguments
p <- add_argument(p, "midas_dir",
help = paste("Directory with midas merge output. Must have",
"snps_info.txt, snps_freq.txt and snps_depth.txt"),
type = "character")
p <- add_argument(p, "focal_group",
help = paste("Group that is going to be compared against the rest",
"in the LMM. It must correspond to one of the levels",
"in the Group column of the map_file"),
type = "character")
# Optional arguments
p <- add_argument(p, "--map_file",
help = paste("Mapping file. Must be tab-delimited and have ",
"ID and Group columns."),
type = "character",
default = "map.txt")
p <- add_argument(p, "--outdir",
help = paste("Directory for output"),
type = "character",
default = "metawas")
p <- add_argument(p, "--gemma",
help = paste("GEMMA executable."),
type = "character",
default = "gemma")
p <- add_argument(p, "--impute",
help = paste("If passed, imputation will be attempted",
"before LMM."),
flag = TRUE)
p <- add_argument(p, "--pcs",
help = paste("File with community abundance principal",
"components. Must be tab-delimited, the first",
"column must be named 'ID' and correspond to",
"the sample IDs, and there should be one column",
"per principal component."),
type = "character",
default = NULL)
p <- add_argument(p, "--pval_thres",
help = paste("p-value threshold for considering significance"),
type = "numeric",
default = 1e-6)
# Read arguments
cat("Processing arguments...\n")
args <- parse_args(p)
# Process arguments
# Check gemma
o <- HMVAR:::run_command(paste(args$gemma))
if(o != 0){
stop("ERROR: GEMMA executable not found", call. = TRUE)
}
return(args)
}
# Read arguments
# indir <- "./"
# spec <- "midas_output_small/"
# Eventually replace this with argparse
# args <- list(midas_dir = file.path(indir, spec),
# focal_group = "Supragingival.plaque",
# map_file = "map.txt",
# outdir = "script_pcs_noimpute/",
# gemma = "~/bin/gemma-0.98.1-linux-static",
# impute = FALSE,
# pcs = "pcs.txt",
# pval_thres = 1e-6)
# rm(indir, spec)
args <- process_arguments()
print(args)
# The steps will be performed
# Steps
# 1. Convert midas data to BIMBAM
# 2. Impute genotypes with mice (optional)
# 3. Get kinship matrix with gemma
# 4. Run lmm
# 5. Run lmm with PCs (optional)
# Required in fraserv for the bugwas modified gemma
# Trying to move to newer GEMMA
# Sys.setenv(LD_LIBRARY_PATH="/opt/modules/pkgs/eqtlbma/git/lib/")
# Main output directory
cat("Creating output directory...\n")
dir.create(args$outdir)
# Create list for filenames
Files <- list(Dirs = list(),
Files = list())
# Read map
cat("Processing map...\n")
map <- read_tsv(args$map_file, col_types = 'cc')
map <- map %>% select(sample = ID, Group = Group)
# Convert to bimbam
cat("Converting to BIMBAM format...\n")
Files$Dirs$bimbam_dir <- file.path(args$outdir, "bimbam")
midas_bimbam <- midas_to_bimbam(midas_dir = args$midas_dir,
map = map,
outdir = Files$Dirs$bimbam_dir,
focal_group = args$focal_group,
prefix = NULL)
Files$Files$midas_geno_file <- midas_bimbam$filenames$geno_file
Files$Files$pheno_file <- midas_bimbam$filenames$pheno_file
Files$Files$snp_file <- midas_bimbam$filenames$snp_file
rm(map)
# Exit if not enough samples
group_count <- table(midas_bimbam$Dat$pheno$phenotype)
if(length(group_count) != 2 || any(group_count < 3)){
cat("Exiting because ther are not enough samples in two groups: (", group_count, ")...\n")
quit(save = "no")
}
if(args$impute){
# Impute
Files$Dirs$imputed_dir <- file.path(args$outdir, "imputed")
Files$Files$imputed_geno_file <- mice_impute(geno = midas_bimbam$Dat$geno,
snp = midas_bimbam$Dat$snp,
outdir = Files$Dirs$imputed_dir,
m = 5,
verbose = FALSE,
prefix = "imputed",
return_table = FALSE,
seed = 76543)
Files$Files$midas_geno_file <- Files$Files$imputed_geno_file
}
# Get kinship matrix
# Works with both gemma v0.93b & v0.98.1
# I am ingoring patterns since genotypes are not fixed but frequencies instead
Files$Dirs$kinship_dir <- file.path(args$outdir, "kinship")
Files$Files$kinship_file <- gemma_kinship(geno_file = Files$Files$midas_geno_file,
pheno_file = Files$Files$pheno_file,
snp_file = Files$Files$snp_file,
gemma = args$gemma,
outdir = Files$Dirs$kinship_dir,
prefix = 'kinship')
# Run standard lmm
Files$Dirs$lmm_dir <- file.path(args$outdir, "lmm")
res <- gemma_lmm(geno_file = Files$Files$midas_geno_file,
pheno_file = Files$Files$pheno_file,
snp_file = Files$Files$snp_file,
kinship_file = Files$Files$kinship_file,
cov_file = NULL,
gemma = args$gemma,
outdir = Files$Dirs$lmm_dir,
maf = 0,
prefix = "lmm")
Files$Files$lmm_log_file <- res[1]
Files$Files$lmm_assoc_file <- res[2]
rm(res)
if(!is.na(args$pcs)){
# Run lmm with pcs
# Format PCs as covariates
pcs <- read_tsv(args$pcs, col_types = cols(ID = col_character(),
.default = col_double()))
pcs <- pcs %>% slice(match(midas_bimbam$Dat$pheno$id, ID))
pcs$ID <- 1
Files$Files$pc_covariates <- file.path(Files$Dirs$bimbam_dir, "pcs.bimbam")
write_tsv(pcs, Files$Files$pc_covariates, col_names = FALSE)
# Run lmmpcs
Files$Dirs$lmmpcs_dir <- file.path(args$outdir, "lmmpcs")
res <- gemma_lmm(geno_file = Files$Files$midas_geno_file,
pheno_file = Files$Files$pheno_file,
snp_file = Files$Files$snp_file,
kinship_file = Files$Files$kinship_file,
cov_file = Files$Files$pc_covariates,
gemma = args$gemma,
outdir = Files$Dirs$lmmpcs_dir,
maf = 0,
prefix = "lmmpcs")
Files$Files$lmmpcs_log_file <- res[1]
Files$Files$lmmpcs_assoc_file <- res[2]
rm(res)
# Combine results
lmm <- read_tsv(Files$Files$lmm_assoc_file,
col_types = cols(chr = col_character(),
rs = col_character(),
ps = col_integer(),
n_miss = col_integer(),
allele1 = col_character(),
allele0 = col_character(),
af = col_number(),
logl_H1 = col_number(),
l_mle = col_number(),
p_lrt = col_number()))
lmmpcs <- read_tsv(Files$Files$lmmpcs_assoc_file,
col_types = cols(chr = col_character(),
rs = col_character(),
ps = col_integer(),
n_miss = col_integer(),
allele1 = col_character(),
allele0 = col_character(),
af = col_number(),
logl_H1 = col_number(),
l_mle = col_number(),
p_lrt = col_number())) %>%
select(-n_miss, -allele1, -allele0, - af)
lmm <- lmm %>% full_join(lmmpcs, by = c("chr", "rs", "ps"),
suffix = c(".lmm", ".lmmpcs"))
# Select interpretation
res <- rep('none', nrow(lmm))
res[ lmm$p_lrt.lmm < args$pval_thres & lmm$p_lrt.lmmpcs >= args$pval_thres ] <- "int"
res[ lmm$p_lrt.lmm >= args$pval_thres & lmm$p_lrt.lmmpcs < args$pval_thres ] <- "env"
res[ lmm$p_lrt.lmm < args$pval_thres & lmm$p_lrt.lmmpcs < args$pval_thres ] <- "both"
lmm <- lmm %>% bind_cols(type = res)
# lmm$type %>% table
# Write results
Files$Files$results <- file.path(args$outdir, "lmm.results.txt")
write_tsv(lmm, Files$Files$results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.