#!/usr/bin/env Rscript
#
# Read csv file, prepare for GEMMA, run GEMMA and plot the results
#
library(mousegwas)
library(argparse)
library(yaml)
library(dplyr)
library(readr)
library(R.utils)
library(data.table)
library(ggplot2)
library(GGally)
library(corrgram)
# create parser object
parser <- ArgumentParser()
# specify our desired options
# by default ArgumentParser will add an help option
parser$add_argument("--downsample", '-d', default=100000, type="integer",
help="Downsample strains to have at most this number of representatives. If 0 - average each strain")
parser$add_argument("-i", "--input",
help="Input csv file, assume column names. Definitions for which columns to use and how are in the yaml file")
parser$add_argument("-y", "--yaml",
help="Yaml file which will include: strain- Name of strain column, phenotypes-phenotypes column names to test, covar-columns to be used as covariates, F1- a dictionary mapping F1 names to strain names (female parent first), translate- translate strain names to names in the genotypes file")
parser$add_argument("--gemma",
help="Gemma executable")
parser$add_argument("-g", "--genotypes", nargs = '+',
help="The genotypes csv files as downloaded from phenome.jax.org")
parser$add_argument("--basedir", default=".",
help="output directory. Will overwrite existing files")
parser$add_argument("--pylmm", "-p", default="pylmmGWAS.py",
help="pylmmGWAS.py executable")
parser$add_argument("--pylmmkinship", "-k", default="pylmmKinship.py",
help="pylmmKinship.py executable")
parser$add_argument("--method", "-m", default="GEMMA",
help="Which GWAS software to use, possible values: GEMMA|pyLMM")
parser$add_argument("--noloco", default=FALSE, action="store_true",
help="Do not use LOCO")
parser$add_argument("--missing", default=0.05, type="double",
help="Maximal fraction of missing data for marker")
parser$add_argument("--MAF", default=0.1, type="double",
help="Minimal value for minor allele frequency")
parser$add_argument("--header", default="GWAS results",
help="Manhattan plot header")
parser$add_argument("--shuffle", default=FALSE, action="store_true",
help="Shuffle the phenotypes between the different individuals")
parser$add_argument("--seed", type="integer", default=100,
help="If shuffle is true, set the seed to avoid repeating the same results but have the option to rerun")
parser$add_argument("--qqnorm", default=FALSE, action="store_true",
help="QQNORM each phenotype before analyzing")
parser$add_argument("--genedist", default=1000000, type="integer",
help="gene distance from SNP, for gene reporting")
parser$add_argument("--peakdist", default=1000000, type="integer",
help="minimal distance between peaks")
parser$add_argument("--snpthr", default=7, type="double",
help="P threshold for gene reporting")
parser$add_argument("--namethr", default=10, type="double",
help="Print gene names above this threshold")
parser$add_argument("--coat_covar", default=FALSE, action="store_true",
help="Use coat color as defined in yaml file as a covariate")
parser$add_argument("--coat_phenotype", default=FALSE, action="store_true",
help="Use coat color as defined in yaml file as a phenotype")
parser$add_argument("--coat_fraction", default=0.05,
help="Fraction of strains with a coat color to use as a phenotype, default 0.05")
parser$add_argument("--runmetasoft", default=FALSE, action="store_true",
help="Run metasoft on the given phenotypes. This assumes the phenotypes have the same effect")
parser$add_argument("--lambda_mean", default=1, type="double",
help="lambda_mean for MetaSoft. Should be obtained from an initial run")
parser$add_argument("--lambda_hetero", default=1, type="double",
help="lambda_hetero for MetaSoft. Should be obtained from an initial run")
parser$add_argument("--mvalue_method", default="mcmc",
help="Either mcmc (default) or exact if the number of phenotypes is small (<10)")
parser$add_argument("--MDA", action="store_true", default=FALSE,
help="Use the pre-loaded MDA genotypes files in the package, replaces -g")
parser$add_argument("--nomv",
default = FALSE,
action = "store_true",
help = "Ignore multivariate results and use only single phenotypes")
args <- parser$parse_args()
metasoft_args <- paste("-mvalue_p_thres 1e-5", "-lambda_mean", args$lambda_mean, "-lambda_hetero", args$lambda_hetero, "-mvalue_method", args$mvalue_method, sep=" ")
# Load the yaml
yamin <- yaml.load_file(args$yaml)
# Make the output directory
if (!grepl("^/", args$basedir)) args$basedir <- paste(getwd(), args$basedir, sep = "/")
dir.create(args$basedir, recursive = TRUE)
print(paste0("Working directory: ", args$basedir))
# Read the input table
if (length(args$input)){
complete_table <- read_csv(args$input, col_names = TRUE)
}else{
complete_table <- tibble()
}
print(complete_table)
# Generate a list of phenotypes and list of groups
phegroups <- list()
pheno_names <- c()
i = 1
for (n in names(yamin$phenotypes)){
if (n %in% names(complete_table)){
pheno_names <- c(pheno_names, n)
if ("group" %in% names(yamin$phenotypes[[n]]))
phegroups[[yamin$phenotypes[[n]]$group]] <- c(phegroups[[yamin$phenotypes[[n]]$group]], i)
i <- i+1
}
}
#pheno_names <- intersect(yamin$phenotypes, names(complete_table))
# A list of covariates
covar_names <- setdiff(intersect(yamin$covar, names(complete_table)), pheno_names)
# Read the coat from the yaml file
coat_table <- tibble(strain=character(), coat=character(), eyes=character())
if (args$coat_phenotype |args$coat_covar){
for (ct in yamin$coat){
eyec = "normal"
coat = unlist(ct)[1]
if (grepl(",", coat)){
spl <- strsplit(coat, ",")[[1]]
eyec <- spl[1]
coat <- gsub("^ *", "", spl[2])
}
coat_table <- add_row(coat_table, strain=names(ct), coat=coat, eyes=eyec)
}
if (length(unique(coat_table$eyes)) > 1)
coat_table_mm <- model.matrix(~coat+eyes+0, coat_table)
else
coat_table_mm <- model.matrix(~coat+0, coat_table)
rownames(coat_table_mm) <- coat_table$strain
# Remove columns with one strain
coat_table_mm <- coat_table_mm[,colSums(coat_table_mm)>1, drop=F]
}
# Get the strain names
if (args$coat_phenotype) {
strains <-
unique(tibble(
input_name = rownames(coat_table_mm),
p1 = rownames(coat_table_mm),
p2 =
rownames(coat_table_mm)
))
}else{
strains <-
complete_table %>% select(input_name = one_of(yamin$strain)) %>% unique() %>% mutate(p1 =
input_name, p2 = input_name)
}
# Translate F1
a <- tibble(input_name=character(), p1=character(), p2=character())
for (fl in yamin$F1){
a <- add_row(a, input_name=names(fl), p1=unlist(fl)[1], p2=unlist(fl)[2])
}
# Combine the parents in the strains table
strains <- strains %>% left_join(a, by="input_name", suffix=c(".orig","")) %>%
mutate(p1=ifelse(is.na(p1), p1.orig, p1), p2=ifelse(is.na(p2), p2.orig, p2)) %>% select(input_name, p1, p2)
# Translate names to genotype names
t <- tibble(name=character(), gen_name=character())
for (tr in yamin$translate){
t <- add_row(t, name=names(tr), gen_name=unlist(tr)[1])
}
# Replace relevant names in the strains table
strains <- strains %>% left_join(t, by=c("p1" = "name")) %>% mutate(p1=ifelse(is.na(gen_name), p1, gen_name)) %>% select(input_name, p1, p2)
strains <- strains %>% left_join(t, by=c("p2" = "name")) %>% mutate(p2=ifelse(is.na(gen_name), p2, gen_name)) %>% select(input_name, p1, p2)
valid_strains <- unique(c(strains$p1, strains$p2))
# For each genotype file read it and add to the long file, use only genotypes in the input
# Read the genotype csv file
longfile <- tempfile()
complete.geno <- NULL
if (args$MDA){
args$genotypes <- c(system.file("extdata", "snp_190703353330.csv.gz", package = "mousegwas"),
system.file("extdata", "snp_190703530013.csv.gz", package = "mousegwas"),
system.file("extdata", "snp_190703577680.csv.gz", package = "mousegwas"),
system.file("extdata", "snp_190703687397.csv.gz", package = "mousegwas"),
system.file("extdata", "snp_190703843678.csv.gz", package = "mousegwas"))
}
for (f in args$genotypes){
geno <- fread(f)
geno[, c("major", "minor") := tstrsplit(observed, "/", fixed=TRUE, keep=1:2)]
geno <- geno[rs!="",]
if (is.null(complete.geno)){
if (length(intersect(names(geno), valid_strains))>0){
addnames <- c(intersect(names(geno), valid_strains), "chr", "bp38", "rs", "major", "minor")
complete.geno <- geno[,..addnames]
}
}else{
addnames <- c(intersect(names(geno), valid_strains), "chr", "bp38", "rs", "major", "minor")
geno <- geno[,..addnames]
setkey(geno, rs)
setkey(complete.geno, rs)
complete.geno <- merge(complete.geno, geno, all=TRUE, by=c("chr", "bp38", "rs", "major", "minor"))
}
valid_strains <- setdiff(valid_strains, names(complete.geno))
}
complete.geno[, chr:=as.character(chr)]
srdata <- complete.geno[, .(rs, major, minor)]
numeric.geno <- complete.geno[, .("chr", "bp38", "rs", "major", "minor")]
for (cn in setdiff(names(complete.geno), c("chr", "bp38", "rs", "major", "minor"))){
complete.geno[get(cn)=='H',c(cn) := 1]
complete.geno[get(cn)==major, c(cn) := 0]
complete.geno[get(cn)==minor, c(cn) := 2]
complete.geno[,c(cn) := as.numeric(get(cn))]#as.numeric(ifelse(..cn=='H', 1, ifelse(..cn==major, 0, 2)))]
}
# Compute the specific strains genomes
strains_genomes <- srdata
sorder = c()
phenos = data.table()
for (p in pheno_names) phenos <- phenos[, eval(p) := numeric()]
covars = data.table()
for (c in covar_names) covars <- covars[, eval(c) := numeric()]
covars <- covars[, isWild := numeric()]
sexvec <- c()
# Keep old not found strains to not repeat error messages
notfounds = c()
# Read the input phenotypic data, write the genotypes and phenotypes in the proper tables
for (comrow in 1:dim(complete_table)[1]){
sname <- as.character(complete_table[comrow, yamin$strain])
rnum <- which(strains$input_name==sname)
if (length(rnum) == 0){
print(paste0("Can't find strain data ", sname, " row ",comrow))
next
}
p1n <- strains$p1[rnum]
p2n <- strains$p2[rnum]
if (p1n %in% names(complete.geno) & p2n %in% names(complete.geno)){
sorder <- c(sorder, sname)
strains_genomes[, eval(paste0('X',comrow)):=(complete.geno[,..p1n] + complete.geno[,..p2n])/2]
if (p1n!=p2n & !is.null(yamin$sex) & complete_table[comrow, yamin$sex]=="M"){
# It's an F1 male. Only one X chromosome
# The first parent is the female. Take its X chromosome only
strains_genomes[complete.geno$chr=="X", eval(paste0('X',comrow))] = complete.geno[complete.geno$chr=="X",..p1n]
}
# Add the phenotypes to the table
ct <- if (p1n==p2n) as.character(coat_table %>% filter(strain==p1n) %>% select(coat)) else as.character(coat_table %>% filter(strain==sname) %>% select(coat))
if (is.null(ct) & (args$coat_covar)){
print(paste0("Can't find coat color for ", p1n, " or ", sname))
}
prow <- complete_table[comrow, pheno_names]
# Add the covariates to the table
crow <- cbind(complete_table[comrow, covar_names, drop=F], tibble(isWild=as.numeric(p1n %in% yamin$wild | p2n %in% yamin$wild)))
if (args$coat_covar){
crow <- cbind(crow, coat=ct)
}
if (length(covar_names) == 0 | all(!is.na(crow))) {
phenos <- rbind(phenos, prow, fill = TRUE)
covars <- rbind(covars, crow, fill = TRUE)
sexvec <-
c(sexvec, if (is.null(yamin$sex))
1
else
complete_table[comrow, yamin$sex])
}
}else{
if (p1n==p2n){
if (!p1n %in% notfounds){
print(paste0("Can't find strain ",p1n))
notfounds <- c(notfounds, p1n)
}
}else{
if (!(p1n %in% notfounds & p2n %in% notfounds)){
print(paste0("Can't find ", p1n," or ", p2n))
notfounds <- c(notfounds, p1n, p2n)
}
}
}
}
if (args$coat_phenotype){
for (rnum in 1:dim(strains)[1]){
sname <- strains$input_name[rnum]
p1n <- strains$p1[rnum]
p2n <- strains$p2[rnum]
cname <- if (p1n==p2n) p1n else sname
# Filter to include colors with at least 5% positives
prow <- coat_table_mm[cname,colSums(coat_table_mm) > args$coat_fraction*nrow(coat_table_mm),drop=F]
if (p1n %in% names(complete.geno) & p2n %in% names(complete.geno)){
sorder <- c(sorder, sname)
strains_genomes[, eval(sname):=(complete.geno[,..p1n] + complete.geno[,..p2n])/2]
phenos <- rbind(phenos, prow, fill = TRUE)
}
}
}
print(dim(strains_genomes))
print(dim(phenos))
# Generate a covar table based on the confounding SNPs provided in the yaml file
snpcovar <- NULL
if (!is.null(yamin$confSNPs)){
snpcovar <- base::t(as.matrix(strains_genomes)[as.matrix(strains_genomes[,1]) %in% yamin$confSNPs, 4:ncol(strains_genomes), drop=F])
colnames(snpcovar) <- strains_genomes$rs[strains_genomes$rs %in% yamin$confSNPs]
for (c in 1:ncol(snpcovar)){
snpcovar[,c] <- as.numeric(snpcovar[,c])
}
covars <- cbind(covars, snpcovar)
covar_names <- c(covar_names, colnames(snpcovar))
}
# Compute the covariate matrix
if (length(covar_names) > 0){
if (args$coat_covar){
covar_names <- c(covar_names, "coat")
}
covars <- model.matrix(as.formula(paste0("~", do.call(paste, c(as.list(covar_names), sep="+")))), covars)
}else{
covars = NULL
}
# scale to have mean=0 var=1 if there are more than 2 values (0,1)
raw_phenos <- phenos
if (!all(is.na(phenos) | phenos==0 | phenos==1)){
phenos <- scale(phenos)
}else{
phenos <- as.matrix(phenos)
}
# Remove columns with NaNs
for (c in ncol(phenos):1){ if (all(is.na(phenos[,c]) | phenos[,c]==0)) {phenos <- phenos[,-c, drop=F]; raw_phenos <- raw_phenos[,-c,drop=F]}}
if (args$shuffle){
set.seed(args$seed)
neword <- sample(nrow(phenos))
phenos <- phenos[neword,, drop=F]
raw_phenos <- raw_phenos[neword,,drop=F]
}
# Take the betas of each strain and use it to run GEMMA
if (args$coat_phenotype){
b <- list(phenotypes = phenos, genotypes = strains_genomes, indices = 1:nrow(phenos), covars=NULL)
}else{
b <- average_strain(strains_genomes, phenos, covars, args$downsample, sexvec, sorder)
}
# Print the phenotypes order
raw_phenos <- raw_phenos[b$indices,,drop=F]
write.csv(raw_phenos, file=paste0(args$basedir, "/raw_phenotypes.csv"))
write.csv(colnames(b$phenotypes), file=paste0(args$basedir, "/phenotypes_order.txt"), quote = FALSE, col.names = FALSE, row.names = FALSE)
# Plot correlations between phenotypes
#print(head(b$phenotypes))
if (dim(b$phenotypes)[2]>1){
corpheno <- b$phenotypes
colnames(corpheno) <- gsub("OFA_Groom", "", colnames(corpheno))
ppr <- ggpairs(as.data.frame(corpheno))
ggsave(paste0(args$basedir, "/phenotype_correlations.pdf"), plot=ppr, device="pdf", width=16, height=16, units="in")
pdf(paste0(args$basedir, "/phenotype_correlations_corrgram.pdf"), width = 16, height = 16)
corrgram(as.data.frame(corpheno), order=TRUE, upper.panel=panel.conf, lower.panel=panel.shade, diag.panel=panel.density)
dev.off()
}
# Remove SNPs with more than 5% missing data and 5% MAF
mafc <- rowSums(complete.geno[,-1:-5])/(2*(ncol(complete.geno)-5))
b$genotypes <- b$genotypes[rowSums(is.na(b$genotypes))<=(ncol(b$genotypes)-3)*args$missing &
mafc >= args$MAF & mafc <= 1-args$MAF,]
complete.geno <- complete.geno[rs %in% b$genotypes$rs,]
fwrite(complete.geno, file = paste0(args$basedir, "/strains_genotypes_all.csv"))
# Normalize the phenotypes
if (args$qqnorm){
for (r in 1:ncol(b$phenotypes)){
b$phenotypes[,r] <- qqnorm(as.data.frame(b$phenotypes)[,r], plot=F)$x
}
}
# Export order of strains used
write.csv(sorder[b$indices], paste0(args$basedir, "/export_strains_order.csv"), quote = FALSE, col.names = FALSE)
# Run gemma/pylmm using the helper function
if (args$method == "GEMMA"){
if (args$nomv) phegroups = NULL
results_file <- execute_lmm(data.table(b$genotypes), data.table(b$phenotypes),
as.data.table(complete.geno[,.(rs, bp38, chr)]),
b$covars, args$basedir, loco=!args$noloco,
groups = phegroups, runmetasoft=args$runmetasoft, metasoft_args = metasoft_args)
# Run no LOCO to get the unified heritability for each phenotype
if (!args$noloco){
all_res <- execute_lmm(data.table(b$genotypes), data.table(b$phenotypes),
as.data.table(complete.geno[,.(rs, bp38, chr)]),
b$covars, args$basedir, loco=FALSE, runmetasoft=F)
# Extract the VPE values for each phenotype
}
allVPE = data.table(phenotype=character(), PVE=numeric(), PVESE=numeric(), Vg=numeric(), Ve=numeric())
for (n in 1:dim(b$phenotypes)[2]){
fname <- paste0(args$basedir, "/output/lmm_all_phenotype_", n, ".log.txt")
if (file.exists(fname)){
sigs <- get_sigmas(fname)
sigs$phenotype <- colnames(b$phenotypes)[n]
allVPE <- rbind(allVPE, sigs)
}
}
fwrite(allVPE, file=paste0(args$basedir, "/PVE_GEMMA_estimates.txt"))
}else if (args$method == "pyLMM"){
results_file <- run_pylmm(data.table(b$genotypes), data.table(b$phenotypes),
as.data.table(complete.geno[,.(rs, bp38, chr)]),
b$covars, args$basedir, args$pylmm, args$pylmmkinship, loco=!args$noloco, metasoft_args = metasoft_args)
}
is.metasoft <- FALSE
if (args$method=="GEMMA" && args$runmetasoft)
is.metasoft = TRUE
pval_thr <- args$snpthr
# Manhattan plot
if (args$runmetasoft) {
genotypes = as.data.frame(complete.geno[, -c("chr", "bp38", "minor", "major", "rs")])
rownames(genotypes) <- complete.geno[, rs]
p <-
plot_gemma_lmm(
results_file,
name = args$header,
metasoft = is.metasoft,
pyLMM = args$method == "pyLMM" && ncol(b$phenotypes) == 1,
annotations = paste0(args$basedir, "/annotations.csv"),
namethr = args$namethr,
redthr = pval_thr,
genotypes = genotypes,
maxdist = args$peakdist
)
#genotypes = paste0(args$basedir, "/all_genotypes.csv"))
save(p, file = paste0(args$basedir, "/gwas_object_output.Rdata"))
ggsave(
paste0(args$basedir, "/manhattan_plot_p_lrt.pdf"),
plot = p$plot,
device = "pdf",
width = 16,
height = 8,
units = "in"
)
# Read the significant SNPs and grab their related genes
affgen <-
get_genes(p$gwas[p$gwas$ispeak == T &
p$gwas$P >= args$snpthr, ], dist = args$genedist)
fwrite(
merge(data.table(affgen), data.table(p$gwas), by = "rs"),
paste0(
args$basedir,
"/genes_dist_",
args$genedist,
"_pval_",
args$snpthr,
".csv"
)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.