#!/usr/bin/env Rscript
# (C) Copyright 2019 Sur Herrera Paredes
#
# This file is part of HMVAR.
#
# HMVAR is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# HMVAR is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HMVAR. If not, see <http://www.gnu.org/licenses/>.
library(tidyverse)
library(argparser)
library(HMVAR)
process_arguments <- function(){
p <- arg_parser(paste("Calculate DoS statistics on a single genome or a
set of pre-processed genomes."))
# Positional arguments
p <- add_argument(p, "input",
help = paste("Input can be either a single file or a directory.",
"If a single file is passed, it hould be a tab-delimited",
"file where each row corresponds to a gene, and it must",
"include columns Dn, Ds, Pn, and Ps corresponding to the",
"values in the McDonald-Kreitman contingency table.\n",
"If a directory is passed, it should either contain a",
"set of tab-delimited files as above, or a set of",
"sub-directories, each one of which is the ouptut from",
"<midas_merge.py snps>. The option --dir_type allows",
"you to specify which type of directory it is, and",
"the option --suffix lets you specify the suffix of",
"the tab-delimited files."),
type = "character")
p <- add_argument(p, "outdir",
help = paste("This should be the output directory for the output"),
type = "character")
# Optional arguments
p <- add_argument(p, "--dir_type",
help = paste("Either 'tabs' or 'midas' to indicate whether",
"tab-delimited files or midas_merge.py directories",
"within the input directory",
"are to be processed. If <input> is a file, then",
"this option will be ignored."),
type = "character",
default = "files")
p <- add_argument(p, "--map_file",
help = paste("Mapping file that associates samples",
"with groups. It must be a tab-delimited",
"file with headers 'ID' and 'Group'. Only",
"required if --dir_type=midas"),
type = "character")
p <- add_argument(p, "--genes",
help = paste0("File with list of gene IDs to test. ",
"It must contain one gene id per line, ",
"without header, and the IDs must ",
"correspond to the gene_id column from ",
"the snps_info.txt file if --dir_type=midas",
"or to the same column in the tab-delimited files",
"if --dir_type=tabs or if <input> was a file.",
"If NULL, all ",
"genes will be tested."),
default = NULL,
type = "character")
p <- add_argument(p, "--depth_thres",
help = paste0("Minumum number of reads at a given site ",
"at a given sample, for that site in that ",
"sample to be included in calculations"),
default = 1,
type = "integer")
p <- add_argument(p, "--freq_thres",
help = paste0("This value is the threshold that will be ",
"used to determine which allele is present in ",
"a sample. The value from ",
"min(freq_thres, 1 - freq_thres) represents the ",
" distance from 0 or 1, for a site to be assigned ",
"to the major or minor allele respectively. ",
"It must be a value in [0,1]."),
default = 0.5,
type = "double")
p <- add_argument(p, "--focal_group",
help = paste("Group to use as a focal group to compare against all other.",
"If passed, the script will convert all values in the Group",
"column of map into 'non.<focal_group>', effectiveley",
"ensuring a dichotomous grouping. If not passed, the script",
"will expect only two levels in the Group column of map and it",
"will fail if more than one group is found."),
default = NULL,
type = "character")
p <- add_argument(p, "--suffix",
help = paste("The suffix of the tab-delimited files. It will be used",
"to identify which files to process, and also, to",
"determine the output files. Basically if an input",
"file is of the form /path/<prefix><suffix>",
"the ouptut will be of the form <outdir>/<prefix>*"),
type = "character",
default = ".txt")
# Read arguments
cat("Processing arguments...\n")
args <- parse_args(p)
# Process arguments
if(args$freq_thres < 0 || args$freq_thres > 1){
stop("ERROR: freq_thres must be a value in [0,1].")
}
if(args$depth_thres < 0){
stop("ERROR: depth_thres must be a non-negative integer.")
}
args$suffix <- paste0(args$suffix, "$")
# Read genes
if(is.na(args$genes)){
args$genes <- NULL
}else{
cat("Reading list of genes...\n")
args$genes <- read_tsv(args$genes, col_names = FALSE, col_types = 'c')
args$genes <- genes$X1
}
return(args)
}
write_outputs <- function(dat, infile, suffix, outdir, pval_col){
# infile <- args$input
# outdir <- args$outdir
prefix <- str_replace(string = basename(infile), pattern = suffix, replacement = "")
# Write table
filename <- paste0(c(prefix, "DoS.table.txt"), collapse = ".")
filename <- file.path(outdir, filename)
write_tsv(dat, filename)
# Produce plots
if(sum(!is.na(dat$DoS)) > 1){
p1 <- ggplot(dat, aes(x = DoS)) +
geom_histogram(bins = 15) +
ggtitle(label = prefix) +
AMOR::theme_blackbox()
filename <- paste0(c(prefix, "DoS.histogram.png"), collapse = ".")
filename <- file.path(outdir, filename)
ggsave(filename, p1, width = 6, height = 5, dpi = 200)
p1 <- ggplot(dat, aes_string(x = pval_col)) +
geom_histogram(bins = 20) +
ggtitle(label = prefix) +
AMOR::theme_blackbox()
filename <- paste0(c(prefix, "DoS.pval.histogram.png"), collapse = ".")
filename <- file.path(outdir, filename)
ggsave(filename, p1, width = 6, height = 5, dpi = 200)
filename <- paste0(c(prefix, "DoS.pval.qqplot.png"), collapse = ".")
filename <- file.path(outdir, filename)
png(filename = filename, width = 6, height = 5, units = "in", res = 200)
HMVAR::pval_qqplot(dat[ , pval_col, drop = TRUE ])
dev.off()
}
}
args <- process_arguments()
if(!dir.exists(args$outdir)){
dir.create(args$outdir)
}
if(dir.exists(args$input)){
# Input is dir
cat("Input is a dir\n")
if(args$dir_type == 'tabs'){
cat("Inputs are tab files\n")
inputs <- list.files(args$input)
inputs <- str_subset(string = inputs, pattern = args$suffix)
cat("Found ", length(inputs), " files\n")
}else if(args$dir_type == "midas"){
cat("Inputs are dirs\n")
if(!file.exists(args$map_file)){
stop("ERROR: --map_file must be provided in <input> is dir and --dir_type=midas.")
}
inputs <- list.dirs(args$input, recursive = FALSE)
# inputs <- str_subset(string = inputs, pattern = args$suffix)
cat("Found ", length(inputs), " dirs\n")
pval_col <- 'p.value'
}else{
stop("ERROR: --dir_type must be tabs or midas.")
}
Res <- NULL
for(input in inputs){
cat("Processing ", input, "...\n")
if(args$dir_type == 'tabs'){
# Each input is a file
# Input is file
dat <- read_tsv(file.path(args$input, input),
col_types = cols(.default = col_character(),
Dn = col_integer(),
Ds = col_integer(),
Pn = col_integer(),
Ps = col_integer()))
dat <- dat %>%
select(gene_id, Dn, Ds, Pn, Ps)
if(!is.null(args$genes)){
dat <- dat %>%
filter(gene_id %in% args$genes)
}
if('p.value' %in% colnames(dat)){
pval_col <- 'DoS.p.value'
}else{
pval_col <- 'p.value'
}
if(nrow(dat) > 0){
cat("\tCalculating DoS.\n")
dos <- calculate_dos(dat = dat, test = TRUE, clean = FALSE)
write_outputs(dat = dos, infile = input, suffix = args$suffix, outdir = args$outdir, pval_col = pval_col)
}else{
cat("\t", input, " did not have enough lines.\n")
dos <- tibble()
}
}else if(args$dir_type == 'midas'){
# Each input is a midas merge dir
dos <- midas_dos(midas_dir = input,
map = args$map_file,
genes = args$genes,
depth_thres = args$depth_thres,
freq_thres = args$freq_thres,
focal_group = args$focal_group)
write_outputs(dat = dos, infile = input, suffix = "$", outdir = args$outdir, pval_col = 'p.value')
}else{
stop("ERROR: --dir_type must be tabs or midas.")
}
if(nrow(dos) > 0){
cat("\tbinding...\n")
dos$input <- input
cat("dim dos:", dim(dos), "\n")
Res <- Res %>% bind_rows(dos)
cat("dim Res:", dim(Res), "\n")
}
}
# Write combined
cat("Writing combined...\n")
cat("===", dim(Res), "===\n")
write_outputs(dat = Res, infile = "summary", suffix = '$', outdir = args$outdir, pval_col = pval_col)
}else if(file.exists(args$input)){
# Input is file
dat <- read_tsv(args$input,
col_types = cols(.default = col_character(),
Dn = col_integer(),
Ds = col_integer(),
Pn = col_integer(),
Ps = col_integer()))
dat <- dat %>%
select(gene_id, Dn, Ds, Pn, Ps)
if(!is.null(args$genes)){
dat <- dat %>%
filter(gene_id %in% args$genes)
}
if('p.value' %in% colnames(dat)){
pval_col <- 'DoS.p.value'
}else{
pval_col <- 'p.value'
}
if(nrow(dat) > 0){
cat("\tcaluclating DoS.\n")
dos <- calculate_dos(dat = dat, test = TRUE, clean = FALSE)
write_outputs(dat = dos, infile = args$input, suffix = args$suffix, outdir = args$outdir, pval_col = pval_col)
}else{
cat("\t",input, " did not have enough lines.\n")
}
}else{
stop("ERROR: input doesn't exist")
}
warnings()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.