# write_dadi-----------------------------------------------------------------------
#' @name write_dadi
#' @title Write a \code{dadi} SNP input file from a tidy data frame.
#' @description This function will generate a \code{dadi} SNP input file using a
#' radiator tidy tibble.
#'
#' Note that missing data can potentially bias demographic inference,
#' consequently you might want to check the impact of missing data by running
#' two datset in \code{dadi}: one with the missing data or several with varying thresholds
#' of missingness, and one with imputed genotypes
#' (check my other package \href{https://thierrygosselin.github.io/grur/}{grur} for this).
#' @param data A tidy data frame object in the global environment or
#' a tidy data frame in wide or long format in the working directory.
#' \emph{How to get a tidy data frame ?}
#' Look into \pkg{radiator} \code{\link{tidy_genomic_data}}.
#' @param fasta.outgroup (optional) The fasta file, sequences for the outgroup.
#' Default: \code{fasta.outgroup = NULL}.
#' @param fasta.ingroup (optional) The fasta file, sequences for the ingroup.
#' Leave empty if no outgroup.
#' Default: \code{fasta.ingroup = NULL}.
#' @param sumstats.outgroup (optional) The sumstats output file from STACKS
#' when running STACKS for the outgroup fasta file. This file is
#' required to use an outgroup.
#' Default: \code{sumstats.outgroup = NULL}.
#' @param sumstats.ingroup (optional) The sumstats output file from STACKS
#' when running STACKS for the ingroup fasta file.This file is
#' required to use with an outgroup. Leave empty if no outgroup.
#' Default: \code{sumstats.ingroup = NULL}.
#' @param dadi.input.filename (optional) Name of the \code{dadi} SNP input file
#' written to the working directory. e.g. \code{dadi.file.tsv}.
#' Default use date and time to make the file. If used, the file extension
#' need to finish with \code{.tsv or .txt}.
#' Default: \code{dadi.input.filename = NULL}.
#' @param calibrate.alleles (optional, logical) To re-calibrate REF an ALT alleles.
#' Will be done automatically to the dataset if the required genomic format is
#' not found. Please use if you have removed individuals.
#' Default: \code{calibrate.alleles = FALSE}.
#' @export
#' @rdname write_dadi
#' @return The function returns tibble and the dadi input file in the working
#' directory.
# @seealso `vignette_write_dadi` for more info
#' @examples
#' \dontrun{
#' # without outgroup:
#' dadi.data <- radiator::write_dadi(data = "my_tidy_dataset.rad")
#'
#' # with outgroup and fasta generated by stacks:
#' dadi.data <- radiator::write_dadi(
#' data = "my_tidy_dataset.rad",
#' fasta.ingroup = "batch_1.ingroup.fa",
#' fasta.outgroup = "batch_1.outgroup.fa",
#' sumstats.ingroup = "batch_1.sumstats.ingroup.tsv",
#' sumstats.outgroup = "batch_1.sumstats.outgroup.tsv"
#' )
#' }
#' @references Gutenkunst RN, Hernandez RD, Williamson SH, Bustamante CD (2009)
#' Inferring the Joint Demographic History of Multiple Populations
#' from Multidimensional SNP Frequency Data (G McVean, Ed,).
#' PLoS genetics, 5, e1000695.
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
write_dadi <- function(
data,
fasta.ingroup = NULL,
fasta.outgroup = NULL,
sumstats.ingroup = NULL,
sumstats.outgroup = NULL,
dadi.input.filename = NULL,
calibrate.alleles = FALSE
){
message("Preparing \u2202a\u2202i input SNP data format")
# dadi unicode character: \u2202
# Checking for missing and/or default arguments ------------------------------
if (missing(data)) rlang::abort("Input file missing")
# File type detection----------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
if (data.type != "tbl_df") rlang::abort("Wrong input file, check doc")
# keep date and time
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
# file.date <- suppressWarnings(
# stringi::stri_replace_all_fixed(
# Sys.time(),
# pattern = " EDT",
# replacement = "")
# )
# create a strata.df
strata.df <- radiator::generate_strata(data = data, pop.id = TRUE)
pop.labels <- pop.levels <- levels(data$POP_ID)
# Compute count and Minor Allele Frequency -----------------------------------
# Check if the genotype format required is there, if not generate
if (!tibble::has_name(data, "GT_VCF") && !tibble::has_name(data, "GT_BIN")) {
calibrate.alleles <- TRUE
}
if (!tibble::has_name(data, "REF")) {
rlang::abort("REF/ALT alleles required")
}
# re-calibrate REF/ALT and/or generate the format...
if (calibrate.alleles) {
data %<>% radiator::calibrate_alleles(data = ., gt.bin = TRUE) %$% input
}
message("Computing the Allele Frequency Spectrum...")
if (tibble::has_name(data, "GT_BIN")) {
input.count <- data %>%
dplyr::group_by(MARKERS, POP_ID, REF, ALT) %>%
dplyr::summarise(
N = as.numeric(n()),
PP = as.numeric(length(GT_BIN[GT_BIN == 0L])),
PQ = as.numeric(length(GT_BIN[GT_BIN == 1L])),
QQ = as.numeric(length(GT_BIN[GT_BIN == 2L])),
.groups = "keep"
) %>%
dplyr::mutate(MAF = ((QQ*2) + PQ)/(2*N)) %>%
dplyr::ungroup(.)
} else {
input.count <- data %>%
dplyr::group_by(MARKERS, POP_ID, REF, ALT) %>%
dplyr::summarise(
N = as.numeric(n()),
PP = as.numeric(length(GT_VCF[GT_VCF == "0/0"])),
PQ = as.numeric(length(GT_VCF[GT_VCF == "1/0" | GT_VCF == "0/1"])),
QQ = as.numeric(length(GT_VCF[GT_VCF == "1/1"])),
.groups = "keep"
) %>%
dplyr::mutate(MAF = ((QQ*2) + PQ)/(2*N)) %>%
dplyr::ungroup(.)
}
input.count %<>%
dplyr::mutate(
REF = dplyr::recode(REF, "A" = "001", "C" = "002", "G" = "003", "T" = "004"),
ALT = dplyr::recode(ALT, "A" = "001", "C" = "002", "G" = "003", "T" = "004")
)
# Function to make dadi input data format ***********************************
# input <- input.count # testing
if (is.null(fasta.ingroup)) {
dadi.input <- suppressWarnings(
input.count %>%
dplyr::group_by(MARKERS, POP_ID, REF, ALT) %>%
dplyr::mutate(
A1 = (2 * PP) + PQ,
A2 = (2 * QQ) + PQ
) %>%
dplyr::ungroup(.) %>%
dplyr::select(POP_ID, Allele1 = REF, A1, Allele2 = ALT, A2, MARKERS) %>%
radiator::rad_long(
x = .,
cols = c("POP_ID", "MARKERS", "Allele1", "Allele2"),
measure_vars = c("A1", "A2"),
names_to = "ALLELE_GROUP",
values_to = "COUNT"
) %>%
tidyr::unite(POP, POP_ID, ALLELE_GROUP, sep = "_") %>%
radiator::rad_wide(x = ., formula = "MARKERS + Allele1 + Allele2 ~ POP", values_from = "COUNT") %>%
dplyr::mutate(
IN_GROUP = rep("---", n()), #version 2
OUT_GROUP = rep("---", n())
) %>%
dplyr::select(IN_GROUP,
OUT_GROUP,
Allele1,
dplyr::contains("A1"),
Allele2, dplyr::contains("A2"), MARKERS
) %>%
dplyr::arrange(MARKERS)
)
} else {# no fasta, no outgroup
message("using outgroup info")
# Get the list of ref. allele in the vcf of the ingroup
ref.allele.vcf.ingroup <- input.count %>%
dplyr::ungroup(.) %>%
dplyr::distinct(MARKERS, REF, .keep_all = TRUE) %>%
dplyr::arrange(MARKERS) %>%
tidyr::separate(MARKERS, c("CHROM", "LOCUS", "POS"), sep = "__") %>%
dplyr::distinct(CHROM, LOCUS, POS, REF, .keep_all = TRUE) %>%
dplyr::mutate(
CHROM = as.character(stringi::stri_replace_all_fixed(CHROM, pattern = "un", replacement = "1", vectorize_all = FALSE)),
LOCUS = as.integer(LOCUS),
POS = as.integer(POS)
) %>%
dplyr::arrange(CHROM, LOCUS, POS)
# View(ref.allele.vcf.ingroup)
# keep the list of markers
markers <- dplyr::ungroup(ref.allele.vcf.ingroup) %>%
dplyr::select(-REF)
# import out- and in- group fasta files ********************************
fasta.outgroup <- data.table::fread(
input = fasta.outgroup,
sep = "\t",
stringsAsFactors = FALSE,
showProgress = FALSE,
verbose = FALSE,
header = FALSE,
col.names = "LOCUS"
) %>%
dplyr::mutate(
ID.FILTER = rep(c("LOCUS", "SEQ"), times = n()/2),
ANCESTRAL = rep("outgroup", times = n())
)
fasta.ingroup <- data.table::fread(
input = fasta.ingroup,
sep = "\t",
stringsAsFactors = FALSE,
showProgress = FALSE,
verbose = FALSE,
header = FALSE,
col.names = "LOCUS"
) %>%
tibble::as_tibble() %>%
dplyr::mutate(
ID.FILTER = rep(c("LOCUS", "SEQ"), times = n()/2),
ANCESTRAL = rep("ingroup", times = n())
)
fasta.data <- dplyr::bind_rows(fasta.outgroup, fasta.ingroup)
fasta.outgroup <- fasta.ingroup <- NULL
message("Preparing fasta sequences")
fasta.prep <- fasta.data %>%
dplyr::filter(ID.FILTER == "LOCUS") %>%
dplyr::select(-ID.FILTER) %>%
dplyr::bind_cols(fasta.data %>%
dplyr::filter(ID.FILTER == "SEQ") %>%
dplyr::select(-ID.FILTER, -ANCESTRAL, SEQUENCES = LOCUS)
) %>%
tidyr::separate(LOCUS, c("LOCUS", "ALLELE"), sep = "_Sample_", extra = "warn" ) %>%
tidyr::separate(ALLELE, c("GARBAGE", "ALLELE"), sep = "_Allele_", extra = "warn" ) %>%
tidyr::separate(ALLELE, c("ALLELE", "INDIVIDUALS"), sep = " ", extra = "warn" ) %>%
dplyr::mutate(LOCUS = as.integer(stringi::stri_replace_all_fixed(LOCUS, pattern = ">CLocus_", replacement = "", vectorize_all = FALSE))) %>%
dplyr::select(-GARBAGE, -INDIVIDUALS) %>%
dplyr::distinct(LOCUS, ALLELE, ANCESTRAL, SEQUENCES, .keep_all = TRUE)
fasta.data <- NULL
# import out- and in- group sumstats files*****************************
# Note: only one sumstats might be necessary to have the info needed, need checking
sumstats.outgroup <- data.table::fread(
input = sumstats.outgroup,
sep = "\t",
stringsAsFactors = FALSE,
skip = "Batch ID",
select = c("Chr", "Locus ID", "BP", "Col"),
showProgress = TRUE,
verbose = FALSE
) %>%
tibble::as_tibble() %>%
dplyr::select(CHROM = Chr, LOCUS = `Locus ID`, POS = BP, SNP_READ_POS = Col) %>%
dplyr::mutate(
CHROM = as.character(stringi::stri_replace_all_fixed(CHROM, pattern = "un", replacement = "1", vectorize_all = FALSE)),
LOCUS = as.integer(LOCUS),
POS = as.integer(POS)
) %>%
dplyr::distinct(CHROM, LOCUS, SNP_READ_POS, .keep_all = TRUE) %>%
dplyr::semi_join(markers, by = c("CHROM", "LOCUS", "POS")) %>%
dplyr::arrange(CHROM, LOCUS, POS, SNP_READ_POS) %>%
dplyr::mutate(ANCESTRAL = rep("outgroup", times = n())) %>%
# the SNP position on the read is +1 for the fasta file
dplyr::mutate(SNP_READ_POS = SNP_READ_POS + 1)
sumstats.ingroup <- data.table::fread(
input = sumstats.ingroup,
sep = "\t",
stringsAsFactors = FALSE,
skip = "Batch ID",
select = c("Chr", "Locus ID", "BP", "Col"),
showProgress = TRUE,
verbose = FALSE
) %>%
tibble::as_tibble() %>%
dplyr::select(CHROM = Chr, LOCUS = `Locus ID`, POS = BP, SNP_READ_POS = Col) %>%
dplyr::mutate(
CHROM = as.character(stringi::stri_replace_all_fixed(CHROM, pattern = "un", replacement = "1", vectorize_all = FALSE)),
LOCUS = as.integer(LOCUS),
POS = as.integer(POS)
) %>%
dplyr::distinct(CHROM, LOCUS, SNP_READ_POS, .keep_all = TRUE) %>%
dplyr::semi_join(markers, by = c("CHROM", "LOCUS", "POS")) %>%
dplyr::arrange(CHROM, LOCUS, POS, SNP_READ_POS) %>%
dplyr::mutate(ANCESTRAL = rep("ingroup", times = n())) %>%
# the SNP position on the read is +1 for the fasta file
dplyr::mutate(SNP_READ_POS = SNP_READ_POS + 1)
markers <- NULL # remove unused object
# When REF and ALT allele are equal in number, this can be problematic between ANCESTRAL group of alleles.
# For those loci, we will set them based on both group (out- and in-group), so that there is no differences
# THINKING: when ingroup REF and ALT equal, set to REF in VCF, and EQUAL in outgroup
# when outgroup REF and ALT equal, set the REF based on the ingroup VCF
max.length.read <- stringi::stri_length(str = fasta.prep[1,4])
message("combining information from the fasta and sumstats file")
ref.allele.vcf.ingroup.fasta <- ref.allele.vcf.ingroup %>%
# The sumstats is used to get the position of the markers along the sequence read
dplyr::left_join(
sumstats.ingroup %>%
dplyr::select(-ANCESTRAL)
, by = c("CHROM", "LOCUS", "POS")
) %>%
dplyr::mutate(SNP_READ_POS = stringi::stri_replace_na(SNP_READ_POS, replacement = "NA")) %>%
dplyr::filter(SNP_READ_POS != "NA") %>%
dplyr::mutate(SNP_READ_POS = as.integer(SNP_READ_POS)) %>%
dplyr::arrange(CHROM, LOCUS, POS) %>%
dplyr::ungroup(.) %>%
# get the fasta sequence for those LOCUS...
dplyr::left_join(
fasta.prep %>%
dplyr::filter(ANCESTRAL == "ingroup") %>%
dplyr::select(-ALLELE, -ANCESTRAL)
, by = "LOCUS"
) %>%
dplyr::mutate(SNP_READ_POS = stringi::stri_replace_na(SNP_READ_POS, replacement = "NA")) %>%
dplyr::filter(SNP_READ_POS != "NA") %>%
# get the flanking bases, left and right, of the SNP
dplyr::mutate(
SNP_READ_POS = as.integer(SNP_READ_POS),
FASTA_REF = stringi::stri_sub(SEQUENCES, from = SNP_READ_POS, to = SNP_READ_POS),
IN_GROUP = stringi::stri_sub(SEQUENCES, from = SNP_READ_POS - 1, to = SNP_READ_POS + 1)
) %>%
# remove lines with no match between all the alleles in the fasta file and the REF in the VCF
dplyr::filter(FASTA_REF == REF) %>%
dplyr::group_by(CHROM, LOCUS, POS, REF) %>%
dplyr::distinct(CHROM, LOCUS, POS, REF, .keep_all = TRUE) %>%
dplyr::mutate(
IN_GROUP = ifelse(SNP_READ_POS == max.length.read, stringi::stri_pad(IN_GROUP, width = 3, side = "right", pad = "-"), IN_GROUP),
IN_GROUP = ifelse(SNP_READ_POS == 1, stringi::stri_pad(IN_GROUP, width = 3, side = "left", pad = "-"), IN_GROUP)
)
ingroup <- dplyr::ungroup(ref.allele.vcf.ingroup.fasta) %>%
dplyr::select(CHROM, LOCUS, POS, IN_GROUP) %>%
dplyr::arrange(CHROM, LOCUS, POS) %>%
tidyr::unite(MARKERS, c(CHROM, LOCUS, POS), sep = "__")
markers.id <- dplyr::ungroup(ref.allele.vcf.ingroup.fasta) %>%
dplyr::select(CHROM, LOCUS, POS, SNP_READ_POS)
markers.id.ingroup.nuc <- dplyr::ungroup(ref.allele.vcf.ingroup.fasta) %>%
dplyr::select(CHROM, LOCUS, POS, IN_GROUP)
# remove unused objects
ref.allele.vcf.ingroup <- ref.allele.vcf.ingroup.fasta <- NULL
sumstats.ingroup <- sumstats.outgroup <- NULL
# Same thing but with outgroup
ref.allele.outgroup.fasta <- markers.id %>%
dplyr::left_join(
fasta.prep %>%
dplyr::filter(ANCESTRAL == "outgroup") %>%
dplyr::select(-ALLELE, -ANCESTRAL)
, by = "LOCUS"
) %>%
dplyr::mutate(SEQUENCES = stringi::stri_replace_na(SEQUENCES, replacement = "NA")) %>%
dplyr::filter(SEQUENCES != "NA") %>%
# get the flanking bases, left and right, of the SNP
dplyr::mutate(OUT_GROUP = stringi::stri_sub(SEQUENCES, from = SNP_READ_POS - 1, to = SNP_READ_POS + 1)) %>%
dplyr::select(CHROM, LOCUS, POS, OUT_GROUP, SNP_READ_POS) %>%
dplyr::group_by(CHROM, LOCUS, POS, OUT_GROUP, SNP_READ_POS) %>%
dplyr::tally(.) %>%
dplyr::group_by(CHROM, LOCUS, POS) %>%
dplyr::filter(n == max(n))
fasta.prep <- NULL # remove unused object
# The problem is that some markers in the outgroup will have number of REF and ALT alleles equals...
# Making the ancestral allele call ambiguous (50/50 chance of differences...)
ambiguous.ancestral.allele <- ref.allele.outgroup.fasta %>%
dplyr::ungroup(.) %>%
dplyr::select(CHROM, LOCUS, POS, SNP_READ_POS) %>%
dplyr::group_by(CHROM, LOCUS, POS, SNP_READ_POS) %>%
dplyr::tally(.) %>%
dplyr::filter(n > 1) %>%
dplyr::select(CHROM, LOCUS, POS, SNP_READ_POS) %>%
dplyr::left_join(markers.id.ingroup.nuc, by = c("CHROM", "LOCUS", "POS")) %>%
dplyr::rename(OUT_GROUP = IN_GROUP) %>%
dplyr::ungroup(.) %>%
dplyr::arrange(CHROM, LOCUS, POS)
markers.id.ingroup.nuc <- NULL # remove unused object
outgroup <- ref.allele.outgroup.fasta %>%
dplyr::select(-n) %>%
dplyr::anti_join(ambiguous.ancestral.allele, by = c("CHROM", "LOCUS", "POS")) %>%
dplyr::ungroup(.) %>%
dplyr::arrange(CHROM, LOCUS, POS) %>%
dplyr::bind_rows(ambiguous.ancestral.allele) %>%
dplyr::ungroup(.) %>%
dplyr::arrange(CHROM, LOCUS, POS) %>%
dplyr::mutate(
OUT_GROUP = ifelse(SNP_READ_POS == max.length.read, stringi::stri_pad(OUT_GROUP, width = 3, side = "right", pad = "-"), OUT_GROUP),
OUT_GROUP = ifelse(SNP_READ_POS == 1, stringi::stri_pad(OUT_GROUP, width = 3, side = "left", pad = "-"), OUT_GROUP)
) %>%
dplyr::select(CHROM, LOCUS, POS, OUT_GROUP) %>%
dplyr::arrange(CHROM, LOCUS, POS) %>%
tidyr::unite(MARKERS, c(CHROM, LOCUS, POS), sep = "__")
ambiguous.ancestral.allele <- NULL # remove unused object
ref.allele.outgroup.fasta <- NULL # remove unused object
# common markers between ingroup and outgroup
markers.ingroup <- ingroup %>% dplyr::select(MARKERS)
markers.outgroup <- outgroup %>% dplyr::select(MARKERS)
markers.ingroup.outgroup.common <- dplyr::bind_rows(
markers.ingroup, markers.outgroup
) %>%
dplyr::group_by(MARKERS) %>%
dplyr::tally(.) %>%
dplyr::filter(n == 2) %>%
dplyr::arrange(MARKERS) %>%
dplyr::distinct(MARKERS)
markers.ingroup <- markers.outgroup <- NULL
message("Number of markers common between in- and out- group = ",
dplyr::n_distinct(markers.ingroup.outgroup.common$MARKERS))
dadi.input <- suppressWarnings(
input.count %>%
dplyr::group_by(MARKERS, POP_ID, REF, ALT) %>%
dplyr::mutate(
A1 = (2 * PP) + PQ,
A2 = (2 * QQ) + PQ
) %>%
dplyr::ungroup(.) %>%
dplyr::select(POP_ID, Allele1 = REF, A1, Allele2 = ALT, A2, MARKERS) %>%
radiator::rad_long(x = .,
cols = c("POP_ID", "MARKERS", "Allele1", "Allele2"),
measure_vars = c("A1", "A2"),
names_to = "ALLELE_GROUP",
values_to = "COUNT"
) %>%
tidyr::unite(POP, POP_ID, ALLELE_GROUP, sep = "_") %>%
radiator::rad_wide(x = ., formula = "MARKERS + Allele1 + Allele2 ~ POP", values_from = "COUNT") %>%
dplyr::select(
Allele1,
dplyr::contains("A1"),
Allele2,
dplyr::contains("A2"),
MARKERS
) %>%
dplyr::arrange(MARKERS) %>%
dplyr::ungroup(.) %>%
dplyr::semi_join(markers.ingroup.outgroup.common, by = "MARKERS") %>%
dplyr::inner_join(ingroup, by = "MARKERS") %>%
dplyr::inner_join(outgroup, by = "MARKERS") %>%
dplyr::select(
IN_GROUP, OUT_GROUP,
Allele1, dplyr::contains("A1"),
Allele2, dplyr::contains("A2"),
MARKERS
) %>%
dplyr::arrange(MARKERS)
)
# remove unused objects
outgroup <- ingroup <- markers.ingroup.outgroup.common <- markers.id <- NULL
} # With outgroup and ingroup fasta file
# We need to modify the header to remove "_A1" and "_A2"
# that were necessary for spreading the info accross columns
colnames(dadi.input) <- stringi::stri_replace_all_fixed(
str = colnames(dadi.input),
pattern = c("_A1", "_A2"),
replacement = "",
vectorize_all = FALSE
)
# dadi.input.filename = NULL
if (is.null(dadi.input.filename)) {
dadi.input.filename <- stringi::stri_join(
"dadi_input_", file.date, ".tsv", sep = "")
}
file.version <- suppressWarnings(
stringi::stri_join(
"#\u2202a\u2202i SNP input file generated with radiator v.",
utils::packageVersion("radiator"),
sep = "")
)
file.header.line <- suppressWarnings(
as.data.frame(stringi::stri_join(file.version, file.date, sep = " "))
)
readr::write_tsv(
x = file.header.line,
file = dadi.input.filename,
append = FALSE,
col_names = FALSE
) # write the header line
readr::write_tsv(
x = dadi.input,
file = dadi.input.filename,
append = TRUE,
col_names = TRUE
) # write the data frame
message("\u2202a\u2202i input file name written: ", dadi.input.filename)
return(dadi.input)
} # End write_dadi
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.