# Linkage map
# Genotype summary
#' @title Summary of \code{batch_x.genotypes.txt} and
#' \code{batch_x.markers.tsv} files.
#' @description Useful summary of \code{batch_x.genotypes.txt} and
#' \code{batch_x.markers.tsv} files produced by STACKS genotypes module use
#' for linkage mapping. Filter segregation distortion and output JoinMap and or
#' OneMap file.
#' @param genotypes The \code{genotypes = batch_x.genotypes.txt} created by STACKS genotypes
#' module.
#' @param markers The \code{markers = batch_x.markers.tsv} created by STACKS genotypes
#' module.
#' @param filter.monomorphic (optional) Should monomorphic loci be filtered out.
#' Default: \code{filter.monomorphic = TRUE}.
#' @param filter.missing.band (optional) Should loci with missing
#' band be filtered out. Default: \code{filter.missing.band = TRUE}.
#' @param filter.mean.log.likelihood (optional, integer) Apply a mean log likelihood
#' filter to the loci. e.g. filter.mean.log.likelihood = -10.
#' Default: \code{filter.mean.log.likelihood = NULL} (no filter)
#' @param B (optional, integer) The segregation distortion p-value
#' will be computed with a Monte Carlo test with \code{B} replicates,
#' if \code{B} is supplied.
#' For more details, see \code{\link[stats]{chisq.test}}.
#' Default: \code{B = NULL}.
#' @param filter.GOF (optional, integer) Filer value of the goodness-of-fit for segregation
#' distortion. Default: \code{filter.GOF = NULL}.
#' @param filter.GOF.p.value (optional, double) Filer the goodness-of-fit p-value of
#' GOF segregation distortion. Default: \code{filter.GOF.p.value = NULL}.
#' @param ind.genotyped (optional, integer) Filter the number of individual
#' progenies required to keep the marker. Default: \code{ind.genotyped = 1}.
#' @param joinmap (optional) Name of the JoinMap file to write
#' in the working directory. e.g. "joinmap.turtle.family3.loc".
#' Default: \code{joinmap = NULL}.
#' @param onemap (optional) Name of the OneMap file to write
#' in the working directory. e.g. "onemap.turtle.family3.txt".
#' Default: \code{onemap = NULL}.
#' @param filename (optional) The name of the summary file written
#' in the directory. No default.
#' Default: \code{filename = NULL}.
#' @return The function returns a list with the
#' genotypes summary \code{$geno.sum}, joinmap markers \code{$joinmap.sum}
#' and onemap markers \code{$onemap.sum} summary (use $ to access each
#' components). A JoinMap and/or OneMap file can also be exported to
#' the working direcvtory.
#' @details SIGNIFICANCE results pvalue (goodness-of-fit pvalue):
#' <= 0.0001 = ****, <= 0.001 & > 0.0001 = ***,
#' <= 0.01 & > 0.001 = **, <= 0.05 & > 0.01 = *.
#' @rdname summary_genotypes
#' @export
#' @details work in progress...
#' @examples
#' \dontrun{
#' linkage.map.crab <- summary_genotypes(
#' genotypes = "batch_10.markers.tsv",
#' markers = "batch_10.genotypes_300.txt",
#' filter.monomorphic = TRUE,
#' filter.missing.band = TRUE,
#' filter.mean.log.likelihood = -10,
#' B = NULL,
#' ind.genotyped = 300,
#' joinmap = "test.loc",
#' onemap = "test.onemap.txt",
#' filename = "genotypes.summary.tsv"
#' )
#' }
#' @references Catchen JM, Amores A, Hohenlohe PA et al. (2011)
#' Stacks: Building and Genotyping Loci De Novo From Short-Read Sequences.
#' G3, 1, 171-182.
#' @references Catchen JM, Hohenlohe PA, Bassham S, Amores A, Cresko WA (2013)
#' Stacks: an analysis tool set for population genomics.
#' Molecular Ecology, 22, 3124-3140.
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
summary_genotypes <- function(
genotypes,
markers,
filter.monomorphic = TRUE,
filter.missing.band = TRUE,
filter.mean.log.likelihood = NULL,
B = NULL,
filter.GOF = NULL,
filter.GOF.p.value = NULL,
ind.genotyped = 1,
joinmap = NULL,
onemap = NULL,
filename = NULL) {
message("\n\n\nThis function is no longer working with stacks >= v.2.0\n\n\n")
opt.change <- getOption("width")
options(width = 70)
timing <- proc.time()
cat("###############################################################################\n")
cat("########################### stackr::summary_genotypes #########################\n")
cat("###############################################################################\n")
if (missing(genotypes)) stop("batch_x.genotypes.txt file created by STACKS genotypes is required")
if (missing(markers)) stop("batch_x.markers.tsv file created by STACKS genotypes is required")
if (filter.monomorphic == FALSE) message("Keeping monomorphic markers for linkage mapping is not recommended")
message("Importing ", genotypes)
genotypes.file <- readr::read_tsv(file = genotypes, col_names = c("SQL_ID", "BATCH_ID", "LOCUS", "INDIVIDUALS", "GENOTYPES"), col_types = "iiiic", skip =1, progress = interactive())
message("Importing ", markers)
markers.file <- readr::read_tsv(file = markers, col_names = c("SQL_ID", "BATCH_ID", "LOCUS", "TYPE", "TOTAL_GENOTYPES", "MAX", "GENOTYPE_FREQS", "SEGREGATION_DISTORTION", "MEAN_LOG_LIKELIHOOD", "GENOTYPE_MAP", "UNCORRECTED_MARKER"), col_types = c("iiicidcddcc"),skip =1, progress = interactive())
geno.sum<- genotypes.file %>%
dplyr::select(LOCUS, INDIVIDUALS, GENOTYPES) %>%
dplyr::group_by(LOCUS, GENOTYPES) %>%
dplyr::summarize(COUNT = n()) %>%
dplyr::group_by(LOCUS) %>%
tidyr::spread(GENOTYPES, COUNT) %>%
dplyr::ungroup(.) %>%
merge(markers.file, by = "LOCUS") %>%
dplyr::mutate(
PATTERN = stringi::stri_replace_all_fixed(TYPE, c("--/ab", "aa/ab", "ab/--", "ab/aa","ab/ab", "ab/ac", "ab/cd"),
c("aaxab", "aaxab", "abxaa", "abxaa", "abxab", "abxac", "abxcd"), vectorize_all = FALSE),
JOINMAP = stringi::stri_replace_all_fixed(TYPE, c("--/ab", "aa/ab", "ab/--", "ab/aa", "ab/ab", "ab/ac", "ab/cd"),
c("nnxnp", "nnxnp", "lmxll", "lmxll", "hkxhk", "efxeg", "abxcd"), vectorize_all = FALSE),
ONEMAP = stringi::stri_replace_all_fixed(TYPE, c("--/ab", "aa/ab", "ab/--", "ab/aa", "ab/ab", "ab/ac", "ab/cd"),
c("D2.15", "D2.15", "D1.10", "D1.10", "B3.7", "A.2", "A.1"), vectorize_all = FALSE),
BANDS_EXP = ifelse(PATTERN=="abxac" | PATTERN=="abxcd", 4, ifelse(PATTERN=="abxab", 3, 2))
) %>%
merge(
genotypes.file %>%
dplyr::select(LOCUS, INDIVIDUALS, GENOTYPES) %>%
dplyr::filter (GENOTYPES != "--") %>%
dplyr::group_by(LOCUS, GENOTYPES) %>%
dplyr::summarize(COUNT = n()) %>%
dplyr::group_by(LOCUS) %>%
dplyr::tally(.) %>%
dplyr::select(LOCUS, BANDS_OBS=n)
, by="LOCUS"
) %>%
dplyr::mutate(
POLYMORPHIC = ifelse(BANDS_OBS > 1, "polymorphic","monomorphic"),
MISSING_BAND = ifelse(BANDS_OBS < BANDS_EXP, "missing_band", "all_bands")
) %>%
dplyr::ungroup(.)
if (filter.monomorphic == TRUE) {
geno.sum <- geno.sum %>% dplyr::filter(POLYMORPHIC == "polymorphic")
}
if (filter.missing.band == TRUE) {
# filter(PATTERN == "abxcd") %>% # use this for testing
geno.sum <- dplyr::filter(geno.sum, MISSING_BAND == "all_bands")
}
if(!is.null(filter.mean.log.likelihood)) {
geno.sum <- geno.sum %>% dplyr::filter(MEAN_LOG_LIKELIHOOD >= filter.mean.log.likelihood)
}
# Segregation Distortion summary ---------------------------------------------
if (is.null(B)) {
sim.p.value <- FALSE
B <- 1
} else {# With monte carlo replicates
sim.p.value <- TRUE
message("p-values computed with Monte Carlo simulations")
}
# abxac and/or abxcd markers
message("\nGenerating abxac and/or abxcd markers...")
pattern.A <- geno.sum %>%
dplyr::filter(PATTERN == "abxac" | PATTERN == "abxcd") %>%
dplyr::group_by(LOCUS) %>%
dplyr::do(
CHISQ=stats::chisq.test(
c(.$aa,.$ab, .$ac, .$bc),
p = c(0.25, 0.25, 0.25, 0.25),
simulate.p.value = sim.p.value, B = B)
) %>%
dplyr::mutate(
GOF = CHISQ$statistic,
GOF_PVALUE = CHISQ$p.value
) %>%
dplyr::select(-CHISQ)
# abxab markers
message("\nGenerating abxab markers...")
pattern.B <- geno.sum %>%
dplyr::filter(PATTERN == "abxab") %>%
dplyr::group_by(LOCUS) %>%
dplyr::do(
CHISQ=stats::chisq.test(
c(.$aa,.$ab, .$bb),
p = c(0.25, 0.5, 0.25),
simulate.p.value = sim.p.value, B = B)
) %>%
dplyr::mutate(
GOF = CHISQ$statistic,
GOF_PVALUE = CHISQ$p.value
) %>%
dplyr::select(-CHISQ)
# aaxab and/or abxaa markers
message("\nGenerating aaxab and/or abxaa markers")
pattern.D <- geno.sum %>%
dplyr::filter(PATTERN=="aaxab" | PATTERN=="abxaa") %>%
dplyr::group_by(LOCUS) %>%
dplyr::do(
CHISQ=stats::chisq.test(
c(.$aa,.$ab),
p = c(0.5,0.5),
simulate.p.value = sim.p.value, B = B)
) %>%
dplyr::mutate(
GOF = CHISQ$statistic,
GOF_PVALUE = CHISQ$p.value
) %>%
dplyr::select(-CHISQ)
chisq.data <- dplyr::bind_rows(pattern.A, pattern.B, pattern.D)
geno.sum <- dplyr::inner_join(geno.sum, chisq.data, by = "LOCUS") %>%
dplyr::mutate(
GOF = round(GOF,2),
GOF_PVALUE = round(GOF_PVALUE, 4),
SIGNIFICANCE = ifelse(GOF_PVALUE <= 0.0001, "****",
ifelse(GOF_PVALUE <= 0.001 & GOF_PVALUE > 0.0001, "***",
ifelse(GOF_PVALUE <= 0.01 & GOF_PVALUE > 0.001, "**",
ifelse(GOF_PVALUE <= 0.05 & GOF_PVALUE > 0.01, "*", ""))))
) %>%
dplyr::arrange(LOCUS)
##filter.GOF = FALSE, filter.GOF.p.value = FALSE, ind.genotyped
# Filter goodness-of-fit
if(!is.null(filter.GOF)) {
message("Segregation distortion: filtering goodness-of-fit")
geno.sum <- dplyr::filter(geno.sum, GOF < filter.GOF)
}
# Filter goodness-of-fit p-value
if(!is.null(filter.GOF.p.value)) {
message("Segregation distortion: filtering goodness-of-fit p-value")
geno.sum <- dplyr::filter(geno.sum, GOF_PVALUE < filter.GOF.p.value)
}
# Filter number of individuals genotyped
message("\nFiltering number of individuals genotyped")
geno.sum <- dplyr::filter(geno.sum, TOTAL_GENOTYPES > ind.genotyped)
# pattern summary
joinmap.sum <- geno.sum %>%
dplyr::group_by(JOINMAP) %>%
dplyr::tally(.) %>%
dplyr::rename(MARKER_NUMBER = n)
onemap.sum <- geno.sum %>%
dplyr::group_by(ONEMAP) %>%
dplyr::tally(.) %>%
dplyr::rename(MARKER_NUMBER = n)
# joinmap output--------------------------------------------------------------
if (!is.null(joinmap)) {
joinmap.data <- genotypes.file %>%
dplyr::select(LOCUS, INDIVIDUALS, GENOTYPES) %>%
dplyr::inner_join(
geno.sum %>%
dplyr::select(LOCUS, JOINMAP)
, by = "LOCUS") %>%
dplyr::mutate(
GENOTYPES = ifelse(JOINMAP == "efxeg", (stringi::stri_replace_all_fixed(GENOTYPES, c("aa", "ab", "ac", "bc"), c("ee", "ef", "eg", "fg"), vectorize_all = FALSE)), GENOTYPES),
GENOTYPES = ifelse(JOINMAP == "hkxhk", (stringi::stri_replace_all_fixed(GENOTYPES, c("aa", "ab", "bb"), c("hh", "hk", "kk"), vectorize_all = FALSE)), GENOTYPES),
GENOTYPES = ifelse(JOINMAP == "lmxll", (stringi::stri_replace_all_fixed(GENOTYPES, c("aa", "ab"), c("ll", "lm"), vectorize_all = FALSE)), GENOTYPES),
GENOTYPES = ifelse(JOINMAP == "nnxnp", (stringi::stri_replace_all_fixed(GENOTYPES, c("aa", "ab"), c("nn", "np"), vectorize_all = FALSE)), GENOTYPES),
JOINMAP = stringi::stri_replace_all_fixed(JOINMAP, c("efxeg", "hkxhk", "lmxll", "nnxnp"), c("<efxeg>", "<hkxhk>", "<lmxll>", "<nnxnp>"), vectorize_all = FALSE)
) %>%
dplyr::group_by(LOCUS, JOINMAP) %>%
tidyr::spread(INDIVIDUALS, GENOTYPES) %>%
dplyr::ungroup(.)
progeny.names <- genotypes.file %>%
dplyr::distinct(INDIVIDUALS) %>%
dplyr::arrange(INDIVIDUALS)
locus.number <- paste("nloc = ", nrow(geno.sum), sep = "")
ind.number <- paste("nind = ", nrow(progeny.names), sep = "")
# Now write the output
joinmap.file <- file(joinmap, "write")
cat(paste("name = fill this", "popt = fill this with cross type", locus.number , ind.number, sep = "\n"), sep = "\n", file = joinmap.file, append = TRUE)
utils::write.table(
joinmap.data, file = joinmap.file, append = TRUE, col.names = FALSE,
row.names = FALSE, sep = "\t", quote = FALSE)
suppressWarnings(
utils::write.table(
progeny.names, file = joinmap.file, append = TRUE, col.names = "individual names:",
row.names = FALSE, sep = "\t", quote = FALSE)
)
close(joinmap.file)
message("JoinMap filename written: ", joinmap)
}
# onemap output--------------------------------------------------------------
if (!is.null(onemap)) {
onemap.data <- genotypes.file %>%
dplyr::select(LOCUS, INDIVIDUALS, GENOTYPES) %>%
dplyr::inner_join(
geno.sum %>%
dplyr::select(LOCUS, ONEMAP)
, by="LOCUS") %>%
dplyr::mutate(
GENOTYPES = ifelse(ONEMAP == "A.2" & GENOTYPES == "aa", "a",
ifelse(ONEMAP == "A.2" & GENOTYPES == "ac", "a",
ifelse(ONEMAP == "A.2" & GENOTYPES == "bc", "ab",
ifelse(ONEMAP == "B3.7" & GENOTYPES == "aa", "a",
ifelse(ONEMAP == "B3.7" & GENOTYPES == "ab", "-",
ifelse(ONEMAP == "B3.7" & GENOTYPES == "bb", "ab",
ifelse(ONEMAP == "D1.10" & GENOTYPES == "aa", "a",
ifelse(GENOTYPES == "--", "-", GENOTYPES))))))))) %>%
dplyr::mutate(
ONEMAP = ifelse(ONEMAP == "A.2", "D1.10", "D1.10")
) %>%
dplyr::group_by(LOCUS, ONEMAP) %>%
tidyr::spread(INDIVIDUALS, GENOTYPES) %>%
dplyr::ungroup(.) %>%
dplyr::mutate(LOCUS = paste("*", LOCUS, sep="")) %>%
tidyr::unite(MARKERS, LOCUS, ONEMAP, sep=" ", remove = T) %>%
dplyr::mutate(MARKERS = paste(.[,1], .[,2], sep = "\t"))
progeny.names <- genotypes.file %>%
dplyr::distinct(INDIVIDUALS) %>%
dplyr::arrange(INDIVIDUALS)
# Now write the output
onemap.file <- file(onemap, "write")
cat(paste(nrow(progeny.names), nrow(geno.sum), sep="\t"), sep = "\n", file = onemap.file, append = TRUE)
utils::write.table(onemap.data, file = onemap.file, append = TRUE, col.names = FALSE,
row.names = FALSE, sep = ",", quote = FALSE)
close(onemap.file)
message("OneMap filename written: ", onemap)
}
# Save/Write the file to the working directory--------------------------------
if (!is.null(filename)) {
readr::write_tsv(geno.sum, filename, append = FALSE, col_names = TRUE)
message("genotypes summary written: ", filename)
}
res <- list()
res$geno.sum <- geno.sum
res$joinmap.sum <- joinmap.sum
res$onemap.sum <- onemap.sum
message("\nComputation time: ", round((proc.time() - timing)[[3]]), " sec")
cat("################################## completed ##################################\n")
options(width = opt.change)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.