#' Data Input in fitPoly format
#'
#' Reads an external data file generated as output of \code{\link[fitPoly]{saveMarkerModels}}.
#' This function creates an object of class \code{mappoly.data}.
#'
#' @param file.in a character string with the name of (or full path to) the input file
#'
#' @param ploidy the ploidy level
#'
#' @param parent1 a character string containing the name (or pattern of genotype IDs) of parent 1
#'
#' @param parent2 a character string containing the name (or pattern of genotype IDs) of parent 2
#'
#' @param offspring a character string containing the name (or pattern of genotype IDs) of the offspring
#' individuals. If \code{NULL} (default) it considers all individuals as offsprings, except
#' \code{parent1} and \code{parent2}.
#'
#' @param filter.non.conforming if \code{TRUE} (default) converts data points with unexpected
#' genotypes (i.e. no double reduction) to 'NA'. See function \code{\link[mappoly]{segreg_poly}}
#' for information on expected classes and their respective frequencies.
#'
#' @param elim.redundant logical. If \code{TRUE} (default), removes redundant markers
#' during map construction, keeping them annotated to in order to include them in the final map.
#'
#' @param parent.geno indicates whether to use the joint probability \code{'joint'} (default) or the
#' maximum probability of multiple replicates (if available) to assign dosage to parents.
#' If there is one observation per parent, both options will yield the same results.
#'
#' @param thresh.parent.geno threshold probability to assign a dosage to parents. If the probability
#' is smaller than \code{thresh.parent.geno}, the marker is discarded.
#'
#' @param prob.thres threshold probability to assign a dosage to offspring. If the probability
#' is smaller than \code{prob.thres}, the data point is converted to 'NA'.
#'
#' @param file.type indicates whether the characters in the input file are separated by
#' 'white spaces' ("table") or by commas ("csv").
#'
#' @param verbose if \code{TRUE} (default), the current progress is shown; if
#' \code{FALSE}, no output is produced
#'
#' @return An object of class \code{mappoly.data} which contains a
#' list with the following components:
#' \item{ploidy}{ploidy level}
#' \item{n.ind}{number individuals}
#' \item{n.mrk}{total number of markers}
#' \item{ind.names}{the names of the individuals}
#' \item{mrk.names}{the names of the markers}
#' \item{dosage.p1}{a vector containing the dosage in
#' parent P for all \code{n.mrk} markers}
#' \item{dosage.p2}{a vector containing the dosage in
#' parent Q for all \code{n.mrk} markers}
#' \item{chrom}{a vector indicating which sequence each marker
#' belongs. Zero indicates that the marker was not assigned to any
#' sequence}
#' \item{genome.pos}{Physical position of the markers into the
#' sequence}
#' \item{seq.ref}{NULL (unused in this type of data)}
#' \item{seq.alt}{NULL (unused in this type of data)}
#' \item{all.mrk.depth}{NULL (unused in this type of data)}
#' \item{geno.dose}{a matrix containing the dosage for each markers (rows)
#' for each individual (columns). Missing data are represented by
#' \code{ploidy_level + 1}}
#' \item{n.phen}{number of phenotypic traits}
#' \item{phen}{a matrix containing the phenotypic data. The rows
#' correspond to the traits and the columns correspond
#' to the individuals}
#' \item{kept}{if elim.redundant = TRUE, holds all non-redundant markers}
#' \item{elim.correspondence}{if elim.redundant = TRUE, holds all non-redundant markers and
#' its equivalence to the redundant ones}
#' @examples
#' \donttest{
#' #### Tetraploid Example
#' ft <- "https://raw.githubusercontent.com/mmollina/MAPpoly_vignettes/master/data/fitpoly.dat"
#' tempfl <- tempfile()
#' download.file(ft, destfile = tempfl)
#' fitpoly.dat <- read_fitpoly(file.in = tempfl, ploidy = 4,
#' parent1 = "P1", parent2 = "P2",
#' verbose = TRUE)
#' print(fitpoly.dat, detailed = TRUE)
#' plot(fitpoly.dat)
#' plot_mrk_info(fitpoly.dat, 37)
#'}
#'
#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu}
#'
#' @references
#'
#' Voorrips, R.E., Gort, G. & Vosman, B. (2011) Genotype calling
#' in tetraploid species from bi-allelic marker data using mixture
#' models. _BMC Bioinformatics_.
#' \doi{10.1186/1471-2105-12-172}
#'
#' @export read_fitpoly
#' @importFrom dplyr filter group_by summarise across
#' @importFrom utils read.delim
read_fitpoly <- function(file.in, ploidy, parent1, parent2, offspring = NULL,
filter.non.conforming = TRUE, elim.redundant = TRUE,
parent.geno = c("joint", "max"), thresh.parent.geno = 0.95,
prob.thres = 0.95, file.type = c("table", "csv"), verbose = TRUE)
{
file.type <- match.arg(file.type)
if(file.type == "table")
dat <- read.delim(file = file.in, header = TRUE, stringsAsFactors = FALSE)
else if(file.type == "csv")
dat <- read.csv(file = file.in, header = TRUE, stringsAsFactors = FALSE)
p1 <- unique(sapply(parent1, function(x) unique(grep(pattern = x, dat[,"SampleName"], value = TRUE))))
p2 <- unique(sapply(parent2, function(x) unique(grep(pattern = x, dat[,"SampleName"], value = TRUE))))
if(is.null(offspring)){
offspring <- setdiff(unique(dat[,"SampleName"]), c(p1, p2))
} else {
if(length(offspring) == 1)
offspring <- unique(grep(pattern = offspring, dat[,"SampleName"], value = TRUE))
}
parent.geno <- match.arg(parent.geno)
dat <- dat[,c("MarkerName", "SampleName",paste0("P", 0:ploidy))]
if(parent.geno == "joint"){
dat.p1 <- dat %>%
filter(SampleName %in% p1) %>%
group_by(MarkerName) %>%
summarise(across(2:(2+ploidy), prod), .groups = 'drop')
dat.p1 <- data.frame(dat.p1[,-1], row.names = dat.p1$MarkerName)
dat.p1 <- as.data.frame(t(apply(dat.p1, 1, function(x) round(x/sum(x),3))))
genoP1 <- apply(dat.p1, 1, function(x) {
if(any(is.na(x))) return(NA)
if(max(x) < thresh.parent.geno)
return(NA)
return(which.max(x)-1)
})
dat.p2 <- dat %>%
filter(SampleName %in% p2) %>%
group_by(MarkerName) %>%
summarise(across(2:(2+ploidy), prod), .groups = 'drop')
dat.p2 <- data.frame(dat.p2[,-1], row.names = dat.p2$MarkerName)
dat.p2 <- as.data.frame(t(apply(dat.p2, 1, function(x) round(x/sum(x),3))))
genoP2 <- apply(dat.p2, 1, function(x) {
if(any(is.na(x))) return(NA)
if(max(x) < thresh.parent.geno)
return(NA)
return(which.max(x)-1)
})
}
if(parent.geno == "max"){
dat.p1 <- dat %>%
filter(SampleName %in% p1) %>%
group_by(MarkerName) %>%
summarise(across(2:(2+ploidy), max), .groups = 'drop')
dat.p1 <- data.frame(dat.p1[,-1], row.names = dat.p1$MarkerName)
genoP1 <- apply(dat.p1, 1, function(x) {
if(any(is.na(x))) return(NA)
if(max(x) < thresh.parent.geno)
return(NA)
return(which.max(x)-1)
})
dat.p2 <- dat %>%
filter(SampleName %in% p2) %>%
group_by(MarkerName) %>%
summarise(across(2:(2+ploidy), max), .groups = 'drop')
dat.p2 <- data.frame(dat.p2[,-1], row.names = dat.p2$MarkerName)
genoP2 <- apply(dat.p2, 1, function(x) {
if(any(is.na(x))) return(NA)
if(max(x) < thresh.parent.geno)
return(NA)
return(which.max(x)-1)
})
}
## get marker names ----------------------
mrk.names <- names(which(!is.na(genoP1 + genoP2)))
## get number of individuals -------------
n.ind <- length(offspring)
## get number of markers -----------------
n.mrk <- length(mrk.names)
## get individual names ------------------
ind.names <- offspring
## get dosage in parent P ----------------
dosage.p1 <- as.integer(genoP1[mrk.names])
names(dosage.p1) <- mrk.names
## get dosage in parent Q ----------------
dosage.p2 <- as.integer(genoP2[mrk.names])
names(dosage.p2) <- mrk.names
## monomorphic markers
dp <- abs(abs(dosage.p1-(ploidy/2))-(ploidy/2))
dq <- abs(abs(dosage.p2-(ploidy/2))-(ploidy/2))
mrk.names <- names(which(dp+dq != 0))
dosage.p1 <- dosage.p1[mrk.names]
dosage.p2 <- dosage.p2[mrk.names]
nphen <- 0
phen <- NULL
if (verbose){
cat("Reading the following data:")
cat("\n Ploidy level:", ploidy)
cat("\n No. individuals: ", n.ind)
cat("\n No. markers: ", n.mrk)
cat("\n No. informative markers: ", length(mrk.names), " (", round(100*length(mrk.names)/n.mrk,1), "%)", sep = "")
cat("\n ...")
}
## get genotypic info --------------------
MarkerName <- SampleName <- NULL
geno <- dat %>%
filter(SampleName %in% offspring) %>%
filter(MarkerName %in% mrk.names) %>%
arrange(SampleName, MarkerName)
colnames(geno) <- c("mrk", "ind", as.character(0:ploidy))
ind.names <- unique(geno$ind)
mrk.names <- unique(geno$mrk)
dosage.p1 <- dosage.p1[mrk.names]
dosage.p2 <- dosage.p2[mrk.names]
## transforming na's in expected genotypes using Mendelian segregation
i.na <- which(apply(geno, 1, function(x) any(is.na(x))))
if (length(i.na) > 0) {
m.na <- match(geno[i.na, 1], mrk.names)
dp.na <- dosage.p1[m.na]
dq.na <- dosage.p2[m.na]
for (i in 1:length(m.na)) geno[i.na[i], -c(1, 2)] <- segreg_poly(ploidy, dp.na[i], dq.na[i])
}
## dosage info
if(filter.non.conforming){
geno.dose <- matrix(NA,1,1)
} else {
geno.dose <- dist_prob_to_class(geno = geno, prob.thres = prob.thres)
if(geno.dose$flag)
{
geno <- geno.dose$geno
geno.dose <- geno.dose$geno.dose
n.ind <- ncol(geno.dose)
ind.names <- colnames(geno.dose)
} else {
geno.dose <- geno.dose$geno.dose
}
geno.dose[is.na(geno.dose)] <- ploidy + 1
}
## returning the 'mappoly.data' object
if (verbose) cat("\n Done with reading.\n")
res <- structure(list(ploidy = ploidy,
n.ind = n.ind,
n.mrk = length(mrk.names),
ind.names = ind.names,
mrk.names = mrk.names,
dosage.p1 = dosage.p1,
dosage.p2 = dosage.p2,
chrom = rep(NA, length(mrk.names)),
genome.pos = rep(NA, length(mrk.names)),
seq.ref = NULL,
seq.alt = NULL,
all.mrk.depth = NULL,
prob.thres = prob.thres,
geno = geno,
geno.dose = geno.dose,
nphen = nphen,
phen = phen,
chisq.pval = NULL,
kept = NULL,
elim.correspondence = NULL),
class = "mappoly.data")
if(filter.non.conforming){
if (verbose) cat(" Filtering non-conforming markers.\n ...")
res <- filter_non_conforming_classes(res)
if (verbose) cat("\n Performing chi-square test.\n ...")
##Computing chi-square p.values
Ds <- array(NA, dim = c(ploidy+1, ploidy+1, ploidy+1))
for(i in 0:ploidy)
for(j in 0:ploidy)
Ds[i+1,j+1,] <- segreg_poly(ploidy = ploidy, dP = i, dQ = j)
Dpop <- cbind(res$dosage.p1, res$dosage.p2)
M <- t(apply(Dpop, 1, function(x) Ds[x[1]+1, x[2]+1,]))
dimnames(M) <- list(res$mrk.names, c(0:ploidy))
M <- cbind(M, res$geno.dose)
res$chisq.pval <- apply(M, 1, mrk_chisq_test, ploidy = ploidy)
if (verbose) cat("\n Done.\n")
}
if (elim.redundant){
seqred = make_seq_mappoly(res, arg = 'all', data.name = res)
redun = elim_redundant(seqred, data = res)
if (nrow(redun$elim.correspondence) < 1) return(res)
res$kept = redun$kept
res$elim.correspondence = redun$elim.correspondence
mrks.rem = match(res$elim.correspondence$elim, res$mrk.names)
res$elim.correspondence$chrom = res$chrom[c(mrks.rem)]
res$elim.correspondence$genome.pos = res$genome.pos[c(mrks.rem)]
res$elim.correspondence$seq.ref = NULL
res$elim.correspondence$seq.alt = NULL
res$elim.correspondence$all.mrk.depth = NULL
res$n.mrk = length(res$kept)
res$mrk.names = res$mrk.names[-c(mrks.rem)]
res$geno.dose = res$geno.dose[-c(mrks.rem),]
res$geno = res$geno[which(res$geno$mrk %in% rownames(res$geno.dose)),]
res$dosage.p1 = res$dosage.p1[-c(mrks.rem)]
res$dosage.p2 = res$dosage.p2[-c(mrks.rem)]
res$chrom = res$chrom[-c(mrks.rem)]
res$genome.pos = res$genome.pos[-c(mrks.rem)]
res$chisq.pval = res$chisq.pval[-c(mrks.rem)]
}
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.