Nothing
#' Extract the LOD Scores in a \code{'mappoly.map'} object
#' @param x an object of class \code{mappoly.map}
#' @param sorted logical. if \code{TRUE}, the LOD Scores are displayed
#' in a decreasing order
#' @return a numeric vector containing the LOD Scores
#' @keywords internal
#' @export
get_LOD <- function(x, sorted = TRUE) {
w <- sapply(x$maps, function(x) x$loglike)
LOD <- w - max(w)
if (sorted)
LOD <- sort(LOD, decreasing = TRUE)
abs(LOD)
}
#' Get recombination fraction from a matrix
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
get_rf_from_mat <- function(M){
r <- numeric(nrow(M)-1)
for(i in 1:(nrow(M)-1)){
r[i] <- M[i, i+1]
}
r
}
#' Is it a probability dataset?
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
is.prob.data <- function(x){
exists('geno', where = x)
}
#' Get the number of bivalent configurations
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
get_w_m <- function(ploidy){
if(ploidy%%2 != 0) stop("ploidy level should be an even number")
if(ploidy <= 0) stop("ploidy level should be greater than zero")
1/factorial((ploidy/2)) * prod(choose(seq(2, ploidy, 2),2))
}
#' Reverse map
#'
#' Provides the reverse of a given map.
#'
#' @param input.map an object of class \code{mappoly.map}
#'
#' @examples
#' plot_genome_vs_map(solcap.mds.map[[1]])
#' plot_genome_vs_map(rev_map(solcap.mds.map[[1]]))
#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu}
#'
#'@export
#'
rev_map <- function(input.map)
{
output.map <- input.map
output.map$info <- lapply(output.map$info, rev)
for(i in 1:length(output.map$maps))
{
output.map$maps[[i]]$seq.num <- rev(input.map$maps[[i]]$seq.num)
output.map$maps[[i]]$seq.rf <- rev(input.map$maps[[i]]$seq.rf)
output.map$maps[[i]]$seq.ph$P <- rev(input.map$maps[[i]]$seq.ph$P)
output.map$maps[[i]]$seq.ph$Q <- rev(input.map$maps[[i]]$seq.ph$Q)
}
return(output.map)
}
#' Returns the class with the highest probability in
#' a genotype probability distribution
#'
#' @param geno the probabilistic genotypes contained in the object
#' \code{'mappoly.data'}
#' @param prob.thres probability threshold to select the genotype.
#' Values below this genotype are assumed as missing data
#' @return a matrix containing the doses of each genotype and
#' marker. Markers are disposed in rows and individuals are
#' disposed in columns. Missing data are represented by NAs
#' @keywords internal
#' @examples
#' \donttest{
#' geno.dose <- dist_prob_to_class(hexafake.geno.dist$geno)
#' geno.dose$geno.dose[1:10,1:10]
#'}
#' @importFrom magrittr "%>%"
#' @importFrom reshape2 melt dcast
#' @importFrom dplyr group_by filter arrange
#' @export
dist_prob_to_class <- function(geno, prob.thres = 0.9) {
a <- reshape2::melt(geno, id.vars = c("mrk", "ind"))
mrk <- ind <- value <- variable <- NULL # Setting the variables to NULL first
a$variable <- as.numeric(levels(a$variable))[a$variable]
b <- a %>%
dplyr::group_by(mrk, ind) %>%
dplyr::filter(value > prob.thres) %>%
dplyr::arrange(mrk, ind, variable)
z <- reshape2::dcast(data = b[,1:3], formula = mrk ~ ind, value.var = "variable")
rownames(z) <- z[,"mrk"]
z <- data.matrix(frame = z[,-1])
n <- setdiff(unique(geno$mrk), rownames(z))
if(length(n) > 0)
{
ploidy <- matrix(NA, nrow = length(n), ncol = ncol(z), dimnames = list(n, colnames(z)))
z <- rbind(z,ploidy)
}
rm.ind <- setdiff(unique(geno$ind), colnames(z))
flag <- FALSE
if(length(rm.ind) > 0){
flag <- TRUE
warning("Inividual(s) ", paste(rm.ind, collapse = " "),
"\n did not meet the 'prob.thres' criteria for any of\n the markers and was (were) removed.")
geno <- geno %>% dplyr::filter(ind %in% colnames(z))
}
z <- z[as.character(unique(geno$mrk)), as.character(unique(geno$ind))]
list(geno.dose = z, geno = geno, flag = flag)
}
#' Export data to \code{polymapR}
#'
#' See examples at \url{https://rpubs.com/mmollin/tetra_mappoly_vignette}.
#'
#' @param data.in an object of class \code{mappoly.data}
#' @return a dosage \code{matrix}
#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu}
#' @export export_data_to_polymapR
export_data_to_polymapR <- function(data.in)
{
data.out <- as.matrix(data.frame(P1 = data.in$dosage.p1,
P2 = data.in$dosage.p2,
data.in$geno.dose))
data.out[data.out == (data.in$ploidy + 1)] <- NA
return(data.out)
}
#' Msg function
#'
#' @param void internal function to be documented
#' @keywords internal
#' @importFrom cli rule
msg <- function(text, line = 1){
cli::rule(line = line, right = text) %>%
text_col() %>%
message()
}
#' @importFrom rstudioapi isAvailable hasFun getThemeInfo
#' @importFrom crayon white black
text_col <- function(x) {
# If RStudio not available, messages already printed in black
if (!rstudioapi::isAvailable()) {
return(x)
}
if (!rstudioapi::hasFun("getThemeInfo")) {
return(x)
}
theme <- rstudioapi::getThemeInfo()
if (isTRUE(theme$dark)) crayon::white(x) else crayon::black(x)
}
#' Map functions
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
mf_k <- function(d) 0.5 * tanh(d/50)
#'
#' Map functions
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
mf_h <- function(d) 0.5 * (1 - exp(-d/50))
#' Map functions
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
mf_m <- function(d) sapply(d, function(a) min(a/100, 0.5))
#' Map functions
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
imf_k <- function(r) {
r[r >= 0.5] <- 0.5 - 1e-14
50 * atanh(2 * r)
}
#' Map functions
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
imf_h <- function(r) {
r[r >= 0.5] <- 0.5 - 1e-14
-50 * log(1 - 2 * r)
}
#' Map functions
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
imf_m <- function(r) sapply(r, function(a) min(a * 100, 50))
#' Compare two polyploid haplotypes stored in list format
#'
#' @param ploidy ploidy level
#' @param h1 homology group 1
#' @param h2 homology group 2
#' @return a numeric vector of size \code{ploidy} indicating which
#' homolog in h2 represents the homolog in h1. If there is
#' no correspondence, i.e. different homolog, it returns NA for
#' that homolog.
#' @keywords internal
#' @export compare_haplotypes
compare_haplotypes <- function(ploidy, h1, h2) {
I1 <- matrix(0, ploidy, length(h1))
I2 <- matrix(0, ploidy, length(h2))
for (i in 1:length(h1)) {
I1[h1[[i]], i] <- 1
I2[h2[[i]], i] <- 1
}
a <- apply(I1, 1, paste, collapse = "")
b <- apply(I2, 1, paste, collapse = "")
haplo.ord <- match(a, b)
list(is.same.haplo = !any(is.na(haplo.ord)), haplo.ord = haplo.ord)
}
#' Genotypic information content
#'
#' This function plots the genotypic information content given
#' an object of class \code{mappoly.homoprob}.
#'
#' @param hprobs an object of class \code{mappoly.homoprob}
#'
#' @param P a string containing the name of parent P
#'
#' @param Q a string containing the name of parent Q
#'
#'
#' @examples
#' \donttest{
#' w <- lapply(solcap.err.map[1:3], calc_genoprob)
#' h.prob <- calc_homologprob(w)
#' plot_GIC(h.prob)
#' }
#'
#' @importFrom dplyr mutate
#' @importFrom ggplot2 facet_wrap element_text ylim scale_color_discrete
#' @export plot_GIC
plot_GIC <- function(hprobs, P = "P1", Q = "P2"){
if(!inherits(hprobs, "mappoly.homoprob"))
stop(" 'hprobs' should be of class 'mappoly.homoprob'")
LG <- map.position <- GIC <- p1 <- m1 <- homolog <- marker <- probability <- NULL
DF <- hprobs$homoprob %>%
dplyr::mutate(p1 = probability * (1-probability)) %>%
group_by(marker, homolog, map.position, LG) %>%
summarise(m1 = sum(p1)) %>%
mutate(GIC = 1-(4/hprobs$info$n.ind)*m1 , parent = ifelse(homolog%in%letters[1:hprobs$info$ploidy], P, Q))
head(as.data.frame(DF))
print(ggplot(DF, aes(map.position, GIC, colour = homolog)) +
geom_smooth(alpha = .8, se = FALSE) + facet_wrap(~LG, nrow = 3, ncol = 5) +
facet_grid(parent~LG, scales = "free_x", space = "free_x") +
geom_hline(yintercept = .8, linetype = "dashed") + ylim(0,1) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_color_discrete(name = "Homologs") +
ylab("Genotypic Information Content") + xlab("Distance (cM)"))
return(invisible(DF))
}
#' Plot two overlapped haplotypes
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export plot_compare_haplotypes
plot_compare_haplotypes <- function(ploidy, hom.allele.p1, hom.allele.q1, hom.allele.p2 = NULL, hom.allele.q2 = NULL) {
nmmrk <- names(hom.allele.p1)
oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1))
on.exit(par(oldpar))
o1 <- order(apply(ph_list_to_matrix(hom.allele.p1, ploidy), 2, paste, collapse = ""), decreasing = TRUE)
hom.allele.p1 <- ph_matrix_to_list(ph_list_to_matrix(hom.allele.p1, ploidy)[, o1])
o2 <- order(apply(ph_list_to_matrix(hom.allele.q1, ploidy), 2, paste, collapse = ""), decreasing = TRUE)
hom.allele.q1 <- ph_matrix_to_list(ph_list_to_matrix(hom.allele.q1, ploidy)[, o2])
if (!is.null(hom.allele.p2)) {
o3 <- order(apply(ph_list_to_matrix(hom.allele.p2, ploidy), 2, paste, collapse = ""), decreasing = TRUE)
hom.allele.p2 <- ph_matrix_to_list(ph_list_to_matrix(hom.allele.p2, ploidy)[, o3])
}
if (!is.null(hom.allele.q2)) {
o4 <- order(apply(ph_list_to_matrix(hom.allele.q2, ploidy), 2, paste, collapse = ""), decreasing = TRUE)
hom.allele.q2 <- ph_matrix_to_list(ph_list_to_matrix(hom.allele.q2, ploidy)[, o4])
}
col2 <- 0 #rgb(1,0,.5,0.5)
col1 <- rgb(1, 0, 0, 0.5)
col3 <- rgb(0, 0, 1, 0.5)
n.mrk <- length(hom.allele.p1)
plot(c(0, 22), c(0, -(ploidy + 15)), type = "n", axes = FALSE, xlab = "", main = "", ylab = "")
for (i in -(1:ploidy)) {
lines(c(0, 10), c(i, i), lwd = 1, col = "darkgray", lty = 2)
lines(c(12, 22), c(i, i), lwd = 1, col = "darkgray", lty = 2)
}
pos.p <- cumsum(c(0, rep(1, n.mrk - 1)/sum(rep(1, n.mrk - 1))) * 10)
for (i in 1:n.mrk) {
points(x = rep(pos.p[i], ploidy), y = -c(1:ploidy), pch = 15, col = col2, cex = 2)
if (any(hom.allele.p1[[i]] != 0))
points(x = rep(pos.p[i], length(hom.allele.p1[[i]])), y = -hom.allele.p1[[i]], col = col1, pch = 15, cex = 2)
if (any(hom.allele.p2[[i]] != 0))
if (!is.null(hom.allele.p2))
points(x = rep(pos.p[i], length(hom.allele.p2[[i]])), y = -hom.allele.p2[[i]], col = col3, pch = 15, cex = 2)
}
text(x = pos.p, y = rep(-(ploidy+1), length(pos.p)), labels = nmmrk, cex = .5)
pos.q <- pos.p + 12
for (i in 1:n.mrk) {
points(x = rep(pos.q[i], ploidy), y = -c(1:ploidy), col = col2, pch = 15, cex = 2)
if (any(hom.allele.q1[[i]] != 0))
points(x = rep(pos.q[i], length(hom.allele.q1[[i]])), y = -hom.allele.q1[[i]], col = col1, pch = 15, cex = 2)
if (any(hom.allele.q2[[i]] != 0))
if (!is.null(hom.allele.q2))
points(x = rep(pos.q[i], length(hom.allele.q2[[i]])), y = -hom.allele.q2[[i]], col = col3, pch = 15, cex = 2)
}
text(x = pos.q, y = rep(-(ploidy+1) , length(pos.q)), labels = nmmrk, cex = .5)
text(x = 11, y = -(ploidy + 1)/2, labels = "X", cex = 1)
}
#' Check if it is possible to estimate the recombination
#' fraction between neighbor markers using two-point
#' estimation
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
check_if_rf_is_possible <- function(input.seq){
dp <- abs(abs(input.seq$seq.dose.p1-(input.seq$ploidy/2))-(input.seq$ploidy/2))
dq <- abs(abs(input.seq$seq.dose.p2-(input.seq$ploidy/2))-(input.seq$ploidy/2))
y <- numeric(length(input.seq$seq.num)-1)
for(i in 1:length(y))
y[i] <- (dp[i] == 0 && dq[i+1] == 0) ||
(dp[i+1] == 0 && dq[i] == 0) ||
(dp[i] == 0 && dq[i] == 0) ||
(dp[i+1] == 0 && dq[i+1] == 0)
y <- as.logical(y)
!y
}
#' N! combination
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
perm_tot <- function(v) {
n <- length(v)
result <- v
if (n > 1) {
M <- perm_tot(v[2:n])
result <- cbind(v[1], M)
if (n > 2) {
for (i in 2:(n - 1)) {
N <- cbind(M[, 1:(i - 1)], v[1], M[, i:(n - 1)])
result <- rbind(result, N)
}
}
N <- cbind(M, v[1])
result <- rbind(result, N)
}
return(result)
}
#' N!/2 combination
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
perm_pars <- function(v) {
n <- length(v)
result <- v
if (n > 2) {
Mt <- perm_tot(v[2:n])
result <- cbind(v[1], Mt)
f <- floor(n/2)
c <- ceiling(n/2)
if (n > 3) {
for (i in 2:f) {
N <- cbind(Mt[, 1:(i - 1)], v[1], Mt[, i:(n - 1)])
result <- rbind(result, N)
}
}
if (c > f) {
Ms <- perm_pars(v[2:n])
if (n > 3) {
N <- cbind(Ms[, 1:f], v[1], Ms[, c:(n - 1)])
} else {
N <- cbind(Ms[1:f], v[1], Ms[c:(n - 1)])
}
result <- rbind(result, N)
}
}
return(result)
}
#' Color pallet ggplot-like
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
#' @importFrom grDevices hcl col2rgb hsv rgb2hsv
gg_color_hue <- function(n) {
x <- rgb2hsv(col2rgb("steelblue"))[, 1]
cols = seq(x[1], x[1] + 1, by = 1/n)
cols = cols[1:n]
cols[cols > 1] <- cols[cols > 1] - 1
return(hsv(cols, x[2], x[3]))
}
#' MAPpoly pallet 1
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
mp_pallet1 <- colorRampPalette(c("#ffe119", "#f58231","#e6194b","#808000","#9a6324", "#800000"))
#' MAPpoly pallet 2
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
mp_pallet2 <- colorRampPalette(c("#911eb4", "#000075","#4363d8","#42d4f4","#469990", "#3cb44b"))
#' MAPpoly pallet 3
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
mp_pallet3 <- colorRampPalette(c("#ffe119", "#f58231","#e6194b","#808000","#9a6324", "#800000","#911eb4", "#000075","#4363d8","#42d4f4","#469990", "#3cb44b"))
#' Update missing information
#'
#' Updates the missing data in the dosage matrix of an object of class
#' \code{mappoly.data} given a new probability threshold
#' @param input.data an object of class \code{mappoly.data}
#' @param prob.thres probability threshold to associate a marker call to a
#' dosage. Markers with maximum genotype probability smaller than 'prob.thres'
#' are considered as missing data for the dosage calling purposes
#' @examples
#' \donttest{
#' data.updated = update_missing(hexafake.geno.dist, prob.thres = 0.5)
#' print(hexafake.geno.dist)
#' print(data.updated)
#' }
#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu}
#' @export
update_missing <- function(input.data, prob.thres = 0.95){
geno.dose <- dist_prob_to_class(geno = input.data$geno, prob.thres = prob.thres)
if(geno.dose$flag)
{
geno <- geno.dose$geno
geno.dose <- geno.dose$geno.dose
} else {
geno.dose <- geno.dose$geno.dose
}
geno.dose[is.na(geno.dose)] <- input.data$ploidy + 1
input.data$geno.dose <- geno.dose
input.data$prob.thres <- prob.thres
return(input.data)
}
#' Chi-square test
#'
#' @param void internal function to be documented
#' @keywords internal
mrk_chisq_test <- function(x, ploidy){
y <- x[-c(1:(ploidy+1))]
y[y == ploidy+1] <- NA
y <- table(y, useNA = "always")
names(y) <- c(names(y)[-length(y)], "NA")
seg.exp <- x[0:(ploidy+1)]
seg.exp <- seg.exp[seg.exp != 0]
seg.obs <- seg.exp
seg.obs[names(y)[-length(y)]] <- y[-length(y)]
pval <- suppressWarnings(stats::chisq.test(x = seg.obs, p = seg.exp[names(seg.obs)])$p.value)
pval
}
#' Get the genomic position of markers in a sequence
#'
#' This functions gets the genomic position of markers in a sequence and
#' return an ordered data frame with the name and position of each marker
#' @param input.seq a sequence object of class \code{mappoly.sequence}
#' @param verbose if \code{TRUE} (default), the current progress is shown; if
#' \code{FALSE}, no output is produced
#' @param x an object of the class mappoly.geno.ord
#' @param ... currently ignored
#'
#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu}
#' @examples
#' s1 <- make_seq_mappoly(tetra.solcap, "all")
#' o1 <- get_genomic_order(s1)
#' plot(o1)
#' s.geno.ord <- make_seq_mappoly(o1)
#' @export get_genomic_order
get_genomic_order <- function(input.seq, verbose = TRUE){
if (!inherits(input.seq, "mappoly.sequence")) {
stop(deparse(substitute(input.seq)),
" is not an object of class 'mappoly.sequence'")
}
if(all(is.na(input.seq$genome.pos))){
if(all(is.na(input.seq$chrom)))
stop("No sequence or sequence position information found.")
else{
if (verbose) message("Ordering markers based on chromosome information")
M <- data.frame(seq = input.seq$chrom, row.names = input.seq$seq.mrk.names)
M.out <- M[order(M[,1]),]
}
} else if(all(is.na(input.seq$chrom))){
if(all(is.na(input.seq$genome.pos)))
stop("No sequence or sequence position information found.")
else{
if (verbose) message("Ordering markers based on sequence position information")
M <- data.frame(seq.pos = input.seq$genome.pos, row.names = input.seq$seq.mrk.names)
M.out <- M[order(as.numeric(M[,1])),]
}
} else{
M <- data.frame(seq = input.seq$chrom,
seq.pos = input.seq$genome.pos,
row.names = input.seq$seq.mrk.names)
M.out <- M[order(as.numeric(M[,1]), as.numeric(M[,2])),]
}
structure(list(data.name = input.seq$data.name, ord = M.out), class = "mappoly.geno.ord")
}
#' @rdname get_genomic_order
#' @export
print.mappoly.geno.ord <- function(x, ...){
print(head(x$ord))
}
#' @rdname get_genomic_order
#' @export
#' @importFrom ggplot2 ggplot aes geom_point xlab ylab theme
plot.mappoly.geno.ord <- function(x, ...){
seq <- seq.pos <- NULL
x$ord$seq<- as.factor(x$ord$seq)
levels(x$ord$seq) <- levels(x$ord$seq)[order(as.numeric(levels(x$ord$seq)))]
ggplot2::ggplot(as.data.frame(x$ord),
ggplot2::aes(x = seq.pos, y = seq, group = as.factor(seq))) +
ggplot2::geom_point(ggplot2::aes(color = as.factor(seq)), shape = 108, size = 5, show.legend = FALSE) +
ggplot2::xlab("Position") +
ggplot2::ylab("Chromosome")
}
#' Remove markers from a map
#'
#' This function creates a new map by removing markers from an existing one.
#'
#' @param input.map an object of class \code{mappoly.map}
#' @param mrk a vector containing markers to be removed from the input map,
#' identified by their names or positions
#' @return an object of class \code{mappoly.map}
#'
#' @param verbose if \code{TRUE} (default), the current progress is shown; if
#' \code{FALSE}, no output is produced
#'
#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu}
#'
#' @examples
#' sub.map <- get_submap(maps.hexafake[[1]], 1:50, reestimate.rf = FALSE)
#' plot(sub.map, mrk.names = TRUE)
#' mrk.to.remove <- c("M_1", "M_23", "M_34")
#' red.map <- drop_marker(sub.map, mrk.to.remove)
#' plot(red.map, mrk.names = TRUE)
#'
#' @export
drop_marker <- function(input.map, mrk, verbose = TRUE)
{
## Checking class of arguments
if (!inherits(input.map, "mappoly.map")) {
stop(deparse(substitute(input.map)), " is not an object of class 'mappoly.map'")
}
## Checking markers to be removed
if(is.numeric(mrk) & all(mrk >= 0)){
if(any(mrk > input.map$info$n.mrk))
stop("At least one marker position is not contained in the map.")
} else if(is.character(mrk)){
if(any(!mrk%in%input.map$info$mrk.names)){
stop("At least one marker position is not contained in the map.")
} else {
mrk <- which(input.map$info$mrk.names%in%mrk)
}
}
## Getting new map
suppressMessages(output.map <- get_submap(input.map = input.map,
mrk.pos = c(1:input.map$info$n.mrk)[-mrk],
phase.config = 1,
reestimate.rf = FALSE))
if(length(input.map$maps) > 1){
for(i in 2:length(input.map$maps)){
suppressMessages(temp.map <- get_submap(input.map = input.map,
mrk.pos = c(1:input.map$info$n.mrk)[-mrk],
phase.config = 1,
reestimate.rf = FALSE))
output.map$maps[[i]] <- temp.map$maps[[1]]
}
}
if (verbose) message("
INFO:
-----------------------------------------
The recombination fractions provided were
obtained using the marker positions in the
input map; For accurate values, plese
reestimate the map using functions 'reest_rf',
'est_full_hmm_with_global_error' or
'est_full_hmm_with_prior_prob'")
return(output.map)
}
#' Add a single marker to a map
#'
#' Creates a new map by adding a marker in a given position in a pre-built map.
#'
#' \code{add_marker} splits the input map into two sub-maps to the left and the
#' right of the given position. Using the genotype probabilities, it computes
#' the log-likelihood of all possible linkage phases under a two-point threshold
#' inherited from function \code{\link[mappoly]{rf_list_to_matrix}}.
#'
#' @param input.map an object of class \code{mappoly.map}
#'
#' @param mrk the name of the marker to be inserted
#'
#' @param pos the name of the marker after which the new marker should be added.
#' One also can inform the numeric position (between markers) were the
#' new marker should be added. To insert a marker at the beginning of a
#' map, use \code{pos = 0}
#'
#' @param rf.matrix an object of class \code{mappoly.rf.matrix} containing the recombination
#' fractions and the number of homologues sharing alleles between pairwise
#' markers on \code{input.map}. It is important that \code{shared.alleles = TRUE}
#' in function \code{\link[mappoly]{rf_list_to_matrix}} when computing \code{rf.matrix}.
#'
#' @param genoprob an object of class \code{mappoly.genoprob} containing the genotype probabilities
#' for all marker positions on \code{input.map}
#'
#' @param phase.config which phase configuration should be used. "best" (default)
#' will choose the maximum likelihood configuration
#'
#' @param tol the desired accuracy (default = 10e-04)
#'
#' @param r.test for internal use only
#'
#' @param verbose if \code{TRUE} (default), the current progress is shown; if
#' \code{FALSE}, no output is produced
#'
#' @return A list of class \code{mappoly.map} with two elements:
#'
#' i) info: a list containing information about the map, regardless of the linkage phase configuration:
#' \item{ploidy}{the ploidy level}
#' \item{n.mrk}{number of markers}
#' \item{seq.num}{a vector containing the (ordered) indices of markers in the map,
#' according to the input file}
#' \item{mrk.names}{the names of markers in the map}
#' \item{seq.dose.p1}{a vector containing the dosage in parent 1 for all markers in the map}
#' \item{seq.dose.p2}{a vector containing the dosage in parent 2 for all markers in the map}
#' \item{chrom}{a vector indicating the sequence (usually chromosome) each marker belongs
#' as informed in the input file. If not available,
#' \code{chrom = NULL}}
#' \item{genome.pos}{physical position (usually in megabase) of the markers into the sequence}
#' \item{seq.ref}{reference base used for each marker (i.e. A, T, C, G). If not available,
#' \code{seq.ref = NULL}}
#' \item{seq.alt}{alternative base used for each marker (i.e. A, T, C, G). If not available,
#' \code{seq.ref = NULL}}
#' \item{chisq.pval}{a vector containing p-values of the chi-squared test of Mendelian
#' segregation for all markers in the map}
#' \item{data.name}{name of the dataset of class \code{mappoly.data}}
#' \item{ph.thres}{the LOD threshold used to define the linkage phase configurations to test}
#'
#' ii) a list of maps with possible linkage phase configuration. Each map in the list is also a
#' list containing
#' \item{seq.num}{a vector containing the (ordered) indices of markers in the map,
#' according to the input file}
#' \item{seq.rf}{a vector of size (\code{n.mrk - 1}) containing a sequence of recombination
#' fraction between the adjacent markers in the map}
#' \item{seq.ph}{linkage phase configuration for all markers in both parents}
#' \item{loglike}{the hmm-based multipoint likelihood}
#'
#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu}
#'
#' @examples
#' \donttest{
#' sub.map <- get_submap(maps.hexafake[[1]], 1:20, reestimate.rf = FALSE)
#' plot(sub.map, mrk.names = TRUE)
#' s <- make_seq_mappoly(hexafake, sub.map$info$mrk.names)
#' tpt <- est_pairwise_rf(s)
#' rf.matrix <- rf_list_to_matrix(input.twopt = tpt,
#' thresh.LOD.ph = 3,
#' thresh.LOD.rf = 3,
#' shared.alleles = TRUE)
#' ###### Removing marker "M_1" (first) #######
#' mrk.to.remove <- "M_1"
#' input.map <- drop_marker(sub.map, mrk.to.remove)
#' plot(input.map, mrk.names = TRUE)
#' ## Computing conditional probabilities using the resulting map
#' genoprob <- calc_genoprob(input.map)
#' res.add.M_1 <- add_marker(input.map = input.map,
#' mrk = "M_1",
#' pos = 0,
#' rf.matrix = rf.matrix,
#' genoprob = genoprob,
#' tol = 10e-4)
#' plot(res.add.M_1, mrk.names = TRUE)
#' best.phase <- res.add.M_1$maps[[1]]$seq.ph
#' names.id <- names(best.phase$P)
#' plot_compare_haplotypes(ploidy = 6,
#' hom.allele.p1 = best.phase$P[names.id],
#' hom.allele.q1 = best.phase$Q[names.id],
#' hom.allele.p2 = sub.map$maps[[1]]$seq.ph$P[names.id],
#' hom.allele.q2 = sub.map$maps[[1]]$seq.ph$Q[names.id])
#'
#' ###### Removing marker "M_10" (middle or last) #######
#' mrk.to.remove <- "M_10"
#' input.map <- drop_marker(sub.map, mrk.to.remove)
#' plot(input.map, mrk.names = TRUE)
#' # Computing conditional probabilities using the resulting map
#' genoprob <- calc_genoprob(input.map)
#' res.add.M_10 <- add_marker(input.map = input.map,
#' mrk = "M_10",
#' pos = "M_9",
#' rf.matrix = rf.matrix,
#' genoprob = genoprob,
#' tol = 10e-4)
#' plot(res.add.M_10, mrk.names = TRUE)
#' best.phase <- res.add.M_10$maps[[1]]$seq.ph
#' names.id <- names(best.phase$P)
#' plot_compare_haplotypes(ploidy = 6,
#' hom.allele.p1 = best.phase$P[names.id],
#' hom.allele.q1 = best.phase$Q[names.id],
#' hom.allele.p2 = sub.map$maps[[1]]$seq.ph$P[names.id],
#' hom.allele.q2 = sub.map$maps[[1]]$seq.ph$Q[names.id])
#' }
#' @export
add_marker <- function(input.map, mrk, pos, rf.matrix, genoprob = NULL,
phase.config = "best", tol = 10e-4, r.test = NULL,
verbose = TRUE){
## Checking class of arguments
if(!inherits(input.map, "mappoly.map")) {
stop(deparse(substitute(input.map)), " is not an object of class 'mappoly.map'")
}
if(!inherits(mrk, "character")) {
stop("Please, provide the marker name")
}
if(!inherits(rf.matrix, "mappoly.rf.matrix")) {
stop(deparse(substitute(rf.matrix)), " is not an object of class 'mappoly.rf.matrix'")
}
if(is.null(rf.matrix$ShP))
stop(deparse(substitute(rf.matrix)), " should contain the number of homologues sharing alleles.
Use 'shared.alleles = TRUE' in function 'rf_list_to_matrix'")
if(!all(c(input.map$info$mrk.names, mrk)%in%colnames(rf.matrix$rec.mat))){
stop(deparse(substitute(rf.matrix)), " does not contain all necessary information about 'input.map' and 'mrk'.")
}
## choosing the linkage phase configuration
LOD.conf <- get_LOD(input.map, sorted = FALSE)
if(phase.config == "best") {
i.lpc <- which.min(LOD.conf)
} else if (phase.config > length(LOD.conf)) {
stop("invalid linkage phase configuration")
} else i.lpc <- phase.config
## Checking genoprob
if(is.null(genoprob)){
if (verbose) message("Calculating genoprob.")
genoprob <- calc_genoprob(input.map, phase.config = i.lpc, verbose = FALSE)
}
if(!inherits(genoprob, "mappoly.genoprob")) {
stop("'", deparse(substitute(genoprob)), "' is not an object of class 'mappoly.genoprob'")
}
if(!identical(names(genoprob$map), input.map$info$mrk.names)){
warning("'", deparse(substitute(genoprob)), "' is inconsistent with 'input.map'.\n Recalculating genoprob.")
genoprob <- calc_genoprob(input.map, phase.config = i.lpc, verbose = FALSE)
}
## ploidy
ploidy <- input.map$info$ploidy
## Number of genotypes in the offspring
n.gen <- dim(genoprob$probs)[1]
## number of markers
n.mrk <- dim(genoprob$probs)[2]
## number of individuals
n.ind <- dim(genoprob$probs)[3]
## number of genotypic states
ngam <- choose(ploidy, ploidy/2)
## the threshold for visiting states: 1/n.gen
thresh.cut.path <- 1/n.gen
## Indexing position were the marker should be inserted
if(is.numeric(pos)){
if(!(pos >= 0 & pos <= (n.mrk+1))){
stop(deparse(substitute(pos)), " out of bounds.")
}
} else if(is.character(pos)){
if(!pos%in%input.map$info$mrk.names){
stop("The reference marker is not contained in the map.")
} else {
pos <- which(input.map$info$mrk.names%in%pos)
}
}
## Hash table: homolog combination --> states to visit in both parents
A <- as.matrix(expand.grid(c(1:ngam) - 1, c(1:ngam) - 1)[,2:1])
rownames(A) <- dimnames(genoprob$probs)[[1]]
## Indexing marker to be inserted
if(!mrk%in%colnames(rf.matrix$rec.mat)){
stop(deparse(substitute(mrk)), " is not in 'rf.matrix'")
}
dat <- get(input.map$info$data.name, pos = 1)
#mrk.id <- match(mrk, dat$mrk.names)
mrk.id <- mrk
## Adding marker: beginning of sequence
if(pos == 0){
## h: states to visit in both parents
## e: probability distribution (ignored in this version)
e.right <- h.right <- vector("list", n.ind)
for(i in 1:n.ind){
a.right <- genoprob$probs[,pos+1,i]
e.right[[i]] <- a.right[a.right > thresh.cut.path]
h.right[[i]] <- A[names(e.right[[i]]), , drop = FALSE]
}
if(is.null(r.test)){
r.test <- generate_all_link_phases_elim_equivalent_haplo(block1 = c(input.map$maps[[i.lpc]], mrk.names = list(input.map$info$mrk.names)),
block2 = mrk.id,
rf.matrix = rf.matrix,
ploidy = ploidy, max.inc = 0)
}
} else if(pos > 0 & pos < n.mrk){ ## Adding marker: middle positions
## h: states to visit in both parents
## e: probability distribution (ignored in this version)
e.left <- h.left <- e.right <- h.right <- vector("list", n.ind)
for(i in 1:n.ind){
a.left <- genoprob$probs[,pos,i]
a.right <- genoprob$probs[,pos+1,i]
e.left[[i]] <- a.left[a.left > thresh.cut.path]
h.left[[i]] <- A[names(e.left[[i]]), , drop = FALSE]
e.right[[i]] <- a.right[a.right > thresh.cut.path]
h.right[[i]] <- A[names(e.right[[i]]), , drop = FALSE]
}
## Info to the left
map.left <- get_submap(input.map = input.map,
mrk.pos = 1:pos,
phase.config = i.lpc,
reestimate.rf = FALSE,
verbose = FALSE)
r.left <- generate_all_link_phases_elim_equivalent_haplo(block1 = c(map.left$maps[[i.lpc]], mrk.names = list(map.left$info$mrk.names)),
block2 = mrk.id,
rf.matrix = rf.matrix,
ploidy = ploidy, max.inc = 0)
## Info to the right
map.right <- get_submap(input.map = input.map,
mrk.pos = (pos+1):input.map$info$n.mrk,
phase.config = i.lpc,
reestimate.rf = FALSE, verbose = FALSE)
r.right <- generate_all_link_phases_elim_equivalent_haplo(block1 = c(map.right$maps[[i.lpc]], mrk.names = list(map.right$info$mrk.names)),
block2 = mrk.id,
rf.matrix = rf.matrix,
ploidy = ploidy, max.inc = 0)
## Using both information sources, left and right
r.test <- unique(r.left, r.right)
} else if(pos == n.mrk){
## h: states to visit in both parents
## e: probability distribution (ignored in this version)
e.left <- h.left <- vector("list", n.ind)
for(i in 1:n.ind){
a.left <- genoprob$probs[,pos,i]
e.left[[i]] <- a.left[a.left > thresh.cut.path]
h.left[[i]] <- A[names(e.left[[i]]), , drop = FALSE]
}
r.test <- generate_all_link_phases_elim_equivalent_haplo(block1 = c(input.map$maps[[i.lpc]], mrk.names = list(input.map$info$mrk.names)),
block2 = mrk.id,
rf.matrix = rf.matrix,
ploidy = ploidy, max.inc = 0)
}
else stop("should not get here!")
## gathering maps to test and conditional probabilities
test.maps <- mrk.genoprobs <- vector("list", length(r.test))
for(i in 1:length(test.maps))
{
## This sub-map is just to create a framework
hap.temp <- get_submap(input.map,
c(1,1),
reestimate.rf = FALSE, verbose = FALSE)
hap.temp <- filter_map_at_hmm_thres(hap.temp, thres.hmm = 10e-10)
hap.temp$maps[[1]]$seq.num <- rep(match(mrk.id, dat$mrk.names), 2)
hap.temp$maps[[1]]$seq.ph <- list(P = c(r.test[[i]]$P, r.test[[i]]$P),
Q = c(r.test[[i]]$Q, r.test[[i]]$Q))
hap.temp$maps[[1]]$seq.rf <- 10e-6
hap.temp$info$mrk.names <- rep(mrk,2)
test.maps[[i]] <- hap.temp
## States to visit for inserted biallelic SNP
mrk.genoprobs[[i]] <- calc_genoprob(test.maps[[i]], verbose = FALSE)
}
## Inserted marker
## h: states to visit in both parents
## e: probability distribution
h <- e <- vector("list", length(r.test))
for(j in 1:length(r.test)){
etemp <- htemp <- vector("list", n.ind)
for(i in 1:n.ind){
a <- mrk.genoprobs[[j]]$probs[,1,i]
etemp[[i]] <- a[a > thresh.cut.path]
htemp[[i]] <- A[names(etemp[[i]]), , drop = FALSE]
}
h[[j]] <- htemp
e[[j]] <- etemp
}
configs <- vector("list", length(test.maps))
names(configs) <- paste0("Phase_config.", 1:length(test.maps))
res <- matrix(NA, nrow = length(h), ncol = 3, dimnames = list(names(configs), c("log_like", "rf1", "rf2")))
## Computing three-point multiallelic map using HMM
for(i in 1:length(h)){
if (verbose) cat(".")
if(pos == 0){
h.test <- c(h[i], list(h.right))
names(h.test) <- c("hap1", "hap2")
e.test <- c(e[i], list(e.right))
restemp <- est_haplo_hmm(ploidy = ploidy,
n.mrk = length(h.test),
n.ind = n.ind,
haplo = h.test,
#emit = e.test,
rf_vec = rep(0.01, length(h.test)-1),
verbose = FALSE,
use_H0 = FALSE,
tol = tol)
temp <- unlist(restemp)
res[i,1:length(temp)] <- temp
P <- c(test.maps[[i]]$maps[[1]]$seq.ph$P[1],
input.map$maps[[1]]$seq.ph$P)
Q <- c(test.maps[[i]]$maps[[1]]$seq.ph$Q[1],
input.map$maps[[1]]$seq.ph$Q)
names(P) <- names(Q) <- c(test.maps[[i]]$maps[[1]]$seq.num[1],
input.map$maps[[1]]$seq.num)
configs[[i]] <- list(P = P, Q = Q)
}
else if(pos > 0 & pos < n.mrk){
h.test <- c(list(h.left), h[i], list(h.right))
names(h.test) <- c("hap1", "hap2", "hap3")
e.test <- c(list(e.left), e[i], list(e.right))
restemp <-est_haplo_hmm(ploidy = ploidy,
n.mrk = length(h.test),
n.ind = n.ind,
haplo = h.test,
#emit = e.test,
rf_vec = rep(0.01, length(h.test)-1),
verbose = FALSE,
use_H0 = FALSE,
tol = tol)
temp <- unlist(restemp)
res[i,1:length(temp)] <- temp
P <- c(map.left$maps[[1]]$seq.ph$P,
test.maps[[i]]$maps[[1]]$seq.ph$P[1],
map.right$maps[[1]]$seq.ph$P)
Q <- c(map.left$maps[[1]]$seq.ph$Q,
test.maps[[i]]$maps[[1]]$seq.ph$Q[1],
map.right$maps[[1]]$seq.ph$Q)
names(P) <- names(Q) <- c(map.left$maps[[1]]$seq.num,
test.maps[[i]]$maps[[1]]$seq.num[1],
map.right$maps[[1]]$seq.num)
configs[[i]] <- list(P = P, Q = Q)
}
else if(pos == n.mrk){
h.test <- c(list(h.left), h[i])
names(h.test) <- c("hap1", "hap2")
e.test <- c(list(e.left), e[i])
restemp <- est_haplo_hmm(ploidy = ploidy,
n.mrk = length(h.test),
n.ind = n.ind,
haplo = h.test,
#emit = e.test,
rf_vec = rep(0.01, length(h.test)-1),
verbose = FALSE,
use_H0 = FALSE,
tol = tol)
temp <- unlist(restemp)
res[i,1:length(temp)] <- temp
P <- c(input.map$maps[[1]]$seq.ph$P,
test.maps[[i]]$maps[[1]]$seq.ph$P[1])
Q <- c(input.map$maps[[1]]$seq.ph$Q,
test.maps[[i]]$maps[[1]]$seq.ph$Q[1])
names(P) <- names(Q) <- c(input.map$maps[[1]]$seq.num,
test.maps[[i]]$maps[[1]]$seq.num[1])
configs[[i]] <- list(P = P, Q = Q)
}
}
if (verbose) cat("\n")
res <- res[order(res[,"log_like"], decreasing = TRUE),,drop = FALSE]
## Updating map
output.map <- input.map
seq.num <- as.numeric(names(configs[[1]]$P))
output.map$info$seq.num <- seq.num
output.map$info$mrk.names <- dat$mrk.names[seq.num]
output.map$info$n.mrk <- length(output.map$info$mrk.names)
output.map$maps <- vector("list", nrow(res))
for(i in 1:nrow(res))
{
## Updating recombination fractions (approximated)
if(pos == 0){
seq.rf <- as.numeric(c(res[i, "rf1"], input.map$maps[[i.lpc]]$seq.rf))
} else if(pos > 0 & pos < n.mrk){
seq.rf <- as.numeric(c(head(input.map$maps[[i.lpc]]$seq.rf, n = pos - 1),
res[i, c("rf1", "rf2")],
tail(input.map$maps[[i.lpc]]$seq.rf, n = input.map$info$n.mrk - pos - 1)))
} else if(pos == n.mrk){
seq.rf <- as.numeric(c(input.map$maps[[i.lpc]]$seq.rf, res[i, "rf1"]))
}
output.map$maps[[i]] <- list(seq.num = seq.num,
seq.rf = seq.rf,
seq.ph = configs[[rownames(res)[i]]],
loglike = res[i, "log_like"])
}
output.map$info$seq.dose.p1 <- dat$dosage.p1[output.map$info$mrk.names]
output.map$info$seq.dose.p2 <- dat$dosage.p2[output.map$info$mrk.names]
output.map$info$chrom <- dat$chrom[output.map$info$mrk.names]
output.map$info$genome.pos <- dat$genome.pos[output.map$info$mrk.names]
output.map$info$seq.ref <- dat$seq.ref[output.map$info$mrk.names]
output.map$info$seq.alt <- dat$seq.alt[output.map$info$mrk.names]
output.map$info$chisq.pval <- dat$chisq.pval[output.map$info$mrk.names]
return(output.map)
}
#' Data sanity check
#'
#' Checks the consistency of a dataset
#'
#' @param x an object of class \code{mappoly.data}
#'
#' @return if consistent, returns 0. If not consistent, returns a
#' vector with a number of tests, where \code{TRUE} indicates
#' a failed test.
#'
#' @examples
#' check_data_sanity(tetra.solcap)
#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu}
#'
#' @references
#' Mollinari, M., and Garcia, A. A. F. (2019) Linkage
#' analysis and haplotype phasing in experimental autopolyploid
#' populations with high ploidy level using hidden Markov
#' models, _G3: Genes, Genomes, Genetics_.
#' \doi{10.1534/g3.119.400378}
#'
#' @export check_data_sanity
check_data_sanity <- function(x){
if(exists('geno', where = x)){
check_data_dist_sanity(x)
} else if (exists('geno.dose', where = x)){
check_data_dose_sanity(x)
} else
stop("Inconsistent genotypic information.")
}
#' Checks the consistency of dataset (dosage)
#'
#' @param void internal function to be documented
#' @keywords internal
check_data_dose_sanity <- function(x){
test <- logical(24L)
names(test) <- 1:24
# ploidy
test[1] <- x$ploidy%%2 != 0 #is ploidy even?
test[2] <- any(sapply(x$dosage.p1, function(y) max(y) > x$ploidy | min(y) < 0)) #are dosages in P higher than ploidy?
test[3] <- any(sapply(x$dosage.p2, function(y) max(y) > x$ploidy | min(y) < 0)) #are dosages in Q higher than ploidy?
test[4] <- max(x$geno.dose) > x$ploidy + 1 #is there any dose in offspring higher than ploidy?
test[5] <- min(x$geno.dose) < 0 #is there any negative dose in offspring?
# number of individuals
test[6] <- x$n.ind < 0 #is the number of individuals greater than zero?
test[7] <- length(x$ind.names) != x$n.ind #is the number of individual names equal to the number of individuals?
test[8] <- ncol(x$geno.dose) != x$n.ind #is the number of columns in the dosage matrix equal to the number of individuals?
# number of markers
test[9] <- x$n.mrk < 0 #is the number of markers greater than zero?
test[10] <- length(x$mrk.names) != x$n.mrk #is the number of marker names equal to the number of markers?
test[11] <- length(x$dosage.p1) != x$n.mrk #is the number of marker dosages in P equal to the number of markers?
test[12] <- length(x$dosage.p2) != x$n.mrk #is the number of marker dosages in Q equal to the number of markers?
if(length(x$chrom) > 0)
test[13] <- length(x$chrom) != x$n.mrk #is the number of sequences equal to the number of markers?
if(length(x$genome.pos) > 0)
test[14] <- length(x$genome.pos) != x$n.mrk #is the number of sequence positions equal to the number of markers?
test[15] <- nrow(x$geno.dose) != x$n.mrk #is the number of rows in the dosage matrix equal to the number of markers?
if(length(x$chisq.pval) > 0)
test[16] <- length(x$chisq.pval) != x$n.mrk #is the number of chi-square tests equal to the number of markers?
# individual names in the dosage dataset
test[17] <- !is.character(x$ind.names) # are individual's names characters
test[18] <- !identical(colnames(x$geno.dose), x$ind.names) # are column names in dosage matrix identical to individual names?
# markers names in the dosage dataset
test[19] <- !is.character(x$mrk.names)# are marker's names characters
test[20] <- !identical(rownames(x$geno.dose), x$mrk.names)# are row names in dosage matrix identical to marker names?
# dosage in both parents
test[21] <- !is.integer(x$dosage.p1) # are dosages in P numeric
test[22] <- !is.integer(x$dosage.p2) # are dosages in Q numeric
test[23] <- !identical(names(x$dosage.p1), x$mrk.names) # are names in P's dosage vector identical to marker names?
test[24] <- !identical(names(x$dosage.p2), x$mrk.names) # are names in Q's dosage vector identical to marker names?
if(any(test))
return(test)
else
return(0)
}
#' Checks the consistency of dataset (probability distribution)
#'
#' @param void internal function to be documented
#' @keywords internal
check_data_dist_sanity <- function(x){
test <- logical(29L)
names(test) <- 1:29
# ploidy
test[1] <- x$ploidy%%2 != 0 #is ploidy even?
test[2] <- any(sapply(x$dosage.p1, function(y) max(y) > x$ploidy | min(y) < 0)) #are dosages in P higher than ploidy?
test[3] <- any(sapply(x$dosage.p2, function(y) max(y) > x$ploidy | min(y) < 0)) #are dosages in Q higher than ploidy?
test[4] <- ncol(x$geno) > x$ploidy + 3 #is the number of columns in the probability data frame correct? (ploidy + 3)
test[5] <- max(x$geno.dose) > x$ploidy + 1 #is there any dose in offspring higher than ploidy?
test[6] <- min(x$geno.dose) < 0 #is there any negative dose in offspring?
# number of individuals
test[7] <- x$n.ind < 0 #is the number of individuals greater than zero?
test[8] <- length(x$ind.names) != x$n.ind #is the number of individual names equal to the number of individuals?
test[9] <- length(unique(x$geno$ind)) != x$n.ind #is the number of individuals in the probability data frame equal to the number of individuals?
test[10] <- ncol(x$geno.dose) != x$n.ind #is the number of columns in the dosage matrix equal to the number of individuals?
# number of markers
test[11] <- x$n.mrk < 0 #is the number of markers greater than zero?
test[12] <- length(x$mrk.names) != x$n.mrk #is the number of marker names equal to the number of markers?
test[13] <- length(x$dosage.p1) != x$n.mrk #is the number of marker dosages in P equal to the number of markers?
test[14] <- length(x$dosage.p2) != x$n.mrk #is the number of marker dosages in Q equal to the number of markers?
if(length(x$chrom) > 0)
test[15] <- length(x$chrom) != x$n.mrk #is the number of sequences equal to the number of markers?
if(length(x$genome.pos) > 0)
test[16] <- length(x$genome.pos) != x$n.mrk #is the number of sequence positions equal to the number of markers?
test[17] <- nrow(x$geno)/x$n.ind != x$n.mrk#is the number of rows in the probability data frame divided by n.ind equal to the number of markers?
test[18] <- nrow(x$geno.dose) != x$n.mrk#is the number of rows in the dosage matrix equal to the number of markers?
if(length(x$chisq.pval) > 0)
test[19] <- length(x$chisq.pval) != x$n.mrk#is the number of chi-square tests equal to the number of markers?
# individual names in the dosage and probability dataset
test[20] <- !is.character(x$ind.names) # are individual's names characters
test[21] <- !identical(x$geno$ind, rep(x$ind.names, each = x$n.mrk)) #are individual's names in the probability data frame properly
#arranged and consistent with the informed individual's names
test[22] <- !identical(colnames(x$geno.dose), x$ind.names)# are column names in dosage matrix identical to individual names?
# marker names in the dosage and probability dataset
test[23] <- !is.character(x$mrk.names)# are marker's names characters
test[24] <- !identical(x$geno$mrk, rep(x$mrk.names, x$n.ind))#are marker names in the probability data frame properly
#arranged and consistent with the informed marker names
test[25] <- !identical(rownames(x$geno.dose), x$mrk.names)# are row names in dosage matrix identical to marker names?
# dosage in both parents
test[26] <- !is.integer(x$dosage.p1)# are dosages in P numeric
test[27] <- !is.integer(x$dosage.p2)# are dosages in Q numeric
test[28] <- !identical(names(x$dosage.p1), x$mrk.names)# are names in P's dosage vector identical to marker names?
test[29] <- !identical(names(x$dosage.p2), x$mrk.names)# are names in Q's dosage vector identical to marker names?
if(any(test))
return(test)
else
return(0)
}
#' Merge datasets
#'
#' This function merges two datasets of class \code{mappoly.data}. This can be useful
#' when individuals of a population were genotyped using two or more techniques
#' and have datasets in different files or formats. Please notice that the datasets
#' should contain the same number of individuals and they must be represented identically
#' in both datasets (e.g. \code{Ind_1} in both datasets, not \code{Ind_1}
#' in one dataset and \code{ind_1} or \code{Ind.1} in the other).
#'
#' @param dat.1 the first dataset of class \code{mappoly.data} to be merged
#'
#' @param dat.2 the second dataset of class \code{mappoly.data} to be merged (default = NULL);
#' if \code{dat.2 = NULL}, the function returns \code{dat.1} only
#'
#' @return An object of class \code{mappoly.data} which contains all markers
#' from both datasets. It will be a list with the following components:
#' \item{ploidy}{ploidy level}
#' \item{n.ind}{number individuals}
#' \item{n.mrk}{total number of markers}
#' \item{ind.names}{the names of the individuals}
#' \item{mrk.names}{the names of the markers}
#' \item{dosage.p1}{a vector containing the dosage in
#' parent P for all \code{n.mrk} markers}
#' \item{dosage.p2}{a vector containing the dosage in
#' parent Q for all \code{n.mrk} markers}
#' \item{chrom}{a vector indicating which sequence each marker
#' belongs. Zero indicates that the marker was not assigned to any
#' sequence}
#' \item{genome.pos}{Physical position of the markers into the
#' sequence}
#' \item{seq.ref}{if one or both datasets originated from read_vcf, it keeps
#' reference alleles from sequencing platform, otherwise is NULL}
#' \item{seq.alt}{if one or both datasets originated from read_vcf, it keeps
#' alternative alleles from sequencing platform, otherwise is NULL}
#' \item{all.mrk.depth}{if one or both datasets originated from read_vcf, it keeps
#' marker read depths from sequencing, otherwise is NULL}
#' \item{prob.thres}{(unused field)}
#' \item{geno.dose}{a matrix containing the dosage for each markers (rows)
#' for each individual (columns). Missing data are represented by
#' \code{ploidy_level + 1}}
#' \item{geno}{if both datasets contain genotype distribution information,
#' the final object will contain 'geno'. This is set to NULL otherwise}
#' \item{nphen}{(0)}
#' \item{phen}{(NULL)}
#' \item{chisq.pval}{a vector containing p-values related to the chi-squared
#' test of Mendelian segregation performed for all markers in both datasets}
#' \item{kept}{if elim.redundant = TRUE when reading any dataset, holds all non-redundant markers}
#' \item{elim.correspondence}{if elim.redundant = TRUE when reading any dataset,
#' holds all non-redundant markers and its equivalence to the redundant ones}
#'
#' @author Gabriel Gesteira, \email{gdesiqu@ncsu.edu}
#' @examples
#' \donttest{
#' ## Loading a subset of SNPs from chromosomes 3 and 12 of sweetpotato dataset
#' ## (SNPs anchored to Ipomoea trifida genome)
#' dat <- NULL
#' for(i in c(3, 12)){
#' cat("Loading chromosome", i, "...\n")
#' tempfl <- tempfile(pattern = paste0("ch", i), fileext = ".vcf.gz")
#' x <- "https://github.com/mmollina/MAPpoly_vignettes/raw/master/data/sweet_sample_ch"
#' address <- paste0(x, i, ".vcf.gz")
#' download.file(url = address, destfile = tempfl)
#' dattemp <- read_vcf(file = tempfl, parent.1 = "PARENT1", parent.2 = "PARENT2",
#' ploidy = 6, verbose = FALSE)
#' dat <- merge_datasets(dat, dattemp)
#' cat("\n")
#' }
#' dat
#' plot(dat)
#'}
#'
#' @references
#' Mollinari, M., and Garcia, A. A. F. (2019) Linkage
#' analysis and haplotype phasing in experimental autopolyploid
#' populations with high ploidy level using hidden Markov
#' models, _G3: Genes, Genomes, Genetics_.
#' \doi{10.1534/g3.119.400378}
#'
#' @export merge_datasets
#' @importFrom dplyr bind_rows arrange
merge_datasets = function(dat.1 = NULL, dat.2 = NULL){
## Check objects class
if (is.null(dat.1)){
if (is.null(dat.2)) return(dat.1)
if (!inherits(dat.2, "mappoly.data")) {
stop(deparse(substitute(dat.2)), " is not an object of class 'mappoly.data'")
} else {
return(dat.2)
}
}
if (!inherits(dat.1, "mappoly.data")) {
stop(deparse(substitute(dat.1)), " is not an object of class 'mappoly.data'")
}
if (is.null(dat.2)) return(dat.1)
if (!inherits(dat.2, "mappoly.data")) {
stop(deparse(substitute(dat.2)), " is not an object of class 'mappoly.data'")
}
## Check ploidy
if (dat.1$ploidy != dat.2$ploidy){
stop("The ploidy levels in the datasets do not match. Please check your files and try again.")
}
## Check individuals
if (dat.1$n.ind != dat.2$n.ind){
stop("The datasets have different number of individuals. Please check your files and try again.")
}
## Check individual names
if (!all(dat.1$ind.names %in% dat.2$ind.names)){
nmi.1 = dat.1$ind.names[!(dat.1$ind.names %in% dat.2$ind.names)]
nmi.2 = dat.2$ind.names[!(dat.2$ind.names %in% dat.1$ind.names)]
cat("Some individuals' names don't match:\n")
cat("Dataset 1\tDataset 2\n")
for (i in 1:length(nmi.1)) cat(nmi.1[i], "\t\t", nmi.2[i], "\n")
stop("Your datasets have the same number of individuals, but some of them are not the same. Please check the list above, fix your files and try again.")
}
## Checking (and fixing) individuals order in geno.dose
if (!all(colnames(dat.1$geno.dose) == colnames(dat.2$geno.dose))){
dat.2$geno.dose = dat.2$geno.dose[,colnames(dat.1$geno.dose)]
}
## Merging all items
dat.1$geno.dose = rbind(dat.1$geno.dose, dat.2$geno.dose)
dat.1$dosage.p1 = c(dat.1$dosage.p1, dat.2$dosage.p1)
dat.1$dosage.p2 = c(dat.1$dosage.p2, dat.2$dosage.p2)
##dat.1$mrk.names = c(dat.1$mrk.names, dat.2$mrk.names) ## Fixing possible name incompatibilities
dat.1$mrk.names = rownames(dat.1$geno.dose) ## Fixing possible name incompatibilities
dat.1$n.mrk = dat.1$n.mrk + dat.2$n.mrk
dat.1$chrom = c(dat.1$chrom, dat.2$chrom)
dat.1$genome.pos = c(dat.1$genome.pos, dat.2$genome.pos)
# VCF info
## seq.ref and seq.alt
if (!is.null(dat.1$seq.ref) && !is.null(dat.2$seq.ref)){
dat.1$seq.ref = c(dat.1$seq.ref,dat.2$seq.ref)
dat.1$seq.alt = c(dat.1$seq.alt,dat.2$seq.alt)
} else if (is.null(dat.1$seq.ref) && is.null(dat.2$seq.ref)){
dat.1$seq.ref = dat.1$seq.alt = NULL
} else if (!is.null(dat.1$seq.ref) && is.null(dat.2$seq.ref)){
dat.1$seq.ref = c(dat.1$seq.ref,rep(NA,length(dat.2$n.mrk)))
dat.1$seq.alt = c(dat.1$seq.alt,rep(NA,length(dat.2$n.mrk)))
} else if (is.null(dat.1$seq.ref) && !is.null(dat.2$seq.ref)){
dat.1$seq.ref = c(rep(NA,length(dat.1$n.mrk)),dat.2$seq.ref)
dat.1$seq.alt = c(rep(NA,length(dat.1$n.mrk)),dat.2$seq.alt)
}
## all.mrk.depth
if (!is.null(dat.1$all.mrk.depth) && !is.null(dat.2$all.mrk.depth)){
dat.1$all.mrk.depth = c(dat.1$all.mrk.depth,dat.2$all.mrk.depth)
} else if (is.null(dat.1$all.mrk.depth) && is.null(dat.2$all.mrk.depth)){
dat.1$all.mrk.depth = NULL
} else if (!is.null(dat.1$all.mrk.depth) && is.null(dat.2$all.mrk.depth)){
dat.1$all.mrk.depth = c(dat.1$all.mrk.depth,rep(NA,length(dat.2$n.mrk)))
} else if (is.null(dat.1$all.mrk.depth) && !is.null(dat.2$all.mrk.depth)){
dat.1$all.mrk.depth = c(rep(NA,length(dat.1$n.mrk)),dat.2$all.mrk.depth)
}
# Chisq info
if (!is.null(dat.1$chisq.pval) && !is.null(dat.2$chisq.pval)){
dat.1$chisq.pval = c(dat.1$chisq.pval,dat.2$chisq.pval)
} else if (is.null(dat.1$chisq.pval) && is.null(dat.2$chisq.pval)){
dat.1$chisq.pval = NULL
} else if (!is.null(dat.1$chisq.pval) && is.null(dat.2$chisq.pval)){
dat.1$chisq.pval = c(dat.1$chisq.pval,rep(NA,length(dat.2$n.mrk)))
} else if (is.null(dat.1$chisq.pval) && !is.null(dat.2$chisq.pval)){
dat.1$chisq.pval = c(rep(NA,length(dat.1$n.mrk)),dat.2$chisq.pval)
}
# Redundant info
## Kept info
if (!is.null(dat.1$kept) && !is.null(dat.2$kept)){
dat.1$kept = c(dat.1$kept,dat.2$kept)
} else if (is.null(dat.1$kept) && is.null(dat.2$kept)){
dat.1$kept = NULL
} else if (!is.null(dat.1$kept) && is.null(dat.2$kept)){
dat.1$kept = c(dat.1$kept, dat.2$mrk.names)
} else if (is.null(dat.1$kept) && !is.null(dat.2$kept)){
dat.1$kept = c(dat.1$mrk.names,dat.2$kept)
}
## elim.correspondence info
if (!is.null(dat.1$elim.correspondence) && !is.null(dat.2$elim.correspondence)){
dat.1$elim.correspondence = rbind(dat.1$elim.correspondence,dat.2$elim.correspondence)
} else if (is.null(dat.1$elim.correspondence) && is.null(dat.2$elim.correspondence)){
dat.1$elim.correspondence = NULL
} else if (!is.null(dat.1$elim.correspondence) && is.null(dat.2$elim.correspondence)){
dat.1$elim.correspondence = dat.1$elim.correspondence
} else if (is.null(dat.1$elim.correspondence) && !is.null(dat.2$elim.correspondence)){
dat.1$elim.correspondence = dat.2$elim.correspondence
}
## geno dist info (just keep if both datasets contain this information)
if (exists('geno', where = dat.1) && exists('geno', where = dat.2)){
#dat.1$geno = rbind(dat.1$geno,dat.2$geno)
ind <- NULL
dat.1$geno <- dplyr::bind_rows(dat.1$geno, dat.2$geno) %>% arrange(ind)
} else dat.1$geno = NULL
if(!is.null(dat.1$chisq.pval) | !any(is.na(dat.1$chisq.pval)))
dat.1$chisq.pval <- dat.1$chisq.pval[dat.1$mrk.names]
## Fixing possible issues with marker names
names(dat.1$dosage.p1) = names(dat.1$dosage.p2) = names(dat.1$genome.pos) = names(dat.1$chrom) = names(dat.1$chisq.pval) = dat.1$mrk.names
return(dat.1)
}
#' Summary maps
#'
#' This function generates a brief summary table of a list of \code{mappoly.map} objects
#' @param map.list a list of objects of class \code{mappoly.map}
#' @param verbose if \code{TRUE} (default), the current progress is shown; if
#' \code{FALSE}, no output is produced
#' @return a data frame containing a brief summary of all maps contained in \code{map.list}
#' @examples
#' tetra.sum <- summary_maps(solcap.err.map)
#' tetra.sum
#' @author Gabriel Gesteira, \email{gdesiqu@ncsu.edu}
#' @export summary_maps
summary_maps = function(map.list, verbose = TRUE){
## Check data
if (any(!sapply(map.list, inherits, "mappoly.map")))
stop(deparse(substitute(map.list)),
" is not a list containing 'mappoly.map' objects.")
md <- unlist(lapply(map.list, function(x) sum(get_tab_mrks(x)) - sum(get_tab_mrks(x)[rbind(c(2,2),c(x$info$ploidy,x$info$ploidy))]) - sum(get_tab_mrks(x)[rbind(c(1,2),c(2,1),c(x$info$ploidy,(x$info$ploidy+1)),c((x$info$ploidy+1),x$info$ploidy))])))
if(map.list[[1]]$info$ploidy == 2)
md[] <- 0
results = data.frame("LG" = as.character(seq(1,length(map.list),1)),
"Genomic sequence" = as.character(unlist(lapply(map.list, function(x) paste(unique(x$info$chrom), collapse = "-")))),
"Map length (cM)" = unlist(lapply(map.list, function(x) round(sum(c(0, imf_h(x$maps[[1]]$seq.rf))), 2))),
"Markers/cM" = round(unlist(lapply(map.list, function(x) x$info$n.mrk/(round(sum(c(0, imf_h(x$maps[[1]]$seq.rf))), 2)))),2),
"Simplex" = unlist(lapply(map.list, function(x) sum(get_tab_mrks(x)[rbind(c(1,2),c(2,1),c(x$info$ploidy,(x$info$ploidy+1)),c((x$info$ploidy+1),x$info$ploidy))]))),
"Double-simplex" = unlist(lapply(map.list, function(x) sum(get_tab_mrks(x)[rbind(c(2,2),c(x$info$ploidy,x$info$ploidy))]))),
"Multiplex" = md,
"Total" = unlist(lapply(map.list, function(x) x$info$n.mrk)),
"Max gap" = unlist(lapply(map.list, function(x) round(imf_h(max(x$maps[[1]]$seq.rf)),2))),
check.names = FALSE, stringsAsFactors = F)
results = rbind(results, c('Total', NA, sum(as.numeric(results$`Map length (cM)`)), round(mean(as.numeric(results$`Markers/cM`)),2), sum(as.numeric(results$Simplex)), sum(as.numeric(results$`Double-simplex`)), sum(as.numeric(results$Multiplex)), sum(as.numeric(results$Total)), round(mean(as.numeric(results$`Max gap`)),2)))
if (verbose){
all.mrks = unlist(lapply(map.list, function(x) return(x$info$mrk.names)))
if (!any(get(map.list[[1]]$info$data.name, pos = 1)$elim.correspondence$elim %in% all.mrks))
message("\nYour dataset contains removed (redundant) markers. Once finished the maps, remember to add them back with the function 'update_map'.\n")
}
return(results)
}
#' Get table of dosage combinations
#'
#' Internal function
#' @param x an object of class \code{mappoly.map}
#' @author Gabriel Gesteira, \email{gdesiqu@ncsu.edu}
get_tab_mrks = function(x){
tab = table(get(x$info$data.name, pos = 1)$dosage.p1[which(get(x$info$data.name, pos = 1)$mrk.names %in% x$info$mrk.names)], get(x$info$data.name, pos = 1)$dosage.p2[which(get(x$info$data.name, pos = 1)$mrk.names %in% x$info$mrk.names)])
doses = as.character(seq(0,x$info$ploidy,1))
# checking dimensions
if (!all(doses %in% colnames(tab))){
newmat = matrix(NA, nrow(tab), length(doses))
newmat[,which(doses %in% colnames(tab))] = tab[,which(colnames(tab) %in% doses)]
newmat[,which(!doses %in% colnames(tab))] = 0
rownames(newmat) = rownames(tab)
colnames(newmat) = doses
tab = newmat
}
if (!all(doses %in% rownames(tab))){
newmat = matrix(NA, length(doses), ncol(tab))
newmat[which(doses %in% rownames(tab)),] = tab[which(rownames(tab) %in% doses),]
newmat[which(!doses %in% rownames(tab)),] = 0
colnames(newmat) = colnames(tab)
rownames(newmat) = doses
tab = newmat
}
return(tab)
}
#' Update map
#'
#' This function takes an object of class \code{mappoly.map} and checks for
#' removed redundant markers in the original dataset. Once redundant markers
#' are found, they are re-added to the map in their respective equivalent positions
#' and another HMM round is performed.
#'
#' @param input.maps a single map or a list of maps of class \code{mappoly.map}
#' @param verbose if TRUE (default), shows information about each update process
#' @return an updated map (or list of maps) of class \code{mappoly.map}, containing the original map(s) plus redundant markers
#' @author Gabriel Gesteira, \email{gdesiqu@ncsu.edu}
#' @examples
#' orig.map <- solcap.err.map
#' up.map <- lapply(solcap.err.map, update_map)
#' summary_maps(orig.map)
#' summary_maps(up.map)
#' @export update_map
#'
update_map = function(input.maps, verbose = TRUE){
## Checking object
if (inherits(input.maps, "mappoly.map"))
input.maps = list(input.maps)
if(!inherits(input.maps, "list"))
stop(deparse(substitute(input.maps)),
" is not an object of class 'mappoly.map' neither a list containing 'mappoly.map' objects.")
if (any(!sapply(input.maps, inherits, "mappoly.map")))
stop(deparse(substitute(input.maps)),
" is not an object of class 'mappoly.map' neither a list containing 'mappoly.map' objects.")
## Checking the existence of redundant markers
if (is.null(get(input.maps[[1]]$info$data.name, pos = 1)$elim.correspondence))
stop('Your dataset does not contain redundant markers. Please check it and try again.')
## Creating list to handle results
results = list()
for (i in 1:length(input.maps)){
if (verbose) cat("Updating map", i , "\n")
input.map = input.maps[[i]]
## Checking if redundant markers belong to the informed map
map.kept.mrks = which(as.character(get(input.map$info$data.name, pos = 1)$elim.correspondence$kept) %in% input.map$info$mrk.names)
if (is.null(map.kept.mrks)){
if (verbose) cat("There is no redundant marker on map ",i,". Skipping it.\n")
next
}
corresp = get(input.map$info$data.name, pos = 1)$elim.correspondence[which(as.character(get(input.map$info$data.name, pos = 1)$elim.correspondence$kept) %in% input.map$info$mrk.names),]
## Check if redundant markers were not already added to the map
if (any(corresp$elim %in% input.map$info$mrk.names)) {
if (verbose) cat("Some redundant markers were already added to the map ", i,". These markers will be skipped.\n")
corresp = corresp[!(corresp$elim %in% input.map$info$mrk.names),]
}
## Updating number of markers
input.map$info$n.mrk = input.map$info$n.mrk + nrow(corresp)
## Adding markers to the sequence
while (nrow(corresp) > 0){
pos.kep = match(as.character(corresp$kept), get(input.map$info$data.name, pos = 1)$mrk.names)
input.map$info$seq.num = append(input.map$info$seq.num, NA, after = which(input.map$info$seq.num == pos.kep[1]))
input.map$info$chisq.pval = append(input.map$info$chisq.pval, input.map$info$chisq.pval[which(input.map$info$seq.num == pos.kep[1])], after = which(input.map$info$seq.num == pos.kep[1]))
input.map$info$seq.dose.p1 = append(input.map$info$seq.dose.p1, input.map$info$seq.dose.p1[which(input.map$info$seq.num == pos.kep[1])], after = which(input.map$info$seq.num == pos.kep[1]))
input.map$info$seq.dose.p2 = append(input.map$info$seq.dose.p2, input.map$info$seq.dose.p2[which(input.map$info$seq.num == pos.kep[1])], after = which(input.map$info$seq.num == pos.kep[1]))
input.map$info$chrom = as.numeric(append(input.map$info$chrom, as.character(corresp$chrom[1]), after = which(input.map$info$seq.num == pos.kep[1])))
input.map$info$genome.pos = as.numeric(append(input.map$info$genome.pos, as.character(corresp$genome.pos[1]), after = which(input.map$info$seq.num == pos.kep[1])))
input.map$info$mrk.names = append(input.map$info$mrk.names, as.character(corresp$elim[1]), after = which(input.map$info$mrk.names == as.character(corresp$kept[1])))
if (!is.null(input.map$info$seq.ref) && !is.null(input.map$info$seq.alt)){
input.map$info$seq.ref = append(input.map$info$seq.ref, as.character(corresp$seq.ref[1]), after = which(input.map$info$seq.num == pos.kep[1]))
input.map$info$seq.alt = append(input.map$info$seq.alt, as.character(corresp$seq.alt[1]), after = which(input.map$info$seq.num == pos.kep[1]))
}
for (j in 1:length(input.map$maps)){
input.map$maps[[j]]$seq.rf = append(input.map$maps[[j]]$seq.rf, 0.000001, after = which(input.map$info$seq.num == pos.kep[1]))
input.map$maps[[j]]$seq.ph$P = append(input.map$maps[[j]]$seq.ph$P, input.map$maps[[j]]$seq.ph$P[paste0(pos.kep[1])], after = which(input.map$info$seq.num == pos.kep[1]))
input.map$maps[[j]]$seq.ph$Q = append(input.map$maps[[j]]$seq.ph$Q, input.map$maps[[j]]$seq.ph$Q[paste0(pos.kep[1])], after = which(input.map$info$seq.num == pos.kep[1]))
}
corresp = corresp[-1,]
}
## Renaming objects and updating map information
names(input.map$info$chrom) = names(input.map$info$genome.pos) = names(input.map$info$chisq.pval) = names(input.map$info$seq.dose.p1) = names(input.map$info$seq.dose.p2) = names(input.map$info$seq.num) = input.map$info$mrk.names
for (j in 1:length(input.map$maps)){
names(input.map$maps[[j]]$seq.ph$P) = names(input.map$maps[[j]]$seq.ph$Q) = as.character(input.map$info$seq.num)
input.map$maps[[j]]$seq.num = input.map$info$seq.num
}
if (!is.null(input.map$info$seq.ref) && !is.null(input.map$info$seq.alt))
names(input.map$info$seq.ref) = names(input.map$info$seq.alt) = input.map$info$mrk.names
results[[i]] = input.map
}
if (length(results) == 1) return(results[[1]])
else return(results)
}
#' Random sampling of dataset
#' @param x an object of class \code{mappoly.data}
#' @param n number of individuals or markers to be sampled
#' @param percentage if \code{n == NULL}, the percentage of individuals or markers to be sampled
#' @param type should sample individuals or markers?
#' @param selected.ind a vector containing the name of the individuals to select. Only has effect
#' if \code{type = "individual"}, \code{n = NULL} and \code{percentage = NULL}
#' @param selected.mrk a vector containing the name of the markers to select. Only has effect
#' if \code{type = "marker"}, \code{n = NULL} and \code{percentage = NULL}
#' @return an object of class \code{mappoly.data}
#' @keywords internal
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr filter
sample_data <- function(input.data, n = NULL,
percentage = NULL,
type = c("individual", "marker"),
selected.ind = NULL,
selected.mrk = NULL){
if(!is.null(selected.mrk))
type <- "marker"
else type <- match.arg(type)
if(type == "individual"){
if(!is.null(n)){
selected.ind.id <- sort(sample(input.data$n.ind, n))
} else if(!is.null(percentage)){
selected.ind.id <- sample(input.data$n.ind, ceiling(input.data$n.ind * percentage/100))
} else if(!is.null(selected.ind)){
selected.ind.id <- match(selected.ind, input.data$ind.names)
} else {
stop("Inform 'n', 'percentage' or selected.ind.")
}
ind <- mrk <- NULL
selected.ind <- input.data$ind.names[selected.ind.id]
if(length(selected.ind.id) >= input.data$n.ind) return(input.data)
if(is.prob.data(input.data))
input.data$geno <- input.data$geno %>%
dplyr::filter(ind%in%selected.ind)
input.data$geno.dose <- input.data$geno.dose[,selected.ind.id]
input.data$ind.names <- input.data$ind.names[selected.ind.id]
input.data$n.ind <- ncol(input.data$geno.dose)
##Computing chi-square p.values
if(!is.null(input.data$chisq.pval)){
ploidy <- input.data$ploidy
Ds <- array(NA, dim = c(ploidy+1, ploidy+1, ploidy+1))
for(i in 0:ploidy)
for(j in 0:ploidy)
Ds[i+1,j+1,] <- segreg_poly(ploidy = ploidy, dP = i, dQ = j)
Dpop <- cbind(input.data$dosage.p1, input.data$dosage.p2)
M <- t(apply(Dpop, 1, function(x) Ds[x[1]+1, x[2]+1,]))
dimnames(M) <- list(input.data$mrk.names, c(0:ploidy))
M <- cbind(M, input.data$geno.dose)
input.data$chisq.pval <- apply(M, 1, mrk_chisq_test, ploidy = ploidy)
}
return(input.data)
}
else if(type == "marker"){
if(!is.null(n)){
selected.mrk.id <- sort(sample(input.data$n.mrk, n))
} else if(!is.null(percentage)){
selected.mrk.id <- sort(sample(input.data$n.mrk, ceiling(input.data$n.mrk * percentage/100)))
} else if(!is.null(selected.mrk)){
selected.mrk.id <- match(selected.mrk, input.data$mrk.names)
} else{
stop("Inform 'n', 'percentage' or selected.mrk.")
}
selected.mrk <- input.data$mrk.names[selected.mrk.id]
if(length(selected.mrk.id) >= input.data$n.mrk) return(input.data)
if(is.prob.data(input.data))
input.data$geno <- input.data$geno %>%
dplyr::filter(mrk%in%selected.mrk)
input.data$geno.dose <- input.data$geno.dose[selected.mrk.id, , drop = FALSE]
input.data$n.mrk <- nrow(input.data$geno.dose)
input.data$mrk.names <- input.data$mrk.names[selected.mrk.id]
input.data$dosage.p1 <- input.data$dosage.p1[selected.mrk.id]
input.data$dosage.p2 <- input.data$dosage.p2[selected.mrk.id]
input.data$chrom <- input.data$chrom[selected.mrk.id]
input.data$genome.pos <- input.data$genome.pos[selected.mrk.id]
input.data$seq.ref <- input.data$seq.ref[selected.mrk.id]
input.data$seq.alt <- input.data$seq.alt[selected.mrk.id]
input.data$all.mrk.depth <- input.data$all.mrk.depth[selected.mrk.id]
input.data$kept <- intersect(input.data$mrk.names, input.data$kept)
input.data$elim.correspondence <- input.data$elim.correspondence[input.data$elim.correspondence$kept%in%input.data$mrk.names, , drop = FALSE]
input.data$chisq.pval <- input.data$chisq.pval[names(input.data$chisq.pval)%in%input.data$mrk.names]
if(!is.null(input.data$chisq.pval))
input.data$chisq.pval <- input.data$chisq.pval[names(input.data$chisq.pval)%in%input.data$mrk.names]
return(input.data)
}
else stop("Inform type")
}
#' Get weighted ordinary least squared map give a sequence and rf matrix
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
#' @importFrom zoo na.approx
get_ols_map <- function(input.seq, input.map, weight = TRUE){
id <- input.seq$seq.mrk.names
y <- as.numeric((imf_h(as.dist(input.map$rec.mat[id,id]))))
w <- as.numeric((imf_h(as.dist(input.map$lod.mat[id,id]))))
v <- t(combn(id,2))
rf <- get_rf_from_mat(input.map$rec.mat[id,id])
rf <- zoo::na.approx(rf)
z <- cumsum(imf_h(c(0,rf)))
names(z) <- id
x <- numeric(nrow(v))
names(x) <- names(y) <- apply(v, 1, paste0, collapse = "-")
for(i in 1:nrow(v))
x[i] <- z[v[i,2]]-z[v[i,1]]
if(weight)
model <- lm(y ~ x-1, weights = w)
else
model <- lm(y ~ x-1)
new <- data.frame(x = z)
u <- predict(model, new)
d <- cumsum(imf_h(c(0, mf_h(diff(u)))))
names(d) <- id
d
}
#' Get dosage type in a sequence
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
get_dosage_type <- function(input.seq){
p <- abs(abs(input.seq$seq.dose.p1 - input.seq$ploidy/2) - input.seq$ploidy/2)
q <- abs(abs(input.seq$seq.dose.p2 - input.seq$ploidy/2) - input.seq$ploidy/2)
s.p <- p == 1 & q == 0
s.q <- p == 0 & q == 1
ds <- p == 1 & q == 1
list(simplex.p = input.seq$seq.mrk.names[s.p],
simplex.q = input.seq$seq.mrk.names[s.q],
double.simplex = input.seq$seq.mrk.names[ds],
multiplex = input.seq$seq.mrk.names[!(s.p | s.q | ds)])
}
#' Aggregate matrix cells (lower the resolution by a factor)
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
aggregate_matrix <- function(M, fact){
id <- seq(1,ncol(M), by = fact)
id <- cbind(id, c(id[-1]-1, ncol(M)))
R <- matrix(NA, nrow(id), nrow(id))
for(i in 1:(nrow(id)-1)){
for(j in (i+1):nrow(id)){
R[j,i] <- R[i,j] <- mean(M[id[i,1]:id[i,2], id[j,1]:id[j,2]], na.rm = TRUE)
}
}
R
}
#' Get states and emission in one informative parent
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
get_states_and_emission_one_parent <- function(ploidy, ph, global.err, D){
n.mrk <- nrow(D)
n.ind <- ncol(D)
A <- matrix(0, nrow = choose(ploidy, ploidy/2), ncol = length(ph))
for(i in 1:length(ph)){
id1 <- numeric(ploidy)
id1[ph[[i]]] <- 1
A[,i] <- apply(combn(id1, ploidy/2), 2, sum)
}
if(round(global.err, 4) == 0.0){
e <- h <- vector("list", n.mrk)
for(j in 1:n.mrk){
e.temp <- h.temp <- vector("list", n.ind)
for(i in 1:n.ind){
h.temp[[i]] <- which(A[,j] == D[j,i]) - 1
if(length(h.temp[[i]]) == 0)
h.temp[[i]] <- 1:nrow(A) - 1
#e.temp[[i]] <- rep(1, length(h.temp[[i]]))/length(h.temp[[i]])
e.temp[[i]] <- rep(1, length(h.temp[[i]]))
}
e[[j]] <- e.temp
h[[j]] <- h.temp
}
} else if(round(global.err, 4) > 0.0){
e <- h <- vector("list", n.mrk)
for(j in 1:n.mrk){
e.temp <- h.temp <- vector("list", n.ind)
for(i in 1:n.ind){
h.temp[[i]] <- 1:nrow(A) - 1
s <- which(A[,j] == D[j,i])
if(length(s) == 0)
e.temp[[i]] <- rep(1/nrow(A), nrow(A))
else{
e.temp0 <- numeric(nrow(A))
e.temp0[-s] <- global.err/(nrow(A) - length(s))
e.temp0[s] <- (1- global.err)/length(s)
e.temp[[i]] <- e.temp0
}
}
e[[j]] <- e.temp
h[[j]] <- h.temp
}
}
list(states = h, emission = e)
}
#' Conversion: vector to matrix
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
v_2_m <- function(x, n){
y <- base::matrix(NA, n, n)
y[base::lower.tri(y)] <- as.numeric(x)
y[base::upper.tri(y)] <- t(y)[base::upper.tri(y)]
y
}
#' Compare a list of maps
#'
#' Compare lengths, density, maximum gaps and log likelihoods in a list of maps.
#' In order to make the maps comparable, the function uses the intersection of
#' markers among maps.
#'
#' @param ... a list of objects of class \code{mappoly.map}
#'
#' @return A data frame where the lines correspond to the maps in the order provided in input list list
#'
#' @export
compare_maps <- function(...){
l <- list(...)
id <- lapply(l, function(x) x$info$mrk.names)
id <- Reduce(intersect, id)
if(length(id) < 2)
stop("At least two markers must be present in all maps")
map.out <- vector("list", length(l))
for(i in 1:length(l)){
id.temp <- which(l[[i]]$info$mrk.names%in%id)
map.temp <- filter_map_at_hmm_thres(get_submap(l[[i]], mrk.pos = id.temp,
reestimate.rf = FALSE, verbose = FALSE), 10e-5)
map.out[[i]] <-loglike_hmm(map.temp)
}
a <- summary_maps(map.out, verbose = FALSE)
a <- cbind(a[-nrow(a), -c(1,5,6,7,8)], sapply(map.out, function(x) x$maps[[1]]$loglike))
a <- cbind(a, rep("-", nrow(a)))
a[which.max(a[,5]),6] <- "*"
colnames(a)[c(5,6)] <- c("loglike", "max_likelog")
return(as.data.frame(a))
}
# Skeleton to test CPP functions
# @export
# test_CPP<-function(m, rf)
# .Call("rec_number", as.integer(m), as.numeric(rf), PACKAGE = "mappoly")
#
#' @export
.mappoly_data_skeleton<-function()
structure(list(ploidy = NA,
n.ind = NA,
n.mrk = NA,
ind.names = NA,
mrk.names = NA,
dosage.p1 = NA,
dosage.p2 = NA,
chrom = NA,
genome.pos = NA,
seq.ref = NA,
seq.alt = NA,
all.mrk.depth = NA,
prob.thres = NA,
geno.dose = NA,
nphen = NA,
phen = NA,
kept = NA,
chisq.pval = NA,
elim.correspondence = NA),
class = "mappoly.data")
#' Split map into sub maps given a gap threshold
#'
#' @param void internal function to be documented
#' @keywords internal
#' @export
split_mappoly <- function(input.map,
gap.threshold = 5,
size.rem.cluster = 1,
phase.config = "best",
tol.final = 10e-4,
verbose = TRUE){
if (!inherits(input.map, "mappoly.map")) {
stop(deparse(substitute(input.map)), " is not an object of class 'mappoly.map'")
}
## choosing the linkage phase configuration
LOD.conf <- get_LOD(input.map, sorted = FALSE)
if(phase.config == "best") {
i.lpc <- which.min(LOD.conf)
} else if (phase.config > length(LOD.conf)) {
stop("invalid linkage phase configuration")
} else i.lpc <- phase.config
id <- which(imf_h(input.map$maps[[i.lpc]]$seq.rf) > gap.threshold)
if(length(id) == 0){
if(verbose) cat("no submaps found\n")
return(input.map)
}
id <- cbind(c(1, id+1), c(id, input.map$info$n.mrk))
temp.map <- input.map
## Selecting map segments larger then the specified threshold
segments <- id[apply(id, 1, diff) > size.rem.cluster - 1, , drop = FALSE]
if(length(segments) == 0) stop("all markers were eliminated\n")
## Dividing map in sub-maps
temp.maps <- vector("list", nrow(segments))
if (verbose) {
ns <- nrow(segments)
if(ns == 1){
cat("one submap found ...\n")
map <- get_submap(input.map, c(segments[1, 1]:segments[1, 2]), tol.final = tol.final, verbose = FALSE)
return(filter_map_at_hmm_thres(map, 10e-4))
}
else cat(ns, "submaps found ...\n")
}
for(i in 1:length(temp.maps)){
temp.id <- c(segments[i, 1]:segments[i, 2])
if(length(temp.id) > 1)
temp.maps[[i]] <- get_submap(input.map, temp.id, reestimate.rf = FALSE, verbose = FALSE)
else
temp.maps[[i]] <- input.map$info$mrk.names[temp.id]
}
temp.maps
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.