Nothing
# Assign dosage scores to parents when using probabilistic genotype data
#' @param chk Output list as returned by function \code{\link{checkF1}}
#' @param probgeno_df A data.frame as read from the scores file produced by function
#' \code{saveMarkerModels} of R package \code{fitPoly}, or alternatively, a data.frame containing the following columns:
#' \itemize{
#' \item{SampleName}{
#' Name of the sample (individual)
#' }
#' \item{MarkerName}{
#' Name of the marker
#' }
#' \item{P0}{
#' Probabilities of dosage score '0'
#' }
#' \item{P1...}{
#' Probabilities of dosage score '1' etc. (up to max offspring dosage, e.g. P4 for tetraploid population)
#' }
#' \item{maxP}{
#' Maximum genotype probability identified for a particular individual and marker combination
#' }
#' \item{maxgeno}{
#' Most probable dosage for a particular individual and marker combination
#' }
#' \item{geno}{
#' Most probable dosage for a particular individual and marker combination, if \code{maxP} exceeds a user-defined threshold (e.g. 0.9), otherwise \code{NA}}
#' }
#' @param scorefile filename for tab-separated text file with the dosages. If NA no file is written
#' @return A data.frame with three columns: "MarkerName", "parent1" and "parent2".
#' @examples
#' data("chk1","gp_df")
#' pardose<-assign_parental_dosage(chk=chk1,probgeno_df=gp_df)
#' @noRd
assign_parental_dosage <- function(chk,
probgeno_df,
scorefile = NA){
probgeno_df <- test_probgeno_df(probgeno_df)
result <- compareProbes.gp(chk = chk,
scores = probgeno_df,
probe.suffix = NA,
fracdiff.threshold = 0.04,
qall_flavor = "qall_mult",
compfile = NA,
combscorefile = scorefile)
r <- result$combscores
r <- r[,c("MarkerName","parent1","parent2")]
r <- r[!is.na(r$parent1) & !is.na(r$parent2),]
return(invisible(r))
} #assign_parental_dosage
#' calc_binning_thresholds
#' @description Calculate LOD and recombination frequency thresholds for marker binning
#' @param dosage_matrix An integer matrix with markers in rows and individuals in columns.
#' @noRd
calc_binning_thresholds <- function(dosage_matrix){
Ntot <- ncol(dosage_matrix) #exclude marker name and 2 parent cols
the_na_rate <-
sum(is.na(dosage_matrix)) / length(dosage_matrix) #0.02054193
N_e <- Ntot * (1 - the_na_rate)
r_thresh <- 1 / N_e
lod_thresh <- 23.42072 + 0.115817 * N_e # See Bourke et al. (2016) for details
out <- c(r_thresh, lod_thresh)
names(out)<- c("r_thresh", "lod_thresh")
return(out)
} #calc_binning_thresholds
#'@title Calculate potential shifts in marker dosages
#'@description Internal function for \code{correctDosages} function
#'@param segtypes Vector of segregation types
#'@param ploidy The ploidy of the species
#'@return Data.frame with columns:
#'\item{segtype}{ The segtype code before the underscore i.e. without the first dosage class
#'}
#'\item{mindos}{ The first dosage class
#'}
#'\item{shift}{ -1, 0 or 1: potental shifts to try out for each segtype whether a shift of +/- 1
#'should be tried or not (0) (only when number of dosages in segtype in <= ploidy-2 and
#'min.dosage==1 or max.dosage==ploidy-1)
#'}
#'@noRd
calcPotShifts <- function(segtypes, ploidy) {
shift <- rep(0, length(segtypes))
segtypes[is.na(segtypes) | segtypes==""] <- "x_0" #avoid problems with strsplit
segtypes <- do.call(rbind, strsplit(as.character(segtypes), split="_"))
numdos <- nchar(segtypes[,1]) #number of dosage classes in segtype
compl <- substr(segtypes[,1],1,1) == "c"
lencompl <- as.integer(substr(segtypes[compl, 1], 2, numdos[compl]-1))
#nr of dosages in complex segtype
numdos[compl] <- lencompl
mindos <- as.integer(segtypes[,2])
#shift down: numdos <= ploidy-2 and lowest dosage = 1:
shift[numdos <= ploidy-2 & mindos == 1] <- -1
#shift up: numdos <= ploidy-2 and highest dosage = ploidy-1:
maxdos <- mindos + numdos - 1
shift[numdos <= ploidy-2 & maxdos == ploidy-1] <- 1
data.frame(seg=segtypes[,1], mindos=mindos, shift=shift,
stringsAsFactors=FALSE)
} #calcPotShifts
#'calc_q1 ***********************************************************
#'@description Calculate q1 quality score. q1 is a measure of the fit of selfit.
#'It is based on Pvalue and the fraction of valid F1 scores. If either is below their
#'threshold the q1 value is 0. Also, if bestParentfit is different from bestfit this indicates a segregation distortion.
#'@param Pvalue P-value of bestParentfit segtype
#'@param fracInvalid Fraction invalid scores of bestParentfit segtype
#'@param bestfit segtype of best fit of population
#'@param bestParentfit segtype of best fit of parents
#'@param Pvalue_threshold Threshold P-value of bestParentfit segtype
#'@param fracInvalid_threshold a maximum threshold for the fracInvalid of the bestParentfit segtype
#' (with a larger fraction of invalid dosages in the F1 the q1 quality parameter will be set to 0)
#' @noRd
calc_q1 <- function(Pvalue,
fracInvalid,
bestfit,
bestParentfit,
Pvalue_threshold,
fracInvalid_threshold) {
#x compares bestfit and bestParentfit:
if (bestfit == bestParentfit) {
x <- 1
} else {
x <- 0.5
}
#Version 20150220: nonzero but low y values also below P=0.001 depending
#on threshold:
if (Pvalue < Pvalue_threshold) y <- 0 else {
if (Pvalue < 0.01) {
if (Pvalue < 0.001) y <- linterpol(Pvalue, c(0,0), c(0.001, 0.05)) else
y <- linterpol(Pvalue, c(0.001, 0.05), c(0.01, 0.6))
} else if (Pvalue < 0.15) {
if (Pvalue < 0.05) y <- linterpol(Pvalue, c(0.01, 0.6), c(0.05, 0.8)) else
y <- linterpol(Pvalue, c(0.05, 0.8), c(0.15, 1))
} else y <- 1
}
#z is determined by fraction F1 scores in valid categories for the selfit
#segtype. z is equal to 1-fracInvalid, but below 1-frqinvalid.threshold z=0.
z <- ifelse(fracInvalid > fracInvalid_threshold, 0, 1 - fracInvalid)
x * y * z
} #calc_q1
#'calc_q2 ***********************************************************
#'@description q2 is about how well the parents are scored and fit with the selfit segtype
#'@param par.conflicts Amount of conflicts in parental scores
#'@param par.NAfrac Amount of missing values in the parents
#'@param matchParents Should parents match F1? One of the following: "No", "Unknown", "OneOK" or "Yes"
#'@param parentsScoredWithF1 Logical, if \code{TRUE} then q2 is determined not by matchParents but
#'by the amount of conflicts and missing values in the parents. If \code{FALSE}, par.conflicts and
#'par.NAfrac are not used and the information on the match between parents and F1 segtype has to come
#'from matchParents.
#'@noRd
calc_q2 <- function(par.conflicts,
par.NAfrac,
matchParents,
parentsScoredWithF1) {
if (parentsScoredWithF1) {
q2 <- max(0, 1 - 0.5 * sum(par.conflicts) - 0.5 * sum(par.NAfrac))
#In the version of 20150220 matchParents cannot be "No": selfit is always
#bestParentfit (sometimes after promoting lowconf parental scores).
#Therefore the remaining matchParents codes Unknown, OneOK and Yes are
#correlated with the q2 score, no point to use the matchParents
} else {
# !parentsScoredWithF1
# Note that matchParents=="No" cannot occur in the versions since 20150220.
q2 <- match(matchParents,
c("No", "Unknown", "OneOK", "Yes")) #1:4 or NA
#q2 <- c(0, 0.5, 0.9, 1)[q2] #translate into 0..1 values
q2 <- c(0, 0.65, 0.9, 1)[q2] #translate into 0..1 values
#version of 20160324: the penalty for Unknown is less if the parents
#are not scored with the F1: there is still the lack of information, but
#the lack of scores does not mean that the (F1) genotyping is less reliable.
}
q2
} #calc_q2
#'calc_q3 ***********************************************************
#'@description #q3 is about the fraction missing F1 scores. If that is larger than NA.threshold, q3=0;
#'else q3 depends on the fraction scored F1's
#'@param F1.NAfrac Fraction of missing values in the F1
#'@param fracNA_threshold Threshold for fraction missing values
#'@noRd
calc_q3 <- function(F1.NAfrac,
fracNA_threshold) {
if (F1.NAfrac > fracNA_threshold) {
q3 <- 0
} else if (fracNA_threshold > 0.2) {
if (F1.NAfrac >= 0.2) {
# 0.2<=F1.NAfrac<threshold
q3 <- linterpol(F1.NAfrac, c(0.2, 0.5), c(fracNA_threshold, 0))
} else {
# 0<=F1.NAfrac<0.2
q3 <- linterpol(F1.NAfrac, c(0, 1), c(0.2, 0.5))
}
} else {
# fracNA_threshold <= 0.2, 0<=F1.NAfrac<threshold
#: let q3 go from 1 at frac=0 to 0.5 at frac=threshold:
q3 <- linterpol(F1.NAfrac, c(0, 1), c(fracNA_threshold, 0.5))
}
q3
} #calc_q3
#'calc_qall ***********************************************************
#'@param Pvalue_threshold Threshold can be disabled by setting to 0
#'@param fracInvalid_threshold Threshold can be disabled by setting to 1
#'@param fracNA_threshold Threshold can be disabled by setting to 1
#'@param Pvalue P-value of bestParentfit segtype
#'@param fracInvalid Fraction invalid scores of the bestParentfit segtype
#'@param F1.NAfrac fraction missing scores among F1 samples
#'@param matchParents Should parents match F1? One of the following: "No", "Unknown", "OneOK" or "Yes"
#'@param bestfit segtype of best fit of population
#'@param bestParentfit segtype of best fit of parents
#'@param par.conflicts Logical vector: is there a conflict for P1, P2
#'@param par.NAfrac number of NA scores for the samples of P1, P2
#'@param critweight numeric vector of length 3 (3 quality criteria) with their weights,
#'or NA (in that case qall = q1*q2*q3, not a weighted average)
#'@param parentsScoredWithF1 Were parents scored with F1?
#'@return vector of length 4 or 5. The last (two) components of the vector are qall_mult
#'(obtained by multiplication of q1..q3), and if critweights is supplied, qall_weights
#'(a weighted average of q1..q3)
#'@noRd
calc_qall <- function(Pvalue_threshold,
fracInvalid_threshold,
fracNA_threshold,
Pvalue,
fracInvalid,
F1.NAfrac,
matchParents,
bestfit,
bestParentfit,
par.conflicts,
par.NAfrac,
critweight,
parentsScoredWithF1) {
q <- numeric(ifelse (length(critweight) == 3, 5, 4)) #last (two) are qall
q[1] <- calc_q1(Pvalue, fracInvalid, bestfit, bestParentfit,
Pvalue_threshold, fracInvalid_threshold)
q[2] <- calc_q2(par.conflicts, par.NAfrac, matchParents,
parentsScoredWithF1)
q[3] <- calc_q3(F1.NAfrac, fracNA_threshold)
q[4] <- prod(q[1:3]) #q[4] is qall_mult
if (length(critweight) == 3) {
q[5] <- sum(q[1:3] * critweight) / sum(critweight) #q[5] is qall_weights
}
q
} #calc_qall
#'checkFilename ***********************************************************
#'@description Check if a filename is valid for writing by trying to create the file.
#'If the filename contains a path, result will only be TRUE if the whole path already exists
#'@param filename The filename to check
#'@param overwrite if TRUE (default) an existing file of that name will be deleted (if it is not
#' protected by being locked, read-only, owned by another user etc)
#' @noRd
checkFilename <- function(filename, overwrite=TRUE) {
filename <- filename[1] # we test only the first filename
if (!overwrite && file.exists(filename)) return(FALSE)
tryCatch(
{ suppressWarnings(
if (file.create(filename)) {
file.remove(filename)
TRUE
} else FALSE)
}, error = function(e) { FALSE }
)
} #checkFilename
#'chk2integer ***********************************************************
#'@description Convert data from checkF1 to integer
#'@param chk a data frame as returned by checkF1
#'@return the same data frame, but with all columns from Parent1 to last ancestor
#'converted to integer (as these are sometimes factors with levels not equal to values
#'after reading checkF1 from file)
#'@noRd
chk2integer <- function(chk) {
#find out the ploidy:
F1cap <- names(chk)[which(names(chk) == "F1_NA") - 1]
ploidyF1 <- as.integer(sub("F1_", "", F1cap))
#find the column range to convert to integers:
firstcol <- which(names(chk) == "parent1")
#determine which column has the last parent or ancestor sample:
#if 3 columns for each segtype are present (showAll TRUE), then
#the first column after the last sample has name frqInvalid_<segtype> (where
#<segtype> depends on the ploidy).
#else (showAll FALSE) the next column has name "bestfit", followed
#by "frqInvalid_bestfit"
lastcol <- which(leftstr(names(chk), 10) == "frqInvalid")[1]
if (rightstr(names(chk)[lastcol], 7) == "bestfit") {
lastcol <- lastcol - 2
} else {
lastcol <- lastcol - 1
}
if (length(firstcol) != 1 || is.na(lastcol) ||
lastcol - firstcol < 3 + ploidyF1) {
stop("chk2integer: column names of chk incorrect")
}
for (i in firstcol:lastcol) {
suppressWarnings(chk[[i]] <- as.integer(as.character(chk[[i]])))
#warnings caused by NAs
}
chk
} #chk2integer
#' Add colour bar scale to heatplot
#' @description Function to generate a scale for heatplots
#' @param col.data vector of colours
#' @param min minimum colour
#' @param max maximum colour
#' @param cex.ticks size of ticks on colour bar
#' @param nticks number of ticks on colour bar
#' @param ticks vector of positions of ticks on colour bar
#' @param title optional title for colour bar
#' @param ylab optional y-axis label for colour bar
#' @param cex.lab size of labels on colour bar
#' @noRd
colour.bar <- function(col.data, min, max=-min, cex.ticks = 1.2, nticks=11,
ticks=seq(min, max, len=nticks), title='', ylab = '',
cex.lab = 1) {
scale <- length(col.data)/(max-min)
plot(c(0,10), c(min,max), type='n', bty='n', xaxt='n', xlab='', yaxt='n', ylab=ylab, main=title, cex.lab = cex.lab)
axis(2, ticks, las=1, cex.axis = cex.ticks)
for (i in 1:length(col.data)) {
y = (i-1)/scale + min
rect(0,y,10,y+1/scale, col=col.data[i], border=NA)
}
} #colour.bar
#' Function to compare probes of the same marker and eventually assist merge
#' @description compareProbes.gp is used to compare probes of the same marker
#' @param chk Output list as returned by function \code{\link{checkF1}}
#' @param scores Input data, probabilistic genotype scores
#' @param probe.suffix Codes used to identify probes targetting same SNP
#' @param fracdiff.threshold Acceptable proportion of differences in probes to still be considered identical
#' @param parent1 Parent 1 identifier
#' @param parent2 Parent 2 identifier
#' @param qall_flavor Which quality criterion to use? By default qall_mult
#' @param shiftParents Logical, parental scores shifted?
#' @param compfile optional filename to write comparison info to
#' @param combscorefile Optional file to write comparison scores
#' @noRd
compareProbes.gp <- function (chk,
scores,
probe.suffix = c("P", "Q", "R"),
fracdiff.threshold = 0.04,
qall_flavor = "qall_mult",
compfile,
combscorefile){
parent1 <- chk$meta$parent1
parent2 <- chk$meta$parent2
F1 <- chk$meta$F1
ancestors <- chk$meta$ancestors
polysomic <- chk$meta$polysomi
disomic <- chk$meta$disomic
mixed <- chk$meta$mixed
ploidy <- chk$meta$ploidy
ploidy2 <- ifelse(is.null(chk$meta$ploidy2),chk$meta$ploidy,chk$meta$ploidy2)
shiftParents <- chk$meta$shiftParents
chk <- chk$checked_F1
noprobes <- length(probe.suffix) == 1 && is.na(probe.suffix)
if (noprobes){
funcname <- "writeScoresfile"
} else{
funcname <- "compareProbes"
}
if (!noprobes && (length(probe.suffix) != 3 || sum(is.na(probe.suffix)) >
0 || max(nchar(probe.suffix)) != min(nchar(probe.suffix)) ||
sum(abs(match(probe.suffix, probe.suffix) - c(1, 2, 3))) >
0))
stop("compareProbes: probe.suffix invalid")
if (ploidy%%2 != 0)
stop(paste(funcname, ": odd parental ploidy not allowed",
sep = ""))
if (missing(ploidy2) || is.na(ploidy2))
ploidy2 <- ploidy
else if (ploidy2%%2 != 0)
stop(paste(funcname, ": odd parental ploidy2 not allowed",
sep = ""))
ploidyF1 <- (ploidy + ploidy2)/2
segtypecol <- which(tolower(names(chk)) == "segtype")
if (length(segtypecol) != 1)
segtypecol <- which(tolower(names(chk)) == "selfit")
if (length(segtypecol) != 1)
segtypecol <- which(tolower(names(chk)) == "bestparentfit")
if (length(segtypecol) != 1)
stop(paste(funcname, ": chk should have one column segtype or bestParentfit",
sep = ""))
# if (class(chk[, segtypecol]) != "character") #failed CRAN check 1.1.3
if (!inherits(chk[, segtypecol], "character"))
chk[, segtypecol] <- as.character(chk[, segtypecol])
# if (class(chk$MarkerName) != "character") #failed CRAN check 1.1.3
if (!inherits(chk$MarkerName, "character"))
chk$MarkerName <- as.character(chk$MarkerName)
chk <- chk[!is.na(chk[, segtypecol]) & chk[, segtypecol] !=
"", ]
qallcol <- which(tolower(names(chk)) == qall_flavor)
if (length(qallcol) > 1)
qallcol <- qallcol[1]
shiftspresent <- FALSE
shiftcol <- which(tolower(names(chk)) == "shift")
if (length(shiftcol) > 1) {
stop(paste(funcname, ": chk should have at most one column 'shift'"),
sep = "")
}else if (length(shiftcol) == 0) {
chk$shift <- rep(0, nrow(chk))
shiftcol <- which(tolower(names(chk)) == "shift")
shiftParents <- TRUE
}else {
if (missing(shiftParents) || is.na(shiftParents)) {
if (ploidy == ploidy2)
shiftParents <- TRUE
else stop(paste(funcname, ": if ploidy2 != ploidy, shiftParents must be specified (FALSE/TRUE)",
sep = ""))
}
if (sum(is.na(chk[, shiftcol])) > 0)
stop(paste(funcname, ": column 'shift' in chk may not contain missing values",
sep = ""))
shiftspresent <- TRUE
}
progenitors <- c(parent1, parent2, ancestors)
progenitors <- progenitors[!is.na(progenitors)]
if (length(progenitors) == 0)
shiftParents <- FALSE
allsamp <- c(progenitors, F1)
if (shiftParents)
shfsamp <- rep(TRUE, length(allsamp))
else shfsamp <- c(rep(FALSE, length(progenitors)), rep(TRUE, length(F1)))
missamp <- allsamp[!(allsamp %in% unique(as.character(scores$SampleName)))]
if (length(missamp) > 0)
stop(paste(funcname, ": some samples not in scores:",
paste(missamp, collapse = " ")))
# missamp <- progenitors[!(progenitors %in% names(chk))]
# if (length(missamp) > 0)
# stop(paste(funcname, ": some samples not in chk:", paste(missamp,
# collapse = " ")))
if (length(F1) != round(sum(chk[1, 5:(6 + ploidyF1)])))
stop(paste(funcname, ": F1 has", length(F1), "samples but in chk",
sum(chk[1, 5:(6 + ploidyF1)]), "values counted"))
which_shf <- which(rightstr(chk$MarkerName, 4) == "_shf")
chk$SNPname <- chk$MarkerName
chk$SNPname[which_shf] <- leftstr(chk$MarkerName[which_shf],
-4)
if (sum(is.na(match(chk$SNPname, unique(as.character(scores$MarkerName))))) >
0)
stop(paste(funcname, ": not all markers from chk appear in scores",
sep = ""))
if (noprobes) {
chk$probe <- rep("", nrow(chk))
}else {
mnl <- nchar(chk$SNPname)
chk$probe <- substr(chk$SNPname, mnl - nchar(probe.suffix[1]) +
1, mnl)
chk$SNPname <- substr(chk$SNPname, 1, mnl - nchar(probe.suffix[1]))
}
if (!noprobes && sum(!(chk$probe %in% probe.suffix[1:2])) >
0)
stop("compareProbes: not all marker names have a specified probe suffix")
if (!noprobes && sum(nchar(chk$SNPname) == 0) > 0)
stop("compareProbes: not all marker names have a specified probe suffix")
if (!is.na(compfile) && !checkFilename(compfile))
stop(paste(funcname, ": compfile not valid or not writable",
sep = ""))
if (!is.na(combscorefile) && !checkFilename(combscorefile))
stop(paste(funcname, ": combscorefile not valid or not writable",
sep = ""))
if (is.factor(chk$parent1))
chk$parent1 <- as.integer(as.character(chk$parent1))
if (is.factor(chk$parent2))
chk$parent2 <- as.integer(as.character(chk$parent2))
if (is.factor(chk[, shiftcol]))
chk[, shiftcol] <- as.integer(as.character(chk[, shiftcol]))
if (is.factor(parent1))
parent1 <- as.character(parent1)
if (is.factor(parent2))
parent2 <- as.character(parent2)
if (is.factor(F1))
F1 <- as.character(F1)
if (is.factor(ancestors))
ancestors <- as.character(ancestors)
snpnames <- sort(unique(chk$SNPname))
segtype <- array(NA, dim = c(length(snpnames), 2, 2), dimnames = list(snpname = snpnames,
probe = c("probe1", "probe2"), shift = c("false", "true")))
chkshift <- segtype
qall <- segtype
cons.parent1 <- segtype
cons.parent2 <- segtype
Rsegtype <- segtype
for (sh in 1:2) {
if (sh == 1)
sel.sh <- chk$shift == 0
else sel.sh <- chk$shift != 0
if (noprobes)
proberange <- 1
else proberange <- 1:2
for (prb in proberange) {
if (noprobes) {
prbdat <- chk[sel.sh, ]
}
else {
prbdat <- chk[chk$probe == probe.suffix[prb] &
sel.sh, ]
}
rows <- match(prbdat$SNPname, snpnames)
segtype[rows, prb, sh] <- prbdat[, segtypecol]
chkshift[rows, prb, sh] <- prbdat[, shiftcol]
if (length(qallcol) == 1)
qall[rows, prb, sh] <- prbdat[, qallcol]
cons.parent1[rows, prb, sh] <- prbdat$parent1
cons.parent2[rows, prb, sh] <- prbdat$parent2
}
}
seginfo <- selSegtypeInfo(calcSegtypeInfo(ploidy, ploidy2),
polysomic, disomic, mixed)
if (sum(is.na(match(unique(chk[, segtypecol]), names(seginfo)))) >
0)
stop("compareProbes: chk contains segtypes not expected with the settings\nof polysomic/disomic/mixed/ploidy/ploidy2")
scores <- scores[scores$SampleName %in% allsamp, ]
combscores <- makeCombscoresDf(parent1, parent2, F1, ancestors,
nrow = nrow(chk))
combrow <- 0
batchsize <- 100
batchnr <- 1
batchscores <- list()
while (batchsize * (batchnr - 1) < length(snpnames)) {
cat(paste("batch", batchnr, "\n"))
minmrk <- batchsize * (batchnr - 1) + 1
maxmrk <- min(length(snpnames), batchsize * batchnr)
batchmarkers <- snpnames[minmrk:maxmrk]
if (!noprobes) {
batchmarkers <- c(paste(batchmarkers, probe.suffix[1],
sep = ""), paste(batchmarkers, probe.suffix[2],
sep = ""))
}
batchscores <- scores[scores$MarkerName %in% batchmarkers,
]
for (mrk in minmrk:maxmrk) {
sc <- list()
progen.sc <- array(NA, dim = c(length(progenitors),
2, 2), dimnames = list(sample = progenitors,
probe = c("probe1", "probe2"), shift = c("FALSE",
"TRUE")))
F1.sc <- array(NA, dim = c(length(F1), 2, 2), dimnames = list(sample = F1,
probe = c("probe1", "probe2"), shift = c("FALSE",
"TRUE")))
parent.sc <- array(NA, dim = c(2, 2, 2), dimnames = list(sample = c("parent1",
"parent2"), probe = c("probe1", "probe2"), shift = c("FALSE",
"TRUE")))
P0col <- which(names(scores) == "P0")
if (length(P0col) != 1)
stop("compareProbes: scores should contain columns P0 .. P<ploidy>")
for (prb in 1:2) if (sum(!is.na(segtype[mrk, prb,
])) > 0) {
if (noprobes) {
mrkname_scores <- snpnames[mrk]
}
else {
mrkname_scores <- paste(snpnames[mrk], probe.suffix[prb],
sep = "")
}
sc[[prb]] <- list()
sc[[prb]][[1]] <- batchscores[batchscores$MarkerName ==
mrkname_scores & batchscores$SampleName %in%
allsamp, ]
sc[[prb]][[1]] <- sc[[prb]][[1]][match(allsamp,
sc[[prb]][[1]]$SampleName), ]
if (!is.na(segtype[mrk, prb, 2])) {
sc[[prb]][[2]] <- sc[[prb]][[1]]
sv <- chkshift[mrk, prb, 2]
sc[[prb]][[2]]$geno[shfsamp] <- sc[[prb]][[2]]$geno[shfsamp] +
sv
inval <- shfsamp & !(sc[[prb]][[2]]$geno %in%
0:ploidyF1)
sc[[prb]][[2]]$geno[inval] <- NA
if (sv > 0) {
sc[[prb]][[2]][shfsamp, P0col + ploidyF1] <- sum(sc[[prb]][[2]][shfsamp,
P0col + ((ploidyF1 - sv):ploidyF1)])
if (sv < ploidy)
sc[[prb]][[2]][shfsamp, P0col + (sv:(ploidyF1 -
1))] <- sc[[prb]][[2]][shfsamp, P0col +
(0:(ploidyF1 - 1 - sv))]
sc[[prb]][[2]][shfsamp, P0col + (0:(sv -
1))] <- 0
}
else {
sc[[prb]][[2]][shfsamp, P0col] <- sum(sc[[prb]][[2]][shfsamp,
P0col + (0:(-sv))])
if ((-sv) < ploidyF1)
sc[[prb]][[2]][shfsamp, P0col + (1:(ploidyF1 +
sv))] <- sc[[prb]][[2]][shfsamp, P0col +
((1 - sv):ploidyF1)]
sc[[prb]][[2]][shfsamp, P0col + (ploidyF1 +
sv + 1):ploidyF1] <- 0
}
suppressWarnings(sc[[prb]][[2]]$maxP <- apply(sc[[prb]][[2]][P0col +
(0:ploidyF1)], 1, max, na.rm = TRUE))
}
for (sh in 1:2) {
if (!is.na(segtype[mrk, prb, sh])) {
mrkname_out <- mrkname_scores
if (shiftspresent)
mrkname_out <- paste(mrkname_out, c("n",
"s")[sh], sep = "")
progensc <- sc[[prb]][[sh]][sc[[prb]][[sh]]$SampleName %in%
progenitors, ]
progen.sc[match(progensc$SampleName, progenitors),
prb, sh] <- progensc$geno
F1.sc[, prb, sh] <- sc[[prb]][[sh]]$geno[sc[[prb]][[sh]]$SampleName %in%
F1]
parent.sc[, prb, sh] <- getPargeno(cons.parent1[mrk,
prb, sh], cons.parent2[mrk, prb, sh], segtype = segtype[mrk,
prb, sh], seginfo = seginfo)
combrow <- combrow + 1
if (combrow > nrow(combscores))
combscores <- rbind(combscores, makeCombscoresDf(parent1,
parent2, F1, ancestors, nrow = 5000))
combscores$MarkerName[combrow] <- mrkname_out
combscores$segtype[combrow] <- segtype[mrk,
prb, sh]
combscores[combrow, 3:length(combscores)] <- c(progen.sc[,
prb, sh], parent.sc[, prb, sh], F1.sc[,
prb, sh])
}
}
}
if (!noprobes) {
numcomb <- 0
for (sh1 in 1:2) for (sh2 in 1:2) if (!is.na(segtype[mrk,
1, sh1]) && !is.na(segtype[mrk, 2, sh2]) &&
segtype[mrk, 1, sh1] == segtype[mrk, 2, sh2]) {
bothscored <- sum(!is.na(sc[[1]][[sh1]]$geno) &
!is.na(sc[[2]][[sh2]]$geno))
diff <- sc[[1]][[sh1]]$geno != sc[[2]][[sh2]]$geno
Ndifferent <- sum(diff, na.rm = TRUE)
fracdiff <- Ndifferent/bothscored
if (!is.na(fracdiff) && fracdiff <= fracdiff.threshold) {
Rsegtype[mrk, sh1, sh2] <- segtype[mrk, 1,
sh1]
numcomb <- numcomb + 1
combgeno <- sc[[1]][[sh1]]$geno
w <- which(is.na(sc[[1]][[sh1]]$geno) & !is.na(sc[[2]][[sh2]]$geno))
combgeno[w] <- sc[[2]][[sh2]]$geno[w]
w <- which(!is.na(sc[[1]][[sh1]]$geno) &
!is.na(sc[[2]][[sh2]]$geno) & sc[[2]][[sh2]]$geno !=
sc[[1]][[sh1]]$geno & sc[[2]][[sh2]]$maxP >
sc[[1]][[sh1]]$maxP)
combgeno[w] <- sc[[2]][[sh2]]$geno[w]
parent.na <- matrix(NA, nrow = 2, ncol = 2)
parent.na[1, ] <- is.na(parent.sc[, 1, sh1])
parent.na[2, ] <- is.na(parent.sc[, 2, sh2])
combparent <- parent.sc[, 1, sh1]
w <- which(parent.na[1, ] & !parent.na[2,
])
combparent[w] <- parent.sc[w, 2, sh2]
w <- which(!parent.na[1, ] & !parent.na[2,
] & parent.sc[, 1, sh1] != parent.sc[,
2, sh2])
combparent[w] <- NA
if (xor(parent.na[1, 1], parent.na[1, 2]) &&
xor(parent.na[2, 1], parent.na[2, 2]) &&
xor(parent.na[1, 1], parent.na[2, 1]) &&
getMatchParents(combparent, seginfo[[segtype[mrk,
prb]]]) == "No") {
combparent <- c(NA, NA)
}
if (shiftspresent) {
mrkname_cmb <- paste(snpnames[mrk], "R",
paste(c("n", "s")[c(sh1, sh2)], collapse = ""),
sep = "")
}
else {
mrkname_cmb <- paste(snpnames[mrk], "R",
sep = "")
}
combrow <- combrow + 1
if (combrow > nrow(combscores))
combscores <- rbind(combscores, makeCombscoresDf(parent1,
parent2, F1, ancestors, nrow = 5000))
combscores$MarkerName[combrow] <- mrkname_cmb
combscores$segtype[combrow] <- segtype[mrk,
1, sh1]
combscores[combrow, 3:length(combscores)] <- c(combgeno[1:length(progenitors)],
combparent, combgeno[(length(progenitors) +
1):length(combgeno)])
}
}
}
}
batchnr <- batchnr + 1
}
if (combrow == 0)
combscores <- combscores[0, ]
else combscores <- combscores[1:combrow, ]
if (!is.na(combscorefile))
write.table(combscores, combscorefile, quote = FALSE,
sep = "\t", na = "", row.names = FALSE, col.names = TRUE)
if (noprobes) {
compstat <- NA
}else {
compstat <- data.frame(SNPname = snpnames)
sufx <- matrix(c(paste(probe.suffix[1], "n", sep = ""),
paste(probe.suffix[1], "s", sep = ""), paste(probe.suffix[2],
"n", sep = ""), paste(probe.suffix[2], "s", sep = "")),
2, 2, byrow = TRUE)
max_sh <- 2
Rsufx <- matrix(c("nn", "ns", "sn", "ss"), 2, 2, byrow = TRUE)
if (!shiftspresent) {
max_sh <- 1
sufx <- substr(sufx, 1, 1)
Rsufx <- matrix("")
}
for (prb in 1:2) for (sh in 1:max_sh) {
compstat[[paste("segtype", sufx[prb, sh], sep = "")]] <- segtype[,
prb, sh]
if (length(qallcol) == 1) {
compstat[[paste("qall", sufx[prb, sh], sep = "")]] <- qall[,
prb, sh]
}
}
for (sh1 in 1:max_sh) for (sh2 in 1:max_sh) {
compstat[[paste("segtype", probe.suffix[3], Rsufx[sh1,
sh2], sep = "")]] <- Rsegtype[, sh1, sh2]
}
for (prb in 1:2) {
compstat[[paste("count", probe.suffix[prb], sep = "")]] <- rowSums(!is.na(segtype[,
prb, ]))
}
compstat[[paste("count", probe.suffix[3], sep = "")]] <- apply(!is.na(Rsegtype),
1, sum)
rownames(compstat) <- NULL
if (!is.na(compfile))
write.table(compstat, compfile, row.names = FALSE,
col.names = TRUE, sep = "\t", quote = FALSE,
na = "")
}
return(invisible(list(combscores = combscores, compstat = compstat)))
} #compareProbes.gp
#' Convert a matrix to data-frame
#' @description Function to convert output from mappoly to data-frame compatible with polymapR, for use in probabilitic genotype pipeline
#' @param m matrix, output of mappoly::rf_list_to_matrix
#' @param dat an object of class 'mappoly.data'
#' @noRd
convert_matrix_to_df <- function(m, dat){
x1 <- reshape2::melt(m$rec.mat) #rf
x2 <- reshape2::melt(m$lod.mat) #LOD
x3 <- reshape2::melt(m$ShP) #how many homologs share alleles (doses) between markers in P1
x4 <- reshape2::melt(m$ShQ)#how many homologs share alleles (doses) between markers in P2
df.rec <- data.frame(marker_a = as.character(x1[,1]),
marker_b = as.character(x1[,2]),
dose_a_P1 = dat$dosage.p[as.character(x1[,1])],
dose_b_P1 = dat$dosage.p[as.character(x1[,2])],
dose_a_P2 = dat$dosage.q[as.character(x1[,1])],
dose_b_P2 = dat$dosage.q[as.character(x1[,2])],
r = round(x1[,3], 6),
LOD = round(x2[,3], 3),
phase_p1 = x3[,3],
phase_p2 = x4[,3])
a <- combn(colnames(m$rec.mat), 2)
id1 <- apply(a, 2, paste0, collapse = "-")
id2 <- apply(df.rec[,1:2], 1, paste0, collapse = "-")
df.rec <- df.rec[match(id1, id2), ]
return(df.rec)
} #convert_matrix_to_df
#' User interface for specifying linkage group for a homologue
#' @param LG_hom_stack A data.frame defining clusters
#' @param LG_number Number of chromosomes (linkage groups)
#' @noRd
createChmHomList <- function(LG_hom_stack, LG_number=12){
## This is intended to allow the user to define which cluster elements should be
## put together into a single file. The user defines how many chromosomes are identified
## in the data:
homolog_list <- sapply(1:LG_number, function(x) NULL)
for(c in 1:LG_number){
cat(paste("Linkage group ",c," :\n"))
cat("_______________________________________\n")
cluster_tracker <- NULL #To make sure unique clusters are entered
temp_hom <- readline("Enter first cluster number: ")
temp_group <- NULL
while(temp_hom != "x") {
if(temp_hom %in% cluster_tracker){
cat("Error, this cluster was already assigned\n")
temp_hom <- readline("Please re-enter next cluster number (or x to exit): ")
} else if(temp_hom %in% names(LG_hom_stack) == FALSE){
cat("Error, impossible cluster assignment\n")
temp_hom <- readline("Please re-enter next cluster number (or x to exit): ")
}else{
cluster_tracker <- c(cluster_tracker,temp_hom)
temp_group <- c(temp_group,temp_hom)
temp_hom <- readline("Enter next cluster number (or x to exit): ")
}
}
homolog_list[[c]] <- temp_group
cat("_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n")
}
cat("Clusters defined as: \n")
print(homolog_list)
user_abort <- "n"
suppressWarnings(
if(length(setdiff(1:length(LG_hom_stack),as.numeric(unlist(homolog_list)))) != 0){
cat("The following cluster(s) have not been assigned to any linkage group:\n")
print(setdiff(1:length(LG_hom_stack),as.numeric(unlist(homolog_list))))
user_abort <- readline("Do you wish to abort and try again? (y/n) ")
})
if(user_abort != "y"){
## Now we should go and collect the marker names from each cluster, add chromosome numbers
names(homolog_list)<-as.character(1:length(homolog_list))
homolog_m<-stack(homolog_list)
colnames(homolog_m)<-c("homologue", "LG")
marker_cluster<-stack(LG_hom_stack)
colnames(marker_cluster)<-c("SxN_Marker", "homologue")
output_df<-merge(marker_cluster, homolog_m, by="homologue")
output_df<-output_df[,c("SxN_Marker","LG","homologue")]
} else{
cat("Attempt to define linkage groups has been aborted. Please re-run function.\n")
}
return(output_df)
} #createChmHomList
#'digamfrq ***********************************************************
#'@description Get the integer ratios of gametes with dosage 0:(ploidy/2)
#'from an allopolyploid parent (disomic inheritance) with given ploidy and dosage
#'@param ploidy The ploidy level of the parent
#'@param dosage The parental dosage
#'@noRd
digamfrq <- function(ploidy, dosage) {
gp <- ploidy/2 #gamete ploidy
#first step: calculate all possible compositions of diploid parental genomes
#i.e. the numbers of subgenomes with nulliplex, simplex and duplex dosages
maxdup <- dosage %/% 2 #max nr of subgenomes with duplex dosage
mindup <- max(dosage-gp, 0) #min nr of subgenomes with duplex dosage
genomes <- matrix(integer((maxdup-mindup+1)*3), ncol=3)
colnames(genomes) <- c("nulli", "sim", "du")
for (i in 1:nrow(genomes)) {
genomes[i,3] <- mindup + i - 1
genomes[i,2] <- dosage - 2*genomes[i,3]
genomes[i,1] <- gp - sum(genomes[i, 2:3])
}
#next step: per genomic composition calculate the gamete distributions:
gam <- matrix(numeric(nrow(genomes)*(gp+1)), nrow=nrow(genomes)) #all 0
colnames(gam) <- 0:gp
for (i in 1:nrow(genomes)) {
mingamdos <- genomes[i,3] #each duplex genome contributes 1
maxgamdos <- gp-genomes[i,1] #each nulliplex genome contributes 0
bincoeff <- choose(genomes[i,2], 0:genomes[i,2])
# to return fractions:
#gam[i, (mingamdos+1):(maxgamdos+1)] <- bincoeff / sum(bincoeff)
#to return integer ratios:
gam[i, (mingamdos+1):(maxgamdos+1)] <- bincoeff
#the smallest bincoeff is always 1 so division by the gcd is not needed
}
#list(genomes, gam)
gam
} #digamfrq
#'gcd ***********************************************************
#'@description Return the greatest common divisor of two integers (vectorized: x and y may be vectors)
#'@param x integer (or vector of integers)
#'@param y integer (or vector of integers)
#'@noRd
gcd <- function(x,y) {
r <- x %% y;
return(ifelse(r, gcd(y, r), y))
} #gcd
#'gcd_all ***********************************************************
#'@description Return the greatest common divisor of all elements of vector x
#'@param x Input vector
#'@noRd
gcd_all <- function(x) {
if (length(x) == 1) x else {
if (length(x) == 2) gcd(x[1], x[2]) else
gcd_all(c(gcd(x[1], x[2]), x[3:length(x)]))
}
} #gcd_all
#'getConsensusGeno ***********************************************************
#'@description Get consensus genotype
#'@param geno A vector with scores (0..ploidy or NA) for all samples of one parent.
#'@param maxNAfrac The maximum fraction of missing scores allowed to assign a high-confidence
#'consensus geno score; if more are missing a missing geno score is returned (the default requires
#'MORE than half of the samples to be scored, so one missing out of 2 samples already causes a missing geno)
#'@param lowconf.NAfrac if the fraction missing scores is more than \code{maxNAfrac} but less than \code{lowconf.NAfrac},
#'or if there is only one sample, a \code{lowconf.geno} score is assigned
#'@return Returns a list with 4 components:
#'\item{geno}{
#'The consensus of the dosages in vector geno. NA if geno is empty,
#'if all elements of geno are NA, if there are different non-NA
#'values in geno, or if the fraction of NA values in geno larger than \code{maxNAfrac}
#'}
#'\item{lowconf.geno}{
#'If there is no conflict and the fraction of missing scores
#'is between \code{maxNAfrac} and \code{lowconf.NAfrac}, the consensus is assigned
#'as a "low-confidence" geno - this needs confirmation from the
#'F1 segregation to be accepted}
#'\item{conflict}{
#'\code{TRUE} if there are different non-NA values in geno, else \code{FALSE}
#'}
#'\item{NAfrac}{
#'The fraction NA in geno (0.5 if length(geno)==0)
#'}
#'@noRd
getConsensusGeno <- function(geno,
maxNAfrac=0.499,
lowconf.NAfrac=0.751) {
nwgeno <- NA; nwlogeno <- NA; conflict <- FALSE; NAfrac <- 0
if (length(geno) == 0) NAfrac <- 0.5 else { #we need a NAfrac value for q2
NAfrac <- sum(is.na(geno) / length(geno))
if (NAfrac < 1.0) {
if (length(geno) == 1) nwlogeno <- geno else {
if (max(geno, na.rm=TRUE) != min(geno, na.rm=TRUE)) {
conflict <- TRUE
} else if (NAfrac <= maxNAfrac) {
nwgeno <- min(geno, na.rm=TRUE)
} else if (NAfrac <= lowconf.NAfrac) {
nwlogeno <- min(geno, na.rm=TRUE)
}
}
}
}
list(geno=nwgeno, lowconf.geno=nwlogeno, conflict=conflict, NAfrac=NAfrac)
} #getConsensusGeno
#'getMatchParents ***********************************************************
#'@description Does the specified segtype match the parental genotypes?
#'@param parGeno integer vector of length 2 with the two parental (consensus) dosages (can each be NA)
#'@param seginfoItem one of the items (segtypes) of a list as returned by \code{\link{calcSegtypeInfo}}
#'@noRd
getMatchParents <- function(parGeno,
seginfoItem) {
p <- which(!is.na(parGeno))
if (length(p) == 0) {
#both parental genotypes unknown
return("Unknown")
} else if (length(p) == 1) {
#only one of the parental genotypes known
if (sum(seginfoItem$pardosage[, p] == parGeno[p]) > 0)
return("OneOK") else return("No")
} else {
#both parental genotypes known
i <- which(seginfoItem$pardosage[, 1] == parGeno[1])
if (length(i) == 1 && parGeno[2] == seginfoItem$pardosage[i, 2])
return("Yes") else return("No")
}
} #getMatchParents
#' getPargeno
#' @description Get parental genotypes
#' @param P1consensus the consensus scores of parent1
#' @param P2consensus the consensus scores of parent2
#' @param segtype the name of the selected segtype (selfit or (in the new polyploid version) bestParentfit from \code{checkF1};
#' should occur in names(seginfo); OR the number of the segtype in seginfo
#' @param matchParents "Yes" if both consensus parental dosages match segtype
#' (bestParentfit.matchParents from \code{checkF1})
#' @param seginfo a list as returned by \code{calcSegtypeInfo}, with the par.dosages
#' limited to those fitting the auto, allo and mixed parameters used to calculate \code{checkF1}
#' @param allparentscores if FALSE (default) and more than one combination would be
#' possible, c(NA, NA) is returned. If TRUE then a matrix with
#' one row for each possibility and two columns for the 2 parental
#' dosages is returned.
#' @return an integer vector of two elements: the inferred dosages
#' of parent 1 and 2; or (if allparentscores is TRUE and there are
#' multiple combinations) a 2-column matrix with one row per parental combination
#' @noRd
getPargeno <- function(P1consensus,
P2consensus,
segtype,
matchParents=NULL,
seginfo,
allparentscores=FALSE) {
# we try to fill in the expected parental dosage
# based on the segtype and the consensus of the observed parental scores:
parent.sc <- c(P1consensus, P2consensus)
if (is.character(segtype)) {
segtype <- which(names(seginfo) == segtype)
if (length(segtype) != 1) stop("internal error in getPargeno")
} #else segtype is already the number of the segtype
exp.par <- seginfo[[segtype]]$pardosage #already selected to match
# auto/allo/mixed
# now get the final parental dosage from consensus and segtype:
if (nrow(exp.par) == 1) {
# both parents must have the same genotype so we can fill these in,
# irrespective of the observed parental genotype and the matchParent status:
parent.sc <- exp.par[1,]
} else {
if (is.null(matchParents)) {
matchParents <- getMatchParents(parGeno=parent.sc,
seginfoItem=seginfo[[segtype]])
}
if (matchParents != "Yes") {
# there is more than one possible parental combination.
# if matchParents=="Yes" we don't need to do anything;
# else (matchParents= Unknown, OneOK or No) we can only fill in the parents
# if one parent matches one combination and the other matches none
pmatch <- rep(NA, 2)
for (p in 1:2) pmatch[p] <- match(parent.sc[p], exp.par[, p])
if (sum(!is.na(pmatch)) == 1) {
#one parent matches one expected parental combination and the other
#doesn't match any
r <- min(pmatch, na.rm=TRUE)
parent.sc <- exp.par[r,]
}
else {
# two possibilities:
# - both parents each match a different row of exp.par
# - none of the parents matches a row of exp.par
# (if both parents would match the same row, matchParents would be Yes)
# In both cases we cannot assign the parental genotypes:
if (allparentscores) {
parent.sc <- exp.par #a 2-column matrix with 2 or more rows
} else parent.sc <- rep(NA, 2)
}
} #matchParents!="Yes"
} #nrow(exp.par) == 1 else
parent.sc
} #getPargeno
#'@title Get substrings from the lefthand side
#'@description Get substrings from the lefthand side
#'@usage leftstr(x, n)
#'@param x a character vector (or something having an as.character method)
#'@param n a single number: if n>=0, the leftmost n characters of each element
#'of x are selected, if n<0 the (-n) rightmost characters are omitted
#'@return character vector with leftside substrings of x
#'@noRd
leftstr <- function(x, n) {
#vectorized for x, not n
#n>=0: take the leftmost n characters
#n<0: take all but the rightmost (-n) characters
if (n >= 0) {
substr(x, 1, n) #automatically converts x to character if needed
} else {
#n < 0
if (!is.character(x)) x <- as.character(x)
len <- nchar(x)
substr(x, 1, len + n)
}
} #leftstr
#' Calculate recombination frequency, LOD and phase using genotype probabilities, using the maximum likelihood estimation of
#' recombination frequency estimated using an approximated approach (using sums of genotype probabilities per dosage class as proxy for discrete counts)
#' @description \code{linkage.gp.approx} is used to calculate recombination frequency, LOD and phase
#' @param probgeno_df A data frame as read from the scores file produced by function \code{saveMarkerModels} of R package \code{fitPoly}, or alternatively, a data frame containing the following columns:
#' \itemize{
#' \item{SampleName}{
#' Name of the sample (individual)
#' }
#' \item{MarkerName}{
#' Name of the marker
#' }
#' \item{P0}{
#' Probabilities of dosage score '0'
#' }
#' \item{P1...}{
#' Probabilities of dosage score '1' etc. (up to max offspring dosage, e.g. P4 for tetraploid population)
#' }
#' \item{maxP}{
#' Maximum genotype probability identified for a particular individual and marker combination
#' }
#' \item{maxgeno}{
#' Most probable dosage for a particular individual and marker combination
#' }
#' \item{geno}{
#' Most probable dosage for a particular individual and marker combination, if \code{maxP} exceeds a user-defined threshold (e.g. 0.9), otherwise \code{NA}}
#' }
#' @param chk Output list as returned by function \code{\link{checkF1}}
#' @param pardose The most likely (discrete) parental dosage scores, used mainly for internal calls of this function
#' @param markertype1 A vector of length 2 specifying the first markertype to compare. The first element specifies the dosage in \code{target_parent}
#' (and the second in the other parent). By default \code{NULL}, in which case all marker types are analysed.
#' @param markertype2 A vector of length 2 specifying the second markertype to compare. This argument is optional. If not specified, the function will calculate
#' linkage within the markertype as specified by \code{markertype1}, or if both arguments \code{markertype1} and \code{markertype2} are \code{NULL}, between all marker types.
#' The first element specifies the dosage in \code{target_parent} (and the second in the other parent).
#' @param target_parent Which parent is being targeted (only acceptable options are "P1" or "P2"), ie. which parent is of specific interest?
#' If this is the maternal parent, please specify as "P1". If the paternal parent, please use "P2". The actual identifiers of the two parents are
#' entered using the arguments \code{parent1_replicates} and \code{parent2_replicates}.
#' @param F1 Offspring names
#' @param parent1_replicates Names of maternal replicates
#' @param parent2_replicates Names of paternal replicates
#' @param ploidy Ploidy of parent 1
#' @param ploidy2 Ploidy of parent 2
#' @param LOD_threshold Minimum LOD score of linkages to report. Recommended to use for large number (> millions) of marker comparisons in order to reduce memory usage.
#' @param G2_test Logical. See \code{\link{linkage.gp}}
#' @param prefPars Preferential pairing parameters. See \code{\link{linkage.gp}}
#' @param combinations_per_iter Combinations per iteration. See \code{\link{linkage.gp}}
#' @param iter_RAM RAM per iteration. See \code{\link{linkage.gp}}
#' @param verbose Write messages to standard output?
#' @param ncores Number of cores to use.
#' @return
#' Returns a data.frame with columns:
#' \itemize{
#' \item{marker_a}{
#' first marker of comparison
#' }
#' \item{marker_b}{
#' second marker of comparison
#' }
#' \item{r}{
#' estimated recombination frequency
#' }
#' \item{LOD}{
#' associated LOD score of the r estimate
#' }
#' \item{phase}{
#' phase between markers
#' }
#' }
#' @noRd
linkage.gp.approx <- function(probgeno_df,
chk,
pardose,
markertype1,
markertype2,
target_parent,
F1,
parent1_replicates,
parent2_replicates,
ploidy,
ploidy2,
LOD_threshold = 0,
G2_test,
prefPars = c(0, 0),
combinations_per_iter = NULL,
iter_RAM = 500,
verbose = TRUE,
ncores){
win <- Sys.info()["sysname"] == "Windows"
if (win) {
cl <- parallel::makeCluster(ncores)
doParallel::registerDoParallel(cl)
} else {
doParallel::registerDoParallel(cores = ncores)
}
prog_ploidy <- (ploidy + ploidy2)/2
## Here we still use dosage_matrix instead of dosage_data because we do not filter for the marker type
## Find all marker's P1 and P2
score_P <- probgeno_df[probgeno_df$SampleName %in% parent1_replicates | probgeno_df$SampleName %in% parent2_replicates,]
markers <- as.character(unique(score_P$MarkerName))
#make sure markers appear in the PossibleMarkerType
markers <- markers[markers %in% pardose$MarkerName]
convert_probability_score <- function(MT_mat = RealMT,
expected_MT = markertype1,
target_parent = "P1",
scores_mat = probgeno_df,
ploidy = ploidy){
if(target_parent == "P1"){
expected_MT <- paste0(expected_MT,collapse = ".")
Tpart <- 1
NTpart <- 2
}else{
expected_MT <- paste0(c(expected_MT[2],expected_MT[1]),collapse = ".")
Tpart <- 2
NTpart <- 1
}
MT_mat$MT <- paste0(MT_mat$parent1, ".",MT_mat$parent2)
if(any(MT_mat$MT != expected_MT)){
change_markers <- MT_mat[which(MT_mat$MT != expected_MT),]
sub_scores_mat <- scores_mat[scores_mat$MarkerName %in% as.character(change_markers$MarkerName),]
#Step 1: reverse all: 0 to 4, 1 to 3, 2 keep as 2
dosc <- list()
for(c in 0:ploidy){
dosc[[as.character(c)]] <- sub_scores_mat[[paste0("P",c)]]
}
for(c in 0:ploidy){
sub_scores_mat[paste0("P",c)] <- dosc[[as.character(ploidy-c)]]
}
change_markers_new <- ploidy - change_markers[,c(2,3)]
change_markers_new$MT <- paste0(change_markers_new$parent1, ".",change_markers_new$parent2)
change_markers_new <- cbind("MarkerName" = change_markers$MarkerName,change_markers_new)
scores_mat[scores_mat$MarkerName %in% as.character(change_markers$MarkerName),] <- sub_scores_mat
#Step 2: make the 2 - 0
if(any(change_markers[paste0("parent",Tpart)] < change_markers_new[paste0("parent",Tpart)])){
change_markers_new1 <- change_markers_new[which(change_markers[paste0("parent",Tpart)] < change_markers_new[paste0("parent",Tpart)]),]
sub_scores_mat <- scores_mat[scores_mat$MarkerName %in% as.character(change_markers_new1$MarkerName),]
for(t in 0:(ploidy/2-1)){
sub_scores_mat[paste0("P",t)] <- sub_scores_mat[paste0("P",(ploidy/2 + t))] + sub_scores_mat[paste0("P",t)]
}
change_markers_new1[paste0("parent",Tpart)]<- change_markers[which(change_markers[paste0("parent",Tpart)] < change_markers_new[paste0("parent",Tpart)]),
paste0("parent",Tpart)]
change_markers_new1[paste0("parent",NTpart)] <- replicate(nrow(change_markers_new1),0)
change_markers_new[which(change_markers[paste0("parent",Tpart)] < change_markers_new[paste0("parent",Tpart)]),]<- change_markers_new1
change_markers_new$MT <- paste0(change_markers_new$parent1,".",change_markers_new$parent2)
scores_mat[scores_mat$MarkerName %in% as.character(change_markers_new1$MarkerName),] <- sub_scores_mat
}
#Step 3: substract by ploidy/2
if(any(change_markers_new$MT != expected_MT)){
change_markers_new2 <- change_markers_new[which(change_markers_new$MT != expected_MT),]
sub_scores_mat <- scores_mat[scores_mat$MarkerName %in% as.character(change_markers_new2$MarkerName),]
dosc <- list()
for(c in 0:ploidy){
dosc[[as.character(c)]] <- sub_scores_mat[[paste0("P",c)]]
}
for(c in 0:ploidy){
sub_scores_mat[paste0("P",c)] <- dosc[[as.character(ploidy-c)]]
}
change_markers_new3 <- ploidy - change_markers_new2[,c(2,3)]
change_markers_new3$MT <- paste0(change_markers_new3$parent1, ".",change_markers_new3$parent2)
change_markers_new3 <- cbind("MarkerName" = change_markers_new2$MarkerName,change_markers_new3)
change_markers_new[which(change_markers_new$MT != expected_MT),] <- change_markers_new3
scores_mat[scores_mat$MarkerName %in% as.character(change_markers_new3$MarkerName),] <- sub_scores_mat
if(any(change_markers_new2[paste0("parent",Tpart)] < change_markers_new3[paste0("parent",Tpart)])){
change_markers_new4 <- change_markers_new3[which(change_markers_new2[paste0("parent",Tpart)] < change_markers_new3[paste0("parent",Tpart)]),]
sub_scores_mat <- scores_mat[scores_mat$MarkerName %in% as.character(change_markers_new4$MarkerName),]
for(t in 0:(ploidy/2-1)){
sub_scores_mat[paste0("P",t)] <- sub_scores_mat[paste0("P",(ploidy/2 + t))]+ sub_scores_mat[paste0("P",t)]
}
change_markers_new4[paste0("parent",Tpart)]<- change_markers_new2[which(change_markers_new2[paste0("parent",Tpart)] < change_markers_new3[paste0("parent",Tpart)]),
paste0("parent",Tpart)]
change_markers_new4[paste0("parent",NTpart)] <- replicate(nrow(change_markers_new4),0)
change_markers_new[change_markers_new$MarkerName %in% as.character(change_markers_new4$MarkerName),] <- change_markers_new4
change_markers_new$MT <- paste0(change_markers_new$parent1,".",change_markers_new$parent2)
scores_mat[scores_mat$MarkerName %in% as.character(change_markers_new4$MarkerName),] <- sub_scores_mat
}
}
if(any(change_markers_new$MT != expected_MT)){
writeLines("There is some errors")
}
}
return(scores_mat)
} #convert_probability_score
pardose$markertype <- paste0(pardose$parent1, ".",pardose$parent2)
#filter the possible markertype
if(nrow(pardose) != 0){
#if these is no markertype2, only pick out the markers of markertype1
if(is.null(markertype2)){
#check whether there is a need to convert dosages
if(target_parent == "P1"){
MT <- markertype1
}else{
MT <- c(markertype1[2],markertype1[1])
}
#find all possible marker type
if(any(MT %in% c(0,ploidy))){
zero_loc <- which(MT %in% c(0,ploidy))
non_zero_loc <- which(!MT %in% c(0,ploidy))
zero <- c(0,4)
non_zero <- unique(c(MT[non_zero_loc],ploidy-MT[non_zero_loc]))
if(length(non_zero) == 0){
MT_matrix <- data.frame("Var1" = zero,
"Var2" = zero)
MT_sum <- paste0(MT_matrix$Var1,".", MT_matrix$Var2) #all possible MT - character
}else{
if(zero_loc < non_zero_loc){
MT_matrix <- expand.grid(zero,non_zero)
MT_sum <- paste0(MT_matrix$Var1,".", MT_matrix$Var2) #all possible MT - character
}else{
MT_matrix <- expand.grid(non_zero,zero)
MT_sum <- paste0(MT_matrix$Var1,".", MT_matrix$Var2)
}
}
}else{
MT_c <- ploidy - MT
MT_sum <- unique(c(paste0(MT, collapse = "."), paste0(MT_c, collapse = ".")))
}
PossibleMarker <- as.character(pardose[pardose$markertype %in% MT_sum,]$MarkerName)
if(length(PossibleMarker) != 0){
if(length(PossibleMarker) == 1){
combinations <- data.frame(PossibleMarker,PossibleMarker)
}else{
combinations <- t(combn(PossibleMarker,2))
}
colnames(combinations) <- c("markera","markerb")
rownames(combinations) <- seq(1, nrow(combinations),1)
#Do the marker type conversion
RealMT <- pardose[pardose$MarkerName %in% PossibleMarker,c("MarkerName","parent1","parent2")]
probgeno_df1 <- convert_probability_score(MT_mat = RealMT,
expected_MT = markertype1,
target_parent = target_parent,
scores_mat = probgeno_df,
ploidy = ploidy)
probgeno_df <- probgeno_df1
}else{
message("There are not enough markers")
return(NULL)
}
}else{ #there is markertype1 and markertype2
if(target_parent == "P1"){
MT1 <- markertype1
MT2 <- markertype2
}else{
MT1 <- c(markertype1[2], markertype1[1])
MT2 <- c(markertype2[2], markertype2[1])
}
#find all possibilities of MT
if(any(MT1 %in% c(0,ploidy))){
zero_loc <- which(MT1 %in% c(0,ploidy))
non_zero_loc <- which(!MT1 %in% c(0,ploidy))
zero <- c(0,ploidy) #BUG? replaced 4 with ploidy..
non_zero <- unique(c(MT1[non_zero_loc],ploidy-MT1[non_zero_loc]))
if(length(non_zero) == 0){
MT1_matrix <- data.frame("Var1" = zero,
"Var2" = zero)
MT1_sum <- paste0(MT1_matrix$Var1,".", MT1_matrix$Var2) #all possible MT - character
}else{
if(zero_loc < non_zero_loc){
MT1_matrix <- expand.grid(zero,non_zero)
MT1_sum <- paste0(MT1_matrix$Var1,".", MT1_matrix$Var2) #all possible MT - character
}else{
MT1_matrix <- expand.grid(non_zero,zero)
MT1_sum <- paste0(MT1_matrix$Var1,".", MT1_matrix$Var2) #all possible MT - character
}
}
}else{
MT1_c <- ploidy - MT1
MT1_sum <- unique(c(paste0(MT1, collapse = "."), paste0(MT1_c, collapse = ".")))
}
if(any(MT2 %in% c(0,ploidy))){
zero_loc <- which(MT2 %in% c(0,ploidy))
non_zero_loc <- which(!MT2 %in% c(0,ploidy))
zero <- c(0,ploidy)
non_zero <- unique(c(MT2[non_zero_loc],ploidy-MT2[non_zero_loc]))
if(length(non_zero) == 0){
MT2_matrix <- data.frame("Var1" = zero,
"Var2" = zero)
MT2_sum <- paste0(MT2_matrix$Var1,".", MT2_matrix$Var2) #all possible MT - character
}else{
if(zero_loc < non_zero_loc){
MT2_matrix <- expand.grid(zero,non_zero)
MT2_sum <- paste0(MT2_matrix$Var1,".", MT2_matrix$Var2) #all possible MT - character
}else{
MT2_matrix <- expand.grid(non_zero,zero)
MT2_sum <- paste0(MT2_matrix$Var1,".", MT2_matrix$Var2) #all possible MT - character
}
}
}else{
MT2_c <- ploidy - MT2
MT2_sum <- unique(c(paste0(MT2, collapse = "."), paste0(MT2_c, collapse = ".")))
}
Possiblemarker1 <- as.character(pardose[pardose$markertype %in% MT1_sum,]$MarkerName)
Possiblemarker2 <- as.character(pardose[pardose$markertype %in% MT2_sum,]$MarkerName)
if(length(Possiblemarker1) != 0 & length(Possiblemarker2 != 0)){
combinations <- expand.grid(Possiblemarker1,Possiblemarker2)
colnames(combinations) <- c("markera","markerb")
rownames(combinations) <- seq(1, nrow(combinations),1)
RealMT1 <- pardose[pardose$MarkerName %in% Possiblemarker1,c("MarkerName","parent1","parent2")]
RealMT2 <- pardose[pardose$MarkerName %in% Possiblemarker2,c("MarkerName","parent1","parent2")]
probgeno_df1 <- convert_probability_score(MT_mat = RealMT1,
expected_MT = markertype1,
target_parent = target_parent,
scores_mat = probgeno_df,
ploidy = ploidy)
probgeno_df2 <- convert_probability_score(MT_mat = RealMT2,
expected_MT = markertype2,
target_parent = target_parent,
scores_mat = probgeno_df1,
ploidy = ploidy)
probgeno_df <- probgeno_df2
}else{
combinations <- data.frame()
message("There is not enough markers")
return(NULL)
}
}
} #the return is after filtering, the already made marker combinations
seginfo <- calcSegtypeInfo(ploidy, ploidy2) #obtain the seginfo for further use
polysomic <- chk$meta$polysomic
disomic <- chk$meta$disomic
mixed <- chk$meta$mixed
seginfo <- selSegtypeInfo(seginfo, polysomic, disomic, mixed)
seginfoSummary <- segtypeInfoSummary(seginfo)
# seginfo <- polymapR:::selSegtypeInfo(seginfo, polysomic, disomic, mixed)
# seginfoSummary <- polymapR:::segtypeInfoSummary(seginfo)
if(polysomic){
pairing <- "random"
pairing_abbr <- "r"
}
if(disomic){
if(polysomic) stop("Cannot run linkage.gp function if both polysomic and disomic are TRUE. Please re-run checkF1 with one of these options FALSE")
pairing <- "preferential"
pairing_abbr <- "p"
}
##Here we need to call the segregation table according to the ploidy
if(is.null(markertype2)){
fname <- paste0(pairing_abbr, prog_ploidy, "_", paste0(markertype1, collapse = "."), "_", paste0(markertype1, collapse = "."))
}else{
fname <- paste0(pairing_abbr, prog_ploidy, "_", paste0(markertype1, collapse = "."), "_", paste0(markertype2, collapse = "."))
}
seg.fname <- paste0("seg_p", ploidy, "_", pairing)
seg <- get(seg.fname)
# seg <- get(seg.fname, envir = getNamespace("polymapR"))
segpos <- c(0:prog_ploidy)
segoff <- seg[, 3:ncol(seg)]
segpar <- seg[, c("dosage1", "dosage2")]
if (is.null(combinations_per_iter)) {
expected_nr_comparisons <- nrow(combinations)
bites_needed <- 14 * expected_nr_comparisons
reserve_RAM <- iter_RAM * 1e+06
number_of_iterations <- ceiling(bites_needed/reserve_RAM)
combinations_per_iter <- ceiling(nrow(combinations)/number_of_iterations)
if (number_of_iterations == 1)
reserve_RAM <- bites_needed
if (verbose) {
message(paste0("Number of combinations per iteration: ",
combinations_per_iter, "\nReserving approximately ",
round((reserve_RAM + (110 * combinations_per_iter))/1e+06),
"MB RAM per iteration"))
}
}
split_factor <- ceiling((1:nrow(combinations))/combinations_per_iter)
combinations_list <- split(as.data.frame(combinations), split_factor,drop = TRUE)
if (verbose) {
message(paste("In total", nrow(combinations), "combinations, which will run in",
length(unique(split_factor)), "iteration(s)...",
sep = " "))
}
if(is.null(markertype2)){
offspring_dosage1 <- offspring_dosage2 <- segpos[segoff[segpar$dosage1 == markertype1[1] &
segpar$dosage2 == markertype1[2], ] > 0]
}else{
offspring_dosage1 <- segpos[segoff[segpar$dosage1 == markertype1[1] &
segpar$dosage2 == markertype1[2], ] > 0]
offspring_dosage2 <- segpos[segoff[segpar$dosage1 == markertype2[1] &
segpar$dosage2 == markertype2[2], ] > 0]
}
dosage_combinations <- expand.grid(offspring_dosage1, offspring_dosage2)
dosage_levels <- paste0("n_", dosage_combinations[, 1], dosage_combinations[, 2]) #make the name looks like: n_00, n_10, n_01, n_11
rownames(dosage_combinations) <- dosage_levels
convert_scores <- function(scores,
markername,
samplename){
TempArray <- array(0, dim = c(length(samplename),ploidy+1,length(markername)),
dimnames = list(c(samplename),
paste0("P",seq(0,ploidy)),
c(markername)))
for(m in markername){
O_temp <- scores[scores$MarkerName == m,]
rownames(O_temp) <- O_temp$SampleName
TempArray[,,m] <- as.matrix(O_temp[samplename,paste0("P",seq(0,ploidy))])
}
return(TempArray)
}
array <- convert_scores(scores = probgeno_df,
markername = unique(c(as.character(combinations[,1]),as.character(combinations[,2]))),
samplename = as.character(unique(probgeno_df$SampleName))[as.character(unique(probgeno_df$SampleName)) %in% F1])
rfun <- get(fname)
# rfun <- get(fname, envir = getNamespace("polymapR"))
r_LOD_tot <- foreach::foreach(i = 1:length(combinations_list),
.combine = rbind, .inorder = F) %dopar% {
combs <- as.matrix(combinations_list[[i]])
if(nrow(combs) > 1){
a <- array[,,combs[,1]]
b <- array[,,combs[,2]]
compare_vec <- matrix(nrow = nrow(combs),ncol = length(dosage_levels))
for (l in 1:length(dosage_levels)){
proba <- a[,paste0("P",as.character(dosage_combinations[l, 1])),]
probb <- b[,paste0("P",as.character(dosage_combinations[l, 2])),]
colnames(proba) <- colnames(probb) <- NULL
sum <- colSums(proba * probb, na.rm = TRUE)
compare_vec[,l] <- sum
}
}else if(nrow(combs) == 1){
a <- array[,,combs[,1]]
b <- array[,,combs[,2]]
compare_vec <- matrix(nrow = nrow(combs),ncol = length(dosage_levels))
for (l in 1:length(dosage_levels)){
proba <- a[,paste0("P",as.character(dosage_combinations[l, 1]))]
probb <- b[,paste0("P",as.character(dosage_combinations[l, 2]))]
colnames(proba) <- colnames(probb) <- NULL
sum <- sum(proba * probb, na.rm = TRUE)
compare_vec[,l] <- sum
}
}else{
compare_vec <- NULL
}
count_matrix <- compare_vec
if (G2_test) {
off1coords <- sapply(offspring_dosage1, function(n) which(n == dosage_combinations[, 1]))
off2coords <- sapply(offspring_dosage2, function(n) which(n == dosage_combinations[, 2]))
offspring1sums <- matrix(sapply(1:ncol(off1coords),
function(c) rowSums(count_matrix[, off1coords[,c], drop = F])), ncol = ncol(off1coords))
offspring2sums <- matrix(sapply(1:ncol(off2coords),
function(c) rowSums(count_matrix[, off2coords[,c], drop = F])), ncol = ncol(off2coords))
Totals <- rowSums(count_matrix)
Sumcounts <- cbind(offspring1sums, offspring2sums)
combos <- expand.grid(1:length(offspring_dosage1),
(length(offspring_dosage1) + 1):(length(offspring_dosage1) + length(offspring_dosage2)))
expected_counts <- matrix(sapply(1:nrow(combos),
function(r) Sumcounts[, combos[r, 1]] * Sumcounts[,combos[r, 2]]/Totals), ncol = nrow(combos))
G <- 2 * rowSums(matrix(sapply(1:ncol(count_matrix),
function(c) count_matrix[, c, drop = F] * (log(pmax(count_matrix[,c, drop = F], 1)) - log(pmax(expected_counts[,
c, drop = F], 1)))), ncol = ncol(count_matrix)))
df <- (length(offspring_dosage1) - 1) * (length(offspring_dosage2) - 1)
if (df > 1) {
e <- exp(-G/(2 * (df - 1)))
G1 <- ((4 - e) * e - 3) * (df - 1) + G
LOD_independence <- G1/(2 * log(10))
}else {
LOD_independence <- G/(2 * log(10))
}
}
summary_linkage <- data.frame()
if(!is.null(compare_vec)){
colnames(count_matrix) <- dosage_levels
count <- count_matrix
if(pairing == "random"){
r_list <- rfun(count)
r_list1 <- rfun(round(count))
if(any(r_list$r_mat < 0) & all(r_list1$r_mat > 0)) {
r_list$r_mat[r_list$r_mat < 0] <- r_list1$r_mat[r_list$r_mat < 0]
}
}
if(pairing == "preferential"){
r_list <- rfun(count, p1 = prefPars[1], p2 = prefPars[2])
r_list1 <- rfun(round(count), p1 = prefPars[1], p2 = prefPars[2])
if(any(r_list$r_mat < 0) & all(r_list1$r_mat > 0)) {
r_list$r_mat[which(r_list$r_mat < 0),] <- r_list1$r_mat[which(r_list$r_mat < 0),]
}
}
r_over_0.5 <- r_list$r_mat >= 0.5
r_list$r_mat[r_over_0.5] <- 1
# based on phasing strategy chose phasing
if (r_list$phasing_strategy == "MLL") {
if (!any(is.na(r_list$logL_mat))) {
# normal case
r_list$logL_mat[r_over_0.5] <- -1e4
phase_num <- apply(r_list$logL_mat, 1, which.max)
} else {
# exceptional case (happens only for few linkage functions)
# works with missing values
r_list$logL_mat[r_over_0.5] <- NA
phase_num <- vector(length = nrow(r_list$logL_mat))
for (i in seq(nrow(r_list$logL_mat))) {
max_logL <- which.max(r_list$logL_mat[i,])
if (length(max_logL) == 0) {
phase_num[i] <- which.min(r_list$r_mat[i,])
} else if (is.na(r_list$r_mat[i, max_logL])) {
phase_num[i] <- which.min(r_list$r_mat[i,])
} else {
phase_num[i] <- max_logL
}
}
}
} else if (r_list$phasing_strategy == "MINR") {
phase_num <- apply(r_list$r_mat, 1, function(x) {
which.min(x)[1]
}
)
} else {
stop("Unknown phasing strategy. Check rf functions.")
}
# fix for NA values of r
if(any(is.na(phase_num))){
if(!"unknown" %in% r_list$possible_phases)
r_list$possible_phases <- c(r_list$possible_phases,"unknown")
phase_num[is.na(phase_num)] <- length(r_list$possible_phases)
}
phase <- r_list$possible_phases[phase_num]
# r based on choice of phase
r_mat_phase <- cbind(r_list$r_mat, phase_num)
r <- apply(r_mat_phase, 1, function(x) {
x[x["phase_num"]]
})
# LOD based on choice of phase
LOD_mat_phase <- cbind(r_list$LOD_mat, phase_num)
LOD <- apply(LOD_mat_phase, 1, function(x) {
x[x["phase_num"]]
})
rm(r_list)
r_LOD <- data.frame(combs, r, LOD, phase)
negative_r <- which(r_LOD$r < 0 | r_LOD$r == 1)
r_LOD$r[negative_r] <- 0.499
r_LOD$LOD[negative_r] <- 0
levels(r_LOD$phase) <- c(levels(r_LOD$phase), "unknown")
r_LOD$phase[negative_r] <- "unknown"
r_LOD$LOD[r_LOD$LOD < 0] <- 0
r_LOD <- r_LOD[r_LOD$LOD >= LOD_threshold,]
colnames(r_LOD)[1:2] <- c("marker_a", "marker_b")
if(G2_test){
r_LOD <-
cbind(r_LOD[, c("marker_a", "marker_b", "r", "LOD", "phase")], LOD_independence)
} else {
r_LOD <-
r_LOD[, c("marker_a", "marker_b", "r", "LOD", "phase")]
}
summary_linkage <- rbind(summary_linkage, r_LOD)
}
return(summary_linkage)
}
if (win) parallel::stopCluster(cl)
return(r_LOD_tot)
}
#' Calculate recombination frequency, LOD and phase using genotype probabilities, using the maximum likelihood estimation of
#' recombination frequency estimated using the mappoly package
#' @description \code{linkage.gp.mappoly} is used to calculate recombination frequency, LOD and phase, using mappoly to perform unbiased estimation of recombination frequency.
#' @param probgeno_df A data frame as read from the scores file produced by function \code{saveMarkerModels} of R package \code{fitPoly}, or alternatively, a data frame containing the following columns:
#' \itemize{
#' \item{SampleName}{
#' Name of the sample (individual)
#' }
#' \item{MarkerName}{
#' Name of the marker
#' }
#' \item{P0}{
#' Probabilities of dosage score '0'
#' }
#' \item{P1...}{
#' Probabilities of dosage score '1' etc. (up to max offspring dosage, e.g. P4 for tetraploid population)
#' }
#' \item{maxP}{
#' Maximum genotype probability identified for a particular individual and marker combination
#' }
#' \item{maxgeno}{
#' Most probable dosage for a particular individual and marker combination
#' }
#' \item{geno}{
#' Most probable dosage for a particular individual and marker combination, if \code{maxP} exceeds a user-defined threshold (e.g. 0.9), otherwise \code{NA}}
#' }
#' @param chk Output list as returned by function \code{\link{checkF1}}
#' @param pardose The most likely (discrete) parental dosage scores, used mainly for internal calls of this function
#' @param markertype1 A vector of length 2 specifying the first markertype to compare. The first element specifies the dosage in \code{target_parent}
#' (and the second in the other parent). By default \code{NULL}, in which case all marker types are analysed.
#' @param markertype2 A vector of length 2 specifying the second markertype to compare. This argument is optional. If not specified, the function will calculate
#' linkage within the markertype as specified by \code{markertype1}, or if both arguments \code{markertype1} and \code{markertype2} are \code{NULL}, between all marker types.
#' The first element specifies the dosage in \code{target_parent} (and the second in the other parent).
#' @param target_parent Which parent is being targeted (only acceptable options are "P1" or "P2"), ie. which parent is of specific interest?
#' If this is the maternal parent, please specify as "P1". If the paternal parent, please use "P2". The actual identifiers of the two parents are
#' entered using the arguments \code{parent1_replicates} and \code{parent2_replicates}.
#' @param F1 Offspring names
#' @param parent1_replicates Names of maternal replicates
#' @param parent2_replicates Names of paternal replicates
#' @param ploidy Ploidy of parent 1
#' @param ploidy2 Ploidy of parent 2
#' @param LOD_threshold Minimum LOD score of linkages to report. Recommended to use for large number (> millions) of marker comparisons in order to reduce memory usage.
#' @param ncores Number of cores to use.
#' @return
#' Returns a data.frame with columns:
#' \itemize{
#' \item{marker_a}{
#' first marker of comparison
#' }
#' \item{marker_b}{
#' second marker of comparison
#' }
#' \item{r}{
#' estimated recombination frequency
#' }
#' \item{LOD}{
#' associated LOD score of the r estimate
#' }
#' \item{phase}{
#' phase between markers
#' }
#' }
#' @noRd
linkage.gp.mappoly <- function(probgeno_df,
chk,
pardose,
markertype1,
markertype2,
target_parent,
F1,
parent1_replicates,
parent2_replicates,
ploidy,
ploidy2,
ncores,
LOD_threshold = 0){
###########################################################################
## Generate marker conversion information (used later to re-code phasing)
dummy.mat <- as.matrix(pardose[,c("parent1","parent2")])
rownames(dummy.mat) <- pardose$MarkerName
colnames(dummy.mat) <- c("P1","P2")
dummy.mat <- cbind(dummy.mat,dummy=0) #add a dummy offspring
conv <- convert_marker_dosages(dosage_matrix = dummy.mat,
ploidy = ploidy,
ploidy2 = ploidy2,
marker_conversion_info = TRUE)
pc <- conv$dosage_matrix[,c("P1","P2")]
##############################################################
if(!is.null(markertype1) | !is.null(markertype2)){ #so restrict the data to particular markertype(s)
if(identical(markertype1, markertype2)) {
markertype2 <- NULL
}
other_parent <- setdiff(c("P1","P2"), target_parent)
if(is.null(markertype2)){ #single marker type
mrks <- pc[,target_parent] == markertype1[1] & pc[,other_parent] == markertype1[2]
} else{ #two marker types
mrks <- (pc[,target_parent] == markertype1[1] & pc[,other_parent] == markertype1[2]) |
(pc[,target_parent] == markertype2[1] & pc[,other_parent] == markertype2[2])
# Note: this means there is an inefficiency in the approach, as estimates of a marker type with itself are also included here
# Also means that we should filter out inter-markertype estimates from the output at the end, otherwise unexpected behaviour in downstream functions will ensue
}
if(length(which(mrks)) == 0) stop("No markers of type specified in markertype 1 (or markertype2) found!")
probgeno_df <- probgeno_df[probgeno_df$MarkerName %in% names(which(mrks)),]
probgeno_df <- droplevels(probgeno_df)
pardose <- pardose[pardose$MarkerName %in% names(which(mrks)),]
}
## I would like to avoid use of <<- here, but it seems necessary due to how mappoly is programmed.
## Argument pos = 1 in get() call of mappoly functions => Global Env. is environment searched for, while the call is within a function
mpdat <- mappoly::import_data_from_polymapR(input.data = probgeno_df,
ploidy = ploidy,
parent1 = parent1_replicates,
parent2 = parent2_replicates,
input.type = "probabilistic",
pardose = pardose)
## Try the following work-around to avoid use of <<- in previous call, but satisfy CRAN checks:
pos <- 1
envir = as.environment(pos)
assign(x="mpdat", value=mpdat, envir = envir)
s <- mappoly::make_seq_mappoly(input.obj = mpdat, arg = "all")
## Need to load the following function from the mappoly NameSpace to run subsequent mappoly function call:
paralell_pairwise_probability <- get("paralell_pairwise_probability", envir = getNamespace("mappoly"))
tpt.prob <- mappoly::est_pairwise_rf(input.seq = s, ncpus = ncores,
est.type = "prob")
m.prob <- mappoly::rf_list_to_matrix(input.twopt = tpt.prob, shared.alleles = TRUE)
df.prob <- convert_matrix_to_df(m = m.prob, dat = mpdat)
## Remove non-linked pairs (r = NA)
df.prob <- df.prob[!is.na(df.prob$r),]
## Now we need to filter out inter-markertype estimates, if they exist:
if(!is.null(markertype1) & !is.null(markertype2)){
mrk_a <- names(which(pc[,target_parent] == markertype1[1] & pc[,other_parent] == markertype1[2]))
mrk_b <- names(which(pc[,target_parent] == markertype2[1] & pc[,other_parent] == markertype2[2]))
set1 <- which(df.prob[,"marker_a"] %in% mrk_a & df.prob[,"marker_b"] %in% mrk_b)
set2 <- which(df.prob[,"marker_a"] %in% mrk_b & df.prob[,"marker_b"] %in% mrk_a)
## We need to reverse set2, if this is non-empty:
df.prob_out <- df.prob[set1,]
if(length(set2) > 0){
df.prob_set2 <- df.prob[set2,]
## reverse the names:
df.prob_set2$marker_a <- df.prob[set2,"marker_b"]
df.prob_set2$marker_b <- df.prob[set2,"marker_a"]
## reverse the dose codes:
df.prob_set2$dose_a_P1 <- df.prob[set2,"dose_b_P1"]
df.prob_set2$dose_a_P2 <- df.prob[set2,"dose_b_P2"]
df.prob_set2$dose_b_P1 <- df.prob[set2,"dose_a_P1"]
df.prob_set2$dose_b_P2 <- df.prob[set2,"dose_a_P2"]
## the phase codes are the same I think. I will have to check this works OK for different target and other parents.
df.prob_out <- rbind(df.prob_out,df.prob_set2)
}
} else{
df.prob_out <- df.prob
}
## extract the marker conversion info
mci <- conv$marker_conversion_info
## Extract conversion info of marker pairs in P1 and P2
P1.ci <- cbind(mci[df.prob_out[,"marker_a"],"P1"],mci[df.prob_out[,"marker_b"],"P1"])
P2.ci <- cbind(mci[df.prob_out[,"marker_a"],"P2"],mci[df.prob_out[,"marker_b"],"P2"])
######
## P1
######
P1.ci <- cbind(mci[df.prob_out[,"marker_a"],"P1"],mci[df.prob_out[,"marker_b"],"P1"])
P2.ci <- cbind(mci[df.prob_out[,"marker_a"],"P2"],mci[df.prob_out[,"marker_b"],"P2"])
p1homozygous <- pc[df.prob_out[,"marker_a"],"P1"] %in% c(0,ploidy) & pc[df.prob_out[,"marker_b"],"P1"] %in% c(0,ploidy)
p2homozygous <- pc[df.prob_out[,"marker_a"],"P2"] %in% c(0,ploidy2) & pc[df.prob_out[,"marker_b"],"P2"] %in% c(0,ploidy2)
d.a <- df.prob_out[,"dose_a_P1"]
d.b <- df.prob_out[,"dose_b_P1"]
ns <- df.prob_out[,"phase_p1"]
n10 <- d.a - ns
n01 <- d.b - ns
n00 <- ploidy - (ns + n10 + n01)
n11 <- ns
n11[P1.ci[,1] & !P1.ci[,2]] <- n01[P1.ci[,1] & !P1.ci[,2]]
n11[!P1.ci[,1] & P1.ci[,2]] <- n10[!P1.ci[,1] & P1.ci[,2]]
n11[P1.ci[,1] & P1.ci[,2]] <- n00[P1.ci[,1] & P1.ci[,2]]
df.prob_out$phase_p1.conv <- ""
df.prob_out$phase_p1.conv[n11 > 0] <- "coupling" #ignore mixed
df.prob_out$phase_p1.conv[n11 == 0 & !p1homozygous] <- "repulsion"
######
## P2
######
d.a <- df.prob_out[,"dose_a_P2"]
d.b <- df.prob_out[,"dose_b_P2"]
ns <- df.prob_out[,"phase_p2"]
n10 <- d.a - ns
n01 <- d.b - ns
n00 <- ploidy2 - (ns + n10 + n01)
n11 <- ns
n11[P2.ci[,1] & !P2.ci[,2]] <- n01[P2.ci[,1] & !P2.ci[,2]]
n11[!P2.ci[,1] & P2.ci[,2]] <- n10[!P2.ci[,1] & P2.ci[,2]]
n11[P2.ci[,1] & P2.ci[,2]] <- n00[P2.ci[,1] & P2.ci[,2]]
df.prob_out$phase_p2.conv <- ""
df.prob_out$phase_p2.conv[n11 > 0] <- "coupling" #ignore mixed
df.prob_out$phase_p2.conv[n11 == 0 & !p2homozygous] <- "repulsion"
## Select the target parent to generate correct order of composite phase
if(target_parent == "P1"){
phase <- paste0(df.prob_out[,"phase_p1.conv"],df.prob_out[,"phase_p2.conv"])
} else{
phase <- paste0(df.prob_out[,"phase_p2.conv"],df.prob_out[,"phase_p1.conv"])
}
## Assign phase unknown to pairs with no phase in either parent
phase[nchar(phase) == 0] <- "unknown"
df.prob_out$phase <- phase
## Use filtering on LOD threshold
df.prob_out <- df.prob_out[df.prob_out$LOD >= LOD_threshold,]
# time_end <- Sys.time()
# timediff <- as.numeric(time_end - time_start, units = "mins")
#
# write(
# paste(
# "\nFor",
# nrow(df.prob_out),
# "marker combinations LOD, r and phase written to output. Calculated on a",
# Sys.info()["sysname"],
# "machine using",
# ncores,
# "cores, taking",
# round(timediff, 2),
# "minutes"
# ),
# log.conn
# )
# if (!is.null(log)) close(log.conn)
return(df.prob_out[,c("marker_a","marker_b","r","LOD","phase")])
} #linkage.gp.mappoly
#'linterpol ***********************************************************
#'@description linear interpolation: returns the y values matching the x values (vector)
#'on the line through points pnt1 and pnt2 (both are vectors of length 2)
#'possible error if both X-coordinates equal not checked or caught
#'@param x x-coordinate for which corresponding y-coordinate is sought.
#'@param pnt1 Point 1
#'@param pnt2 Point 2
#'@noRd
linterpol <- function(x, pnt1, pnt2) {
a <- (pnt2[2] - pnt1[2]) / (pnt2[1] - pnt1[1])
b <- pnt1[2] - a * pnt1[1]
a * x + b
} #linterpol
#' makeCombscoresDf
#' @description Create an empty data frame with the correct size and column names and types
#' @param parent1 character vector with sampleIDs for parent 1. (0, 1 or multiple samples per
#' parent allowed, 0 samples are specified as character(0))
#' @param parent2 character vector with sampleIDs for parent 2. (0, 1 or multiple samples per
#' parent allowed, 0 samples are specified as character(0)
#' @param F1 character vector with sampleIDs for F1 individuals (one sample per F1 individual)
#' @param ancestors character vector of other ancestor samples listed in the \code{checkF1} output; character(0) if none
#' @param other character vector with sampleIDs of any other samples; character(0) if none
#' @param nrow the number of rows to create
#' @noRd
makeCombscoresDf <- function(parent1, parent2, F1, ancestors=character(0),
other=character(0), nrow){
ncol <- length(parent1) + length(parent2) + length(ancestors) + 2 +
length(F1) + length(other)
scores <- matrix(rep(NA_integer_, nrow * ncol), nrow=nrow)
df <- data.frame(
MarkerName=rep("", nrow),
segtype=rep("", nrow),
scores,
stringsAsFactors=FALSE
)
names(df) <- c("MarkerName", "segtype", parent1, parent2, ancestors,
"parent1", "parent2", F1, other)
return(df)
} #makeCombscoresDf
#'makeProgeny ***********************************************************
#'@description Returns a vector with the integer ratios of the F1 progeny
#'@param gametes1 vector of the integer ratios of the gametes produced by parent 1
#'@param gametes2 vector of the integer ratios of the gametes produced by parent 2
#'@noRd
makeProgeny <- function(gametes1, gametes2) {
gp1 <- length(gametes1)-1 #gametes1 ploidy
gp2 <- length(gametes2)-1 #gametes2 ploidy
m <- matrix(0, nrow=gp1+gp2+1, ncol=gp2+1)
for (i in 1:(gp2+1))
m[i:(i+gp1), i] <- gametes1 * gametes2[i]
D <- round(rowSums(m)) #the integer ratios of the F1 generation
names(D) <- 0:(length(D)-1)
d <- gcd_all(D[D>0])
round(D/d) #the integer ratios of the F1 generation simplified
} #makeProgeny
#' Merge marker assignments
#' @description \code{merge_marker_assignments} Merges 1.0 backbone object with marker assignment objects
#' @param dosage_matrix An integer matrix with markers in rows and individuals in columns.
#' @param target_parent Character string specifying target parent.
#' @param other_parent Character string specifying other parent.
#' @param LG_hom_stack data.frame specifying 1.0 marker assignments to linkage groups and homologues.
#' @param SN_linked_markers a list of marker assignment objects
#' @param ploidy Ploidy level of plant species.
#' @param LG_number Number of linkage groups (chromosomes).
#' @param log Character string specifying the log filename to which standard output should be written. If NULL log is send to stdout.
#' @return Returns a matrix with marker assignments. Number of linkages of 1.0 markers are artificial.
#' @examples
#' data("screened_data3", "LGHomDf_P1_1", "P1_SxS_Assigned", "P1_DxN_Assigned")
#' merged_assignment<-merge_marker_assignments(screened_data3, target_parent="P1",
#' other_parent="P2",
#' LG_hom_stack=LGHomDf_P1_1,
#' SN_linked_markers=list(P1_SxS_Assigned, P1_DxN_Assigned),
#' ploidy=4,
#' LG_number=5)
#' @noRd
merge_marker_assignments <- function(dosage_matrix,
target_parent = "P1",
other_parent = "P2",
LG_hom_stack,
SN_linked_markers,
ploidy,
LG_number,
log = NULL) {
LG_hom_stack <- test_LG_hom_stack(LG_hom_stack)
dosage_matrix <- test_dosage_matrix(dosage_matrix)
if(!target_parent %in% colnames(dosage_matrix) | !other_parent %in% colnames(dosage_matrix))
stop("Incorrect column name identifiers supplied for parents (target_parents and/or other_parent). Please check!")
markers_LG_hom_stack <- as.character(LG_hom_stack[, "SxN_Marker"])
LG_hom_stack <- LG_hom_stack[, c("LG", "homologue")]
rownames(LG_hom_stack) <- markers_LG_hom_stack
colnames(LG_hom_stack) <- c("Assigned_LG", "Assigned_Homolog")
comb <- as.matrix(do.call(rbind, SN_linked_markers))
#Add SxN markers
SN_LG_mat <- t(matrix(sapply(LG_hom_stack[, "Assigned_LG"], function(x) {
a <- rep(0, LG_number)
a[x] <- 1
return(a)
}),nrow = LG_number, dimnames = list(paste0("LG", levels(LG_hom_stack$Assigned_LG)),markers_LG_hom_stack)
))
LG_mat <- rbind(SN_LG_mat, comb[, paste0("LG", levels(LG_hom_stack$Assigned_LG)), drop = FALSE])
LG_mat <-
cbind(c(as.numeric(as.character(LG_hom_stack[, "Assigned_LG"])), comb[, "Assigned_LG"]), LG_mat)
rownames(LG_mat) <- c(markers_LG_hom_stack, rownames(comb))
colnames(LG_mat)[1] <- "Assigned_LG"
SN_hom_mat <- t(matrix(sapply(LG_hom_stack[, "Assigned_Homolog"], function(x) {
a <- rep(0, ploidy)
a[x] <- 1
return(a)
}),nrow = ploidy, dimnames = list(paste0("Hom", 1:ploidy),markers_LG_hom_stack)
))
counts_hom_mat <- rbind(SN_hom_mat, comb[, paste0("Hom", 1:ploidy)])
assigned_hom_mat <- matrix(c(LG_hom_stack[, "Assigned_Homolog"],
rep(NA, (ploidy - 1) * nrow(LG_hom_stack))),
ncol = ploidy)
##incorporate number of linkages per homologue
assigned_hom_mat <-
rbind(assigned_hom_mat, comb[, paste0("Assigned_hom", 1:ploidy)])
Assigned_LG_hom <- cbind(LG_mat, counts_hom_mat, assigned_hom_mat)
matched_rows <- match(rownames(Assigned_LG_hom),rownames(dosage_matrix))
if(any(is.na(matched_rows))){
stop("Could not find all assigned markers in dosage_matrix. Please check supplied dosage_matrix is correct.")
}
parental_dosages <-
dosage_matrix[matched_rows, c(target_parent, other_parent)]
Assigned_LG_hom <-
as.matrix(cbind(parental_dosages, Assigned_LG_hom))
class(Assigned_LG_hom) <- "integer"
if (is.null(log)) {
log.conn <- stdout()
} else {
matc <- match.call()
write.logheader(matc, log)
log.conn <- file(log, "a")
}
write(paste0("\n#### Marker numbers parent ", target_parent, "\n"),
log.conn)
count_table <-
table(
paste0(Assigned_LG_hom[, target_parent], "x", Assigned_LG_hom[, other_parent]),
Assigned_LG_hom[, "Assigned_LG"],
dnn = list("markertype", "linkage group")
)
write(knitr::kable(count_table),
log.conn)
if (!is.null(log))
close(log.conn)
return(Assigned_LG_hom)
}
merge_marker_assignments.gp <- function(MarkerType,
target_parent = "P1",
other_parent = "P2",
LG_hom_stack,
SN_linked_markers,
ploidy,
LG_number,
log = NULL) {
LG_hom_stack <- test_LG_hom_stack(LG_hom_stack)
markers_LG_hom_stack <- as.character(LG_hom_stack[, "SxN_Marker"])
LG_hom_stack <- LG_hom_stack[, c("LG", "homologue")]
rownames(LG_hom_stack) <- markers_LG_hom_stack
colnames(LG_hom_stack) <- c("Assigned_LG", "Assigned_Homolog")
comb <- as.matrix(do.call(rbind, SN_linked_markers))
#Add SxN markers
SN_LG_mat <- t(matrix(sapply(LG_hom_stack[, "Assigned_LG"], function(x) {
a <- rep(0, LG_number)
a[x] <- 1
return(a)
}),nrow = LG_number, dimnames = list(paste0("LG", 1:LG_number),markers_LG_hom_stack)
))
LG_mat <- rbind(SN_LG_mat, comb[, paste0("LG", 1:LG_number), drop = FALSE])
LG_mat <-
cbind(c(LG_hom_stack[, "Assigned_LG"], comb[, "Assigned_LG"]), LG_mat)
rownames(LG_mat) <- c(markers_LG_hom_stack, rownames(comb))
colnames(LG_mat)[1] <- "Assigned_LG"
SN_hom_mat <- t(matrix(sapply(LG_hom_stack[, "Assigned_Homolog"], function(x) {
a <- rep(0, ploidy)
a[x] <- 1
return(a)
}),nrow = ploidy, dimnames = list(paste0("Hom", 1:ploidy),markers_LG_hom_stack)
))
counts_hom_mat <- rbind(SN_hom_mat, comb[, paste0("Hom", 1:ploidy)])
assigned_hom_mat <- matrix(c(LG_hom_stack[, "Assigned_Homolog"],
rep(NA, (ploidy - 1) * nrow(LG_hom_stack))),
ncol = ploidy)
##incorporate number of linkages per homologue
assigned_hom_mat <-
rbind(assigned_hom_mat, comb[, paste0("Assigned_hom", 1:ploidy)])
Assigned_LG_hom <- cbind(LG_mat, counts_hom_mat, assigned_hom_mat)
if(target_parent == "P1"){
colNme <- c("parent1","parent2")
}else{
colNme <- c("parent2","parent1")
}
parental_dosages <- MarkerType[match(rownames(Assigned_LG_hom),MarkerType$MarkerName),colNme]
colnames(parental_dosages) <- c(target_parent, other_parent)
rownames(parental_dosages) <- rownames(Assigned_LG_hom)
Assigned_LG_hom <-
as.matrix(cbind(parental_dosages, Assigned_LG_hom))
class(Assigned_LG_hom) <- "integer"
if (is.null(log)) {
log.conn <- stdout()
} else {
matc <- match.call()
write.logheader(matc, log)
log.conn <- file(log, "a")
}
write(paste0("\n####Marker numbers parent ", target_parent, "\n"),
log.conn)
count_table <-
table(
paste0(Assigned_LG_hom[, target_parent], "x", Assigned_LG_hom[, other_parent]),
Assigned_LG_hom[, "Assigned_LG"],
dnn = list("markertype", "linkage group")
)
write(knitr::kable(count_table),
log.conn)
if (!is.null(log))
close(log.conn)
return(Assigned_LG_hom)
} #merge_marker_assignments.gp
#' Calculate the mode of a vector
#' @param x Vector input, can be numeric, character or factor
#' @noRd
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
} #Mode
#' Plot links between 1.0 markers
#' @description Make a plot with recombination frequency versus LOD score.
#' @param linkage_df A linkage data.frame.
#' @param LG_hom_stack A \code{data.frame} as a result of \code{\link{bridgeHomologues}}
#' @param LG Linkage group (LG) number.
#' @param h1 Homologue to be compared.
#' @param h2 Homologue to be compared against h1
#' @param ymax Maximum limit of y-axis
#' @noRd
plot_SNlinks <- function(linkage_df,LG_hom_stack,
LG, h1, h2, ymax=NULL){
h1Markers <- LG_hom_stack[LG_hom_stack[,"LG"] == LG & LG_hom_stack[,"homologue"] == h1,"SxN_Marker"]
# find markers in cluster to combine
h2Markers <- LG_hom_stack[LG_hom_stack[,"LG"] == LG & LG_hom_stack[,"homologue"] == h2,"SxN_Marker"]
# find markers that are in both clusters
subdata1 <- linkage_df[linkage_df[,"marker_a"] %in% h1Markers & linkage_df[,"marker_b"] %in% h2Markers,]
subdata2 <- linkage_df[linkage_df[,"marker_a"] %in% h2Markers & linkage_df[,"marker_b"] %in% h1Markers,]
plotdata <- rbind(subdata1,subdata2)
if(is.null(ymax)){ ymax<- max(plotdata$LOD)}
if(nrow(plotdata) > 0){
plot_linkage_df(linkage_df = plotdata, #there was droplevels() here..removed July 2020
main=paste0("LG",LG," h",h1," & h",h2),
add_legend = F)
# plot(NULL,xlab="r",ylab="LOD",
# xlim=c(0,0.5),ylim=c(0,ymax),
# main=paste0("LG",LG," h",h1," & h",h2))
# with(plotdata[plotdata$phase=="coupling",],
# points(r,LOD,pch=19,col="limegreen"))
# with(plotdata[plotdata$phase=="repulsion",],
# points(r,LOD,pch=19,col="dodgerblue"))
} else{
warning(paste("No SxN linkage found between h",h1," and h",h2," (LOD too low?)",sep=""))
}
} #plot_SNlinks
#'polygamfrq ***********************************************************
#'@description Get the integer ratios of gametes with dosage 0:(ploidy/2)
#'from an autopolyploid parent (polysomic inheritance) with given ploidy and dosage
#'the result can be converted to fractions by y <- x/(sum(x))
#'@param ploidy The ploidy level of the parent
#'@param dosage The parental dosage
#'@noRd
polygamfrq <- function(ploidy, dosage) {
gp <- ploidy/2 #gamete ploidy
#using hypergeometric distribution:
#dhyper(0:gp, dosage, ploidy-dosage, gp)
#using n-over-k: function choose, in order to return integer ratios:
A <- choose(dosage, 0:gp)
B <- choose(ploidy-dosage, gp-(0:gp))
#C <- choose(ploidy, gp)
#A * B / C this would give the answer as fractions,
# like dhyper(0:gp, dosage, ploidy-dosage, gp)
D <- round(A*B)
d <- gcd_all(D[D>0])
round(D/d)
} #polygamfrq
#' Prepare pwd for mapping scripts e.g. MDSmap
#' @description Prepare a dataframe with pairwise recombination frequency and LOD score for mapping by re-ordering
#' @param pwd A pwd data.frame
#' @noRd
prepare_pwd <- function(pwd) {
pwd[,c("marker_a", "marker_b")] <-
t(apply(pwd[,c("marker_a", "marker_b")], 1, function(x)
x[order(x)]))
switched.pwd<-rbind(pwd[,c("marker_a","marker_b")],
data.frame("marker_a"=pwd$marker_b,"marker_b"=pwd$marker_a))
dupes<-which(duplicated(switched.pwd,fromLast = T))
if(any(dupes <= nrow(pwd))) pwd <- pwd[-dupes[dupes <= nrow(pwd)],]
pwd <- pwd[order(pwd$marker_a, pwd$marker_b),]
pwd <- pwd[!duplicated(paste0(pwd$marker_a, pwd$marker_b)),]
return(pwd[,c("marker_a", "marker_b", "r", "LOD")])
} #prepare_pwd
#'@title Get substrings from the righthand side
#'@description Get substrings from the righthand side
#'@usage rightstr(x, n)
#'@param x a character vector (or something having an as.character method)
#'@param n a single number: if n>=0, the rightmost n characters of each element
#'of x are selected, if n<0 the (-n) leftmost characters are omitted
#'@return character vector with rightside substrings of x
#'@noRd
rightstr <- function(x, n) {
#vectorized for x, not n
#n>=0: take the rightmost n characters
#n<0: take all but the leftmost (-n) characters
if (!is.character(x)) x <- as.character(x)
len <- nchar(x)
if (n >= 0) {
start <- len - n + 1
substr(x, start, len)
} else {
#n < 0
substr(x, -n + 1, len)
}
} #rightstr
# segtypeInfoSummary *********************************************************
#'@title Summarize the segtypeInfo list
#'@description From a list of segregation types as produced by calcSegtypeInfo
#'or selSegtypeInfo, produce a data frame that only lists the parental
#'dosage combinations for each segtype and whether these produce the
#'segtype under polysomic, disomic and/or mixed inheritance.
#'Useful to quickly look up which segtypes match a given parental dosage
#'combination.
#'@usage segtypeInfoSummary(segtypeInfo)
#'
#'@param segtypeInfo a list as returned by calcSegtypeInfo or selSegtypeInfo
#'@return A data frame summarizing the segtypeInfo list, with columns:
#'\itemize{
#'\item{segtype: the name of the segregation type (see details of
#'calcSegtypeInfo)}
#'\item{segtypenr: the sequential number of the segtype in parameter segtypeInfo}
#'\item{parent1, parent2: dosages of the two parents}
#'\item{par.poly, par.di, par.mixed: whether these parental dosages produce this
#'segtype under polysomic, disomic and/or mixed inheritance}
#\item{segtype.poly, segtype.di, segtype.mixed: whether this segtype does occur
#under polysomic, disomic and/or mixed inheritance}
#'}
#'@noRd
segtypeInfoSummary <- function(segtypeInfo) {
totrows <- 0
for (i in 1:length(segtypeInfo))
totrows <- totrows + nrow(segtypeInfo[[i]]$pardosage)
result <- data.frame (segtype=character(totrows),
segtypenr=integer(totrows),
parent1=integer(totrows),
parent2=integer(totrows),
par.poly=integer(totrows),
par.di=integer(totrows),
par.mixed=integer(totrows),
#segtype.poly=integer(totrows),
#segtype.di=integer(totrows),
#segtype.mixed=integer(totrows),
stringsAsFactors=FALSE)
r <- 1
for (i in 1:length(segtypeInfo)) {
for (p in 1:nrow(segtypeInfo[[i]]$pardosage)) {
result$segtype[r] = names(segtypeInfo)[i]
result$segtypenr[r] = i
result$parent1[r] = segtypeInfo[[i]]$pardosage[p,1]
result$parent2[r] = segtypeInfo[[i]]$pardosage[p,2]
result$par.poly[r] = segtypeInfo[[i]]$parmode[p,1]
result$par.di[r] = segtypeInfo[[i]]$parmode[p,2]
result$par.mixed[r] = segtypeInfo[[i]]$parmode[p,3]
#result$segtype.poly[r] = 0 + segtypeInfo[[i]]$polysomic
#result$segtype.di[r] = 0 + segtypeInfo[[i]]$disomic
#result$segtype.mixed[r] = 0 + segtypeInfo[[i]]$mixed
r <- r + 1
}
}
result
} #segtypeInfoSummary
# selSegtypeInfo ***********************************************************
#'@title Restrict a list of segregation types to specified inheritance modes
#'@description From a list of segregation types as produced by calcSegtypeInfo,
#'this function selects only those segtypes that occur with polysomic,
#'disomic and/or mixed inheritance if the corresponding parameters are set to
#'TRUE, and from those segtypes only the parental dosages with the same
#'restrictions are retained.
#'@usage selSegtypeInfo(segtypeInfo, polysomic, disomic, mixed)
#'@param segtypeInfo a list as returned by calcSegtypeInfo
#'@param polysomic If TRUE all segtypes with poly TRUE are retained, and from
#'those segtypes all parental dosage combinations with parmode[,1] TRUE
#'@param disomic If TRUE all segtypes with di TRUE are retained, and from
#'those segtypes all parental dosage combinations with parmode[,2] TRUE
#'@param mixed If TRUE all segtypes with mixed TRUE are retained, and from
#'those segtypes all parental dosage combinations with parmode[,3] TRUE
#'@return A list like segtypeInfo, modified as specified by parameters polysomic,
#'disomic and mixed
#'@noRd
selSegtypeInfo <- function(segtypeInfo, polysomic, disomic, mixed) {
result <- list()
selected <- integer(0)
for (s in seq_along(segtypeInfo)) {
if ((polysomic && segtypeInfo[[s]]$polysomic) ||
(disomic && segtypeInfo[[s]]$disomic) ||
(mixed && segtypeInfo[[s]]$mixed)) {
selected <- c(selected, s)
r <- length(result) + 1
result[[r]] <- segtypeInfo[[s]]
selpar <- (polysomic & result[[r]]$parmode[,1]) |
(disomic & result[[r]]$parmode[,2]) |
(mixed & result[[r]]$parmode[,3])
result[[r]]$pardosage <- result[[r]]$pardosage[selpar,, drop=FALSE]
result[[r]]$parmode <- result[[r]]$parmode[selpar,, drop=FALSE]
}
}
names(result) <- names(segtypeInfo)[selected]
result
} #selSegtypeInfo
#' Error and warning handling for cluster_stack
#' @param cluster_stack A data.frame with a column "marker" specifying markernames,
#' and a column "cluster" specifying marker cluster.
#' @noRd
test_cluster_stack <- function(cluster_stack){
cn <- colnames(cluster_stack)
cnw <- c("marker", "cluster")
if(!all(cnw %in% cn)){
warning("The names \"marker\" and \"cluster\" should be part of the columnames of cluster_stack")
message("Trying to change columnames..")
colnames(cluster_stack) <- cnw
}
return(cluster_stack)
} #test_cluster_stack
#' Error and warning handling for dosage_matrix
#' @param dosage_matrix An integer matrix with markers in rows and individuals in columns.
#' @noRd
test_dosage_matrix <- function(dosage_matrix){
if(inherits(dosage_matrix,"data.frame")){
warning("dosage_matrix should be a matrix, now it's a data.frame.")
message("Trying to convert it to matrix, assuming markernames are in the first column..")
rownames(dosage_matrix) <- dosage_matrix[,1]
dosage_matrix <- as.matrix(dosage_matrix[,-1])
class(dosage_matrix) <- "integer"
} else if(inherits(dosage_matrix,"matrix")){
rn <- rownames(dosage_matrix)
cn <- colnames(dosage_matrix)
if(is.null(rn)) stop("The rownames of dosage_matrix should contain markernames. Now NULL")
if(is.null(cn)) stop("The columnnames of dosage_matrix should contain genotype names. Now NULL")
if(!(typeof(dosage_matrix) =="integer" | typeof(dosage_matrix) =="double")){
warning("dosage_matrix should be integer or numeric. Trying to convert it.. ")
class(dosage_matrix) <- "integer"
}
} else {
stop("dosage_matrix should be a matrix of integers.
See the manual of this function for more information.")
}
if(any(duplicated(rownames(dosage_matrix)))){
warning("Duplicated marker names detected. Removing duplicates as quick-fix.. (but advise to re-check your input also!)")
dosage_matrix <- dosage_matrix[!duplicated(dosage_matrix),]
}
return(dosage_matrix)
} #test_dosage_matrix
#' Error and warning handling for LG_hom_stack
#' @param LG_hom_stack A data.frame with a column "SxN_Marker" specifying markernames,
#' a column "homologue" specifying homologue cluster and "LG" specifying linkage group.
#' @noRd
test_LG_hom_stack <- function(LG_hom_stack){
cn <- colnames(LG_hom_stack)
cnw <- c("SxN_Marker", "homologue", "LG")
if(!all(cnw %in% cn)){
warning("The names \"SxN_Marker\", \"homologue\" and \"LG\" should be part of the columnames of LG_hom_stack")
message("Trying to change columnames..")
colnames(LG_hom_stack) <- cnw
}
if(!is.factor(LG_hom_stack$LG)) LG_hom_stack$LG <- as.factor(LG_hom_stack$LG)
return(LG_hom_stack)
} #test_LG_hom_stack
#' Error and warning handling for linkage_df
#' @param linkage_df A linkage data.frame as output of \code{\link{linkage}} calculating linkage between 1.0 markers.
#' @noRd
test_linkage_df <- function(linkage_df){
if(!is.data.frame(linkage_df)){
stop("linkage_df should be an object of class data.frame")
} else {
cn <- colnames(linkage_df)
cnw <- c("marker_a","marker_b","r","LOD","phase")
if(!identical(cn[1:5], cnw)){
warning(
"The first five columnnames of linkage_df are not of format c(\"marker_a\",\"marker_b\",\"r\",\"LOD\",\"phase\")")
message("Trying to convert colnames..")
colnames(linkage_df)[1:5] <- cnw
}
linkage_df[,c("marker_a", "marker_b")]<-
sapply(linkage_df[,c("marker_a", "marker_b")], as.character)
linkage_df$phase <- as.factor(linkage_df$phase)
classes <- sapply(linkage_df, class)
names(classes) <- NULL
classesw <- c("character", "character", "numeric", "numeric", "factor")
if(!identical(classes[1:5], classesw)){
stop(paste("The first five columns of linkage_df should be of classes:
", paste(classesw, collapse = " ")))
}
}
return(linkage_df)
} #test_linkage_df
#' Error and warning handling for probgeno_df data-frame of probabilistic genotypes (scores)
#' @param probgeno_df A data frame as read from the scores file produced by function
#' \code{saveMarkerModels} of R package \code{fitPoly}, or alternatively, a data frame containing the following columns:
#' \itemize{
#' \item{SampleName}{
#' Name of the sample (individual)
#' }
#' \item{MarkerName}{
#' Name of the marker
#' }
#' \item{P0}{
#' Probabilities of dosage score '0'
#' }
#' \item{P1...}{
#' Probabilities of dosage score '1' etc. (up to max offspring dosage, e.g. P4 for tetraploid population)
#' }
#' \item{maxP}{
#' Maximum genotype probability identified for a particular individual and marker combination
#' }
#' \item{maxgeno}{
#' Most probable dosage for a particular individual and marker combination
#' }
#' \item{geno}{
#' Most probable dosage for a particular individual and marker combination, if \code{maxP} exceeds a user-defined threshold (e.g. 0.9), otherwise \code{NA}}
#' }
#' @noRd
test_probgeno_df <- function(probgeno_df){
if(!inherits(probgeno_df, "data.frame")) {
warning("probgeno_df should be a data-frame. Attempting to coerce...")
probgeno_df <- as.data.frame(probgeno_df)
}
if(!all(c("MarkerName", "SampleName", "geno","maxgeno","maxP") %in% colnames(probgeno_df))) stop("colnames MarkerName, SampleName, maxgeno, geno, and maxP are required!")
if(!is.factor(probgeno_df$SampleName)) probgeno_df$SampleName <- as.factor(probgeno_df$SampleName)
return(probgeno_df)
} #test_probgeno_df
#' Large vector or in standardized matrix
#' @description Turns a vector in a matrix with fixed number of columns
#' @param x A vector
#' @param n.columns Integer, number of columns
#' @noRd
vector.to.matrix <- function(x, n.columns){
if(length(x)>n.columns){
x<-c(x, rep("", n.columns-length(x)%%n.columns))
} else {
n.columns <- length(x)
}
x.m <- matrix(x, ncol=n.columns, byrow=T)
colnames(x.m)<-rep("_", n.columns)
return(x.m)
} #vector.to.matrix
#' Write a header for the log file
#' @description Functionalized writing of function name and arguments as start for log paragraph.
#' @param matc A object of class \code{call}
#' @param log A character string specifying the log file
#' @noRd
write.logheader <- function(matc, log){
args <- as.list(matc)
mod <- "w"
if(file.exists(log)) mod <- "a"
log.conn <- file(log, mod)
if(mod=="w") write(c("<style type=\"text/css\">.table {width: 40%;}</style>",
"## polymapR log file",
"This file is written in [markdown](https://en.wikipedia.org/wiki/Markdown).",
"Use knitr or [Markable](https://markable.in) to export it as a nicely formatted html or word file."),
log.conn)
close(log.conn)
mod <- "a"
log.conn <- file(log, mod)
write(c(paste0("\n### Log for function call of `", args[[1]], "`"),
as.character(Sys.time())),file = log.conn)
close(log.conn)
log.conn <- file(log, "a")
write("\nWith the following call:\n", log.conn)
write("```", log.conn)
sink(log.conn)
print(matc)
sink()
write("```\n", log.conn)
close(log.conn)
} #write.logheader
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.