#' Construct linkage groups using 2-point recombination fraction estimates
#'
#' Use two-point recombination fraction estimates to group markers into the specified number of linkage group, The input \code{initial} can be used to select groups of markers that will be assigned to the same group. Grouping is performed using hierarchical clustering (\code{hclust}) using either average, complete or single linkage. The matrix used for clustering can be either theta (the recombination fraction matrix), lod (the log likelihood ratio) or a combination of the two.
#' @importFrom stats na.omit
#' @importFrom stats hclust
#' @importFrom stats as.dist
#' @importFrom stats cutree
#' @export
#' @param mpcross Object of class \code{mpcross}
#' @param groups The number of groups to be formed
#' @param initial A list, with each entry containing markers which are to be assigned to the same group. Markers can be referenced by name or by index
#' @param clusterBy The type of data to cluster by. Can be one of "theta" for recombination fraction, "lod" for log likelihood ration, or "combined" for a combination of the two
#' @param method The clustering method to use. Must be one of "single", "complete" or "average"
#' @return A copy of the input mpcross object, with an additional "lg" entry containing the groupings of the markers. In addition the recombination fraction estimates and genetic data are reordered according to the created groupings of markers.
#' \item{lg$groups}{ Numeric vector giving the group to which each marker belongs}
#' \item{lg$all.groups}{ Numeric vector giving the numbers for any groups that are present}
#' @examples
#' map <- qtl::sim.map(len=rep(100, 2), n.mar=11, eq.spacing=TRUE, include.x=FALSE)
#' sim.ped <- sim.mpped(4, 1, 500, 6, 1)
#' sim.dat <- sim.mpcross(map=map, pedigree=sim.ped,
#' qtl=matrix(data=c(1, 50, .4, 0, 0, 0),
#' nrow=1, ncol=6, byrow=TRUE), seed=1)
#' dat.rf <- mpestrf(sim.dat)
#' grouped <- mpgroup(dat.rf, groups=2, clusterBy="combined", method="average")
#' grouped$lg
mpgroup <- function(mpcross, groups, initial = NULL, clusterBy="combined", method="average")
{
if(!(clusterBy %in% c("combined", "theta", "lod")))
{
stop("Input clusterBy must be one of 'combined', 'theta' or 'lod'")
}
if(!(method %in% c("average", "complete", "single")))
{
stop("Input method must be one of 'average', 'complete' or 'single'")
}
if (missing(mpcross))
{
stop("Input mpcross cannot be missing")
}
if (is.null(mpcross$rf)&is.null(mpcross$map))
{
stop("Must calculate recombination fractions prior to grouping loci")
}
if(!is.null(mpcross$map)) {
cat("Using map groupings for groups. Remove map object if you want to regroup.\n")
initial <- lapply(mpcross$map, function(x) match(names(x), colnames(mpcross$finals)))
groups <- length(mpcross$map)
output <- mpcross
grpassignment <- vector(length=ncol(mpcross$finals))
for (ii in 1:groups) grpassignment[initial[[ii]]] <- ii
output$lg <- list(all.groups=1:groups, groups=grpassignment)
names(output$lg$groups) <- unlist(lapply(mpcross$map, names))
return(output)
## don't rearrange order
}
lod <- mpcross$rf$lod
theta <- mpcross$rf$theta
#Reverse lod so that small values indicate similarity
lod[is.na(mpcross$rf$lod)] <- 0
lod <- max(lod) - lod
diag(lod) <- 0
theta[is.na(mpcross$rf$theta)] <- 0.5
if(method == "average")
{
linkFunction <- function(x) mean(x, na.rm=TRUE)
}
else if(method == "complete")
{
linkFunction <- function(x) max(x, na.rm=TRUE)
}
else
{
linkFunction <- function(x) min(x, na.rm=TRUE)
}
if(clusterBy == "combined")
{
distMat <- theta + lod / max(lod) * min(abs(diff(mpcross$rf$r)))
}
else if(clusterBy == "theta")
{
distMat <- theta
}
else
{
distMat <- lod
}
if(is.null(initial))
{
clustered <- hclust(as.dist(distMat), method=method)
}
else
{
#The number of values we're going to shrink the distance matrix by, as a result of the pre-grouping
nLess <- sum(unlist(lapply(initial, function(x) length(x) - 1)))
if(nLess == 0)
{
clustered <- hclust(as.dist(distMat), method=method)
}
else
{
#We actually have groups input. These groups go first, followed by the ungrouped values
ungrouped <- setdiff(colnames(mpcross$rf$theta), unlist(initial))
new.dist <- matrix(data=0, nrow=ncol(mpcross$rf$theta) - nLess, ncol = ncol(mpcross$rf$theta) - nLess)
#Are there even any ungrouped values? If there aren't, skip this bit
onlyGrouped <- length(initial) == ncol(mpcross$rf$theta) - nLess
if(!onlyGrouped)
{
new.dist[(length(initial)+1):(ncol(mpcross$rf$theta) - nLess), (length(initial)+1):(ncol(mpcross$rf$theta) - nLess)] <- distMat[ungrouped, ungrouped]
}
for(i in 1:length(initial))
{
markersI <- initial[[i]]
for(j in 1:length(initial))
{
markersJ <- initial[[j]]
new.dist[i, j] <- new.dist[j, i] <- linkFunction(distMat[markersI, markersJ])
}
if(!onlyGrouped)
{
for(j in 1:length(ungrouped))
{
new.dist[i, j+length(initial)] <- linkFunction(distMat[markersI, ungrouped[j]])
}
}
}
clustered <- hclust(as.dist(new.dist), members=c(unlist(lapply(initial, length)), rep(1, length(ungrouped))), method=method)
}
}
cut <- cutree(clustered, k=groups)
if(!is.null(initial))
{
specifiedGroups <- cut[1:length(initial)]
cut <- cut[-(1:length(initial))]
names(cut) <- ungrouped
for(i in 1:length(initial))
{
new <- rep(specifiedGroups[i], length(initial[[i]]))
names(new) <- initial[[i]]
cut <- c(cut, new)
}
cut <- cut[colnames(mpcross$rf$theta)]
}
else
{
names(cut) <- colnames(mpcross$rf$theta)
}
output <- mpcross
output$lg <- list(all.groups=1:groups, groups=cut)
return(subset(output, markers = colnames(output$founders)[order(output$lg$groups)]))
}
mpsplitgroup <- function(mpcross, toSplit, nSplits, clusterBy="combined", method="average")
{
if(!(method %in% c("average", "complete", "single")))
{
stop("Input method must be one of 'average', 'complete' or 'single'")
}
if(!(clusterBy %in% c("combined", "theta", "lod")))
{
stop("Input clusterBy must be one of 'combined', 'theta' or 'lod'")
}
if (missing(mpcross))
{
stop("Input mpcross cannot be missing")
}
if (is.null(mpcross$rf))
{
stop("Must calculate recombination fractions prior to grouping loci")
}
if(is.null(mpcross$lg))
{
stop("Must have an existing grouping structure to call mpsubgroup")
}
lod <- mpcross$rf$lod
theta <- mpcross$rf$theta
#Reverse lod so that small values indicate similarity
lod[is.na(mpcross$rf$lod)] <- 0
lod <- max(lod) - lod
diag(lod) <- 0
theta[is.na(mpcross$rf$theta)] <- 0.5
if(clusterBy == "combined")
{
distMat <- theta + lod / max(lod) * min(abs(diff(mpcross$rf$r)))
}
else if(clusterBy == "theta")
{
distMat <- theta
}
else
{
distMat <- lod
}
new.groups <- mpcross$lg$groups
current.group <- which(mpcross$lg$groups == toSplit)
subdist <- as.dist(distMat[current.group, current.group])
clustered <- hclust(subdist, method=method)
cut <- cutree(clustered, k=nSplits)
new.groups[new.groups > toSplit] <- new.groups[new.groups > toSplit] + nSplits-1
new.groups[current.group] <- cut + toSplit - 1
output <- mpcross
all.groups <- mpcross$lg$all.groups
all.groups[all.groups > toSplit] <- all.groups[all.groups > toSplit] + nSplits - 1
all.groups <- unique(c(all.groups, toSplit:(toSplit + nSplits-1)))
output$lg <- list(all.groups=all.groups, groups=new.groups)
return(subset(output, markers = colnames(output$founders)[order(output$lg$groups)]))
}
mpsubgroup <- function(mpcross, subgroups, clusterBy="combined", method="average")
{
if(!(method %in% c("average", "complete", "single")))
{
stop("Input method must be one of 'average', 'complete' or 'single'")
}
if(!(clusterBy %in% c("combined", "theta", "lod")))
{
stop("Input clusterBy must be one of 'combined', 'theta' or 'lod'")
}
if (missing(mpcross))
{
stop("Input mpcross cannot be missing")
}
if (is.null(mpcross$rf))
{
stop("Must calculate recombination fractions prior to grouping loci")
}
if(is.null(mpcross$lg))
{
stop("Must have an existing grouping structure to call mpsubgroup")
}
lod <- mpcross$rf$lod
theta <- mpcross$rf$theta
#Reverse lod so that small values indicate similarity
lod[is.na(mpcross$rf$lod)] <- 0
lod <- max(lod) - lod
diag(lod) <- 0
theta[is.na(mpcross$rf$theta)] <- 0.5
if(clusterBy == "combined")
{
distMat <- theta + lod / max(lod) * min(abs(diff(mpcross$rf$r)))
}
else if(clusterBy == "theta")
{
distMat <- theta
}
else
{
distMat <- lod
}
new.groups <- vector(mode="integer", length=length(mpcross$lg$groups))
names(new.groups) <- names(mpcross$lg$groups)
for(index in 1:length(mpcross$lg$all.groups))
{
group <- mpcross$lg$all.groups[index]
current.group <- which(mpcross$lg$groups == group)
subdist <- as.dist(distMat[current.group, current.group])
clustered <- hclust(subdist, method=method)
cut <- cutree(clustered, k=subgroups)
new.groups[current.group] <- cut + ((index -1)*subgroups)
}
output <- mpcross
output$lg <- list(n.groups=length(mpcross$lg$all.groups) * subgroups, groups=new.groups)
return(subset(output, markers = colnames(output$founders)[order(output$lg$groups)]))
}
fineTuneGroups <- function(grouped, excludeGroups=c())
{
if(is.null(names(grouped$lg$groups))) stop("Invalid mpcross object input")
originalChromosome <- newChromosome <- markerName <- newAverage <- c()
for(i in 1:length(grouped$lg$all.groups))
{
indicesI <- which(grouped$lg$groups == grouped$lg$all.groups[i])
diagonal <- grouped$rf$theta[indicesI, indicesI, drop=FALSE]
originalAverages <- apply(diagonal, 1, function(x) mean(x, na.rm=TRUE))
for(j in grouped$lg$all.groups[-i])
{
indicesJ <- which(grouped$lg$groups == j)
offDiagonal <- grouped$rf$theta[indicesI, indicesJ, drop=FALSE]
averages <- apply(offDiagonal, 1, function(x) mean(x, na.rm=TRUE))
#Ok, for these markers names chromosome j is better than chromosome i.
betterMarkerNames <- names(which(averages < originalAverages))
#For these ones we have no previously better chromosome, so add them straight in
firstSelection <- betterMarkerNames[!(betterMarkerNames %in% markerName)]
#For these we've already made a selection. But is j also better than other any other previously looked choices. If it's mostly unlinked to its current chromosome then a lot of other bad chromosomes will probably be a little bit better too
secondSelection <- betterMarkerNames[betterMarkerNames %in% markerName]
markerName <- c(markerName, firstSelection)
originalChromosome <- c(originalChromosome, rep(grouped$lg$all.groups[i], length(firstSelection)))
newChromosome <- c(newChromosome, rep(j, length(firstSelection)))
newAverage <- c(newAverage, averages[firstSelection])
#Which of these ones to keep
keep <- newAverage[match(secondSelection, markerName)] > averages[secondSelection]
#And the corresponding bits of the previous relocation data that has to be removed
overwrite <- match(secondSelection[keep], markerName)
secondSelection <- secondSelection[keep]
newChromosome[overwrite] <- rep(j, length(secondSelection))
newAverage[overwrite] <- averages[secondSelection]
}
}
#Actually do the re-arranging
for(index in 1:length(markerName))
{
currentNewChromosome <- newChromosome[index]
if(!(currentNewChromosome %in% excludeGroups))
{
currentMarkerName <- markerName[index]
original <- originalChromosome[index]
grouped$lg$groups[currentMarkerName] <- currentNewChromosome
}
}
remove <- c()
for(i in grouped$lg$all.groups)
{
indicesI <- which(grouped$lg$groups == i)
diagonal <- grouped$rf$theta[indicesI, indicesI, drop=FALSE]
additionalRemove <- which(apply(diagonal, 1, function(x) mean(x, na.rm=TRUE)) > 0.41)
remove <- c(remove, additionalRemove)
}
markers <- colnames(grouped$founders)[order(grouped$lg$groups)]
markers <- markers[-match(names(remove), markers)]
newGrouped <- subset(grouped, markers = markers)
return(newGrouped)
}
joinGroups <- function(mpcross, join)
{
#The value contained at index i is the new group ID for that group
new.group.id <- vector(mode="numeric", length=max(mpcross$lg$all.groups))
new.group.id[] <- NA
next.group.id <- 1
for(i in 1:length(join))
{
group <- as.integer(join[[i]])
#None of these have existing groups, so we're creating a new group
if(all(is.na(new.group.id[group])))
{
new.group.id[group] <- next.group.id
next.group.id <- next.group.id + 1
}
else
{
#At least one of these is already part of a group, in fact there might be two groups being joined here.
existing <- unique(na.omit(new.group.id[group]))
#We're joining different groups
if(length(existing) > 1)
{
new.group.id[unique(c(group, which(new.group.id %in% existing)))] <- min(existing)
}
else
{
new.group.id[group] <- existing
}
}
}
for(i in mpcross$lg$all.groups)
{
if(is.na(new.group.id[i]))
{
new.group.id[i] <- next.group.id
next.group.id <- next.group.id + 1
}
}
previous.names <- names(mpcross$lg$groups)
mpcross$lg$groups <- new.group.id[mpcross$lg$groups]
names(mpcross$lg$groups) <- previous.names
mpcross$lg$all.groups <- unique(mpcross$lg$groups)
return(mpcross)
}
findJoinPoint <- function(mpcross, marker1, marker2, joins)
{
group1 <- mpcross$lg$groups[marker1]
group2 <- mpcross$lg$groups[marker2]
for(i in 1:length(joins))
{
join <- joins[[i]]
if(join[[1]] == "join")
{
joinCommand <- c(join[[2]], join[[3]])
if(all(c(group1, group2) %in% joinCommand))
{
return(i)
}
else if(group1 %in% joinCommand)
{
group1 <- min(joinCommand)
}
else if(group2 %in% joinCommand)
{
group2 <- min(joinCommand)
}
}
}
return(-1)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.