#IBD calculation -----------------
#' Calculation of IBD
#'
#' This function computes Identity-by-descent probabilities according to a different IBD models:
#' - Naive model based on random assortment of markers.
#' - HMM model as implemented in `polyqtlR`
#' - Heuristic model as implmented in `polyqtlR`
#'
#' It requires homologue matrices (`p1hom` and `p2hom`) indicating which homologues of each parent carry
#' a genotype of 1 or 0 for the tracked allele.
#' Based on this, a probability matrix is returned: for each individual how likely it is,
#' at each marker, that it has inherited each of the homologues.
#' As a generic method, it can use a single genotype of one marker (and two vectors
#' of homologue assignment); a vector of markers (and two homologue matrices) or a matrix of markers
#' (and two homologue matrices).
#'
#' @param geno integer or matrix containing genotypes (genotypes of allele A, from 0 to ploidy).
#' If `geno` is a matrix, each row must be labelled with a marker name, and each column is
#' interpreted as an individual. Note that for many markers of one single individual, a
#' 1 column matrix must be provided.
#' @param p1hom numeric vector or matrix indicating which parental homologue, from parent 1, contains allele A.
#' If `geno` is an integer, `p1hom` must be a vector of length ploidy. If `geno` is a matrix, `p1hom` must be a matrix of ncol = ploidy and nrow = nrow(geno) and the same rownames
#' as `geno`.
#' @param p2hom numeric vector or matrix indicating which parental homologue, from parent 2, contains allele A.
#' If `geno` is an integer, `p2hom` must be a vector of length ploidy. If `geno` is a matrix,`p2hom` must be a matrix of ncol = ploidy and nrow = nrow(geno) and the same rownames
#' as `geno`.
#' @param ploidy numeric, indicating the ploidy of an organism. Only even ploidies allowed and
#' all individuals are expected to have the same ploidy.
#' @param method character indicating IBD method to use. Either "naive" (default), "hmm" for hidden markov model
#' or "heuristic" for the heuristic model of `polyqtlR`
#' @param map data.frame containing "position" and "marker" columns. Required for "hmm" and "heuristic"
#' methods
#' @param p1name character, column name containing the first parent name in the geno matrix. Required
#' for "heuristic" and "hmm" methods
#' @param p2name character, column name containing the second parent name in the geno matrix. Required
#' for "heuristic" and "hmm" methods
#' @param hmm.error numeric, if "hmm" method is used, the estimated error percentage in the genotypes.
#' The default (0.01) performs quite well if there's not a large number of errors.
#'
#' @return The naive probability of having inherited each parental homologue. If the
#' offspring genotype is impossible given the parental homologues, NA is returned.
#' If `geno`is a vector, returns a one-row matrix, each column containing the
#' probability of having inherited that parental homologue (the first columns corresponding
#' to parent 1, and the rest to parent 2). If `geno` is a matrix, a list of matrices is returned.
#' Each element is the probabilities for all individuals of having inherited one parental
#' homologue. There should be ploidy*2 elements in the list.
#' @export
#'
#' @examples
#' data("genotype")
#' geno <- geno[,-1:-2] #we take out parental genotypes
#' data("homologue")
#'
#' #For a single genotype
#' IBD <- naive_IBD(geno[1,1],hom[1,1:2],hom[1,3:4], ploidy = 2)
#' #For a single individual
#' #The genotype must be a matrix of one column!
#' IBD <- naive_IBD(geno[,1,drop = FALSE],hom[,1:2],hom[,3:4], ploidy = 2)
#' #For all individuals
#' IBD <- naive_IBD(geno,hom[,1:2],hom[,3:4], ploidy = 2)
calc_IBD <- function(geno,p1hom,p2hom,map = NULL,ploidy = 2,method = "naive",
p1name = "P1",p2name = "P2",hmm.error = 0.01){
method <- match.arg(method, c("naive","hmm","heuristic"))
geno_p1 <- identical(class(geno),class(p1hom))
geno_p2 <- identical(class(geno),class(p2hom))
if(any(class(geno) %in% "data.frame")) geno <- as.matrix(geno)
if(any(class(p1hom) %in% "data.frame")) p1hom <- as.matrix(p1hom)
if(any(class(p2hom) %in% "data.frame")) p2hom <- as.matrix(p2hom)
if(!(geno_p1 & geno_p2)){
stop("All data provided to IBD must be of the same class (i.e. all numeric or all matrices)")
}else{
switch(method,
naive = UseMethod("naive_IBD",geno),
hmm = UseMethod("hmm_IBD",geno),
heuristic = UseMethod("heur_IBD",geno))
}
}
### SD Implementation ---------
naive_IBD <- function(geno,p1hom,p2hom,ploidy = 2,p1name = "P1",p2name = "P2",...){
geno_p1 <- identical(class(geno),class(p1hom))
geno_p2 <- identical(class(geno),class(p2hom))
if(!(geno_p1 & geno_p2)){
stop("All data provided to IBD must be of the same class (i.e. all numeric or all matrices)")
}else{
UseMethod("naive_IBD",geno)
}
}
#' Calculation of IBD for a single data point
#'
#' @describeIn naive_IBD Function for a single marker of a single individual.
naive_IBD.numeric <- function(geno,p1hom,p2hom,ploidy = 2,...){
if(all(is.na(geno))) return(NA)
assertthat::assert_that(all(geno <= ploidy))
assertthat::assert_that(length(p1hom) == ploidy)
assertthat::assert_that(length(p2hom) == ploidy)
configlist <- config(geno,ploidy = ploidy)
parenthom <- list(p1 = p1hom,
p2 = p2hom)
parentcols <- list(p1 = 1:(ploidy/2),
p2 = (ploidy/2+1):ploidy)
#This expresses the maximum and minimum allele genotype
#that each parent can give. It's useful to find out
#impossible inheritances
parentrange <- sapply(parenthom,function(homp){
min <- sum(sort(homp)[1:(ploidy/2)])
max <- sum(sort(homp,decreasing = T)[1:(ploidy/2)])
return(c(min = min,max = max))
})
#We will calculate the homologue probability
#per each configuration
config_prob <- lapply(configlist,function(con){
#This calculates the homologue probabilites
#per parent for the configuration at which we're at
homprobs <- lapply(c(1,2),function(parent){
conp <- con[parentcols[[parent]]]
homp <- parenthom[[parent]]
if(any(is.na(homp)) | sum(conp) > parentrange["max",parent] | sum(conp) < parentrange["min",parent]){
impossible <- T
}else{
impossible <- F
}
homprobs <- t(sapply(conp,function(inherited){
sapply(homp,function(hom){
if(impossible) return(NA)
res <- (inherited == hom)/sum(homp == inherited)
if(is.nan(res)) res <- 0
return(res)
})
}))
rownames(homprobs) <- conp
return(homprobs)
})
#Here something else needs to be added in order to make it
#for polyploids
homprobs <- do.call(cbind,homprobs)
homprobs <- colSums(homprobs)
#probability of parent 1 to give first genotypes
probp1 <- sapply(con[parentcols$p1],function(gamete) mean(parenthom$p1 == gamete) )
probp1 <- prod(probp1,na.rm = T)
#probability of parent 2 to give other genotypes
probp2 <- sapply(con[parentcols$p2],function(gamete) mean(parenthom$p2 == gamete) )
probp2 <- prod(probp2,na.rm = T)
return(list(conprob = probp1*probp2,
homprob = homprobs))
})
#Now each configuration probability
#First the probability of each configuration
conprob <- extract(config_prob,"conprob")
homprob <- extract(config_prob,"homprob")
homprob <- matrix(homprob,nrow = length(p1hom) + length(p2hom))
possible_configs <- !is.na(colSums(homprob))
conprob <- conprob[possible_configs]
homprob <- homprob[,possible_configs,drop = F]
#Now we normalize it (is this correct?)
conprob <- conprob/sum(conprob,na.rm = T)
#Then we multiply the probability of the configuration times the
#probability of each homologue. Summing the homologue probabilities
#provides us with the final homologue probabilities
homprob <- t(homprob %*% conprob)
colnames(homprob) <- c(names(p1hom),names(p2hom))
if(all(!possible_configs)) return(matrix(NA,ncol = length(p1hom) + length(p2hom)))
return(homprob)
}
#' Calculation of IBD for multiple markers and/or individuals
#'
#' @describeIn naive_IBD Function for handling matrix input to naive_IBD
naive_IBD.matrix <- function(geno,p1hom,p2hom,ploidy = 2, p1name = "P1",p2name = "P2",...){
if(is.vector(geno)){
nams <- names(geno)
geno <- matrix(geno,ncol = 1)
rownames(geno) <- nams
}
if(is.null(rownames(geno))) stop("genotype matrix should contain rownames")
if(is.null(rownames(p1hom))) stop("P1 homologue matrix should contain rownames")
if(is.null(rownames(p2hom))) stop("P2 homologue matrix should contain rownames")
markers_p1 <- rownames(geno) %in% rownames(p1hom)
if(!all(markers_p1)) stop("Some rownames in genotype matrix were not found in P1 homologue matrix: ",
paste(rownames(geno)[!markers_p1],collapse = " "))
markers_p2 <- rownames(geno) %in% rownames(p2hom)
if(!all(markers_p2)) stop("Some rownames in genotype matrix were not found in P2 homologue matrix:\n",
paste(rownames(geno)[!markers_p2],collapse = " "))
assertthat::assert_that(ncol(p1hom) == ploidy)
assertthat::assert_that(ncol(p2hom) == ploidy)
pindex <- which(colnames(geno) %in% c(p1name,p2name))
if(length(pindex) > 0) geno <- geno[,-pindex,drop=F]
#To save time, we only calculate each unique case
#To identify unique cases we transform the data into one long vector
#This is memory expensive, might need to rework for big individual x genotype matrices
#Alterative method to identify cases
homcases <- apply(cbind(p1hom,p2hom),1,paste0,collapse = "_")
unhom <- unique(homcases)
ung <- unique(as.vector(geno))
whichom <- lapply(unhom,function(h) homcases == h)
whichg <- lapply(ung, function(g) geno == g)
cases <- lapply(whichg,function(g){
lapply(whichom, function(h){
mh <- matrix(rep(h,ncol(g)),nrow = length(h))
which(as.logical(mh*g))
})
})
cases <- unlist(cases,recursive = F)
names(cases) <- apply(expand.grid(unhom,ung),1,paste,collapse = "_")
#Then we fill in a result matrix
#we only calculate one time each combination
matfill <- rep(NA, (ncol(p1hom) + ncol(p2hom)))
res <- lapply(matfill,matrix,
ncol = ncol(geno),
nrow = nrow(geno))
parentcols <- list(p1 = 1:ncol(p1hom),
p2 = ncol(p1hom) + 1:ncol(p2hom))
for(i in 1:length(cases)){
cas <- names(cases)[i]
dat <- suppressWarnings(as.numeric(strsplit(cas,"_")[[1]]))
prob <- naive_IBD.numeric(geno = dat[ncol(p1hom) + ncol(p2hom) + 1],
p1hom = dat[parentcols$p1],
p2hom = dat[parentcols$p2],ploidy = ploidy)
for(j in 1:length(res)){
res[[j]][cases[[i]]] <- prob[j]
}
}
for(i in 1:length(res)){
dimnames(res[[i]]) <- dimnames(geno)
}
names(res) <- c(colnames(p1hom),colnames(p2hom))
return(res)
}
#' Configuration function
#'
#' Small internal function that returns all possible configurations of
#' parental inheritance given a genotype
#'
#' @param g integer, a genotype <= ploidy and >=0
#' @param ploidy integer, indicating ploidy.
#'
#' @return a list of vectors, where each element indicates
#' one possible parental chromosome inheritance configuration.
config <- function(g,ploidy = 2){
genoes <- c(rep(1,g),rep(0,ploidy-g))
return(unique(combinat::permn(genoes)))
}
### HMM implementation ----------
#Based on polyqtlR's hmm model
hmm_IBD <- function(geno,p1hom,p2hom,map,ploidy = 2,
p1name = "P1", p2name = "P2", hmm.error = 0.01,...){
if(is.null(map)) stop("Argument map cannot be null if hmm method is used")
if(!c("marker","position") %in% colnames(map)) stop("Map must be a data.frame with marker and position columns")
UseMethod("hmm_IBD",geno)
}
hmm_IBD.matrix <- function(geno,p1hom,p2hom,map,ploidy = 2,
p1name = "P1", p2name = "P2",hmm.error = 0.01,...){
homnames <- c(colnames(p1hom),colnames(p2hom))
phased_map <- list(LG1 = data.frame(marker = map$marker,
position = map$position,
p1hom[map$marker,],p2hom[map$marker,]))
colnames(phased_map[[1]]) <- c("marker","position",paste0("h",1:(ploidy*2)))
IBD <- polyqtlR::estimate_IBD(phased_maplist = phased_map, genotypes = geno,
ploidy = ploidy, method = "hmm",
parent1 = p1name,parent2 = p2name,
error = hmm.error)
homArray <- per_homologue(IBD$LG1$IBDarray)
IBD <- lapply(1:dim(homArray)[[2]],function(i) homArray[,i,])
names(IBD) <- homnames
return(IBD)
}
hmm_IBD.numeric <- function(...){
stop("Numeric method for hmm is not implemented: at least two markers required")
}
#Using polyqtlR's estimate_IBD
per_homologue <- function(IBDarray){
#Function expects an IBD array with following dimensions:
#[locus, genotypeclass, individuals]
#And translates it to an array with following dimensions:
#[locus, homologue, individuals]
#Genotypeclass should be expressed as a combination
#of letters where each letter represents a parental homologue
#First create indicator matrix
genoclass <- dimnames(IBDarray)[[2]]
homnames <- unique(unlist(strsplit(genoclass,"")))
indicatorMatrix <- sapply(homnames,function(x) stringr::str_count(genoclass,x) )
indicatorMatrix <- indicatorMatrix[,sort(colnames(indicatorMatrix))]
#Then multiply the matrices
haploIBD <- apply(IBDarray, c(1,3), function(x) x %*% indicatorMatrix)
haploIBD <- aperm(haploIBD,c(2,1,3))
dimnames(haploIBD)[[2]] <- homnames
return(haploIBD)
}
### Heuristic implementation ----------
heur_IBD <- function(geno,p1hom,p2hom,map = NULL,ploidy = 2,
p1name = "P1",p2name = "P2",...){
UseMethod("heur_IBD",geno)
}
heur_IBD.numeric <- function(geno,p1hom,p2hom,ploidy = 2,
p1name = "P1",p2name = "P2",...){
homnames <- c(names(p1hom),names(p2hom))
map <- data.frame(marker = "m1",position = 1)
phased_map <- list(LG1 = data.frame(marker = map$marker,
position = map$position,
p1hom,p2hom))
colnames(phased_map[[1]]) <- c("marker","position",paste0("h",1:(ploidy*2)))
g <- matrix(c(sum(p1hom),sum(p2hom),geno),nrow = 1)
rownames(g) <- "m1"
colnames(g) <- c("P1","P2","geno")
IBD <- faster_IBD(phased_maplist = phased_map, dosage_matrix = g,
ploidy = ploidy, p1name = p1name, p2name = p2name,
method.exp = "interval")
IBD <- IBD$LG1$IBDarray
names(IBD) <- homnames
return(IBD)
}
heur_IBD.matrix <- function(geno,p1hom,p2hom,map,ploidy = 2,
p1name = "P1",p2name = "P2",...){
if(is.null(map)) stop("Argument map cannot be null if heuristic method is used")
homnames <- c(colnames(p1hom),colnames(p2hom))
phased_map <- list(LG1 = data.frame(marker = map$marker,
position = map$position,
p1hom[map$marker,],p2hom[map$marker,]))
colnames(phased_map[[1]]) <- c("marker","position",paste0("h",1:(ploidy*2)))
IBD <- faster_IBD(phased_maplist = phased_map, dosage_matrix = geno,
ploidy = ploidy, p1name = p1name, p2name = p2name,
method.exp = "simple", double_reduction = F)
IBD <- lapply(1:dim(IBD$LG1$IBDarray)[[2]],function(i) IBD$LG1$IBDarray[,i,])
names(IBD) <- homnames
return(IBD)
}
#' Extremely fast estimation of identity-by-descent (IBD) probabilities.
#' @description The method of "quick-and-dirty" IBD estimation was originally developed by Bourke (2014) for tetraploid data only, and was subsequently
#' generalised by van Geest et al. (2017). Can be useful for a first quick analysis, particularly in large hexaploid datasets. However, the higher accuracy of IBD
#' probabilities generated by \code{\link{hmm_IBD}} makes that function the preferred choice.
#' @param phased_maplist A list of linkage maps calculated by \code{polymapR::create_phased_maplist}
#' @param dosage_matrix An integer matrix with markers in rows and individuals in columns
#' @param map_function The mapping function to calculate recombination frequency based on map distance (haldane or kosambi)
#' @param ploidy Ploidy level of parents or of the first parent
#' @param ploidy2 Ploidy level of the second parent. By default \code{NULL}, if parents have equal ploidy levels.
#' @param fix_threshold The threshold to fix the IBD probabilities while correcting for the sum of probabilities.
#' @param factor_dist Factor to increase or decrease the recombination frequencies as calculated from the map distances.
#' @param ncores Number of cores to use for multi-core processing.
#' @param p1name Name of parent 1 in the column names of \code{dosage_matrix}
#' @param p2name Name of parent 2 in the column names of \code{dosage_matrix}
#' @param method.exp Model to use for probability expansion. Either "simple" or "interval"
#' @return
#' A nested list (with the same length as phased_maplist). Each list element contains the following items:
#' \item{IBDtype}{Always "haplotypeIBD" for the output of this function}
#' \item{IBDarray}{An array of IBD probabilities. The dimensions of the array are: markers, homologues and individuals.}
#' \item{map}{Integrated linkage map positions of markers used in IBD calculation}
#' \item{parental_phase}{The parental marker phasing, coded in 1 and 0's}
#' \item{biv_dec}{\code{NULL}}
#' \item{gap}{The gap size used in IBD interpolation, by default \code{NULL}. See \code{\link{spline_IBD}}}
#' \item{genocodes}{\code{NULL}}
#' \item{pairing}{\code{NULL}}
#' \item{ploidy}{ploidy of parent 1}
#' \item{ploidy2}{ploidy of parent 2}
#' \item{method}{The method used, here "heur" (heuristic)}
#' \item{error}{The error prior used, not relevant here thus \code{NULL}}
#' @references
#' Bourke P.M. (2014) QTL analysis in polyploids: Model testing and power calculations. Wageningen University (MSc thesis)
#' @examples
#' data("phased_maplist.4x","SNP_dosages.4x")
#' IBD_list.4x <- fast_IBD(phased_maplist = phased_maplist.4x,
#' dosage_matrix = SNP_dosages.4x,
#' ploidy = 4)
faster_IBD <- function(phased_maplist,
dosage_matrix,
map_function = "haldane",
ploidy,
ploidy2 = NULL,
fix_threshold = 0.1,
factor_dist = 1,
ncores = 1,
p1name = "P1",
p2name = "P2",
method.exp = "simple",
double_reduction = T){
if(is.null(ploidy2)) ploidy2 <- ploidy
ploidy1 <- ploidy
map_function <- match.arg(map_function, choices = c("kosambi", "haldane"))
# check compatibility of dosages and phased maps
p_check <- function(){
pcols <- list(p1 = 3:(2+ploidy1) ,
p2 = 3:(2+ploidy2) + ploidy1)
res <- sapply(1:2,function(p){
sapply(seq(phased_maplist),function(n){
pname <- switch(p, '1' = p1name, '2' = p2name)
phom <- phased_maplist[[n]][,pcols[[p]]]
all(dosage_matrix[rownames(phom),pname] == rowSums(phom,na.rm = T))
})
})
return(all(res))
}
if(!p_check())
stop("Inconsistent input detected.\nIt is likely that unconverted dosages are being combined with a converted phased maplist, or vice versa.")
if(map_function == "kosambi"){
mapfun <- function(d, f = 1){
d <- d/100/f
0.5*((exp(4*d)-1)/(exp(4*d)+1))
}
} else if (map_function == "haldane"){
mapfun <- function(d, f = 1){
d <- d/100/f
0.5*(1-exp(-2*d))
}
}
outlist <- lapply(seq(phased_maplist),function(i){
ph <- phased_maplist[[i]]
posdf <- ph[,c("marker", "position"),drop = F]
## If marker data is conflicting and co-located, can lead to problems. Jitter positions if necessary
while(length(unique(posdf$position)) < nrow(posdf)){
warning("Non-unique marker positions detected. Jittering positions.")
posdf[duplicated(posdf$position),"position"] <- jitter(posdf[duplicated(posdf$position),"position"],amount=0.1)
#Map might no longer be in order:
ph <- ph[order(posdf$position),,drop = F]
posdf <- posdf[order(posdf$position),,drop = F]
}
posvec <- posdf$position
mark <- as.character(posdf$marker)
names(posvec) <- mark
#Same distmat calculation without the need of reshape2
#or additional objects
distmat <- abs(outer(posvec,posvec,FUN = "-"))
k0_mat <- mapfun(distmat, f = factor_dist)
rownames(ph) <- ph$marker
ph <- ph[,-which(colnames(ph) %in% c("marker", "position"))]
ph <- as.matrix(ph)
scd <- dosage_matrix[rownames(ph),,drop = F]
# get parental scores
#Names should not be assumed
par <- scd[,c(p1name,p2name),drop = F]
progeny <- scd[,-which(colnames(scd) %in% c(p1name,p2name)),drop = F]
# make a garray (=genotype probability array) with only 0.5
garray <- array(data = 0.5, dim = c(nrow(ph), ploidy1 + ploidy2, ncol(scd)-2),
dimnames = list(rownames(scd), colnames(ph), colnames(scd)[3:ncol(scd)]))
# get all markers with fully informative dosage
P1_hp <- scd[,p1name,drop = F] == 0.5*ploidy1
P2_hp <- scd[,p2name,drop = F] == 0.5*ploidy2
# get all maximum scores per marker which are informative in offspring
sumP <- rowSums(par)
# fill in garray
for(m in seq(nrow(scd))){
# get progeny that is informative - this is the kernel of the original approach:
# progeny score which equals (or exceeds, DR?) the sum of the parental scores have inherited
# all the parental alleles, and so we assign all these homologues with a probability of 1
progS <- colnames(progeny)[progeny[m,] >= sumP[m] & !is.na(progeny[m,])]
# assign 1 if so.
garray[m,ph[m,] == 1,progS] <- 1
# special case where dosage is fully informative (dosage = 0.5*ploidy)
# only if one of the P1 dosages should be 0.5*ploidy
# and marker is assigned to 0.5*ploidy homologues
# apply zeros to all others
if((P1_hp[m] | P2_hp[m]) & sum(ph[m,]) == sumP[m]){
if(P1_hp[m])
garray[m,c(ph[m,1:ploidy1] == 0, rep(FALSE, ploidy2)),progS] <- 0
if(P2_hp[m])
garray[m,c(rep(FALSE, ploidy1) ,ph[m,(ploidy1+1):(ploidy1 + ploidy2)] == 0),progS] <- 0
}
# if allele is absent in progeny, also informative:
prog0 <- colnames(progeny)[progeny[m,] == 0 & !is.na(progeny[m,])]
garray[m,ph[m,] == 1,prog0] <- 0
}
# order garray
garray <- garray[colnames(distmat),,]
# find nearest informative and change genotype probability accordingly
if(method.exp == "simple"){
garray <- simple_expansion(garray,k0_mat,posvec)
}else{
garray <- interval_expansion(garray,k0_mat,posvec)
}
# call correct_to_sumScore to array:
for(g in dimnames(garray)[[3]]){
corr_P1 <- correct_to_sumScore(subg_P = garray[,1:ploidy1,g],
threshold = fix_threshold, ploidy = ploidy1)
garray[,1:ploidy1,g] <- corr_P1
corr_P2 <- correct_to_sumScore(subg_P = garray[,(ploidy1 +1):(ploidy1 + ploidy2),g],
threshold = fix_threshold, ploidy = ploidy2)
garray[,(ploidy1 +1):(ploidy1 + ploidy2),g] <- corr_P2
}
return(list("IBDtype" = "haplotypeIBD",
"IBDarray" = garray,
"map" = cbind("chromosome" = i,
posdf),
"parental_phase" = ph,
"biv_dec" = NULL,
"gap" = NULL,
"genocodes" = NULL,
"pairing" = NULL,
"ploidy" = ploidy,
"ploidy2" = ploidy2,
"method" = "heur",
"error" = NULL))
})
if(!is.null(names(phased_maplist))){
names(outlist) <- names(phased_maplist)
} else{
warning(paste("phased_maplist had no LG names assigned. Assigning names",
paste0("'LG",seq(length(outlist)),"'", collapse = ", "),"to output list."))
names(outlist) <- paste0("LG",seq(length(outlist)))
}
return(outlist)
} #fast_IBD
#This is the reduction approach (HeurB)
interval_expansion <- function(garray,k0_mat,posvec){
#Expand probabilities
for(h in dimnames(garray)[[2]]){
gp <- garray[,h,]
inf1 <- gp == 1
inf0 <- gp == 0
informative <- inf1 | inf0
intervals <- group_informative(informative)
case <- unique(unlist(intervals))
caseindex <- sapply(intervals,function(i) case %in% i)
rownames(caseindex) <- case
for(ca in case){
bounds <- as.numeric(strsplit(ca,":")[[1]])
#Start and end boundaries are computed differently
#So we need to change it in case
if(bounds[1] == -1 & is.infinite(bounds[2])){
next()
}
if(bounds[1] == -1){
marks <- 1:bounds[2]
bounds[1] <- bounds[2]
}else if(is.infinite(bounds[2])){
marks <- bounds[1]:length(posvec)
bounds[2] <- bounds[1]
}else{
marks <- bounds[1]:bounds[2]
}
#All markers that are not boundary are non-informative
notinf <- marks[!marks %in% bounds]
near_inf <- (posvec[notinf] > mean(posvec[bounds])) + 1
#To which individuals should these results be allocated
locate <- caseindex[ca,]
newval <- k0_mat[cbind(notinf,bounds[near_inf])]
#We check the type (repulsion/coupling)
closest <- gp[bounds[near_inf],locate,drop = F]
#All results are equal (but if in coupling they must be reversed)
newval <- matrix(newval,ncol = ncol(closest),nrow = nrow(closest))
#If in coupling the probability is reversed
newval[closest == 1] <- 1 - newval[closest == 1]
gp[notinf,locate] <- newval
}
garray[,h,] <- gp
}
return(garray)
}
#This is a helper function for HeurB
group_informative <- function(mat){
#Given a logical matrix, returns a list
#Where each element contains the intervals of
#False values in each column. The first
#Interval starts at -1 and the last ends at Inf
intervals <- apply(mat,2,function(i){
m <- which(i)
if(length(m) == 0) return("-1:Inf")
move <- diff(m)
#We define boundaries of non-informative intervals
m[move != 1]
starts <- c(m[move != 1])
ends <- c(m[-1][move != 1])
#Not sure why but this is necessary for correct interval definition
if(length(ends) != length(starts)) starts <- starts[-length(starts)]
intervals <- paste0(starts,":",ends)
if(all(intervals == ":")) intervals <- NULL
intervals <- c(paste0(c(-1,m[length(m)]),":",c(m[1],Inf)),intervals)
})
}
#This is equivalent to the previous approach (HeurA)
simple_expansion <- function(garray,k0_mat,posvec){
for(g in dimnames(garray)[[3]]){
for(h in dimnames(garray)[[2]]){
gp <- garray[,h,g]
inf1 <- gp == 1
inf0 <- gp == 0
informative <- inf1 | inf0
if(any(informative)){
mdist <- k0_mat[names(gp)[!informative], names(gp)[informative], drop = FALSE]
near_inf <- findNearest(posvec[!informative],posvec[informative])
#The k1 is equal to 1-k0 so this makes sense:
if(any(inf1)) mdist[,names(inf1)[inf1]] <- 1 - mdist[,names(inf1)[inf1]]
garray[rownames(mdist),h,g] <- mdist[cbind(seq(near_inf), near_inf)]
}
}
}
return(garray)
}
#This function simplifies finding the nearest marker
findNearest <- function(a,b){
span <- range(c(a,b))
intervals <- c(span[1] - 1,
(b[-length(b)] + b[-1])/2,
span[2] + 1)
findInterval(a,intervals)
}
# function to correct total sum of genotype probabilties to 0.5*ploidy
correct_to_sumScore <- function(subg_P, threshold = 0.1, ploidy){
sumScore <- 0.5*ploidy
parentalDose <- rowSums(subg_P)
#Make sure that IBDs don't exceed 0.5*ploidy
exceed <- parentalDose > sumScore
subg_P[exceed,] <- subg_P[exceed,]*sumScore/parentalDose[exceed]
#Fully informative if 1/2 is 0
inf0 <- rowSums(subg_P == 0) == 0.5*ploidy
subg_P[inf0,][subg_P[inf0,] != 0] <- 1
#Fully informative if 1/2 is 1
inf1 <- rowSums(subg_P == 1) == 0.5*ploidy
subg_P[inf1,][subg_P[inf1,] != 1] <- 0
#Those that are not informative are threshold corrected
#Not sure what is happening here
fix <- (subg_P < threshold) | (subg_P > (1 - threshold))
fixable <- rowSums(fix) == ploidy
fix[fixable,] <- subg_P[fixable,] == 1 | subg_P[fixable,] == 0
corval <- subg_P
corval[!fix] <- 0
corr <- sumScore - rowSums(corval)
corval <- subg_P
corval[fix] <- 0
corval <- corval*corr/rowSums(corval)
subg_P[!fix] <- corval[!fix]
# Some probabilites can end up >1. Correct for this:
while(any(subg_P > 1)){
over <- rowSums(subg_P > 1) > 0
need_cor <- subg_P[over,,drop = F]
need_cor <- t(apply(need_cor,1,function(gp){
addp <- gp < 1 & gp > 0#& gp>=0.5
remp <- gp > 1
gp[addp] <- gp[addp] + (sum(gp[gp > 1] - 1)/sum(addp))
gp[remp] <- 1
return(gp)
}))
subg_P[over,] <- need_cor
}
return(subg_P)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.