Nothing
#' @title Count opposing homozygous SNPs between pairs of individuals
#'
#' @description Quick identification of likely parents, as the number of
#' opposing homozygous (OH) SNPs is expected to be zero for parent- offspring
#' pairs in absence of genotyping errors, and greater than zero for all other
#' pairs.
#'
#' @param x Either a matrix, dataframe or similar where the first two columns
#' are individual IDs, or a vector with IDs. In the second case, you may
#' provide \code{ID2}, and the output will be an ID1 x ID2 matrix; else the
#' output will be an ID1 x ID1 matrix. Non-genotyped individuals are included
#' in the results with all \code{NA}'s, and a warning.
#' @param ID2 optional second vector with IDs
#' @param GenoM numeric matrix with genotype data: One row per individual, one
#' column per SNP, coded as 0, 1, 2, missing values as a negative number or
#' NA. Row names must be individual IDs, column names are ignored. You can
#' reformat data with \code{\link{GenoConvert}}, or use other packages to get
#' it into a genlight object and then use \code{as.matrix}.
#' @param max_OH stop counting OH's for a pair if this value is reached, to
#' reduce computation time. Ignored if negative value or equal to total number
#' of SNPs.
#' @param quiet suppress messages
#'
#' @details Counting the number of opposing homozygous (OH) SNPs is much faster
#' than calculating likelihoods, and does not rely on an estimated genotyping
#' error rate. It can therefore be useful during quality control, or to help
#' figure out problems when assignment rate with \code{\link{sequoia}} is
#' lower than expected.
#'
#' @seealso \code{\link{CalcPairLL}} to calculate likelihoods for pairs,
#' \code{\link{CalcOHLLR}} to calculate OH for a pedigree,
#' \code{\link{CalcMaxMismatch}} for calculation of the maximum OH used by
#' \code{\link{sequoia}} to filter potential parent-offspring pairs.
#'
#' @examples
#' offspring_ids <- with(LH_HSg5, ID[BirthYear==2001])
#' candidate_father_ids <- with(LH_HSg5, ID[BirthYear==2000 & Sex==2])
#'
#' OH_matrix <- CountOH(offspring_ids, candidate_father_ids, GenoM=Geno_HSg5)
#'
#' hist(c(OH_matrix), breaks=c(0:40)-.5)
#' # with high quality SNP data, there is often a clear separation in OH counts
#' # between parent-offspring pairs (here: OH<3) and others (here: OH>4).
#' # BUT: non-PO close relatives may have very low OH counts by chance,
#' # and true PO pairs may have fairly high OH counts due to genotyping errors.
#'
#'
#' @useDynLib sequoia, .registration = TRUE
#'
#' @export
CountOH <- function(x = NULL,
ID2 = NULL,
GenoM = NULL,
max_OH = -1,
quiet = FALSE)
{
if (length(dim(x)) == 2) { # x is matrix or dataframe or similar
OutFormat = 'Dataframe'
ID1 <- as.character(x[[1]])
ID2 <- as.character(x[[2]])
} else if (is.null(x)) { # TODO: check that x is a vector
stop('Please provide either a matrix/dataframe or two vectors with IDs')
} else {
OutFormat <- 'Matrix'
ID1 <- as.character(x)
if (is.null(ID2)) {
if (!quiet) cli::cli_alert_info('No ID2 provided, so using ID2 = ID1')
ID2 <- as.character(x)
} else {
ID2 <- as.character(ID2)
}
}
if (!all(c(ID1,ID2) %in% rownames(GenoM)))
if (!quiet) cli::cli_alert_info('Some IDs are not among rownames of GenoM')
GenoM <- CheckGeno(GenoM, quiet = TRUE, Plot = FALSE)
if (OutFormat == 'Dataframe') {
Pairs <- as.data.frame(x[,1:2])
names(Pairs) <- c('ID1', 'ID2')
} else {
Pairs <- data.frame(ID1 = rep(ID1, each = length(ID2)),
ID2 = rep(ID2, times = length(ID1)))
}
gID <- rownames(GenoM)
GenoNums <- setNames(seq_along(gID), gID)
for (x in 1:2) {
Pairs[, paste0('ID',x,'.num')] <- ifelse(Pairs[, paste0('ID',x)] %in% gID,
GenoNums[Pairs[, paste0('ID',x)]], 0)
}
Np <- nrow(Pairs)
if (max_OH < 0) max_OH <- ncol(GenoM)
TMP <- .Fortran(countpairoh,
ng = as.integer(nrow(GenoM)),
nm = as.integer(ncol(GenoM)), # TODO: check for consistent naming
np = as.integer(Np),
maxoh = as.integer(max_OH),
genofr = as.integer(GenoM),
pairids = as.integer(c(Pairs$ID1.num, Pairs$ID2.num)),
ohrf = integer(Np))
TMP$ohrf[ TMP$ohrf <0 ] <- NA # either or both not genotyped
if (OutFormat == 'Dataframe') {
return(
cbind(x, OH = TMP$ohrf )
)
} else {
return(
matrix(TMP$ohrf, byrow = TRUE,
nrow=length(ID1), ncol=length(ID2),
dimnames = list(ID1 = ID1, ID2 = ID2))
)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.