# R/mungeData.R In HWxtest: Exact Tests for Hardy-Weinberg Proportions

#### Documented in alleleCountsclearUpperdf.to.matricesfillUppermatrix.to.vecremove.missing.allelesvec.to.matrix

# Functions for arranging data used in HW test
# (c) William R. Engels, 2014
#'

#' @title
#' mungeData
#' @description
#' Utility functions for handling genotype counts and arranging data
#'

#' @details
#' Interconvert between different formats for genotype counts.
#'
#' Let \code{k} be the number of alleles:
#' \itemize{

#' \item \code{clearUpper} fills the upper-right half of the \eqn{k x k} matrix with \code{NA}
#' \item \code{fillUpper} makes the \eqn{k x k} matrix symmetrical by filling the upper-right half with numbers from the lower half.
#' \item \code{vec.to.matrix} converts genotype counts in vector form and returns a matrix. The vector must have \eqn{k(k+1)/2} non-negative integers.
#' \item  \code{matrix.to.vec} converts a \eqn{k x k} matrix of genotype counts to a vector of length \eqn{k(k+1)/2}
#' \item \code{alleleCounts} returns a vector of length \eqn{k} containing the numbers of each allele. The sum of this vector will be twice the number of diploids in the sample.
#' \item \code{remove.missing.alleles} returns a matrix with no \code{0}'s for allele counts
#' \item \code{df.to.matrices} converts a data frame to a list of genotype count matrices. The data frame should be of the kind produced in the package \code{adegenet} with \code{genind2df}
#' }
#'
#'
#' @param gmat a matrix of non-negative integers representing genotype counts. In a matrix of genotype counts, \code{a[i,j]} and \code{a[j,i]} both represent the same heterozygote. Only the lower-left half of \code{gmat} is used. Numbers along the diagonal represent counts of the homozygotes.
#'
#' @param gvec vector containing \code{k(k+1)/2} genotype counts. All non-negative integers.  Genotype counts should be in the order: \code{a11, a21, a22, a31, a32, ..., akk}
#'
#' @param alleleNames an optional list of names for the alleles. The length should be \eqn{k}
#' @param df a dataframe containing individual genotypes. Each row represents an individual. The first column, named \dQuote{pop} names the population. Each other column is named for a particular locus. The genotypes are as \dQuote{123/124}
#' @param sep For a dataframe, this is the separator character. typically \dQuote{/}
#'
#'
#' @examples
#' gvec <- c(0,3,1,5,18,1,3,7,5,2)
#' gmat <- vec.to.matrix(gvec, alleleNames=letters[1:4])
#' alleleCounts(gmat)
#'
#'
#' @rdname mungeData
#' @export
fillUpper <-
function(gmat){
if(!(is.matrix(gmat) && (nrow(gmat)==ncol(gmat)))) stop("Must be square matrix at least 2x2")
k <- nrow(gmat);
if(k<2) return(gmat)
for(j in 2:k) {gmat[1:(j-1), j] <- gmat[j,1:(j-1)]};
gmat
}

#' @rdname mungeData
#' @export
alleleCounts <-
function(gmat) {
if(class(gmat)!="matrix") gmat <- vec.to.matrix(gmat)
t <- fillUpper(gmat);
k <- dim(t)[1];
m <- integer(k);
for(i in 1:k) {m[i] <- sum(t[i,]) + t[i,i]};
if(!is.null(rownames(gmat))) names(m) <- rownames(gmat)
m
}

#' @title vec.to.matrix
#' @rdname mungeData
#' @export
vec.to.matrix <-
function(gvec, alleleNames=""){
if(!(is.vector(gvec) && is.numeric(gvec))) stop("\nMust be a vector")
nGenotypes <- length(gvec)
nAlleles <- as.integer((sqrt(8*nGenotypes + 1) - 1)/2)
if(nGenotypes != nAlleles*(nAlleles + 1)/2) stop("\nWrong number of genotype counts")
t <- matrix(NA, nAlleles, nAlleles)
for(i in 1:nAlleles){t[i, 1:i] <- gvec[(i*(i-1)/2 + 1):(i*(i+1)/2)]}
if(length(alleleNames)>=nAlleles) {
rownames(t) <- alleleNames;
colnames(t) <- alleleNames;
}
t
}
#'
#' @title remove.missing.alleles
#' @description remove missing alleles
#' @details none
#' @rdname mungeData
#' @export
remove.missing.alleles <-
function(gmat) {
if(class(gmat)!="matrix") gmat <- vec.to.matrix(gmat)
m <- alleleCounts(gmat)
zm <- which(m==0)
if(length(zm)) gmat <- gmat[-zm,-zm]
gmat
}

#'
#' @title matrix.to.vec
#' @description converts matrix to vector
#' @details none
#' @rdname mungeData
#' @export
matrix.to.vec <-
function(gmat){
if(!(is.matrix(gmat) && (nrow(gmat)==ncol(gmat)))) stop("Must be square matrix")
v <- c();
k <- nrow(gmat)
for(i in 1:k){v <- append(v,gmat[i,1:i])}
names(v) <- NULL;
v
}

#' @title clearUpper
#' @description Clears upper-right of matrix
#' @rdname mungeData
#' @export
clearUpper <-
function(gmat){
if(!(is.matrix(gmat) && (nrow(gmat)==ncol(gmat)))) stop("Must be square matrix")
k <- nrow(gmat);
for(j in 2:k) {gmat[1:(j-1), j] <- NA};
gmat
}

#' @title df.to.matrices
#' @rdname mungeData
#' @export
df.to.matrices <- function(df, sep="/"){
if(!is.data.frame(df)) stop("Must be a data frame")
pnames <- levels(factor(df$pop)) # population names gnames <- colnames(df)[-1] #locus names (with "pop" removed) res <- list() for(pn in pnames){ popn <- list() popsel <- df$pop==pn
for(gn in gnames){
anames <- levels(factor(unlist(strsplit(df[[gn]][popsel], sep))))
k <- length(anames)
ta <- table(factor(df[[gn]][popsel]))
gtypes <- names(ta)
t <- matrix(0, k,k)
rownames(t) <- anames
colnames(t) <- anames
if(k > 1) {
for(g in gtypes) {
diploid <- unlist(strsplit(g,sep))
t[diploid[[1]], diploid[[2]]] <- ta[[g]]
}}
t <- t + t(t)
diag(t)  <- diag(t)/2
popn[[gn]] <- NA
if(k > 1) popn[[gn]] <- clearUpper(t)
} # for gn
res[[pn]] <- popn
} #for pn
res
}
NULL

# #' Convert a list of genotypes into a genotype count matrix
# #'
# #' Genotype lists as are used by packages genetics and adegenet are converted to an array of genotype counts.
# #' This function requires package genetics
# #'
# #' @param g List of text objects indicating genotypes. Alleles are separated by \dQuote{/}
# #'
# #' @return matrix of \eqn{k x k} genotype counts
# #'
# #' @examples
# #' g <- c(rep("a/b",3),
# #'		"b/b",
# #'		rep("a/c", 5),
# #'		rep("b/c", 18),
# #'		"c/c",
# #'		rep("a/d",3),
# #'		rep("b/d", 7),
# #'		rep("c/d", 5),
# #'		rep("d/d", 2))
# #' genotypeList.to.matrix(g)

# #' @export
# genotypeList.to.matrix <-
# function(g){
# if(!require(genetics)) stop("\ngenetics package is required")
# x <- genotype(g);
# tab <- table(factor(allele(x, 1), levels = allele.names(x)), factor(allele(x, 2), levels = allele.names(x)));
# t(tab)
# }


## Try the HWxtest package in your browser

Any scripts or data that you put into this service are public.

HWxtest documentation built on May 29, 2017, 3:33 p.m.