argyle: Basic interface and QC for genotypes from Illumina Infinium arrays

## genetics.R
## implement simple genetics logic on 'genotypes' object

## for input checking, but not used to avoid repeated copying of big objects
.check.and.warn <- function(x) {
	
	if (is.matrix(x)) {
		if (!inherits(x, "genotypes")) {
			warning(paste("Input is not a 'genotypes' object; assuming it is a matrix of",
						  "genotypes with shape (markers x samples)."))
			if (!is.numeric(x)) {
				stop("Input is not numeric.")
			}
			else {
				return(TRUE)
			}
		}
		else {
			if (!is.numeric(x)) {
				stop("Input is not numeric.")
			}
			else {
				return(TRUE)
			}
		}
	}
	else {
		stop("Input is not a matrix.")
	}
	
}

#' Calculate allele frequencies by marker or sample
#' 
#' @param gty a \code{genotypes} object with numeric allele encoding
#' @param by margin over which to calculate frequencies (usually \code{"markers"})
#' @param na.rm skip over missing genotypes in frequency calculations
#' @param counts logical; if \code{TRUE}, report allele counts instead of relative frequencies
#' @param ... ignored
#' 
#' @return a named vector of allele frequencies
#' 
#' @export
freq <- function(gty, by = c("markers","samples"), na.rm = TRUE, counts = FALSE, ...) {
	
	#if (!.check.and.warn(gty))
	#	stop("Can't understand input.")
	
	if (!is.numeric(gty))
		stop("Input is not numeric.")
	
	by <- match.arg(by)
	if (by == "markers")
		if (!counts)
			return( rowMeans(gty, na.rm = na.rm)/2 )
		else
			return( rowSums(gty, na.rm = na.rm) )
	else
		if (!counts)
			return( colMeans(gty, na.rm = na.rm)/2 )
		else
			return( colSums(gty, na.rm = na.rm) )
	
}



#' Calculate minor-allele frequency (MAFs) from a genotypes matrix
#' 
#' Just an alias or \code{freq(x, "markers")}.  Sticklers for nomenclature will be aware that this
#' function does not truly return the MAF unless the allele encoding is \code{"relative"}.
#' 
#' @param ... passed through to call to \code{freq()}
#' 
#' @seealso \code{\link{freq}}
#'
#' @export
maf <- function(...) freq(..., by = "markers")

#' Count number of missing genotypes by marker or sample
#'
#' @param gty a \code{genotypes} object
#' @param by margin over which to count missing calls
#' @param ... ignored
#'
#' @return a named vector of missing-genotype *counts*
#'
#' @export
nmiss <- function(gty, by = c("markers","samples"), ...) {
	
	#if (!.check.and.warn(gty))
	#	stop("Can't understand input.")
	
	by <- match.arg(by)
	if (by == "markers")
		dim.along <- 1
	else
		dim.along <- 2
	
	apply(gty, dim.along, function(x) sum(is.na(x)))
	
}

#' Calculate rate of missing genoypes by marker or sample
#' 
#' @param gty a \code{genotypes} object
#' @param by margin over which to count missing calls
#' @param ... ignored
#' 
#' @return a named vector of missing-genotype *proportions*
#' 
#' @export
missingness <- function(gty, by = c("markers","samples"), ...) {
	
	by <- match.arg(by)
	rez <- nmiss(gty, by = by, ...)
	denom <- if (by == "markers") ncol(gty) else nrow(gty)
	return(rez/denom)
	
}

#' Identify the consensus genotype call among a group of samples
#' 
#' @param gty a \code{genotypes} object
#' @param nas.allowed maximum proportion of missing genotypes to ignore 
#' @param ... ignored
#' 
#' @return a named vector of consensus genotypes, with length equal to number of rows in \code{gty}
#' 
#' @details Define the "consensus" genotype to be the genotype with greatest frequency among a group
#' 	of samples, with ties broken in favor of the major allele.  This function computes that consensus
#' 	ignoring missing values, and then masks sites with missingness greater than \code{nas.allowed}.
#' 
#' @export
consensus <- function(gty, nas.allowed = 0.0, ...) {
	
	#if (!.check.and.warn(gty))
	#	stop("Can't understand input.")
	
	rez <- apply(gty, 1, function(x) which.max(tabulate(x+1, nbins = 3))-1)
	nas <- (missingness(gty, "markers") > nas.allowed)
	rez[nas] <- NA
	
	return(rez)
	
}

## helper function to determine if a marker is segregating
is.segregating <- function(x, allow.het = TRUE, ...) {
	
	tbl <- tabulate(x+1, nbins = 3)
	if (allow.het) {
		flag <- sum(tbl > 0) > 1
		flag <- flag | any(x == 1, na.rm = TRUE)
	}	
	else {
		flag <- sum(tbl[c(1,3)] > 0) > 1
	}
	return(flag)
	
}

#' Identify segregating sites among a group of samples
#' 
#' @param gty a \code{genotypes} object
#' @param nas.allowed maximum proportion of missing genotypes to ignore
#' @param allow.het logical; if \code{FALSE}, ignore samples with heterozygous genotype
#' @param ... ignored
#' 
#' @return a named logical vector, of length equal to the number of rows in \code{gty},
#' 	indicating whether a site is segregating
#' 	
#' @details A site is defined to be "segregating" if two or more alleles are present in one or
#' 	more copies across all samples.  Sites at which missingness is greater than \code{nas.missing}
#' 	are set to \code{NA}.
#' 	
#' 	When \code{allow.het = FALSE}, this is equivalent to asking whether at least one pair of samples
#' 	has a fixed difference at each site.
#' 	
#' @seealso \code{\link{fixed.diffs}}
#' 
#' @export
segregating <- function(gty, nas.allowed = 0.0, allow.het = TRUE, ...) {
	
	#if (!.check.and.warn(gty))
	#	stop("Can't understand input.")
	
	rez <- apply(gty, 1, is.segregating, allow.het = allow.het)
	nas <- (missingness(gty, "markers") > nas.allowed)
	rez[nas] <- NA
	
	return(rez)
	
}

#' Identify fixed differences between a pair of samples
#' 
#' @param gty a \code{genotypes} object
#' @param cols indexing vector for selecting columns from \code{gty}
#' @param ... ignored
#' 
#' @return a named logical vector, of length equal to the number of rows in \code{gty},
#' 	indicating whether a site is a fixed difference between the two samples
#' 	
#' @details A site is defined to have a "fixed difference" between two samples if they have opposite
#' 	homozygous genotypes.  In contrast to \code{segregating()}, this function rejects sites at which either
#' 	sample is heterozygous or has a missing genotype, rather than marking them with \code{NA}s.
#' 	
#' 	If more than two samples are present in the input, an error occurs.
#' 	
#' @seealso \code{\link{segregating}}
#' 
#' @export
fixed.diffs <- function(gty, cols = TRUE, ...) {
	
	#if (!.check.and.warn(gty))
	#	stop("Can't understand input.")
	
	gty <- gty[ ,TRUE, drop = FALSE ]
	if (ncol(gty) != 2)
		stop("A 'fixed difference' can only exist between exactly two samples.")
	
	diffs <- (gty[ ,1 ] != gty[ ,2 ])
	ns <- is.na(rowSums(gty))
	hs <- (gty[ ,1 ] == 1) | (gty[ ,2 ] == 1)
	
	return(setNames( as.vector(diffs & !ns & !hs), rownames(gty) ))
	
}

#' Find markers informative between sample groups
#' 
#' @param gty a \code{genotypes} object
#' @param between expression for grouping samples (see Details)
#' @param ... other parameters to be passed to \code{consensus()} (such as \code{nas.allowed})
#' 
#' @return a named logical vector, of length equal to the number of rows in \code{gty},
#' 	indicating whether a marker is informative in the given samples
#' 	
#' @details A marker is defined to be "informative" if it is not monomorphic across a group of samples.
#' 	In the context of a genetic cross (eg. an F2 intercross), an informative marker is one with a fixed
#' 	difference between the two parental lines.  This function generalizes that idea to more than two groups:
#' 	a marker is informative if it has a fixed difference between any pair of groups, as defined in the
#' 	\code{between} expression.
#' 	
#'  The expression \code{between} can be protected for evaluation in the environment of the sample metadata
#'  in \code{attr(gty, "ped")} (view it with \code{samples(gty)}) by wrapping it in \code{.()}.  To group
#'  on more than one variable, provide multiple expressions in a list or separate them with commas like
#'  \code{.(expr1, expr2)}.  (This syntax will be familiar to users of the \code{plyr} package.)
#'  
#'  Consensus genotypes are first computed within the groups defined by unique values of \code{between},
#'  and then markers with fixed differences between any two groups are idenfitied.
#' 	
#' @seealso \code{\link{segregating}}, \code{\link{fixed.diffs}}
#' 
#' @export
informative <- function(gty, between, ...) {
	
	if (!inherits(gty, "genotypes"))
		stop("Please supply an object of class 'genotypes'.")
	
	#message("Calculating consensus genotypes...")
	cons <- do.call("cbind", genoapply(gty, 2, between, consensus, ..., strip = TRUE))
	
	#message("Iterating over groups...")
	seg <- segregating(cons, allow.het = FALSE, ...)
	
	return(seg)
	
}

#' Predict genotype of an F1 individual given genotypes of its parents
#' 
#' @param object a \code{genotypes} object
#' @param na.rm logical; should missing genotypes be ignored?
#' @param ... ignored
#' 
#' @return a new \code{genotypes} object containing predicted genotypes for all possible F1s between
#' 	the parents in \code{object}
#' 	
#' @details Each column in the input is assumed to be either a representative individual from an inbred
#' 	line, or a consensus genotype across several such individuals.  No checks are performed on the homzygosity
#' 	of the parents, but parents with high heterozygosity will result in F1 predictions with mostly missing
#' 	genotypes.
#' 	
#' 	When \code{na.rm = TRUE}, a missing genotype in either parent causes an \code{NA} to be emitted for
#' 	the F1; when \code{na.rm = FALSE}, the non-missing parental genotype (if any) is emitted. Heterozygous
#' 	calls in either parent always generate an \code{NA} in the F1.
#' 
#' 	F1s are named like "parent1::parent2".
#' 
#' 	Sex chromosomes and mitochonria receive no special treatment, so genotypes there will probably be bogus
#' 	(except for the case of chrX in F1 females.)
#' 	
#' @seealso \code{\link{segregating}}, \code{\link{fixed.diffs}}
#' 
#' @export predict.f1
predict.f1 <- function(object, na.rm = FALSE, ...) {
	
	if (!inherits(object, "genotypes"))
		stop("Please supply an object of class 'genotypes'.")
	
	pairs <- combn(ncol(object), 2)
	f1.names <- paste(colnames(object)[ pairs[1,] ], colnames(object)[ pairs[2,] ],
					  sep = "::")
	rez <- matrix(NA, ncol = ncol(pairs), nrow = nrow(object),
				  dimnames = list(rownames(object), f1.names))
	
	map <- attr(object, "map")
	object <- .copy.matrix.noattr(object)
	message(paste("Predicting F1 genotypes for", ncol(pairs), "pairs of parents..."))
	for (i in seq_len(ncol(pairs))) {
		f1 <- as.vector(rowMeans(object[ ,pairs[,i] ], na.rm = na.rm))
		het <- apply(object, 1, function(x) any(x == 1, na.rm = na.rm))
		f1[het] <- NA
		rez[,i] <- f1
	}
	
	rez <- genotypes(rez, map = map, alleles = "01")
	return(rez)
	
}

pairwise.diffs <- function(gty, ...) {
	
	pairs <- combn(ncol(gty), 2)
	pnames <- paste(colnames(gty)[ pairs[1,] ], colnames(gty)[ pairs[2,] ],
					 sep = "::")
	rez <- matrix(NA, ncol = ncol(pairs), nrow = nrow(gty),
				  dimnames = list(rownames(gty), pnames))
	
	map <- attr(gty, "map")
	gty <- .copy.matrix.noattr(gty)
	message(paste("Calculating pairwise differences for", ncol(pairs), "pairs of parents..."))
	for (i in seq_len(ncol(pairs))) {
		is.diff <- as.integer(gty[ pairs[1,i] ] == gty[ pairs[2,i] ])
		rez[,i] <- is.diff
	}
	
	rez <- genotypes(rez, map = map, alleles = "01")
	return(rez)
	
}

#' Calculate the proportion of sites at which each sample is heterozygous
#' 
#' @param gty a \code{genotypes} object
#' @param na.rm logical; ignore missing genotypes when counting heterozygous sites?
#' @param ... ignored
#' 
#' @return a named vector with the proportion of sites of heterozygous for each sample
#' 
#' @export
prop.het <- function(gty, na.rm = TRUE, ...) {
	
	gty[ gty != 1 ] <- 0
	colMeans(gty, na.rm = na.rm)
	
}

#' Calculate heterozygosity by marker
#' 
#' @param gty a \code{genotypes} object
#' @param hwe if \code{TRUE}, compute the MLE \code{hat{h}} assuming Hardy-Weinberg equilibrium
#' @param na.rm logical; ignore missing genotypes?
#' @param ... ignored
#' 
#' @return a named vector with the heterozygosity of each marker
#'
#' @export
heterozygosity <- function(gty, hwe = FALSE, na.rm = TRUE, ...) {
	
	
	if (!hwe) {
		gty[ gty != 1 ] <- 0
		rowMeans(gty, na.rm = na.rm)
	}
	else {
		apply(gty, 1, function(x) 1 - mean(x == 0, na.rm = na.rm)^2 - mean(x == 2, na.rm = na.rm))
	}
		
	
}

## cheap estimate of pairwise LD via genotypic correlation
ld <- function(gty, force = FALSE, ...) {
	
	if (!inherits(gty, "genotypes"))
		stop("Please supply an object of class 'genotypes'.")
	
	if (prod(dim(gty)) > 1e6 && !force)
		stop("Resulting matrix will have >1M entries. Use force = TRUE to proceed anyway.")
	
	ldmat <- cor(t(gty))^2
	class(ldmat) <- c("LD.matrix", class(ldmat))
	return(ldmat)
	
}

#' Compute rate of Mendelian inconsistency between an individual and possible mother-father pairs.
#' 
#' @param gty a \code{genotypes} object containing exactly one target individual
#' @param parents a \code{genotypes} object containing possible parents for the target individual
#' @param verbose logical; if \code{TRUE}, send progress messages to terminal
#' @param ... ignored
#' 
#' @return a dataframe of mother-father pairs and corresponding Mendel distances
#' 
#' @details Define the "Mendel distance" between an individual and two possible parents as the mean
#' 	rate of Mendelian inconsistencies -- that is, markers at which zero alleles are shared IBS between
#' 	one parent and one offspring -- across the pairs (father, offspring) and (mother, offspring). Given
#' 	a set of possible parents (in \code{parents}), this function creates all possible male-female pairs
#' 	and computes the Mendel distance between each pair and a single target individual (in \code{gty}).
#' 	
#' 	If the sex of the target individual is non-missing, the sex chromosomes will be handled specially:
#' 	for males, chrY is compared to the father's chrY and chrX to the mother's chrX; for females, chrX
#' 	is compared to both parents and chrY is ignored.  If the sex of the target individual is missing,
#' 	the sex chromosomes will be ignored, and mitochondrial markers are always ignored because they are
#' 	so few in number and tend to be homoplasic.
#' 
#' @seealso \code{\link{guess.parents}}
#' 
#' @export
mendel.distance <- function(gty, parents, verbose = TRUE, ...) {
	
	if (!inherits(gty, "genotypes") || !inherits(parents, "genotypes"))
		stop("Both input (gty) and possible parents (parents) must be of class 'genotypes'.")
	
	if (attr(gty, "alleles") != attr(parents, "alleles") || !is.numeric(gty))
		stop("Alleles should be in same numeric encoding in unknown (gty) and parents (parents).")
	
	if (ncol(gty) > 1)
		stop("Can only handle one sample of unknown parentage at a time.")
	
	## extract parents of each sex
	moms <- parents[ ,attr(parents, "ped")$sex == 2 ]
	dads <- parents[ ,attr(parents, "ped")$sex == 1 ]
	if (nrow(moms) < 1 || nrow(dads) < 1)
		stop("Not enough parents.")
	
	## keep only markers shared between parents and offspring...
	keep <- intersect(rownames(gty), rownames(moms))
	gty <- gty[ keep, ]
	moms <- moms[ keep, ]
	dads <- dads[ keep, ]
	
	## generate possible pairs
	#pairs <- as.matrix(expand.grid( colnames(moms), colnames(dads), stringsAsFactors = FALSE ))
	if (verbose)
		message("Investigating ", nrow(pairs), " possible parents...")
	
	## get chromosomes by inheritance pattern
	autos <- autosomes(gty[,1])
	chrx <- xchrom(gty[,1])
	chry <- ychrom(gty[,1])
	
	## weighting factors
	A <- nrow(autos)
	X <- nrow(chrx)
	Y <- nrow(chry)
	
	## autosomal distances
	dm <- apply(autosomes(moms), 2, ibs0, autos)
	dd <- apply(autosomes(dads), 2, ibs0, autos)
	
	if (sex(gty) == 1) {
		## males: chrX distances to mom only, plus chrY to dad
		dm <- (A/(A+X))*dm + (X/(A+X))*apply(xchrom(moms), 2, ibs0, chrx)
		dd <- (A/(A+Y))*dd + (Y/(A+Y))*apply(ychrom(dads), 2, ibs0, chry) 
	}
	else if (sex(gty) == 2) {
		## females: chrX distances to both parents
		dm <- (A/(A+X))*dm + (X/(A+X))*apply(xchrom(moms), 2, ibs0, chrx)
		dd <- (A/(A+X))*dd + (X/(A+X))*apply(xchrom(dads), 2, ibs0, chrx)
	}
	
	rez <- data.frame()
	if (ncol(moms))
		rez <- rbind(rez, data.frame(iid = colnames(gty), parent = colnames(moms), sex = 2, score = dm[ colnames(moms) ]))
	
	if (ncol(dads))
		rez <- rbind(rez, data.frame(iid = colnames(gty), parent = colnames(dads), sex = 2, score = dd[ colnames(dads) ]))
		
	return(rez)
	
}

ibs0 <- function(x,y) sum(abs(x-y) == 2, na.rm = TRUE)/sum(!is.na(x+y))

ibs0.fancy <- function(x, y, ...) {
	
	if (!all(inherits(x, "genotypes"), inherits(y, "genotypes")))
		stop("Both input objects must be of class 'genotypes'.")
	
	## make sure only one sample in each object
	x <- x[,1]
	y <- y[,1]
	
	## get sexes
	sx <- sex(x)
	sy <- sex(y)
	
	## autosomal genotypes
	xa <- autosomes(x)
	ya <- autosomes(y)
	
	## chrX genotypes
	xx <- x[ grepl("X", attr(x, "map")$chr), ]
	yx <- y[ grepl("X", attr(x, "map")$chr), ]
	
	## chrY genotypes
	xy <- x[ grepl("Y", attr(x, "map")$chr), ]
	yy <- y[ grepl("Y", attr(x, "map")$chr), ]
	
	## chrM genotypes
	xm <- x[ grepl("M", attr(x, "map")$chr), ]
	ym <- y[ grepl("M", attr(x, "map")$chr), ]
	
	## autosomal distance
	da <- sum(abs(xa-ya) == 2, na.rm = TRUE)
	na <- sum(!is.na(xa+ya))
	
	## sex chr distances
	dx <- sum(abs(xx-yx))
	nx <- sum(!is.na(xx+yx))
	if (sx == 1 && sy == 1) {
		## only check chrY if both samples male
		dy <- sum(abs(xy-yy) == 2, na.rm = TRUE)
		ny <- sum(!is.na(xy+yy))
	}
	else {
		dy <- 0
		ny <- 0
	}
	
	## mitochondria distances
	if (sx == 2 && sy == 2) {
		## only check mitochondria if both samples female
		dm <- sum(abs(xm-ym) == 2, na.rm = TRUE)
		nm <- sum(!is.na(xm+ym))
	}
	else {
		dm <- 0
		nm <- 0
	}
	
	return( (dx+dy+dm+da)/(nx+ny+nm+na) )

}

#' Attempt to guess the mother-father pair corresponding to offspring
#' 
#' @param gty a \code{genotypes} object containing target individuals
#' @param parents a \code{genotypes} object containing possible parents for the target individuals
#' @param ... ignored
#' 
#' @return the sample metedata from the input object, with \code{mom} and \code{dad} values replaced
#' 	by the function's best guesses
#' 
#' @details This wrapper calls \code{mendel.distance()} for all mother-father pairs available from
#' 	\code{parents} and all offspring in \code{gty}, and for each offspring reports the mother-father
#' 	pair with the smallest Mendel distance.  No guarantees are made with respect to near-misses.
#' 
#' @export
guess.parents <- function(gty, parents, ...) {
	## TODO: fix this
	if (!inherits(gty, "genotypes"))
		stop("Please supply an object of class 'genotypes'.")
	
	message("Attempting to guess parents of ", ncol(gty), " samples.")
	
	fam <- attr(gty, "ped")
	for (i in colnames(gty)) {
		message("\t... ", i)
		scores <- mendel.distance(gty[,i], parents, verbose = FALSE)
		guess <- scores[ which.min(scores$score),c("mom","dad") ]
		fam[ i,"mom" ] <- guess[1,"mom"]
		fam[ i,"dad" ] <- guess[1,"dad"]
	}
	
	message("Done.\n")
	return(fam)
	
}

#' Calculate a simple genetic distance: proportion of alleles shared IBS
#' 
#' @param gty a \code{genotypes} object
#' @param ... ignored
#' 
#' @return a \code{dist} object whose entires are the proportion of alleles shared
#' 	identical-by-state between samples
#' 	
#' @details To get a plain distance matrix, do \code{as.matrix(dist(x))}.
#' 
#' @seealso \code{\link[stats]{dist}}
#' 
#' @export
dist.genotypes <- function(gty, ...) {
	
	if (!inherits(gty, "genotypes"))
		stop("Please supply an object of class 'genotypes'.")
	
	if (!is.numeric(gty))
		gty <- recode(gty, "01")
	
	message("Computing distance matrix...")
	## compute distance matrix with Rcpp, super fast
	d <- dist_ibs(gty)
	dimnames(d) <- list(colnames(gty), colnames(gty))
	return( as.dist(d) )
	
}
#' @export
dist <- function(x, ...) UseMethod("dist")

weir.fst <- function(gty, subpop = NULL, per.locus = FALSE, ...) {
	
	## Copyright Eva Chan 2008
	## eva@evachan.org
	##
	## A script to estimate the variance components and fixation indices as described in
	## Weir & Cockerham 1984 Evolution 38(6): 1358-1370.
	##
	## Arguments
	## =========
	## geno:   matrix of genotypes with rows corresp. to markers and columns to individuals; 
	##         notation for genotyeps are {0,1,2} indicating the number of one of the two alleles
	## subpop: vector indicting the sub-popln to which the individuals belong to
	##
	## Output
	## ======
	## list of two objects: perloc and global
	## perloc: matrix of 6 columns and as many rows as markers in geno
	##         the 6 columns contain the estiamted variance components and fixation indices per locus
	##         a = component of variance between subpops
	##         b = component of variance between individuals within subpops
	##         c = component of variance between gametes within individuals
	## global: numeric vector of three values corresponding to the esimated F (Fit), theta (Fst), & 
	##         f (Fis) across all loci
	##
	## Note
	## ====
	## R/HIERFSTAT also estimate F-statistics using variance component estimation.
	## Results from that package is not too different to those from this function; I suspect
	## there are two sources of differences:
	## 1) all estimates of variance component from HIERFSTAT are doubled in magnitude to those
	##    from this function (i.e. scaled by factor of 2);
	## 2) rounding off variations may also be present.
	## The scaled difference in the estimates of variance components poses no problem when
	## calcualting fixation indicies as the values scaling factor is cancelled out in the
	## calculation of the ratios.
	
	if (!inherits(gty, "genotypes"))
		stop("Please supply an object of class 'genotypes'.")
	if (!is.numeric(gty))
		geno <- recode(gty, "01")
	else
		geno <- gty
	
	if (is.null(subpop))
		subpop <- as.character(attr(geno, "ped")$fid)
	else
		if (length(subpop) != ncol(geno))
			stop("Length of population labels doesn't match dimension of genotypes matrix.")
	
	spop <- unique(as.character(subpop))    ## unique spops
	r <- length(spop)
	message("Calculating Fst with following population labels:\n",
			paste(spop, collapse = ", "))
	
	n11 <- n12 <- n22 <- matrix(NA, ncol=r, nrow=nrow(geno))
	for(i in 1:r) {
		inds <- which(subpop == spop[i])
		n11[,i] <- rowSums(geno[,inds] == 0,na.rm = TRUE)
		n12[,i] <- rowSums(geno[,inds] == 1,na.rm = TRUE)
		n22[,i] <- rowSums(geno[,inds] == 2,na.rm = TRUE)
	}
	ni <- n11 + n12 + n22
	pi_tilda <- ((2 * n11) + n12) / (2 * ni)
	hi_tilda <- n12 / ni
	n_bar <- rowSums(ni, na.rm = TRUE)/r
	#      C_square <- ( apply(ni*ni,1,sum,na.rm=T) - (n_bar*n_bar*r) ) / ( (n_bar*n_bar) * (r-1) )    ## mod 2/3/2008
	#      nc <- n_bar * (1 - (C_square/r))
	nc <- ((r*n_bar) - rowSums(((ni*ni)/(r*n_bar)), na.rm = TRUE)) / (r - 1)
	p_bar <- rowSums( (ni*pi_tilda)/(r*n_bar), na.rm = TRUE )
	s_square <- rowSums( (ni*((pi_tilda-p_bar)^2)) / ((r-1)*n_bar), na.rm = TRUE )
	h_bar <- rowSums((ni*hi_tilda)/(r*n_bar), na.rm = TRUE )
	
	#      F_hat = 1 - ( (h_bar*(1-(C_square/r))) / ( (2*p_bar*(1-p_bar)*(1-((n_bar*C_square)/(r*(n_bar-1))))) + (2*(s_square/r)*(1+(((r-1)*(n_bar*C_square))/(r*(n_bar-1))))) + ((h_bar/2)*(C_square/(r*(n_bar-1)))) ))
	#      theta_hat <- (s_square - ((1/(n_bar-1))*((p_bar*(1-p_bar)) - (((r-1)/r)*s_square) - (h_bar/4)))) / (((1-((n_bar*C_square)/(r*(n_bar-1))))*p_bar*(1-p_bar)) + ((1+(((r-1)*n_bar*C_square)/(r*(n_bar-1))))*(s_square/r)) + ((C_square/(r*(n_bar-1)))*(h_bar/4)))
	#      f_hat <- 1 - (h_hat / ((((2*n_bar)/(n_bar-1))*p_bar*(1-p_bar)) - (((2*n_bar*(r-1))/(r*(n_bar-1)))*s_square) - ((1/(n_bar-1))*(h_bar/2))))
	
	a_hat <- (n_bar/nc) * ( s_square - ((1/(n_bar-1))*((p_bar*(1-p_bar)) - (((r-1)/r)*s_square) - ((1/4)*h_bar))) )
	b_hat <- (n_bar/(n_bar-1)) * ((p_bar*(1-p_bar)) - (((r-1)/r)*s_square) - ((((2*n_bar)-1)/(4*n_bar))*h_bar))
	c_hat <- h_bar/2
	
	F_hat <- 1 - (c_hat / (a_hat + b_hat + c_hat))
	theta_hat <- a_hat / (a_hat + b_hat + c_hat)
	f_hat <- 1 - (c_hat / (b_hat + c_hat))
	
	F_hat_w <- 1 - (sum(c_hat,na.rm=T) / sum((a_hat + b_hat + c_hat),na.rm=T))
	theta_hat_w <- sum(a_hat,na.rm=T) / sum((a_hat + b_hat + c_hat),na.rm=T)
	f_hat_w <- 1 - (sum(c_hat,na.rm=T) / sum((b_hat + c_hat),na.rm=T))
	
	if (per.locus)
		list( perloc=cbind(a_hat=a_hat, b_hat=b_hat, c_hat=c_hat, F_hat=F_hat, theta_hat=theta_hat, f_hat=f_hat),
		 	 global=c(F_hat=F_hat_w, theta_hat=theta_hat_w, f_hat=f_hat_w) )
	else
		return(theta_hat_w)
	
}


#' Thin markers to a specified density on the genetic map
#'
#' @param gty a \code{genotypes} object
#' @param spacing numeric; target distance between retained markers, in cM
#' @param ... ignored
#' 
#' @return a new \code{genotypes} object with fewer markers
#' 
#' @aliases thin
#' @export
thin.genotypes <- function(gty, spacing = 5.0, ...) {
	
	## start borrowing from ggplot2
	find.zero <- function (x_range, width, boundary) {
		shift <- floor((x_range[1] - boundary)/width)
		boundary + shift * width
	}
	fixed.bins <- function (x, width, center = NULL, boundary = NULL, closed = c("right","left")) {
		x <- as.numeric(x)
		width <- as.numeric(width)
		closed <- match.arg(closed)
		x_range <- range(x, na.rm = TRUE, finite = TRUE)
		if (length(x_range) == 0) {
			return(x)
		}
		if (!is.null(boundary) && !is.null(center)) {
			stop("Only one of 'boundary' and 'center' may be specified.")
		}
		if (is.null(boundary)) {
			if (is.null(center)) {
				boundary <- width/2
			}
			else {
				boundary <- center - width/2
			}
		}
		boundary <- as.numeric(boundary)
		min_x <- find.zero(x_range, width, boundary)
		max_x <- max(x, na.rm = TRUE) + (1 - 1e-08) * width
		breaks <- seq(min_x, max_x, width)
		cut(x, breaks, include.lowest = TRUE, right = (closed == "right"))
	}
	## end borrowing
	
	if (!inherits(gty, "genotypes"))
		stop("Please supply an object of class 'genotypes'.")
	
	if (!.has.valid.map(gty))
		stop("Input object must have a valid genetic map.")
	
	# subsample by chromosome
	message("Starting with ", nrow(gty), " markers...")
	suppressMessages({
		
		thinner <- function(d) {
			binned <- fixed.bins(markers(d)$cM, spacing)
			.keep1 <- function(z) z[1]
			keep <- sapply(split(seq_len(nrow(d)), binned), .keep1)
			keep <- union(keep, c(1,nrow(d)))
			d[ sort(keep), ]
		}
		
		gty <- subset(gty, !is.na(cM))
		vals <- split(seq_len(nrow(gty)), attr(gty, "map")$chr)
		ll <- lapply(vals, function(v) {
			x <- gty[ v,,drop = FALSE ]
			if (nrow(x))
				thinner(x)
			else
				return(NULL)
		})
		nulls <- sapply(ll, function(f) is.null(f))
		ll <- ll[ !nulls ]
		thinned <- Reduce(rbind, ll)
	})
	
	dropped <- setdiff(attr(gty, "map")$chr, attr(thinned, "map")$chr)
	
	message("Thinned to ", nrow(thinned), " markers.")
	if (length(dropped))
		message("Dropped these chromosomes without genetic positions: ", paste(dropped, collapse = ", "))
	return(thinned)
	
}
#' @export
thin <- function(gty, ...) UseMethod("thin")
andrewparkermorgan/argyle documentation built on May 10, 2019, 11:08 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
andrewparkermorgan/argyle
Basic interface and QC for genotypes from Illumina Infinium arrays

R/genetics.R
In andrewparkermorgan/argyle: Basic interface and QC for genotypes from Illumina Infinium arrays

R Package Documentation

Browse R Packages

We want your feedback!

andrewparkermorgan/argyle Basic interface and QC for genotypes from Illumina Infinium arrays

R/genetics.R In andrewparkermorgan/argyle: Basic interface and QC for genotypes from Illumina Infinium arrays

R Package Documentation

Browse R Packages

We want your feedback!

andrewparkermorgan/argyle
Basic interface and QC for genotypes from Illumina Infinium arrays

R/genetics.R
In andrewparkermorgan/argyle: Basic interface and QC for genotypes from Illumina Infinium arrays