R/pca.R

## pca.R
## functions for performing PCA on genotypes and hybridization intensities

## impute allele freq in place of missing values
.fudge.missing.genotypes <- function(g, check.variance = TRUE, eps = 1e-6, ...) {
	
	X <- t(.copy.matrix.noattr(g))
	## replace missing values with column mean
	if (any(is.na(colMeans(X)))) {
		message("\treplacing missing values with minor-allele frequency...")
		X <- apply(X, 2, function(x) {
			x[ is.na(x) ] <- mean(x, na.rm = TRUE)
			return(x)
		})
	}
	
	## remove zero-variance columns
	if (check.variance) {
		const <- abs(colVars(X) - 0) <= eps
		const[ is.na(const) ] <- TRUE
		X <- X[ ,!const ]
	}
	
	return(X)
	
}

## internal function for actually doing PCA
.do.pca.genotypes <- function(gty, extras = NULL, what = c("genotypes","intensity","norm"), weights = NULL,
							  fast = FALSE, scale = TRUE, center = TRUE, eps = 1e-6, K = 1:3, ...) {
	
	if (!inherits(gty, "genotypes"))
		stop("Please supply an object of class 'genotypes.'")
	
	if (!is.null(extras) && !inherits(extras, "genotypes"))
		stop(paste("If supplying extra samples to project onto pre-computed PCs",
				   "they should also be suppled as class 'genotypes'."))
	
	what <- match.arg(what)
	.make.pca.input <- function(g) {
		if (what == "genotypes") {
			if (!is.numeric(g)) {
				g <- recode.genotypes(g, "01")
			}
			X <- t(.copy.matrix.noattr(g))
			## replace missing values with column mean
			if (any(is.na(colMeans(X)))) {
				message("\treplacing missing values with minor-allele frequency...")
				X <- apply(X, 2, function(x) {
					x[ is.na(x) ] <- mean(x, na.rm = TRUE)
					return(x)
				})
			}
			f <- colMeans(X)/2
			sigma <- sqrt(f*(1-f))
			X <- scale(X, center = f, scale = sigma)
			if (!is.null(weights)) {
				if (length(weights) == ncol(X)) {
					message("\tapplying weights ...")
					weights <- weights/sum(weights)*ncol(X)
					X <- scale(X, center = FALSE, scale = 1/weights)
				}
				else {
					message("\t[weights provided but don't match dimensions; ignoring them]")
				}
			}
		}
		else if (what == "intensity") {
			if (.has.valid.intensity(g)) {
				x <- with(attr(g, "intensity"), .copy.matrix.noattr(x))
				y <- with(attr(g, "intensity"), .copy.matrix.noattr(y))
				rownames(x) <- paste0(rownames(x), "_x")
				rownames(y) <- paste0(rownames(y), "_y")
				x[ is.na(x) ] <- 0
				y[ is.na(y) ] <- 0
				X <- t(rbind(x,y))	
			}
			else {
				stop("Need valid intensity matrices in order to do PCA on intensity.")
			}
		}
		else if (what == "norm") {
			if (.has.valid.baflrr(g)) {
				x <- attr(g, "baf")
				#y <- attr(g, "lrr")
				rownames(x) <- paste0(rownames(x), "_x")
				#rownames(y) <- paste0(rownames(y), "_y")
				x[ is.na(x) ] <- 0
				#y[ is.na(y) ] <- 0
				#X <- t(rbind(x,y))	
				X <- t(x)
			}
			else {
				stop("Need valid BAF and LRR matrices in order to do PCA on normalized intensity.")
			}
		}
		return(X)
	}
	
	## remove zero-variance columns, if scaling requested
	.check.variance <- function(x) {
		const <- abs(colVars(x) - 0) <= eps
		const[ is.na(const) ] <- TRUE
		return(!const)
	}

	## finally do PCA
	message("Preparing input matrices...")
	
	if (!is.null(extras)) {
		suppressMessages(both <- merge(gty, extras, check.alleles = TRUE))
		both <- .make.pca.input(both)
		keep <- .check.variance(both[ colnames(gty), ]) & .check.variance(both[ colnames(extras), ])
		features <- both[ colnames(gty),keep ]
		topredict <- both[ colnames(extras),keep ]
	}
	else {
		features <- .make.pca.input(gty)
		keep <- .check.variance(features)
		features <- features[ ,keep ]
	}
		
	message(paste("Computing principal components of", what, "matrix (", paste(dim(features), collapse = " x ") ,")..."))
	if (fast) {
		message("\t(using corpcor::fast.svd() ...)")
		pc <- .fast.pca(features, K = K, .center = center, .scale = scale)	
	}
	else {
		message("\t(using base::prcomp() ...)")
		if (what == "genotypes") {
			pc <- prcomp(features, center = TRUE, scale. = FALSE)
		}
		else {
			pc <- prcomp(features, center = center, scale. = scale)
		}
	}
	proj <- predict(pc)
	if (!is.null(extras)) {
		message("Projecting extra samples onto existing PCs...")
		predicted <- scale(topredict, pc$center, pc$scale) %*% pc$rotation
		#predicted <- predict(pc, newdata = topredict)
		proj <- rbind(proj, predicted)
	}
	
	## add %variance explained
	attr(proj, "effective.dim") <- dim(features)
	attr(proj, "pca") <- pc
	attr(proj, "explained") <- pc$sdev^2/sum(pc$sdev^2)
	
	message("Done.")
	return(proj)
	
}

## do (faster?) PCA via SVD using corpcor::fast.svd()
.fast.pca <- function(x, K = 1:3, .scale = TRUE, .center = TRUE, ...) {
	
	xx <- scale(x, scale = .scale, center = .center)
	y <- xx/(ncol(xx)-1)
	udv <- corpcor::fast.svd(y)
	
	proj <- udv$u
	rownames(proj) <- rownames(x)
	colnames(proj) <- paste0("PC", 1:ncol(proj)) 
	rot <- udv$v
	rownames(rot) <- rownames(rot)
	colnames(rot) <- paste0("PC", 1:ncol(rot)) 
	
	rez <- list(sdev = udv$d,
				rotation = rot,
				x = proj,
				center = .center, scale = .scale)
	class(rez) <- c("prcomp", class(rez))
	return(rez)
	
}

#' Perform PCA on a \code{genotypes} object
#'
#' Performs principal components analysis (PCA) on either (numerically-coded) genotypes or
#' on the underliny 2D hybridization-intensity matrices.
#'
#' @param x an object of class \code{genotypes}
#' @param extras a second dataset of class \code{genotypes}, to be projected onto PCs computed
#' 	from the first. NB: this function *does not* verify that the underlying features (ie. markers) match.
#' @param what \code{"genotypes"} to do PCA on genotypes (coded 0/1/2); \code{"intensity"} to do PCA on
#' 	underlying 2D intensities.  The latter triggers an error if intensity matrices are absent.
#' @param K how many PCs to return.
#' @param fast if \code{TRUE}, use \code{corpcor::fast.svd()} to speed up calculations
#' @param weights a vector of weights by which to pre-multiply the genotypes matrix
#' @param ... other parameters for call to \code{prcomp}, such as \code{center} and \code{scale}
#' 	(both \code{TRUE} by default)
#'
#' @details Uses base-\code{R}'s \code{prcomp} under the hood (unless \code{fast = TRUE}). By default,
#' 	columns are centered and scaled; not doing so will likely produce a strange result.  When doing PCA on genotypes,
#' 	missing values are replaced with the column mean, which in many circumstances can be interpreted as the
#' 	minor-allele frequency. (This is very similar to the behavior of PLINK.)  When doing PCA on intensities, missing
#' 	values are set to zero -- but even no-call genotypes have nonmissing intensities on Illumina arrays, so this is unlikely
#' 	to have any effect in practice.
#'
#' @return a dataframe with as many rows as samples, in which the first columns are sample IDs and any associated
#' 	metadata (as returned by \code{samples(x)}), followed by the first K PCs.  Scaled eigenvalues (ie. percent
#' 	of variance explained) are provided as \code{attr(,"explained")}.
#'
#' @seealso \code{pca.plink()} for using PLINK's (much faster and more powerful) implementation
#'
#' @export
pca.genotypes <- function(x, extras = NULL, what = c("genotypes","intensity","norm"), K = 3, fast = FALSE, weights = NULL, ...) {
	
	if (!inherits(x, "genotypes"))
		stop("Please supply an object of class 'genotypes.'")
	
	if (!is.numeric(K) || K < 1)
		stop("You probably don't want to do PCA with <1 dimension.")
	
	what <- match.arg(what)

	## run the PCA
	if (is.null(weights))
		rez <- .do.pca.genotypes(x, extras = extras, what = what, fast = fast, K = K, ...)
	else
		rez <- .do.pca.genotypes(x, extras = extras, what = what, fast = fast, K = K, weights = weights, scale = FALSE, ...)
	
	## add sample metatdata, if any
	meta <- data.frame(iid = rownames(rez))
	rownames(meta) <- rownames(rez)
	if (.has.valid.ped(x)) {
		fam <- attr(x, "ped")
		if (!is.null(extras))
			fam <- rbind(fam, attr(extras, "ped"))
		meta <- merge(meta, fam, by = "iid")
		rownames(meta) <- as.character(meta$iid)
		if (!(nrow(meta) == nrow(rez) && all(rownames(rez) %in% meta$iid)))
			stop("Sample metadata and PCA result are out of sync.")
	}
	rez.df <- data.frame(meta, rez[ rownames(meta),1:K ])
	
	## copy over the %variance explained
	class(rez.df) <- c("pca.result", class(rez.df))
	attr(rez.df, "explained") <- attr(rez, "explained")[1:K]
	if ("effective.dim" %in% names(attributes(rez)))
		attr(rez.df, "effective.dim") <- attr(rez, "effective.dim")
	
	return(rez.df)
	
}
#' @export
pca <- function(x, ...) UseMethod("pca")
andrewparkermorgan/argyle documentation built on May 10, 2019, 11:08 a.m.