R/plink.R

## plink.R
## wrappers for interacting with plink

#' Read a PLINK binary fileset into a \code{genotypes} object
#' 
#' @param prefix path to a PLINK fileset, excluding \code{*.bed} suffix
#' @param chr chromosome(s) to keep
#' @param from keep markers at physical position greater than or equal to this
#' @param to keep markers at physical position less than or equal to this
#' @param markers keep *markers* with these names
#' @param keep keep *samples* with these names
#' @param intensity attempt to read intensity matrices (\code{*.bii}), if present
#' @param keep.chrnames whether to keep chromosome names as-is, or allow them to be converted to PLINK's style
#' @param ... ignored
#' 
#' @return a \code{genotypes} object, with alleles in the "01" encoding (see \code{\link{recode.genotypes}})
#' 
#' @details Reads a PLINK binary fileset using pure \code{R}.  The fileset should be generated in 
#' 	the "variant-major" format (the default in all recent versions of PLINK.) Missing genotypes are
#' 	represented as \code{NA}s.
#' 	
#' 	If any of \code{chr,from,to,markers,keep} are specified, the file will be read in random-access mode. Positional
#' 	filters are applied only if one or more chromosomes are specified.  Marker-name filters are applied after
#' 	chromosome/position filters; it rarely makes sense to specify both a set of names and a set of positions.
#' 	Chromosome names must match exactly those used in the marker map (\code{*.bim} file), and sample names must match
#' 	exactly those used in the sample metadata file (\code{*.fam} file).
#' 	
#' 	Random-access mode is not guaranteed to be more efficient than reading the whole dataset into the \code{R}
#' 	session and then subsetting it.  Efficiency gains will depend on how much of the data happends to be found in
#' 	contiguous blocks, and also on hardware (eg. SSD versus standard hard disk).
#' 	
#' 	If intensities are present, they are expected to be encoded as described in the help page for
#' 	\code{\link{write.plink()}}.
#' 	
#' @seealso \code{\link{write.plink}}
#' 
#' @references
#' PLINK v1.9: \url{https://www.cog-genomics.org/plink2}
#' 
#' Purcell S et al. (2007) PLINK: a toolset for whole-genome association and population-based 
#' 	linkage analysis. Am J Hum Genet 81(3): 559-575. doi:10.1086/519795.
#' 
#' @export read.plink
read.plink <- function(prefix, intensity = FALSE, chr = NULL, from = NULL, to = NULL, markers = NULL, keep = NULL, keep.chrnames = FALSE, ...) {
	
	## construct filenames; check that all are present and accounted for
	prefix <- gsub("\\.bed$", "", prefix)
	bed <- paste0(prefix, ".bed")
	bim <- paste0(prefix, ".bim")
	fam <- paste0(prefix, ".fam")
	
	if (!all(sapply(c(bed, bim, fam), file.exists)))
		stop("One or more of the bed/bim/fam files for this plink fileset doesn't exist.")
	
	message(paste0("Reading family info from: <", fam,">"))
	message(paste0("Reading marker info from: <", bim,">"))
	message(paste0("Reading binary genotypes from: <", bed,">"))
	
	## read map and family file
	bim <- read.map(bim, keep.chrnames)
	fam <- read.fam(fam)
	
	## check if we're doing random access
	kr <- rep(TRUE, nrow(bim))
	ki <- rep(TRUE, nrow(fam))
	# first: subset by variants
	if (!is.null(chr)) {
		chr <- as.character(chr)
		kr <- kr & (bim$chr %in% chr)
		if (!is.null(from))
			kr <- kr & (bim$pos >= from)
		if (!is.null(to))
			kr <- kr & (bim$pos <= to)
	}
	if (!is.null(markers)) {
		markers <- na.omit(as.character(markers))
		kr <- kr & (bim$marker %in% markers)
	}
	kr[ is.na(kr) ] <- FALSE
	
	# next: subset by samples
	if (!is.null(keep)) {
		keep <- as.character(keep)
		ki <- ki & (fam$iid %in% keep)
	}
	ki[ is.na(ki) ] <- FALSE
	
	## are we keeping all samples and variants?
	subsetting <- !all(ki,kr)
	kr <- which(kr)
	ki <- which(ki)
	nr.keep <- length(kr)
	ni.keep <- length(ki)
	
	message(paste0("\texpecting a matrix of size ", nr.keep," markers x ", ni.keep, " samples..."))
	
	## calculate number of markers and samples
	nr <- nrow(bim)
	ni <- nrow(fam)
	## block size in bytes: (number of individuals)/4, to nearest byte
	bsz <- ceiling(ni/4)
	
	## open bed file and check its magic number
	bed <- file(bed, open = "rb")
	magic <- readBin(bed, "raw", 3)
	if (!all(magic[1] == "6c", magic[2] == "1b", magic[3] == "01"))
		stop("Wrong magic number for bed file; should be -- 0x6c 0x1b 0x01 --.")
	
	if (!subsetting) {
		## read *all* genotypes block by block
		intr <- interactive()
		if (intr)
			pb <- txtProgressBar(min = 0, max = nr, style = 3)
		gty <- matrix(NA, nrow = ni, ncol = nr)
		for (i in 1:nr) {
			geno.raw <- as.logical(rawToBits(readBin(bed, "raw", bsz)))
			j <- seq(1,length(geno.raw),2)
			## express genotypes as minor allele dosage (0,1,2)
			geno <- geno.raw[ j ] + geno.raw[ j+1 ]
			## recall that 0/1 is het, but 1/0 is missing
			geno[ geno.raw[ j ] == 1 & geno.raw[ j+1 ] == 0 ] <- NA
			gty[ ,i ] <- geno[1:ni]
			if (intr)
				setTxtProgressBar(pb, i)
		}
	}
	else {
		## we must do random-access; prepare genotypes object
		gty <- matrix(NA, nrow = ni.keep, ncol = nr.keep)
		
		## loop on variants
		intr <- interactive()
		if (intr)
			pb <- txtProgressBar(min = 0, max = nr.keep, style = 3)
		for (i in seq_along(kr)) {
			
			# calculate offset for this variant; go there
			v <- kr[i]-1
			offset <- v*bsz+3
			seek(bed, offset, "start")
			
			# read the genotypes
			geno.raw <- as.logical(rawToBits(readBin(bed, "raw", bsz)))
			j <- seq(1,length(geno.raw),2)
			## express genotypes as minor allele dosage (0,1,2)
			geno <- geno.raw[ j ] + geno.raw[ j+1 ]
			## recall that 0/1 is het, but 1/0 is missing
			geno[ geno.raw[ j ] == 1 & geno.raw[ j+1 ] == 0 ] <- NA
			# get rid of samples we don't want; this is a hack
			gty[ ,i ] <- (geno[1:ni])[ ki ]
			if (intr)
				setTxtProgressBar(pb, i)
		}
		
	}
	
	close(bed)
	
	## add rownames (markers) and colnames (samples)
	gty <- t(gty)
	rownames(gty) <- as.character(bim$marker)[kr]
	colnames(gty) <- as.character(fam$iid)[ki]
	
	## read intensities, if requested
	## unlike genotypes, these are in sample-major order
	ilist <- NULL
	if (intensity) {
		bii <- paste0(prefix, ".bii")
		if (file.exists(bii)) {
			message(paste0("Reading binary intensity matrices from: <", bii,">"))
			bii <- file(bii, "rb")
			
			## set up containers for results
			x <- matrix(NA, nrow = nr.keep, ncol = ni.keep,
						dimnames = dimnames(gty))
			y <- x
			
			## set offset for second part of intensity data
			ystart <- nr*ni*2
			
			## loop on samples to keep
			intr <- interactive()
			if (intr)
				pb <- txtProgressBar(min = 0, max = ni.keep, style = 3)
			for (i in seq_along(ki)) {
				# calculate offset for this sample
				s <- ki[i]-1
				offset.x <- s*nr*2
				offset.y <- ystart + s*nr*2
				
				# read intensity data
				seek(bed, offset.x, "start")
				X <- readBin(bii, integer(), 2*nr, size = 2, signed = FALSE, endian = "little")
				seek(bed, offset.y, "start")
				Y <- readBin(bii, integer(), 2*nr, size = 2, signed = FALSE, endian = "little")
				x[ ,i ] <- X[kr]/10e3
				y[ ,i ] <- Y[kr]/10e3
				
				if (intr)
					setTxtProgressBar(pb, i)
				
			}
			
			close(bii)
			ilist <- list(x = x, y = y)
		}
	}
	
	## make it a 'genotypes' object
	gty <- genotypes(gty, map = bim[kr,], ped = fam[ki,], alleles = "01",
					 intensity = ilist)
	return(gty)
	
}

#' Write a \code{genotypes} object as a PLINK binary fileset
#' 
#' @param gty an \code{genotypes} object
#' @param prefix path to the PLINK fileset to generate, excluding \code{*.bed} suffix
#' @param map a valid marker map; overrides existing map
#' @param fam sample metadata to override any existing metadata
#' @param intensity whether to write intensity matrices, if present (to \code{*.bii} file)
#' @param keep.chrnames whether to keep chromosome names as-is, or allow them to be converted to PLINK's style
#' @param ... ignored
#' 
#' @return \code{TRUE} on completion
#' 
#' @details Writes a PLINK binary fileset using pure \code{R}.  The fileset is generated in the
#' 	"variant-major" format (the default in all recent versions of PLINK), with appropriate magic
#' 	number.  Complete sample metadata and a complete marker map, including reference alleles (columns "A1"
#' 	and "A2"), are both required.
#' 	
#' 	This function only supports writing from character (allele enconding \code{"native"}) genotypes,
#' 	in order to avoid ambiguity around the correspondence between alleles and numeric genotypes.  But
#' 	numeric genotypes can be converted back to character via \code{recode.genotypes(,"native")}.
#' 	
#' 	If intensity data is saved, it is written to a binary file with suffix \code{*.bii} in little-endian
#' 	encoding.  First the X-intensity and then the Y-intensity matrix are convered to a single vector in
#' 	column-major fashion, and the result is written to disk.  Intensities are pre-multiplied by 10,000 and
#' 	truncated to integers of size 2 bytes (max 2^16-1) to save space; this means values will be clipped to
#' 	6.5535 or less. In practice, for Illumina arrays, this makes essentially no difference at all for
#' 	downstream analysis.
#' 	
#' @seealso \code{\link{read.plink}}
#' 
#' @references
#' PLINK v1.9: \url{https://www.cog-genomics.org/plink2}
#' 
#' Purcell S et al. (2007) PLINK: a toolset for whole-genome association and population-based 
#' 	linkage analysis. Am J Hum Genet 81(3): 559-575. doi:10.1086/519795.
#' 
#' @export write.plink
write.plink <- function(gty, prefix, map = NULL, fam = NULL, intensity = FALSE, keep.chrnames = FALSE, ...) {
	
	if (!inherits(gty, "genotypes"))
		warning("Input has not been blessed as genotype data; proceeding with skepticism.")
	
	if (!is.matrix(gty))
		gty <- as.matrix(gty)
	
	if (!is.character(gty))
		stop("Only character genotypes allowed for now, to avoid problems with allele coding.")
	
	if (is.null(map))
		if (!is.null(attr(gty, "map")))
			map <- attr(gty, "map")
	else
		stop("Must supply a marker map (columns: chr, marker, cM, pos, A1, A2).")
	
	if (is.null(fam))
		if (!is.null(attr(gty, "ped")))
			fam <- attr(gty, "ped")
	else
		stop("Must supply a family file (columns: fid, iid, mom, dad, sex, pheno).")
	
	fam.cols <- c("fid","iid","mom","dad","sex","pheno")
	if (!all(fam.cols %in% colnames(fam)))
		stop("Must supply a family file (columns: fid, iid, mom, dad, sex, pheno).")
	fam <- fam[ ,c(fam.cols, setdiff(colnames(fam), fam.cols)) ]
	
	if (!keep.chrnames)
		map[,1] <- convert.names(map[,1], to = "plink")
	map[is.na(map[,3]),3] <- 0
	map[is.na(map[,4]),4] <- 0
	
	o <- with(map, order(chr, pos, cM))
	map <- map[o,]
	gty <- gty[o,]
	justcalls <- .copy.matrix.noattr(gty)
	
	if (nrow(justcalls) != nrow(map))
		stop(paste0("Dimensions of genotype matrix (", nrow(justcalls) ,") and marker map (", nrow(map) ,") don't match."))
	if (ncol(justcalls) != nrow(fam))
		stop(paste0("Dimensions of genotype matrix (", ncol(justcalls) ,") and family file (", nrow(fam) ,") don't match."))
	
	message("Creating plink fileset <", prefix ,".*>...")
	message(paste0("\twriting family file (", ncol(justcalls)," individuals) ..."))
	write.plink.file(fam, paste0(prefix, ".fam"))
	message(paste0("\twriting marker map (", nrow(justcalls)," markers) ..."))
	write.plink.file(map, paste0(prefix, ".bim"))
	
	message(paste0("\tpreparing to convert genotypes to binary format..."))
	nr <- nrow(justcalls)
	ni <- ncol(justcalls)
	## block size in bytes: (number of individuals)/4, to nearest byte
	bsz <- ceiling(ni/4)
	
	## open bed file and write magic number
	bed <- file(paste0(prefix, ".bed"), open = "wb")
	magic <- packBits( c( rev(as.logical(c(0,1,1,0,1,1,0,0))),
						  rev(as.logical(c(0,0,0,1,1,0,1,1))),
						  rev(as.logical(c(0,0,0,0,0,0,0,1)))) )
	writeBin(magic, bed)
	
	intr <- interactive()
	if (intr)
		pb <- txtProgressBar(min = 0, max = nrow(justcalls), style = 3)
	for (i in seq_len(nrow(justcalls))) {
		row <- justcalls[ i, ]
		row[ is.na(row) ] <- "N"
		a1 <- row == map[i,5]
		a2 <- row == map[i,6]
		het <- row == "H"
		miss <- row == "N" | is.na(row)
		row2 <- logical(2*length(row))
		if (any(a2)) {
			j <- which(a2)-1
			row2[ 2*j + 1 ] <- TRUE
			row2[ 2*j + 2 ] <- TRUE
		}
		if (any(miss))
			row2[ 2*(which(miss)-1) + c(1) ] <- TRUE
		if (any(het))
			row2[ 2*(which(het)-1) + c(2) ] <- TRUE
		towrite <- logical(8*bsz)
		towrite[ 1:length(row2) ] <- row2
		writeBin(packBits(towrite, "raw"), bed)
		if (intr)
			setTxtProgressBar(pb, i)
	}
	close(bed)
	
	## write intensity matrices if needed
	if (intensity && .has.valid.intensity(gty)) {
		message(paste0("\tsaving intensities to binary format..."))
		# convert intensities to integer values (may lose precision)
		X <- as.integer(pmin(10e3*attr(gty , "intensity")$x, 2^16-1))
		Y <- as.integer(pmin(10e3*attr(gty , "intensity")$y, 2^16-1))
		# open file
		bii <- file(paste0(prefix, ".bii"), "wb")
		# write values, always little-endian
		writeBin(X, bii, size = 2, endian = "little")
		writeBin(Y, bii, size = 2, endian = "little")
		# close file
		close(bii)
	}
	
	invisible(TRUE)
	
}

### utilities and internals

## convert plink-style to UCSC-style chromosome identifiers
unplink.chroms <- function(x, ...) {
	
	mm <- paste0("chr", c(1:19,"X","Y","M"))
	chrs <- setNames( mm, c(1:19,23,24,26) )
	chrs <- c(chrs, "X" = "chrX", "Y" = "chrY", "MT" = "chrM")
	factor(chrs[ as.character(x) ], levels = mm)
	
}

## create a fam-formatted dataframe for plink
make.fam <- function(ids, fid = NULL, mom = 0, dad = 0, sex = 0, pheno = -9, ...) {
	
	if (is.character(ids)) {
		resex <- rep(0, length(ids))
		resex[ grepl("^[mM]", sex) ] <- 1
		resex[ grepl("^[fF]", sex) ] <- 2
		resex[ is.na(resex) ] <- 0
	}
	else {
		resex <- factor(sex, levels = c(1,2))
		resex <- as.numeric(resex)
		resex[ is.na(resex) ] <- 0
	}
	
	pheno[ is.na(pheno) ] <- -9
	ids <- gsub(" ","", as.character(ids))
	mom <- gsub(" ","", as.character(mom))
	dad <- gsub(" ","", as.character(dad))
	if (is.null(fid))
		fid <- ids
	if (length(ids) != length(unique(ids)))
		warning("IDs are not unique; plink might complain.")
	
	fam <- data.frame(fid = fid, iid = ids,
					  mom = mom, dad = dad, sex = resex, pheno = pheno,
					  stringsAsFactors = FALSE)
	rownames(fam) <- as.character(fam$iid)
	return(fam)
	
}

## shortcut to write tab-separated output files without rownames, colnames, quotes
write.plink.file <- function(x, ..., fix.missing = NA) {
	
	## convert NAs to specific missing-value code for plink
	if (!is.na(fix.missing)) {
		for (i in colnames(x))
			x[ is.na(x[,i]),i ] <- fix.missing
	}
	
	write.table(x, ..., col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")
}

#' Create a pointer to a PLINK fileset
#' 
#' @param gty a length-1 character vector containing the path to an existing PLINK fileset, OR a
#' 	\code{genotypes} object which will be converted to one
#' @param where if \code{gty}, directory in which to write the new PLINK fileset
#' @param prefix basename of the fileset to be created
#' @param flags (ignored)
#' @param ... ignored
#' 
#' @return a length-1 character vector with path to a PLINK binary fileset, of class \code{"plink"}
#' 
#' @details Command-line PLINK takes its input from filesets on disk. This function creates a pointer to such
#' 	a fileset after checking that all its members (\code{*.bed}, \code{*.bim}, \code{*.fam}) are present.
#' 	Wrapper functions for various PLINK analyses accept this pointer as input.
#' 	
#' 	When the fileset alraedy exists, an additional attribute (\code{"where"}) is added to the result
#' 	specifying where to place the output of any future PLINK calls.  By default such files are routed
#' 	to the temporary directory corresponding to the current \code{R} session, to avoid cluttering the
#' 	directory containing the source fileset.
#' 	
#'	When the fileset does not exist and \code{gty} is a \code{genotypes} object, a new fileset is created
#'	in the location specified by \code{where} via a call to \code{write.plink(gty)}.  Then the path to
#'	that fileset is returned. 
#'
#' @export
plinkify <- function(gty, where = tempdir(), prefix = "stuff", flags = "", ...) {
	
	## allow shortcut to bless paths of existing plink files
	if (inherits(gty, "character") && is.atomic(gty)) {
		gty <- gsub("\\.bed$", "", gty)
		paths <- sapply(c("bim","bed","fam"), function(x) file.exists(paste0(gty, ".", x)))
		if (all(paths)) {
			prefix <- as.character(gty)
			attr(prefix, "working") <- where
			class(prefix) <- c("plink", class(prefix))
			return(prefix)
		}
		else {
			stop("Can't find a plink fileset by this name in specified location.")
		}
	}
	
	## only move forward if input is in right form
	if (!inherits(gty, "genotypes"))
		stop("Plinkification of non-'genotypes' objects is not yet implemented.")
	
	prefix <- file.path(where, prefix)
	attr(prefix, "working") <- where
	system(paste0("rm ", prefix, "*"))
	write.plink(gty, paste0(prefix))
	
	class(prefix) <- c("plink", class(prefix))
	return(prefix)
	
}

#' @export
summary.plink <- function(object, ...) {
	
	cat("-- Pointer to a PLINK fileset -- ","\n")
	cat("Source:", normalizePath(paste0(object, ".bed")), "\n")
	cat("Ouput dir:", normalizePath(attr(object, "working")), "\n")
	
}

#' @export
print.plink <- function(x, ...) {
	summary.plink(x, ...)
}

## read a plink fam/tfam file
read.fam <- function(f, pheno.missing = c(0,-9), ...) {
	
	ff <- f
	if (!file.exists(ff)) {
		ff <- paste0(f, ".fam")
		if (!file.exists(ff)) {
			ff <- paste0(f, ".tfam")
			if (!file.exists(ff))
				stop("Oops -- can't find a plink family file by that name.")
		}
	}
	fam <- read.table(ff, header = FALSE, comment.char = "", stringsAsFactors = FALSE)
	colnames(fam) <- c("fid","iid","mom","dad","sex","pheno")
	rownames(fam) <- as.character(fam$iid)
	fam$iid <- as.character(fam$iid)
	
	## convert plink's missing-data characters to proper NAs
	fam$pheno[ fam$pheno %in% c(pheno.missing) ] <- NA
	
	return(fam)
	
}

## read plink bim/map file
read.map <- function(f, keep.chrnames = FALSE, ...) {
	
	ff <- f
	if (!file.exists(ff)) {
		ff <- paste0(f, ".bim")
		if (!file.exists(ff)) {
			ff <- paste0(f, ".map")
			if (!file.exists(ff))
				stop("Oops -- can't find a plink bim/map file by that name.")
		}
	}
	fam <- read.table(ff, header = FALSE, stringsAsFactors = FALSE)
	colnames(fam) <- c("chr","marker","cM","pos","A1","A2")
	if (!keep.chrnames)
		fam$chr <- unplink.chroms(fam$chr)
	fam$cM[ fam$cM == 0 ] <- NA
	rownames(fam) <- as.character(fam$marker)
	
	return(fam)
	
}


### wrappers for specific plink commands

## run a plink command; check that expected output is present
.plink.command <- function(prefix, command, expected = list(), where = NULL,
						   suffix = NULL, overwrite = TRUE, capture = FALSE, 
						   remove = NULL, keep = NULL, flags = "", ...) {
	
	## require that <prefix> has been blessed as a plink fileset
	if (!inherits(prefix, "plink"))
		stop("Not convinced the input is a pointer to a plink fileset.")
	if (is.null(where))
		if (!is.null(attr(prefix, "working")))
			where <- attr(prefix, "working")
	else
		where <- dirname(prefix)
	outprefix <- file.path(where, basename(prefix))
	if (!is.null(suffix))
		outprefix <- paste0(outprefix, ".", gsub("^\\.", "", suffix))
	
	if (!file.exists(paste0(prefix, ".bed")) & file.exists(paste0(prefix, ".tped"))) {
		cmd <- paste0("plink --tfile ", prefix, " --make-bed --out ", outprefix)
		system(cmd, intern = FALSE)
	}
	
	expected <- sapply(expected, function(f) paste0(outprefix, ".", f))
	for (f in expected) {
		if (file.exists(f)) {
			warning(paste0("File <", f, "> already exists; this command will overwrite it."))
			if (!overwrite)
				return(FALSE)
		}
	}
	
	if (!is.null(remove)) {
		rm.ff <- tempfile()
		write.table(remove, rm.ff, quote = FALSE, row.names = FALSE, col.names = FALSE, sep = "\t")
		flags <- paste("--remove", rm.ff, flags)
	}
	
	if (!is.null(keep)) {
		keep.ff <- tempfile()
		write.table(keep, keep.ff, quote = FALSE, row.names = FALSE, col.names = FALSE, sep = "\t")
		flags <- paste("--keep", keep.ff, flags)
	}
	
	cmd <- paste0("plink --bfile ", prefix, " ", command, " --out ", outprefix, " --allow-extra-chr ", flags)
	rez <- system(cmd, intern = capture)
	success <- sapply(expected, file.exists)
	if (capture) {
		return(rez)
	}
	else {
		if (all(success) | !length(success)) {
			return(outprefix)
		}
		else {
			warning( paste("Call to plink failed; command was '", cmd, "'") )
			warning( paste("Couldn't find output file(s):", paste(expected[!success], collapse = ",")) )
			return(FALSE)
		}
	}
	
}

#' Prune markers by pairwise LD with PLINK
#' 
#' @param prefix a pointer to a PLINK fileset (of class \code{plink})
#' @param prune.by command-line flags to control LD-pruning
#' @param ... ignored
#' 
#' @return a pointer to a new, pruned PLINK fileset
#' 
#' @details See the relevant PLINK documentation for details of the underlying calculations.
#'
#' @references
#' PLINK v1.9: \url{https://www.cog-genomics.org/plink2}
#' 
#' Purcell S et al. (2007) PLINK: a toolset for whole-genome association and population-based 
#' 	linkage analysis. Am J Hum Genet 81(3): 559-575. doi:10.1086/519795.
#' 
#' @export
prune.plink <- function(prefix, prune.by = "--make-founders --indep 50 5 2", ...) {
	
	success <- .plink.command(prefix, prune.by, list("prune.in"))
	keep <- paste0(success, ".prune.in")
	cmd <- paste("--extract", keep, "--make-bed")
	success <- .plink.command(prefix, cmd, list("bed","bim","fam"), suffix = "pruned", ...)
	if (is.character(success)) {
		class(success) <- c("plink", class(success))
		return(success)
	}
	else {
		stop( paste("LD pruning failed.") )
	}
	
}

#' Perform a genome-wide association scan with PLINK
#' 
#' @param prefix a pointer to a PLINK fileset (of class \code{plink})
#' @param model statistical model for association between genotype and phenotype (\code{"assoc"} for
#' case-control contingency table; \code{"linear"} for linear model on quantitative trait;
#' \code{"logistic"} for case-control logistic regression)
#' @param test family of hypothesis test(s) to perform
#' @param perms logical: establish significance thresholds using adaptive permutation, or skip it?
#' @param nperms integer; maximum number of permutations
#' @param geno.missing drop markers with call rate lower than this threshold
#' @param ind.missing drop samples with call rate lower than this threshold
#' @param maf drop markers with minor-allele frequency lower than this threhsold
#' @param hwe drop markers p-value less than this threshold for test of Hardy-Weinbery equilibrium
#' @param flags additional command-line flags passed directly to PLINK call
#' @param ... ignored
#' 
#' @return a dataframe with association results, having the following columns:
#' \itemize{
#' \item chr
#' \item pos
#' \item marker
#' \item A1 -- reference allele (interpretation depends on dataset)
#' \item p.value -- p-value for hypothesis test
#' \item OR -- odds ratio (in case-control context)
#' \item n -- count of non-missing genotypes at this marker
#' \item test -- which family of hypothesis test this is
#' \item p.value.perm -- empirical p-value from permutations, if applicable
#' }
#' 
#' @details See the relevant PLINK documentation for details of the underlying calculations. Note
#' 	that PLINK doesn't perform kinship correction.  There now exists other, smarter software for
#' 	GWAS including \code{EMMAX}, \code{FastLMM} and \code{LIMIX}.
#'
#' @references
#' PLINK v1.9: \url{https://www.cog-genomics.org/plink2}
#' 
#' Purcell S et al. (2007) PLINK: a toolset for whole-genome association and population-based 
#' 	linkage analysis. Am J Hum Genet 81(3): 559-575. doi:10.1086/519795.
#' 
#' @export
assoc.plink <- function(prefix, model = c("assoc","linear","logistic"),
						test = c("additive","genotypic","hethom","dominant","recessive"),
						perms = TRUE, nperms = 1e5, geno.missing = 0, ind.missing = 0, maf = 0, hwe = 1,
						flags = "--keep-allele-order --allow-no-sex --nonfounders", ...) {
	
	model <- match.arg(model)
	test <- match.arg(test)
	if (test == "additive") {
		test <- ""
	}
	if (model == "assoc") {
		suffix <- ".assoc"
		cmd <- paste("--assoc")
	}
	else if (model == "linear") {
		suffix <- ".assoc.linear"
		cmd <- paste("--linear standard-beta", test)
	}
	else if (model == "logistic") {
		suffix <- ".assoc.logistic"
		cmd <- paste("--logistic", test)
	}
	
	if (perms & n.perms > 0)
		cmd <- paste(cmd, "perm --aperm 5", formatC(n.perms, format = "d"))
	cmd <- paste(cmd, flags)
	message(cmd)
	
	where <- dirname(prefix)
	if (geno.missing > 0)
		cmd <- paste(cmd, "--geno", geno.missing)
	if (ind.missing > 0)
		cmd <- paste(cmd, "--mind", ind.missing)
	if (maf > 0)
		cmd <- paste(cmd, "--maf", maf)
	if (hwe < 1)
		cmd <- paste(cmd, "--hwe", hwe, "midp include-nonctrl")
	
	expect <- list(gsub("^\\.", "", suffix))
	if (perms)
		expect <- c(expect, paste0(expect[[1]], ".perm"))
	success <- .plink.command(prefix, cmd, expect, ...)
	if (is.character(success)) {
		.df <- read.table(paste0(success, suffix), header = TRUE, stringsAsFactors = FALSE)
		if (model == "assoc") {
			df <- with(.df, data.frame(chr = CHR, pos = BP, marker = SNP, A1 = A1,
									   p.value = P, OR = OR, stringsAsFactors = FALSE))
		}
		else if (model == "logistic") {
			df <- with(.df, data.frame(chr = CHR, pos = BP, marker = SNP, A1 = A1,
									   p.value = P, OR = OR, n = NMISS, test = TEST,
									   stringsAsFactors = FALSE))
		}
		else if (model == "linear") {
			df <- with(.df, data.frame(chr = CHR, pos = BP, marker = SNP, A1 = A1,
									   p.value = P, beta = BETA, score = STAT, n = NMISS, test = TEST,
									   stringsAsFactors = FALSE))
		}
		df$chr <- unplink.chroms(df$chr)
		if (perms) {
			pdf <- read.table(paste0(success, paste0(suffix, ".perm")), header = TRUE, stringsAsFactors = FALSE)
			rownames(pdf) <- as.character(pdf$SNP)
			df$p.value.perm <- NA
			df$p.value.perm <- pdf[ as.character(df$marker),"EMP1" ]
		}
		return(df)
	}
	else {
		stop("Association-mapping failed.")
	}
	
}

#' Perform family-based transmission diseqiulibrium test (TDT) with PLINK
#' 
#' @param prefix a pointer to a PLINK fileset (of class \code{plink})
#' @param perms logical: establish significance thresholds using adaptive permutation, or skip it?
#' @param geno.missing drop markers with call rate lower than this threshold
#' @param ind.missing drop samples with call rate lower than this threshold
#' @param maf drop markers with minor-allele frequency lower than this threhsold
#' @param hwe drop markers p-value less than this threshold for test of Hardy-Weinbery equilibrium
#' @param flags additional command-line flags passed directly to PLINK call
#' @param ... ignored
#' 
#' @return a dataframe with association results, having the following columns:
#' \itemize{
#' \item chr
#' \item pos
#' \item marker
#' \item A1 -- reference allele (interpretation depends on dataset)
#' \item p.value -- p-value for hypothesis test
#' \item OR -- odds ratio (in case-control context)
#' \item n -- count of non-missing genotypes at this marker
#' \item test -- which family of hypothesis test this is
#' \item p.value.perm -- empirical p-value from permutations, if applicable
#' }
#' 
#' @details Requires a case-control phenotype. See the relevant PLINK documentation for details
#' 	of the underlying calculations.
#'
#' @references
#' PLINK v1.9: \url{https://www.cog-genomics.org/plink2}
#' 
#' Purcell S et al. (2007) PLINK: a toolset for whole-genome association and population-based 
#' 	linkage analysis. Am J Hum Genet 81(3): 559-575. doi:10.1086/519795.
#' 
#' @export
tdt.plink <- function(prefix, perms = FALSE, geno.missing = 0, ind.missing = 0, maf = 0, hwe = 1,
						flags = "--keep-allele-order --allow-no-sex --nonfounders", ...) {
	
	cmd <- paste("--tdt", flags)
	if (perms)
		cmd <- paste(cmd, "--perm")
	
	where <- dirname(prefix)
	if (geno.missing > 0)
		cmd <- paste(cmd, "--geno", geno.missing)
	if (ind.missing > 0)
		cmd <- paste(cmd, "--mind", ind.missing)
	if (maf > 0)
		cmd <- paste(cmd, "--maf", maf)
	if (hwe < 1)
		cmd <- paste(cmd, "--hwe", hwe, "midp include-nonctrl")
	
	expect <- list(gsub("^\\.", "", "tdt"))
	if (perms)
		expect <- c(expect, paste0(expect[[1]], ".perm"))
	success <- .plink.command(prefix, cmd, expect, ...)
	if (is.character(success)) {
		.df <- read.table(paste0(success, ".tdt"), header = TRUE, stringsAsFactors = FALSE)
		df <- with(.df, data.frame(chr = CHR, pos = BP, marker = SNP, A1 = A1,
									   p.value = P_COM, OR = OR))
		df$chr <- unplink.chroms(df$chr)
		if (perms) {
			pdf <- read.table(paste0(success, paste0("tdt", ".perm")), header = TRUE, stringsAsFactors = FALSE)
			rownames(pdf) <- as.character(pdf$SNP)
			df$p.value.perm <- NA
			df$p.value.perm <- pdf[ as.character(df$marker),"EMP1" ]
		}
		return(df)
	}
	else {
		stop("Association-mapping failed.")
	}
	
}

## compute genome-wide IBD estimate (\hat{pi}) with plink, optionally with initial LD pruning step
ibd.plink <- function(prefix, flags = "--nonfounders", prune = FALSE, as.matrix = TRUE, ...) {
	
	prefix.new <- prefix
	if (prune) {
		prefix.new <- prune.plink(prefix, ...)
	}
	
	cmd <- paste("--genome full --min 0.0", flags)
	success <- .plink.command(prefix.new, cmd, list("genome"), ...)
	if (is.character(success)) {
		ibd.file <- paste0(success, ".genome")
		ibd <- read.table(ibd.file, header = TRUE, strip.white = TRUE, stringsAsFactors = FALSE)
		if (as.matrix) {
			iid <- union( ibd$IID1, ibd$IID2 )
			nind <- length(iid)
			K <- matrix(0, nrow = nind, ncol = nind)
			colnames(K) <- iid
			rownames(K) <- iid
			for (i in 1:nrow(ibd)) {
				K[ ibd$IID1[i], ibd$IID2[i] ] <- ibd$PI_HAT[i]
			}
			return(Matrix::Matrix(K, sparse = TRUE))
		}
		else
			return(ibd)
	}
	else {
		stop( paste0("IBD detection failed.") )
	}
	
}

#' Compute realized kinship matrix (aka GRM) with PLINK
#' 
#' @param prefix a pointer to a PLINK fileset (of class \code{plink})
#' @param flags command-line flags passed directly to underlying PLINK call
#' @param method command-line flags for \code{PLINK}, controlling underlying calculation
#' @param prune logical; if \code{TRUE}, do a round of LD-pruning before calculating kinship matrix
#' @param ... ignored
#' 
#' @return a square matrix of genetic distances (or similarities), with row and column names
#' 	equal to the sample IDs
#' 
#' @details See the relevant PLINK documentation for details of the underlying calculations.
#' 	The default is to compute the 'Mahnattan distance' between samples, ie. 1 minus the mean
#' 	number alleles shared identical-by-state across all markers.
#'
#' @references
#' PLINK v1.9: \url{https://www.cog-genomics.org/plink2}
#' 
#' Purcell S et al. (2007) PLINK: a toolset for whole-genome association and population-based 
#' 	linkage analysis. Am J Hum Genet 81(3): 559-575. doi:10.1086/519795.
#' 
#' @export
kinship.plink <- function(prefix, method = "--distance square 1-ibs", flags = "", prune = FALSE, ...) {
	
	prefix.new <- prefix
	if (prune) {
		prefix.new <- prune.plink(prefix, ...)
	}
	
	suffix <- "mdist"
	if (grepl("--make-rel", method))
		suffix <- "rel"
	
	cmd <- paste(method, flags)
	success <- .plink.command(prefix.new, cmd, list(suffix, paste0(suffix, ".id")), ...)
	if (is.character(success)) {
		kin.file <- paste0(success, ".", suffix)
		kin.ids <- paste0(success, ".", suffix, ".id")
		K <- as.matrix(read.table(kin.file, stringsAsFactors = FALSE))
		colnames(K) <- rownames(K) <- read.table(kin.ids, stringsAsFactors = FALSE)[,2]
		return(K)
	}
	else {
		stop("Kinship estimation failed.")
	}
	
}

#' Compute Weir & Cockerham's F_st estimator using PLINK
#' 
#' @param prefix a pointer to a PLINK fileset (of class \code{plink})
#' @param by a vector of population labels; if not specified, family ID is used
#' @param per.locus logical: if \code{TRUE}, return one value per marker; else return aggregate
#' @param chr restrict analysis to this chromosome (if \code{NULL}, all chromosomes)
#' @param flags command-line flags passed directly to underlying PLINK call
#' @param ... ignored
#' 
#' @return a square matrix of pairwise F_st values; diagonal has mean heterozygosity within
#' 	each population as defined by the labels in \code{by}
#' 
#' @details The F_st statistic as proposed by Sewall Wright is a measure of allele-frequency
#' 	differentiation between two populations. See the relevant PLINK documentation for details
#' 	of the underlying calculations, and Weir & Cockerham (1984) for derivation of the estimator.
#'
#' @references
#' PLINK v1.9: \url{https://www.cog-genomics.org/plink2}
#' 
#' Purcell S et al. (2007) PLINK: a toolset for whole-genome association and population-based 
#' 	linkage analysis. Am J Hum Genet 81(3): 559-575. doi:10.1086/519795.
#' 	
#' Wright S (1931) Evolution in Mendelian populations. Genetics 16: 97-159.
#' 
#' Weir BS & Cockerham CC (1984) Estimating F-statistics for the analysis of population
#' 	structure. Evolution 38(6): 1358-1370.
#' 
#' @aliases weir.fst
#' @export weir.fst
weir.fst.plink <- function(prefix, by = NULL, per.locus = FALSE, chr = NULL, flags = "", ...) {
	
	if (!inherits(prefix, "plink"))
		stop("Not convinced the input is a pointer to a plink fileset.")
	
	if (!is.null(chr))
		prefix <- filter.plink(prefix, chr = chr)
	print(prefix)
	
	fam <- read.fam(prefix)
	by.flags <- "--family"
	if (!is.null(by)) {
		ff <- tempfile()
		if (length(by) != nrow(fam))
			stop("Group labels and family file have different lengths.")
		write.table(data.frame(fid = fam$fid, iid = fam$iid, group = by), ff,
					col.names = FALSE, row.names = FALSE, quote = FALSE)
		by.flags <- paste0("--within ", ff)
	}
	else {
		by <- fam$fid
	}
	
	## compute off-diagonals: Fst
	cl <- levels(factor(by))
	pairs <- combn(cl, 2)
	fst <- matrix(0, nrow = length(cl), ncol = length(cl))
	loci <- NULL
	rownames(fst) <- cl
	colnames(fst) <- cl
	for (i in seq_len(ncol(pairs))) {
		by.flags.pair <- paste(by.flags, "--keep-cluster-names", paste(as.vector(pairs[,i]), collapse = " "))
		cmd <- paste("--fst", by.flags.pair, flags)
		print(cmd)
		rez <- .plink.command(prefix, cmd, list("fst"), capture = TRUE, ...)
		if (is.character(rez)) {
			fst[ pairs[1,i], pairs[2,i] ] <- as.numeric(unlist(strsplit(rez[ length(rez) ], ":"))[2])
			fst[ pairs[2,i], pairs[1,i] ] <- fst[ pairs[1,i], pairs[2,i] ]
			if (per.locus) {
				rez <- read.table(paste0(file.path(attr(prefix, "working"), basename(prefix)), ".fst"), header = TRUE, stringsAsFactors = FALSE)
				colnames(rez) <- c("chr","marker","pos","n","fst")
				rez$pop1 <- factor(pairs[1,i], levels = cl)
				rez$pop2 <- factor(pairs[2,i], levels = cl)
				rez$chr <- unplink.chroms(rez$chr)
				loci <- rbind(loci, rez)
			}
			
		}
		else
			stop("Calculation of F_st failed.")
	}
	
	## compute on-diagonals: mean F
	cmd <- paste("--het")
	success <- .plink.command(prefix, cmd, list("het"), ...)
	if (is.character(success)) {
		rez <- read.table(paste0(success, ".het"), header = TRUE, stringsAsFactors = FALSE)
		colnames(rez) <- c("fid","iid","hom.o","hom.e","nmar","fhat")
	}
	else
		stop("Estimation of inbreeding coefficients failed.")
	
	fhat <- tapply(rez$fhat, by, mean)
	for (i in cl) {
		fst[ i,i ] <- fhat[i]
	}
	
	if (per.locus)
		return( list(matrix = fst, loci = loci) )
	else
		return(fst)
}

## get count of Hs and Ns by sample with plink
## NOT WORKING
qc.plink <- function(prefix, flags = "--nonfounders", ...) {
	
	# missingness
	missfile <- paste0(prefix, ".imiss")
	cmd <- paste("--missing --out", prefix, flags)
	success <- .plink.command(prefix, cmd, list(missfile))
	if (!success)
		stop("Attempt to compute missing-genotype rate failed.")
	
	# heterozygosity
	hetfile <- paste0(prefix, ".het")
	cmd <- paste("--het --out", prefix, flags)
	success <- .plink.command(prefix, cmd, list(hetfile), ...)
	if (!success)
		stop("Attempt to compute heterozygosity failed.")
	
	nmiss <- read.table(missfile, header = TRUE, stringsAsFactors = FALSE)
	nhom <- read.table(hetfile, header = TRUE, stringsAsFactors = FALSE)
	nmiss <- cbind(nmiss, nhom[ ,-(1:2) ])
	nmiss$H <- with(nmiss, (N_GENO - O.HOM. - N_MISS + 1))
	nmiss$N <- nmiss$N_MISS
	nmiss$valid <- nmiss$N_GENO
	rez <- nmiss[ ,c("FID","IID","H","N","valid") ]
	
	return(rez)
	
}

#' Compute pairwise LD between markers with PLINK
#' 
#' @param prefix a pointer to a PLINK fileset (of class \code{plink})
#' @param dprime logical; if \code{TRUE}, compute Lewontin D-prime instead of R2
#' @param index.snp name of the marker used to anchor the calculation
#' @param markers compute all pairwise LD values between markers in this list
#' @param chr limit analysis to this chromosome
#' @param from with \code{chr}, limit analysis to window starting at this position
#' @param to with \code{chr}, limit analysis to window ending at this position
#' @param window (ignored)
#' @param window.r2 keep calculating until pairwise LD falls below this threshold
#' @param flags command-line flags passed directly to underlying PLINK call
#' @param ... ignored
#' 
#' @return a \code{data.table} of pairwise LD values, one marker pair per row
#' 
#' @details Linkage disequlibrium (LD) is estimated here via the squared genetic correlation (r^2)
#' 	between loci.  Since the size of the output scales with the square of the number of markers and
#' 	can easily grow to gigabytes for a dataset with >10k markers, it is highly recommended
#' 	to limit the calculation using the parameters above.
#' 	
#' 	See the relevant PLINK documentation for details of the underlying calculations.
#'
#' @references
#' PLINK v1.9: \url{https://www.cog-genomics.org/plink2}
#' 
#' Purcell S et al. (2007) PLINK: a toolset for whole-genome association and population-based 
#' 	linkage analysis. Am J Hum Genet 81(3): 559-575. doi:10.1086/519795.
#' 	
#' Lewontin RC (1960) The interaction of selection and linkage. I. General considerations; heterotic
#' 	models. Genetics 49(1): 49-67.
#' 
#' @export
ld.plink <- function(prefix, dprime = FALSE, index.snp = NULL, markers = NULL, chr = NULL, from = NULL, to = NULL,
					 window = NULL, window.r2 = 0, flags = "", ...) {
	
	if (!inherits(prefix, "plink"))
		stop("Not convinced the input is a pointer to a plink fileset.")
	where <- dirname(prefix)
	
	cmd <- paste0("--r2 inter-chr")
	if (dprime)
		cmd <- paste(cmd, "dprime")
	if (!is.null(index.snp))
		cmd <- paste0(cmd, " --ld-snp ", index.snp)
	if (!is.null(markers)) {
		mkfile <- tempfile()
		write.table(data.frame(markers = markers), mkfile, quote = FALSE, col.names = FALSE, row.names = FALSE)
		cmd <- paste0(cmd, " --ld-snp-list ", mkfile)
	}
	if (!is.null(chr))
		cmd <- paste0(cmd, " --chr ", paste(gsub("^chr","", as.character(chr))))
	if (!is.null(from))
		cmd <- paste0(cmd, " --from-bp ", formatC(from, format = "d"))
	if (!is.null(to))
		cmd <- paste0(cmd, " --to-bp ", formatC(to, format = "d"))
	if (!is.null(window.r2))
		cmd <- paste0(cmd, " --ld-window-r2 ", window.r2)
	cmd <- paste(cmd, flags)
	
	success <- .plink.command(prefix, cmd, list("ld"), ...)
	if (is.character(success)) {
		ld.file <- paste0(success, ".ld")
		ld <- data.table::data.table(read.table(ld.file, header = TRUE, strip.white = TRUE, stringsAsFactors = FALSE))
		return(ld)
	}
	else {
		stop( paste0("LD estimation failed.") )
	}
	
}

#' Filter markers and samples from a dataset with PLINK
#' 
#' @param prefix a pointer to a PLINK fileset (of class \code{plink})
#' @param out filename prefix for filtered PLINK fileset; overrides \code{prefix}
#' @param chr keep only markers on this chromosome
#' @param from with \code{chr}, keep only markers with position greater than this
#' @param to with \code{chr}, keep only markers with position less than this
#' @param maf drop markers with minor-allele frequency lower than this threhsold
#' @param hwe drop markers p-value less than this threshold for test of Hardy-Weinbery equilibrium
#' @param geno.missing drop markers with call rate lower than this threshold
#' @param ind.missing drop samples with call rate lower than this threshold
#' @param attrib with \code{attrib.file}, keep only samples with these attribute(s) (labels)
#' @param attrib.file with \code{attrib}, a file containing family IDs, sample IDs and attribute(s)
#' @param remove character vector of samples to exclude
#' @param keep character vector of samples to keep
#' @param remove.fam character vector of family IDs to exclude
#' @param keep.fam character vector of family IDs to keep
#' @param flags additional command-line flags passed directly to PLINK call
#' @param ... ignored
#' 
#' @return a pointer (of class \code{plink}) to a new PLINK binary fileset after application
#' 	of the above filters
#' 	
#' @details See the relevant PLINK documentation for details of the filters and precedence rules
#' 	governing how they are applied.
#'
#' @references
#' PLINK v1.9: \url{https://www.cog-genomics.org/plink2}
#' 
#' Purcell S et al. (2007) PLINK: a toolset for whole-genome association and population-based 
#' 	linkage analysis. Am J Hum Genet 81(3): 559-575. doi:10.1086/519795.
#' 
#' @export
filter.plink <- function(prefix, out = NULL, chr = NULL, from = NULL, to = NULL,
						 maf = 0, hwe = 1, geno.missing = 0,
						 ind.missing = 0, attrib = NULL, attrib.file = NULL,
						 remove = NULL, keep = NULL, remove.fam = NULL, keep.fam = NULL,
						 flags = "", ...) {
	
	if (!inherits(prefix, "plink"))
		stop("Not convinced the input is a pointer to a plink fileset.")
	where <- dirname(prefix)
	
	cmd <- "--make-bed"
	if (!is.null(attrib)) {
		if (file.exists(attrib.file))
			cmd <- paste(cmd, "--attrib-indiv", attrib.file, paste(attrib, collapse = ","))
		else
			stop("Filtering of individuals by attributes was requested but attribute file couldn't be found.")
	}
	if (!is.null(chr))
		cmd <- paste0(cmd, " --chr ", paste(gsub("^chr","", as.character(chr))))
	if (!is.null(from))
		cmd <- paste0(cmd, " --from-bp ", formatC(from, format = "d"))
	if (!is.null(to))
		cmd <- paste0(cmd, " --to-bp ", formatC(to, format = "d"))
	if (ind.missing > 0)
		cmd <- paste(cmd, "--mind", ind.missing)
	if (maf > 0)
		cmd <- paste(cmd, "--maf", maf)
	if (hwe < 1)
		cmd <- paste(cmd, "--hwe", hwe, "midp include-nonctrl")
	cmd <- paste(cmd, flags)
	
	if (!is.null(remove.fam)) {
		fam <- read.fam(prefix)
		ff <- tempfile()
		write.plink.file(subset(fam, fid %in% remove.fam)[,1:2], ff)
		cmd <- paste(cmd, "--remove", ff)
	}
	
	if (!is.null(remove)) {
		fam <- read.fam(prefix)
		ff <- tempfile()
		write.plink.file(subset(fam, iid %in% remove)[,1:2], ff)
		cmd <- paste(cmd, "--remove", ff)
	}
	
	if (!is.null(keep.fam)) {
		fam <- read.fam(prefix)
		ff <- tempfile()
		write.plink.file(subset(fam, fid %in% keep.fam)[,1:2], ff)
		cmd <- paste(cmd, "--keep", ff)
	}
	
	if (!is.null(keep)) {
		fam <- read.fam(prefix)
		ff <- tempfile()
		write.plink.file(subset(fam, iid %in% keep)[,1:2], ff)
		cmd <- paste(cmd, "--keep", ff)
	}
	
	## allow output to user-specified location instead of TMPDIR
	if (!is.null(out)) {
		attr(prefix, "working") <- dirname(out)
		sfx <- basename(out)
	}
	else {
		sfx <- "filtered"
	}
	success <- .plink.command(prefix, cmd, list("bed","bim","fam"), suffix = sfx, ...)
	if (is.character(success)) {
		class(success) <- c("plink", class(success))
		if (!is.null(attr(prefix, "working")))
			attr(success, "working") <- attr(prefix, "working")
	}
	else {
		stop( paste0("Filtering of plink dataset failed.") )
	}
	
	return(success)
	
}

#' Perform classical multidimensional scaling (MDS) with PLINK
#' 
#' @param prefix a pointer to a PLINK fileset (of class \code{plink})
#' @param flags command-line flags passed directly to underlying PLINK call
#' @param K project samples onto this many dimensions
#' @param ... ignored
#' 
#' @return a dataframe with individual IDs, family IDs, and then projections in columns
#' 	"MDS1"..."MDS{k}".
#' 
#' @details See the relevant PLINK documentation for details of the underlying calculations.
#' 	The default is to perform MDS on autosomal genotypes only.  Scaled eigenvalues (ie. percent
#' 	variance explained by each dimension) are provided in \code{attr(,"eigvals")}.
#'
#' @references
#' PLINK v1.9: \url{https://www.cog-genomics.org/plink2}
#' 
#' Purcell S et al. (2007) PLINK: a toolset for whole-genome association and population-based 
#' 	linkage analysis. Am J Hum Genet 81(3): 559-575. doi:10.1086/519795.
#' 
#' @seealso \code{\link{pca.plink}}
#' 
#' @export
mds.plink <- function(prefix, flags = "--autosome", K = 3, ...) {
	
	if (!inherits(prefix, "plink"))
		stop("Not convinced the input is a pointer to a plink fileset.")
	where <- dirname(prefix)
	
	if (K < 1 || !is.numeric(K))
		stop("Can't do MDS with no dimensions; specify a number K > 0.")
	
	cmd <- paste("--cluster --mds-plot", K, "eigvals", flags)
	success <- .plink.command(prefix, cmd, list("mds","mds.eigvals"), ...)
	if (is.character(success)) {
		mds.file <- paste0(success, ".mds")
		mds <- read.table(mds.file, header = TRUE, strip.white = TRUE, stringsAsFactors = FALSE)
		colnames(mds) <- c("fid","iid","cluster",paste0("MDS", seq_len(K)))
		eigs <- read.table(paste0(success, ".mds.eigvals"), header = FALSE, stringsAsFactors = FALSE)[,1]
		attr(mds, "eigvals") <- eigs/sum(eigs)
		return(mds)
	}
	else {
		stop( paste0("MDS procedure failed.") )
	}
	
}

#' Perform PCA on genotypes with PLINK
#' 
#' @param x a pointer to a PLINK fileset (of class \code{plink})
#' @param flags command-line flags passed directly to underlying PLINK call
#' @param K return the projection of samples onto the top K PCs
#' @param by project individuals (\code{"indiv"}) or markers (\code{"var"}) onto PCs?
#' @param ... ignored
#' 
#' @return When \code{by = "indiv"} (the default), a dataframe with individual IDs, family IDs,
#' and then projections in columns "PC1"..."PC{k}".  When \code{by == "var"}, a dataframe with
#' columns chromosome and marker name followed by PCs.  The object inherits from \code{pca.result}
#' so it can be autoploted with \code{plot()}.
#' 
#' @details See the relevant PLINK documentation for details of the underlying calculations.
#' 	The default is to perform PCA on autosomal genotypes only.  Scaled eigenvalues (ie. percent
#' 	variance explained by each dimension) are provided in \code{attr(,"eigvals")}.
#' 	
#' 	A recently-proposed test for natural selection uses the squared loadings of each marker
#' 	against the top PCs as an analog of F_st.  To obtain those loadings use \code{by = "var"}.
#'
#' @references
#' PLINK v1.9: \url{https://www.cog-genomics.org/plink2}
#' 
#' Purcell S et al. (2007) PLINK: a toolset for whole-genome association and population-based 
#' 	linkage analysis. Am J Hum Genet 81(3): 559-575. doi:10.1086/519795.
#' 	
#' Duforet-Frebourg N et al. (2015) Detecting genomic signatures of natural selection with
#' 	principal componentanalysis: application to the 1000 Genomes data
#' 	\url{http://arxiv.org/abs/1504.04543}
#' 
#' @seealso \code{\link{mds.plink}}, \code{\link{plot.pca.result}}
#' 
#' @aliases pca
#' @export
pca.plink <- function(x, flags = "--autosome", K = 20, by = c("indiv","var"), ...) {
	
	if (!inherits(x, "plink"))
		stop("Not convinced the input is a pointer to a plink fileset.")
	where <- dirname(x)
	
	if (K < 1 || !is.numeric(K))
		stop("You probably don't want a PCA result with no dimensions; specify a number K > 0.")
	
	by <- match.arg(by)
	cmd <- paste(flags, "--pca", K)
	if (by == "indiv") {
		cols <- c("fid","iid", paste0("PC", seq_len(K)))
		expect <- list("eigenvec","eigenval")
	}
	else if (by == "var") {
		cmd <- paste(cmd, "var-wts")
		cols <- c("chr","marker", paste0("PC", seq_len(K)))
		expect <- list("eigenvec.var","eigenval")
	}
	else {
		stop()
	}
	
	success <- .plink.command(x, cmd, expect, ...)
	if (is.character(success)) {
		pcs.file <- paste0(success, ".", expect[[1]])
		pcs <- read.table(pcs.file, header = FALSE, strip.white = TRUE, stringsAsFactors = FALSE)
		colnames(pcs) <- cols[1:ncol(pcs)]
		eigs <- read.table(paste0(success, ".", expect[[2]]), header = FALSE, stringsAsFactors = FALSE)[,1]
		attr(pcs, "eigvals") <- eigs/sum(eigs)
		attr(pcs, "explained") <- attr(pcs, "eigvals")
		class(pcs) <- c("pca.result", class(pcs))
		return(pcs)
	}
	else {
		stop( paste0("PCA procedure failed.") )
	}
	
}
andrewparkermorgan/argyle documentation built on May 10, 2019, 11:08 a.m.