R/admixture.R

Defines functions .sort_indiv theme_admixture sort_by_cluster plot_admixture_labelled plot_admixture read_loglik_files tidy.admixture_Q tidy bootstrap_Q read_Q_bootstraps read_Fst_matrix read_Q_matrix

Documented in plot_admixture plot_admixture_labelled read_loglik_files read_Q_matrix sort_by_cluster .sort_indiv theme_admixture tidy

## admixture.R
# functions for handling output from ADMIXTURE (Alexander DH 2009 Genome Res)

#' Read ancestry proportions (`Q-matrix`) from ADMIXTURE run
#' 
#' @param infile path to matrix of ancestry proportions (*.Q file) from ADMIXTURE run
#' @param K number of clusters; if \code{NULL} (default), attempt to infer from filename
#' @param iids sample IDs (in same order as rows in matrix); if \code{NULL} (default),
#'  look for corresponding *.fam file
#' @param ... extra arguments, ignored
#' @details If \code{K} and \code{iids} are both \code{NULL} (default), assumes that
#'  estimation was performed from a PLINK *.bed file, and that the corresponding *.fam
#'  file resides in same directory as output. If standard errors for ancestrty proportions
#'  are availbale (*.Q_se file), they are read as well and stored in \code{attr(result, "se")}.
#' 
#' @export
read_Q_matrix <- function(infile, K = NULL, iids = NULL, ...) {
	
	infile <- infile[1]
	fam.path <- NULL
	if (is.null(iids)) {
		if (grepl("\\.\\d+\\.Q$", infile)) {
			fam.path <- gsub("\\.\\d+\\.Q$",".fam", infile)
		}
		else {
			fam.path <- paste0(infile, ".fam")
			infile <- paste0(infile, ".Q")
		}
	}
	
	sefile <- paste0(infile, "_se")
	do.se <- file.exists(sefile)
	
	# figure out how many clusters
	if (is.null(K))
		K <- as.integer(stringr::str_match(infile, "([0-9]+)\\.Q$")[,2])
	
	message("Reading ancestry proportions from <", infile, ">...")
	Q <- readr::read_delim(infile, delim =" ", col_names = FALSE)
	
	if (do.se) {
		message("Reading standard errors from <", sefile, ">...")
		se <- readr::read_delim(sefile, delim = " ", col_names = FALSE)
	}
	
	if (!is.null(fam.path)){
		message("Reading sample metadata from <", fam.path, ">...")
		fam <- read_fam(fam.path)
	}
	else {
		message("Using input sample IDs ...")
		fam <- tibble::tibble(fid = iids, iid = iids, mom = 0, dad = 0, sex = 0, pheno = -9)
	}
		
	#rownames(Q) <- fam$iid
	colnames(Q) <- paste0("pop", seq_len(ncol(Q)))
	Q <- dplyr::bind_cols(iid = fam$iid, fid = as.character(fam$fid), Q)
	
	if (do.se) {
		colnames(se) <- paste0("pop", seq_len(ncol(se)))
		se <- dplyr::bind_cols(iid = fam$iid, fid = as.character(fam$fid), se)
		attr(Q, "se") <- se
	}
	
	class(Q) <- c("admixture_Q", class(Q))
	attr(Q, "K") <- K
	return(Q)
	
}

#' @export
read_Fst_matrix <- function(infile, melted = TRUE, lower.tri = TRUE, ...) {
	
	infile <- infile[1]
	
	message("Reading Fst matrix from <", infile, "> ...")
	lines <- readLines(infile)
	popfile <- gsub("\\.\\d+\\.out$", ".pop", infile)
	
	keep <- grepl("^Pop\\d+", lines, perl = TRUE)
	lines <- lines[keep]
	ll <- stringr::str_split(lines, "\\t")
	
	npop <- length(ll[[1]])+1
	if (file.exists(popfile)) {
		message("Reading population labels from <", popfile, "> ...")
		pops <- readr::read_tsv(popfile, col_names = "pop", na = "-", progress = FALSE)$pop
		pops <- pops[ !is.na(pops) ]
		pops <- pops[ !duplicated(pops) ]
		po <- sort(pops)
	}
	else {
		message("Using default population labels from ADMIXTURE ...")
		pops <- ll[[1]]
		po <- pops
	}
	
	Fst <- matrix(0.0, nrow = npop, ncol = npop)
	for (ii in seq_along(ll[-(1:2)])) {
		values <- as.numeric( ll[[ ii+2 ]][ -1 ] )
		Fst[ ii+1,1:length(values) ] <- values
	}
	
	Fst <- Fst + t(Fst)
	rownames(Fst) <- colnames(Fst) <- pops
	cl <- stats::hclust( stats::as.dist(Fst+0.001), method = "centroid" )
	o <- cl$order
	po <- po[o]
	Fst <- Fst[ po,po ]
	
	if (melted) {
		
		if (lower.tri) {
			Fst[ upper.tri(Fst) ] <- NA
		}
		Fst <- dplyr::bind_cols(pop = rownames(Fst), tibble::as_tibble(Fst))
		Fst <- tidyr::gather(Fst, key = other, value = fst, -pop)
		Fst <- subset(Fst, !is.na(fst))
		Fst$pop <- factor(Fst$pop, levels = po)
		Fst$other <- factor(Fst$other, levels = po)
		
	}
	
	attr(Fst, "source") <- infile
	attr(Fst, "clustering") <- cl
	return(Fst)
	
}

# deprecated and ill-conceived
read_Q_bootstraps <- function(prefix, k, ...) {
	
	where <- dirname(prefix)
	bname <- basename(prefix)
	pattern <- paste0(bname, "\\..+\\.", k, "\\.Q")
	print(pattern)
	ff <- list.files(path = where, pattern = pattern, full.names = TRUE)
	print(ff)
	#id <- seq_along(ff)
	#df <- tibble::tibble(run = id)
	lapply(ff, read_Q_matrix)
	
}

# deprecated and ill-conceived
bootstrap_Q <- function(ql, ...) {
	
	K <- attr(ql[[1]], "K")
	iids <- ql[[1]][ ,c("iid","fid") ]
	
	Q <- simplify2array(lapply(ql, function(f) as.matrix(f[,-(1:2)])))
	mu <- apply(Q, c(1,2), mean, na.rm = TRUE)
	sigma <- apply(Q, c(1,2), sd, na.rm = TRUE)/sqrt(dim(Q)[3])
	
	mu <- dplyr::bind_cols(iids, tibble::as_tibble(mu))
	sigma <- dplyr::bind_cols(iids, tibble::as_tibble(sigma))
	
	mu <- mutate(tidyr::gather(mu, key = pop, value = mu, -iid, -fid), K = K)
	sigma <- mutate(tidyr::gather(sigma, key = pop, value = se, -iid, -fid), K = K)
	
	dplyr::inner_join(mu, sigma)
	
}

#' Convert admixture proportions to tidy format.
#' 
#' @param Q a matrix of admixture proprtions
#' @param ... extra arguments; if first one is a data frame, treat it as additional sample metadata,
#'   with column \code{"iid"} as primary key
#' @return a dataframe of admixture proportions (and standard errors if available),
#'   one row per component-individual pair
#' 
#' @export
tidy <- function(Q, ...){
	UseMethod("tidy")
}

#' @export
tidy.admixture_Q <- function(Q, ...) {

	if (!inherits(Q, "admixture_Q"))
		warning("Q may not be a properly-formatted ADMIXTURE result.")
	
	K <- attr(Q, "K")
	qq <- dplyr:: mutate(tidyr::gather(Q, key = variable, value = value, -iid, -fid), K = K)
	
	if (!is.null(attr(Q, "se"))) {
		se <- attr(Q, "se")
		se <- dplyr::mutate(tidyr::gather(se, key = variable, value = se, -iid, -fid), K = K)
		qq <- dplyr::inner_join(qq, se)
	}
	
	extras <- list(...)
	if (length(extras)) {
		if (is.data.frame(extras[[1]])) {
			meta <- tibble::as_tibble(extras[[1]])
			qq <- dplyr::left_join(qq, meta, by = "iid")
		}
	}
	
	return(qq)
		
}

#' Read log-likelihoods from ADMIXTURE runs
#' @export
read_loglik_files <- function(prefix, ...) {
	
	d <- dirname(prefix)
	f <- basename(prefix)
	ff <- list.files(d, paste0(f, "\\.[0-9]+\\.llik"), full.names = TRUE)
	k <- stringr::str_match(ff, "\\.([0-9]+)\\.llik")[,2]
	llik <- sapply(ff, readLines)
	tibble::tibble(K = as.integer(k), llik = as.numeric(llik))
	
}

#' Make standard stacked-bars plot of ancestry proportions.
#' 
#' @param Q a matrix of admixture proportions, melted or not
#' @param label show individual IDs along axis
#' @param sort_order apply pre-determined sort order to individuals, eg. by cluster membership
#' @param pop_order named vector to map arbitrary population labels ('pop1') onto meaningful ones
#' @param ... extra arguments, ignored
#' 
#' @export
plot_admixture <- function(Q, label = FALSE, sort_order = NULL, pop_order = NULL, ...) {
	
	
	if (inherits(Q, "admixture_Q"))
		qq <- tidy.admixture_Q(Q, ...)
	else
		qq <- tibble::as_tibble(Q)
	
	if (!is.null(pop_order)) {
		qq$variable <- factor(qq$variable, labels = pop_order)
		levels(qq$variable) <- names(pop_order)
	}
	if (!is.null(sort_order)) {
		qq$iid <- factor(qq$iid, sort_order)
	}
	
	qq <- dplyr::arrange(qq, variable)
	p0 <- ggplot2::ggplot(qq) +
		ggplot2::geom_bar(ggplot2::aes(x = iid, y = value, fill = variable), stat = "identity") +
		theme_admixture()
	
	if (!label)
		p0 <- p0 +
		ggplot2::theme(axis.ticks.x = ggplot2::element_blank(),
					   axis.text.x = ggplot2::element_blank())
	
	return(p0)
	
}

#' Make stacked-bar plot of ancestry proportions, with population labels
#' 
#' @param Q a matrix of admixture proportions, melted or not; must have a column `pop` with population labels
#' @param label show individual IDs along axis
#' @param sort_order pre-determined sort order for individuals, eg. by cluster membership
#' @param pop_order pre-determined sort order for populations
#' @param cluster_order named vector to map arbitrary ancestry-component labels ('pop1') onto meaningful ones
#' @param label_pops should population labels be drawn?
#' @param label_ind should individuals be labelled along x-axis (probaly not, if more than a few samples)
#' @param label_size text size for population labels
#' @param nudge extra buffer around population labels
#' @param ymax how much extra space to allow for population labels; about 1.5 is probably enough
#' @param ... extra arguments, ignored
#' 
#' @export
plot_admixture_labelled <- function(Q, sort_order = NULL, pop_order = NULL, cluster_order = NULL, label_pops = TRUE,
									label_size = 3, opaque_labs = FALSE, label_ind = FALSE, nudge = 0.05, ymax = 1.5, ...) {
	
	if (inherits(Q, "admixture_Q"))
		qq <- tidy.admixture_Q(Q, ...)
	else
		qq <- tibble::as_tibble(Q)
	
	if (!is.null(cluster_order)) {
		qq$variable <- factor(qq$variable, labels = cluster_order)
		if (!is.null(names(cluster_order))) {
			levels(qq$variable) <- names(cluster_order)
		} else {
			levels(qq$variable) <- cluster_order
		}
	} else {
		qq$variable <- factor(qq$variable)
	}
	if (!is.null(sort_order)) {
		qq$iid <- factor(qq$iid, sort_order)
	} else {
		qq$iid <- factor(qq$iid)
		sort_order <- levels(qq$iid)
	}
	if (!is.null(pop_order)) {
		qq$pop <- factor(qq$pop, levels = pop_order)
	} else {
		qq$pop <- factor(qq$pop)
		pop_order <- levels(qq$pop)
	}
	
	qq <- dplyr::arrange(qq, variable)
	qq1 <- subset(qq, as.numeric(variable) == 1)
	so <- with(qq1, order(pop, iid))
	new_so <- as.character(qq1$iid[so])
	qq$iid <- factor(qq$iid, new_so)
	qq1$iid <- factor(qq1$iid, new_so)
	qq1 <- dplyr::arrange(qq1, iid)
	
	pop_labs <- pop_order
	nudge_by <- length(new_so)*nudge
	pop_spacing <- (length(new_so)-nudge_by)/length(pop_labs)
	pop_pos <- (seq_along(pop_labs)-1)*pop_spacing+nudge_by
	#print(setNames(pop_pos, pop_labs))
	first_ind <- sapply(pop_labs, function(f) min(which(qq1$pop == f)))
	last_ind <- sapply(pop_labs, function(f) max(which(qq1$pop == f)))
	
	#print(first_ind)
	pop_df <- tibble::tibble(pop = pop_labs,
							 pos = pop_pos,
							 first = first_ind,
							 last = last_ind)
	
	p0 <- ggplot2::ggplot(qq) +
		ggplot2::geom_bar(ggplot2::aes(x = iid, y = value, fill = variable), stat = "identity") +
		ggplot2::geom_segment(data = pop_df, 
							  ggplot2::aes(x = first-1, xend = first-1, y = 0, yend = 1),
							  colour = "grey60") +
		ggplot2::scale_y_continuous(limits = c(0, ymax),
									breaks = c(0, 0.5, 1.0)) +
		theme_admixture()
	
	if (label_pops) {
		if (opaque_labs) {
			p0 <- p0 +
				ggplot2::geom_label(data = pop_df, 
									ggplot2::aes(x = pos, y = 1.2, label = pop),
									fill = "white", label.r = ggplot2::unit(0, "points"),
									angle = 90, hjust = 0, nudge_x = 0.05, nudge_y = 0.02, size = label_size)
		}
		else {
			p0 <- p0 +
				ggplot2::geom_text(data = pop_df,
								   ggplot2::aes(x = pos, y = 1.2, label = pop),
								   angle = 90, hjust = 0, nudge_x = 0.05, nudge_y = 0.02, size = label_size)
		}
		p0 <- p0 +
			ggplot2::geom_segment(data = pop_df, 
								  ggplot2::aes(x = pos, xend = first-1, y = 1.2, yend = 1.0),
								  colour = "grey60")
	}
	
	if (!label_ind)
		p0 <- p0 +
		ggplot2::theme(axis.ticks.x = ggplot2::element_blank(),
					   axis.text.x = ggplot2::element_blank())
	
	if (label_pops) {
		attr(p0, "pop_labels") <- pop_df
	}
	
	return(p0)
	
}

#' Sort samples by admixture components using crude heuristics
#' 
#' @param Q an ADMIXTURE Q-matrix from \code{read_Q_matrix()}
#' @param method clustering heuristic; currently only works with "all"
#' @param ... extra arguments, ignored
#' @return character vector of individual IDs in sorted order
#' 
#' @export
sort_by_cluster <- function(Q, method = "all", ...) {
	
	if (!inherits(Q, "admixture_Q"))
		stop("Only works for bona fide un-melted ADMIXTURE result.")
	
	pop.cols <- paste0("pop", seq_len(attr(Q, "K")))
	QQ <- matrix(as.matrix(Q[ ,pop.cols ]), nrow = nrow(Q), dimnames = list(Q$iid))
	rez <- .sort_indiv(QQ, sortind = method)
	so <- rownames(rez$dframe)
	return(so)
	
}

#' A ggplot theme for ADMIXTURE plots
#' @export
theme_admixture <- function(...) {
	
	ggplot2::theme_bw(...) +
		ggplot2::theme(...,
					   axis.title.x = ggplot2::element_blank(),
					   axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, size = 6),
					   panel.background = ggplot2::element_blank(),
					   panel.grid = ggplot2::element_blank(),
					   panel.border = ggplot2::element_blank(),
					   panel.spacing.x = grid::unit(0.15, "lines"),
					   strip.background = ggplot2::element_rect(colour = NA)
		)

}

#' Internal helper function for sorting rows in ADMIXTURE Q-matrix
#' @details Code stolen whole-cloth from <https://github.com/royfrancis/pophelper/blob/master/R/pophelper.R> 
#' @export
.sort_indiv <- function(dframe = NULL, grplab = NA, selgrp = NA, sortind = NA, grplabpos = NA, linepos = NA)
{
	if(is.null(dframe)) stop("sortInd: Argument 'dframe' is empty.")
	if(!all(is.na(grplab))) 
	{
		if(is.na(selgrp)) selgrp <- names(grplab)[1]
		if(!is.character(selgrp)) stop("sortInd: Argument 'selgrp' must be a character datatype.")
		if(length(selgrp)>1) stop("sortInd: Argument 'selgrp' must be a character datatype of length 1.")
		if(!any(selgrp %in% names(grplab))) stop(paste0("sortInd: Argument 'selgrp' contains (",selgrp,") which is not in the 'grplab' titles (",paste0(names(grpla),collapse=", "),")."))
	}
	if(all(!is.na(sortind)))
	{
		if(length(sortind) > 1) stop("sortInd: Argument 'sortind' must be of length 1. Use 'all','label' or a cluster name like 'Cluster1'.")
		#if(sortind != "all" && sortind != "label" && !grepl("Cluster[0-9+]",sortind)) stop("sortInd: Argument 'sortind' must be set to 'all', 'label' or a cluster like 'Cluster1'.")
	}
	
	#sorting without grplab
	if(any(is.na(grplab)))
	{
		if(!is.na(sortind))
		{
			if(sortind=="all")
			{
				maxval <- apply(dframe,1,max)
				matchval <- vector(length=nrow(dframe))
				for(j in 1:nrow(dframe)) matchval[j] <- match(maxval[j],dframe[j,])
				dftemp <- dframe
				dftemp$maxval <- maxval
				dftemp$matchval <- matchval
				
				dframe <- dframe[with(dftemp,order(matchval,-maxval)),,drop=FALSE]
			}
			
			if(sortind=="label") dframe[order(rownames(dframe)),,drop=FALSE]
			
			if(sortind!="all" && sortind!="label")
			{
				if(!(sortind %in% colnames(dframe))) stop(paste0("sortInd: 'sortind' value (",sortind,") not found in file header (",paste0(colnames(dframe),collapse=", "),")."))
				dframe <- dframe[order(dframe[[sortind]]),,drop=FALSE]
			}
		}
		labelposbind <- NA
		markerposbind <- NA
	}
	
	#sorting with grplab
	if(!all(is.na(grplab)))
	{
		gnames <- names(grplab)
		
		if(!is.na(sortind))
		{
			if(sortind=="all")
			{
				maxval <- apply(dframe,1,max)
				matchval <- vector(length=nrow(dframe))
				for(j in 1:nrow(dframe)) matchval[j] <- match(maxval[j],dframe[j,])
				dftemp <- dframe
				dftemp$maxval <- maxval
				dftemp$matchval <- matchval
				
				if(length(intersect(colnames(dftemp),colnames(grplab)))!=0) stop(paste0("sortInd: One or more header labels in the run file are duplicated in grplab header. Change labels to be unique. Following are the duplicate label(s): (",paste0(intersect(colnames(dftemp),colnames(grplab)),collapse=", "),")."))
				dftemp <- cbind(dftemp,grplab)
				
				rle1 <- rle(as.character(unlist(grplab[selgrp])))
				grplabnames <- rle1$values
				tovec <- cumsum(rle1$lengths)
				fromvec <- (tovec - rle1$lengths)+1
				dftemplist <- vector("list",length=length(grplabnames))
				for(k in 1:length(tovec))
				{
					dftemp1 <- dftemp[fromvec[k]:tovec[k],,drop=FALSE]
					dftemp1$grp <- NULL
					dftemplist[[k]] <- dftemp1[with(dftemp1,order(matchval,-maxval)),,drop=FALSE]
				}
				
				dframe <- do.call("rbind",dftemplist)
				grplab <- dframe[,gnames,drop=FALSE]
				dframe[,gnames] <- NULL
				dframe$maxval <- NULL
				dframe$matchval <- NULL
			}
			
			if(sortind=="label")
			{
				dftemp <- dframe
				if(length(intersect(colnames(dftemp),colnames(grplab)))!=0) stop(paste0("sortInd: One or more header labels in the run file are duplicated in grplab header. Change labels to be unique. Following are the duplicate label(s): (",paste0(intersect(colnames(dftemp),colnames(grplab)),collapse=", "),")."))
				dftemp <- cbind(dftemp,grplab)
				
				rle1 <- rle(as.character(unlist(grplab[selgrp])))
				grplabnames <- rle1$values
				tovec <- cumsum(rle1$lengths)
				fromvec <- (tovec - rle1$lengths)+1
				dftemplist <- vector("list",length=length(grplabnames))
				for(k in 1:length(tovec))
				{
					dftemp1 <- dftemp[fromvec[k]:tovec[k],,drop=FALSE]
					dftemp1$grp <- NULL
					dftemplist[[k]] <- dftemp1[order(rownames(dftemp1)),,drop=FALSE]
				}
				
				dframe <- do.call("rbind",dftemplist)
				grplab <- dframe[,gnames,drop=FALSE]
				dframe[,gnames] <- NULL
			}
			
			
			if(sortind!="all" && sortind!="label")
			{
				if(!(sortind %in% colnames(dframe))) stop(paste0("sortInd: 'sortind' value (",sortind,") not found in file header (",paste0(colnames(dframe),collapse=", "),")."))
				clnum <- which(sortind==colnames(dframe))
				dftemp <- dframe
				if(length(intersect(colnames(dftemp),colnames(grplab)))!=0) stop(paste0("sortInd: One or more header labels in the run file are duplicated in grplab header. Change labels to be unique. Following are the duplicate label(s): (",paste0(intersect(colnames(dftemp),colnames(grplab)),collapse=", "),")."))
				dftemp <- cbind(dftemp,grplab)
				
				rle1 <- rle(as.character(unlist(grplab[selgrp])))
				grplabnames <- rle1$values
				tovec <- cumsum(rle1$lengths)
				fromvec <- (tovec - rle1$lengths)+1
				dftemplist <- vector("list",length=length(grplabnames))
				for(k in 1:length(tovec))
				{
					dftemp1 <- dftemp[fromvec[k]:tovec[k],,drop=FALSE]
					dftemp1$grp <- NULL
					dftemplist[[k]] <- dftemp1[order(dftemp1[[sortind]]),,drop=FALSE]
				}
				
				dframe <- do.call("rbind",dftemplist)
				grplab <- dframe[,gnames,drop=FALSE]
				dframe[,gnames] <- NULL
			}
		}
		
		#create labelpos and markerpos
		gnames <- names(grplab)
		markerlist <- vector("list",length=length(gnames))
		labellist <- vector("list",length=length(gnames))
		intlablist <- vector("list",length=length(gnames))
		for(k in 1:length(gnames))
		{
			rlegrp <- rle(unlist(grplab[gnames[k]]))
			labelpos <- data.frame(label=rlegrp$values,freq=rlegrp$lengths,stringsAsFactors=F)
			markerpos <- data.frame(markerxpos=c(0,cumsum(labelpos$freq)),stringsAsFactors=F)
			labelpos$labxpos <- round((diff(markerpos$markerxpos)/2)+markerpos$markerxpos[1:length(markerpos$markerxpos)-1],1)
			labelpos$labypos <- rep(grplabpos,nrow(labelpos))
			markerpos$temp <- factor(rep(1,nrow(markerpos)))
			markerpos$markerypos <- rep(linepos,nrow(markerpos))
			
			markerpos$count <- gnames[k]
			markerlist[[k]] <- markerpos
			labelpos$count <- gnames[k]
			labellist[[k]] <- labelpos
		}
		
		markerposbind <- do.call("rbind",markerlist)
		markerposbind$count <- factor(markerposbind$count,levels=gnames)
		labelposbind <- do.call("rbind",labellist)
		labelposbind$count <- factor(labelposbind$count,levels=gnames)
		
		rownames(markerposbind) <- 1:nrow(markerposbind)
		rownames(labelposbind) <- 1:nrow(labelposbind)
		
		#adjust divider position
		markerposbind$divxpos <- markerposbind$markerxpos+0.5
	}
	
	
	
	return(list(dframe=dframe,grplab=grplab,labelpos=labelposbind,markerpos=markerposbind))
}
andrewparkermorgan/popcorn documentation built on July 8, 2023, 12:42 p.m.