Nothing
#' Run ADMIXTURE from R
#'
#' This function runs ADMIXTURE, taking either a `gen_tibble` or a file as an
#' input. This is a wrapper that runs ADMIXTURE from the command line, and reads
#' the output into R. It can run multiple values of `k` and multiple repeats for
#' each `k`.
#'
#' @details This is a wrapper for the command line program ADMIXTURE. It can
#' either use a binary present in the main environment, or use a copy
#' installed in a conda environment.
#' @param x a `gen_tibble` or a character giving the path of the input PLINK bed
#' file
#' @param k an integer giving the number of clusters
#' @param n_runs the number of runs for each k value (defaults to 1)
#' @param crossval boolean, should cross validation be used to assess the fit
#' (defaults to FALSE)
#' @param n_cores number of cores (defaults to 1)
#' @param seed the seed for the random number generator (defaults to NULL)
#' @param conda_env the name of the conda environment to use. "none" forces the
#' use of a local copy, whilst any other string will direct the function to
#' use a custom conda environment.
#' @return an object of class `gt_admix` consisting of a list with the following
#' elements:
#' - `k` the number of clusters
#' - `Q` a matrix with the admixture proportions
#' - `P` a matrix with the allele frequencies
#' - `log` a log of the output generated by ADMIXTURE
#' (usually printed on the screen when running from the command line)
#' - `cv` the cross validation error (if `crossval` is TRUE)
#' - `loglik` the log likelihood of the model
#' - `id` the id column of the input `gen_tibble` (if applicable)
#' - `group` the group column of the input `gen_tibble` (if applicable)
#' @export
#' @examples
#' # run the example only if we have the package installed
#' \dontrun{
#' bed_file <-
#' system.file("extdata", "lobster", "lobster.bed", package = "tidypopgen")
#' lobsters <- gen_tibble(bed_file,
#' backingfile = tempfile("lobsters"),
#' quiet = TRUE
#' )
#' lobsters <- lobsters %>% group_by(population)
#' gt_admixture(lobsters,
#' k = 2:3, seed = c(1, 2),
#' n_runs = 2, crossval = TRUE
#' )
#' }
# If the package `tidygenclust` is installed, and its conda environment has been
# set up with `ADMIXTURE` in it (the default), it will automatically use that
# version unless you change `conda_env` to "none". If set to "auto", the
# default, a copy from `tidygenclust` will be preferred if available, otherwise
# a local copy will be used.
gt_admixture <- function(
x,
k,
n_runs = 1,
crossval = FALSE,
n_cores = 1,
seed = NULL,
conda_env = "auto") {
# check that we have the right number of repeats
if (length(seed) != n_runs) {
stop("'seed' should be a vector of length 'n_runs'")
}
# if x is a character, check that it is file that exists
if (is.character(x)) {
if (!file.exists(x)) {
stop("The file ", x, " does not exist")
}
input_file <- x
} else {
# if x is a gen_tibble
if (!inherits(x, "gen_tbl")) {
stop("x must be a gen_tibble or a character")
}
# write the gen_tibble to a temp file
input_file <- gt_as_plink(x, file = tempfile(), chromosomes_as_int = TRUE)
}
# expand path to file to be full path
input_file <- normalizePath(input_file)
# cast k as an integer
k <- as.integer(k)
# set and check the conda environment
# if there is no reticulate
if (!requireNamespace("reticulate", quietly = TRUE)) {
# but we have not set conda_env to none
if (conda_env != "none") {
# if we have auto, set to none
if (conda_env == "auto") {
conda_env <- "none"
} else {
stop(paste(
"The package reticulate is needed to use",
"a conda environemnt in R."
))
}
}
} else {
# if reticulate is available
# if we have "auto"
if (conda_env == "auto") {
conda_envs <- reticulate::conda_list()[["name"]]
# if we are on Linux and have ctidygenclust installed
if (
(Sys.info()["sysname"] == "Linux") &&
("ctidygenclust" %in% conda_envs)
) {
conda_env <- "ctidygenclust"
} else if (
(Sys.info()["sysname"] == "Darwin") &&
("cadmixture86" %in% conda_envs)
) {
# else on osx use cadmixture86
conda_env <- "cadmixture86"
} else { # if we have no conda environment
conda_env <- "none"
}
# check that the conda environment does exist
if (
(conda_env != "none") &&
(!conda_env %in% reticulate::conda_list()[["name"]])
) {
# nolint
stop("The conda environment ", conda_env, " does not exist.")
}
}
}
# if conda is "none", check that we have admixture installed
if (conda_env == "none") {
if (system2("which", args = "admixture", stdout = NULL) != 0) {
stop("ADMIXTURE is not installed in this path")
}
}
# set the working path to the tempdir, where admixture will dump text files
out <- tempdir()
# store working directory
wd <- getwd()
# change to the directory of the input file
setwd(out)
on.exit(setwd(wd))
# initialise list to store results
adm_list <- list(
k = integer(),
Q = list(),
P = list(),
log = list(),
loglik = numeric()
)
if (crossval) {
adm_list$cv <- numeric()
}
class(adm_list) <- c("gt_admix", "list")
# loop over values of k and number of repeats
index <- 1
for (this_k in as.integer(k)) {
for (this_rep in seq_len(n_runs)) {
# set the arguments for admixture
admixture_args <- paste(input_file, this_k)
if (crossval) {
admixture_args <- paste(admixture_args, "--cv")
}
if (n_cores > 1) {
admixture_args <- paste(admixture_args, paste0("-j", n_cores))
}
if (!is.null(seed)) {
admixture_args <- paste(admixture_args, paste0("-s ", seed[index]))
}
# if we conda_env is none
if (conda_env == "none") {
adm_out <- system2("admixture", args = admixture_args, stdout = TRUE)
# change back to the original working directory
} else {
adm_out <- reticulate::conda_run2(
"admixture",
args = admixture_args,
envname = conda_env,
intern = TRUE
)
}
# build expected .Q filename
q_file <- file.path(out, paste0(
gsub(".bed$", "", basename(input_file)),
".", this_k, ".Q"
))
# check if no .Q files were written and if adm_out contains "Error:"
# stop and print adm_out if both are true
if (
!file.exists(q_file) &&
length(grep("Error:", adm_out)) > 0
) {
# nolint
stop(adm_out)
}
# read the output
output_prefix <- file.path(out, gsub(".bed", "", basename(input_file)))
adm_list$k[index] <- this_k
adm_list$Q[[index]] <-
q_matrix(utils::read.table(
paste(output_prefix, this_k, "Q", sep = "."),
header = FALSE
))
adm_list$P[[index]] <- utils::read.table(
paste(output_prefix, this_k, "P", sep = "."),
header = FALSE
) # nolint
adm_list$loglik[index] <- as.numeric(strsplit(
grep("^Loglikelihood", adm_out, value = TRUE),
":"
)[[1]][2]) # nolint
adm_list$log[[index]] <- adm_out
if (crossval) {
# extract value from line with CV error (number after :)
adm_list$cv[index] <- as.numeric(strsplit(
grep("CV error", adm_out, value = TRUE),
":"
)[[1]][2]) # nolint
}
index <- index + 1
}
}
# add metadata if x is a gen_tibble
if (inherits(x, "gen_tbl")) {
adm_list$id <- x$id
# if it is grouped, add the group
if (inherits(x, "grouped_gen_tbl")) {
adm_list$group <- x[[dplyr::group_vars(x)]]
}
}
# add info on algorithm
adm_list$algorithm <- "ADMIXTURE"
return(adm_list)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.