Nothing
#' Utility function to run Structure
#'
#' These functions were copied from package strataG, which is no longer on CRAN
#' (maintained by Eric Archer)
#' @export
#' @author Bernd Gruber (bugs? Post to
#' \url{https://groups.google.com/d/forum/dartr}); original implementation of
#' Eric Archer \url{https://github.com/EricArcher/strataG}
#'
#' @param g a gtypes object [see \code{strataG}].
#' @param k.range vector of values to for \code{maxpop} in multiple runs. If set
#' to \code{NULL}, a single STRUCTURE run is conducted with \code{maxpops}
#' groups. If specified, do not also specify \code{maxpops}.
#' @param num.k.rep number of replicates for each value in \code{k.range}.
#' @param label label to use for input and output files
#' @param delete.files logical. Delete all files when STRUCTURE is finished?
#' @param exec name of executable for STRUCTURE. Defaults to "structure".
#' @param burnin Number of burnin reps [default 10000].
#' @param numreps Number of MCMC replicates [default 1000].
#' @param noadmix Logical. No admixture? [default TRUE].
#' @param freqscorr Logical. Correlated frequencies? [default FALSE].
#' @param randomize Randomize [default TRUE].
#' @param seed Set random seed [default 0].
#' @param pop.prior A character specifying which population prior model to use:
#' "locprior" or "usepopinfo" [default NULL].
#' @param locpriorinit Parameterizes locprior parameter r - how informative the
#' populations are. Only used when pop.prior = "locprior" [default 1].
#' @param maxlocprior Specifies range of locprior parameter r. Only used when
#' pop.prior = "locprior" [default 20].
#' @param gensback Integer defining the number of generations back to test for
#' immigrant ancestry. Only used when pop.prior = "usepopinfo" [default 2].
#' @param migrprior Numeric between 0 and 1 listing migration prior. Only used
#' when pop.prior = "usepopinfo" [default 0.05].
#' @param pfrompopflagonly Logical. update allele frequencies from individuals
#' specified by popflag. Only used when pop.prior = "usepopinfo" [default TRUE].
#' @param popflag A vector of integers (0, 1) or logicals identifiying whether
#' or not to use strata information. Only used when pop.prior = "usepopinfo"
#' [default NULL].
#' @param inferalpha Logical. Infer the value of the model parameter # from the
#' data; otherwise is fixed at the value alpha which is chosen by the user.
#' This option is ignored under the NOADMIX model. Small alpha implies that
#' most individuals are essentially from one population or another, while
#' alpha > 1 implies that most individuals are admixed [default FALSE].
#' @param alpha Dirichlet parameter for degree of admixture. This is the
#' initial value if inferalpha = TRUE [default 1].
#' @param unifprioralpha Logical. Assume a uniform prior for alpha which runs
#' between 0 and alphamax. This model seems to work fine; the alternative model
#' (when unfprioralpha = 0) is to take alpha as having a Gamma prior, with
#' mean alphapriora × alphapriorb, and variance alphapriora × alphapriorb^2
#' [default TRUE].
#' @param alphamax Maximum for uniform prior on alpha when
#' unifprioralpha = TRUE [default 20].
#' @param alphapriora Parameters of Gamma prior on alpha when
#' unifprioralpha = FALSE [default 0.05].
#' @param alphapriorb Parameters of Gamma prior on alpha when
#' unifprioralpha = FALSE [default 0.001].
#'
#' @return \describe{ \item{\code{structureRun}}{a list where each element is a
#' list with results from \code{structureRead} and a vector of the filenames
#' used} \item{\code{structureWrite}}{a vector of the filenames used by
#' STRUCTURE} \item{\code{structureRead}}{a list containing: \describe{
#' \item{\code{summary}}{new locus name, which is a combination of loci in
#' group} \item{\code{q.mat}}{data.frame of assignment probabilities for each
#' id} \item{\code{prior.anc}}{list of prior ancestry estimates for each
#' individual where population priors were used} \item{\code{files}}{vector of
#' input and output files used by STRUCTURE} \item{\code{label}}{label for the
#' run} } } }
utils.structure.run <- function(g,
k.range,
num.k.rep,
label,
delete.files = TRUE,
exec,
burnin,
numreps,
noadmix,
freqscorr,
randomize,
seed,
pop.prior,
locpriorinit,
maxlocprior,
gensback,
migrprior,
pfrompopflagonly,
popflag,
inferalpha,
alpha,
unifprioralpha,
alphamax,
alphapriora,
alphapriorb
# ,
# ...
) {
################################################################
.structureParseQmat <- function(q.mat.txt,
pops) {
q.mat.txt <- sub("[*]+", "", q.mat.txt)
q.mat.txt <- sub("[(]", "", q.mat.txt)
q.mat.txt <- sub("[)]", "", q.mat.txt)
q.mat.txt <- sub("[|][ ]+$", "", q.mat.txt)
cols1to4 <- c("row", "id", "pct.miss", "orig.pop")
strsplit(q.mat.txt, " ") %>%
purrr::map(function(q) {
q <- q[!q %in% c("", " ", ":")] %>%
as.character() %>%
rbind() %>%
as.data.frame(stringsAsFactors = FALSE)
stats::setNames(
q,
c(
cols1to4,
paste("Group", 1:(ncol(q) - 4), sep = ".")
)
)
}) %>%
dplyr::bind_rows() %>%
dplyr::mutate_at(dplyr::vars(
"row",
"pct.miss",
"orig.pop",
dplyr::starts_with("Group.")
), as.numeric) %>%
dplyr::mutate(orig.pop = if (!is.null(pops)) {
pops[.data$orig.pop]
} else {
.data$orig.pop
})
}
################################################################
.alleles2integer <- function(g,
min.val = 0) {
g$data %>%
dplyr::group_by(.data$locus) %>%
dplyr::mutate(allele = min.val - 1 + as.integer(factor(.data$allele))) %>%
dplyr::ungroup()
}
################################################################
.stackedAlleles <- function(g,
alleles2integer = FALSE,
na.val = NULL,
...) {
x <- if (alleles2integer) {
.alleles2integer(g, ...)
} else {
g$data
}
if (!is.null(na.val)) {
x$allele[is.na(x$allele)] <- na.val
}
x %>%
dplyr::arrange(.data$id, .data$locus) %>%
dplyr::mutate(a = rep(1:g$ploidy, dplyr::n() / g$ploidy)) %>%
tidyr::spread(.data$locus, .data$allele) %>%
dplyr::rename(allele = "a") %>%
dplyr::select(.data$id, .data$stratum, .data$allele, dplyr::everything())
}
####################################################
.getFileLabel <- function(g,
label = NULL) {
desc <- g$description
label <- if (!is.null(label)) {
label
} else if (!is.null(desc)) {
desc
} else {
"strataG.gtypes"
}
gsub("[[:punct:]]", ".", label)
}
####################################################
structureWrite <- function(g,
label = NULL,
maxpops = 1:(dplyr::n_distinct(g$data$stratum)),
burnin = 1000,
numreps = 1000,
noadmix = TRUE,
freqscorr = FALSE,
randomize = TRUE,
seed = 0,
pop.prior = NULL,
locpriorinit = 1,
maxlocprior = 20,
gensback = 2,
migrprior = 0.05,
pfrompopflagonly = TRUE,
popflag = NULL,
inferalpha = FALSE,
alpha = 1,
unifprioralpha = TRUE,
alphamax = 20,
alphapriora = 0.05,
alphapriorb = 0.001) {
if (!is.null(pop.prior)) {
if (!pop.prior %in% c("locprior", "usepopinfo")) {
stop(error("'pop.prior' must be 'locprior' or 'usepopinfo'."))
}
}
if (is.null(popflag)) {
popflag <- rep(1, length(unique(g$data$id)))
}
if (length(popflag) != length(unique(g$data$id))) {
stop(error(" 'popflag' should be the same length as the number of individuals in 'g'."))
}
if (!all(popflag %in% c(0, 1))) {
stop(error(" All values in 'popflag' must be 0 or 1."))
}
if (is.null(names(popflag))) {
names(popflag) <- unique(g$data$id)
}
in.file <- ifelse(is.null(label), "data", paste(label, "data", sep = "_"))
out.file <- ifelse(is.null(label), "out", paste(label, "out", sep = "_"))
main.file <- ifelse(is.null(label), "mainparams", paste(label, "mainparams", sep = "_"))
extra.file <- ifelse(is.null(label), "extraparams", paste(label, "extraparams", sep = "_"))
mat <- .stackedAlleles(g, alleles2integer = TRUE, na.val = -9) %>%
dplyr::select(-.data$allele) %>%
dplyr::mutate(
id = gsub(" ", "_", .data$id),
stratum = as.numeric(factor(.data$stratum)),
popflag = popflag[.data$id]
) %>%
dplyr::select(.data$id, .data$stratum, .data$popflag, dplyr::everything()) %>%
as.matrix()
write(paste(sort(unique(g$data[["locus"]])), collapse = " "), file = in.file)
for (i in 1:nrow(mat)) {
write(paste(mat[i, ], collapse = " "),
file = in.file,
append = TRUE
)
}
main.params <- c(
paste("MAXPOPS", as.integer(maxpops)),
paste("BURNIN", as.integer(burnin)),
paste("NUMREPS", as.integer(numreps)),
paste("INFILE", in.file),
paste("OUTFILE", out.file),
paste("NUMINDS", length(unique(g$data$id))),
paste("NUMLOCI", length(unique(g$data$locus))),
"MISSING -9", "LABEL 1", "POPDATA 1",
"POPFLAG 1", "LOCDATA 0", "PHENOTYPE 0",
"EXTRACOLS 0", "MARKERNAMES 1"
)
main.params <- paste("#define", main.params)
write(main.params, file = main.file)
extra.params <- c(
paste("NOADMIX", as.integer(noadmix)),
paste("FREQSCORR", as.integer(freqscorr)),
paste("INFERALPHA", as.integer(inferalpha)),
paste("ALPHA", as.numeric(alpha)),
"FPRIORMEAN 0.01", "FPRIORSD 0.05", "LAMBDA 1.0",
paste("UNIFPRIORALPHA", as.integer(unifprioralpha)),
paste("ALPHAMAX", as.numeric(alphamax)),
paste("ALPHAPRIORA", as.numeric(alphapriora)),
paste("ALPHAPRIORB", as.numeric(alphapriorb)),
"COMPUTEPROB 1",
paste("ADMBURNIN", max(0, as.integer(burnin / 2))),
"ALPHAPROPSD 0.025", "STARTATPOPINFO 0",
paste("RANDOMIZE", as.integer(randomize)),
paste("SEED", as.integer(seed)),
"METROFREQ 10", "REPORTHITRATE 0"
)
if (!is.null(pop.prior)) {
pop.prior <- tolower(pop.prior)
prior.params <- if (pop.prior == "locprior") {
c("LOCPRIOR 1", "LOCISPOP 1", paste(
"LOCPRIORINIT",
locpriorinit
), paste("MAXLOCPRIOR", maxlocprior))
} else if (pop.prior == "usepopinfo") {
c(
"USEPOPINFO 1", paste("GENSBACK", trunc(gensback)),
paste("MIGRPRIOR", migrprior), paste(
"PFROMPOPFLAGONLY",
as.integer(pfrompopflagonly)
)
)
}
extra.params <- c(extra.params, prior.params)
}
extra.params <- extra.params[!is.na(extra.params)]
extra.params <- paste("#define", extra.params)
write(extra.params, file = extra.file)
invisible(list(
files = c(
data = in.file,
mainparams = main.file,
extraparams = extra.file,
out = out.file
),
pops = sort(unique(g$data$stratum))
))
}
###########################################################
structureRead <- function(file,
pops = NULL) {
if (!file.exists(file)) {
stop(error(paste("the file '", file, "' can't be found.",
sep = ""
)))
}
result <- scan(file, "character", quiet = TRUE)
loc <- grep("Estimated", result,
ignore.case = FALSE,
value = FALSE
)
est.ln.prob <- as.numeric(result[loc[1] + 6])
loc <- grep("likelihood", result,
ignore.case = FALSE,
value = FALSE
)
mean.lnL <- as.numeric(result[loc[1] + 2])
var.lnL <- as.numeric(result[loc[2] + 2])
loc <- grep("MAXPOPS", result, value = F)
maxpops <- result[loc]
maxpops <- sub("MAXPOPS=", "", maxpops)
maxpops <- as.integer(sub(",", "", maxpops))
loc <- grep("GENSBACK", result, value = F)
gensback <- result[loc]
gensback <- sub("GENSBACK=", "", gensback)
gensback <- as.integer(sub(",", "", gensback))
smry <- c(
k = maxpops, est.ln.prob = est.ln.prob, mean.lnL = mean.lnL,
var.lnL = var.lnL
)
result <- scan(file, "character",
sep = "\n",
quiet = TRUE
)
first <- grep("(%Miss)", result, value = FALSE) + 1
last <- grep("Estimated Allele", result, value = FALSE) -
1
tbl.txt <- result[first:last]
tbl.txt <- sub("[*]+", "", tbl.txt)
tbl.txt <- sub("[(]", "", tbl.txt)
tbl.txt <- sub("[)]", "", tbl.txt)
tbl.txt <- sub("[|][ ]+$", "", tbl.txt)
prior.lines <- grep("[|]", tbl.txt)
no.prior <- if (length(prior.lines) < length(tbl.txt)) {
no.prior.q.txt <- if (length(prior.lines) == 0) {
tbl.txt
} else {
tbl.txt[-prior.lines]
}
.structureParseQmat(no.prior.q.txt, pops)
} else {
NULL
}
if (maxpops == 1) {
no.prior$row <- NULL
return(list(summary = smry, q.mat = no.prior, prior.anc = NULL))
}
has.prior <- if (length(prior.lines) > 0) {
prior.txt <- strsplit(tbl.txt[prior.lines], "[|]")
prior.q.txt <- unlist(lapply(prior.txt, function(x) x[1]))
df <- .structureParseQmat(prior.q.txt, pops)
prior.anc <- purrr::map(prior.txt, function(x) {
anc.mat <- matrix(NA, nrow = maxpops, ncol = gensback + 1)
rownames(anc.mat) <- paste("Pop", 1:nrow(anc.mat), sep = ".")
colnames(anc.mat) <- paste("Gen", 0:gensback, sep = ".")
x <- sapply(strsplit(x[-1], "\\s|[:]"), function(y) {
y <- y[y != ""]
y[-1]
})
for (i in 1:ncol(x)) {
pop <- as.numeric(x[1, i])
anc.mat[pop, ] <- as.numeric(x[-1, i])
}
anc.mat
}) %>% stats::setNames(df$id)
prob.mat <- t(sapply(1:nrow(df), function(i) {
pop.probs <- rowSums(prior.anc[[i]])
pop.probs[is.na(pop.probs)] <- df$Group.1[i]
pop.probs
}))
colnames(prob.mat) <- paste("Group", 1:ncol(prob.mat),
sep = "."
)
df$Group.1 <- NULL
df <- cbind(df, prob.mat)
list(df = df, prior.anc = prior.anc)
} else {
NULL
}
has.prior.df <- if (is.null(has.prior)) {
NULL
} else {
has.prior$df
}
q.mat <- rbind(no.prior, has.prior.df)
q.mat <- q.mat[order(q.mat$row), ]
q.mat$row <- NULL
rownames(q.mat) <- NULL
q.mat[, -(1:3)] <- t(apply(q.mat[, -(1:3)], 1, function(i) i / sum(i)))
prior.anc <- if (is.null(has.prior)) {
NULL
} else {
has.prior$prior.anc
}
list(summary = smry, q.mat = q.mat, prior.anc = prior.anc)
}
###########################################################
label <- g$description
label <- paste(label, "structureRun", sep = ".")
label <- gsub("[[:space:]]", ".", label)
label <- gsub(":", ".", label)
unlink(label, recursive = TRUE, force = TRUE)
dir.create(label)
if (!utils::file_test("-d", label)) {
stop(error(paste("'", label, "' is not a valid folder.",
sep = ""
)))
}
label <- file.path(label, label)
if (is.null(k.range)) {
k.range <- 1:(dplyr::n_distinct(g$data$stratum))
}
rep.df <- expand.grid(rep = 1:num.k.rep, k = k.range)
rownames(rep.df) <- paste(label, ".k", rep.df$k, ".r",
rep.df$rep,
sep = ""
)
out.files <- lapply(rownames(rep.df), function(x) {
sw.out <- structureWrite(g,
label = x,
maxpops = rep.df[x, "k"],
burnin = burnin,
numreps = numreps,
noadmix = noadmix,
freqscorr = freqscorr,
randomize = randomize,
seed = seed,
pop.prior = pop.prior,
locpriorinit = locpriorinit,
maxlocprior = maxlocprior,
gensback = gensback,
migrprior = migrprior,
pfrompopflagonly = pfrompopflagonly,
popflag = popflag,
inferalpha = inferalpha,
alpha = alpha,
unifprioralpha = unifprioralpha,
alphamax = alphamax,
alphapriora = alphapriora,
alphapriorb = alphapriorb
)
files <- sw.out$files
cmd <- paste0(
exec, " -m ", files["mainparams"],
" -e ", files["extraparams"], " -i ",
files["data"], " -o ", files["out"]
)
err.code <- system(cmd)
if (err.code == 127) {
stop("You do not have STRUCTURE installed.")
} else if (!err.code == 0) {
stop(paste(
"Error running STRUCTURE. Error code",
err.code, "returned."
))
}
files["out"] <- paste(files["out"], "_f",
sep = ""
)
result <- structureRead(files["out"], sw.out$pops)
if (file.exists("seed.txt")) {
file.remove("seed.txt")
}
files <- if (delete.files) {
NULL
} else {
files
}
result <- c(result, list(files = files, label = basename(x)))
fname <- paste(x, ".ws.rdata", sep = "")
save(result, file = fname)
fname
})
run.result <- lapply(out.files, function(f) {
result <- NULL
load(f)
result
})
names(run.result) <- sapply(run.result, function(x) x$label)
class(run.result) <- c("structure.result", class(run.result))
if (delete.files) {
unlink(dirname(label), recursive = TRUE, force = TRUE)
}
run.result
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.