R/utils.structure.run.r

Defines functions utils.structure.run

Documented in utils.structure.run

#' Utility function to run Structure
#'
#' These functions were copied from package strataG, which is no longer on CRAN
#'  (maintained by Eric Archer)
#' @export
#' @author Bernd Gruber (bugs? Post to
#'  \url{https://groups.google.com/d/forum/dartr}); original implementation of
#'  Eric Archer \url{https://github.com/EricArcher/strataG}
#'
#' @param g a gtypes object [see \code{strataG}].
#' @param k.range vector of values to for \code{maxpop} in multiple runs. If set
#'   to \code{NULL}, a single STRUCTURE run is conducted with \code{maxpops}
#'   groups. If specified, do not also specify \code{maxpops}.
#' @param num.k.rep number of replicates for each value in \code{k.range}.
#' @param label label to use for input and output files
#' @param delete.files logical. Delete all files when STRUCTURE is finished?
#' @param exec name of executable for STRUCTURE. Defaults to "structure".
#' @param burnin Number of burnin reps [default 10000].
#' @param numreps Number of MCMC replicates [default 1000].
#' @param noadmix Logical. No admixture? [default TRUE].
#' @param freqscorr Logical. Correlated frequencies? [default FALSE].
#' @param randomize Randomize [default TRUE].
#' @param seed Set random seed [default 0].
#' @param pop.prior A character specifying which population prior model to use:
#'  "locprior" or "usepopinfo" [default NULL].
#' @param locpriorinit Parameterizes locprior parameter r - how informative the
#'  populations are. Only used when pop.prior = "locprior" [default 1].
#' @param maxlocprior Specifies range of locprior parameter r. Only used when 
#' pop.prior = "locprior" [default 20].
#' @param gensback Integer defining the number of generations back to test for 
#' immigrant ancestry. Only used when pop.prior = "usepopinfo" [default 2].
#' @param migrprior Numeric between 0 and 1 listing migration prior. Only used 
#' when pop.prior = "usepopinfo" [default 0.05].
#' @param pfrompopflagonly Logical. update allele frequencies from individuals 
#' specified by popflag. Only used when pop.prior = "usepopinfo" [default TRUE].
#' @param popflag A vector of integers (0, 1) or logicals identifiying whether 
#' or not to use strata information. Only used when pop.prior = "usepopinfo"
#'  [default NULL].
#' @param inferalpha Logical. Infer the value of the model parameter # from the 
#' data; otherwise is fixed at the value alpha which is chosen by the user. 
#' This option is ignored under the NOADMIX model. Small alpha implies that 
#' most individuals are essentially from one population or another, while 
#' alpha > 1 implies that most individuals are admixed [default FALSE].
#' @param alpha Dirichlet parameter for degree of admixture. This is the 
#' initial value if inferalpha = TRUE [default 1].
#' @param unifprioralpha Logical. Assume a uniform prior for alpha which runs 
#' between 0 and alphamax. This model seems to work fine; the alternative model
#'  (when unfprioralpha = 0) is to take alpha as having a Gamma prior, with 
#'  mean alphapriora × alphapriorb, and variance alphapriora × alphapriorb^2 
#'  [default TRUE].
#' @param alphamax Maximum for uniform prior on alpha when 
#' unifprioralpha = TRUE [default 20].
#' @param alphapriora Parameters of Gamma prior on alpha when 
#' unifprioralpha = FALSE [default 0.05].
#' @param alphapriorb Parameters of Gamma prior on alpha when 
#' unifprioralpha = FALSE [default 0.001].
#'
#' @return \describe{ \item{\code{structureRun}}{a list where each element is a
#' list with results from \code{structureRead} and a vector of the filenames
#' used} \item{\code{structureWrite}}{a vector of the filenames used by
#' STRUCTURE} \item{\code{structureRead}}{a list containing: \describe{
#' \item{\code{summary}}{new locus name, which is a combination of loci in
#' group} \item{\code{q.mat}}{data.frame of assignment probabilities for each
#' id} \item{\code{prior.anc}}{list of prior ancestry estimates for each
#' individual where population priors were used} \item{\code{files}}{vector of
#' input and output files used by STRUCTURE} \item{\code{label}}{label for the
#' run} } } }

utils.structure.run <- function(g,
                                k.range,
                                num.k.rep,
                                label,
                                delete.files = TRUE,
                                exec,
                                burnin,
                                numreps,
                                noadmix,
                                freqscorr,
                                randomize,
                                seed,
                                pop.prior,
                                locpriorinit,
                                maxlocprior,
                                gensback,
                                migrprior,
                                pfrompopflagonly,
                                popflag,
                                inferalpha,
                                alpha,
                                unifprioralpha,
                                alphamax,
                                alphapriora,
                                alphapriorb
                                # ,
                                # ...
                                ) {
  ################################################################

  .structureParseQmat <- function(q.mat.txt,
                                  pops) {
    q.mat.txt <- sub("[*]+", "", q.mat.txt)
    q.mat.txt <- sub("[(]", "", q.mat.txt)
    q.mat.txt <- sub("[)]", "", q.mat.txt)
    q.mat.txt <- sub("[|][ ]+$", "", q.mat.txt)
    cols1to4 <- c("row", "id", "pct.miss", "orig.pop")

    strsplit(q.mat.txt, " ") %>%
      purrr::map(function(q) {
        q <- q[!q %in% c("", " ", ":")] %>%
          as.character() %>%
          rbind() %>%
          as.data.frame(stringsAsFactors = FALSE)

        stats::setNames(
          q,
          c(
            cols1to4,
            paste("Group", 1:(ncol(q) - 4), sep = ".")
          )
        )
      }) %>%
      dplyr::bind_rows() %>%
      dplyr::mutate_at(dplyr::vars(
        "row",
        "pct.miss",
        "orig.pop",
        dplyr::starts_with("Group.")
      ), as.numeric) %>%
      dplyr::mutate(orig.pop = if (!is.null(pops)) {
        pops[.data$orig.pop]
      } else {
        .data$orig.pop
      })
  }

  ################################################################

  .alleles2integer <- function(g,
                               min.val = 0) {
    g$data %>%
      dplyr::group_by(.data$locus) %>%
      dplyr::mutate(allele = min.val - 1 + as.integer(factor(.data$allele))) %>%
      dplyr::ungroup()
  }

  ################################################################

  .stackedAlleles <- function(g,
                              alleles2integer = FALSE,
                              na.val = NULL,
                              ...) {
    x <- if (alleles2integer) {
      .alleles2integer(g, ...)
    } else {
      g$data
    }

    if (!is.null(na.val)) {
      x$allele[is.na(x$allele)] <- na.val
    }

    x %>%
      dplyr::arrange(.data$id, .data$locus) %>%
      dplyr::mutate(a = rep(1:g$ploidy, dplyr::n() / g$ploidy)) %>%
      tidyr::spread(.data$locus, .data$allele) %>%
      dplyr::rename(allele = "a") %>%
      dplyr::select(.data$id, .data$stratum, .data$allele, dplyr::everything())
  }

  ####################################################
  .getFileLabel <- function(g,
                            label = NULL) {
    desc <- g$description
    label <- if (!is.null(label)) {
      label
    } else if (!is.null(desc)) {
      desc
    } else {
      "strataG.gtypes"
    }
    gsub("[[:punct:]]", ".", label)
  }

  ####################################################

  structureWrite <- function(g,
                             label = NULL,
                             maxpops = 1:(dplyr::n_distinct(g$data$stratum)),
                             burnin = 1000,
                             numreps = 1000,
                             noadmix = TRUE,
                             freqscorr = FALSE,
                             randomize = TRUE,
                             seed = 0,
                             pop.prior = NULL,
                             locpriorinit = 1,
                             maxlocprior = 20,
                             gensback = 2,
                             migrprior = 0.05,
                             pfrompopflagonly = TRUE,
                             popflag = NULL,
                             inferalpha = FALSE,
                             alpha = 1,
                             unifprioralpha = TRUE,
                             alphamax = 20,
                             alphapriora = 0.05,
                             alphapriorb = 0.001) {
    if (!is.null(pop.prior)) {
      if (!pop.prior %in% c("locprior", "usepopinfo")) {
        stop(error("'pop.prior' must be 'locprior' or 'usepopinfo'."))
      }
    }

    if (is.null(popflag)) {
      popflag <- rep(1, length(unique(g$data$id)))
    }

    if (length(popflag) != length(unique(g$data$id))) {
      stop(error("  'popflag' should be the same length as the number of individuals in 'g'."))
    }
    if (!all(popflag %in% c(0, 1))) {
      stop(error("  All values in 'popflag' must be 0 or 1."))
    }

    if (is.null(names(popflag))) {
      names(popflag) <- unique(g$data$id)
    }

    in.file <- ifelse(is.null(label), "data", paste(label, "data", sep = "_"))
    out.file <- ifelse(is.null(label), "out", paste(label, "out", sep = "_"))
    main.file <- ifelse(is.null(label), "mainparams", paste(label, "mainparams", sep = "_"))
    extra.file <- ifelse(is.null(label), "extraparams", paste(label, "extraparams", sep = "_"))
    mat <- .stackedAlleles(g, alleles2integer = TRUE, na.val = -9) %>%
      dplyr::select(-.data$allele) %>%
      dplyr::mutate(
        id = gsub(" ", "_", .data$id),
        stratum = as.numeric(factor(.data$stratum)),
        popflag = popflag[.data$id]
      ) %>%
      dplyr::select(.data$id, .data$stratum, .data$popflag, dplyr::everything()) %>%
      as.matrix()

    write(paste(sort(unique(g$data[["locus"]])), collapse = " "), file = in.file)

    for (i in 1:nrow(mat)) {
      write(paste(mat[i, ], collapse = " "),
        file = in.file,
        append = TRUE
      )
    }

    main.params <- c(
      paste("MAXPOPS", as.integer(maxpops)),
      paste("BURNIN", as.integer(burnin)),
      paste("NUMREPS", as.integer(numreps)),
      paste("INFILE", in.file),
      paste("OUTFILE", out.file),
      paste("NUMINDS", length(unique(g$data$id))),
      paste("NUMLOCI", length(unique(g$data$locus))),
      "MISSING -9", "LABEL 1", "POPDATA 1",
      "POPFLAG 1", "LOCDATA 0", "PHENOTYPE 0",
      "EXTRACOLS 0", "MARKERNAMES 1"
    )

    main.params <- paste("#define", main.params)

    write(main.params, file = main.file)

    extra.params <- c(
      paste("NOADMIX", as.integer(noadmix)),
      paste("FREQSCORR", as.integer(freqscorr)),
      paste("INFERALPHA", as.integer(inferalpha)),
      paste("ALPHA", as.numeric(alpha)),
      "FPRIORMEAN 0.01", "FPRIORSD 0.05", "LAMBDA 1.0",
      paste("UNIFPRIORALPHA", as.integer(unifprioralpha)),
      paste("ALPHAMAX", as.numeric(alphamax)),
      paste("ALPHAPRIORA", as.numeric(alphapriora)),
      paste("ALPHAPRIORB", as.numeric(alphapriorb)),
      "COMPUTEPROB 1",
      paste("ADMBURNIN", max(0, as.integer(burnin / 2))),
      "ALPHAPROPSD 0.025", "STARTATPOPINFO 0",
      paste("RANDOMIZE", as.integer(randomize)),
      paste("SEED", as.integer(seed)),
      "METROFREQ 10", "REPORTHITRATE 0"
    )

    if (!is.null(pop.prior)) {
      pop.prior <- tolower(pop.prior)
      prior.params <- if (pop.prior == "locprior") {
        c("LOCPRIOR 1", "LOCISPOP 1", paste(
          "LOCPRIORINIT",
          locpriorinit
        ), paste("MAXLOCPRIOR", maxlocprior))
      } else if (pop.prior == "usepopinfo") {
        c(
          "USEPOPINFO 1", paste("GENSBACK", trunc(gensback)),
          paste("MIGRPRIOR", migrprior), paste(
            "PFROMPOPFLAGONLY",
            as.integer(pfrompopflagonly)
          )
        )
      }
      extra.params <- c(extra.params, prior.params)
    }

    extra.params <- extra.params[!is.na(extra.params)]
    extra.params <- paste("#define", extra.params)

    write(extra.params, file = extra.file)

    invisible(list(
      files = c(
        data = in.file,
        mainparams = main.file,
        extraparams = extra.file,
        out = out.file
      ),
      pops = sort(unique(g$data$stratum))
    ))
  }

  ###########################################################
  structureRead <- function(file,
                            pops = NULL) {
    if (!file.exists(file)) {
      stop(error(paste("the file '", file, "' can't be found.",
        sep = ""
      )))
    }

    result <- scan(file, "character", quiet = TRUE)
    loc <- grep("Estimated", result,
      ignore.case = FALSE,
      value = FALSE
    )
    est.ln.prob <- as.numeric(result[loc[1] + 6])
    loc <- grep("likelihood", result,
      ignore.case = FALSE,
      value = FALSE
    )
    mean.lnL <- as.numeric(result[loc[1] + 2])
    var.lnL <- as.numeric(result[loc[2] + 2])
    loc <- grep("MAXPOPS", result, value = F)
    maxpops <- result[loc]
    maxpops <- sub("MAXPOPS=", "", maxpops)
    maxpops <- as.integer(sub(",", "", maxpops))
    loc <- grep("GENSBACK", result, value = F)
    gensback <- result[loc]
    gensback <- sub("GENSBACK=", "", gensback)
    gensback <- as.integer(sub(",", "", gensback))
    smry <- c(
      k = maxpops, est.ln.prob = est.ln.prob, mean.lnL = mean.lnL,
      var.lnL = var.lnL
    )
    result <- scan(file, "character",
      sep = "\n",
      quiet = TRUE
    )
    first <- grep("(%Miss)", result, value = FALSE) + 1
    last <- grep("Estimated Allele", result, value = FALSE) -
      1
    tbl.txt <- result[first:last]
    tbl.txt <- sub("[*]+", "", tbl.txt)
    tbl.txt <- sub("[(]", "", tbl.txt)
    tbl.txt <- sub("[)]", "", tbl.txt)
    tbl.txt <- sub("[|][ ]+$", "", tbl.txt)
    prior.lines <- grep("[|]", tbl.txt)

    no.prior <- if (length(prior.lines) < length(tbl.txt)) {
      no.prior.q.txt <- if (length(prior.lines) == 0) {
        tbl.txt
      } else {
        tbl.txt[-prior.lines]
      }
      .structureParseQmat(no.prior.q.txt, pops)
    } else {
      NULL
    }

    if (maxpops == 1) {
      no.prior$row <- NULL
      return(list(summary = smry, q.mat = no.prior, prior.anc = NULL))
    }

    has.prior <- if (length(prior.lines) > 0) {
      prior.txt <- strsplit(tbl.txt[prior.lines], "[|]")
      prior.q.txt <- unlist(lapply(prior.txt, function(x) x[1]))
      df <- .structureParseQmat(prior.q.txt, pops)

      prior.anc <- purrr::map(prior.txt, function(x) {
        anc.mat <- matrix(NA, nrow = maxpops, ncol = gensback + 1)
        rownames(anc.mat) <- paste("Pop", 1:nrow(anc.mat), sep = ".")
        colnames(anc.mat) <- paste("Gen", 0:gensback, sep = ".")

        x <- sapply(strsplit(x[-1], "\\s|[:]"), function(y) {
          y <- y[y != ""]
          y[-1]
        })

        for (i in 1:ncol(x)) {
          pop <- as.numeric(x[1, i])
          anc.mat[pop, ] <- as.numeric(x[-1, i])
        }
        anc.mat
      }) %>% stats::setNames(df$id)
      prob.mat <- t(sapply(1:nrow(df), function(i) {
        pop.probs <- rowSums(prior.anc[[i]])
        pop.probs[is.na(pop.probs)] <- df$Group.1[i]
        pop.probs
      }))
      colnames(prob.mat) <- paste("Group", 1:ncol(prob.mat),
        sep = "."
      )
      df$Group.1 <- NULL
      df <- cbind(df, prob.mat)
      list(df = df, prior.anc = prior.anc)
    } else {
      NULL
    }

    has.prior.df <- if (is.null(has.prior)) {
      NULL
    } else {
      has.prior$df
    }

    q.mat <- rbind(no.prior, has.prior.df)
    q.mat <- q.mat[order(q.mat$row), ]
    q.mat$row <- NULL
    rownames(q.mat) <- NULL
    q.mat[, -(1:3)] <- t(apply(q.mat[, -(1:3)], 1, function(i) i / sum(i)))
    prior.anc <- if (is.null(has.prior)) {
      NULL
    } else {
      has.prior$prior.anc
    }

    list(summary = smry, q.mat = q.mat, prior.anc = prior.anc)
  }

  ###########################################################

  label <- g$description
  label <- paste(label, "structureRun", sep = ".")
  label <- gsub("[[:space:]]", ".", label)
  label <- gsub(":", ".", label)

  unlink(label, recursive = TRUE, force = TRUE)

  dir.create(label)

  if (!utils::file_test("-d", label)) {
    stop(error(paste("'", label, "' is not a valid folder.",
      sep = ""
    )))
  }

  label <- file.path(label, label)

  if (is.null(k.range)) {
    k.range <- 1:(dplyr::n_distinct(g$data$stratum))
  }

  rep.df <- expand.grid(rep = 1:num.k.rep, k = k.range)
  rownames(rep.df) <- paste(label, ".k", rep.df$k, ".r",
    rep.df$rep,
    sep = ""
  )

  out.files <- lapply(rownames(rep.df), function(x) {
    sw.out <- structureWrite(g, 
                             label = x, 
                             maxpops = rep.df[x, "k"],
                             burnin = burnin,
                             numreps = numreps,
                             noadmix = noadmix,
                             freqscorr = freqscorr,
                             randomize = randomize,
                             seed = seed,
                             pop.prior = pop.prior,
                             locpriorinit = locpriorinit,
                             maxlocprior = maxlocprior,
                             gensback = gensback,
                             migrprior = migrprior,
                             pfrompopflagonly = pfrompopflagonly,
                             popflag = popflag,
                             inferalpha = inferalpha,
                             alpha = alpha,
                             unifprioralpha = unifprioralpha,
                             alphamax = alphamax,
                             alphapriora = alphapriora,
                             alphapriorb = alphapriorb
                             )

    files <- sw.out$files
    cmd <- paste0(
      exec, " -m ", files["mainparams"],
      " -e ", files["extraparams"], " -i ",
      files["data"], " -o ", files["out"]
    )
    err.code <- system(cmd)
    if (err.code == 127) {
      stop("You do not have STRUCTURE installed.")
    } else if (!err.code == 0) {
      stop(paste(
        "Error running STRUCTURE. Error code",
        err.code, "returned."
      ))
    }
    files["out"] <- paste(files["out"], "_f",
      sep = ""
    )
    result <- structureRead(files["out"], sw.out$pops)
    if (file.exists("seed.txt")) {
      file.remove("seed.txt")
    }
    files <- if (delete.files) {
      NULL
    } else {
      files
    }
    result <- c(result, list(files = files, label = basename(x)))
    fname <- paste(x, ".ws.rdata", sep = "")
    save(result, file = fname)
    fname
  })

  run.result <- lapply(out.files, function(f) {
    result <- NULL
    load(f)
    result
  })
  names(run.result) <- sapply(run.result, function(x) x$label)
  class(run.result) <- c("structure.result", class(run.result))
  if (delete.files) {
    unlink(dirname(label), recursive = TRUE, force = TRUE)
  }
  run.result
}

Try the dartR.popgen package in your browser

Any scripts or data that you put into this service are public.

dartR.popgen documentation built on June 28, 2024, 1:06 a.m.