R/gl.read.structure.r
In dartR.popgen: Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

Documented in gl.read.structure

#' @name gl.read.structure
#'
#' @title Read output files produced by the program STRUCTURE
#'
#' @description
#' Reads and processes STRUCTURE output files, extracting run summaries and Q-matrices.
#' Optionally associates Q-matrices with population information from a genlight object.
#'
#' @param folder.path Path to folder containing STRUCTURE output files [required].
#' @param x Optional genlight object used to attach population labels to individuals [default NULL].
#' @param pattern Optional regex to filter files in folder.path (e.g. ".*_f$" or "out$") [default NULL].
#' @param recursive Logical; search folder recursively [default FALSE].
#' @param rename_files Logical; if TRUE, renames source files on disk based on K and replicate [default FALSE].
#' @param prefix Optional prefix used for renaming/labels. If NULL, uses longest common prefix of filenames [default NULL].
#' @param verbose Verbosity (as in dartR) [default 2 / gl.set.verbosity()].
#'
#' @return A list of class "structure.result". Each element contains:
#' \itemize{
#'   \item summary: named numeric vector (k, est.ln.prob, mean.lnL, var.lnL)
#'   \item q.mat: data.frame (id, pct.miss, orig.pop, Group.1..Group.K)
#'   \item prior.anc: optional list of ancestry matrices (if present)
#'   \item files: file path
#'   \item label: run label
#' }
#'
#' @export
gl.read.structure <- function(folder.path,
                              x = NULL,
                              pattern = NULL,
                              recursive = FALSE,
                              rename_files = FALSE,
                              prefix = NULL,
                              verbose = NULL) {
  
  # SET VERBOSITY
  verbose <- gl.check.verbosity(verbose)
  
  funname <- match.call()[[1]]
  utils.flag.start(func = funname, build = "Jody", verbose = verbose)
  
  # Dependencies (keep explicit, but do not attach)
  req_packages <- c("purrr", "dplyr")
  for (pkg in req_packages) {
    if (!requireNamespace(pkg, quietly = TRUE)) {
      stop(error("Package ", pkg, " is needed for this function to work. Please install it."))
    }
  }
  
  if (!dir.exists(folder.path)) {
    stop(error("The folder '", folder.path, "' doesn't exist."))
  }
  
  file_paths <- list.files(
    folder.path,
    full.names = TRUE,
    recursive = recursive
  )
  if (!is.null(pattern)) {
    file_paths <- file_paths[grepl(pattern, basename(file_paths))]
  }
  if (length(file_paths) == 0) {
    stop(error("No files found in folder '", folder.path, "' with the requested criteria."))
  }
  
  # ---- helpers ----
  
  longest_common_prefix <- function(strings) {
    strings <- as.character(strings)
    if (length(strings) == 0) return("")
    s0 <- strings[which.min(nchar(strings))]
    if (nchar(s0) == 0) return("")
    for (i in seq_len(nchar(s0))) {
      ch <- substr(s0, i, i)
      if (any(substr(strings, i, i) != ch)) {
        return(if (i == 1) "" else substr(s0, 1, i - 1))
      }
    }
    s0
  }
  
  # Detect K: prefer MAXPOPS=, fallback to "populations assumed"
  detect_k <- function(file) {
    lines <- readLines(file, warn = FALSE, n = 300)
    
    # MAXPOPS= is typically present in STRUCTURE outputs
    mx <- grep("MAXPOPS\\s*=", lines, value = TRUE)
    if (length(mx) > 0) {
      k <- suppressWarnings(as.integer(sub(".*MAXPOPS\\s*=\\s*([0-9]+).*", "\\1", mx[1])))
      if (!is.na(k)) return(k)
    }
    
    # fallback
    pa <- grep("populations assumed", lines, ignore.case = TRUE, value = TRUE)
    if (length(pa) > 0) {
      k <- suppressWarnings(as.integer(gsub("[^0-9]", "", pa[1])))
      if (!is.na(k)) return(k)
    }
    
    NA_integer_
  }
  
  clean_q_line <- function(s) {
    s <- gsub("\\*+", "", s)
    s <- gsub("[()]", "", s)
    s <- sub("\\|\\s*$", "", s)
    s
  }
  
  .structureParseQmat2 <- function(q.mat.txt, pops = NULL, popdata = 1L) {
    q.mat.txt <- clean_q_line(q.mat.txt)
    
    # split by whitespace; keep stable columns
    cols_fixed <- c("row", "id", "pct.miss", "orig.pop")
    
    rows <- strsplit(q.mat.txt, "\\s+") |>
      purrr::map(function(q) {
        q <- q[q != "" & q != ":"]
        if (length(q) == 0) return(NULL)
        
        # If POPDATA==0 then orig.pop is absent; insert a dummy "1" after pct.miss.
        if (popdata == 0L) {
          if (length(q) < 4) return(NULL)
          q <- c(q[1:3], "1", q[4:length(q)])
        }
        
        df <- as.data.frame(t(q), stringsAsFactors = FALSE)
        n_groups <- max(0, ncol(df) - 4)
        colnames(df) <- c(cols_fixed, paste0("Group.", seq_len(n_groups)))
        df
      }) |>
      purrr::compact()
    
    if (length(rows) == 0) return(NULL)
    
    out <- dplyr::bind_rows(rows)
    
    # numeric coercion (suppress warnings from any stray text)
    num_cols <- c("row", "pct.miss", "orig.pop", grep("^Group\\.", names(out), value = TRUE))
    out[num_cols] <- lapply(out[num_cols], function(z) suppressWarnings(as.numeric(z)))
    
    if (!is.null(pops)) {
      out$orig.pop <- pops[out$orig.pop]
    }
    
    out
  }
  
  structureRead2 <- function(file, pops = NULL) {
    if (!file.exists(file)) stop(error("The file '", file, "' can't be found."))
    
    # read token stream for summary stats (fast enough, keeps your original logic)
    tokens <- scan(file, what = "character", quiet = TRUE)
    
    get_after <- function(pattern, offset, which_hit = 1L) {
      loc <- grep(pattern, tokens, fixed = FALSE)
      if (length(loc) < which_hit) return(NA_real_)
      suppressWarnings(as.numeric(tokens[loc[which_hit] + offset]))
    }
    
    est.ln.prob <- get_after("Estimated", 6, 1)
    mean.lnL    <- get_after("likelihood", 2, 1)
    var.lnL     <- get_after("likelihood", 2, 2)
    
    # Robust parameter extraction from tokens
    extract_param_int <- function(param) {
      loc <- grep(param, tokens, fixed = TRUE)
      if (length(loc) == 0) return(NA_integer_)
      v <- tokens[loc[1]]
      v <- sub(paste0(".*", param, "="), "", v)
      v <- sub(",", "", v, fixed = TRUE)
      suppressWarnings(as.integer(v))
    }
    
    maxpops  <- extract_param_int("MAXPOPS")
    gensback <- extract_param_int("GENSBACK")
    popdata  <- extract_param_int("POPDATA")
    if (is.na(popdata)) popdata <- 1L
    
    smry <- c(
      k = maxpops,
      est.ln.prob = est.ln.prob,
      mean.lnL = mean.lnL,
      var.lnL = var.lnL
    )
    
    # line-based parsing for Q-matrix
    lines <- scan(file, what = "character", sep = "\n", quiet = TRUE)
    
    first <- grep("\\(%Miss\\)", lines) + 1
    last  <- grep("Estimated Allele", lines) - 1
    if (length(first) == 0 || length(last) == 0 || first[1] > last[1]) {
      stop(error("Could not locate Q-matrix block in file: ", basename(file)))
    }
    
    tbl.txt <- lines[first[1]:last[1]]
    tbl.txt <- clean_q_line(tbl.txt)
    
    prior.lines <- grep("\\|", tbl.txt)
    
    no.prior <- NULL
    if (length(prior.lines) < length(tbl.txt)) {
      no.prior.q.txt <- if (length(prior.lines) == 0) tbl.txt else tbl.txt[-prior.lines]
      no.prior <- .structureParseQmat2(no.prior.q.txt, pops = pops, popdata = popdata)
    }
    
    # K=1 shortcut
    if (!is.na(maxpops) && maxpops == 1L) {
      if (!is.null(no.prior)) no.prior$row <- NULL
      return(list(summary = smry, q.mat = no.prior, prior.anc = NULL))
    }
    
    has.prior.df <- NULL
    prior.anc <- NULL
    
    if (length(prior.lines) > 0) {
      prior.parts <- strsplit(tbl.txt[prior.lines], "\\|", fixed = FALSE)
      
      prior.q.txt <- vapply(prior.parts, function(x) x[1], character(1))
      df <- .structureParseQmat2(prior.q.txt, pops = pops, popdata = popdata)
      
      # build prior ancestry matrices per individual (keyed by id)
      prior.anc <- purrr::imap(prior.parts, function(x, idx) {
        anc.mat <- matrix(NA_real_, nrow = maxpops, ncol = gensback + 1)
        rownames(anc.mat) <- paste0("Pop.", seq_len(nrow(anc.mat)))
        colnames(anc.mat) <- paste0("Gen.", 0:gensback)
        
        # x[-1] contains ancestry strings after '|'
        bits <- strsplit(x[-1], "\\s|:", perl = TRUE)
        bits <- lapply(bits, function(y) y[y != ""])
        # expected: first token is pop, remaining are gens 0..gensback
        for (b in bits) {
          if (length(b) < 2) next
          pop_i <- suppressWarnings(as.integer(b[1]))
          if (is.na(pop_i) || pop_i < 1 || pop_i > maxpops) next
          vals <- suppressWarnings(as.numeric(b[-1]))
          if (length(vals) == ncol(anc.mat)) anc.mat[pop_i, ] <- vals
        }
        anc.mat
      })
      
      # name prior.anc by id if available
      if (!is.null(df) && "id" %in% names(df) && length(prior.anc) == nrow(df)) {
        names(prior.anc) <- df$id
      }
      
      # reconstruct group probs (rowSums), fallback to first group when missing
      prob.mat <- t(vapply(seq_len(nrow(df)), function(i) {
        id_i <- df$id[i]
        anc <- prior.anc[[id_i]]
        p <- rowSums(anc, na.rm = TRUE)
        # if everything is NA, fallback using first group column (STRUCTURE behaviour varies)
        if (all(is.na(p)) || all(p == 0)) {
          # if Group.1 exists, replicate it; otherwise NA
          g1 <- if ("Group.1" %in% names(df)) df$Group.1[i] else NA_real_
          p <- rep(g1, maxpops)
        }
        p
      }, numeric(maxpops)))
      
      colnames(prob.mat) <- paste0("Group.", seq_len(ncol(prob.mat)))
      df$Group.1 <- NULL
      has.prior.df <- cbind(df, prob.mat, stringsAsFactors = FALSE)
    }
    
    q.mat <- dplyr::bind_rows(no.prior, has.prior.df)
    if (is.null(q.mat) || nrow(q.mat) == 0) {
      stop(error("Parsed Q-matrix is empty for file: ", basename(file)))
    }
    
    # order by original row and drop helper
    if ("row" %in% names(q.mat)) {
      q.mat <- q.mat[order(q.mat$row), , drop = FALSE]
      q.mat$row <- NULL
    }
    rownames(q.mat) <- NULL
    
    # normalize Q rows to sum to 1 (Groups only)
    group_cols <- grep("^Group\\.", names(q.mat), value = TRUE)
    if (length(group_cols) > 0) {
      q <- as.matrix(q.mat[, group_cols, drop = FALSE])
      rs <- rowSums(q, na.rm = TRUE)
      rs[rs == 0] <- NA_real_
      q <- q / rs
      q.mat[, group_cols] <- q
    }
    
    list(summary = smry, q.mat = q.mat, prior.anc = prior.anc)
  }
  
  # ---- file inventory ----
  
  k_vals <- vapply(file_paths, detect_k, integer(1))
  file_info <- data.frame(
    f_name = file_paths,
    k = k_vals,
    stringsAsFactors = FALSE
  )
  
  if (all(is.na(file_info$k))) {
    stop(error("Could not detect K for any file in: ", folder.path))
  }
  
  file_info <- file_info[order(file_info$k, file_info$f_name), , drop = FALSE]
  file_info <- dplyr::group_by(file_info, .data$k)
  file_info <- dplyr::mutate(file_info, rep = dplyr::row_number())
  file_info <- as.data.frame(file_info)
  
  # label/prefix
  file_names <- basename(file_paths)
  if (is.null(prefix)) {
    prefix <- longest_common_prefix(file_names)
    if (prefix == "") prefix <- "structure"
  }
  
  file_info$label <- paste0(prefix, ".k", file_info$k, ".r", file_info$rep)
  
  # optionally rename files on disk
  if (isTRUE(rename_files)) {
    if (verbose >= 2) cat(report("Renaming STRUCTURE files on disk.\n"))
    new_paths <- file.path(dirname(file_info$f_name), paste0(file_info$label, "_out"))
    ok <- file.rename(file_info$f_name, new_paths)
    if (!all(ok)) warning(warn("Not all files were successfully renamed."))
    file_info$f_name <- ifelse(ok, new_paths, file_info$f_name)
  }
  
  # ---- parse all runs ----
  
  if (verbose >= 2) cat(report("Processing ", nrow(file_info), " STRUCTURE output files.\n"))
  
  run_results <- purrr::pmap(
    list(file_info$f_name, file_info$label),
    function(f, lab) {
      if (verbose >= 3) cat(report("Processing: ", basename(f), "\n"))
      out <- structureRead2(file = f)
      out$files <- f
      out$label <- lab
      out
    }
  )
  
  names(run_results) <- file_info$label
  
  # ---- attach pop data from genlight (optional) ----
  if (!is.null(x)) {
    datatype <- utils.check.datatype(x, verbose = 0)
    if (!datatype %in% c("SNP", "SilicoDArT")) {
      stop(error("The x parameter must be a genlight object containing SNP or SilicoDArT data."))
    }
    
    pop_tbl <- data.frame(id = indNames(x), pop = pop(x), stringsAsFactors = FALSE)
    
    run_results <- lapply(run_results, function(y) {
      if (is.null(y$q.mat) || !("id" %in% names(y$q.mat))) return(y)
      merged <- merge(y$q.mat, pop_tbl, by = "id", all.x = TRUE, sort = FALSE)
      # keep original ordering from y$q.mat
      merged <- merged[match(y$q.mat$id, merged$id), , drop = FALSE]
      merged$orig.pop <- merged$pop
      merged$pop <- NULL
      y$q.mat <- merged
      y
    })
  }
  
  class(run_results) <- c("structure.result", class(run_results))
  
  if (verbose >= 1) cat(report("\nCompleted: ", funname, "\n"))
  run_results
}

Any scripts or data that you put into this service are public.

dartR.popgen documentation built on March 16, 2026, 9:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dartR.popgen
Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

R/gl.read.structure.r
In dartR.popgen: Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

Defines functions gl.read.structure

Documented in gl.read.structure

Try the dartR.popgen package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dartR.popgen Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

R/gl.read.structure.r In dartR.popgen: Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

Defines functions gl.read.structure

Documented in gl.read.structure

Try the dartR.popgen package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dartR.popgen
Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

R/gl.read.structure.r
In dartR.popgen: Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis