R/gl.assign.pa.r

Defines functions gl.assign.pa

Documented in gl.assign.pa

#' @name gl.assign.pa
#' @title Eliminates populations as possible source populations for an
#' individual of unknown provenance, using private alleles
#' @description
#' This script eliminates from consideration as putative source populations,
#' those populations for which the individual has too many private alleles. The
#' populations that remain are putative source populations, subject to further
#' consideration.
#'
#' The algorithm identifies those target populations for which the individual
#' has no private alleles or for which the number of private alleles does not
#' exceed a user specified threshold.
#'
#' An excessive count of private alleles is an indication that the unknown does
#' not belong to a target population (provided that the sample size is
#' adequate, say >=10).
#'
#' @param x Name of the input genlight object [required].
#' @param unknown SpecimenID label (indName) of the focal individual whose
#' provenance is unknown [required].
#' @param nmin Minimum sample size for a target population to be included in the
#' analysis [default 10].
#' @param threshold Populations to retain for consideration; those for which the
#'  focal individual has less than or equal to threshold loci with private
#'  alleles [default 0].
#' @param n.best If given a value, dictates the best n=n.best populations to
#' retain for consideration (or more if their are ties) based on private alleles
#'  [default NULL].
#' @param verbose Verbosity: 0, silent or fatal errors; 1, begin and end; 2,
#' progress log; 3, progress and results summary; 5, full report
#'  [default 2 or as specified using gl.set.verbosity].
#' @return A genlight object containing the focal individual (assigned to
#' population 'unknown') and populations for which the focal individual is not
#' distinctive (number of loci with private alleles less than or equal to the
#' threshold). If no such populations, the genlight object contains only data
#' for the unknown individual.
#'
#' @importFrom stats dnorm qnorm
#' @export
#'
#' @author Custodian: Arthur Georges -- Post to
#'   \url{https://groups.google.com/d/forum/dartr}
#' @examples
#' # Test run with a focal individual from the Macleay River (EmmacMaclGeor)
#' test <- gl.assign.pa(testset.gl,
#'   unknown = "UC_00146", nmin = 10, threshold = 1)
#'
#' @seealso \code{\link{gl.assign.pca}}

gl.assign.pa <- function(x,
                         unknown,
                         nmin = 10,
                         threshold = 0,
                         n.best = NULL,
                         verbose = NULL) {
  # SET VERBOSITY
  verbose <- gl.check.verbosity(verbose)

  # FLAG SCRIPT START
  funname <- match.call()[[1]]
  utils.flag.start(
    func = funname,
    build = "Josh",
    verbose = verbose
  )

  # CHECK DATATYPE
  datatype <- utils.check.datatype(x, verbose = verbose)

  # FUNCTION SPECIFIC ERROR CHECKING

  test <- unknown %in% indNames(x)
  if (!all(test, na.rm = FALSE)) {
    stop(
      error(
        "Fatal Error: nominated focal individual (of unknown provenance)
                is not present in the dataset!\n"
      )
    )
  }
  if (nmin <= 0 & verbose >= 1) {
    if(verbose>0) cat(
      warn(
        "  Warning: the minimum size of the target population must be
                greater than zero, set to 10\n"
      )
    )
    nmin <- 10
  }
  if (threshold < 0 & verbose >= 1) {
    if(verbose>0) cat(
      warn(
        "  Warning: the threshold for private alleles must be
                non-negative, set to 0\n"
      )
    )
    threshold <- 0
  }

  if (!is.null(n.best)) {
    if (n.best < 1 & verbose >= 1) {
      cat(
        warn(
          "  Warning: the n.best parameter for retention of best
                    match populations must be a positive integer, set to NULL\n"
        )
      )
      n.best <- NULL
    }
  }

  # DO THE JOB

  # Set a hard recommended minimum population size
  hard.min <- 10
  if (verbose >= 1 & (nmin < hard.min)) {
    cat(warn(
      "  Warning: The specified minimum sample size is less than 10
            individuals\n"
    ))
    cat(
      warn(
        "    Risk of alleles present in the unknown being missed during
                sampling of populations with sample sizes less than 10\n"
      )
    )
  }

  # Assign the unknown individual to population 'unknown'
  vec <- as.vector(pop(x))
  vec[indNames(x) == unknown] <- "unknown"
  # Note, population containing the unknown has been reduced in size by 1
  pop(x) <- as.factor(vec)

  # Remove loci scored as NA for the unknown
  a <- x[pop(x) == "unknown", ]
  # Fuck, it changed the locus names, replaced hyphens with periods
  b <- data.frame(as.matrix(a))
  # Change them back
  names(b) <- locNames(a)
  c <- names(b)[is.na(b)]
  if (length(c) > 0) {
    x <- gl.drop.loc(x, loc.list = c, verbose = 0)
  }

  # Split the genlight object into one containing the unknown and one
  # containing the remaining populations
  unknowns <- gl.keep.pop(x, pop.list = "unknown", verbose = 0)
  knowns <- gl.drop.pop(x, pop.list = "unknown", verbose = 0)

  # Remove all known populations with less than nmin individuals
  pop.keep <- levels(pop(knowns))[table(pop(knowns)) >= nmin]
  pop.toss <- levels(pop(knowns))[table(pop(knowns)) < nmin]
  if (verbose >= 2) {
    cat(report(
      "  Discarding",
      length(pop.toss),
      "populations with sample size <",
      nmin,
      ":\n"
    ))
    if (verbose >= 3) {
      cat(paste(pop.toss, collapse = ", "), "\n")
    }
  }

  # knowns <- knowns[pop(knowns) %in% pop.keep]
  knowns <- gl.keep.pop(knowns, pop.list = pop.keep, verbose = 0)

  # Warn of populations retained with sizes less than the hard wired minimum
  pop.warn <- levels(pop(knowns))[table(pop(knowns)) < hard.min]
  if (length(pop.warn >= 1)) {
    if (verbose >= 1) {
      cat(
        warn(
          "  Warning: Some retained populations have sample sizes less
                    than",
          hard.min,
          ":",
          pop.warn,
          "\n"
        )
      )
    }
    if (verbose >= 1) {
      cat(warn(
        "    Substantial risk of private alleles arising as sampling
                error\n\n"
      ))
    }
  }

  # Report number of focal individuals (1) and number of target populations
  n <- length(pop(unknowns))
  N <- length(levels(pop(knowns)))
  if (n != 1) {
    if (verbose >= 0) {
      stop(
        error(
          "Fatal Error: Number of unknown focal individuals > 1;
                    population label 'unknown' already in use\n"
        )
      )
    }
  }
  if (verbose >= 2) {
    cat(
      report(
        "  Assigning",
        n,
        "unknown individual",
        unknown,
        "to",
        N,
        "target populations using",
        nLoc(x),
        "loci\n"
      )
    )
  }

  # CALCULATE NUMBER OF LOCI WITH PRIVATE ALLELES

  # Genotype of the unknown individual
  unknown.ind <- as.matrix(unknowns)
  # for each population
  pop.list <- seppop(knowns)
  count <- rep(0, length(pop.list))
  # For each population
  for (i in 1:length(pop.list)) {
    gen <- as.matrix(pop.list[[i]])
    # For each locus
    for (j in 1:nLoc(pop.list[[i]])) {
      unknown.gen <- unknown.ind[j]
      pop.gen <- gen[, j]
      if (length(pop.gen) >= nmin) {
        # Where the unknown focal individual is homozygous reference
        # allele [aa]
        if (unknown.gen == 0) {
          # If all individuals in the target population are bb [not
          # aa or ab] then focal individual has private allele [a]
          if (all(pop.gen == 2, na.rm = TRUE)) {
            count[i] <- count[i] + 1
          }
        }
        # Where the unknown focal individual is homozygous for the
        # alternate allele [bb]
        if (unknown.gen == 2) {
          # If all individuals in the target population are aa [not
          # bb or ab] then focal individual has private allele [b]
          if (all(pop.gen == 0, na.rm = TRUE)) {
            count[i] <- count[i] + 1
          }
        }
        # Where the unknown focal individual is heterozgous [ab]
        if (unknown.gen == 1) {
          # If all individuals in the target population are aa, then
          # [b] is private or if bb, then [a] is private
          if ((all(pop.gen == 0, na.rm = TRUE)) ||
            (all(pop.gen == 2, na.rm = TRUE))) {
            count[i] <- count[i] + 1
          }
        }
      }
    }
  }

  # Print out results

  if (verbose >= 3) {
    cat("  Table showing populations against number of loci with private
            alleles\n")
  }
  counter <- 1
  retain <- NULL
  for (m in levels(as.factor(count))) {
    if (as.numeric(as.character(m)) > threshold) {
      if (verbose >= 3) {
        cat(paste0("  >", threshold, " ---"))
      }
    }
    if (verbose >= 3) {
      cat("  ", m, levels(pop(knowns))[count == m], "\n")
    }
    if (!is.null(n.best)) {
      if (counter < n.best) {
        retain <- c(retain, levels(pop(knowns))[count == m])
        counter <-
          counter + length(levels(pop(knowns))[count == m])
      }
    }
  }

  # Save the data in a new gl object

  if (is.null(n.best)) {
    gl <-
      gl.keep.pop(
        x,
        pop.list = c(
          levels(pop(knowns))[count <= threshold],
          "unknown"
        ),
        mono.rm = TRUE,
        verbose = 0
      )
  } else {
    gl <-
      gl.keep.pop(
        x,
        pop.list = c(retain, "unknown"),
        mono.rm = TRUE,
        verbose = 0
      )
  }

  # Check that there is more than one population to assign (excluding
  # 'unknown')
  if (is.null(n.best)) {
    if (verbose >= 2) {
      if (nPop(gl) == 1) {
        # Taking into account the unknown as a population
        if (verbose >= 2) {
          cat(
            report(
              "  There are no populations retained for assignment.",
              "  The unknown may not belong to one of the target populations.",
              "  Returning genlight object for the unknown individual only\n"
            )
          )
        }
      } else {
        if (verbose >= 2) {
          cat(
            report(
              "  Identified and retained",
              nPop(gl) - 1,
              "putative source populations for",
              unknown,
              " (the unknown) based on the specified threshold\n"
            )
          )
          cat(report("  Monomorphic loci removed\n"))
        }
      }
    }
  } else {
    if (verbose >= 2) {
      if (length(retain) == 0) {
        if (verbose >= 2) {
          cat(
            report(
              "  There are no populations retained for assignment.",
              "  The unknown may not belong to one of the target populations.",
              "  Returning genlight object for the unknown individual only\n"
            )
          )
        }
      } else {
        if (verbose >= 2) {
          cat(
            report(
              "  Identified and retained the top",
              length(retain),
              "best listed putative source populations for",
              unknown,
              "\n"
            )
          )
          cat(report("  Monomorphic loci removed\n"))
        }
      }
    }
  }

  # FLAG SCRIPT END

  if (verbose > 0) {
    cat(report("Completed:", funname, "\n"))
  }

  return(gl)
}

Try the dartR.captive package in your browser

Any scripts or data that you put into this service are public.

dartR.captive documentation built on April 3, 2025, 7:34 p.m.