R/mat_scramble.R

Defines functions mat_scramble

Documented in mat_scramble

#' Scramble a matrix of genotype data
#'
#' This function assumes that M is a matrix with L rows (number of markers) and
#' 2 * N (N = number of individuals) columns.
#' There are two ways that the data might be permuted.  In the first,
#' obtained with `preserve_haplotypes = FALSE`,
#' the position of missing data within the matrix is held constant, but all
#' non-missing sites within a row (i.e. all gene copies at a locus) get
#' scrambled amongst the samples.  In the second way, just the columns are
#' permuted.  This preserves haplotypes in the data, if there are any.
#' The second approach should only be used if haplotypes are inferred in
#' the individuals.
#'
#' There is now an additional way of permuting:  if
#' `preserve_individuals = TRUE`, then entire individuals are permuted.
#' If `preserve_haplotypes = FALSE`, then the gene copies at each locus
#' are randomly ordered within each individual before permuating them.
#' If `preserve_haplotypes = TRUE` then that initial permutation is not
#' done.  This should only be done if the individuals are phased and that
#' phasing is represented in how the genotypes are stored in the matrix.
#' @param M a matrix with L rows (number of markers) and 2 * N columns
#' where N is the number of individuals. Missing data must be coded
#' as NA
#' @param preserve_haplotypes logical indicating whether the haplotypes
# should be preserved.  If `row_groups` is not null, this is automatically
#' set to be TRUE
#' @param preserve_individuals logical indicating whether the genes within
#' each individual should stay togeter.
#' @param row_groups if not NULL must be a list of indexes of adjacent rows
#' that are all in the same groups.  For example: `list(1:10, 11:15, 16:30)`.
#' They should be in order and complete.
#' In practice, these should correspond to the indexes of markers on different
#' chromosomes.
#' @return This function returns a matrix of the same dimensions and storage.mode
#' as the input, `M`; however the elements have been permuted according to the
#' options specified by the users.
#' @export
#' @examples
#' # make a matrix with alleles named as I.M.g, where I is individual
#' # number, M is marker number, and g is either "a" or "b" depending
#' # on which gene copy in the diploid it is.  4 indivs and 7 markers...
#' Mat <- matrix(
#'  paste(
#'    rep(1:4, each = 7 * 2),
#'    rep(1:7, 4 * 2),
#'    rep(c("a", "b"), each = 7),
#'    sep = "."
#'  ),
#'  nrow = 7
#' )
#'
#' # without preserving haplotypes
#' S1 <- mat_scramble(Mat)
#'
#' # preserving haplotypes with markers 1-7 all on one chromosome
#' S2 <- mat_scramble(Mat, preserve_haplotypes = TRUE)
#'
#' # preserving haplotypes with markers 1-3 on one chromosome and 4-7 on another
#' S3 <- mat_scramble(Mat, row_groups = list(1:3, 4:7))
#'
#' # preserving individuals, but not haplotypes, with two chromosomes
#' S4 <- mat_scramble(Mat, row_groups = list(1:3, 4:7), preserve_individuals = TRUE)
#'
#' # preserving individuals by chromosome, but not haplotypes, with two chromosomes
#' S5 <- mat_scramble(Mat, row_groups = list(1:3, 4:7), preserve_individuals = "BY_CHROM")
#'
#' # preserving individuals by chromosome, and preserving haplotypes, with two chromosomes
#' S6 <- mat_scramble(Mat, row_groups = list(1:3, 4:7),
#'                  preserve_individuals = "BY_CHROM", preserve_haplotypes = TRUE)

mat_scramble <- function(M, preserve_haplotypes = FALSE, row_groups = NULL, preserve_individuals = FALSE) {
  if(!is.null(row_groups)) {
    idxs <- unlist(row_groups)
    if(any(duplicated(idxs))) stop("Duplicated values in row_groups.")
    if(!all( (1:nrow(M)) %in% idxs)) stop("row_groups does not contain all indexes.")
  }


  # here is a block that will return if preserve_individuals is TRUE
  if(preserve_individuals == TRUE || preserve_individuals == "BY_CHROM") {
    if(preserve_haplotypes == FALSE) {
      # cycle over individuals and sample the gene copies at each locus within them
      M2 <- lapply(seq(1, ncol(M), by = 2), function(i) {  # cycle over individuals
        apply(M[, c(i, i+1)], 1, sample)
      }) %>%
        do.call(rbind, .) %>%
        t()
    } else {  # if preserve haplotypes == TRUE, we don't do that step
      M2 <- M
    }

    if(preserve_individuals == TRUE || (preserve_individuals == "BY_CHROM" && is.null(row_groups))) {
      # now, we permute the individuals around M2 and return
      num_ind <- ncol(M2) / 2
      ind_ord <- sample(1:num_ind)
      ind_pick <- (2 * rep(ind_ord, each = 2)) - c(1, 0)
      return(M2[, ind_pick])
    }

    if(preserve_individuals == "BY_CHROM") {
      # now, we permute the individuals around M2 and return
      num_ind <- ncol(M2) / 2
      ret <- lapply(row_groups, function(x) {
        ind_ord <- sample(1:num_ind)
        ind_pick <- (2 * rep(ind_ord, each = 2)) - c(1, 0)
        M2[x, ind_pick]
      }) %>%
        do.call(rbind, args = .)

      return(ret)
    }


  }


  # This only happens if preserve_individuals was FALSE
  if(preserve_haplotypes == FALSE) {
    ret <- apply(M, 1, function(x) {
      x[!is.na(x)] <- sample(x[!is.na(x)])
      x
    }) %>%
      t()
  } else {
    if(is.null(row_groups)) {
      ret <- M[, sample(1:ncol(M))]
    } else {
      ret <- lapply(row_groups, function(x) {
        M[x, sample(1:ncol(M))]
      }) %>%
        do.call(rbind, args = .)
    }
  }
  return(ret)
}
eriqande/gscramble documentation built on March 5, 2024, 4:22 p.m.