R/SimulatePop.R

Defines functions SimulatePop

Documented in SimulatePop

#' Simulate an admixed polyploid population characterized for multi-allelic markers
#'
#' @param K
#' Number of ancestral groups (positive integer superior or equal to 2)
#' @param N
#' Number of individuals (positive integer superior or equal to 2)
#' @param P
#' Ploidy level (positive integer) either as a single value to specify the same
#' ploidy for all simulated individuals, or as a vector of size `N`
#' @param M
#' Number of markers (positive integer superior or equal to 2)
#' @param L
#' Number of alleles (positive integer superior or equal to 2) either as a
#' single value to specify the same number of alleles for all simulated markers,
#' or as a vector of size `M`
#' @param C
#' Number of chromosomes (positive integer). By default, markers are evenly
#' distributed between chromosomes and evenly spaced
#' @param SmoothParam
#' Smoothing parameter corresponding to the number of generations since admixture
#' @param GeneticMap
#' An optional genetic map that overrides the number of markers `M` and chromosomes
#' `C`, with three named columns: `Marker`, `Chromosome`, `Distance`. The distances
#' must be in centiMorgan
#' @param Prop
#' Preconfigured matrix of admixture proportions that overrides the number of
#' individuals `N` and ancestral groups `K`
#' @param Freq
#' Preconfigured list of matrices of allele frequencies in ancestral groups
#' that overrides the number of markers `M`, alleles `L` and ancestral groups `K`
#' @param AlphaProp
#' Concentration parameter from the Dirichlet distribution
#' (positive numeric value) used for sampling admixture proportions
#' @param AlphaFreq
#' Concentration parameter from the Dirichlet distribution
#' (positive numeric value) used for sampling allele frequencies in
#' ancestral groups
#' @param Depth 
#' (optional) Total read depth (positive integer), either as a single value 
#' to specify the same depth for all datapoints, or as a matrix of 
#' dimension `N` x `M`
#' @param SeqError 
#' (optional) Sequencing error (numeric value between 0 and 1) rate to be used 
#' when sampling sequencing reads
#' @param NbThreads
#' Number of threads to be used (positive integer) with a default value of 0
#' setting automatically all threads available
#' @param Seed
#' Seed for reproducible inference (integer)
#'
#' @return
#' A list of four items: a list of genotying matrices (`Geno`), a matrix of
#' admixture proportions (`Prop`), a list of matrices of allele frequencies in
#' ancestral groups (`Freq`), and a vector of log-likelihood values
#' over iterations (`LogLik`)
#'
#' @description
#' This function simulates a polyploid admixed population
#'
#' @importFrom magrittr %>%
#' @importFrom magrittr set_names
#' @import dplyr
#'
#' @export
#'
#' @examples
#' ## Simulate a polyploid admixed population
#' DataSim <- SimulatePop(K=3L, N=10L, P=6L, M=50L, C=5L, L=10L, Seed=123)
#'
#' ## Genotyping matrices
#' head(DataSim$Geno$Marker1)
#'
#' ## Ancestry matrices by chromosome
#' head(DataSim$Ancestry$Ind1$Chr1)
#'
#' ## Admixture proportions
#' head(DataSim$Prop)
#'
#' ## Allele frequencies in ancestral groups
#' DataSim$Freq$Marker1
#'
#' ## Genetic map
#' head(DataSim$GeneticMap)
SimulatePop <- function(K=3L, N=100L, P=6L, M=1000L, C=5L, L=10L, SmoothParam=1,
                        GeneticMap=NULL, Prop=NULL, Freq=NULL, AlphaProp=1, AlphaFreq=1,
                        Depth=NULL, SeqError=1e-2, NbThreads=0L, Seed=123L){
  ## Initial checks
  stopifnot("K must be an integer superior or equal to 2" =
              K>=2L && length(K)==1)
  stopifnot("N must be an integer superior or equal to 2" =
              N>=2L && length(N)==1)
  stopifnot("P must be positive integers, either as single value or as a vector" =
              all(P>=1L))
  stopifnot("L must be integers superior or equal to 2, either as single value or as a vector" =
              all(L>=2L))
  stopifnot("C must be a positive integer" =
              C>=1L && length(C)==1)
  stopifnot("M must be an integer superior or equal to 2C" =
              M>=2*C && length(M)==1)
  stopifnot("SmoothParam must be a positive numeric value" =
              SmoothParam>0)
  stopifnot("When informed, GeneticMap must be a dataframe with three named columns
             (Marker, Chromosome and Distance) and a minimum of 2 markers per
             chromosome" =
              is.null(GeneticMap) ||
              (is.data.frame(GeneticMap) &&
                 all(c("Marker","Chromosome","Distance")%in%colnames(GeneticMap)) &&
                 min(table(GeneticMap$Chromosome)==2)))
  stopifnot("When informed, Prop must be a matrix of numeric values" =
              is.null(Prop) || (is.matrix(Prop) && is.numeric(Prop)))
  stopifnot("When informed, Freq must be a list of numeric matrices of size superior or equal to 2C" =
              is.null(Freq) || (length(Freq) >= 2*C && is.matrix(Freq[[1]]) && is.numeric(Freq[[1]])))
  stopifnot("When informed, Freq matrices must have same number of rows as Prop" =
              is.null(Freq) || is.null(Prop) || ncol(Prop)==nrow(Freq[[1]]))
  stopifnot("AlphaProp must be a positive numeric value" =
              AlphaProp>0)
  stopifnot("AlphaFreq must be a positive numeric value" =
              AlphaFreq>0)
  stopifnot("SeqError must be a positive numeric value between 0 and 1" =
              SeqError>=0 && SeqError<1)
  stopifnot("When informed, Depth must be a single value or a matrix of size N x M with positive integers" =
              is.null(Depth) || (all(Depth>=1L) && (length(Depth)==1) || (is.matrix(Depth) && nrow(Depth)==N && ncol(Depth)==M)))

  ## Set Seeds
  set.seed(Seed)

  ## Set initial quantities
  . <- NULL
  Ind_names <- paste0("Ind",1:N) %>% set_names(.,.)
  Group_names <- paste0("K",1:K) %>% set_names(.,.)
  Marker_names <- paste0("Marker",1:M) %>% set_names(.,.)
  Chr_names <- paste0("Chr",1:C) %>% set_names(.,.)
  if(!is.null(GeneticMap)){
    GeneticMap <- GeneticMap %>%
      arrange(Chromosome,Distance)
    Chr_names <- unique(GeneticMap$Chromosome) %>% set_names(.,.)
    Marker_names <- GeneticMap$Marker %>% set_names(.,.)
    M <- nrow(GeneticMap)
    C <- length(Chr_names)
  }
  if(!is.null(Prop)){
    if(!is.null(rownames(Prop))) Ind_names <- rownames(Prop)
    if(!is.null(colnames(Prop))) Group_names <- colnames(Prop)
    K <- ncol(Prop)
    N <- nrow(Prop)
  }
  if(!is.null(Freq)){
    if(!is.null(names(Freq))) Marker_names <- names(Freq)
    if(!is.null(rownames(Freq[[1]]))) Group_names <- rownames(Freq[[1]])
    M <- length(Freq)
    K <- nrow(Freq[[1]])
    L <- sapply(Freq,function(m)ncol(m))
  }else{
    L <- sapply(Marker_names,function(m)L)
  }
  if(length(P)==1) P <- rep(P,N)

  ## Additional checks
  stopifnot("When informed, Depth must be either a scalar or a matrix of positive integers" = 
              is.null(Depth) || (length(Depth)==1 && Depth>0) || 
              (is.matrix(Depth) && is.numeric(Depth) && ncol(Depth)==M && nrow(Depth)==N && all(Depth>=0)))

  ## Simulate admixture proportions
  if(is.null(Prop)){
    Prop <- .SimulateProp(Ind_names,Group_names,AlphaProp,Seed)
  }

  ## Simulate allele frequencies in ancestral groups
  if(is.null(Freq)){
    Freq <- .SimulateFreq(Marker_names,Group_names,L,AlphaFreq,Seed,NbThreads)
  }

  ## Genetic map formatting
  Chromosome <- NULL
  Distance <- NULL
  if(is.null(GeneticMap)){
    M_per_C <- floor(M/C) + (seq_len(C) <= M %%floor(M/C))
    Chr_vect <- lapply(1:C,function(chr)rep(Chr_names[chr],M_per_C[chr])) %>% unlist
    GeneticMap <- data.frame(Marker = Marker_names, row.names = NULL) %>%
      mutate(Chromosome = Chr_vect) %>%
      {lapply(Chr_names,function(chr)
        filter(.,.$Chromosome==chr) %>%
          mutate(Distance=seq(0,100,100/(nrow(.)-1))))} %>%
      bind_rows()
  }

  ## Calculate distances between marker pairs
  PairwiseDist <- lapply(Chr_names,function(chr){
    GeneticMap_chr <- GeneticMap %>% filter(Chromosome==chr)
    res <- data.frame(Marker_1 = GeneticMap_chr$Marker[-nrow(GeneticMap_chr)],
                      Marker_2 = GeneticMap_chr$Marker[-1],
                      Chromosome = chr,
                      Dist = as.numeric(as.character(GeneticMap_chr$Distance))[-1] -
                        as.numeric(as.character(GeneticMap_chr$Distance))[-nrow(GeneticMap_chr)])
    return(res)
  })

  ## Simulate ancestry and genotyping
  ResSim <- .SimulateGeno(Prop,Freq,PairwiseDist,P,SmoothParam,Seed,NbThreads)

  ## Deduce admixture proportions from ancestry dosages along chromosomes
  Prop <- .DeduceProp(ResSim$Ancestry)
  
  ## Generate output
  Res <- list(Geno = ResSim$Geno,
              Ancestry = ResSim$Ancestry,
              Prop = Prop,
              Freq = Freq,
              GeneticMap = GeneticMap)
  
  ## Sample sequencing reads
  if(!is.null(Depth)){
    if(length(Depth)==1) Depth <- matrix(Depth,N,M)
    AlleleDepth <- .SampleReads(ResSim$Geno,Depth,SeqError,Seed,NbThreads)
    Res <- append(list(AlleleDepth = AlleleDepth),Res)
  }

  ## Return outputs
  return(Res)
}

Try the AdmixPoly package in your browser

Any scripts or data that you put into this service are public.

AdmixPoly documentation built on June 18, 2026, 1:06 a.m.