pbsim: Simulations of Plant Breeding Experiments

Documented in sim_phenoval sim_trial

#' Simulate phenotypic values
#' 
#' @description 
#' Simulates phenotypic observations given the genotypic value of individuals and the 
#' heritability of a quantitative trait.
#' 
#' @param pop An object of class \code{pop}.
#' @param h2 The heritability of the trait or traits. May be a numeric vector of 
#' length 1 (heritability is the same for all traits) or a numeric vector of 
#' length n_trait.
#' @param n.env The number of environments in which to phenotype.
#' @param n.rep The number of replicates of each individual in each environment.
#' @param return.eff Should the variance components and effects be returned?
#' @param ... Other arguments. See \code{Details}.
#' 
#' @details
#' 
#' Other arguments that can be specified are :
#' \describe{
#'   \item{\code{V_E}}{The variance of environmental effects. May be a scalar 
#'   (V_E is the same for all traits) or a numeric vector of length n_trait. }
#'   \item{\code{V_R}}{The variance of the residual effects. May be a scalar 
#'   (V_R is the same for all traits) or a numeric vector of length n_trait. }
#' }
#' 
#' Environmental effects are drawn from a normal distribution such that \eqn{e ~ N(0, V_E)},
#' where \code{V_E} is the environmental variance. If \code{V_E} is not provided, it 
#' is 8 times the genotypic variance. Genotype-environmental interaction effects are drawn
#' from a normal distribution such that \eqn{ge ~ N(0, V_GE)}, where \code{V_GE} is the genotype-environment 
#' interaction variance. If \code{V_GE} is not provided, it defaults to 2 times the genotypic variance. 
#' Residual effects are drawn from a normal distribution such that \eqn{\epsilon ~ N(0, V_R)}, where \code{V_R} is 
#' the residual variance. If \code{V_R} is not provided, it is calculated from \code{h2}, where 
#' \eqn{V_R = n.rep * n.env * (\frac{Vg}{h2} - Vg)}.
#' 
#' @return 
#' An object of class \code{pop} with all information in the input \code{pop} object,
#' plus simulated phenotypes.
#' 
#' @examples 
#' 
#' # Create a genome
#' n.mar  <- c(505, 505, 505)
#' len <- c(120, 130, 140)
#' 
#' genome <- sim_genome(len, n.mar)
#' 
#' # Simulate a a trait with 15 QTL
#' qtl.model <- matrix(nrow = 15, ncol = 4)
#' 
#' genome <- sim_gen_model(genome, qtl.model, add.dist = "geometric", max.qtl = 15)
#' 
#' pop <- sim_pop(genome = genome, n.ind = 200)
#' 
#' ## Default simulates environment and residual effects, but no GxE
#' pop1 <- sim_phenoval(pop = pop, h2 = 0.5, n.env = 2, n.rep = 2)
#' 
#' 
#' # Simulate GxE
#' # First simulate a population
#' pop <- sim_pop(genome = genome, n.ind = 200, ignore.gen.model = TRUE)
#' 
#' # Second, create a gene model with QxE
#' genome <- sim_gen_model(genome = genome, qtl.model = qtl.model, geno = pop$geno,
#'                         add.dist = "geometric", V_GE.scale = 2)
#'                         
#' # Re-save the population
#' pop <- create_pop(genome = genome, geno = do.call("cbind", pop$geno))
#' 
#' # Generate phenotypes 
#' pop1 <- sim_phenoval(pop = pop, h2 = 0.5, n.env = 2, n.rep = 2)
#' 
#' 
#' 
#' 
#' @export 
#' 
sim_phenoval <- function(pop, h2, n.env = 1, n.rep = 1, ...) {
  
  # Make sure pop inherits the class "pop"
  if (!inherits(pop, "pop")) stop("The input 'pop' must be of class 'pop'.")
  
  # Grab the genotypic values
  geno_val <- gv(pop)
  
  # Does the pop object have genotypic values
  if (is.null(geno_val)) stop("The 'pop' object must have the data.frame of genotypic values")
  
  # Number of traits
  n_trait <- sum(startsWith(x = names(geno_val), "trait"))
  
  # Capture the other arguments
  other.args <- list(...)
  
  # Other variances
  V_E <- other.args$V_E
  V_R <- other.args$V_R
  
  # The same goes for V_E and V_R - this will pass if both or either are NULL.
  if (length(V_E) > 1 & length(V_E) != n_trait)
    stop("The length of V_E, if not 1, must be the same as the number of traits.")
  
  # The same goes for V_E and V_R - this will pass if both or either are NULL.
  if (length(V_R) > 1 & length(V_R) != n_trait)
    stop("The length of h2, if not 1, must be the same as the number of traits.")
  
  
  # If this list is not empty, make sure that the elements are correctly named
  if (length(other.args) != 0) {
    
    # Warn if all are not provided
    if (!all(c("V_E", "V_R") %in% names(other.args)))
      warning("V_E or V_R might have been passed, but were not detected. Check 
              your arguments.")
    
    # If none of the variance components are passed; you must provide the heritability
  } else {
    
    if (missing(h2)) stop("Variance components were not passed; you must provide a hertiability (h2).")
    
    # If the h2 vector is longer than 1, the length must be the same as the number
    # of traits
    if (length(h2) > 1 & length(h2) != n_trait)
      stop("The length of h2, if not 1, must be the same as the number of traits.")
    
    # Heritability must be between 0 and 1
    if (!all(h2 >= 0, h2 <= 1))
      stop("Heritability must be between 0 and 1.")
    
  }
  
  # Calculate genetic variance for each trait
  V_G <- varG(pop) 
    
  
  # Calculate environment variance if not provided
  if (is.null(V_E)) V_E <- V_G * 8
  
  # Calculate residual variance if not provided
  if (is.null(V_R)) V_R <- n.rep * n.env * ( (V_G / h2) - V_G )

  # Number of individuals
  n_ind <- nind(pop)
  
  # List of variance components
  var_comp <- list(V_G = V_G, V_E = V_E, V_R = V_R)
  
  # Generate environment effects
  t_eff <- lapply(X = V_E, function(varE) {
    sim <- rnorm(n = n.env, mean = 0, sd = sqrt(varE))
    structure(sim, names = paste("env", seq(n.env), sep = ""))
  })
    
  t_eff1 <- lapply(t_eff, matrix, nrow = n_ind, ncol = n.env * n.rep, byrow = TRUE)
  
  # Get the gxe slopes
  gt_slopes <- pop$gxe_slope
  
  if (!is.null(gt_slopes)) {
    gt_slopes <- subset(gt_slopes, select = -ind, drop = FALSE)
    
  } else {
    gt_slopes <- replicate(n = ncol(geno_val) - 1, rep(0, length = nind(pop)), simplify = FALSE)
    
  }

  # Generate GxE effects
  gt_eff <- mapply(gt_slopes, t_eff1, V_E, FUN = function(gxe, teff, ve) {
    # Rescale the slopes using V_E
    gxe1 <- gxe / sqrt(ve)
    
    # Get the genotype-specific slopes
    # Remember to add 1
    geno_b <- matrix(data = 1 + gxe1, nrow = n_ind, ncol = ncol(teff)) 
    # Multiply the slopes by the environmental effects
    geno_b * teff
  }, SIMPLIFY = FALSE)

  
  # Generate residual effects
  epsilon <- lapply(X = V_R, function(varR) {
    sim <- rnorm(n = n.env * n.rep * n_ind, mean = 0, sd = sqrt(varR))
    matrix(sim, nrow = n_ind, ncol = n.env * n.rep, byrow = TRUE)
  })
  
  g <- subset(geno_val, select = -ind, drop = FALSE)
  
  # Apply over all of the list
  p <- mapply(g, gt_eff, epsilon, FUN = function(g1, gt1, ep) {
    
    # Reform the g1 matrix
    g1 <- matrix(g1, nrow = n_ind, ncol = n.env * n.rep)
    
    # Sum
    p <- g1 + gt1 + ep
    
    p1 <- structure(p, dimnames = list(indnames(pop), paste( paste("env", seq(n.env), sep = ""), rep(paste("rep", seq(n.rep), sep = ""), each = n.env), sep = "_" )) )
    
    p2 <- data.frame(ind = row.names(p1), p1, stringsAsFactors = FALSE, row.names = NULL)
    p2_reshape <- reshape(p2, direction = "long", varying = list(2:ncol(p2)), timevar = "env", v.names = "phenoval", idvar = "ind", new.row.names = NULL)
    p2_reshape$env <- names(p2)[-1][p2_reshape$env]
    p2_reshape$rep <- sapply(strsplit(p2_reshape$env, split = "_"), FUN = "[", 2)
    p2_reshape$env <- sapply(strsplit(p2_reshape$env, split = "_"), FUN = "[", 1)
 
    return(p2_reshape)
    
  }, SIMPLIFY = FALSE)
  
  
  # Tidy the phenotypes
  p_df <- mapply(names(p), p, FUN = function(tr, ph) data.frame(trait = tr, ph, stringsAsFactors = FALSE), SIMPLIFY = FALSE)
  p_df1 <- do.call("rbind", p_df)
  row.names(p_df1) <- NULL
    
  
  # Calculate the mean phenotypic value
  mu_p <- aggregate(formula = phenoval ~ ind + trait, data = p_df1, FUN = mean)
  mu_p1 <- reshape(data = mu_p, idvar = "ind", direction = "wide", v.names = "phenoval", timevar = "trait")
  names(mu_p1)[-1] <- names(p)
  
  pheno_val <- list(
      var_comp = var_comp,
      pheno_obs = p_df1,
      pheno_mean = mu_p1
  )
    

  pop[["pheno_val"]] <- pheno_val
  
  
  # Return the pop
  return(pop)
  
} # Close the function




#' Simulate an experimental field trial
#' 
#' @description 
#' Simulates a trial in which phenotypic data on a quantitative trait is gathered.
#' 
#' @param pop An object of class \code{pop}, the individuals in which are to be
#' phenotyped.
#' @param h2 The heritability of the trait or traits. May be a \code{double} 
#' (heritability is the same for all traits) or a numeric vector with length equal
#' to the number of traits.
#' @param n.env The number of environments in which to phenotype.
#' @param n.rep The number of replicates of each experimental (i.e. non-check)
#' individual in each environment.
#' @param check.pop A \code{pop} object with individuals to be used as repeated 
#' checks (e.g. for an augmented design). If missing (default), then no checks
#' are included.
#' @param check.rep A scalar specifying the number of times each check is replicated
#' in each environment.
#' @param ... Other arguments. See \code{Details}.
#' 
#' @return 
#' An object of class \code{pop} with all information in the input \code{pop} object,
#' plus simulated phenotypes.
#' 
#' @details
#' 
#' Other arguments that can be specified are :
#' \describe{
#'   \item{\code{V_E}}{The variance of environmental effects. May be a \code{double} 
#'   (V_E is the same for all traits) or a numeric vector with length equal
#'   to the number of traits. }
#'   \item{\code{V_R}}{The variance of the residual effects. May be a \code{double} 
#'   (V_R is the same for all traits) or a numeric vector with length equal
#'   to the number of traits. }
#' }
#' 
#' @examples 
#' 
#' \dontrun{
#' 
#' # Load some historic data
#' data("s2_cap_genos")
#' data("s2_snp_info")
#' 
#' map <- s2_snp_info %>% 
#'   select(-alleles) %>% 
#'   data.frame(row.names = .$rs, stringsAsFactors = FALSE) %>% 
#'   select(-rs) %>% 
#'   table_to_map()
#' 
#' # Create a genome with genetic architecture
#' genome <- sim_genome(map = map)
#' 
#' # Simulate a a trait with 15 QTL
#' qtl.model <- matrix(nrow = 15, ncol = 4)
#' 
#' genome <- sim_gen_model(genome, qtl.model, add.dist = "geometric", max.qtl = 15)
#' 
#' pop <- create_pop(genome = genome, geno = s2_cap_genos)
#' 
#' # Sample the population for checks
#' set.seed(1035)
#' check_pop <- pop %>%
#'   subset_pop(individual = sample(indnames(.), 5))
#'   
#' # Remove the checks from the original pop
#' exp_pop <- pop %>% 
#'   subset_pop(setdiff(indnames(.), indnames(check_pop)))
#' 
#' # Simulate a trial
#' pop_pheno <- sim_trial(pop = exp_pop, h2 = 0.5, n.env = 3, n.rep = 1,
#'   check.pop = check_pop, check.rep = 3)
#'   
#' }
#'   
#' @import dplyr
#' @import tidyr
#' 
#' 
sim_trial <- function(pop, h2, n.env = 1, n.rep = 1, check.pop, check.rep, ...) {
  
  # Make sure pop inherits the class "pop"
  if (!inherits(pop, "pop"))
    stop("The input 'pop' must be of class 'pop'.")
  
  # n.rep and n.env must be integers
  n.rep <- as.integer(n.rep)
  n.env <- as.integer(n.env)
  
  # Check the checks, if present
  if (!missing(check.pop)) {
    if (!inherits(check.pop, "pop"))
      stop("The input 'pop' must be of class 'pop'.")
    
    if (missing(check.rep))
      stop("If 'check.pop' is passed, so too must 'check.rep'")
    
  }
  
  # Check the checks, if present
  if (!missing(check.rep)) {
    if (missing(check.pop))
      stop("If 'check.rep' is passed, so too must 'check.pop'")
    
    check.rep <- as.integer(check.rep)
    
  }
  
  # Capture the other arguments
  other.args <- list(...)
  
  # Other variances
  V_E <- other.args$V_E
  V_R <- other.args$V_R
  
  # Simulate phenotypes for the experimental individuals
  exp_pheno <- sim_phenoval(pop = pop, h2 = h2, n.env = n.env, n.rep = n.rep, 
                            V_E = V_E, V_R = V_R)
  
  # Extract the variance components from the exp_pheno for use with the checks
  V_R <- as.numeric(exp_pheno$pheno_val$var_comp$V_R)
  V_E <- 0
  
  # Note the environment variance is 0 so as to add the environmental effects
  # from the previous simulation
  check_pheno <- sim_phenoval(pop = check_pop, h2 = h2, n.env = n.env, n.rep = check.rep,
                              V_E = V_E, V_R = V_R)
  
  # Extract the phenotypic observations and add the environmental effects
  pheno_obs <- check_pheno$pheno_val$pheno_obs
  
  env_eff <- exp_pheno$pheno_val$effects$env %>% 
    lapply(function(eff) data.frame(env = names(eff), effect = eff, row.names = NULL))
  
  env_eff <- lapply(names(env_eff), function(trait) mutate(env_eff[[trait]], trait = trait)) %>%
    bind_rows()
  
  # Merge and add env eff
  pheno_obs1 <- merge(x = pheno_obs, y = env_eff) %>% 
    mutate(phenoval = phenoval + effect) %>% 
    select(ind, trait, env, rep, phenoval) %>%
    arrange(trait, env, rep)
  
  # Calculate the means
  pheno_mean <- pheno_obs1 %>% 
    group_by(ind, trait) %>% 
    summarize(phenomean = mean(phenoval)) %>% 
    spread(trait, phenomean) %>% 
    as.data.frame()
  
  # Adjust the check phenos
  check_pheno$pheno_val <- list(var_comp = exp_pheno$pheno_val$var_comp, 
                                effects = exp_pheno$pheno_val$effects, 
                                pheno_obs = pheno_obs1, 
                                pheno_mean = pheno_mean)
  
  # Merge populations
  final_pop <- combine_pop(list(exp_pheno, check_pheno))
  
  # Add variance components
  final_pop$pheno_val <- list(var_comp = exp_pheno$pheno_val$var_comp, 
                              effects = exp_pheno$pheno_val$effects, 
                              pheno_obs = final_pop$pheno_val$pheno_obs, 
                              pheno_mean = final_pop$pheno_val$pheno_mean)
  
  # Return
  return(final_pop)
  
}