R/sim_run_generative_model_with_TD.R

Defines functions sim_run_generative_model_with_TD

Documented in sim_run_generative_model_with_TD

#' This function runs the generative model to simulate input sparse gamete data for rhapsodi incorporating transmission distortion through either gamete killing or gene conversion
#' 
#' This function runs the generative model to simulate input sparse gamete data for rhapsodi incorporating transmission distortion through either gamete killing or gene conversion. In addition,
#' to returning the sparse gamete data, the function also returns the fully known generated gamete data,
#' the diploid donor phased haplotypes, and the true recombination break points for each gamete. This function also returns the identity of the SNP used for TD simulation. 
#' The following variables of the simulation can all be controlled:
#' the number of gametes, the number of SNPs, the sequencing coverage (or missing genotype rate), 
#' the average recombination rate, whether to simulate sequencing error, the sequencing error rate to use,
#' whether to add de novo mutations, values parameterizing how many de novo mutations there are and how many gametes are affected by the de novo mutations,
#' the type of transmission distortion (gamete killing or gene conversion), the SNP and haplotype used for simulation of transmission distortion, the probability of a gamete with the causal SNP to undergo a gene conversion or gamete killing event, the size of gene conversion tracts,
#' and the random seed for reproducibility
#'
#' @param num_gametes an integer, the number of gametes, or the number of columns for the sparse gamete data you want generated
#' @param num_snps an integer, the number of SNPs, or the number of rows for the sparse gamete data you want generated. Note: not all of these will be heterozygous due to the coverage and therefore this number won't necessarily equal the number of SNPs following filtering at the end of the generation 
#' @param coverage a numeric, input if input_cov is TRUE, suggested NULL otherwise
#' @param TD_type a string, if `gk`, gamete killing is simulated; else any other string, gene conversion is simulated; default = "gk"
#' @param p_kill a numeric, used within gamete killing simulation, the probability that a gamete containing the SNP subject to TD will be removed from the dataset through simulated gamete killing; default = 0.5
#' @param p_convert a numeric, used within gene conversion simulation, the probability that a gamete with the TD allele will undergo a gene conversion event; default = 0.5
#' @param converted_snp an integer, used within gene conversion simulation, but not required, indicating the specific SNP which will be subject so TD. Randomly selected if not provided by the user; default = NULL
#' @param converted_haplotype an integer, 0 or 1, used within gene conversion simulation, but not required, indicating which haplotype will be subject to transmission distortion. Randomly selected if not provided by the user; default = NULL
#' @param conversion_lambda a numeric, used within gene conversion simulation as lambda in the Poisson distribution to determine the length of the gene conversion event; default = 4
#' @param killer_snp an integer, used within gamete killing simulation, but not required, indicating the specific SNP which will be subject to TD. Randomly selected if not provided by the user; default = NULL
#' @param killer_haplotype an integer, 0 or 1, used within gamete killing simulation, but not required, indicating which haplotype will be subject to transmission distortion. Randomly selected if not provided by the user; default = NULL
#' @param recomb_lambda a numeric, the average rate of recombination expected for the simulation; default = 1
#' @param random_seed an integer, the random seed which will be set for the simulation, default=42
#' @param input_cov a logical, TRUE if coverage (i.e. like 0.01 (x)) will be input rather than missing genotype rate 
#' @param input_mgr a logical, TRUE if missing genotype rate (i.e. like 80 (%) or 0.8) will be inpupt rather than coverage, default = FALSE
#' @param missing_genotype_rate a numeric, input if input_mgr is TRUE and input_COV is FALSE, suggested NULL otherwise, default=NULL
#' @param add_seq_error a logical, TRUE if you want to add sequencing error to the generated data, default=TRUE
#' @param seqError_add a numeric, the sequencing error rate if adding sequencing error to the generated data, default=0.005
#' @param add_de_novo_mut a logical, TRUE if you want to add de novo mutations to the generated data, default=FALSE
#' @param de_novo_lambda an integer, default=5, parameterizes a poisson distribution to find the number of de novo mutations (DNM) total
#' @param de_novo_alpha a numeric, default=7.5, shape parameter for a gamma distribution to find the number of gametes affected per DNM
#' @param de_novo_beta a numeric, default=10, scale parameter for a gamma distribution to find the number of gametes affected per DNM
#'
#' @return a list containing: generated_data a named list returning the generated input and full truth data, specifically `gam_na` for the sparse rhapsodi input, `gam_full` for the fully known gamete data input equivalent, `recomb_spots` for the true recombination spots for each gamete, and `donor_haps` for the diploid donor phased haplotypes; TD_SNP, an integer denoting the identity of the causal SNP used in TD simulations.
#'
#' @export

sim_run_generative_model_with_TD  <- function(num_gametes, num_snps, coverage, 
                                              TD_type='gk', p_kill = 0.5,
                                              p_convert = 0.5, converted_snp = NULL, converted_haplotype = NULL, conversion_lambda = 4,
                                              killer_snp = NULL, killer_haplotype = NULL,
                                              recomb_lambda=1, random_seed=42, 
                                              input_cov=TRUE, input_mgr=FALSE, missing_genotype_rate=NULL,
                                              add_seq_error=TRUE, seqError_add=0.005, 
                                              add_de_novo_mut=FALSE, de_novo_lambda=5, de_novo_alpha=7.5, de_novo_beta=10){
  generated_data <- list()
  
  # From sim_run_generative_model - get MGR or coverage
  if (input_cov){
    missing_genotype_rate <- sim_find_mgr_from_cov(coverage)
  } else if (input_mgr){
    coverage <- sim_find_cov_from_mgr(missing_genotype_rate)
  }
  
  # From sim_run_generative_model - get number of NAs from MGR
  num_nas <- sim_find_num_nas(num_gametes, num_snps, missing_genotype_rate)
  
  set.seed(random_seed)
  
  # Generate gametes and simulate TD
  if (TD_type == 'gk'){
    TD_simulation <- sim_gamete_killing(num_gametes = num_gametes, num_snps = num_snps, p_kill=p_kill, killer_snp = killer_snp,
                                        killer_haplotype = killer_haplotype, recomb_lambda = recomb_lambda)
  }
  else{
    TD_simulation <- sim_gene_conversion(num_gametes = num_gametes, num_snps = num_snps, p_convert=p_convert, converted_snp = converted_snp, 
                                         converted_haplotype = converted_haplotype, conversion_lambda = conversion_lambda, recomb_lambda = recomb_lambda)
  }
  
  sim_gam <- TD_simulation[[1]]
  TD_SNP <- TD_simulation[[2]]
  donor_haps <- TD_simulation[[4]]

  gam_haps <- sapply(sim_gam, "[[", 3)
  
  crossover_indices <- sapply(sim_gam, "[[", 1)
  names(crossover_indices) <- paste0(rep("gam", num_gametes), 1:num_gametes, "_")
  
  unlist_ci <- unlist(crossover_indices, use.names=TRUE)
  tci_dt <- data.table::data.table(gam=sapply(strsplit(names(unlist_ci), "_"), `[`, 1), start =(unlist_ci-1), end=(unlist_ci))
  
  gam_mat <- sapply(sim_gam, "[[", 2)
  if (missing_genotype_rate > 0.5){
    gam_mat_with_na <- sim_add_to_na_flatten(gam_mat, num_nas, num_gametes, num_snps)
  } else if (missing_genotype_rate <= 0.5){
    gam_mat_with_na <- sim_add_na_flatten(gam_mat, num_nas, num_gametes, num_snps)
  }
  
  if (add_de_novo_mut){
    dnm_out <- sim_add_de_novo_mut(de_novo_lambda, de_novo_alpha, de_novo_beta, num_snps, num_gametes, gam_haps, gam_mat, gam_mat_with_na, donor_haps, unlist_ci, missing_genotype_rate)
    num_snps <- dnm_out$num_snps
    
    donor_haps <- dnm_out$donor_haps
    
    gam_mat_with_na <- dnm_out$gam_mat_with_na
    
    gam_mat <- dnm_out$gam_mat
    
    unlist_ci <- dnm_out$unlist_ci
    tci_dt <- data.table::data.table(gam=sapply(strsplit(names(unlist_ci), "_"), `[`, 1), start =(unlist_ci-1), end=(unlist_ci))
    
    new_dnm_rows <- dnm_out$new_rows
  } else {new_dnm_rows <- c() }
  if (add_seq_error){
    gam_mat_with_na <- sim_add_seq_error(num_snps, num_gametes, seqError_add, gam_mat_with_na)
  }
  gam_na_df <- data.frame(pseudo_pos = 1:nrow(gam_mat_with_na), gam_mat_with_na) %>% `colnames<-`(c("positions", paste0("gam", 1:num_gametes, "_")))
  gam_full_df <- data.frame(pseudo_pos = 1:nrow(gam_mat), gam_mat) %>% `colnames<-`(c("positions", paste0("gam", 1:num_gametes, "_")))
  
  #Filtering
  filtered_out <- sim_filter_generated_data(gam_na_df, gam_full_df, donor_haps, new_dnm_rows, add_de_novo_mut)
  gam_na_df <- filtered_out$gam_na_df
  gam_full_df <- filtered_out$gam_full_df
  donor_haps <- filtered_out$donor_haps
  num_snps <- filtered_out$num_snps
  
  generated_data$recomb_spots <- tci_dt
  generated_data$gam_full <- gam_full_df
  generated_data$gam_na <- gam_na_df
  generated_data$donor_haps <- donor_haps
  
  return(list(generated_data, TD_SNP))
}

  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
mccoy-lab/rhapsodi documentation built on July 27, 2022, 3:56 a.m.