R/simulate_data_scripts.R

Defines functions simulate_background

simulate_background <- function(blocks, snp_distance_kb) {
  blocks[,.(chr,start=seq(from = round(start/1000), to = round(end/1000), by = snp_distance_kb)*1000),by=id][,.(chr,start,end=start)][]
}

simulate_background_from_genes <- function(genes, snp_distance_kb) {
  distances <- vector(mode="numeric",length=genes[,.N]-1)
  for(i in 1:(genes[,.N]-1)) {
    if(genes[i,chr] == genes[i+1,chr]){
      distances[i] <- genes[i+1,start] - genes[i,end]
    }
    if(genes[i,chr] != genes[i+1,chr]){
      distances[i] <- NA
    }
  }
  distances <- distances[!distances %in% NA]
  distances <- distances[distances > 0]
  mean_distance <- mean(distances)
  genes[,.(chr,start=seq(from = round( (min(start)-mean_distance) /1000), to = round( (max(end)+mean_distance)/1000), by = snp_distance_kb)*1000),by=chr][,.(chr,start,end=start)][]
}

simulate_random_candidates <- function(background,n_candidates){
  background[sample(.N,size = n_candidates,replace = F)][order(chr,start)][]
}

simulate_genic_enriched_candidates <- function(background,features,n_candidates,genic_fraction=0.5){
  if(!identical(data.table::key(background), c("chr", "start", "end"))){
    data.table::setkey(background,chr,start,end)
  }
  if(!identical(data.table::key(features), c("chr", "start", "end"))){
    data.table::setkey(features,chr,start,end)
  }
  intersect <- data.table::foverlaps(background,features)
  # if enrichment is 10, this means n_nongenic/n_genic == 0.9
  n_genic <- round(n_candidates * genic_fraction)
  n_non_genic <- n_candidates - n_genic
  genic_cands <- intersect[!gene %in% NA,.(chr,start=i.start,end=i.end)][sample(.N,n_genic,replace = F)]
  non_genic_cands <- intersect[gene %in% NA,.(chr,start=i.start,end=i.end)][sample(.N,n_non_genic,replace = F)]
  data.table::rbindlist(list(genic_cands,non_genic_cands))[]
}



# simulate a genome
poisson_snp_positions <- function(chr,chr_length_Mb, mean_snps_per_kb){
  # expected number of snps is the poison rate parameter, lambda
  lambda <- chr_length_Mb * 1000 * mean_snps_per_kb
  # a n of snps can is form the (log) distribution function
  n_snps <- sample(1:(chr_length_Mb * 1000),size = 1, prob = ppois(1:(chr_length_Mb * 1000), lambda = lambda, lower.tail = FALSE))
  snp_positions <- cumsum(round(rexp(n = n_snps,rate = lambda)*chr_length_Mb*1e6))
  snp_positions <- round(snp_positions/max(snp_positions)*chr_length_Mb*1e6)
  snp_positions <- data.table::data.table(chr=chr,start=snp_positions,end=snp_positions)
  return(snp_positions)
}

# chrs <- 1:22
# lengths <- c(250,240,200,190,180,170,160,145,140,133,135,133,115,107,101,90,83,80,60,64,46,50)
# data.table::data.table(chr=)
joshuamschmidt/multiPermr documentation built on Oct. 12, 2020, 11:42 a.m.