R/optimise_block_size_functions.R

Defines functions optimise_blocks

optimise_blocks <- function(background, 
                            candidates, 
                            features,
                            initial_block_size_kb=500,
                            tolerance_percent=0.5,
                            n_perms=25,
                            increment_step_size_kb=25,
                            genic_only=TRUE,
                            combined_test="multiple_hits") {
  candidatesDT <- rbindlist(candidates,idcol = T)[,.(chr,start, end,stat= .id)]
  setkey(candidatesDT,chr,start,end)
  backgroundDT <- rbindlist(background,idcol = T)[,.(chr,start, end,stat= .id)]
  setkey(backgroundDT,chr,start,end)
  #backgroundDT <- unique(backgroundDT, by = key(backgroundDT), fromLast = TRUE)
  setkey(features, chr, start, end)
  before_remap_gene <- foverlaps(candidatesDT, features, nomatch = NULL)[,.(gene=unique(gene)),by=stat]
  # initial blocks
  remap_list <- blocks_to_remapped(block_size = initial_block_size_kb*1000,
                                   background = backgroundDT,
                                   candidates = candidatesDT,
                                   features = features,
                                   genic_only = genic_only)
  after_remap_gene <- data.table::foverlaps(data.table::setkey(remap_list[[2]],chr,start,end),
                                              data.table::setkey(remap_list[[3]][,.(chr=id,start=block.relative_start,end=block.relative_end,gene)],chr,start,end),
                                              nomatch = 0)[,.(gene=unique(gene)),by=stat]
  # tolerance range of genic n
  obs_gene_number <- 0
  if(combined_test=="multiple_hits"){
    obs_gene_number <-  after_remap_gene[,.N]
  }
  if(combined_test=="unique_hits"){
    obs_gene_number <-  after_remap_gene[,uniqueN(gene)]
  }
  # independent option is signposted here, but not yet implemented
  if(combined_test=="independent"){
    obs_gene_number <-  after_remap_gene[,.(N=uniqueN(gene)),by=stat]
  }
  upper_limit <- round((100+tolerance_percent)/100*obs_gene_number)
  lower_limit <- round((100-tolerance_percent)/100*obs_gene_number)
  # average permutation genic n in permutations with initial block size 
  test_permutations <- data.table::rbindlist(parallel::mclapply(1:n_perms,
                     function(x) { permuted_genes <- permute_genes(features = remap_list[[3]]);
                                   out <- data.table::foverlaps(remap_list[[2]],
                                                                permuted_genes,
                                                                nomatch = NULL)[,.(gene=unique(gene),perm_n=x),by=stat];}))
  mean_perm_n <- 0
  if(combined_test=="multiple_hits"){
    mean_perm_n <-  round(test_permutations[,.N,by=perm_n][,mean(N)])
  }
  if(combined_test=="unique_hits"){
    obs_gene_number <-  round(test_permutations[,uniqueN(gene),by=perm_n][,mean(V1)])
  }
  if(mean_perm_n >= lower_limit & mean_perm_n <= upper_limit){
    # already optimised... lucky!
    return(list(remap_list,optimised_block_size=initial_block_size_kb*1000,before_remap_gene,after_remap_gene))
  }
  max_increments <- round((initial_block_size_kb - increment_step_size_kb) 
                          / increment_step_size_kb)
  # we assume that making the blocks SMALLER will increase the genic n
  if(mean_perm_n < lower_limit) {
    increment_step <- 1
    optimised_block_size_kb <- 0
    while(mean_perm_n < lower_limit & increment_step < max_increments) {
      optimised_block_size_kb <- initial_block_size_kb - (increment_step_size_kb*increment_step)
      remap_list <- blocks_to_remapped(block_size = optimised_block_size_kb*1000,
                                       background = backgroundDT,
                                       candidates = candidatesDT,
                                       features = features,
                                       genic_only = genic_only)
      test_permutations <- data.table::rbindlist(parallel::mclapply(1:n_perms,
                                                                    function(x) { permuted_genes <- permute_genes(features = remap_list[[3]]);
                                                                    out <- data.table::foverlaps(remap_list[[2]],
                                                                                                 permuted_genes,
                                                                                                 nomatch = NULL)[,.(gene=unique(gene),perm_n=x),by=stat];}))
      if(combined_test=="multiple_hits"){
        mean_perm_n <-  round(test_permutations[,.N,by=perm_n][,mean(N)])
      }
      if(combined_test=="unique_hits"){
        obs_gene_number <-  round(test_permutations[,uniqueN(gene),by=perm_n][,mean(V1)])
      }
      increment_step <- increment_step+1
    }
    after_remap_gene <- data.table::foverlaps(data.table::setkey(remap_list[[2]],chr,start,end),
                                              data.table::setkey(remap_list[[3]][,.(chr=id,start=block.relative_start,end=block.relative_end,gene)],chr,start,end),
                                              nomatch = 0)[,.(gene=unique(gene)),by=stat]
    return(list(remap_list,optimised_block_size=optimised_block_size_kb*1000,before_remap_gene,after_remap_gene))
    #return(list(remap_list,optimised_block_size_kb*1000,before_remap_gene,after_remap_gene))
  }
  # if(mean_perm_n > upper_limit) {
  #   increment_step <- 1
  #   optimised_block_size_kb <- 0
  #   while(mean_perm_n < lower_limit & increment_step < max_increments) {
  #     optimised_block_size_kb <- initial_block_size_kb + (increment_step_size_kb*increment_step)
  #     remap_list <- blocks_to_remapped(block_size = optimised_block_size_kb*1000,
  #                                      background = background,
  #                                      candidates = candidates,
  #                                      features = features,
  #                                      genic_only = genic_only)
  #     mean_perm_n <- mean(make_permutation_n_vector(n_perms = n_perms,
  #                                                   stats = remap_list[[2]],
  #                                                   features = remap_list[[3]],
  #                                                   blocks = remap_list[[1]]))
  #     increment_step <- increment_step+1
  #   }
  #   return(list(remap_list,optimised_block_size_kb*1000))
  # }
}

blocks_to_remapped <- function(block_size,
                               background,
                               candidates,
                               features,
                               genic_only) {
  blocks <- if (genic_only) {
    block_maker(
      block_size = block_size,
      stats = background,
      features = features
    )
  } else {
    block_maker(
      block_size = block_size,
      stats = background,
      features = NULL
    )
  }
  clean_blocks <- block_cleaner(blocks,
    stats = background,
    features = NULL
  )
  remapped_candidates <- remapper(blocks = clean_blocks, to_remap = candidates)[,.(chr=id,start=block.relative_start,end=block.relative_end,stat)]
  setkey(remapped_candidates,chr,start,end)
  remapped_features <- remapper(blocks = clean_blocks, to_remap = features)
  return(list(clean_blocks, remapped_candidates, remapped_features))
}

make_permutation_n_vector <- function(n_perms, stats, features, blocks) {
  perm_gene_n <- vector(mode = "numeric", length = n_perms)
  for (i in 1:n_perms) {
    perm_gene_n[i] <- melt_permuation_list(lapply(stats, function(x) data.table::foverlaps(x, permute_blocks(features = features, blocks = blocks), nomatch = 0, by.x = c("id", "block.relative_start", "block.relative_end"))))[,.N]#[, uniqueN(gene)]
  }
  return(perm_gene_n)
}
joshuamschmidt/multiPermr documentation built on Oct. 12, 2020, 11:42 a.m.