R/annotation_functions.R

Defines functions annotate_permutation_multi gene_set_enrichment annotate_by_feature

annotate_by_feature <- function(stat, feature) {
  if (!all.equal(key(stat), c("chr", "start", "end"))) {
    data.table::setkey(stat, chr, start, end)
  }
  if (!all.equal(key(feature), c("chr", "start", "end"))) {
    data.table::setkey(feature, chr, start, end)
  }
  annotated <- foverlaps(stat, remapped.genes)[, c(1, 9, 10, 8, 11, 12, 7), with = F]
  data.table::setnames(annotated, c("i.start", "i.end", "i.id", "i.block.relative_start", "i.block.relative_end"), c("start", "end", "id", "block.relative_start", "block.relative_end"))
  cols <- names(annotated)[-7]
  annotated[, gene = list(gene), by = cols]
  return(annotated)
}

gene_set_enrichment <- function(stats, features, feature_set_map, n_permutations) {
  # stat_block_ids <- vector()
  # if (inherits(stats, "list")) {
  #   stat_block_ids <- sort(Reduce(union, lapply(stats, function(x) x[, unique(id)])))
  # }
  # if (inherits(stats, "data.table")) {
  #   stat_block_ids <- stats[, sort(unique(id))]
  # }
  # n_associated_blocks <- length(stat_block_ids)
  n.cores <- data.table::getDTthreads()
  observed <- get_observed_multi_stat2(
    statDT = stats,
    features = features,
    feature_set_map = feature_set_map,
    permutation_n = 0,
    allow_multiple_hits = T
  )
  permuted <- data.table::rbindlist(parallel::mclapply(1:n_permutations,
    function(x) {
      get_permutation_multi_stat2(statDT = stats,
                                  features = features,
                                  feature_set_map = feature_set_map,
                                  permutation_n = x,
                                  allow_multiple_hits = T)
        
    },
    mc.cores = n.cores
  ))
  # include observed set as perm_n 0
  permuted <- data.table::rbindlist(list(permuted, observed))
  return(permuted[])
}


annotate_permutation_multi <- function(permDT, feature_set_map, permutation_n, output_genes) {
  permDT[, stat_n_gene := .N]
  permDT[, stat_unique_gene := uniqueN(gene)]
  permDT <- unique(feature_set_map[permDT, on = .(gene)
                                   ][!id %in% NA
                                     ][, .(stat = "multi_hits",
                                           set_n = length(gene),
                                           stat_n_gene,
                                           stat_unique_gene,
                                           genes = ifelse(output_genes,
                                                          paste(sort(gene),
                                                                collapse = " "),
                                                          NA)), by = .(id)])[]
  if(permDT[,uniqueN(id)] < feature_set_map[,uniqueN(id)]) {
    permDT <- rbind(permDT,
                    data.table(id = feature_set_map[!id %in% permDT$id,
                                                    unique(id)],
                               stat = "multi_hits",
                               set_n = 0,
                               stat_n_gene = permDT[1]$stat_n_gene,
                               stat_unique_gene = permDT[1]$stat_unique_gene,
                               genes = NA))
  }
  permDT[, set_pr := ifelse(set_n > 0, set_n / stat_n_gene, 0)]
  permDT[, perm_n := permutation_n]
  return(permDT)
}

annotate_permutation_unique <- function(permDT, feature_set_map, permutation_n, output_genes) {
  permDT[, total_n_gene := uniqueN(gene)]
  permDT <- unique(feature_set_map[out, on = .(gene)][!id %in% NA][, .(stat = "unique_hits", set_n = uniqueN(gene), stat_n_gene = total_n_gene, genes = ifelse(output_genes, paste(sort(gene), collapse = " "), NA)), by = .(id)])
  permDT <- rbind(permDT, data.table(id = feature_set_map[!id %in% permDT$id, unique(id)], stat = "unique_hits", set_n = 0, stat_n_gene = out[1]$stat_n_gene, genes = NA))
  permDT[, set_pr := ifelse(set_n > 0, set_n / stat_n_gene, 0)]
  permDT[, perm_n := permutation_n]
  return(permDT)
}

get_permutation_multi_stat <- function(stats,
                                       features,
                                       feature_set_map,
                                       blocks,
                                       stat_block_ids,
                                       n_associated_blocks,
                                       permutation_n,
                                       allow_multiple_hits) {
  perm <- permute_blocks(features = features, blocks = blocks)
  out <- melt_permuation_list(lapply(stats, function(x) data.table::foverlaps(x, perm, nomatch = 0, by.x = c("id", "block.relative_start", "block.relative_end"))))
  if (allow_multiple_hits) {
    out <- annotate_permutation_multi(permDT = out, feature_set_map = feature_set_map, permutation_n = permutation_n, output_genes = FALSE)
    #   out[,total_n_gene:= .N]
    #   out[,unique_n_gene:= uniqueN(gene)]
    #   out <- unique(feature_set_map[out,on=.(gene)][!id %in% NA][,.(stat='multi_hits',set_n=length(gene),stat_n_gene=total_n_gene,stat_unique_gene=unique_n_gene,genes=NA),by=.(id)])[]
    #   out <- rbind(out,data.table(id=feature_set_map[!id %in% out$id,unique(id)],stat='multi_hits',set_n=0,stat_n_gene=out[1]$stat_n_gene,stat_unique_gene=out[1]$stat_unique_gene,genes=NA))
  } else {
    out <- annotate_permutation_unique(permDT = out, feature_set_map = feature_set_map, permutation_n = permutation_n, output_genes = FALSE)
    #   out[,total_n_gene:=uniqueN(gene)]
    #   out <-  unique(feature_set_map[out,on=.(gene)][!id %in% NA][,.(stat='unique_hits',set_n=uniqueN(gene),stat_n_gene=total_n_gene,genes=NA),by=.(id)])
    #   out <- rbind(out,data.table(id=feature_set_map[!id %in% out$id,unique(id)],stat='unique_hits',set_n=0,stat_n_gene=out[1]$stat_n_gene,genes=NA))
  }
  # out[,set_pr:=ifelse(set_n>0,set_n/stat_n_gene,0)]
  # out[,perm_n:=permutation_n]
  return(out[])
}


get_permutation_multi_stat2 <- function(statDT,
                                       features,
                                       feature_set_map,
                                       permutation_n,
                                       allow_multiple_hits) {
  permuted_genes <- permute_genes(features = features)
  setkey(permuted_genes,chr,start,end)
  # out <- data.table::foverlaps(statDT,
  #                              data.table::setkey(features[,.(chr=id,start=block.relative_start,end=block.relative_end,gene)],chr,start,end),
  #                              nomatch = NULL)[,.(gene=unique(gene)),by=stat]
  # 
  out <- data.table::foverlaps(statDT,
                               permuted_genes,
                               nomatch = NULL)[,.(gene=unique(gene)),by=stat]
  
  if (allow_multiple_hits) {
    out <- annotate_permutation_multi(permDT = out,
                                      feature_set_map = feature_set_map,
                                      permutation_n = permutation_n,
                                      output_genes = FALSE)
  } else {
    out <- annotate_permutation_unique(permDT = out,
                                       feature_set_map = feature_set_map,
                                       permutation_n = permutation_n,
                                       output_genes = FALSE)
    }
  return(out[])
}


permute_genes <- function(features){
  perm.genes <- features[features[,.(id=unique(id),
                                     new=sample(unique(id),
                                                uniqueN(id),
                                                replace = FALSE)),by=chr
  ], on=.(id)][,.(chr=new,
                  start=block.relative_start,
                  end=block.relative_end,gene)]
  setkey(perm.genes,chr,start,end)[]
}
get_observed_multi_stat2 <- function(statDT,
                                     features,
                                     feature_set_map,
                                     permutation_n,
                                     allow_multiple_hits){
  out <- data.table::foverlaps(data.table::setkey(statDT,chr,start,end),
                                            data.table::setkey(features[,.(chr=id,start=block.relative_start,end=block.relative_end,gene)],chr,start,end),
                                            nomatch = 0)[,.(gene=unique(gene)),by=stat]
  if (allow_multiple_hits) {
    out <- annotate_permutation_multi(permDT = out,
                                      feature_set_map = feature_set_map,
                                      permutation_n = permutation_n,
                                      output_genes = TRUE)
  } else {
    out <- annotate_permutation_unique(permDT = out,
                                       feature_set_map = feature_set_map,
                                       permutation_n = permutation_n,
                                       output_genes = TRUE)
  }
  return(out[])
}

get_observed_multi_stat <- function(stats,
                                    features,
                                    feature_set_map,
                                    stat_block_ids,
                                    allow_multiple_hits) {
  out <- melt_permuation_list(lapply(stats, function(x) data.table::foverlaps(x, setkey(features[id %in% stat_block_ids][, .(id, block.relative_start, block.relative_end, gene)], id, block.relative_start, block.relative_end), nomatch = 0, by.x = c("id", "block.relative_start", "block.relative_end"))))
  if (allow_multiple_hits) {
    out <- annotate_permutation_multi(permDT = out, feature_set_map = feature_set_map, permutation_n = 0, output_genes = TRUE)
    # out[,total_n_gene:= .N]
    # out[,unique_n_gene:= uniqueN(gene)]
    # out <- unique(feature_set_map[out,on=.(gene)][!id %in% NA][,.(stat='multi_hits',set_n=length(gene),stat_n_gene=total_n_gene,stat_unique_gene=unique_n_gene,genes=paste(sort(gene),collapse = " ")),by=.(id)])[]
    # out <- rbind(out,data.table(id=feature_set_map[!id %in% out$id,unique(id)],stat='multi_hits',set_n=0,stat_n_gene=out[1]$stat_n_gene,stat_unique_gene=out[1]$stat_unique_gene,genes=NA))
  } else {
    out <- annotate_permutation_unique(permDT = out, feature_set_map = feature_set_map, permutation_n = 0, output_genes = TRUE)
    # out[,total_n_gene:=uniqueN(gene)]
    # out <-  unique(feature_set_map[out,on=.(gene)][!id %in% NA][,.(stat='unique_hits',set_n=uniqueN(gene),stat_n_gene=total_n_gene,genes=paste(sort(gene),collapse = " ")),by=.(id)])
    # out <- rbind(out,data.table(id=feature_set_map[!id %in% out$id,unique(id)],stat='unique_hits',set_n=0,stat_n_gene=out[1]$stat_n_gene,genes=NA))
  }
  # out[,set_pr:=ifelse(set_n>0,set_n/stat_n_gene,0)]
  # out[,perm_n:=0]
  return(out[])
}

# permute feature locations, and get new stat -> feature mapping
# permute_blocks <- function(features, blocks, stat_block_ids, n_associated_blocks) {
#   setkey(features[blocks[,.(old=stat_block_ids,id=sample(unique(id),size = n_associated_blocks,replace = F))],on=.(id)][!gene %in% NA,.(id=old,block.relative_start,block.relative_end,gene)], id, block.relative_start,block.relative_end)[]
# }

permute_blocks <- function(features, blocks) {
  blocks$id2 <- blocks$id
  setkey(features[blocks[sample(.N, .N, replace = F), .(chr, start, end, old = id, id = blocks$id2)], on = .(id)][!gene %in% NA][, .(id = old, block.relative_start, block.relative_end, gene)], id, block.relative_start, block.relative_end)[]
}


force_genic_permute_blocks <- function(features, blocks, stat_block_ids, n_associated_blocks) {
  features[blocks[, .(chr, start, end, id)], on = .(id)][, uniqueN(gene)]
  blocks[features[, .(chr, start, end, id, gene)], on = .(id)][, uniqueN(gene)]
}

#genic_stat_block_ids <- sort(Reduce(union, lapply(remapped_stats, function(x) data.table::fovrelapsx[, unique(id)])))

# melt output of permute_blocks to a two column DT. (gene,stat)
melt_permuation_list <- function(permutation_list) {
  list_names <- names(permutation_list)
  feature <- (names(permutation_list[[1]])[4])
  data.table::rbindlist(lapply(seq_along(permutation_list), function(i) permutation_list[[i]][, .(gene = unique(get(feature)), stat = names(permutation_list)[[i]])]))
}


melt_permuation_list2 <- function(permutation_list) {
  list_names <- names(permutation_list)
  feature <- (names(permutation_list[[1]])[4])
  data.table::rbindlist(lapply(seq_along(permutation_list), function(i) permutation_list[[i]][, .(gene = unique(get(feature)), stat = names(permutation_list)[[i]])]))
}

##

# roll blocks within chromosome
roll_within <- function(chr_ids, shift, direction) {
  # n_ids <- length(chr_ids)
  range_ids <- range(chr_ids)
  max_ids <- range_ids[2] - range_ids[1] + 1
  new_ids <- chr_ids + shift * direction
  ifelse(new_ids > range_ids[2], new_ids - max_ids, ifelse(new_ids < range_ids[1], new_ids + max_ids, new_ids))
}
joshuamschmidt/multiPermr documentation built on Oct. 12, 2020, 11:42 a.m.