package_workflow.R

# we have two types of input data
# stats, and features.
# in a simple go test, a stat is any genome wide statistic i.e. GWAS, selection test
# a feature is the genes.

# workflow
# if we have a change to gene definition, like gene +-2kb.
#genes_2kb <- genes[,`:=`(start=start-2000,end=end+2000)]
#data.table::setkey(genes,chr,start,end)

## or from GTF

# genes <- gtf_reader("/Users/joshuaschmidt/Dropbox/Gene_lists/input_data/genome_annotation/chimp.genes.human.names.homologs.and.amb.homology_mar.17.gtf")
# # if we have a change to gene defintion, like gene +-2kb.
# genes[,`:=`(start=start-2000,end=end+2000)]
# data.table::setkey(genes,chr,start,end)


# 4: make genome blocks. We need to control for the 'callable genome'. 
# Each block has at least one stat and one feature.
# have to pick a block size. 250-500kb is suitable.
# make a general function if passing multiple stats
# blocks <- block_maker(
#   block_size = 250e3,
#   stats = list("eastern" = eastern_background,
#                "central" = central_background),
#   features = genes
# )
# clean_blocks <- block_cleaner(
#   blocks,
#   stats = list("eastern"=eastern_background,
#                "central"=central_background),
#   features = NULL
# )

data.table::setDTthreads()
system.time(optimised_blocks <- optimise_blocks(
  background = list("eastern" = pbs_background#,
                    #"central" = pbs_background,
                    #"internal" = internal_background_focal),
  ),
  candidates = list("eastern" = eastern_candidates,
                    "central" = central_candidates,
                    "internal" = internal_candidates_focal),
  features = genes_2kb,
  initial_block_size_kb = 500,
  tolerance_percent = 1,
  n_perms = 20,
  increment_step_size_kb = 25,
  genic_only = FALSE
))

# any features overlap dropped/changed windows?
changed_genes <- trimmed_feature_finder(
  clean_blocks = optimised_blocks[[1]][[1]],
  feature = genes_2kb
)

# remove low coverage genes from gene list and from enrichment gene set

clean_genes <- remove_low_coverage_from_feature_set(
  feature_set = genes_2kb,
  missing_features = changed_genes,
  coverage_threshold = 1
)

clean_kegg <- sync_feature_set_map_to_feature_set(
  feature_set_map = kegg_long,
  clean_feature_set = clean_genes,
  min_n_set = 10
)

# remapped_stats <- remapper(
#   blocks = clean_blocks,
#   to_remap=list("eastern"=eastern_candidates,
#                 "central"=central_candidates)
# )
# 
# remapped_genes <- remapper(blocks = clean_blocks, to_remap = clean_genes)
# 

# permutations
system.time(permutations <- gene_set_enrichment(stats = optimised_blocks[[1]][[2]],
                                                features = optimised_blocks[[1]][[3]],
                                                feature_set_map = clean_kegg,
                                                n_permutations = 100e3)
)
#---* change that optimised_blocks has list of lists.....*-----#
# note that i get some permutations with id NA..... but this is inconsistent!
#permutations <- permutations[!id %in% NA]
system.time(permutations <- calculate_empP(permutations))
system.time(permutations <- correct_fdr_permDT(permutations))

# tidy/final results table
results <- create_results_table(permDT = permutations,
                                feature_set_map = clean_kegg)

results_table <- plot_results_table(results,show_n = 10,
                                    sort_by = "enriched_p")  		

results_table <- plot_results_table(results,show_n = 10,
                                    sort_by = "depleted_p")  		

# resamples have higher genic content then obs data?
# could be if obs data has a relative depletion in genic?
# test with simulated data (SNPs a fixed amount per bp (1 ^-5kbp))

simulated_background <- simulate_background(blocks = clean_blocks,
                                            snp_distance_kb = 1)

#simulated_background <- simulate_background_from_genes(genes = genes,snp_distance_kb = 0.1)
simulated_eastern_candidates <- simulate_random_candidates(background = simulated_background, n_candidates = 6000)
simulated_central_candidates <- simulate_random_candidates(background = simulated_background, n_candidates = 6000) # rbind(simulated_eastern_candidates[sample(.N,size = 500,replace = F)], simulate_random_candidates(background = simulated_background, n_candidates = 3500))

simulated_blocks <- block_maker(block_size = 500e3,
                      stats = list("eastern"=simulated_background,"central"=simulated_background))

simulated_clean_blocks <- block_cleaner(simulated_blocks,
                              stats = list("eastern"=simulated_background,"central"=simulated_background),
                              features = NULL)

# any features overlap dropped/changed windows?
simulated_changed_genes <- trimmed_feature_finder(clean_blocks = simulated_clean_blocks,
                                        feature = genes)

# remove low coverage genes from gene list and from enrichment gene set

simulated_clean_genes <- remove_low_coverage_from_feature_set(feature_set = genes, 
                                                    missing_features = simulated_changed_genes, 
                                                    coverage_threshold = 50)

simulated_clean_kegg <- sync_feature_set_map_to_feature_set(feature_set_map = kegg_long, 
                                                  clean_feature_set = simulated_clean_genes)

simulated_remapped_stats <- remapper(blocks = simulated_clean_blocks,
                           to_remap=list("eastern"=simulated_eastern_candidates,"central"=simulated_central_candidates))

simulated_remapped_genes <- remapper(blocks = simulated_clean_blocks, to_remap = simulated_clean_genes)


# permutations
system.time(simulated_permutations <- gene_set_enrichment(stats=simulated_remapped_stats,features = simulated_remapped_genes,blocks = simulated_clean_blocks, n_permutations = 1000, feature_set_map = simulated_clean_kegg))


system.time(simulated_permutations <- calculate_empP(simulated_permutations))


system.time(simulated_permutations <- correct_fdr_permDT(simulated_permutations))


simulated_permutations[perm_n!=0][,mean(stat_n_gene)]
simulated_permutations[perm_n==0][order(two_tailed_fdr)]

# simulated, with some genic enrichment in candidates versus background....

simulated_eastern_candidates <- simulate_genic_enriched_candidates(background = simulated_background,features = simulated_clean_genes, n_candidates = 4000, genic_fraction = 0.05)
simulated_central_candidates <- simulate_genic_enriched_candidates(background = simulated_background,features = simulated_clean_genes, n_candidates = 4000, genic_fraction = 0.05)
simulated_blocks <- block_maker(block_size = 250e3,
                                stats = list("eastern"=simulated_background,"central"=simulated_background))

simulated_clean_blocks <- block_cleaner(simulated_blocks,
                                        stats = list("eastern"=simulated_background,"central"=simulated_background),
                                        features = NULL)

# any features overlap dropped/changed windows?
simulated_changed_genes <- trimmed_feature_finder(clean_blocks = simulated_clean_blocks,
                                                  feature = genes)

# remove low coverage genes from gene list and from enrichment gene set

simulated_clean_genes <- remove_low_coverage_from_feature_set(feature_set = genes, 
                                                              missing_features = simulated_changed_genes, 
                                                              coverage_threshold = 50)

simulated_clean_kegg <- sync_feature_set_map_to_feature_set(feature_set_map = kegg_long, 
                                                            clean_feature_set = simulated_clean_genes)

simulated_remapped_stats <- remapper(blocks = simulated_clean_blocks,
                                     to_remap=list("eastern"=simulated_eastern_candidates,"central"=simulated_central_candidates))

simulated_remapped_genes <- remapper(blocks = simulated_clean_blocks, to_remap = simulated_clean_genes)


# permutations
system.time(simulated_permutations <- gene_set_enrichment(stats=simulated_remapped_stats,features = simulated_remapped_genes,blocks = simulated_clean_blocks, n_permutations = 1000, feature_set_map = simulated_clean_kegg))


system.time(simulated_permutations <- calculate_empP(simulated_permutations))


system.time(simulated_permutations <- correct_fdr_permDT(simulated_permutations))








data.table::setkey(features,chr,start,end)
data.table::setkey(windows,chr,start,end)
genes.windows <- data.table::foverlaps(features,windows, nomatch=NULL)
# think easiest thing is associate genes with windows, and relativise coordinates
# essentially window becomes chr in overlap. genes 'split over' chromosomes.
# candidate SNPs 'stay' where they are
genes.windows[,win.relative_start:=ifelse(start < i.start, i.start-start,1)]
genes.windows[,win.relative_end:=ifelse(i.end < end, i.end-start,end-start)]
genes.windows <- genes.windows[,.(chr=id,start=win.relative_start,end=win.relative_end,gene)]
# map stat to windows
data.table::setkey(windows,chr,start,end)
data.table::setkey(stat,chr,start,end)
stat.windows <- data.table::foverlaps(stat,windows)
stat.windows[,win.relative_start:=ifelse(start < i.start, i.start-start,1)]
stat.windows[,win.relative_end:=ifelse(i.end < end, i.end-start,end-start)]


# make the object for genes, relative to windows
test.genes <- genes.windows[,.(chr=id,start=win.relative_start,end=win.relative_end,gene)]
saveRDS(test.genes,"test.genes.rds")
setkey(test.snps,chr,start,end)
setkey(test.genes,chr,start,end)
if(allowMultiHits==TRUE){
  obs.genes <- sort(foverlaps(test.snps,test.genes,nomatch = NULL)[order(gene),unique(gene),by=stat]$V1)
} else{obs.genes <- foverlaps(test.snps,test.genes,nomatch = NULL)[order(gene),unique(gene)]}
joshuamschmidt/multiPermr documentation built on Oct. 12, 2020, 11:42 a.m.