# we have two types of input data
# stats, and features.
# in a simple go test, a stat is any genome wide statistic i.e. GWAS, selection test
# a feature is the genes.
# workflow
# if we have a change to gene definition, like gene +-2kb.
#genes_2kb <- genes[,`:=`(start=start-2000,end=end+2000)]
#data.table::setkey(genes,chr,start,end)
## or from GTF
# genes <- gtf_reader("/Users/joshuaschmidt/Dropbox/Gene_lists/input_data/genome_annotation/chimp.genes.human.names.homologs.and.amb.homology_mar.17.gtf")
# # if we have a change to gene defintion, like gene +-2kb.
# genes[,`:=`(start=start-2000,end=end+2000)]
# data.table::setkey(genes,chr,start,end)
# 4: make genome blocks. We need to control for the 'callable genome'.
# Each block has at least one stat and one feature.
# have to pick a block size. 250-500kb is suitable.
# make a general function if passing multiple stats
# blocks <- block_maker(
# block_size = 250e3,
# stats = list("eastern" = eastern_background,
# "central" = central_background),
# features = genes
# )
# clean_blocks <- block_cleaner(
# blocks,
# stats = list("eastern"=eastern_background,
# "central"=central_background),
# features = NULL
# )
data.table::setDTthreads()
system.time(optimised_blocks <- optimise_blocks(
background = list("eastern" = pbs_background#,
#"central" = pbs_background,
#"internal" = internal_background_focal),
),
candidates = list("eastern" = eastern_candidates,
"central" = central_candidates,
"internal" = internal_candidates_focal),
features = genes_2kb,
initial_block_size_kb = 500,
tolerance_percent = 1,
n_perms = 20,
increment_step_size_kb = 25,
genic_only = FALSE
))
# any features overlap dropped/changed windows?
changed_genes <- trimmed_feature_finder(
clean_blocks = optimised_blocks[[1]][[1]],
feature = genes_2kb
)
# remove low coverage genes from gene list and from enrichment gene set
clean_genes <- remove_low_coverage_from_feature_set(
feature_set = genes_2kb,
missing_features = changed_genes,
coverage_threshold = 1
)
clean_kegg <- sync_feature_set_map_to_feature_set(
feature_set_map = kegg_long,
clean_feature_set = clean_genes,
min_n_set = 10
)
# remapped_stats <- remapper(
# blocks = clean_blocks,
# to_remap=list("eastern"=eastern_candidates,
# "central"=central_candidates)
# )
#
# remapped_genes <- remapper(blocks = clean_blocks, to_remap = clean_genes)
#
# permutations
system.time(permutations <- gene_set_enrichment(stats = optimised_blocks[[1]][[2]],
features = optimised_blocks[[1]][[3]],
feature_set_map = clean_kegg,
n_permutations = 100e3)
)
#---* change that optimised_blocks has list of lists.....*-----#
# note that i get some permutations with id NA..... but this is inconsistent!
#permutations <- permutations[!id %in% NA]
system.time(permutations <- calculate_empP(permutations))
system.time(permutations <- correct_fdr_permDT(permutations))
# tidy/final results table
results <- create_results_table(permDT = permutations,
feature_set_map = clean_kegg)
results_table <- plot_results_table(results,show_n = 10,
sort_by = "enriched_p")
results_table <- plot_results_table(results,show_n = 10,
sort_by = "depleted_p")
# resamples have higher genic content then obs data?
# could be if obs data has a relative depletion in genic?
# test with simulated data (SNPs a fixed amount per bp (1 ^-5kbp))
simulated_background <- simulate_background(blocks = clean_blocks,
snp_distance_kb = 1)
#simulated_background <- simulate_background_from_genes(genes = genes,snp_distance_kb = 0.1)
simulated_eastern_candidates <- simulate_random_candidates(background = simulated_background, n_candidates = 6000)
simulated_central_candidates <- simulate_random_candidates(background = simulated_background, n_candidates = 6000) # rbind(simulated_eastern_candidates[sample(.N,size = 500,replace = F)], simulate_random_candidates(background = simulated_background, n_candidates = 3500))
simulated_blocks <- block_maker(block_size = 500e3,
stats = list("eastern"=simulated_background,"central"=simulated_background))
simulated_clean_blocks <- block_cleaner(simulated_blocks,
stats = list("eastern"=simulated_background,"central"=simulated_background),
features = NULL)
# any features overlap dropped/changed windows?
simulated_changed_genes <- trimmed_feature_finder(clean_blocks = simulated_clean_blocks,
feature = genes)
# remove low coverage genes from gene list and from enrichment gene set
simulated_clean_genes <- remove_low_coverage_from_feature_set(feature_set = genes,
missing_features = simulated_changed_genes,
coverage_threshold = 50)
simulated_clean_kegg <- sync_feature_set_map_to_feature_set(feature_set_map = kegg_long,
clean_feature_set = simulated_clean_genes)
simulated_remapped_stats <- remapper(blocks = simulated_clean_blocks,
to_remap=list("eastern"=simulated_eastern_candidates,"central"=simulated_central_candidates))
simulated_remapped_genes <- remapper(blocks = simulated_clean_blocks, to_remap = simulated_clean_genes)
# permutations
system.time(simulated_permutations <- gene_set_enrichment(stats=simulated_remapped_stats,features = simulated_remapped_genes,blocks = simulated_clean_blocks, n_permutations = 1000, feature_set_map = simulated_clean_kegg))
system.time(simulated_permutations <- calculate_empP(simulated_permutations))
system.time(simulated_permutations <- correct_fdr_permDT(simulated_permutations))
simulated_permutations[perm_n!=0][,mean(stat_n_gene)]
simulated_permutations[perm_n==0][order(two_tailed_fdr)]
# simulated, with some genic enrichment in candidates versus background....
simulated_eastern_candidates <- simulate_genic_enriched_candidates(background = simulated_background,features = simulated_clean_genes, n_candidates = 4000, genic_fraction = 0.05)
simulated_central_candidates <- simulate_genic_enriched_candidates(background = simulated_background,features = simulated_clean_genes, n_candidates = 4000, genic_fraction = 0.05)
simulated_blocks <- block_maker(block_size = 250e3,
stats = list("eastern"=simulated_background,"central"=simulated_background))
simulated_clean_blocks <- block_cleaner(simulated_blocks,
stats = list("eastern"=simulated_background,"central"=simulated_background),
features = NULL)
# any features overlap dropped/changed windows?
simulated_changed_genes <- trimmed_feature_finder(clean_blocks = simulated_clean_blocks,
feature = genes)
# remove low coverage genes from gene list and from enrichment gene set
simulated_clean_genes <- remove_low_coverage_from_feature_set(feature_set = genes,
missing_features = simulated_changed_genes,
coverage_threshold = 50)
simulated_clean_kegg <- sync_feature_set_map_to_feature_set(feature_set_map = kegg_long,
clean_feature_set = simulated_clean_genes)
simulated_remapped_stats <- remapper(blocks = simulated_clean_blocks,
to_remap=list("eastern"=simulated_eastern_candidates,"central"=simulated_central_candidates))
simulated_remapped_genes <- remapper(blocks = simulated_clean_blocks, to_remap = simulated_clean_genes)
# permutations
system.time(simulated_permutations <- gene_set_enrichment(stats=simulated_remapped_stats,features = simulated_remapped_genes,blocks = simulated_clean_blocks, n_permutations = 1000, feature_set_map = simulated_clean_kegg))
system.time(simulated_permutations <- calculate_empP(simulated_permutations))
system.time(simulated_permutations <- correct_fdr_permDT(simulated_permutations))
data.table::setkey(features,chr,start,end)
data.table::setkey(windows,chr,start,end)
genes.windows <- data.table::foverlaps(features,windows, nomatch=NULL)
# think easiest thing is associate genes with windows, and relativise coordinates
# essentially window becomes chr in overlap. genes 'split over' chromosomes.
# candidate SNPs 'stay' where they are
genes.windows[,win.relative_start:=ifelse(start < i.start, i.start-start,1)]
genes.windows[,win.relative_end:=ifelse(i.end < end, i.end-start,end-start)]
genes.windows <- genes.windows[,.(chr=id,start=win.relative_start,end=win.relative_end,gene)]
# map stat to windows
data.table::setkey(windows,chr,start,end)
data.table::setkey(stat,chr,start,end)
stat.windows <- data.table::foverlaps(stat,windows)
stat.windows[,win.relative_start:=ifelse(start < i.start, i.start-start,1)]
stat.windows[,win.relative_end:=ifelse(i.end < end, i.end-start,end-start)]
# make the object for genes, relative to windows
test.genes <- genes.windows[,.(chr=id,start=win.relative_start,end=win.relative_end,gene)]
saveRDS(test.genes,"test.genes.rds")
setkey(test.snps,chr,start,end)
setkey(test.genes,chr,start,end)
if(allowMultiHits==TRUE){
obs.genes <- sort(foverlaps(test.snps,test.genes,nomatch = NULL)[order(gene),unique(gene),by=stat]$V1)
} else{obs.genes <- foverlaps(test.snps,test.genes,nomatch = NULL)[order(gene),unique(gene)]}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.