library(corrplot) library(devtools) library(DT) library(here) library(ggplot2) library(GeneOverlap) library(LDSCforRyten) library(tidyverse) library(readxl) library(UpSetR) devtools::load_all(path = "/home/rreynolds/projects/MarkerGenes/") predictions <- read_csv(file = "~/misc_projects/JuanBotia_G2PML/data/20190718_allpredictions.csv") %>% dplyr::mutate(panel = str_replace(panel, ".rds", ""), # Rename panel = str_replace(panel, "Early_onset_dementia_\\(encompassing_fronto-temporal_dementia_and_prion_disease\\)", "Early_onset_dementia_with_FTD_and_prion"), panel = str_replace_all(panel, "_", ".")) %>% dplyr::select(-X1) knitr::opts_chunk$set(echo = T, message= F, warning = F)
predictions %>% dplyr::filter(quality >= 1) %>% dplyr::group_by(panel) %>% summarise(n = n())
predictions %>% dplyr::filter(quality >= 0.8) %>% dplyr::group_by(panel) %>% summarise(n = n())
filtered_predictions <- # Create dataframe containing panels and gene predictions with quality >= 1 predictions %>% dplyr::filter(quality >= 0.8) %>% dplyr::mutate(annot = "prediction.80") %>% # Bind additional dataframe containing panels and gene predictions with quality >= 0.8 bind_rows(predictions %>% dplyr::filter(quality >= 1) %>% dplyr::mutate(annot = "prediction.100"))
# Create lists of genes to perform overlap gene_names_list <- setNames(filtered_predictions %>% dplyr::filter(annot == "prediction.100") %>% group_split(panel), filtered_predictions %>% dplyr::filter(annot == "prediction.100") %>% .[["panel"]] %>% unique() %>% sort()) %>% lapply(., function(x) x %>% .[["gene"]]) upset(fromList(gene_names_list), sets = names(gene_names_list), keep.order = TRUE, nintersects = 30, order.by = "freq")
gom.self <- newGOM(gene_names_list, gene_names_list, genome.size = 21196) All.jaccard <- getMatrix(gom.self, "Jaccard") col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA")) corrplot(All.jaccard, method="color", col=col(100), cl.lim = c(0,1), type="lower", # addCoef.col = "black", number.cex = 0.75, # Add coefficient of correlation tl.col="black", tl.srt=45, tl.cex = 0.5, mar=c(0,0,1,0), #Text label color and rotation p.mat = All.jaccard, sig.level = (0.05), insig = "p-value", number.cex = 0.5, diag = TRUE)
LDSCforRyten
package.# The following snippet only needs to be run once. # Extract positional information and add 100kb window around genes within dataframe filtered_predictions <- filtered_predictions %>% LDSCforRyten::AddGenePosDetails_RemoveXandYandMT(., columnToFilter = "gene", mart = 37, attributes = c("hgnc_symbol","chromosome_name", "start_position","end_position"), filter = c("hgnc_symbol")) %>% LDSCforRyten::AddBPWindow(windowsize = 100000) %>% # Add column combining panel and prediction quality into one name, to allow splitting of dataframe into lists dplyr::mutate(combined = str_c(annot, panel, sep = ":")) %>% dplyr::select(combined, everything()) # Split dataframe by panel/prediction quality into individual dataframes and store within a named list # NOTE: Names assigned to each individual dataframe will be used to output files to a directory, and as the file name. It is VERY important that these names contain no '_', as this will downstream use of Assimilate_H2_results() function, which will remove anything prior to the last _ in the annotation name. gene_list <- setNames(filtered_predictions %>% group_split(combined), filtered_predictions %>% .[["combined"]] %>% unique() %>% sort()) # Loading baseline and creating Granges object from baseline model BM <- LDSCforRyten::creating_baseline_df(baseline_model = "97") BM_GR <- LDSCforRyten::df2GRanges(BM, seqname_col = "CHR", start_col = "BP", end_col = "BP") # Find overlapping regions between genes in each annotation and BM (baseline model) hits <- gene_list %>% LDSCforRyten::overlap_annot_list(BM_GR, seqname_col = "chromosome_name", start_col = "Gene.Start.MinusKB", end_col = "Gene.end.PlusKB") # Annotate BM with 1s where annotation overlaps with the baseline model list.BM <- hits %>% overlap_annot_hits_w_baseline(BM) # Exporting files annot_basedir <- "/home/rreynolds/data/Extra_Annotations/Juan2019.ML/" LDSCforRyten::create_annot_file_and_export(list.BM, annot_basedir = annot_basedir)
GWASs <- as.character("AD2018,ALS2018,Anxiety2016.cc,Anxiety2016.fs,ILAE.Epilepsy2018.All,ILAE.Epilepsy2018.Focal,ILAE.Epilepsy2018.Generalised,Height2018,Intelligence2018,MDD2018_ex23andMe,MS,OCD2017,PD2018_meta5_ex23andMe,RA2014.EUR,SCZ2018,WHR") %>% str_split(",") %>% unlist() read_excel(path = here::here("misc", "LDSC_GWAS_details.xlsx")) %>% dplyr::filter(Original_name %in% GWASs) %>% dplyr::select(-contains("Full_paths"), -Original_name, -Notes) %>% DT::datatable(rownames = FALSE, options = list(scrollX = TRUE), class = 'white-space: nowrap')
LDSC_Pipeline_Entire.R
script. This script has multiple optional flags, including:--GWAS
: If called, this flag allows the user to select which GWAS to run with. If this option is not called, then the script will default to running all available GWASs. Note that running with all GWASs will slow the process.--baseline_model
: Use this flag to specify which baseline model should be used to estimate heritability. By default, the 97 annotation will be used, which includes annotations relating to genetic architecture, LD, MAF, conservation, etc.--calculate_LD_alone
: If user only wants to calculate LD scores for an annotation, argument 'TRUE' must be supplied.--calculate_h2_alone
: If user only wants to calculate heritability estimates, argument 'TRUE' must be supplied.-h
flag.```{bash LDSC arguments, eval = F, tidy = T} cd /home/rreynolds/data/Extra_Annotations/Juan2019.ML/
nohup Rscript \ /home/rreynolds/packages/LDSCforRyten/pipelines/LDSC_Pipeline_Entire.R \ /home/rreynolds/data/Extra_Annotations/ \ Juan2019.ML \ prediction.100:Cerebral.vascular.malformations,prediction.100:Congenital.muscular.dystrophy,prediction.100:Congenital.myopathy,prediction.100:Distal.myopathies,prediction.100:Early.onset.dementia.with.FTD.and.prion,prediction.100:Early.onset.dystonia,prediction.100:Genetic.epilepsy.syndromes,prediction.100:Intellectual.disability,prediction.100:Parkinson.Disease.and.Complex.Parkinsonism,prediction.100:Skeletal.Muscle.Channelopathies \ --GWAS=AD2018,ALS2018,Anxiety2016.cc,Anxiety2016.fs,ILAE.Epilepsy2018.All,ILAE.Epilepsy2018.Focal,ILAE.Epilepsy2018.Generalised,Height2018,Intelligence2018,MDD2018_ex23andMe,MS,OCD2017,PD2018_meta5_ex23andMe,RA2014.EUR,SCZ2018,WHR \ --baseline_model=97 \ &>/home/rreynolds/misc_projects/JuanBotia_G2PML/nohup_logs/20190731_LDSC_Prediction_100.log&
# Results - A number of functions are availble to help collect the output of LDSC. In the sample code below, only `Assimilate_H2_results()` was used. There are also, however, functions for plotting the results of these (not used in this tutorial). For details, please refer to `LDSC_SummariseOutput_Functions.R` found in the `R` directory of this package. ## Collecting the results ```r results <- LDSCforRyten::Assimilate_H2_results(path_to_results = list.files(path = "/home/rreynolds/data/Extra_Annotations/Juan2019.ML/Output/", pattern = ".results", full.names = T)) %>% Calculate_enrichment_SE_and_logP(one_sided = "+") %>% separate(annot_name, into = c("prediction_quality", "panel"),sep = ":") %>% dplyr::select(GWAS, prediction_quality, panel, everything()) %>% dplyr::mutate(GWAS = str_replace(GWAS, "ILAE.Epilepsy2018.All", "EPI All"), GWAS = str_replace(GWAS, "ILAE.Epilepsy2018.Focal", "EPI Focal"), GWAS = str_replace(GWAS, "ILAE.Epilepsy2018.Generalised", "EPI Generalised"), GWAS = str_replace(GWAS, "PD2018.ex23andMe", "PD2018.ex23&Me")) # # Saved once # write_csv(x = results %>% # dplyr::select(-Enrichment.Lower.SE, -Enrichment.Upper.SE, -Log_P, -Coefficient.Lower.SE, -Coefficient.Upper.SE), # path = "/home/rreynolds/misc_projects/JuanBotia_G2PML/results/20190802_LDSCsummary.csv") # Only first three results shown results[1:3,] %>% dplyr::filter(prediction_quality == "prediction.100") %>% dplyr::select(-Enrichment_p, -Enrichment.Lower.SE, -Enrichment.Upper.SE, -Log_P, -Coefficient.Lower.SE, -Coefficient.Upper.SE)
ggplot(data = results[1:3,] %>% dplyr::filter(prediction_quality == "prediction.100"), aes(x = MarkerGenes::reorder_within(x = panel, by = Z_score_logP, within = GWAS, fun = max, desc = TRUE), y = Z_score_logP)) + geom_point(aes(), shape = 19, size = 1.5) + geom_hline(aes(yintercept = -log10(0.05/(1*10))), linetype = "77", size = 0.6) + MarkerGenes::scale_x_reordered() + coord_flip() + facet_wrap(~ GWAS, scales = "free_y", ncol = 2) + labs(x = "", y = expression("Coefficient P-value (-log"[1][0]~"p)"), title = "") + theme_bw() + theme(axis.title = element_text(size = 8, face = "bold"), axis.text.x = element_text(size = 6, angle = 45, hjust = 1, vjust = 1), strip.text = element_text(size = 6), legend.position = "none", legend.key.size = unit(1,"line")) + scale_colour_manual(guide = FALSE, name = element_blank())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.