library(corrplot)
library(devtools)
library(DT)
library(here)
library(ggplot2)
library(GeneOverlap)
library(LDSCforRyten)
library(tidyverse)
library(readxl)
library(UpSetR)
devtools::load_all(path = "/home/rreynolds/projects/MarkerGenes/")

predictions <- read_csv(file = "~/misc_projects/JuanBotia_G2PML/data/20190718_allpredictions.csv") %>% 
  dplyr::mutate(panel = str_replace(panel, ".rds", ""),
                # Rename 
                panel = str_replace(panel, "Early_onset_dementia_\\(encompassing_fronto-temporal_dementia_and_prion_disease\\)", "Early_onset_dementia_with_FTD_and_prion"),
                panel = str_replace_all(panel, "_", ".")) %>% 
  dplyr::select(-X1)

knitr::opts_chunk$set(echo = T, message= F, warning = F)

Background

Running LDSC

Restricting gene lists

predictions %>% 
  dplyr::filter(quality >= 1) %>% 
  dplyr::group_by(panel) %>% 
  summarise(n = n())
predictions %>% 
  dplyr::filter(quality >= 0.8) %>% 
  dplyr::group_by(panel) %>% 
  summarise(n = n())

Creating gene lists

filtered_predictions <- 
  # Create dataframe containing panels and gene predictions with quality >= 1
  predictions %>% 
  dplyr::filter(quality >= 0.8) %>% 
  dplyr::mutate(annot = "prediction.80") %>% 
  # Bind additional dataframe containing panels and gene predictions with quality >= 0.8
  bind_rows(predictions %>% 
              dplyr::filter(quality >= 1) %>% 
              dplyr::mutate(annot = "prediction.100"))

Overlap between predicted genes across panels

Prediction quality >= 1

# Create lists of genes to perform overlap
gene_names_list <- setNames(filtered_predictions %>% 
                              dplyr::filter(annot == "prediction.100") %>% 
                              group_split(panel),
                            filtered_predictions %>% 
                              dplyr::filter(annot == "prediction.100") %>% 
                              .[["panel"]] %>% 
                              unique() %>% 
                              sort()) %>% 
  lapply(., function(x) x %>% .[["gene"]])

upset(fromList(gene_names_list), sets = names(gene_names_list), keep.order = TRUE, nintersects = 30, order.by = "freq")
gom.self <- newGOM(gene_names_list, gene_names_list, genome.size = 21196)
All.jaccard <- getMatrix(gom.self, "Jaccard")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(All.jaccard, method="color", col=col(100), cl.lim = c(0,1), 
          type="lower", 
          # addCoef.col = "black", number.cex = 0.75, # Add coefficient of correlation
          tl.col="black", tl.srt=45, tl.cex = 0.5, mar=c(0,0,1,0), #Text label color and rotation
          p.mat = All.jaccard, sig.level = (0.05), insig = "p-value", number.cex = 0.5,  
          diag = TRUE)

Creating LDSC annotations

# The following snippet only needs to be run once.
# Extract positional information and add 100kb window around genes within dataframe
filtered_predictions <- filtered_predictions %>% 
  LDSCforRyten::AddGenePosDetails_RemoveXandYandMT(.,
                                     columnToFilter = "gene", 
                                     mart = 37, 
                                     attributes = c("hgnc_symbol","chromosome_name", 
                                                             "start_position","end_position"),
                                     filter = c("hgnc_symbol")) %>% 
  LDSCforRyten::AddBPWindow(windowsize = 100000) %>% 
  # Add column combining panel and prediction quality into one name, to allow splitting of dataframe into lists
  dplyr::mutate(combined = str_c(annot, panel, sep = ":")) %>% 
  dplyr::select(combined, everything())


# Split dataframe by panel/prediction quality into individual dataframes and store within a named list
# NOTE: Names assigned to each individual dataframe will be used to output files to a directory, and as the file name. It is VERY important that these names contain no '_', as this will downstream use of Assimilate_H2_results() function, which will remove anything prior to the last _ in the annotation name.  
gene_list <- setNames(filtered_predictions %>% 
                           group_split(combined),
                         filtered_predictions %>% 
                           .[["combined"]] %>% 
                           unique() %>% 
                           sort())

# Loading baseline and creating Granges object from baseline model
BM <- LDSCforRyten::creating_baseline_df(baseline_model = "97")
BM_GR <- LDSCforRyten::df2GRanges(BM, 
                                  seqname_col = "CHR", 
                                  start_col = "BP",
                                  end_col = "BP")

# Find overlapping regions between genes in each annotation and BM (baseline model)
hits <- gene_list %>%
  LDSCforRyten::overlap_annot_list(BM_GR, 
                                   seqname_col = "chromosome_name", 
                                   start_col = "Gene.Start.MinusKB", 
                                   end_col = "Gene.end.PlusKB")

# Annotate BM with 1s where annotation overlaps with the baseline model
list.BM <- hits %>%
  overlap_annot_hits_w_baseline(BM)

# Exporting files
annot_basedir <- "/home/rreynolds/data/Extra_Annotations/Juan2019.ML/"
LDSCforRyten::create_annot_file_and_export(list.BM, annot_basedir = annot_basedir)

Running LDSC

GWASs <- as.character("AD2018,ALS2018,Anxiety2016.cc,Anxiety2016.fs,ILAE.Epilepsy2018.All,ILAE.Epilepsy2018.Focal,ILAE.Epilepsy2018.Generalised,Height2018,Intelligence2018,MDD2018_ex23andMe,MS,OCD2017,PD2018_meta5_ex23andMe,RA2014.EUR,SCZ2018,WHR") %>% 
  str_split(",") %>% 
  unlist()

read_excel(path = here::here("misc", "LDSC_GWAS_details.xlsx")) %>% 
  dplyr::filter(Original_name %in% GWASs) %>% 
  dplyr::select(-contains("Full_paths"), -Original_name, -Notes) %>% 
  DT::datatable(rownames = FALSE,
                options = list(scrollX = TRUE),
                class = 'white-space: nowrap')

```{bash LDSC arguments, eval = F, tidy = T} cd /home/rreynolds/data/Extra_Annotations/Juan2019.ML/

Prediction quality >= 1

nohup Rscript \ /home/rreynolds/packages/LDSCforRyten/pipelines/LDSC_Pipeline_Entire.R \ /home/rreynolds/data/Extra_Annotations/ \ Juan2019.ML \ prediction.100:Cerebral.vascular.malformations,prediction.100:Congenital.muscular.dystrophy,prediction.100:Congenital.myopathy,prediction.100:Distal.myopathies,prediction.100:Early.onset.dementia.with.FTD.and.prion,prediction.100:Early.onset.dystonia,prediction.100:Genetic.epilepsy.syndromes,prediction.100:Intellectual.disability,prediction.100:Parkinson.Disease.and.Complex.Parkinsonism,prediction.100:Skeletal.Muscle.Channelopathies \ --GWAS=AD2018,ALS2018,Anxiety2016.cc,Anxiety2016.fs,ILAE.Epilepsy2018.All,ILAE.Epilepsy2018.Focal,ILAE.Epilepsy2018.Generalised,Height2018,Intelligence2018,MDD2018_ex23andMe,MS,OCD2017,PD2018_meta5_ex23andMe,RA2014.EUR,SCZ2018,WHR \ --baseline_model=97 \ &>/home/rreynolds/misc_projects/JuanBotia_G2PML/nohup_logs/20190731_LDSC_Prediction_100.log&

# Results
- A number of functions are availble to help collect the output of LDSC. In the sample code below, only `Assimilate_H2_results()` was used. There are also, however, functions for plotting the results of these (not used in this tutorial). For details, please refer to `LDSC_SummariseOutput_Functions.R` found in the `R` directory of this package.

## Collecting the results

```r

results <- LDSCforRyten::Assimilate_H2_results(path_to_results = list.files(path = "/home/rreynolds/data/Extra_Annotations/Juan2019.ML/Output/", 
                                                                            pattern = ".results", 
                                                                            full.names = T)) %>% 
  Calculate_enrichment_SE_and_logP(one_sided = "+") %>% 
  separate(annot_name, into = c("prediction_quality", "panel"),sep = ":") %>% 
  dplyr::select(GWAS, prediction_quality, panel, everything()) %>% 
  dplyr::mutate(GWAS = str_replace(GWAS, "ILAE.Epilepsy2018.All", "EPI All"),
                GWAS = str_replace(GWAS, "ILAE.Epilepsy2018.Focal", "EPI Focal"),
                GWAS = str_replace(GWAS, "ILAE.Epilepsy2018.Generalised", "EPI Generalised"),
                GWAS = str_replace(GWAS, "PD2018.ex23andMe", "PD2018.ex23&Me"))

# # Saved once
# write_csv(x = results %>%
#   dplyr::select(-Enrichment.Lower.SE, -Enrichment.Upper.SE, -Log_P, -Coefficient.Lower.SE, -Coefficient.Upper.SE),
#   path = "/home/rreynolds/misc_projects/JuanBotia_G2PML/results/20190802_LDSCsummary.csv")

# Only first three results shown
results[1:3,] %>% 
  dplyr::filter(prediction_quality == "prediction.100") %>% 
  dplyr::select(-Enrichment_p, -Enrichment.Lower.SE, -Enrichment.Upper.SE, -Log_P, -Coefficient.Lower.SE, -Coefficient.Upper.SE)

Plotting the results

ggplot(data = results[1:3,] %>% 
         dplyr::filter(prediction_quality == "prediction.100"),
       aes(x = MarkerGenes::reorder_within(x = panel,
                                           by = Z_score_logP,
                                           within = GWAS,
                                           fun = max,
                                           desc = TRUE),
           y = Z_score_logP)) + 
  geom_point(aes(), shape = 19, size = 1.5) + 
  geom_hline(aes(yintercept = -log10(0.05/(1*10))), linetype = "77", size = 0.6) + 
  MarkerGenes::scale_x_reordered() +
  coord_flip() +
  facet_wrap(~ GWAS, scales = "free_y", ncol = 2) +
  labs(x = "", y = expression("Coefficient P-value (-log"[1][0]~"p)"), title = "") +
  theme_bw() + 
  theme(axis.title = element_text(size = 8, face = "bold"),
        axis.text.x = element_text(size = 6, angle = 45, hjust = 1, vjust = 1),
        strip.text = element_text(size = 6),
        legend.position = "none",
        legend.key.size = unit(1,"line")) +
  scale_colour_manual(guide = FALSE,
                      name = element_blank())


RHReynolds/LDSCforRyten documentation built on Sept. 27, 2021, 5:20 p.m.