R_tmp/validation.R

#' Merge validation assays into a single plot
#'
#' @family VALIDATION
#' @examples
#' \dontrun{
#' plt.ALL <- VALIDATION.super_plot(root="/sc/arion/projects/pd-omics/brian/Fine_Mapping/Data/GWAS/Nalls23andMe_2019/_genome_wide")
#' }
VALIDATION.super_plot <- function(root="/sc/arion/projects/pd-omics/brian/Fine_Mapping/Data/GWAS/Nalls23andMe_2019/_genome_wide",
                                  height=10, width=12,
                                  layout="horiz",
                                  show_plot=TRUE,
                                  save_plot=FALSE){
  snp.groups_list <- echodata::snp_group_filters()
  snp_groups <- names(snp.groups_list) #c("GWAS lead","UCS (-PolyFun)","UCS","Consensus")
  expanded_groups <-  grep("Support",names(echodata::snp_group_filters()),value = TRUE, invert = TRUE)

  #### S-LDSC h2 ####
  res.h2 <- data.table::fread(file.path(root,"PolyFun/Nalls23andMe_2019.h2_enrich.snp_groups.csv.gz"))
  plt.h2 <- POLYFUN.h2_enrichment_SNPgroups_plot(RES = res.h2,
                                                 snp_groups = snp_groups,
                                                 # comparisons_filter = NULL,
                                                 save_path = file.path(root,"PolyFun/Nalls23andMe_2019.h2_enrich.snp_groups_expanded.png"),
                                                 remove_outliers = TRUE,
                                                 show_padj = FALSE,
                                                 show_signif = FALSE,
                                                 vjust_signif=0.5,
                                                 show_xtext = FALSE,
                                                 show_plot = TRUE)

  #### IMPACT ####
  res.IMPACT <- data.table::fread(file.path(root,"IMPACT/TOP_IMPACT_all.csv.gz"))
  ## binarize
  # res.IMPACT <- dplyr::mutate(res.IMPACT, mean_IMPACT=ifelse(mean_IMPACT>=.95,1,0))
  plt.IMPACT <- IMPACT.snp_group_boxplot(TOP_IMPACT_all = res.IMPACT,
                                         snp_groups = snp_groups,
                                         # snp_groups = expanded_groups,
                                         title = "IMPACT",
                                         ylabel = "IMPACT score",
                                         save_path = file.path(root,"IMPACT/Nalls23andMe_2019.IMPACT.snp_groups_expanded.png"),
                                         # comparisons_filter = NULL,
                                         show_padj = FALSE,
                                         show_signif = FALSE,
                                         vjust_signif=0.5,
                                         show_xtext = FALSE,
                                         show_plot = TRUE)

  #### SURE ####
  res.SURE <- data.table::fread(file.path(root,"SURE/Nalls23andMe_2019.SURE.snp_groups.mean.csv.gz"))
  plt.SURE <- SURE.plot(sure.melt = res.SURE,
                        snp_groups = snp_groups,
                        facet_formula = ".~ Cell_type",
                        # comparisons_filter = NULL,
                        save_path = file.path(root,"SURE/Nalls23andMe_2019.SURE.snp_groups_expanded.mean.png"),
                        show_plot = TRUE,
                        show_padj = FALSE,
                        show_signif = FALSE,
                        vjust_signif=0.5,
                        width=8)

  #### Deep learning ####
  res.DL <- data.table::fread(file.path(root,"Dey_DeepLearning/Nalls23andMe_2019.Dey_DeepLearning.annot.Allelic_Effect.snp_groups_mean.csv.gz"))
  plt.DL <- DEEPLEARNING.plot(annot.melt=res.DL,
                              snp_groups = snp_groups,
                              facet_formula="Assay ~ Model +Tissue",
                              # comparisons_filter=NULL,
                              model_metric = "MAX",
                              remove_outliers = TRUE,
                              show_padj = FALSE,
                              show_signif = FALSE,
                              vjust_signif=0.5,
                              save_path=gsub("\\.csv\\.gz",".png",file.path(root,"Dey_DeepLearning/Nalls23andMe_2019.Dey_DeepLearning.annot.Allelic_Effect.snp_groups_mean_expanded.ALL.csv.gz")),
                              show_plot = TRUE)


   # MERGE PLOTS
  library(patchwork)
  if(layout=="horiz"){
    plt.ALL <-  (  (
      (plt.h2 + theme(plot.margin = unit(rep(0,4),"cm")) ) /
        (plt.IMPACT + theme(plot.margin = unit(rep(0,4),"cm"))) /
        (plt.SURE + theme(plot.margin = unit(rep(0,4),"cm")))
    ) |  (plt.DL) ) +
      patchwork::plot_layout(widths = c(.25,1)) +
      patchwork::plot_annotation(tag_levels = letters)
  }
  if(layout=="vert"){
    plt.ALL <-  (  (
      (plt.h2 + theme(plot.margin = unit(rep(0,4),"cm")) ) |
        (plt.IMPACT + theme(plot.margin = unit(rep(0,4),"cm"))) |
        (plt.SURE + theme(plot.margin = unit(rep(0,4),"cm"),
                          axis.text.x = element_blank(),
                          axis.title.x = element_blank()))
    ) /  (plt.DL) ) +
      patchwork::plot_layout(heights = c(.25,1)) +
      patchwork::plot_annotation(tag_levels = letters)
    height=15; width=20;
  }


  if(show_plot) print(plt.ALL)
  if(save_plot!=FALSE){
    # save_plot <- file.path("~/Desktop/Fine_Mapping/Manuscripts/PD/Figures/FigS2","VALIDATION.super_plot.extended.png");  height=10; width=10;
    ggsave(save_plot, plt.ALL, dpi=400,
           height=height, width=width)
  }
  return(plt.ALL)
}




#### MUST USE FULL DATASET (all loci merged) to get proper null distribution
# Alternaitvely, could use a uniform distribution (but this is an assumption)
## But for IMPACT, unif dist may atually be higher impact than reality


#' Validation permutation tests
#'
#' @family VALIDATION
#' @source
#' \href{https://www.datanovia.com/en/lessons/wilcoxon-test-in-r/}{Wilcox test tutorial}
#' @examples
#' \dontrun{
#' save_path <- root <- "/sc/arion/projects/pd-omics/brian/Fine_Mapping/Data/GWAS/Nalls23andMe_2019/_genome_wide"
#'
#' #### h2 ####
#' ## mean
#' path <- file.path(root,"PolyFun/Nalls23andMe_2019.h2_enrich.snp_groups.csv.gz")
#' metric <- "h2.enrichment"
#'
#' #### IMPACT ####
#' ## mean
#' path <- file.path(root,"IMPACT/TOP_IMPACT_all.csv.gz");
#' metric <- "mean_IMPACT"
#' ## raw
#' path <- file.path(root,"IMPACT/IMPACT_overlap.csv.gz")
#' metric <- "IMPACT_score"
#'
#'
#' ## Import and Process ##
#' metric_df <- data.table::fread(path, nThread=8)
#' if(metric=="IMPACT_score") metric_df <- subset(metric_df, select=c(SNP,leadSNP,ABF.Credible_Set,ABF.PP,SUSIE.Credible_Set,SUSIE.PP,POLYFUN_SUSIE.Credible_Set,POLYFUN_SUSIE.PP,FINEMAP.Credible_Set,FINEMAP.PP,Consensus_SNP,Support,Locus,IMPACT_score))
#' if(metric=="mean_IMPACT") metric_df <- echodata::find_consensus_snps_no_polyfun(metric_df)
#'
#' #### run bootstrap ####
#' permute_res <- VALIDATION.permute(metric_df=metric_df, metric=metric )
#' }
VALIDATION.permute <- function(metric_df,
                               metric,
                               locus_means=FALSE,
                               snp_groups=c("Random","GWAS lead","UCS (-PolyFun)","UCS","Consensus (-PolyFun)","Consensus")){
  snp_filters <- echodata::snp_group_filters(random_sample_size = 1000)
  if(locus_means){
    snp_filters <- setNames(paste0("SNP_group=='",snp_groups,"'"), snp_groups)
  } else {
    if(!"Consensus_SNP_noPF" %in% colnames(metric_df)){
      try({
        metric_df <- echodata::find_consensus_snps_no_polyfun(metric_df)
      })
    }
  }
  sampling_df <- metric_df

  # Prepare data
  message("VALIDATION:: Preparing data...")
  dat <- lapply(snp_groups, function(snp_group,
                                     .metric_df=metric_df,
                                     .locus_means=locus_means,
                                     .metric=metric){
    messager(snp_group)
    snp_filt <- snp_filters[[snp_group]]
    d <- subset(.metric_df, eval(parse(text=snp_filt)),
                select=c("Locus",if(.locus_means) NULL else "SNP",.metric))
    d$SNP_group <- snp_group
    return(d)
  }) %>% data.table::rbindlist() %>%
    dplyr::mutate(SNP_group=as.factor(SNP_group),
                  Locus=as.factor(Locus))


  messager("VALIDATION:: Beginning permutation testing.")
  ref_group <- "Random"
  target_groups <- snp_groups[!snp_groups %in% ref_group]
  RES <- lapply(target_groups, function(target,
                                        .ref_group=ref_group){
    print(target)
    ## NOTE: "exact" requires LOTS of memory.
    ## Use approximate()
    distribution= coin::approximate(nresample=10000)
    coin_res <- coin::independence_test(data=subset(dat, SNP_group %in% c(ref_group,target)),
                                        as.formula(paste(metric,"~ SNP_group") ),
                                        distribution = distribution)
    res <- data.frame(stat = coin::statistic(coin_res, type="standardized")[[1]],
                      z = coin::statistic(coin_res, type="standardized")[[1]],
                      stat_linear = coin::statistic(coin_res, type="linear")[[1]],
                      stat_centered =  coin::statistic(coin_res, type="centered")[[1]],
                      p = coin::pvalue(coin_res),
                      p_int_lower=coin::pvalue_interval(coin_res)[[1]],
                      p_int_upper=coin::pvalue_interval(coin_res)[[2]],
                      group1=ref_group,
                      group2=target,
                      metric=metric,
                      method="coin::independence_test")
    return(res)
  }) %>% data.table::rbindlist()
  return(RES)

  if(show_plot){
    messager("VALIDATION:: Plotting permutation results.")
    library(ggplot2)
    colorDict <- echodata::snp_group_colorDict()
    ggplot(data = RES, aes(x=group2, y= z, fill=group2, label=paste("p =",p))) +
      geom_bar(stat = "identity", alpha=.5) +
      # geom_errorbar(aes(ymin=z-p_int_lower,ymax=z+p_int_upper)) +
      geom_text(aes(y=ifelse(z<0, z-.25, z+.25))) +
      scale_fill_manual(values = colorDict ) +
      labs(title=paste("Permuted independence tests"),
           subtitle = paste0("Metric: ",metric,", ","Reference: ",unique(ref_group))) +
      theme_bw() +
      theme(axis.text.x = element_text(angle=45, hjust=1))

    #
    #   ggplot(data = RES, aes(x= -log1p(p), y= z,  fill=group2, color=group2)) +
    #     geom_point() +
    #     scale_color_manual(values = colorDict ) +
    #     theme_bw()
  }


}






#' Perform validation bootstrap procedure
#'
#' @family VALIDATION
#' @source
#' \href{https://www.datanovia.com/en/lessons/wilcoxon-test-in-r/}{Wilcox test tutorial}
#' @examples
#' \dontrun{
#' save_path <- root <- "/sc/arion/projects/pd-omics/brian/Fine_Mapping/Data/GWAS/Nalls23andMe_2019/_genome_wide"
#'
#' #### h2 ####
#' ## mean
#' path <- file.path(root,"PolyFun/Nalls23andMe_2019.h2_enrich.snp_groups.csv.gz")
#' metric <- "h2.enrichment"
#' ## raw
#' path <- file.path(root,"PolyFun/h2_merged.csv.gz")
#' metric <- "SNPVAR"
#'
#' #### IMPACT ####
#' ## mean
#' path <- file.path(root,"IMPACT/TOP_IMPACT_all.csv.gz");
#' metric <- "mean_IMPACT"
#' ## raw
#' path <- file.path(root,"IMPACT/IMPACT_overlap.csv.gz")
#' metric <- "IMPACT_score"
#'
#'
#' ## Import and Process ##
#' metric_df <- data.table::fread(path, nThread=8)
#' if(metric=="IMPACT_score") metric_df <- subset(metric_df, select=c(SNP,leadSNP,ABF.Credible_Set,ABF.PP,SUSIE.Credible_Set,SUSIE.PP,POLYFUN_SUSIE.Credible_Set,POLYFUN_SUSIE.PP,FINEMAP.Credible_Set,FINEMAP.PP,Consensus_SNP,Support,Locus,IMPACT_score))
#' if(metric=="mean_IMPACT") metric_df <- echodata::find_consensus_snps_no_polyfun(metric_df)
#'
#' #### run bootstrap ####
#' boot_res <- VALIDATION.bootstrap(metric_df=metric_df, metric=metric, nThread=8, save_path=gsub("\\.csv\\.gz",".bootstrap.coin_wilcox_test.csv.gz",path) )
#' }
VALIDATION.bootstrap <- function(metric_df,
                                 metric,
                                 predictor = "SNP_group",
                                 validation_method=NULL,
                                 synthesize_random=FALSE,
                                 snp_groups=c("Random","GWAS lead","UCS (-PolyFun)","UCS","Consensus (-PolyFun)","Consensus"),
                                 test_method="coin_wilcox_test",
                                 locus_means=TRUE,
                                 iterations=1000,
                                 save_path=FALSE,
                                 nThread=1,
                                 verbose=TRUE){
  sampling_df <- metric_df
  snp_filters <- echodata::snp_group_filters()
  if(locus_means){
    snp_filters <- setNames(paste0("SNP_group=='",snp_groups,"'"), snp_groups)
  } else {
    if(!"Consensus_SNP_noPF" %in% colnames(metric_df)){
      try({
        metric_df <- echodata::find_consensus_snps_no_polyfun(metric_df)
      })
    }
  }

  # Bootstrap without replacement (in a given iteration)
  messager("VALIDATION:: Using",test_method,v=verbose)
  RES_GROUPS <- lapply(snp_groups, function(snp_group,
                                            .iterations=iterations,
                                            .test_method=test_method){
    message(snp_group)
    parallel::mclapply(1:.iterations, function(i,
                                       .snp_group=snp_group,
                                       .synthesize_random=synthesize_random,
                                       test_method=.test_method){
      replace <- F;
      res <- NULL;
      try({
        if(i %% 100==0) print(i)
          snp_filt <- snp_filters[[.snp_group]]
          bg_size = 20
          #### Random sample ####
          random_snps <-  metric_df %>%
            dplyr::sample_n(size=bg_size, replace=replace)%>%
            dplyr::mutate(SNP_group="Random")
          #### optional: use random noise if you dont have the fulll sample
          if(.synthesize_random){
            # Synthesize a uniform distribution
            random_snps[[metric]] <- runif(n = bg_size,
                                           min=min(metric_df[[metric]], na.rm = TRUE),
                                           max=max(metric_df[[metric]], na.rm = TRUE))
          }

          #### Target sample ####
          fg_size=20
          if(.snp_group=="Random"){
            .snp_group <- "Random_target"
            target_snps <-  metric_df %>%
              dplyr::sample_n(size=fg_size, replace=replace)%>%
              dplyr::mutate(SNP_group=.snp_group)
            if(.synthesize_random){
              target_snps[[metric]] <- runif(n = fg_size,
                                             min=min(metric_df[[metric]], na.rm = TRUE),
                                             max=max(metric_df[[metric]], na.rm = TRUE))
            }
          } else {
            target_snps <- subset(metric_df, eval(parse(text=snp_filt))) %>%
              dplyr::sample_n(size = fg_size) %>%
              dplyr::mutate(SNP_group=.snp_group)
          }
          # Merge
          dat <- rbind(random_snps, target_snps) %>%
            dplyr::mutate(SNP_group=as.factor(SNP_group),
                          Locus=as.factor(Locus))

          #### Permutation testing ####


          #### Technically you're only supposed to use this function after doing a regular independence test,
          ### but the way I'm using it here is different (only comparing two groups at once).
          ### The output is just convenient.
          #### Builds upon `coin::independence_test`

          # package <- "coin";
          #### coin ####
          if(test_method=="coin_independence_test"){
            coin_res <- coin::independence_test(data=dat,
                                     as.formula(paste(metric,"~",predictor ) ) )
            res <- data.frame(stat= coin::statistic(coin_res),
                              p= coin::pvalue(coin_res))
          }
          #### rcompanion ####
          if(test_method=="rcompanion"){
            correction_method <- "fdr"
            res <- rcompanion::pairwisePermutationTest(data = dat,
                                                       formula=as.formula(paste(metric,"~",predictor)),
                                                       method = correction_method) %>%
              dplyr::mutate(Comparison = gsub(" = 0","",Comparison),
                            adjust.method=correction_method) %>%
              tidyr::separate(col = "Comparison", sep=" - ", into=c("group1","group2"))
          }
          if(test_method=="coin_wilcox_test"){
            coin_res <- coin::wilcox_test(data=dat,
                                          as.formula(paste(metric,"~",predictor) ),
                                          method="exact")
            res <- data.frame(stat = coin::statistic(coin_res, type="standardized")[[1]],
                              z = coin::statistic(coin_res, type="standardized")[[1]],
                              stat_linear = coin::statistic(coin_res, type="linear")[[1]],
                              stat_centered =  coin::statistic(coin_res, type="centered")[[1]],
                              p = coin::pvalue(coin_res))
          }
          #### ggpubr ####
          if(test_method=="ggpubr_wilcox"){
            res <- ggpubr::compare_means(data = dat,
                                         formula = as.formula(paste(metric,"~",predictor)),
                                         # group.by = "Locus",
                                         method="wilcox.test")
          }
          #### stats ####
          if(test_method=="stats_wilcox.test"){
            res_wilcox <- stats::wilcox.test(as.formula(paste(metric,"~",predictor)),
                                      data=dat, conf.int=TRUE)
            z =  stats::qnorm(res_wilcox$p.value/2)
            res <- data.frame(stat=res_wilcox$statistic,
                              z = z,
                              r = abs(z)/sqrt(nrow(dat)),
                              p=res_wilcox$p.value,
                              confInt_lower=res_wilcox$conf.int[1],
                              confInt_upper=res_wilcox$conf.int[2],
                              estimate=res_wilcox$estimate)
          }
          if(test_method=="stats_pairwise.wilcox.test"){
            res <- stats::pairwise.wilcox.test(x=dat[[metric]],
                                               g = dat[["SNP_group"]])
          }
          res$Metric <- metric
          res$test_method <- test_method
        }) ## End try
        return(res)
    }, mc.cores = nThread) %>%  data.table::rbindlist(fill=TRUE) %>%
        dplyr::mutate(SNP_group=snp_group)
  })  %>% data.table::rbindlist(fill=TRUE)

  if(save_path!=FALSE){
    # validation_method = "Dey_DeepLearning"
    dir.create(dirname(save_path), showWarnings = FALSE, recursive = TRUE)
    messager("VALIDATION:: Saving bootstrapping results ==>",save_path,v=verbose)
    data.table::fwrite(RES_GROUPS, save_path)
  }
  return(RES_GROUPS)
}





#' Conduct bootstrap procedure on multiple columns
#'
#' @family VALIDATION
#' @examples
#' \dontrun{
#' save_path <- root <-  "/sc/arion/projects/pd-omics/brian/Fine_Mapping/Data/GWAS/Nalls23andMe_2019/_genome_wide"
#'
#'
#' #### SURE MPRA #####
#' ## mean
#' path <- file.path(root,"SURE/Nalls23andMe_2019.SURE.snp_groups.mean.csv.gz")
#' metric_names <- "p"
#' ## raw
#' path <- file.path(root,"SURE/Nalls23andMe_2019.SURE.csv.gz")
#' metric_names <- c("k562.wilcox.p.value","hepg2.wilcox.p.value")
#' validation_method <- "SuRE MPRA"
#'
#' #### Dey_DeepLearning ####
#' ## mean
#' path <- file.path(root,"Dey_DeepLearning/Nalls23andMe_2019.Dey_DeepLearning.annot.Allelic_Effect.snp_groups_mean.csv.gz")
#' metric_names <- "value"; grouping_var="annot"
#' ## raw
#' path <- file.path(root,"Dey_DeepLearning/Nalls23andMe_2019.Dey_DeepLearning.annot.Allelic_Effect.csv.gz")
#' ## path <- file.path(root,"Dey_DeepLearning/Nalls23andMe_2019.Dey_DeepLearning.annot.Variant_Level.csv.gz")
#' validation_method = "Dey_DeepLearning"
#'
#'
#' # -----Import and process ------
#' metric_df <- data.table::fread(path, nThread=8)
#' ## metric_names <- grep("Basenji.*MAX|DeepSEA.*MAX|Roadmap.*MAX", colnames(metric_df), value = TRUE)
#' ## metric_df <- metric_df %>% dplyr::mutate(annot=paste(Model,Tissue,Assay,Type,Metric,sep="_"))
#' boot_res <- VALIDATION.bootstrap_multimetric(metric_df=metric_df, metric_names=metric_names, validation_method=validation_method, save_path=gsub("\\.csv\\.gz",".bootstrap.coin_wilcox_test.csv.gz",path), grouping_var=grouping_var, iterations=10000, nThread=12)
#' }
VALIDATION.bootstrap_multimetric <- function(metric_df,
                                             metric_names,
                                             validation_method=NULL,
                                             locus_means=TRUE,
                                             grouping_var=NULL,
                                             test_method="coin_wilcox_test",
                                             iterations=1000,
                                             nThread=1,
                                             save_path=FALSE,
                                             verbose=TRUE){
  if(locus_means){
    RES_GROUPS <- lapply(unique(metric_df[[grouping_var]]), function(group,
                                                .validation_method=validation_method,
                                                .test_method=test_method,
                                                .metric_names=metric_names,
                                                .iterations=iterations,
                                                .nThread=nThread){
      metric <- metric_names[[1]]
      message(group,":",metric)
      VALIDATION.bootstrap(metric_df[metric_df[[grouping_var]]==group,],
                           metric=metric,
                           validation_method = .validation_method,
                           locus_means = TRUE,
                           iterations=.iterations,
                           test_method=.test_method,
                           nThread = .nThread,
                           save_path=FALSE) %>%
        dplyr::mutate(Metric = paste(group,metric,sep="_"))
    }) %>% data.table::rbindlist(fill=TRUE)
  } else {
    RES_GROUPS <- lapply(metric_names, function(metric,
                                                .validation_method=validation_method,
                                                .test_method=test_method,
                                                .iterations=iterations,
                                                .nThread=nThread){
      message(metric)
      VALIDATION.bootstrap(metric_df,
                           metric=metric,
                           validation_method = .validation_method,
                           test_method=.test_method,
                           iterations=.iterations,
                           nThread=.nThread,
                           save_path=FALSE)
    }) %>% data.table::rbindlist(fill=TRUE)
  }

  if(save_path!=FALSE){
    messager("VALIDATION:: Saving results ==>",save_path,v=verbose)
    data.table::fwrite(RES_GROUPS,save_path)
  }
  return(RES_GROUPS)
}





#' Test whether permutation p-value distributions are different
#'
#' @family VALIDATION
#' @examples
#' \dontrun{
#' root <- "/sc/arion/projects/pd-omics/brian/Fine_Mapping/Data/GWAS/Nalls23andMe_2019"
#' permute.IMPACT <-  data.table::fread(file.path(root,"_genome_wide/IMPACT/Nalls23andMe_2019.IMPACT.permutations.csv.gz"))
#' res <- VALIDATION.permute_compare_results(permute_res=permute.IMPACT)
#' }
VALIDATION.compare_bootstrap_distributions <- function(boot_res,
                                                       formula_str="stat ~ SNP_group"){
  # GLM
  fit <- stats::glm(data = boot_res,
                    # family = stats::poisson # This would actually remove the signfiicance from the signal
                    formula = as.formula(formula_str))
  # fit <- stats::pairwise.t.test(data=boot_res,
  #                        x=boot_res$stat,
  #                        g=boot_res$SNP_group)
  print(summary(fit))
  ## Extract p-value
  coefs <-  data.frame(coef(summary(fit))) %>%
    `colnames<-`(c("Estimate","StdErr","z","p")) %>%
    tibble::rownames_to_column("SNP_group") %>%
    dplyr::mutate(SNP_group=gsub("SNP_group","",SNP_group),
                  signif = pvalues_to_symbols(p))
  ## Extract lambda w/ MASS
  metric <- strsplit(formula_str," ~ ")[[1]][1]
  lambda <- MASS::fitdistr(x = subset(boot_res, !is.na(eval(parse(text=metric))))[[metric]],
                           densfun = "Poisson")
  coefs$lambda <- as.numeric(lambda[[1]])
  return(coefs)

  # Coin
  # RES_GROUPS$SNP_group <- as.factor(RES_GROUPS$SNP_group)
  # coin::independence_test(p ~ SNP_group,
  #                   data=RES_GROUPS)
  # Now test the p-val distributions

  ## None of these account for the zero inflted nature of the p-value distributions
  # res <- ggpubr::compare_means(data = RES_GROUPS,
  #                              formula = p ~ SNP_group,
  #                              method="wilcox.test")
  # res <- coin::independence_test(data= RES_GROUPS,
  #                                p ~ SNP_group,
  #                                distribution="approximate")
  ## Negative-binomial distributions are intended for count data
  # fit <- MASS::glm.nb(data = RES_GROUPS,
  #              formula = p ~ SNP_group)
  # summary(fit)

  ## Using a Poisson distribution, and estimating lambda with a Generalized Linear Model (GLM),
  ## will fit a model that much better appromxiates the p-value distributions.
}




VALIDATION.aggregate_permute_res <- function(permute_res){
  permute_agg <- permute_res %>%
    dplyr::mutate(SNP_group = factor(SNP_group, levels = unique(permute_res$SNP_group), ordered = TRUE),
                  Metric=.y.) %>%
    tidyr::separate(Metric, sep="_", into=c("Model","Tissue","Assay"), extra="drop", remove=FALSE) %>%
    dplyr::mutate(.y.=gsub(paste(unique(Assay),collapse="|"),"",.y.),
                  SNP_group=as.character(SNP_group)) %>%
    dplyr::select(-c(Metric,Model,Tissue,Assay))
  return(permute_agg)
}



#' Plot permutation test results
#'
#' @family VALIDATION
#' @examples
#' \dontrun{
#' root <- "/sc/arion/projects/pd-omics/brian/Fine_Mapping/Data/GWAS/Nalls23andMe_2019/_genome_wide"
#'
#' #### h2 ####
#' validation_method <- "S-LDSC heritability"
#' ## mean
#' path <- file.path(root,"PolyFun/Nalls23andMe_2019.h2_enrich.snp_groups.bootstrap.coin_wilcox_test.csv.gz")
#' ## raw
#' path <-  file.path(root,"PolyFun/Nalls23andMe_2019.h2.bootstrap.coin_wilcox_test.csv.gz")
#'
#' #### IMPACT ####
#' validation_method <- "IMPACT"
#' ### mean
#' path <- file.path(root,"IMPACT/TOP_IMPACT_all.bootstrap.coin_wilcox_test.csv.gz")
#' ### raw
#' path <- file.path(root,"IMPACT/Nalls23andMe_2019.IMPACT_score.permutations.csv.gz")
#'
#' #### SURE MPRA ####
#' validation_method <- "SuRE MPRA"
#' ## mean
#' path <- file.path(root,"SURE/Nalls23andMe_2019.SURE.snp_groups.mean.bootstrap.stats_wilcox.test.csv.gz")
#' ### raw
#' path <- file.path(root,"SURE/Nalls23andMe_2019.SURE.bootstrap.stats_wilcox.test.csv.gz")
#'
#'
#' #### Dey_DeepLearning ####
#' validation_method <- "Dey_DeepLearning"
#' ## mean
#' path <- file.path(root,"Dey_DeepLearning/Nalls23andMe_2019.Dey_DeepLearning.annot.Allelic_Effect.snp_groups_mean.bootstrap.coin_wilcox_test_subset.csv.gz")
#' ## raw
#' path <- file.path(root,"Dey_DeepLearning/Nalls23andMe_2019.Dey_DeepLearning.annot.Allelic_Effect.bootstrap.stats_wilcox.test.csv.gz")
#' path <- file.path(root,"Dey_DeepLearning/Nalls23andMe_2019.Dey_DeepLearning.annot.Variant_Level.bootstrap.stats_wilcox.test.csv.gz")
#'
#' # ------  Import
#' boot_res <-  data.table::fread(path, nThread=1)
#' if(validation_method=="SuRE MPRA") boot_res$z <- -boot_res$z
#'
#' # ---Plot ---
#' library(patchwork)
#' if(validation_method=="Dey_DeepLearning") boot_res <- data.frame(boot_res)[grepl("H3K4ME3", boot_res$Metric, fixed=TRUE) & grepl("Basenji", boot_res$Metric, fixed=TRUE) & grepl("_MAX_", boot_res$Metric, fixed=TRUE),]
#' facet_formula <- if(validation_method=="Dey_DeepLearning") "Assay  ~ Model" else ".~."
#' facet_formula <- if(validation_method=="SuRE MPRA") ".  ~ Cell_type" else ".~."
#' gp1 <- VALIDATION.bootstrap_plot(boot_res=boot_res, validation_method=validation_method, y_var="z", save_plot=gsub("\\.csv\\.gz",".stat_values.png",path), width=9, facet_formula=facet_formula, override_metric_count = TRUE)
#' gp2 <- VALIDATION.bootstrap_plot(boot_res=subset(boot_res, stat>0), validation_method=validation_method, y_var="p", save_plot=gsub("\\.csv\\.gz",".p_values.png",path), width=9,facet_formula = facet_formula, override_metric_count = TRUE)
#' gp12 <- (gp1 / gp2) + patchwork::plot_annotation(tag_levels = "a")
#' ggsave(gsub("\\.csv\\.gz",".png",path),gp12, dpi=400, height=9, width=15)
#'
#' }
VALIDATION.bootstrap_plot <- function(boot_res,
                                      validation_method=NULL,
                                      facet_formula=". ~ .",
                                      y_var="z",
                                      override_metric_count=FALSE,
                                      remove_random=TRUE,
                                      show_plot=TRUE,
                                      save_plot=FALSE,
                                      box.padding=.5,
                                      font_size=3,
                                      height=5,
                                      width=7,
                                      verbose=TRUE){
  library(ggplot2)
  colorDict <- echodata::snp_group_colorDict()
  if(remove_random) boot_res <- subset(boot_res, SNP_group!="Random")
  plot_dat <- boot_res %>%
    dplyr::mutate(SNP_group = factor(SNP_group, levels = names(colorDict), ordered = TRUE))
  boot_res <- boot_res %>%
    dplyr::mutate(SNP_group = factor(SNP_group, levels = names(colorDict)))

  # Conduct GLM on pval distributions

  metric_count <- length(unique(boot_res$Metric))
  if(metric_count>1|override_metric_count){
    messager("VALIDATION:: Facetting plots by metric.",v=verbose)
    glm_res <- lapply(unique(boot_res$Metric), function(metric){
      print(metric)
      VALIDATION.compare_bootstrap_distributions(boot_res=subset(boot_res, Metric==metric),
                                       formula_str = paste(y_var,"~ SNP_group")) %>%
        dplyr::mutate(Metric=metric)
    }) %>% data.table::rbindlist()
    # Postprocess
    glm_res <- glm_res %>%
      subset(SNP_group!="(Intercept)") %>%
      dplyr::mutate(SNP_group = factor(SNP_group, levels = unique(SNP_group), ordered = TRUE))

    if(validation_method=="Dey_DeepLearning"){
      glm_res <- glm_res %>%
        tidyr::separate(Metric, sep="_", into=c("Model","Tissue","Assay"), extra="drop", remove=FALSE)
        plot_dat <- plot_dat %>%
          tidyr::separate(Metric, sep="_", into=c("Model","Tissue","Assay"), extra="drop", remove=FALSE)
    }
    if(validation_method=="SuRE MPRA"){
      glm_res <- glm_res %>%
        tidyr::separate(Metric, sep="_", into=c("unit","Cell_type"), extra="drop", remove=FALSE)
      plot_dat <- plot_dat %>%
        tidyr::separate(Metric, sep="_", into=c("unit","Cell_type"), extra="drop", remove=FALSE)
    }
      glm_violin <-
        ggplot(data=glm_res, aes(x=SNP_group,y=p, fill=SNP_group)) +
      geom_violin(alpha=.5) +
      geom_boxplot(alpha=.5) +
      ggpubr::yscale("-log1p", .format = TRUE) +
      geom_jitter(height = 0, alpha=.1) +
      scale_fill_manual(values = colorDict) +
      # geom_hline(yintercept = -log10(0.05), alpha=.5, linetype=2) +
      facet_grid(facets = as.formula(facet_formula),
                 scales="free_y") +
      theme_bw() +
      theme(axis.text.x = element_text(angle=45, hjust=1))
      print(glm_violin)
  } else {
    glm_res <- VALIDATION.compare_bootstrap_distributions(boot_res=boot_res,
                                                          formula_str = paste(y_var,"~ SNP_group"))
  }


  # Plot
  iterations <-  max((plot_dat %>% dplyr::group_by(SNP_group,Metric)%>% tally())$n)
  # plot_dat$SNP_group <- forcats::fct_rev( plot_dat$SNP_group )
  gp <- ggplot(data = plot_dat, aes(x=eval(parse(text=y_var)), fill=SNP_group, color=SNP_group,
                                    linetype=SNP_group)) +
    # geom_histogram(alpha=.5,  position="identity", bins=100) +
    geom_density(position = "identity", adjust=1, alpha=.1) +
    scale_fill_manual(values = colorDict) +
    scale_color_manual(values = colorDict) +
    labs(title=paste("Bootstrapped tests:",validation_method,paste0("(",iterations," iterations)")),
         x=paste0("bootstrapped ",y_var,"-values")) +
    facet_grid(facets = as.formula(facet_formula)) +
    theme_bw() +
    theme(strip.background = element_rect(fill="grey20"),
          strip.text = element_text(color='white'))
  # gp + scale_fill_manual(values = rep("transparent",dplyr::n_distinct(plot_dat$SNP_group)))

  # Get density peaks
  if(metric_count==1|override_metric_count){
    b <- ggplot_build(gp)
    density_peaks <- b$data[[1]] %>%
      dplyr::group_by(fill) %>%
      dplyr::top_n(n = 1, wt = y) %>%
      dplyr::mutate(SNP_group =  setNames(names(colorDict), unname(colorDict))[[fill]]) %>%
      data.table::data.table() %>%
      data.table::merge.data.table(glm_res, by="SNP_group") %>%
      dplyr::mutate(p=ifelse(p==0, .Machine$double.xmin, p)) %>%
      dplyr::mutate(#p_adj=ifelse(p*dplyr::n_distinct(PANEL)>1, 1,p*dplyr::n_distinct(PANEL)),
                    p_adj= stats::p.adjust(p = p, method="fdr"),
                    p_norm= ifelse(p*10000*nrow(.)>1, 1, p*10000*nrow(.))) %>%
      dplyr::mutate(signif = pvalues_to_symbols(p_norm))
    # Annotate plot
    gp_lab <- gp  +
      ggrepel::geom_label_repel(data=density_peaks,
                                aes(x=x, y=y, label=paste(SNP_group,"\n",
                                                          "p =",format(as.numeric(p_norm), digits = 2), signif,"\n",
                                                          "z =",format(as.numeric(z), digits = 2),paste0("(se = ",format(as.numeric(StdErr), digits = 2),")")
                                                          )
                                    ),
                                show.legend = FALSE,
                                box.padding =  box.padding,
                                alpha=.75, segment.alpha = .5,
                                color="black",
                                size=font_size)
  } else {gp_lab <- gp}


  if(show_plot) print(gp_lab)

  if(save_plot!=FALSE){
    ggsave(save_plot, gp_lab, dpi=400, height=height, width=width)
  }
  return(gp_lab)
}








#' Check whether Support level correlates with some variable
#'
#' @examples
#' \dontrun{
#' # S-LDSC h2
#' h2_stats <- data.table::fread("/sc/arion/projects/pd-omics/brian/Fine_Mapping/Data/GWAS/Nalls23andMe_2019/_genome_wide/PolyFun/Nalls23andMe_2019.grouped_snpvar_stats.csv.gz")
#' }
VALIDATION.compare_support <- function(plot_dat){
  plot_dat <- h2_stats
  variable <- "SNPVAR_min"

  plot_dat$log <- log1p( plot_dat[[variable]])
  comparisons <- utils::combn(x = as.character(unique(plot_dat$Support)),
                              m=2,
                              simplify  = FALSE) %>% purrr::compact()
  method="wilcox.test"
  ggpubr::ggviolin(data = plot_dat,
                   x="Support", y="log",
                   fill="Support", alpha=.5,
                   add="boxplot", add.params = list(alpha=.1)) +
    ggpubr::stat_compare_means(method=method, comparisons = comparisons,
                               label = "p.signif") +
    geom_jitter(alpha=.1, height=0)
}
RajLabMSSM/echolocatoR documentation built on Jan. 29, 2023, 6:04 a.m.