R/validate_associations_mkg.R

Defines functions validate_associations_mkg

Documented in validate_associations_mkg

#' Validate associations with the Monarch Knowledge Graph
#'
#' This function validates the associations between phenotypes and cell types
#' with the Monarch Knowledge Graph (MKG). It computes the number of phenotypes
#' and cell types that are present in the MKG and the number of associations
#' that are present in the MKG.
#' @inheritParams prioritise_targets
#' @param kg A data.table with phenotype-cell type relationships
#' ("from" and "to" columns, respectively) gathered from
#'  the Monarch Knowledge Graph.
#' @param cl Cell Ontology object.
#' @param alt_ids Alternative IDs to map CL IDs into.
#' @param metric Ontological distance metric to use for plotting.
#' @export
#' @examples
#' results <- load_example_results()
#' kg <- validate_associations_mkg(result=results)
validate_associations_mkg <- function(results=load_example_results(),
                                      kg=get_data("monarch_kg_cells.csv"),
                                      q_threshold=0.05,
                                      cl=KGExplorer::get_ontology(
                                        name = "cl",
                                        remove_cyclic_paths = TRUE,
                                        remove_rings = TRUE),
                                      alt_ids=c("CL:0000111"="CL:2000032"),
                                      metric = c("dist_nca.min_adj",
                                                 "dist_lca.min_adj",
                                                 "dist_nca.min",
                                                 "dist_lca.min")[1]
                                      ){
  from <- hpo_id <- NULL;
  # kg <- data.table::fread(here::here("data/monarch_kg_cells.csv"))
  add_logfc(results)
  kg_og <- data.table::copy(kg)
  kg <- kg[grepl("HP:",from)][from %in% unique(results$hpo_id)]
  message(paste(
    "Remaining:",length(unique(kg$from)),
    "phenotypes across",length(unique(kg$to)),"celltypes."
  ))
  #### Merge results with Knowledge Graph ####
  kg_res <- data.table::merge.data.table(
    kg[,c("from","to","label.to")],
    results[q<q_threshold],
    by.x="from",
    by.y="hpo_id") |>
    data.table::setorderv(c("from","logFC"), c(1,-1))
  #### Compute average number of celltypes / phenotype ####
  # kg[,list(N=data.table::uniqueN(to)), by="from"]$N|>summary()
  # kg_res[,list(N=data.table::uniqueN(CellType)), by="from"]$N|>summary()

  missing_phenos <- setdiff(kg$from, kg_res$from)
  proportion_phenotypes_captured <- (1-(length(missing_phenos) /
                                        length(unique(kg$from))))
  message(format(proportion_phenotypes_captured*100,digits = 4),
          "% phenotypes recovered.")
  #### Get missing phenotypes and associations ####
  kg_missing <- kg[from%in% missing_phenos]
  res_missing <- results[hpo_id%in% missing_phenos]
  #### Compute increase in knowledge ####
  # n_before <- kg$from|>unique()|>length()
  # n_after <- res[q<0.05]$hpo_id|>unique()|>length()
  n_before <- nrow(kg)
  n_after <- nrow(results[q<q_threshold])
  #### Compute % recall at each cell type ontological distance #####
  cl_distance_results <- validate_associations_mkg_dist(
    results=results,
    kg=kg,
    cl=cl,
    q_threshold=q_threshold,
    alt_ids=alt_ids,
    metric = metric)
  return(
    list(
      kg=kg_og,
      kg_filt=kg,
      kg_res=kg_res,
      proportion_phenotypes_captured=proportion_phenotypes_captured,
      missing_phenos=missing_phenos,
      kg_missing=kg_missing,
      res_missing=res_missing,
      n_before=n_before,
      n_after=n_after,
      increased_knowledge=n_after/n_before,
      cl_distance_results=cl_distance_results
    )
  )
}
neurogenomics/MultiEWCE documentation built on Sept. 28, 2024, 2:27 a.m.