R/gpt_annot_read.R

Defines functions gpt_annot_read

Documented in gpt_annot_read

#' Read annotations from GPT
#'
#' Read in phenotype annotations generated by GPT and
#'  do some initial preprocessing (e.g. adding HPO IDs).
#' @inheritParams main
#' @inheritParams make_
#' @inheritParams get_
#' @param force_new If \code{TRUE}, the data will be downloaded from GitHub
#' even if it already exists locally.
#' @param verbose Print messages.
#' @param include_nogenes Include phenotypes with no associated genes.
#' @inheritDotParams get_data
#' @returns data.table of phenotype annotations
#'
#' @export
#' @examples
#' gpt_annot <- gpt_annot_read()
gpt_annot_read <- function(save_dir=KGExplorer::cache_dir(package="HPOExplorer"),
                           phenotype_to_genes = load_phenotype_to_genes(save_dir = save_dir),
                           force_new=FALSE,
                           hpo=get_hpo(),
                           include_nogenes=TRUE,
                           verbose=TRUE,
                           ...){
  pheno_count <- hpo_name <- hpo_id <- NULL;

  save_path <- get_data(file = "gpt4_hpo_annotations.csv.gz",
                        save_dir = save_dir,
                        overwrite = force_new,
                        ...)
  {
    d <- data.table::fread(save_path, header = TRUE)
    data.table::setnames(d,"phenotype","hpo_name", skip_absent = TRUE)
    d <- d[!is.na(hpo_name)]
    d <- add_hpo_id(d, hpo = hpo)
  }
  {
    #### Add subset with fixed hpo_names ####
    # https://github.com/neurogenomics/RareDiseasePrioritisation/issues/31#issuecomment-1989079044
    save_path_fixmap <- get_data(file = "mismatched_hpo_names_fixed.csv.gz",
                                 save_dir = save_dir,
                                 overwrite = force_new,
                                 ...)
    fixmap <- data.table::fread(save_path_fixmap)
    d <- rbind(d[!hpo_name %in% unique(fixmap$hpo_name)],
               fixmap, fill=TRUE)
  }
  #### Check phenotype names ####
  d <- merge(d,
             unique(phenotype_to_genes[,c("hpo_id","hpo_name")]),
             all.x = TRUE,
             by=c("hpo_name","hpo_id"))
  d <- data.frame(d)
  d[d==""] <- NA
  d <- data.table::data.table(d)
  d[,pheno_count:=table(d$hpo_name)[hpo_name]]
  #### Ensure no phenos are missing HPO IDs ####
  missing_phenos <- length(unique(d[is.na(hpo_id)]$hpo_name))
  if(missing_phenos>0){
    messager(missing_phenos,
             "phenotypes do not have matching HPO IDs.",v=verbose)
  }
  messager("Reading in GPT annotations for",
           formatC(length(unique(d$hpo_id)),big.mark = ","),
           "phenotypes.",
           v=verbose)
  # phenotype_miss_rate <-
  #   length(d$phenotype[!d$phenotype %in% phenotype_to_genes$hpo_name]) /
  #   length(d$phenotype)
  return(d)
}
neurogenomics/HPOExplorer documentation built on Aug. 24, 2024, 1:39 a.m.