#' Read annotations from GPT
#'
#' Read in phenotype annotations generated by GPT and
#' do some initial preprocessing (e.g. adding HPO IDs).
#' @inheritParams main
#' @inheritParams make_
#' @inheritParams get_
#' @param force_new If \code{TRUE}, the data will be downloaded from GitHub
#' even if it already exists locally.
#' @param verbose Print messages.
#' @param include_nogenes Include phenotypes with no associated genes.
#' @inheritDotParams get_data
#' @returns data.table of phenotype annotations
#'
#' @export
#' @examples
#' gpt_annot <- gpt_annot_read()
gpt_annot_read <- function(save_dir=KGExplorer::cache_dir(package="HPOExplorer"),
phenotype_to_genes = load_phenotype_to_genes(save_dir = save_dir),
force_new=FALSE,
hpo=get_hpo(),
include_nogenes=TRUE,
verbose=TRUE,
...){
pheno_count <- hpo_name <- hpo_id <- NULL;
save_path <- get_data(file = "gpt4_hpo_annotations.csv.gz",
save_dir = save_dir,
overwrite = force_new,
...)
{
d <- data.table::fread(save_path, header = TRUE)
data.table::setnames(d,"phenotype","hpo_name", skip_absent = TRUE)
d <- d[!is.na(hpo_name)]
d <- add_hpo_id(d, hpo = hpo)
}
{
#### Add subset with fixed hpo_names ####
# https://github.com/neurogenomics/RareDiseasePrioritisation/issues/31#issuecomment-1989079044
save_path_fixmap <- get_data(file = "mismatched_hpo_names_fixed.csv.gz",
save_dir = save_dir,
overwrite = force_new,
...)
fixmap <- data.table::fread(save_path_fixmap)
d <- rbind(d[!hpo_name %in% unique(fixmap$hpo_name)],
fixmap, fill=TRUE)
}
#### Check phenotype names ####
d <- merge(d,
unique(phenotype_to_genes[,c("hpo_id","hpo_name")]),
all.x = TRUE,
by=c("hpo_name","hpo_id"))
d <- data.frame(d)
d[d==""] <- NA
d <- data.table::data.table(d)
d[,pheno_count:=table(d$hpo_name)[hpo_name]]
#### Ensure no phenos are missing HPO IDs ####
missing_phenos <- length(unique(d[is.na(hpo_id)]$hpo_name))
if(missing_phenos>0){
messager(missing_phenos,
"phenotypes do not have matching HPO IDs.",v=verbose)
}
messager("Reading in GPT annotations for",
formatC(length(unique(d$hpo_id)),big.mark = ","),
"phenotypes.",
v=verbose)
# phenotype_miss_rate <-
# length(d$phenotype[!d$phenotype %in% phenotype_to_genes$hpo_name]) /
# length(d$phenotype)
return(d)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.