#' Check annotations from GPT
#'
#' Check GPT phenotype annotations using a several metrics.
#' @param annot GPT-generated phenotype annotations.
#' @param query_hits A named list of HPO IDs for each query,
#' generated by \link[HPOExplorer]{search_hpo}.
#' These will be used as the ground truth when trying to identify
#' true positive annotations.
#' @param response_map A named list of responses to map onto.
#' Used for standardisation of responses.
#' @param pos_values Positive values.
#' @param neg_values Negative values.
#' @param verbose Print messages.
#' @returns Named list
#'
#' @export
#' @import data.table
#' @importFrom stats na.omit
#' @examples
#' checks <- gpt_annot_check()
gpt_annot_check <- function(annot = gpt_annot_read(),
query_hits = search_hpo(),
response_map = list(
no="never",
yes="always"
),
pos_values=c("sometimes","often","always"),
neg_values = c("never","rarely"),
verbose = TRUE
){
pheno_count <- NULL;
#### Proportion of HPO_IDs annotated before/after chatGPT ####
# hpo <- get_hpo()
# prior_ids <- unique(HPOExplorer::hpo_modifiers$hpo_id)
# new_ids <- unique(annot$hpo_id)
# length(new_ids)/length(prior_ids)
# length(prior_ids)/length(hpo@terms)
# length(new_ids)/length(hpo@terms)
#### Check annotation consistency ####
nm <- names(annot)[!grepl("hpo_name|justification|hpo_id|hpo_name|pheno_count",names(annot),
ignore.case = TRUE)]
response_counts <- table(tolower(unlist(annot[,nm,with=FALSE])), useNA = "always")
# opts <- unlist(sapply(annot[,nm,with=FALSE], unique)) |> unique()
#### Standardise responses ####
for(n in nm){
annot[,(n):=tolower(get(n))]
annot[,(n):=ifelse(get(n) %in% names(response_map),response_map[[get(n)]], get(n)), by=.I]
annot[,(n):=ifelse(get(n) %in% c(pos_values,neg_values),get(n),NA), by=.I]
}
response_counts_standard <- table(tolower(unlist(annot[,nm,with=FALSE])), useNA = "always")
#### Compute number of non-negative answers within each column####
if(nrow(annot[pheno_count>1])==0){
messager("No duplicate phenotypes to check consistency for.")
annot_mean <- NULL
}else {
#### Check for relaxed consistency (only distguish between positive and negative )####
annot_mean <- annot[pheno_count>1][,lapply(.SD,function(x){
mean(!na.omit(x) %in% neg_values)
}),.SDcols=nm,by="hpo_name"]
#### Check for stringent consistency ####
annot_stringent_mean <- annot[pheno_count>1][,lapply(.SD,function(x){
data.table::uniqueN(x)==1
}),.SDcols=nm,by="hpo_name"]
}
#### Check ontology classifications #####
annot_check <- lapply(seq(nrow(annot)), function(i){
r <- annot[i,]
cbind(
r[,c("hpo_name","hpo_id")],
lapply(stats::setNames(names(query_hits),
names(query_hits)),
function(x){
if(r$hpo_id %in% query_hits[[x]]){
tolower(r[,x,with=FALSE][[1]]) %in% pos_values
} else {
NA
}
}) |> data.table::as.data.table()
)
}) |> data.table::rbindlist()
#### Compute consistency within each column ####
if(!is.null(annot_mean)){
consistency_count <- sapply(annot_mean[,-c("hpo_name")],
function(x)nrow(annot_mean))
consistency_rate <- sapply(annot_mean[,-c("hpo_name")],
function(x)sum(x%in%c(0,1)/nrow(annot_mean)))
consistency_stringent_rate <- sapply(annot_stringent_mean[,-c("hpo_name")],
function(x)sum(x)/nrow(annot_mean))
} else {
consistency_count <- consistency_rate <- consistency_stringent_rate <- NULL
}
### Proportion of rows where annotation is not NA
checkable_rate <- sapply(
annot_check[,names(query_hits),with=FALSE],
function(x){sum(!is.na(x))/length(x)})
checkable_count <- sapply(
annot_check[,names(query_hits),with=FALSE],
function(x){sum(!is.na(x))})
### Proportion of rows where the annotation was checkable and TRUE
true_pos_rate <- sapply(
annot_check[,names(query_hits),with=FALSE],
function(x){sum(stats::na.omit(x)==TRUE)/length(stats::na.omit(x))})
### Proportion of rows where the annotation was checkable and FALSE
false_neg_rate <- sapply(
annot_check[,names(query_hits),with=FALSE],
function(x){sum(stats::na.omit(x)==FALSE)/length(stats::na.omit(x))})
#### Gather results ####
checks <- list(
annot=annot,
annot_mean=annot_mean,
annot_stringent_mean=annot_stringent_mean,
consistency_count=consistency_count,
consistency_rate=consistency_rate,
consistency_stringent_count=consistency_count,
consistency_stringent_rate=consistency_stringent_rate,
annot_check=annot_check,
checkable_rate=checkable_rate,
checkable_count=checkable_count,
true_pos_count=checkable_count,
true_pos_rate=true_pos_rate,
false_neg_rate=false_neg_rate,
response_counts=response_counts,
response_counts_standard=response_counts_standard
)
#### Plot ####
checks[["plot"]] <- gpt_annot_check_plot(checks=checks)
#### Return ####
return(checks)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.