data-raw/drug_utilities.R

## Read URL
#'
#' Function that retrieves PubChem compound properties using PubChem public user gateway (PUG)
#'
#' @param q_url Query URL
#' @return a data frame with the following columns of chemical compound properties:
#'
#'
readUrl <- function(q_url) {
  out <- tryCatch(
    {
      readLines(con=q_url, warn=FALSE)
    },
    error=function(cond) {
      # Choose a return value in case of error
      return(NA)
    },
    warning=function(cond) {
      # Choose a return value in case of warning
      return(NULL)
    },
    finally={
      #message(paste("Processed URL:", q_url))
      #message("Some other message at the end")
    }
  )
  return(out)
}


### NCI DRUG DISPLAY LABELS

process_nci_labels <- function(path_data_raw, overwrite = F) {

  nci_thesaurus_labels_filepath = file.path(path_data_raw, "nci_thesaurus", "ThesaurusLabels.tsv")
  nci_thesaurus_owl_filepath <- file.path(path_data_raw, "nci_thesaurus", "ThesaurusInferred.owl")
  if(!is.null(nci_thesaurus_owl_filepath) & 
     file.exists(nci_thesaurus_owl_filepath) & 
     (!file.exists(nci_thesaurus_labels_filepath) | overwrite == T)){
    system(paste0('/Users/sigven/miniconda3/bin/python data-raw/parse_owl_labels.py ',nci_thesaurus_owl_filepath,' > ',file.path(path_data_raw,'nci_thesaurus','ThesaurusLabels.tsv')))
  }
  nci_labels <- read.table(file = nci_thesaurus_labels_filepath, sep="\t",comment.char="",
                           stringsAsFactors = F,quote="",header=F) |>
    magrittr::set_colnames(c('nci_t','nci_cd_name','cui')) |>
    dplyr::filter(!is.na(nci_t))

  return(nci_labels)
}

### CHEMBL-PUBCHEM COMPOUND CROSS-REFERENCE

get_chembl_pubchem_xref <- function(datestamp = '20241024',
                                             chembl_release = "v34",
                                             path_data_raw = NULL,
                                             update = F){
  chembl_pubchem_xref_fname <- file.path(
    path_data_raw, "chembl",
    paste0("chembl_pubchem_mapping_", datestamp,".txt.gz"))
  if(!file.exists(chembl_pubchem_xref_fname) | update == T){
    download.file(
      "ftp://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/src_id1/src1src22.txt.gz",
      destfile = chembl_pubchem_xref_fname)
  }
  chembl_pubchem_compound_xref <- read.table(
    gzfile(chembl_pubchem_xref_fname), header = F, skip = 1,
    stringsAsFactors = F,sep="\t",quote="") |>
    magrittr::set_colnames(c('molecule_chembl_id','pubchem_cid')) |>
    dplyr::mutate(chembl_db_version = chembl_release)

  return(chembl_pubchem_compound_xref)
}



### TARGETED ANTICANCER COMPOUNDS FROM OPEN TARGETS
get_otp_cancer_drugs <-
  function(path_data_raw = NULL,
           ot_version = "2024.09"){

    cancer_terms <- list()
    cancer_terms[['all']] <- phenOncoX::get_terms(
      cache_dir = path_data_raw
    )
    
    cancer_terms[['efo']] <- cancer_terms[['all']]$records |>
      dplyr::select(cui, cui_name, efo_id, primary_site) |> 
      dplyr::filter(!is.na(efo_id)) |>
      dplyr::distinct() |>
      dplyr::rename(disease_efo_id = efo_id)

  fname <- paste0(path_data_raw,
                  paste0("/opentargets/opentargets_drugs_",
                         ot_version,".rds"))
  ot_compounds <- as.data.frame(
    readRDS(file = fname) |>
      dplyr::select(target_genename,
                    target_symbol,
                    target_type,
                    target_ensembl_gene_id,
                    target_entrezgene,
                    disease_efo_id,
                    disease_efo_label,
                    drug_name,
                    drug_type,
                    drug_max_ct_phase,
                    drug_max_phase_indication,
                    molecule_chembl_id,
                    drug_action_type,
                    drug_moa,
                    drug_tradenames,
                    drug_synonyms,
                    drug_clinical_source,
                    drug_clinical_id,
                    drug_description,
                    drug_withdrawn,
                    drug_approved_indication,
                    drug_blackbox_warning,
                    drug_year_first_approval)) |>
    dplyr::mutate(
      disease_efo_id = stringr::str_replace_all(
        disease_efo_id, "_", ":")) |>
    dplyr::distinct()  |>
    dplyr::left_join(
      cancer_terms[['efo']], 
      by = "disease_efo_id", 
      multiple = "all", 
      relationship = "many-to-many") |>
    ## general cancer ontology terms (neoplasm, cancer, carcinoma, squamous cell carcinoma)
    dplyr::mutate(
      drug_cancer_relevance =
        dplyr::if_else(
          !is.na(primary_site) |
            disease_efo_id == 'EFO:0000616' |
            disease_efo_id == 'EFO:0000311' |
            disease_efo_id == 'EFO:0000313' |
            disease_efo_id == 'MONDO:0004992' |
            disease_efo_id == 'MONDO:0002120' |
            disease_efo_id == 'EFO:0003826' |
            disease_efo_id == 'EFO:0000707' |
            disease_efo_id == 'EFO:0000228',
          "by_cancer_condition_otp", "by_other_condition_otp")
    ) |>
    dplyr::mutate(
      drug_cancer_relevance = dplyr::if_else(
        is.na(disease_efo_id) &
          (drug_type == "Antibody" | 
             drug_type == "Unknown" |
             drug_type == "Small molecule") &
          !is.na(target_genename) &
          stringr::str_detect(
            tolower(target_genename),
            paste0(
              "(kinase|epidermal growth factor|tubulin|",
              "oncogene|fibroblast growth factor|",
              "dna topoisomerase|tnf receptor superfamily|",
              "protein tyrosine phosphatase|",
              "receptor tyrosine kinase|",
              "platelet derived growth factor)"
              )) &
          (drug_action_type == "INHIBITOR" | 
             drug_action_type == "BINDING AGENT" |
             drug_action_type == "ANTAGONIST"),
        "by_cancer_target_otp",
        as.character(drug_cancer_relevance)
      )
    ) |>
    dplyr::mutate(
      drug_type = dplyr::if_else(
        drug_type == "unknown",
        "Unknown",
        as.character(drug_type)
      )
    ) |>
    # dplyr::filter(
    #   !is.na(drug_action_type)
    # ) |>
    # dplyr::filter(
    #   !stringr::str_detect(
    #     tolower(drug_action_type),
    #     "allosteric|modulator|substrate|releasing|opener"
    #   )
    # ) |>
    dplyr::filter(
      drug_type != "Cell" &
        drug_type != "Enzyme" &
        drug_type != "Gene" &
        drug_type != "Oligonucleotide" &
        drug_type != "Oligosaccharide" 
    )
  
  cancer_indication_stats <- ot_compounds |> 
    dplyr::filter(
      !is.na(drug_cancer_relevance) & 
        drug_cancer_relevance == "by_cancer_condition_otp") |> 
    dplyr::select(drug_name, disease_efo_label) |> 
    dplyr::filter(!is.na(disease_efo_label)) |>
    dplyr::distinct() |> 
    dplyr::group_by(drug_name) |> 
    dplyr::summarise(n_cancer_indications = dplyr::n())
  
  non_cancer_indication_stats <- ot_compounds |>
    dplyr::filter(
      !is.na(drug_cancer_relevance) & 
        drug_cancer_relevance == "by_other_condition_otp") |>
    #dplyr::filter(drug_cancer_relevance == F) |> 
    dplyr::filter(!is.na(disease_efo_label)) |>
    dplyr::select(drug_name, disease_efo_label,
                  drug_approved_indication) |> 
    dplyr::distinct() |> 
    dplyr::group_by(drug_name) |> 
    dplyr::summarise(n_noncancer_indications = dplyr::n(),
                     approved_non_cancer = paste(
                       unique(drug_approved_indication),
                       collapse = ";"
                     ), .groups = "drop") |>
    dplyr::mutate(approved_non_cancer = dplyr::if_else(
      stringr::str_detect(approved_non_cancer,"TRUE"),
      as.logical(TRUE),
      as.logical(FALSE)
    ))
  
  indication_stats <- cancer_indication_stats |>
    dplyr::full_join(non_cancer_indication_stats, by = "drug_name") |>
    dplyr::mutate(n_cancer_indications = dplyr::if_else(
      is.na(n_cancer_indications),
      as.numeric(0),
      as.numeric(n_cancer_indications)
    )) |>
    dplyr::mutate(n_noncancer_indications = dplyr::if_else(
      is.na(n_noncancer_indications),
      as.numeric(0),
      as.numeric(n_noncancer_indications)
    )) |>
    dplyr::mutate(drug_n_indications = n_cancer_indications +
                    n_noncancer_indications) |>
    dplyr::mutate(drug_frac_cancer_indications = dplyr::if_else(
      n_cancer_indications == 0,
      as.numeric(0),
      round(as.numeric(n_cancer_indications / 
                   (n_cancer_indications + n_noncancer_indications)),
            digits = 3)
    )) |>
    dplyr::mutate(drug_approved_noncancer = dplyr::if_else(
      is.na(approved_non_cancer),
      FALSE,
      as.logical(approved_non_cancer)
    )) |>
    dplyr::select(drug_name,
                  drug_n_indications,
                  drug_approved_noncancer,
                  drug_frac_cancer_indications)
  
  ot_compounds <- ot_compounds |>
    dplyr::left_join(indication_stats, by = "drug_name") |>
    dplyr::mutate(
      drug_n_indications = dplyr::if_else(
        is.na(drug_n_indications) | 
          drug_n_indications == "NA",
        0,
        as.integer(drug_n_indications)
      )
    )

  targeted_compounds <- as.data.frame(
    ot_compounds |>
      dplyr::distinct() |>
      dplyr::group_by_at(
        dplyr::vars(-c(drug_clinical_id))) |>
      dplyr::summarise(
        drug_clinical_id = paste(
          unique(drug_clinical_id),
          collapse = ","),
        drug_clinical_source = paste(
          unique(sort(drug_clinical_source)),
          collapse = ","),
        .groups = "drop") |>
      dplyr::ungroup() |>
      dplyr::mutate(
        drug_name_lc = tolower(drug_name))
  )
  
  
  ## adjust max ct phase
  drugs_with_max_phase_adj <- as.data.frame(
    targeted_compounds |>
    #dplyr::filter(drug_cancer_relevance == T) |>
    dplyr::filter(!is.na(drug_max_ct_phase) & 
                    !is.na(drug_max_phase_indication)) |>
    dplyr::group_by(drug_name, molecule_chembl_id) |>
    dplyr::summarise(drug_max_ct_phase = max(drug_max_phase_indication),
                     .groups = "drop")
  )

  targeted_compounds <- as.data.frame(targeted_compounds |>
    dplyr::select(-drug_max_ct_phase) |>
    dplyr::left_join(
      drugs_with_max_phase_adj, 
      by = c("drug_name","molecule_chembl_id"),
      multiple = "all", relationship = "many-to-many")
  )
  
  targeted_cancer_compounds <- targeted_compounds |>
    dplyr::filter(
      !is.na(drug_cancer_relevance) &
        (drug_cancer_relevance == "by_cancer_condition_otp" |
           drug_cancer_relevance == "by_cancer_target_otp")) |>
    dplyr::mutate(drug_n_indications = dplyr::if_else(
      is.na(drug_n_indications),as.numeric(0),
      as.numeric(drug_n_indications)
    )) |>
    dplyr::mutate(drug_frac_cancer_indications = dplyr::if_else(
      is.na(drug_frac_cancer_indications),as.numeric(0),
      as.numeric(drug_frac_cancer_indications)
    )) |>
    dplyr::mutate(drug_approved_noncancer = dplyr::if_else(
      is.na(drug_approved_noncancer),as.logical(FALSE),
      as.logical(drug_approved_noncancer)
    )) |>
    dplyr::filter(drug_withdrawn == F)
  
  targeted_noncancer_compounds <- targeted_compounds |>
    # dplyr::filter(
    #   !is.na(drug_cancer_relevance) & 
    #     drug_cancer_relevance == "by_other_condition_otp") |> 
    dplyr::anti_join(
      dplyr::select(
        targeted_cancer_compounds, molecule_chembl_id
      ), by = "molecule_chembl_id"
    ) |>
    dplyr::filter(drug_withdrawn == F)
  
  targeted_noncancer_compounds$drug_name_lc <- NULL
  targeted_cancer_compounds$drug_name_lc <- NULL
  
  ot_recs <- dplyr::bind_rows(
    targeted_cancer_compounds,
    targeted_noncancer_compounds) |> 
    dplyr::distinct()
  
  return(ot_recs)

}

get_atc_drug_classification <- function(
    path_data_raw = NULL){
  
  atc_codes <- as.data.frame(
    readr::read_csv(
      file = file.path(
      path_data_raw, "atc", 
      "ATC.csv.gz"), 
      show_col_types = F)) |>
    janitor::clean_names() |>
    dplyr::mutate(class_id = stringr::str_replace(
      class_id, "http://purl.bioontology.org/ontology/ATC/",""
    )) |>
    dplyr::mutate(parents = stringr::str_replace(
      parents, "http://purl.bioontology.org/ontology/ATC/",""
    )) |>
    dplyr::mutate(
      is_drug_class = dplyr::if_else(
        is.na(is_drug_class),
        FALSE,
        TRUE
      )
    ) |>
    dplyr::select(class_id, preferred_label, synonyms, obsolete,
                  parents, atc_level, is_drug_class) |>
    dplyr::arrange(atc_level, class_id, parents)
    #dplyr::filter(stringr::str_detect(class_id,"^L"))
  
  atc_level1 <- atc_codes |>
    dplyr::filter(atc_level == 2) |>
    dplyr::rename(atc_level1 = preferred_label,
                  atc_code_level1 = class_id) |>
    dplyr::select(atc_code_level1, atc_level1)
  
  atc_level2 <- atc_codes |>
    dplyr::filter(atc_level == 3) |>
    dplyr::rename(atc_level2 = preferred_label,
                  atc_code_level2 = class_id, parent_code = parents) |>
    dplyr::select(atc_code_level2, atc_level2, parent_code)
  
  
  atc_level3 <- atc_codes |>
    dplyr::filter(atc_level == 4) |>
    dplyr::rename(atc_level3 = preferred_label,
                  atc_code_level3 = class_id, 
                  parent_code = parents) |>
    dplyr::select(atc_code_level3, 
                  atc_level3, parent_code)
  
  atc_level4 <- atc_codes |>
    dplyr::filter(atc_level == 5) |>
    dplyr::rename(atc_level4 = preferred_label,
                  atc_code_level4 = class_id, 
                  parent_code = parents) |>
    dplyr::select(atc_code_level4, 
                  atc_level4, 
                  parent_code)
  
  atc_drug_classification <- atc_level1 |> 
    dplyr::left_join(
      atc_level2, by = c("atc_code_level1" = "parent_code")) |>
    dplyr::left_join(
      atc_level3, by = c("atc_code_level2" = "parent_code")
    )  |>
    dplyr::left_join(
      atc_level4, by = c("atc_code_level3" = "parent_code")
    )  |>
    dplyr::rename(atc_drug_entry = atc_level4) |>
    dplyr::mutate(atc_level2 = stringr::str_replace(
      atc_level2, 
      "in ATC|, ANTINEOPLASTIC DRUGS|, ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS",""
    )) |>
    dplyr::mutate(atc_level3 = stringr::str_replace(
      atc_level3, 
      "in ATC",""
    )) |>
    dplyr::mutate(atc_level2 = stringr::str_replace(
      atc_level2, "AND ANTIBODY DRUG CONJUGATES","AND ADCs"
    )) |>
    dplyr::mutate(atc_level3 = dplyr::case_when(
      atc_code_level3 == "L03AX" ~ "Other immunostimulants",
      atc_code_level3 == "L01CD" ~ "Taxanes",
      atc_code_level3 == "L04AX" ~ "Other immunosuppressants",
      atc_code_level3 == "L04AB" ~ "TNF-alpha inhibitors",
      atc_code_level3 == "L02BX" ~ "Other hormone antagonists",
      atc_code_level3 == "L02AE" ~ "GnRH analogs",
      atc_code_level3 == "L01XK" ~ "PARP inhibitors",
      atc_code_level3 == "L01XX" ~ "Other antineoplastic agents",
      atc_code_level3 == "L02AX" ~ "Other hormones for endocrine therapy",
      atc_code_level3 == "L02BB" ~ "Anti-androgens",
      atc_code_level3 == "L01XH" ~ "HDAC inhibitors",
      atc_code_level3 == "L01XG" ~ "Proteasome inhibitors",
      atc_code_level3 == "L01XA" ~ "Platinum compounds",
      atc_code_level3 == "L01FX" ~ "Other MaBs and ADCs",
      atc_code_level3 == "L01FF" ~ "PD-1/PDL-1 inhibitors",
      atc_code_level3 == "L01FG" ~ "VEGFR inhibitors",
      atc_code_level3 == "L01FE" ~ "EGFR inhibitors",
      atc_code_level3 == "L01FD" ~ "HER2 inhibitors",
      atc_code_level3 == "L01FC" ~ "CD38 inhibitors",
      atc_code_level3 == "L01FB" ~ "CD22 inhibitors",
      atc_code_level3 == "L01FA" ~ "CD20 inhibitors",
      atc_code_level3 == "L01EX" ~ "Other protein kinase inhibitors",
      atc_code_level3 == "L01EN" ~ "FGFR inhibitors",
      atc_code_level3 == "L01EM" ~ "PI3K inhibitors",
      atc_code_level3 == "L01EL" ~ "BTK inhibitors",
      atc_code_level3 == "L01EK" ~ "VEGFR inhibitors",
      atc_code_level3 == "L01EJ" ~ "JAK inhibitors",
      atc_code_level3 == "L01EH" ~ "HER2 inhibitors",
      atc_code_level3 == "L01EG" ~ "mTOR inhibitors",
      atc_code_level3 == "L01EF" ~ "CDK inhibitors",
      atc_code_level3 == "L01EE" ~ "MEK inhibitors",
      atc_code_level3 == "L01ED" ~ "ALK inhibitors",
      atc_code_level3 == "L01EC" ~ "BRAF inhibitors",
      atc_code_level3 == "L01EB" ~ "EGFR inhibitors",
      atc_code_level3 == "L01EA" ~ "BCR-ABL inhibitors",
      atc_code_level3 == "L01DC" ~ "Other cytotoxic antibiotics",
      atc_code_level3 == "L01CX" ~ "Other plant alkaloids and natural products",
      atc_code_level3 == "L01CE" ~ "TOP1 inhibitors",
      atc_code_level3 == "L01CC" ~ "Colchicine derivatives",
      atc_code_level3 == "L01CB" ~ "Podophyllotoxin derivatives",
      atc_code_level3 == "L01CA" ~ "Vinca alkaloids and analogues",
      atc_code_level3 == "L01BC" ~ "Pyrimidine analogues",
      atc_code_level3 == "L01BB" ~ "Purine analogues",
      atc_code_level3 == "L01BA" ~ "Folic acid analogues",
      atc_code_level3 == "L01AX" ~ "Other alkylating agents",
      atc_code_level3 == "L01XB" ~ "Methylhydrazines",
      atc_code_level3 == "L01AG" ~ "Epoxides",
      atc_code_level3 == "L01AD" ~ "Nitrosoureas",
      atc_code_level3 == "L01AC" ~ "Ethylene imines",
      TRUE ~ as.character(atc_level3)
    )) |>
    dplyr::select(-atc_code_level4) |>
    dplyr::distinct() 
    #dplyr::mutate(atc_drug_entry = dplyr::if_else(
      #atc_code_level3 == "L01EX" |
        #atc_code_level3 == "L01FX" |
        #atc_code_level3 == "L01XX",
      #as.character(NA),
      #as.character(atc_drug_entry)
    #))
  
  atc_custom <- as.data.frame(
    readr::read_csv(
      file.path(
        path_data_raw,
        "atc",
        "custom_classification.csv"
      ), col_names = T, show_col_types = F, na = c(".")
    )
  )
  
  atc_drug_classification <- atc_drug_classification |>
    dplyr::bind_rows(atc_custom) |>
    dplyr::arrange(atc_code_level3) |>
    dplyr::distinct() |>
    dplyr::mutate(atc_code_level3 = dplyr::case_when(
      ## JAK inhibitors (assigned to both L01EJ and L04AF, 
      ## stick to L01EJ for consistency with other kinase inhibitors)
      atc_code_level3 == "L04AF" ~ "L01EJ",
      ## mTOR inhibitors (assigned to both L04AH and L01EG, 
      ## stick to L01EG for consistency with other kinase inhibitors)
      atc_code_level3 == "L04AH" ~ "L01EG",
      atc_drug_entry == "ipilimumab" | 
        atc_drug_entry == "tremelimumab" ~ "L01FXA",
      atc_drug_entry == "epcoritamab" | 
        atc_drug_entry == "glofitamab" |
        atc_drug_entry == "mosunetuzumab" |
        atc_drug_entry == "talquetamab" | 
        atc_drug_entry == "teclistamab" ~ "L01FXE",
      TRUE ~ as.character(atc_code_level3)
    )) |>
    dplyr::mutate(atc_level3 = dplyr::case_when(
      atc_code_level3 == "L01EJ" ~ "JAK inhibitors",
      atc_code_level3 == "L01EG" ~ "mTOR inhibitors",
      atc_drug_entry == "ipilimumab" | 
        atc_drug_entry == "tremelimumab" ~ "Other ICIs - CTLA4 inhibitors",
      atc_drug_entry == "epcoritamab" | 
        atc_drug_entry == "glofitamab" |
        atc_drug_entry == "mosunetuzumab" | 
        atc_drug_entry == "talquetamab" | 
        atc_drug_entry == "teclistamab" ~ "T-cell Engagers",
      TRUE ~ as.character(atc_level3)
    ))
  
  
  return(atc_drug_classification)
  
  
}



## NCI THESAURUS CANCER DRUGS/TREATMENTS
get_nci_drugs <- function(nci_db_release = nci_db_release,
                          overwrite = F,
                          path_data_raw = NULL,
                          path_data_processed = NULL){
  nci_antineo_thesaurus <- NULL
  nci_drugs <- NULL
  
  nci_ftp_base <- paste0(
    "https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/archive/",
    nci_db_release,
    "_Release/")
  
  nci_thesaurus_files <- list()
  nci_thesaurus_files[['flat']] <- 
    paste0("Thesaurus_", nci_db_release,".FLAT.zip")
  nci_thesaurus_files[['owl']] <- 
    paste0("Thesaurus_", nci_db_release,".OWL.zip")
  nci_thesaurus_files[['inf_owl']] <- 
    paste0("ThesaurusInf_", nci_db_release,".OWL.zip")
  
  
  options(timeout = 50000)
  for (elem in c('flat','owl','inf_owl')) {
    remote_file <- paste0(
      nci_ftp_base, nci_thesaurus_files[[elem]])
    local_file <- file.path(
      path_data_raw,"nci_thesaurus", 
      nci_thesaurus_files[[elem]])
    if (!file.exists(local_file)) {
      download.file(
        url = remote_file, 
        destfile = local_file, 
        quiet = T)
      system(paste0('unzip -d ',
                    file.path(path_data_raw, "nci_thesaurus"), 
                    ' -o -u ',local_file))
    }
  }
  
  antineo_agents_url <-
    'https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Drug_or_Substance/Antineoplastic_Agent.txt'
  antineo_agents_local <-
    file.path(
      path_data_raw,
      "nci_thesaurus",
      "Antineoplastic_Agent.txt")
  if (!file.exists(antineo_agents_local)) {
    download.file(
      url = antineo_agents_url, 
      destfile = antineo_agents_local, quiet = T)
  }
  
  
  if(!file.exists(
    file.path(
      path_data_processed,
      "nci_thesaurus",
      "nci_treatment_thesaurus_antineo_v2.rds")) | overwrite == T){
    
    sorafenib_definition <-
      'A synthetic compound targeting growth signaling and angiogenesis. Sorafenib blocks the enzyme RAF kinase, a critical component of the RAF/MEK/ERK signaling pathway that controls cell division and proliferation; in addition, sorafenib inhibits the VEGFR-2/PDGFR-beta signaling cascade, thereby blocking tumor angiogenesis.'
    
    nci_display_labels <- process_nci_labels(
      path_data_raw = path_data_raw,
      overwrite = overwrite)
    
    ## Agents/compounds marked as antineplastic according to NCI
    nci_antineo_agents <-
      read.table(file = file.path(path_data_raw,"nci_thesaurus","Antineoplastic_Agent.txt"),
                 header = T, stringsAsFactors = F, comment.char = "",
                 quote = "", sep = "\t") |>
      janitor::clean_names() |>
      dplyr::rename(nci_t = code) |>
      dplyr::select(nci_t) |>
      dplyr::mutate(antineoplastic_agent = TRUE) |>
      dplyr::distinct()
    
    
    ## parse all entries in nci thesaurus where the 
    ## semantic concept type is treatment-related
    nci_antineo_thesaurus_raw <- as.data.frame(
      read.table(file = file.path(path_data_raw, "nci_thesaurus", "Thesaurus.txt"),
                 header = F, stringsAsFactors = F, sep="\t",
                 comment.char="", quote = "") |>
        dplyr::rename(nci_t = V1, nci_concept_name = V2,
                      nci_t_parent = V3, concept_synonym = V4,
                      nci_concept_definition = V5,
                      nci_cd_name = V6, nci_concept_status = V7,
                      nci_concept_semantic_type = V8) |>
        
        dplyr::filter(
          stringr::str_detect(
            nci_concept_semantic_type,
            "Chemical|Substance|Therapeutic|Drug|Immunologic")) |>
        dplyr::filter(
          nci_concept_semantic_type != "Drug Delivery Device" &
          nci_concept_semantic_type != "Chemical Viewed Functionally"  
        ) |>
        dplyr::filter(
          !stringr::str_detect(
            tolower(nci_concept_definition), "surgical"
          )
        ) |>
        dplyr::filter(
          nci_concept_semantic_type != "Therapeutic or Preventive Procedure" |
            (nci_concept_semantic_type == "Therapeutic or Preventive Procedure" &
               stringr::str_detect(
                 tolower(concept_synonym), "/"
               ) &
               !stringr::str_detect(
                 tolower(nci_concept_definition), "regimen"
               )
            )
        ) |>
        dplyr::left_join(
          nci_antineo_agents, by = c("nci_t"),
          multiple = "all", relationship = "many-to-many") |>
        dplyr::select(-c(nci_concept_name, nci_cd_name)) |>
        dplyr::mutate(nci_concept_synonym_all = concept_synonym) |>
        tidyr::separate_rows(concept_synonym,sep="\\|") |>
        dplyr::mutate(nci_concept_synonym = tolower(concept_synonym)) |>
        dplyr::select(-concept_synonym) |>
        dplyr::filter(
          nci_t != 'C147908' & nci_t != 'C71622') |> ## Hormone Therapy Agent
        dplyr::mutate(
          nci_concept_semantic_type =
            dplyr::if_else(
              nci_t == 'C61948','Pharmacologic Substance',
              nci_concept_semantic_type)) |> ## redundant Sorafenib entries
        dplyr::mutate(
          nci_concept_definition =
            dplyr::if_else(
              nci_t == 'C61948', sorafenib_definition,
              nci_concept_definition)) |> ## redundant Sorafenib entries
        dplyr::distinct() |>
        dplyr::left_join(
          nci_display_labels, by = c("nci_t"), 
          multiple = "all", relationship = "many-to-many") |>
        dplyr::filter(
          !(nci_t == "C1806" & nci_concept_synonym == "gemtuzumab")) |>
        dplyr::filter(
          !(nci_t == "C405" & nci_concept_synonym == "ctx")) |>
        dplyr::mutate(nci_db_version = nci_db_release) |>
        dplyr::filter(!is.na(nci_cd_name)) |>
        dplyr::filter(!stringr::str_detect(
          tolower(nci_concept_definition), "coronavirus")) |>
        dplyr::filter(!stringr::str_detect(
          nci_concept_synonym_all, 
          "SARS-CoV-2|COVID-19|CoV-19|Coronary|Corona|Covid-19|covid-19")) |>
        #))
        dplyr::mutate(antineoplastic_agent = dplyr::if_else(
          is.na(antineoplastic_agent),
          as.logical(TRUE),
          as.logical(antineoplastic_agent)
        )) |>
        dplyr::filter(!stringr::str_detect(nci_cd_name," (Gel|Oil|Cream|Seed|Block|Field|Supplement|Factor)$")) |>
        dplyr::filter(!stringr::str_detect(nci_cd_name,"(Vaccination|Lotion|Therapeutic Heat|Procedure|Rehabilitation|Prevention|Rinse)$")) |>
        dplyr::filter(!stringr::str_detect(nci_cd_name,"(Epitope|Exract|Influenza|Ginseng|Ointment|Management|Injection|Tool)$")) |>
        dplyr::filter(!stringr::str_detect(nci_cd_name,"^(Vitamin A Compound|Inactivated Poliovirus|Antineoplastic Immune Cell|Topical)")) |>
        dplyr::filter(!stringr::str_detect(nci_cd_name,"^(Sheng-Yu|Ginseng|Dry Cleaning|Boost|Tobacco|Microwave)")) |>
        dplyr::filter(!stringr::str_detect(nci_cd_name,"^(Blood|Interruption of|Gum Arabic|Vaginal Cylinder|Laser Ablation|Wheatgrass)")) |>
        dplyr::filter(!stringr::str_detect(nci_cd_name,"^(Chemical Challenge|Prevention of|Magic Mouthwash|Wood Dust|Soot|Cocaine)")) |>
        dplyr::filter(!stringr::str_detect(nci_cd_name,"^(Antibody|Antigen|Antioxidant|Vaccination|Acetate|Antiserum|Asbestos|Aspirate|Autoantigen|Cytokine)$")) |>
        dplyr::filter(!stringr::str_detect(nci_cd_name," Spray| Extract| Antidiabetic| Implant|(Green Tea|Living Healthy|Pollutant|Probe|Protective Agent|Supportive Care|Caffe)"))
        #dplyr::filter(
        #  !stringr::str_detect(
        #    tolower(nci_concept_definition),
        #    "chinese |antidiabet|diabetes|antidepress|analgesic|pulmonary edema|nutritional|human carcinogen|anesthetic|nonsedating|sedative|antihyper|antiinflamma|antiarrythm|antiangin|antihist|muscle|neurotransmitter"))
    )
    
    
    nci_antineo_thesaurus <- nci_antineo_thesaurus_raw
    
    ## add apatinib as an alias (is missing in NCI thesaurus)
    apatinib_alias_entry <- nci_antineo_thesaurus |>
      dplyr::filter(nci_concept_synonym == "rivoceranib") |>
      dplyr::mutate(nci_concept_synonym = "apatinib") |>
      dplyr::mutate(nci_concept_synonym_all =
                      paste(nci_concept_synonym_all,"Apatinib",sep="|"))
    
    
    ## add nab-paclitaxel as an alias (is missing in NCI thesaurus)
    paclitaxel_entry <- nci_antineo_thesaurus |>
      dplyr::filter(nci_cd_name == "Paclitaxel") |>
      dplyr::select(nci_t, nci_t_parent, nci_concept_definition,
                    nci_concept_status, nci_concept_semantic_type,
                    nci_concept_synonym_all, cui) |>
      dplyr::distinct()
    
    nab_paclitaxel_entry <- nci_antineo_thesaurus |>
      dplyr::filter(nci_cd_name == "Nab-paclitaxel")
    
    nab_paclitaxel_entry$nci_t <- paclitaxel_entry$nci_t
    nab_paclitaxel_entry$nci_t_parent <- paclitaxel_entry$nci_t_parent
    nab_paclitaxel_entry$cui <- paclitaxel_entry$cui
    nab_paclitaxel_entry$nci_concept_definition <- paclitaxel_entry$nci_concept_definition
    nab_paclitaxel_entry$nci_concept_status <- paclitaxel_entry$nci_concept_status
    nab_paclitaxel_entry$nci_concept_semantic_type <- paclitaxel_entry$nci_concept_semantic_type
    nab_paclitaxel_entry$nci_cd_name <- "Paclitaxel"
    
    aliases_paclitaxel <- stringr::str_split(
      paclitaxel_entry$nci_concept_synonym_all, "\\|")[[1]]
    aliases_nab_paclitaxel <- stringr::str_split(
      nab_paclitaxel_entry$nci_concept_synonym_all[1], "\\|")[[1]]
    all_paclitaxel_aliases <- 
      paste(sort(unique(c(aliases_paclitaxel, aliases_nab_paclitaxel))),
                 collapse="|")
    
    nab_paclitaxel_entry$nci_concept_synonym_all <- 
      all_paclitaxel_aliases
    
    paclitaxel_entries <- nci_antineo_thesaurus |>
      dplyr::filter(nci_cd_name == "Paclitaxel")
    paclitaxel_entries$nci_concept_synonym_all <- 
      all_paclitaxel_aliases
    
    nci_antineo_thesaurus <- nci_antineo_thesaurus |>
      dplyr::filter(nci_cd_name != "Paclitaxel" &
                      nci_cd_name != "Nab-paclitaxel") |>
      dplyr::bind_rows(nab_paclitaxel_entry) |>
      dplyr::bind_rows(paclitaxel_entries) |>
      dplyr::bind_rows(apatinib_alias_entry) |>
      dplyr::arrange(nci_cd_name) |>
      dplyr::rename(nci_drug_name = nci_concept_synonym) |>
      dplyr::filter(!(nci_cd_name == "Sorafenib Tosylate" &
                        nci_drug_name == "sorafenib")) |>
      dplyr::select(-c(nci_t_parent,cui)) |>
      dplyr::distinct()
     
    nci_drugs <- list()
    
    ## NCI anticancer drugs (non-targeted) - lacking compound identifier (CHEMBL)
    nci_drugs[['no_chembl_id']] <- nci_antineo_thesaurus |>
      dplyr::select(nci_t,
                    nci_concept_definition,
                    nci_cd_name,
                    nci_drug_name,
                    nci_concept_synonym_all) |>
      dplyr::filter(!stringr::str_detect(
        nci_drug_name,
        "(material|medication|medicine|medicinal|medical|^was |^agent |no therapy|-line therapy)"
      )) |>
      dplyr::filter(!stringr::str_detect(
        tolower(nci_cd_name),
        "(material|medication|medicine|medicinal|medical|^was |^agent |no therapy|-line therapy)"
      )) |>
      
      ## remove general drug terms ("XX Agent" etc)
      dplyr::mutate(num_words = stringr::str_count(
        nci_cd_name, " ") + 1) |>
      dplyr::filter(
        !(num_words <= 4 & 
            stringr::str_detect(tolower(nci_cd_name), " agent") &
            !stringr::str_detect(nci_cd_name, "[0-9]{1,}")
        )) |>
      dplyr::select(-num_words) |>
      dplyr::distinct()
    
    saveRDS(nci_drugs, file = file.path(
      path_data_processed,  "nci_thesaurus",
      "nci_treatment_thesaurus_antineo_v2.rds"))
    
  }else{
    nci_drugs <- readRDS(
      file = file.path(
        path_data_processed, "nci_thesaurus",
        "nci_treatment_thesaurus_antineo_v2.rds"))
  }
  return(nci_drugs)
}


merge_nci_opentargets <- function(
    drug_sets = NULL,
    path_data_raw = NULL){
  
  
  ## X-ref Open Targets and NCI by drug name
  otp_drugs_all <- drug_sets[['otp']] |>
    dplyr::mutate(drug_name_lc = tolower(drug_name)) |>
    dplyr::left_join(
        dplyr::filter(
          drug_sets[['nci']][['no_chembl_id']],
          !is.na(nci_drug_name)),
      by = c("drug_name_lc" = "nci_drug_name"),
      multiple = "all", relationship = "many-to-many") |>
    dplyr::select(-c("drug_name_lc")) |>
    dplyr::distinct() |>
    dplyr::mutate(opentargets = TRUE)
  
  ## list all drug aliases in targeted drugs from OTP
  all_drug_aliases <- otp_drugs_all |>
    tidyr::separate_rows(drug_synonyms, sep = "\\|") |>
    dplyr::select(drug_synonyms) |>
    dplyr::mutate(nci_drug_name = tolower(drug_synonyms)) |>
    dplyr::distinct()
  
  ## Identify drugs in NCI that are in Open Targets
  found_drugs <- drug_sets[['nci']][['no_chembl_id']] |>
    dplyr::semi_join(all_drug_aliases,
                     by = "nci_drug_name") |>
    dplyr::select(nci_t, nci_drug_name) |>
    dplyr::distinct()
  
  
  
  nci_missing <- drug_sets[['nci']][['no_chembl_id']] |>
    dplyr::anti_join(found_drugs,
                     by = "nci_t") |>
    dplyr::select(-nci_drug_name) |>
    dplyr::mutate(opentargets = F) |>
    dplyr::distinct()
  
  ## ignore some drugs
  custom_name_ignore <- as.data.frame(readr::read_tsv(
    file = "data-raw/drug_names_ignore.tsv",
    col_names = F, show_col_types = F))
  colnames(custom_name_ignore) <- c("nci_cd_name")
  
  ## ignore some molecule identifiers (mislabeled)
  custom_chembl_ignore <- as.data.frame(readr::read_tsv(
    file = "data-raw/molecule_chembl_ids_ignore.tsv",
    col_names = F, show_col_types = F))
  colnames(custom_chembl_ignore) <- c("molecule_chembl_id")
  
  all_cancer_drugs <- otp_drugs_all |>
    dplyr::bind_rows(nci_missing) |>
    dplyr::anti_join(
      custom_chembl_ignore,
      by = "molecule_chembl_id") |>
    dplyr::distinct() |>
    dplyr::mutate(
      nci_cd_name =
        dplyr::if_else(
          is.na(nci_cd_name) &
            !stringr::str_detect(drug_name,"[0-9]"),
          Hmisc::capitalize(tolower(drug_name)),
          nci_cd_name)) |>
    dplyr::mutate(
      nci_cd_name = dplyr::if_else(
        is.na(nci_cd_name) &
          stringr::str_detect(drug_name,"[0-9]"),
        drug_name,
        nci_cd_name)) |>
    dplyr::mutate(
      nci_cd_name = dplyr::if_else(
        !is.na(nci_cd_name) &
          !is.na(molecule_chembl_id) &
          molecule_chembl_id == "CHEMBL535",
      "Sunitinib",
      nci_cd_name)) |>
    dplyr::mutate(
      nci_cd_name = dplyr::if_else(
        !is.na(drug_name) &
        !is.na(nci_cd_name) &
          !is.na(molecule_chembl_id) & 
          (molecule_chembl_id == "CHEMBL3545427" |
          molecule_chembl_id == "CHEMBL5315068" |
          molecule_chembl_id == "CHEMBL2109325" |
          molecule_chembl_id == "CHEMBL5316170" |
          molecule_chembl_id == "CHEMBL31965" |
          molecule_chembl_id == "CHEMBL5314999" |
          molecule_chembl_id == "CHEMBL2109402" |
          molecule_chembl_id == "CHEMBL1790041" |
          molecule_chembl_id == "CHEMBL4650278" |
          molecule_chembl_id == "CHEMBL3545401"),
        stringr::str_to_title(drug_name),
        nci_cd_name)) |>
    dplyr::mutate(drug_name = dplyr::if_else(
      is.na(drug_name),
      nci_cd_name,
      drug_name)) |>
    dplyr::anti_join(
      custom_name_ignore,
      by = "nci_cd_name")
  
  rownames(all_cancer_drugs) <- NULL
  
  salt_patterns <-
    readr::read_tsv(
      file.path(path_data_raw, "salts.tsv"),
      show_col_types = F, col_names = F)

  salt_patterns_regex <- paste0(
    "( (",
    paste(salt_patterns$X1, collapse="|"),
    "))$")
  
  salt_forms <- all_cancer_drugs |>
    dplyr::filter(
      stringr::str_detect(
        tolower(nci_cd_name), 
        salt_patterns_regex)) |>
    dplyr::filter(!is.na(opentargets_version)) |>
    dplyr::mutate(tradename = stringr::str_replace(
      nci_cd_name,
      salt_patterns_regex,
      "")) |>
    dplyr::select(tradename, nci_cd_name) |>
    dplyr::distinct() |>
    dplyr::mutate(is_salt = T) |>
    dplyr::inner_join(
      dplyr::select(all_cancer_drugs, nci_cd_name),
      by = c("tradename" = "nci_cd_name"),
      multiple = "all", relationship = "many-to-many") |>
    dplyr::distinct() |>
    dplyr::select(-tradename)
  
  
  all_cancer_drugs <- all_cancer_drugs |>
    dplyr::left_join(
      salt_forms, 
      by = "nci_cd_name",
      multiple = "all", 
      relationship = "many-to-many") |>
    dplyr::mutate(is_salt = dplyr::if_else(
      is.na(is_salt),
      as.logical(FALSE),
      as.logical(is_salt)
    )) |>
    dplyr::distinct()
  
  
  ## antibody drug conjugates
  adc_candidates <- all_cancer_drugs |>
    dplyr::filter(
      (!is.na(nci_cd_name) &
         stringr::str_detect(tolower(nci_cd_name), "mab ")) |
        stringr::str_detect(
          nci_concept_definition, "ADC|antibody(-| )drug conjugate")) |>
    dplyr::mutate(is_adc = TRUE) |>
    dplyr::mutate(is_adc = dplyr::if_else(
      stringr::str_detect(nci_concept_definition, "ADCC") &
        !(stringr::str_detect(
          tolower(nci_concept_definition),
          "(antibody(-| )drug conjugate)") &
            (is.na(drug_type) |
               drug_type == "Small molecule" |
               drug_type == "Antibody")),
      as.logical(FALSE),
      as.logical(is_adc)
    )) |>
    dplyr::mutate(is_adc = dplyr::if_else(
      is_adc == F &
        !is.na(nci_cd_name) &
        stringr::str_detect(
          tolower(nci_cd_name), "mab ") &
        stringr::str_detect(
          tolower(nci_cd_name), "(tin|ine|tan|can|tox)$"
        ),
      TRUE,
      as.logical(is_adc)
    )) |>
    dplyr::filter(!is.na(nci_cd_name)) |>
    dplyr::select(nci_cd_name, is_adc) |>
    dplyr::distinct()
  
  all_cancer_drugs <- all_cancer_drugs |>
    dplyr::left_join(
      adc_candidates, 
      by = "nci_cd_name",
      multiple = "all", 
      relationship = "many-to-many") |>
    dplyr::mutate(is_adc = dplyr::if_else(
      is.na(is_adc),
      as.logical(FALSE),
      as.logical(is_adc)
    )) |>
    dplyr::filter(drug_withdrawn == F | is.na(drug_withdrawn)) |>
    dplyr::mutate(nci_concept_definition = dplyr::if_else(
      !is.na(nci_concept_definition) & 
        nchar(nci_concept_definition) == 0,
      as.character(NA),
      as.character(nci_concept_definition)
    )) |>
    dplyr::mutate(drug_n_indications = dplyr::if_else(
      is.na(drug_n_indications),as.numeric(0),
      as.numeric(drug_n_indications)
    )) |>
    dplyr::mutate(drug_frac_cancer_indications = dplyr::if_else(
      is.na(drug_frac_cancer_indications),as.numeric(0),
      as.numeric(drug_frac_cancer_indications)
    )) |>
    dplyr::mutate(drug_approved_noncancer = dplyr::if_else(
      is.na(drug_approved_noncancer),as.logical(FALSE),
      as.logical(drug_approved_noncancer)
    )) |>
    dplyr::mutate(drug_cancer_relevance = dplyr::if_else(
      is.na(drug_cancer_relevance) &
        opentargets == F,
      "by_nci",
      as.character(drug_cancer_relevance)
    )) |>
    dplyr::distinct()
  
  return(all_cancer_drugs)
  
}



map_curated_targets <- function(gene_info = NULL,
                                   path_data_raw = NULL,
                                   drug_df = NULL){

  drug_df$target_entrezgene <- as.numeric(
    drug_df$target_entrezgene
  )
  
  drug_target_patterns <-
    read.table(file = file.path(
      path_data_raw,
      "custom_drug_target_regex_nci.tsv"),
      sep = "\t", header = T, 
      stringsAsFactors = F, quote = "") |>
    dplyr::inner_join(
      gene_info, by = "symbol", multiple = "all", 
      relationship = "many-to-many") |>
    dplyr::distinct()

  drugname_suffix <- 
    paste0("(ab|al|an|at|cl|co|da|de|dy|ea|ed|el|en|er|es|ex|fa|ib|ic|",
           "id|il|im|in|ir|is|le|ls|lt|me|na|ne|ns|nt|od|ol|on|or|pt|",
           "py|ra|rd|re|rm|rt|se|ta|te|ts|ue|um|us|yl)$")
  drug_inhibitor_regex <- 
    paste0(drugname_suffix,"|^(anti-|inhibitor of)|inhibitor")
  
  all_inhibitors_no_target <- drug_df |>
    dplyr::filter(is.na(target_symbol)) |>
    dplyr::filter(
      stringr::str_detect(
        tolower(nci_cd_name),
        "inhibitor|antagonist|antibody|blocker|sepantronium| mimetic") |
        stringr::str_detect(
          tolower(nci_cd_name),
          drug_inhibitor_regex) |
        (stringr::str_detect(nci_concept_definition,"KRAS") &
           stringr::str_detect(nci_concept_definition,"inhibitor"))) |>
    dplyr::filter(!stringr::str_detect(
      nci_cd_name,
      " CAR T|(T|t)herapy|SARS-CoV-2|( (R|r)egimen|(A|a)cid|Hydrochloride)$")) |>
    dplyr::filter(!stringr::str_detect(
      nci_concept_definition,
      "SARS-CoV-2")) |>
    dplyr::filter(nchar(nci_concept_definition) > 0)

  custom_nci_targeted_drugs <- data.frame()
  for(i in 1:nrow(drug_target_patterns)){
    pattern <- drug_target_patterns[i, "pattern"]
    target_symbol <- drug_target_patterns[i, "symbol"]
    target_genename <- drug_target_patterns[i, "genename"]
    target_entrezgene <- drug_target_patterns[i, "target_entrezgene"]
    target_type <- drug_target_patterns[i, "target_type"]
    target_ensembl_gene_id <- drug_target_patterns[i, "target_ensembl_gene_id"]
    #target_uniprot_id <- drug_target_patterns[i, "target_uniprot_id"]

    hits <- all_inhibitors_no_target |>
      dplyr::filter(stringr::str_detect(
        nci_cd_name,
        pattern = pattern) |
          (stringr::str_detect(
            tolower(nci_cd_name),
            drug_inhibitor_regex) &
             stringr::str_detect(
               nci_concept_definition, pattern))
      )

    if(nrow(hits) > 0){

      for(n in 1:nrow(hits)){
        hit <- hits[n,]

        if(stringr::str_detect(tolower(hit$nci_cd_name),
                               "(mab|art)$|monoclonal antibody|^anti-")){
          hit$drug_type <- "Antibody"
        }else{
          hit$drug_type <- "Small molecule"
        }

        hit$drug_action_type <- "INHIBITOR"
        if(stringr::str_detect(
          tolower(hit$nci_cd_name),
          "antagonist")){
          hit$drug_action_type <- "ANTAGONIST"
        }
        if(stringr::str_detect(
          tolower(hit$nci_cd_name),
          "blocker")){
          hit$drug_action_type <- "BLOCKER"
        }
        hit$target_symbol <- target_symbol
        hit$target_genename <- target_genename
        hit$target_type <- target_type
        hit$target_entrezgene <- target_entrezgene
        hit$target_ensembl_gene_id <- target_ensembl_gene_id
        hit$drug_clinical_source <- "nci_thesaurus_custom"
        hit$drug_cancer_relevance <- "by_cancer_target_nci"
        hit$drug_n_indications <- 0
        hit$drug_frac_cancer_indications <- 0
        hit$drug_approved_noncancer <- FALSE

        ## set general indications for unknown cases
        if(is.na(hit$disease_efo_id) & 
           is.na(hit$disease_efo_label) &
           is.na(hit$cui) & 
           is.na(hit$cui_name)){
          hit$disease_efo_id = "EFO:0000311"
          hit$disease_efo_label = "cancer"
          hit$cui = "C0006826"
          hit$cui_name = "Malignant neoplastic disease"
        }

        custom_nci_targeted_drugs <- custom_nci_targeted_drugs |>
          dplyr::bind_rows(hit)

      }
    }
  }


  ### CHECK HOW MANY TARGET-LACKING INHIBITORS ARE MISSING
  ### FROM THE CUSTOM NCI MATCHING ROUTINE

  inhibitors_no_target_nonmapped <- all_inhibitors_no_target |>
    dplyr::anti_join(custom_nci_targeted_drugs, by = "nci_cd_name") |>
    dplyr::filter(!stringr::str_detect(
      nci_concept_definition, 
      "(A|a)ntibody(-| )drug conjugate \\(ADC\\)"
    )) |>
    dplyr::filter(!stringr::str_detect(
      tolower(nci_cd_name),"^(allogeneic|regimen |copper |fluorine f |indium |iodine |carbon c|autologous |recombinant |lutetium |yttrium |y 90)|vaccine$"
    )) |>
    dplyr::filter(
      stringr::str_detect(
        nci_concept_definition, "antineoplastic|tumor|cancer"
      )
    ) |>
    dplyr::select(nci_cd_name,
                  nci_concept_definition) |>
    dplyr::distinct()


  drug_df$target_entrezgene <- 
    as.integer(drug_df$target_entrezgene)
  
  custom_nci_targeted_drugs$target_entrezgene <-
    as.integer(custom_nci_targeted_drugs$target_entrezgene)
  
  ot_nci_drugs_curated <-
    dplyr::anti_join(drug_df, custom_nci_targeted_drugs,
                     by = "nci_cd_name") |>
    dplyr::bind_rows(custom_nci_targeted_drugs) |>
    dplyr::arrange(target_symbol, nci_cd_name) |>
    dplyr::mutate(drug_action_type = dplyr::if_else(
      (stringr::str_detect(tolower(nci_cd_name),"inhibitor") &
         is.na(drug_action_type)) |
        (!is.na(nci_cd_name) &
           stringr::str_detect(nci_cd_name,"(mab|art)$") &
           is.na(drug_action_type)),
      "INHIBITOR",
      as.character(drug_action_type))) |>
    dplyr::mutate(drug_cancer_relevance = dplyr::if_else(
      is.na(drug_cancer_relevance) &
        (stringr::str_detect(
          tolower(nci_concept_definition),
          "anti-tumor|chemotherapy|cancer vaccine|immunothera|monoclonal antibody|antineoplastic|treatment of cancer|treatment of metastat") |
           stringr::str_detect(tolower(nci_cd_name)," regimen|recombinant|carcinoma|immune checkpoint|anti-programmed cell death ")),
      as.character("by_cancer_definition_nci"),
      as.character(drug_cancer_relevance)
    )) |>
    dplyr::mutate(drug_action_type = dplyr::if_else(
      is.na(drug_action_type) &
        stringr::str_detect(
          drug_action_type,
          "^(SUBSTRATE|HYDROLYTIC ENZYME|RELEASING AGENT)"),
      paste0(drug_action_type,"_OTHER"),
      as.character(drug_action_type)
    ))
  
  
  duplicated_drugs <- ot_nci_drugs_curated |>
    dplyr::group_by(nci_cd_name) |>
    dplyr::summarise(drug_cancer_relevance = paste(
      sort(unique(drug_cancer_relevance)), collapse="@")) |>
    dplyr::filter(stringr::str_detect(drug_cancer_relevance,"@")) |>
    tidyr::separate_rows(drug_cancer_relevance, sep = "@") |>
    dplyr::filter(!stringr::str_detect(drug_cancer_relevance,"otp"))
  
  ot_nci_drugs_curated <- ot_nci_drugs_curated |>
    dplyr::anti_join(duplicated_drugs, 
                     by = c("nci_cd_name","drug_cancer_relevance"))
  
  return(list('curated' = ot_nci_drugs_curated,
              'nonmapped' = inhibitors_no_target_nonmapped))
}

assign_drug_category <- function(drug_df = NULL,
                                  path_data_raw = NULL){

  atc_classification <- 
    get_atc_drug_classification(path_data_raw = path_data_raw)

  atc_classification_with_drugs <- atc_classification |>
    dplyr::mutate(atc_drug_entry = dplyr::case_when(
      !is.na(atc_drug_entry) &
      (tolower(atc_drug_entry) == "sotorasib" |
         tolower(atc_drug_entry) == "ivosidenib" |
         tolower(atc_drug_entry) == "enasidenib" |
         tolower(atc_drug_entry) == "venetoclax" |
        tolower(atc_drug_entry) == "adagrasib") ~ as.character(NA),
      TRUE ~ as.character(atc_drug_entry)
    )) |>
    dplyr::filter(!is.na(atc_drug_entry))
  
  atc_classification_clean <- atc_classification |>
    dplyr::filter(is.na(atc_drug_entry)) |>
    dplyr::bind_rows(atc_classification_with_drugs) |>
    dplyr::select(-atc_drug_entry) |>
    dplyr::distinct() |>
    dplyr::filter(!is.na(atc_code_level3))
  
  classified_drugs <- list()

  classified_drugs[['pre_classified_atc']] <- drug_df |>
    dplyr::mutate(drug_entry = tolower(nci_cd_name)) |>
    dplyr::left_join(
      dplyr::select(
        atc_classification_with_drugs, 
        atc_drug_entry, 
        atc_code_level3),
      by = c("drug_entry" = "atc_drug_entry"),
      relationship = "many-to-many") |>
    dplyr::filter(!is.na(atc_code_level3)) |>
    dplyr::distinct()
  
  drugs_non_classified <- drug_df |>
    dplyr::mutate(drug_entry = tolower(nci_cd_name)) |>
    dplyr::anti_join(
      classified_drugs[['pre_classified_atc']], 
      by = "drug_entry") |>
    dplyr::distinct()

  custom_target_classifications <- drugs_non_classified |>
    dplyr::filter(!is.na(target_symbol)) |>
    dplyr::group_by(drug_name, nci_cd_name, 
                    drug_entry,
                    nci_concept_definition,
                    molecule_chembl_id, drug_action_type) |>
    dplyr::reframe(
      target_symbol = paste(
        sort(unique(target_symbol)), collapse="|")) |>
    dplyr::mutate(atc_code_level3 = dplyr::case_when(
      target_symbol == "ABL1" | target_symbol == "ABL1|BCR" ~ "L01EA",
      target_symbol == "BRAF" ~ "L01EC",
      stringr::str_detect(target_symbol,"TOP1(\\|TOP2)?") ~ "L01CE",
      stringr::str_detect(
        target_symbol, "^((PSM(A|B|C|D)[0-9]{1,})|ADRM1|SEM1)") ~ "L01XG",
      stringr::str_detect(
        target_symbol, "^((BRAF\\|(KDR|RAF))|(ARAF\\|BRAF))") ~ "L01EXJ",
      target_symbol == "PLK1" | target_symbol == "PLK4" ~ "L01EXK",
      target_symbol == "EGFR" &
        !is.na(drug_name) &
          stringr::str_detect(drug_name, "MAB") ~ "L01FE",
      target_symbol == "ERBB2" &
        !is.na(drug_name) &
        stringr::str_detect(drug_name, "MAB") ~ "L01FD",
      target_symbol == "ERBB2" &
        (is.na(drug_name) |
           (!is.na(drug_name) &
              !stringr::str_detect(drug_name, "MAB"))) ~ "L01EH",
      !is.na(drug_name) &
        drug_name == "SIROLIMUS" ~ "L01EG",
      target_symbol == "EGFR" &
        (is.na(drug_name) |
        (!is.na(drug_name) &
        !stringr::str_detect(drug_name, "MAB"))) ~ "L01EB",
      (!is.na(drug_name) &
         stringr::str_detect(drug_name, "ABIVERTINIB")) ~ "L01EB",
      target_symbol == "ALK" ~ "L01ED",
      (drug_action_type == "INHIBITOR" |
         drug_action_type == "ANTAGONIST") & 
        target_symbol == "CTLA4" ~ "L01FXA",
      (drug_action_type == "INHIBITOR" |
         drug_action_type == "ANTAGONIST") & 
        target_symbol == "LAG3"  ~ "L01FXB",
      (drug_action_type == "INHIBITOR" |
         drug_action_type == "ANTAGONIST") & 
           target_symbol == "TIGIT" ~ "L01FXC",
      (drug_action_type == "INHIBITOR" |
         drug_action_type == "ANTAGONIST") & 
           (target_symbol == "CD274|CTLA4" |
           target_symbol == "CTLA4|PDCD1") ~ "L01FXD",
      (!is.na(drug_name) & drug_name == "BRENTUXIMAB VEDOTIN") |
      (stringr::str_detect(
        target_symbol,"^(TUBA|TUBB)") &
        (drug_action_type == "INHIBITOR" |
           drug_action_type == "DISRUPTING AGENT")) ~ "L01XXK",
      stringr::str_detect(
        target_symbol, "^(PARP[1-9]{1}(\\|)?){1,}$") ~ "L01XK",
      stringr::str_detect(
        target_symbol, "^(HDAC[0-9]{1,}(\\|)?){1,}$") ~ "L01XH",
      stringr::str_detect(
        target_symbol, "^MAP2K[0-9]") ~ "L01EE",
      stringr::str_detect(
        target_symbol, "^CDK[0-9]{1,}") ~ "L01EF",
      stringr::str_detect(
        target_symbol, "^(BIRC|XIAP)") ~ "L01XXF",
      stringr::str_detect(
        target_symbol, "^JAK[1-2]") ~ "L01EJ",
      stringr::str_detect(
        target_symbol, "^AURK(A|B|C)") ~ "L01EXB",
      stringr::str_detect(
        target_symbol, "EGFR") &
        stringr::str_detect(
          target_symbol,"ERBB") ~ "L01EXH",
      stringr::str_detect(
        target_symbol, "^FGFR[1-4]{1}") ~ "L01EN",
      stringr::str_detect(
        target_symbol, "^IDH[1-2]{1}") ~ "L01XXD",
      stringr::str_detect(
        target_symbol, "^((IGF1\\|)|IGF1R)") ~ "L01XXN",
      !is.na(drug_name) & 
        stringr::str_detect(
          tolower(drug_name), "^aberaterone") ~ "L02BX",
      !is.na(drug_name) & 
        stringr::str_detect(
          tolower(drug_name), "^(gemcitabine)") ~ "L01BC",
      !is.na(drug_name) & 
        stringr::str_detect(
          tolower(drug_name), "^(fludarabine)") ~ "L01BB",
      stringr::str_detect(
        target_symbol, "^(K|N|H)RAS") ~ "L01XXC",
      stringr::str_detect(
        target_symbol, "^MET$") ~ "L01EXA",
      stringr::str_detect(
        target_symbol, "^(CD3D\\|CD3E\\|CD3G)") ~ "L01FXE",
      stringr::str_detect(
        target_symbol, "^(AKT[0-9](\\|)?){1,}") |
        (!is.na(drug_name) &
           stringr::str_detect(
             drug_name, "GSK-690693")) ~ "L01EXC",
      stringr::str_detect(
        target_symbol, "^ATR$") ~ "L01XXI",
      stringr::str_detect(
        target_symbol, "^PDGFR(A|B)|^KIT") ~ "L01EXE",
      stringr::str_detect(
        target_symbol, "^(NTRK[0-9]\\|?){1,}$") |
        (!is.na(drug_name) & drug_name == "TALETRECTINIB") ~ "L01EXF",
      stringr::str_detect(
        target_symbol, "^(CHEK(1|2)\\|?){1,2}$") ~ "L01XXJ",
      stringr::str_detect(
        target_symbol, "^BCL2") ~ "L01XXM",
      stringr::str_detect(
        target_symbol, "^GNRH") ~ "L02AE",
      stringr::str_detect(
        target_symbol, "^BRD(T|[1-9]{1})") ~ "L01XXA",
      stringr::str_detect(
        target_symbol, "^MTOR\\|PIK3") ~ "L01XXG",
      stringr::str_detect(
        target_symbol, "^PIK3") |
        (!is.na(drug_name) & 
        (stringr::str_detect(drug_name,"UMBRALISIB") |
        (drug_name == "OMIPALISIB" |
           drug_name == "PF-04691502"))) ~ "L01EM",
      stringr::str_detect(
        target_symbol, "^BTK$") ~ "L01EL",
      (!is.na(drug_name) & 
         (drug_name == "PRALSETINIB" |
            drug_name == "SELPERCATINIB")) ~ "L01EXL",
      stringr::str_detect(
        target_symbol, "^MTOR$") ~ "L01EG",
      stringr::str_detect(
        target_symbol, "^(KDR|FLT1|FLT3|FLT4)") ~ "L01EK",
      stringr::str_detect(
        target_symbol, "^(MS4A1)") ~ "L01FA",
      stringr::str_detect(
        target_symbol, "^(ESR1)") ~ "L02BA",
      stringr::str_detect(
        target_symbol, "^(CD38)") ~ "L01FC",
      stringr::str_detect(
        target_symbol, "^(CD22)") ~ "L01FB",
      target_symbol == "MAPK1|MAPK3" ~ "L01XXE",
      stringr::str_detect(target_symbol, "^AR$") & 
        (!is.na(nci_concept_definition) &
           stringr::str_detect(
             nci_concept_definition, "androgen receptor")) ~ "L02BB",
      (target_symbol == "PDCD1" |
         target_symbol == "CD274") ~ "L01FF",
      TRUE ~ as.character(NA)
    )) |>
    dplyr::mutate(atc_code_level3 = dplyr::if_else(
      (!is.na(drug_name) &
         stringr::str_detect(
           drug_name,
           paste0(
             "DOVITINIB|BRIVANIB|UCN-01|AT-9283|",
             "SURUFATINIB|ORANTINIB|LUCITANIB|",
             "ALTIRATINIB|CEP-11981|CRENOLANIB|",
             "FORETINIB|FAMITINIB|MOTESANIB|",
             "LESTAURTINIB|OSI-930|PACRITINIB|KW-2449|",
             "CABOZANTINIB|BMS-817378|BMS-794833|",
             "GOLVATINIB|TAK-593|XL-820|TANDUTINIB|IMATINIB|",
             "CERDULATINIB|CEP-2563|FEDRATINIB|IBCASERTIB|",
             "VANDETANIB|TESEVATINIB|TARLOXOTINIB|",
             "TAK-285|SKLB1028|PUQUITINIB|KBP5209|REPOTRECTINIB|",
             "VATALANIB|QUIZARTINIB|BMS-690514|IMATINIB|DASATINIB|",
             "CANERTINIB|CEP-32496|REGORAFENIB|GUSACITINIB|",
             "SU-014813|X-82|XL-999|LINIFANIB|NINGETINIB|",
             "PEXIDARTINIB|RG-1530|SITRAVATINIB|SORAFENIB|SUNITINIB|",
             "ENTRECTINIB"))) |
      (!is.na(nci_concept_definition) &
         stringr::str_detect(
           tolower(nci_concept_definition), 
        paste0(
          "multikinase|multi(-)?targeted|multi-kinase inhibitor|",
          "targets multiple|inhibitor of multiple|multiple-receptor")) &
        stringr::str_count(target_symbol, pattern = "\\|") >= 2),
      "L01EXI",
      as.character(atc_code_level3)
    )) |>
    dplyr::left_join(
      dplyr::select(
        atc_classification_clean,
        atc_code_level3,
      ),
      by = "atc_code_level3",
      relationship = "many-to-many"
    ) |>
    #dplyr::filter(!is.na(atc_code_level3)) |>
    dplyr::select(
      #molecule_chembl_id,
      drug_entry,
      atc_code_level3
    ) |>
    dplyr::group_by(drug_entry) |>
    dplyr::summarise(
      atc_code_level3 = paste(
        unique(atc_code_level3), collapse="|")) |>
    dplyr::distinct() |>
    dplyr::mutate(
      atc_code_level3 = stringr::str_replace(
        atc_code_level3, "^(NA\\|)|(\\|(NA|L01XXK))$",""
      )
    )
  
  classified_drugs[['classified_targeted_custom']] <- 
    drugs_non_classified |>
    dplyr::filter(!is.na(target_symbol)) |>
    dplyr::left_join(
      custom_target_classifications, by = "drug_entry",
      relationship = "many-to-many") |>
    dplyr::distinct()
  
  classified_drugs_all<- 
    classified_drugs[['pre_classified_atc']] |>
    dplyr::bind_rows(
      classified_drugs[['classified_targeted_custom']]) |>
    dplyr::bind_rows(
      dplyr::filter(drugs_non_classified, is.na(target_symbol))) |>
    dplyr::mutate(atc_code_level3 = dplyr::case_when(
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
        (!is.na(nci_concept_definition) &
         stringr::str_detect(
           tolower(nci_concept_definition),
           "anthracycline|anthracenedione")) ~ "L01DB",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
        stringr::str_detect(tolower(drug_entry),"hydroxyurea|leucovorin|eniluracil") ~ "L01BA",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
        stringr::str_detect(tolower(drug_entry),"arsenic trioxide") ~ "L01BA",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
        stringr::str_detect(tolower(drug_entry),
                            "thioguanine|aspacytarabine|aspacytarabine|troxacitabine") ~ "L01BB",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
        !stringr::str_detect(drug_entry,"/| ") &
        stringr::str_detect(drug_entry, "xel$") ~ "L01CD",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
        !stringr::str_detect(drug_entry,"/| ") &
        stringr::str_detect(drug_entry, "platin$") ~ "L01XA",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
        !is.na(nci_concept_definition) &
        stringr::str_detect(tolower(nci_concept_definition), "anti-estrogen") ~ "L02BA",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
        !is.na(nci_concept_definition) &
        stringr::str_detect(tolower(nci_concept_definition), "aromatase inhibitor") ~ "L02BG",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
        !is.na(nci_concept_definition) &
        stringr::str_detect(tolower(nci_concept_definition), "nitrogen mustard") ~ "L01AA",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
        stringr::str_detect(
        tolower(nci_concept_definition), "purine( nucleoside)? analog") ~ "L01BB",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
        stringr::str_detect(
        tolower(nci_concept_definition), "pyrimidine( nucleoside)? analog") ~ "L01BC",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") &
        !is.na(drug_entry) &
        !stringr::str_detect(drug_entry, "/") &
        stringr::str_detect(
          tolower(nci_concept_definition), "vinca alkaloid") ~ "L01CA",
      (is.na(atc_code_level3) | atc_code_level3 == "NA") & 
      ((!is.na(nci_concept_definition) &
          stringr::str_detect(
            tolower(nci_concept_definition),
            "antineoplastic activit|anti-tumor activit"
          )) |
         (!is.na(drug_max_ct_phase) &
            stringr::str_detect(
              drug_entry,"(in|ib|ide|ine|ax|il|an|ate| alfa)$") &
            drug_max_ct_phase >= 2 &
            (!is.na(drug_n_indications) &
               drug_n_indications > 2) &
            (!is.na(drug_frac_cancer_indications) &
               drug_frac_cancer_indications > 0.4))) ~ "L01XX",
      TRUE ~ as.character(atc_code_level3)
    )) |>
    dplyr::distinct() |>
    dplyr::group_by(dplyr::across(-c("atc_code_level3"))) |>
    dplyr::summarise(atc_code_level3 = paste(unique(atc_code_level3), collapse="|"),
                     .groups = "drop") |>
    dplyr::mutate(atc_code_level3 = dplyr::case_when(
      stringr::str_detect(
        atc_code_level3, "^(L0[A-Z0-9]{1,}\\|((S|M)01(XA|LA|AH)))$") ~ 
        stringr::str_replace_all(
          atc_code_level3, "(\\|((S|M)01(XA|LA|AH)))$", ""),
      stringr::str_detect(
        atc_code_level3, "L0[A-Z0-9]{1,}\\|L01XX") ~ 
        stringr::str_replace_all(
          atc_code_level3, "\\|L01XX", ""),
      atc_code_level3 == "D11AH|L01EJ" |
        atc_code_level3 == "D11AH|L01XF" |
        atc_code_level3 == "G03DA|L02AB" |
        atc_code_level3 == "D10AD|L01XF" |
        atc_code_level3 == "D11AH|L04AD" ~ 
        stringr::str_replace_all(
          atc_code_level3, "^(D10AD|G03DA|D11AH)\\|", ""),
      atc_code_level3 == "L01FG|S01LA" |
        atc_code_level3 == "L01EG|S01XA" |
        atc_code_level3 == "L01BB|L04AA" |
        atc_code_level3 == "L01BA|L04AX" |
        atc_code_level3 == "L01XX|M01AH" ~ 
        stringr::str_replace_all(
          atc_code_level3, "\\|(S01LA|S01XA|L04AA|L04AX|M01AH)", ""),
      TRUE ~ atc_code_level3
    )) |>
    dplyr::distinct()
  
  
  atc_classified_drugs <- classified_drugs_all |>
    dplyr::filter(!is.na(drug_entry)) |>
    dplyr::select(atc_code_level3, drug_entry) |>
    dplyr::filter(!is.na(atc_code_level3)) |>
    tidyr::separate_rows(atc_code_level3, sep = "\\|") |>
    dplyr::filter(!is.na(atc_code_level3)) |>
    dplyr::left_join(
      dplyr::select(
        atc_classification_clean,
        atc_code_level1,
        atc_level1,
        atc_code_level2,
        atc_level2,
        atc_code_level3,
        atc_level3
      ), relationship ="many-to-many"
    ) |>
    dplyr::group_by(drug_entry) |>
    dplyr::summarise(
      atc_code_level1 = paste(unique(atc_code_level1), collapse="|"),
      atc_level1 = paste(unique(stringr::str_trim(atc_level1)), collapse="|"),
      atc_code_level2 = paste(unique(atc_code_level2), collapse="|"),
      atc_level2 = paste(unique(stringr::str_trim(atc_level2)), collapse="|"),
      atc_code_level3 = paste(unique(atc_code_level3), collapse="|"),
      atc_level3 = paste(unique(stringr::str_trim(atc_level3)), collapse="|")) |>
    dplyr::mutate(atc_code_level1 = dplyr::if_else(
      !is.na(atc_code_level1),
      stringr::str_replace(
        atc_code_level1,"(^NA\\|)|\\|NA$",""),
      as.character(atc_code_level1)
    )) |>
    dplyr::mutate(atc_level1 = dplyr::if_else(
      !is.na(atc_level1),
      stringr::str_replace(
        atc_level1,"(^NA\\|)|\\|NA$",""),
      as.character(atc_level1)
    )) |>
    dplyr::mutate(atc_code_level2 = dplyr::if_else(
      !is.na(atc_code_level2),
      stringr::str_replace(
        atc_code_level2,"(^NA\\|)|\\|NA$",""),
      as.character(atc_code_level2)
    )) |>
    dplyr::mutate(atc_level2 = dplyr::if_else(
      !is.na(atc_level2),
      stringr::str_replace(
        atc_level2,"(^NA\\|)|\\|NA$",""),
      as.character(atc_level2)
    )) |>
    dplyr::mutate(atc_code_level3 = dplyr::if_else(
      !is.na(atc_code_level3),
      stringr::str_replace(
        atc_code_level3,"(^NA\\|)|\\|NA$",""),
      as.character(atc_code_level3)
    )) |>
    dplyr::mutate(atc_level3 = dplyr::if_else(
      !is.na(atc_level3),
      stringr::str_replace(
        atc_level3,"(^NA\\|)|\\|NA$",""),
      as.character(atc_level3)
    )) |>
    dplyr::mutate(atc_code_level3 = dplyr::if_else(
      !is.na(atc_code_level3),
      stringr::str_replace(
        atc_code_level3,"L01XX\\|L01XXK","L01XXK"),
      as.character(atc_code_level3)
    )) |>
    dplyr::mutate(atc_level3 = dplyr::if_else(
      !is.na(atc_level3),
      stringr::str_replace(
        atc_level3,
        "Other antineoplastic agents\\|Tubulin inhibitors",
        "Tubulin inhibitors"),
      as.character(atc_level3)
    )) |>
    dplyr::mutate(atc_level3 = dplyr::if_else(
      !is.na(atc_level3),
      stringr::str_replace(
        atc_level3,
        "Folic acid analogues|Other immunosuppressants",
        "Folic acid analogues"),
      as.character(atc_level3)
    ))
  

  classified_drugs_all_final <- classified_drugs_all |>
    dplyr::select(-atc_code_level3) |>
    dplyr::left_join(atc_classified_drugs, by = "drug_entry",
                     relationship = "many-to-many") |>
    dplyr::mutate(atc_treatment_category = "unknown") |>
    dplyr::mutate(atc_treatment_category = dplyr::case_when(
      !is.na(atc_code_level2) &
        stringr::str_detect(
          atc_code_level2, "(\\|L02(A|B))|(^L02(A|B))") ~ 
        "cancer_hormone_therapy",
      !is.na(target_symbol) & 
      !is.na(atc_code_level2) &
        stringr::str_detect(
          atc_code_level2, "(\\|L01(E|F))|(^L01(E|F))"
        ) ~ "cancer_targeted_therapy",
      !is.na(target_symbol) &
      !is.na(atc_code_level3) &
        stringr::str_detect(
          atc_code_level3, "(\\|L01XX[A-J])|(^L01XX[A-J])"
        ) ~ "cancer_targeted_therapy",
      !is.na(target_symbol) &
        !is.na(atc_code_level3) &
        stringr::str_detect(
          atc_code_level3, "(\\|L01XXM)|(^L01XXM)"
        ) ~ "cancer_targeted_therapy",
      !is.na(target_symbol) &
        !is.na(atc_code_level3) &
        stringr::str_detect(
          atc_code_level3, "(\\|L04)|(^L04)"
        ) ~ "cancer_immuno_suppressants",
      !is.na(target_symbol) &
        stringr::str_detect(drug_cancer_relevance,"^by_cancer") & 
        (is.na(atc_code_level3) | 
           atc_code_level3 == "NA" | 
           atc_code_level3 == "L01XX")
         ~ "cancer_unclassified",
      !is.na(target_symbol) &
      !is.na(atc_code_level3) &
        stringr::str_detect(
          atc_code_level3, "(\\|L01X[A-L])|(^L01X[A-L])"
        ) ~ "cancer_targeted_therapy",
      !is.na(atc_code_level2) &
        !stringr::str_detect(
          atc_code_level2, "(\\|L01(E|F))|(^L01(E|F))") &
        
        ((!is.na(atc_code_level3) &
            drug_type != "Antibody" &
        stringr::str_detect(atc_code_level3, "L01XXK")) |
          (!is.na(atc_code_level3) &
          stringr::str_detect(atc_code_level3,"L01X(A|B)")) |
        (stringr::str_detect(
          atc_code_level2, "(L01(A|B|C|D))"
          ))) ~ "cancer_chemo_therapy",
      (!is.na(atc_code_level3) &
          drug_type == "Antibody" &
          stringr::str_detect(
            atc_code_level3, "L01XXK")
       ) ~ "cancer_adc_targeted_therapy",
      !is.na(target_symbol) &
      !is.na(atc_code_level2) &
        atc_code_level2 != "NA" &
        !stringr::str_detect(
          atc_code_level2, "\\|L0|^L0"
          ) ~ "other_targeted_therapy",
      TRUE ~ as.character(atc_treatment_category)
    ))
  
  
  cancer_targets_classified <- classified_drugs_all_final |>
    dplyr::filter(
      (drug_cancer_relevance == "by_cancer_condition_otp" |
         drug_cancer_relevance == "by_cancer_target_otp" |
        drug_cancer_relevance == "by_cancer_target_nci") & 
        atc_treatment_category == "cancer_targeted_therapy") |>
    dplyr::select(target_symbol) |>
    dplyr::filter(!is.na(target_symbol)) |>
    dplyr::filter(!stringr::str_detect(target_symbol,"^(RP(S|L)|TUB)")) |>
    dplyr::distinct() |>
    dplyr::mutate(cancer_target_classified = TRUE)
  
  classified_drugs_all_final <- classified_drugs_all_final |>
    dplyr::left_join(cancer_targets_classified) |>
    dplyr::mutate(cancer_target_classified = dplyr::if_else(
      is.na(target_symbol) | is.na(cancer_target_classified),
      FALSE,
      as.logical(cancer_target_classified)
    ))
    
  drug_df <- remove_duplicate_chembl_ids(
    drug_df = classified_drugs_all_final)
  
  drug_df$drug_entry <- NULL
  
  return(drug_df)
  
}


remove_duplicate_chembl_ids <- function(drug_df = NULL){
  
  black_list <- 
    readr::read_tsv(
      file = "data-raw/drug_name_black_list.txt", 
      col_names = F, show_col_types = F)
  
  black_list$nci_cd_name <- black_list$X1
  
  drug_df <- drug_df |> dplyr::anti_join(
    black_list, by = "nci_cd_name"
  )
  
  return(drug_df)
  
}

clean_final_drug_list <- function(drug_df = NULL){

  pharmaoncox <- drug_df |>
    dplyr::filter(
      is.na(nci_cd_name) |
      !stringr::str_detect(
        nci_cd_name,
        "Sustained-release| Bead(s)?| Compound|Vector|Pegylated")
    ) |>
    dplyr::filter(
      is.na(nci_concept_definition) |
      !stringr::str_detect(
        tolower(nci_concept_definition),
        "a( (synthetic|diagnostic|targeted))?( radio(immuno)conjugate)")
    ) |>
    dplyr::filter(
      is.na(drug_action_type) |
      !stringr::str_detect(tolower(drug_action_type),"vaccine")
    ) |>
    dplyr::filter(
      is.na(nci_concept_definition) |
        !stringr::str_detect(
          tolower(nci_concept_definition),
          "(^(a|any)|cancer|tumor|dna|autologous|cell-based|synthetic|cell|peptide|valent)( cell)? vaccine")
    ) |>
    dplyr::filter(
      is.na(nci_cd_name) |
      !stringr::str_detect(
        nci_cd_name,
        "^(Carbon C|Fluorine F|Gallium Ga|Indium In|Iodine I|Lutetium Lu|Technetium Tc|Yttrium Y)"
      )
    ) |>
    dplyr::filter(is.na(nci_cd_name) | 
                    nci_cd_name != "Abivertinib Maleate") |>
    dplyr::mutate(nci_cd_name = dplyr::if_else(
      is.na(nci_cd_name),
      stringr::str_to_title(drug_name),
      as.character(nci_cd_name)
    )) |>
    dplyr::mutate(is_salt = dplyr::if_else(
      nci_cd_name == "Avitinib Maleate",
      as.logical(NA),
      as.logical(is_salt)
    )) |>
    dplyr::mutate(drug_name = dplyr::if_else(
      nci_cd_name == "Doxycycline" &
        !is.na(molecule_chembl_id) &
        molecule_chembl_id == "CHEMBL1433",
      "DOXYCYCLINE",
      as.character(drug_name)))


  drug_action_types <- as.data.frame(
    pharmaoncox |>
      dplyr::select(nci_cd_name, drug_action_type) |>
      dplyr::distinct() |>
      dplyr::group_by(nci_cd_name) |>
      dplyr::summarise(drug_action_type = paste(
        drug_action_type, collapse = "/"
      ))
  )

  pharmaoncox$drug_action_type <- NULL
  pharmaoncox <- pharmaoncox |>
    dplyr::left_join(
      drug_action_types,
      by = "nci_cd_name", 
      relationship = "many-to-many") |>
    dplyr::select(drug_name,
                  nci_cd_name,
                  drug_type,
                  drug_action_type,
                  molecule_chembl_id,
                  drug_moa,
                  drug_max_phase_indication,
                  dplyr::everything())


  drug_max_ct_phase <- as.data.frame(
    pharmaoncox |>
      dplyr::select(nci_cd_name, drug_max_ct_phase) |>
      dplyr::group_by(nci_cd_name) |>
      dplyr::summarise(drug_max_ct_phase = max(drug_max_ct_phase))
  )

  pharmaoncox$drug_max_ct_phase <- NULL

  pharmaoncox <- pharmaoncox |>
    dplyr::left_join(drug_max_ct_phase,
                     by = "nci_cd_name",
                     relationship = "many-to-many") |>
    dplyr::select(-c(drug_moa)) |>
    dplyr::mutate(nci_concept_synonym2 = dplyr::if_else(
      is.na(nci_concept_synonym_all) & !is.na(drug_synonyms),
      as.character(tolower(drug_synonyms)),
      as.character(tolower(nci_concept_synonym_all))
    )) |>
    dplyr::mutate(nci_concept_synonym_all2 = nci_concept_synonym_all) |>
    tidyr::separate_rows(nci_concept_synonym2,
                         sep="\\|") |>
    dplyr::rename(nci_concept_synonym = nci_concept_synonym2) |>
    dplyr::select(-c(nci_concept_synonym_all2,
                     drug_synonyms,
                     drug_tradenames,
                     drug_description)) |>
    dplyr::distinct() |>
    dplyr::select(drug_name, nci_cd_name, drug_type,
                  drug_action_type, drug_cancer_relevance,
                  molecule_chembl_id,
                  drug_max_phase_indication, drug_max_ct_phase,
                  target_genename, target_symbol,
                  target_type, target_ensembl_gene_id,
                  target_entrezgene,
                  disease_efo_id, disease_efo_label,
                  cui, cui_name, primary_site,
                  nci_concept_synonym,
                  nci_concept_synonym_all,
                  dplyr::everything()) |>
    dplyr::mutate(nci_concept_definition =
                    stringi::stri_enc_toascii(nci_concept_definition)) |>
    dplyr::mutate(nci_concept_synonym_all =
                    stringi::stri_enc_toascii(nci_concept_synonym_all)) |>
    dplyr::mutate(nci_concept_synonym =
                    stringi::stri_enc_toascii(nci_concept_synonym)) |>
    dplyr::mutate(drug_name =
                    stringi::stri_enc_toascii(drug_name)) |>
    dplyr::mutate(
      nci_cd_name =
        stringi::stri_enc_toascii(nci_cd_name)) |>
    dplyr::mutate(drug_action_type = stringr::str_replace_all(
      drug_action_type, "/NA|NA/",""
    )) |>
    dplyr::mutate(idx = dplyr::row_number())
    


  ## Simplify records with only "cancer" indications, 
  ## mapping them to a unique
  ## EFO/CUI cross-ref, avoiding similar records 
  ##with "neoplasm", "carcinoma" etc.

  pharmaoncox_cancer_no_indication <- pharmaoncox |>
    dplyr::filter(is.na(disease_efo_id))
  
  pharmaoncox_non_cancer <- pharmaoncox |>
    dplyr::filter(is.na(primary_site) & !is.na(disease_efo_id) &
                    drug_cancer_relevance == "by_other_condition_otp")

  pharmaoncox_cancer_NOS <- as.data.frame(
    pharmaoncox |>
      dplyr::filter(is.na(primary_site) & !is.na(disease_efo_id) &
                      drug_cancer_relevance != "by_other_condition_otp") |>
      dplyr::mutate(disease_efo_id = "EFO:0000311",
                    disease_efo_label = "cancer",
                    cui = "C0006826",
                    cui_name = "Malignant neoplastic disease") |>
      dplyr::group_by(
        dplyr::across(-dplyr::ends_with(c("drug_clinical_id")))) |>
      dplyr::summarise(
        drug_clinical_id = paste(unique(drug_clinical_id), collapse=","),
        .groups = "drop"
      ) |>
      dplyr::distinct()
  )

  pharmaoncox_cancer_specific <- pharmaoncox |>
    dplyr::filter(!is.na(primary_site))


  pharmaoncox <- pharmaoncox_cancer_no_indication |>
    dplyr::bind_rows(pharmaoncox_non_cancer) |>
    dplyr::bind_rows(pharmaoncox_cancer_specific) |>
    dplyr::bind_rows(pharmaoncox_cancer_NOS) |>
    dplyr::arrange(nci_cd_name) |>
    dplyr::mutate(nci_cd_name = dplyr::if_else(
      stringr::str_detect(nci_cd_name,";-"),
      drug_name,
      as.character(nci_cd_name)
    )) |>
    dplyr::filter(!is.na(nci_cd_name)) |>
    dplyr::mutate(drug_name_final = dplyr::case_when(
      !is.na(drug_name) & 
        !is.na(nci_cd_name) & 
        tolower(nci_cd_name) != tolower(drug_name) &
        stringr::str_detect(
          tolower(nci_cd_name),"(ib|mab|in)$") &
        stringr::str_detect(
          drug_name, "[0-9]{1,}") ~ nci_cd_name,
      !is.na(drug_name) & 
        !is.na(nci_cd_name) & 
        tolower(nci_cd_name) != tolower(drug_name) &
        !stringr::str_detect(
          tolower(nci_cd_name),"(ib|mab|in)$") &
        stringr::str_detect(
          drug_name, "[0-9]{1,}") ~ drug_name,
      !is.na(drug_name) & 
        !is.na(nci_cd_name) & 
        tolower(nci_cd_name) == tolower(drug_name) ~ 
        stringr::str_to_title(tolower(nci_cd_name)),
      is.na(drug_name) & !is.na(nci_cd_name) ~ nci_cd_name,
      TRUE ~ as.character(
        stringr::str_to_title(tolower(drug_name))
      )
    )) |>
    dplyr::filter(
      !(drug_name_final == "CFI-400945" &
          molecule_chembl_id == "CHEMBL3408947") &
        !(drug_name_final == "Relebactam" &
            molecule_chembl_id == "CHEMBL3112741") &
        !(drug_name_final == "SR16234" &
            molecule_chembl_id == "CHEMBL3545210")
    )


  pharmaoncox$drug_name <- NULL
  pharmaoncox$drug_name <- pharmaoncox$drug_name_final
  pharmaoncox$nci_cd_name <- NULL
  pharmaoncox$drug_name_final <- NULL
  pharmaoncox$nci_concept_synonym <- NULL

  blackbox_warnings <- as.data.frame(pharmaoncox |>
    dplyr::select(drug_name, drug_blackbox_warning) |>
    dplyr::distinct() |>
      dplyr::group_by(drug_name) |>
    dplyr::summarise(drug_blackbox_warning = paste(
      unique(drug_blackbox_warning), collapse=","))
  )

  nci_t_map <- as.data.frame(pharmaoncox |>
    dplyr::select(drug_name, nci_t, nci_concept_definition) |>
    dplyr::distinct() |>
    dplyr::group_by(drug_name) |>
    dplyr::summarise(
      nci_t = paste(
        unique(nci_t), collapse=","),
      nci_concept_definition = paste(
        unique(nci_concept_definition),
        collapse = ",")
    ), .groups = "drop"
  )

  pharmaoncox$nci_t <- NULL
  pharmaoncox$nci_concept_definition <- NULL
  pharmaoncox$drug_blackbox_warning <- NULL
  pharmaoncox2 <- pharmaoncox |>
    dplyr::left_join(
      nci_t_map, by = "drug_name", 
      relationship = "many-to-many") |>
    dplyr::left_join(
      blackbox_warnings, by = "drug_name", 
      relationship = "many-to-many") |>
    dplyr::distinct() |>
    dplyr::mutate(drug_action_type = dplyr::if_else(
      drug_action_type == "NA" &
        stringr::str_detect(tolower(drug_name),
                            "ib |ib$") &
        stringr::str_detect(tolower(nci_concept_definition),
                            "inhibitor"),
      "INHIBITOR",
      as.character(drug_action_type)
    )) |>
    dplyr::mutate(drug_type = dplyr::if_else(
      drug_action_type == "NA" &
        stringr::str_detect(tolower(drug_name),
                            "ib |ib$") &
        stringr::str_detect(tolower(nci_concept_definition),
                            "inhibitor"),
      "Small molecule",
      as.character(drug_type)
    )) |>
    
    dplyr::mutate(inhibition_moa = dplyr::if_else(
      !stringr::str_detect(
        drug_action_type,
        paste0(
          "^(POSITIVE|NEGATIVE|AGONIST|HYDROLYTIC|",
          "CHELATING|PARTIAL|NA|INVERSE|OPENER|",
          "RELEASING|ANTISENSE|STABILISER|MODULATOR|",
          "SEQUESTERING|OXIDATIVE|OTHER|ACTIVATOR|RNA|",
          "VACCINE|DEGRADER|PROTEOLYTIC|SUBSTRATE)")) &
      (is.na(drug_type) | 
         (drug_type != "Protein" & 
            drug_type != "Gene" & 
            drug_type != "Cell")) &
      (is.na(target_type) | 
         target_type != "protein_protein_interaction"),
      as.logical(TRUE),
      as.logical(FALSE),
      as.logical(FALSE)
    )) |>
    dplyr::mutate(inhibition_moa = dplyr::if_else(
      inhibition_moa == F &
        drug_action_type == "OTHER" &
        (stringr::str_detect(
          tolower(drug_name),"mab$") |
           drug_type == "Antibody"),
      as.logical(TRUE),
      as.logical(inhibition_moa)
    )) 
  
  inhibition_moa_df <- as.data.frame(pharmaoncox2 |>
    dplyr::select(drug_name, inhibition_moa) |>
    dplyr::distinct() |>
    dplyr::group_by(drug_name) |>
    dplyr::summarise(inhibition_moa = paste(
      unique(inhibition_moa), collapse="|"
    ), .groups = "drop")
  ) |>
    dplyr::mutate(inhibition_moa = dplyr::if_else(
      stringr::str_detect(inhibition_moa, "\\|"),
      as.logical(TRUE),
      as.logical(inhibition_moa)
    ))
  
  pharmaoncox2$inhibition_moa <- NULL
  pharmaoncox2 <- pharmaoncox2 |> 
    dplyr::left_join(
      inhibition_moa_df, 
      by = "drug_name", 
      relationship = "many-to-many") |>
    dplyr::filter(!((!is.na(molecule_chembl_id) &
                       molecule_chembl_id == "CHEMBL1742994") &
                      drug_name == "Brentuximab vedotin")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL1908394") &
                      drug_name == "GSK-461364")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) &
                       molecule_chembl_id == "CHEMBL2158685") &
                    drug_name == "ABC-294640")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL2331680") &
                      drug_name == "RG-7603")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL253969") &
                      drug_name == "OSI-632")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL283120") &
                    drug_name == "AXL-1717")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL52885") &
                    drug_name == "ENMD-981693")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL4650827") &
                      drug_name == "SNDX-5613 FREE BASE")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL1645462") &
                      drug_name == "AC-480")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL4303241") &
                    drug_name == "BAY-1161909")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL4298098") &
                    drug_name == "SAR-408701")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL4091801") &
                    drug_name == "APG115")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL3623290") &
                    drug_name == "AZD-3759")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL103") &
                      drug_name == "Mycophenolic Acid")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL3545007") &
                    drug_name == "RG-7602")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                      molecule_chembl_id == "CHEMBL3545003") &
                    drug_name == "MSC-2363318A")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL4297310") &
                    drug_name == "ONC-201")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL4297930") &
                    drug_name == "CYC-065")) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL3899477") &
                    (nci_t == "NA" | is.na(nci_t)))) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL3126004") &
                      (nci_t == "NA" | is.na(nci_t)))) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL376408") &
                      (nci_t == "NA" | is.na(nci_t)))) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL4297489") &
                      (nci_t == "NA" | is.na(nci_t)))) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL17157") &
                      (nci_t == "NA" | is.na(nci_t)))) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL1201231") &
                      (nci_t == "NA" | is.na(nci_t)))) |>
    dplyr::filter(!((!is.na(molecule_chembl_id) & 
                       molecule_chembl_id == "CHEMBL1433") &
                      (nci_t == "NA" | is.na(nci_t)))) |>
    dplyr::filter(!(drug_name == "Risedronate Sodium" | 
                      drug_name == "Reminertant" |
                      drug_name == 'Ibandronate Sodium'))

  
  

  drug_maps <- list()
  drug_maps[['id2name']] <- pharmaoncox2 |>
    dplyr::select(drug_name) |>
    dplyr::distinct() |>
    dplyr::mutate(
      drug_id = dplyr::row_number())

  if(nrow(drug_maps[['id2name']]) != length(
    unique(pharmaoncox2$drug_name))){
    lgr::lgr$fatal("Ambiguous drug names")
    return(0)
  }

  pharmaoncox2 <- pharmaoncox2 |>
    dplyr::left_join(
      drug_maps[['id2name']], 
      by = "drug_name", 
      multiple = "all")

  drug_maps[['id2target']] <- pharmaoncox2 |>
    dplyr::select(drug_id,
                  target_symbol,
                  target_entrezgene,
                  target_genename,
                  target_ensembl_gene_id,
                  target_type) |>
    dplyr::distinct()

  drug_maps[['id2indication']] <- pharmaoncox2 |>
    dplyr::select(drug_id,
                  drug_max_phase_indication,
                  drug_approved_indication,
                  drug_frac_cancer_indications,
                  drug_approved_noncancer,
                  drug_n_indications,
                  drug_clinical_source,
                  drug_year_first_approval,
                  drug_clinical_id,
                  drug_max_ct_phase,
                  disease_efo_id,
                  disease_efo_label,
                  cui,
                  cui_name,
                  primary_site) |>
    dplyr::distinct()

  drug_maps[['id2synonym']] <- pharmaoncox2 |>
    dplyr::select(drug_id,
                  nci_concept_synonym_all) |>
    dplyr::distinct()

  drug_maps[['id2basic']] <- as.data.frame(
    pharmaoncox2 |>
    dplyr::select(
      drug_id,
      molecule_chembl_id,
      drug_type,
      drug_action_type,
      drug_cancer_relevance,
      inhibition_moa,
      is_salt,
      is_adc,
      drug_blackbox_warning,
      nci_concept_definition,
      nci_t,
      opentargets,
      atc_code_level1,
      atc_level1,
      atc_code_level2,
      atc_level2,
      atc_code_level3,
      atc_level3,
      atc_treatment_category) |>
      dplyr::distinct())

  return(drug_maps)

}


expand_drug_aliases <- function(drug_index_map = NULL,
                                path_data_raw = NULL,
                                chembl_pubchem_datestamp = chembl_pubchem_datestamp){

  chembl_pubchem_xref <-
    get_chembl_pubchem_xref(
      datestamp = chembl_pubchem_datestamp,
      path_data_raw = path_data_raw)
  
  drugAliasPrimary <- 
    drug_index_map[['id2name']] |>
    dplyr::select(drug_id, drug_name) |>
    dplyr::mutate(alias = drug_name) |>
    dplyr::select(-drug_name) |>
    dplyr::mutate(alias_source = "primaryName")
  
  
  drugAliasNCI <- as.data.frame(
    drug_index_map[['id2synonym']] |>
      dplyr::select(drug_id, nci_concept_synonym_all) |>
      tidyr::separate_rows(nci_concept_synonym_all, sep="\\|") |>
      dplyr::distinct() |>
      dplyr::rename(alias = nci_concept_synonym_all) |>
      dplyr::filter(nchar(alias) > 3) |>
      dplyr::mutate(alias_source = "nci")
  )

  ## Extend aliases with those found in PubChem

  ## get drug set that contains PubChem cross-references
  unique_chembl_pubchem <- drug_index_map[['id2basic']] |>
    dplyr::select(drug_id, molecule_chembl_id) |>
    dplyr::filter(!is.na(molecule_chembl_id)) |>
    dplyr::distinct() |>
    dplyr::left_join(
      chembl_pubchem_xref, 
      by = "molecule_chembl_id",
      multiple = "all", relationship = "many-to-many") |>
    dplyr::filter(!is.na(pubchem_cid)) |>
    dplyr::select(-c(chembl_db_version))


  ## Retrieve aliases for drugs with PubChem x-refs
  lgr::lgr$info("Retrieving additional drug aliases/synonyms for PubChem-mapped compounds")
  pubchem_synonym_files <-
    sort(list.files(path = file.path(path_data_raw, "pubchem"),
                    pattern = "CID-Synonym-filtered_",
                    full.names = T))

  drugAliasPubchem <- data.frame()
  for(f in pubchem_synonym_files){
    synonym_data <- as.data.frame(readr::read_tsv(
      f, col_names = c('pubchem_cid','alias'),
      col_types = "dc",
      progress = F
    ))

    pubchem_alias_df <- synonym_data |>
      dplyr::inner_join(
        unique_chembl_pubchem,
        by = "pubchem_cid", 
        multiple = "all", relationship = "many-to-many")
    
    if(nrow(pubchem_alias_df) > 0){
      pubchem_alias_df <- pubchem_alias_df |>
        dplyr::select(-c(pubchem_cid, molecule_chembl_id)) |>
        dplyr::mutate(alias_source = "pubchem")
      drugAliasPubchem <-
        drugAliasPubchem |>
        dplyr::bind_rows(pubchem_alias_df)
    }
    rm(synonym_data)
  }
  
  ## ignore some drugs
  custom_name_ignore <- as.data.frame(readr::read_tsv(
    file = "data-raw/drug_names_ignore.tsv",
    col_names = F, show_col_types = F))
  colnames(custom_name_ignore) <- c("alias_lc")
  custom_name_ignore$alias_lc <- tolower(
    custom_name_ignore$alias_lc)
  
  
  drugAliasAll <- as.data.frame(drugAliasPrimary |>
    dplyr::bind_rows(drugAliasNCI) |>
    dplyr::bind_rows(drugAliasPubchem) |>
    dplyr::filter(nchar(alias) > 3 & 
                    nchar(alias) < 250) |>
    #tidyr::separate_rows(alias, sep = "\\|") |>
    dplyr::distinct() |>
    dplyr::left_join(drug_index_map[['id2name']],
                     by = "drug_id") |>
      
    ## avoid drug aliases for a particular drug that are identical 
    ## to the primary drug name of another drug
    dplyr::filter(
      !(tolower(alias) != tolower(drug_name) &
          tolower(alias) %in% tolower(drugAliasPrimary$alias))) |>
      
    ## consider unambiguous drug aliases only
    dplyr::group_by(alias) |>
    dplyr::summarise(
      alias_source = paste(unique(alias_source), collapse="|"),
      drug_id = paste(unique(drug_id), collapse=","),
      .groups = "drop") |>
      dplyr::filter(!stringr::str_detect(drug_id,",")) |>
      dplyr::distinct() |>
      dplyr::mutate(
        alias = stringi::stri_enc_toascii(alias)
      ) |>
      dplyr::mutate(
        alias_lc = tolower(alias)) |> 
      
      ## remove aliases that co-incide with ordinary
      ## english words (pubchem)
      dplyr::left_join(
        words::words, 
        by = c("alias_lc" = "word"),
        multiple = "all", relationship = "many-to-many") |> 
      dplyr::filter(
        is.na(word_length) |
          (!is.na(word_length) &
             word_length > 7 &
             (alias_source != "pubchem" ))) |>
      dplyr::anti_join(
        custom_name_ignore, by = "alias_lc") |>
      dplyr::select(-c(word_length,alias_lc)) |>
      dplyr::distinct() |>
      dplyr::select(drug_id, alias, alias_source) |>
      dplyr::mutate(
        drug_id = as.integer(drug_id)
      )
      
  )

  return(drugAliasAll)


}


get_mesh_drug_categories <- function(path_data_raw = NULL){
  
  mesh_drug_data <- whatamesh::read_mesh_file(
    file.path(path_data_raw,"data-raw",
              "mesh","d2022.bin")
  )
  i <- 1
  
  all_drugs <- data.frame()
  while(i <= nrow(mesh_drug_data)){
    drugname <- mesh_drug_data[i,]$MH
    if(!is.null(mesh_drug_data[i,]$PA)){
      drug_category <- paste(unlist(mesh_drug_data[i,]$PA),
                             collapse=";")
      
      df <- data.frame(
        'drugname' = drugname,
        'category' = drug_category,
        stringsAsFactors = F
      )
      
      all_drugs <- all_drugs |>
        dplyr::bind_rows(df)
    }
    i <- i + 1
  }
  
}
sigven/oncoPharmaDB documentation built on Dec. 24, 2024, 9:46 p.m.