data-raw/biomarker_utilities.R

clean_variant_consequence <- function(variant_df){
  
  variant_df <- variant_df |>
    dplyr::mutate(
      variant_consequence =
        dplyr::if_else(
          stringr::str_detect(variant,"Overexpression") &
            variant_consequence == "N/A",
          "transcript_amplification",
          as.character(variant_consequence))) |>
    dplyr::mutate(
      variant_consequence =
        dplyr::if_else(
          stringr::str_detect(variant,"Underexpression") &
            variant_consequence == "N/A",
          "transcript_ablation",
          as.character(variant_consequence))) |>
    dplyr::mutate(
      variant_consequence =
        dplyr::if_else(
          stringr::str_detect(
            variant_consequence,
            "^(plus_1_frameshift|minus_1_frameshift|frameshift_)"),
          "frameshift_variant",
          as.character(variant_consequence))) |>
    dplyr::mutate(variant_consequence = dplyr::if_else(
      variant_consequence == "" &
        stringr::str_detect(variant,"EXON") &
        stringr::str_detect(variant,"DELETION"),
      "exon_loss_variant",
      as.character(variant_consequence)
    )) |>
    dplyr::mutate(variant_consequence = dplyr::if_else(
      variant_consequence == "" &
        stringr::str_detect(variant,"Exon") &
        stringr::str_detect(variant,"Mutation"),
      "exon_variant",
      as.character(variant_consequence)
    )) |>
    dplyr::mutate(variant_consequence = dplyr::if_else(
      variant_consequence == "" &
        stringr::str_detect(variant,"^(Loss|Loss-of-function)$"),
      "loss_of_function_variant",
      as.character(variant_consequence)
    )) |>
    dplyr::mutate(variant_consequence = dplyr::if_else(
      stringr::str_detect(variant,"^(Activating |Gain-of)") &
        (is.na(variant_consequence) | nchar(variant_consequence) == 0),
      "gain_of_function_variant",
      as.character(variant_consequence)
    )) |>
    dplyr::mutate(variant_consequence = dplyr::if_else(
      stringr::str_detect(variant,"^Wildtype") &
        (is.na(variant_consequence) | nchar(variant_consequence) == 0),
      "wild_type",
      as.character(variant_consequence)
    ))

  return(variant_df)
}

set_alteration_type <- function(variant_df){
  
  variant_df <- variant_df |>
    dplyr::mutate(
      alteration_type =
        dplyr::if_else(
          stringr::str_detect(toupper(variant),"EXPRESSION"),
          "EXP",as.character(alteration_type))) |>
    dplyr::mutate(
      alteration_type =
        dplyr::case_when(
          alteration_type == "EXP" &
            stringr::str_detect(
              toupper(variant),"OVER") ~ "EXP_OVER",
          alteration_type == "EXP" &
            stringr::str_detect(
              toupper(variant),"UNDER") ~ "EXP_UNDER",
          TRUE ~ as.character(alteration_type)
        )
    ) |>
    dplyr::mutate(alteration_type = dplyr::if_else(
      variant_consequence == "stop_gained",
      "MUT_LOF",as.character(alteration_type)
    )) |>
    dplyr::mutate(
      alteration_type =
        dplyr::if_else(
          stringr::str_detect(
            toupper(variant),"LOSS-OF-FUNCTION|LOSS|TRUNCAT|INACTIVAT") |
            stringr::str_detect(
              variant_consequence, 
              "stop_gained,loss_of_function_variant|loss_of_function_variant,stop_gained") |
            (stringr::str_detect(
              variant_consequence, "loss_of_function_variant") &
               toupper(variant) == "MUTATION") |
            stringr::str_detect(variant_consequence,"truncation"),
          "MUT_LOF",as.character(alteration_type))) |>
    
    dplyr::mutate(alteration_type = dplyr::if_else(
      toupper(variant) == "BIALLELIC INACTIVATION",
      "MUT_LOF_BIALLELIC",
      as.character(alteration_type)
    )) |>
    dplyr::mutate(
      alteration_type =
        dplyr::if_else(
          alteration_type == 'MUT_LOF' &
            (stringr::str_detect(
              toupper(variant_consequence),"FRAMESHIFT") |
               stringr::str_detect(
                 variant_consequence,"feature_truncation")),
          "MUT_LOF_FS",as.character(alteration_type))) |>
    dplyr::mutate(
      alteration_type =
        dplyr::if_else(
          stringr::str_detect(
            toupper(variant), 
            "AMPLIFICATION|DELETION|COPY") &
            stringr::str_detect(
              variant_consequence, "_amplification|_ablation"),
          "CNA",as.character(alteration_type))) |>
    
    dplyr::mutate(
      alteration_type =
        dplyr::if_else(
          stringr::str_detect(
            toupper(variant_consequence),
            "TRANSCRIPT_FUSION|REGION_FUSION") |
            stringr::str_detect(toupper(variant),"FUSION") |
            (variant_consequence == "" & 
               stringr::str_detect(variant,"::")),
          "FUSION",
          as.character(alteration_type))) |>
    dplyr::mutate(
      alteration_type =
        dplyr::if_else(
          stringr::str_detect(
            toupper(variant),"REARRANGEMENT") &
            variant_consequence == "",
          "REARRANGEMENT",
          as.character(alteration_type)))
  
  return(variant_df)
  
  
}

clean_variant_naming <- function(
    variant_df, col = "molecular_profile_name"){
  
  variant_df[,"cc_clean"] <- variant_df[,col]
  variant_df <- variant_df |>
    dplyr::mutate(cc_clean = stringr::str_trim(cc_clean)) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean,"RS[0-9]{5,}"),
      stringr::str_replace_all(cc_clean,"RS","rs"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = stringr::str_replace_all(
      cc_clean, 
      paste0(
        "(ENST[0-9]{1,}|",
        "NM_[0-9]{1,})(\\.[0-9]{1,})?:c\\."),"c."
    )) |>
    dplyr::mutate(cc_clean = stringr::str_replace_all(
      cc_clean, 
      paste0(
        "(ENSP[0-9]{1,}|NP_[0-9]{1,}",
        ")(\\.[0-9]{1,})?:p\\."),"p."
    )) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "^NC_[0-9]{1,}"),
      "", as.character(cc_clean)
    )) |>
    dplyr::mutate(cc_clean = dplyr::case_when(
      cc_clean == "D835H/Y" ~ "D835H,D835Y",
      cc_clean == "G12/G13" ~  "G12,G13",
      cc_clean == "Q157P/R" ~ "Q157P,Q157R",
      cc_clean == "S310F/Y" ~ "S310F,S310Y",
      cc_clean == "S34Y/F" ~ "S34Y,S34F",
      cc_clean == "S893A/T" ~ "S893A,S893T",
      TRUE ~ as.character(cc_clean)
    )) |> 
   
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(
        cc_clean, "^(CD44s|p16|PRPS1|IKZF1|NUCLEAR) "),
      stringr::str_replace_all(
        cc_clean, "^(CD44s|p16|PRPS1|IKZF1|NUCLEAR) ", ""),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(
        cc_clean, " NUCLEAR "),
      stringr::str_replace_all(
        cc_clean, " NUCLEAR ", ""),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "IVS2\\+1G>A"),
      stringr::str_replace_all(
        cc_clean, "^IVS2\\+1G>A", "c.444+1G>A"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(
        cc_clean, " HOMOZYGOSITY"),
      stringr::str_replace_all(
        cc_clean, " HOMOZYGOSITY", ""),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(
        cc_clean, "deletion and mutation"),
      stringr::str_replace_all(
        cc_clean, "deletion and mutation", "Mutation"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = stringr::str_replace(
      cc_clean, " \\(.+\\)$","")) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "\\*$"),
      stringr::str_replace_all(cc_clean, "\\*$", "X"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "^\\*"),
      stringr::str_replace_all(cc_clean, "^\\*", "X"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "\\* "),
      stringr::str_replace_all(cc_clean, "\\* ", "X "),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "Rare "),
      stringr::str_replace_all(cc_clean, "Rare ", ""),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, " expression"),
      stringr::str_replace_all(
        cc_clean, " expression", " Expression"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "BIALLELIC"),
      stringr::str_replace_all(
        cc_clean, "BIALLELIC", "Biallelic"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "MUTATION|mutation"),
      stringr::str_replace_all(
        cc_clean, "MUTATION|mutation", "Mutation"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "DOMAIN"),
      stringr::str_replace_all(
        cc_clean, "DOMAIN", "Domain"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "CONSERVED"),
      stringr::str_replace_all(
        cc_clean, "CONSERVED", "Conserved"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "DELETION"),
      stringr::str_replace_all(
        cc_clean, "DELETION", "Deletion"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(
        cc_clean, "FRAMESHIFT|FRAME SHIFT"),
      stringr::str_replace_all(
        cc_clean, "FRAMESHIFT|FRAME SHIFT", "Frameshift"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "INACTIVATING"),
      stringr::str_replace_all(
        cc_clean, "INACTIVATING", "Inactivating"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "INACTIVATION"),
      stringr::str_replace_all(
        cc_clean, "INACTIVATION", "Inactivation"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "OVEREXPRESSION"),
      stringr::str_replace_all(
        cc_clean, "OVEREXPRESSION", "Overexpression"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "UNDEREXPRESSION"),
      stringr::str_replace_all(
        cc_clean, "UNDEREXPRESSION", "Underexpression"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "EXPRESSION"),
      stringr::str_replace_all(
        cc_clean, "EXPRESSION", "Expression"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "EXON"),
      stringr::str_replace_all(cc_clean, "EXON", "Exon"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "LOSS$"),
      stringr::str_replace_all(cc_clean, "LOSS", "Loss"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "AMPLIFICATION"),
      stringr::str_replace_all(
        cc_clean, "AMPLIFICATION", "Amplification"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "TRUNCATING"),
      stringr::str_replace_all(
        cc_clean, "TRUNCATING", "Truncating"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "TRUNCATION"),
      stringr::str_replace_all(
        cc_clean, "TRUNCATION", "Truncation"),
      as.character(cc_clean))) |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(
        cc_clean, "TRANSLOCATION|rearrangement"),
      stringr::str_replace_all(
        cc_clean, "TRANSLOCATION|rearrangement", "Rearrangement"),
      as.character(cc_clean)))
  
  if(col != "molecular_profile_name"){
    variant_df <- variant_df |>
      dplyr::mutate(cc_clean = dplyr::if_else(
        stringr::str_detect(cc_clean, "Binding Domain Mutation") &
          feature_name == "TP53",
        "aa_region:100-288",
        as.character(cc_clean))) |>
      dplyr::mutate(cc_clean = dplyr::if_else(
        stringr::str_detect(cc_clean, "SH2 Domain") &
          feature_name == "STAT3",
        "aa_region:584-651",
        as.character(cc_clean))) |>
      dplyr::mutate(cc_clean = dplyr::if_else(
        stringr::str_detect(cc_clean, "Kinase Domain") &
          feature_name == "ERBB2",
        "aa_region:722-975",
        as.character(cc_clean))) |>
      dplyr::mutate(cc_clean = dplyr::if_else(
        stringr::str_detect(cc_clean, "TKD ") &
          feature_name == "FLT3",
        "aa_region:610-942",
        as.character(cc_clean))) |>
      dplyr::mutate(cc_clean = dplyr::if_else(
        stringr::str_detect(cc_clean, "Conserved Domain") &
          feature_name == "TP53",
        paste0(
          "aa_region:117-143,aa_region:171-181,",
          "aa_region:234-258,aa_region:270-286"),
        as.character(cc_clean)))
    }
  variant_df <- variant_df |>
    dplyr::mutate(cc_clean = dplyr::if_else(
      stringr::str_detect(cc_clean, "N/A"),
      stringr::str_replace_all(cc_clean, "N/A", ""),
      as.character(cc_clean))) |>
    dplyr::distinct()
    
  variant_df[,col] <- variant_df[,"cc_clean"]
  variant_df[,"cc_clean"] <- NULL
  return(variant_df)
}

get_literature_references <- function(biomarker_items){
  
  ## set logging layout
  lgr::lgr$appenders$console$set_layout(
    lgr::LayoutFormat$new(timestamp_fmt = "%Y-%m-%d %T"))
  
  biomarker_items[['literature']] <- as.data.frame(
    biomarker_items[['clinical']] |>
      dplyr::filter(!is.na(citation_id)) |>
      tidyr::separate_rows(citation_id, sep=";") |> 
      dplyr::mutate(
        citation_id = stringr::str_trim(citation_id)) |> 
      dplyr::filter(
        !stringr::str_detect(
          tolower(citation_id),
          "asco|aacr|abstr|suppl|confex|jci|blood|annonc|caris")) |> 
      dplyr::distinct() |>
      dplyr::mutate(citation_id = dplyr::case_when(
        citation_id == "PMC3638050" ~ "PMID:22038996",
        citation_id == "PMC3936420" ~ "PMID:24265155",
        TRUE ~ as.character(
          stringr::str_replace(citation_id,"PMID: ","PMID:"))
      )) |>
      dplyr::filter(!stringr::str_detect(citation_id, "^[0-9]{1,5}$")) |>
      dplyr::filter(citation_id != "PMID:238900088" &
                      citation_id != "PMID:24569783" &
                      citation_id != "2556735") |>
      dplyr::mutate(citation_id = dplyr::case_when(
        stringr::str_detect(citation_id, "^FDA") ~ "FDA",
        stringr::str_detect(citation_id, "NCCN") ~ "NCCN",
        TRUE ~ as.character(citation_id)
      )) |>
      dplyr::mutate(citation_id = stringr::str_replace(
        citation_id, "PMID:",""
      )) |>
      
      dplyr::select(citation_id, evidence_id) |>
      dplyr::rename(source_id = citation_id) |>
      dplyr::mutate(source_id = as.character(source_id)) |>
      dplyr::group_by(source_id) |>
      dplyr::summarise(evidence_id = paste(
        unique(sort(evidence_id)), collapse = ";"
      ), .groups = "drop") |>
      dplyr::left_join(
        literatureVault::vault,
        by = "source_id", 
        relationship = "many-to-many",
        multiple = "all"
      )
  )
  
  biomarker_items[['clinical']]$citation_id <- NULL
  
  not_found <- biomarker_items$literature |>
    dplyr::filter(is.na(link)) |>
    dplyr::select(source, source_id)
  
  if(NROW(not_found) > 0){
    lgr::lgr$info(
      "Citations missing in sigven/literatureVault::vault - update needed")
    return(not_found)
  }else{
    lgr::lgr$info(
      "Found all literature references in sigven/literatureVault::vault"
    )
  }
  
  ## ignore molecular profiles/biomarkers that lack literature support
  biomarkers_with_literature_support <- as.data.frame(
    biomarker_items[['literature']] |>
      dplyr::select(evidence_id, source_id) |>
      tidyr::separate_rows(evidence_id, sep = ";") |>
      dplyr::group_by(evidence_id) |>
      dplyr::mutate(source_id = paste(
        unique(source_id), collapse = ";"
      )) |>
      dplyr::distinct()
  )
  
  biomarker_items[['clinical']] <- biomarker_items[['clinical']] |>
    dplyr::inner_join(
      biomarkers_with_literature_support,
      by = "evidence_id", multiple = "all",
      relationship = "many-to-many")
  
  return(biomarker_items)
  
}

#' #' A function that splits an array into chunks of equal size
#' chunk <- function(x,n) split(x, factor(sort(rank(x)%%n)))
#' 
#' #' A function that returns a citation with first author, journal and year for a PubMed ID
#' #'
#' #' @param pmid An array of Pubmed IDs
#' #' @param chunk_size Size of PMID chunks
#' #'
#' #' @export
#' #'
#' #' @return citation PubMed citation, with first author, journal and year
#' #'
#' get_citations_pubmed <- function(
#'     pmid,
#'     pmid_cache_dir = NA,
#'     chunk_size = 2){
#'   
#'   ## make chunk of maximal 400 PMIDs from input array (limit by EUtils)
#'   pmid_chunks <- chunk(pmid, ceiling(length(pmid)/chunk_size))
#'   j <- 0
#'   all_citations <- data.frame()
#'   cat('Retrieving PubMed citations for PMID list, total length', length(pmid))
#'   cat('\n')
#'   while(j < length(pmid_chunks)){
#'     #cat(unlist(pmid_chunks),"\n")
#'     pmid_chunk <- pmid_chunks[[as.character(j)]]
#'     #cat(unlist(pmid_chunk),"\n")
#'     #cat('Processing chunk ',j,' with ',length(pmid_chunk),'PMIDS')
#'     #cat('\n')
#'     pmid_string <- paste(pmid_chunk,collapse = " ")
#'     res <- RISmed::EUtilsGet(
#'       RISmed::EUtilsSummary(pmid_string, type="esearch", db="pubmed", retmax = 5000)
#'     )
#'     
#'     
#'     year <- RISmed::YearPubmed(res)
#'     authorlist <- RISmed::Author(res)
#'     pmid_list <- RISmed::PMID(res)
#'     i <- 1
#'     first_author <- c()
#'     while(i <= length(authorlist)){
#'       
#'       
#'       if(length(authorlist[[i]]) == 5){
#'         first_author <- c(
#'           first_author,
#'           paste(authorlist[[i]][1,]$LastName," et al.",sep=""))
#'       }else{
#'         first_author <- c(
#'           first_author, as.character("Unknown et al.")
#'         )
#'       }
#'       i <- i + 1
#'     }
#'     journal <- RISmed::ISOAbbreviation(res)
#'     citations <- data.frame('pmid' = as.integer(pmid_list),
#'                             'citation' = paste(first_author,year,journal,sep=", "),
#'                             stringsAsFactors = F)
#'     citations$link <- paste0('<a href=\'https://www.ncbi.nlm.nih.gov/pubmed/',
#'                              citations$pmid,'\' target=\'_blank\'>',
#'                              citations$citation,'</a>')
#'     all_citations <- dplyr::bind_rows(
#'       all_citations, citations)
#'     j <- j + 1
#'   }
#'   
#'   return(all_citations)
#'   
#' }
#' 
#' 
#' get_literature_links <- function(literature_df) {
#'   
#'   stopifnot(is.data.frame(literature_df))
#'   assertable::assert_colnames(
#'     literature_df, c("source","source_id","evidence_id"),
#'     quiet = T)
#'   
#'   pubmed_source_ids <- literature_df |>
#'     dplyr::filter(source == "PubMed")
#'   
#'   other_source_ids <- literature_df |>
#'     dplyr::filter(source != "PubMed")
#'   
#'   
#'   if (NROW(pubmed_source_ids) > 0) {
#'     pubmed_source_ids <- get_citations_pubmed(
#'       pmid = pubmed_source_ids$source_id, chunk_size = 100) |>
#'       dplyr::rename(
#'         name = citation,
#'       ) |>
#'       dplyr::mutate(source_id = as.character(pmid)) |>
#'       dplyr::select(-pmid) |>
#'       dplyr::left_join(
#'         pubmed_source_ids, by = "source_id")
#'   }
#'     
#'   if (NROW(other_source_ids) > 0) {
#'     other_source_ids <- other_source_ids |>
#'       dplyr::filter(source != "PubMed") |>
#'       dplyr::mutate(name = source_id) |>
#'       dplyr::mutate(link = dplyr::case_when(
#'         source == "FDA" ~
#'           "<a href='https://www.fda.gov/drugs/resources-information-approved-drugs/oncology-cancer-hematologic-malignancies-approval-notifications' target='_blank'>FDA approvals</a>",
#'         source == "NCCN" ~
#'           "<a href='https://www.nccn.org/guidelines/category_1' target='_blank'>NCCN guidelines</a>",
#'         source == "EMA" ~ "<a href='https://www.ema.europa.eu/en/medicines/field_ema_web_categories%253Aname_field/Human' target='_blank'>European Medicines Agency (EMA)</a>",
#'         source == "clinicaltrials.gov" ~ paste0(
#'           "<a href='https://clinicaltrials.gov/ct2/show/", 
#'           source_id, "' target='_blank'>", 
#'           source_id, "</a>")
#'       ))
#'   }
#'   
#'   all_literature_links <- pubmed_source_ids |>
#'     dplyr::select(source, source_id, 
#'                   evidence_id, 
#'                   dplyr::everything())
#'   
#'   if(NROW(other_source_ids) > 0){
#'     all_literature_links <- 
#'       other_source_ids |>
#'       dplyr::bind_rows(pubmed_source_ids)
#'   }
#'   
#'   return(all_literature_links)
#'   
#'   
#' }


get_amino_acid_dictionary <- function() {
  aa_df <- 
    data.frame('one_letter' = names(Biostrings::AMINO_ACID_CODE), 
               'three_letter' = Biostrings::AMINO_ACID_CODE, 
               'three_letter_uc' = toupper(Biostrings::AMINO_ACID_CODE),
               stringsAsFactors = F) |>
    dplyr::filter(one_letter != "B" &
                    one_letter != "J" &
                    one_letter != "O" &
                    one_letter != "U" &
                    one_letter != "X" &
                    one_letter != "Z")
  
  rownames(aa_df) <- NULL
  return(aa_df)
  
  
}


expand_hgvs_terms <- function(var, aa_dict, add_codon_markers = FALSE) {
  hits <- c()
  
  if (stringr::str_detect(toupper(var),"([A-Z]{3}[0-9]{1,}(([A-Z]{3}) {0,1})([0-9]{1,})?)") &
     !stringr::str_detect(toupper(var),"(DEL|INS|DUP)|=|-|\\*")) {
    aa <- stringr::str_to_title(strsplit(var, "[0-9]{1,}")[[1]])
    codon <- stringr::str_match(var, "[0-9]{1,}")[[1]]

    
    if (length(aa) == 1) {
      alt1 <- paste0(aa[1],codon)
      alt2 <- paste0('p.', alt1)
      if (aa[1] != "Ter") {
        if (aa[1] %in% aa_dict$three_letter) {
          aa1 <- seqinr::a(aa[1])
          if (!is.na(aa1)) {
            alt3 <- paste0(seqinr::a(aa[1]), codon)
            hits <- c(alt1, alt2, alt3)
          }
        }
      }else{
        alt3 <- paste0("X", codon)
        hits <- c(alt1, alt2, alt3)

      }
    }else{
      if (length(aa) == 2) {
        alt1 <- stringr::str_replace(
          stringr::str_replace(paste0(aa[1],codon, aa[2]),"(FS|Fs)$","fs"),
          "ter","Ter")
        alt2 <- stringr::str_replace(
          stringr::str_replace(paste0('p.', alt1),"(FS|Fs)$","fs"),
          "ter","Ter")
        if (aa[1] != "Ter" & 
           aa[2] != "Ter" & 
           aa[2] != "Fs" &
           nchar(aa[1]) == 3 & 
           nchar(aa[2]) == 3 &
           aa[1] %in% aa_dict$three_letter &
           aa[2] %in% aa_dict$three_letter) {
          alt3 <- paste0(seqinr::a(aa[1]), codon, seqinr::a(aa[2]))
          alt4 <- paste0('p.', alt3)
          hits <- c(alt1, alt2, alt3, alt4)
        }
        else{
          if (aa[1] == "Ter") {
            alt3 <- paste0("X", codon, seqinr::a(aa[2]))
            alt4 <- paste0('p.', alt3)
            hits <- c(alt1, alt2, alt3, alt4)
          }else{
            if (aa[2] == "Ter") {
              if (aa[1] %in% aa_dict$three_letter) {
                alt3 <- paste0(seqinr::a(aa[1]), codon, "X")
                alt4 <- paste0('p.', alt3)
                hits <- c(alt1, alt2, alt3, alt4)
              }
            }
            if (aa[2] == "Fs") {
              if (aa[1] %in% aa_dict$three_letter) {
                alt3 <- paste0(seqinr::a(aa[1]), codon, "fs")
                alt4 <- paste0('p.', alt3)
                hits <- c(alt1, alt2, alt3, alt4)
              }
            }

          }
        }
      }
    }
  }
  if (stringr::str_detect(var,"^([A-Z]{1}[0-9]{1,}[A-Z]{0,1})$") &
     !stringr::str_detect(var, "FS|DUP|DEL|INS|fs|dup|del|ins|\\*")) {
    aa <- stringr::str_to_title(strsplit(var, "[0-9]{1,}")[[1]])
    codon <- stringr::str_match(var, "[0-9]{1,}")[[1]]

    if (length(aa) == 1) {
      alt1 <- paste0(aa[1],codon)
      alt2 <- paste0('p.', alt1)
      if (aa[1] != "X" & aa[1] %in% aa_dict$one_letter) {
        alt3 <- paste0(seqinr::aaa(aa[1]), codon)
        hits <- c(alt1, alt2, alt3)
      }else{
        alt3 <- paste0("Ter", codon)
        hits <- c(alt1, alt2, alt3)
      }
    }else{
      if (length(aa) == 2) {
        alt1 <- paste0(aa[1],codon, aa[2])
        alt2 <- paste0('p.', alt1)
        if (aa[1] != "X" & 
           aa[2] != "X" & 
           aa[1] %in% aa_dict$one_letter &
           aa[2] %in% aa_dict$one_letter) {
          alt3 <- paste0(seqinr::aaa(aa[1]),
                         codon, seqinr::aaa(aa[2]))
          alt4 <- paste0('p.', alt3)
          
          if(aa[2] == aa[1]){
            alt5 <-  paste0("p.", seqinr::aaa(aa[1]),
                            codon, "=")
            alt6 <- paste0("p.", aa[1],
                           codon, "=")
            hits <- c(alt1, alt2, alt3, alt4, alt5, alt6)
          }else{
            hits <- c(alt1, alt2, alt3, alt4)
            
          }
          
        }else{
          if (aa[1] == "X" & aa[2] %in% aa_dict$one_letter) {
            alt3 <- paste0("Ter",
                           codon, seqinr::aaa(aa[2]))
            alt4 <- paste0('p.', alt3)
            hits <- c(alt1, alt2, alt3, alt4)
          }else{
            if (aa[2] == "X" & aa[1] %in% aa_dict$one_letter) {
              alt3 <- paste0(seqinr::aaa(aa[1]),
                             codon, "Ter")
              alt4 <- paste0('p.', alt3)
              hits <- c(alt1, alt2, alt3, alt4)
            }

          }
        }

      }
    }
  }
  if (stringr::str_detect(var,"^C\\.")) {
    var <- stringr::str_replace(var,"^C\\.","c.")
    #hits <- res
  }
  if (stringr::str_detect(var,"^P\\.")) {
    var <- stringr::str_replace(var,"^P\\.","p.")
    #hits <- res
  }
  if (stringr::str_detect(var,"FS$")) {
    var <- stringr::str_replace(var,"FS$","fs")
  }
  if (stringr::str_detect(var,";") & stringr::str_detect(var,"\\(")) {
    var <- tolower(var)
  }
  if (stringr::str_detect(var,"^([A-Z]{1}[0-9]{1,}[A-Z]{1})$")) {
    res1 <- paste0('p.', var)
    if(add_codon_markers == TRUE){
      res2 <- stringr::str_match(var,"^([A-Z]{1}[0-9]{1,})")[[1]]
      hits <- c(hits,res1,res2)
    }else{
      hits <- c(hits, res1)
    }
  }
  if (stringr::str_detect(var,"FS$")) {
    res <- stringr::str_replace(var,"FS$","fs")
    hits <- res
  }
  if (stringr::str_detect(var,"(DEL|Del|INS|Ins|Dup|DUP)") &
     !stringr::str_detect(var, "INSERTION|DUPLICATION|NM_|NP_|DELETERIOUS|DELETION|MUTATION|Mutation|Deletion|")) {
    var <- stringr::str_replace(var,"(DEL|Del)","del")
    var <- stringr::str_replace(var,"(INS|Ins)","ins")
    var <- stringr::str_replace(var,"(DUP|Dup)","dup")
    if (!(stringr::str_detect(var,"^(c|p|rs|;|[0-9])"))) {
      res2 <- paste0('p.', var)
      hits <- c(var, res2)
    }
  }

  if (length(hits) == 0 & var != "") {
    if (!(stringr::str_detect(var,"^(c|rs|p|[0-9])")) &
       stringr::str_detect(var, "[0-9]{1,}") &
       !stringr::str_detect(
         var,"EXON|;|MUTATION|Mutation|-|NM_|NP_|DELETERIOUS|Deletion|INTRON|DELETION|DUPLICATION")) {

      if (stringr::str_detect(var,"FS$|DUP|DELINS|FS\\*")) {
        var <- stringr::str_replace(var, "FS","fs")
        var <- stringr::str_replace(var, "DUP","dup")
        var <- stringr::str_replace(var, "DELINS","delins")
        var <- stringr::str_replace(var, "DEL","del")
      }
      res1 <- paste0('p.',var)
      hits <- c(var, res1)
    }else{
      hits <- c(var)
    }
  }

  hits <- unique(hits)

  return(hits)
}

load_civic_biomarkers <- function(
    datestamp = '20240709',
    compound_synonyms = NULL,
    hg38_fasta = 
      "/Users/sigven/research/DB/hg38/hg38.fa",
    crossmap_chainfile_dir = 
      "/Users/sigven/research/DB/chainFiles",
    cache_dir = NA) {
  

  gene_alias <- geneOncoX::get_alias(
    cache_dir = cache_dir
  )
  civic_local_fnames <- list()
  for(e in c("VariantSummaries",
             "civic_accepted",
             "ClinicalEvidenceSummaries",
             "MolecularProfileSummaries")){
    
    
    civic_local_fnames[[e]] <- file.path(
      cache_dir,
      paste0("civic_", e, "_", datestamp,".tsv")
    )
    
    remote_url <- paste0(
      "https://civicdb.org/downloads/nightly/nightly-", e, ".tsv")
    if(e == "civic_accepted"){
      remote_url <- paste0(
        "https://civicdb.org/downloads/nightly/nightly-", e, ".vcf")
      civic_local_fnames[['vcf_grch37']] <- file.path(
        cache_dir,
        paste0("civic_", datestamp,"_grch37.vcf")
      )
      civic_local_fnames[['vcf_grch38']] <- file.path(
        cache_dir,
        paste0("civic_", datestamp,"_grch38.vcf")
      )
    }
    
    
    
    if(e == "civic_accepted"){
      if(!file.exists(civic_local_fnames[['vcf_grch37']])){
        download.file(
          url = remote_url,
          destfile = civic_local_fnames[['vcf_grch37']])
      }
    }else{
      if(!file.exists(civic_local_fnames[[e]])){
        download.file(
          url = remote_url,
          destfile = civic_local_fnames[[e]])
      }
    }

    if(e == 'civic_accepted'){
      vcfhelpR::crossmap_vcf(
        direction = "hg19Tohg38",
        chain_file_directory = crossmap_chainfile_dir,
        target_genome_file = hg38_fasta,
        source_vcf = civic_local_fnames[['vcf_grch37']],
        target_vcf = civic_local_fnames[['vcf_grch38']])
      
      system(paste0('bgzip -dc ',civic_local_fnames[['vcf_grch38']],
                    '.gz > ', 
                    civic_local_fnames[['vcf_grch38']]))
    }
  }
  
  aa_dict <- get_amino_acid_dictionary()
  
  variant_coordinates <- data.frame()
  for(assembly in c('grch37','grch38')){
    skip_lines <- 8
    if(assembly == "grch38"){
      skip_lines <- 7
    }
    
    variant_coordinates <- as.data.frame(
      dplyr::bind_rows(
        variant_coordinates,
        readr::read_tsv(
          file = civic_local_fnames[[paste0('vcf_',assembly)]], 
          skip = skip_lines, 
          show_col_types = F, 
          col_names = T, 
          guess_max = 1000) |>
          janitor::clean_names() |>
          dplyr::rename(variant_id = id) |>
          dplyr::mutate(
            vcf_coord = paste0(
              assembly, ":", 
              paste(number_chrom, pos, ref, alt, sep="_"))
          ) |>
          dplyr::select(variant_id, vcf_coord)
      )
    )
  }
  
  ## ignore these biomarkers
  biomarker_names_skip_regex <- 
    paste0(
      "methylat|phosphoryl|serum lev|mislocali|peri-therap|",
      "alternative|cytoplasmic|p-loop|reg_|alternative|polymorphism|",
      "loss of heterozygosity|philadelphia|internal|n-terminal")
  
  molecularProfileSummary <- as.data.frame(
    data.table::fread(
      civic_local_fnames[['MolecularProfileSummaries']],
      select = c(1:13), fill = T)) |>
    dplyr::rename(variant_id = variant_ids,
                  molecular_profile_name = name,
                  molecular_profile_summary = summary) |>
    dplyr::select(variant_id, 
                  molecular_profile_id, 
                  molecular_profile_name,
                  molecular_profile_summary) |>
    ## ignore profiles with aberration combinations for now
    tidyr::separate_rows(variant_id, sep = ",") |>
    ##dplyr::filter(!stringr::str_detect(variant_id, ",")) |>
    dplyr::mutate(variant_id = as.integer(variant_id)) |>
    clean_variant_naming(col = "molecular_profile_name") |>
    dplyr::filter(!stringr::str_detect(
      tolower(molecular_profile_name),
      biomarker_names_skip_regex))

  clinicalEvidenceSummary <- as.data.frame(
    readr::read_tsv(
      civic_local_fnames[['ClinicalEvidenceSummaries']],
      show_col_types = F, guess_max = 5000)) |>
    dplyr::filter(source_type == "PubMed") |>
    dplyr::mutate(citation_id = as.character(citation_id)) |>
    dplyr::select(molecular_profile_id,
                  evidence_id,
                  evidence_civic_url,
                  disease,
                  doid,
                  therapies,
                  citation_id,
                  evidence_statement,
                  evidence_type,
                  evidence_level,
                  evidence_direction,
                  significance,
                  variant_origin,
                  rating) |>
    dplyr::mutate(citation_id = dplyr::case_when(
      citation_id == "27577079" ~ "28947956",
      citation_id == "27556863" ~ "28978004",
      TRUE ~ as.character(citation_id)
    )) |>
    dplyr::mutate(disease = dplyr::if_else(
      is.na(disease),
      "Cancer",
      as.character(disease)
    )) |>
    dplyr::mutate(doid = dplyr::if_else(
      is.na(doid),
      "162",
      as.character(doid)
    )) |>
    dplyr::rename(disease_ontology_id = doid,
                  clinical_significance = significance,
                  evidence_description = evidence_statement,
                  evidence_url = evidence_civic_url,
                  cancer_type = disease) |>
    dplyr::mutate(
      clinical_significance =
        dplyr::case_when(
          clinical_significance == "Resistance" |
            clinical_significance == "Reduced Sensitivity" ~ "Resistance/Non-response",
          TRUE ~ as.character(clinical_significance))) |>
  dplyr::mutate(disease_ontology_id = paste0(
    "DOID:", disease_ontology_id)) |>
    dplyr::mutate(
      evidence_level =
        dplyr::case_when(
          evidence_level == 'A' ~ 'A: Validated',
          evidence_level == 'B' ~ 'B: Clinical evidence',
          evidence_level == 'C' ~ 'C: Case study',
          evidence_level == 'D' ~ 'D: Preclinical evidence',
          evidence_level == 'E' ~ 'E: Indirect evidence',
          TRUE ~ as.character(evidence_level)
        )
    ) |>
    dplyr::mutate(
      variant_origin =
        dplyr::if_else(
          variant_origin == "N/A" | 
            variant_origin == "Unknown",
          as.character(NA),
          as.character(
            stringr::str_replace(
              variant_origin,"Rare |Common ",""))))


  therapeutic_contexts <- as.data.frame(
    clinicalEvidenceSummary |>
      dplyr::select(evidence_id, therapies) |>
      dplyr::filter(!is.na(therapies) &
                      nchar(therapies) > 0) |>
      tidyr::separate_rows(therapies, sep = ",") |>
      dplyr::mutate(therapies = tolower(therapies)) |>
      dplyr::left_join(
        dplyr::select(compound_synonyms,
                      alias_lc,
                      drug_name,
                      molecule_chembl_id),
        by = c("therapies" = "alias_lc"), 
        multiple = "all", 
        relationship = "many-to-many"
      ) |>
      dplyr::filter(!is.na(drug_name)) |>
      dplyr::select(evidence_id, molecule_chembl_id) |>
      dplyr::distinct() |>
      dplyr::group_by(evidence_id) |>
      dplyr::summarise(molecule_chembl_id = paste(
        unique(sort(molecule_chembl_id)), collapse = "|"),
        .groups = "drop"
      ) |>
      dplyr::ungroup() |>
      dplyr::mutate(molecule_chembl_id = stringr::str_replace(
        molecule_chembl_id,"^NA\\||\\|NA$",""
      ))
  )

  clinicalEvidenceSummary <- clinicalEvidenceSummary |>
    dplyr::left_join(
      therapeutic_contexts,
      by = "evidence_id", multiple = "all",
      relationship = "many-to-many")
  
  variantSummary <- as.data.frame(
    data.table::fread(civic_local_fnames[['VariantSummaries']],
                      select = c(1:25), fill = T)) |>
    ## Ignore the "Factor" feature type (Kataegis, CK, methylation)
    dplyr::filter(feature_type != "Factor") |>
    #dplyr::rename(molecular_profile_id = variant_id) |>
    dplyr::mutate(alteration_type = "MUT") |>
    dplyr::mutate(variant = stringr::str_trim(variant)) |>
    dplyr::mutate(variant = dplyr::if_else(
      !is.na(variant) & 
        (variant == "Fusion" | '::' %in% variant),
      as.character(feature_name),
      as.character(variant)
    )) |>
    dplyr::mutate(variant_aliases = dplyr::if_else(
      gene == "BRAF" & variant == "V600",
      paste(variant_aliases, "V640", sep = ","),
      as.character(variant_aliases)
    )) |>
    dplyr::mutate(variant_aliases = dplyr::if_else(
      !is.na(hgvs_descriptions) & 
        hgvs_descriptions != "NA" & 
        nchar(hgvs_descriptions) > 0,
      paste(variant_aliases, hgvs_descriptions, sep=","),
      as.character(variant_aliases))) |>
    dplyr::select(-c("hgvs_descriptions")) |>
    dplyr::mutate(variant_aliases = stringr::str_replace(
      variant_aliases, "^,", ""
    )) |>
    tidyr::separate_rows(variant_aliases, sep = ",") |>
    tidyr::separate_rows(variant, sep=",") |>
    clean_variant_naming(col = "variant") |>
    clean_variant_naming(col = "variant_aliases") |>
    dplyr::rename(variant_consequence = variant_types,
                  molecular_profile_id = 
                    single_variant_molecular_profile_id,
                  variant_alias = variant_aliases) |>
    clean_variant_consequence() |>
    #dplyr::filter(variant != "Alu insertion") |>
    dplyr::filter(
        !stringr::str_detect(
            tolower(variant),
            biomarker_names_skip_regex)) |>
    dplyr::mutate(variant_alias = dplyr::if_else(
      feature_type == "Fusion" &
        nchar(variant_alias) == 0 | 
        stringr::str_detect(
          variant_alias,"multiple partners"),
      paste0(
        stringr::str_replace(
          feature_name,"v::|::v","")," Fusion"),
      as.character(variant_alias)
    )) |>
    dplyr::mutate(variant = dplyr::if_else(
      nchar(variant_alias) > 0 &
        feature_type == "Fusion" &
        stringr::str_detect(variant,"^e[0-9]{1,}::"),
      stringr::str_replace(variant_alias," E[0-9]{1,}-E[0-9]{1,}$",""),
      as.character(variant))) |>
    dplyr::mutate(gene = dplyr::if_else(
      feature_type == "Fusion",
      as.character(variant),
      as.character(gene)
    )) |>
    set_alteration_type() |>
    dplyr::select(variant_id,
                  molecular_profile_id,
                  variant_consequence,
                  variant,
                  alteration_type,
                  gene,
                  variant_alias) |>
    dplyr::mutate(
      variant = stringr::str_replace_all(
        variant, 
        paste0(
          "((Inactivating|Activating|Deleterious|",
          "Promoter|Truncating|TKD) Mutation)|Loss-of-function"),
        "Mutation"
      )
    ) |>    
    dplyr::mutate(variant_alias = paste(
      variant_alias, variant, sep=","
    )) |>
    tidyr::separate_rows(
      variant_alias, sep=","
    ) |>
    dplyr::filter(
      !stringr::str_detect(
        tolower(variant_alias),
        biomarker_names_skip_regex)) |>
    dplyr::filter(
      (!(stringr::str_detect(variant,"H$") &
           stringr::str_detect(toupper(variant_alias),"TYR$"))) &
        (!(stringr::str_detect(variant,"Y$") &
             stringr::str_detect(toupper(variant_alias),"HIS$")))
      
    ) |>
    dplyr::distinct() |>
    dplyr::mutate(
      variant_name_primary = dplyr::case_when(
        alteration_type == "CNA" |
          (alteration_type == "MUT_LOF" & !endsWith(variant,"X")) |
          stringr::str_detect(
            variant,
            "Mutation|Wildtype|Splice|Copy Number|Deletion|Amplification") |
          stringr::str_detect(alteration_type,"^EXP") ~
          paste0(gene," ",stringr::str_to_title(variant)),
        variant_consequence == "transcript_fusion" |
          (nchar(variant_consequence) == 0 & variant == "FUSION") ~
          paste0(gene, " Fusion"),
        variant == "REARRANGEMENT" ~ 
          paste0(gene, " Rearrangement"),
        TRUE ~ as.character(
          paste0(gene, " ", variant)
        ))
    ) |>
    dplyr::mutate(variant_consequence = dplyr::if_else(
      stringr::str_detect(variant,"^aa_region:"),
      as.character("missense_variant"),
      as.character(variant_consequence))) |>
    dplyr::filter(!nchar(variant_alias) == 0) |>
    dplyr::mutate(variant_name_primary = stringr::str_replace(
      variant_name_primary, "FS","fs")
    ) |>
    dplyr::mutate(variant_name_primary = stringr::str_replace(
      variant_name_primary, "DELINS","delins")
    ) |>
    dplyr::mutate(variant_name_primary = stringr::str_replace(
      variant_name_primary, "DEL","del")
    ) |>
    dplyr::mutate(variant_name_primary = stringr::str_replace(
      variant_name_primary, "INS$","ins")
    )
  
  variants_expanded <- list()
  variants_expanded[['mut']] <- data.frame()

  for (i in unique(variantSummary$variant_id)) {
    vrows <- variantSummary[variantSummary$variant_id == i,]

    variant_entries <- unique(c(vrows$variant,
                                vrows$variant_alias))
    gene <- unique(vrows$gene)
    variant_consequence <- unique(vrows$variant_consequence)
    alteration_type <- unique(vrows$alteration_type)
    variant_name_primary <- unique(vrows$variant_name_primary)

    all_aliases <- c()

    if (stringr::str_detect(alteration_type,"MUT")) {

      for (e in variant_entries) {
        #cat(e,'\n')
        clean_aliases <- expand_hgvs_terms(e, aa_dict = aa_dict)
        for (c in clean_aliases) {
         all_aliases <- unique(c(all_aliases, c))
        }
      }


      for (variant_alias in all_aliases) {
        df <- data.frame(
          'variant_id' = i,
          'symbol' = gene,
          'variant_name_primary' = variant_name_primary,
          'variant_alias' = variant_alias,
          'alteration_type' = alteration_type,
          'variant_consequence' = variant_consequence,
          stringsAsFactors = F)
        variants_expanded[['mut']] <- dplyr::bind_rows(
          variants_expanded[['mut']], df
        )
      }
    }

  }

  variants_expanded[['mut']] <- variants_expanded[['mut']] |>
    dplyr::mutate(
      alteration_type =
        dplyr::if_else(
          stringr::str_detect(
            variant_alias,
            "^(p\\.)?(([A-Z]{1}[0-9]{1,})|([A-Z]{1}[a-z]{2}[0-9]{1,}))$"),
          as.character("CODON"),
          as.character(alteration_type)
        )) |>
    dplyr::mutate(
      variant_alias = dplyr::if_else(
        stringr::str_detect(variant_alias,"FS"),
        stringr::str_replace(variant_alias,"FS","fs"),
        as.character(variant_alias)
      )
    ) |>
    dplyr::mutate(
      variant_alias = dplyr::if_else(
        stringr::str_detect(variant_alias,"DEL") &
          !stringr::str_detect(variant_alias, "DELETION|DELETERIOUS"),
        stringr::str_replace(variant_alias,"DEL","del"),
        as.character(variant_alias)
      )
    ) |>
    dplyr::mutate(
      variant_alias = dplyr::if_else(
        stringr::str_detect(variant_alias,"INS") &
          !stringr::str_detect(variant_alias,"INSERTION"),
        stringr::str_replace(variant_alias,"INS","ins"),
        as.character(variant_alias)
      )
    ) |>
    dplyr::mutate(
      variant_alias = dplyr::if_else(
        stringr::str_detect(variant_alias,"TER|ter") &
          !stringr::str_detect(
            tolower(variant_alias), 
            "alteration"),
        stringr::str_replace(variant_alias,"ter|TER","Ter"),
        as.character(variant_alias)
      )
    ) |>
    dplyr::mutate(
      variant_alias = dplyr::if_else(
        stringr::str_detect(variant_alias,"DUP") &
          !stringr::str_detect(variant_alias,"DUPLICATION"),
        stringr::str_replace(variant_alias,"DUP","dup"),
        as.character(variant_alias)
      )
    ) |>
    dplyr::filter(
      variant_alias != "p.=" &
        !stringr::str_detect(variant_alias, "^p\\.\\(") &
        !stringr::str_detect(variant_alias, "^p\\.Exon") &
        !stringr::str_detect(variant_alias, "ENSP|ENST") &
        !stringr::str_detect(variant_alias, "SPLICE"))
  
  
  aa_dict <- get_amino_acid_dictionary()
  
  for (i in 1:nrow(aa_dict)) {
    aa <- aa_dict[i,"three_letter_uc"]
    aa_three_lc <- aa_dict[i, "three_letter"]
    
    if (nrow(variants_expanded[['mut']][!is.na(variants_expanded[['mut']]$variant_alias) & 
                                 stringr::str_detect(variants_expanded[['mut']]$variant_alias,aa),]) > 0) {
      variants_expanded[['mut']][!is.na(variants_expanded[['mut']]$variant_alias) & 
                             stringr::str_detect(variants_expanded[['mut']]$variant_alias, aa),]$variant_alias <-
        stringr::str_replace_all(variants_expanded[['mut']][!is.na(variants_expanded[['mut']]$variant_alias) & 
                                                    stringr::str_detect(variants_expanded[['mut']]$variant_alias, aa),]$variant_alias,
                             aa,
                             aa_three_lc)
    }
  }
  
  variants_expanded[['mut']] <- variants_expanded[['mut']] |>
    dplyr::mutate(alias_type = "hgvsp") |>
    dplyr::mutate(variant_alias = stringr::str_trim(variant_alias)) |>
    dplyr::mutate(alias_type = dplyr::case_when(
      stringr::str_detect(variant_alias, "^rs[0-9]{1,}$") ~ "dbsnp",
      stringr::str_detect(variant_alias, "^aa_region") ~ "aa_region",
      stringr::str_detect(variant_alias, "^c\\.[0-9]{1,}") ~ "hgvsc",
      stringr::str_detect(variant_alias, "^Exon ") ~ "exon",
      stringr::str_detect(
        variant_alias, 
        "Mutation|Copy|Wildtype|Loss|Truncation|Alteration|Frameshift") ~ "other_gene",
      TRUE ~ as.character(alias_type)
    )) |>
    dplyr::left_join(variant_coordinates, 
                     by = "variant_id", 
                     relationship = "many-to-many") |>
    dplyr::mutate(variant_alias = dplyr::if_else(
      !is.na(vcf_coord),
      paste(variant_alias, vcf_coord, sep=","),
      as.character(variant_alias)
    )) |>
    tidyr::separate_rows(
      variant_alias, sep = ","
    ) |>
    dplyr::select(-vcf_coord) |>
    dplyr::distinct() |>
    dplyr::mutate(alias_type = dplyr::if_else(
      stringr::str_detect(variant_alias, "^grch37"),
      "genomic_grch37",
      as.character(alias_type)
    )) |>
    dplyr::mutate(alias_type = dplyr::if_else(
      stringr::str_detect(variant_alias, "^grch38"),
      "genomic_grch38",
      as.character(alias_type)
    )) |>
    dplyr::mutate(variant_alias = dplyr::if_else(
      stringr::str_detect(alias_type, "^genomic_grch37"),
      stringr::str_replace(variant_alias, "grch37:",""),
      as.character(variant_alias)
    )) |>
    dplyr::mutate(variant_alias = dplyr::if_else(
      stringr::str_detect(alias_type, "^genomic_grch38"),
      stringr::str_replace(variant_alias, "grch38:",""),
      as.character(variant_alias)
    )) |>
    dplyr::mutate(alteration_type = dplyr::if_else(
      stringr::str_detect(alias_type, "^genomic"),
      "MUT",
      as.character(alteration_type)
    )) |>
    dplyr::filter(
      symbol != "TERT" |
        (symbol == "TERT" &
           !stringr::str_detect(variant_alias, "Cys|^p\\.") &
           alteration_type != "CODON")) |>
    dplyr::mutate(variant_alias = dplyr::if_else(
      symbol == "TERT" & variant_alias == "-146C>T",
      paste(variant_alias, 
            "grch37:5_1295250_G_A,grch38:5_1295135_G_A", 
            sep=","),
      as.character(variant_alias)
    )) |>
    dplyr::mutate(variant_alias = dplyr::if_else(
      symbol == "MET" & 
        variant_alias == "Exon 14 Skipping Mutation",
      paste(
        variant_alias, 
        "c.3028+1G>T",
        "c.3028+1G>C",
        "c.3028+1G>A",
        "c.3028+2T>C",
        "c.3028+2T>G", 
        "c.3082+1G>T",
        "c.3082+1G>C",
        "c.3082+1G>A",
        "c.3082+2T>C",
        "c.3082+2T>G", 
        sep=","),
      as.character(variant_alias)
    )) |>
    dplyr::mutate(variant_alias = dplyr::if_else(
      symbol == "TERT" & 
        variant_alias == "-124C>T",
      paste(variant_alias, 
            "grch37:5_1295228_G_A,grch38:5_1295113_G_A", 
            sep=","),
      as.character(variant_alias)
    )) |>
    tidyr::separate_rows(
      variant_alias, sep = ","
    ) |>
    
    dplyr::mutate(alias_type = dplyr::if_else(
      stringr::str_detect(variant_alias, "^grch37"),
      "genomic_grch37",
      as.character(alias_type)
    )) |>
    dplyr::mutate(alias_type = dplyr::if_else(
      stringr::str_detect(variant_alias, "^grch38"),
      "genomic_grch38",
      as.character(alias_type)
    )) |>
    dplyr::mutate(variant_alias = dplyr::if_else(
      stringr::str_detect(alias_type, "^genomic_grch37"),
      stringr::str_replace(variant_alias, "grch37:",""),
      as.character(variant_alias)
    )) |>
    dplyr::mutate(variant_alias = dplyr::if_else(
      stringr::str_detect(alias_type, "^genomic_grch38"),
      stringr::str_replace(variant_alias, "grch38:",""),
      as.character(variant_alias)
    )) |>
    dplyr::mutate(alteration_type = dplyr::if_else(
      stringr::str_detect(alias_type, "^genomic"),
      "MUT",
      as.character(alteration_type)
    )) |>

    dplyr::mutate(alias_type = dplyr::if_else(
      stringr::str_detect(variant_alias, "^c\\."),
      "hgvsc",
      as.character(alias_type)
    )) |>
  
    dplyr::mutate(variant_exon = dplyr::if_else(
      stringr::str_detect(variant_alias, "Exon"),
      stringr::str_replace_all(
        variant_alias, 
        "Exon |( (Frameshift|((Skipping )?Mutation)|Deletion|Insertion))",""),
      as.character(NA)
    )) |>
    dplyr::mutate(variant_exon = dplyr::if_else(
      !is.na(variant_exon) & 
        variant_consequence == "exon_variant",
      stringr::str_replace_all(
        variant_exon,"18-21",
        "18|19|20|21"),
      as.character(variant_exon))
    ) |>
    dplyr::mutate(variant_exon = dplyr::if_else(
      !is.na(variant_exon) & 
        variant_consequence == "exon_variant",
      stringr::str_replace_all(
        variant_exon,
        "1-2","1|2"),
      as.character(variant_exon))
    ) |>
    dplyr::mutate(variant_exon = dplyr::if_else(
      !is.na(variant_exon) & 
        variant_consequence == "exon_variant",
      stringr::str_replace_all(
        variant_exon,
        "10 OR 21","10|21"),
      as.character(variant_exon))
    ) |>
    dplyr::mutate(variant_exon = dplyr::if_else(
      !is.na(variant_exon) & 
        variant_consequence == "exon_variant",
      stringr::str_replace_all(
        variant_exon,
        "10 AND 21","10&21"),
      as.character(variant_exon))
    ) |>
    dplyr::filter(
      is.na(variant_exon) 
      | (variant_exon != "9 AND 20" & 
           !stringr::str_detect(
             variant_exon,"Deletion"))) |>
    dplyr::distinct()
  
  
  ## Make sure alteration types for genomic coordinates
  ## are correct
  variants_nongenomic <- variants_expanded[['mut']] |>
    dplyr::filter(
      !stringr::str_detect(
        alias_type, "genomic_"
      )
    ) |>
    dplyr::group_by(
      variant_id
    ) |>
    dplyr::summarise(
      alteration_type =
        paste(unique(alteration_type), collapse=";")
    ) |>
    dplyr::mutate(
      alteration_type = dplyr::if_else(
        stringr::str_detect(alteration_type, ";"),
        stringr::str_replace(
          alteration_type, ";?CODON;?",""
        ),
        as.character(alteration_type)
      )
    )
  
  
  variants_expanded_mut1 <- variants_expanded[['mut']] |>
    dplyr::filter(!stringr::str_detect(
      alias_type, "genomic"))
  
  variants_expanded_mut2 <- variants_expanded[['mut']] |>
    dplyr::filter(stringr::str_detect(
      alias_type, "genomic"))
  variants_expanded_mut2$alteration_type <- NULL
  variants_expanded_mut2 <- variants_expanded_mut2 |>
    dplyr::left_join(variants_nongenomic, by = "variant_id",
                     relationship = "many-to-many")
  
  variants_expanded[['mut']] <- variants_expanded_mut1 |>
    dplyr::bind_rows(variants_expanded_mut2)
  
  
  ## Combine the two dataframes
  variants_expanded[['other']]  <- as.data.frame(
    variantSummary |>
      dplyr::filter(!stringr::str_detect(alteration_type,"MUT")) |>
      dplyr::select(variant_id, variant, variant_consequence,
                    variant_alias, variant_name_primary, alteration_type,
                    gene) |>
      dplyr::rename(symbol = gene) |>
      dplyr::mutate(
        alias_type = "other_gene"
      ) |>
      dplyr::distinct()
  )
  attr(variants_expanded[['other']],"groups") <- NULL
  
  gene_aliases <- gene_alias$records |>
    dplyr::filter(.data$n_primary_map == 1 & 
                    source == "NCBI") |>
    dplyr::select(alias, symbol, entrezgene) |>
    dplyr::distinct()
  
  biomarker_items <- list()
  biomarker_items[['variant']] <- 
    dplyr::bind_rows(
      variants_expanded[['mut']], 
      variants_expanded[['other']]) |>
    dplyr::select(variant_id, variant_alias, 
                  alias_type, dplyr::everything()) |>
    dplyr::rename(gene = symbol) |>
    dplyr::left_join(
      gene_aliases,
      by = c("gene" = "alias"),
      relationship = "many-to-one"
    ) |>
    dplyr::mutate(variant_consequence = dplyr::case_when(
      nchar(variant_consequence) == 0 & stringr::str_detect(
        variant_alias, "delins([A-Z]{1,4})+$") ~ "inframe_insertion",
      nchar(variant_consequence) == 0 & stringr::str_detect(
        variant_alias, "ins([A-Z]{1,4})+$") ~ "inframe_insertion",
      nchar(variant_consequence) == 0 & stringr::str_detect(
        variant_alias, "[0-9]{1,}?del([A-Z]{1,4})+$") ~ "inframe_deletion",
      nchar(variant_consequence) == 0 & stringr::str_detect(
        variant_alias, "fs") ~ "frameshift_variant",
      nchar(variant_consequence) == 0 & stringr::str_detect(
        variant_alias, "^Mutation$") ~ "protein_altering_variant",
      nchar(variant_consequence) == 0 & 
        alias_type == "hgvsp" ~ "missense_variant",
      TRUE ~ as.character(variant_consequence)
    )) |> 
    dplyr::mutate(variant_alias = dplyr::case_when(
      (is.na(variant_alias) | nchar(variant_alias) == 0) &
        !is.na(gene)  &
        alias_type == "other_gene" &
        stringr::str_detect(gene,"-") ~ gene,
      (is.na(variant_alias) | nchar(variant_alias) == 0) &
        !is.na(gene)  &
        alias_type == "other_gene" &
        !stringr::str_detect(gene,"-") ~ 
        paste0(gene, " Fusion"),
      TRUE ~ as.character(variant_alias)
    )) |>
    dplyr::distinct()
  
  biomarker_items[['clinical']] <- 
    clinicalEvidenceSummary |> 
    dplyr::full_join(
      molecularProfileSummary, by = "molecular_profile_id",
      multiple = "all", relationship = "many-to-many") |> 
    dplyr::select(molecular_profile_id, molecular_profile_name, 
                  molecular_profile_summary, 
                  variant_id, dplyr::everything()) |>
    dplyr::rename(therapeutic_context = therapies) |>
    dplyr::mutate(evidence_id = paste0("EID",evidence_id)) |>
    dplyr::filter(evidence_id != "EIDNA") |>
    dplyr::mutate(biomarker_source = "civic",
                  biomarker_source_datestamp = datestamp,
                  biomarker_entity = T) |>
    dplyr::inner_join(dplyr::select(
      biomarker_items[['variant']], variant_id), 
      by = "variant_id",
      multiple = "all", relationship = "many-to-many") |>
    dplyr::distinct() |>
    dplyr::arrange(evidence_level, molecular_profile_id)
  
  biomarker_items[['clinical']] <- map_biomarker_phenotypes(
    biomarker_items[['clinical']], 
    cache_dir = cache_dir)
  
  biomarker_items <- get_literature_references(biomarker_items)
  if(is.data.frame(biomarker_items)){
    if(unique(colnames(biomarker_items) == c("source","source_id")) == T){
      lgr::lgr$error("literature missing ")
      return(biomarker_items)
    }
  }
  
  biomarker_items[['clinical']] <- biomarker_items[['clinical']] |>
    dplyr::mutate(molecular_profile_type = "Single") |>
    dplyr::mutate(molecular_profile_type = dplyr::case_when(
      stringr::str_detect(molecular_profile_name, " AND|and|And ") ~
      paste0(
        "Combo_", stringr::str_count(variant_id, " AND|and|And ")[1] + 1),
      stringr::str_detect(molecular_profile_name, " OR|or|Or ") ~
      paste0(
        "Combo_Opt_", stringr::str_count(variant_id, ",")[1] + 1),
      TRUE ~ molecular_profile_type
    )) |>
    dplyr::select(
      biomarker_source,
      biomarker_source_datestamp,
      biomarker_entity,
      molecular_profile_id,
      molecular_profile_name,
      molecular_profile_summary,
      molecular_profile_type,
      variant_id,
      source_id,
      evidence_id,
      variant_origin,
      primary_site,
      cancer_type,
      disease_ontology_id,
      do_name,
      efo_id,
      efo_name,
      cui,
      cui_name,
      dplyr::everything()
    )
  
  biomarker_items[['variant']] <- as.data.frame(
    biomarker_items[['variant']] |>
      dplyr::mutate(biomarker_source = "civic",
                    biomarker_source_datestamp = datestamp) |>
      dplyr::select(
        biomarker_source,
        biomarker_source_datestamp,
        variant_id,
        variant_alias,
        variant_name_primary,
        alteration_type,
        alias_type,
        variant_consequence,
        variant_exon,
        gene,
        symbol,
        entrezgene,
        dplyr::everything()
      ) |>
      dplyr::inner_join(
        dplyr::select(biomarker_items[['clinical']],
                      variant_id, molecular_profile_id),
        by = "variant_id",
        relationship = "many-to-many"
      ) |>
      dplyr::distinct() |>
      dplyr::select(-c("variant")) |>
      dplyr::select(
        biomarker_source,
        biomarker_source_datestamp,
        variant_id,
        molecular_profile_id,
        dplyr::everything()
      )
  )

  return(biomarker_items)

}


load_cgi_biomarkers <- function(compound_synonyms = NULL,
                                datestamp = '20221027',
                                cache_dir = NA) {

  aa_dict <- get_amino_acid_dictionary()
  gene_alias <- geneOncoX::get_alias(
    cache_dir = cache_dir
  )
  cancer_type_abbreviations <-
    read.table(
      file = file.path(cache_dir, "cgi_cancer_subtypes.tsv"),
      sep = "\t", header = T,
      stringsAsFactors = F, quote = "", 
      na.strings = "",fill = T) |>
    dplyr::mutate(
      disease_ontology_id = paste0(
        "DOID:",disease_ontology_id))

  ## SNVs/InDels
  ## SKIP/IGNORE mutations that are
  # 1) not explicitly mapped to protein alterations (MUT)
  # 2) marked with uncertainty, rechecking etc (Comments)
  # 3) involved in toxicity
  # 4) curated from abstracts etc (not PMIDs or guidelines)
  # 5) Non-malignant/Non-cancer conditions
  
  ## NEW
  cgi_biomarkers <- as.data.frame(
    readr::read_tsv(
      file.path(cache_dir, "cgi_biomarkers_20221017.tsv"),
      col_names = T, show_col_types = F, guess_max = 1000) |>
      janitor::clean_names() |>
      dplyr::mutate(
        biomarker = 
          stringr::str_replace_all(
            biomarker,
            "\\.",""
          )) |>
      dplyr::select(-c(curator, curation_date, targeting,
                       drug_family, drug_status, drug, tcgi_included,
                       metastatic_tumor_type, 
                       primary_tumor_type_full_name, assay_type)) |>
      dplyr::distinct() |>
      dplyr::mutate(biomarker = stringr::str_replace_all(
        biomarker, "12,13,59,61,117,146",
        "G12,G13,A59,Q61,K117,A146")) |>
      dplyr::mutate(alteration_type = stringr::str_replace(
        alteration_type, "EXPR", "EXP"
      )) |>
      dplyr::mutate(alteration_type = stringr::str_replace(
        alteration_type, "FUS", "FUSION"
      )) |>
      dplyr::mutate(biomarker = stringr::str_replace_all(
        biomarker, "\\*","X")) |>
      dplyr::mutate(biomarker = stringr::str_replace_all(
        biomarker, "618,620,634,768,791,891,918",
        "C618,C620,C634,E768,Y791,S891,M918")) |>
      dplyr::mutate(biomarker = stringr::str_replace_all(
        biomarker, "12,13","G12,G13")) |>
      dplyr::mutate(biomarker = stringr::str_replace(
        biomarker, "undexpression","Underexpression")) |>
      dplyr::mutate(biomarker = stringr::str_replace_all(
        biomarker, ",537,538,",",Y537,D538,")) |>
      dplyr::rename(
        molecular_profile = biomarker
      ) |>
      dplyr::mutate(molecular_profile = stringr::str_replace_all(
        molecular_profile, ";|(, )", ","
      )) |>
      dplyr::mutate(molecular_profile = stringr::str_replace_all(
        molecular_profile, " (\\+|-)$", ""
      )) |>
      dplyr::filter(
        !stringr::str_detect(source, "^(ENA|ESMO|ASCO|AACR)")
      ) |>
      tidyr::separate_rows(
        primary_tumor_type, sep = ";") |>
      dplyr::left_join(
        cancer_type_abbreviations,
        by = c("primary_tumor_type" = "cgi_abbreviation"),
        relationship = "many-to-many") |>
      dplyr::mutate(molecular_profile_id = dplyr::row_number()) |>
      dplyr::mutate(molecular_profile_type = dplyr::if_else(
        stringr::str_detect(molecular_profile, " \\+ "),
        "Combo",
        "Single"
      )) |>
      dplyr::mutate(molecular_profile_name = molecular_profile) |>
      tidyr::separate_longer_delim(molecular_profile, delim = " + ")
  )
      
  combo_index <- cgi_biomarkers |>
    dplyr::group_by(molecular_profile_id) |>
    dplyr::reframe(combo_index = dplyr::row_number()) |>
    dplyr::select(-molecular_profile_id)
  
  cgi_biomarkers <- as.data.frame(
    cgi_biomarkers |>
      dplyr::bind_cols(combo_index) |>                               
      dplyr::mutate(molecular_profile_type = dplyr::if_else(
        molecular_profile_type == "Combo",
        paste(
          molecular_profile_type, combo_index, sep="_"),
        as.character(molecular_profile_type)
      )) |>
      dplyr::select(-combo_index) |>
      dplyr::mutate(alteration2 = 
                      stringr::str_replace_all(
                        stringr::str_match(
                          molecular_profile, "\\([A-Z0-9]+(,[A-Z0-9]+){0,}\\)")[,1],
                        "\\)|\\(","")
      ) |>
      dplyr::mutate(alteration_type = dplyr::case_when(
        stringr::str_detect(molecular_profile_type, "Combo") &
          stringr::str_detect(molecular_profile, " amplification") ~ "CNA",
        stringr::str_detect(molecular_profile_type, "Combo") &
          stringr::str_detect(molecular_profile, " deletion") &
          !stringr::str_detect(molecular_profile, "inframe") ~ "CNA",
        stringr::str_detect(molecular_profile_type, "Combo") &
          stringr::str_detect(molecular_profile, " fusion") ~ "FUSION",
        stringr::str_detect(molecular_profile_type, "Combo") &
          stringr::str_detect(tolower(molecular_profile), "overexpression") ~ "EXP_OVER",
        stringr::str_detect(molecular_profile_type, "Combo") &
          stringr::str_detect(tolower(molecular_profile), "underexpression") ~ "EXP_OVER",
        stringr::str_detect(molecular_profile_type, "Combo") &
          stringr::str_detect(molecular_profile, " expression") ~ "EXP",
        stringr::str_detect(molecular_profile_type, "Combo") &
          stringr::str_detect(alteration_type,"MUT") & 
          stringr::str_detect(tolower(molecular_profile), "oncogenic|\\(") ~ "MUT_ONC",
        TRUE ~ as.character(alteration_type)
      )) |>
      tidyr::separate(gene, c("gene1","gene2"), sep = ";") |>
      dplyr::mutate(gene = dplyr::if_else(
        !is.na(gene1) & !is.na(gene2) & 
          startsWith(molecular_profile, gene2),
        as.character(gene2),
        as.character(gene1)
      )) |>
      dplyr::select(-c(gene1,gene2)) |>
      dplyr::select(molecular_profile, gene, dplyr::everything()) |>
      dplyr::mutate(alteration_type = dplyr::case_when(
        alteration_type == "BIA" ~ "MUT_LOF_BIALLELIC",
        stringr::str_detect(
          molecular_profile, " wildtype") ~ "WT",
        stringr::str_detect(
          molecular_profile, " oncogenic") ~ "MUT_ONC",
        stringr::str_detect(
          tolower(molecular_profile), "overexpression")  ~ "EXP_OVER",
        stringr::str_detect(
          tolower(molecular_profile), "underexpression") ~ "EXP_UNDER",
        stringr::str_detect(
          molecular_profile, " fusion$") ~ "FUSION",
        TRUE ~ as.character(alteration_type)
      )) |>
      
      dplyr::mutate(variant_consequence = dplyr::case_when(
        
        alteration_type == "WT" ~ as.character(NA),
        alteration_type == "CNA" & 
          stringr::str_detect(
            alteration,":amp") ~ "transcript_amplification",
        alteration_type == "CNA" & 
          stringr::str_detect(
            alteration,":del") ~ "transcript_ablation",
        alteration_type == "EXP_OVER" ~ "transcript_amplification",
        alteration_type == "FUSION" ~ "transcript_fusion",
        alteration_type == "EXP_UNDER" ~ "transcript_ablation",
        alteration_type == "MUT_LOF_BIALLELIC" ~ "loss_of_function_variant",
        alteration_type == "MUT_ONC" ~ "protein_altering_variant",
        (alteration_type == "MUT" | alteration_type == "MUT_ONC") &
          !stringr::str_detect(molecular_profile,"inframe|splice|insertion|deletion") ~ "missense_variant",
        (alteration_type == "MUT" | alteration_type == "MUT_ONC") &
          stringr::str_detect(molecular_profile,"splice donor") ~ "splice_donor_variant",
        (alteration_type == "MUT" | alteration_type == "MUT_ONC") &
          stringr::str_detect(molecular_profile,"splice acceptor") ~ "splice_acceptor_variant",
        (alteration_type == "MUT" | alteration_type == "MUT_ONC") &
          stringr::str_detect(molecular_profile,"inframe") &
          stringr::str_detect(molecular_profile,"insertion") &
          stringr::str_detect(molecular_profile, "deletion") ~ "inframe_deletion,inframe_insertion",
        (alteration_type == "MUT" | alteration_type == "MUT_ONC") &
          stringr::str_detect(molecular_profile,"inframe") &
          stringr::str_detect(molecular_profile, "deletion") ~ "inframe_deletion",
        molecular_profile == "FLT3-ITD" ~ "inframe_insertion",
        molecular_profile == "PDGFRA exon 18 mutations" ~ "inframe_deletion,inframe_insertion,missense_variant",
        (alteration_type == "MUT" | alteration_type == "MUT_ONC") &
          stringr::str_detect(molecular_profile,"inframe") &
          stringr::str_detect(molecular_profile, "insertion") ~ "inframe_insertion",
        (alteration_type == "MUT") &
          stringr::str_detect(molecular_profile, " exon ") &
          stringr::str_detect(alteration, "inframe_insertion") ~ "inframe_insertion",
        (alteration_type == "MUT") &
          stringr::str_detect(molecular_profile, " exon ") &
          stringr::str_detect(alteration, "inframe_deletion") ~ "inframe_deletion",
        TRUE ~ as.character(NA)
      )) |>
      dplyr::rename(
        variant_alias = alteration2
      ) |>
      dplyr::mutate(variant_alias = dplyr::case_when(
        alteration_type == "EXP" ~ "Expression",
        stringr::str_detect(molecular_profile, "wildtype") ~ "Wildtype",
        is.na(variant_alias) & 
          (alteration_type == "MUT" | 
             alteration_type == "MUT_ONC" |
             alteration_type == "MUT_LOF_BIALLELIC") ~ "Mutation",
        alteration_type == "CNA" & 
          variant_consequence == "transcript_amplification" ~ "Amplification",
        alteration_type == "CNA" & 
          variant_consequence == "transcript_ablation" ~ "Deletion",
        alteration_type == "FUSION" ~ "Fusion",
        alteration_type == "EXP_UNDER" & 
          variant_consequence == "transcript_ablation" ~ "Underexpression",
        alteration_type == "EXP_OVER" & 
          variant_consequence == "transcript_amplification" ~ "Overexpression",
        TRUE ~ as.character(variant_alias)
      )) |>
      dplyr::mutate(gene = dplyr::if_else(
        alteration_type == "FUSION" &
          stringr::str_detect(molecular_profile, "-"),
        stringr::str_replace(
          stringr::str_replace(molecular_profile, " fusion",""),
          "ABL1-BCR","BCR-ABL1"), 
        as.character(gene))
      ) |>
      dplyr::mutate(gene = dplyr::case_when(
        gene == "MLL2" ~ "KMT2D",
        TRUE ~ as.character(gene)
      )) |>
      dplyr::mutate(variant_origin = "Somatic") |>
      dplyr::mutate(variant_origin = dplyr::if_else(
        !is.na(comments) & 
          stringr::str_detect(tolower(comments),"germline"),
        "Germline",
        as.character(variant_origin)
      )) |>
      dplyr::mutate(evidence_type = 'Predictive',
                    therapeutic_context = drug_full_name,
                    clinical_significance = NA) |>
      dplyr::mutate(
        clinical_significance =
          dplyr::case_when(
            association == "Resistant" ~ "Resistance/Non-response",
            association == "No Responsive" ~ "Resistance/Non-response",
            association == "Responsive" ~ "Sensitivity/Response",
            stringr::str_detect(association, "Toxicity") ~ "Toxicity",
            TRUE ~ as.character("Other"))) |>
      dplyr::rename(evidence_level_raw = evidence_level) |>
      dplyr::mutate(mapping_rank = 1) |>
      dplyr::mutate(
        evidence_level =
          dplyr::if_else(
            stringr::str_detect(evidence_level_raw,"guidelines"),
            "A: FDA/NCCN/ELN guidelines",
            as.character(NA))) |>
      dplyr::mutate(
        evidence_level =
          dplyr::if_else(
            stringr::str_detect(evidence_level_raw,"Late trials"),
            "B1: Clinical evidence: late trials",
            as.character(evidence_level))) |>
      dplyr::mutate(
        evidence_level =
          dplyr::if_else(
            stringr::str_detect(evidence_level_raw,"Early trials"),
            "B2: Clinical evidence: early trials",
            as.character(evidence_level))) |>
      dplyr::mutate(
        evidence_level =
          dplyr::if_else(
            stringr::str_detect(evidence_level_raw,"Case report"),
            "C: Case study",
            as.character(evidence_level))) |>
      dplyr::mutate(
        evidence_level =
          dplyr::if_else(
            stringr::str_detect(evidence_level_raw,"Pre-clinical"),
            "D: Preclinical evidence",
            as.character(evidence_level))) |>
      dplyr::filter(!is.na(evidence_level_raw)) |>
      dplyr::select(-comments) |>
      dplyr::mutate(alteration = dplyr::if_else(
        molecular_profile == "KIT mutation in exon 9,11,13,14 or 17",
        "KIT:449-514,550-592,627-664,664-714,788-828",
        as.character(alteration)
      )) |>
      dplyr::mutate(alteration = dplyr::if_else(
        stringr::str_detect(molecular_profile, " exon |-ITD$"),
        stringr::str_replace_all(
          alteration, 
          "KIT:|([A-Z0-9]{3,}::consequence::(skipping_mutation|(inframe_(insertion|variant|deletion))):)",
          ""
        ),
        as.character(alteration)
      )) |>
      dplyr::mutate(alteration = dplyr::if_else(
        stringr::str_detect(molecular_profile, " exon |-ITD$"),
        stringr::str_replace_all(
          alteration, "::(inframe_insertion|missense_variant):", ""),
        as.character(alteration)
      )) |>
      dplyr::mutate(variant_alias = dplyr::if_else(
        stringr::str_detect(molecular_profile, " exon |-ITD$") &
          !is.na(alteration),
        as.character(alteration),
        as.character(variant_alias)
      )) |>
      tidyr::separate_longer_delim(variant_alias, delim = ",") |>
      dplyr::distinct() |>
      dplyr::mutate(variant_alias = dplyr::if_else(
        stringr::str_detect(variant_alias, "-"),
        paste0("aa_region:", variant_alias),
        as.character(variant_alias)
      )) |>
      dplyr::mutate(variant_alias = dplyr::if_else(
        stringr::str_detect(
          molecular_profile, "^(IL7R inframe insertion \\(237)"),
        "aa_region:237-255",
        as.character(variant_alias)
      )) |>
      dplyr::left_join(
        dplyr::select(gene_alias$records, 
                      alias, symbol, entrezgene),
        by = c("gene" = "alias"), multiple = "all",
        relationship = "many-to-many"
      ) |>
      dplyr::mutate(alteration = stringr::str_replace_all(
        alteration,"__\\.","::v")
      ) |>
      dplyr::mutate(variant_consequence = dplyr::if_else(
        stringr::str_detect(variant_alias, "X$"),
        "stop_gained",
        as.character(variant_consequence)
      )) |>
      dplyr::mutate(alteration_type = dplyr::if_else(
        stringr::str_detect(
          variant_consequence, 
          "stop_gained|splice_donor|splice_acceptor"),
        "MUT_LOF",
        as.character(alteration_type)
      ))
  )
      
  unique_variants <- as.data.frame(
    cgi_biomarkers  |>
      dplyr::filter(!is.na(variant_alias) &
                      !is.na(gene) &
                      !is.na(alteration_type)) |>
      dplyr::select(gene, symbol, entrezgene, variant_alias,
                    alteration_type, variant_consequence) |>
      dplyr:: arrange(symbol, alteration_type, variant_alias) |>
      dplyr::distinct() |>
      dplyr::mutate(variant_id = dplyr::row_number()) |>
      dplyr::mutate(variant_name_primary = dplyr::case_when(
        !is.na(symbol) ~ paste(symbol, variant_alias),
        is.na(symbol) &
          !is.na(gene) ~ paste(gene, variant_alias),
        TRUE ~ as.character(NA)
      ))
  )

  cgi_variants <- data.frame()
  for (i in 1:nrow(unique_variants)) {

    alt <- unique_variants[i,"variant_alias"]
    terms <- expand_hgvs_terms(alt, aa_dict = aa_dict)
    for (t in terms) {
      df <- data.frame(
        'variant_id' = 
          unique_variants[i,"variant_id"],
        'variant_alias' = t,
        'variant_name_primary' = unique_variants[i,"variant_name_primary"],
        'gene' = unique_variants[i,"gene"],
        'entrezgene' = unique_variants[i,"entrezgene"],
        'symbol' = unique_variants[i,"symbol"],
        'variant_consequence' = unique_variants[i,"variant_consequence"],
        'alteration_type' = unique_variants[i,"alteration_type"],
        stringsAsFactors = F)
      cgi_variants <- cgi_variants |>
        dplyr::bind_rows(df)
    }
  }
  
  cgi_variants <- cgi_variants |>
    dplyr::mutate(alias_type = "hgvsp") |>
    dplyr::mutate(alias_type = dplyr::case_when(
      stringr::str_detect(variant_alias, "aa") ~ "aa_region",
      stringr::str_detect(variant_alias, "^(Mut|Wild|Exp|Overexp|Underexp|Fus|Ampli|Delet)") ~ "other_gene",
      TRUE ~ as.character(alias_type)
    )) |>
    dplyr::mutate(variant_exon = NA) |>
    dplyr::mutate(biomarker_source = "cgi") |>
    dplyr::mutate(biomarker_source_datestamp = datestamp)  |>
    dplyr::mutate(
      alteration_type =
        dplyr::if_else(
          !is.na(alteration_type) & !is.na(variant_alias) & 
            stringr::str_detect(
              variant_alias, 
              "^(([A-Z]{1}[0-9]{1,})|([A-Z]{1}[a-z]{2}[0-9]{1,}))$"),
          as.character("CODON"),
          as.character(alteration_type)
        )) |>
    dplyr::select(
      biomarker_source,
      biomarker_source_datestamp,
      variant_id,
      variant_alias,
      variant_name_primary,
      alteration_type,
      alias_type,
      variant_consequence,
      variant_exon,
      gene,
      symbol,
      entrezgene,
    ) |>
    dplyr::distinct()
    

  cgi_clinical <- cgi_biomarkers |>

    ## get variant id's per molecular profile
    dplyr::inner_join(
      dplyr::select(
        cgi_variants, variant_id, 
        gene, alias_type, variant_alias),
      by = c("gene", "variant_alias"),
      relationship = "many-to-many"
    ) |>
    
    dplyr::select(-c(entrezgene, variant_consequence, variant_alias,
                     alias_type, gene, symbol, alteration_type,
                     alteration)) |>
    dplyr::distinct() |>
    dplyr::mutate(therapeutic_context = dplyr::if_else(
      stringr::str_detect(
        therapeutic_context,
        "(ib|ide|ole|ant|stat|ine|us|one|mab|in|el|[0-9])( |  )\\("),
      stringr::str_replace(
        therapeutic_context,
        "( |  )\\(.*\\)( )?$",""),
      as.character(therapeutic_context)
    ))
  

  therapeutic_contexts <- as.data.frame(
    cgi_clinical |>
      dplyr::select(molecular_profile_id, therapeutic_context) |>
      dplyr::filter(!is.na(therapeutic_context) &
                      nchar(therapeutic_context) > 0) |>
      tidyr::separate_rows(therapeutic_context, sep = " \\+ ") |>
      dplyr::mutate(therapeutic_context = tolower(therapeutic_context)) |>
      dplyr::left_join(
        dplyr::select(compound_synonyms,
                      alias_lc, 
                      drug_name, 
                      molecule_chembl_id),
        by = c("therapeutic_context" = "alias_lc"), 
        multiple = "all", relationship = "many-to-many"
      ) |>
      dplyr::filter(!is.na(drug_name)) |>
      dplyr::select(molecular_profile_id, molecule_chembl_id) |>
      dplyr::distinct() |>
      dplyr::group_by(molecular_profile_id) |>
      dplyr::summarise(molecule_chembl_id = paste(
        unique(sort(molecule_chembl_id)), collapse="|"),
        .groups = "drop"
      ) |>
      dplyr::ungroup() |>
      dplyr::mutate(molecule_chembl_id = stringr::str_replace(
        molecule_chembl_id,"^NA\\||\\|NA$",""
      ))
  )

  cgi_clinical <- cgi_clinical |>
    dplyr::left_join(
      therapeutic_contexts, by = "molecular_profile_id", 
      multiple = "all", relationship = "many-to-many") |>
    dplyr::mutate(evidence_url = "https://www.cancergenomeinterpreter.org/biomarkers") |>
    dplyr::mutate(biomarker_source = "cgi") |>
    dplyr::mutate(biomarker_source_datestamp = datestamp) |>
    dplyr::mutate(evidence_description = NA,
                  evidence_id = paste0("CGI", molecular_profile_id),
                  molecular_profile_summary = NA,
                  rating = NA,
                  biomarker_entity = T) |>
    dplyr::select(-c(association, evidence_level_raw,
                     primary_tumor_type, drug_full_name)) |>
    dplyr::rename(cancer_type = disease_name,
                  citation_id = source)

  cgi_clinical <- map_biomarker_phenotypes(
    biomarkers_clinical = cgi_clinical,
    cache_dir = cache_dir) |>
    dplyr::distinct()
  
  biomarker_items <- list()
  
 
  biomarker_items[['clinical']] <- cgi_clinical
  biomarker_items[['clinical']]$molecular_profile <- NULL
  
  
  biomarker_items <- get_literature_references(biomarker_items)
  biomarker_items[['clinical']] <- biomarker_items[['clinical']] |>
    dplyr::select(
      biomarker_source,
      biomarker_source_datestamp,
      biomarker_entity,
      molecular_profile_id,
      molecular_profile_name,
      molecular_profile_summary,
      molecular_profile_type,
      variant_id,
      source_id,
      evidence_id,
      variant_origin,
      primary_site,
      cancer_type,
      disease_ontology_id,
      do_name,
      efo_id,
      efo_name,
      cui,
      cui_name,
      dplyr::everything()
    ) |>
    dplyr::distinct()
  
  biomarker_items[['variant']] <- as.data.frame(
    cgi_variants |>
    dplyr::inner_join(
      dplyr::select(
        biomarker_items$clinical,
        variant_id
      ), by = "variant_id", 
      relationship = "many-to-many"
    ) |>
     dplyr::distinct() 
  )

  return(biomarker_items)
}

map_biomarker_phenotypes <- function(biomarkers_clinical = NULL,
                                     cache_dir = NA) {

  cancer_term_map <- 
    phenOncoX::get_terms(cache_dir = cache_dir)$records |>
    dplyr::select(do_id, efo_id, efo_name, do_name,
                  cui, cui_name, primary_site) |>
    dplyr::filter(!is.na(do_id) & !is.na(cui)) |>
    dplyr::rename(disease_ontology_id = do_id) |>
    dplyr::distinct()
  
  cancer_aux_pheno_maps <- 
    phenOncoX::get_aux_maps(cache_dir = cache_dir)

  umls_terms_all <- 
    cancer_aux_pheno_maps$records$umls$concept |>
    dplyr::filter(main_term == T) |>
    dplyr::select(cui, cui_name)
  
  do_terms_all <- 
    cancer_aux_pheno_maps$records$do |>
    dplyr::select(
      cui, do_id, do_name
    ) |>
    dplyr::distinct()
  
  efo_terms_all <- 
    cancer_aux_pheno_maps$records$efo$efo2xref |>
    dplyr::select(
      cui, efo_id, efo_name, primary_site) |>
    dplyr::distinct()

  biomarkers_all_phenotypes <- biomarkers_clinical |>
    dplyr::left_join(
      cancer_term_map, by = "disease_ontology_id", 
      multiple = "all", relationship = "many-to-many")

  ## if conditions in biomarkers is not found in phenOncoX terms
  ## match against individual dictionaries
  missing_do_phenotypes <- biomarkers_all_phenotypes |>
    dplyr::filter(is.na(cui)) |>
    dplyr::select(-c(cui, cui_name, primary_site,
                     efo_id, do_name, efo_name)) |>
    dplyr::inner_join(
      do_terms_all,
      by = c("disease_ontology_id" = "do_id"), 
      multiple = "all", relationship = "many-to-many") |>
    dplyr::left_join(
      umls_terms_all, by = "cui", 
      multiple = "all", relationship = "many-to-many") |>
    dplyr::left_join(
      efo_terms_all,
      by = "cui", multiple = "all",  
    relationship = "many-to-many") |>
    dplyr::distinct() |>
    dplyr::mutate(primary_site = dplyr::case_when(
      do_name == "cancer" ~ as.character(NA),
      is.na(primary_site) & stringr::str_detect(do_name,"brain") ~ "CNS/Brain",
      is.na(primary_site) & stringr::str_detect(do_name,"breast") ~ "Breast",
      is.na(primary_site) & stringr::str_detect(do_name,"colon|rectum") ~ "Colon/Rectum",
      is.na(primary_site) & stringr::str_detect(do_name,"gastric|stomach|esophag") ~ "Esophagus/Stomach",
      is.na(primary_site) & stringr::str_detect(do_name,"prostate") ~ "Prostate",
      is.na(primary_site) & stringr::str_detect(do_name,"pancrea") ~ "Pancreas",
      is.na(primary_site) & stringr::str_detect(do_name,"lung") ~ "Lung",
      is.na(primary_site) & stringr::str_detect(do_name,"myeloid") ~ "Myeloid",
      is.na(primary_site) & stringr::str_detect(do_name,"lymphoma") ~ "Lymphoid",
      is.na(primary_site) & stringr::str_detect(do_name,"ovary|ovarian") ~ "Ovary",
      TRUE ~ as.character(primary_site)
    ))

  biomarkers_phenotype_mapped <-
    dplyr::filter(biomarkers_all_phenotypes, !is.na(cui)) |>
    dplyr::bind_rows(missing_do_phenotypes)

  return(biomarkers_phenotype_mapped)
}

load_custom_fusion_db <- function() {

  other_translocations <- as.data.frame(read.table(
    file = "data-raw/biomarkers/translocation_patterns.tsv", header = T,
    sep = "\t", stringsAsFactors = F) |>
      dplyr::select(ngram_pattern, biomarker) |>
      dplyr::rename(alias = ngram_pattern,
                    variant = biomarker) |>
      dplyr::mutate(biomarker_entity = T,
                    symbol = variant,
                    alias = toupper(alias)) |>
      tidyr::separate_rows(alias, sep = "\\|") |>
      dplyr::mutate(biomarker_source_db = "fusion_customdb") |>
      dplyr::distinct()
  )

  other_translocations_lc <- as.data.frame(read.table(
    file = "data-raw/biomarkers/translocation_patterns.tsv", header = T,
    sep = "\t", stringsAsFactors = F) |>
      dplyr::select(ngram_pattern, biomarker) |>
      dplyr::rename(alias = ngram_pattern,
                    variant = biomarker) |>
      dplyr::mutate(biomarker_entity = T,
                    symbol = variant) |>
      tidyr::separate_rows(alias, sep = "\\|") |>
      dplyr::mutate(biomarker_source_db = "fusion_customdb") |>
      dplyr::distinct()
  )

  biomarker_items <- list()
  
  biomarker_items[['variant']] <- other_translocations |>
    dplyr::bind_rows(other_translocations_lc) |>
    dplyr::distinct() |>
    dplyr::rename(variant_alias = alias,
                  gene = symbol) |>
    dplyr::mutate(alteration_type = "FUSION",
                  variant_consequence = "transcript_fusion",
                  variant_id = variant,
                  variant_name_primary = variant) |>
    dplyr::mutate(alias_type = "other_gene") |>
    dplyr::select(variant_id, variant_name_primary,
                  variant_alias, alias_type,
                  gene, alteration_type, variant_consequence)

  return(biomarker_items)
}

load_depmap_fusions <- function(db_datestamp = "24Q4"){
  
  # Load DepMap fusions
  depmap_data <- list()
  depmap_data[['fusions']] <- as.data.frame(read.csv(
    file = "data-raw/depmap/OmicsFusionFiltered.csv", header = T))
  
  depmap_data[['models']] <- as.data.frame(read.csv(
    file = "data-raw/depmap/Model.csv", header = T)) |>
    dplyr::select(
      ModelID, CellLineName, OncotreeLineage,
      OncotreePrimaryDisease, OncotreeCode,
      Age, Sex, PrimaryOrMetastasis, SampleCollectionSite,
      SourceType
    )
  return(depmap_data)
}

load_mitelman_db <- function(cache_dir = NA,
                             db_datestamp = "20241015") {

  # Load Mitelman database
  # dos2unix -q -n MBCA.TXT.DATA MBCA.TXT
  
  morphology <- read.table(
    file = file.path(
      cache_dir, "mitelmandb", "KODER.TXT.DATA"),
    sep = "\t", stringsAsFactors = F, 
    header = T, fill = T) |>
    dplyr::filter(KodTyp == "MORPH" & nchar(Kod) > 0) |>
    dplyr::mutate('morphology_code' = as.character(Kod),
                  'morphology_name' = Benamning) |>
    dplyr::select(morphology_code, morphology_name) |>
    dplyr::distinct()
  
  topography <- read.table(
    file = file.path(
      cache_dir, "mitelmandb", "KODER.TXT.DATA"),
    sep = "\t", stringsAsFactors = F, 
    header = T, fill = T) |>
    dplyr::filter(KodTyp == "TOP" & nchar(Kod) > 0) |>
    dplyr::mutate('topography_code' = as.character(Kod),
                  'topography_name' = Benamning) |>
    dplyr::select(topography_code, topography_name) |>
    dplyr::distinct()

  pmid_data <- readr::read_tsv(
    file = file.path(
      cache_dir, "mitelmandb", "REF.TXT.DATA"),
    quote = "none", show_col_types = F, guess_max = 1000000) |>
    dplyr::select(RefNo, Pubmed) |>
    dplyr::filter(nchar(Pubmed) > 0) |>
    dplyr::mutate(reference_id = as.character(RefNo)) |>
    dplyr::distinct() |>
    dplyr::rename(source_id = Pubmed) |>
    dplyr::select(reference_id, source_id) |>
    dplyr::distinct()

  fusion_event_data <- as.data.frame(readr::read_tsv(
    file = file.path(
      cache_dir, "mitelmandb", "MBCA.TXT.DATA"),
    show_col_types = F, guess_max = 100000)) |>
    dplyr::filter(stringr::str_detect(GeneShort,"::")) |>
    dplyr::rename(variant = GeneShort,
                  karyotype = KaryShort,
                  reference_id = RefNo,
                  morphology_code = Morph,
                  topography_code = Topo) |>
    dplyr::mutate(reference_id = as.character(reference_id),
                  morphology_code = as.character(morphology_code)) |>
    dplyr::select(MolClin, 
                  reference_id, 
                  morphology_code, 
                  topography_code,
                  variant, karyotype) |>
    tidyr::separate_rows(variant, sep = ",") |>
    dplyr::left_join(pmid_data, by = "reference_id", 
                     multiple = "all", 
                     relationship = "many-to-many") |>
    dplyr::left_join(morphology, by = "morphology_code", 
                     multiple = "all", 
                     relationship = "many-to-many") |>
    dplyr::left_join(topography, by = "topography_code", 
                     multiple = "all", 
                     relationship = "many-to-many") |>
    dplyr::filter(stringr::str_detect(variant,"::")) |>
    dplyr::mutate(variant = stringr::str_replace(
      variant, "\\+", "")) |>
    dplyr::mutate(evidence_id = dplyr::row_number()) |>
    dplyr::mutate(evidence_id = paste0("MITDB_",evidence_id)) |>
    dplyr::filter(stringr::str_count(variant,",") ==
                    stringr::str_count(karyotype,",")) |>
    tidyr::separate_rows(c(variant,karyotype),
                         sep = ",") |>
    dplyr::mutate(topography_name2 = dplyr::case_when(
      is.na(topography_name) & 
        stringr::str_detect(
          tolower(morphology_name), "leukemia|myeloma") ~ "Myeloid",
      is.na(topography_name) & 
        stringr::str_detect(
        tolower(morphology_name), "lymphoma") ~ "Lymphoid",
      topography_name == "Ovary" ~ "Ovary/Fallopian Tube",
      topography_name == "Fallopian tube" ~ "Ovary/Fallopian Tube",
      topography_name == "Uterus, corpus" ~ "Uterus",
      stringr::str_detect(morphology_name, 
        paste0(
          "Nasal cavity/Paranasal sinuses|Nasopharynx|",
          "LarynxOral cavity|Oro- and hypopharynx|",
          "Larynx|Teeth")) ~ "Head and Neck",
      topography_name == "Gallbladder/Biliary system" ~ "Biliary Tract",
      topography_name == "Soft tissue" ~ "Soft Tissue",
      topography_name == "Brain" ~ "CNS/Brain",
      morphology_name == "Mature B-cell neoplasm, NOS" ~ "Lymphoid",
      topography_name == "Brain stem" ~ "CNS/Brain",
      topography_name == "Cerebellum" ~ "CNS/Brain",
      topography_name == "Vagina" ~ "Vulva/Vagina",
      topography_name == "Skeleton" ~ "Bone",
      stringr::str_detect(
        tolower(morphology_name),
        "myelodysplastic|myeloproliferative") ~ "Myeloid",
      stringr::str_detect(
        tolower(morphology_name),
        "miscellaneous hematopoietic/lymphoid|hodgkin disease") ~ "Lymphoid",
      topography_name == "Ureter" ~ "Kidney",
      topography_name == "Unknown site" ~ "Any",
      topography_name == "Urethra" ~ "Bladder/Urinary Tract",
      topography_name == "Spinal cord" ~ "Peripheral Nervous System",
      topography_name == "Uterus, cervix" ~ "Cervix",
      topography_name == "Large intestine" ~ "Colon/Rectum",
      topography_name == "Bladder" ~ "Bladder/Urinary Tract",
      topography_name == "Adrenal" ~ "Adrenal Gland",
      topography_name == "Intraabdominal" ~ "Esophagus/Stomach",
      topography_name == "Oesophagus" ~ "Esophagus/Stomach",
      TRUE ~ topography_name
    )) |>
    dplyr::rename(primary_diagnosis = morphology_name,
                  primary_site = topography_name2) |>
    dplyr::select(variant, 
                  karyotype, 
                  primary_diagnosis,
                  primary_site,
                  source_id, 
                  evidence_id) |>
    dplyr::distinct() |>
    dplyr::mutate(variant_id = paste(
      variant, karyotype, sep = " - "
    ))

  evidence_id <- nrow(fusion_event_data) + 1

  fusion_event_data$variant_alias <- paste(
    fusion_event_data$variant,
    paste(
      stringr::str_split_fixed(fusion_event_data$variant,"::",2)[,2],
      stringr::str_split_fixed(fusion_event_data$variant,"::",2)[,1],
      sep = "-"
    ),
    paste(
      stringr::str_split_fixed(fusion_event_data$variant,"::",2)[,1],
      stringr::str_split_fixed(fusion_event_data$variant,"::",2)[,2],
      sep = "/"
    ),
    paste(
      stringr::str_split_fixed(fusion_event_data$variant,"::",2)[,1],
      stringr::str_split_fixed(fusion_event_data$variant,"::",2)[,2],
      sep = "-"
    ),
    fusion_event_data$karyotype,
    sep = "@@@"
  )
  
  fusion_event_data$variant_name_primary <- fusion_event_data$variant

  bcr_abl_custom <-
    data.frame('variant_id' = "BCR::ABL1 - t(9;22)(q34;q11)",
               'variant_alias' = 'BCR::ABL',
               'karyotype' = 't(9;22)(q34;q11)',
               'variant_name_primary' = 'BCR::ABL1',
               'evidence_id' = paste0('MITDB_',evidence_id),
               'primary_diagnosis' = NA,
               'primary_site' = NA,
               'source_id' = NA,
               'symbol' = 'BCR-ABL1',
               stringsAsFactors = F)

  mitelman_db <- as.data.frame(
    fusion_event_data |>
      dplyr::select(variant_alias, 
                    variant_id,
                    karyotype,
                    variant_name_primary,
                    primary_diagnosis, 
                    primary_site,
                    evidence_id, 
                    source_id) |>
      tidyr::separate_rows(variant_alias, sep = "@@@") |>
      #dplyr::rename(variant_alias = alias) |>
      dplyr::filter(
        !stringr::str_detect(
          variant_alias,"RARA::PML|ABL1::BCR|ERG::TMPRSS2")) |>
      
      dplyr::bind_rows(
        bcr_abl_custom
      ) |>
      dplyr::mutate(alteration_type = "FUSION",
                    variant_origin = "somatic",
                    variant_consequence = "transcript_fusion",
                    evidence_url = "https://mitelmandatabase.isb-cgc.org/",
                    biomarker_source = "mitelmandb",
                    #biomarker_entity = T,
                    #alias_type = "other_gene",
                    biomarker_source_datestamp = db_datestamp) |>
      dplyr::arrange(evidence_id) |>
      dplyr::rename(citation_id = source_id) |>
      dplyr::select(biomarker_source,
                    biomarker_source_datestamp,
                    variant_id,
                    variant_alias,
                    #alias_type,
                    variant_name_primary,
                    alteration_type,
                    dplyr::everything())

  )

  mitelman_db$symbol <- stringr::str_split_fixed(
    mitelman_db$variant_id, " - ", 2)[,1]

  
  biomarker_items <- list()
  biomarker_items[['clinical']] <- mitelman_db
  
  biomarker_items[['variant']] <- biomarker_items[['clinical']] |>
    #dplyr::select(variant_id, variant_alias, alteration_type, symbol) |>
    dplyr::mutate(alias_type = "other_gene") |>
    dplyr::rename(gene = symbol) |>
    #dplyr::rename(variant_alias = variant) |>
    dplyr::select(biomarker_source,
                  biomarker_source_datestamp,
                  variant_id, 
                  variant_alias, 
                  variant_name_primary,
                  alteration_type,
                  alias_type,
                  variant_consequence,
                  gene) |>
    dplyr::distinct()
  
  biomarker_items[['clinical']] <- biomarker_items[['clinical']] |>
    dplyr::select(-c("variant_alias", "variant_name_primary",
                     "alteration_type","variant_consequence",
                     "symbol")) |>
    dplyr::distinct()
  
  biomarker_items <- 
    get_literature_references(biomarker_items)
  
  return(biomarker_items)
}
sigven/oncoPharmaDB documentation built on Dec. 24, 2024, 9:46 p.m.