data_processing_code/data_utility_functions.R

#' A function that splits an array into chunks of equal size
chunk <- function(x,n) split(x, factor(sort(rank(x)%%n)))

#' A function that returns a citation with first author, journal and year for a PubMed ID
#'
#' @param pmid An array of Pubmed IDs
#' @param chunk_size Size of PMID chunks
#'
#' @export
#'
#' @return citation PubMed citation, with first author, journal and year
#'
get_citations_pubmed <- function(
    pmid,
    chunk_size = 100){


  ## set logging layout
  lgr::lgr$appenders$console$set_layout(
    lgr::LayoutFormat$new(timestamp_fmt = "%Y-%m-%d %T"))

  ## make chunk of maximal 400 PMIDs from input array (limit by EUtils)
  pmid_chunks <- chunk(
    pmid, ceiling(length(pmid)/chunk_size))
  j <- 0
  all_citations <- data.frame()
  lgr::lgr$info(
    paste0('Retrieving PubMed citations for PMID list, total length: ',
           length(pmid)))
  while (j < length(pmid_chunks)) {
    pmid_chunk <- pmid_chunks[[as.character(j)]]
    lgr::lgr$info(
      paste0('Processing chunk ',j,' with ',length(pmid_chunk),' PMIDS'))
    pmid_string <- paste(pmid_chunk,collapse = " ")
    res <- RISmed::EUtilsGet(
      RISmed::EUtilsSummary(
        pmid_string, type = "esearch", db = "pubmed", retmax = 5000)
    )


    year <- RISmed::YearPubmed(res)
    authorlist <- RISmed::Author(res)
    pmid_list <- RISmed::PMID(res)
    i <- 1
    first_author <- c()
    while (i <= length(authorlist)) {


      if (length(authorlist[[i]]) == 5) {
        first_author <- c(
          first_author,
          paste(authorlist[[i]][1,]$LastName," et al.",sep = ""))
      } else{
        first_author <- c(
          first_author, as.character("Unknown et al.")
        )
      }
      i <- i + 1
    }
    journal <- RISmed::ISOAbbreviation(res)
    citations <- data.frame(
      'pmid' = as.integer(pmid_list),
      'citation' = paste(
        first_author, year, journal, sep = ", "),
      stringsAsFactors = F)
    citations$link <- paste0(
      '<a href=\'https://www.ncbi.nlm.nih.gov/pubmed/',
      citations$pmid,'\' target=\'_blank\'>',
      citations$citation,'</a>')
    all_citations <- dplyr::bind_rows(
      all_citations, citations)
    j <- j + 1
  }

  return(all_citations)

}

#' A function that returns a citation with first author, journal and year for a PubMed ID
#'
#' @param pmid An array of Pubmed IDs
#' @param raw_db_dir base directory
#' @param chunk_size Size of PMID chunks
#' @return citation PubMed citation, with first author, journal and year
#'
get_citations_pubmed2 <- function(
    pmid,
    raw_db_dir = NULL,
    chunk_size = 300){

  ## make chunk of maximal 400 PMIDs from input array (limit by EUtils)
  pmid_chunks <- chunk(pmid, ceiling(length(pmid)/chunk_size))
  j <- 0
  all_citations <- data.frame()
  cat('Retrieving PubMed citations for PMID list, total length', length(pmid))
  cat('\n')

  eutils_medline_fname <-
    file.path(
      raw_db_dir,
      paste0("tmp_edirect_results_",
             stringi::stri_rand_strings(
               1, 20, pattern = "[A-Za-z0-9]"),
             ".txt"
      )
    )

  eutils_medline_sh <-
    file.path(
      raw_db_dir,
      paste0("tmp_edirect_cmd_",
             stringi::stri_rand_strings(
               1, 20, pattern = "[A-Za-z0-9]"),
             ".sh"
      )
    )

  write("#!/bin/sh\n\n", file = eutils_medline_sh)

  all_cmds <- c()
  while(j < length(pmid_chunks)){
    pmid_chunk <- pmid_chunks[[as.character(j)]]
    cat('Processing chunk (edirect utils) ',j,' with ',length(pmid_chunk),'PMIDs')
    cat('\n')
    pmid_string <- paste(pmid_chunk,collapse = ",")

    eutils_cmd <- paste0(
      "/Users/sigven/edirect/esearch -db pubmed -query '",
      pmid_string,
      "' | /Users/sigven/edirect/efetch -format medline >> ",
      eutils_medline_fname)

    all_cmds <- c(all_cmds, eutils_cmd)
    j <- j + 1
  }

  write(all_cmds, file = eutils_medline_sh, append = T)
  system(paste0('chmod a+rx ',eutils_medline_sh))
  system(eutils_medline_sh, ignore.stdout = T, ignore.stderr = T)

  all_citations <- data.frame()

  if(file.exists(eutils_medline_fname)){

    con <- file(eutils_medline_fname, open = "r")
    lines <- readLines(con)
    fau <- NA
    pmid <- NA
    journal <- NA
    year <- NA

    for(l in lines){
      if(stringr::str_detect(l,"^PMID-")){

        if(!is.na(fau) & !is.na(pmid) & !is.na(journal) & !is.na(year)){
          citation <- data.frame(
            'pmid' = as.integer(pmid),
            'citation' = paste(fau,year,journal,sep=", "),
            'link' =  paste0('<a href=\'https://www.ncbi.nlm.nih.gov/pubmed/',
                             pmid,'\' target=\'_blank\'>',
                             paste(fau,year,journal,sep=", ")
                             ,'</a>'),
            stringsAsFactors = F)

          fau <- NA
          pmid <- NA
          journal <- NA
          year <- NA

          all_citations <- all_citations |>
            dplyr::bind_rows(citation)
        }

        pmid <- stringr::str_replace(l, "PMID- ","")
        cat(pmid,'\n')
      }
      if(stringr::str_detect(l,"^FAU - ") & is.na(fau)){
        fau <- paste0(
          stringr::str_split_fixed(
            stringr::str_replace(l, "FAU - ",""),",",2)[1],
          " et al.")
        cat(fau,'\n')
      }
      if(stringr::str_detect(l,"^DP(\\s+)-")){
        year <- stringr::str_split(
          stringr::str_replace(l, "DP(\\s+)- ","")," ")[[1]][1]
        cat(year,'\n')
      }
      if(stringr::str_detect(l,"^TA(\\s+)-")){
        journal <- stringr::str_replace(l, "TA(\\s+)- ","")
        cat(journal,'\n')
      }

    }
    close(con)

  }

  system('rm -f ', eutils_medline_fname)

  return(all_citations)

}

get_msigdb_signatures <- function(
    raw_db_dir = NULL,
    db_version = 'v2023.1.Hs (March 2023)'){

  ## get full dataset: Broad Institute's Molecular Signatures Database (v7.1)
  invisible(assertthat::assert_that(
    dir.exists(raw_db_dir),
    msg = paste0("Directory '",
                 raw_db_dir,"' does not exist")))
  msigdb_xml_fname <- file.path(
    raw_db_dir, "msigdb", "msigdb.xml")
  invisible(assertthat::assert_that(
    file.exists(msigdb_xml_fname),
    msg = paste0("File '",
                 msigdb_xml_fname,
                 "' does not exist")))

  msig_data_xml <- xml2::read_xml(msigdb_xml_fname)

  ## make data frame with signatures, one record pr. gene-signature association
  all_genesets <- msig_data_xml |> xml2::xml_find_all("//GENESET")
  category_code <- all_genesets |> xml2::xml_attr("CATEGORY_CODE")
  all_msigdb <- data.frame('category_code' = category_code, stringsAsFactors = F)
  all_msigdb$description <- all_genesets |> xml2::xml_attr("DESCRIPTION_BRIEF")
  all_msigdb$standard_name <- all_genesets |> xml2::xml_attr("STANDARD_NAME")
  all_msigdb$organism <- all_genesets |> xml2::xml_attr("ORGANISM")
  all_msigdb$pmid <- all_genesets |> xml2::xml_attr("PMID")
  all_msigdb$systematic_name <- all_genesets |> xml2::xml_attr("SYSTEMATIC_NAME")
  all_msigdb$subcategory_code <- all_genesets |> xml2::xml_attr("SUB_CATEGORY_CODE")
  all_msigdb$entrezgene <- all_genesets |> xml2::xml_attr("MEMBERS_EZID")
  all_msigdb$contributor <- all_genesets |> xml2::xml_attr("CONTRIBUTOR")
  all_msigdb$exact_source <- all_genesets |> xml2::xml_attr("EXACT_SOURCE")
  all_msigdb$external_url <- all_genesets |> xml2::xml_attr("EXTERNAL_DETAILS_URL")
  all_msigdb <- all_msigdb |>
    tidyr::separate_rows(entrezgene,sep=",") |>
    dplyr::filter(organism == "Homo sapiens") |>
    dplyr::filter(subcategory_code != 'CP:KEGG') |>
    dplyr::arrange(category_code) |>
    dplyr::mutate(pmid = dplyr::if_else(
      nchar(pmid) == 0,
      as.character(NA),
      as.character(pmid))) |>
    dplyr::mutate(subcategory_code = dplyr::if_else(
      nchar(subcategory_code) == 0,
      as.character("ALL"),
      as.character(subcategory_code))) |>
    dplyr::filter(
      category_code != "ARCHIVED" &
        category_code != "C1") |>
    dplyr::mutate(external_url = dplyr::if_else(
      nchar(external_url) == 0,
      paste0("http://software.broadinstitute.org/gsea/msigdb/cards/",standard_name),
      as.character(external_url))) |>
    dplyr::mutate(description = stringr::str_replace_all(
      description,
      "( \\[ICI [0-9]{1,}(;[0-9]{1,})*\\]( )?)|( \\[GeneID=[0-9]{1,}(;[0-9]{1,})*\\]( )?)|( \\[PubChem=[0-9]{1,}(;[0-9]{1,})*\\]( )?)",""))

  msigdb_category_description <- read.table(
    file = file.path(
      raw_db_dir, "msigdb",
      "msigdb_collection_description.tsv"),
    sep = "\t", header = T, stringsAsFactors = F)

  msigdb_complete <- as.data.frame(all_msigdb |>
    dplyr::left_join(msigdb_category_description,
                     by = c("category_code", "subcategory_code"),
                     relationship = "many-to-many") |>
    dplyr::mutate(db = "MSigDB", db_version = db_version) |>
    dplyr::select(db, db_version, category_code, category_description,
                  subcategory_code, subcategory_description,
                  standard_name, description, organism,
                  entrezgene) |>
    dplyr::rename(signature_description = description,
                  signature_name = standard_name)
  )


  msigdb <- list()
  msigdb[['VERSION']] <- version
  msigdb[['TERM2SOURCE']] <- all_msigdb |>
    dplyr::filter(subcategory_code != "MIR:MIR_Legacy") |>
    dplyr::filter(subcategory_code != "TFT:TFT_Legacy") |>
    dplyr::filter(subcategory_code != "CP:WIKIPATHWAYS") |>
    dplyr::filter(subcategory_code != "VAX") |>
    dplyr::select(standard_name, exact_source, external_url,
                  category_code, subcategory_code) |>
    dplyr::distinct() |>
    dplyr::mutate(external_url = stringr::str_replace(
      external_url,"\\|https://reactome.org/PathwayBrowser/","")) |>
    dplyr::mutate(external_url = dplyr::if_else(
      nchar(external_url) == 0,as.character(NA),as.character(external_url))) |>
    dplyr::mutate(db = dplyr::if_else(
      stringr::str_detect(standard_name,"^GO_"),
      subcategory_code,as.character(NA))) |>
    dplyr::mutate(db = dplyr::if_else(
      stringr::str_detect(standard_name,"^HP_"),
      subcategory_code,as.character(NA))) |>
    dplyr::mutate(db = dplyr::if_else(
      stringr::str_detect(standard_name,"^REACTOME_"),
      "REACTOME",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      stringr::str_detect(standard_name,"^BIOCARTA_"),
      "BIOCARTA",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      stringr::str_detect(standard_name,"^PID_"),
      "PATHWAY_INTERACTION_DB",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      category_code == "H",
      "HALLMARK",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      category_code == "C2" & subcategory_code == "CGP",
      "CHEM_GEN_PERTURB",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      category_code == "C2" & subcategory_code == "CP",
      "CANONICAL_PATHWAY",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      category_code == "C3" & subcategory_code == "MIR:MIRDB",
      "MICRORNA_TARGET_MIRDB",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      category_code == "C3" & subcategory_code == "TFT:GTRD",
      "TF_TARGET_GTRD",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      category_code == "C4" & subcategory_code == "CGN",
      "CANCER_NEIGHBOURHOOD",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      category_code == "C4" & subcategory_code == "CM",
      "CANCER_MODULE",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      category_code == "C6","ONCOGENIC",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      category_code == "C7","IMMUNESIGDB",as.character(db))) |>
    dplyr::mutate(db = dplyr::if_else(
      category_code == "C8","CELLTYPE_SIGNATURES",as.character(db))) |>
    dplyr::select(-c(category_code,subcategory_code)) |>
    dplyr::filter(!is.na(db))


  msigdb[['COLLECTION']] <- list()

  for(c in c('H','C2','C3','C4','C5','C6','C7','C8')){
    msigdb[['COLLECTION']][[c]] <- list()
    subcategories <-
      unique(all_msigdb[all_msigdb$category_code == c,]$subcategory_code)
    for(scat in subcategories){
      subcat <- stringr::str_replace(scat,"GO:","")
      if(subcat == "CP:WIKIPATHWAYS" |
         subcat == "VAX" |
         subcat == "MIR:MIR_Legacy" |
         subcat == "TFT:TFT_Legacy"){
        next
      }

      msigdb[['COLLECTION']][[c]][[subcat]] <- list()
      msigdb[['COLLECTION']][[c]][[subcat]][['TERM2GENE']] <- data.frame()
      msigdb[['COLLECTION']][[c]][[subcat]][['TERM2GENE']] <-
        dplyr::filter(all_msigdb,
                      category_code == c &
                        subcategory_code == scat) |>
        dplyr::select(standard_name, entrezgene) |>
        dplyr::rename(entrez_gene = entrezgene)

      msigdb[['COLLECTION']][[c]][[subcat]][['TERM2NAME']] <-
        dplyr::filter(all_msigdb,
                      category_code == c &
                        subcategory_code == scat) |>
        dplyr::select(standard_name, description) |>
        dplyr::rename(name = description) |>
        dplyr::distinct()
    }
  }
  rm(all_msigdb)
  return(list('db' = msigdb, 'df' = msigdb_complete))
}


get_opentarget_associations <-
  function(raw_db_dir = NULL,
           min_overall_score = 0.1,
           min_num_sources = 2,
           release = "2023.06",
           direct_associations_only = F){

    opentarget_targets <- as.data.frame(
      readRDS(
        file.path(raw_db_dir,
               "opentargets",
               paste0("opentargets_target_",
               release,
               ".rds"))) |>
        dplyr::select(target_ensembl_gene_id,
                      SM_tractability_category,
                      SM_tractability_support,
                      AB_tractability_category,
                      AB_tractability_support) |>
        dplyr::rename(ensembl_gene_id = target_ensembl_gene_id)
      )

    opentarget_associations_raw <- as.data.frame(
      readRDS(
        file.path(raw_db_dir,
                  "opentargets",
                  paste0(
                    "opentargets_association_direct_HC_",
                    release,
                    ".rds"))
      )
    )

    opentarget_datatype_support <- as.data.frame(
      opentarget_associations_raw |>
        dplyr::select(disease_id, target_ensembl_gene_id, datatype_items) |>
        tidyr::separate_rows(datatype_items, sep=",") |>
        tidyr::separate(datatype_items,
                        into = c("datatype","n_evidence","score"),
                        sep="\\|") |>
        dplyr::group_by(disease_id, target_ensembl_gene_id) |>
        dplyr::summarise(datatype_support = paste(
          unique(sort(datatype)), collapse=";"),
          .groups = "drop")
      )

    opentarget_associations_raw <- opentarget_associations_raw |>
      dplyr::left_join(
        opentarget_datatype_support,
        by = c("disease_id", "target_ensembl_gene_id"),
        relationship = "many-to-many")


    opentarget_associations <- as.data.frame(
      opentarget_associations_raw |>
        dplyr::rename(disease_efo_id = disease_id,
                      ensembl_gene_id = target_ensembl_gene_id) |>
        dplyr::mutate(disease_efo_id = stringr::str_replace(
          disease_efo_id,":","_")) |>
        dplyr::filter(score >= min_overall_score) |>
        dplyr::filter(stringr::str_count(datatype_items,",") >= min_num_sources - 1) |>
        dplyr::mutate(association_key =
                        paste(disease_efo_id,
                              "T",
                              datatype_support,
                              score,
                              sep=":")) |>
        dplyr::group_by(ensembl_gene_id) |>
        dplyr::summarise(
          ot_association = paste(association_key, collapse = "&"),
          .groups = "drop")
    )

    ot_associations <- opentarget_targets |>
      dplyr::left_join(opentarget_associations,
                       by = "ensembl_gene_id",
                       relationship = "many-to-many")


    return(ot_associations)

  }


get_cancer_hallmarks <- function(raw_db_dir = NULL,
                                 gene_info = NULL,
                                 opentargets_version = "2023.06",
                                 update = T){

  rds_fname <-
    file.path(raw_db_dir, "opentargets",
              paste0("opentargets_hallmarkdata_",
              opentargets_version, ".rds"))

  if(update == F & file.exists(rds_fname)){
    hallmark_data <- readRDS(file = rds_fname)
    return(hallmark_data)
  }

  opentargets_target_rds <- file.path(
    raw_db_dir, "opentargets",
    paste0("opentargets_target_", opentargets_version, ".rds")
  )

  hallmark_data_long <- as.data.frame(
    readRDS(file = opentargets_target_rds) |>
      dplyr::select(target_ensembl_gene_id, hgnc_id, cancer_hallmark) |>
      dplyr::rename(ensembl_gene_id = target_ensembl_gene_id) |>
      dplyr::filter(!is.na(cancer_hallmark)) |>
      tidyr::separate_rows(cancer_hallmark, sep="@@@") |>
      tidyr::separate(cancer_hallmark,
                      into = c('hallmark','promote','suppress','literature_support'),
                      remove = F, sep = '\\|') |>
      dplyr::rename(promotes = promote, suppresses = suppress) |>
      dplyr::mutate(hallmark = stringr::str_to_title(hallmark)) |>
      tidyr::separate_rows(literature_support, sep="&") |>
      dplyr::mutate(promotes = as.logical(stringr::str_replace(promotes,"PROMOTE:",""))) |>
      dplyr::mutate(suppresses = as.logical(stringr::str_replace(suppresses,"SUPPRESS:",""))) |>
      tidyr::separate(literature_support, into = c('pmid','pmid_summary'), sep="%%%") |>
      dplyr::filter(pmid != "37937225") |>
      dplyr::mutate(pmid_summary = R.utils::capitalize(pmid_summary))
  )

  pmid_data <- get_citations_pubmed(
    unique(hallmark_data_long$pmid),
    chunk_size = 15) |>
    dplyr::mutate(pmid = as.character(pmid))


  hallmark_data_long <- hallmark_data_long |>
    dplyr::left_join(pmid_data, by = "pmid",  relationship = "many-to-many")

  hallmark_data_short <- as.data.frame(
    hallmark_data_long |>
      dplyr::select(hgnc_id, ensembl_gene_id, hallmark,
                    promotes, suppresses, pmid_summary,
                    link) |>
      dplyr::mutate(literature_support = paste0(pmid_summary," (",link,")")) |>
      dplyr::group_by(hgnc_id, ensembl_gene_id, hallmark,
                      promotes, suppresses) |>
      dplyr::summarise(literature_support = paste(
        literature_support, collapse=". "),
        .groups = "drop") |>
      dplyr::left_join(dplyr::select(gene_info, hgnc_id,
                                     symbol, entrezgene),
                       by = "hgnc_id",  relationship = "many-to-many") |>
      dplyr::mutate(symbol =
                      paste0("<a href ='http://www.ncbi.nlm.nih.gov/gene/",
                             entrezgene,"' target='_blank'>",symbol,"</a>")) |>
      dplyr::select(-c("hgnc_id")) |>
      dplyr::select(symbol, entrezgene, ensembl_gene_id,
                    hallmark, promotes, suppresses,
                    literature_support)
  )

  hallmark_data <- list('long' = hallmark_data_long,
                        'short' = hallmark_data_short)
  saveRDS(hallmark_data, file = rds_fname)
  return(hallmark_data)

}

get_tf_target_interactions <- function(
  raw_db_dir = NULL,
  update = F){


  rds_fname = file.path(
    raw_db_dir,
    "omnipathdb",
    "omnipath_tf_interactions.rds")

  if(update == F & file.exists(rds_fname)){
    tf_target_interactions <- readRDS(file = rds_fname)
    return(tf_target_interactions)
  }


  tf_target_interactions_dorothea_all <-
    OmnipathR::import_transcriptional_interactions(
      organism = "9606",
      dorothea_levels = c("A","B","C","D"),
      references_by_resource = F) |>
    dplyr::filter(!is.na(dorothea_level)) |>
    dplyr::group_by(source_genesymbol, target_genesymbol) |>
    dplyr::summarise(
      references = paste(unique(references), collapse=";"),
      interaction_sources = paste(unique(sources), collapse="; "),
      dorothea_level = paste(unique(dorothea_level), collapse=";"),
      .groups = "drop") |>
    dplyr::filter(!stringr::str_detect(source_genesymbol,"_")) |>
    dplyr::mutate(interaction_sources = stringr::str_squish(
      stringr::str_replace_all(
        interaction_sources, ";","; "))) |>

    ## Not available for non-academic use (entries provided solely with TRED)
    dplyr::filter(!stringr::str_detect(
      interaction_sources,"^(RegNetwork_DoRothEA; TRED_DoRothEA)$")) |>
    dplyr::mutate(interaction_source = stringr::str_replace_all(
      interaction_sources, "; TRED_DoRothEA$", ""
    )) |>
    dplyr::mutate(interaction_source = stringr::str_replace_all(
      interaction_sources, "; TRED_DoRothEA; ", "; "
    ))

  tf_target_pmids <- tf_target_interactions_dorothea_all |>
    dplyr::filter(!is.na(references) & nchar(references) > 0) |>
    dplyr::select(source_genesymbol, target_genesymbol, references) |>
    tidyr::separate_rows(references, sep=";") |>
    dplyr::filter(nchar(references) > 0) |>
    dplyr::distinct()

  tf_target_citations <- get_citations_pubmed(
    pmid = unique(tf_target_pmids$references),
    chunk_size = 200)

  tf_target_pmids <- as.data.frame(
    tf_target_pmids |>
      dplyr::mutate(references = as.integer(references)) |>
      dplyr::left_join(
        tf_target_citations, by = c("references" = "pmid"),
        relationship = "many-to-many") |>
      dplyr::group_by(source_genesymbol, target_genesymbol) |>
      dplyr::summarise(
        tf_target_literature_support = paste(
          link, collapse= ","),
        tf_target_literature = paste(
          citation, collapse="|"
        ),
        .groups = "drop"
      )
  )

  tf_target_interactions_dorothea_all <- tf_target_interactions_dorothea_all |>
    dplyr::left_join(tf_target_pmids,
                     by = c("source_genesymbol",
                            "target_genesymbol"),
                     relationship = "many-to-many") |>
    dplyr::rename(regulator = source_genesymbol,
                  target = target_genesymbol) |>
    dplyr::select(-references) |>
    dplyr::mutate(confidence_level = dplyr::case_when(
      stringr::str_detect(dorothea_level,"A|A;B|A;B;D|A;D") ~ "A",
      stringr::str_detect(dorothea_level,"B|B;D|B;C") ~ "B",
      stringr::str_detect(dorothea_level,"C;D|C") ~ "C",
      stringr::str_detect(dorothea_level,"D") ~ "D",
      TRUE ~ as.character(dorothea_level),
    )) |>
    dplyr::select(-dorothea_level) |>
    dplyr::mutate(mode_of_regulation = NA)


  tf_target_interactions_dorothea_pancancer <-
    dorothea::dorothea_hs_pancancer |>
    dplyr::mutate(interaction_sources = "DoRothEA_hs_pancancer") |>
    dplyr::rename(regulator = tf,
                  confidence_level = confidence,
                  mode_of_regulation = mor) |>
    dplyr::mutate(mode_of_regulation = dplyr::if_else(
      mode_of_regulation == 1,"Stimulation",
      "Repression")) |>
    dplyr::mutate(tf_target_literature_support = NA,
                  tf_target_literature = NA) |>
    dplyr::select(regulator, target,
                  interaction_sources,
                  confidence_level,
                  mode_of_regulation,
                  tf_target_literature_support,
                  tf_target_literature)

  tf_target_interactions <- list()
  tf_target_interactions[['global']] <-
    tf_target_interactions_dorothea_all
  tf_target_interactions[['pancancer']] <-
    tf_target_interactions_dorothea_pancancer

  saveRDS(tf_target_interactions, file = rds_fname)

  return(tf_target_interactions)
}

get_cancer_drugs <- function(raw_db_dir = NULL){

  cancer_drugs <- list()
  target_drug_edges <- list()
  target_drug_nodes <- list()
  drugs_per_gene <- list()
  for(p in c('early_phase','late_phase')){
    cancer_drugs[[p]] <- data.frame()
    target_drug_edges[[p]] <- data.frame()
    target_drug_nodes[[p]] <- data.frame()
    drugs_per_gene[[p]] <- data.frame()
  }


  approved_inhibitors <- as.data.frame(
    pharmOncoX::get_drugs(
      cache_dir = raw_db_dir,
      drug_targeted_agent = T,
      drug_action_inhibition = T,
      drug_cancer_indication = T,
      drug_is_approved = T)$records |>
      dplyr::filter(!is.na(primary_site)) |>
      dplyr::filter(
        stringr::str_detect(drug_clinical_source,"FDA|DailyMed|ATC")) |>
      dplyr::select(
        disease_efo_label, target_entrezgene,
        drug_name, molecule_chembl_id) |>
      dplyr::rename(entrezgene = target_entrezgene) |>
      dplyr::mutate(entrezgene = as.integer(entrezgene)) |>
      dplyr::mutate(
        drug_link =
          paste0("<a href = 'https://platform.opentargets.org/drug/",
                 molecule_chembl_id,
                 "' target='_blank'>",drug_name,"</a>")) |>
      dplyr::group_by(entrezgene, drug_link) |>
      dplyr::summarise(indications = paste(
        sort(unique(disease_efo_label)), collapse="; "
      ), .groups = "drop") |>
      dplyr::ungroup() |>
      dplyr::mutate(indications = stringr::str_replace(
        indications,
        "breast cancer; breast carcinoma",
        "breast cancer"
      )) |>
      dplyr::mutate(drug_indications = paste0(
        drug_link, " (",indications,")")) |>
      dplyr::group_by(entrezgene) |>
      dplyr::summarise(approved_drugs = paste(
        drug_indications, collapse=", "),
        .groups = "drop")
  )

  black_list <- data.frame(
    'drug_name' = c('8h9 131i','Abc-294640',
                  'Axl-1717','Azd-3759',
                  'Apg115','Bay-1161909',
                  'Sndx-5613 Free Base'))

  cancer_drugs[['late_phase']] <-
    pharmOncoX::get_drugs(
      cache_dir = raw_db_dir,
      drug_targeted_agent = T,
      drug_action_inhibition = T,
      drug_cancer_indication = F,
      drug_classified_cancer = F,
      drug_minimum_phase_any_indication = 3)$records |>
    dplyr::filter(!is.na(molecule_chembl_id)) |>
    dplyr::filter(drug_cancer_relevance != "by_other_condition_otp" &
                    drug_cancer_relevance != "by_cancer_target_otp") |>
    dplyr::select(target_entrezgene,
                  drug_name,
                  primary_site,
                  molecule_chembl_id,
                  drug_max_ct_phase) |>
    dplyr::anti_join(black_list, by = "drug_name") |>
    dplyr::distinct() |>
    dplyr::rename(entrezgene = target_entrezgene) |>
    dplyr::mutate(entrezgene = as.integer(entrezgene)) |>
    dplyr::distinct()

  cancer_drugs[['early_phase']] <-
    pharmOncoX::get_drugs(
      cache_dir = raw_db_dir,
      drug_targeted_agent = T,
      drug_action_inhibition = F,
      drug_cancer_indication = F,
      drug_classified_cancer = F,
      drug_minimum_phase_any_indication = 0)$records |>
    dplyr::filter(!is.na(molecule_chembl_id)) |>
    dplyr::filter(drug_cancer_relevance != "by_other_condition_otp" &
                    drug_cancer_relevance != "by_cancer_target_otp") |>
    dplyr::anti_join(
      cancer_drugs[['late_phase']], by = "molecule_chembl_id") |>
    dplyr::select(target_entrezgene,
                  drug_name,
                  primary_site,
                  molecule_chembl_id,
                  drug_max_ct_phase) |>
    dplyr::anti_join(black_list, by = "drug_name") |>
    dplyr::rename(entrezgene = target_entrezgene) |>
    dplyr::mutate(entrezgene = as.integer(entrezgene)) |>
    dplyr::distinct()

  ## check for duplicate chembl IDs
  dups <- cancer_drugs$early_phase |>
    dplyr::bind_rows(cancer_drugs$late_phase) |>
    dplyr::group_by(molecule_chembl_id) |>
    dplyr::summarise(
      drug_name = paste(unique(drug_name),
                        collapse="@")) |>
    dplyr::filter(
      stringr::str_detect(drug_name, "@")
    )

  if(NROW(dups) > 0){
    lgr::lgr$error("ERROR: duplicate chembl IDs")
    return(NULL)
  }

  for(p in c('early_phase','late_phase')){

    cancer_drugs[[p]] <- cancer_drugs[[p]] |>
      dplyr::mutate(to = paste0("s", molecule_chembl_id)) |>
      dplyr::mutate(from = paste0("s", entrezgene)) |>
      dplyr::distinct()

    drugs_per_gene[[p]] <- as.data.frame(
      cancer_drugs[[p]] |>
        dplyr::select(entrezgene,
                      drug_name,
                      drug_max_ct_phase,
                      molecule_chembl_id) |>
        dplyr::distinct() |>
        dplyr::mutate(
          drug_link =
            paste0("<a href = 'https://platform.opentargets.org/drug/",
                   molecule_chembl_id,"' target='_blank'>",drug_name,"</a>")) |>
        dplyr::arrange(entrezgene, desc(drug_max_ct_phase)) |>
        dplyr::group_by(entrezgene) |>
        dplyr::summarise(
          targeted_cancer_drugs =
            paste(unique(drug_link),
                  collapse = ", "), .groups = "drop")
    )
    if(p == 'early_phase'){
      drugs_per_gene[[p]] <- drugs_per_gene[[p]] |>
        dplyr::rename(
          targeted_cancer_drugs_ep = targeted_cancer_drugs)
    }else{
      drugs_per_gene[[p]] <- drugs_per_gene[[p]] |>
        dplyr::rename(
          targeted_cancer_drugs_lp = targeted_cancer_drugs)
    }

    target_drug_edges[[p]] <- cancer_drugs[[p]] |>
      dplyr::select(drug_name,
                    from,
                    to,
                    #target_symbol,
                    entrezgene,
                    molecule_chembl_id) |>
      dplyr::distinct() |>
      dplyr::mutate(width = 1)

    target_drug_nodes[[p]] <- as.data.frame(
      cancer_drugs[[p]] |>
        dplyr::select(to,
                      drug_name,
                      primary_site) |>
        dplyr::distinct() |>
        dplyr::rename(id = to,
                      label = drug_name) |>
        dplyr::group_by(id, label) |>
        dplyr::summarise(
          primary_site = paste(sort(unique(primary_site)),
                               collapse = ", "),
          .groups = "drop") |>
        dplyr::ungroup() |>
        dplyr::mutate(
          primary_site =
            dplyr::if_else(is.na(primary_site) |
                             nchar(primary_site) == 0,
                           "Unspecific indication",
                           as.character(primary_site))) |>
        dplyr::mutate(
          title = paste0(label, " (", primary_site,")")) |>
        dplyr::mutate(
          shadow = F, shape = "diamond",
          color.background = "orange",
          font.color = "black")
    )
    if(p == 'early_phase'){
      target_drug_nodes[[p]] <- target_drug_nodes[[p]] |>
        dplyr::mutate(shadow = F,
                      shape = "diamond",
                      color.background = "purple",
                      font.color = "black")
    }

  }
  cancerdrugdb <- list()
  cancerdrugdb[['approved_per_target']] <- approved_inhibitors
  cancerdrugdb[['drug_per_target']] <- list()
  cancerdrugdb[['drug_per_target']][['early_phase']] <-
    drugs_per_gene[['early_phase']]
  cancerdrugdb[['drug_per_target']][['late_phase']] <-
    drugs_per_gene[['late_phase']]

  cancerdrugdb[['network']] <- list()
  cancerdrugdb[['network']][['edges']] <-
    dplyr::bind_rows(target_drug_edges[['early_phase']],
                     target_drug_edges[['late_phase']]) |>
    dplyr::select(from, to, width) |>
    dplyr::distinct()

  cancerdrugdb[['network']][['nodes']] <-
    dplyr::bind_rows(target_drug_nodes[['early_phase']],
                     target_drug_nodes[['late_phase']]) |>
    dplyr::distinct()

  return(cancerdrugdb)


}

get_pathway_annotations <- function(
  raw_db_dir = NULL,
  gene_info = NULL,
  wikipathways = T,
  kegg = T,
  netpath = T,
  msigdb = T,
  omnipathdb = NULL,
  msigdb_version = NULL,
  wikipathways_version = NULL,
  kegg_version = NULL,
  netpath_version = NULL){


  wikipathways_gmt <- file.path(
    raw_db_dir, "wikipathways",
    paste0("wikipathways-", wikipathways_version,
           "-gmt-Homo_sapiens.gmt")
  )

  netpath_mapping_fname <- file.path(
    raw_db_dir, "netpath",
    "id_mapping.tsv")

  keggdb_fname <- file.path(
    raw_db_dir,
    "kegg", "kegg.pathway.gene.tsv.gz"
  )

  assertable::assert_colnames(
    gene_info,
    c("symbol","entrezgene"),
    only_colnames = F,
    quiet = T)

  pathwaydb <- list()

  if(wikipathways == T){
    if(!file.exists(wikipathways_gmt)){
      rWikiPathways::downloadPathwayArchive(
        organism = "Homo sapiens",
        date = wikipathways_version,
        destpath = file.path(raw_db_dir,"wikipathways"),
        format = "gmt")
    }

    wikipathways <- clusterProfiler::read.gmt(
      wikipathways_gmt)
    wp2gene <- wikipathways |>
      tidyr::separate(term, c("name","version","wpid","org"), "%")
    wikipathwaydb <- list()
    wikipathwaydb[['VERSION']] <- wikipathways_version
    wikipathwaydb[['TERM2GENE']] <- wp2gene |>
      dplyr::select(wpid, gene) |>
      dplyr::rename(standard_name = wpid, entrez_gene = gene)
    wikipathwaydb[['TERM2NAME']] <- wp2gene |>
      dplyr::select(wpid, name) |>
      dplyr::rename(standard_name = wpid) |>
      dplyr::distinct()

    pathwaydb[['wikipathways']] <- wikipathwaydb
  }


  if(netpath == T & !is.null(omnipathdb) & !is.null(gene_info)){
      netpath_idmapping <-
      read.table(netpath_mapping_fname,
                 stringsAsFactors = F, header = F, sep = "\t",
                 col.names = c("name", "standard_name")) |>
      dplyr::mutate(standard_name = paste0("NetPath_",standard_name))

    netpath_pathway_data <- omnipathdb |>
      dplyr::filter(source == "NetPath") |>
      dplyr::select(genesymbol, value) |>
      dplyr::rename(name = value, symbol = genesymbol) |>
      dplyr::left_join(netpath_idmapping, by = "name",  relationship = "many-to-many") |>
      dplyr::mutate(name = paste0(name," signaling pathway")) |>
      dplyr::arrange(name, standard_name, symbol) |>
      dplyr::left_join(dplyr::select(
        gene_info, entrezgene, symbol),
        by = "symbol",  relationship = "many-to-many") |>
      dplyr::rename(entrez_gene = entrezgene) |>
      dplyr::mutate(entrez_gene = as.character(entrez_gene)) |>
      dplyr::select(standard_name, name, entrez_gene)
    netpathdb <- list()
    netpathdb[['VERSION']] <- netpath_version
    netpathdb[['TERM2GENE']] <- netpath_pathway_data |>
      dplyr::select(standard_name, entrez_gene)
    netpathdb[['TERM2NAME']] <- netpath_pathway_data |>
      dplyr::select(standard_name, name) |>
      dplyr::distinct()

    pathwaydb[['netpath']] <- netpathdb
  }


  if(kegg == T){
    kegg_pathway_genes <- as.data.frame(
      readr::read_tsv(file = keggdb_fname,
                 col_names = T, quote = "",
                 show_col_types = F))
    keggdb <- list()
    keggdb[['VERSION']] <- kegg_version
    keggdb[['TERM2NAME']] <- kegg_pathway_genes |>
      dplyr::select(name,pathway_id) |>
      dplyr::rename(standard_name = pathway_id) |>
      dplyr::select(standard_name, name) |>
      dplyr::distinct()

    keggdb[['TERM2GENE']] <- kegg_pathway_genes |>
      dplyr::select(pathway_id, gene_id) |>
      dplyr::rename(standard_name = pathway_id,
                    entrez_gene = gene_id) |>
      dplyr::select(standard_name,entrez_gene) |>
      dplyr::mutate(entrez_gene = as.character(
        entrez_gene)) |>
      dplyr::distinct()

    pathwaydb[['kegg']] <- keggdb
  }

  if(msigdb == T){

    msigdb <-
      get_msigdb_signatures(raw_db_dir = raw_db_dir,
                            db_version = msigdb_version)

    pathwaydb[['msigdb']] <- msigdb$db

  }

  return(pathwaydb)

}

get_protein_complexes <- function(
    raw_db_dir = NULL,
    update = F){

    rds_fname <- file.path(
      raw_db_dir,
      "protein_complexes", "omnipath_complexdb.rds")

    humap2_complexes_tsv <-file.path(
      raw_db_dir,
      "protein_complexes", "humap2_complexes.tsv")

    corum_complexes_tsv <- file.path(
      raw_db_dir,
      "protein_complexes", "coreComplexes.txt")

    humap_url <-
      "http://humap2.proteincomplexes.org/static/downloads/humap2/humap2_complexes_20200809.txt"


    if(update == F & file.exists(rds_fname)){
      proteincomplexdb <- readRDS(rds_fname)
      return(proteincomplexdb)
    }

    ## Protein complex data from OmniPath - multiple underlying databases
    ## - CORUM, ComplexPortal, Compleat etc

    protein_complexes <- list()

    protein_complexes[['OmniPath']] <- OmnipathR::import_omnipath_complexes() |>
      dplyr::filter(!is.na(references) & !is.na(name)) |>
      dplyr::rename(pmid = references,
                    uniprot_acc = components,
                    complex_name = name) |>
      dplyr::select(complex_name, uniprot_acc,
                    pmid, sources) |>
      dplyr::mutate(complex_name = Hmisc::capitalize(complex_name)) |>
      dplyr::distinct() |>
      dplyr::filter(stringr::str_detect(uniprot_acc,"_")) |>
      tidyr::separate_rows(uniprot_acc, sep = "_") |>
      tidyr::separate_rows(sources, sep=";") |>
      tidyr::separate_rows(pmid, sep=";") |>
      dplyr::filter(nchar(pmid) > 3) |>
      dplyr::distinct() |>
      dplyr::mutate(complex_name = dplyr::case_when(
        complex_name == "26S proteasome" ~ "26S Proteasome",
        complex_name == "Abi1-Wasl complex" ~ "ABI1-WASL complex",
        complex_name == "Brm-associated complex" ~ "BRM-associated complex",
        complex_name == "Ski Complex" ~ "SKI complex",
        complex_name == "SF3b complex" ~ "SF3B complex",
        complex_name == "Sin3 complex" ~ "SIN3 complex",
        complex_name == "Sos1-Abi1-Eps8 complex" ~ "SOS1-ABI1-EPS8 complex",
        complex_name == "DCS complex (Ptbp1, Ptbp2, Hnrph1, Hnrpf)" ~
          "DCS complex (PTBP1, PTBP2, HNRPH1, HNRPF)",
        complex_name == "Tiam1-Efnb1-Epha2 complex" ~ "TIAM1-EFNB1-EPHA2 complex",
        complex_name == "Tip5-Dnmt-Hdac1 complex" ~ "TIP5-DNMT-HDAC1 complex",
        TRUE ~ as.character(complex_name)
      )) |>
      dplyr::group_by(complex_name) |>
      dplyr::summarise(
        uniprot_acc = paste(sort(unique(uniprot_acc)),
                            collapse = "_"),
        pmid = paste(sort(unique(pmid)),
                     collapse=";"),
        sources = paste(sort(unique(sources)),
                        collapse=";")) |>
      dplyr::distinct() |>
      dplyr::mutate(num_sources = stringr::str_count(sources,";") + 1) |>
      dplyr::mutate(complex_name = stringr::str_replace_all(
        complex_name,"\\\\u2013","-"
      )) |>
      dplyr::mutate(sources = stringr::str_replace_all(
        sources, ";", "; ")) |>
      dplyr::mutate(complex_id = dplyr::row_number())

     pmid2complex <- protein_complexes[['OmniPath']] |>
       dplyr::select(complex_id,
                     pmid) |>
       tidyr::separate_rows(pmid, sep=";") |>
       dplyr::filter(stringr::str_detect(pmid,"^[0-9]{4,}$"))

     protein_complex_citations <- get_citations_pubmed(
       pmid = unique(pmid2complex$pmid))

     pmid2complex <- pmid2complex |>
       dplyr::mutate(pmid = as.integer(pmid)) |>
       dplyr::left_join(protein_complex_citations, by = "pmid", relationship = "many-to-many") |>
       dplyr::group_by(complex_id) |>
       dplyr::summarise(
         complex_literature_support = paste(
           link, collapse= ", "),
         complex_literature = paste(
           citation, collapse="|"
         )
       )

     protein_complexes[['OmniPath']] <-
       protein_complexes[['OmniPath']] |>
       dplyr::left_join(pmid2complex,
                       by = "complex_id",
                       relationship = "many-to-many")

    ## CORUM annotations (complex comments, disease comments, purification methods)
    protein_complexes[['CORUM']] <- as.data.frame(
      readr::read_tsv(
        corum_complexes_tsv,
        show_col_types = F) |>
        janitor::clean_names() |>
        dplyr::rename(pmid = pub_med_id,
                      uniprot_acc = subunits_uni_prot_i_ds) |>
        dplyr::mutate(
          complex_name = stringr::str_replace_all(
            complex_name, "\\\\u2013", "-"
          )
        ) |>
        dplyr::mutate(
          complex_name = dplyr::if_else(
            complex_name == "VHL-ElonginB-ElonginC complex",
            "VHL-TCEB1-TCEB2 complex",
            as.character(complex_name)
          )
        ) |>
        dplyr::mutate(
          complex_name = dplyr::if_else(
            complex_name == "TGF-beta receptor II-TGF-beta3 complex",
            "TGF-betaR2-TGF-beta3 complex",
            as.character(complex_name)
          )
        ) |>
        dplyr::filter(organism == "Human") |>
        tidyr::separate_rows(uniprot_acc, sep = ";") |>
        tidyr::separate_rows(pmid, sep = ";") |>
        dplyr::filter(nchar(pmid) > 2) |>
        dplyr::group_by(complex_name) |>
        dplyr::summarise(
          purification_method =
            paste(unique(protein_complex_purification_method),
                  collapse = "; "),
          complex_comment =
            paste(unique(complex_comment),
                  collapse="; "),
          disease_comment =
            paste(unique(disease_comment),
                  collapse="; "),
          # pmid =
          #   paste(unique(pmid),
          #         collapse=";"),
          # uniprot_acc = paste(sort(unique(uniprot_acc)),
          #                     collapse = "_"),
          .groups = "drop"
        ) |>
        dplyr::distinct()
    )


    # missing_corum_entries <- protein_complexes[['CORUM']] |>
    #   dplyr::anti_join(protein_complexes[['OmniPath']], by = "complex_name") |>
    #   dplyr::anti_join(protein_complexes[['OmniPath']], by = "uniprot_acc") |>
    #   dplyr::filter(stringr::str_detect(
    #     uniprot_acc, "_"
    #   ))

    protein_complexes[['OmniPath']] <-
      protein_complexes[['OmniPath']] |>
      dplyr::left_join(
        protein_complexes[['CORUM']],
        by = "complex_name",
        relationship = "many-to-many") |>
      #dplyr::select(-pmid) |>
      dplyr::mutate(confidence = NA) |>
      dplyr::mutate(complex_id = as.character(complex_id))

    if(!file.exists(humap2_complexes_tsv)){
      download.file(url = humap_url,
                    destfile = humap2_complexes_tsv)
    }

    complex_humap <- readr::read_csv(
      file = humap2_complexes_tsv,
      show_col_types = F) |>
      janitor::clean_names() |>
      dplyr::rename(complex_id = hu_map2_id,
                    uniprot_acc = uniprot_ac_cs) |>
      dplyr::select(-genenames) |>
      dplyr::mutate(uniprot_acc = stringr::str_replace_all(
        uniprot_acc, " ","_"
      )) |>
      dplyr::mutate(complex_name = complex_id) |>
      dplyr::mutate(sources = "hu.MAP 2.0") |>
      dplyr::anti_join(
        protein_complexes[['OmniPath']],
        by = "uniprot_acc")

    complex_all <- protein_complexes[['OmniPath']] |>
      dplyr::bind_rows(complex_humap)

    proteincomplexdb <- list()
    proteincomplexdb[["db"]] <- complex_all |>
      dplyr::select(-uniprot_acc) |>
      dplyr::distinct()

    proteincomplexdb[["up_xref"]] <- complex_all |>
      dplyr::select(complex_id, uniprot_acc) |>
      tidyr::separate_rows(uniprot_acc, sep="_") |>
      dplyr::distinct()


    saveRDS(proteincomplexdb,
            file = rds_fname)
    return(proteincomplexdb)
}

clean_project_score_cell_lines <- function(
  mat){

  ## Ignore cell lines that fail QC
  mat <- mat[
    , mat[3,] != "False"
  ]

  rownames(mat) <-
    mat[,"model_name"]
  mat <- mat[,-1]
  colnames(mat) <-
    mat["model_id",]
  mat <- mat[-1,]
  mat <- mat[-2,]
  mat <- mat[-2,]

  duplicate_cell_lines <-
    plyr::count(colnames(mat)) |>
    dplyr::filter(freq == 2)

  ## Mark cell lines included for analysis
  ## - if duplicates - keep Sanger cell line
  cell_line_identifiers <- colnames(mat)
  for(i in 1:length(cell_line_identifiers)){
    if(cell_line_identifiers[i] %in% duplicate_cell_lines$x){
      if(mat[1,i] == "Sanger"){
        mat[1,i] <- "Inc"
      }else{
        mat[1,i] <- "Ex"
      }
    }else{
      mat[1,i] <- "Inc"
    }
  }

  ## Exclude duplicate cell lines (Broad)
  mat <- mat[
    , mat[1,] != "Ex"
  ]
  mat <- mat[-1,]

  return(mat)

}

get_fitness_data_crispr <- function(
    raw_db_dir = NULL,
    gene_info = NULL){

    cell_lines <- read.csv2(
      file = file.path(
        raw_db_dir,
        "crispr_viability",
        "model_list.csv"),
      comment.char="",sep=",",
      header = T, stringsAsFactors = F,
      fill = T,na.strings = c("")) |>
      janitor::clean_names() |>
      dplyr::select(model_id, model_name, synonyms, model_type,
                    crispr_ko_data, tissue, cancer_type,
                    tissue_status, cancer_type_detail, cancer_type_ncit_id,
                    sample_site, gender, species, ethnicity, mutational_burden,
                    ploidy_wes, msi_status) |>
      dplyr::filter(model_type == "Cell Line" &
                      species == "Homo Sapiens") |>
      dplyr::filter(tissue != "Unknown" &
                      tissue != "Adrenal Gland" &
                      tissue != "Placenta" &
                      tissue != "Vulva") |>
      dplyr::mutate(tissue = dplyr::if_else(tissue == "Central Nervous System",
                                            "CNS/Brain",
                                            as.character(tissue))) |>
      dplyr::mutate(tissue = dplyr::if_else(tissue == "Large Intestine",
                                            "Colon/Rectum",
                                            as.character(tissue))) |>
      dplyr::mutate(tissue = dplyr::if_else(tissue == "Small Intestine",
                                            "Stomach",
                                            as.character(tissue)))

    gene_identifiers <-
      read.csv(file = file.path(
        raw_db_dir,
        "crispr_viability",
        "gene_identifiers.csv"),
        stringsAsFactors = F) |>
      dplyr::rename(gene_id_project_score = gene_id,
                    entrezgene = entrez_id) |>
      dplyr::filter(!is.na(entrezgene)) |>
      dplyr::left_join(
        dplyr::select(
          gene_info, symbol,
          entrezgene), by = "entrezgene",
        relationship = "many-to-many") |>
      dplyr::select(gene_id_project_score, entrezgene, symbol)

    bayes_factors <- as.matrix(
      readr::read_tsv(
        file = file.path(
          raw_db_dir,
          "crispr_viability",
          "scaledBayesianFactors.tsv.gz"),
        show_col_types = F)) |>
      clean_project_score_cell_lines()

    bayes_factors <-
      as.data.frame(setNames(reshape2::melt(bayes_factors, na.rm = T),
                             c('symbol', 'model_id', 'scaled_BF'))) |>
      dplyr::mutate(model_id = as.character(model_id),
                    symbol = as.character(symbol),
                    scaled_BF = as.numeric(scaled_BF) * -1) |>
      dplyr::filter(!is.na(scaled_BF)) |>
      dplyr::filter(scaled_BF < 0)

    ## Fitness scores - new
    cell_fitness_scores <- as.matrix(readr::read_tsv(
      file = file.path(
        raw_db_dir,
        "crispr_viability",
        "binaryFitnessScores.tsv.gz"),
        show_col_types = F)) |>
      clean_project_score_cell_lines()


    projectscoredb <- list()
    projectscoredb[['fitness_scores']] <-
      as.data.frame(setNames(reshape2::melt(cell_fitness_scores, na.rm = T),
                             c('symbol', 'model_id', 'loss_of_fitness'))) |>
      dplyr::mutate(model_id = as.character(model_id),
                    symbol = as.character(symbol),
                    loss_of_fitness = as.integer(loss_of_fitness)) |>
      dplyr::filter(loss_of_fitness == 1) |>
      dplyr::left_join(
        dplyr::select(cell_lines, model_id, model_name, tissue,
                      cancer_type, sample_site, tissue_status),
        by = c("model_id"),  relationship = "many-to-many") |>
      dplyr::filter(!is.na(model_name)) |>
      dplyr::left_join(gene_identifiers,by=c("symbol"),  relationship = "many-to-many") |>
      dplyr::filter(!is.na(gene_id_project_score)) |>
      dplyr::left_join(
        bayes_factors, by =
          c("symbol", "model_id"),  relationship = "many-to-many") |>
      dplyr::arrange(symbol, scaled_BF)

    ## Target priority scores
    projectscoredb[['target_priority_scores']] <-
      read.csv(file = file.path(
        raw_db_dir,
        "crispr_viability", "depmap-priority-scores.csv"),
        header = T, quote="", stringsAsFactors = F) |>
      purrr::set_names(
        c("tractability_bucket","gene_id","priority_score",
          "symbol","analysis_id","tumor_type")) |>
      dplyr::mutate(priority_score = as.numeric(
        stringr::str_replace_all(priority_score,"\\\"",""))) |>
      dplyr::mutate(symbol = as.character(
        stringr::str_replace_all(symbol,"\\\"",""))) |>
      dplyr::mutate(gene_id = as.character(
        stringr::str_replace_all(gene_id,"\\\"",""))) |>
      dplyr::mutate(tumor_type = as.character(
        stringr::str_replace_all(tumor_type,"\\\"",""))) |>
      dplyr::select(symbol, gene_id, tumor_type,
                    priority_score) |>
      dplyr::mutate(priority_score =
                      round(priority_score,
                            digits = 3)) |>
      dplyr::mutate(tumor_type = dplyr::case_when(
        tumor_type == "Pan-Cancer" ~ "Pancancer",
        tumor_type == "Haematopoietic and Lyphoid" ~
          "Haematopoietic and Lymphoid",
        TRUE ~ as.character(tumor_type)
      )) |>
      dplyr::distinct()

    ## order symbols by pancancer priority rank
    priority_order <-
      projectscoredb$target_priority_scores |>
      dplyr::filter(tumor_type == "Pancancer") |>
      dplyr::arrange(desc(priority_score))

    projectscoredb[['target_priority_scores']] <-
      projectscoredb[['target_priority_scores']] |>
      dplyr::mutate(symbol = factor(symbol,
                                    levels = priority_order$symbol))

    return(projectscoredb)

}


get_survival_associations <- function(
  raw_db_dir = NULL,
  gene_info = NULL){

  for(feature_type in
      c('CNAs','Mutations','Gene expression','Methylation',
        'miRNA expression','Protein expression')){
    destfile_fname <-
      file.path(raw_db_dir, "km_survival_cshl",
                paste0(
                  tolower(
                    stringr::str_replace(
                      stringr::str_replace(feature_type," ","_"),
                      "s$","")),
                  ".xlsx"))
    if(!file.exists(destfile_fname)){
      download.file(url = paste0(
        "https://storage.googleapis.com/cancer-survival-km-2021/cancer-survival-km/full-downloads/",
        stringr::str_replace(feature_type," ","%20"),".xlsx"),
        destfile = destfile_fname,
        quiet = T)
    }

  }

  project_survival <- list()
  for(feature_type in c('cna','mutation',
                        'gene_expression',
                        'methylation')){

    fname <- file.path(raw_db_dir,
                       "km_survival_cshl",
                       paste0(feature_type,".xlsx"))
    data <- openxlsx::read.xlsx(fname)

    if(feature_type == 'protein_expression'){
      rownames(data) <- data$Protein
      data$Protein <- NULL
    }else{
      rownames(data) <- data$Gene
      data$Gene <- NULL
    }

    feattype_brief <- feature_type
    if(feattype_brief == "mutation"){
      feattype_brief <- "mut"
    }
    if(feattype_brief == "gene_expression"){
      feattype_brief <- "exp"
    }
    if(feattype_brief == "methylation"){
      feattype_brief <- "meth"
    }

    data <- as.matrix(data)
    project_survival_df <-
      as.data.frame(setNames(reshape2::melt(data, na.rm = T),
                             c('symbol', 'tcga_cohort', 'z_score'))) |>
      #dplyr::mutate(feature_type = feattype_brief) |>
      dplyr::mutate(z_score = as.numeric(stringr::str_trim(z_score))) |>
      dplyr::mutate(symbol = stringr::str_replace(
        symbol,"^'","")) |>
      dplyr::filter(!stringr::str_detect(tcga_cohort,"^Stouffer")) |>
      dplyr::left_join(dplyr::select(gene_info,
                                     symbol, gene_biotype),
                       by = "symbol",  relationship = "many-to-many") |>
      ## limit associations to protein-coding genes
      dplyr::filter(gene_biotype == "protein-coding") |>
      dplyr::select(-gene_biotype)

    pancancer_trend <- as.data.frame(
      project_survival_df |>
        dplyr::group_by(symbol) |>
        dplyr::summarise(tot_z_score = sum(z_score)) |>
        dplyr::arrange(desc(tot_z_score)))

    project_survival[[feattype_brief]] <- project_survival_df |>
      dplyr::mutate(
        symbol = factor(symbol, levels = pancancer_trend$symbol))
  }
  projectsurvivaldb <- project_survival

  return(projectsurvivaldb)

}

get_unique_transcript_xrefs <- function(
  raw_db_dir = NULL,
  gene_oncox = NULL,
  update = F){

  rds_fname <- file.path(
    raw_db_dir,
    "transcript_xref",
    "transcript_xref_db.rds"
  )

  if(update == F & file.exists(rds_fname)){
    transcript_xref_db <- readRDS(file = rds_fname)
    return(transcript_xref_db)
  }


  gene_oncox$alias$records <-
    gene_oncox$alias$records |>
    dplyr::filter(n_primary_map == 1 &
                    alias != symbol) |>
    dplyr::select(alias, entrezgene) |>
    dplyr::arrange(entrezgene) |>
    dplyr::mutate(property = "alias") |>
    dplyr::rename(value = alias) |>
    dplyr::mutate(entrezgene = as.integer(
      entrezgene
    ))

  gencode_merged <-
    gene_oncox$gencode$records[['grch37']] |>
    dplyr::bind_rows(gene_oncox$gencode$records[['grch38']]) |>
    dplyr::select(
      ensembl_gene_id,
      symbol,
      entrezgene,
      refseq_mrna,
      refseq_peptide,
      uniprot_acc,
      name,
      ensembl_transcript_id,
      ensembl_protein_id,
      gene_biotype
    ) |>
    dplyr::filter(
      !stringr::str_detect(entrezgene,"&")) |>
    dplyr::mutate(
      ensembl_protein_id = stringr::str_replace(
        ensembl_protein_id, "\\.[0-9]{1,}$",""
      ))

  transcript_xref_db <- data.frame()

  for(xref in c('ensembl_transcript_id',
                'uniprot_acc',
                'ensembl_gene_id',
                'ensembl_protein_id',
                'refseq_mrna',
                'symbol',
                'refseq_peptide')){

    tmp <- gencode_merged |>
      dplyr::select(entrezgene, !!rlang::sym(xref)) |>
      tidyr::separate_rows(!!rlang::sym(xref), sep ="&") |>
      dplyr::distinct() |>
      dplyr::filter(!is.na(!!rlang::sym(xref))) |>
      dplyr::group_by(!!rlang::sym(xref)) |>
      dplyr::summarise(n = dplyr::n(),
                       entrezgene = paste(unique(entrezgene),collapse=",")) |>
      dplyr::filter(n == 1) |>
      dplyr::mutate(property = xref,
                    value = !!rlang::sym(xref)) |>
      dplyr::select(entrezgene, property, value)

    transcript_xref_db <- dplyr::bind_rows(
      transcript_xref_db, tmp
    )

  }

  gene_oncox$alias$records$entrezgene <-
    as.character(gene_oncox$alias$records$entrezgene)

  transcript_xref_db <- dplyr::bind_rows(
    transcript_xref_db, gene_oncox$alias$records) |>
    dplyr::mutate(entrezgene = as.integer(entrezgene))

  saveRDS(transcript_xref_db, file = rds_fname)
  return(transcript_xref_db)

}

get_omnipath_gene_annotations <- function(
  raw_db_dir = NULL,
  gene_info = NULL,
  update = F){

  rds_fname <- file.path(
    raw_db_dir,
    "omnipathdb",
    "omnipathdb.rds")

  if(update == F & file.exists(rds_fname)){
    omnipathdb <- readRDS(file = rds_fname)
    return(omnipathdb)
  }

  protein_coding_genes <- gene_info |>
    dplyr::filter(gene_biotype == "protein-coding") |>
    dplyr::select(symbol) |>
    dplyr::filter(!stringr::str_detect(symbol,"-")) |>
    dplyr::filter(symbol != "XYLB") |>
    dplyr::distinct()

  i <- 1
  omnipathdb <- data.frame()
  while(i <= nrow(protein_coding_genes) - 200){
    genes <-
      protein_coding_genes$symbol[i:min(nrow(protein_coding_genes),i + 199)]
    annotations <-
      OmnipathR::import_omnipath_annotations(proteins = genes) |>
      dplyr::filter(
        !stringr::str_detect(
          source, "^(HPA_tissue|DisGeNet|KEGG|MSigDB|ComPPI|DGIdb|LOCATE|Vesiclepedia|Ramilowski2015)$")) |>
      dplyr::rename(uniprot_acc = uniprot) |>
      dplyr::mutate(record_id = as.character(record_id))
    omnipathdb <-
      dplyr::bind_rows(
        omnipathdb, annotations)
    cat(i, min(nrow(protein_coding_genes), i + 199), '\n')
    i <- i + 200
  }
  genes <-
    protein_coding_genes$symbol[i:nrow(protein_coding_genes)]
  annotations <-
    OmnipathR::import_omnipath_annotations(
      select_genes = genes) |>
    dplyr::filter(!stringr::str_detect(
      source, "^(HPA_tissue|DisGeNet|KEGG|MSigDB|ComPPI|DGIdb|LOCATE|Vesiclepedia|Ramilowski2015)$")) |>
    dplyr::rename(uniprot_acc = uniprot) |>
    dplyr::mutate(record_id = as.character(record_id))

  if(nrow(annotations) > 0){
    omnipathdb <-
      dplyr::bind_rows(omnipathdb, annotations)
  }

  saveRDS(omnipathdb, file = rds_fname)
  return(omnipathdb)
}

get_gene_go_terms <- function(
  raw_db_dir = NULL){


  goa_human_fname <-
    file.path(
      raw_db_dir,
      "gene_ontology",
      "goa_human.gaf.gz")

  go_terms <- data.frame()
  go_structure <- as.list(GO.db::GOTERM)
  for(n in names(go_structure)){
    term_df <-
      data.frame(
        'go_id' = as.character(n),
        'go_term' =
          as.character(AnnotationDbi::Term(go_structure[[n]])),
        stringsAsFactors = F)
    go_terms <- dplyr::bind_rows(go_terms, term_df)
  }

  go_annotations_qc <-
    read.table(gzfile(goa_human_fname),
               sep = "\t", comment.char = "!", header = F,
               stringsAsFactors = F, quote = "") |>
    dplyr::select(V3, V5, V7, V9, V14) |>
    dplyr::rename(symbol = V3, go_id = V5,
                  go_evidence_code = V7,
                  go_ontology = V9, go_annotation_date = V14) |>
    dplyr::distinct() |>
    dplyr::filter(go_evidence_code != "IEA" &
                    go_evidence_code != "ND") |>
    dplyr::filter(go_ontology != "C") |>
    dplyr::select(-go_annotation_date) |>
    dplyr::distinct() |>
    dplyr::left_join(
      go_terms, by = "go_id", relationship = "many-to-many"
    ) |>
    dplyr::mutate(
      go_term_link =
        paste0('<a href=\'http://amigo.geneontology.org/amigo/term/',
               .data$go_id,'\' target=\'_blank\'>',
               .data$go_term, '</a>'))

  go_function_terms_prgene <- go_annotations_qc |>
    dplyr::filter(go_ontology == "F") |>
    dplyr::group_by(symbol) |>
    dplyr::summarise(
      num_ids_function = length(unique(go_id)),
      go_term_link_mf = paste(
        go_term_link, collapse = ", "
      ),
      .groups = "drop"
    )

  go_process_terms_prgene <- go_annotations_qc |>
    dplyr::filter(go_ontology == "P") |>
    dplyr::group_by(symbol) |>
    dplyr::summarise(
      num_ids_process = length(unique(go_id)),
      go_term_link_proc = paste(
        go_term_link, collapse = ", "
      ),
      .groups = "drop"
    )

  go_terms <- as.data.frame(
    dplyr::full_join(go_function_terms_prgene,
                     go_process_terms_prgene,
                     by = "symbol",  relationship = "many-to-many") |>
      dplyr::filter(symbol != "") |>
      dplyr::mutate(
        num_ids_process = dplyr::if_else(
          is.na(num_ids_process),
          as.integer(0),
          as.integer(num_ids_process)
        )
      ) |>
      dplyr::mutate(
        num_ids_function = dplyr::if_else(
          is.na(num_ids_function),
          as.integer(0),
          as.integer(num_ids_function)
        )
      ) |>
      dplyr::mutate(num_go_terms = num_ids_function +
                      num_ids_process) |>
      dplyr::mutate(
        go_term_link = dplyr::if_else(
          num_go_terms <= 3 &
            num_go_terms > 0,
          paste0(go_term_link_proc,
                 ", ",
                 go_term_link_mf),
          as.character("")
        )
      ) |>
      dplyr::mutate(go_term_link = stringr::str_replace(
        go_term_link, "NA, |, NA",""
      )) |>
      dplyr::select(symbol, go_term_link, num_go_terms)
  )

  return(go_terms)
}
assign_unknown_function_rank <- function(
  gene_xref = NULL){

  gene_xref <- gene_xref |>
    dplyr::mutate(
      unknown_function_rank =
        dplyr::case_when(
          stringr::str_detect(tolower(name),
                              "uncharacterized|open reading frame") &
            is.na(gene_summary) &
            num_go_terms == 0 ~ as.integer(1),
          stringr::str_detect(tolower(name),
                              "uncharacterized|open reading frame") &
            is.na(gene_summary) &
            num_go_terms > 0 &
            num_go_terms <= 3 ~ as.integer(2),
          is.na(gene_summary) &
            num_go_terms == 0 &
            !stringr::str_detect(tolower(name),
                                 "uncharacterized|open reading frame") ~ as.integer(3),
          is.na(gene_summary) &
            num_go_terms == 1 &
            !stringr::str_detect(tolower(name),
                                 "uncharacterized|open reading frame") ~ as.integer(4),
          !is.na(gene_summary) &
            num_go_terms == 0 &
            !stringr::str_detect(tolower(name),
                                 "uncharacterized|open reading frame") ~ as.integer(5),
          is.na(gene_summary) &
            num_go_terms == 2 &
            !stringr::str_detect(tolower(name),
                                 "uncharacterized|open reading frame") ~ as.integer(6),
          TRUE ~ as.integer(7)
        )
    ) |>

    ## make exception for family of clustered histones (not properly mapped with GO)
    dplyr::mutate(unknown_function_rank = dplyr::if_else(
      stringr::str_detect(name, "clustered histone|H3.3 histone"),
      as.integer(8),
      as.integer(unknown_function_rank)
    ))


  return(gene_xref)

}


generate_gene_xref_df <- function(
  raw_db_dir = NULL,
  gene_info = NULL,
  transcript_xref_db = NULL,
  ts_oncogene_annotations = NULL,
  opentarget_associations = NULL,
  gene_oncox = NULL,
  cancerdrugdb = NULL,
  otdb = NULL,
  go_terms_pr_gene = NULL,
  update = T){

  rds_fname <- file.path(
    raw_db_dir,
    "gene_xref",
    "gene_xref.rds"
  )

  if(update == F & file.exists(rds_fname)){
    gene_xref <- readRDS(file = rds_fname)
  }


  invisible(assertthat::assert_that(!is.null(gene_info)))
  invisible(assertthat::assert_that(!is.null(transcript_xref_db)))
  invisible(assertthat::assert_that(!is.null(ts_oncogene_annotations)))
  invisible(assertthat::assert_that(!is.null(cancerdrugdb)))
  invisible(assertthat::assert_that(!is.null(otdb)))
  invisible(assertthat::assert_that(!is.null(go_terms_pr_gene)))
  invisible(assertthat::assert_that(!is.null(opentarget_associations)))

  invisible(assertthat::assert_that(is.data.frame(gene_info)))
  invisible(assertthat::assert_that(is.data.frame(transcript_xref_db)))
  invisible(assertthat::assert_that(is.data.frame(ts_oncogene_annotations)))
  invisible(assertthat::assert_that(typeof(cancerdrugdb) == "list"))
  invisible(assertthat::assert_that(typeof(otdb) == "list"))
  invisible(assertthat::assert_that(is.data.frame(go_terms_pr_gene)))
  invisible(assertthat::assert_that(is.data.frame(opentarget_associations)))

  invisible(assertthat::assert_that(
    "drug_per_target" %in% names(cancerdrugdb)))
  invisible(assertthat::assert_that(
    "early_phase" %in% names(cancerdrugdb[['drug_per_target']])))
  invisible(assertthat::assert_that(
    "late_phase" %in% names(cancerdrugdb[['drug_per_target']])))
  invisible(assertthat::assert_that(
    "max_site_rank" %in% names(otdb)))


  assertable::assert_colnames(
    otdb[['max_site_rank']],
    c('ensembl_gene_id','cancer_max_rank'), only_colnames = F,
    quiet = T)
  assertable::assert_colnames(
    gene_info, c('entrezgene', 'name', 'symbol',
                  'hgnc_id', 'gene_biotype'), only_colnames = F,
    quiet = T)
  assertable::assert_colnames(
    transcript_xref_db, c('entrezgene', 'property',
                 'value'), quiet = T)

  assertable::assert_colnames(
    go_terms_pr_gene,
    c('symbol','num_go_terms','go_term_link'),
    quiet = T)

  assertable::assert_colnames(
    ts_oncogene_annotations,
    c('entrezgene','tumor_suppressor','oncogene', 'cancer_driver'),
    quiet = T, only_colnames = F)

  assertable::assert_colnames(
    opentarget_associations,
    c('ensembl_gene_id','SM_tractability_category','SM_tractability_support',
      'AB_tractability_category','AB_tractability_support'),
    quiet = T, only_colnames = F)


  go2entrez <- transcript_xref_db |>
    dplyr::filter(property == "symbol") |>
    dplyr::rename(symbol = value) |>
    dplyr::left_join(go_terms_pr_gene, by = "symbol", relationship = "many-to-many") |>
    dplyr::filter(!is.na(num_go_terms)) |>
    dplyr::select(-c(property, symbol)) |>
    dplyr::distinct() |>
    dplyr::group_by(entrezgene) |>
    dplyr::summarise(
      num_go_terms = max(num_go_terms),
      go_term_link = paste(
        unique(go_term_link), collapse=", "),
      .groups = "drop"
  )

  upsummary2entrez <- gene_oncox$basic$records |>
    dplyr::select(entrezgene, dbnsfp_function_description) |>
    dplyr::mutate(gene_summary_uniprot = dplyr::if_else(
      !is.na(dbnsfp_function_description),
      paste0("<b>UniProt:</b> ",
             dbnsfp_function_description),
      as.character(NA))) |>
    dplyr::select(entrezgene, gene_summary_uniprot)

  ncbisummary2entrez <- gene_oncox$basic$records |>
    dplyr::select(entrezgene, ncbi_function_summary) |>
    dplyr::mutate(gene_summary_ncbi = dplyr::if_else(
      !is.na(ncbi_function_summary),
      paste0(
        "<b>NCBI/RefSeq/OMIM:</b> ",
        ncbi_function_summary
    ), as.character(NA))) |>
    dplyr::select(entrezgene, gene_summary_ncbi)

  ensembl2entrez <- transcript_xref_db |>
    dplyr::filter(property == "ensembl_gene_id") |>
    dplyr::rename(ensembl_gene_id = value) |>
    dplyr::select(entrezgene, ensembl_gene_id) |>
    dplyr::distinct()


  ## exclude ensembl gene id's which are mapped to
  ## an entrezgene, but where there is a different ensembl
  ## gene id that has a match in Open Targets
  opentargetEns2Entrez <- opentarget_associations |>
    dplyr::inner_join(ensembl2entrez, by = "ensembl_gene_id") |>
    dplyr::select(entrezgene, ensembl_gene_id) |>
    dplyr::mutate(opentargets = T)

  ensembl2ignore <- ensembl2entrez |>
    dplyr::left_join(opentargetEns2Entrez,
                     by = c("ensembl_gene_id",
                            "entrezgene")) |>
    dplyr::mutate(opentargets = dplyr::if_else(
      is.na(opentargets),
      as.logical(FALSE),
      as.logical(opentargets)
    )) |>
    dplyr::filter(opentargets == F) |>
    dplyr::inner_join(
      dplyr::select(opentargetEns2Entrez, entrezgene),
      by = "entrezgene",
      relationship = "many-to-many")

  gene_xref <- gene_info |>
    dplyr::select(symbol, hgnc_id, entrezgene,
                  name, gene_biotype) |>
    dplyr::left_join(
      ensembl2entrez, by = "entrezgene", relationship = "many-to-many") |>
    dplyr::left_join(
      go2entrez, by = "entrezgene", relationship = "many-to-many") |>
    dplyr::filter(!is.na(entrezgene) & !is.na(symbol)) |>
    dplyr::distinct() |>
    dplyr::mutate(
      genename =
        paste0("<a href ='http://www.ncbi.nlm.nih.gov/gene/",
               entrezgene,
               "' target='_blank'>",name,"</a>")) |>
    dplyr::left_join(opentarget_associations,
                     by = "ensembl_gene_id", relationship = "many-to-many") |>
    dplyr::anti_join(
      dplyr::select(
        ensembl2ignore, c("ensembl_gene_id", "entrezgene")),
      by = c("ensembl_gene_id", "entrezgene")
    ) |>
    dplyr::left_join(ts_oncogene_annotations,
                     by = "entrezgene", relationship = "many-to-many") |>
    dplyr::left_join(upsummary2entrez,
                     by = "entrezgene", relationship = "many-to-many") |>
    dplyr::left_join(ncbisummary2entrez,
                     by = "entrezgene", relationship = "many-to-many") |>
    dplyr::left_join(cancerdrugdb[['drug_per_target']][['early_phase']],
                     by = "entrezgene", relationship = "many-to-many") |>
    dplyr::left_join(cancerdrugdb[['drug_per_target']][['late_phase']],
                     by = "entrezgene", relationship = "many-to-many") |>
    dplyr::left_join(cancerdrugdb[['approved_per_target']],
                     by = "entrezgene", relationship = "many-to-many") |>
    dplyr::mutate(tumor_suppressor = dplyr::if_else(
      is.na(tumor_suppressor),
      FALSE,
      as.logical(tumor_suppressor))) |>
    dplyr::mutate(oncogene = dplyr::if_else(
      is.na(oncogene),
      FALSE,
      as.logical(oncogene))) |>

    dplyr::mutate(tsg_confidence_level = dplyr::if_else(
      tumor_suppressor == FALSE,
      "NONE/LIMITED",
      as.character(tsg_confidence_level))) |>
    dplyr::mutate(oncogene_confidence_level = dplyr::if_else(
      oncogene == FALSE,
      "NONE/LIMITED",
      as.character(oncogene_confidence_level))) |>
    dplyr::mutate(cancer_driver = dplyr::if_else(
      is.na(cancer_driver),
      FALSE,
      as.logical(cancer_driver))) |>
    dplyr::mutate(num_go_terms = dplyr::if_else(
      is.na(num_go_terms),
      as.integer(0),
      as.integer(num_go_terms)
    )) |>
    dplyr::mutate(gene_summary = dplyr::case_when(
      !is.na(gene_summary_ncbi) &
        !is.na(gene_summary_uniprot) ~
        paste0(gene_summary_ncbi," ",
               gene_summary_uniprot),
      !is.na(gene_summary_ncbi) &
        is.na(gene_summary_uniprot) ~
        gene_summary_ncbi,
      is.na(gene_summary_ncbi) &
        !is.na(gene_summary_uniprot) ~
        gene_summary_uniprot,
      TRUE ~ as.character(NA)
    )) |>
    dplyr::select(-c(gene_summary_ncbi, gene_summary_uniprot)) |>

    dplyr::mutate(has_gene_summary = dplyr::if_else(
      !is.na(gene_summary),TRUE,FALSE
    )) |>
    dplyr::left_join(otdb$max_site_rank,
                     by = "ensembl_gene_id", relationship = "many-to-many") |>
    dplyr::mutate(cancer_max_rank = dplyr::if_else(
      is.na(cancer_max_rank), 0, as.numeric(cancer_max_rank)
    )) |>
    assign_unknown_function_rank()
    #remove_duplicate_ensembl_genes()

  saveRDS(gene_xref, file = rds_fname)
  return(gene_xref)

}
#
get_tissue_celltype_specificity <- function(
  raw_db_dir = NULL){

    ## https://www.proteinatlas.org/about/assays+annotation#gtex_rna
    ##
    ## Consensus transcript expression levels summarized per gene in
    ## 55 tissues based on transcriptomics data from HPA and GTEx.
    ## The tab-separated file includes Ensembl
    ## gene identifier ("Gene"), analysed sample ("Tissue"),
    ## transcripts per million ("TPM"), protein-transcripts
    ## per million ("pTPM") and normalized expression ("nTPM").

    tissue_cell_expr <- list()
    tissue_cell_expr[['tissue']] <- list()
    tissue_cell_expr[['tissue']][['expr_df']] <- data.frame()
    tissue_cell_expr[['tissue']][['unit']] <- NULL
    tissue_cell_expr[['tissue']][['te_SE']] <- NULL
    tissue_cell_expr[['tissue']][['te_df']] <- data.frame()

    ## https://www.proteinatlas.org/about/assays+annotation#singlecell_rna
    ##
    ## Transcript expression levels summarized per gene in 79 cell types
    ## types from 30 studies. The tab-separated file includes Ensembl
    ## gene identifier ("Gene"), gene name ("Gene name"), analysed
    ## sample ("Cell type") and normalized expresion ("nTPM").

    tissue_cell_expr[['single_cell']] <- list()
    tissue_cell_expr[['single_cell']][['expr_df']] <- data.frame()
    tissue_cell_expr[['single_cell']][['unit']] <- NULL
    tissue_cell_expr[['single_cell']][['te_SE']] <- NULL
    tissue_cell_expr[['single_cell']][['te_df']] <- data.frame()

    for(t in c("rna_tissue_consensus",
               "rna_single_cell_type")){

      local_hpa_file <- file.path(
        raw_db_dir,
        "hpa",
        paste0(t, ".tsv.zip")
      )

      if(!file.exists(local_hpa_file)){
        download.file(
          url = paste0("https://www.proteinatlas.org/download/",
                       t, ".tsv.zip"),
          destfile = local_hpa_file
        )
      }

      ##set max number of tissues/cell_types for
      ##determination of groups in group-enriched genes
      max_types_in_group <- 5

      data <- readr::read_tsv(
        local_hpa_file,
        show_col_types = F,
        col_names = T)

      num_tissues_per_gene <- data |>
        dplyr::group_by(Gene) |>
        dplyr::summarise(n = dplyr::n()) |>
        dplyr::filter(n == 1)

      data <- data |>
        dplyr::anti_join(num_tissues_per_gene,
                         by = "Gene")
      data <- as.data.frame(
        data[, c(1,3,4)] |>
          purrr::set_names(
            c("ensembl_gene_id","category","exp1")) |>
          dplyr::mutate(
            category =
              stringr::str_replace_all(category," |, ","_")) |>
          dplyr::group_by(ensembl_gene_id, category) |>
          dplyr::summarise(exp = mean(exp1), .groups = "drop") |>
          tidyr::pivot_wider(names_from = category,
                             values_from = exp)
      )
      rownames(data) <- data$ensembl_gene_id
      data$ensembl_gene_id <- NULL
      se <- SummarizedExperiment::SummarizedExperiment(
        assays = S4Vectors::SimpleList(
          as.matrix(data)),
        rowData = row.names(data),
        colData = colnames(data)
      )
      te_gene_retrieval_se <-
        TissueEnrich::teGeneRetrieval(
          expressionData = se,
          foldChangeThreshold = 4,
          maxNumberOfTissues = max_types_in_group)

      if(t == "rna_tissue_consensus"){
        tissue_cell_expr[['tissue']][['expr_df']] <- data
        tissue_cell_expr[['tissue']][['te_SE']] <-
          te_gene_retrieval_se
        tissue_cell_expr[['tissue']][['unit']] <- 'nTPM'
        tissue_cell_expr[['tissue']][['te_df']] <-
          as.data.frame(
            as.data.frame(
              SummarizedExperiment::assay(
                tissue_cell_expr[['tissue']][['te_SE']])) |>
              dplyr::rename(ensembl_gene_id = Gene,
                            category = Group) |>
              dplyr::group_by(ensembl_gene_id,
                              category) |>
              dplyr::summarise(
                tissue = paste(
                  sort(unique(Tissue)), collapse=", "),
                .groups = "drop")
          ) |>
          dplyr::mutate(
            tissue = stringr::str_to_title(tissue)
          ) |>
          dplyr::mutate(
            category = dplyr::case_when(
              category == "Expressed-In-All" ~ "Low tissue specificity",
              category == "Group-Enriched" ~ "Group enriched",
              category == "Tissue-Enhanced" ~ "Tissue enhanced",
              category == "Tissue-Enriched" ~ "Tissue enriched",
              category == "Not-Expressed" ~ "Not detected",
              TRUE ~ as.character("Mixed"))
          ) |>
          dplyr::mutate(
            category = factor(
              category,
              levels = c("Tissue enriched",
                         "Group enriched",
                         "Tissue enhanced",
                         "Mixed",
                         "Low tissue specificity",
                         "Not detected"))
          )


      }else{
        tissue_cell_expr[['single_cell']][['expr_df']] <- data
        tissue_cell_expr[['single_cell']][['unit']] <- 'nTPM'
        tissue_cell_expr[['single_cell']][['te_SE']] <-
          te_gene_retrieval_se
        tissue_cell_expr[['single_cell']][['te_df']] <-
          as.data.frame(
            as.data.frame(
              SummarizedExperiment::assay(
                tissue_cell_expr[['single_cell']][['te_SE']])
            ) |>
              dplyr::rename(ensembl_gene_id = Gene,
                            category = Group) |>
              dplyr::group_by(ensembl_gene_id,
                              category) |>
              dplyr::summarise(
                cell_type = paste(sort(unique(Tissue)), collapse=", "),
                .groups = "drop")
          ) |>
          dplyr::mutate(
            cell_type = stringr::str_to_title(cell_type)
          ) |>
          dplyr::mutate(
            category = dplyr::case_when(
              category == "Expressed-In-All" ~ "Low cell type specificity",
              category == "Group-Enriched" ~ "Group enriched",
              category == "Tissue-Enhanced" ~ "Cell type enhanced",
              category == "Tissue-Enriched" ~ "Cell type enriched",
              category == "Not-Expressed" ~ "Not detected",
              TRUE ~ as.character("Mixed"))
          ) |>
          dplyr::mutate(
            category =
              factor(category,
                     levels = c("Cell type enriched",
                                "Group enriched",
                                "Cell type enhanced",
                                "Mixed",
                                "Low cell type specificity",
                                "Not detected"))
          )
      }

    }
    return(tissue_cell_expr)

  }

quantify_gene_cancer_relevance <- function(
    cache_dir = NA,
    ot_associations = NULL){


  phenotype_auxiliary_maps <- phenOncoX::get_aux_maps(
    cache_dir = cache_dir
  )

  phenotype_cancer_map <- phenOncoX::get_terms(
    cache_dir = cache_dir
  )$records

  umls_concept_map <-
    phenotype_auxiliary_maps$records$umls$concept

  efo_name_map <-
    phenotype_auxiliary_maps$records$efo$efo2name

  phenotype_cancer_efo <-
    umls_concept_map |>
    dplyr::filter(main_term == T) |>
    dplyr::select(cui, cui_name) |>
    dplyr::distinct() |>
    dplyr::inner_join(
      dplyr::select(phenotype_cancer_map,
                    efo_id, cui,
                    cui_name, primary_site),
      by = c("cui", "cui_name"), multiple = "all",
      relationship = "many-to-many") |>
    dplyr::rename(disease_efo_id = efo_id) |>
    dplyr::filter(!is.na(disease_efo_id)) |>
    dplyr::mutate(cancer_phenotype = TRUE) |>
    dplyr::distinct()

  phenotypes_non_primary <- phenotype_cancer_efo |>
    dplyr::filter(is.na(primary_site))

  phenotypes_primary <- phenotype_cancer_efo |>
    dplyr::filter(!is.na(primary_site))

  phenotypes_non_primary <- phenotypes_non_primary |>
    dplyr::anti_join(
      phenotypes_primary, by = c("disease_efo_id","cui")) |>
    dplyr::select(-primary_site)

  ### Hereditary breast/ovarian cancer syndrome
  df <- data.frame(primary_site = 'Breast', cui = 'C0677776', stringsAsFactors = F)
  df <- dplyr::bind_rows(df, data.frame(primary_site = 'Ovary/Fallopian Tube', cui = 'C0677776', stringsAsFactors = F))

  phenotypes_hered_brca_ov <- phenotypes_non_primary |>
    dplyr::left_join(df, by = "cui", multiple = "all",
                     relationship = "many-to-many") |>
    dplyr::filter(!is.na(primary_site))
  ###

  phenotypes_other <- phenotypes_non_primary |>
    dplyr::anti_join(phenotypes_hered_brca_ov, by = "cui") |>
    dplyr::mutate(primary_site = dplyr::case_when(
      stringr::str_detect(tolower(cui_name), "breast") ~ "Breast",
      stringr::str_detect(tolower(cui_name), "meningioma") ~ "CNS/Brain",
      stringr::str_detect(tolower(cui_name), "cancer-predisposing") ~ "Other/Unknown",
      stringr::str_detect(tolower(cui_name), "ovarian|fallopian tube") ~ "Ovary/Fallopian Tube",
      stringr::str_detect(tolower(cui_name), "wilms|papillary renal cell|renal cell carcinoma|renal cell cancer") ~ "Kidney",
      stringr::str_detect(tolower(cui_name), "pancreatic") ~ "Pancreas",
      stringr::str_detect(tolower(cui_name), "lynch|polyposis") ~ "Colon/Rectum",
      stringr::str_detect(tolower(cui_name), "hereditary gastric") ~ "Esophagus/Stomach",
      stringr::str_detect(tolower(cui_name), "prostate") ~ "Prostate",
      stringr::str_detect(tolower(cui_name), "xeroderma|melanoma") ~ "Skin",
      stringr::str_detect(tolower(cui_name), "retinoblastoma") ~ "Eye",
      stringr::str_detect(tolower(cui_name), "medullary thyroid|follicular thyroid|papillary thyroid") ~ "Thyroid",
      TRUE ~ as.character(NA)
    ))


  all_cancer_phenotypes_efo <-
    phenotypes_primary |>
    dplyr::bind_rows(phenotypes_hered_brca_ov) |>
    dplyr::bind_rows(phenotypes_other)

  otdb_tmp <- as.data.frame(
    dplyr::select(ot_associations,
                  ot_association,
                  #symbol,
                  ensembl_gene_id) |>
      dplyr::filter(!is.na(ot_association)) |>
      tidyr::separate_rows(ot_association, sep="&") |>
      tidyr::separate(
        ot_association,
        sep=":",
        c('disease_efo_id',
          'direct_ot_association',
          'ot_datatype_support',
          'ot_association_score')) |>
      dplyr::mutate(ot_association_score =
                      as.numeric(ot_association_score)) |>
      dplyr::filter(!is.na(disease_efo_id)) |>

      dplyr::mutate(
        disease_efo_id = stringr::str_replace(disease_efo_id,"_",":")) |>
      dplyr::left_join(
        dplyr::select(efo_name_map,
                      efo_id, efo_name),
        by = c("disease_efo_id" = "efo_id"),
        multiple = "all",
        relationship = "many-to-many") |>
      ## exclude non-specific cancer associations
      dplyr::filter(
        !stringr::str_detect(
          tolower(efo_name),
          "^(((urogenital|mixed|papillary|hereditary|familial|susceptibility|syndrome|small cell|clear cell|squamous cell|digestive system|endocrine|metastatic|invasive|pharynx|vascular|intestinal|cystic|mucinous|epithelial|benign) )?(neoplasm|carcinoma|adenocarcinoma|cancer))$")
      ) |>
      dplyr::filter(
        !stringr::str_detect(
          tolower(efo_name)," neoplasm$"
        )
      ) |>
      dplyr::left_join(
        dplyr::select(all_cancer_phenotypes_efo,
                      disease_efo_id,
                      primary_site,
                      cancer_phenotype),
        by=c("disease_efo_id"),
        multiple = "all",
        relationship = "many-to-many") |>
      dplyr::mutate(
        cancer_phenotype =
          dplyr::if_else(
            is.na(cancer_phenotype) &
              stringr::str_detect(
                tolower(efo_name),
                "carcinoma|tumor|cancer|neoplasm|melanoma|myeloma|hemangioma|astrocytoma|leiomyoma|leukemia|lymphoma|barrett|glioma|sarcoma|blastoma|teratoma|seminoma"),
            TRUE, as.logical(cancer_phenotype))) |>
      dplyr::mutate(
        cancer_phenotype =
          dplyr::if_else(
            is.na(cancer_phenotype) &
              stringr::str_detect(
                tolower(efo_name),
                "hereditary|familial|susceptibility|syndrome") &
              stringr::str_detect(
                tolower(efo_name),
                "tumor|cancer|lynch|carcinoma|sarcoma|melanoma|blastoma"),
            TRUE, as.logical(cancer_phenotype))) |>
      dplyr::mutate(
        ot_link = paste0(
          "<a href='https://platform.opentargets.org/evidence/",
          ensembl_gene_id,"/",
          stringr::str_replace(disease_efo_id,":","_"),
          "' target=\"_blank\">", stringr::str_to_title(efo_name),"</a>")) |>
      dplyr::distinct() |>
      dplyr::distinct()
  )

  set1 <- otdb_tmp |>
    dplyr::filter(!is.na(primary_site)) |>
    dplyr::distinct()

  set2 <- otdb_tmp |>
    dplyr::filter(is.na(primary_site) & cancer_phenotype == T) |>
    dplyr::select(-c(efo_name, primary_site)) |>
    dplyr::anti_join(
      dplyr::select(set1, disease_efo_id, ensembl_gene_id),
      by = c("disease_efo_id","ensembl_gene_id")) |>
    dplyr::inner_join(
      dplyr::select(efo_name_map,
                    efo_id, efo_name, primary_site),
      by = c("disease_efo_id" = "efo_id"),
      multiple = "all",
      relationship = "many-to-many") |>
    dplyr::distinct() |>
    dplyr::filter(!is.na(primary_site))

  set12 <- dplyr::bind_rows(set1, set2)

  set3 <- otdb_tmp |>
    dplyr::anti_join(
      dplyr::select(set12, disease_efo_id, ensembl_gene_id),
      by = c("disease_efo_id","ensembl_gene_id")) |>
    dplyr::distinct()

  otdb_all <- set12 |>
    dplyr::bind_rows(set3) |>
    dplyr::arrange(primary_site, ensembl_gene_id,
                   desc(ot_association_score))

  lgr::lgr$info(
    glue::glue(
      "Found {NROW(otdb_all[otdb_all$cancer_phenotype == T,])} ",
      "gene-cancer associations"))

  otdb_tissue_scores <- as.data.frame(
    otdb_all |>
      dplyr::filter(!is.na(primary_site)) |>
      dplyr::group_by(primary_site, ensembl_gene_id) |>
      dplyr::summarise(
        tissue_assoc_score =
          round(sum(ot_association_score), digits = 6),
        .groups = "drop"
      ) |>
      dplyr::arrange(
        primary_site, desc(tissue_assoc_score))
  )

  otdb_site_rank <- data.frame()
  for(s in unique(otdb_tissue_scores$primary_site)){
    tmp <- otdb_tissue_scores |>
      dplyr::filter(primary_site == s) |>
      dplyr::mutate(
        tissue_assoc_rank =
          round(dplyr::percent_rank(tissue_assoc_score),
                digits = 6)
      )
    otdb_site_rank <- otdb_site_rank |>
      dplyr::bind_rows(tmp)
  }

  otdb_global_rank <- as.data.frame(
    otdb_site_rank |>
      dplyr::group_by(.data$ensembl_gene_id) |>
      dplyr::summarise(
        global_assoc_score = sum(
          .data$tissue_assoc_rank),
        .groups = "drop") |>
      dplyr::ungroup() |>
      dplyr::arrange(
        dplyr::desc(.data$global_assoc_score)) |>
      dplyr::mutate(
        global_assoc_rank = round(
          dplyr::percent_rank(.data$global_assoc_score),
          digits = 5))
  )

  otdb_max_site_rank <- as.data.frame(
    otdb_site_rank |>
      dplyr::arrange(desc(tissue_assoc_rank)) |>
      dplyr::select(ensembl_gene_id, tissue_assoc_rank) |>
      dplyr::group_by(ensembl_gene_id) |>
      dplyr::summarise(
        cancer_max_rank = max(tissue_assoc_rank),
        .groups = "drop")
  )

  otdb <- list()
  otdb$all <- otdb_all
  otdb$gene_rank <- otdb_site_rank |>
    dplyr::left_join(
      otdb_global_rank,
      by = "ensembl_gene_id", multiple = "all",
      relationship = "many-to-many")

  #otdb$site_rank <- otdb_site_rank
  otdb$max_site_rank <- otdb_max_site_rank

  return(otdb)

}

get_ligand_receptors <- function(
  raw_db_dir = NULL,
  keggdb = NULL,
  update = F){

  ligand_receptordb <- list()
  ligand_receptordb[['cellchatdb']] <- NULL
  ligand_receptordb[['celltalkdb']] <- NULL

  for(db in c('cellchatdb','celltalkdb')){

    if(db == 'cellchatdb'){
      rds_fname <- file.path(
        raw_db_dir,
        "cellchatdb",
        "cellchatdb_interactions.rds")

      if(update == F & file.exists(rds_fname)){
        ligand_receptordb[[db]] <- readRDS(file=rds_fname)
        next
      }

      ligand_receptor_db <- CellChat::CellChatDB.human$interaction |>
        magrittr::set_rownames(NULL) |>
        dplyr::rename(interaction_members = interaction_name) |>
        dplyr::rename(interaction_name = interaction_name_2) |>
        dplyr::mutate(evidence = stringr::str_replace_all(
          evidence,"PMC: 4393358", "PMID: 25772309"
        )) |>
        dplyr::mutate(evidence = stringr::str_replace_all(
          evidence,"PMC4571854", "PMID: 26124272"
        )) |>
        dplyr::mutate(evidence = stringr::str_replace_all(
          evidence,"PMC2194005", "PMID: 12208882"
        )) |>
        dplyr::mutate(evidence = stringr::str_replace_all(
          evidence,"PMC2194217", "PMID: 14530377"
        )) |>
        dplyr::mutate(evidence = stringr::str_replace_all(
          evidence,"PMC1237098", "PMID: 16093349"
        )) |>
        dplyr::mutate(evidence = stringr::str_replace_all(
          evidence,"PMC2431087", "PMID: 18250165"
        )) |>
        dplyr::mutate(evidence = stringr::str_squish(
          stringr::str_replace_all(
            evidence,
            ",","; "))
        ) |>
        dplyr::mutate(evidence = stringr::str_replace_all(
          evidence,
          " ","")) |>
        dplyr::mutate(interaction_id = dplyr::row_number()) |>
        dplyr::select(interaction_id, interaction_name,
                      annotation, pathway_name, dplyr::everything())


      ligand_receptor_kegg_support <- ligand_receptor_db |>
        dplyr::select(interaction_id, evidence) |>
        tidyr::separate_rows(evidence, sep=";") |>
        dplyr::filter(stringr::str_detect(evidence,"KEGG")) |>
        dplyr::mutate(evidence = stringr::str_replace(
          evidence, "KEGG:",""
        )) |>
        dplyr::left_join(keggdb$TERM2NAME,
                         by = c("evidence" = "standard_name"), relationship = "many-to-many") |>
        dplyr::mutate(ligand_receptor_kegg_support = paste0(
          "<a href='https://www.genome.jp/pathway/",
          stringr::str_replace(evidence,"hsa","map"),
          "' target='_blank'>",name,"</a>")) |>
        dplyr::select(-evidence)

      ligand_receptor_literature <- as.data.frame(
        ligand_receptor_db |>
          dplyr::select(interaction_id, evidence) |>
          tidyr::separate_rows(evidence, sep=";") |>
          dplyr::filter(stringr::str_detect(evidence,"PMID")) |>
          dplyr::mutate(evidence = stringr::str_replace(
            evidence, "PMID:",""
          )) |>
          dplyr::rename(pmid = evidence)
      )

      ligand_receptor_citations <- readRDS(
        file.path(
          raw_db_dir,
          "cellchatdb",
          "cellchatdb_citations.rds"))

      #ligand_receptor_citations <- get_citations_pubmed(
      #  pmid = unique(ligand_receptor_literature$pmid),
      #  chunk_size = 10)

      ligand_receptor_literature <- as.data.frame(
        ligand_receptor_literature |>
          dplyr::mutate(pmid = as.integer(pmid)) |>
          dplyr::left_join(ligand_receptor_citations,
                           by = c("pmid" = "pmid"), relationship = "many-to-many") |>
          dplyr::filter(!is.na(link)) |>
          dplyr::group_by(interaction_id) |>
          dplyr::summarise(
            ligand_receptor_literature_support = paste(
              link, collapse= ","),
            ligand_receptor_literature = paste(
              citation, collapse="|"
            )
          )
      )

      ligand_receptor_db_final <- ligand_receptor_db |>
        dplyr::left_join(dplyr::select(
          ligand_receptor_kegg_support, interaction_id,
          ligand_receptor_kegg_support), by = "interaction_id", relationship = "many-to-many") |>
        dplyr::left_join(dplyr::select(
          ligand_receptor_literature, interaction_id,
          ligand_receptor_literature_support), relationship = "many-to-many") |>
        dplyr::mutate(literature_support = dplyr::if_else(
          is.na(ligand_receptor_kegg_support),
          as.character(ligand_receptor_literature_support),
          paste(ligand_receptor_kegg_support,
                ligand_receptor_literature_support,
                sep = ", ")
        )) |>
        dplyr::mutate(literature_support = stringr::str_replace_all(
          literature_support,",?NA$",""
        )) |>
        dplyr::select(-c(ligand_receptor_literature_support,
                         ligand_receptor_kegg_support,
                         evidence))

      interaction2ligands <- as.data.frame(
        ligand_receptor_db_final |>
          dplyr::select(interaction_id, ligand) |>
          dplyr::rename(symbol = ligand) |>
          dplyr::mutate(class = "ligand")
      )

      interaction2receptor <- as.data.frame(
        ligand_receptor_db_final |>
          dplyr::select(interaction_id, receptor) |>
          dplyr::rename(symbol = receptor) |>
          tidyr::separate_rows(symbol, sep = "_") |>
          dplyr::mutate(symbol = dplyr::if_else(
            stringr::str_detect(symbol,"^(R2|TGFbR2)$"),
            "TGFBR2",
            as.character(symbol)
          )) |>
          dplyr::mutate(class = "receptor") |>
          dplyr::mutate(symbol = stringr::str_replace(
            symbol,"b","B"
          ))
      )

      cellchatdb <- list()
      cellchatdb[['db']] <- ligand_receptor_db_final
      cellchatdb[['xref']] <- interaction2ligands |>
        dplyr::bind_rows(interaction2receptor)

      saveRDS(cellchatdb, file = rds_fname)

      ligand_receptordb[[db]] <- cellchatdb
    }

    if(db == "celltalkdb"){

      rds_fname <- file.path(
        raw_db_dir,
        "celltalkdb",
        "celltalkdb_interactions.rds")

      if(update == F & file.exists(rds_fname)){
        celltalkdb <- readRDS(file=rds_fname)
        next
      }

      celltalkdb_raw <- readr::read_tsv(
        file = file.path(
          raw_db_dir,
          "celltalkdb",
          "human_lr_pair.txt"),
        col_types = "cccddccccc",
        show_col_types = F) |>
        dplyr::select(ligand_gene_id,
                      receptor_gene_id,
                      evidence) |>
        dplyr::mutate(interaction_id = dplyr::row_number()) |>
        dplyr::rename(entrezgene_ligand = ligand_gene_id,
                      entrezgene_receptor = receptor_gene_id,
                      pmid = evidence) |>
        dplyr::mutate(pmid = stringr::str_replace(
          pmid, ",321961154", ""))


      celltalkdb_literature <- as.data.frame(
        celltalkdb_raw |>
          dplyr::select(interaction_id, pmid) |>
          tidyr::separate_rows(pmid, sep=",") |>
        dplyr::filter(nchar(pmid) > 2)
      )

      ligand_receptor_citations <- readRDS(
        file.path(
          raw_db_dir,
          "celltalkdb",
          "celltalkdb_citations.rds"))
      # ligand_receptor_citations <- get_citations_pubmed(
      #   pmid = unique(celltalkdb_literature$pmid),
      #   chunk_size = 100)

      celltalkdb_literature <- as.data.frame(
        celltalkdb_literature |>
          dplyr::mutate(pmid = as.integer(pmid)) |>
          dplyr::left_join(ligand_receptor_citations,
                           by = c("pmid" = "pmid"), relationship = "many-to-many") |>
          dplyr::filter(!is.na(link)) |>
          dplyr::group_by(interaction_id) |>
          dplyr::summarise(
            ligand_receptor_literature_support = paste(
              link, collapse= ","),
            ligand_receptor_literature = paste(
              citation, collapse="|"
            )
          )
      )

      celltalkdb_final <- as.data.frame(
        celltalkdb_raw |>
          dplyr::select(entrezgene_ligand,
                        entrezgene_receptor,
                        interaction_id) |>
          dplyr::left_join(
            dplyr::select(
              celltalkdb_literature,
              interaction_id,
              ligand_receptor_literature_support,
              ligand_receptor_literature),
            by = "interaction_id", relationship = "many-to-many") |>
          dplyr::rename(
            literature_support = ligand_receptor_literature_support) |>
          dplyr::select(
            interaction_id,
            entrezgene_ligand,
            entrezgene_receptor,
            dplyr::everything()
          )
      )

      ligand_receptordb[[db]] <- celltalkdb_final

      saveRDS(celltalkdb_final, file = rds_fname)

    }
  }

  return(ligand_receptordb)
}

get_hpa_associations <- function(
  raw_db_dir = NULL,
  gene_xref = NULL,
  update = F){

  assertable::assert_colnames(
    gene_xref,
    c("symbol", "ensembl_gene_id"),
    only_colnames = F,
    quiet = T
  )

  rds_fname <- file.path(
    raw_db_dir,
    "hpa",
    "hpa.rds")

  if(update == F & file.exists(rds_fname)){
    hpa <- readRDS(file = rds_fname)
    return(hpa)
  }

  hpa <- readr::read_tsv(
    file = file.path(
      raw_db_dir,
      "hpa",
      "proteinatlas.tsv.gz"
    ),
    show_col_types = F) |>
    janitor::clean_names() |>
    dplyr::select(
      dplyr::matches(
        paste0(
          "ensembl|pathology|antibody|rna_cancer",
          "|rna_tissue|rna_single_cell"))) |>
    dplyr::select(
      !dplyr::matches(
        "_score"
      )
    ) |>
    dplyr::rename(
      ensembl_gene_id = ensembl,
    ) |>
    tidyr::pivot_longer(
      !ensembl_gene_id,
      names_to = "property",
      values_to = "value") |>
    dplyr::mutate(
      property = dplyr::case_when(
        property == "rna_cancer_specific_fpkm" ~
          "rna_cancer_specific_FPKM",
        property == "rna_single_cell_type_specific_n_tpm" ~
          "rna_single_cell_type_specific_nTPM",
        property == "rna_tissue_specific_n_tpm" ~
          "rna_tissue_specific_nTPM",
        TRUE ~ as.character(property)
      )
    ) |>
    dplyr::filter(
      !is.na(value) & nchar(value) > 0
    ) |>
    dplyr::mutate(
      value = dplyr::if_else(
        property == "antibody_rrid",
        stringr::str_replace_all(
          stringr::str_trim(
            stringr::str_replace_all(
              stringr::str_replace_all(
              value,
              "((HPA|CAB)[0-9]{6}:( )?)",
              ""),
              "(, ){1,}","|")
            ),
          "^\\||\\|$",""),
        as.character(value)
      )) |>
    dplyr::filter(property != "antibody") |>
    dplyr::filter(
      !stringr::str_detect(
        value, "unprognostic"
      )
    ) |>
    dplyr::mutate(value = stringr::str_replace_all(
     value, "prognostic |\\)",""
    )) |>
    dplyr::mutate(value = stringr::str_replace(
      value, "orable \\(","orable|"
    ))

  saveRDS(hpa, file = rds_fname)
  return(hpa)

}
#
#   variables_json <- c('RNA tissue specificity',
#                       'RNA tissue distribution',
#                       'RNA tissue specific nTPM',
#                       'RNA single cell type specificity',
#                       'RNA single cell type distribution',
#                       'RNA single cell type specific nTPM',
#                       'RNA cancer specificity',
#                       'RNA cancer distribution',
#                       'RNA cancer specific FPKM',
#                       'RNA cell line specificity',
#                       'RNA cell line distribution',
#                       'RNA cell line specific nTPM',
#                       'Pathology prognostics - Breast cancer',
#                       'Pathology prognostics - Cervical cancer',
#                       'Pathology prognostics - Colorectal cancer',
#                       'Pathology prognostics - Endometrial cancer',
#                       'Pathology prognostics - Glioma',
#                       'Pathology prognostics - Head and neck cancer',
#                       'Pathology prognostics - Liver cancer',
#                       'Pathology prognostics - Lung cancer',
#                       'Pathology prognostics - Melanoma',
#                       'Pathology prognostics - Ovarian cancer',
#                       'Pathology prognostics - Pancreatic cancer',
#                       'Pathology prognostics - Prostate cancer',
#                       'Pathology prognostics - Renal cancer',
#                       'Pathology prognostics - Stomach cancer',
#                       'Pathology prognostics - Testis cancer',
#                       'Pathology prognostics - Thyroid cancer',
#                       'Pathology prognostics - Urothelial cancer',
#                       'Antibody RRID')
#
#   variables_df <- c('rna_tissue_specificity',
#                     'rna_tissue_distribution',
#                     'rna_tissue_specific_nTPM',
#                     'rna_single_cell_type_specificity',
#                     'rna_single_cell_type_distribution',
#                     'rna_single_cell_type_specificity_nTPM',
#                     'rna_cancer_specificity',
#                     'rna_cancer_distribution',
#                     'rna_cancer_specific_FPKM',
#                     'rna_cell_line_specificity',
#                     'rna_cell_line_distribution',
#                     'rna_cell_line_specific_nTPM',
#                     'pathology_prognostics_breast_cancer',
#                     'pathology_prognostics_cervical_cancer',
#                     'pathology_prognostics_colorectal_cancer',
#                     'pathology_prognostics_endometrial_cancer',
#                     'pathology_prognostics_glioma',
#                     'pathology_prognostics_head_and_neck_cancer',
#                     'pathology_prognostics_liver_cancer',
#                     'pathology_prognostics_lung_cancer',
#                     'pathology_prognostics_melanoma',
#                     'pathology_prognostics_ovarian_cancer',
#                     'pathology_prognostics_pancreatic_cancer',
#                     'pathology_prognostics_prostate_cancer',
#                     'pathology_prognostics_renal_cancer',
#                     'pathology_prognostics_stomach_cancer',
#                     'pathology_prognostics_testis_cancer',
#                     'pathology_prognostics_thyroid_cancer',
#                     'pathology_prognostics_urothelial_cancer',
#                     'antibody_rrid')
#
#   hpa <- data.frame()
#   k <- 1
#
#   for (i in 1:nrow(gene_xref_hpa)) {
#     g <- gene_xref_hpa[i,]
#
#     # if(g$gene_biotype != "protein-coding"){
#     #   next
#     # }
#     # if(is.na(g$ensembl_gene_id)){
#     #   next
#     # }
#
#     if(!startsWith(g$ensembl_gene_id, "ENSG") |
#        stringr::str_detect(g$ensembl_gene_id, "PAR")){
#       next
#     }
#
#     local_json <-
#       file.path(
#         raw_db_dir,
#         "hpa",
#         "json2",
#         paste0(g$ensembl_gene_id,".json"))
#
#     if(!file.exists(local_json)){
#       protein_atlas_url <-
#         paste0("https://www.proteinatlas.org/search/",
#                g$ensembl_gene_id,"?format=json")
#
#       if(RCurl::url.exists(protein_atlas_url) == TRUE){
#         cat(k, ' - ', protein_atlas_url, '\n')
#         k <- k + 1
#
#         download.file(
#           protein_atlas_url,
#           destfile = local_json,
#           quiet = T)
#         Sys.sleep(1)
#       }
#     }
#   }
#
#
#     if(file.exists(local_json)){
#       pa_data <- jsonlite::fromJSON(txt = local_json)
#
#       for(j in 1:length(variables_json)){
#         var_name_json <- variables_json[j]
#         var_name_df <- variables_df[j]
#         if(!is.null(pa_data[[var_name_json]])){
#           if(!startsWith(var_name_df, "pathology") &
#              !stringr::str_detect(var_name_df,"(TPM|FPKM|rrid)$")){
#             df <- data.frame(
#               'ensembl_gene_id' = g$ensembl_gene_id,
#               'property' = var_name_df,
#               'value' = pa_data[[var_name_json]],
#               stringsAsFactors = F)
#
#           }else{
#             if(startsWith(var_name_df, "pathology")
#                | endsWith(var_name_df,"rrid")){
#               value <- paste(unlist(pa_data[[var_name_json]]),
#                              collapse="|")
#               if(stringr::str_detect(value,"TRUE") |
#                  var_name_df == "antibody_rrid"){
#                 df <- data.frame(
#                   'ensembl_gene_id' = g$ensembl_gene_id,
#                   'property' = var_name_df,
#                   'value' = value,
#                   stringsAsFactors = F)
#               }
#             }else{
#               df <- data.frame(
#                 'ensembl_gene_id' = g$ensembl_gene_id,
#                 'property' = var_name_df,
#                 'value' = paste(sort(names(pa_data[[var_name_json]])),
#                                 collapse=", "),
#                 stringsAsFactors = F)
#             }
#           }
#           if(NROW(df) > 0){
#             hpa <- dplyr::bind_rows(hpa, df)
#             df <- data.frame()
#           }
#         }
#       }
#     }
#
#   }
#
#   hpa <- hpa |>
#     dplyr::mutate(
#       value =  dplyr::if_else(
#         property == "antibody_rrid",
#         stringr::str_replace_all(value, "^(NA\\|){1,}|(\\|NA){1,}$",""),
#         as.character(value)
#       )
#     ) |>
#     dplyr::mutate(
#       value =  dplyr::if_else(
#         property == "antibody_rrid",
#         stringr::str_replace_all(value, "(\\|NA\\|)","|"),
#         as.character(value)
#       )
#     ) |>
#     dplyr::mutate(
#       value =  dplyr::if_else(
#         property == "antibody_rrid" & value == "NA",
#         as.character(''),
#         as.character(value)
#       )
#     )
#
#
#   saveRDS(hpa, file = rds_fname)
#   return(hpa)
#
#
# }


get_mean_median_tpm <- function(
  tpm_matrix,
  detectable_tpm_level = 1,
  tumor_code = "BRCA_1",
  filter_on = "median"){

  tpm_pr_gene <- as.data.frame(
    data.frame(
      symbol = rownames(tpm_matrix),
      median_tpm_exp = round(matrixStats::rowMedians(
        tpm_matrix), digits = 4
      ),
      mean_tpm_exp = round(matrixStats::rowMeans2(
        tpm_matrix), digits = 4
      ),
      tumor = tumor_code,
      stringsAsFactors = F
    ) |>
      dplyr::group_by(symbol) |>
      dplyr::summarise(
        median_tpm_exp = round(mean(median_tpm_exp), digits = 4),
        mean_tpm_exp = round(mean(mean_tpm_exp), digits = 4),
        tumor = paste(unique(tumor), collapse=","),
        .groups = "drop") |>
      dplyr::filter(median_tpm_exp >= detectable_tpm_level)) |>
    dplyr::mutate(median_tpm_subtype = paste(
      tumor, median_tpm_exp, sep=":"
    )) |>
    dplyr::select(symbol, median_tpm_subtype)


  return(tpm_pr_gene)


}


get_tcga_db <- function(
  raw_db_dir = NULL,
  gene_xref = NULL,
  tcga_release = "release37_20230329",
  update = F){


  rds_fname <- file.path(
    raw_db_dir,
    "tcga",
    "tcgadb.rds")


  ## NOTE: The processed data from Genomic Data Commons
  ## linked to here is retrieved with
  ## https://github.com/sigven/GDComics

  rnaseq_path <- file.path(
    raw_db_dir,
    "tcga",
    "current_release",
    "rnaseq"
  )

  coexpression_tsv <- file.path(
    raw_db_dir,
    "tcga",
    paste0(
      "co_expression_strong_moderate.",
      tcga_release,
      ".tsv.gz"))

  tcga_clinical_rds <- file.path(
    raw_db_dir,
    "tcga",
    "current_release",
    "clinical",
    "tcga_clinical.rds"
  )

  tcga_aberration_rds <- file.path(
    raw_db_dir,
    "tcga",
    "current_release",
    "gene",
    "tcga_gene_aberration_rate.rds"
  )

  maf_codes_tsv <- file.path(
    raw_db_dir,
    "maf_codes.tsv"
  )
  maf_path <- file.path(
    raw_db_dir,
    "tcga",
    "current_release",
    "snv_indel")


  recurrent_variants_tsv <- file.path(
    raw_db_dir,
    "tcga",
    "current_release",
    "vcf",
    "tcga_recurrent_coding_gvanno.grch38.tsv.gz"
  )

  gene_xref <- gene_xref |>
    dplyr::mutate(oncogene = dplyr::if_else(
      .data$oncogene_confidence_level == "MODERATE",
      FALSE,
      as.logical(.data$oncogene)
    )) |>
    dplyr::mutate(tumor_suppressor = dplyr::if_else(
      .data$tsg_confidence_level == "MODERATE",
      FALSE,
      as.logical(.data$tumor_suppressor)
    ))


  if(update == F & file.exists(rds_fname)){
    tcgadb <- readRDS(file = rds_fname)
    return(tcgadb)
  }


  tcga_clinical <-
    readRDS(file = tcga_clinical_rds)
  tcga_aberration_stats <-
    readRDS(file = tcga_aberration_rds) |>
    dplyr::filter(
      clinical_strata == "site" |
        (clinical_strata == "site_diagnosis" &
           percent_mutated >= 1))

  tcga_aberration_stats$genomic_strata <- NULL
  tcga_aberration_stats$entrezgene <- NULL
  tcga_aberration_stats$consensus_calls <- NULL
  tcga_aberration_stats$fp_driver_gene <- NULL

  site_code <- tcga_aberration_stats |>
    dplyr::select(primary_site) |>
    dplyr::distinct() |>
    dplyr::arrange(primary_site) |>
    dplyr::mutate(site_code = dplyr::row_number())

  diagnosis_code <- tcga_aberration_stats |>
    dplyr::select(primary_diagnosis_very_simplified) |>
    dplyr::distinct() |>
    dplyr::rename(
      primary_diagnosis = primary_diagnosis_very_simplified) |>
    dplyr::arrange(primary_diagnosis) |>
    dplyr::mutate(
      diagnosis_code = dplyr::row_number())

  clinical_strata_code <- tcga_aberration_stats |>
    dplyr::select(clinical_strata) |>
    dplyr::distinct() |>
    dplyr::mutate(
      clinical_strata_code = dplyr::row_number())


  tcga_aberration_stats <- tcga_aberration_stats |>
    dplyr::rename(
      primary_diagnosis = primary_diagnosis_very_simplified) |>
    dplyr::left_join(
      clinical_strata_code, by = "clinical_strata",
      relationship = "many-to-many") |>
    dplyr::left_join(
      diagnosis_code, by = "primary_diagnosis",
      relationship = "many-to-many") |>
    dplyr::left_join(
      site_code, by = "primary_site",
      relationship = "many-to-many") |>
    dplyr::select(-c(primary_site, primary_diagnosis,
                     clinical_strata))

  maf_codes <- read.table(
    file = maf_codes_tsv,
    header = T, sep = "\t", quote ="")

  maf_datasets <- list()
  i <- 1
  while(i <= nrow(maf_codes)){
    primary_site <- maf_codes[i,]$primary_site
    maf_code <- maf_codes[i,]$code
    maf_file <- file.path(
      maf_path, paste0(
        "tcga_mutation_grch38_",
        maf_code,"_0.maf.gz"))
    if(file.exists(maf_file)){
      tmp <- read.table(gzfile(maf_file), quote="",
                        header = T, stringsAsFactors = F,
                        sep="\t", comment.char="#")
      tmp$primary_site <- NULL
      tmp$site_diagnosis_code <- NULL
      tmp$Tumor_Sample_Barcode <- stringr::str_replace(
        tmp$Tumor_Sample_Barcode,"-[0-9][0-9][A-Z]$","")

      clinical <- tcga_clinical$slim |>
        dplyr::filter(primary_site == primary_site) |>
        dplyr::select(bcr_patient_barcode, primary_diagnosis_very_simplified,
                      MSI_status, Gleason_score, ER_status,
                      PR_status, HER2_status,
                      pancan_subtype_selected) |>
        dplyr::rename(Diagnosis = primary_diagnosis_very_simplified,
                      Tumor_Sample_Barcode = bcr_patient_barcode,
                      PanCancer_subtype = pancan_subtype_selected) |>
        dplyr::semi_join(
          tmp, by = "Tumor_Sample_Barcode")

      write.table(tmp, file="tmp.maf", quote=F, row.names = F,
                  col.names = T, sep ="\t")
      system('gzip -f tmp.maf', intern = T)
      maf <- maftools::read.maf(
        "tmp.maf.gz", verbose = F, clinicalData = clinical)
      maf_datasets[[maf_code]] <- maf
    }
    i <- i + 1
    cat(primary_site,'\n')

  }
  system('rm -f tmp.maf.gz')


  pfam_domains <-
    as.data.frame(PFAM.db::PFAMDE) |>
    dplyr::rename(PFAM_ID = ac,
                  PFAM_DOMAIN_NAME = de)

  ##TCGA recurrent SNVs/InDels
  recurrent_tcga_variants <- as.data.frame(readr::read_tsv(
    file = recurrent_variants_tsv,
    skip = 1, na = c("."), show_col_types = F) |>
      dplyr::select(TCGA_SITE_RECURRENCE,
                    CHROM, POS, REF, ALT,
                    TCGA_TOTAL_RECURRENCE,
                    PFAM_DOMAIN, HGVSc, LoF,
                    MUTATION_HOTSPOT,
                    MUTATION_HOTSPOT_MATCH,
                    HGVSp_short,
                    ONCOGENICITY_CLASSIFICATION,
                    ONCOGENICITY_SCORE,
                    ENSEMBL_TRANSCRIPT_ID,
                    SYMBOL,
                    COSMIC_MUTATION_ID,
                    Consequence,
                    AMINO_ACID_START,
                    VEP_ALL_CSQ) |>
      dplyr::mutate(VAR_ID = paste(
        CHROM, POS, REF, ALT, sep = "_")
      ) |>
      dplyr::rename(PFAM_ID = PFAM_DOMAIN) |>
      dplyr::mutate(LOSS_OF_FUNCTION = FALSE) |>
      dplyr::mutate(LOSS_OF_FUNCTION = dplyr::if_else(
        !is.na(LoF) & LoF == "HC",
        as.logical(TRUE),
        as.logical(LOSS_OF_FUNCTION),
      )) |>
      dplyr::rename(CONSEQUENCE = Consequence,
                    TOTAL_RECURRENCE = TCGA_TOTAL_RECURRENCE,
                    PROTEIN_CHANGE = HGVSp_short) |>
      dplyr::select(SYMBOL,
                    VAR_ID,
                    CONSEQUENCE,
                    PROTEIN_CHANGE,
                    AMINO_ACID_START,
                    PFAM_ID,
                    HGVSc,
                    MUTATION_HOTSPOT,
                    MUTATION_HOTSPOT_MATCH,
                    ONCOGENICITY_CLASSIFICATION,
                    ONCOGENICITY_SCORE,
                    LOSS_OF_FUNCTION,
                    ENSEMBL_TRANSCRIPT_ID,
                    COSMIC_MUTATION_ID,
                    TCGA_SITE_RECURRENCE,
                    TOTAL_RECURRENCE,
                    VEP_ALL_CSQ) |>
      tidyr::separate_rows(TCGA_SITE_RECURRENCE, sep=",") |>
      tidyr::separate(TCGA_SITE_RECURRENCE, into =
                        c("PRIMARY_SITE","SITE_RECURRENCE", "TCGA_SAMPLES"),
                      sep = ":",
                      remove = T) |>
      dplyr::select(-TCGA_SAMPLES) |>
      dplyr::distinct() |>
      dplyr::mutate(PRIMARY_SITE = dplyr::case_when(
        PRIMARY_SITE == "CNS_Brain" ~ "CNS/Brain",
        PRIMARY_SITE == "Colon_Rectum" ~ "Colon/Rectum",
        PRIMARY_SITE == "Head_and_Neck" ~ "Head and Neck",
        PRIMARY_SITE == "Esophagus_Stomach" ~ "Esophagus/Stomach",
        PRIMARY_SITE == "Bladder_Urinary_Tract" ~ "Bladder/Urinary Tract",
        PRIMARY_SITE == "Adrenal_Gland" ~ "Adrenal Gland",
        PRIMARY_SITE == "Biliary_Tract" ~ "Biliary Tract",
        PRIMARY_SITE == "Ovary_Fallopian_Tube" ~ "Ovary/Fallopian Tube",
        PRIMARY_SITE == "Soft_Tissue" ~ "Soft Tissue",
        TRUE ~ as.character(PRIMARY_SITE))
      ) |>
      dplyr::filter(!stringr::str_detect(
        CONSEQUENCE,"^(intron|intergenic|mature|non_coding|synonymous|upstream|downstream|3_prime|5_prime)"))
  )

  ## reduce number of properties in VEP_ALL_CSQ
  csq_all_slim <- as.data.frame(
    recurrent_tcga_variants |>
    dplyr::select(VAR_ID, VEP_ALL_CSQ) |>
    tidyr::separate_rows(VEP_ALL_CSQ, sep=",") |>
    tidyr::separate(
      VEP_ALL_CSQ, c("V1","V2","V3","V4",
                     "V5","V6","V7","V8"),
                     sep = ":") |>
    dplyr::mutate(VEP_ALL_CSQ = paste(
      V1,V7,V4,V5, sep=":"
    )) |>
    dplyr::group_by(VAR_ID) |>
    dplyr::summarise(VEP_ALL_CSQ = paste(
      unique(VEP_ALL_CSQ), collapse=", "
    ), .groups = "drop")
  )

  recurrent_tcga_variants$VEP_ALL_CSQ <- NULL
  recurrent_tcga_variants <- recurrent_tcga_variants |>
    dplyr::left_join(
      csq_all_slim, by = "VAR_ID",
      relationship = "many-to-many") |>
    dplyr::select(
      SYMBOL, VAR_ID, CONSEQUENCE,
      PROTEIN_CHANGE, MUTATION_HOTSPOT,
      ONCOGENICITY_CLASSIFICATION,
      ONCOGENICITY_SCORE,
      AMINO_ACID_START, PFAM_ID,
      LOSS_OF_FUNCTION, ENSEMBL_TRANSCRIPT_ID,
      MUTATION_HOTSPOT_MATCH,
      COSMIC_MUTATION_ID, PRIMARY_SITE,
      SITE_RECURRENCE, TOTAL_RECURRENCE,
      VEP_ALL_CSQ
    )


  ##TCGA co-expression data
  raw_coexpression <-
    readr::read_tsv(
      coexpression_tsv,
      col_names = c("symbol_A","symbol_B",
                    "r","p_value","tumor"),
      show_col_types = F)
  coexpression_genes1 <- raw_coexpression |>
    dplyr::rename(symbol = symbol_A,
                  symbol_partner = symbol_B) |>
    dplyr::left_join(
      dplyr::select(
        gene_xref, symbol,
        tumor_suppressor,
        oncogene, cancer_driver),
      by = "symbol",  relationship = "many-to-many") |>
    dplyr::filter(
      tumor_suppressor == T |
        oncogene == T |
        cancer_driver == T)

  coexpression_genes2 <- raw_coexpression |>
    dplyr::rename(symbol = symbol_B,
                  symbol_partner = symbol_A) |>
    dplyr::left_join(
      dplyr::select(
        gene_xref, symbol, tumor_suppressor,
        oncogene, cancer_driver),
      by = "symbol",  relationship = "many-to-many") |>
    dplyr::filter(
      tumor_suppressor == T |
        oncogene == T |
        cancer_driver == T)

  rm(raw_coexpression)

  tcga_coexp_db <- dplyr::bind_rows(
    coexpression_genes1,
    coexpression_genes2) |>
    dplyr::arrange(desc(r)) |>
    dplyr::mutate(p_value = signif(
      p_value, digits = 4)) |>
    dplyr::filter(p_value < 1e-6) |>
    dplyr::mutate(r = signif(r, digits = 3)) |>
    dplyr::filter((r <= -0.7 & r < 0) | (r >= 0.7))

  gdc_projects <- as.data.frame(TCGAbiolinks::getGDCprojects()) |>
    dplyr::filter(is.na(dbgap_accession_number) &
                    startsWith(id,"TCGA"))

  all_tcga_mean_median_tpm <- data.frame()

  for(i in 1:nrow(gdc_projects)){
    tumor_code <- gdc_projects[i,]$tumor
    rnaseq_rds_fname <-
      file.path(
        rnaseq_path,
        paste0("rnaseq_",
               tumor_code,
               ".rds")
      )

    cat(tumor_code, '\n')
    if(file.exists(rnaseq_rds_fname)){
      exp_data <- readRDS(file = rnaseq_rds_fname)

      tpm_matrix <- exp_data$matrix$tpm$tumor
      rm(exp_data)

      colnames(tpm_matrix) <-
        stringr::str_replace(
          colnames(tpm_matrix),"-0[0-9][A-Z]$","")

      clinical_subtype_samples <- tcga_clinical$slim |>
        dplyr::filter(tumor == tumor_code) |>
        dplyr::filter(primary_diagnosis_very_simplified !=
                        "Other subtype(s)") |>
        dplyr::select(site_diagnosis_code, bcr_patient_barcode)

      all_tpm_data <-  get_mean_median_tpm(
        tpm_matrix,
        tumor_code = tumor_code
      )

      if(length(unique(clinical_subtype_samples$site_diagnosis_code)) > 1){

        subtypes <- unique(clinical_subtype_samples$site_diagnosis_code)

        for(subtype in subtypes){

          subtype_samples <- clinical_subtype_samples[
            clinical_subtype_samples$site_diagnosis_code == subtype,]$bcr_patient_barcode

          tpm_matrix_subtype <- tpm_matrix[
            , colnames(tpm_matrix) %in% subtype_samples
          ]

          tpm_data <- get_mean_median_tpm(
            tpm_matrix_subtype,
            tumor_code = subtype
          )

          all_tpm_data <- all_tpm_data |>
            dplyr::bind_rows(tpm_data)

        }

      }
      all_tcga_mean_median_tpm <- all_tcga_mean_median_tpm |>
        dplyr::bind_rows(all_tpm_data)
      rm(all_tpm_data)

    }
  }

  all_tcga_tpm_above_1 <- as.data.frame(
    all_tcga_mean_median_tpm |>
      dplyr::group_by(symbol) |>
      dplyr::summarise(
        median_tpm_subtype = paste(
          sort(median_tpm_subtype), collapse="|"
        )
      )) |>
    dplyr::left_join(
      dplyr::select(
        gene_xref, symbol, gene_biotype),
      relationship = "many-to-many"
    ) |>
    dplyr::filter(gene_biotype == "protein-coding")


  tcgadb <- list()
  tcgadb[['coexpression']] <- tcga_coexp_db
  tcgadb[['aberration']] <- tcga_aberration_stats
  tcgadb[['recurrent_variants']] <- recurrent_tcga_variants
  tcgadb[['median_ttype_expression']] <- all_tcga_tpm_above_1
  tcgadb[['pfam']] <- pfam_domains
  tcgadb[['maf_codes']] <- maf_codes
  tcgadb[['maf']] <- maf_datasets
  tcgadb[['site_code']] <- site_code
  tcgadb[['diagnosis_code']] <- diagnosis_code
  tcgadb[['clinical_strata_code']] <- clinical_strata_code

  saveRDS(tcgadb, file = rds_fname)

  return(tcgadb)

}

get_paralog_SL_predictions <- function(
  raw_db_dir = NULL,
  gene_info = NULL){


  predicted_SL_pairs_csv <- file.path(
    raw_db_dir,
    "synlethdb",
    "ryan_predicted_paralog_SL_pairs.csv"
  )

  predicted_SL_data <- readr::read_csv(
    file = predicted_SL_pairs_csv,
    show_col_types = F) |>
    dplyr::select(
      A1_entrez, A2_entrez, prediction_score,
      prediction_rank, prediction_percentile,
      min_sequence_identity, fet_ppi_overlap,
      conservation_score,
      family_size,
      has_pombe_ortholog,
      has_cerevisiae_ortholog
    ) |>
    dplyr::left_join(
      dplyr::select(gene_info, symbol, entrezgene),
      by = c("A1_entrez" = "entrezgene"),  relationship = "many-to-many"
    ) |>
    dplyr::rename(ppi_overlap = fet_ppi_overlap) |>
    dplyr::rename(symbol_A1 = symbol) |>
    dplyr::left_join(
      dplyr::select(gene_info, symbol, entrezgene),
      by = c("A2_entrez" = "entrezgene"),  relationship = "many-to-many"
    ) |>
    dplyr::rename(symbol_A2 = symbol) |>
    dplyr::mutate(
      sequence_identity_pct = round(
        min_sequence_identity * 100, digits = 3
      )
    ) |>
    dplyr::mutate(prediction_score = round(
      prediction_score, digits = 3
    )) |>
    dplyr::mutate(ppi_overlap = round(
      ppi_overlap, digits = 2
    )) |>
    dplyr::rename(entrezgene_A1 = A1_entrez,
                  entrezgene_A2 = A2_entrez) |>
    dplyr::select(entrezgene_A1,
                  symbol_A1,
                  entrezgene_A2,
                  symbol_A2,
                  prediction_score,
                  prediction_percentile,
                  sequence_identity_pct,
                  family_size,
                  conservation_score,
                  ppi_overlap,
                  has_pombe_ortholog,
                  has_cerevisiae_ortholog
                  )

  return(predicted_SL_data)


}

get_synthetic_lethality_pairs <- function(
  raw_db_dir = NULL){

  sl_pairs_csv <- file.path(
    raw_db_dir,
    "synlethdb",
    "Human_SL.csv"
  )

  sl_pairs_data_raw <- as.data.frame(
    readr::read_csv(
      sl_pairs_csv, show_col_types = F) |>
      janitor::clean_names() |>
      dplyr::mutate(
        entrezgene_a = as.integer(gene_a_identifier)) |>
      dplyr::mutate(
        entrezgene_b = as.integer(gene_b_identifier)) |>
      dplyr::rename(pmid = sl_pubmed_id) |>
      dplyr::select(
        entrezgene_a, entrezgene_b,
        pmid, sl_source, sl_cell_line,
        sl_statistic_score) |>
      dplyr::mutate(sl_id = dplyr::row_number()) |>
      tidyr::separate_rows(pmid, sep=";") |>
      tidyr::separate_rows(pmid, sep="/") |>
      dplyr::filter(!stringr::str_detect(pmid, ",")) |>
      tidyr::separate_rows(sl_cell_line, sep=";") |>
      dplyr::distinct() |>
      dplyr::group_by(entrezgene_a, entrezgene_b, sl_id,
                      pmid, sl_source, sl_statistic_score) |>
      dplyr::summarise(sl_cell_line = paste(
        sl_cell_line, collapse=";"), .groups = "drop") |>
      dplyr::ungroup() |>
      dplyr::mutate(sl_statistic_score = round(
        as.numeric(sl_statistic_score), digits = 3)) |>
      dplyr::filter(stringr::str_detect(
        sl_source, "GenomeRNAi|CRISPR"))
  )

  pmid_data <-
    get_citations_pubmed(unique(sl_pairs_data_raw$pmid),
                         chunk_size = 100) |>
    dplyr::mutate(pmid = as.character(pmid)) |>
    dplyr::rename(sl_citation = citation,
                  sl_citation_link = link)

  sl_pairs_data <- as.data.frame(
    sl_pairs_data_raw |>
      dplyr::left_join(
        pmid_data, by = "pmid",
        relationship = "many-to-many") |>
      dplyr::group_by(
        sl_id, entrezgene_a, entrezgene_b,
        sl_source, sl_cell_line, sl_statistic_score) |>
      dplyr::summarise(
        sl_pmid = paste(pmid, collapse=";"),
        sl_citation = paste(sl_citation, collapse="; "),
        sl_citation_link = paste(
          sl_citation_link, collapse= ", "),
        .groups = "drop")
  )

  return(sl_pairs_data)

}


get_subcellular_annotations <- function(
    raw_db_dir = NA,
    transcript_xref_db = NA,
    update = F){


  rds_fname <- file.path(
    raw_db_dir,
    "compartments",
    "compartmentdb.rds")

  if(update == F & file.exists(rds_fname)){
    compartmentdb <- readRDS(file = rds_fname)
    return(compartmentdb)
  }

  go_terms <- data.frame()
  go_structure <- as.list(GO.db::GOTERM)
  for(n in names(go_structure)){
    term_df <-
      data.frame(
        'go_id' = as.character(n),
        'go_term' =
          as.character(AnnotationDbi::Term(go_structure[[n]])),
        'go_ontology' =
          AnnotationDbi::Ontology(go_structure[[n]]),
        stringsAsFactors = F)
    go_terms <- dplyr::bind_rows(go_terms, term_df)
  }

  gganatogram_map_xlsx <- file.path(
    raw_db_dir,
    "compartments",
    "compartment_mapping_jw.xlsx")

  go_gganatogram_map <-
    openxlsx::read.xlsx(gganatogram_map_xlsx,
                        sheet = 2) |>
    janitor::clean_names() |>
    dplyr::select(organ, go_id) |>
    dplyr::mutate(organ = stringr::str_trim(organ)) |>
    dplyr::left_join(gganatogram::cell_key$cell, by = "organ",  relationship = "many-to-many") |>
    dplyr::select(-c(type,value,colour)) |>
    dplyr::rename(ggcompartment = organ)

  ensembl_protein_xref <- transcript_xref_db |>
    dplyr::filter(property == "ensembl_protein_id") |>
    dplyr::select(entrezgene, value) |>
    dplyr::rename(ensembl_protein_id = value) |>
    dplyr::mutate(
      ensembl_protein_id = stringr::str_replace(
        ensembl_protein_id, "\\.[0-9]{1,}$",""
      ))


  subcell_evidence <- list()
  for (e in c('experiments','knowledge','textmining')) {
    fname <- file.path(
      raw_db_dir, "compartments", paste0(
        "human_compartment_",
        e, "_full.tsv.gz"
      )
    )

    subcell_evidence[[e]] <- as.data.frame(
      readr::read_tsv(
        file = fname, col_names = F,
        show_col_types = F)
    )

    if (e == 'experiments') {
      subcell_evidence[[e]] <- subcell_evidence[[e]] |>
        dplyr::mutate(channel = "Experimental") |>
        dplyr::rename(
          ensembl_protein_id = X1,
          go_id = X3,
          source = X5,
          confidence = X7,
        ) |>
        dplyr::mutate(
          confidence = ceiling(as.numeric(confidence))
        ) |>
        dplyr::select(
          ensembl_protein_id,
          go_id,
          channel,
          confidence
        ) |>
        dplyr::filter(
          confidence > 2
        ) |>
        dplyr::distinct() |>
        dplyr::mutate(source = "HPA")
    }

    if (e == 'textmining') {
      subcell_evidence[[e]] <- subcell_evidence[[e]] |>
        dplyr::mutate(channel = "Text mining", source = "PubMed") |>
        dplyr::rename(
          ensembl_protein_id = X1,
          go_id = X3,
          confidence = X6,
          literature = X7,
        ) |>
        dplyr::rowwise() |>
        dplyr::mutate(
          confidence = min(ceiling(as.numeric(confidence)), 4)
        ) |>
        dplyr::filter(
          stringr::str_detect(
            ensembl_protein_id,"ENSP"
          )
        ) |>
        dplyr::select(
          ensembl_protein_id,
          go_id,
          channel,
          confidence
        ) |>
        dplyr::filter(
          confidence > 2
        ) |>
        dplyr::mutate(source = "PubMed")
    }

    if (e == 'knowledge') {
      subcell_evidence[[e]] <- subcell_evidence[[e]] |>
        dplyr::mutate(channel = "Knowledge") |>
        dplyr::rename(
          ensembl_protein_id = X1,
          go_id = X3,
          confidence = X7,
          source = X5,
          source2 = X6) |>
        dplyr::mutate(source = paste(
          source, source2, sep =" - "
        )) |>
        dplyr::group_by(
          ensembl_protein_id,
          go_id,
          channel,
          confidence,
        ) |>
        dplyr::summarise(
          source = paste(unique(source), collapse=", "),
          .groups = "drop"
        ) |>
        dplyr::arrange(
          ensembl_protein_id, go_id, channel, confidence
        ) |>
        dplyr::group_by(
          ensembl_protein_id, go_id, channel
        ) |>
        dplyr::top_n(-1, confidence) |>
        dplyr::filter(
          confidence > 2
        )
    }

  }

  generic_locs_skip <- dplyr::bind_rows(
    data.frame('go_id' = 'GO:0005575', stringsAsFactors = F),
    data.frame('go_id' = 'GO:0110165', stringsAsFactors = F),
    data.frame('go_id' = 'GO:0005694', stringsAsFactors = F),
    data.frame('go_id' = 'GO:0005622', stringsAsFactors = F),
    data.frame('go_id' = 'GO:0043226', stringsAsFactors = F),
    data.frame('go_id' = 'GO:0043227', stringsAsFactors = F),
    data.frame('go_id' = 'GO:0043229', stringsAsFactors = F),
    data.frame('go_id' = 'GO:0043231', stringsAsFactors = F)
  )

  subcell_loc_compartments <-
    do.call(rbind, subcell_evidence) |>
    dplyr::anti_join(
      generic_locs_skip,  by = "go_id"
    ) |>
    dplyr::left_join(
      dplyr::select(
        go_terms, go_id, go_term
      ),
      by = "go_id",  relationship = "many-to-many") |>
    dplyr::left_join(
      ensembl_protein_xref,
      by = "ensembl_protein_id",  relationship = "many-to-many"
    ) |>
    dplyr::rename(
      annotation_source = source,
      annotation_channel = channel
    ) |>
    dplyr::mutate(
      entrezgene = as.integer(entrezgene)
    ) |>
    dplyr::select(
      entrezgene,
      go_id,
      go_term,
      annotation_source,
      annotation_channel,
      confidence
    )

  subcell_figure_legend <- list()
  cell_key[['cell']]$colour <- "#ffd700"
    for (i in 1:nrow(cell_key[['cell']])) {
      subcell_figure_legend[[i]] <-
        gganatogram::gganatogram(
          data=cell_key[['cell']][i,],
          outline = T,
          fillOutline='#a6bddb',
          organism="cell",
          fill="colour")  +
        ggplot2::theme_void() +
        ggplot2::ggtitle(cell_key[['cell']][i,]$organ) +
        ggplot2::theme(
          plot.title = ggplot2::element_text(
            hjust=0.5, size=10)
        ) +
        ggplot2::coord_fixed()
    }

  compartmentdb <- list()
  compartmentdb[['compartments']] <- subcell_loc_compartments
  compartmentdb[['go_gganatogram_map']] <- go_gganatogram_map
  compartmentdb[['gganatogram_legend']] <- subcell_figure_legend

  saveRDS(compartmentdb, file = rds_fname)


  return(compartmentdb)

}


# get_subcellular_annotations <- function(
#   raw_db_dir = NULL,
#   transcript_xref_db = NULL){
#
#
#
#
#
#
#
#
#
#   uniprot_xref <- transcript_xref_db |>
#     dplyr::filter(property == "uniprot_acc") |>
#     dplyr::select(entrezgene, value) |>
#     dplyr::rename(uniprot_acc = value)
#
#   go_terms <- data.frame()
#   go_structure <- as.list(GO.db::GOTERM)
#   for(n in names(go_structure)){
#     term_df <-
#       data.frame(
#         'go_id' = as.character(n),
#         'go_term' =
#           as.character(AnnotationDbi::Term(go_structure[[n]])),
#         'go_ontology' =
#           AnnotationDbi::Ontology(go_structure[[n]]),
#         stringsAsFactors = F)
#     go_terms <- dplyr::bind_rows(go_terms, term_df)
#   }
#
#   comppi_fname <- file.path(
#     raw_db_dir,
#     "comppi",
#     "comppi_proteins.txt")
#
#   comppi_locations_large_minor_yaml <- file.path(
#     raw_db_dir,
#     "comppi",
#     "largelocs.yml"
#   )
#
#   comppi_locations_large_minor_manual <- file.path(
#     raw_db_dir,
#     "comppi",
#     "largelocs_manual.tsv"
#   )
#
#   gganatogram_map_xlsx <- file.path(
#     raw_db_dir,
#     "comppi",
#     "compartment_mapping_jw.xlsx")
#
#   go_gganatogram_map <-
#     openxlsx::read.xlsx(gganatogram_map_xlsx,
#                         sheet = 2) |>
#     janitor::clean_names() |>
#     dplyr::select(organ, go_id) |>
#     dplyr::mutate(organ = stringr::str_trim(organ)) |>
#     dplyr::left_join(gganatogram::cell_key$cell, by = "organ") |>
#     dplyr::select(-c(type,value,colour)) |>
#     dplyr::rename(ggcompartment = organ)
#
#
#   large_locs_yml <- yaml::read_yaml(file=comppi_locations_large_minor_yaml)
#   largeLoc2MinorLoc <- data.frame()
#   for(loc in names(large_locs_yml$largelocs)){
#     df <- data.frame('major_loc' = loc,
#                      'go_id' = large_locs_yml$largelocs[[loc]]$childrenIncluded,
#                      stringsAsFactors = F)
#     largeLoc2MinorLoc <- largeLoc2MinorLoc |>
#       dplyr::bind_rows(df)
#
#   }
#
#   large_locs_tsv <- as.data.frame(
#     readr::read_tsv(
#       file = comppi_locations_large_minor_manual, show_col_types = F
#     ) |>
#       dplyr::distinct()
#   )
#   largeLoc2MinorLoc <- largeLoc2MinorLoc |>
#     dplyr::bind_rows(large_locs_tsv) |>
#     dplyr::distinct()
#
#
#
#   comppidb <- as.data.frame(
#     read.table(
#       file = comppi_fname,
#       sep = "\t", header = T, quote = "",
#       stringsAsFactors = F,comment.char = "") |>
#       janitor::clean_names() |>
#       dplyr::filter(naming_convention == "UniProtKB/Swiss-Prot/P") |>
#       dplyr::select(
#         major_loc_with_loc_score,
#         protein_name,
#         minor_loc,
#         experimental_system_type,
#         localization_source_database,
#         pubmed_id) |>
#       tidyr::separate_rows(
#         minor_loc, pubmed_id,
#         localization_source_database,
#         experimental_system_type, sep="\\|") |>
#       dplyr::rename(
#         go_id = minor_loc,
#         uniprot_acc = protein_name,
#         annotation_source = localization_source_database) |>
#       dplyr::left_join(go_terms,by=c("go_id")) |>
#       dplyr::filter(!is.na(go_term)) |>
#       tidyr::separate_rows(
#         major_loc_with_loc_score,
#         sep = "\\|"
#       ) |>
#       tidyr::separate(
#         major_loc_with_loc_score,
#         into = c("major_loc","major_loc_score"),
#         sep = ":",
#         remove = T
#       ) |>
#       dplyr::mutate(
#         major_loc_score = as.numeric(major_loc_score)
#       ) |>
#       dplyr::inner_join(
#         largeLoc2MinorLoc, by = c("major_loc","go_id")
#       ) |>
#       dplyr::filter(major_loc != "N/A") |>
#       dplyr::filter(major_loc_score > 0.7) |>
#       dplyr::mutate(
#         annotation_type = dplyr::if_else(
#           stringr::str_detect(experimental_system_type,
#                               "Experimental"),
#           "Experimental","Predicted/Unknown")) |>
#       dplyr::select(-experimental_system_type) |>
#       dplyr::distinct() |>
#       dplyr::left_join(uniprot_xref,by=c("uniprot_acc")) |>
#       dplyr::filter(!is.na(uniprot_acc)) |>
#       dplyr::group_by(uniprot_acc,
#                       go_id,
#                       go_term) |>
#       dplyr::summarise(
#         annotation_source =
#           paste(sort(unique(annotation_source)), collapse="|"),
#         annotation_type =
#           paste(sort(unique(annotation_type)), collapse="|"),
#         .groups = "drop") |>
#       dplyr::mutate(
#         confidence =
#           stringr::str_count(annotation_source,
#                              pattern = "\\|") + 1)
#   )
#
#
#   subcell_figure_legend <- list()
#   cell_key[['cell']]$colour <- "#ffd700"
#   for (i in 1:nrow(cell_key[['cell']])) {
#     subcell_figure_legend[[i]] <-
#       gganatogram::gganatogram(
#         data=cell_key[['cell']][i,],
#         outline = T,
#         fillOutline='#a6bddb',
#         organism="cell",
#         fill="colour")  +
#       ggplot2::theme_void() +
#       ggplot2::ggtitle(cell_key[['cell']][i,]$organ) +
#       ggplot2::theme(
#         plot.title = ggplot2::element_text(
#           hjust=0.5, size=10)
#       ) +
#       ggplot2::coord_fixed()
#   }
#
#   subcelldb <- list()
#   subcelldb[['comppidb']] <- comppidb
#   subcelldb[['go_gganatogram_map']] <- go_gganatogram_map
#   subcelldb[['gganatogram_legend']] <- subcell_figure_legend
#
#   return(subcelldb)
#
# }
sigven/oncoEnrichR documentation built on Aug. 31, 2023, 8:05 a.m.