R/protein_complex.R

Defines functions annotate_protein_domain annotate_protein_complex

annotate_protein_complex <- function(query_entrez,
                                     genedb = NULL,
                                     complex_db = NULL,
                                     complex_up_xref = NULL,
                                     transcript_xref = NULL,
                                     otdb_gene_rank = NULL) {

  lgr::lgr$info( "OmniPath: retrieval of protein complex information for target set - multiple underlying sources")
  stopifnot(is.integer(query_entrez))
  stopifnot(!is.null(genedb))
  stopifnot(!is.null(complex_db))
  stopifnot(!is.null(complex_up_xref))
  stopifnot(!is.null(transcript_xref))
  stopifnot(!is.null(otdb_gene_rank))
  validate_db_df(genedb, dbtype = "genedb")
  validate_db_df(complex_db, dbtype = "protein_complex")
  validate_db_df(transcript_xref, dbtype = "transcript_xref")

  otdb_gene_rank <- otdb_gene_rank |>
    dplyr::select(c("ensembl_gene_id",
                  "global_assoc_rank")) |>
    dplyr::distinct()

  uniprot_xref <- transcript_xref |>
    dplyr::filter(.data$property == "uniprot_acc") |>
    dplyr::rename(uniprot_acc = "value") |>
    dplyr::select(c("entrezgene", "uniprot_acc"))


  target_complex_overlap <- list()
  target_complex_overlap[['omnipath']] <- data.frame()
  target_complex_overlap[['humap2']] <- data.frame()

  all_protein_complexes <- as.data.frame(
    complex_up_xref |>
      dplyr::inner_join(
        uniprot_xref,
        by = "uniprot_acc",
        relationship = "many-to-many")
  )

  if(NROW(all_protein_complexes) == 0){
    return(target_complex_overlap)
  }

  all_protein_complexes <- as.data.frame(
    all_protein_complexes |>
      dplyr::inner_join(
        dplyr::select(genedb,
                      c("entrezgene",
                      "symbol",
                      "ensembl_gene_id")),
        by = "entrezgene",
        relationship = "many-to-many") |>
      dplyr::left_join(
        dplyr::select(otdb_gene_rank,
                      c("ensembl_gene_id",
                      "global_assoc_rank")),
        by = "ensembl_gene_id",
        relationship = "many-to-many") |>
      dplyr::mutate(global_assoc_rank = dplyr::if_else(
        is.na(.data$global_assoc_rank),
        as.numeric(0),
        as.numeric(.data$global_assoc_rank)
      )) |>
      dplyr::select(-c("ensembl_gene_id")) |>
      dplyr::distinct() |>
      dplyr::arrange(.data$complex_id, .data$symbol) |>
      dplyr::mutate(
        genelink = paste0(
          "<a href ='http://www.ncbi.nlm.nih.gov/gene/",
          .data$entrezgene,
          "' target='_blank'>", .data$symbol,"</a>")) |>
      dplyr::group_by(.data$complex_id) |>
      dplyr::summarise(
        complex_genes = paste(
          unique(.data$genelink), collapse = ", "),
        num_complex_members = dplyr::n(),
        complex_cancer_rank_score = round(mean(
          .data$global_assoc_rank), digits = 3),
        .groups = "drop")) |>
    dplyr::left_join(
      complex_db, by = "complex_id",
      relationship = "many-to-many") |>
    dplyr::filter(
      .data$num_complex_members > 2) |>
    dplyr::arrange(
      dplyr::desc(.data$complex_cancer_rank_score),
      dplyr::desc(.data$num_complex_members)) |>
    dplyr::rename(annotation_source = "sources")

  target_complex_genes <- list()


  target_complex_genes[['omnipath']] <-
    data.frame("entrezgene" = query_entrez, stringsAsFactors = F) |>
    dplyr::left_join(uniprot_xref, by = "entrezgene", relationship = "many-to-many") |>
    dplyr::distinct() |>
    dplyr::filter(!is.na(.data$uniprot_acc))

  target_complex_genes[['humap2']] <-
    target_complex_genes[['omnipath']]


  for (class in c('omnipath', 'humap2')) {

    if (nrow(target_complex_genes[[class]]) > 0) {

      target_complex_overlap[[class]] <- as.data.frame(
        complex_up_xref |>
          dplyr::inner_join(target_complex_genes[[class]],
                            by = "uniprot_acc", relationship = "many-to-many")
      )

      if (nrow(target_complex_overlap[[class]]) > 0) {

        target_complex_overlap[[class]] <- target_complex_overlap[[class]] |>
          dplyr::left_join(
            dplyr::select(genedb,
                          c("entrezgene",
                          "symbol")),
            by = "entrezgene", relationship = "many-to-many")

        if (class == "humap2") {
          target_complex_overlap[[class]] <- target_complex_overlap[[class]] |>
            dplyr::filter(stringr::str_detect(.data$complex_id,"HuMAP2"))
        } else {
          target_complex_overlap[[class]] <- target_complex_overlap[[class]] |>
            dplyr::filter(!stringr::str_detect(.data$complex_id,"HuMAP2"))
        }

        if (nrow(target_complex_overlap[[class]]) > 0) {
          target_complex_overlap[[class]] <- as.data.frame(
            target_complex_overlap[[class]] |>
              dplyr::group_by(.data$complex_id) |>
              dplyr::summarise(
                target_genes = paste(unique(sort(.data$symbol)),
                                     collapse = ", ")) |>
              dplyr::ungroup() |>
              dplyr::mutate(num_target_members =
                              stringr::str_count(.data$target_genes,",") + 1) |>
              #dplyr::arrange(dplyr::desc(.data$num_target_members)) |>
              dplyr::filter(!is.na(.data$complex_id)) |>
              dplyr::inner_join(all_protein_complexes,
                                by = "complex_id", relationship = "many-to-many") |>
              dplyr::mutate(complex_cancer_rank_bin = dplyr::case_when(
                .data$complex_cancer_rank_score > 0.9 &
                  .data$complex_cancer_rank_score <= 1 ~ 1,
                .data$complex_cancer_rank_score > 0.8 &
                  .data$complex_cancer_rank_score <= 0.9 ~ 2,
                .data$complex_cancer_rank_score > 0.7 &
                  .data$complex_cancer_rank_score <= 0.8 ~ 3,
                .data$complex_cancer_rank_score > 0.6 &
                  .data$complex_cancer_rank_score <= 0.7 ~ 4,
                .data$complex_cancer_rank_score > 0.5 &
                  .data$complex_cancer_rank_score <= 0.6 ~ 5,
                .data$complex_cancer_rank_score > 0.4 &
                  .data$complex_cancer_rank_score <= 0.5 ~ 6,
                .data$complex_cancer_rank_score > 0.3 &
                  .data$complex_cancer_rank_score <= 0.4 ~ 7,
                .data$complex_cancer_rank_score > 0.2 &
                  .data$complex_cancer_rank_score <= 0.3 ~ 8,
                .data$complex_cancer_rank_score > 0.1 &
                  .data$complex_cancer_rank_score <= 0.2 ~ 9,
                TRUE ~ as.numeric(10)
              )) |>
              dplyr::arrange(
                .data$complex_cancer_rank_bin,
                dplyr::desc(.data$num_target_members),
                .data$confidence) |>
                #dplyr::desc(.data$complex_cancer_rank_score),
                #             .data$confidence) |>
              dplyr::select(-c("complex_cancer_rank_bin")) |>
              dplyr::select(c("complex_name",
                            "target_genes",
                            "complex_literature_support",
                            "complex_genes",
                            "annotation_source",
                            "disease_comment",
                            "complex_cancer_rank_score",
                            "num_target_members",
                            "complex_comment",
                            "confidence",
                            "purification_method")) |>
              dplyr::rename(literature = "complex_literature_support")
          )


          if (class == "humap2") {
            target_complex_overlap[[class]] <- target_complex_overlap[[class]] |>
              dplyr::mutate(complex_id = .data$complex_name) |>
              dplyr::select(c("complex_name",
                            "target_genes",
                            "complex_genes",
                            "complex_id",
                            "complex_cancer_rank_score",
                            "num_target_members",
                            "confidence")) |>

              dplyr::mutate(complex_id = paste0(
                "<a href='http://humap2.proteincomplexes.org/displayComplexes?complex_key=",
                .data$complex_id,
                "' target='_blank'>", .data$complex_id, "</a>"
              ))

          }
        }
      }
    }
  }

  return(target_complex_overlap)

}

annotate_protein_domain <- function(query_entrez,
                                    genedb = NULL,
                                    pfamdb = NULL,
                                    transcript_xref = NULL) {

  lgr::lgr$appenders$console$set_layout(
    lgr::LayoutFormat$new(timestamp_fmt = "%Y-%m-%d %T"))

  lgr::lgr$info( "PFAM: retrieval of protein domain information for target set")
  stopifnot(is.integer(query_entrez))
  stopifnot(!is.null(genedb))
  stopifnot(!is.null(pfamdb))
  validate_db_df(genedb, dbtype = "genedb")
  validate_db_df(pfamdb, dbtype = "protein_domain")
  validate_db_df(transcript_xref, dbtype = "transcript_xref")

  uniprot_xref <- transcript_xref |>
    dplyr::filter(.data$property == "uniprot_acc") |>
    dplyr::rename(uniprot_acc = "value") |>
    dplyr::select(c("entrezgene", "uniprot_acc"))

  all_protein_domains <- as.data.frame(
    pfamdb |>
      dplyr::inner_join(
        uniprot_xref,
        by = "uniprot_acc", relationship = "many-to-many") |>
      dplyr::distinct()
  )

  target_protein_domains <-
    data.frame("entrezgene" = query_entrez,
               stringsAsFactors = F) |>
    dplyr::left_join(
      all_protein_domains,
      by = "entrezgene", relationship = "many-to-many") |>
    dplyr::distinct() |>
    dplyr::filter(!is.na(.data$uniprot_acc))


  if (NROW(target_protein_domains) > 0) {
    target_protein_domains <-
      as.data.frame(
        target_protein_domains |>
          dplyr::inner_join(
            dplyr::select(
              genedb,
              c("entrezgene",
              "symbol")),
            by = "entrezgene", relationship = "many-to-many") |>
          dplyr::arrange(
            .data$pfam_id,
            .data$symbol) |>
          dplyr::mutate(
            pfam_id = stringr::str_replace(
              .data$pfam_id, "\\.[0-9]{1,}",""
            )
          ) |>
          dplyr::mutate(
            protein_domain =
              paste0(
                "<a href=\"https://www.ebi.ac.uk/interpro/entry/pfam/",
                .data$pfam_id,
                "\" target='_blank'>",
                .data$pfam_long_name,
                "</a>")
          ) |>
          dplyr::mutate(
            pfam_domain_gene =
              paste0(
                "<a href=\"https://www.ebi.ac.uk/interpro/protein/UniProt/",
                .data$uniprot_acc,
                "\" target='_blank'>",
                .data$symbol,
                "</a>:",
                .data$domain_freq
              )
          ) |>
          dplyr::arrange(
            .data$protein_domain,
            dplyr::desc(.data$domain_freq)
          ) |>
          dplyr::group_by(
            .data$protein_domain,
            ) |>
          dplyr::summarise(
            target_genes = paste(
              unique(.data$pfam_domain_gene),
              collapse = ", "
            ),
            target_set_frequency =
              sum(.data$domain_freq),
            .groups = "drop")
      ) |>
      dplyr::mutate(
        source_db = "Pfam"
      ) |>
      dplyr::arrange(
        dplyr::desc(.data$target_set_frequency))
  }

  return(target_protein_domains)

}
sigven/oncoEnrichR documentation built on Aug. 31, 2023, 8:05 a.m.