R/get_literature_score.R

Defines functions get_literature_score

Documented in get_literature_score

#' #' .query_pubmed
#'
#' Auxiliary function for getting the list score
#' @param search_topic Item to search on PubMed via rentrez
#' @param wait_time Time between searches
#' @param ret_max Number of IDs to be returned. Defaults to 1.
#' @return The rentrez search result (a list)
.query_pubmed <-
  function (search_topic,
            wait_time = 0,
            ret_max = 1) {
    out <- tryCatch({
      s <- rentrez::entrez_search(
        db = "pubmed",
        term = search_topic,
        retmax = ret_max,
        use_history = TRUE
      )
      return(s)
    },
    
    error = function(e) {
      tryCatch({
        message("Query failed, but I'm trying again")
        Sys.sleep(wait_time)
        s <- rentrez::entrez_search(
          db = "pubmed",
          term = search_topic,
          retmax = ret_max,
          use_history = TRUE
        )
        return(s)
        
      },
      
      error = function(e) {
        message("Query failed! Not your lucky day, but I'm trying again")
        Sys.sleep(wait_time)
        s <- rentrez::entrez_search(db = "pubmed",
                                    term = search_topic,
                                    retmax = ret_max)
        message("Actually, you are safe.")
        return(s)
      })
    })
    return(out)
  }

# Pairwise search on pubmed for detection of relevance to a certain topic
# lubianat 28/09/2018

#' get_literature_score
#'
#' Calculates the importance score for a given gene.
#' The importance score is the total counts of articles in the pubmed
#' database that are a result for that gene AND each term of a list
#'
#' @param terms_of_interest A list of terms of interest related to the topic you want to find the relevance for
#' @param genes A vector with multiple genes.
#' @param wait_time How long should be the interval between two requests to the ENTREZ database when it fails.
#' Defaults to 0. In seconds.
#' @param gene2pubmed logical defining if gene2pubmed db is going to be used.
#' If used, the vector of genes has to be of HUMAN genes in the hgcn_symbol format.
#' @param return_all Only to be used with gene2pubmed! logical defining if the
#' all_counts table is going to be returned here. Usually it is generated by the
#' test_score function.
#' @param show_progress If TRUE, a progress bar is displayed. Defaults to TRUE.
#' @param verbose If TRUE, will display the index of the search occuring. Defaults to FALSE.
#' @import rentrez
#' @import progress
#' @return A dataframe with the literature scores.
#' @export
#' @examples
#' genes <- c('CD8A', 'CD4')
#' terms_of_interest <-
#'   c(
#'     "CD4 T cell",
#'     "CD14+ Monocyte",
#'     "B cell",
#'     "CD8 T cell",
#'     "FCGR3A+ Monocyte",
#'     "NK cell",
#'     "Dendritic cell",
#'     "Megakaryocyte",
#'     'immunity'
#'   )
#' get_literature_score(genes, terms_of_interest)

get_literature_score <-
  function(genes,
           terms_of_interest,
           gene2pubmed = FALSE,
           return_all = FALSE,
           wait_time = 0,
           show_progress = TRUE,
           verbose = FALSE) {
    if (gene2pubmed) {
      small_counts_full <-
        data.frame("Genes" = "n",
                   "Topic" = "n",
                   "count" = "n")
      small_counts_full <- small_counts_full[-1, ]
      all_counts <-
        data.frame("Genes" = "n",
                   "Topic" = "n",
                   "count" = "n")
      all_counts <- all_counts[-1,]
      
      message("Querying Pubmed. Might take a while for common terms.")
      
      for (term in terms_of_interest) {
        s <- .query_pubmed(term, ret_max = 999999)
        
        genes_db <-
          gene2pubmed_db$GeneID[which(gene2pubmed_db$PubMed_ID %in% s$ids)]
        gene_counts <- as.data.frame(table(genes_db))
        genes_entrezgene_id <-
          hgcn_entrez_reference$entrezgene_id[which(hgcn_entrez_reference$hgnc_symbol %in% genes)]
        
        # Changing IDs from entrezgene_id to hgnc_symbol
        gene_counts$hgnc_symbol <-
          hgcn_entrez_reference$hgnc_symbol[hgcn_entrez_reference$entrezgene_id %in% gene_counts$genes_db]
        
        
        small_counts <-
          gene_counts[gene_counts$hgnc_symbol %in% genes, ]
        
        small_counts <- small_counts[, c("hgnc_symbol", "Freq")]
        colnames(small_counts) <- c("Genes", "count")
        
        
        if (nrow(small_counts)) {
          small_counts$Topic <- term
          small_counts <-
            small_counts[, c("Genes", "Topic", "count")]
          small_counts_full <-
            rbind(small_counts_full, small_counts)
          
        }
        
        if (length(genes[!genes %in% small_counts$Genes])) {
          small_counts_zeroes <-
            data.frame(Genes = genes[!genes %in% small_counts$Genes],
                       Topic = term,
                       count = 0)
          small_counts_full <-
            rbind(small_counts_full, small_counts_zeroes)
        }
        
        if (return_all) {
          gene_counts <- gene_counts[, c("hgnc_symbol", "Freq")]
          colnames(gene_counts) <- c("Genes", "count")
          gene_counts$Topic <- term
          gene_counts <- gene_counts[, c("Genes", "Topic", "count")]
          all_counts <- rbind(all_counts, gene_counts)
        }
      }
      
      if (return_all) {
        return(all_counts)
      }
      return(small_counts_full)
    }
    
    all_combinations <- expand.grid(genes, terms_of_interest)
    all_combinations$count <- -1
    colnames(all_combinations) <- c('Genes', 'Topic', 'count')
    
    
    number_of_combinations <- nrow(all_combinations)
    # This shows a progress bar to the user.
    pb <-
      progress::progress_bar$new(format = "[:bar] :current/:total (:percent) eta: :eta", total = number_of_combinations)
    if (show_progress) {
      message('Running PubScore:')
    }
    for (index in seq_len(nrow(all_combinations))) {
      if (show_progress) {
        pb$tick()
      }
      if (verbose) {
        message(index)
      }
      search_topic <-
        paste0(all_combinations$Genes[index], ' AND ', all_combinations$Topic[index])
      
      s <- .query_pubmed(search_topic)
      all_combinations$count[index] <- s$count
    }
    return(all_combinations)
  }
csbl-usp/PubScore documentation built on March 13, 2021, 11:03 p.m.