nlpUtilityBelt: Auxiliary NLP tools

# to download rJava when giving an error message: sudo ln -s $(/usr/libexec/java_home)/jre/lib/server/libjvm.dylib /usr/local/lib

################################################################################
#' @title Gene name finder
#' @description \code{getCompleteSymbols} extracts all symbols from a data frame and finds their gene names, synonymns, and interpor protein, complex, and domain names. The function returns a data frame with a single column that lists all unique symbols, symbol synonyms, and names, and interpor protein, complex, and domain names. The species must be specified by the user as a string (e.g., "Human").
#' @param data a data frame that contains contains gene symbols
#' @ species a string specifying the species type
#' @return The function \code{getCompleteSymbols} returns a data frame with a single column that lists all unique symbols, symbol synonyms, and names, and interpor protein, complex, and domain names.
#' @details
#' This function requires the \code{mygene} library and the \code{getPatternList} function.
#' @examples
#' ## get a single list of all symbols and names from a data frame
#'  symbols <- getCompleteSymbols(data)
################################################################################
getCompleteSymbols <- function(data, species) {
    require(mygene, quietly = TRUE)

    # get gene names + protein domains (interpro) for all included symbols
    gene_query <- suppressWarnings(queryMany(c(data$biogrid_symbols), scopes = 'symbol', 
                                             fields = c('alias', 'name', 'interpro'), species = species, returnall=TRUE, return.as = 'DataFrame'))

    # create one list of symbols
    gene_info <- data.frame(symbol_list=unique(c(unlist(data$biogrid_symbols),
                                                unlist(gene_query$response$interpro),
                                                unlist(gene_query$response$name),
                                                unlist(lapply(gene_query$response$interpro, function(x) {unlist(x$desc)})))),
                                              stringsAsFactors = FALSE)
    # convert empty cells to NA for removal
    gene_info$symbol_list[gene_info$symbol_list == ''] <- NA
    gene_info <- na.omit(gene_info)

    return(gene_info)
  }


################################################################################
#' @title Identifies symbols and sentences containing the symbols in a match matrix
#' @description \code{matchFinder} searches through each row (sentence) and column (symbol) of a matrix of matches. Values in the matrix are generated by the results of running grepl against all potential symbol matches and the title and abstract of a journal article. Additional filtering is performed to remove columns where the symbol is not in a master symbol list. The function returns a nested list where the first list contains matched sentences and the second contains matched symbols.
#' @param sentence_matches A matrix of matches, where values in the matrix are generated by the results of running grepl against all potential symbol matches and the title and abstract of a journal article.
#' @param regex a large data frame of symbols and regular expression patterns.
#' @param sentences a character vetor of sentences, the title and abstract.
#' @return The function \code{matchFinder} The function returns a nested list where the first list contains matched sentences and the second contains matched symbols.
#' @examples
#' ## identifiying sentences contiaing specific keywords
#' matches <- matchFinder(sentence_matches, regex, sentences)
################################################################################
matchFinder <- function(sentence_matches, regex, sentences) {
  # set variables
  filtered_symbols <- list()
  count <- 0

  for (i in 1:nrow(sentence_matches)) {
    for (j in 1:ncol(sentence_matches)) {

      # return only those mtaches that are actual symbols
      if (any(unlist(sentence_matches[i,j])>0) && any(grepl(names(sentence_matches)[j], regex$symbols, fixed=TRUE))) {
        count <- 1 + count
        filtered_symbols[[count]] <- names(sentence_matches)[j]
      }
    }
  }
  # get matrix of sentences (rows) by matched symbol (columns)
  match_matrix <- sentence_matches[names(sentence_matches) %in% unique(filtered_symbols)]

  # if only 1 symbol is found
  if (ncol(match_matrix) == 1) {
    matched_symbols <- names(match_matrix)
    match_matrix <- which(match_matrix > 0)
    matched_sentences <- sapply(match_matrix, function(x) sentences[x])

    return(list(matched_sentences, matched_symbols))
    }

  # if more than 1 symbol is found
  if (ncol(match_matrix) > 1) {
    match_matrix <- match_matrix[rowSums(match_matrix > 0) >= 2,] # return all rows having at least 2 matches
    
    if (nrow(match_matrix) == 0){
      return('No Matches')
      }
    else {
    # keep only those sentences with at least 2 matched symbols
    matched_sentences <- sapply(as.integer(rownames(match_matrix)), function(x) sentences[x])

    # remove columns with no matches
    matched_symbols <- names(match_matrix[,unlist(lapply(colSums(match_matrix > 0), function(x) x>0))])

    return(list(matched_sentences, matched_symbols))
    }
  }
  
  # if there are no matches left
  else{return('No Matches')}
}


################################################################################
#' @title Identifies user-defined keywords and sentences containing these keywords
#' @description \code{getInteractionMatches} searches among each sentence that contains two proteins using a user provided list of keywords for matches. If a match is found, the sentences containing the keywords and the matched keywords are returned as a nested list.
#' @param data a character vector of sentences separated by '|'.
#' @param keywords a list of keywords to identify protein-protein interactions.
#' @return The function \code{getInteractionMatches} returns a nested list with two lists: sentences containing keywords, keywords. If matched sentences do not contain any of the provided keywords the cell will contain "No keywords found in sentence" and or "No keywords". The nested list will contain the following sub-lists: sents-Sentences from abstract containing a matched symbol and one or more of the keywords, separated by '|'; kws-Matched keywords from int_sentences, separated by '|'.
#' @details
#' This function requires the \code{quanteda} library.
#' @examples
#' ## identifiying sentences contiaing specific keywords
#' interaction_info <- getInteractionMatches(data, keywords)
################################################################################
getInteractionMatches <- function(data, keywords) {
  
  require(quanteda, quietly = TRUE)
  
  int_sent <- quanteda::kwic(as.tokens(strsplit(data,split='[.|]')), keywords, window = 10, valuetype = "regex")
  
  if (!is.null(int_sent) && nrow(int_sent) != 0) {
    int_words = quanteda::kwic(quanteda::corpus(int_sent$keyword), keywords, window = 10, valuetype = "regex")
    
    return(list(sents <- int_sent$keyword, kws=int_words$keyword))
  }
  
  else {return(list(sents <- c('No keywords found in sentences'), kws <- c('No keywords')))}
}

################################################################################
#' @title Find Protein-Protein Interactions
#' @description \code{getPPIs} identifies proteins in PubMed titles and abstracts and if a match is found returns matching information as data frame. The function searches among each sentence that contains two proteins using a user provided list of keywords for matches. If a match is found, the sentences containing the keywords and the matched keywords are appended to data frame.
#' @param data A data frame containing PubMed Ids, gene A and B symbols, synonyms list (each symbol separated by '|'), gene A and B names, and article title and abstract.
#' @param regex a large data frame of symbols and regular expression patterns.
#' @param getInteractionMatches - see specific function documentation \code{nlpUtilityBelt::getInteractionMatches}.
#' @param keywords a list of keywords to identify PPIs.
#' @return The function "getPPIs" returns a data frame with seven columns: pmid, article_title, article_abstract, matched_symbols, matched_setences, int_sentences, int_keywords, and match. If matched sentences do not contain any of the provided keywords the cell will contain "No keywords found in sentence" and or "No keywords". The data frame will contain the following columns: pmid-PubMed IDs; article_title-PubMed article title; article_abstract-PubMed article abstract sentences separated by '|'; matched_symbols-A list of identified protein symbols separated by '|'; matched_sentences-Sentences from abstract containing a matched symbol, separated by '|'; int_sentences-Sentences from abstract containing a matched symbol and one or more of the keywords, separated by '|'; int_keywords-Matched keywords from int_sentences, separated by '|'; match-A '0' or '1' to indicate articles where gene A and B where identified
#' @details
#' This function requires the packages \code{stringr}. It also uses the \code{getInteractionMacthes} and \code{extractPOS} functions.
#' @examples
#' ## interaction keywords
#' keywords <- c("bind", 
#'              "interact",
#'              "associate",
#'              "regulation",
#'              "bound",
#'              "localize",
#'              "stimulation",
#'              "regulate",
#'              "effect",
#'              "target",
#'              "component",
#'              "member",
#'              "mediate")
#'
#' ## loop over abstracts - get subset of the data for testing
#' PPI_results <- getPPIs(merged_biogrid_pubmed_results, patterns, getInteractionMatches, keywords)
#'
#' ## write out sentences
#' write.table(PPI_results, "PPI_BIOGRID_results.txt", quote  = FALSE, sep = '\t', col.names = TRUE, row.names = FALSE)
################################################################################
getPPIs <- function(data, regex, getInteractionMatches, keywords) {
  
  require(stringr, quietly <- TRUE)
  
  df = data.frame(pmid = rep(0, nrow(data)),
                  article_title = rep(0,nrow(data)), 
                  article_abstract = rep(0,nrow(data)),
                  matched_symbols = rep(0,nrow(data)),
                  matched_sentences = rep(0,nrow(data)),
                  int_sentences = rep(0,nrow(data)), 
                  int_keywords = rep(0,nrow(data)),
                  match=rep(0,nrow(data)))
  
  pb <- txtProgressBar(min <- 0, max <- nrow(data), style <- 3) #progress bar
  
  for(i in 1:nrow(data)) {
    # print(i)
    
    pmid <- as.integer(as.character(data[i,'PMID'])); abstract <- as.character(data[i,'Abstract']); title <- as.character(data[i,'Title'])
    
    # POS tag symbols that are NNP or C and remove 'parentheses and '(', ')', '[', or ']'
    tags <- gsub('[\\),\\]]|[\\(,\\[]', '', unique(unlist(extractPOS(paste(title, abstract), "NNP|C"))))
    POS_tags <- unlist(lapply(1:length(tags), function(x) {strsplit(tags[x], '/', fixed=TRUE)[[1]][1]})) #remove tag
    POS_tags <- gsub('\\.($)', '', POS_tags[!(POS_tags=="")]) #removing period at ends of word
    
    if(any(sapply(regex$patterns, function(x) grepl(x, POS_tags, fixed=TRUE)))) {
      sents <- unlist(strsplit(paste(title, abstract), ". ", fixed <- TRUE)) # title+abstract (we have this here because the other sentence tokenizers don't allow a sentence to start with a lower case letter)
      # find POS matches in title+abstract
      POS_matches <- unique(unlist(sapply(regex$patterns, function(x) grep(x, POS_tags, value=TRUE, fixed=TRUE))))
      POS_sent_matches <- data.frame(sapply(POS_matches, function(x) regexec(x, sents, fixed=TRUE)))
      names(POS_sent_matches) <- POS_matches
      
      # find tag matches in sentences and look for interaction keywords
      if(matchFinder(POS_sent_matches, regex, sents) != "No Matches") {
        matched_sentences <- matchFinder(POS_sent_matches, regex, sents)[1]; matched_symbols <- matchFinder(POS_sent_matches, regex, sents)[2]
        # get sentences with interaction words and keywords
        mint <- getInteractionMatches(paste(unlist(matched_sentences), collapse <- '|'), keywords)
        
        df[i, ] = unique(c(pmid,
                            title,
                            abstract,
                            paste(unlist(matched_symbols), collapse = '|'),
                            paste(unlist(matched_sentences), collapse = '|'),
                            paste(unlist(mint[1]), collapse = '|'),
                            paste(unlist(mint[2]), collapse = '|'), 
                            1))}
      else{df[i, ] <- c(pmid, title, abstract, c(""), c(""), c(""), c(""), 0)}
    }
      else{df[i, ] <- c(pmid, title, abstract, c(""), c(""), c(""), c(""), 0)}
    # monitor and print progress of function
    setTxtProgressBar(pb, i)
  }
  return(df)
  close(pb)
}