R/text_to_orgs.R

Defines functions text_to_orgs

Documented in text_to_orgs

#' Match messy text data to social organizations using a 'funneling' method
#'
#' This function matches unstructured text data to various dictionaries of 
#' organizations by extracting and iterating through consecuetive word sequences 
#' (or n-grams). To do this, the function extracts n-grams using the tidytext 
#' package, matching all sequences in the unstructured data that have n words 
#' and then 'funneling' through all sequences of n-1, n-2, etc. words before 
#' matching the single tokens. This process returns a dataframe of ids, 
#' organizations, and sectors for only those rows matched within the sectors specified.
#'
#' @param data A data frame or data frame extension (e.g. a tibble).
#' @param id A numeric or character vector unique to each entry.
#' @param input Character vector of messy or unstructured text that will
#' be unnested as n-grams and matched to dictionary of organizations in specified sector.
#' @param output Output column to be created as string or symbol.
#' @param sector Sector to match by organizations. Currently, the only option is "academic" 
#' with "business", "government", "household", and "nonprofit" in development.
#'
#' @examples
#'
#' library(tidyverse)
#' library(tidyorgs)
#' data(github_users)
#'
#' classified_by_text <- github_users %>%
#'   text_to_orgs(login, company, organization, academic)
#'
#' @export
text_to_orgs <- function(data, id, input, output, 
                             sector = c("academic", "business", 
                                        "government", "nonprofit")){
  # 1. convert all vars with enquos and check for errors
  id <- dplyr::enquo(id)
  input <- dplyr::enquo(input)
  output <- dplyr::enquo(output)
  sector <- rlang::arg_match(sector)
  `%notin%` <- Negate(`%in%`)
  # 2. pull in academic institutions dictionary 
  #dictionary <- tidyorgs::academic_institutions
  if (sector == "academic"){
    
    dictionary <- academic_institutions 
    data <- data %>%
      tidyr::drop_na(!!input) %>%
      dplyr::mutate("{{input}}" := tolower(!!input),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(univ\\.)\\b", "university"),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(univesity)\\b", "university"),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(univeristy)\\b", "university"),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(universoty)\\b", "university"),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(universit)\\b", "université"),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(universitt)\\b", "universität"),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(technolgy)\\b", "technology"),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(simn bolvar)\\b", "simón bolívar"),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(mnster)\\b", "münster"),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(king's)\\b", "kings"),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(queen's)\\b", "queens"),
                    "{{input}}" := stringr::str_replace(!!input, "\\b(?i)(xi'an jiaotong)\\b", "xian jiaotong"),
                    "{{input}}" := stringr::str_replace(!!input, "a & m", "a&m"),
                    "{{input}}" := stringr::str_replace(!!input, "@", ""),
                    "{{input}}" := stringr::str_replace(!!input, " & ", " and "))
    
  } else if (sector == "business"){ 
    dictionary <- business_data 
    data <- data %>%
      tidyr::drop_na(!!input) %>% 
      mutate("{{input}}" := stringr::str_replace(!!input, "@", ""),
             "{{input}}" := tolower(!!input))
    
  } else if (sector == "government"){ 
    dictionary <- government_data 
    data <- data %>%
      tidyr::drop_na(!!input) %>% 
      mutate("{{input}}" := stringr::str_replace(!!input, "@", ""),
             "{{input}}" := stringr::str_replace(!!input, "\\.", ""),
             "{{input}}" := tolower(!!input))
    
  } else if (sector == "nonprofit"){ 
    dictionary <- nonprofit_data 
    data <- data %>%
      tidyr::drop_na(!!input) %>% 
      mutate("{{input}}" := stringr::str_replace(!!input, "@", ""),
             "{{input}}" := tolower(!!input))
  } else {
    warning("You did not enter a valid sector name. Choose one of academic, business, government, and nonprofit.")
  }
  
  #dictionary <- readr::read_rds(file = "R/academic_institutions.rds")
  ids_to_filter <- c("nonexistent-user")
  funnelized <- data.frame()
  
  max_n <- dictionary %>%
    tidyr::unnest_legacy(catch_terms = base::strsplit(catch_terms, "\\|")) %>%
    dplyr::mutate(word_count = lengths(base::strsplit(catch_terms, "\\W+"))) 
  max_n <- max(max_n$word_count) 

  # 4. use a for loop to funnel match n-grams of lengths 2-12 
  for (n_word in max_n:2) {
    # note: 12 is an arbitrary number that will eventually correspond to largest n in dictionary
    subdictionary <- dictionary %>%
      tidyr::unnest_legacy(catch_terms = base::strsplit(catch_terms, "\\|")) %>%
      dplyr::mutate(word_count = lengths(base::strsplit(catch_terms, "\\W+"))) %>%
      dplyr::filter(word_count == n_word)
    subdictionary <- na.omit(subdictionary$catch_terms)
    funnelized <- data %>%
      dplyr::filter(!!id %notin% ids_to_filter) %>%
      tidytext::unnest_tokens(words, !!input, token="ngrams", n=n_word, to_lower = TRUE) %>%
      dplyr::filter(words %in% subdictionary) %>%
      dplyr::mutate(sector = 1) %>%
      dplyr::filter(sector == 1) %>%
      dplyr::select(!!id, words, sector) %>%
      dplyr::bind_rows(funnelized)
    newly_classified <- funnelized[,1]
    ids_to_filter <- c(ids_to_filter, newly_classified)
  }
  # # 5. funnel match on all of the single tokens
  subdictionary <- dictionary %>%
    tidyr::unnest_legacy(catch_terms = base::strsplit(catch_terms, "\\|")) %>%
    dplyr::mutate(word_count = lengths(base::strsplit(catch_terms, "\\W+"))) %>%
    dplyr::filter(word_count == 1)
  subdictionary <- na.omit(subdictionary$catch_terms)
  funnelized <- data %>%
    dplyr::filter(!!id %notin% ids_to_filter) %>%
    tidytext::unnest_tokens(words, !!input) %>%
    dplyr::filter(words %in% subdictionary) %>%
    dplyr::mutate(sector = 1) %>%
    dplyr::filter(sector == 1) %>%
    dplyr::select(!!id, words, sector) %>%
    dplyr::bind_rows(funnelized) %>% 
    dplyr::rename("{{sector}}" := sector) %>% 
    dplyr::rename_all(~stringr::str_replace_all(.,"\"","")) 
  # 6. standardize all of the organizations
  #dictionary <- tidyorgs::academic_institutions %>%
  standardizer_dictionary <- dictionary %>%
    dplyr::mutate(beginning = "\\b(?i)(", ending = ")\\b",
                  recode_column = paste0(beginning, recode_column, ending)) %>%
    dplyr::select(recode_column, organization_name) %>% tibble::deframe()
  standardized <- funnelized %>%
    dplyr::mutate("{{output}}" := tolower(words)) %>%
    dplyr::mutate("{{output}}" := stringr::str_replace_all({{ output }}, 
                                                           standardizer_dictionary)) %>%
    dplyr::select(!!id, !!output)
  standardized
}
brandonleekramer/tidyorgs documentation built on Dec. 19, 2021, 11:42 a.m.