R/quanteda.R

Defines functions .q_features

.q_features <-
  function(text,
           method = "word",
           remove_words = c(
             "\\.",
             "\\-",
             "\\#",
             "\\'",
             "\\,",
             "\\;",
             "\\_",
             "\\DESCRIPTION:",
             "\\:",
             "\\SBIR",
             "\\I ",
             "\\II ",
             "\\III ",
             "PHASE"
           ),
           stem = F,
           dfm_dictionary = NULL,
           remove_numbers = T,
           remove_punct = T,
           remove_symbols = T,
           remove_separators = TRUE,
           remove_twitter = T,
           remove_hyphens = T,
           collocation_size = 3,
           include_textstat = T,
           remove_url = FALSE,
           stop_sources = c("smart", "snowball", "stopwords-iso"),
           n_gram_tokens = 2,
           n_top_features = 5,
           include_dfm = F,
           verbose = T) {
    if (length(remove_words) > 0) {
      remove_slug <-
        remove_words %>%
        str_c(collapse = "|")
      text <-
        text %>% str_remove_all(remove_slug)
    }
    text <-
      text %>% str_replace_all("\\+", " ") %>%
      str_trim() %>%
      gsub("\\(","", .) %>%
      gsub("\\)", "", .)

    if (remove_numbers) {
      text <- text %>% str_remove_all("[0-9]")
    }

    if (length(dfm_dictionary) > 0) {
      if (class(dfm_dictionary) == "list") {
        dfm_dictionary <- dictionary(dfm_dictionary)
      }
    }

    toks_base <-
      text %>%
      char_tolower() %>%
      tokens(
        what = method,
        remove_numbers = remove_numbers,
        remove_punct = remove_punct,
        remove_symbols = remove_symbols,
        remove_separators = remove_separators,
        remove_twitter = remove_twitter ,
        remove_hyphens = remove_hyphens,
        remove_url = remove_url
      )

    if (length(stop_sources) > 0) {
      stop_sources %>%
        walk(function(stop_source){
          toks_base <<-
            toks_base %>%
            tokens_remove(stopwords(language = "en", source = stop_source))
        })
    }

    if (stem) {
      if (verbose) {
        "Stemming" %>% message()
      }
      toks_base <-
        toks_base %>%
        tokens_wordstem()
    }





    if (include_textstat) {
      df_col <-
        toks_base %>%
        textstat_collocations(size = collocation_size, tolower = FALSE) %>%
        as_tibble()
    }

    if (length(n_gram_tokens) > 0) {
      toks_base <-
        toks_base %>% tokens_ngrams(n = n_gram_tokens)
    }

    dfm_text <-
      toks_base %>% dfm(verbose = verbose, stem = stem, dictionary = dfm_dictionary)
    features <-
      dfm_text %>% featnames()

    data <- tibble(features = list(features))

    if (length(n_top_features) > 0) {
      df_features <-
        dfm_text %>%
        topfeatures(n = n_top_features)

      df_features <-
        tibble(feature = names(df_features), count = as.numeric(df_features))
      data <-
        data %>%
        mutate(dataTopFeatures = list(df_features))
    }

    if (include_textstat) {
      if (nrow(df_col) > 0) {
        data <- data %>%
          mutate(dataTextCollocation = list(df_col))
      }
    }

    if (include_dfm) {
      data <- data %>% mutate(dfm = list(dfm_text))
    }
    data
  }

.df_q_features <-
  function(data,
           method = "word",
           text_column = "descriptionAward",
           id_column = "idSBIR",
           join_data = T,
           remove_words = c(
             "\\.",
             "\\-",
             "\\#",
             "\\'",
             "\\,",
             "\\;",
             "\\_",
             "\\DESCRIPTION:",
             "\\:",
             "\\SBIR",
             "\\I ",
             "\\II ",
             "\\III ",
             "PHASE"
           ),
           stem = F,
           remove_numbers = T,
           remove_punct = T,
           remove_symbols = T,
           remove_separators = TRUE,
           remove_twitter = T,
           remove_hyphens = T,
           dfm_dictionary = NULL,
           n_top_features = 10,
           collocation_size = 3,
           include_textstat = T,
           remove_url = FALSE,
           stop_sources = c("smart", "snowball", "stopwords-iso"),
           n_gram_tokens = 1:3,
           exclude_features  = F,
           include_dfm = F,
           verbose = T) {
    if (length(id_column) == 0) {
      data <- data %>%
        mutate(id = 1:n())
      id_column <- "id"
    }

    df <-
      data %>%
      filter(!(is.na(!!sym(text_column))))


    df_text <-
      df %>%
      select(one_of(c(id_column, text_column)))

    .q_features_safe <- possibly(.q_features, tibble())
    all_rows <- 1:nrow(df_text)

    all_data <-
      all_rows %>%
      map_dfr(function(x) {
        glue::glue("modeling {x} of {max(all_rows)} for {text_column}") %>% message()
        df_row <- df_text %>% dplyr::slice(x)
        df <- .q_features_safe(
          text = df_row %>% pull(text_column),
          remove_words = remove_words,
          stem = stem,
          remove_numbers = remove_numbers,
          remove_punct = remove_punct,
          remove_symbols = remove_symbols,
          remove_separators = remove_separators,
          remove_twitter = remove_twitter,
          remove_hyphens = remove_hyphens,
          collocation_size = collocation_size,
          include_textstat = include_textstat,
          remove_url = remove_url ,
          stop_sources = stop_sources,
          n_gram_tokens = n_gram_tokens,
          include_dfm = include_dfm,
          verbose = verbose,
          method = method,
          dfm_dictionary = dfm_dictionary,
          n_top_features = n_top_features
        )

        if (nrow(df) == 0) {
          return(tibble())
        }

        slug <-
          str_c(
            text_column %>% substr(1, 1) %>% str_to_upper(),
            text_column %>% substr(2, nchar(text_column)),
            collapse = ""
          )
        names(df) <- names(df) %>% str_c(slug)
        df <-
          df %>%
          mutate(UQ(id_column) := df_row %>% pull(id_column)) %>%
          select(one_of(id_column), everything())
        df
      })

    if (exclude_features) {
      all_data <- all_data %>%
        select(-matches("^features"))
    }

    if (!join_data) {
      return(all_data)
    }

    data <-
      data %>%
      left_join(all_data, by = id_column)



    data
  }

#' Quanteda analysis of text vector
#'
#' @param texts vector of text
#' @param method what the unit for splitting the text, available alternatives are:
#' \describe{ \item{\code{"word"}}{(recommended default) smartest, but slowest, word tokenization method; see
#' \link[stringi]{stringi-search-boundaries} for details.}
#' \item{\code{"fasterword"}}{dumber, but faster, word tokenization metho
#' uses \code{\link[stringi]{stri_split_charclass}(x,
#' "[\\\\p{Z}\\\\p{C}]+")}} \item{\code{"fastestword"}}{dumbest, but fastest,
#'   word tokenization method, calls \code{\link[stringi]{stri_split_fixed}(x, "
#'   ")}} \item{\code{"character"}}{tokenization into individual characters}
#'   \item{\code{"sentence"}}{sentence segmenter, smart enough to handle some
#'   exceptions in English such as "Prof. Plum killed Mrs. Peacock." (but
#'   far
#'   from perfect).} }
#' @param remove_words
#' @param stem if \code{TRUE} stem words
#' @param remove_numbers logical; if \code{TRUE} remove tokens that
#' consist only
#'   of numbers, but not words that start with digits, e.g. \code{2day}
#' @param remove_punct
#' @param remove_symbols
#' @param remove_separators logical; if \code{TRUE} remove separators and
#'   separator characters (Unicode "Separator" [Z] and "Control [C]"
#'   categories). Only applicable for \code{what = "character"} (when you
#'   probably want it to be \code{FALSE}) and for \code{what = "word"} (when you
#'   probably want it to be \code{TRUE}).
#' @param remove_twitter logical; if \code{TRUE} remove Twitter characters
#'   \code{@@} and \code{#}; set to \code{TRUE} if you wish to eliminate these.
#'   Note that this will always be set to \code{FALSE} if \code{remove_punct =
#'   FALSE}.
#' @param remove_hyphens logical; if \code{TRUE} split words that are connected
#'   by hyphenation and hyphenation-like characters in between words, e.g.
#'   \code{"self-storage"} becomes \code{c("self", "storage")}.  Default is
#'   \code{FALSE} to preserve such words as is, with the hyphens.  Only applies
#'   if \code{method = "word"} or \code{what = "fasterword"}.
#' @param collocation_size integer collocation size for texstat parameter
#' @param include_textstat if \code{TRUE} applies textstat algorithm to text vector
#' @param remove_url logical; if \code{TRUE} find and eliminate URLs beginning
#'   with http(s) -- see section "Dealing with URLs".
#' @param n_gram_tokens \code{integer} of n-gram tokens - default 2L
#' @param include_dfm if \code{TRUE} includes document feature matrix
#' @param verbose if \code{TRUE} vervbose
#' @param dfm_dictionary if not \code{NULL} dictionary of word meanings
#' @param n_top_features if not \code{NULL} number of top features for feature count
#' @param exclude_features if \code{TRUE} remove nested feature list
#' @param stop_sources stop word source \itemize{
#' \item NULL
#' \item "smart"
#' \item "stopwords-iso"
#' \item  "snowball"
#' }
#'
#' @return
#' @import quanteda stopwords
#' @export
#'
#' @examples
#' qe_texts(texts = "HIGH SURFACE AREA NON-OXIDE CERAMIC ELECTRODES FOR ULTRACAPACITORS", n_gram_tokens = 1:4)
#'
qe_texts <-
  function(texts = NULL,
           method = "word",
           remove_words = c(
             "\\.",
             "\\-",
             "\\#",
             "\\'",
             "\\,",
             "\\;",
             "\\_",
             "\\DESCRIPTION:",
             "\\:",
             "\\SBIR",
             "\\I ",
             "\\II ",
             "\\III ",
             "PHASE"
           ),
           dfm_dictionary = NULL,
           n_top_features = 10,
           stem = F,
           exclude_features = F,
           remove_numbers = T,
           remove_punct = T,
           remove_symbols = T,
           remove_separators = TRUE,
           remove_twitter = T,
           remove_hyphens = T,
           collocation_size = 3,
           include_textstat = T,
           remove_url = FALSE,
           stop_sources = c("smart", "snowball", "stopwords-iso"),
           n_gram_tokens = 2,
           include_dfm = F,
           verbose = T) {
    if (length(texts) == 0) {
      stop("Enter text vector")
    }
    .q_features_safe <- possibly(.q_features, tibble())
    all_data <-
      texts %>%
      map_dfr(function(text) {
        .q_features_safe(
          text = text,
          method = method,
          remove_words = remove_words,
          stem = stem,
          remove_numbers = remove_numbers,
          remove_punct = remove_punct,
          remove_symbols = remove_symbols,
          remove_separators = remove_separators,
          remove_twitter = remove_twitter,
          remove_hyphens = remove_hyphens,
          collocation_size = collocation_size,
          include_textstat = include_textstat,
          remove_url = remove_url,
          stop_sources = stop_sources,
          n_gram_tokens = n_gram_tokens ,
          include_dfm = include_dfm,
          verbose = verbose,
          dfm_dictionary = dfm_dictionary,
          n_top_features = n_top_features
        ) %>%
          mutate(text) %>%
          select(text, everything())
      }) %>%
      filter(!is.na(text))
    d_f_cols <- all_data %>%
      select(matches("data|feature")) %>%
      names()

    if (length(d_f_cols) > 0) {
      df_features <- all_data %>%
        select(matches("data|feature")) %>%
        mutate_at(d_f_cols, list(function(x) {
          x %>% map_dbl(length) > 0
        }))

      new_name <- d_f_cols %>% substr(1, 1) %>% str_to_upper() %>%
        str_c(d_f_cols %>% substr(2, nchar(d_f_cols)), sep = "")
      new_names <- str_c("has", new_name)
      names(df_features)[names(df_features) %in% d_f_cols] <-
        new_names

      all_data <-
        all_data %>%
        bind_cols(df_features)

    }

    if (exclude_features) {
      all_data <-
        all_data %>%
        select(-matches("^features"))
    }



    all_data
  }


#' Apply quanteda to text data
#'
#' @param data \code{tibble}
#' @param text_columns vector text columns
#' @param id_column if not \code{NULL} id column
#' @param join_data if \code{TRUE} joins to original data
#' @param method what the unit for splitting the text, available alternatives are:
#' \describe{ \item{\code{"word"}}{(recommended default) smartest, but slowest, word tokenization method; see
#' \link[stringi]{stringi-search-boundaries} for details.}
#' \item{\code{"fasterword"}}{dumber, but faster, word tokenization metho
#' uses \code{\link[stringi]{stri_split_charclass}(x,
#' "[\\\\p{Z}\\\\p{C}]+")}} \item{\code{"fastestword"}}{dumbest, but fastest,
#'   word tokenization method, calls \code{\link[stringi]{stri_split_fixed}(x, "
#'   ")}} \item{\code{"character"}}{tokenization into individual characters}
#'   \item{\code{"sentence"}}{sentence segmenter, smart enough to handle some
#'   exceptions in English such as "Prof. Plum killed Mrs. Peacock." (but
#'   far
#'   from perfect).} }
#' @param remove_words
#' @param stem if \code{TRUE} stem words
#' @param remove_numbers logical; if \code{TRUE} remove tokens that
#' consist only
#'   of numbers, but not words that start with digits, e.g. \code{2day}
#' @param remove_punct
#' @param remove_symbols
#' @param remove_separators logical; if \code{TRUE} remove separators and
#'   separator characters (Unicode "Separator" [Z] and "Control [C]"
#'   categories). Only applicable for \code{what = "character"} (when you
#'   probably want it to be \code{FALSE}) and for \code{what = "word"} (when you
#'   probably want it to be \code{TRUE}).
#' @param remove_twitter logical; if \code{TRUE} remove Twitter characters
#'   \code{@@} and \code{#}; set to \code{TRUE} if you wish to eliminate these.
#'   Note that this will always be set to \code{FALSE} if \code{remove_punct =
#'   FALSE}.
#' @param remove_hyphens logical; if \code{TRUE} split words that are connected
#'   by hyphenation and hyphenation-like characters in between words, e.g.
#'   \code{"self-storage"} becomes \code{c("self", "storage")}.  Default is
#'   \code{FALSE} to preserve such words as is, with the hyphens.  Only applies
#'   if \code{method = "word"} or \code{what = "fasterword"}.
#' @param collocation_size integer collocation size for texstat parameter
#' @param include_textstat if \code{TRUE} applies textstat algorithm to text vector
#' @param remove_url logical; if \code{TRUE} find and eliminate URLs beginning
#'   with http(s) -- see section "Dealing with URLs".
#' @param stop_sources stopword source \itemize{
#' \item smart
#' \item snowball
#' \item stopwords-iso
#' }
#' @param n_gram_tokens \code{integer} of n-gram tokens - default 2L
#' @param include_dfm if \code{TRUE} includes document feature matrix
#' @param verbose if \code{TRUE} vervbose
#' @param dfm_dictionary if not \code{NULL} dictionary of word meanings
#' @param n_top_features if not \code{NULL} number of top features for feature count
#' @param exclude_features if \code{TRUE} exclude feature columns
#'
#' @return
#' @export
#' @import quanteda stopwords
#'
#' @examples
#' library(tidyverse)
#' data <- tibble(idSBIR = 190558, nameAward = "WEB-BASED NUTRITION EDUCATION FOR COLLEGE STUDENTS", descriptionAward = "THE NEGATIVE HEALTH AND SOCIAL CONSEQUENCES OF POOR NUTRITION ARE WELL DOCUMENTED IN COLLEGE STUDENTS AND THE OUTCOME OF POOR EATING HABITS IS MANIFEST IN BOTH OBESITY AND A VARIETY OFHEALTH CONCERNS. THIS APPLICATION PROPOSES THE DEVELOPMENT OF A COLLEGE STUDENTWEBSITE CALLED MYSTUDENTBODY.COM (NUTRITION), TO BE BASED AT A COLLEGE PERSONALHEALTH INTERNET PORTAL CALLED MYSTUDENTBODY.COM. THE PROGRAM WILL BE OFFEREDTHROUGH COLLEGES AND UNIVERSITIES TO EDUCATE STUDENTS ABOUT HEALTHY NUTRITIONAND LEARN EFFECTIVE, TAILORED HEALTHY EATING STRATEGIES. USING INTERACTIVE,WEB-BASED TECHNOLOGY, THIS PSYCHOEDUCATIONAL PROGRAM WILL BE SUPPORTED BY ANUMBER OF UNIQUE FEATURES THAT WILL MAKE IT A TRUE INNOVATION IN THE AREA OFNUTRITION EDUCATION FOR COLLEGE STUDENTS. MYSTUDENTBODY.COM (NUTRITION) WILLGUIDE STUDENTS THROUGH AN INTERACTIVE PROGRAM DESIGNED TO TEACH EFFECTIVENUTRITION EDUCATION IN AN INTERNET CONTEXT THAT IS INFORMATIVE, ENGAGING, ANDDRAMATIC. THE CURRENT APPLICATION COMBINES STATE-OF-THE-ART KNOWLEDGE ABOUTTAILORING STRATEGIES WITH ADVANCES IN INTERNET-BASED TECHNOLOGIES.MYSTUDENTBODY.COM (NUTRITION) OFFERS AN ONLINE PERSONALIZED NUTRITION EDUCATIONPROGRAM ALLOWING STUDENTS TO RECEIVE EMPIRICALLY-BASED INFORMATION AND FEEDBACKIN A CONFIDENTIAL MANNER.")
#'
#' qe_data(data = data, id_column = "idSBIR", text_columns = c("nameAward", "descriptionAward"), n_gram_tokens = 2:3, n_top_features = 5) %>% glimpse()
#'
qe_data <-
  function(data,
           text_columns = NULL,
           id_column = NULL,
           join_data = T,
           method = "word",
           remove_words = c(
             "\\.",
             "\\-",
             "\\#",
             "\\'",
             "\\,",
             "\\;",
             "\\_",
             "\\DESCRIPTION:",
             "\\:",
             "\\SBIR",
             "\\I ",
             "\\II ",
             "\\III ",
             "PHASE"
           ),
           dfm_dictionary = NULL,
           n_top_features = 10,
           stem = F,
           remove_numbers = T,
           remove_punct = T,
           remove_symbols = T,
           exclude_features = F,
           remove_separators = TRUE,
           remove_twitter = T,
           remove_hyphens = T,
           collocation_size = 2,
           include_textstat = F,
           remove_url = T,
           stop_sources = c("smart", "snowball", "stopwords-iso"),
           n_gram_tokens = 2:3,
           include_dfm = F,
           verbose = T) {

    .df_q_features_safe <- possibly(.df_q_features, NULL)
    all_data <-
      text_columns %>%
      map(function(text_column) {
        .df_q_features_safe(
          data = data,
          method = method,
          text_column = text_column,
          id_column = id_column,
          remove_words = remove_words,
          stem = stem,
          remove_numbers = remove_numbers,
          remove_punct = remove_punct,
          remove_symbols = remove_symbols,
          remove_separators = remove_separators,
          remove_twitter = remove_twitter,
          exclude_features = exclude_features,
          remove_hyphens = remove_hyphens,
          collocation_size = collocation_size,
          include_textstat = include_textstat,
          remove_url = remove_url,
          stop_sources = stop_sources,
          n_gram_tokens = n_gram_tokens,
          include_dfm = include_dfm,
          join_data = F,
          verbose = verbose,
          dfm_dictionary = dfm_dictionary,
          n_top_features = n_top_features
        )
      }) %>%
      discard(function(x){
        length(x) == 0
      })

    all_data <- all_data %>% purrr::reduce(left_join)

    d_f_cols <- all_data %>%
      select(matches("data|feature")) %>%
      names()

    if (length(d_f_cols) > 0) {
      df_features <-
        all_data %>%
        select(one_of(id_column), matches("data|feature")) %>%
        mutate_at(d_f_cols, list(function(x) {
          x %>% map_dbl(length) > 0
        }))

      new_name <- d_f_cols %>% substr(1, 1) %>% str_to_upper() %>%
        str_c(d_f_cols %>% substr(2, nchar(d_f_cols)), sep = "")
      new_names <- str_c("has", new_name)
      names(df_features)[names(df_features) %in% d_f_cols] <-
        new_names

      all_data <-
        all_data %>%
        left_join(df_features, by = id_column)

    }


    if (!join_data) {
      return(all_data)
    }

    data <-
      data %>%
      left_join(all_data, by = id_column)

    data

  }
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.