R/slowrate-helpers.R

Defines functions process_keyword_df calc_keyword_scores split_hyphenated_words filter_words get_cand_words split_string append_split_txt stop_pos_tags handle_pos_error get_pos_tags

# get_pos_tags adapted from:
# http://martinschweinberger.de/docs/articles/PosTagR.pdf
get_pos_tags <- function(txt, word_token_annotator, pos_annotator) {
  str_txt <- NLP::as.String(txt)
  a2 <- NLP::Annotation(1L, "sentence", 1L, nchar(str_txt))
  a2 <- NLP::annotate(str_txt, word_token_annotator, a2)
  a3 <- NLP::annotate(str_txt, pos_annotator, a2)
  a3w <- a3[a3$type == "word"]
  pos <- unlist(lapply(a3w$features, `[[`, "POS"))
  data.frame(
    word = str_txt[a3w],
    pos = pos,
    stringsAsFactors = FALSE
  )
}

handle_pos_error <- function(c_obj) {
  er <- c_obj$message
  if (grepl("memory", er, ignore.case = TRUE)) {
    stop(
      er, "\n\nSee the second FAQ in the 'Frequently asked questions' ",
      "vignette for how to fix this."
    )
  } else {
    stop(er)
  }
}

stop_pos_tags <- function(pos_word_df, stop_pos) {
  in_stop_pos <- pos_word_df$pos %in% stop_pos
  pos_word_df$word[in_stop_pos] <- "."
  paste(pos_word_df$word, collapse = " ")
}

append_split_txt <- function(stop_words, phrase_delims) {
  if (!is.null(stop_words)) {
    stop_words_wrds <- paste0("\\b", tolower(stop_words), "\\b")
    stop_regex <- paste0(stop_words_wrds, collapse = "|")
    paste0(stop_regex, "|", phrase_delims)
  } else {
    phrase_delims
  }
}

split_string <- function(txt, regex) {
  x <- strsplit(txt, regex)[[1]]
  strsplit(x, " ")
}

get_cand_words <- function(txt, stop_words, phrase_delims) {
  regex <- append_split_txt(stop_words, phrase_delims)
  split_string(txt, regex)
}

filter_words <- function(cand_words, word_min_char) {
  temp_vec <- lapply(cand_words, function(x)
    x[x != "" & grepl("[[:alpha:]]", x) & nchar(x) >= word_min_char])
  temp_vec[vapply(temp_vec, length, numeric(1)) > 0]
}

split_hyphenated_words <- function(cand_words) {
  lapply(cand_words, function(x) {
    split <- strsplit(x, "-")
    vec <- unlist(split)
    vec[vec != ""]
  })
}

calc_keyword_scores <- function(cand_words) {
  # Get a list of unique words in each keyword so we don't double count (e.g.,
  # don't double count "vector" in "vector times vector").
  unq_wrds <- unlist(lapply(cand_words, unique))

  wrd_cnts <- as.matrix(table(unq_wrds))

  temp_score1 <- vapply(
    rownames(wrd_cnts), function(x)
      sum(
        vapply(
          cand_words, function(q) ifelse(x %in% q, length(q) - 1, 0), numeric(1)
        )
      )
    , numeric(1)
  )

  degree <- temp_score1 + wrd_cnts[, 1]

  word_scores <- structure(degree / wrd_cnts, names = rownames(wrd_cnts))
  unlist(lapply(cand_words, function(x) sum(word_scores[x])))
}

process_keyword_df <- function(keyword_df) {
  key_cnts <- table(keyword_df$keyword)
  key_cntsdf <- as.data.frame(key_cnts, stringsAsFactors = FALSE)
  colnames(key_cntsdf) <- c("keyword", "freq")
  key_df <- merge(key_cntsdf, keyword_df, by = "keyword")
  out_df <- unique(key_df[order(key_df$score, decreasing = TRUE), ])
  row.names(out_df) <- NULL
  out_df
}
crew102/slowraker documentation built on Sept. 5, 2024, 11:22 a.m.