R/util.R

Defines functions extractText unAccent countExpression

Documented in countExpression extractText unAccent

#' Util Function for extracting text from URL
#'
#' @param url URL
#' @param minchar Minimal length for a sentence
#' @param filter Boolean for basic or strict filtering
#'
#' @details
#'
#' @examples
#' \dontrun{
#' extractText("https//www.wikipedia.com",2O,filter=FALSE)
#' extractText("https//www.wikipedia.com",2O,filter=TRUE)
#' }
#' @return A character with the main text
#' @author Vincent Terrasi
#' @export
extractText <- function(url, minchar, filter=FALSE) {

  # download html
  status <- tryCatch(
    html <- getURL(url, followlocation = TRUE, ssl.verifypeer=FALSE, useragent="R")
    ,error = function(e) e
  )
  if(inherits(status,  "error")) {
    print(paste0("error url ",url))
    return("error")
  }
  # parse html
  doc = htmlParse(html, asText=TRUE)

  if (filter==FALSE) {
    plain.text <- xpathSApply(doc, paste0("//body//text()[not(ancestor::select)][not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)][string-length(.) > ",minchar,"]"), xmlValue)
  } else {
    plain.text <- xpathSApply(doc, paste0("//body//text()[normalize-space()][not(ancestor::select)][not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)][not(ancestor::*[contains(@id,\"footer\")])][not(ancestor::*[contains(@id,\"sidebar\")])][not(ancestor::*[contains(@id,\"comment\")])][string-length(.) > ",minchar,"]"), xmlValue)
    #plain.text <- xpathSApply(doc, "//body//text()[normalize-space() and not(ancestor::noscript | ancestor::script | ancestor::style | ancestor::form | ancestor::*[contains(@id,\"footer\")] | ancestor::*[contains(@id,\"sidebar\")] | ancestor::*[contains(@id,\"comment\")][string-length(.) > 25]", xmlValue)
  }

  txt <- paste(plain.text, collapse = " ")

  return(txt)
}

#' Remove all accents from text
#'
#' @param text your text
#'
#' @details
#'
#' @examples
#' \dontrun{
#' unAccent(text)
#' }
#' @return a text without accents
#' @author Vincent Terrasi
#' @export
unAccent <- function(text) {
  encoding <- Encoding(text)
  text <- gsub("['`^~\"]", " ", text)
  if (!grepl("unknown",encoding)) {
    text <- iconv(text, from=encoding, to="ASCII//TRANSLIT//IGNORE")
  } else {
    text <- iconv(text, to="ASCII//TRANSLIT//IGNORE")
  }
  text <- gsub("['`^~\"]", "", text)
  return(text)
}

#' count each expression in a text
#'
#' @param text your text
#' @param searchword your expression
#'
#' @details
#'
#' @examples
#' \dontrun{
#' nb <- countExpression(text,searchword)
#' }
#' @return Number of expression
#' @author Vincent Terrasi
#' @export
countExpression <- function(text, searchword){
  text <- str_to_lower(unAccent(text))
  searchword <- str_to_lower(unAccent(searchword))
  kw_in_text <- str_count(text, paste(searchword, collapse='|'))
  return(kw_in_text)
}
voltek62/writingAssistantR documentation built on May 4, 2019, 7:41 p.m.