R/feature_engineering.R
In crfsuite: Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

Documented in crf_cbind_attributes txt_feature

#' @title Enrich a data.frame by adding frequently used CRF attributes
#' @description The CRF attributes which are implemented in this function 
#' are merely the neighbouring information of a certain field.
#' For example the previous word, the next word, the combination of the previous 2 words.
#' This function \code{cbind}s these neighbouring attributes as columns to the provided data.frame.\cr
#' 
#' By default it adds the following columns to the data.frame 
#' \itemize{
#'  \item{the term itself \code{(term[t])}}
#'  \item{the next term \code{(term[t+1])}}
#'  \item{the term after that \code{(term[t+2])}}
#'  \item{the previous term \code{(term[t-1])}}
#'  \item{the term before the previous term \code{(term[t-2])}}
#'  \item{as well as all combinations of these terms (bigrams/trigrams/...) where up to \code{ngram_max}
#' number of terms are combined.}
#' }
#' See the examples.
#' @param data a data.frame which will be coerced to a data.table (cbinding will be done by reference on the existing data.frame)
#' @param terms a character vector of column names which are part of \code{data} 
#' for which the function will look to the preceding and following rows in order to cbind this information to the \code{data}
#' @param by a character vector of column names which are part of \code{data} indicating the fields which define the sequence. 
#' Preceding/following terms will be looked for within data of \code{by}. 
#' Typically this will be a document identifier or sentence identifier in an NLP context.
#' @param from integer, by default set to -2, indicating to look up to 2 terms before the current term
#' @param to integer, by default  set to 2, indicating to look up to 2 terms after the current term
#' @param ngram_max integer indicating the maximum number of terms to combine (2 means bigrams, 3 trigrams, ...)
#' @param sep character indicating how to combine the previous/next/current terms. Defaults to '-'.
#' @export
#' @examples 
#' x <- data.frame(doc_id = sort(sample.int(n = 10, size = 1000, replace = TRUE)))
#' x$pos <- sample(c("Art", "N", "Prep", "V", "Adv", "Adj", "Conj", 
#'                   "Punc", "Num", "Pron", "Int", "Misc"), 
#'                   size = nrow(x), replace = TRUE)
#' x <- crf_cbind_attributes(x, terms = "pos", by = "doc_id", 
#'                           from = -1, to = 1, ngram_max = 3)
#' head(x)
#' 
#' \donttest{
#' ## Example on some real data
#' x <- ner_download_modeldata("conll2002-nl")
#' x <- crf_cbind_attributes(x, terms = c("token", "pos"), 
#'                           by = c("doc_id", "sentence_id"),
#'                           ngram_max = 3, sep = "|")
#' }
crf_cbind_attributes <- function(data, terms, by, from = -2, to = 2, ngram_max = 3, sep = "-"){
  for(column in terms){
    data <- crf_cbind_attributes_single(data = data, field = column, by = by, from = from, to = to, ngram_max = ngram_max, sep = sep)
  }
  data
}


crf_cbind_attributes_single <- function(data, field, by, from = -2, to = 2, ngram_max = 3, sep = "-"){
  .SD <- .N <- NULL
  stopifnot(inherits(data, "data.frame"))
  stopifnot(is.character(field))
  stopifnot(length(field) == 1)
  stopifnot(all(by %in% colnames(data)))
  stopifnot(all(field %in% colnames(data)))
  ngram_max <- as.integer(ngram_max)
  stopifnot(ngram_max >= 1)
  data <- setDT(data)
  
  posprocessfields <- c()
  ##
  ## LEVEL 1: just term itself and terms in the neighbourhood (previous/next term)
  ## term itself, look ahead and look back
  elements <- seq(from, to, by = 1L)
  level1 <- c()
  
  look <- elements[elements < 0]
  newfields <- sprintf("%s[t%s]", field, look)
  level1 <- append(level1, newfields)
  data[, c(newfields) := Map(x = .SD, n = look, f=function(x, n) data.table::shift(x, n = -n, type = "lag")), by = by, .SDcols = rep(field, length(newfields))]
  look <- 0
  newfields <- sprintf("%s[t]", field)
  level1 <- append(level1, newfields)
  data[, c(newfields) := Map(x = .SD, n = look, f=function(x, n) x), by = by, .SDcols = rep(field, length(newfields))]
  look <- elements[elements > 0]
  newfields <- sprintf("%s[t+%s]", field, look)
  level1 <- append(level1, newfields)
  data[, c(newfields) := Map(x = .SD, n = look, f=function(x, n) data.table::shift(x, n = n, type = "lead")), by = by, .SDcols = rep(field, length(newfields))]
  
  posprocessfields <- append(posprocessfields, level1)
  
  ##
  ## Higher levels, get all combinations of fields + paste the fields together using the separator
  ##
  for(n in 2:ngram_max){
    combinations <- combn(level1, m = n, simplify=FALSE)
    for(fields in combinations){
      newfield <- paste(fields, collapse = sep)
      fmt <- paste(rep("%s", length(fields)), collapse = sep)
      addfmt <- function(x, fmt){
        x$fmt <- fmt
        x
      }
      data[, c(newfield) := do.call(txt_sprintf, addfmt(.SD, fmt)), .SDcols = fields]
      posprocessfields <- append(posprocessfields, newfield)
    }
  }
  
  ##
  ## Add also column name as for CRFsuite all attributes are the same, this allows to distinguish the attributes among the columns
  ##
  for(newfield in posprocessfields){
    data[, c(newfield) := txt_sprintf("%s=%s", newfield, unlist(.SD)), .SDcols = newfield]
  }
  setDF(data)
}

#' @title Extract basic text features which are useful for entity recognition
#' @description Extract basic text features which are useful for entity recognition
#' @param x a character vector
#' @param type a character string, which can be one of 'is_capitalised', 'is_url', 'is_email', 'is_number', 'prefix', 'suffix', 'shape'
#' @param n for type 'prefix' or 'suffix', the number of characters of the prefix/suffix
#' @return 
#' For type 'is_capitalised', 'is_url', 'is_email', 'is_number': a logical vector of the same length as \code{x}, indicating if \code{x} is capitalised, a url, an email or a number\cr
#' For type 'prefix', 'suffix': a character vector of the same length as \code{x}, containing the prefix or suffix \code{n} number of characters of \code{x}\cr
#' For type 'shape': a character vector of the same length as \code{x}, where lowercased elements are replaced with x and uppercased elements with X
#' @export
#' @examples 
#' txt_feature("Red Devils", type = "is_capitalised")
#' txt_feature("red devils", type = "is_capitalised")
#' txt_feature("http://www.bnosac.be", type = "is_url")
#' txt_feature("info@google.com", type = "is_email")
#' txt_feature("hi there", type = "is_email")
#' txt_feature("1230000", type = "is_number")
#' txt_feature("123.15", type = "is_number")
#' txt_feature("123,15", type = "is_number")
#' txt_feature("123abc", type = "is_number")
#' txt_feature("abcdefghijklmnopqrstuvwxyz", type = "prefix", n = 3)
#' txt_feature("abcdefghijklmnopqrstuvwxyz", type = "suffix", n = 3)
#' txt_feature("Red Devils", type = "shape")
#' txt_feature("red devils", type = "shape")
txt_feature <- function(x, type = c('is_capitalised', 'is_url', 'is_email', 'is_number', 'prefix', 'suffix', 'shape'), n = 4){
  type <- match.arg(type)
  missing <- is.na(x)
  miss <- as.logical(NA)
  if(type == "is_capitalised"){
    result <- grepl("^[[:upper:]]", x)
  }else if(type == "is_url"){
    result <- grepl(pattern = "https?", x = x, ignore.case = TRUE)
  }else if(type == "is_email"){
    result <- grepl(".+@.+\\.", x)
  }else if(type == "is_number"){
    result <- grepl("^[[:digit:].,]+$", x)
  }else if(type == "prefix"){
    result <- substr(x, start = 1, stop = n)
  }else if(type == "suffix"){
    x_size <- nchar(x)
    start <- x_size - (n - 1)
    result <- substr(x, start = ifelse(start < 1, 1, start), stop = x_size)
  }else if(type == "shape"){
    result <- gsub("[[:lower:]]", "x", x)
    result <- gsub("[[:upper:]]", "X", result)
  }
  result[missing] <- miss
  result
}

Any scripts or data that you put into this service are public.

crfsuite documentation built on Sept. 17, 2023, 1:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

crfsuite
Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

R/feature_engineering.R
In crfsuite: Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

Defines functions txt_feature crf_cbind_attributes_single crf_cbind_attributes

Documented in crf_cbind_attributes txt_feature

Try the crfsuite package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

crfsuite Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

R/feature_engineering.R In crfsuite: Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

Defines functions txt_feature crf_cbind_attributes_single crf_cbind_attributes

Documented in crf_cbind_attributes txt_feature

Try the crfsuite package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

crfsuite
Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

R/feature_engineering.R
In crfsuite: Conditional Random Fields for Labelling Sequential Data in Natural Language Processing