R/utils.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

Documented in paste.data.frame strsplit.data.frame txt_collapse txt_contains txt_context txt_count txt_freq txt_grepl txt_highlight txt_next txt_nextgram txt_overlap txt_paste txt_previous txt_previousgram txt_recode txt_recode_ngram txt_sample txt_show txt_tagsequence unique_identifier unlist_tokens

#' @title Collapse a character vector while removing missing data.
#' @description Collapse a character vector while removing missing data.
#' @param x a character vector or a list of character vectors
#' @param collapse a character string to be used to collapse the vector. Defaults to a space: ' '.
#' @return a character vector of length 1 with the content of x collapsed using paste
#' @export
#' @seealso \code{\link{paste}}
#' @examples 
#' txt_collapse(c(NA, "hello", "world", NA))
#' 
#' x <- list(a = c("h", "i"), b = c("some", "more", "text"), 
#'           c = character(), d = NA)
#' txt_collapse(x, collapse = " ")
txt_collapse <- function(x, collapse = " "){
  if(!is.list(x)){
    x <- as.character(x)
    x <- x[!is.na(x)]
    if(length(x) == 0){
      return(NA_character_)
    }else if(length(x) > 1){
      x <- paste(x, collapse = collapse)
    } 
  }else{
    x <- sapply(x, FUN = function(x, collapse){
      x <- as.character(x)
      x <- x[!is.na(x)]
      if(length(x) == 0){
        return(NA_character_)
      }else if(length(x) > 1){
        x <- paste(x, collapse = collapse)
      } 
      x
    }, collapse = collapse)
  }
  x
}

#' @title Concatenate strings with options how to handle missing data
#' @description NA friendly version for concatenating string
#' @param ... character vectors
#' @param collapse a character string to be used to paste the vectors together. Defaults to a space: ' '.
#' @param na.rm logical, if set to \code{TRUE}, will replace NA with ''. If set to \code{FALSE}, will have a resulting value of NA
#' if at least one element is \code{NA}, in a similar spirit as \code{mean}. Defaults to \code{FALSE}.
#' @return a character vector
#' @export
#' @seealso \code{\link{paste}}
#' @examples 
#' x <- c(1, 2, 3, NA, NA)
#' y <- c("a", "b", "c", NA, "OK")
#' paste(x, y, sep = "-")
#' txt_paste(x, y, collapse = "-", na.rm = TRUE)
#' txt_paste(x, y, collapse = "-", na.rm = FALSE)
#' 
#' x <- c(NA, "a", "b")
#' y <- c("1", "2", NA)
#' z <- c("-", "*", NA)
#' txt_paste(x, y, z, collapse = "", na.rm = TRUE)
#' txt_paste(x, y, z, "_____", collapse = "", na.rm = TRUE)
#' txt_paste(x, y, z, "_____", collapse = "", na.rm = FALSE)
txt_paste <- function(..., collapse = " ", na.rm = FALSE){
  x <- data.frame(list(...), stringsAsFactors = FALSE)
  if(na.rm){
    apply(x, MARGIN = 1, FUN = function(x){
      x <- x[!is.na(x)]
      if(length(x) == 0){
        return(NA_character_)
      }
      paste(x, collapse = collapse)
    })    
  }else{
    apply(x, MARGIN = 1, FUN = function(x){
      if(anyNA(x)){
        return(NA_character_)
      }
      paste(x[!is.na(x)], collapse = collapse)   
    })
  }
}


#' @title Look up a multiple patterns and indicate their presence in text
#' @description A variant of \code{\link{grepl}} which allows to specify multiple regular expressions
#' and allows to combine the result of these into one logical vector.\cr 
#' You can specify how to combine the results of the regular expressions by specifying
#' an aggregate function like \code{\link{all}}, \code{\link{any}}, \code{\link{sum}}.
#' @param x a character vector 
#' @param pattern a character vector containing one or several regular expressions
#' @param FUN a function to apply to combine the results ot the different regular expressions for each element of \code{x}. 
#' Defaults to \code{\link{all}}.
#' @param ignore.case passed on to \code{\link{grepl}}
#' @param perl passed on to \code{\link{grepl}}
#' @param fixed passed on to \code{\link{grepl}}
#' @param useBytes passed on to \code{\link{grepl}}
#' @param ... further arguments passed on to \code{FUN}
#' @return a logical vector with the same length as \code{x} with the result of the call to \code{FUN} applied elementwise to each result of grepl for each pattern
#' @export
#' @seealso \code{\link{grepl}}
#' @examples 
#' x <- c("--A--", "--B--", "--ABC--", "--AC--", "Z")
#' txt_grepl(x, pattern = c("A", "C"), FUN = all)
#' txt_grepl(x, pattern = c("A", "C"), FUN = any)
#' txt_grepl(x, pattern = c("A", "C"), FUN = sum)
#' data.frame(x = x, 
#'            A_and_C = txt_grepl(x, pattern = c("A", "C"), FUN = all),
#'            A_or_C  = txt_grepl(x, pattern = c("A", "C"), FUN = any),
#'            A_C_n   = txt_grepl(x, pattern = c("A", "C"), FUN = sum))
#' txt_grepl(x, pattern = "A|C")
txt_grepl <- function(x, pattern, FUN = all, ignore.case = FALSE, perl = FALSE,
                      fixed = FALSE, useBytes = FALSE, ...){
  found <- lapply(pattern, FUN = function(onepattern, x, ignore.case, perl, fixed, useBytes){
    grepl(x = x, pattern = onepattern, ignore.case = ignore.case, perl = perl, fixed = fixed, useBytes = useBytes)
  }, x = x, ignore.case = ignore.case, perl = perl, fixed = fixed, useBytes = useBytes)
  found <- matrix(unlist(found), nrow = length(x))
  apply(found, MARGIN = 1, FUN = FUN, ...)
}




#' @title Boilerplate function to sample one element from a vector.
#' @description Boilerplate function to sample one element from a vector.
#' @param x a vector
#' @param na.exclude logical indicating to remove NA values before taking a sample
#' @param n integer indicating the number of items to sample from \code{x}
#' @return one element sampled from the vector x
#' @export
#' @seealso \code{\link{sample.int}}
#' @examples 
#' txt_sample(c(NA, "hello", "world", NA))
txt_sample <- function(x, na.exclude = TRUE, n = 1){
  if(na.exclude){
    x <- x[!is.na(x)]
  }
  x[sample.int(length(x), size = n)]
}

#' @title Boilerplate function to cat only 1 element of a character vector.
#' @description Boilerplate function to cat only 1 element of a character vector.
#' @param x a character vector
#' @return invisible
#' @export
#' @seealso \code{\link{txt_sample}}
#' @examples 
#' txt_show(c("hello \n\n\n world", "world \n\n\n hello"))
txt_show <- function(x){
  x <- txt_sample(x)
  if(length(x) == 1){
    cat(x, sep = "\n")  
  }else{
    cat(x, sep = "\n")
  }
  invisible()
}


#' @title Recode text to other categories
#' @description Recode text to other categories. 
#' Values of \code{x} which correspond to \code{from[i]} will be recoded to \code{to[i]}
#' @param x a character vector
#' @param from a character vector with values of \code{x} which you want to recode
#' @param to a character vector with values of you want to use to recode to where you
#' want to replace values of \code{x} which correspond to \code{from[i]} to \code{to[i]}
#' @param na.rm logical, if set to TRUE, will put all values of \code{x} which have no
#' matching value in \code{from} to NA. Defaults to \code{FALSE}
#' @return a character vector of the same length of \code{x} where values of \code{x}
#' which are given in \code{from} will be replaced by the corresponding element in \code{to}
#' @seealso \code{\link{match}}
#' @export
#' @examples 
#' x <- c("NOUN", "VERB", "NOUN", "ADV")
#' txt_recode(x = x,
#'            from = c("VERB", "ADV"),
#'            to = c("conjugated verb", "adverb"))
#' txt_recode(x = x,
#'            from = c("VERB", "ADV"),
#'            to = c("conjugated verb", "adverb"),
#'            na.rm = TRUE)
#' txt_recode(x = x,
#'            from = c("VERB", "ADV", "NOUN"),
#'            to = c("conjugated verb", "adverb", "noun"),
#'            na.rm = TRUE)
txt_recode <- function(x, from = c(), to = c(), na.rm = FALSE){
  if(length(x) == 0){
    return(x)
  }
  stopifnot(length(from) == length(to))
  if(na.rm){
    return(recode(x = x, from = from, to = to))
  }
  nongiven <- unique(x[!is.na(x)])
  nongiven <- setdiff(nongiven, from)
  if(length(nongiven) > 0) {
    from <- append(x = from, values = nongiven)
    to   <- append(x = to, values = nongiven)
  }
  to[match(x, from)]
}

recode <- function(x, from, to){
  to[match(x, from)]
}


#' @title Based on a vector with a word sequence, get n-grams (looking forward)
#' @description If you have annotated your text using \code{\link{udpipe_annotate}},
#' your text is tokenised in a sequence of words. Based on this vector of words in sequence
#' getting n-grams comes down to looking at the next word and the subsequent word andsoforth.
#' These words can be \code{pasted} together to form an n-gram containing
#' the current word, the next word up, the subsequent word, ...
#' @param x a character vector where each element is just 1 term or word
#' @param n an integer indicating the ngram. Values of 1 will keep the x, a value of 2 will
#' append the next term to the current term, a value of 3 will append the subsequent
#' term and the term following that term to the current term
#' @param sep a character element indicating how to \code{\link{paste}} the subsequent words together
#' @return a character vector of the same length of \code{x} with the n-grams
#' @seealso \code{\link{paste}}, \code{\link[data.table]{shift}}
#' @export
#' @examples 
#' x <- sprintf("%s%s", LETTERS, 1:26)
#' txt_nextgram(x, n = 2)
#' 
#' data.frame(words = x,
#'            bigram = txt_nextgram(x, n = 2),
#'            trigram = txt_nextgram(x, n = 3, sep = "-"),
#'            quatrogram = txt_nextgram(x, n = 4, sep = ""),
#'            stringsAsFactors = FALSE)
#' 
#' x <- c("A1", "A2", "A3", NA, "A4", "A5")
#' data.frame(x, 
#'            bigram = txt_nextgram(x, n = 2, sep = "_"),
#'            stringsAsFactors = FALSE)
txt_nextgram <- function(x, n = 2, sep = " "){
  n <- as.integer(n)
  stopifnot(n >= 1)
  if(n == 1){
    return(x)
  }
  nextel <- n - 1L
  out <- list()
  out[[1]] <- x
  idx <- is.na(out[[1]])
  for(i in 1:nextel){
    out[[i+1]] <- txt_next(x, n = i)
    idx <- idx | is.na(out[[i+1]])
  }
  out$sep <- sep
  out <- do.call(paste, out)
  out[max((length(out)-(nextel-1L)), 1L):length(out)] <- NA
  out[idx] <- NA
  out
}


#' @title Based on a vector with a word sequence, get n-grams (looking backward)
#' @description If you have annotated your text using \code{\link{udpipe_annotate}},
#' your text is tokenised in a sequence of words. Based on this vector of words in sequence
#' getting n-grams comes down to looking at the previous word and the subsequent previous word andsoforth.
#' These words can be \code{pasted} together to form an n-gram containing
#' the second previous word, the previous word, the current word ...
#' @param x a character vector where each element is just 1 term or word
#' @param n an integer indicating the ngram. Values of 1 will keep the x, a value of 2 will
#' append the previous term to the current term, a value of 3 will append the second previous term
#' term and the previous term preceding the current term to the current term
#' @param sep a character element indicating how to \code{\link{paste}} the subsequent words together
#' @return a character vector of the same length of \code{x} with the n-grams
#' @seealso \code{\link{paste}}, \code{\link[data.table]{shift}}
#' @export
#' @examples 
#' x <- sprintf("%s%s", LETTERS, 1:26)
#' txt_previousgram(x, n = 2)
#' 
#' data.frame(words = x,
#'            bigram = txt_previousgram(x, n = 2),
#'            trigram = txt_previousgram(x, n = 3, sep = "-"),
#'            quatrogram = txt_previousgram(x, n = 4, sep = ""),
#'            stringsAsFactors = FALSE)
#' 
#' x <- c("A1", "A2", "A3", NA, "A4", "A5")
#' data.frame(x, 
#'            bigram = txt_previousgram(x, n = 2, sep = "_"),
#'            stringsAsFactors = FALSE)
txt_previousgram <- function(x, n = 2, sep = " "){
  n <- as.integer(n)
  stopifnot(n >= 1)
  if(n == 1){
    return(x)
  }
  nextel <- n - 1L
  out <- list()
  out[[n]] <- x
  idx <- is.na(out[[n]])
  for(i in 1:nextel){
    out[[n-i]] <- txt_previous(x, n = i)
    idx <- idx | is.na(out[[n-i]])
  }
  out$sep <- sep
  out <- do.call(paste, out)
  #out[max((length(out)-(nextel-1L)), 1L):length(out)] <- NA
  out[idx] <- NA
  out
}

#' @title Based on a vector with a word sequence, get n-grams (looking forward + backward)
#' @description If you have annotated your text using \code{\link{udpipe_annotate}},
#' your text is tokenised in a sequence of words. Based on this vector of words in sequence
#' getting n-grams comes down to looking at the previous/next word and the subsequent previous/next word andsoforth.
#' These words can be \code{pasted} together to form an n-gram.
#' @param x a character vector where each element is just 1 term or word
#' @param n an integer vector indicating how many terms to look back and ahead
#' @param sep a character element indicating how to \code{\link{paste}} the subsequent words together
#' @param na.rm logical, if set to \code{TRUE}, will keep all text even if it can not look back/ahead the amount specified by \code{n}. 
#' If set to \code{FALSE}, will have a resulting value of \code{NA}
#' if at least one element is \code{NA} or it can not look back/ahead the amount specified by \code{n}.
#' @return a character vector of the same length of \code{x} with the n-grams
#' @seealso \code{\link{txt_paste}}, \code{\link{txt_next}}, \code{\link{txt_previous}}, \code{\link[data.table]{shift}}
#' @export
#' @examples 
#' x <- c("We", "walked", "anxiously", "to", "the", "doctor", "!")
#' 
#' ## Look 1 word before + word itself
#' y <- txt_context(x, n = c(-1, 0), na.rm = FALSE)
#' data.frame(x, y)
#' ## Look 1 word before + word itself + 1 word after
#' y <- txt_context(x, n = c(-1, 0, 1), na.rm = FALSE)
#' data.frame(x, y)
#' y <- txt_context(x, n = c(-1, 0, 1), na.rm = TRUE)
#' data.frame(x, y)
#' 
#' ## Look 2 words before + word itself + 1 word after 
#' ## even if not all words are there
#' y <- txt_context(x, n = c(-2, -1, 0, 1), na.rm = TRUE, sep = "_")
#' data.frame(x, y)
#' y <- txt_context(x, n = c(-2, -1, 1, 2), na.rm = FALSE, sep = "_")
#' data.frame(x, y)
#' 
#' x <- c("We", NA, NA, "to", "the", "doctor", "!")
#' y <- txt_context(x, n = c(-1, 0), na.rm = FALSE)
#' data.frame(x, y)
#' y <- txt_context(x, n = c(-1, 0), na.rm = TRUE)
#' data.frame(x, y)
#' 
#' library(data.table)
#' data(brussels_reviews_anno, package = "udpipe")
#' x      <- as.data.table(brussels_reviews_anno)
#' x      <- subset(x, doc_id %in% txt_sample(unique(x$doc_id), n = 10))
#' x      <- x[, context := txt_context(lemma), by = list(doc_id, sentence_id)]
#' head(x, 20)
#' x$term <- sprintf("%s/%s", x$lemma, x$upos)
#' x      <- x[, context := txt_context(term), by = list(doc_id, sentence_id)]
#' head(x, 20)
txt_context <- function(x, n = c(-1, 0, 1), sep = " ", na.rm = FALSE){
  context <- lapply(n, FUN=function(i){
    if(i >= 0){
      out <- txt_next(x, n = i)
    }else{
      out <- txt_previous(x, n = -i)
    }
    out
  })
  context$collapse <- sep
  context$na.rm <- na.rm
  do.call(txt_paste, context)
}

#' @title Get the n-th previous element of a vector
#' @description Get the n-th previous element of a vector
#' @param x a character vector where each element is just 1 term or word
#' @param n an integer indicating how far to look back. Defaults to 1.
#' @return a character vector of the same length of \code{x} with the previous element
#' @seealso \code{\link[data.table]{shift}}
#' @export
#' @examples 
#' x <- sprintf("%s%s", LETTERS, 1:26)
#' txt_previous(x, n = 1)
#' 
#' data.frame(word = x,
#'            word_previous1 = txt_previous(x, n = 1),
#'            word_previous2 = txt_previous(x, n = 2),
#'            stringsAsFactors = FALSE)
txt_previous <- function(x, n = 1){
  n <- as.integer(n)
  stopifnot(n >= 0)
  data.table::shift(x, n = n, type = "lag")
}

#' @title Get the n-th next element of a vector
#' @description Get the n-th next element of a vector
#' @param x a character vector where each element is just 1 term or word
#' @param n an integer indicating how far to look next. Defaults to 1.
#' @return a character vector of the same length of \code{x} with the next element
#' @seealso \code{\link[data.table]{shift}}
#' @export
#' @examples 
#' x <- sprintf("%s%s", LETTERS, 1:26)
#' txt_next(x, n = 1)
#' 
#' data.frame(word = x,
#'            word_next1 = txt_next(x, n = 1),
#'            word_next2 = txt_next(x, n = 2),
#'            stringsAsFactors = FALSE)
txt_next <- function(x, n = 1){
  n <- as.integer(n)
  stopifnot(n >= 0)
  data.table::shift(x, n = n, type = "lead")
}



#' @title Frequency statistics of elements in a vector
#' @description Frequency statistics of elements in a vector
#' @param x a vector
#' @param exclude logical indicating to exclude values from the table. Defaults to NA and NaN.
#' @param order logical indicating to order the resulting dataset in order of frequency. Defaults to TRUE.
#' @return a data.frame with columns key, freq and freq_pct indicating the how 
#' many times each value in the vector \code{x} is occurring
#' @export
#' @examples 
#' x <- sample(LETTERS, 1000, replace = TRUE)
#' txt_freq(x)
#' x <- factor(x, levels = LETTERS)
#' txt_freq(x, order = FALSE)
txt_freq <- function(x, exclude = c(NA, NaN), order=TRUE){
  if(inherits(x, "data.frame")){
    stop("x should be a vector")
  }
  default <- data.frame(key = character(), freq = integer(), freq_pct = numeric(), stringsAsFactors = FALSE)
  
  if(is.factor(x)){
    ## For factors, we want to keep all factor levels, even if they don't appear in the data
    tab <- table(x, exclude = exclude)
    if(length(tab) == 0){
      return(default)
    }
    tab <- as.data.frame.table(tab, responseName = "freq")
    setnames(tab, old = colnames(tab)[1], new = "key")
    if(is.factor(tab$key)){
      tab$key <- as.character(tab$key)
    }
    tab$freq_pct <- 100 * tab$freq / sum(tab$freq)
    if(order){
      tab <- tab[order(tab$freq, decreasing=TRUE), ]  
    }
    rownames(tab) <- NULL
    return(tab)
  }else{
    ## For other non-factor data like characters, just calculate how many times each key is occurring
    .N <- key <- freq <- freq_pct <- NULL
    if(length(x) == 0){
      return(default)
    }
    x <- data.table(content = x)
    x <- setnames(x, old = "content", new = "key")
    x <- x[, list(freq = .N), by = list(key)]
    x <- x[, freq_pct := 100 * freq / sum(freq)]
    x <- x[!(key %in% exclude), ]
    if(order){
      x <- setorder(x, -freq)
    }
    x <- setDF(x)
    return(x)
  }
}


#' @title Highlight words in a character vector
#' @description Highlight words in a character vector. The words provided in \code{terms} are 
#' highlighted in the text by wrapping it around the following charater: |.
#' So 'I like milk and sugar in my coffee' would give 'I like |milk| and sugar in my coffee' if you 
#' want to highlight the word milk
#' @param x a character vector with text
#' @param terms a vector of words to highlight which appear in \code{x}
#' @return A character vector with the same length of \code{x} where the terms provided in \code{terms}
#' are put in between || to highlight them
#' @export
#' @examples 
#' x <- "I like milk and sugar in my coffee."
#' txt_highlight(x, terms = "sugar")
#' txt_highlight(x, terms = c("milk", "my"))
txt_highlight <- function(x, terms){
  terms <- paste(terms, collapse = "|")
  gsub(pattern = sprintf("(%s)", terms), replacement = "\\|\\1\\|", x = x, fixed = FALSE)
}



#' @title Recode words with compound multi-word expressions
#' @description Replace in a character vector of tokens, tokens with compound multi-word expressions.
#' So that \code{c("New", "York")} will be \code{c("New York", NA)}. 
#' @param x a character vector of words where you want to replace tokens with compound multi-word expressions.
#' This is generally a character vector as returned by the token column of \code{as.data.frame(udpipe_annotate(txt))}
#' @param compound a character vector of compound words multi-word expressions indicating terms which can be considered as one word. 
#' For example \code{c('New York', 'Brussels Hoofdstedelijk Gewest')}.
#' @param ngram a integer vector of the same length as \code{compound} indicating how many terms there are in the specific compound multi-word expressions
#' given by \code{compound}, where \code{compound[i]} contains \code{ngram[i]} words. 
#' So if \code{x} is \code{c('New York', 'Brussels Hoofdstedelijk Gewest')}, the ngram would be \code{c(2, 3)}
#' @param sep separator used when the compounds were constructed by combining the words together into a compound multi-word expression. Defaults to a space: ' '.
#' @return the same character vector \code{x} where elements in \code{x} will be replaced by compound multi-word expression. 
#' If will give preference to replacing with compounds with higher ngrams if these occur. See the examples.
#' @export
#' @seealso \code{\link{txt_nextgram}}
#' @examples 
#' x <- c("I", "went", "to", "New", "York", "City", "on", "holiday", ".")
#' y <- txt_recode_ngram(x, compound = "New York", ngram = 2, sep = " ")
#' data.frame(x, y)
#' 
#' keyw <- data.frame(keyword = c("New-York", "New-York-City"), ngram = c(2, 3))
#' y <- txt_recode_ngram(x, compound = keyw$keyword, ngram = keyw$ngram, sep = "-")
#' data.frame(x, y)
#' 
#' ## Example replacing adjectives followed by a noun with the full compound word
#' data(brussels_reviews_anno)
#' x <- subset(brussels_reviews_anno, language == "nl")
#' keyw <- keywords_phrases(x$xpos, term = x$token, pattern = "JJNN", 
#'                          is_regex = TRUE, detailed = FALSE)
#' head(keyw)
#' x$term <- txt_recode_ngram(x$token, compound = keyw$keyword, ngram = keyw$ngram)
#' head(x[, c("token", "term", "xpos")], 12)
txt_recode_ngram <- function(x, compound, ngram, sep = " "){
  ngram <- as.integer(ngram)
  
  if(length(ngram) != 1){
    stopifnot(length(ngram) == length(compound))
    keywords <- data.frame(keyword = compound, ngram = ngram, stringsAsFactors = FALSE)
    
    ## Loop over all ngrams, first replace the keywords with the most number of words in there
    ngrams <- unique(keywords$ngram)
    ngrams <- sort(ngrams, decreasing = TRUE)
    for(i in ngrams){
      x <- txt_recode_ngram(x, compound = keywords$keyword[keywords$ngram == i], ngram = i, sep = sep)
    }
  }else{
    keywords <- as.character(compound)
    if(length(keywords) == 0){
      return(x)
    }
    y <- txt_nextgram(x, n = ngram, sep = sep)
    idx <- which(y %in% keywords)
    ## Overwrite word with bigram/trigram/n-gram
    x[idx] <- y[idx]
    ## Set the next values to NA
    size <- length(x)
    if(ngram > 1){
      for (i in 1:(ngram - 1)) {
        loc <- idx + i
        loc <- loc[loc <= size]
        if(length(loc) > 0){
          x[loc] <- NA_character_
        }
      }
    }
  }
  x
}


#' @title Identify a contiguous sequence of tags as 1 being entity 
#' @description 
#' This function allows to identify contiguous sequences of text which have the same label or 
#' which follow the IOB scheme.\cr 
#' Named Entity Recognition or Chunking frequently follows the IOB tagging scheme 
#' where "B" means the token begins an entity, "I" means it is inside an entity,
#' "E" means it is the end of an entity and "O" means it is not part of an entity. 
#' An example of such an annotation would be 'New', 'York', 'City', 'District' which can be tagged as 
#' 'B-LOC', 'I-LOC', 'I-LOC', 'E-LOC'.\cr
#' The function looks for such sequences which start with 'B-LOC' and combines all subsequent 
#' labels of the same tagging group into 1 category. This sequence of words also gets a unique identifier such 
#' that the terms 'New', 'York', 'City', 'District' would get the same sequence identifier.
#' @param x a character vector of categories in the sequence of occurring (e.g. B-LOC, I-LOC, I-PER, B-PER, O, O, B-PER)
#' @param entities a list of groups, where each list element contains
#' \itemize{
#'  \item{start: }{A length 1 character string with the start element identifying a sequence start. E.g. 'B-LOC'}
#'  \item{labels: }{A character vector containing all the elements which are considered being part of a same labelling sequence, including the starting element. 
#'  E.g. \code{c('B-LOC', 'I-LOC', 'E-LOC')}}
#' }
#' The list name of the group defines the label that will be assigned to the entity. If \code{entities} is not provided each possible value of \code{x}
#' is considered an entity. See the examples.
#' @return a list with elements \code{entity_id} and \code{entity} where 
#' \itemize{
#'  \item{entity is a character vector of the same length as \code{x} containing entities , 
#'        constructed by recoding \code{x} to the names of \code{names(entities})}
#'  \item{entity_id is an integer vector of the same length as \code{x} containing unique identifiers identfying the compound label sequence such that 
#'        e.g. the sequence 'B-LOC', 'I-LOC', 'I-LOC', 'E-LOC' (New York City District) would get the same \code{entity_id} identifier.}
#' }
#' See the examples.
#' @export
#' @examples 
#' x <- data.frame(
#'   token = c("The", "chairman", "of", "the", "Nakitoma", "Corporation", 
#'            "Donald", "Duck", "went", "skiing", 
#'             "in", "the", "Niagara", "Falls"),
#'   upos = c("DET", "NOUN", "ADP", "DET", "PROPN", "PROPN", 
#'            "PROPN", "PROPN", "VERB", "VERB", 
#'            "ADP", "DET", "PROPN", "PROPN"),
#'   label = c("O", "O", "O", "O", "B-ORG", "I-ORG", 
#'             "B-PERSON", "I-PERSON", "O", "O", 
#'             "O", "O", "B-LOCATION", "I-LOCATION"), stringsAsFactors = FALSE)
#' x[, c("sequence_id", "group")] <- txt_tagsequence(x$upos)
#' x
#' 
#' ##
#' ## Define entity groups following the IOB scheme
#' ## and combine B-LOC I-LOC I-LOC sequences as 1 group (e.g. New York City) 
#' groups <- list(
#'  Location = list(start = "B-LOC", labels = c("B-LOC", "I-LOC", "E-LOC")),
#'  Organisation =  list(start = "B-ORG", labels = c("B-ORG", "I-ORG", "E-ORG")),
#'  Person = list(start = "B-PER", labels = c("B-PER", "I-PER", "E-PER")), 
#'  Misc = list(start = "B-MISC", labels = c("B-MISC", "I-MISC", "E-MISC")))
#' x[, c("entity_id", "entity")] <- txt_tagsequence(x$label, groups)
#' x
txt_tagsequence <- function(x, entities){
  stopifnot(is.character(x))
  if(missing(entities)){
    entities <- unique(x)
    names(entities) <- entities
    entities <- lapply(entities, FUN=function(x) list(start = x, labels = x))
    iob <- FALSE
  }else{
    iob <- TRUE
  }
  ## Data checks
  if(!all(sapply(entities, FUN=function(x) "start" %in% names(x) & "labels" %in% names(x)))){
    stop("entities should be list with elements start and labels")
  }
  
  ## Some data preparation on the entity groups
  starts_with <- sapply(entities, FUN=function(x) x$start)
  restentities <- lapply(entities, FUN=function(x) setdiff(x$labels, x$start))
  names(restentities) <- starts_with
  
  ## START
  newgroup <- rep(TRUE, length(x))
  x_prev <- txt_previous(x, n = 1)
  ## current group is same as previous group then we need to consider this together as 1 unless the previous one is part of the starting category
  if(iob){
    newgroup[which(x == x_prev & !x %in% starts_with)] <- FALSE  
  }else{
    newgroup[which(x == x_prev)] <- FALSE
  }
  ## current group is not part of start category but previous group is, look if current category part of the categories of the start
  idx <- x %in% unlist(restentities)
  if(sum(idx) > 0){
    is_same_grp <- mapply(x[idx], x_prev[idx], FUN=function(current, previous){
      current %in% restentities[[previous]]
    })
    newgroup[idx][is_same_grp] <- FALSE  
  }
  
  ## Create a new ID and a label
  out <- list()
  out$entity_id <- cumsum(newgroup)
  out$entity <- txt_recode(x = x, 
                           from = unlist(lapply(entities, FUN=function(x) x$labels)), 
                           to = rep(names(entities), sapply(entities, FUN=function(x) length(x$labels))))
  out
}


#' @title Check if text contains a certain pattern
#' @description Look up text which has a certain pattern. This pattern lookup is performed by executing a regular expression using \code{\link{grepl}}.
#' @param x a character vector with text
#' @param patterns a regular expression which might be contained in \code{x}, a vector of these
#' or a list of pattern elements where the list elements \code{include} and \code{exclude} indicate to find a pattern in \code{x}
#' while excluding elements which have another pattern
#' @param value logical, indicating to return the elements of \code{x} where the pattern was found or just a logical vector. Defaults to FALSE indicating to return a logical.
#' @param ignore.case logical, if set to \code{FALSE}, the pattern matching is case sensitive and if TRUE, case is ignored during matching. Passed on to \code{\link{grepl}}
#' @param ... other parameters which can be passed on to \code{\link{grepl}} e.g. fixed/perl/useBytes
#' @export
#' @seealso \code{\link{grepl}}
#' @return  
#' a logical vector of the same length as \code{x} indicating if one of the patterns was found in \code{x}.\cr 
#' Or the vector of elements of \code{x} where the pattern was found in case argument \code{value} is set to \code{TRUE}
#' @examples
#' x <- c("The cats are eating catfood", 
#'        "Our cat is eating the catfood", 
#'        "the dog eats catfood, he likes it", 
#'        NA)
#' txt_contains(x, patterns = c("cat", "dog")) 
#' txt_contains(x, patterns = c("cat", "dog"), value = TRUE) 
#' txt_contains(x, patterns = c("eats"), value = TRUE) 
#' txt_contains(x, patterns = c("^The"), ignore.case = FALSE, value = TRUE) 
#' txt_contains(x, patterns = list(include = c("cat"), exclude = c("dog")), 
#'              value = TRUE) 
#' txt_contains(x, "cat") & txt_contains(x, "dog")
txt_contains <- function(x, patterns, value = FALSE, ignore.case = TRUE, ...){
  
  if(is.list(patterns)){
    include <- rep_len(FALSE, length(x))
    exclude <- rep_len(FALSE, length(x))
    for(pattern in patterns$include){
      include <- include | grepl(pattern = pattern, x = x, ignore.case = ignore.case, ...)
    }
    for(pattern in patterns$exclude){
      exclude <- exclude | grepl(pattern = pattern, x = x, ignore.case = ignore.case, ...)
    }
    result <- include & !exclude
  }else{
    result <- rep_len(FALSE, length(x))
    for(pattern in patterns){
      result <- result | grepl(pattern = pattern, x = x, ignore.case = ignore.case, ...)
    }
  }
  if(value == TRUE){
    result <- x[result]
  }else{
    idx <- which(is.na(x))
    result[idx] <- as.logical(NA)
  }
  result
}


#' @title Count the number of times a pattern is occurring in text
#' @description Count the number of times a pattern is occurring in text. 
#' Pattern counting is performed by executing a regular expression using \code{\link{gregexpr}} and 
#' checking how many times the regular expression occurs.
#' @param x a character vector with text
#' @param pattern a text pattern which might be contained in \code{x}
#' @param ... other arguments, passed on to \code{\link{gregexpr}}
#' @return an integer vector of the same length as \code{x} indicating how many times the pattern is occurring in \code{x}
#' @export
#' @examples 
#' x <- c("abracadabra", "ababcdab", NA)
#' txt_count(x, pattern = "ab")
#' txt_count(x, pattern = "AB", ignore.case = TRUE)
#' txt_count(x, pattern = "AB", ignore.case = FALSE)
txt_count <- function(x, pattern, ...){
  result <- gregexpr(pattern = pattern, text = x, ...)
  result <- sapply(result, FUN = function(x){
    test <- length(x) == 1 && x < 0
    if(is.na(test)){
      return(NA_integer_) 
    }
    if(test){
      0L
    }else{
      length(x)
    }
  }, USE.NAMES = FALSE)
  as.integer(result)
}

#' @title Get the overlap between 2 vectors
#' @description Get the overlap between 2 vectors
#' @param x a vector
#' @param y a vector
#' @return a vector with elements of \code{x} which are also found in \code{y}
#' @export
#' @examples 
#' x <- c("a", "b", "c")
#' y <- c("b", "c", "e", "z")
#' txt_overlap(x, y)
#' txt_overlap(y, x)
txt_overlap <- function(x, y){
  y[match(x, y, nomatch = 0L)]
}

#' @title Create a unique identifier for each combination of fields in a data frame
#' @description Create a unique identifier for each combination of fields in a data frame. 
#' This unique identifier is unique for each combination of the elements of the fields. 
#' The generated identifier is like a primary key or a secondary key on a table.
#' This is just a small wrapper around \code{\link[data.table]{frank}}
#' @param x a data.frame
#' @param fields a character vector of columns from \code{x}
#' @param start_from integer number indicating to start from that number onwards
#' @return an integer vector of the same length as the number of rows in \code{x} 
#' containing the unique identifier
#' @export
#' @examples 
#' data(brussels_reviews_anno)
#' x <- brussels_reviews_anno
#' x$doc_sent_id <- unique_identifier(x, fields = c("doc_id", "sentence_id"))
#' head(x, 15)
#' range(x$doc_sent_id)
#' x$doc_sent_id <- unique_identifier(x, fields = c("doc_id", "sentence_id"), start_from = 10)
#' head(x, 15)
#' range(x$doc_sent_id)
unique_identifier <- function(x, fields, start_from = 1L){
  id <- data.table::frankv(x, cols = fields, ties.method = "dense")
  if(!missing(start_from)){
    id <- id - 1L + as.integer(start_from)
  }
  id
}


#' @title Concatenate text of each group of data together
#' @description This function is similar to \code{\link{paste}}
#' but works on a data.frame, hence paste.data.frame. 
#' It concatenates text belonging to groups of data together in one string. 
#' The function is the inverse operation of \code{\link{strsplit.data.frame}}.
#' @param data a data.frame or data.table
#' @param term a string with a column name or a character vector of column names from \code{data} which you want to concatenate together using \code{\link{paste}}
#' @param group a string with a column name or a character vector of column names from \code{data} indicating identifiers of groups. 
#' The text in \code{term} will be concatenated by group.
#' @param collapse a character string that you want to use to collapse the text data together. 
#' Defaults to a single space.
#' @return A data.frame with 1 row per group containing the columns from \code{group} and \code{term} 
#' where all the text in \code{term} for each group will be \code{\link{paste}-d} together, separated by the \code{collapse} argument.
#' @seealso \code{\link{strsplit.data.frame}}, \code{\link{paste}}
#' @export
#' @examples 
#' data(brussels_reviews_anno, package = "udpipe")
#' head(brussels_reviews_anno)
#' x <- paste.data.frame(brussels_reviews_anno, 
#'                       term = "lemma", 
#'                       group = c("doc_id", "sentence_id"))
#' str(x)
#' x <- paste.data.frame(brussels_reviews_anno, 
#'                       term = c("lemma", "token"), 
#'                       group = c("doc_id", "sentence_id"), 
#'                       collapse = "-")
#' str(x)                       
paste.data.frame <- function(data, term, group, collapse=" "){
  .SDcols <- .SD <- NULL
  stopifnot(inherits(data, "data.frame"))
  stopifnot(inherits(term, "character"))
  stopifnot(inherits(group, "character"))
  stopifnot(all(c(term, group) %in% colnames(data)))
  if(inherits(data, "data.table")){
    x <- data
  }else{
    x <- data.table::as.data.table(data[, c(term, group)])  
  }
  x <- x[, lapply(.SD, FUN=function(x) paste(x, collapse = collapse)), by = group, .SDcols = term]
  x <- data.table::setDF(x)
  x
}

#' @title Obtain a tokenised data frame by splitting text alongside a regular expression 
#' @description Obtain a tokenised data frame by splitting text alongside a regular expression. 
#' This is the inverse operation of \code{\link{paste.data.frame}}.
#' @param data a data.frame or data.table
#' @param term a character with a column name from \code{data} which you want to split into tokens
#' @param group a string with a column name or a character vector of column names from \code{data} indicating identifiers of groups. 
#' The text in \code{term} will be split into tokens by group.
#' @param split a regular expression indicating how to split the \code{term} column. 
#' Defaults to splitting by spaces, punctuation symbols or digits. This will be passed on to \code{\link{strsplit}}.
#' @param ... further arguments passed on to \code{\link{strsplit}}
#' @return A tokenised data frame containing one row per token.\cr
#' This data.frame has the columns from \code{group} and \code{term} where the text in column \code{term}
#' will be split by the provided regular expression into tokens. 
#' @seealso \code{\link{paste.data.frame}}, \code{\link{strsplit}}
#' @export
#' @examples 
#' data(brussels_reviews, package = "udpipe")
#' x <- strsplit.data.frame(brussels_reviews, term = "feedback", group = "id")
#' head(x)
#' x <- strsplit.data.frame(brussels_reviews, 
#'                          term = c("feedback"), 
#'                          group = c("listing_id", "language"))
#' head(x)  
#' x <- strsplit.data.frame(brussels_reviews, term = "feedback", group = "id", 
#'                          split = " ", fixed = TRUE)
#' head(x)                          
strsplit.data.frame <- function(data, term, group, split = "[[:space:][:punct:][:digit:]]+", ...){
  .SDcols <- .SD <- NULL
  stopifnot(inherits(data, "data.frame"))
  stopifnot(inherits(term, "character"))
  stopifnot(inherits(group, "character"))
  stopifnot(all(c(term, group) %in% colnames(data)))
  if(length(term) > 1){
    warning("strsplit.data.frame is not intended to be called with several columns in term")
  }
  if(inherits(data, "data.table")){
  }else{
    data <- data.table::as.data.table(data[, c(term, group)])  
  }
  x <- data[, lapply(.SD, FUN=function(txt, ...){
    terms <- unlist(strsplit(txt, split = split, ...))
    terms <- as.character(terms)
    terms <- terms[!is.na(terms)]
    terms <- terms[nchar(terms) > 0]
    terms
  }, ...), by = group, .SDcols = term]
  x <- data.table::setDF(x)
  x
}


#' @title Create a data.frame from a list of tokens
#' @description Create a data.frame from a list of tokens. 
#' @param x a list where the list elements are character vectors of tokens
#' @return the data of \code{x} converted to a data.frame.
#' This data.frame has columns doc_id and token where the doc_id is taken from the list names of x
#' and token contains the data of \code{x}
#' @export
#' @examples 
#' x <- setNames(c("some text here", "hi  there understand this?"), c("a", "b"))
#' x <- strsplit(x, split = " ")
#' x
#' unlist_tokens(x)
unlist_tokens <- function(x){
  stopifnot(is.list(x))
  doc_id <- rep(x = names(x), times = sapply(x, length))
  token  <- unlist(x, use.names = FALSE, recursive = FALSE)
  if(length(token) == 0){
    x <- data.frame(doc_id = character(), 
                    token = character(), 
                    stringsAsFactors = FALSE)
  }else{
    x <- data.frame(doc_id = doc_id, 
                    token = token, 
                    stringsAsFactors = FALSE)  
  }
  x
}
Any scripts or data that you put into this service are public.
udpipe documentation built on Jan. 6, 2023, 5:06 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
udpipe
Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/utils.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

Try the udpipe package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

udpipe Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/utils.R In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

Try the udpipe package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

udpipe
Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/utils.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit