R/nlp_morphological.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

Documented in cbind_morphological

#' @title Add morphological features to an annotated dataset
#' @description The result of \code{\link{udpipe_annotate}} which is put into a data.frame 
#' returns a field called \code{feats} containing morphological features as defined at 
#' \url{https://universaldependencies.org/u/feat/index.html}. If there are several of these features,
#' these are concatenated with the \code{|} symbol. This function extracts each of these morphological 
#' features separately and adds these as extra columns to the data.frame
#' @param x a data.frame or data.table as returned by \code{as.data.frame(udpipe_annotate(...))}
#' @param term the name of the field in \code{x} which contains the morphological features. Defaults to 'feats'.
#' @param which a character vector with names of morphological features to uniquely parse out. These 
#' features are one of the 24 lexical and grammatical properties of words defined at \url{https://universaldependencies.org/u/feat/index.html}.
#' Possible values are:
#' \itemize{
#' \item{"lexical": "PronType", "NumType", "Poss", "Reflex", "Foreign", "Abbr", "Typo"}
#' \item{"inflectional_noun": "Gender", "Animacy", "NounClass", "Number", "Case", "Definite", "Degree"}
#' \item{"inflectional_verb": "VerbForm", "Mood", "Tense", "Aspect", "Voice", "Evident", "Polarity", "Person", "Polite", "Clusivity"}
#' }
#' See the examples.
#' @return \code{x} in the same order with extra columns added (at least the column has_morph is added indicating
#' if any morphological features are present and as well extra columns for each possible morphological feature in the data)
#' @export
#' @examples 
#' \dontrun{
#' udmodel <- udpipe_download_model(language = "english-ewt")
#' udmodel <- udpipe_load_model(file = udmodel$file_model)
#' x <- udpipe_annotate(udmodel, 
#'                      x = "The economy is weak but the outlook is bright")
#' x <- as.data.frame(x)
#' x <- cbind_morphological(x, term = "feats")
#' }
#' 
#' f <- system.file(package = "udpipe", "dummydata", "traindata.conllu")
#' x <- udpipe_read_conllu(f)
#' x <- cbind_morphological(x, term = "feats")
#' 
#' f <- system.file(package = "udpipe", "dummydata", "traindata.conllu")
#' x <- udpipe_read_conllu(f)
#' x <- cbind_morphological(x, term = "feats", 
#'                          which = c("Mood", "Gender", "VerbForm", "Polarity", "Polite"))
#' 
#' # extract all features from the feats column even if not present in the data
#' f <- system.file(package = "udpipe", "dummydata", "traindata.conllu")
#' x <- udpipe_read_conllu(f)
#' x <- cbind_morphological(x, term = "feats", 
#'                          which = c("lexical", "inflectional_noun", "inflectional_verb"))
cbind_morphological <- function(x, term = "feats", which){
  stopifnot(inherits(x, "data.frame"))
  stopifnot(length(term) == 1)
  stopifnot(term %in% colnames(x))
  oldfields <- colnames(x)
  out <- txt_morphological(x[[term]])
  out <- data.table::setnames(out, 
                              old = colnames(out), 
                              new = c("has_morph", sprintf("morph_%s", setdiff(colnames(out), "has_morph"))))
  newfields <- setdiff(colnames(out), c("has_morph", oldfields))
  if(inherits(x, "data.table")){
    x <- setDF(x)
    x[, colnames(out)] <- out  
    x <- setDT(x)
  }else if(inherits(x, "data.frame")){
    x[, colnames(out)] <- out  
  }
  
  if(!missing(which)){
    morpho_feats <- list()
    morpho_feats$lexical <- c("PronType", "NumType", "Poss", "Reflex", "Foreign", "Abbr", "Typo")
    morpho_feats$inflectional_noun <- c("Gender", "Animacy", "NounClass", "Number", "Case", "Definite", "Degree")
    morpho_feats$inflectional_verb <- c("VerbForm", "Mood", "Tense", "Aspect", "Voice", "Evident", "Polarity", "Person", "Polite", "Clusivity")
    if(any(!which %in% c("lexical", "inflectional_noun", "inflectional_verb"))){
      morpho_feats <- as.character(which)
    }else{
      morpho_feats <- c(unlist(morpho_feats[c("lexical", "inflectional_noun", "inflectional_verb") %in% which]), setdiff(which, c("lexical", "inflectional_noun", "inflectional_verb"))) 
    }
    morpho_feats <- tolower(morpho_feats)
    morpho_feats <- sprintf("morph_%s", morpho_feats)
    missing <- setdiff(morpho_feats, colnames(x))
    if(inherits(x, "data.table")){
      for(field in missing){
        x[, eval(field) := NA_character_]  
      }
      x <- x[, unique(c(oldfields, "has_morph", morpho_feats)), drop = FALSE]
      x <- data.table::setcolorder(x, neworder = unique(c(oldfields, "has_morph", morpho_feats)))
    }else if(inherits(x, "data.frame")){
      x <- data.table::setDT(x)
      for(field in missing){
        x[, eval(field) := NA_character_]  
      }
      x <- data.table::setDF(x)
      x <- x[, unique(c(oldfields, "has_morph", morpho_feats)), drop = FALSE]
      x <- data.table::setDT(x)
      x <- data.table::setcolorder(x, neworder = unique(c(oldfields, "has_morph", morpho_feats)))
      x <- data.table::setDF(x)
    }
  }
  x
}



txt_morphological <- function(x){
  ## R CMD check happiness
  id <- has_morph <- NULL
  
  ## Split field according to |, next put it into data.frame and reshape it to wide format
  ## Add an indicator has_morph indicating if morphological features are uberhaupt present
  morpho <- strsplit(x, split = "\\|")
  morpho <- lapply(morpho, FUN=function(x) list(feats = x))
  morpho <- data.table::rbindlist(morpho, idcol = "id")
  ## NA values in morpohological features are set to None=None
  morpho$feats[is.na(morpho$feats)] <- "has_morph=FALSE"
  morpho$key <- gsub("^(.+)=(.+)$", "\\1", morpho$feats)
  morpho$key <- tolower(morpho$key) 
  morpho$value <- gsub("^(.+)=(.+)$", "\\2", morpho$feats)
  morpho <- data.table::dcast.data.table(data = morpho, formula = id ~ key, value.var = "value")
  if("has_morph" %in% colnames(morpho)){
    morpho <- morpho[, has_morph := as.logical(has_morph)]
    morpho$has_morph[is.na(morpho$has_morph)] <- TRUE
  }else{
    morpho <- morpho[, has_morph := TRUE]
  }
  morpho <- morpho[, id := NULL]
  morpho <- data.table::setcolorder(morpho, c("has_morph", setdiff(colnames(morpho), "has_morph")))
  morpho <- data.table::setDF(morpho)
  morpho
}

Any scripts or data that you put into this service are public.

udpipe documentation built on Jan. 30, 2026, 5:09 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

udpipe
Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/nlp_morphological.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

Defines functions txt_morphological cbind_morphological

Documented in cbind_morphological

Try the udpipe package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

udpipe Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/nlp_morphological.R In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

Defines functions txt_morphological cbind_morphological

Documented in cbind_morphological

Try the udpipe package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

udpipe
Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/nlp_morphological.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit