R/udpipe_reconstruct.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

# if(FALSE){
#   library(udpipe)
#   txt <- "  Maxime  y su mujer\\ hicieron que nuestra estancia 
#     fuera lo mas comoda posible. \n  
#     El primer dia Maxime nos espero hasta tarde para recibirnos y 
#     darnos todas las indicaciones posibles del apartamento y 
#     de la situacion de aparcamiento en el barrio ya que fuimos 
#     desde Espana con el coche ( es todo zona azul de 9:00 a 18:00 pero 
#     como saliamos pronto y llegabamos tarde no nos afectaba).\n  
#     El apartamento es muy completo, la verdad es como aparece 
#     en el anuncio, es mas, incluso tiene una barandilla 
#     en la escaleras que dan a la habitacion que en la foto no sale.\n   
#     El jardin esta muy bien para desayunar o cenar ya que 
#     tiene una mesa grande para ello.\n   
#     El barrio es muy tranquilo con bastantes tiendas y restaurantes.\n
#        En general estuvimos muy comodos durante nuestra estancia, 
#        repetiriamos ahora mismo.\n   Muchas gracias por todo Maxime.  "
#   ud_model <- udpipe_download_model(language = "spanish")
#   ud_model <- udpipe_load_model(ud_model$file_model)
#   x <- udpipe_annotate(ud_model, x = txt)
#   x <- as.data.frame(x, detailed = TRUE)
#   original <- udpipe_reconstruct(sentence_id = x$sentence_id, token = x$token, token_id = x$token_id, misc = x$misc)
#   
#   ## TEST
#   library(udpipe)
#   data(brussels_reviews, package = "udpipe")
#   comments <- subset(brussels_reviews, language %in% "es")
#   comments <- data.frame(doc_id = comments$id, text = comments$feedback, stringsAsFactors = FALSE)
#   x <- udpipe(comments, "spanish-ancora", trace = 50)
#   x <- merge(x, comments, by = "doc_id", sort = FALSE)
#   test <- mapply(text = x$text, start = x$start, end = x$end, token = x$token, FUN=function(text, start, end, token){
#     substr(text, start, end) == token
#   }, USE.NAMES = FALSE)
#   sum(!test, na.rm=TRUE) == 0
# }


udpipe_reconstruct <- function(sentence_id, token, token_id, misc, only_from_to = FALSE){
  
  ##
  ## FROM THE UDPIPE DOCS: 
  ##
  
  # The markup uses the following MISC fields on tokens (not words in multi-word tokens):
  # SpacesBefore=content (by default empty): spaces/other content preceding the token
  # SpacesAfter=content (by default a space if SpaceAfter=No feature is not present, empty otherwise): spaces/other content following the token
  # SpacesInToken=content (by default equal to the FORM of the token): FORM of the token including original spaces (this is needed only if tokens are allowed to contain spaces and a token contains a tab or newline characters)
  
  # The content of all the three fields must be escaped to allow storing tabs and newlines. The following C-like schema is used:
  # \s: space
  # \t: tab
  # \r: CR character
  # \n: LF character
  # \p: | (pipe character)
  # \\: \ (backslash character)
  
  rawtxt <- token
  
  has_spacesafter_no <- grepl(pattern = "SpaceAfter=No", misc)
  has_spacesafter    <- grepl(pattern = "SpacesAfter=", misc)
  has_spacesbefore   <- grepl(pattern = "SpacesBefore=", misc)
  has_spacesintoken  <- grepl(pattern = "SpacesInToken=", misc)
  has_multiple       <- grepl(pattern = "\\|", misc)
  
  ##
  ## Spaces after
  ##
  after <- rep("", length(token))
  ## if no spaceafter feature, there is a space
  after[!has_spacesafter] <- " "
  ## if missing, there is a space after
  after[is.na(misc)] <- " "
  ## if contains SpaceAfter=No, there is nothing to add
  after[has_spacesafter_no] <- ""
  ## if contains SpacesAfter=, add the spaces to the after part
  idx <- which(has_spacesafter)
  #addme <- gsub(pattern = "(SpacesAfter=)(.+)($|Spaces)", "\\2", misc[idx])
  addme <- gsub(pattern = "(SpacesAfter=)(.+)", "\\2", misc[idx])
  idx_multiple <- which(has_spacesafter & has_multiple)
  if(length(idx_multiple) > 0){
    addme_multiple <- sapply(strsplit(misc[idx_multiple], split = "\\|"), FUN=function(x) grep(pattern = "SpacesAfter", x = x, value = TRUE))
    addme_multiple <- gsub(pattern = "SpacesAfter=", replacement = "", addme_multiple)
    addme[which(idx_multiple %in% idx)] <- addme_multiple
  }
  #addme <- sapply(strsplit(misc[idx], split = "\\|"), FUN=function(x) grep(pattern = "SpacesAfter", x = x, value = TRUE))
  #addme <- gsub(pattern = "SpacesAfter=", replacement = "", addme)
  addme <- gsub("\\\\s", " ", addme)
  addme <- gsub("\\\\n", "\n", addme)
  addme <- gsub("\\\\t", "\t", addme)
  addme <- gsub("\\\\r", "\r", addme)
  addme <- gsub("\\\\p", "|", addme)
  addme <- gsub("\\\\", "\\", addme)
  after[idx] <- addme
  ## Fix for using std::istringstream in udpipe_annotate as it always ends with a newline character
  after[length(after)] <- gsub("\n$", "", after[length(after)])
  
  ##
  ## Spaces before
  ##
  before <- rep("", length(token))
  ## if contains SpacesBefore=, add the spaces to the after part
  idx <- which(has_spacesbefore)
  #addme <- gsub(pattern = "(SpacesBefore=)(.+)($|Spaces)", "\\2", misc[idx])
  addme <- gsub(pattern = "(SpacesBefore=)(.+)", "\\2", misc[idx])
  idx_multiple <- which(has_spacesbefore & has_multiple)
  if(length(idx_multiple) > 0){
    addme_multiple <- sapply(strsplit(misc[idx_multiple], split = "\\|"), FUN=function(x) grep(pattern = "SpacesBefore", x = x, value = TRUE))
    addme_multiple <- gsub(pattern = "SpacesBefore=", replacement = "", addme_multiple)
    addme[which(idx_multiple %in% idx)] <- addme_multiple
  }
  #addme <- sapply(strsplit(misc[idx], split = "\\|"), FUN=function(x) grep(pattern = "SpacesBefore", x = x, value = TRUE))
  #addme <- gsub(pattern = "SpacesBefore=", replacement = "", addme)
  addme <- gsub("\\\\s", " ", addme)
  addme <- gsub("\\\\n", "\n", addme)
  addme <- gsub("\\\\t", "\t", addme)
  addme <- gsub("\\\\r", "\r", addme)
  addme <- gsub("\\\\p", "|", addme)
  addme <- gsub("\\\\", "\\", addme)
  before[idx] <- addme
  
  ##
  ## SpacesInToken - MISC field stores form of the token including original spaces if there is a space in the token which can not be handled by FORM
  ##
  idx <- which(has_spacesintoken)
  #token[idx] <- gsub(pattern = "(SpacesInToken=)(.+)($|Spaces)", "\\2", misc[idx])
  addme <- gsub(pattern = "(SpacesInToken=)(.+)", "\\2", misc[idx])
  idx_multiple <- which(has_spacesintoken & has_multiple)
  if(length(idx_multiple) > 0){
    addme_multiple <- sapply(strsplit(misc[idx_multiple], split = "\\|"), FUN=function(x) grep(pattern = "SpacesInToken", x = x, value = TRUE))
    addme_multiple <- gsub(pattern = "SpacesInToken=", replacement = "", addme_multiple)
    addme[which(idx_multiple %in% idx)] <- addme_multiple
  }
  #addme <- sapply(strsplit(misc[idx], split = "\\|"), FUN=function(x) grep(pattern = "SpacesInToken", x = x, value = TRUE))
  #addme <- gsub(pattern = "SpacesInToken=", replacement = "", addme)
  token[idx] <- addme
  
  ##
  ## Construct original text
  ##
  original_txt <- sprintf("%s%s%s", before, token, after)
  
  ##
  ## Multi-word tokens are not considered
  ##
  is_multi_word <- grepl("-", token_id)
  ids <- sprintf("%s.%s", sentence_id, token_id)
  ids_remove <- mapply(sentence_id = sentence_id[is_multi_word],
                       token_id = token_id[is_multi_word], 
                       FUN=function(sentence_id, token_id){
                         sprintf("%s.%s", sentence_id, unlist(strsplit(token_id, split = "-")))
                       }, SIMPLIFY = TRUE, USE.NAMES = FALSE)
  idx <- which(ids %in% ids_remove)
  original_txt[idx] <- ""
  
  ##
  ## Construct from-to
  ##
  before[idx] <- ""
  after[idx] <- ""
  
  nchars <- nchar(original_txt)
  original_to <- cumsum(nchars)
  original_from <- original_to - nchars + 1L
  from <- original_from + nchar(before)
  to <- original_to - nchar(after)
  from[idx] <- NA_integer_
  to[idx] <- NA_integer_
  
  
  if(only_from_to){
    return(list(from = from, to = to))  
  }else{
    return(list(text = paste(original_txt, collapse = ""),
                from = from,
                to = to))  
  }
}

Any scripts or data that you put into this service are public.

udpipe documentation built on Jan. 30, 2026, 5:09 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

udpipe
Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/udpipe_reconstruct.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

Defines functions udpipe_reconstruct

Try the udpipe package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

udpipe Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/udpipe_reconstruct.R In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

Defines functions udpipe_reconstruct

Try the udpipe package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

udpipe
Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/udpipe_reconstruct.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit