R/udpipe_reconstruct.R

Defines functions udpipe_reconstruct

# if(FALSE){
#   library(udpipe)
#   txt <- "  Maxime  y su mujer\\ hicieron que nuestra estancia 
#     fuera lo mas comoda posible. \n  
#     El primer dia Maxime nos espero hasta tarde para recibirnos y 
#     darnos todas las indicaciones posibles del apartamento y 
#     de la situacion de aparcamiento en el barrio ya que fuimos 
#     desde Espana con el coche ( es todo zona azul de 9:00 a 18:00 pero 
#     como saliamos pronto y llegabamos tarde no nos afectaba).\n  
#     El apartamento es muy completo, la verdad es como aparece 
#     en el anuncio, es mas, incluso tiene una barandilla 
#     en la escaleras que dan a la habitacion que en la foto no sale.\n   
#     El jardin esta muy bien para desayunar o cenar ya que 
#     tiene una mesa grande para ello.\n   
#     El barrio es muy tranquilo con bastantes tiendas y restaurantes.\n
#        En general estuvimos muy comodos durante nuestra estancia, 
#        repetiriamos ahora mismo.\n   Muchas gracias por todo Maxime.  "
#   ud_model <- udpipe_download_model(language = "spanish")
#   ud_model <- udpipe_load_model(ud_model$file_model)
#   x <- udpipe_annotate(ud_model, x = txt)
#   x <- as.data.frame(x, detailed = TRUE)
#   original <- udpipe_reconstruct(sentence_id = x$sentence_id, token = x$token, token_id = x$token_id, misc = x$misc)
#   
#   ## TEST
#   library(udpipe)
#   data(brussels_reviews, package = "udpipe")
#   comments <- subset(brussels_reviews, language %in% "es")
#   comments <- data.frame(doc_id = comments$id, text = comments$feedback, stringsAsFactors = FALSE)
#   x <- udpipe(comments, "spanish-ancora", trace = 50)
#   x <- merge(x, comments, by = "doc_id", sort = FALSE)
#   test <- mapply(text = x$text, start = x$start, end = x$end, token = x$token, FUN=function(text, start, end, token){
#     substr(text, start, end) == token
#   }, USE.NAMES = FALSE)
#   sum(!test, na.rm=TRUE) == 0
# }


udpipe_reconstruct <- function(sentence_id, token, token_id, misc, only_from_to = FALSE){
  
  ##
  ## FROM THE UDPIPE DOCS: 
  ##
  
  # The markup uses the following MISC fields on tokens (not words in multi-word tokens):
  # SpacesBefore=content (by default empty): spaces/other content preceding the token
  # SpacesAfter=content (by default a space if SpaceAfter=No feature is not present, empty otherwise): spaces/other content following the token
  # SpacesInToken=content (by default equal to the FORM of the token): FORM of the token including original spaces (this is needed only if tokens are allowed to contain spaces and a token contains a tab or newline characters)
  
  # The content of all the three fields must be escaped to allow storing tabs and newlines. The following C-like schema is used:
  # \s: space
  # \t: tab
  # \r: CR character
  # \n: LF character
  # \p: | (pipe character)
  # \\: \ (backslash character)
  
  rawtxt <- token
  
  has_spacesafter_no <- grepl(pattern = "SpaceAfter=No", misc)
  has_spacesafter    <- grepl(pattern = "SpacesAfter=", misc)
  has_spacesbefore   <- grepl(pattern = "SpacesBefore=", misc)
  has_spacesintoken  <- grepl(pattern = "SpacesInToken=", misc)
  has_multiple       <- grepl(pattern = "\\|", misc)
  
  ##
  ## Spaces after
  ##
  after <- rep("", length(token))
  ## if no spaceafter feature, there is a space
  after[!has_spacesafter] <- " "
  ## if missing, there is a space after
  after[is.na(misc)] <- " "
  ## if contains SpaceAfter=No, there is nothing to add
  after[has_spacesafter_no] <- ""
  ## if contains SpacesAfter=, add the spaces to the after part
  idx <- which(has_spacesafter)
  #addme <- gsub(pattern = "(SpacesAfter=)(.+)($|Spaces)", "\\2", misc[idx])
  addme <- gsub(pattern = "(SpacesAfter=)(.+)", "\\2", misc[idx])
  idx_multiple <- which(has_spacesafter & has_multiple)
  if(length(idx_multiple) > 0){
    addme_multiple <- sapply(strsplit(misc[idx_multiple], split = "\\|"), FUN=function(x) grep(pattern = "SpacesAfter", x = x, value = TRUE))
    addme_multiple <- gsub(pattern = "SpacesAfter=", replacement = "", addme_multiple)
    addme[which(idx_multiple %in% idx)] <- addme_multiple
  }
  #addme <- sapply(strsplit(misc[idx], split = "\\|"), FUN=function(x) grep(pattern = "SpacesAfter", x = x, value = TRUE))
  #addme <- gsub(pattern = "SpacesAfter=", replacement = "", addme)
  addme <- gsub("\\\\s", " ", addme)
  addme <- gsub("\\\\n", "\n", addme)
  addme <- gsub("\\\\t", "\t", addme)
  addme <- gsub("\\\\r", "\r", addme)
  addme <- gsub("\\\\p", "|", addme)
  addme <- gsub("\\\\", "\\", addme)
  after[idx] <- addme
  ## Fix for using std::istringstream in udpipe_annotate as it always ends with a newline character
  after[length(after)] <- gsub("\n$", "", after[length(after)])
  
  ##
  ## Spaces before
  ##
  before <- rep("", length(token))
  ## if contains SpacesBefore=, add the spaces to the after part
  idx <- which(has_spacesbefore)
  #addme <- gsub(pattern = "(SpacesBefore=)(.+)($|Spaces)", "\\2", misc[idx])
  addme <- gsub(pattern = "(SpacesBefore=)(.+)", "\\2", misc[idx])
  idx_multiple <- which(has_spacesbefore & has_multiple)
  if(length(idx_multiple) > 0){
    addme_multiple <- sapply(strsplit(misc[idx_multiple], split = "\\|"), FUN=function(x) grep(pattern = "SpacesBefore", x = x, value = TRUE))
    addme_multiple <- gsub(pattern = "SpacesBefore=", replacement = "", addme_multiple)
    addme[which(idx_multiple %in% idx)] <- addme_multiple
  }
  #addme <- sapply(strsplit(misc[idx], split = "\\|"), FUN=function(x) grep(pattern = "SpacesBefore", x = x, value = TRUE))
  #addme <- gsub(pattern = "SpacesBefore=", replacement = "", addme)
  addme <- gsub("\\\\s", " ", addme)
  addme <- gsub("\\\\n", "\n", addme)
  addme <- gsub("\\\\t", "\t", addme)
  addme <- gsub("\\\\r", "\r", addme)
  addme <- gsub("\\\\p", "|", addme)
  addme <- gsub("\\\\", "\\", addme)
  before[idx] <- addme
  
  ##
  ## SpacesInToken - MISC field stores form of the token including original spaces if there is a space in the token which can not be handled by FORM
  ##
  idx <- which(has_spacesintoken)
  #token[idx] <- gsub(pattern = "(SpacesInToken=)(.+)($|Spaces)", "\\2", misc[idx])
  addme <- gsub(pattern = "(SpacesInToken=)(.+)", "\\2", misc[idx])
  idx_multiple <- which(has_spacesintoken & has_multiple)
  if(length(idx_multiple) > 0){
    addme_multiple <- sapply(strsplit(misc[idx_multiple], split = "\\|"), FUN=function(x) grep(pattern = "SpacesInToken", x = x, value = TRUE))
    addme_multiple <- gsub(pattern = "SpacesInToken=", replacement = "", addme_multiple)
    addme[which(idx_multiple %in% idx)] <- addme_multiple
  }
  #addme <- sapply(strsplit(misc[idx], split = "\\|"), FUN=function(x) grep(pattern = "SpacesInToken", x = x, value = TRUE))
  #addme <- gsub(pattern = "SpacesInToken=", replacement = "", addme)
  token[idx] <- addme
  
  ##
  ## Construct original text
  ##
  original_txt <- sprintf("%s%s%s", before, token, after)
  
  ##
  ## Multi-word tokens are not considered
  ##
  is_multi_word <- grepl("-", token_id)
  ids <- sprintf("%s.%s", sentence_id, token_id)
  ids_remove <- mapply(sentence_id = sentence_id[is_multi_word],
                       token_id = token_id[is_multi_word], 
                       FUN=function(sentence_id, token_id){
                         sprintf("%s.%s", sentence_id, unlist(strsplit(token_id, split = "-")))
                       }, SIMPLIFY = TRUE, USE.NAMES = FALSE)
  idx <- which(ids %in% ids_remove)
  original_txt[idx] <- ""
  
  ##
  ## Construct from-to
  ##
  before[idx] <- ""
  after[idx] <- ""
  
  nchars <- nchar(original_txt)
  original_to <- cumsum(nchars)
  original_from <- original_to - nchars + 1L
  from <- original_from + nchar(before)
  to <- original_to - nchar(after)
  from[idx] <- NA_integer_
  to[idx] <- NA_integer_
  
  
  if(only_from_to){
    return(list(from = from, to = to))  
  }else{
    return(list(text = paste(original_txt, collapse = ""),
                from = from,
                to = to))  
  }
}

Try the udpipe package in your browser

Any scripts or data that you put into this service are public.

udpipe documentation built on Jan. 6, 2023, 5:06 p.m.