R/rdr_pos_tagging.R

#' @title Lists all installed Ripple Down Rules-based Part-Of-Speech Tagger language models
#' @description Lists all language models currently installed in the RDRPOSTagger package folder.
#' These current consists of language models for
#' \itemize{
#'  \item{MORPH: }{detailed morphological tagging for languages}
#'  \item{POS: }{basic parts of speech tagging for languages}
#'  \item{UniversalPOS: }{universal POS tagging for languages. Based on data from http://universaldependencies.org version 2.0}
#' }
#' @return a list of data.frames with elements MORPH, POS and UniversalPOS containing the
#' language, the location of the dictionary and the location of the ripple down rules
#' @export
#' @seealso \code{\link{rdr_model}}
#' @examples
#' models <- rdr_available_models()
#' models
#'
#' models$MORPH$language
#' models$POS$language
#' models$UniversalPOS$language
rdr_available_models <- function(){
  available <- list()
  available$MORPH$language <- list.files(system.file("Models", "MORPH", package = "RDRPOSTagger"), pattern = ".zip$", full.names = TRUE)
  available$POS$language <- list.files(system.file("Models", "POS", package = "RDRPOSTagger"), pattern = ".zip$", full.names = TRUE)
  available$UniversalPOS$language <- list.files(system.file("Models", "UniversalPOS", package = "RDRPOSTagger"), pattern = ".zip$", full.names = TRUE)
  
  x <- lapply(available$MORPH$language, FUN=function(f) unzip(f, list = TRUE)$Name)
  available$MORPH$dictionary <- sapply(x, FUN=function(f) file.path(system.file("Models", "MORPH", package = "RDRPOSTagger"), grep(".DICT$", f, value = TRUE)))
  available$MORPH$rules <- sapply(x, FUN=function(f) file.path(system.file("Models", "MORPH", package = "RDRPOSTagger"), grep(".RDR$", f, value = TRUE)))
  x <- lapply(available$POS$language, FUN=function(f) unzip(f, list = TRUE)$Name)
  available$POS$dictionary <- sapply(x, FUN=function(f) file.path(system.file("Models", "POS", package = "RDRPOSTagger"), grep(".DICT$", f, value = TRUE)))
  available$POS$rules <- sapply(x, FUN=function(f) file.path(system.file("Models", "POS", package = "RDRPOSTagger"), grep(".RDR$", f, value = TRUE)))
  x <- lapply(available$UniversalPOS$language, FUN=function(f) unzip(f, list = TRUE)$Name)
  available$UniversalPOS$dictionary <- mapply(file_path_sans_ext(basename(available$UniversalPOS$language)), x, FUN=function(language, f) file.path(system.file("Models", "UniversalPOS", package = "RDRPOSTagger"), language, grep(".DICT$", f, value = TRUE)), USE.NAMES = FALSE)
  available$UniversalPOS$rules <- mapply(file_path_sans_ext(basename(available$UniversalPOS$language)), x, FUN=function(language, f) file.path(system.file("Models", "UniversalPOS", package = "RDRPOSTagger"), language, grep(".RDR$", f, value = TRUE)), USE.NAMES = FALSE)
  available$MORPH$language <- file_path_sans_ext(basename(available$MORPH$language))
  available$POS$language <- file_path_sans_ext(basename(available$POS$language))
  available$UniversalPOS$language <- file_path_sans_ext(basename(available$UniversalPOS$language))
  
  available <- lapply(available, as.data.frame, stringsAsFactors = FALSE)
  class(available) <- "rdr_models"
  available
}


rdr_unzipped_models <- function(){
  default <- data.frame(language = character(),
                        dictionary = character(),
                        rules = character(),
                        stringsAsFactors = FALSE)
  available <- list()
  available$MORPH <- file_path_sans_ext(list.files(system.file("Models", "MORPH", package = "RDRPOSTagger"), pattern = ".DICT$|.RDR$"))
  available$POS <- file_path_sans_ext(list.files(system.file("Models", "POS", package = "RDRPOSTagger"), pattern = ".DICT$|.RDR$"))
  available$UniversalPOS <- file_path_sans_ext(list.files(system.file("Models", "UniversalPOS", package = "RDRPOSTagger")))
  available$UniversalPOS <- unique(available$UniversalPOS)
  available$UniversalPOS <- available$UniversalPOS[dir.exists(file.path(system.file("Models", "UniversalPOS", package = "RDRPOSTagger"), available$UniversalPOS))]
  available$UniversalPOS <- setdiff(available$UniversalPOS, c("Readme", "UDv1.3_results"))
  available <- lapply(available, unique)
  if(length(available$MORPH) > 0){
    available$MORPH <- data.frame(language = available$MORPH,
                                  dictionary = system.file("Models", "MORPH", sprintf("%s.DICT", available$MORPH), package = "RDRPOSTagger"),
                                  rules = system.file("Models", "MORPH", sprintf("%s.RDR", available$MORPH), package = "RDRPOSTagger"),
                                  stringsAsFactors = FALSE)  
  }else{
    available$MORPH <- default
  }
  if(length(available$POS) > 0){
    available$POS <- data.frame(language = available$POS,
                                dictionary = system.file("Models", "POS", sprintf("%s.DICT", available$POS), package = "RDRPOSTagger"),
                                rules = system.file("Models", "POS", sprintf("%s.RDR", available$POS), package = "RDRPOSTagger"),
                                stringsAsFactors = FALSE)
  }else{
    available$POS <- default
  }
  if(length(available$UniversalPOS) > 0){
    available$UniversalPOS <- data.frame(language = available$UniversalPOS,
                                         dictionary = sapply(available$UniversalPOS, FUN=function(loc) list.files(system.file("Models", "UniversalPOS", loc, package = "RDRPOSTagger"), pattern = "\\.DICT$", full.names=TRUE)),
                                         rules = sapply(available$UniversalPOS, FUN=function(loc) list.files(system.file("Models", "UniversalPOS", loc, package = "RDRPOSTagger"), pattern = "\\.RDR$", full.names=TRUE)),
                                         stringsAsFactors = FALSE)
  }else{
    available$UniversalPOS <- default
  }
  class(available) <- "rdr_models"
  available
}



rdr_unzip <- function(language, annotation){
  zipfile <- file.path(system.file("Models", annotation, sprintf("%s.zip", language), package = "RDRPOSTagger"))
  if(!file.exists(zipfile)){
    stop(sprintf("Language %s not part of the possible languages for annotation %s. Run rdr_available_models() to see the available language/annotation models.", language, annotation))
  }
  existing <- rdr_unzipped_models()
  existing <- existing[[annotation]]$language
  existsalready <- language %in% existing
  if(annotation %in% c("MORPH", "POS")){
    storeat <- dirname(zipfile)
  }else if(annotation %in% "UniversalPOS"){
    storeat <- file.path(dirname(zipfile), file_path_sans_ext(basename(zipfile)))
  } 
  if(!existsalready){
    if(!dir.exists(storeat)){
      dir.create(storeat)  
    }
    unzip(zipfile = zipfile, exdir = storeat)
  }
}

# rdr_read_file <- function(file){
#   readLines(file, encoding = "UTF-8")
#   x <- list.files(system.file(package = "RDRPOSTagger"), recursive = TRUE, full.names = TRUE, pattern = ".DICT|.RDR")
#   x <- lapply(x, FUN = function(file) readLines(file, encoding = "UTF-8"))
#   save(x, file = "test.RData", compress = "xz")
# }
# rdr_init <- function(){
#   modelszip <- system.file("Models.zip", package = "RDRPOSTagger")
#   modelsfolder <- system.file(package = "RDRPOSTagger")
#   unzip(m, overwrite = TRUE, exdir = system.file(package = "RDRPOSTagger"))
# }



#' @export
print.rdr_models <- function(x, ...){
  cat('Riple down Rule based available taggers:', sep = "\n")
  cat('----------------------------------------', sep = "\n")
  
  cat("\n", sep = "")
  cat('1/ POS tagging for languages:', sep = "\n\n")
  if(length(x$POS$language) > 0){
    cat(paste(x$POS$language, collapse = ", "), sep = "\n")
  }else{
    cat("No languages for this type of POS tagging")
  }
  cat("\n", sep = "")
  cat('2/ MORPH tagging for languages:', sep = "\n\n")
  if(length(x$MORPH$language) > 0){
    cat(paste(x$MORPH$language, collapse = ", "), sep = "\n")
  }else{
    cat("No languages for this type of POS tagging")
  }
  cat("\n", sep = "")
  cat('3/ UniversalPOS tagging for languages:', sep = "\n")
  if(length(x$UniversalPOS$language) > 0){
    cat(paste(x$UniversalPOS$language, collapse = ", "), sep = "\n")
  }else{
    cat("No languages for this type of POS tagging")
  }
}

#' @title Set up a Ripple Down Rules-based Part-Of-Speech Tagger for tagging sentences
#' @description Set up a Ripple Down Rules-based Part-Of-Speech Tagger for tagging sentences
#' Possible languages are:
#' \itemize{
#'  \item{MORPH: }{Bulgarian, Czech, Dutch, French, German, Portuguese, Spanish, Swedish}
#'  \item{POS: }{English, French, German, Hindi, Italian, Thai, Vietnamese}
#'  \item{UniversalPOS: }{Ancient_Greek, Ancient_Greek-PROIEL, Arabic, Basque, Belarusian, 
#'  Bulgarian, Catalan, Chinese, Coptic, Croatian, Czech, Czech-CAC, Czech-CLTT, 
#'  Danish, Dutch, Dutch-LassySmall, English, English-LinES, English-ParTUT, Estonian, 
#'  Finnish, Finnish-FTB, French, French-ParTUT, French-Sequoia, Galician, Galician-TreeGal, 
#'  German, Gothic, Greek, Hebrew, Hindi, Hungarian, Indonesian, Irish, Italian, 
#'  Italian-ParTUT, Japanese, Korean, Latin, Latin-ITTB, Latin-PROIEL, Latvian, Lithuanian, 
#'  Norwegian-Bokmaal, Norwegian-Nynorsk, Old_Church_Slavonic, Persian, Polish, Portuguese, 
#'  Portuguese-BR, Romanian, Russian, Russian-SynTagRus, Slovak, Slovenian, Slovenian-SST, 
#'  Spanish, Spanish-AnCora, Swedish, Swedish-LinES, Tamil, Turkish, Urdu, Vietnamese}
#' }
#' @param language the language which is one of the languages for the annotation shown in \code{\link{rdr_available_models}}
#' @param annotation the type of annotation. Either one of 'MORPH', "POS' or 'UniversalPOS'
#' @return An object of class RDRPOSTagger which is a list with elements model (the location of the dictionary and the rules of that language),
#' the type of annotation and java objects tagger, initialtagger, dictionary and utility.
#' This model object can be used to tag sentences based on the specified POS tags.
#' @seealso \code{\link{rdr_pos}}
#' @export
#' @examples
#' \dontrun{
#' ## MORPH models
#' tagger <- rdr_model(language = "Bulgarian", annotation = "MORPH")
#' tagger <- rdr_model(language = "Czech", annotation = "MORPH")
#' tagger <- rdr_model(language = "Dutch", annotation = "MORPH")
#' tagger <- rdr_model(language = "French", annotation = "MORPH")
#' tagger <- rdr_model(language = "German", annotation = "MORPH")
#' tagger <- rdr_model(language = "Portuguese", annotation = "MORPH")
#' tagger <- rdr_model(language = "Spanish", annotation = "MORPH")
#' tagger <- rdr_model(language = "Swedish", annotation = "MORPH")
#' ## POS models
#' tagger <- rdr_model(language = "English", annotation = "POS")
#' tagger <- rdr_model(language = "French", annotation = "POS")
#' tagger <- rdr_model(language = "German", annotation = "POS")
#' tagger <- rdr_model(language = "Hindi", annotation = "POS")
#' tagger <- rdr_model(language = "Italian", annotation = "POS")
#' tagger <- rdr_model(language = "Thai", annotation = "POS")
#' tagger <- rdr_model(language = "Vietnamese", annotation = "POS")
#' ## UniversalPOS models
#' tagger <- rdr_model(language = "Ancient_Greek", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Ancient_Greek-PROIEL", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Arabic", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Basque", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Belarusian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Bulgarian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Catalan", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Chinese", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Coptic", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Croatian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Czech", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Czech-CAC", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Czech-CLTT", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Danish", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Dutch", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Dutch-LassySmall", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "English", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "English-LinES", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "English-ParTUT", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Estonian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Finnish", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Finnish-FTB", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "French", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "French-ParTUT", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "French-Sequoia", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Galician", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Galician-TreeGal", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "German", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Gothic", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Greek", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Hebrew", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Hindi", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Hungarian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Indonesian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Irish", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Italian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Italian-ParTUT", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Japanese", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Korean", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Latin", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Latin-ITTB", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Latin-PROIEL", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Latvian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Lithuanian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Norwegian-Bokmaal", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Norwegian-Nynorsk", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Old_Church_Slavonic", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Persian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Polish", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Portuguese", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Portuguese-BR", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Romanian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Russian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Russian-SynTagRus", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Slovak", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Slovenian", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Slovenian-SST", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Spanish", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Spanish-AnCora", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Swedish", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Swedish-LinES", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Tamil", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Turkish", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Urdu", annotation = "UniversalPOS")
#' tagger <- rdr_model(language = "Vietnamese", annotation = "UniversalPOS")
#' }
rdr_model <- function(language,
                      annotation = c("MORPH", "POS", "UniversalPOS")){
  
  
  ## Check if model exists
  annotation <- match.arg(annotation)
  ## Unzip the model
  rdr_unzip(language, annotation)
    
  available_models <- rdr_unzipped_models()
  models <- available_models[[annotation]]
  idx <- which(models$language %in% language)
  idx <- head(idx, 1)
  if(length(idx) == 0){
    stop(sprintf("Language %s not part of the possible languages for annotation %s. Pick one of these languages: %s", language, annotation, paste(models$language, collapse = ", ")))
  }
  models <- models[idx, ]
  stopifnot(file.exists(models$rules))
  stopifnot(file.exists(models$dictionary))

  ## Construct the tagger
  utility <- rJava::.jnew("Utils")

  tagger <- .jnew("RDRPOSTagger")
  initialtagger <- .jnew("InitialTagger")

  tagger$constructTreeFromRulesFile(models$rules)
  dictionary <- utility$getDictionary(models$dictionary)

  z <- list(model = models, annotation = annotation,
            tagger = tagger, initialtagger = initialtagger, dictionary = dictionary, utility = utility)
  class(z) <- "RDRPOSTagger"
  z
}

#' @export
print.RDRPOSTagger <- function(x, ...){
  cat(sprintf("RDRPOSTagger %s annotation for %s", x$annotation, x$model$language), sep = "\n")
  cat(sprintf("Based on dictionary of %s items", length(readLines(x$model$dictionary))), sep = "\n")
  cat(sprintf("Top 10 Ripple Down Rules at %s", x$model$rules), sep = "\n")
  cat(head(readLines(x$model$rules, encoding = "UTF-8"), 10), sep = "\n")
  cat(sprintf("..."), sep = "\n")
}


#' @title Part-Of-Speech Tagging for tagging sentences based on Ripple Down Rules
#' @description Part-Of-Speech Tagging for tagging sentences based on Ripple Down Rules
#' @param object And object of class RDRPOSTagger as returned by \code{\link{rdr_model}}
#' @param x a character vector in UTF-8 encoding where each element of the character vector contains text which you like to tag.
#' @param doc_id an identifier of a document with the same length as \code{x}.
#' @param add_space_around_punctuations logical indicating to add a space around punctuations before doing the RDR tagging. Defaults
#' to \code{TRUE} as the RDRPOStagger requires this.
#' @return a data frame with fields doc_id, token_id, token, pos where the pos field is the Parts of Speech tag.
#' If you want to find out the meaning of the different POS tags, visit http://universaldependencies.org.
#' @seealso \code{\link{rdr_model}}
#' @export
#' @examples
#' x <- c("Dus godvermehoeren met pus in alle puisten, zei die schele van Van Bukburg.", 
#'   "Er was toen dat liedje van tietenkonttieten kont tieten kontkontkont",
#'   "  ", "", NA)
#' tagger <- rdr_model(language = "Dutch", annotation = "MORPH")
#' rdr_pos(tagger, x = x)
#' 
#' tagger <- rdr_model(language = "Dutch", annotation = "UniversalPOS")
#' rdr_pos(tagger, x = x)
#' 
#' \dontrun{
#' x <- c("Oleg Borisovich Kulik is a Ukrainian-born Russian performance artist, 
#'   sculptor, photographer and curator.")
#' tagger <- rdr_model(language = "English", annotation = "POS")
#' rdr_pos(tagger, x = x)
#' }
rdr_pos <- function(object, x, doc_id = paste("d", seq_along(x), sep=""), add_space_around_punctuations=TRUE){
  x <- trimLeadingTrailing(x)
  if(add_space_around_punctuations){
    x <- rdr_add_space_around_punctuations(x)
  }
  x_tagged <- mapply(FUN = function(object, txt, doc){
    rdr_pos_one(object = object, x = txt, doc.id = doc)
  }, 
  txt = x,
  doc = doc_id,
  SIMPLIFY = FALSE, USE.NAMES = FALSE, MoreArgs = list(object = object))
  x_tagged <- rbindlist(x_tagged)
  setDF(x_tagged)
  x_tagged
}

rdr_pos_one <- function(object, x, doc.id){
  if(is.na(x) || x == ""){
    return(data.table(doc.id = doc.id, word.id = 0L, word = NA_character_, word.type = NA_character_))
  }
  wordtags <- object$initialtagger$InitTagger4Sentence(object$dictionary, x)
  size <- wordtags$size()
  if(size == 0){
    return(data.table(doc.id = doc.id, word.id = 0L, word = NA_character_, word.type = NA_character_))
  }
  x <- lapply(0L:(size-1L), FUN=function(i){
    setDT(list(doc.id = doc.id, word.id = i+1L, word = wordtags$get(i)$word, word.type = wordtags$get(i)$tag))
  })
  x <- rbindlist(x)
  Encoding(x$word) <- "UTF-8"
  setcolorder(x, neworder = c("doc.id", "word.id", "word", "word.type"))
  setnames(x, old = c("doc.id", "word.id", "word", "word.type"), new = c("doc_id", "token_id", "token", "pos"))
  x
}

trimLeadingTrailing <- function(x){
  gsub("^([[:space:]])+|([[:space:]])+$", "", x)
}





#' @title Add space around punctuations so that it can be used in \code{rdr_pos}
#' @description Add space around punctuations so that it can be used in \code{rdr_pos}
#' and points/punctuations are not added to 1 specific word/term.
#' @param x a character vector
#' @param symbols a character class of regular expressions to be used to identify punctuation symbols
#' @return the character vector \code{x} where a space is put around punctuations
#' @export
#' @examples
#' x <- c("Dus godvermehoeren met pus in alle puisten, zei die schele van Van Bukburg.Nieuwe zin.", 
#'   "  ", "", NA)
#' rdr_add_space_around_punctuations(x)
rdr_add_space_around_punctuations <- function(x, symbols = "[!,-.:;?]"){
  idx <- which(is.na(x))
  r <- gregexpr(pattern = symbols, text = x)
  rm <- regmatches(x, r)
  regmatches(x, r) <- lapply(rm, FUN=function(x){
    if(length(x) == 0) return(x)
    sprintf(" %s ", x)
  }) 
  x[idx] <- NA
  x
}
bnosac/RDRPOSTagger documentation built on May 8, 2019, 3:43 p.m.