R/data.R
In crfsuite: Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

Documented in ner_download_modeldata

#' @title CRF Training data: download training data for doing Named Entity Recognition (NER)
#' @description Download training data for doing Named Entity Recognition (NER)
#' @param type a character string with the type of data to download. See the function usage for all possible values.
#' These data will be downloaded from either:
#' \itemize{
#'  \item{NLTK-data forked repository: }{\url{https://github.com/bnosac-dev/nltk_data/blob/gh-pages/packages/corpora/conll2002.zip}}
#'  \item{FOX forked repository of GermanNER: }{\url{https://github.com/bnosac-dev/FOX/tree/master/input/GermanNER}}
#'  \item{FOX forked repository of WikiNER: }{\url{https://github.com/bnosac-dev/FOX/tree/master/input/Wikiner}}
#' }
#' Please visit the information on these repositories first before you use these data in any commercial product.
#' @param docs integer indicating how many documents to sample from the data (only used for data from the NLTK repository). 
#' This is only used to reduce CRAN R CMD check training time in the examples of this R package.
#' @return a data.frame with training data for a Named Entity Recognition task or an object of try-error in case of failure of downloading the data
#' @export
#' @examples 
#' \dontrun{
#' x <- ner_download_modeldata("conll2002-nl")
#' x <- ner_download_modeldata("conll2002-es")
#' x <- ner_download_modeldata("GermanNER")
#' x <- ner_download_modeldata("wikiner-en-wp2")
#' x <- ner_download_modeldata("wikiner-nl-wp3")
#' x <- ner_download_modeldata("wikiner-fr-wp3")
#' }
#' ## reduce number of docs
#' x <- ner_download_modeldata("conll2002-es", docs = 10)
ner_download_modeldata <- function(type = c("conll2002-nl", "conll2002-es", "GermanNER", 
                                            "wikiner-de-wp2",
                                            "wikiner-de-wp3",
                                            "wikiner-en-wp2",
                                            "wikiner-en-wp3",
                                            "wikiner-es-wp2",
                                            "wikiner-es-wp3",
                                            "wikiner-fr-wp2",
                                            "wikiner-fr-wp3",
                                            "wikiner-it-wp2",
                                            "wikiner-it-wp3",
                                            "wikiner-nl-wp2",
                                            "wikiner-nl-wp3",
                                            "wikiner-pl-wp3",
                                            "wikiner-pt-wp3",
                                            "wikiner-ru-wp2",
                                            "wikiner-ru-wp3"), 
                                   docs = -Inf){
  .N <- sentence_id <- doc_id <- txt <- NULL
  
  msg <- suppressWarnings(try({
    type <- match.arg(type)
    temporary_file <- tempfile()
    if(type == "conll2002-nl"){
      url <- "https://raw.githubusercontent.com/bnosac-dev/nltk_data/gh-pages/packages/corpora/conll2002.zip"
      download.file(url, temporary_file)
      rawdata <- list()
      f <- unz(temporary_file, filename = "conll2002/ned.train")
      rawdata$ned.train <- readLines(f, encoding = "UTF-8")
      close(f)
      f <- unz(temporary_file, filename = "conll2002/ned.testa")
      rawdata$testa <- readLines(f, encoding = "UTF-8")
      close(f) 
      f <- unz(temporary_file, filename = "conll2002/ned.testb")
      rawdata$testb <- readLines(f, encoding = "UTF-8")
      close(f)    
      rawdata <- lapply(rawdata, FUN=function(x){
        x <- data.frame(txt = x, stringsAsFactors = FALSE)
        x$doc_id <- cumsum(x$txt == "-DOCSTART- -DOCSTART- O")
        if(is.finite(docs) & docs > 0){
          d <- unique(x$doc_id)
          x <- x[x$doc_id %in% sample(d, size = min(docs, length(d))), ]
        }
        x$sentence_id <- cumsum(x$txt == "") + 1L
        x <- x[x$txt != "" & x$txt != "-DOCSTART- -DOCSTART- O", ]
        x$txt <- strsplit(x$txt, " ")
        x$token <- sapply(x$txt, FUN=function(x) x[1])
        x$pos <- sapply(x$txt, FUN=function(x) x[2])
        x$label <- sapply(x$txt, FUN=function(x) x[3])
        x[, c("doc_id", "sentence_id", "token", "pos", "label")]
      })
      rawdata <- data.table::rbindlist(rawdata, idcol = "data")
      rawdata$doc_id <- as.integer(factor(sprintf("%s-%s", rawdata$data, rawdata$doc_id)))
      file.remove(temporary_file)
    }else if(type == "conll2002-es"){
      url <- "https://raw.githubusercontent.com/bnosac-dev/nltk_data/gh-pages/packages/corpora/conll2002.zip"
      download.file(url, temporary_file)
      rawdata <- list()
      f <- unz(temporary_file, filename = "conll2002/esp.train")
      rawdata$train <- readLines(f, encoding = "UTF-8")
      close(f)
      f <- unz(temporary_file, filename = "conll2002/esp.testa")
      rawdata$testa <- readLines(f, encoding = "UTF-8")
      close(f) 
      f <- unz(temporary_file, filename = "conll2002/esp.testb")
      rawdata$testb <- readLines(f, encoding = "UTF-8")
      close(f) 
      rawdata <- lapply(rawdata, FUN=function(x){
        x <- data.frame(txt = x, stringsAsFactors = FALSE)
        x$doc_id <- cumsum(x$txt == "") + 1L
        if(is.finite(docs) & docs > 0){
          d <- unique(x$doc_id)
          x <- x[x$doc_id %in% sample(d, size = min(docs, length(d))), ]
        }
        x <- x[x$txt != "", ]
        x$txt <- strsplit(x$txt, " ")
        x$token <- sapply(x$txt, FUN=function(x) x[1])
        x$pos <- sapply(x$txt, FUN=function(x) x[2])
        x$label <- sapply(x$txt, FUN=function(x) x[3])
        x[, c("doc_id", "token", "pos", "label")]
      })
      rawdata <- data.table::rbindlist(rawdata, idcol = "data")
      rawdata$doc_id <- as.integer(factor(sprintf("%s-%s", rawdata$data, rawdata$doc_id)))
      file.remove(temporary_file)
    }else if(type == "GermanNER"){
      rawdata <- readLines("https://raw.githubusercontent.com/bnosac-dev/FOX/master/input/GermanNER/full_train.tsv", encoding = "UTF-8")
      rawdata <- data.frame(txt = rawdata, stringsAsFactors = FALSE)
      rawdata$doc_id <- cumsum(rawdata$txt == "") + 1L
      rawdata <- rawdata[rawdata$txt != "", ]
      rawdata$txt <- strsplit(rawdata$txt, "\t")
      rawdata$token <- sapply(rawdata$txt, FUN=function(x) x[1])
      rawdata$label <- sapply(rawdata$txt, FUN=function(x) x[2])
      rawdata <- rawdata[, c("doc_id", "token", "label")]
    }else if(type %in% c("wikiner-de-wp2",
                         "wikiner-de-wp3",
                         "wikiner-en-wp2",
                         "wikiner-en-wp3",
                         "wikiner-es-wp2",
                         "wikiner-es-wp3",
                         "wikiner-fr-wp2",
                         "wikiner-fr-wp3",
                         "wikiner-it-wp2",
                         "wikiner-it-wp3",
                         "wikiner-nl-wp2",
                         "wikiner-nl-wp3",
                         "wikiner-pl-wp3",
                         "wikiner-pt-wp3",
                         "wikiner-ru-wp2",
                         "wikiner-ru-wp3")){
      url <- sprintf("https://raw.githubusercontent.com/bnosac-dev/FOX/master/input/Wikiner/aij-%s.bz2", type)
      download.file(url, temporary_file)
      rawdata <- data.frame(txt = readLines(temporary_file, encoding = "UTF-8"), stringsAsFactors = FALSE)
      rawdata$doc_id <- cumsum(rawdata$txt == "")
      rawdata <- rawdata[rawdata$txt != "", ]
      rawdata <- data.table::setDT(rawdata)
      rawdata <- rawdata[, sentence_id := 1:.N, by = list(doc_id)]
      rawdata <- rawdata[, list(txt = unlist(strsplit(txt, " "))), by = list(doc_id, sentence_id)]
      rawdata <- rawdata[, c("token", "pos", "label") := tstrsplit(txt, "\\|")]
      rawdata <- data.table::setDF(rawdata)
      rawdata <- rawdata[, c("doc_id", "sentence_id", "token", "pos", "label")]
      file.remove(temporary_file)
    }
    setDT(rawdata)
  }, silent = TRUE))
  if(inherits(msg, "try-error")){
    return(msg)
  }
  rawdata
}



#' @title Dutch reviews of AirBnB customers on Brussels address locations available at www.insideairbnb.com
#' @description The data contains 500 reviews in Dutch of people who visited an AirBnB appartment in Brussels. \cr
#' The data frame contains the fields 
#' \itemize{
#'  \item{doc_id: }{a unique identifier of the review}
#'  \item{listing_id: }{the airbnb address identifier}
#'  \item{text: }{text with the feedback of a customer on his visit in the AirBnB appartment}
#' }
#' @name airbnb
#' @docType data
#' @source \url{http://insideairbnb.com/brussels}: information of 2015-10-03
#' @seealso \code{\link{airbnb_chunks}}
#' @examples
#' data(airbnb)
#' str(airbnb)
#' head(airbnb)
NULL


#' @title Dutch reviews of AirBnB customers on Brussels address locations manually tagged with entities 
#' @description The \code{\link{airbnb}} dataset was manually annotated with the shiny app inside this R package.
#' The annotation shows chunks of data which have been flagged with the following categories: PERSON, LOCATION, DISTANCE.
#' The dataset is an object of class \code{chunkrange} and of type data.frame which contains the following fields:
#' \itemize{
#'  \item{doc_id: }{a unique identifier of the review, which is also available in \code{\link{airbnb}}}
#'  \item{listing_id: }{the airbnb address identifier}
#'  \item{text: }{text with the feedback of a customer on his visit in the AirBnB appartment}
#'  \item{chunk_id: }{a chunk identifier}
#'  \item{chunk_entity: }{a chunk entity label}
#'  \item{chunk: }{the text of the chunk which is a substring of \code{text}}
#'  \item{start: }{the starting position in \code{text} where the \code{chunk} is found}
#'  \item{end: }{the end position in \code{text} where the \code{chunk} is found}
#' }
#' @name airbnb_chunks
#' @docType data
#' @seealso \code{\link{airbnb_chunks}}
#' @examples
#' data(airbnb_chunks)
#' str(airbnb_chunks)
#' head(airbnb_chunks)
NULL

Any scripts or data that you put into this service are public.

crfsuite documentation built on Sept. 17, 2023, 1:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

crfsuite
Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

R/data.R
In crfsuite: Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

Defines functions ner_download_modeldata

Documented in ner_download_modeldata

Try the crfsuite package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

crfsuite Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

R/data.R In crfsuite: Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

Defines functions ner_download_modeldata

Documented in ner_download_modeldata

Try the crfsuite package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

crfsuite
Conditional Random Fields for Labelling Sequential Data in Natural Language Processing

R/data.R
In crfsuite: Conditional Random Fields for Labelling Sequential Data in Natural Language Processing