R/1_tcorpus.r

Defines functions get_context safe_selection is_tcorpus as.tcorpus.default as.tcorpus.tCorpus as.tcorpus summary.tCorpus rebuild_tcorpus refresh_tcorpus print.tCorpus

Documented in as.tcorpus as.tcorpus.default as.tcorpus.tCorpus print.tCorpus refresh_tcorpus summary.tCorpus

## Here the main tCorpus class is created
## Additional methods are specified in the matching r files (e.g. dtm method in document_term_matrix.r)

## The additional methods should where possible use only the accessor methods (instead of working with self$tokens and self$meta directly).
## This way, all validations, assumptions and optimizations stay intact

## this file has to be named 1_*, because in building the package the r files are processed in order by name,
## and the tCorpus class has to be created before the additional methods

tCorpus <- R6::R6Class("tCorpus",
   cloneable=FALSE,

   private = list(
     .p = list(),

     sync = function(){
       doc_ids = intersect(unique(self$tokens$doc_id), unique(self$meta$doc_id))
       self$tokens = self$tokens[list(doc_ids),]
       self$meta = self$meta[list(doc_ids),]
       private$set_keys()
     },

     check_unique_rows = function(d){
       if (anyDuplicated(d, by = c('doc_id','token_id'))) stop('After transformation, token_id is not unique within documents')
     },

     set_keys = function(){
       if (!identical(key(self$tokens), c('doc_id', 'token_id'))) setkey(self$tokens, 'doc_id', 'token_id')
       if (!identical(key(self$meta), c('doc_id'))) setkey(self$meta, 'doc_id')
     },

     droplevels = function(){
       self$tokens = base::droplevels(self$tokens)
       self$meta = base::droplevels(self$meta)
       invisible(self)
     },

     select_rows = function(selection, keep_meta=F) {
       selection = safe_selection(self$tokens, selection)
       self$tokens = subset(self$tokens, selection)
       if (!keep_meta) {
         self$meta = self$meta[as.character(unique(self$tokens$doc_id)),,nomatch=0]
         self$meta$doc_id = as.character(self$meta$doc_id)
       }
       private$set_keys()
     },

     select_meta_rows = function(selection, keep_data=F) {
       selection = safe_selection(self$meta, selection)
       self$meta = subset(self$meta, selection)
       self$meta$doc_id = as.character(self$meta$doc_id)
       if (!keep_data) self$tokens = self$tokens[as.character(unique(self$meta$doc_id)),,nomatch=0]
       private$set_keys()
     }

   ),

   public = list(
     help = function() ?tCorpus,

     tokens = NULL,
     meta = NULL,
     model = NULL,

     validate = function() {
        self$validate_tokens()
        self$validate_meta()
     },
     validate_tokens = function() {
        cnames = self$names
        e = c()
        if (!'doc_id' %in% cnames) e = c(e, '\t- tokens data does not have a "doc_id" column.')
        if (!'token_id' %in% cnames) e = c(e, '\t- tokens data does not have "token_id" column.')
        if (any(grepl('^\\.', cnames))) e = c(e, '\t- column names in a tCorpus cannot start with a dot. You can change the column names in $tokens or with the $set_name method')
        if (length(e) > 0) {
          e_message = paste0('TOKEN DATA BROKEN:\n', paste(e, collapse='\n'))
          stop(e_message)
        }

        if (!identical(data.table::key(self$tokens), c('doc_id','token_id'))) data.table::setkeyv(self$tokens, c('doc_id','token_id'))
     },

     validate_meta = function() {
       cnames = self$meta_names
       e = c()
       if (!'doc_id' %in% cnames) e = c(e, '\t- meta data does not have a "doc_id" column. You can try to fix the data in $meta')
       if (any(grepl('^\\.', cnames)))  e = c(e, '\t- column names in a tCorpus cannot start with a dot. You can change the column names in $meta or with the $set_meta_name method')
       if (length(e) > 0) {
         e_message = paste0('META DATA BROKEN:\n', paste(e, collapse='\n'))
         stop(e_message)
       }

       if (!identical(data.table::key(self$meta), 'doc_id')) data.table::setkeyv(self$meta, 'doc_id')
     },

     initialize = function(tokens, meta, model) {
       self$tokens = data.table(tokens)
       self$meta = data.table(meta)
       self$model = model
       private$set_keys()
     },

     finalize = function() {
       #forget_all_mem()
     },

     copy = function(){
       tCorpus$new(tokens = data.table::copy(self$tokens),
                   meta = data.table::copy(self$meta),
                   model = self$model)
     },

## SHOW/GET DATA METHODS ##
    get = function(columns=NULL, keep_df=F, as.df=F, subset=NULL, doc_id=NULL, token_id=NULL, copy=F) {
      self$validate_tokens()

      if (class(substitute(subset)) %in% c('call', 'name')) subset = self$eval(substitute(subset), parent.frame())
      if (!is.null(doc_id) && !is.null(subset)) stop('Cannot filter by subset and doc_ids at the same time')
      if (is.null(doc_id) && !is.null(token_id)) stop('token_id can only be given in pairs with doc_id')

      if (is.null(doc_id) && is.null(subset)) {
        if (is.null(columns)) {
          d = self$tokens
        } else {
          d = self$tokens[,columns,with=F]
        }
      } else {
        if (is.null(columns)) columns = colnames(self$tokens)
        if (!is.null(subset)) d = self$tokens[subset,columns,with=F]
        if (!is.null(doc_id)) {
          if (is.null(token_id)) {
            positions = list(doc_ids = as.character(doc_id))
            d = self$tokens[positions,columns,with=F]
          } else {
            if (!length(doc_id) == length(token_id)) stop('token_id can only be given in pairs with doc_id')
            positions = list(doc_ids = as.character(doc_id), token_is = as.numeric(token_id))
            d = self$tokens[positions,columns,with=F]
          }
        }
      }

      if (ncol(d) == 1 && !keep_df) d = d[,1][[1]]
      if (as.df) d = as.data.frame(d)
      if(copy) data.table::copy(d) else d
    },

    get_levels = function(column){
      if (!column %in% self$names) stop(sprintf('"%s" column does not exists in tokens', column))
      if (!is.factor(self$tokens[[column]])) stop(sprintf('"%s" is not a factor', column))
      data.table::copy(levels(self$tokens[[column]]))
    },

    get_meta = function(columns=NULL, keep_df=F, as.df=F, per_token=F, subset=NULL, doc_id=NULL, copy=T) {
      self$validate()

      if (class(substitute(subset)) %in% c('call', 'name')) subset = self$eval_meta(substitute(subset), parent.frame())
      if (!is.null(doc_id) && !is.null(subset)) stop('Cannot filter by subset and doc_ids at the same time')


      if (is.null(doc_id) && is.null(subset)) {
        if (is.null(columns)) {
          d = self$meta
        } else {
          d = self$meta[,columns,with=F]
        }
      } else {
        if (is.null(columns)) columns = colnames(self$meta)
        if (!is.null(doc_id)) {
          doc_ids = as.character(doc_id)
          d = self$meta[list(doc_ids),columns,with=F]
        }
        if (!is.null(subset)) self$meta[subset,columns,with=F]
      }

      if (ncol(d) == 1 && !keep_df) d = d[,1][[1]]
      if (as.df) d = as.data.frame(d)
      if (per_token) {
        exp_i = match(self$tokens$doc_id, self$meta$doc_id)
        d = if (inherits(d, 'data.frame')) d[exp_i,,drop=!keep_df & !is.null(columns)] else d[exp_i]
      }
      if(copy) data.table::copy(d) else d
    },

    get_meta_levels = function(column){
      if (!column %in% self$meta_names) stop(sprintf('"%s" column does not exists in meta', column))
      if (!is.factor(self$meta[[column]])) stop(sprintf('"%s" is not a factor', column))
      data.table::copy(levels(self$meta[[column]]))
    },

    context = function(context_level = c('document','sentence'), with_labels=T){
       get_context(self, context_level = context_level, with_labels=with_labels)
     },

    eval = function(x, enclos=parent.frame()) eval(x, self$tokens, enclos),
    eval_meta = function(x, enclos=parent.frame()) eval(x, self$meta, enclos),

    get_token_id = function(doc_id=NULL, token_id=NULL, subset=NULL, subset_meta=NULL, window=NULL, inverse=F){
      self$validate()

      if (class(substitute(subset)) %in% c('call', 'name')) subset = self$eval(substitute(subset), parent.frame())
      if (class(substitute(subset_meta)) %in% c('call', 'name')) subset_meta = self$eval_meta(substitute(subset_meta), parent.frame())
      if (is.null(doc_id) && !is.null(token_id)) stop('token_id can only be given in pairs with doc_id')

      ## enable subset to be called from a character string. (e.g. used in search_features)
      if(inherits(subset, 'character')) subset = eval(parse(text=subset), self$tokens, parent.frame())
      if(inherits(subset_meta, 'character')) subset_meta = eval(parse(text=subset_meta), self$meta, parent.frame())

      i = NULL
      if (!is.null(doc_id)) {
        pos = if (is.null(token_id)) list(doc_id) else list(doc_id, token_id)
        i = self$tokens[pos, which=T]
      }
      if (!is.null(subset)) {
        subset[is.na(subset)] = F
        i = if (is.null(i)) which(subset) else intersect(i, which(subset))
      }
      if (!is.null(subset_meta)) {
        subset_meta[is.na(subset_meta)] = F
        doc_ids = self$get_meta('doc_id')[subset_meta]
        meta_i = self$tokens[list(doc_ids), which=T]
        i = if (is.null(i)) meta_i else intersect(i, meta_i)
      }

      if (!is.null(window)) i = which(i_window(self, i=i, window=window))
      if (inverse) i = !1:self$n %in% i
      i
    },


## DATA MODIFICATION METHODS ##

     set = function(column, value, subset=NULL, subset_value=T){
       self$validate_tokens()

       if (column == 'doc_id') stop('Cannot change doc_id. If you want to change doc_id labels, you can overwrite $doc_id_levels.')
       if (class(substitute(subset)) %in% c('call', 'name')) subset = self$eval(substitute(subset), parent.frame())
       if (class(substitute(value)) %in% c('call', 'name')) value = self$eval(substitute(value), parent.frame())
       if (grepl('^\\.', column)) stop('column names in a tCorpus cannot start with a dot')

       if (!is.null(subset)){
         if (subset_value && length(value) > 1) value = value[subset]
         if (!column %in% colnames(self$tokens)) {
           self$tokens[,(column) := NA]
           if (is.factor(value)) {
             self$tokens[,(column) := fast_factor(self$tokens[[column]])]
           } else {
             self$tokens[,(column) := methods::as(self$tokens[[column]], class(value))]
           }
         }

         if (column %in% c('sentence','token_id')) {  ## for position columns, first perform checks (inefficient, but this should be a rare case anyway)
            if (!is.numeric(value)) stop('position column has to be numeric/integer')
            value = as.numeric(value)
            mod = if ('sentence' %in% self$names) self$tokens[,c('doc_id','sentence','token_id')] else self$tokens[,c('doc_id','token_id')]
            mod[subset, (column) := value]
            check_unique_rows(mod)
         }

         .value = value
         self$tokens[subset, (column) := .value]


       } else {
         if (column %in% c('sentence','token_id')) {
           if (!is.numeric(value)) stop('position column has to be numeric/integer')
           value = as.numeric(value)
           mod = if ('sentence' %in% self$names) self$tokens[,c('doc_id','sentence','token_id')] else self$tokens[,c('doc_id','token_id')]
           suppressWarnings(mod[, (column) := value])
           check_unique_rows(mod)
         }
         if (is.character(value)) value = fast_factor(value)  ## force the use of factors for character vectors.
         ## ugly suppress. Should look into why data.table give the (seemingly harmless) internal.selfref warning
         suppressWarnings(self$tokens[,(column) := value])
       }

       ## the assign by reference in data.table sometimes adds NA as a factor level...
       if (anyNA(levels(self$tokens[[column]]))) {
         self$tokens[,(column) := fast_factor(self$tokens[[column]])]
       }

       private$set_keys()
       self$tokens[]
       self$meta[]
       invisible(self)
     },

    merge = function(df, by=NULL, by.x=NULL, by.y=NULL, columns=NULL) {
      if (is.null(by) && is.null(by.x) && is.null(by.y)) by = intersect(self$names, colnames(df))
      if (!all(c(by,by.x) %in% self$names)) stop('Not all columns specified in by / by.x exist in $tokens')
      if (!all(c(by,by.y) %in% colnames(df))) stop('Not all columns specified in by / by.y exist in df')
      
      if (inherits(df, 'data.table'))
        if (any(duplicated(df[,c(by,by.y), with=F]))) stop('Columns specified in by (or by.y) must be unique in df')
      else
        if (any(duplicated(df[,c(by,by.y)]))) stop('Columns specified in by (or by.y) must be unique in df')
      
      if (!is.null(columns)) {
        if (!is.null(by)) 
          df = df[,c(by, columns)] 
        else
          df = df[,c(by.y, columns)]
      }
        
      cnames = setdiff(colnames(df), c(by,by.x,by.y))
      in_tc = intersect(cnames, self$names)
      if (length(in_tc) > 0) stop(sprintf('DUPLICATE COLUMN NAMES: Some columns that are to be merged have names that are already used in $tokens (%s)', paste(in_tc, collapse=', ')))
      
      if (!is.null(by)) {
        self$tokens = merge(self$tokens, data.table::as.data.table(df), by=by, all.x=T)
      } else { 
        self$tokens = merge(self$tokens, data.table::as.data.table(df), by.x=by.x, by.y=by.y, all.x=T)
      }
      self$validate_tokens()
      invisible(self)
    },

     set_levels = function(column, levels) {
       if (!column %in% self$names) stop(sprintf('"%s" column does not exists in tokens', column))
       if (!is.factor(self$tokens[[column]])) stop(sprintf('"%s" is not a factor', column))
       if (!length(levels) == length(levels(self$tokens[[column]]))) stop('new levels of different length than current levels')
       if (column == 'doc_id') {
         self$meta$doc_id = levels[match(levels(self$tokens$doc_id), self$meta$doc_id)]
       }
       data.table::setattr(self$tokens[[column]], 'levels', levels)
     },

     delete_columns = function(cnames){
       protected_cols = intersect(self$names, c('doc_id', 'token_id'))
       if (any(protected_cols %in% cnames)) stop("The position columns doc_id and token_id cannot be deleted")
       for (col in cnames) self$tokens[,(col) := NULL]
       self$tokens[]
       invisible(self)
     },

     set_name = function(oldname, newname) {
       if (oldname %in% c('doc_id','token_id')) stop('The position columns (doc_id, token_id) cannot be set or changed')
       if (grepl('^\\.', newname)) stop('column names in a tCorpus cannot start with a dot')

       data.table::setnames(self$tokens, oldname, newname)
       self$tokens[]
       invisible(self)
     },

     set_meta = function(column, value, subset=NULL, subset_value=T){
       self$validate_meta()

       if (column == 'doc_id') stop('Cannot change doc_id. If you want to change doc_id labels, you can overwrite $doc_id_levels.')
       if (class(substitute(subset)) %in% c('call', 'name')) subset = self$eval_meta(substitute(subset), parent.frame())
       if (class(substitute(value)) %in% c('call', 'name')) value = self$eval_meta(substitute(value), parent.frame())

       if (grepl('^\\.', column)) stop('column names in a tCorpus cannot start with a dot')

       if (!is.null(subset)){
         if (subset_value && length(value) > 1) value = value[subset]

         if (!column %in% colnames(self$meta)) {
           self$meta[[column]] = NA
           if (is.factor(value)) {
             self$meta[[column]] = fast_factor(self$meta[[column]])
           } else {
             self$meta[,(column) := methods::as(self$meta[[column]], class(value))]
           }
         }

         .value = value
         self$meta[subset, (column) := .value]

       } else {
         ## ugly suppress. Should look into why data.table give the (seemingly harmless) internal.selfref warning
         suppressWarnings(self$meta[,(column) := value])
       }

       ## strangely, the assign by reference in data.table sometimes adds NA as a factor level...
       if (anyNA(levels(self$meta[[column]]))) {
         self$meta[,(column) := fast_factor(self$meta[[column]])]
       }
       self$tokens[]
       self$meta[]
       invisible(self)
     },

      merge_meta = function(df, by=NULL, by.x=NULL, by.y=NULL) {
        if (is.null(by) && is.null(by.x) && is.null(by.y)) return(invisible(self))
        if (!all(c(by,by.x) %in% self$meta_names)) stop('Not all columns specified in by / by.x exist in $meta')
        if (!all(c(by,by.x) %in% colnames(df))) stop('Not all columns specified in by / by.y exist in df')
        
        if (inherits(df, 'data.table'))
          if (any(duplicated(df[,c(by,by.y), with=F]))) stop('Columns specified in by (or by.y) must be unique in df')
        else
          if (any(duplicated(df[,c(by,by.y)]))) stop('Columns specified in by (or by.y) must be unique in df')
        
        
        cnames = setdiff(colnames(df), c(by,by.x,by.y))
        in_tc = intersect(cnames, self$meta_names)
        if (length(in_tc) > 0) stop(sprintf('DUPLICATE COLUMN NAMES: Some columns that are to be merged have names that are already used in $meta (%s)', paste(in_tc, collapse=', ')))
        
        if (!is.null(by))
          self$meta = merge(self$meta, data.table::as.data.table(df), by=by, all.x=T)
        else 
          self$meta = merge(self$meta, data.table::as.data.table(df), by.x=by.x, by.y=by.y, all.x=T)
        self$validate_meta()
        invisible(self)
      },

     set_meta_levels = function(column, levels) {
        if (!column %in% self$meta_names) stop(sprintf('"%s" column does not exists in meta', column))
        if (!is.factor(self$meta[[column]])) stop(sprintf('"%s" is not a factor', column))
        data.table::setattr(self$meta[[column]], 'levels', levels)
      },

     delete_meta_columns = function(cnames){
        protected_cols = intersect(self$names, c('doc_id'))
        if (any(protected_cols %in% cnames)) stop('doc_id cannot be deleted')
        for (col in cnames) self$meta[,(col) := NULL]
        self$meta[]
        invisible(self)
      },

     set_meta_name = function(oldname, newname) {
       if (oldname %in% c('doc_id')) stop('The doc_id column cannot be set or changed')
       if (grepl('^\\.', newname)) stop('column names in a tCorpus cannot start with a dot')
       setnames(self$meta, oldname, newname)
       self$meta[]
       invisible(self)
     },

     subset = function(subset=NULL, subset_meta=NULL, hits=NULL, window=NULL, copy=F, ...){
       self$validate()

       if (class(substitute(subset)) %in% c('call', 'name')) subset = self$eval(substitute(subset), parent.frame())
       if (class(substitute(subset_meta)) %in% c('call', 'name')) subset_meta = self$eval_meta(substitute(subset_meta), parent.frame())
       if (copy) {
         selfcopy = self$copy()$subset(subset=subset, subset_meta=subset_meta, hits=hits, window=window, copy=F)
         return(selfcopy)
       }

       if (!is.null(subset_meta)) subset_meta[is.na(subset_meta)] = F

       if (!is.null(subset)){
         subset[is.na(subset)] = F
         if (!is.null(window)){
           global_i = get_global_i(self, max_window_size=window)
           global_r = global_i[subset]
           global_window = rep(global_r, window*2 + 1) + rep(-window:window, each=length(global_r)) ## add window
           subset = global_i %in% global_window
         }
       }

       ## subset the data, using different solutions if one or both subsets (data and meta) are used for optimalisation
       if (!is.null(subset_meta) && !is.null(subset)) {
         private$select_meta_rows(subset_meta, keep_data = T) ## if both subsets are used, first perform both without subseting the other, then match on doc_ids.
         private$select_rows(subset, keep_meta = T)           ## (we cannot subset one before the other, because subset call's can contains vectors for which the rows should match)
         private$sync()
       }
       if (!is.null(subset_meta) && is.null(subset)) {
         private$select_meta_rows(subset_meta, keep_data = F)
       }
       if (is.null(subset_meta) && !is.null(subset)) {
         private$select_rows(subset, keep_meta = F)
       }

       private$droplevels()
       self$tokens[]
       self$meta[]
       if (!is.factor(self$tokens$doc_id)) self$tokens$doc_id = fast_factor(self$tokens$doc_id)
       invisible(self)
     },

     subset_meta = function(subset=NULL, copy=F){
        ## subset also has a subset_meta argument, but we add this for consistency with other _meta methods
        if (class(substitute(subset)) %in% c('call', 'name')) subset = self$eval_meta(substitute(subset), parent.frame())
        .subset = subset
        self$subset(subset_meta = .subset, copy=copy)
      },

      indices = function() data.table::indices(self$tokens),
      clear_indices = function() data.table::setindex(self$tokens, NULL)
   ),



   active = list(
     n = function() nrow(self$tokens),
     n_meta = function() nrow(self$meta),
     feature_names = function(e=NULL) {
       if (!is.null(e)) stop('Cannot change tcorpus$feature_names by assignment. Instead, use the set_name() function')
       fnames = colnames(self$tokens)[!colnames(self$tokens) %in% c('doc_id','sentence','token_id')]
     },
     names = function(e=NULL) {
       if (!is.null(e)) stop('Cannot change tcorpus$names by assignment. Instead, use the set_name() function')
       colnames(self$tokens)
     },
     meta_names = function(e=NULL) {
       if (!is.null(e)) stop('Cannot change tcorpus$meta_names by assignment. Instead, use the set_meta_name() function')
       colnames(self$meta)
     },

     doc_id_levels = function(mod=NULL) {
       if (!is.null(mod)){
         self$meta$doc_id = mod[match(levels(self$tokens$doc_id), self$meta$doc_id)]
         levels(self$tokens$doc_id) = mod
       }
       levels(self$get('doc_id'))
     }
   )
)

#' S3 print for tCorpus class
#'
#' @param x a tCorpus object
#' @param ... not used
#'
#' @method print tCorpus
#' @examples
#' tc = create_tcorpus(c('First text', 'Second text'))
#' print(tc)
#' @export
print.tCorpus <- function(x, ...) {
  sent_info = if ('sentence' %in% x$names) paste(' and sentences (n = ', nrow(unique(x$get(c('doc_id','sentence')))), ')', sep='') else ''
  cat('tCorpus containing ', x$n, ' tokens',
      '\ngrouped by documents (n = ', x$n_meta, ')', sent_info,
      '\ncontains:',
      '\n  - ', length(x$names), ' column', if (length(x$names) > 1) 's', ' in $tokens', ':\t', paste(x$names, collapse=', '),
      '\n  - ', length(x$meta_names), ' column', if (length(x$meta_names) > 1) 's', ' in $meta', ':  \t', paste(x$meta_names, collapse=', '),
      '\n', sep='')
}


#' Refresh a tCorpus object using the current version of corpustools
#'
#' As an R6 class, tCorpus contains its methods within the class object (i.e. itself). Therefore, if you use a new version of corpustools with an older tCorpus object (e.g., stored as a .rds. file), then the methods are not automatically updated. You can then use refresh_tcorpus() to reinitialize the tCorpus object with the current version of corpustools.
#'
#' @param tc a tCorpus object
#'
#' @return a tCorpus object
#' @examples
#' tc = create_tcorpus(c('First text', 'Second text'))
#' refresh_tcorpus(tc)
#' @export
refresh_tcorpus <- function(tc){
  tCorpus$new(tokens = data.table::copy(tc$get(keep_df = T)),
              meta = data.table::copy(tc$get_meta(keep_df = T)),
              model = tc$model)
}

rebuild_tcorpus <- function(tc) {
  tokens_to_tcorpus(tokens = tc$get(),
                    doc_col = 'doc_id',
                    sentence_col = ifelse('sentence' %in% tc$names, T, F),
                    token_id_col = 'token_id',
                    meta = tc$get_meta())
}

#' Summary of a tCorpus object
#'
#' @param object A tCorpus object
#' @param ... not used
#'
#' @method summary tCorpus
#' @examples
#' tc = create_tcorpus(c('First text', 'Second text'))
#' summary(tc)
#' @export
summary.tCorpus <- function(object, ...) object

#' Force an object to be a tCorpus class
#'
#' @param x the object to be forced
#' @param ... not used
#'
#' @export
as.tcorpus <- function(x, ...) UseMethod('as.tcorpus')

#' Force an object to be a tCorpus class
#'
#' @param x the object to be forced
#' @param ... not used
#'
#' @examples
#' tc = create_tcorpus(c('First text', 'Second text'))
#' as.tcorpus(tc)
#' @export
as.tcorpus.tCorpus <- function(x, ...) x

#' Force an object to be a tCorpus class
#'
#' @param x the object to be forced
#' @param ... not used
#'
#' @examples
#' \dontrun{
#' x = c('First text','Second text')
#' as.tcorpus(x) ## x is not a tCorpus object
#' }
#' @export
as.tcorpus.default <- function(x, ...) stop('x has to be a tCorpus object')
## params: preprocess_params=list, filter_params,

is_tcorpus <- function(x){
  if (!class(x)[1] %in% c('tCorpus')) stop('not a tCorpus object')
  TRUE
}

###  utility

safe_selection <- function(d, selection){
  if (any(is.na(selection))) stop('selection cannot contain NA')
  if (!is.numeric(selection) && !is.logical(selection)) stop('selection has to be either a logical vector or a numerical vector (indices for TRUE values)')
  if (is.numeric(selection)) selection = 1:nrow(d) %in% selection
  selection
}


get_context <- function(tc, context_level = c('document','sentence'), with_labels=T){
  context_level = match.arg(context_level)

  if (context_level == 'document') {
    context = tc$get('doc_id')
    if (!with_labels) levels(context) = 1:length(levels(context))
  }
  if (context_level == 'sentence') {
    if (!'sentence' %in% tc$names) stop('Sentence level not possible, since no sentence information is available. To enable sentence level analysis, use split_sentences = T in "create_tcorpus()" or specify sentence_col in "tokens_to_tcorpus()"')
    d = tc$get(c('doc_id','sentence'))
    if (with_labels){
      ucontext = unique(d[,c('doc_id','sentence')])
      ucontext = stringi::stri_paste(ucontext$doc_id, ucontext$sentence, sep=' #')
      context = fast_factor(global_position(d$sentence, d$doc_id, presorted = T, position_is_local=T))
      levels(context) = ucontext
    } else {
      fast_factor(global_position(d$sentence, d$doc_id, presorted = T, position_is_local=T))
      context = fast_dummy_factor(global_position(d$sentence, d$doc_id, presorted = T, position_is_local=T))
    }
  }
  context
}
kasperwelbers/tcorpus documentation built on May 10, 2023, 5:10 p.m.