R/wrap_documents.r
In tokenbrowser: Create Full Text Browsers from Annotated Token Lists

Documented in wrap_documents

create_doc_headers <- function(meta, doc_col='doc_id', nav=doc_col) {
  title = add_tag(meta[[doc_col]], 'doc_id')

  if (!is.null(nav)) {
    navtags = add_tag(nav, 'div', tag_attr(style=attr_style(display="none")))
    meta = create_meta_tables(meta, ignore_col = doc_col)
    stringi::stri_paste(title, navtags, meta, sep='\n')
  } else {
    meta = create_meta_tables(meta, ignore_col = doc_col)
    stringi::stri_paste(title, meta, sep='\n')
  }

}

wrap_tokens <- function(tokens, doc_col='doc_id', token_col='token', space_col=NULL){
  if (any(is.na(tokens[[token_col]]))) {
    if (is.factor(tokens[[token_col]])) {
      levels(tokens[[token_col]]) = union(levels(tokens[[token_col]]), '')
    }
    tokens[[token_col]][is.na(tokens[[token_col]])] = ''
  }

  ## quick hack because split sorts by f. needs more efficient solution
  i = match(tokens[[doc_col]], unique(tokens[[doc_col]]))

  if (!is.null(space_col) && space_col %in% colnames(tokens)){
    space = tokens[[space_col]]
  } else
    space = ' '
  tokens[[token_col]] = stringi::stri_paste(tokens[[token_col]], space, sep='')

  text = split(tokens[[token_col]], f = i)
  text = stringi::stri_paste_list(text, sep='')
  text = gsub('\\n', '<br>', text)
  sprintf('<p>%s</p>', pretty_text_wrap(text))
}




pretty_text_wrap <- function(x){
  x = gsub(' ', ' ', x)
  x = gsub(" ([.,?!:;>)])", '\\1', x)
  x = gsub('([(<]) ', '\\1', x)
  x
}

#top_category <- function(meta, tokens, category, doc_col){
#  agg = stats::aggregate(category, by=list(tokens[[doc_col]], category), FUN=length)
#  agg = agg[order(-agg$x),]
#  agg = agg[!duplicated(agg[[1]]),]
#  agg[[2]][match(meta[[doc_col]], agg[[1]])]
#}

#' Wrap tokens into document html strings
#'
#' Pastes the tokens into articles, and returns an <article> html element.
#'
#' @param tokens     A data.frame with a column for document ids (doc_col)
#'                   and a column for tokens (token_col)
#' @param meta       A data.frame with a column for document_ids (doc_col). All other columns are added
#'                   to the browser as document meta
#' @param doc_col    The name of the document id column
#' @param token_col  The name of the token column
#' @param space_col  Optionally, a column with space indications (e.g., newline) per token (which is how some NLP parsers indicate spaces)
#' @param nav        The column in meta used for nav. Defaults to 'doc_id'
#' @param token_nav  Alternative to nav (which uses meta), a column in tokens used for navigation
#' @param top_nav    If token_nav is used, navigation filters will only apply to the top x values with highest token occurence in a document
#' @param thres_nav  Like top_nav, but specifying a threshold for the minimum number of tokens.
#'
#' @return A named vector, with document ids as names and the document html strings as values
#' @export
#' @examples
#' docs = wrap_documents(sotu_data$tokens, sotu_data$meta)
#' head(names(docs))
#' docs[[1]]
wrap_documents <- function(tokens, meta, doc_col='doc_id', token_col='token', space_col=NULL, nav=doc_col, token_nav=NULL, top_nav=NULL, thres_nav=NULL) {
  if (!methods::is(tokens, 'data.frame')) tokens = as.data.frame(tokens)
  doc_id = unique(tokens[[doc_col]])
  if (!is.null(meta)) {
    meta = as.data.frame(meta)
    meta = meta[match(doc_id, meta[[doc_col]]),,drop=F]
  } else {
    meta = data.frame(doc_id = doc_id)
    colnames(meta) = doc_col
  }


  if (!is.null(token_nav)) {
    nav = token_nav_string(tokens, meta, doc_col, token_nav, top_nav, thres_nav)
    header = create_doc_headers(meta, doc_col = doc_col, nav=nav)
  } else {
    nav = if (is.null(nav)) NULL else sprintf('<tag>%s</tag>', meta[[nav]])
    header = create_doc_headers(meta, doc_col = doc_col, nav= nav)
  }

  texts = wrap_tokens(tokens, doc_col=doc_col, token_col=token_col, space_col=space_col)
  docs = stringi::stri_paste(header, texts, sep='\n')

  docs = add_tag(docs, 'article', tag_attr(insearch="1",infilter="1"))
  names(docs) = doc_id

  docs
}

Any scripts or data that you put into this service are public.

tokenbrowser documentation built on Oct. 23, 2020, 6:54 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tokenbrowser
Create Full Text Browsers from Annotated Token Lists

R/wrap_documents.r
In tokenbrowser: Create Full Text Browsers from Annotated Token Lists

Defines functions wrap_documents pretty_text_wrap wrap_tokens create_doc_headers

Documented in wrap_documents

Try the tokenbrowser package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tokenbrowser Create Full Text Browsers from Annotated Token Lists

R/wrap_documents.r In tokenbrowser: Create Full Text Browsers from Annotated Token Lists

Defines functions wrap_documents pretty_text_wrap wrap_tokens create_doc_headers

Documented in wrap_documents

Try the tokenbrowser package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tokenbrowser
Create Full Text Browsers from Annotated Token Lists

R/wrap_documents.r
In tokenbrowser: Create Full Text Browsers from Annotated Token Lists