R/generic_reshape.r

Defines functions add_sub_id increment_sub_id get_fill_nodes get_node_position get_linked_node_vars get_node_vars .nodes_eval add_to_fill add_to_tokens is_deparsed_call remove_isolate_fill do_copy_fill copy_fill copy_nodes do_remove_fill remove_fill remove_nodes mutate_nodes subset_nodes selected_nodes unselect_nodes reselect_nodes select_nodes

Documented in copy_fill copy_nodes mutate_nodes remove_fill remove_nodes reselect_nodes selected_nodes select_nodes subset_nodes unselect_nodes

#' Apply tquery to initiate reshape operations
#'
#' @param tokens            A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.
#' @param tquery            A \link{tquery} that selects and labels the nodes that are used in the reshape operations
#' @param fill              Logical, should fill be used?
#' @param fill_only_first   Logical, should a node only be filled once, with the nearest (first) labeled node?
#' @param .one_per_sentence If true, only one match per sentence is used, giving priority to paterns closest to the root (or fartest from the root if .order = -1). 
#'                          This is sometimes necessary to deal with recursion.
#' @param .order            If .one_per_sentence is used, .order determines whether the paterns closest to (1) or farthest away (-1) are used.
#'                        
#' @return A tokenIndex with a .nodes attribute, that enables the use of reshape operations on the selected nodes
#' @export
#' @examples 
#' tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
#' 
#' ## use a tquery to label the nodes that you want to manipulate
#' tq = tquery(relation = "relcl", label = "relative_clause")
#' 
#' ## apply query to select nodes
#' tokens2 = select_nodes(tokens, tq) 
#' 
#' ## as an example, we make the parent of the relative_clause
#' ## nodes NA, effectively cutting of the relcl from the tree
#' tokens2 = mutate_nodes(tokens2, "relative_clause", parent=NA)
#' 
#' tokens2
#' 
#' \donttest{
#' if (interactive()) plot_tree(tokens2)
#' 
#' ## this is designed to work nicely with magrittr piping
#' if (interactive()) {
#' tokens %>%
#'   select_nodes(tq) %>%
#'   mutate_nodes("relative_clause", parent=NA) %>%
#'   plot_tree()
#' }
#' }
select_nodes <- function(tokens, tquery, fill=TRUE, fill_only_first=TRUE, .one_per_sentence=FALSE, .order=1){
  if (rsyntax_threads() != data.table::getDTthreads()) {
    old_threads = data.table::getDTthreads()
    on.exit(data.table::setDTthreads(old_threads))
    data.table::setDTthreads(rsyntax_threads())
  }
  
  .ROOT_DIST = NULL
  
  ## fill mode: all, only_first, 
  tokens = as_tokenindex(tokens)
  nodes = find_nodes(tokens, tquery, fill = FALSE, melt = FALSE, root_dist = .one_per_sentence)
  
  if (!is.null(nodes)) {
    if (.one_per_sentence) {
      if (!.order %in% c(-1,1)) stop('.order has to be 1 or -1')
      nodes = data.table::setorderv(nodes, '.ROOT_DIST', order = .order)
      nodes = nodes[!duplicated(nodes, by=c('doc_id','sentence'), fromLast = FALSE),]
      nodes[,.ROOT_DIST := NULL]
    }
    if (fill) {
      if (fill_only_first) {
        ids = unique(add_fill(tokens, nodes, tquery, block=nodes))
      } else {
        ids = unique(add_fill(tokens, nodes, tquery, block=NULL))
      }
    } else ids = NULL
    
    if (!is.null(ids)) {
      ids = melt_nodes_list(ids, fill_only_first=fill_only_first)
      ids = unique(ids, by=c('doc_id','sentence','token_id'))
      
      is_fill = ids$.FILL_LEVEL > 0
      fill_table = subset(ids, is_fill)
      #if (nrow(fill_table) == 0) browser()
      data.table::setindexv(fill_table, '.ROLE')
    }
    l = list(nodes=nodes, fill=fill_table, prov = list(tquery=tquery, fill=fill, fill_only_first=fill_only_first))
  } else {
    l = list(nodes =data.table::data.table(), fill=data.table::data.table(), prov = list(tquery=tquery, fill=fill, fill_only_first=fill_only_first))
  }
  
  data.table::setattr(tokens, '.nodes', value = l)  ## .nodes is assigned to attribute. This way it will automatically be deleted
                                                    ## if tokens is changes (which in this case is convenient)
  tokens[]
}


#' Within a chain of reshape operations, reapply the tquery
#'
#' @param .tokens A tokenIndex in which nodes are selected with \link{select_nodes}.  
#'
#' @return A tokenIndex with a .nodes attribute
#' @export
#' @examples
#' tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
#' 
#' ## use a tquery to label the nodes that you want to manipulate
#' tq = tquery(relation = "relcl", label = "relative_clause")
#' 
#' ## apply query to select nodes
#' tokens2 = select_nodes(tokens, tq) 
#' 
#' ## reuses the tq, that is stored in tokens2
#' ## this makes it easy to make the selection anew after a transformation
#' tokens2 = reselect_nodes(tokens2)
reselect_nodes <- function(.tokens) {
  .nodes = selected_nodes(.tokens)
  select_nodes(.tokens, tquery = .nodes$prov$tquery, fill = .nodes$prov$fill, fill_only_first = .nodes$prov$fill_only_first)
}


#' Undo select_nodes
#'
#' Not strictly required. Only available for elegance and minor memory efficiency
#'
#' @param .tokens A tokenIndex in which nodes are selected with \link{select_nodes}.  
#'
#' @return A tokenIndex (without a .nodes attribute)
#' @export
#' @examples
#' tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
#' 
#' tq = tquery(relation = "relcl", label = "relative_clause")
#' tokens = select_nodes(tokens, tq) 
#' selected_nodes(tokens)
#' 
#' tokens = unselect_nodes(tokens)
#' 
#' is.null(attr(tokens, '.nodes'))
unselect_nodes <- function(.tokens) {
  data.table::setattr(.tokens, '.nodes', NULL)
  .tokens
}

#' If select_nodes() is used, the selected nodes can be extracted with selected_nodes().
#' This is mainly for internal use, but it can also be usefull for debugging, and to controll
#' loops of reshape operation (e.g. break if no selected nodes left)
#'
#' @param .tokens A tokenIndex in which nodes are selected with \link{select_nodes}.  
#'
#' @return A tokenIndex with a .nodes attribute
#' @export
#' @examples 
#' tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
#' 
#' ## use a tquery to label the nodes that you want to manipulate
#' tq = tquery(relation = "relcl", label = "relative_clause")
#' 
#' ## apply query to select nodes
#' tokens2 = select_nodes(tokens, tq) 
#' 
#' ## Get selected nodes from tokenindex
#' selected_nodes(tokens2)
selected_nodes <- function(.tokens) {
  if (is.null(attr(.tokens, '.nodes'))) stop('.tokens data.table does not have selected nodes. Please use select_nodes(tokens, tquery, ...) before using this function. See ?select_nodes documentation for details')
  attr(.tokens, '.nodes')
}



#' Subset a select_nodes selection 
#' 
#' Enables more control in reshape operations
#'
#' @param .tokens A tokenIndex in which nodes are selected with \link{select_nodes}.  
#' @param subset  A subset expression (that evaluates to a logical vector). The token column for each labeled node in the tquery can be referred to as label$column.
#' @param copy    If TRUE, make a deep copy of .tokens. Use if output does not overwrite .tokens
#'
#' @return A tokenIndex with a .nodes attribute
#' @export
#' @examples
#' tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
#' 
#' ## use a tquery to label the nodes that you want to manipulate
#' tq = tquery(label='verb', children(relation='nsubj'))
#' 
#' ## apply query to select nodes
#' tokens2 = select_nodes(tokens, tq) 
#' 
#' selected_nodes(tokens2)$nodes
#' tokens2 = subset_nodes(tokens2, verb$relation == 'ROOT')
#' selected_nodes(tokens2)$nodes
subset_nodes <- function(.tokens, subset, copy=TRUE) {
  if (rsyntax_threads() != data.table::getDTthreads()) {
    old_threads = data.table::getDTthreads()
    on.exit(data.table::setDTthreads(old_threads))
    data.table::setDTthreads(rsyntax_threads())
  }
  
  copy=TRUE ## make optional if tested
  .nodes = selected_nodes(.tokens)
  if (nrow(.nodes$nodes) == 0) return(.tokens)
  if (copy) .tokens = data.table::copy(.tokens)
  
  .tokens = data.table::copy(.tokens)
  linked_node_vars = get_linked_node_vars(.tokens, .nodes)
  
  if (!is_deparsed_call(subset)) subset = deparse(substitute(subset)) 
  subset = if (subset == 'NULL') NULL else eval(parse(text=subset), envir=linked_node_vars, parent.frame())
  
  if (is.null(subset)) if (copy) .tokens[] else invisible(.tokens)
  if (all(subset)) if (copy) .tokens[] else invisible(.tokens)
  
  .nodes$nodes = .nodes$nodes[subset,]
  node_ids = unique(as.character(.nodes$nodes$.ID))
  .nodes$fill = .nodes$fill[list(.ID=node_ids), on='.ID', nomatch=0]
  
  data.table::setattr(.tokens, '.nodes', value = .nodes)
  if (copy) .tokens[] else invisible(.tokens)
}

#' Mutate nodes
#'
#' @param .tokens A tokenIndex in which nodes are selected with \link{select_nodes}.  
#' @param node    The name of the node that is to be mutated
#' @param ...     named arguments. The name should be a column in tokens
#' @param subset  A subset expression (that evaluates to a logical vector). The token column for each labeled node in the tquery can be referred to as label$column.
#'
#' @return A tokenIndex with a .nodes attribute
#' @export
#' @examples 
#' tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
#' 
#' ## use a tquery to label the nodes that you want to manipulate
#' tq = tquery(relation = "relcl", label = "relative_clause")
#' 
#' ## apply query to select nodes
#' tokens2 = select_nodes(tokens, tq) 
#' 
#' ## as an example, we make the parent of the relative_clause
#' ## nodes NA, effectively cutting of the relcl from the tree
#' tokens2 = mutate_nodes(tokens2, "relative_clause", parent=NA)
#' 
#' tokens2
mutate_nodes <- function(.tokens, node, ..., subset=NULL) {
  if (rsyntax_threads() != data.table::getDTthreads()) {
    old_threads = data.table::getDTthreads()
    on.exit(data.table::setDTthreads(old_threads))
    data.table::setDTthreads(rsyntax_threads())
  }
  
  copy = TRUE ## make optional if tested
  .nodes = selected_nodes(.tokens)
  if (nrow(.nodes$nodes) == 0) return(.tokens)
  if (!is.character(node)) stop('"node" argument has to be a character value')
    
  l = tidyselect::quos(...)
  if (length(l) == 0) return(.tokens) 
  if (copy) .tokens = data.table::copy(.tokens)
  
  linked_node_vars = get_linked_node_vars(.tokens, .nodes)
  
  token_i = get_node_position(.tokens, .nodes, node)
  if (!is_deparsed_call(subset)) subset = deparse(substitute(subset)) 
  subset = if (subset == 'NULL') NULL else eval(parse(text=subset), envir=linked_node_vars, parent.frame())
  for (i in seq_along(l)) {
    col = names(l)[i]
    e = rlang::quo_get_expr(l[[i]])
    if (is.character(e)) e = parse(text=e)
    if (is.expression(e)) {
      .VAL = as.character(e[[1]])
    } else {
      .VAL = eval(e, linked_node_vars, parent.frame())
    }
    
    if (!is.null(subset)) {
      .tokens[token_i[subset], (col) := .VAL[subset]]
    } else {
      .tokens[token_i, (col) := .VAL]
    }
  }

  data.table::setattr(.tokens, '.nodes', value = .nodes)
  if (copy) .tokens[] else invisible(.tokens)
}


#' Remove nodes
#'
#' @param .tokens          A tokenIndex in which nodes are selected with \link{select_nodes}.  
#' @param node             The name of the node that is to be mutated
#' @param rm_subset        A subset expression (that evaluates to a logical vector) to more specifically specify which nodes to remove. The token column for each labeled node in the tquery can be referred to as label$column.
#' @param with_fill        If TRUE, also remove the fill nodes
#' @param rm_subset_fill   A subset on the fill nodes. Can only directly use token column. For example, use pos == 'VERB' to remove only verbs
#' @param keep_shared      If there is another node that has the same fill nodes, should the fill nodes that are shared also be removed?
#'
#' @return A tokenIndex with a .nodes attribute
#' @export
#' @examples 
#' tokens = tokens_spacy[tokens_spacy$doc_id == 'text1',]
#' 
#' ## use a tquery to label the nodes that you want to manipulate
#' tq = tquery(pos = 'VERB',
#'             children(label = 'object', relation='dobj'))
#' 
#' ## apply query to select nodes
#' tokens2 = select_nodes(tokens, tq) 
#' 
#' remove_nodes(tokens2, 'object')
#' remove_nodes(tokens2, 'object', with_fill=FALSE)
remove_nodes <- function(.tokens, node, rm_subset=NULL, with_fill=TRUE, rm_subset_fill=NULL, keep_shared=FALSE) {
  if (rsyntax_threads() != data.table::getDTthreads()) {
    old_threads = data.table::getDTthreads()
    on.exit(data.table::setDTthreads(old_threads))
    data.table::setDTthreads(rsyntax_threads())
  }
  
  .nodes = selected_nodes(.tokens)
  if (nrow(.nodes$nodes) == 0) return(.tokens)
  
  if (!is.character(node)) stop('"node" argument has to be a character value')
  
  if (!is_deparsed_call(rm_subset)) rm_subset = deparse(substitute(rm_subset)) 
  rm_subset = if (rm_subset == 'NULL') NULL else .nodes_eval(.tokens, .nodes, rm_subset)
  if (!is.null(rm_subset)) {
    if (!any(rm_subset)) return(.tokens) ## if there are no nodes that meet the rm_subset condition, nothing is removed
  } else rm_subset = rep(TRUE, nrow(.nodes$nodes))
  
  ## remove fill first because nodes are used to find fill nodes
  if (with_fill) {
    fill_nodes = get_fill_nodes(.tokens, .nodes, node)
    if (!is_deparsed_call(rm_subset_fill)) rm_subset_fill = deparse(substitute(rm_subset_fill)) 
    rm_subset_fill = if (rm_subset_fill == 'NULL') NULL else eval(parse(text=rm_subset_fill), envir = fill_nodes, parent.frame())
    .tokens = do_remove_fill(.tokens, .nodes, fill_nodes, node, rm_subset_fill, rm_subset, keep_shared)
    .nodes = selected_nodes(.tokens)
  }
  
  drop_ids = .nodes$nodes[rm_subset, c('doc_id','sentence',node), with=FALSE]
  drop_ids = stats::na.omit(drop_ids)
  if (nrow(drop_ids) > 0) {
    data.table::setnames(drop_ids, old=node, new='token_id')
    .tokens = .tokens[!drop_ids, on=c('doc_id','sentence','token_id')]
    .nodes$nodes[[node]] = ifelse(rm_subset, NA, .nodes$nodes[[node]])
  }
  .tokens = fix_missing_parents(.tokens, warn = FALSE)  ## (function in token_index.r)
  
  data.table::setattr(.tokens, '.nodes', value = .nodes)
  .tokens[]
}

#' Remove fill
#' 
#' Like remove_nodes, but only removing the fill nodes
#' 
#' @param .tokens          A tokenIndex in which nodes are selected with \link{select_nodes}.  
#' @param node             The name of the node that is to be mutated
#' @param rm_subset_fill   A subset on the fill nodes. Can only directly use token column. For example, use pos == 'VERB' to remove only verbs
#' @param rm_subset        A subset expression (that evaluates to a logical vector) to more specifically specify which nodes to remove. The token column for each labeled node in the tquery can be referred to as label$column.
#' @param keep_shared      If there is another node that has the same fill nodes, should the fill nodes that are shared also be removed?
#'
#' @return A tokenIndex with a .nodes attribute
#' @export
#' @examples 
#' tokens = tokens_spacy[tokens_spacy$doc_id == 'text1',]
#' 
#' ## use a tquery to label the nodes that you want to manipulate
#' tq = tquery(pos = 'VERB',
#'             children(label = 'object', relation='dobj'))
#' 
#' ## apply query to select nodes
#' tokens2 = select_nodes(tokens, tq) 
#' 
#' remove_fill(tokens2, 'object')
remove_fill <- function(.tokens, node, rm_subset_fill=NULL, rm_subset=NULL, keep_shared=FALSE) {
  if (rsyntax_threads() != data.table::getDTthreads()) {
    old_threads = data.table::getDTthreads()
    on.exit(data.table::setDTthreads(old_threads))
    data.table::setDTthreads(rsyntax_threads())
  }
  
  .nodes = selected_nodes(.tokens)
  if (nrow(.nodes$nodes) == 0) return(.tokens)
  
  if (!is.character(node)) stop('"node" argument has to be a character value')
  
  if (!is_deparsed_call(rm_subset)) rm_subset = deparse(substitute(rm_subset)) 
  rm_subset = if (rm_subset == 'NULL') NULL else .nodes_eval(.tokens, .nodes, rm_subset)
  
  fill_nodes = get_fill_nodes(.tokens, .nodes, node)
  if (!is_deparsed_call(rm_subset_fill)) rm_subset_fill = deparse(substitute(rm_subset_fill)) 
  rm_subset_fill = if (rm_subset_fill == 'NULL') NULL else eval(parse(text=rm_subset_fill), envir = fill_nodes, parent.frame())
  
  do_remove_fill(.tokens, .nodes, fill_nodes, node, rm_subset_fill, rm_subset, keep_shared)
}

do_remove_fill <- function(.tokens, .nodes, fill_nodes, node, rm_subset_fill, rm_subset, keep_shared=FALSE) {
  ## here node must already be a character, and rm_subset and rm_subset_fill must already be logical vectors (or NULL)
  node_ids = .nodes$nodes$.ID

  if (!is.null(rm_subset_fill)) {
    fill_nodes = fill_nodes[rm_subset_fill,] 
  }
  if (nrow(fill_nodes) == 0) return(.tokens)
  
  if (!is.null(rm_subset)) {
    if (!any(rm_subset)) return(.nodes) ## if there are no nodes that meet the rm_subset condition, nothing is removed
    node_ids = unique(stats::na.omit(node_ids[rm_subset]))
    fill_nodes = fill_nodes[list(node_ids), on='.ID', nomatch=0, allow.cartesian=TRUE]
  }
  if (nrow(fill_nodes) == 0) return(.tokens)
  
  .nodes$fill = .nodes$fill[!fill_nodes, on=c('.ID','.ROLE')]
  
  if (keep_shared) {
    fill_nodes = fill_nodes[!.nodes$fill, on=c('doc_id','sentence','token_id')]  
    if (nrow(fill_nodes) == 0) return(.tokens)
  } 
  
  .tokens = .tokens[!fill_nodes, on=c('doc_id','sentence','token_id')]
  .tokens = fix_missing_parents(.tokens, warn=FALSE)  ## (function in token_index.r)
  
  data.table::setattr(.tokens, '.nodes', value = .nodes)
  .tokens[]
}

#' Copy nodes
#'
#' @param .tokens          A tokenIndex in which nodes are selected with \link{select_nodes}.  
#' @param node             The name of the node that is to be copied
#' @param new              The name given to the copy
#' @param subset           A subset expression (that evaluates to a logical vector). The token column for each labeled node in the tquery can be referred to as label$column.
#' @param keep_relation    If FALSE, remove relation (making node a root)
#' @param copy_fill        If TRUE, also copy the fill
#' @param subset_fill      A subset on the fill nodes. Can only directly use token column. For example, use pos == 'VERB' to copy only verbs
#' @param only_new         If TRUE, direct fill children will only be copied to to_node if it does not already have nodes of this relation. This is a good heuristic for dealing with argument drop. 
#' 
#' @return A tokenIndex with a .nodes attribute
#' @export
#' @examples
#' tokens = tokens_spacy[tokens_spacy$doc_id == 'text1',]
#' 
#' tq = tquery(label='object', relation='dobj')
#'             
#' tokens2 = select_nodes(tokens, tq)
#' selected_nodes(tokens2)
#' 
#' copy_nodes(tokens2, 'object', 'new_object')
#' 
#' tokens3 = copy_nodes(tokens2, 'object', 'new_object', copy_fill=TRUE)
#' 
#' \donttest{
#' if (interactive()) plot_tree(tokens3, token, pos)
#' }
copy_nodes <- function(.tokens, node, new, subset=NULL, keep_relation=TRUE, copy_fill=FALSE, subset_fill=NULL, only_new=NULL) {
  if (rsyntax_threads() != data.table::getDTthreads()) {
    old_threads = data.table::getDTthreads()
    on.exit(data.table::setDTthreads(old_threads))
    data.table::setDTthreads(rsyntax_threads())
  }
  
  parent = NULL; relation = NULL
  
  .nodes = selected_nodes(.tokens)  
  if (nrow(.nodes$nodes) == 0) return(.tokens)
  
  if (!is.character(node)) stop('"node" argument has to be a character value')
  if (!is.character(new)) stop('"new" argument has to be a character value')
  if (!node %in% colnames(.nodes$nodes)) stop(sprintf('node (%s) is not a valid node in the tquery results (use the label= argument)', node))
  
  if (!is_deparsed_call(subset)) subset = deparse(substitute(subset)) 
  subset = if (subset == 'NULL') NULL else .nodes_eval(.tokens, .nodes, subset)
  if (is.null(subset)) subset = rep(TRUE, nrow(.nodes$nodes))
  
  node_ids = .nodes$nodes[,c('doc_id','sentence',node),with=FALSE]
  node_ids = add_sub_id(.tokens, node_ids[subset])$token_id ## beware of the subset being used here
  .nodes$nodes[[new]] = double(nrow(.nodes$nodes))
  .nodes$nodes[subset, (new) := node_ids]
  
  node_vars = get_node_vars(.tokens, .nodes, node)
  if (nrow(node_vars) > nrow(.nodes$nodes)) browser()
  #.tokens[list(doc_id='text1042', sentence=1, token_id=7.3),]
  
  node_vars = node_vars[subset]
  node_vars$token_id = node_ids
  if (!keep_relation) {
    node_vars[, parent := NA]
    node_vars[, relation := 'ROOT']
  }
  .tokens = add_to_tokens(.tokens, node_vars)
  
  data.table::setattr(.tokens, '.nodes', value = .nodes)
  
  if (copy_fill) {
    fill_nodes = get_fill_nodes(.tokens, .nodes, node)
    if (!is_deparsed_call(subset_fill)) subset_fill = deparse(substitute(subset_fill)) 
    subset_fill = if (subset_fill == 'NULL') NULL else eval(parse(text=subset_fill), envir = fill_nodes, parent.frame())
    .tokens = do_copy_fill(.tokens, .nodes, fill_nodes, node, new, subset, subset_fill, only_new=NULL)
  }
  .tokens[]
}

#' Copy nodes
#'
#' @param .tokens          A tokenIndex in which nodes are selected with \link{select_nodes}.  
#' @param from_node        The name of the node from which fill is copied
#' @param to_node              The name of the node to which fill is copied
#' @param subset           A subset expression (that evaluates to a logical vector). The token column for each labeled node in the tquery can be referred to as label$column.
#' @param subset_fill      A subset on the fill nodes. Can only directly use token column. For example, use pos == 'VERB' to copy only verbs
#' @param only_new         If TRUE, direct fill children will only be copied to to_node if it does not already have nodes of this relation. This is a good heuristic for dealing with argument drop. 
#'
#' @return    A tokenIndex with a .nodes attribute
#' @export
#' @examples
#' tokens = tokens_spacy[tokens_spacy$doc_id == 'text1',]
#' 
#' tq = tquery(label='object', relation='dobj')
#'             
#' tokens2 = select_nodes(tokens, tq)
#' selected_nodes(tokens2)
#' 
#' tokens3 = copy_nodes(tokens2, 'object', 'new_object')
#' copy_fill(tokens3, 'object', 'new_object')
copy_fill <- function(.tokens, from_node, to_node, subset=NULL, subset_fill=NULL, only_new=NULL) {
  if (rsyntax_threads() != data.table::getDTthreads()) {
    old_threads = data.table::getDTthreads()
    on.exit(data.table::setDTthreads(old_threads))
    data.table::setDTthreads(rsyntax_threads())
  }
  
  .nodes = selected_nodes(.tokens)
  if (nrow(.nodes$nodes) == 0) return(.tokens)
  
  if (!is.character(from_node)) stop('"from_node" argument has to be a character value')
  if (!is.character(to_node)) stop('"to_node" argument has to be a character value')
  if (!from_node %in% colnames(.nodes$nodes)) stop(sprintf('from_node (%s) is not a valid node in the tquery results (use the label= argument)', from_node))
  if (!to_node %in% colnames(.nodes$nodes)) stop(sprintf('to_node (%s) is not a valid node in the tquery results (use the label= argument)', to_node))
  
  if (!is_deparsed_call(subset)) subset = deparse(substitute(subset)) 
  subset = if (subset == 'NULL') NULL else .nodes_eval(.tokens, .nodes, subset)
  
  fill_nodes = get_fill_nodes(.tokens, .nodes, from_node)
  
  if (!is_deparsed_call(subset_fill)) subset_fill = deparse(substitute(subset_fill)) 
  subset_fill = if (subset_fill == 'NULL') NULL else eval(parse(text=subset_fill), envir = fill_nodes, parent.frame())
  
  do_copy_fill(.tokens, .nodes, fill_nodes, from_node, to_node, subset, subset_fill, only_new)
}

do_copy_fill <- function(.tokens, .nodes, fill_nodes, from_node, to_node, subset=NULL, subset_fill=NULL, only_new=NULL) {
  .NEW_ID = NULL; token_id = NULL; parent = NULL; .ROLE = NULL
  ## here from_node and to_node must already be characters, and subset and subset_fill must already be logical vectors (or NULL)
  
  if (nrow(fill_nodes) == 0) return(.tokens)
  
  if (!is.null(subset_fill)) {
    fill_nodes = fill_nodes[subset_fill]
  }
  
  if (nrow(fill_nodes) == 0) return(.tokens)
  
  if (!is.null(only_new)) {
    only_new = 'relation' ## could be any column, but think only this makes sense
    first_children_parents = .nodes$nodes[,c('doc_id','sentence',to_node,'.ID'), with=FALSE]
    first_children = rbind(
      merge(.tokens, first_children_parents, by.x=c('doc_id','sentence','token_id'), by.y=c('doc_id','sentence',to_node)),
      merge(.tokens, first_children_parents, by.x=c('doc_id','sentence','parent'), by.y=c('doc_id','sentence',to_node))
    )
    first_children$.FILL_LEVEL = 1
    if (any(!only_new %in% colnames(first_children))) stop('some values in only_new are not valid token columns')
    remove_fill = first_children[,c('.FILL_LEVEL','.ID',only_new),with=FALSE]
    fill_nodes = fill_nodes[!remove_fill, on=c('.FILL_LEVEL','.ID',only_new)]
  }
  if (nrow(fill_nodes) == 0) return(.tokens)
  
  
  if (!is.null(subset)) {
    if (!all(subset)) {
      node_ids = unique(.nodes$nodes$.ID[subset])
      fill_nodes = fill_nodes[list(.ID=node_ids), on='.ID', nomatch=0, allow.cartesian=TRUE]
    }
  }
  if (nrow(fill_nodes) == 0) return(.tokens)
  
  id_index = match(fill_nodes$.ID, .nodes$nodes$.ID)
  from_node_ids = .nodes$nodes[[from_node]][id_index]
  to_node_ids = .nodes$nodes[[to_node]][id_index]
  
  fill_nodes[,.NEW_ID := add_sub_id(.tokens, fill_nodes[,c('doc_id','sentence','token_id')])$token_id]
  
  parent_index = fill_nodes[,c('doc_id','sentence','token_id','parent','.ID','token')]
  parent_index[, .I := 1:nrow(parent_index)]
  
  match_id = fill_nodes[,c('doc_id','sentence','token_id','.ID','.NEW_ID')]
  parent_index = merge(parent_index, match_id, by.x=c('doc_id','sentence','parent','.ID'), by.y=c('doc_id','sentence','token_id','.ID'), allow.cartesian=TRUE)
  .NEW_PARENT = rep(NA, nrow(fill_nodes))
  parent_index = stats::na.omit(parent_index)
  .NEW_PARENT[parent_index$.I] = parent_index$.NEW_ID
  
  fill_nodes[, token_id := .NEW_ID]
  ## if parent is the from_node id (i.e. fill is a direct child that connects the fill tree to the node), replace parent with node id, otherwise create the sub id
  is_direct_child = fill_nodes$parent == from_node_ids
  if (any(is_direct_child)) {
    fill_nodes[is_direct_child, parent := to_node_ids[is_direct_child]]
  }
  if (any(!is_direct_child)) {
    fill_nodes[!is_direct_child, parent := as.double(.NEW_PARENT[!is_direct_child])]
    fill_nodes = remove_isolate_fill(fill_nodes)
  }
  fill_nodes[, .ROLE := to_node]  
  .tokens = add_to_tokens(.tokens, fill_nodes)  
  .nodes = add_to_fill(.nodes, fill_nodes)  
  
  data.table::setattr(.tokens, '.nodes', value = .nodes)
  .tokens[]
}

remove_isolate_fill <- function(fill_nodes) {
  parent = NULL
  
  iso = fill_nodes[is.na(parent),] ## removes isolates, which can result from subsetting or using only_new
  fill_nodes = fill_nodes[!is.na(parent),]
  while (nrow(iso) > 0) {
    iso_i = fill_nodes[list(iso$doc_id, iso$sentence, iso$token_id), on=c('doc_id','sentence','parent'), nomatch=0, which=TRUE]
    if (length(iso_i) > 0) {
      iso = fill_nodes[iso_i,]
      fill_nodes = fill_nodes[!iso_i,]
    } else break
  }
  fill_nodes
}

is_deparsed_call <- function(x) {
  ## for use in functions to pass on deparsed calls, in particular for subset arguments
  ## determine whether x is a deparsed call (call as a string)
  tryCatch(is.character(eval(x)), error = function(e) FALSE, finally = TRUE)
}



add_to_tokens <- function(.tokens, new_tokens) {
  if (!identical(colnames(.tokens), colnames(new_tokens))) new_tokens = subset(new_tokens, select = colnames(.tokens))
  .tokens = unique(data.table::rbindlist(list(.tokens, new_tokens), use.names = TRUE, fill=TRUE))
  .tokens = as_tokenindex(.tokens)
  .tokens
}

add_to_fill <- function(.nodes, new_fill) {
  if (!identical(colnames(.nodes$fill), colnames(new_fill))) new_fill = subset(new_fill, select = colnames(.nodes$fill))
  .nodes$fill = unique(data.table::rbindlist(list(.nodes$fill, new_fill), use.names = TRUE, fill=TRUE))
  data.table::setindexv(.nodes$fill, '.ROLE')
  .nodes
}

.nodes_eval <- function(.tokens, .nodes, x) {
  ## x has to be an expressions given as a character value (deparse(substitute(x)))
  if (is.null(x)) return(NULL)
  linked_node_vars = get_linked_node_vars(.tokens, .nodes)
  eval(parse(text=x), envir = linked_node_vars, parent.frame())
}

get_node_vars <- function(.tokens, .nodes, node) {
  node_ids = .nodes$nodes[,c('doc_id','sentence',node),with=FALSE]
  data.table::setnames(node_ids, old=node, new='token_id')
  node_vars = .tokens[node_ids, on=c('doc_id','sentence','token_id')]
  #unique(node_vars, by=c('doc_id','sentence','token_id'))
  node_vars
}
  
get_linked_node_vars <- function(.tokens, .nodes) {
  node_names = setdiff(colnames(.nodes$nodes), c('.ID','doc_id','sentence'))
  node_vars = lapply(node_names, function(node) get_node_vars(.tokens, .nodes, node))
  names(node_vars) = node_names
  node_vars
}

get_node_position <- function(.tokens, .nodes, node) {
  node_ids = .nodes$nodes[,c('doc_id','sentence',node), with=FALSE]
  data.table::setnames(node_ids, old=node, new='token_id')
  .tokens[node_ids,on=c('doc_id','sentence','token_id'),which=TRUE]
}

get_fill_nodes <- function(.tokens, .nodes, node) {
  fill_table = .nodes$fill[list(node), on='.ROLE', nomatch=0]
  data.table::setkeyv(fill_table, c('doc_id','sentence','token_id'))
  merge(fill_table, .tokens, by=c('doc_id','sentence','token_id'))
}

increment_sub_id <- function(x) {
  ## use decimals as sub_id by mirroring increment. so increments 3.99 to 3.001 (mirroring 99 to 100), 3.001 to 3.101, etc. 
  int_x = floor(x) 
  dec = round(x - int_x, 6) ## see round at end, for some reason it has to go here as well
  dec = ifelse(dec > 0, stringi::stri_extract_first_regex(x, '[0-9]+$'), '0')
  dec = as.numeric(stringi::stri_reverse(dec)) + 1
  sub_id = as.numeric(stringi::stri_reverse(sprintf('%.1f', dec)))
  round(int_x + sub_id, 6) ## round is necessary, because sub_id is sliiiiiigly more than what you see, which messes up the stri_count_regex for number of digits 
}

add_sub_id <- function(.tokens, ids) {
  token_id = NULL
  
  .EXISTS = NULL
  ## ids should be a data.table with the first three columns being: doc_id, sentence, token_id
  ids = data.table::copy(ids)
  data.table::setnames(ids, c('doc_id','sentence','token_id'))
  data.table::setkeyv(ids, c('doc_id','sentence','token_id'))
  
  #matched = rep(1, nrow(ids)) ## ids should initially exists
  #exists = rep(TRUE, nrow(ids))
  ids[, .EXISTS := TRUE]
  while (any(ids$.EXISTS)) {
    ids[ids$.EXISTS, token_id := increment_sub_id(ids$token_id[ids$.EXISTS])]
    e = .tokens[unique(ids[ids$.EXISTS]), on=c('doc_id','sentence','token_id'), which=FALSE, nomatch=0]
    if (nrow(e) == 0) break
    ids[, .EXISTS := FALSE]
    ids[e, on=c('doc_id','sentence','token_id'), .EXISTS := TRUE]
  }
  
  dup = duplicated(ids, by = c('doc_id','sentence','token_id'))
  while (any(dup)) {
    ids[dup, token_id := increment_sub_id(ids$token_id[dup])]
    dup = duplicated(ids, by = c('doc_id','sentence','token_id'))
  }
  
  e = .tokens[unique(ids), on=c('doc_id','sentence','token_id'), which=FALSE, nomatch=0]
  if (nrow(e) > 0) {
    ids[, .EXISTS := NULL]
    #stop('en nog een keer!!')
    ids = add_sub_id(.tokens, ids)
  }
  
  ids
}
vanatteveldt/rsyntax documentation built on June 14, 2022, 7:42 a.m.