R/find_nodes.r
In rsyntax: Extract Semantic Relations from Text by Querying and Reshaping Syntax

find_nodes <- function(tokens, tquery, block=NULL, use_index=TRUE, name=NA, fill=TRUE, melt=TRUE, root_dist=FALSE) {
  .MATCH_ID = NULL; .DROP = NULL; .ID = NULL ## declare data.table bindings
  tokens = as_tokenindex(tokens)  
  block = get_long_ids(block)
  
  nodes = filter_tokens(tokens, lookup=tquery$lookup, .G_ID=tquery$g_id, .BLOCK=block, use_index=use_index)
  if (nrow(nodes) == 0) return(NULL)
  nodes = subset(nodes, select = c('doc_id','sentence','token_id'))

  any_nonfill_nested = any(sapply(tquery$nested, function(x) !methods::is(x, 'tQueryFill')))
  if (any_nonfill_nested) {
    nodes = find_nested(tokens, nodes, tquery, block, fill=FALSE, block_loop=F)
  } else {
    data.table::setnames(nodes, old = 'token_id', new='.ID')
    if (!is.na(tquery$label)) nodes[,(tquery$label) := .ID]
  } 

  if (is.null(nodes)) return(NULL)  
  if (nrow(nodes) == 0) return(NULL)

  ### possible solution for removing block within rec_search
  nodes = get_root_dist(tokens, nodes)
  nodes = get_unique_patterns(nodes)
  if (!root_dist) nodes$.ROOT_DIST = NULL
  
  if (fill) nodes = add_fill(tokens, nodes, tquery, block=nodes)

  
  nodes = create_unique_key(nodes, name, tquery)
  if (melt) {
    nodes = melt_nodes_list(nodes)
  }
  nodes[]
}

find_nested <- function(tokens, nodes, tquery, block, fill, block_loop) {
  .ID = NULL; .MATCH_ID = NULL

  nodes = rec_find(tokens, ids=nodes, ql=tquery$nested, block=block, fill=fill, block_loop=block_loop)
  
  if (nrow(nodes) == 0) return(NULL)
  nodes[, .ID := .MATCH_ID]
  data.table::setcolorder(nodes, c('.ID', setdiff(colnames(nodes), '.ID')))
  
  if (is.na(tquery$label)) {
    nodes[,.MATCH_ID := NULL]
  } else {
    data.table::setnames(nodes, '.MATCH_ID', tquery$label)
  }
  
  dropcols = grep('.DROP.*', colnames(nodes), value=TRUE)
  if (length(dropcols) > 0) nodes[, (dropcols) := NULL]
  
  unique(nodes)
}

add_fill <- function(tokens, nodes, tquery, block, level=1) {
  is_fill = sapply(tquery$nested, methods::is, 'tQueryFill')

  if (any(!is_fill)) {
    for (tq in tquery$nested[!is_fill]) {
      nodes = add_fill(tokens, nodes, tq, block, level+1)
    }
  } 
  
  if (any(is_fill)) {
    if (is.na(tquery$label)) {
      if (level == 1) match_id = '.ID' else return(nodes)
    } else match_id = tquery$label
    if (!match_id %in% colnames(nodes)) return(nodes)
    ids = subset(nodes, select = c('doc_id','sentence',match_id))
    ids = unique(stats::na.omit(ids))
    add = rec_find(tokens, ids, tquery$nested[is_fill], block = block, fill=TRUE, block_loop=T)
    
    if (grepl('#', tquery$label)) {
      label = gsub('#.*', '', tquery$label)
      label = paste0('^', label, '\\_')
      colnames(add) = gsub(label, tquery$label, colnames(add))
    }

    if (nrow(add) > 0) {
      setkeyv(nodes, c('doc_id','sentence',match_id))
      nodes = merge(nodes, add, by.x=c('doc_id','sentence',match_id), by.y=c('doc_id','sentence','.MATCH_ID'), all.x=TRUE, allow.cartesian=TRUE)
      dropcols = grep('.DROP.*', colnames(nodes), value=TRUE)
      if (length(dropcols) > 0) nodes[, (dropcols) := NULL]
    }
  }
  unique(nodes)
}

get_top_label <- function(tquery) {
  ## get the first label in a tquery.
  if (!is.na(tquery$label)) return(tquery$label)
  if (!is.null(tquery$nested)) {
    for (nested in tquery$nested) {
      label = get_top_label(nested)
      if (!is.na(label)) return(label)
    }
  }
  return(NA)
}

create_unique_key <- function(nodes, name, tquery){
  id_col = get_top_label(tquery)
  if (!is.na(name)) {
    key = paste0(name, '#', nodes$doc_id, '.', nodes$sentence, '.', nodes[[id_col]])
  } else {
    key = paste0(nodes$doc_id, '.', nodes$sentence, '.', nodes[[id_col]])
  }

  nodes$.ID = paste0(nodes$doc_id, '...', nodes$sentence, '...', nodes$.ID) ## quick fix for matching on 3 columns
  key = key[match(nodes$.ID, nodes$.ID)] ## give same id to nodes with same .ID
  #key = paste0(name, '#', 1:nrow(nodes))
  nodes$.ID = key
  return(nodes)
}

get_root_dist <- function(tokens, nodes) {
  .ROOT_DIST = NULL
  tf = token_family(tokens, unique(data.table(doc_id=nodes$doc_id, sentence=nodes$sentence, token_id=nodes$.ID)), 
                    depth=Inf, level='parents', minimal=TRUE, show_level=TRUE, replace=TRUE)
  tf = data.table::setorderv(tf, cols = '.FILL_LEVEL', order = -1)
  tf = unique(tf, by=c('doc_id','sentence','.MATCH_ID'))
  data.table::setnames(tf, c('.FILL_LEVEL', '.MATCH_ID'), c('.ROOT_DIST', '.ID'))
  tf = subset(tf, select=c('doc_id','sentence','.ID','.ROOT_DIST'))
  nodes = merge(nodes, tf, by = c('doc_id','sentence','.ID'), all.x=TRUE)
  #nodes = nodes[list(tf$doc_id, tf$sentence, tf$.MATCH_ID), .ROOT_DIST := tf$.FILL_LEVEL, on=c('doc_id','sentence','.ID')]
  nodes[is.na(nodes$.ROOT_DIST), .ROOT_DIST := 0]
  nodes
}



get_unique_patterns <- function(nodes) {
  ln = nodes
  ln$i = 1:nrow(ln)
  ln = data.table::melt(ln, id.vars=c('doc_id','sentence','.ID','i','.ROOT_DIST'))
  ln = ln[!is.na(ln$value),]
  data.table::setorderv(ln, c('doc_id','sentence','.ID','i'))

  ## rm duplicate i-value pairs
  rm_i = unique(ln$i[duplicated(ln[,c('i','value')])])
  if (length(rm_i > 0)) ln = ln[-ln[list(i=rm_i), on='i', which=T]]
  
  ## If nodes are matched multiple times, remove the ones where the root dist is higher
  ## (these are most often nested in the other pattern, unless very compliated tqueries are used)
  possible_dupl = unique(ln$value[duplicated(ln[,c('doc_id','sentence','value')])])
  possible_dupl = unique(ln[list(value=possible_dupl),,on='value']$i)
  dup = get_duplicates(ln[list(i=possible_dupl),,on='i'])
  
  ## A complication is that once we remove a duplicate/nested pattern, 
  ## It might also solve another duplicate. So we can't just remove
  ## all duplicates. We can see which duplicates certainly need to be removed
  ## by looking which duplicates are not 'solved' by removing other duplicates
  ## We then repeat this until no remain
  ## this loop is guaranteed to remove one pattern per sentence per iteration (so it's fairly short) 
  rm_j = rep(F, nrow(nodes))
  while (TRUE) {
    dupl_pat = unique(dup$i.x)    ## what are patterns (by index i in nodes) that seem to be duplicates?
    possible_nondupl = dup[list(i.y = dupl_pat),,on='i.y',which=T,nomatch=0] ## get all duplicates where the matched pattern is a duplicate in another pattern 
    if (length(possible_nondupl) > 0) {
      certain_dupl = dup[-possible_nondupl]    ## if the matched pattern is not a duplicate in another pattern, we can be sure it's a definitive duplicate
      rm_j[certain_dupl$i.x] = T
    } else {
      rm_j[dup$i.x] = T
      break
    }
    possible_dupl = setdiff(possible_nondupl, unique(certain_dupl$i.x))  ## now repeat for remaining possible duplicates
    dup = get_duplicates(ln[list(i=possible_dupl),,on='i'])
  }
  rm_j = which(rm_j)

  nodes
  if (length(rm_i) > 0 || length(rm_j) > 0)
    nodes = nodes[-unique(c(rm_i, rm_j)),]
  
  nodes
  
}  

get_duplicates <- function(ln, priority='higher') {
  ln_m = merge(ln[,!colnames(ln) == 'variable', with=F], 
               ln[,c('doc_id','sentence','.ID','.ROOT_DIST','value','i')], 
               by=c('doc_id','sentence','value'), allow.cartesian = T)
  ln_m = ln_m[ln_m$.ID.x != ln_m$.ID.y,]
  
  if (priority == 'higher') {
    dupl = ifelse(ln_m$.ROOT_DIST.x != ln_m$.ROOT_DIST.y, 
                  ln_m$.ROOT_DIST.x > ln_m$.ROOT_DIST.y,       ## remove x if x lower in tree
                  ln_m$.ID.x > ln_m$.ID.y)                     ## and if same height remove x if to the left in sentence
  } else {
    dupl = ifelse(ln_m$.ROOT_DIST.x != ln_m$.ROOT_DIST.y, 
                  ln_m$.ROOT_DIST.x < ln_m$.ROOT_DIST.y,       ## remove x if x higher in tree
                  ln_m$.ID.x > ln_m$.ID.y)                     ## and if same height remove x if to the left in sentence
  }
  ui = unique(ln_m$i.x[dupl])
  ln_m[list(i.x=ui),,on='i.x']
}

Any scripts or data that you put into this service are public.

rsyntax documentation built on June 7, 2022, 9:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

rsyntax
Extract Semantic Relations from Text by Querying and Reshaping Syntax

R/find_nodes.r
In rsyntax: Extract Semantic Relations from Text by Querying and Reshaping Syntax

Defines functions get_duplicates get_unique_patterns get_root_dist create_unique_key get_top_label add_fill find_nested find_nodes

Try the rsyntax package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rsyntax Extract Semantic Relations from Text by Querying and Reshaping Syntax

R/find_nodes.r In rsyntax: Extract Semantic Relations from Text by Querying and Reshaping Syntax

Defines functions get_duplicates get_unique_patterns get_root_dist create_unique_key get_top_label add_fill find_nested find_nodes

Try the rsyntax package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rsyntax
Extract Semantic Relations from Text by Querying and Reshaping Syntax

R/find_nodes.r
In rsyntax: Extract Semantic Relations from Text by Querying and Reshaping Syntax