R/coll_sentmatch.R

Defines functions colloc_sentmatch

Documented in colloc_sentmatch

#' Sentence match retriever
#'
#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
#'
#' @description The function extract full sentence-match for a (set of significant) collocate(s) for a given nodeword.
#' @param collout List output of \code{\link{colloc_leipzig}}.
#' @param colloc Character vector of the collocate(s) whose sentence match(es) to be retrieved.
#' @param wspan Character vector of the window span in which the collocates occur. Default to \code{NULL}, which will retrieve the collocate's occurrence in all span.
#' @param nodeword Character vector specifying one of the nodewords if search parameter in \code{\link{colloc_leipzig}} includes more than one nodeword.
#' @param sampled Integer vector indicating the number of random sample of the sentence match to be retrieve. Default to \code{NULL}, which will retrieve all sentence-matches.
#'
#' @return Character vector of sentence-match(es).
#'
#' @seealso \code{\link{colloc_sentmatch_tagged}} for tagged and data frame version of the output.
#'
#' @export
#'
#' @importFrom assertthat assert_that
#'
#' @examples
#' collout <- colloc_leipzig(leipzig_corpus_list = demo_corpus_leipzig,
#'                      pattern = "mengatakan",
#'                      window = "r",
#'                      span = 3,
#'                      save_interim = FALSE)
#'
#' colloc_sentmatch(collout,
#'                  colloc = "bahwa",
#'                  sampled = 10)
#'
#' # This will produce message indicating that
#' # the queried sample number is higher than
#' # the sentence match for "akan"
#' colloc_sentmatch(collout,
#'                  colloc = "akan",
#'                  sampled = 10)
colloc_sentmatch <- function(collout, colloc = NULL, wspan = NULL, nodeword = NULL, sampled = NULL) {

  assertthat::assert_that(!is.null(colloc),
                          msg = "`colloc` argument cannot be NULL. Specify with one or more collocates!")
  w <- dplyr::quo(w)
  span <- dplyr::quo(span)
  node <- dplyr::quo(node)

  if(is.null(nodeword)) {

    if(!is.null(wspan)) {

      sent <- subset(collout[[1]], !!w %in% colloc & !!span %in% wspan)$sent_match

    } else {

      sent <- subset(collout[[1]], !!w %in% colloc)$sent_match

    }

  } else if (!is.null(nodeword)) {

    if(!is.null(wspan)) {

      sent <- subset(collout[[1]], !!w %in% colloc & !!node %in% nodeword & !!span %in% wspan)$sent_match

    } else {

      sent <- subset(collout[[1]], !!w %in% colloc & !!node %in% nodeword)$sent_match

    }

  }

  if(!is.null(sampled)) {

    if(length(sent) > sampled) {

      return(sample(sent, sampled, TRUE))

    } else {

      warning(paste("Returning all ", length(sent),"matches!\n  Length of matches (",
                        length(sent),
                        ") is lower than the number of the queried sample (",
                        sampled,
                        ").\n",
                        sep = ""))
      return(sent)

    }

  } else if(is.null(sampled)) {

    return(sent)

  }

}
gederajeg/collogetr documentation built on April 16, 2020, 11:58 a.m.