R/corp-df2corp.R

Defines functions post2qcorp post2tmcorp comment2qcorp comment2tmcorp

Documented in comment2qcorp comment2tmcorp post2qcorp post2tmcorp

#' Convert post data frame to corpus objects
#'
#' These functions convert data frame to corpus objects
#' same as those constructed with
#' \code{\link[quanteda]{corpus}}, \code{\link[tm]{VCorpus}},
#' and \code{\link[tm]{PCorpus}}. The list-column
#' \code{comment} of the original post data frame is dropped
#' for not complicating the data structure of the corpus.
#'
#' @param df Data frame. A data frame generated by
#'   \code{\link{post2df}}.
#' @param PCorpus Logical. Whether to contruct corpus with
#'   \code{\link[tm]{PCorpus}}. If \code{FALSE}, uses
#'   \code{\link[tm]{VCorpus}}. Defaults to \code{FALSE}.
#' @param ... Additional arguments passed on to
#'   \code{\link[quanteda]{corpus}}, \code{\link[tm]{VCorpus}},
#'   or \code{\link[tm]{PCorpus}}.
#'
#' @seealso \code{\link{comment2qcorp}}, \code{\link{comment2tmcorp}}
#' @name post2corpus
NULL


#' Convert post data frame to quanteda corpus
#' @rdname post2corpus
#'
#' @examples
#' library(quanteda)
#' post_df  <- example_posts()
#' post_corp <- post2qcorp(post_df)
#'
#' summary(post_corp)
#'
#' @importFrom dplyr rename
#' @importFrom magrittr %>%
#' @importFrom quanteda corpus
#' @export
post2qcorp <- function(df, ...) {
  df <- df[, -which(colnames(df) == "comment")]
  post_corp <- corpus(df,
                 docid_field = "title",
                 text_field = "content", ...)
}


#' Convert post data frame to tm corpus
#' @rdname post2corpus
#'
#' @examples
#' library(tm)
#' post_df  <- example_posts()
#' post_corp <- post2tmcorp(post_df)
#'
#' inspect(post_corp)
#' meta(post_corp, "author")
#'
#' @importFrom tm PCorpus VCorpus DataframeSource
#' @importFrom dplyr rename select everything
#' @importFrom magrittr %>%
#' @export
post2tmcorp <- function(df, PCorpus = FALSE, ...) {
  df <- df[, -which(colnames(df) == "comment")]

  df <- rename(df, "doc_id" = "link") %>%
    rename("text" = "content") %>%
    select("doc_id", "text", everything())

  ifelse(PCorpus == T,
         docs <- PCorpus(DataframeSource(df), ...),
         docs <- VCorpus(DataframeSource(df), ...))
  return(docs)
}




#' Convert 'comment' list-column to 'corpus' list-column
#'
#' These functions convert data frames stored in the list-column
#' 'comment' of the data frame returned by \code{\link{post2df}}
#'  to corpus objects in \code{\link[quanteda]{corpus}},
#'  \code{\link[tm]{VCorpus}}, and \code{\link[tm]{PCorpus}}.
#'
#' @param df Data frame. A data frame generated by
#'   \code{\link{post2df}}.
#' @param PCorpus Logical. Whether to contruct corpus with
#'   \code{\link[tm]{PCorpus}}. If \code{FALSE}, uses
#'   \code{\link[tm]{VCorpus}}. Defaults to \code{FALSE}.
#' @param ... Additional arguments passed on to
#'   \code{\link[quanteda]{corpus}}, \code{\link[tm]{VCorpus}},
#'   or \code{\link[tm]{PCorpus}}.
#'
#' @seealso \code{\link{post2qcorp}}, \code{\link{post2tmcorp}}
#' @name comment2corpus
NULL




#' Convert comment list-column to quanteda corpus list-column
#'
#' @rdname comment2corpus
#'
#' @importFrom quanteda corpus
#' @importFrom tibble data_frame
#' @export
comment2qcorp <- function(df) {
  comment_corp <- vector("list", length = nrow(df))

  for (i in seq_along(df$comment)) {
    if (is.null(df$comment[[i]])) {
      comment_corp[[i]] <- df$comment[[i]]
    } else {
      comment_corp[[i]] <- corpus(df$comment[[i]],
                                  text_field = "comment",
                                  docid_field = "time")
    }
  }
  df <- data_frame(post_id = df$link,
                   comment = comment_corp)
}
# toks <- tokens(corp)
# key_in_context <- kwic(toks, "魯")




#' Convert comment list-column to tm corpus list-column
#'
#' @rdname comment2corpus
#'
#' @importFrom tm PCorpus VCorpus DataframeSource
#' @importFrom dplyr mutate rename select everything
#' @importFrom magrittr %>%
#' @export
comment2tmcorp <- function(df, PCorpus = FALSE, ...) {

  if (PCorpus == T) {
    corp <- function(df, ...) PCorpus(DataframeSource(df), ...)
  } else {
    corp <- function(df, ...) VCorpus(DataframeSource(df), ...)
  }

  as_source <- function(df) {
    df <- df %>%
      mutate("doc_id" = 1:nrow(df)) %>%
      rename("text" = "comment") %>%
      select("doc_id", "text", everything())
  }

  comment_corp <- vector("list", length = nrow(df))
  for (i in seq_along(df$comment)) {
    if (is.null(df$comment[[i]])) {
      comment_corp[[i]] <- df$comment[[i]]
    } else {
      comment_corp[[i]] <- corp(as_source(df$comment[[i]]), ...)
    }
  }
  df <- data_frame(post_id = df$link,
                   comment = comment_corp)
}
liao961120/pttR documentation built on Dec. 16, 2019, 2:19 a.m.