R/describe.r

Defines functions print.docx docx_describe_cmnts docx_describe_tbls

Documented in docx_describe_cmnts docx_describe_tbls print.docx

#' Returns a description of all the tables in the Word document
#'
#' This function will attempt to discern the structure of each of the tables
#' in \code{docx} and print this information
#'
#' @param docx \code{docx} object read with \code{read_docx}
#' @export
#' @examples
#' complx <- read_docx(system.file("examples/complex.docx", package="docxtractr"))
#' docx_tbl_count(complx)
#' docx_describe_tbls(complx)
docx_describe_tbls <- function(docx) {

  ensure_docx(docx)
  if (!docx_tbl_count(docx) > 0) {
    message("No tables in document")
    return(invisible(NULL))
  }

  ns <- docx$ns
  tbls <- docx$tbls

  cat(sprintf("Word document [%s]\n\n", docx$path))

  for (i in 1:length(tbls)) {

    tbl <- tbls[[i]]

    cells <- xml2::xml_find_all(tbl, "./w:tr/w:tc", ns=ns)
    rows <- xml2::xml_find_all(tbl, "./w:tr", ns=ns)

    cell_count_by_row <- purrr::map_int(rows, ~{ length(xml2::xml_find_all(.x, "./w:tc", ns)) })
    row_counts <- paste0(unique(cell_count_by_row), collapse=", ")
    max_cell_count <- max(cell_count_by_row)

    cat(sprintf("Table %d\n  total cells: %d\n  row count  : %d\n",
                i, length(cells), length(rows)))

    # simplistic test for whether table is uniform rows x cells == cell count
    if ((max_cell_count * length(rows)) == length(cells)) {
      cat("  uniform    : likely!\n")
    } else {
      cat(sprintf(
        "  uniform    : unlikely => found differing cell counts (%s) across some rows\n",
        row_counts))
    }

    # microsoft has a tag for some table structure info. examine it to
    # see if the creator of the header made the first row special which
    # will likely mean it's a header candidate
    hdr <- has_header(tbl, rows, ns)
    if (is.na(hdr)) {
      cat("  has header : unlikely\n")
    } else {
      cat(sprintf("  has header : likely! => possibly [%s]\n", hdr))
    }

    cat("\n")

  }

}

#' Returns information about the comments in the Word document
#'
#' @param docx \code{docx} object read with \code{read_docx}
#' @export
#' @examples
#' cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr"))
#' docx_cmnt_count(cmnts)
#' docx_describe_cmnts(cmnts)
docx_describe_cmnts <- function(docx) {

  ensure_docx(docx)
  if (!docx_cmnt_count(docx) > 0) {
    message("No comments in document")
    return(invisible(NULL))
  }

  ns <- docx$ns
  cmnts <- docx$cmnts

  cat(sprintf("Word document [%s]\n\n", docx$path))

  cat(sprintf("Found %d comments.\n", length(cmnts)))

  purrr::map_df(xml_attrs(cmnts), function(x) {
    as.data.frame(
      t(cbind.data.frame(x, stringsAsFactors=FALSE)),
      stringsAsFactors = FALSE
    )
  }) -> meta

  cmnt_df <- dplyr::bind_cols(meta,
                       cbind.data.frame(comment_text=xml2::xml_text(cmnts),
                                        stringsAsFactors=FALSE))

  aut_df <- dplyr::count(cmnt_df, author)
  aut_df <- dplyr::arrange(aut_df, -n)

  print(select(aut_df, author, `# Comments`=n))

}

#' Display information about the document
#'
#' @param x \code{docx} object
#' @param ... ignored
#' @export
print.docx <- function(x, ...) {
  docx_describe_tbls(x)
  docx_describe_cmnts(x)
}
hrbrmstr/docxtractr documentation built on July 11, 2020, 10:06 p.m.