R/pmc_table.R

Defines functions pmc_table

Documented in pmc_table

#' Convert table nodes to tibbles
#'
#' Convert PubMed Central table nodes into a list of tibbles
#'
#' @param doc \code{xml_document} from PubMed Central
#'
#' @return a list of tibbles
#'
#' @note Saves the caption and footnotes as attributes and collapses multiline
#' headers, expands all rowspan and colspan attributes and adds
#' subheadings to column one.
#'
#' @author Chris Stubben
#'
#' @examples
#' # doc <- pmc_xml("PMC2231364")
#' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
#'   package = "tidypmc"
#' ))
#' x <- pmc_table(doc)
#' sapply(x, dim)
#' x
#' attributes(x[[1]])
#' @export

pmc_table <- function(doc) {
  if (class(doc)[1] != "xml_document") {
    stop("doc should be an xml_document from PubMed Central")
  }
  twn <- length(xml2::xml_find_all(doc, "//table-wrap"))
  ## Avoid table-wrap without table node, usually link to image only
  z <- xml2::xml_find_all(doc, "//table-wrap/table/..")
  if (length(z) == 0) {
    message("No tables found")
    if (twn > 0) message("Table-wrap with link to image?")
    tbls <- NULL
  } else {
    tbl_nodes <- xml2::xml_find_all(z, "./table")
    message("Parsing ", length(z), " tables")
    if (twn > length(z)) {
      message(twn - length(n), " /table-wrap with link to image?")
    }
    ## START table function
    #  t1 <- xml2::xml_find_all(doc, "//table")[1]
    tbls <- lapply(tbl_nodes, function(t1) {
      # PARSE HEADER
      x <- xml2::xml_find_all(t1, ".//thead/tr")
      # cat(as.character(x))
      ## missing header
      if (length(x) == 0) {
        thead <- NA
        ## 1 header row...
      } else if (length(x) == 1) {
        colspan <- as.numeric(xml2::xml_attr(
          xml2::xml_find_all(x, ".//td|.//th"), "colspan",
          default = "1"
        ))
        thead <- xml2::xml_text(xml2::xml_find_all(x, ".//td|.//th"))
        # repeat across colspan
        if (any(colspan > 1)) {
          thead <- rep(thead, colspan)
        }
        # mutliline header - collapse into single row
        # SEE  tables 1 and 2 in PMC3109299
      } else {
        nr <- length(x)
        nc <- max(vapply(x, function(y) sum(as.numeric(xml2::xml_attr(
            xml2::xml_find_all(y, ".//td|.//th"), "colspan",
            default = "1"
          ))), double(1)))
        c2 <- data.frame(matrix(NA, nrow = nr, ncol = nc))
        for (i in seq_len(nr)) {
          rowspan <- as.numeric(xml2::xml_attr(xml2::xml_find_all(
            x[[i]], ".//td|.//th"
          ), "rowspan", default = "1"))
          colspan <- as.numeric(xml2::xml_attr(xml2::xml_find_all(
            x[[i]], ".//td|.//th"
          ), "colspan", default = "1"))
          thead <- xml2::xml_text(xml2::xml_find_all(
            x[[i]], ".//td|.//th"
          ))
          if (any(colspan > 1)) {
            thead <- rep(thead, colspan)
            rowspan <- rep(rowspan, colspan)
          }
          # fill values into empty cells
          n <- which(is.na(c2[i, ]))
          ## truncate to avoid warning - see PMC3119406
          if (length(thead) != length(n)) thead <- thead[seq_along(n)]
          c2[ i, n] <- thead
          if (any(rowspan > 1)) {
            for (j in seq_along(rowspan)) {
              if (rowspan[j] > 1) {
                ## repeat value down column
                c2[(i + 1):(i + (rowspan[j] - 1)), n[j]] <- thead[j]
              }
            }
          }
        }
        ## COLLAPSE into single row...
        ## some rowspans may extend past nr!  see table 1 PMC3109299
        if (nrow(c2) > nr) c2 <- c2[seq_len(nr), ]
        ## collaps3 column names and row values uses ";" as separator
        thead <- apply(c2, 2, function(x)
          paste(unique(x), collapse = ": "))
        # some mutliline rows with horizontal lines only
        thead <- gsub(": : ", ": ", thead)
        thead <- gsub("^: ", "", thead)
        thead <- gsub(": $", "", thead)
      }
      #-------------------------------------------------------------------
      # PARSE TABLE
      # Do not repeat values with colspans across rows (usually table
      # subheaders). Repeat values with rowspan down columns
      x <- xml2::xml_find_all(t1, ".//tbody/tr")
      # number of rows
      nr <- length(x)
      nc <- max(vapply(x, function(y) sum(as.numeric(xml2::xml_attr(
          xml2::xml_find_all(y, ".//td|.//th"), "colspan",
          default = "1"
        ))), double(1)))
      c2 <- data.frame(matrix(NA, nrow = nr, ncol = nc))
      for (i in seq_len(nr)) {
        ## some table use //th  see table1 PMC3031304
        rowspan <- xml2::xml_attr(xml2::xml_find_all(
          x[[i]], ".//td|.//th"
        ), "rowspan", default = "1")
        colspan <- xml2::xml_attr(xml2::xml_find_all(
          x[[i]], ".//td|.//th"
        ), "colspan", default = "1")
        # PMC6358641 with rowspan=""
        rowspan <- as.numeric(ifelse(rowspan == "", 1, rowspan))
        colspan <- as.numeric(ifelse(colspan == "", 1, colspan))
        val <- xml2::xml_text(xml2::xml_find_all(x[[i]], ".//td|.//th"))
        # NO-BREAK, EN or EM SPACE
        val <- gsub("\u00A0|\u2002|\u2003", " ", val)
        val <- trimws(val)
        if (any(colspan > 1)) {
          val <- rep(val, colspan)
          ##  only display subheader in column 1?
          val[-1][val[-1] == val[-length(val)]] <- NA
          rowspan <- rep(rowspan, colspan)
        }
        # fill values into empty cells
        n <- which(is.na(c2[i, ]))

        # some tables have extra td tags  see table 2  PMC3109299
        # <td align="left" rowspan="1" colspan="1"/>
        # truncate to avoid warning??
        if (length(val) != length(n)) {
          val <- val[seq_along(n) ]
        }
        c2[ i, n] <- val
        if (any(rowspan > 1)) {
          for (j in seq_along(rowspan)) {
            if (rowspan[j] > 1) {
              ## repeat value down column
              c2[ (i + 1):(i + (rowspan[j] - 1)), n[j]] <- val[j]
            }
          }
        }
      }
      x <- c2
      #-------------------------------------
      if (!is.na(thead[1])) {
        thead[thead == ""] <- "X"
        tbn <- ncol(x)
        thn <- length(thead)
        if (tbn != thn) {
          message("Warning: number of column in /thead and /tbody do not match")
          if (tbn > thn) {
            thead <- append(thead, rep("X", tbn - thn))
          } else {
            ## see table 3 from PMC3020393
            thead <- thead[seq_len(tbn)]
          }
        }
        thead <- gsub("\n", " ", thead)
        thead <- make.unique(thead)
        colnames(x) <- thead
      }
      # DELETE empty rows  -
      if (nrow(x) > 1) {
        nX <- apply(x, 1, function(y) sum(!(is.na(y) | y == "")))
        x <- x[nX != 0, , FALSE] # use FALSE in case only 1 column in TABLE
      }
      # FIX column typess
      ## errors if newlines and tabs in cells(or colnames!)
      colnames(x) <- gsub("\n *", "", colnames(x))
      x <- tibble::as_tibble(x)
      x <- suppressMessages(repeat_sub(x))
      x
    })
    ### END table functino
    #----------------------------------------------------
    ## should have label and caption?
    f1 <- vapply(z, function(x) xml2::xml_text(
        xml2::xml_find_first(x, "./label")
      ), character(1))
    f2 <- vapply(z, function(x) xml2::xml_text(
        xml2::xml_find_first(x, "./caption")
      ), character(1))
    # check length, some table-wrap with more than 1 /table tag
    if (length(f1) == length(tbls)) {
      names(tbls) <- f1
    }
    else {
      message("Number of /table nodes is not the sampe as /table-wrap")
    }
    if (length(f2) == length(tbls)) {
      for (i in seq_along(tbls)) {
        attr(tbls[[i]], "caption") <- f2[i]
      }
    }
    ## footnotes
    fn <- vapply(z, function(x) xml2::xml_text(
        xml2::xml_find_first(x, "./table-wrap-foot")
      ), character(1))
    n <- which(!is.na(fn))
    if (length(n) > 0) {
      message("Adding footnotes to Table ", paste(n, collapse = ","))
      for (i in n) {
        attr(tbls[[i]], "footnotes") <- fn[i]
      }
    }
  }
  tbls
}

Try the tidypmc package in your browser

Any scripts or data that you put into this service are public.

tidypmc documentation built on Aug. 1, 2019, 5:05 p.m.