R/read_worksheet.R

## [ ] 18.3.1.1  anchor (Object Cell Anchor)
## [ ] 18.3.1.2  autoFilter (AutoFilter Settings)
## [ ] 18.3.1.3  brk (Break)
## [x] 18.3.1.4  c (Cell) -- xlsx_ct_cell
## [ ] 18.3.1.5  cellSmartTag (Cell Smart Tag)
## [ ] 18.3.1.6  cellSmartTagPr (Smart Tag Properties)
## [ ] 18.3.1.7  cellSmartTags (Cell Smart Tags)
## [ ] 18.3.1.8  cellWatch (Cell Watch Item)
## [ ] 18.3.1.9  cellWatches (Cell Watch Items)
## [ ] 18.3.1.10 cfRule (Conditional Formatting Rule)
## [ ] 18.3.1.11 cfvo (Conditional Format Value Object)
## [ ] 18.3.1.12 chartsheet (Chart Sheet)
## [x] 18.3.1.13 col (Column Width & Formatting) -- xlsx_ct_cols
## [ ] 18.3.1.14 colBreaks (Vertical Page Breaks)
## [ ] 18.3.1.15 color (Data Bar Color)
## [ ] 18.3.1.16 colorScale (Color Scale)
## [x] 18.3.1.17 cols (Column Information) -- xlsx_ct_cols
## [ ] 18.3.1.18 conditionalFormatting (Conditional Formatting)
## [ ] 18.3.1.19 control (Embedded Control)
## [ ] 18.3.1.20 controlPr (Embedded Control Properties)
## [ ] 18.3.1.21 controls (Embedded Controls)
## [ ] 18.3.1.22 customPr (Custom Property)
## [ ] 18.3.1.23 customProperties (Custom Properties)
## [ ] 18.3.1.24 customSheetView (Custom Chart Sheet View)
## [ ] 18.3.1.25 customSheetView (Custom Sheet View)
## [ ] 18.3.1.26 customSheetViews (Custom Chart Sheet Views)
## [ ] 18.3.1.27 customSheetViews (Custom Sheet Views)
## [ ] 18.3.1.28 dataBar (Data Bar)
## [ ] 18.3.1.29 dataConsolidate (Data Consolidate)
## [ ] 18.3.1.30 dataRef (Data Consolidation Reference)
## [ ] 18.3.1.31 dataRefs (Data Consolidation References)
## [ ] 18.3.1.32 dataValidation (Data Validation)
## [ ] 18.3.1.33 dataValidations (Data Validations)
## [ ] 18.3.1.34 dialogsheet (Dialog Sheet)
## [ ] 18.3.1.35 dimension (Worksheet Dimensions)
## [ ] 18.3.1.36 drawing (Drawing)
## [ ] 18.3.1.37 drawingHF (Drawing Reference in Header Footer)
## [ ] 18.3.1.38 evenFooter (Even Page Footer)
## [ ] 18.3.1.39 evenHeader (Even Page Header)
## [ ] 18.3.1.40 f (Formula)
## [ ] 18.3.1.41 firstFooter (First Page Footer)
## [ ] 18.3.1.42 firstHeader (First Page Header)
## [ ] 18.3.1.43 formula (Formula)
## [ ] 18.3.1.44 formula1 (Formula 1)
## [ ] 18.3.1.45 formula2 (Formula 2)
## [ ] 18.3.1.46 headerFooter (Header Footer Settings)
## [ ] 18.3.1.47 hyperlink (Hyperlink)
## [ ] 18.3.1.48 hyperlinks (Hyperlinks)
## [ ] 18.3.1.49 iconSet (Icon Set)
## [ ] 18.3.1.50 ignoredError (Ignored Error)
## [ ] 18.3.1.51 ignoredErrors (Ignored Errors)
## [ ] 18.3.1.52 inputCells (Input Cells)
## [ ] 18.3.1.53 is (Rich Text Inline)
## [x] 18.3.1.54 mergeCell (Merged Cell) -- xlsx_ct_merge_cell
## [x] 18.3.1.55 mergeCells (Merge Cells) -- xlsx_read_merged
## [ ] 18.3.1.56 objectPr (Embedded Object Properties)
## [ ] 18.3.1.57 oddFooter (Odd Page Footer)
## [ ] 18.3.1.58 oddHeader (Odd Header)
## [ ] 18.3.1.59 oleObject (Embedded Object)
## [ ] 18.3.1.60 oleObjects (Embedded Objects)
## [ ] 18.3.1.61 outlinePr (Outline Properties)
## [ ] 18.3.1.62 pageMargins (Page Margins)
## [ ] 18.3.1.63 pageSetup (Page Setup Settings)
## [ ] 18.3.1.64 pageSetup (Chart Sheet Page Setup)
## [ ] 18.3.1.65 pageSetUpPr (Page Setup Properties)
## [x] 18.3.1.66 pane (View Pane) -- xlsx_ct_pane
## [ ] 18.3.1.67 picture (Background Image)
## [ ] 18.3.1.68 pivotArea (Pivot Area)
## [ ] 18.3.1.69 pivotSelection (PivotTable Selection)
## [ ] 18.3.1.70 printOptions (Print Options)
## [ ] 18.3.1.71 protectedRange (Protected Range)
## [ ] 18.3.1.72 protectedRanges (Protected Ranges)
## [ ] 18.3.1.73 row (Row)
## [ ] 18.3.1.74 rowBreaks (Horizontal Page Breaks (Row))
## [ ] 18.3.1.75 scenario (Scenario)
## [ ] 18.3.1.76 scenarios (Scenarios)
## [ ] 18.3.1.77 securityDescriptor (Security Descriptor)
## [ ] 18.3.1.78 selection (Selection)
## [ ] 18.3.1.79 sheetCalcPr (Sheet Calculation Properties)
## [x] 18.3.1.80 sheetData (Sheet Data) -- xlsx_read_sheet_data
## [ ] 18.3.1.81 sheetFormatPr (Sheet Format Properties)
## [ ] 18.3.1.82 sheetPr (Sheet Properties)
## [ ] 18.3.1.83 sheetPr (Chart Sheet Properties)
## [ ] 18.3.1.84 sheetProtection (Chart Sheet Protection)
## [ ] 18.3.1.85 sheetProtection (Sheet Protection Options)
## [-] 18.3.1.86 sheetView (Chart Sheet View)
## [x] 18.3.1.87 sheetView (Worksheet View) -- xlsx_ct_worksheet_view
## [x] 18.3.1.88 sheetViews (Sheet Views) -- xlsx_ct_worksheet_views
## [-] 18.3.1.89 sheetViews (Chart Sheet Views)
## [ ] 18.3.1.90 smartTags (Smart Tags)
## [ ] 18.3.1.91 sortCondition (Sort Condition)
## [ ] 18.3.1.92 sortState (Sort State)
## [ ] 18.3.1.93 tabColor (Sheet Tab Color)
## [ ] 18.3.1.94 tablePart (Table Part)
## [ ] 18.3.1.95 tableParts (Table Parts)
## [ ] 18.3.1.96 v (Cell Value)
## [ ] 18.3.1.97 webPublishItem (Web Publishing Item)
## [ ] 18.3.1.98 webPublishItems (Web Publishing Items)
## [ ] 18.3.1.99 worksheet (Worksheet)

## 18.3.1.17 cols
xlsx_ct_cols <- function(xml, ns) {
  classes <- c(best_fit="logical", collapsed="logical",
               custom_width="logical", hidden="logical",
               min="integer", max="integer", outline_level="integer",
               style="integer", width="numeric")
  process_container(xml, xlsx_name("cols", ns), ns, xlsx_ct_col,
                    classes=classes)
}

## NOTE: width is a funny measurement that is a hybrid of pixels and
## character width:
##
## > Column width measured as the number of characters of the maximum
## > digit width of the numbers 0, 1, 2, ..., 9 as rendered in the
## > normal style's font. There are 4 pixels of margin padding (two on
## > each side), plus 1 pixel padding for the gridlines.
##
## But the actual calculation of how much space there is will vary
## with things like the border and padding thickness, boldness, etc.
xlsx_ct_col <- function(xml, ns) {
  at <- as.list(xml2::xml_attrs(xml))
  tibble::tibble(
    best_fit = attr_bool(at$bestFit, FALSE),
    collapsed = attr_bool(at$collapsed, FALSE),
    custom_width = attr_bool(at$customWidth, FALSE),
    hidden = attr_bool(at$hidden, FALSE),
    min = attr_integer(at$min),
    max = attr_integer(at$max),
    outline_level = attr_integer(at$outlineLevel),
    ## phonetic = attr_bool(at$phonetic),
    style = attr_integer(at$style), # for new cols only
    width = attr_numeric(at$width))
}

## 18.3.1.55 mergeCells (Merge Cells)
xlsx_read_merged <- function(xml, ns) {
  merged <- xml2::xml_children(
    xml2::xml_find_first(xml, xlsx_name("mergeCells", ns), ns))
  lapply(merged, xlsx_ct_merge_cell)
}

## 18.3.1.54 mergeCell (Merged Cell)
xlsx_ct_merge_cell <- function(x) {
  cellranger::as.cell_limits(xml2::xml_attr(x, "ref"))
}

## 18.3.1.80 sheetData (Sheet Data)
xlsx_read_sheet_data <- function(xml, ns, strings) {
  rows <- xml2::xml_children(
    xml2::xml_find_first(xml, xlsx_name("sheetData", ns), ns))
  dat <- lapply(rows, xlsx_ct_row, ns, strings)

  cells <- rbind_df(unlist(lapply(dat, "[[", "cells"), FALSE),
                    c(ref="character", style="integer", type="character",
                      formula="character", value="list"))
  rows <- rbind_df(lapply(dat, "[[", "row"),
                   vcapply(xlsx_ct_row(NULL)$row, storage.mode))

  list(rows=rows, cells=cells)
}

## 18.3.1.73 row (Row)
xlsx_ct_row <- function(xml, ns, strings) {
  ## NOTE: the r attribute is optional and the <cell> 'r' attribute is
  ## optional so it seems possible that we could end up unable to
  ## determine where rows and cells are?  Unless the the fact that
  ## rows are an xsd:sequence comes in to help?
  at <- xml_attrs_list(xml)

  row <- list(
    r = attr_integer(at$r),
    spans = attr_character(at$spans),
    s = attr_integer(at[["s"]]), # style
    custom_format = attr_bool(at$customFormat, FALSE),
    ht = attr_numeric(at$ht), # height -- vs <col> that uses "width" :-/
    hidden = attr_bool(at$hidden, FALSE),
    custom_height = attr_bool(at$customHeight),
    outline_level = attr_integer(at$outlineLevel),
    collapsed = attr_bool(at$collapsed, FALSE),
    ## ph = attr_bool(at$ph, FALSE))
    thick_top = attr_bool(at$thickTop, FALSE),
    thick_bot = attr_bool(at$thickBot, FALSE))

  if (is.null(xml)) {
    cells <- list()
  } else {
    cells <- lapply(xml2::xml_find_all(xml, xlsx_name("c", ns), ns),
                    xlsx_ct_cell, ns, strings)
  }

  list(row=row, cells=cells)
}

## 18.3.1.4  c (Cell)
##
## > While a cell can have a formula element f and a value element v,
## > when the cell's type t is inlineStr then only the element is is
## > allowed as a child element.
##
## Ignoring attributes:
## * cm metadata
## * vm metadat
## * ph phonetic information
##
## Cell types are enumerated in 18.18.11:
##
##   b: Boolean
##   d: Date (in the ISO 8601 format)
##   e: Error
##   inlineStr: Inline String (i.e., one not in the shared string
##              table. If this cell type is used, then the cell value
##              is in the is element rather than the v element in the
##              cell (c element).)
##   n: Number
##   s: Shared String
##   str: String (a formula string)
xlsx_ct_cell <- function(xml, ns, strings) {
  at <- as.list(xml2::xml_attrs(xml))

  type <- attr_character(at$t)
  if (identical(type, "inlineStr")) { # avoid missingness
    formula <- NA_character_
    ## value here _must_ be present, so no conditional (vs below)
    value <- xlsx_ct_rst(xml2::xml_find_first(xml, xlsx_name("is", ns), ns), ns)
  } else {
    formula <- xml2::xml_text(xml2::xml_find_first(xml, xlsx_name("f", ns), ns))
    v <- xml2::xml_find_first(xml, xlsx_name("v", ns), ns)
    value <- if (inherits(v, "xml_missing")) NULL else xml2::xml_text(v)

    ## String substitutions from the string table:
    if (identical(type, "s")) {
      value <- strings[as.integer(unlist(value)) + 1L]
    } else if (identical(type, "b")) {
      ## TODO: This is probably worth doing?
      ## value <- as.logical(as.integer(value))
      value <- as.numeric(value)
    } else if (!is.null(value) &&
               (is.na(type) || identical(type, "n") || identical(type, "d"))) {
      ## TODO: Consider being more careful with the 'd' class here?
      value <- as.numeric(value)
    }
  }

  list(
    ref = attr_character(at$r),
    style = attr_integer(at$s) + 1L,
    type = type,
    formula = formula,
    value = value)
}

## 18.3.1.88 sheetViews
xlsx_ct_worksheet_views <- function(xml, ns) {
  xpath <- sprintf("./%s/%s",
                   xlsx_name("sheetViews", ns), xlsx_name("sheetView", ns))
  els <- xml2::xml_find_all(xml, xpath, ns)
  if (length(els) == 0L) {
    NULL
  } else if (length(els) == 1L) {
    xlsx_ct_worksheet_view(els[[1L]], ns)
  } else {
    tmp <- lapply(els, xlsx_ct_worksheet_view, ns)
    empty <- vlapply(tmp, is.null)
    if (all(empty)) {
      NULL
    } else {
      ## TODO: check the Enron corpus;
      ##    larry_campbell__21047__EOTT tanks tx NM.xlsx
      ## for a file where this is the case.  It needs opening in Excel
      ## to see what this resolves as.  Given we largely ignore these,
      ## we'll take the first for now.
      tmp[[which(!empty)[[1L]]]]
    }
  }
}

## 18.3.1.87 sheetView
xlsx_ct_worksheet_view <- function(xml, ns) {
  pane <- xml2::xml_find_first(xml, xlsx_name("pane", ns), ns)
  if (inherits(pane, "xml_missing")) NULL else xlsx_ct_pane(pane)
}

## 18.3.1.66 pane (View Pane)
xlsx_ct_pane <- function(xml, ns) {
  at <- xml_attrs_list(xml)
  ## x_split: Horizontal position of the split, in 1/20th of a point;
  ##    0 if none.  If the pane is frozen, this value indicates the
  ##    number of columns visible in the top pane.
  ## y_split: Vertical position of the split, in 1/20th of a point;
  ##    0 if none.  If the pane is frozen, this value indicates the
  ##    number of columns visible in the left pane.
  ## state: one of frozen / frozenSplit / split
  state <- attr_character(at$state)
  list(x_split = attr_numeric(at$xSplit, 0),
       y_split = attr_numeric(at$ySplit, 0),
       top_left = attr_character(at$topLeftCell),
       state = state,
       frozen = state != "split")
}
rsheets/rexcel documentation built on May 28, 2019, 3:32 a.m.