tidycells-intro.R
In tidycells: Read Tabular Data from Diverse Sources and Easily Make Them Tidy

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, include = FALSE---------------------------------------------------
library(tidycells)

## ---- eval=FALSE--------------------------------------------------------------
#  read_cells(file_name)

## -----------------------------------------------------------------------------
fold <- system.file("extdata", "messy", package = "tidycells", mustWork = TRUE)
# this is because extension is intentionally given wrong  
# while filename is actual identifier of the file type
fcsv <- list.files(fold, pattern = "^csv.", full.names = TRUE)[1]

## ---- eval=FALSE--------------------------------------------------------------
#  utils::read.csv(fcsv)

## ---- echo=FALSE--------------------------------------------------------------
knitr::kable(utils::read.csv(fcsv), align = "l", longtable = FALSE)

## ---- eval=FALSE--------------------------------------------------------------
#  read_cells(fcsv)

## ---- echo=FALSE--------------------------------------------------------------
knitr::kable(read_cells(fcsv))

## ---- eval=FALSE--------------------------------------------------------------
#  read.csv(fcsv) %>% tidyr::gather(measure, val_of_measure, -Kid.Name)

## ---- echo=FALSE--------------------------------------------------------------
knitr::kable(read.csv(fcsv) %>% tidyr::gather(measure, val_of_measure, -Kid.Name), align = "l", longtable = FALSE)

## ---- eval=FALSE--------------------------------------------------------------
#  read_cells()

## ---- out.width = "550px", echo=FALSE, dpi=300--------------------------------
knitr::include_graphics("ext/read_cells_out.png")

## -----------------------------------------------------------------------------
d1 <- read_cells(fcsv, at_level = "va_classify") %>% 
  read_cells()
d2 <- read_cells(fcsv, at_level = 2) %>% 
  read_cells(at_level = 4) %>% 
  read_cells()
identical(d1, d2)

## ---- out.width = "356px", echo=FALSE, fig.align='center', dpi=300------------
knitr::include_graphics("ext/read_cells.svg")

## -----------------------------------------------------------------------------
x <- iris %>% head() 
cd <- x %>% as_cell_df(take_col_names = TRUE)

## ---- eval=TRUE, out.width = "356px"------------------------------------------
plot(cd, adaptive_txt_size = FALSE, txt_size = 2.5)

## ---- eval=FALSE--------------------------------------------------------------
#  dummy_dat <- tibble::tibble(name = c("Mr. X", "Mr. Y"), sal = c("1,000","12,000"), age = c(35, 60))
#  dummy_dat

## ---- echo=FALSE--------------------------------------------------------------
dummy_dat <- tibble::tibble(name = c("Mr. X", "Mr. Y"), sal = c("1,000","12,000"), age = c(35, 60))
knitr::kable(dummy_dat)

## -----------------------------------------------------------------------------
dummy_dat_cells <- dummy_dat %>% 
  as_cell_df(take_col_names = TRUE) 

## -----------------------------------------------------------------------------
va_cells <- numeric_values_classifier(dummy_dat_cells)

## -----------------------------------------------------------------------------
va_cells <- value_attribute_classify(dummy_dat_cells, 
                                     classifier = numeric_values_classifier())

## ---- fig.width=3, fig.height=2, eval=TRUE, out.width = "256px"---------------
plot(va_cells, adaptive_txt_size = FALSE, txt_size = 3)

## ---- eval=TRUE, out.width = "356px"------------------------------------------
# let's resume with iris example
cd <- numeric_values_classifier(cd)
plot(cd, adaptive_txt_size = FALSE, txt_size = 2.5)

## ---- echo=FALSE--------------------------------------------------------------
cd <- numeric_values_classifier(cd)

## -----------------------------------------------------------------------------
ca <- analyze_cells(cd)

## ---- eval=TRUE, out.width = "356px"------------------------------------------
plot(ca)

## ---- eval=FALSE--------------------------------------------------------------
#  dcomp <- compose_cells(ca)
#  # discarding other columns for printing (you can check them!)
#  head(dcomp[1:7])

## ---- echo=FALSE--------------------------------------------------------------
dcomp <- compose_cells(ca)
knitr::kable(head(dcomp[1:7]))

## ---- eval= FALSE-------------------------------------------------------------
#  # let's check the same for va_cells (earlier dummy example)
#  va_cells %>%
#    analyze_cells() %>%
#    compose_cells(discard_raw_cols = TRUE)

## ---- echo=FALSE--------------------------------------------------------------
va_cells %>% 
  analyze_cells() %>% 
  compose_cells(discard_raw_cols = TRUE) %>% 
  knitr::kable()

## ---- fig.width=3, fig.height=2-----------------------------------------------
cell_composition_traceback(ca, trace_row = 1)
cell_composition_traceback(ca, trace_row = 10)

## ---- eval=FALSE--------------------------------------------------------------
#  dcc <- collate_columns(dcomp)
#  head(dcc)

## ---- echo=FALSE--------------------------------------------------------------
dcc <- collate_columns(dcomp)
head(dcc) %>% knitr::kable()

## ---- eval=FALSE--------------------------------------------------------------
#  # below should match with
#  # read_cells(fcsv) %>% dplyr::select(-table_tag)
#  # * which (the column table_tag) is added to identify
#  # * potential multiple tables.
#  fcsv %>%
#    read.csv() %>%
#    as_cell_df(take_col_names = TRUE) %>%
#    numeric_values_classifier() %>%
#    analyze_cells() %>%
#    compose_cells() %>%
#    collate_columns()

## ---- echo=FALSE--------------------------------------------------------------
fcsv %>% 
  read.csv() %>% 
  as_cell_df(take_col_names = TRUE) %>% 
  numeric_values_classifier() %>% 
  analyze_cells() %>% 
  compose_cells() %>% 
  collate_columns() %>% 
  knitr::kable()

## -----------------------------------------------------------------------------
rc_part <- read_cells(fcsv, at_level = 2)
# it is a list with `read_cells_stage` attribute
# which indicate the last processed stage in read_cells
str(rc_part)
# sample_based_classifier is another VA classifier
# for details see coresponding documentation
rc_part[[1]] <- rc_part[[1]] %>% 
  numeric_values_classifier() %>% 
  sample_based_classifier(empty_sample = "6")
  

# below should be similar to 
# rc_part[[1]] %>% 
#   analyze_cells() %>% 
#   compose_cells(discard_raw_cols = TRUE)
rc_part %>% read_cells(from_level = 3)

## ---- eval=FALSE--------------------------------------------------------------
#  system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE)

## ---- out.width = "451px", echo=FALSE, eval=TRUE, dpi=300---------------------
knitr::include_graphics("ext/marks.png")

## ---- eval=FALSE--------------------------------------------------------------
#  system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE) %>%
#    read_cells()

## ---- echo=FALSE--------------------------------------------------------------
if(rlang::is_installed("tidyxl")){
  system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE) %>% 
    read_cells() %>% 
    knitr::kable()
}

## ---- eval=FALSE--------------------------------------------------------------
#  # if you have tidyxl installed
#  d <- system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE) %>%
#    read_cells(at_level = "make_cells") %>%
#    .[[1]]

## -----------------------------------------------------------------------------
# or you may do
d <- system.file("extdata", "marks_cells.rds", package = "tidycells", mustWork = TRUE) %>% 
  readRDS()
d <- numeric_values_classifier(d)
da <- analyze_cells(d)

## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  dc <- compose_cells(da, print_attribute_overview = TRUE)

## ---- out.width = "451px", echo=FALSE, eval=TRUE, dpi=300---------------------
knitr::include_graphics("ext/compose_cells_cli1.png")
dc <- compose_cells(da)

## -----------------------------------------------------------------------------
# bit tricky and tedious unless you do print_attribute_overview = TRUE in above line
dcfine <- dc %>% 
  dplyr::mutate(name = dplyr::case_when(
    data_block == 1 ~ major_row_left_2_1,
    data_block == 2 ~ major_col_bottom_1_1,
    data_block == 3 ~ major_row_left_1_1
  ),
  sex = dplyr::case_when(
    data_block == 1 ~ major_row_left_1_1,
    data_block == 2 ~ major_col_bottom_2_1,
    data_block == 3 ~ minor_row_right_1_1
  ),
  school = dplyr::case_when(
    data_block == 1 ~ minor_col_top_1_1,
    data_block == 2 ~ minor_corner_topLeft_1_1,
    data_block == 3 ~ minor_col_top_1_1
  )) %>% 
  dplyr::select(school,sex, name, value)


## ---- echo=FALSE--------------------------------------------------------------
knitr::kable(head(dcfine), align = c(rep("l", 3), "c"))

## ---- eval=FALSE--------------------------------------------------------------
#  collate_columns(dc) %>%
#    head()

## ---- echo=FALSE--------------------------------------------------------------
knitr::kable(head(collate_columns(dc)), align = c(rep("l", 5), "c"))

## -----------------------------------------------------------------------------
dm <- tibble::tibble(fn = list.files(fold, full.names = T))
dm$fn %>% basename()

## ---- echo=FALSE, include=FALSE-----------------------------------------------
# filter based on file-type
dm <- dm %>%
    dplyr::mutate(original = dm$fn %>%
      purrr::map_chr(~ basename(.x) %>%
        stringr::str_split("\\.") %>%
        purrr::map_chr(1)))
if(!(rlang::is_installed("readxl") | rlang::is_installed("xlsx"))){
  dm <- dm %>% dplyr::filter(original!="xls")
}
if(!(rlang::is_installed("tidyxl"))){
  dm <- dm %>% dplyr::filter(original!="xlsx")
}
if(!(rlang::is_installed("docxtractr"))){
  dm <- dm %>% dplyr::filter(original!="docx" & original!="doc")
}
if(!(rlang::is_installed("tabulizer"))){
  dm <- dm %>% dplyr::filter(original!="pdf")
}
if(!(rlang::is_installed("XML"))){
  dm <- dm %>% dplyr::filter(original!="html")
}

## -----------------------------------------------------------------------------
dcomps <- dm$fn %>% purrr::map(read_cells)
dcomps_sel <- dcomps %>%
    purrr::map(~ .x %>%
      dplyr::select(value, collated_1, collated_2))
# all of them are same [intentionaly made. but the file types are totally different]
dcomps_sel %>% 
  purrr::map_lgl(~identical(.x, dcomps_sel[[1]])) %>% 
  all()
# check one sample
dcomps_sel[[1]]


## ---- out.width = "516px", echo=FALSE, dpi=300--------------------------------
knitr::include_graphics("ext/v12.png")

## ---- out.width = "516px", echo=FALSE, dpi=300--------------------------------
knitr::include_graphics("ext/v34.png")

## ---- out.width = "516px", echo=FALSE, dpi=300--------------------------------
knitr::include_graphics("ext/v56.png")

Any scripts or data that you put into this service are public.

tidycells documentation built on March 26, 2020, 7:35 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tidycells
Read Tabular Data from Diverse Sources and Easily Make Them Tidy

inst/doc/tidycells-intro.R
In tidycells: Read Tabular Data from Diverse Sources and Easily Make Them Tidy

Try the tidycells package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tidycells Read Tabular Data from Diverse Sources and Easily Make Them Tidy

inst/doc/tidycells-intro.R In tidycells: Read Tabular Data from Diverse Sources and Easily Make Them Tidy

Try the tidycells package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tidycells
Read Tabular Data from Diverse Sources and Easily Make Them Tidy

inst/doc/tidycells-intro.R
In tidycells: Read Tabular Data from Diverse Sources and Easily Make Them Tidy