background-image: url(pics/manbeer.jpg) background-size: 100% class: middle, center
library(knitr) options(htmltools.dir.version = FALSE, cache=TRUE) opts_chunk$set(comment = NA, prompt=TRUE) #opts_chunk$set(dev.args=list(bg="transparent"), fig.width=15, fig.height=7) source("kutheme.R")
class: center
DF <- data.frame(day1=sample(1:25, 6), day2=c(NA, NA, sample(1:25, 3), 88), day3=c(NA, 99, 40, sample(1:25, 3))) head(DF, 4)
class: center
head(DF, 6)
background-image: url(pics/mau.png) background-size: 80% class: center
library(extrafont) loadfonts() library(ggplot2) library(ggthemr) claus <- define_palette(swatch = c('#eeeeee', '#FFFACD','#901A1E', 'lightblue', 'green', 'blue', 'brown', 'purple', 'yellow'), gradient = c(lower = 'red', upper = 'green'), background = "#222222", text = c("#eeeeee", "#eeeeee"), line = c("#ee1e1e", "#6e6e6e"), # Axis lines gridline = "#434343") # Gridlines ggthemr(claus, layout="scientific", type = 'outer', spacing=1.2) tlfb <- data.frame(units=dget("tlfb.txt")) p <- ggplot(tlfb, aes(units)) + geom_dotplot() + # ggtitle("Baseline alcohol") + xlab("Units") + ylab("Frequency") + theme(text=element_text(size=24, family="Comic Sans MS"), axis.text.y=element_blank(), axis.ticks.y=element_blank()) theme_xkcd <- theme( # panel.background = element_rect(fill="transparent"), # plot.background = element_rect(fill="transparent"), #axis.ticks = element_line(colour=NA), ## panel.grid = element_line(colour="white"), #axis.text.y = element_text(colour=NA), ## axis.text.x = element_text(colour="black"), text = element_text(size=36, family="Humor Sans") ) ## p + theme_xkcd # Rotate the dot plot # p + coord_flip() #p <- ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point(size=3) + # ggtitle("Fuel Efficiency of 32 Cars") + # xlab("Weight (x1000 lb)") + ylab("Miles per Gallon") + # theme(text=element_text(size=16, family="Comic Sans MS")) #p
What didn't we check?
--
class: center
library(dataMaid) data(toyData)
.small[
toyData
]
class: middle
library(dataMaid) data(toyData) makeDataReport(toyData) #<<
Documentation to be read and evaluated by a human.
See github.com/ekstroem/dataMaid for more info. Stable version on CRAN.
background-image: url(pics/flowchart2.png) background-size: 100% class: center
dataMaid
flowchartlibrary(DiagrammeR) library(DiagrammeRsvg) library(svglite) library(rsvg) grViz(" digraph { graph[layout = neato, overlap=false, rankdir = 'LR', bgcolor='#000000', sep=.2] node[fontsize=20, fillcolor = DimGray, fontcolor = White, fillcolor='#222222', style=filled, color=White] edge [color = White, penwidth=4] 'data frame' -> 'summarize' subgraph detail { rank='same'; 'summarize' -> 'visualize' -> 'check'; 'check' -> 'summarize'; } 'check' -> '.Rmd / render'; }")
knitr::include_graphics('pics/summ.png', dpi = NA)
background-image: url(pics/miss.png) background-size: 100%
background-image: url(pics/out1.png) background-size: 100%
background-image: url(pics/out2.png) background-size: 100%
dataMaid
common arguments.small[
| Argument | Description |
| ------------- |:--------------------------------|
| mode
| Tasks to perform. c("summarize", "visualize", "check")
is default |
| replace
| Logical. Should existing dataMaid reports be overwritten? Default FALSE
|
| output
| Output format. Choices are "pdf"
, "html"
, ""word"
|
| onlyProblematic
| Logical. Show only variable with problems. Default FALSE
|
| maxProbVals
| Maximum number of unique values printed. Positive int or Inf
(default 10) |
]
class: inverse, middle
dataMaid
packagemakeDataReport()
to generate a data reportbigPresidentData
data for errors we did not find before.Hunt for errors!
class: center
.small[
toyData
]
dataMaid
interactively.footnotesize[
check(toyData$events) check(toyData$events, numericChecks = "identifyMissing")
]
???
check er de funktioner, der bliver checket for
Kan sætte specifikke (også i makeDataReport)
Vis, at det er en liste med 2 elementer, og den form, som de har.
check
functions.footnotesize[
allCheckFunctions()
]
.footnotesize[
---------------------------------------------------------------------------------- name description classes -------------------- ------------------------------------------------------------- identifyCaseIssues Identify case issues character, factor identifyLoners Identify levels with < 6 obs. character, factor identifyMissing Identify miscoded missing character, Date, factor, values integer, labelled, logical, numeric
]
and more
dataMaid
- visualizations.pull-left[ .footnotesize[
visualize(toyData$events)
Can also check the available visual
functions
allVisualFunctions()
] ]
.pull-right[
visualize(toyData$events) # Visualize variable
]
dataMaid
- summaries.footnotesize[
summarize(toyData$events)
]
.small[
> allSummaryFunctions() ------------------------------------------------------------- name description classes ------------ -------------------- --------------------------- centralValue Compute median character, Date, factor, or mode integer, labelled, logical, numeric countMissing Compute ratio of character, Date, factor, missing obs. integer, labelled, logical, numeric minMax Find min and max integer, numeric, Date values quartiles Compute 1st and 3rd quartiles Date, integer, numeric uniqueValues Count number of unique values character, Date, factor, integer, labelled, logical, numeric variableType Data class of variable character, Date, factor, integer, labelled, logical, numeric ----------------------------------------------------------------------
]
dataMaid
Custom check, visual, or summary functions.
Few requirements --- input and output formats.
Easiet to work with a template system and modify one of those.
Check vignette vignette("extending_dataMaid")
for detailed instructions. Or the exercises!
summaryFunction
- template.footnotesize[
mySummaryFunction <- function(v, ...) { val <- [ result of whatever summary we are doing ] res <- [ properly escaped version of val ] summaryResult(list(feature = "[Feature name]", result = res, value = val)) }
]
Example (centralValue for numeric/integer)
.footnotesize[
function (v, maxDecimals = 2) { v <- na.omit(v) val <- median(v) summaryResult(list(feature = "Median", result = round(val, maxDecimals), value = val)) }
]
checkFunction
- template.small[
isSSN <- function(v, nMax = NULL, ...) { out <- list(problem = FALSE, message = "", problemValues=NULL) if (class(v) %in% c("character", "factor", "labelled")) { if (any(grep("\\d{3}-\\d{2}-\\d{4}", v))) { out$problem <- TRUE out$message <- "Warning: Seems to contain SSNs." out$problemValues <- "Will not show" } } out }
.footnotesize[
DF <- data.frame(ids=c("111-22-3333","123-45-6789", "111-22-3333"), id2=c("111223333", "123456789", "4728491283"), stringsAsFactors=FALSE) check(DF, characterChecks = c("isSSN"))
]
class: inverse
How to tailor dataMaid
to work with your dataset:
dataMaid
(exercise 2b)Pick whatever you want. Or jump back and forth.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.