library(learnr) library(tidyverse) library(knitr) library(here) library(twitterwidget) library(rlang) library(ggrepel) library(viridis) library(gghighlight) library(patchwork) knitr::opts_chunk$set(echo = FALSE, fig.align="center")
path <- '/Users/matthewhirschey/Dropbox/DUKE/PROJECTS/bespokeDS/bespokelearnr/inst/extdata' proteins <- readRDS(paste0(path, '/bespoke_dataframe.Rds')) ### proteins_join <- readRDS(paste0(path, '/bespoke_dataframe_join.Rds')) joined <- proteins %>% right_join(proteins_join, by = 'id') ### df_input <- proteins # this line is for bespoke.R to get proper var source('/Users/matthewhirschey/Dropbox/DUKE/PROJECTS/bespokeDS/bespokelearnr/inst/content/metadata.R') source('/Users/matthewhirschey/Dropbox/DUKE/PROJECTS/bespokeDS/bespokelearnr/inst/content/bespoke.R')
url <- "https://source.unsplash.com/701-FJcjLAQ/500x500" knitr::include_graphics(url)
Photo by National Cancer Institute on Unsplash
Volume
- Data collection & storage allows access to huge amounts of medical information
Ubiquity
- Data are available anywhere across geography, social, and economic classes
Latency
- Technology facilitates no delay in access to data
Data-driven decision making!
url <- "https://upload.wikimedia.org/wikipedia/commons/0/06/DIKW_Pyramid.svg" knitr::include_graphics(url)
https://en.wikipedia.org/wiki/DIKW_pyramid
url <- "https://images.squarespace-cdn.com/content/v1/5150aec6e4b0e340ec52710a/1364352051365-HZAS3CLBF7ABLE3F5OBY/ke17ZwdGBToddI8pDm48kB2M2-8_3EzuSSXvzQBRsa1Zw-zPPgdn4jUwVcJE1ZvWQUxwkmyExglNqGp0IvTJZUJFbgE-7XRK3dMEBRBhUpxPe_8B-x4gq2tfVez1FwLYYZXud0o-3jV-FAs7tmkMHY-a7GzQZKbHRGZboWC-fOc/Data_Science_VD.png?format=1500w" knitr::include_graphics(url)
http://drewconway.com/zia/2013/3/26/the-data-science-venn-diagram
url <- "https://3.bp.blogspot.com/-bvQxcwfqATQ/V-E_uTBc4VI/AAAAAAAAMGQ/Qa1Ntef-rs0E-mWx5pkVu-CPlREdvD0TwCLcB/s1600/VennDiagram2.png" knitr::include_graphics(url)
Joel Grus via KDnuggets
url <- "https://upload.wikimedia.org/wikipedia/commons/0/06/DIKW_Pyramid.svg" knitr::include_graphics(url)
https://en.wikipedia.org/wiki/DIKW_pyramid
tweet <- twitterwidget('1125268670324695041')
r tweet
excel <- tibble( name = c("Excel", "Java", "C", "C++", "Python"), num = c(100000000, 9000000, 6000000, 4000000, 3000000) ) excel_plot <- ggplot(excel) + geom_col(aes(x = fct_rev(fct_reorder(name, num)), y = num), fill = "navy") + labs(x = "", y = "Number of Users (Million)") + scale_y_continuous(labels = c("0", "25", "50", "75", "100")) + theme_minimal() excel_plot #need to add source
Data scientists and programmers have strong opinions about the differences in languages
The focus here will be on the R programming language
Any questions?
url <- "https://source.unsplash.com/ZzWsHbu2y80/256x455" knitr::include_graphics(url)
Photo by Hannah Wright on Unsplash
<-
(looks like an arrow pointing left) Assign an object; remember, no quotes on name
name <- 4
Return that object by typing its name
name
Try this in the code chunk below, then hit "Run Code"
round(x, digits = 3)
Try this in the code chunk below, then hit "Run Code"
round(pi, digits = 3)
question("Which of these are numbers?", answer("1", correct = TRUE), answer('"1"', message = "Because it has quotes, it is an string"), answer('"one"', message = "Because it has quotes, it is an string"), answer("one", message = "R recognizes this as an object"), allow_retry = TRUE, random_answer_order = TRUE )
Suppose one <- 1
question("Which of these will work?", answer("log(1)", correct = TRUE), answer('log("1")', message = "You cannot log transform a word (string)"), answer('log("one")', message = "You cannot log transform a word (string)"), answer("log(one)", correct = TRUE), allow_retry = TRUE, random_answer_order = TRUE )
Try it for yourself!
one <- 1
Data stored in a dataframe are conceptually equivalent to a spreadsheet with rows and columns
This is a sample from the r dataframe_name
dataset
df_input %>% select(1:5) %>% slice(1:5) %>% knitr::kable()
Data stored in a dataframe are conceptually equivalent to a spreadsheet with rows and columns
This is a sample from the r dataframe_name
dataset
df_input %>% select(1:5) %>% slice(1:5) %>% print(as_tibble())
col <- sample(colnames(df_input), 1)
You can call a single part of the data frame
r dataframe_name
$r col
df_input %>% select(!!col)
Write the R code required to extract a variable from the r dataframe_name
dataset:
Remember, the format is: r dataframe_name
$r col
You can also save a part of the dataframe as an object for later use
r col` <- `r dataframe_name`$`r col
In the code chunk below:
1. On the first line, write the R code to save a single column to a new object
2. On the second line, type the object name - this will print out the new object
3. Run the code
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.