# rmarkdown::run("dataframe.Rmd") library(learnr) library(tutor) library(tidyverse) gradethis::gradethis_setup()
Create a 3 column data.frame
{.R} called df
containing three columns x
, y
and z
with:
x
a vector from $-\pi$ to $\pi$ of length 10y
their sinusz
the sum of the two first columns.x <- ___ y <- ___ z <- ___ df <- data.frame()
x <- seq(-pi, pi, length=10) y <- sin(x) z <- x + y df <- data.frame(x=x, y=y, z=z)
grade_code()
Print the 4 first lines of the table.
# Take a look at the head() function
head(df, 4)
grade_code()
Print the second (i.e. y
) column with three different ways.
df[___]
df[___]
df$___
df[,2] df[,"y"] df$y
grade_code()
Modify the column z
so that it contains its value minus its minimum.
df$z <- df$z - min(df$z)
grade_code()
Print the average of the third column.
# look at the mean() function
mean(df$z)
grade_code()
Using plot(x,y)
{.R} where x
and y
are vectors, plot the 2nd column as a function of the first.
plot(___, ___)
plot(df$x, df$y)
grade_code()
Look into the function write.table()
{.R} to write a text file containing this data.frame
{.R}.
write.table(___)
write.table(df, "~/Downloads/some_data.dat", quote = FALSE, row.names = FALSE)
grade_code()
Do the all the same things with a tibble
{.R}.
df_tib <- tibble(___)
library(tidyverse) df_tib <- tibble(x = seq(-pi, pi, length = 10), y = sin(x), z = x + y) head(df_tib, 4) df_tib[, 2] df_tib[[2]] mean(df_tib$z) write_csv(df_tib, "~/Downloads/some_data.csv") plot(df_tib$x, df_tib$y)
grade_code()
We will work with 3 different files:
- tutor_examples("rubis_01.txt")
- tutor_examples("population.csv")
- tutor_examples("FTIR_rocks.xlsx")
Load them into separate data.frames
{.R}. Look into the options of read.table()
{.R}, read.csv()
{.R}, readxl::read_excel()
{.R}, to get the proper data fields. Make sure that the rubis_01
data.frame has w
and intensity
as column names.
rubis_01 <- ___(tutor_examples("rubis_01.txt"), ___) population <- ___(tutor_examples("population.csv")) FTIR_rocks <- ___(tutor_examples("FTIR_rocks.xlsx"))
rubis_01 <- read.table(tutor_examples("rubis_01.txt"), col.names = c("w", "intensity")) population <- read.csv(tutor_examples("population.csv")) FTIR_rocks <- readxl::read_excel(tutor_examples("FTIR_rocks.xlsx"))
grade_code()
Print their dimensions and column names.
# Dimensions rubis_01 population FTIR_rocks # Names rubis_01 population FTIR_rocks
dim(rubis_01); names(rubis_01) dim(population); names(population) dim(FTIR_rocks); names(FTIR_rocks)
grade_code()
Do the same things by loading directly into tibbles.
library(tidyverse) rubis_01 <- read_table(tutor_examples("rubis_01.txt"), col_names = c("w", "intensity")) population <- read_csv(tutor_examples("population.csv"))
grade_code()
We will use the TGA data file tutor_examples("ATG.txt")
(click link to take a look at the file).
Load it into a data.frame
{.R}. Look into the options of read.table()
{.R} to get the proper data fields.
d <- read.table(tutor_examples("ATG.txt"), ___ ) d
# check how many lines you have to read # check how many lines you have to skip before reading # you need to skip the line with the unit
# Two versions d <- read.table(tutor_examples("ATG.txt"), skip = 12, header = FALSE, nrows = 4088 ) names(d) <- c("Index", "t", "Ts", "Tr", "Value") head(d) d <- read.table(tutor_examples("ATG.txt"), skip = 10, comment.char = "[", header = TRUE, nrows = 4088 ) head(d)
Do the same using the tidyverse
function read_table()
:
library(tidyverse) d <- read_table(tutor_examples("ATG.txt"), ___ ) d
library(tidyverse) d <- read_table(tutor_examples("ATG.txt"), skip = 10, comment = "[") %>% drop_na() d
Load the tutor_examples("population.csv")
file into a tibble
{.R} called popul
.
library(___) popul <- read____(tutor_examples("population.csv"))
library(tidyverse) popul <- read_csv(tutor_examples("population.csv"))
grade_code()
What are the names of the columns? What's the dimension of the table ?
popul
names(popul); dim(popul)
grade_code()
Are the data tidy? make the table tidy if needed
popul popul.tidy <- popul %>% pivot_longer( cols = ___, # what are the columns we want to keep? -> -these names_to = ___, # name of the column gathering the original column names values_to= ___ # name of the column gathering the original column values )
head(popul) # no popul.tidy <- popul %>% pivot_longer(cols = -year, names_to = "city", values_to = "pop" ) popul.tidy
grade_code()
Create a subset containing the data for Montpellier using a filtering function from the tidyverse
.
library(tidyverse) popul <- read_csv(tutor_examples("population.csv")) popul.tidy <- popul %>% pivot_longer( cols = -year, names_to = "city", values_to = "pop" )
mtp <- popul.tidy %>% ___
mtp <- popul.tidy %>% filter(city == "Montpellier")
grade_code()
library(tidyverse) popul <- read_csv(tutor_examples("population.csv")) popul.tidy <- popul %>% pivot_longer( cols = -year, names_to = "city", values_to = "pop" ) mtp <- popul.tidy %>% filter(city == "Montpellier")
mtp
max(mtp$pop) min(mtp$pop) range(mtp$pop) mean(mtp$pop)
grade_code()
What is the total population in 2012?
library(tidyverse) popul <- read_csv(tutor_examples("population.csv")) popul.tidy <- popul %>% pivot_longer( cols = -year, names_to = "city", values_to = "pop" ) mtp <- popul.tidy %>% filter(city == "Montpellier")
popul.tidy %>% ___ %>% # You need to filter the data for the year 2012 ___ %>% # Then select the right column ___ # And perform the sum of its data
sum(popul.tidy[popul.tidy$year == 2012, "pop"]) popul.tidy %>% filter(year == 2012) %>% select(pop) %>% sum()
grade_code()
What is the total population per year?
library(tidyverse) popul <- read_csv(tutor_examples("population.csv")) popul.tidy <- popul %>% pivot_longer( cols = -year, names_to = "city", values_to = "pop" ) mtp <- popul.tidy %>% filter(city == "Montpellier")
popul.tidy %>% ___ %>% # You need to group data per year ___ # Then summarize the data of each year as # the total population of each group
popul.tidy %>% group_by(year) %>% summarise(pop_tot = sum(pop))
grade_code()
What is the average population per city over the years?
library(tidyverse) popul <- read_csv(tutor_examples("population.csv")) popul.tidy <- popul %>% pivot_longer( cols = -year, names_to = "city", values_to = "pop" ) mtp <- popul.tidy %>% filter(city == "Montpellier")
popul.tidy %>% ___ %>% # You need to group data per...? ___ # Then...?
popul.tidy %>% group_by(city) %>% summarise(pop_ave = mean(pop))
grade_code()
First, load the tidyverse
and lubridate
package
___ ___
library(tidyverse) library(lubridate)
grade_code()
Load tutor_examples("people1.csv")
and tutor_examples("people2.csv")
into pp1
and pp2
, and take a look at them.
tutor_examples("people1.csv") tutor_examples("people2.csv") pp1 <- read____(___) pp2 <- read____(___)
pp1 <- read_csv(tutor_examples("people1.csv")) pp2 <- read_csv(tutor_examples("people2.csv"))
grade_code()
Create a new tibble pp
by using the pipe operator (%>%
{.R}) and successively:
inner_join()
{.R}age
containing the age in years (use lubridate::time_length(x, 'years')
{.R} with x a time difference in days) by using mutate()
{.R}pp <- pp1 %>% ___ %>% # you need to join with pp2 mutate(___) # then add a column `age` computing the right thing
pp <- pp1 %>% inner_join(pp2) %>% mutate(age = time_length(today() - dateofbirth, "years"))
grade_code()
library(tidyverse) library(lubridate) pp1 <- read_csv(tutor_examples("people1.csv")) pp2 <- read_csv(tutor_examples("people2.csv")) pp <- pp1 %>% inner_join(pp2) %>% mutate(age = time_length(today() - dateofbirth, "years"))
Display a summary of the table using str()
{.R}
str(pp)
grade_code()
Using groupe_by()
{.R} and summarize()
{.R}:
n()
{.R})arrange()
{.R})library(tidyverse) library(lubridate) pp1 <- read_csv(tutor_examples("people1.csv")) pp2 <- read_csv(tutor_examples("people2.csv")) pp <- pp1 %>% inner_join(pp2) %>% mutate(age = time_length(today() - dateofbirth, "years"))
# - Show the number of males and females in the table (use the counter `n()`) pp %>% ___ %>% ___ # - Show the average age per gender pp %>% ___ %>% ___ # - Show the average size per gender and institution pp %>% ___ %>% ___ # - Show the number of people from each country, sorted by descending population pp %>% ___ %>% ___ %>% ___
# - Show the number of males and females in the table (use the counter `n()`) pp %>% group_by(gender) %>% summarize(count = n()) # - Show the average age per gender pp %>% group_by(gender) %>% summarize(age = mean(age)) # - Show the average size per gender and institution pp %>% group_by(gender, institution) %>% summarize(size = mean(size)) # - Show the number of people from each country, sorted by descending population pp %>% group_by(origin) %>% summarize(count = n()) %>% arrange(desc(count))
grade_code()
Using select()
{.R}, display:
library(tidyverse) library(lubridate) pp1 <- read_csv(tutor_examples("people1.csv")) pp2 <- read_csv(tutor_examples("people2.csv")) pp <- pp1 %>% inner_join(pp2) %>% mutate(age = time_length(today() - dateofbirth, "years"))
# - only the name and age columns pp ___ # - all but the name column pp ___
# - only the name and age columns pp %>% select(c(name, age)) # - all but the name column pp %>% select(-name)
grade_code()
Using filter()
{.R}, show data only for:
e
in their namelibrary(tidyverse) library(lubridate) pp1 <- read_csv(tutor_examples("people1.csv")) pp2 <- read_csv(tutor_examples("people2.csv")) pp <- pp1 %>% inner_join(pp2) %>% mutate(age = time_length(today() - dateofbirth, "years"))
# - Chinese people pp ___ # - From institution ECL and UCBL pp ___ # - People older than 22 pp ___ # - People with a `e` in their name pp ___
# - Chinese people pp %>% filter(origin == "China") # - From institution ECL and UCBL pp %>% filter(institution %in% c("ECL", "UCBL")) # - People older than 22 pp %>% filter(age > 22) # - People with a `e` in their name pp %>% filter(grepl("e", name))
grade_code()
Here we will see how to load many files at once with the tidyverse
and how to perform some data wrangling.
We will work with the files whose paths are stored in the vector flist
. These files are all .csv
files containing two columns and a header. We can use the fact that read_csv()
accepts vectors as argument and read them all at once.
tib
."file"
containing the list of filenames: look at the id
parameter "file"
column so that it contains just the file name and not the full path – look at the basename()
function.flist <- tutor_examples("sample")
tib <- read_csv(___, # what do we want to read? id = ___) %>% # what is the name of the column containing the file names ? mutate(___) # modify this column so that it contains just the file name and not the full path
tib <- read_csv(flist, id = "file") %>% mutate(file = basename(file))
grade_code()
We also want to get information from ou file names, such as the sample number, the temperature, the time, and the time unit. Use the function separate()
to split the file
column into sample
, T
, time
and time_unit
. If applicable, make sure that the resulting columns are numeric by getting rid of the annoying characters.
flist <- tutor_examples("sample") tib <- read_csv(flist, id = "file") %>% mutate(file = basename(file))
tib <- tib %>% separate(col = ___, into = ___, convert = ___) %>% mutate(sample = as.numeric(str_replace(___)), T = ___, time = ___, time_unit = ___ ) tib
tib <- tib %>% separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% mutate(sample = as.numeric(str_replace(sample, "sample", "")), T = as.numeric(str_replace(T, "K", "")) ) tib
grade_code()
Now we want all times to be in the same unit. Using mutate()
and ifelse()
, convert the minutes in seconds, then get rid of the time_unit
column.
flist <- tutor_examples("sample") tib <- read_csv(flist, id = "file") %>% mutate(file = basename(file)) %>% separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% mutate(sample = as.numeric(str_replace(sample, "sample", "")), T = as.numeric(str_replace(T, "K", "")) )
tib <- tib %>% mutate(time = ifelse(test, yes, no)) %>% # convert minutes to seconds select(___) # get rid of the `time_unit` column tib
tib <- tib %>% mutate(time = ifelse(time_unit=="min", time*60, time)) %>% select(-time_unit) tib
grade_code()
Before going further, we want to take a look at our data using ggplot
. Modify the following code so that a color is added as a function of the sample number, and the plots are gathered on a grid showing the time as a function of the temperature.
flist <- tutor_examples("sample") tib <- read_csv(flist, id = "file") %>% mutate(file = basename(file)) %>% separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% mutate(sample = as.numeric(str_replace(sample, "sample", "")), T = as.numeric(str_replace(T, "K", "")) ) %>% mutate(time = ifelse(time_unit=="min", time*60, time)) %>% select(-time_unit)
tib %>% ggplot(aes(x = x, y = y, color = ___)) + geom_point() + facet_grid(___ ~ ___)
tib %>% ggplot(aes(x = x, y = y, color = factor(sample))) + geom_point() + facet_grid(time ~ T)
grade_code()
We want to nest our data to be able to perform operations on them – like fitting them. Using the nest()
function, nest the data so that we end up with only 2 columns: file
and data
.
flist <- tutor_examples("sample") tib <- read_csv(flist, id = "file") %>% mutate(file = basename(file)) %>% separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% mutate(sample = as.numeric(str_replace(sample, "sample", "")), T = as.numeric(str_replace(T, "K", "")) ) %>% mutate(time = ifelse(time_unit=="min", time*60, time)) %>% select(-time_unit)
tib <- tib %>% nest(___) tib
# What are the columns to nest ? data = c(these_columns)
tib <- tib %>% nest(data = c(x, y)) tib
grade_code()
Now we can fit all our data at once with a linear model:
flist <- tutor_examples("sample") tib <- read_csv(flist, id = "file") %>% mutate(file = basename(file)) %>% separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% mutate(sample = as.numeric(str_replace(sample, "sample", "")), T = as.numeric(str_replace(T, "K", "")) ) %>% mutate(time = ifelse(time_unit=="min", time*60, time)) %>% select(-time_unit) %>% nest(data = c(x,y))
tib <- tib %>% mutate(fit = map(data, ~lm(data=., x~y))) tib
tib <- tib %>% mutate(fit = map(data, ~ lm(data = ., x ~ y))) tib
grade_code()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.