# rmarkdown::run("dataframe.Rmd")
library(learnr)
library(tutor)
library(tidyverse)
gradethis::gradethis_setup()

Exercise 1

Create a 3 column data.frame{.R} called df containing three columns x, y and z with:

x <- ___
y <- ___
z <- ___
df <- data.frame()
x <- seq(-pi, pi, length=10)
y <- sin(x)
z <- x + y
df <- data.frame(x=x, y=y, z=z)
grade_code()

Print the 4 first lines of the table.


# Take a look at the head() function
head(df, 4)
grade_code()

Print the second (i.e. y) column with three different ways.

df[___]
df[___]
df$___
df[,2]
df[,"y"]
df$y
grade_code()

Modify the column z so that it contains its value minus its minimum.


df$z <- df$z - min(df$z)
grade_code()

Print the average of the third column.


# look at the mean() function
mean(df$z)
grade_code()

Using plot(x,y){.R} where x and y are vectors, plot the 2nd column as a function of the first.

plot(___, ___)
plot(df$x, df$y)
grade_code()

Look into the function write.table(){.R} to write a text file containing this data.frame{.R}.

write.table(___)
write.table(df, "~/Downloads/some_data.dat", quote = FALSE, row.names = FALSE)
grade_code()

Do the all the same things with a tibble{.R}.

df_tib <- tibble(___)
library(tidyverse)
df_tib <- tibble(x = seq(-pi, pi, length = 10), 
                 y = sin(x), 
                 z = x + y)
head(df_tib, 4)
df_tib[, 2]
df_tib[[2]]
mean(df_tib$z)
write_csv(df_tib, "~/Downloads/some_data.csv")
plot(df_tib$x, df_tib$y)
grade_code()

Exercise 2

We will work with 3 different files: - tutor_examples("rubis_01.txt") - tutor_examples("population.csv") - tutor_examples("FTIR_rocks.xlsx")

Load them into separate data.frames{.R}. Look into the options of read.table(){.R}, read.csv(){.R}, readxl::read_excel(){.R}, to get the proper data fields. Make sure that the rubis_01 data.frame has w and intensity as column names.

rubis_01   <- ___(tutor_examples("rubis_01.txt"), ___)
population <- ___(tutor_examples("population.csv"))
FTIR_rocks <- ___(tutor_examples("FTIR_rocks.xlsx"))
rubis_01   <- read.table(tutor_examples("rubis_01.txt"), col.names = c("w", "intensity"))
population <- read.csv(tutor_examples("population.csv"))
FTIR_rocks <- readxl::read_excel(tutor_examples("FTIR_rocks.xlsx"))
grade_code()

Print their dimensions and column names.

# Dimensions
rubis_01
population
FTIR_rocks
# Names
rubis_01
population
FTIR_rocks
dim(rubis_01);   names(rubis_01)
dim(population); names(population)
dim(FTIR_rocks); names(FTIR_rocks)
grade_code()

Do the same things by loading directly into tibbles.


library(tidyverse)
rubis_01 <- read_table(tutor_examples("rubis_01.txt"), col_names = c("w", "intensity"))
population <- read_csv(tutor_examples("population.csv"))
grade_code()

Exercise 3

We will use the TGA data file tutor_examples("ATG.txt") (click link to take a look at the file).

Load it into a data.frame{.R}. Look into the options of read.table(){.R} to get the proper data fields.

d <- read.table(tutor_examples("ATG.txt"),
                ___
                )
d
# check how many lines you have to read
# check how many lines you have to skip before reading
# you need to skip the line with the unit
# Two versions
d <- read.table(tutor_examples("ATG.txt"),
    skip = 12,
    header = FALSE,
    nrows = 4088
)
names(d) <- c("Index", "t", "Ts", "Tr", "Value")
head(d)
d <- read.table(tutor_examples("ATG.txt"),
    skip = 10,
    comment.char = "[",
    header = TRUE,
    nrows = 4088
)
head(d)

Do the same using the tidyverse function read_table():

library(tidyverse)
d <- read_table(tutor_examples("ATG.txt"),
                ___
                )
d
library(tidyverse)
d <- read_table(tutor_examples("ATG.txt"), 
                skip    = 10,
                comment = "[") %>% 
        drop_na()
d

Exercise 4

Load the tutor_examples("population.csv") file into a tibble{.R} called popul.

library(___)
popul <- read____(tutor_examples("population.csv"))
library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))
grade_code()

What are the names of the columns? What's the dimension of the table ?

popul
names(popul); dim(popul)
grade_code()

Are the data tidy? make the table tidy if needed

popul
popul.tidy <- popul %>% 
    pivot_longer(
        cols     = ___, # what are the columns we want to keep? -> -these
        names_to = ___, # name of the column gathering the original column names
        values_to= ___  # name of the column gathering the original column values
        )
head(popul) # no
popul.tidy <- popul %>% 
    pivot_longer(cols      = -year,
                 names_to  = "city",
                 values_to = "pop"
                )
popul.tidy
grade_code()

Create a subset containing the data for Montpellier using a filtering function from the tidyverse.

library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))
popul.tidy <- popul %>%
    pivot_longer(
        cols = -year,
        names_to = "city",
        values_to = "pop"
    )
mtp <- popul.tidy %>% ___
mtp <- popul.tidy %>% filter(city == "Montpellier")
grade_code()
library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))
popul.tidy <- popul %>%
    pivot_longer(
        cols = -year,
        names_to = "city",
        values_to = "pop"
    )
mtp <- popul.tidy %>% filter(city == "Montpellier")
mtp
max(mtp$pop)
min(mtp$pop)
range(mtp$pop)
mean(mtp$pop)
grade_code()

What is the total population in 2012?

library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))
popul.tidy <- popul %>%
    pivot_longer(
        cols = -year,
        names_to = "city",
        values_to = "pop"
    )
mtp <- popul.tidy %>% filter(city == "Montpellier")
popul.tidy %>% 
    ___ %>%         # You need to filter the data for the year 2012
    ___ %>%         # Then select the right column
    ___             # And perform the sum of its data
sum(popul.tidy[popul.tidy$year == 2012, "pop"])
popul.tidy %>%
    filter(year == 2012) %>%
    select(pop) %>%
    sum()
grade_code()

What is the total population per year?

library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))
popul.tidy <- popul %>%
    pivot_longer(
        cols = -year,
        names_to = "city",
        values_to = "pop"
    )
mtp <- popul.tidy %>% filter(city == "Montpellier")
popul.tidy %>% 
    ___ %>%    # You need to group data per year
    ___        # Then summarize the data of each year as 
               # the total population of each group
popul.tidy %>% 
    group_by(year) %>% 
    summarise(pop_tot = sum(pop))
grade_code()

What is the average population per city over the years?

library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))
popul.tidy <- popul %>%
    pivot_longer(
        cols = -year,
        names_to = "city",
        values_to = "pop"
    )
mtp <- popul.tidy %>% filter(city == "Montpellier")
popul.tidy %>%
    ___ %>%  # You need to group data per...?
    ___      # Then...?
popul.tidy %>%
    group_by(city) %>%
    summarise(pop_ave = mean(pop))
grade_code()

Exercise 5

First, load the tidyverse and lubridate package

___
___
library(tidyverse)
library(lubridate)
grade_code()

Load tutor_examples("people1.csv") and tutor_examples("people2.csv") into pp1 and pp2, and take a look at them.

tutor_examples("people1.csv")
tutor_examples("people2.csv")
pp1 <- read____(___)
pp2 <- read____(___)
pp1 <- read_csv(tutor_examples("people1.csv"))
pp2 <- read_csv(tutor_examples("people2.csv"))
grade_code()

Create a new tibble pp by using the pipe operator (%>%{.R}) and successively:

pp <- pp1 %>%
    ___ %>%       # you need to join with pp2
    mutate(___)   # then add a column `age` computing the right thing
pp <- pp1 %>%
    inner_join(pp2) %>%
    mutate(age = time_length(today() - dateofbirth, "years"))
grade_code()

library(tidyverse)
library(lubridate)
pp1 <- read_csv(tutor_examples("people1.csv"))
pp2 <- read_csv(tutor_examples("people2.csv"))
pp <- pp1 %>%
    inner_join(pp2) %>%
    mutate(age = time_length(today() - dateofbirth, "years"))

Display a summary of the table using str(){.R}


str(pp)
grade_code()

Using groupe_by(){.R} and summarize(){.R}:

library(tidyverse)
library(lubridate)
pp1 <- read_csv(tutor_examples("people1.csv"))
pp2 <- read_csv(tutor_examples("people2.csv"))
pp <- pp1 %>%
    inner_join(pp2) %>%
    mutate(age = time_length(today() - dateofbirth, "years"))
# - Show the number of males and females in the table (use the counter `n()`)
pp %>%
    ___ %>%
    ___
# - Show the average age per gender
pp %>%
    ___ %>%
    ___
# - Show the average size per gender and institution
pp %>%
    ___ %>%
    ___
# - Show the number of people from each country, sorted by descending population
pp %>%
    ___ %>%
    ___ %>%
    ___
# - Show the number of males and females in the table (use the counter `n()`)
pp %>%
    group_by(gender) %>%
    summarize(count = n())
# - Show the average age per gender
pp %>%
    group_by(gender) %>%
    summarize(age = mean(age))
# - Show the average size per gender and institution
pp %>%
    group_by(gender, institution) %>%
    summarize(size = mean(size))
# - Show the number of people from each country, sorted by descending population
pp %>%
    group_by(origin) %>%
    summarize(count = n()) %>%
    arrange(desc(count))
grade_code()

Using select(){.R}, display:

library(tidyverse)
library(lubridate)
pp1 <- read_csv(tutor_examples("people1.csv"))
pp2 <- read_csv(tutor_examples("people2.csv"))
pp <- pp1 %>%
    inner_join(pp2) %>%
    mutate(age = time_length(today() - dateofbirth, "years"))
# - only the name and age columns
pp ___
# - all but the name column
pp ___
# - only the name and age columns
pp %>% select(c(name, age))
# - all but the name column
pp %>% select(-name)
grade_code()

Using filter(){.R}, show data only for:

library(tidyverse)
library(lubridate)
pp1 <- read_csv(tutor_examples("people1.csv"))
pp2 <- read_csv(tutor_examples("people2.csv"))
pp <- pp1 %>%
    inner_join(pp2) %>%
    mutate(age = time_length(today() - dateofbirth, "years"))
# - Chinese people
pp ___
# - From institution ECL and UCBL
pp ___
# - People older than 22
pp ___
# - People with a `e` in their name
pp ___
# - Chinese people
pp %>% filter(origin == "China")
# - From institution ECL and UCBL
pp %>% filter(institution %in% c("ECL", "UCBL"))
# - People older than 22
pp %>% filter(age > 22)
# - People with a `e` in their name
pp %>% filter(grepl("e", name))
grade_code()

Exercise 6

Here we will see how to load many files at once with the tidyverse and how to perform some data wrangling.

Loading the data

We will work with the files whose paths are stored in the vector flist. These files are all .csv files containing two columns and a header. We can use the fact that read_csv() accepts vectors as argument and read them all at once.

flist <- tutor_examples("sample")
tib <- read_csv(___,           # what do we want to read?
               id = ___) %>%  # what is the name of the column containing the file names ?
        mutate(___)           # modify this column so that it contains just the file name and not the full path
tib <- read_csv(flist, id = "file") %>% 
        mutate(file = basename(file))
grade_code()

Operations on strings

We also want to get information from ou file names, such as the sample number, the temperature, the time, and the time unit. Use the function separate() to split the file column into sample, T, time and time_unit. If applicable, make sure that the resulting columns are numeric by getting rid of the annoying characters.

flist <- tutor_examples("sample")
tib <- read_csv(flist, id = "file") %>%
    mutate(file = basename(file))
tib <- tib %>% 
    separate(col = ___, 
             into = ___, 
             convert = ___) %>% 
    mutate(sample = as.numeric(str_replace(___)),
           T = ___,
           time = ___,
           time_unit = ___
           )
tib
tib <- tib %>% 
    separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% 
    mutate(sample = as.numeric(str_replace(sample, "sample", "")),
           T = as.numeric(str_replace(T, "K", ""))
           )
tib
grade_code()

Now we want all times to be in the same unit. Using mutate() and ifelse(), convert the minutes in seconds, then get rid of the time_unit column.

flist <- tutor_examples("sample")
tib <- read_csv(flist, id = "file") %>%
    mutate(file = basename(file)) %>% 
    separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% 
    mutate(sample = as.numeric(str_replace(sample, "sample", "")),
           T = as.numeric(str_replace(T, "K", ""))
           )
tib <- tib %>% 
    mutate(time = ifelse(test, yes, no)) %>% # convert minutes to seconds
    select(___) # get rid of the `time_unit` column
tib
tib <- tib %>% 
    mutate(time = ifelse(time_unit=="min", time*60, time)) %>% 
    select(-time_unit)
tib
grade_code()

Plotting data

Before going further, we want to take a look at our data using ggplot. Modify the following code so that a color is added as a function of the sample number, and the plots are gathered on a grid showing the time as a function of the temperature.

flist <- tutor_examples("sample")
tib <- read_csv(flist, id = "file") %>%
    mutate(file = basename(file)) %>% 
    separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% 
    mutate(sample = as.numeric(str_replace(sample, "sample", "")),
           T = as.numeric(str_replace(T, "K", ""))
           ) %>% 
    mutate(time = ifelse(time_unit=="min", time*60, time)) %>% 
    select(-time_unit)
tib %>% 
    ggplot(aes(x = x, y = y, color = ___)) +
        geom_point() + 
        facet_grid(___ ~ ___)
tib %>% 
    ggplot(aes(x = x, y = y, color = factor(sample))) +
        geom_point() + 
        facet_grid(time ~ T)
grade_code()

Nesting data

We want to nest our data to be able to perform operations on them – like fitting them. Using the nest() function, nest the data so that we end up with only 2 columns: file and data.

flist <- tutor_examples("sample")
tib <- read_csv(flist, id = "file") %>%
    mutate(file = basename(file)) %>% 
    separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% 
    mutate(sample = as.numeric(str_replace(sample, "sample", "")),
           T = as.numeric(str_replace(T, "K", ""))
           ) %>% 
    mutate(time = ifelse(time_unit=="min", time*60, time)) %>% 
    select(-time_unit)
tib <- tib %>% 
    nest(___)
tib
# What are the columns to nest ? data = c(these_columns)
tib <- tib %>% 
    nest(data = c(x, y))
tib
grade_code()

Fitting all data

Now we can fit all our data at once with a linear model:

flist <- tutor_examples("sample")
tib <- read_csv(flist, id = "file") %>%
    mutate(file = basename(file)) %>% 
    separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% 
    mutate(sample = as.numeric(str_replace(sample, "sample", "")),
           T = as.numeric(str_replace(T, "K", ""))
           ) %>% 
    mutate(time = ifelse(time_unit=="min", time*60, time)) %>% 
    select(-time_unit) %>%
    nest(data = c(x,y))
tib <- tib %>% 
    mutate(fit = map(data, ~lm(data=., x~y)))
tib
tib <- tib %>%
    mutate(fit = map(data, ~ lm(data = ., x ~ y)))
tib
grade_code()


colinbousige/tutor documentation built on Jan. 29, 2023, 7:35 p.m.