In colinbousige/tutor: Interactive tutorials for learning R

# rmarkdown::run("dataframe.Rmd")
library(learnr)
library(tutor)
library(tidyverse)
gradethis::gradethis_setup()

Exercise 1

Create a 3 column data.frame{.R} called df containing three columns x, y and z with:

x a vector from $-\pi$ to $\pi$ of length 10
y their sinus
and z the sum of the two first columns.

x <- ___
y <- ___
z <- ___
df <- data.frame()

x <- seq(-pi, pi, length=10)
y <- sin(x)
z <- x + y
df <- data.frame(x=x, y=y, z=z)

grade_code()

Print the 4 first lines of the table.

# Take a look at the head() function

head(df, 4)

grade_code()

Print the second (i.e. y) column with three different ways.

df[___]
df[___]
df$___

df[,2]
df[,"y"]
df$y

grade_code()

Modify the column z so that it contains its value minus its minimum.

df$z <- df$z - min(df$z)

grade_code()

Print the average of the third column.

# look at the mean() function

mean(df$z)

grade_code()

Using plot(x,y){.R} where x and y are vectors, plot the 2nd column as a function of the first.

plot(___, ___)

plot(df$x, df$y)

grade_code()

Look into the function write.table(){.R} to write a text file containing this data.frame{.R}.

write.table(___)

write.table(df, "~/Downloads/some_data.dat", quote = FALSE, row.names = FALSE)

grade_code()

Do the all the same things with a tibble{.R}.

df_tib <- tibble(___)

library(tidyverse)
df_tib <- tibble(x = seq(-pi, pi, length = 10), 
                 y = sin(x), 
                 z = x + y)
head(df_tib, 4)
df_tib[, 2]
df_tib[[2]]
mean(df_tib$z)
write_csv(df_tib, "~/Downloads/some_data.csv")
plot(df_tib$x, df_tib$y)

grade_code()

Exercise 2

We will work with 3 different files: - tutor_examples("rubis_01.txt") - tutor_examples("population.csv") - tutor_examples("FTIR_rocks.xlsx")

Load them into separate data.frames{.R}. Look into the options of read.table(){.R}, read.csv(){.R}, readxl::read_excel(){.R}, to get the proper data fields. Make sure that the rubis_01 data.frame has w and intensity as column names.

rubis_01   <- ___(tutor_examples("rubis_01.txt"), ___)
population <- ___(tutor_examples("population.csv"))
FTIR_rocks <- ___(tutor_examples("FTIR_rocks.xlsx"))

rubis_01   <- read.table(tutor_examples("rubis_01.txt"), col.names = c("w", "intensity"))
population <- read.csv(tutor_examples("population.csv"))
FTIR_rocks <- readxl::read_excel(tutor_examples("FTIR_rocks.xlsx"))

grade_code()

Print their dimensions and column names.

# Dimensions
rubis_01
population
FTIR_rocks
# Names
rubis_01
population
FTIR_rocks

dim(rubis_01);   names(rubis_01)
dim(population); names(population)
dim(FTIR_rocks); names(FTIR_rocks)

grade_code()

Do the same things by loading directly into tibbles.

library(tidyverse)
rubis_01 <- read_table(tutor_examples("rubis_01.txt"), col_names = c("w", "intensity"))
population <- read_csv(tutor_examples("population.csv"))

grade_code()

Exercise 3

We will use the TGA data file tutor_examples("ATG.txt") (click link to take a look at the file).

Load it into a data.frame{.R}. Look into the options of read.table(){.R} to get the proper data fields.

d <- read.table(tutor_examples("ATG.txt"),
                ___
                )
d

# check how many lines you have to read
# check how many lines you have to skip before reading
# you need to skip the line with the unit

# Two versions
d <- read.table(tutor_examples("ATG.txt"),
    skip = 12,
    header = FALSE,
    nrows = 4088
)
names(d) <- c("Index", "t", "Ts", "Tr", "Value")
head(d)
d <- read.table(tutor_examples("ATG.txt"),
    skip = 10,
    comment.char = "[",
    header = TRUE,
    nrows = 4088
)
head(d)

Do the same using the tidyverse function read_table():

library(tidyverse)
d <- read_table(tutor_examples("ATG.txt"),
                ___
                )
d

library(tidyverse)
d <- read_table(tutor_examples("ATG.txt"), 
                skip    = 10,
                comment = "[") %>% 
        drop_na()
d

Exercise 4

Load the tutor_examples("population.csv") file into a tibble{.R} called popul.

library(___)
popul <- read____(tutor_examples("population.csv"))

library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))

grade_code()

What are the names of the columns? What's the dimension of the table ?

popul

names(popul); dim(popul)

grade_code()

Are the data tidy? make the table tidy if needed

popul
popul.tidy <- popul %>% 
    pivot_longer(
        cols     = ___, # what are the columns we want to keep? -> -these
        names_to = ___, # name of the column gathering the original column names
        values_to= ___  # name of the column gathering the original column values
        )

head(popul) # no
popul.tidy <- popul %>% 
    pivot_longer(cols      = -year,
                 names_to  = "city",
                 values_to = "pop"
                )
popul.tidy

grade_code()

Create a subset containing the data for Montpellier using a filtering function from the tidyverse.

library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))
popul.tidy <- popul %>%
    pivot_longer(
        cols = -year,
        names_to = "city",
        values_to = "pop"
    )

mtp <- popul.tidy %>% ___

mtp <- popul.tidy %>% filter(city == "Montpellier")

grade_code()

What is the max and min of population in this city?
The average population over time?

library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))
popul.tidy <- popul %>%
    pivot_longer(
        cols = -year,
        names_to = "city",
        values_to = "pop"
    )
mtp <- popul.tidy %>% filter(city == "Montpellier")

mtp

max(mtp$pop)
min(mtp$pop)
range(mtp$pop)
mean(mtp$pop)

grade_code()

What is the total population in 2012?

library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))
popul.tidy <- popul %>%
    pivot_longer(
        cols = -year,
        names_to = "city",
        values_to = "pop"
    )
mtp <- popul.tidy %>% filter(city == "Montpellier")

popul.tidy %>% 
    ___ %>%         # You need to filter the data for the year 2012
    ___ %>%         # Then select the right column
    ___             # And perform the sum of its data

sum(popul.tidy[popul.tidy$year == 2012, "pop"])
popul.tidy %>%
    filter(year == 2012) %>%
    select(pop) %>%
    sum()

grade_code()

What is the total population per year?

library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))
popul.tidy <- popul %>%
    pivot_longer(
        cols = -year,
        names_to = "city",
        values_to = "pop"
    )
mtp <- popul.tidy %>% filter(city == "Montpellier")

popul.tidy %>% 
    ___ %>%    # You need to group data per year
    ___        # Then summarize the data of each year as 
               # the total population of each group

popul.tidy %>% 
    group_by(year) %>% 
    summarise(pop_tot = sum(pop))

grade_code()

What is the average population per city over the years?

library(tidyverse)
popul <- read_csv(tutor_examples("population.csv"))
popul.tidy <- popul %>%
    pivot_longer(
        cols = -year,
        names_to = "city",
        values_to = "pop"
    )
mtp <- popul.tidy %>% filter(city == "Montpellier")

popul.tidy %>%
    ___ %>%  # You need to group data per...?
    ___      # Then...?

popul.tidy %>%
    group_by(city) %>%
    summarise(pop_ave = mean(pop))

grade_code()

Exercise 5

First, load the tidyverse and lubridate package

___
___

library(tidyverse)
library(lubridate)

grade_code()

Load tutor_examples("people1.csv") and tutor_examples("people2.csv") into pp1 and pp2, and take a look at them.

tutor_examples("people1.csv")
tutor_examples("people2.csv")
pp1 <- read____(___)
pp2 <- read____(___)

pp1 <- read_csv(tutor_examples("people1.csv"))
pp2 <- read_csv(tutor_examples("people2.csv"))

grade_code()

Create a new tibble pp by using the pipe operator (%>%{.R}) and successively:

joining the two tibbles into one using inner_join(){.R}
adding a column age containing the age in years (use lubridate::time_length(x, 'years'){.R} with x a time difference in days) by using mutate(){.R}

pp <- pp1 %>%
    ___ %>%       # you need to join with pp2
    mutate(___)   # then add a column `age` computing the right thing

pp <- pp1 %>%
    inner_join(pp2) %>%
    mutate(age = time_length(today() - dateofbirth, "years"))

grade_code()

library(tidyverse)
library(lubridate)
pp1 <- read_csv(tutor_examples("people1.csv"))
pp2 <- read_csv(tutor_examples("people2.csv"))
pp <- pp1 %>%
    inner_join(pp2) %>%
    mutate(age = time_length(today() - dateofbirth, "years"))

Display a summary of the table using str(){.R}

str(pp)

grade_code()

Using groupe_by(){.R} and summarize(){.R}:

Show the number of males and females in the table (use the counter n(){.R})
Show the average age per gender
Show the average size per gender and institution
Show the number of people from each country, sorted by descending population (arrange(){.R})

library(tidyverse)
library(lubridate)
pp1 <- read_csv(tutor_examples("people1.csv"))
pp2 <- read_csv(tutor_examples("people2.csv"))
pp <- pp1 %>%
    inner_join(pp2) %>%
    mutate(age = time_length(today() - dateofbirth, "years"))

# - Show the number of males and females in the table (use the counter `n()`)
pp %>%
    ___ %>%
    ___
# - Show the average age per gender
pp %>%
    ___ %>%
    ___
# - Show the average size per gender and institution
pp %>%
    ___ %>%
    ___
# - Show the number of people from each country, sorted by descending population
pp %>%
    ___ %>%
    ___ %>%
    ___

# - Show the number of males and females in the table (use the counter `n()`)
pp %>%
    group_by(gender) %>%
    summarize(count = n())
# - Show the average age per gender
pp %>%
    group_by(gender) %>%
    summarize(age = mean(age))
# - Show the average size per gender and institution
pp %>%
    group_by(gender, institution) %>%
    summarize(size = mean(size))
# - Show the number of people from each country, sorted by descending population
pp %>%
    group_by(origin) %>%
    summarize(count = n()) %>%
    arrange(desc(count))

grade_code()

Using select(){.R}, display:

only the name and age columns
all but the name column

library(tidyverse)
library(lubridate)
pp1 <- read_csv(tutor_examples("people1.csv"))
pp2 <- read_csv(tutor_examples("people2.csv"))
pp <- pp1 %>%
    inner_join(pp2) %>%
    mutate(age = time_length(today() - dateofbirth, "years"))

# - only the name and age columns
pp ___
# - all but the name column
pp ___

# - only the name and age columns
pp %>% select(c(name, age))
# - all but the name column
pp %>% select(-name)

grade_code()

Using filter(){.R}, show data only for:

Chinese people
From institution ECL and UCBL
People older than 22
People with a e in their name

library(tidyverse)
library(lubridate)
pp1 <- read_csv(tutor_examples("people1.csv"))
pp2 <- read_csv(tutor_examples("people2.csv"))
pp <- pp1 %>%
    inner_join(pp2) %>%
    mutate(age = time_length(today() - dateofbirth, "years"))

# - Chinese people
pp ___
# - From institution ECL and UCBL
pp ___
# - People older than 22
pp ___
# - People with a `e` in their name
pp ___

# - Chinese people
pp %>% filter(origin == "China")
# - From institution ECL and UCBL
pp %>% filter(institution %in% c("ECL", "UCBL"))
# - People older than 22
pp %>% filter(age > 22)
# - People with a `e` in their name
pp %>% filter(grepl("e", name))

grade_code()

Exercise 6

Here we will see how to load many files at once with the tidyverse and how to perform some data wrangling.

Loading the data

We will work with the files whose paths are stored in the vector flist. These files are all .csv files containing two columns and a header. We can use the fact that read_csv() accepts vectors as argument and read them all at once.

Read them all in a tidy tibble called tib.
Make sure to add a column named "file" containing the list of filenames: look at the id parameter
Modify this "file" column so that it contains just the file name and not the full path – look at the basename() function.

flist <- tutor_examples("sample")

tib <- read_csv(___,           # what do we want to read?
               id = ___) %>%  # what is the name of the column containing the file names ?
        mutate(___)           # modify this column so that it contains just the file name and not the full path

tib <- read_csv(flist, id = "file") %>% 
        mutate(file = basename(file))

grade_code()

Operations on strings

We also want to get information from ou file names, such as the sample number, the temperature, the time, and the time unit. Use the function separate() to split the file column into sample, T, time and time_unit. If applicable, make sure that the resulting columns are numeric by getting rid of the annoying characters.

flist <- tutor_examples("sample")
tib <- read_csv(flist, id = "file") %>%
    mutate(file = basename(file))

tib <- tib %>% 
    separate(col = ___, 
             into = ___, 
             convert = ___) %>% 
    mutate(sample = as.numeric(str_replace(___)),
           T = ___,
           time = ___,
           time_unit = ___
           )
tib

tib <- tib %>% 
    separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% 
    mutate(sample = as.numeric(str_replace(sample, "sample", "")),
           T = as.numeric(str_replace(T, "K", ""))
           )
tib

grade_code()

Now we want all times to be in the same unit. Using mutate() and ifelse(), convert the minutes in seconds, then get rid of the time_unit column.

flist <- tutor_examples("sample")
tib <- read_csv(flist, id = "file") %>%
    mutate(file = basename(file)) %>% 
    separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% 
    mutate(sample = as.numeric(str_replace(sample, "sample", "")),
           T = as.numeric(str_replace(T, "K", ""))
           )

tib <- tib %>% 
    mutate(time = ifelse(test, yes, no)) %>% # convert minutes to seconds
    select(___) # get rid of the `time_unit` column
tib

tib <- tib %>% 
    mutate(time = ifelse(time_unit=="min", time*60, time)) %>% 
    select(-time_unit)
tib

grade_code()

Plotting data

Before going further, we want to take a look at our data using ggplot. Modify the following code so that a color is added as a function of the sample number, and the plots are gathered on a grid showing the time as a function of the temperature.

flist <- tutor_examples("sample")
tib <- read_csv(flist, id = "file") %>%
    mutate(file = basename(file)) %>% 
    separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% 
    mutate(sample = as.numeric(str_replace(sample, "sample", "")),
           T = as.numeric(str_replace(T, "K", ""))
           ) %>% 
    mutate(time = ifelse(time_unit=="min", time*60, time)) %>% 
    select(-time_unit)

tib %>% 
    ggplot(aes(x = x, y = y, color = ___)) +
        geom_point() + 
        facet_grid(___ ~ ___)

tib %>% 
    ggplot(aes(x = x, y = y, color = factor(sample))) +
        geom_point() + 
        facet_grid(time ~ T)

grade_code()

Nesting data

We want to nest our data to be able to perform operations on them – like fitting them. Using the nest() function, nest the data so that we end up with only 2 columns: file and data.

flist <- tutor_examples("sample")
tib <- read_csv(flist, id = "file") %>%
    mutate(file = basename(file)) %>% 
    separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% 
    mutate(sample = as.numeric(str_replace(sample, "sample", "")),
           T = as.numeric(str_replace(T, "K", ""))
           ) %>% 
    mutate(time = ifelse(time_unit=="min", time*60, time)) %>% 
    select(-time_unit)

tib <- tib %>% 
    nest(___)
tib

# What are the columns to nest ? data = c(these_columns)

tib <- tib %>% 
    nest(data = c(x, y))
tib

grade_code()

Fitting all data

Now we can fit all our data at once with a linear model:

flist <- tutor_examples("sample")
tib <- read_csv(flist, id = "file") %>%
    mutate(file = basename(file)) %>% 
    separate(col = file, into = c("sample", "T", "time", "time_unit", NA), convert= TRUE) %>% 
    mutate(sample = as.numeric(str_replace(sample, "sample", "")),
           T = as.numeric(str_replace(T, "K", ""))
           ) %>% 
    mutate(time = ifelse(time_unit=="min", time*60, time)) %>% 
    select(-time_unit) %>%
    nest(data = c(x,y))

tib <- tib %>% 
    mutate(fit = map(data, ~lm(data=., x~y)))
tib

tib <- tib %>%
    mutate(fit = map(data, ~ lm(data = ., x ~ y)))
tib

grade_code()

colinbousige/tutor documentation built on Jan. 29, 2023, 7:35 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

colinbousige/tutor
Interactive tutorials for learning R

In colinbousige/tutor: Interactive tutorials for learning R

Exercise 1

Exercise 2

Exercise 3

Exercise 4

Exercise 5

Exercise 6

Loading the data

Operations on strings

Plotting data

Nesting data

Fitting all data

R Package Documentation

Browse R Packages

We want your feedback!

colinbousige/tutor Interactive tutorials for learning R

In colinbousige/tutor: Interactive tutorials for learning R

Exercise 1

Exercise 2

Exercise 3

Exercise 4

Exercise 5

Exercise 6

Loading the data

Operations on strings

Plotting data

Nesting data

Fitting all data

R Package Documentation

Browse R Packages

We want your feedback!

colinbousige/tutor
Interactive tutorials for learning R