Using the Student and Country Data

options(rmarkdown.html_vignette.check_title = FALSE)
knitr::opts_chunk$set(
echo = TRUE,
collapse = TRUE,
comment = "#>",
warning = FALSE,
message = FALSE,
error = FALSE,
outwidth = "100%",
fig.width = 8,
fig.height = 6,
fig.align = "center")

Introduction

The goal of learningtower is to provide a user-friendly R package to provide easy access to a subset of variables from PISA data collected from the OECD. Version r utils::packageVersion("learningtower") of this package provides the data for the years r learningtower:::data_range(). The survey data is published every three years. This is an excellent real world dataset for data exploring, data visualizing and statistical computations.

This vignette documents how to access the data, and shows a few typical methods to explore the data.

Exploring the student data

Usage of the subset of the student data

Below is a quick example of loading the 2018 subset student data.

library(dplyr)
library(ggplot2)
library(learningtower)

#load the subset student data for the year 2018
data(student_subset_2018)
#load the countrycode data
data(countrycode)

glimpse(student_subset_2018)
selected_countries = c("AUS", "USA", "TUR", "SWE", 
                       "CHE", "NZL", "BEL", "DEU")

student_subset_2018 |> 
  group_by(country, gender) |> 
  dplyr::filter(country %in% selected_countries) |>
  dplyr::left_join(countrycode, by = "country") |> 
  ggplot(aes(x = math,
             y = country_name,
             fill = gender)) +
  geom_boxplot() +
  scale_fill_manual(values = c("#FF7F0EFF", "#1F77B4FF")) +
  theme_classic() +
  labs(x = "Math score", 
       y = "")

Usage of the entire student data

#load the entire student data for the year 2018
student_data_2018 <- load_student(2018)

#load the entire student data for two of the years (2012, 2018)
student_data_2012_2018 <- load_student(c(2012, 2018))

#load the entire student 
student_data_all <- load_student("all")
student_data_2012_2018 <- load_student(c(2012, 2018))

plot_data <- student_data_2012_2018 |> 
  group_by(country, year) |>  
  dplyr::filter(country %in% selected_countries) |> 
  dplyr::summarise(avg_math = mean(math, na.rm = TRUE)) |>  
  left_join(countrycode, by = "country") |> 
  dplyr::select(country_name, year, avg_math) |> 
  ungroup() |> 
  dplyr::mutate(
    label_x_pos = ifelse(year == 2012, 2012 - 2, 2018 + 1),
    label = ifelse(
      year == 2012,
      paste0(country_name, ", ", round(avg_math)),
      round(avg_math)))

plot_data |> 
  ggplot(aes(x = year, 
             y = avg_math,
             label = label,
             colour = country_name,
             group = country_name)) +
  geom_point() +
  geom_line() +
  geom_vline(xintercept=2012,
             linetype="dashed",
             linewidth=0.1) +
  geom_vline(xintercept=2018,
             linetype="dashed",
             linewidth=0.1) + 
  geom_text(aes(x = label_x_pos),
            position = position_nudge(y = 0)) +
  scale_x_continuous(breaks = c(2012, 2018),
                     limits = c(2008, 2020)) +
  scale_colour_manual(values = c("#1F77B4FF", "#FF7F0EFF", "#2CA02CFF", "#D62728FF", 
                                 "#9467BDFF", "#8C564BFF", "#E377C2FF", "#7F7F7FFF")) +
  labs(x = "",
       y = "Average maths score") +
  theme_classic() +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "none")


Try the learningtower package in your browser

Any scripts or data that you put into this service are public.

learningtower documentation built on April 4, 2025, 2:27 a.m.