oica: Data on Vehicle Production and Sales from the OICA

library(tidyverse)
library(rvest)
library(janitor)
library(readxl)
options(dplyr.width = Inf)

# Standard country / area codes from UN

world_regions <- read_excel(
    file.path('data-raw', 'raw', 'un_country_data.xlsx')) %>%
    clean_names() %>%
    select(
        country = country_or_area,
        region = region_name,
        subregion = sub_region_name,
        intermediate_region = intermediate_region_name,
        least_developed = least_developed_countries_ldc,
        land_locked_developing = land_locked_developing_countries_lldc,
        small_island_developing = small_island_developing_states_sids,
        code_region = region_code,
        code_subregion = sub_region_code,
        code_intermediate_region = intermediate_region_code,
        code_m49 = m49_code,
        code_iso_alpha2 = iso_alpha2_code,
        code_iso_alpha3 = iso_alpha3_code
    ) %>%
    mutate(
        least_developed = ifelse(is.na(least_developed), 0, 1),
        land_locked_developing = ifelse(is.na(land_locked_developing), 0, 1),
        small_island_developing = ifelse(is.na(small_island_developing), 0, 1)
    ) %>%
    # Make column for mis-matching country names to match OICA names
    mutate(
        country_oica = case_when(
            country == 'Bolivia (Plurinational State of)' ~ 'Bolivia',
            country == 'Bosnia and Herzegovina' ~ 'Bosnia',
            country == 'Brunei Darussalam' ~ 'Brunei',
            country == 'China, Hong Kong Special Administrative Region' ~ 'Hong Kong',
            country == 'Czechia' ~ 'Czech Republic',
            country == 'Iran (Islamic Republic of)' ~ 'Iran',
            country == "Lao People's Democratic Republic" ~ 'Laos',
            country == 'Republic of Korea' ~ 'South Korea',
            country == 'Republic of Moldova' ~ 'Moldova',
            country == 'Réunion' ~ 'Reunion',
            country == 'Russian Federation' ~ 'Russia',
            country == 'State of Palestine' ~ 'Palestine',
            country == 'Syrian Arab Republic' ~ 'Syria',
            country == 'United Kingdom of Great Britain and Northern Ireland' ~ 'United Kingdom',
            country == 'United Republic of Tanzania' ~ 'Tanzania',
            country == 'United States of America' ~ 'USA',
            country == 'Venezuela (Bolivarian Republic of)' ~ 'Venezuela',
            country == 'Viet Nam' ~ 'Vietnam',
            TRUE ~ country
        )
    ) %>%
    # Add Taiwan
    rbind(data.frame(
        country = 'Taiwan',
        region = 'Asia',
        subregion = 'Eastern Asia',
        intermediate_region = NA,
        least_developed = 0,
        land_locked_developing = 0,
        small_island_developing = 0,
        code_region = 142,
        code_subregion = 30,
        code_intermediate_region = NA,
        code_m49 = NA,
        code_iso_alpha2 = 'CN-TW',
        code_iso_alpha3 = 'TWN',
        country_oica = 'Taiwan'
    ))

write_csv(world_regions, file.path("data-raw", "world_regions.csv"))




# Production Data ----

# Function for scraping the production data table out of the web page
# for a given year
parse_table <- function(year) {
    url_base <- "https://www.oica.net/category/production-statistics/"
    url <- paste0(url_base, year, "-statistics/")
    df <- read_html(url) %>%
        html_table(header = TRUE, trim = TRUE)
    df <- clean_names(df[[1]]) %>%
        mutate(year = year)
    return(df)
}

# Get all the tables
tables <- lapply(seq(2006, 2023), parse_table)

# Make the data frame
production <- do.call(rbind, tables) %>%
    mutate(
        country = str_to_title(country_region),
        pv = parse_number(cars),
        cv = parse_number(commercial_vehicles, na = c("N.A.", "-"))) %>%
    select(year, country, pv, cv) %>%
    filter(! country %in% c("Total", "Totals")) %>%
    pivot_longer(
        names_to = "type",
        values_to = "n",
        pv:cv
    ) %>%
    # Harmonize some country names
    mutate(
        country = case_when(
            country == "Czech Rep." ~ "Czech Republic",
            country == "Usa" ~ "USA",
            country == "Uk" ~ "United Kingdom",
            country == "Supplementary" ~ "Others",
            TRUE ~ country
        )
    ) %>%
    # Join region data
    left_join(
        world_regions %>%
            select(-country) %>%
            rename(country = country_oica),
        by = 'country')

# Save as csv
write_csv(production, file.path("data-raw", "production.csv"))



# Sales Data ----

# Raw data downloaded from
# https://www.oica.net/category/sales-statistics/

# pc = passenger cars
# cv = commercial vehicles

# 2005 - 2019

pc <- read_excel(file.path('data-raw', 'raw', 'pc-sales-2019.xlsx'), skip = 5)
pc$country <- str_to_title(pc$'REGIONS/COUNTRIES')
pc <- pc %>%
    gather(year, sales, '2005':'2019') %>%
    select(country, year, sales) %>%
    mutate(type = 'pc')
cv <- read_excel(file.path('data-raw', 'raw', 'cv-sales-2019.xlsx'), skip = 5)
cv$country <- str_to_title(cv$'REGIONS/COUNTRIES')
cv <- cv %>%
    gather(year, sales, '2005':'2019') %>%
    select(country, year, sales) %>%
    mutate(type = 'cv')
df_05_19 <- rbind(pc, cv) %>%
    filter(is.na(country)==F) %>%
    mutate(
        country = str_replace(country, '\\*', ''),
        year = as.numeric(year)) %>%
    # Drop footnote rows
    filter(
        country != '1 Only Lv',
        country != '2 Including Heavy Trucks, Buses And Coaches'
    )

# 2020 - 2021
pc <- read_excel(file.path('data-raw', 'raw', 'pc-sales-2021.xlsx'), skip = 3)
pc$country <- str_to_title(pc$'REGIONS/COUNTRIES')
pc <- pc %>%
    gather(year, sales, 'Q1-Q4 2019':'Q1-Q4 2021') %>%
    select(country, year, sales) %>%
    mutate(type = 'pc')
cv <- read_excel(file.path('data-raw', 'raw', 'cv-sales-2021.xlsx'), skip = 3)
cv$country <- str_to_title(cv$'REGIONS/COUNTRIES')
cv <- cv %>%
    gather(year, sales, 'Q1-Q4 2019':'Q1-Q4 2021') %>%
    select(country, year, sales) %>%
    mutate(type = 'cv')
df_20_21 <- rbind(pc, cv) %>%
    filter(is.na(country)==F) %>%
    mutate(
        country = str_replace(country, '\\*', ''),
        year = as.numeric(str_replace(year, "Q1-Q4 ", ""))) %>%
    filter(year > 2019)

# 2022 - 2023
pc <- read_excel(file.path('data-raw', 'raw', 'pc-sales-2023.xlsx'), skip = 3)
pc$country <- str_to_title(pc$'REGIONS/COUNTRIES')
pc <- pc %>%
    gather(year, sales, 'Q1-Q4 2021':'Q1-Q4 2023') %>%
    select(country, year, sales) %>%
    mutate(type = 'pc')
cv <- read_excel(file.path('data-raw', 'raw', 'cv-sales-2023.xlsx'), skip = 3)
cv$country <- str_to_title(cv$'REGIONS/COUNTRIES')
cv <- cv %>%
    gather(year, sales, 'Q1-Q4 2021':'Q1-Q4 2023') %>%
    select(country, year, sales) %>%
    mutate(type = 'cv')
df_22_23 <- rbind(pc, cv) %>%
    filter(is.na(country)==F) %>%
    mutate(
        country = str_replace(country, '\\*', ''),
        year = as.numeric(str_replace(year, "Q1-Q4 ", ""))) %>%
    filter(year > 2021)


# Merge years and fix country name irregularities
df <- rbind(df_05_19, df_20_21, df_22_23) %>%
    mutate(
        country = case_when(
            country == 'Switzerland (+Fl)' ~ 'Switzerland',
            country == 'Congo Kinshasa' ~ 'Congo',
            country == 'Moldavia' ~ 'Moldova',
            country == 'United States Of America' ~ 'USA',
            country == 'Azerbaidjan' ~ 'Azerbaijan',
            country == 'Cambodge' ~ 'Cambodia',
            country == 'Hong-Kong' ~ 'Hong Kong',
            country == 'Irak' ~ 'Iraq',
            country == 'Kazakstan' ~ 'Kazakhstan',
            country == 'Kirghizistan' ~ 'Kyrgyzstan',
            country == 'Tadjikistan' ~ 'Tajikistan',
            country == 'Tahiti French Polynesia' ~ 'French Polynesia',
            country == 'Tahiti' ~ 'French Polynesia',
            country == 'Tukmenistan' ~ 'Turkmenistan',
            country == 'Trinidad' ~ 'Trinidad and Tobago',
            country == 'Burkina' ~ 'Burkina Faso',
            country == 'Guiana (French)' ~ 'French Guiana',
            country == 'Guiana' ~ 'French Guiana',
            country == 'Bulgaria1' ~ 'Bulgaria',
            country == 'Mexico2' ~ 'Mexico',
            TRUE ~ country
        )
    )

# Join region data
df <- df %>%
    left_join(
    world_regions %>%
        select(-country) %>%
        rename(country = country_oica),
    by = 'country') %>%
    select(year, country, type, n = sales, everything())

# Separate out countries and regions
sales_region <- df %>%
    filter(is.na(region) == TRUE) %>%
    select(year, region = country, type, n) %>%
    mutate(
        region = ifelse(
            region %in% c(
                "Eu 27 Countries + Efta + Uk", "Eu 28 Countries + Efta"),
            "EU", region
        )
    ) %>%
    filter(! region %in% c('Eu 15 Countries + Efta', 'Europe New Members')) %>%
    arrange(region, year)

sales_country <- df %>%
    filter(is.na(region) == F) %>%
    arrange(country)

# Save data frames
write_csv(sales_country, file.path('data-raw', 'sales_country.csv'))
write_csv(sales_region, file.path('data-raw', 'sales_region.csv'))



# Save the datasets
usethis::use_data(production, overwrite = TRUE)
usethis::use_data(sales_country, overwrite = TRUE)
usethis::use_data(sales_region, overwrite = TRUE)

jhelvy/oica documentation built on June 11, 2025, 5:40 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

jhelvy/oica
Data on Vehicle Production and Sales from the OICA

data-raw/format_data.R
In jhelvy/oica: Data on Vehicle Production and Sales from the OICA

R Package Documentation

Browse R Packages

We want your feedback!

jhelvy/oica Data on Vehicle Production and Sales from the OICA

data-raw/format_data.R In jhelvy/oica: Data on Vehicle Production and Sales from the OICA

R Package Documentation

Browse R Packages

We want your feedback!

jhelvy/oica
Data on Vehicle Production and Sales from the OICA

data-raw/format_data.R
In jhelvy/oica: Data on Vehicle Production and Sales from the OICA