#' ---
#' filename: build-actc.R
#' created: 2017-09-28
#' updated: <2019-07-17 16:55:42 david at grover>
#' author: David Mitchell <david.p.mitchell@homemail.com.au>
#' description: Build Australian Customs Tariff Classification
#' Script automatically downloads the ACTC from the DIBP
#' Tariff Classification (Schedule 3)
#' notes: Check `base_url` is still current
#' ---
### Set ACTC base URL
base_url_2017 <- "https://www.border.gov.au/Busi/cargo-support-trade-and-goods/importing-goods/tariff-classification-of-goods/current-tariff-classification/schedule-3";
base_url_2019 <- "https://www.abf.gov.au/importing-exporting-and-manufacturing/tariff-classification/current-tariff/schedule-3"
base_url <- base_url_2019;
###### Section 0 - Libraries & settings
library(magrittr);
library(rvest); ## Web scraping functions
library(urltools); ## URL parsing and composing
library(XML); ##
library(httr); ##
library(readxl); ## Read Excel files
library(dplyr);
library(tidyr);
### Settings
DEBUG <- FALSE
### Functions
#' @name which_link
#' @title return URL link number matching specified \code{pattern}
#' @description return the link numbers
#' @import xml2::read_html rvest::html_nodes, rvest::html_text
#' @param x a session
#' @param node a node set or a single mode
#' @param pattern character string containing a regular expression to be matched against the node set string
#' @return returns integer specifying the number of the matching link
which_link <- function(x, node, pattern) {
x %>% xml2::read_html(.) %>%
rvest::html_nodes(node) %>%
rvest::html_text() %>%
grep(pattern, .)
}
#' @name capwords
#' @title Capitalise words
#' @param s string
#' @param split character on which to split the original string
#' @param strict enforce capitalisation
#' @return returns capitalised character string
function (s, split = c("\\W"), strict = FALSE)
{
if (length(s) == 0)
stop("Cannot capitalise a blank string.")
if (!is.character(s))
s <- as.character(s)
replace(s, is.na(s), "")
cap <- function(s) paste(toupper(substring(s, 1, 1)), {
s <- substring(s, 2)
if (strict)
tolower(s)
else s
}, sep = "", collapse = " ")
sapply(strsplit(s, split = split), cap, USE.NAMES = !is.null(names(s)))
}
##
httr::set_config(config(ssl_verifypeer = 0L));
###### Section 1 - Import ACTC Schedule 3
#### Return Schedule 3 - Section numbers, names and URLs
actc_sections <- base_url %>%
html_session %>%
read_html %>%
html_nodes("a") %>%
Map(function(x) data.frame(href = html_attr(x, "href"),
text = html_text(x)),
.) %>%
bind_rows %>%
filter(grepl("section-\\d+\\.aspx", href)) %>%
mutate(type = ifelse(grepl("Section\\W*\\w+", text, ignore.case=TRUE), "Number", "Name")) %>%
spread(type, text) %>%
mutate(section_number = sub("Section\\W*(\\w+)", "\\1", Number),
section_name = Name) %>%
rename(section_href = href) %>%
select(section_href, section_number, section_name);
if (DEBUG) {
actc_sections %>%
write.csv(file=file.path(tempdir(), "Sections.csv"), row.names=FALSE)
}
#### Return Schedule 3 - Chapter numbers, names and hrefs
raw_actc_chapters <- actc_sections$section_href %>%
Map(function(x) base_url %>% url_parse %>%
inset("path", value=x) %>%
url_compose %>%
html_session %>%
read_html %>%
html_nodes("a"),
.);
actc_chapters <- Map(function(x, i) data.frame(section_href = i,
href = html_attr(x, "href"),
text = html_text(x)),
raw_actc_chapters, names(raw_actc_chapters)) %>%
bind_rows %>%
filter(grepl("chapter-\\d+\\.aspx$", href)) %>%
mutate(type = ifelse(grepl("Chapter\\W*\\w+", text, ignore.case=TRUE), "Number", "Name")) %>%
distinct(section_href, href, text, type) %>% ## Remove duplicates
filter(!grepl("^\\d+$", text) & !grepl("^$", text)) %>% ## Remove blank/numeric-only text
filter(!grepl("^actc_chapters\\W*\\d+", text)) %>% ## Remove text == 'actc_chapters xx'
spread(type, text) %>%
filter(!is.na(Name)) %>% ## Remove NA names
mutate(chapter_number = sub("Chapter\\W*(\\w+)", "\\1", Number),
chapter_name = Name) %>%
rename(chapter_href = href) %>%
arrange(chapter_number) %>%
select(section_href, chapter_href, chapter_number, chapter_name);
if (DEBUG) {
actc_chapters %>%
write.csv(file=file.path(tempdir(), "actc_chapters.csv"), row.names=FALSE)
}
#### Return Schedule 3 - Tables
Tables <-
Map(function(x)
base_url %>% url_parse %>%
inset("path", value=x) %>%
url_compose %>%
html_session %>%
read_html %>%
html_nodes("table") %>%
html_table(fill=TRUE) %>%
## Rename table columns - note use of seq_along(..) to allow for column-varying tables
lapply(.,
function(y)
y %>% set_names(c("reference_number","statistical_code","unit","goods","rate",
"X_1","X_2")[seq_along(names(.))]) %>%
mutate(chapter_href = x) %>%
mutate_at(vars(statistical_code, reference_number), funs(as.character))
),
actc_chapters$chapter_href) %>%
unlist(recursive=FALSE) %>%
bind_rows;
if (DEBUG) {
Tables %>%
write.csv(file=file.path(tempdir(), "Tables.csv"), row.names=FALSE)
}
#### Create ACTC data frame
actc <- Tables %>%
## Join Section and Chapter numbrers/names
left_join(actc_chapters, by="chapter_href") %>%
left_join(actc_sections, by="section_href") %>%
## Create 8-character reference_number
mutate(clean_reference_number = reference_number %>%
gsub("\\.", "", .),
tmp_reference_number = clean_reference_number %>%
stringr::str_pad(width=8, side="right", pad="0")) %>%
## Clean up 'goods'' text: i) remove leading '-'; ii) remove trailing ':';
## iii) capitalise first letter; iv) remove \r\n and multiple whitespace (also in 'rate')
mutate_at(vars(goods), funs(sub("^\\s*-*\\s*(.+)$", "\\1", .))) %>%
mutate_at(vars(goods), funs(sub("^(.+):$", "\\1", .))) %>%
mutate_at(vars(goods), funs(paste0(toupper(substring(., 1, 1)),
tolower(substring(., 2))))) %>%
mutate_at(vars(goods, rate), funs(gsub("\\s+", " ", .))) %>%
mutate(heading_number = tmp_reference_number %>% gsub(".{4}$", "", .),
subheading_number_5digit = tmp_reference_number %>% gsub(".{3}$", "", .),
subheading_number_6digit = tmp_reference_number %>% gsub(".{2}$", "", .),
heading_name = ifelse(grepl("0{4}$", tmp_reference_number), goods, NA_character_) %>%
zoo::na.locf(na.rm=FALSE),
subheading_name_5digit = ifelse(grepl("[1-9]0{3}$", tmp_reference_number), goods, NA_character_) %>%
zoo::na.locf(na.rm=FALSE),
subheading_name_6digit = ifelse(grepl("[1-9]0{2}$", tmp_reference_number), goods, NA_character_) %>%
zoo::na.locf(na.rm=FALSE),
commodity_name = paste(heading_name, subheading_name_5digit, subheading_name_6digit,
sep="; ") %>% gsub("; NA", "", .))
## Clean and refine
actc <- actc %>%
filter(nchar(actc$clean_reference_number) == 8) %>%
mutate(reference_number = tmp_reference_number %>% as.integer) %>%
rename(notes = X_1) %>%
select(section_number, section_name,
chapter_number, chapter_name,
heading_number, heading_name,
subheading_number_5digit, subheading_name_5digit,
subheading_number_6digit, subheading_name_6digit,
reference_number, statistical_code, unit, commodity_name, rate, notes);
if (DEBUG) {
actc %>%
write.csv(file=file.path(tempdir(), "ACTC.csv"), row.names=FALSE)
}
### Write results to actc_2019 data frame
actc_2019 <- actc;
###### Section 4 - Write data sets files
devtools::use_data(actc_2019, overwrite=TRUE);
## -------------------------------- EOF ---------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.