library(data.table)
library(magrittr)
library(hutils)
library(hutilscpp)
library(default)
load("R/sysdata.rda")
default(fread) <- list(showProgress = FALSE)
LATEST <- normalizePath("~/Data/PSMA-Geocoded-Address-202105/",
winslash = "/")
ADDRESS_DETAIL_PID__by__LATLON <-
dir(pattern = "_ADDRESS_DEFAULT_GEOCODE_psv",
recursive = TRUE,
full.names = TRUE,
path = LATEST) %>%
lapply(fread,
na.strings = "",
select = c("ADDRESS_DETAIL_PID",
"LATITUDE",
"LONGITUDE"),
key = "ADDRESS_DETAIL_PID") %>%
rbindlist %>%
setkeyv("ADDRESS_DETAIL_PID")
STREET_PID_vs_ADDRESS_PID <-
dir(pattern = "_ADDRESS_DETAIL_psv.psv$",
path = LATEST,
recursive = TRUE,
full.names = TRUE) %>%
lapply(fread,
na.strings = "",
select = c("ADDRESS_DETAIL_PID",
# "DATE_CREATED",
# "DATE_LAST_MODIFIED",
# "DATE_RETIRED",
"BUILDING_NAME",
# "LOT_NUMBER_PREFIX",
"LOT_NUMBER",
# "LOT_NUMBER_SUFFIX",
# "FLAT_TYPE_CODE",
# "FLAT_NUMBER_PREFIX",
"FLAT_NUMBER",
# "FLAT_NUMBER_SUFFIX",
# "LEVEL_TYPE_CODE",
# "LEVEL_NUMBER_PREFIX",
# "LEVEL_NUMBER",
# "LEVEL_NUMBER_SUFFIX",
# "NUMBER_FIRST_PREFIX",
"NUMBER_FIRST",
# "NUMBER_FIRST_SUFFIX",
# "NUMBER_LAST_PREFIX",
# "NUMBER_LAST",
# "NUMBER_LAST_SUFFIX",
"STREET_LOCALITY_PID",
# "LOCATION_DESCRIPTION",
# "LOCALITY_PID",
# "ALIAS_PRINCIPAL",
"POSTCODE"
# "PRIVATE_STREET",
# "LEGAL_PARCEL_ID",
# "CONFIDENCE",
# "ADDRESS_SITE_PID",
# "LEVEL_GEOCODED_CODE",
# "PROPERTY_PID",
# "GNAF_PROPERTY_PID",
# "PRIMARY_SECONDARY"
)) %>%
rbindlist(use.names = TRUE, fill = TRUE) %>%
setkey(ADDRESS_DETAIL_PID)
STREET_LOCALITY_PID__STREET_NAME_STREET_TYPE_CODE <-
dir(pattern = "_STREET_LOCALITY_psv.psv$",
path = LATEST,
recursive = TRUE,
full.names = TRUE) %>%
lapply(fread,
na.strings = "",
select = c("STREET_LOCALITY_PID",
"STREET_NAME",
"STREET_TYPE_CODE")) %>%
rbindlist(use.names = TRUE, fill = TRUE) %>%
# Some unnamed streets
.[]
# Reduce the size of lookup tables by converting
# character columns to ints
ADDRESS_DETAIL_PID_by_ID <-
ADDRESS_DETAIL_PID__by__LATLON %>%
.[, list(ADDRESS_DETAIL_INTRNL_ID = .I,
ADDRESS_DETAIL_PID)]
ADDRESS_DETAIL_ID__by__LATLON <-
ADDRESS_DETAIL_PID__by__LATLON[ADDRESS_DETAIL_PID_by_ID,
j = list(ADDRESS_DETAIL_INTRNL_ID,
LATITUDE,
LONGITUDE),
on = "ADDRESS_DETAIL_PID"]
STREET_PID_vs_ADDRESS_ID <-
STREET_PID_vs_ADDRESS_PID[ADDRESS_DETAIL_PID_by_ID,
on = "ADDRESS_DETAIL_PID"] %>%
.[, "ADDRESS_DETAIL_PID" := NULL] %>%
set_cols_first("ADDRESS_DETAIL_INTRNL_ID") %>%
setkeyv("ADDRESS_DETAIL_INTRNL_ID") %>%
.[]
STREET_ID_vs_STREET_PID <-
STREET_LOCALITY_PID__STREET_NAME_STREET_TYPE_CODE %>%
.[, list(STREET_LOCALITY_INTRNL_ID = .I,
STREET_LOCALITY_PID)] %>%
setkey(STREET_LOCALITY_INTRNL_ID)
STREET_ID_vs_ADDRESS_ID <-
STREET_ID_vs_STREET_PID[STREET_PID_vs_ADDRESS_ID, on = "STREET_LOCALITY_PID"] %>%
.[, "STREET_LOCALITY_PID" := NULL] %>%
set_cols_first("ADDRESS_DETAIL_INTRNL_ID") %>%
setkeyv("ADDRESS_DETAIL_INTRNL_ID") %>%
.[]
STREET_LOCALITY_ID__STREET_NAME_STREET_TYPE_CODE <-
STREET_ID_vs_STREET_PID[STREET_LOCALITY_PID__STREET_NAME_STREET_TYPE_CODE, on = "STREET_LOCALITY_PID"] %>%
.[, "STREET_LOCALITY_PID" := NULL] %>%
set_cols_first("STREET_LOCALITY_INTRNL_ID") %>%
setkeyv("STREET_LOCALITY_INTRNL_ID") %>%
.[]
street_type_decoder <- fread("data-raw/street_type_decoder.tsv")
set_unique_key(street_type_decoder, street_abbrev)
# 2big4Github
#
# devtools::use_data(ADDRESS_DETAIL_ID__by__LATLON,
# STREET_ID_vs_ADDRESS_ID,
# STREET_LOCALITY_ID__STREET_NAME_STREET_TYPE_CODE,
# street_type_decoder,
# internal = TRUE,
# overwrite = TRUE)
provide.dir("tsv")
fwrite(ADDRESS_DETAIL_ID__by__LATLON, "tsv/ADDRESS_DETAIL_ID__by__LATLON.tsv", sep = "\t")
fwrite(STREET_ID_vs_ADDRESS_ID, "tsv/STREET_ID_vs_ADDRESS_ID.tsv", sep = "\t")
fwrite(STREET_LOCALITY_ID__STREET_NAME_STREET_TYPE_CODE, "tsv/STREET_LOCALITY_ID__STREET_NAME_STREET_TYPE_CODE.tsv", sep = "\t")
write_dat_fst <- function(x) {
fst::write_fst(x, paste0("inst/extdata/", deparse(substitute(x)), ".fst"), compress = 100)
}
write_dat_fst2 <- function(x) {
fst::write_fst(get(x), paste0("inst/extdata/", x, ".fst"), compress = 100)
}
address2 <-
ADDRESS_DETAIL_ID__by__LATLON %>%
.[, .(ADDRESS_DETAIL_INTRNL_ID, LATITUDE, LONGITUDE,
lat_int = as.integer(LATITUDE),
lat_rem = as.integer(10^7 * (LATITUDE - as.integer(LATITUDE))),
lon_int = as.integer(LONGITUDE),
lon_rem = as.integer(10^7 * (LONGITUDE - as.integer(LONGITUDE))))] %>%
setkey(ADDRESS_DETAIL_INTRNL_ID)
# Breaks 13
addressB13 <-
ADDRESS_DETAIL_ID__by__LATLON %>%
unique(by = c("LATITUDE", "LONGITUDE"))
lon_range <- addressB13[, minmax(LONGITUDE)]
lat_range <- addressB13[, minmax(LATITUDE)]
hutilscpp:::cut_DT(addressB13,
depth = 13L,
x_range = lon_range,
y_range = lat_range)
setkeyv(addressB13, c("xbreaks13", "ybreaks13"))
stopifnot(addressB13[, last(xbreaks13)] == 8192L)
the_8192_seq_lon <- seq(lon_range[1], lon_range[2], length.out = 8092L)
addressB13_ranges <-
addressB13[, .(lat_min = min(LATITUDE),
lat_max = max(LATITUDE),
lon_min = min(LONGITUDE),
lon_max = max(LONGITUDE)),
keyby = .(xbreaks13, ybreaks13)] %>%
.[, all_min_lat := min(lat_min), keyby = "ybreaks13"] %>%
.[, all_max_lat := max(lat_max), keyby = "ybreaks13"] %>%
.[, all_min_lon:= min(lon_min), keyby = "xbreaks13"] %>%
.[, all_max_lon:= max(lon_max), keyby = "xbreaks13"] %>%
.[, d_lon_s := all_min_lon - shift(all_max_lon)] %>%
.[, d_lon_n := shift(all_min_lon, type = "lead", fill = 180) - all_max_lon] %>%
setkey(ybreaks13) %>%
.[, d_lat_e := all_min_lat - shift(all_max_lat, fill = 0)] %>%
.[, d_lat_w := shift(all_min_lat, type = "lead", fill = 90) - all_max_lat] %>%
.[]
# Need to break up to avoid GitHub file size limits
# Australia is skewed...
median_xbreaks13 <- PSMA:::median_xbreaks13
addressB13_west <- addressB13[.(1:median_xbreaks13), on = "xbreaks13"]
addressB13_east <- addressB13[.(median_xbreaks13:8192), on = "xbreaks13"] # overlaps
for (i in 6:12) {
assign(paste0("addressB", i),
value = {
x <-
copy(address2) %>%
.[, .(ADDRESS_DETAIL_INTRNL_ID, LATITUDE, LONGITUDE)] %>%
unique(by = c("LATITUDE", "LONGITUDE"))
hutilscpp:::cut_DT(x,
depth = i,
x_range = lon_range,
y_range = lat_range)
setkeyv(x, paste0(c("xbreaks", "ybreaks"), i))
the_L <- L1_20[[i]]
col_xbreak <- paste0("xbreaks", i)
col_ybreak <- paste0("ybreaks", i)
centre <-
x[the_L, on = key(x), nomatch=0L]
})
}
for (addressBs in paste0("addressB", 6:12)) {
write_dat_fst2(addressBs)
}
address2[, c("LATITUDE", "LONGITUDE") := NULL]
write_dat_fst(address2)
write_dat_fst(addressB13_west)
write_dat_fst(addressB13_east)
# write_dat_fst(ADDRESS_DETAIL_ID__by__LATLON)
write_dat_fst(STREET_ID_vs_ADDRESS_ID)
write_dat_fst(STREET_LOCALITY_ID__STREET_NAME_STREET_TYPE_CODE)
find_data_with_noms <- function(noms, just_act = TRUE) {
all_files.psv <- dir(LATEST,
pattern = "\\.psv$",
full.names = TRUE,
recursive = TRUE)
have_noms <-
sapply(all_files.psv,
function(file.psv) {
if (just_act && !grepl("ACT", file.psv)) {
return(FALSE)
}
all(noms %chin% names(fread(file = file.psv, nrows = 0, sep = "|")))
})
all_files.psv[have_noms]
}
rbindfiles <- function(files.psv, ste = TRUE,
showProgress = FALSE,
sep = "|",
na.strings = c("", "NA"),
select = NULL) {
L <- lapply(files.psv,
fread,
showProgress = showProgress,
sep = "|",
na.strings = na.strings,
select = select)
names(L) <- trim_common_affixes(files.psv)
if (ste) {
rbindlist(L, use.names = TRUE, idcol = "STE")
} else {
rbindlist(L, use.names = TRUE)
}
}
LOCALITY_vs_LOCALITY_PID <-
find_data_with_noms(c("LOCALITY_PID", "NAME"), just_act = FALSE) %>%
rbindfiles
write_dat_fst(LOCALITY_vs_LOCALITY_PID)
LOCALITY_VS_POSTCODE <-
dir(LATEST, pattern = "ADDRESS_DETAIL_psv", recursive = TRUE,
full.names = TRUE) %>%
lapply(function(file.psv) {
fread(file = file.psv,
showProgress = FALSE,
sep = "|",
na.strings = c("", "NA"),
select = c("ADDRESS_DETAIL_PID",
"LOCALITY_PID", "POSTCODE"))
}) %>%
setNames(trim_common_affixes(
dir(LATEST, pattern = "ADDRESS_DETAIL_psv", recursive = TRUE,
full.names = TRUE))) %>%
rbindlist(use.names = TRUE,
idcol = "STE") %>%
.[, .N, keyby = .(STE, LOCALITY_PID, POSTCODE)]
write_dat_fst(LOCALITY_VS_POSTCODE)
STREET_NAMES_BY_ID <-
dir(LATEST, pattern = "STREET_LOCALITY_psv\\.psv$", recursive = TRUE,
full.names = TRUE) %>%
lapply(function(file.psv) {
fread(file = file.psv,
showProgress = FALSE,
sep = "|",
na.strings = c("", "NA"),
select = c("STREET_LOCALITY_PID",
"LOCALITY_PID",
"STREET_NAME",
"STREET_TYPE_CODE"))
}) %>%
rbindlist(use.names = TRUE, fill = TRUE)
STREET_BY_POSTCODE <-
STREET_NAMES_BY_ID[LOCALITY_VS_POSTCODE, on = "LOCALITY_PID", nomatch = 0L] %>%
.[, .(STREET_NAME, STREET_TYPE_CODE, POSTCODE)] %>%
unique %>%
setkey(POSTCODE, STREET_NAME) %>%
.[]
write_dat_fst(STREET_BY_POSTCODE)
usethis::use_data(street_type_decoder, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.