#' Update data
#'
#' @param path Directory of unzipped folder of the `GNAF` unzipped folder. If
#' \code{NULL}, the default, an attempt will be made to infer the latest PSMA
#' url and unzipped in a temporary directory.
#'
#' @param just_tsv \code{bool}. If \code{TRUE}, only the .tsv files are produced.
#' This is intended to be useful only for this package's development.
#'
#' @return
#' Called for its side-effect. When called, data from `path` are incorporated
#' into the PSMA package folder.
#'
#' @export
update_data <- function(path = NULL, just_tsv = FALSE) {
if (is.null(path)) {
return(update_data_auto())
}
stopifnot(is.character(path), length(path) == 1, dir.exists(path),
length(grep("\\.psv$", dir(path = path, recursive = TRUE))) >= 8)
.latest2fst(path, just_tsv = just_tsv)
}
update_data_auto <- function() {
url <- readLines("https://raw.githubusercontent.com/HughParsonage/PSMA/master/data-raw/LATEST_PSMA_URL_ZIP")
tempf.zip <- tempfile(fileext = ".zip")
status <- download.file(url, destfile = tempf.zip, mode = "wb")
if (status) {
stop("status code ", status)
}
current_dir <- getwd()
setwd(dirname(tempf.zip))
new_dir <- paste0("./", basename(url), "/")
unzipped.files <- unzip(tempfile, exdir = new_dir)
update_data(new_dir)
}
.latest2fst <- function(LATEST, progress = 2L, just_tsv = FALSE) {
cat <- function(...) {
base::cat(format(Sys.time(), "%H:%M"), ...)
}
ADDRESS_DETAIL_PID <- LATITUDE <- LONGITUDE <- NULL
ADDRESS_DETAIL_PID__by__LATLON <-
dir(pattern = "_ADDRESS_DEFAULT_GEOCODE_psv",
recursive = TRUE,
full.names = TRUE,
path = LATEST) %>%
lapply(fread,
na.strings = "",
showProgress = FALSE,
select = c("ADDRESS_DETAIL_PID",
"LATITUDE",
"LONGITUDE"),
key = "ADDRESS_DETAIL_PID") %>%
rbindlist %>%
setkeyv("ADDRESS_DETAIL_PID")
if (progress) {
cat("ADDRESS_DETAIL_PID__by__LATLON created.\n")
}
STREET_PID_vs_ADDRESS_PID <-
dir(pattern = "_ADDRESS_DETAIL_psv.psv$",
path = LATEST,
recursive = TRUE,
full.names = TRUE) %>%
lapply(fread,
showProgress = FALSE,
na.strings = "",
select = c("ADDRESS_DETAIL_PID",
# "DATE_CREATED",
# "DATE_LAST_MODIFIED",
# "DATE_RETIRED",
"BUILDING_NAME",
# "LOT_NUMBER_PREFIX",
"LOT_NUMBER",
# "LOT_NUMBER_SUFFIX",
# "FLAT_TYPE_CODE",
# "FLAT_NUMBER_PREFIX",
"FLAT_NUMBER",
# "FLAT_NUMBER_SUFFIX",
# "LEVEL_TYPE_CODE",
# "LEVEL_NUMBER_PREFIX",
# "LEVEL_NUMBER",
# "LEVEL_NUMBER_SUFFIX",
# "NUMBER_FIRST_PREFIX",
"NUMBER_FIRST",
# "NUMBER_FIRST_SUFFIX",
# "NUMBER_LAST_PREFIX",
# "NUMBER_LAST",
# "NUMBER_LAST_SUFFIX",
"STREET_LOCALITY_PID",
# "LOCATION_DESCRIPTION",
# "LOCALITY_PID",
# "ALIAS_PRINCIPAL",
"POSTCODE"
# "PRIVATE_STREET",
# "LEGAL_PARCEL_ID",
# "CONFIDENCE",
# "ADDRESS_SITE_PID",
# "LEVEL_GEOCODED_CODE",
# "PROPERTY_PID",
# "GNAF_PROPERTY_PID",
# "PRIMARY_SECONDARY"
)) %>%
rbindlist(use.names = TRUE, fill = TRUE) %>%
setkeyv("ADDRESS_DETAIL_PID")
if (progress) {
cat("STREET_PID_vs_ADDRESS_PID created.\n")
}
STREET_LOCALITY_PID <- STREET_NAME <- STREET_TYPE_CODE <- NULL
STREET_LOCALITY_PID__STREET_NAME_STREET_TYPE_CODE <-
dir(pattern = "_STREET_LOCALITY_psv.psv$",
path = LATEST,
recursive = TRUE,
full.names = TRUE) %>%
lapply(fread,
showProgress = FALSE,
na.strings = "",
select = c("STREET_LOCALITY_PID",
"STREET_NAME",
"STREET_TYPE_CODE")) %>%
rbindlist(use.names = TRUE, fill = TRUE) %>%
# Some unnamed streets
.[]
if (progress) {
cat("STREET_LOCALITY_PID__STREET_NAME_STREET_TYPE_CODE created.\n")
}
ADDRESS_DETAIL_INTRNL_ID <- NULL
# Reduce the size of lookup tables by converting
# character columns to ints
ADDRESS_DETAIL_PID_by_ID <-
ADDRESS_DETAIL_PID__by__LATLON %>%
.[, list(ADDRESS_DETAIL_INTRNL_ID = .I,
ADDRESS_DETAIL_PID)]
ADDRESS_DETAIL_ID__by__LATLON <-
ADDRESS_DETAIL_PID__by__LATLON[ADDRESS_DETAIL_PID_by_ID,
j = list(ADDRESS_DETAIL_INTRNL_ID,
LATITUDE,
LONGITUDE),
on = "ADDRESS_DETAIL_PID"]
STREET_PID_vs_ADDRESS_ID <-
STREET_PID_vs_ADDRESS_PID[ADDRESS_DETAIL_PID_by_ID,
on = "ADDRESS_DETAIL_PID"] %>%
.[, "ADDRESS_DETAIL_PID" := NULL] %>%
set_cols_first("ADDRESS_DETAIL_INTRNL_ID") %>%
setkeyv("ADDRESS_DETAIL_INTRNL_ID") %>%
.[]
STREET_LOCALITY_INTRNL_ID <- NULL
STREET_ID_vs_STREET_PID <-
STREET_LOCALITY_PID__STREET_NAME_STREET_TYPE_CODE %>%
.[, list(STREET_LOCALITY_INTRNL_ID = .I,
STREET_LOCALITY_PID)] %>%
setkeyv("STREET_LOCALITY_INTRNL_ID")
STREET_ID_vs_ADDRESS_ID <-
STREET_ID_vs_STREET_PID[STREET_PID_vs_ADDRESS_ID, on = "STREET_LOCALITY_PID"] %>%
.[, "STREET_LOCALITY_PID" := NULL] %>%
set_cols_first("ADDRESS_DETAIL_INTRNL_ID") %>%
setkeyv("ADDRESS_DETAIL_INTRNL_ID") %>%
.[]
STREET_LOCALITY_ID__STREET_NAME_STREET_TYPE_CODE <-
STREET_ID_vs_STREET_PID[STREET_LOCALITY_PID__STREET_NAME_STREET_TYPE_CODE, on = "STREET_LOCALITY_PID"] %>%
.[, "STREET_LOCALITY_PID" := NULL] %>%
set_cols_first("STREET_LOCALITY_INTRNL_ID") %>%
setkeyv("STREET_LOCALITY_INTRNL_ID") %>%
.[]
# 2big4Github
#
# devtools::use_data(ADDRESS_DETAIL_ID__by__LATLON,
# STREET_ID_vs_ADDRESS_ID,
# STREET_LOCALITY_ID__STREET_NAME_STREET_TYPE_CODE,
# street_type_decoder,
# internal = TRUE,
# overwrite = TRUE)
if (dir.exists("tsv")) {
fwrite(ADDRESS_DETAIL_ID__by__LATLON, "tsv/ADDRESS_DETAIL_ID__by__LATLON.tsv", sep = "\t")
fwrite(STREET_ID_vs_ADDRESS_ID, "tsv/STREET_ID_vs_ADDRESS_ID.tsv", sep = "\t")
fwrite(STREET_LOCALITY_ID__STREET_NAME_STREET_TYPE_CODE, "tsv/STREET_LOCALITY_ID__STREET_NAME_STREET_TYPE_CODE.tsv", sep = "\t")
if (just_tsv) {
return(invisible(NULL))
}
}
write_dat_fst <- function(x) {
bd <- "./inst/extdata/"
if (dir.exists("./inst/extdata/")) {
b.fst <- paste0("inst/extdata/", deparse(substitute(x)), ".fst")
} else {
b.fst <- paste0(deparse(substitute(x)), ".fst")
b.fst <- system.file("extdata", b.fst, package = "PSMA")
}
fst::write_fst(x, b.fst, compress = 100)
}
write_dat_fst2 <- function(x) {
stopifnot(is.character(x), length(x) == 1)
if (dir.exists("./inst/extdata/")) {
b.fst <- paste0("inst/extdata/", x, ".fst")
} else {
b.fst <- paste0(x, ".fst")
b.fst <- system.file("extdata", b.fst, package = "PSMA")
}
fst::write_fst(get(x), b.fst, compress = 100)
}
lat_int <- lat_rem <- lon_int <- lon_rem <- NULL
address2 <-
ADDRESS_DETAIL_ID__by__LATLON %>%
.[, .(ADDRESS_DETAIL_INTRNL_ID, LATITUDE, LONGITUDE,
lat_int = as.integer(LATITUDE),
lat_rem = as.integer(10^7 * (LATITUDE - as.integer(LATITUDE))),
lon_int = as.integer(LONGITUDE),
lon_rem = as.integer(10^7 * (LONGITUDE - as.integer(LONGITUDE))))] %>%
setkeyv("ADDRESS_DETAIL_INTRNL_ID")
# Breaks 13
addressB13 <-
ADDRESS_DETAIL_ID__by__LATLON %>%
unique(by = c("LATITUDE", "LONGITUDE"))
lon_range <- addressB13[, minmax(LONGITUDE)]
lat_range <- addressB13[, minmax(LATITUDE)]
xbreaks13 <- ybreaks13 <- NULL
..cut_DT(addressB13,
depth = 13L,
x_range = lon_range,
y_range = lat_range)
setkeyv(addressB13, c("xbreaks13", "ybreaks13"))
stopifnot(addressB13[, last(xbreaks13)] == 8192L)
the_8192_seq_lon <- seq(lon_range[1], lon_range[2], length.out = 8092L)
all_min_lat <-
all_max_lat <-
all_min_lon <-
all_max_lon <-
lat_min <-
lat_max <-
lon_min <-
lon_max <-
d_lon_s <-
d_lon_n <-
d_lat_e <-
d_lat_w <- NULL
addressB13_ranges <-
addressB13[, .(lat_min = min(LATITUDE),
lat_max = max(LATITUDE),
lon_min = min(LONGITUDE),
lon_max = max(LONGITUDE)),
keyby = .(xbreaks13, ybreaks13)] %>%
.[, all_min_lat := min(lat_min), keyby = "ybreaks13"] %>%
.[, all_max_lat := max(lat_max), keyby = "ybreaks13"] %>%
.[, all_min_lon:= min(lon_min), keyby = "xbreaks13"] %>%
.[, all_max_lon:= max(lon_max), keyby = "xbreaks13"] %>%
.[, d_lon_s := all_min_lon - shift(all_max_lon)] %>%
.[, d_lon_n := shift(all_min_lon, type = "lead", fill = 180) - all_max_lon] %>%
setkey(ybreaks13) %>%
.[, d_lat_e := all_min_lat - shift(all_max_lat, fill = 0)] %>%
.[, d_lat_w := shift(all_min_lat, type = "lead", fill = 90) - all_max_lat] %>%
.[]
# Need to break up to avoid GitHub file size limits
# Australia is skewed...
median_xbreaks13 <- copy(median_xbreaks13)
addressB13_west <- addressB13[.(1:median_xbreaks13), on = "xbreaks13"]
addressB13_east <- addressB13[.(median_xbreaks13:8192), on = "xbreaks13"] # overlaps
for (i in 6:12) {
if (progress) {
cat("Assigning `addressB", i, "`\r", sep = "")
}
assign(paste0("addressB", i),
value = {
x <-
copy(address2) %>%
.[, .(ADDRESS_DETAIL_INTRNL_ID, LATITUDE, LONGITUDE)] %>%
unique(by = c("LATITUDE", "LONGITUDE"))
..cut_DT(x,
depth = i,
x_range = lon_range,
y_range = lat_range)
setkeyv(x, paste0(c("xbreaks", "ybreaks"), i))
the_L <- L1_20[[i]]
col_xbreak <- paste0("xbreaks", i)
col_ybreak <- paste0("ybreaks", i)
centre <-
x[the_L, on = key(x), nomatch=0L]
})
}
if (progress) cat("\n")
for (addressBs in paste0("addressB", 6:12)) {
if (progress) {
cat("Writing `addressB", i, "`\r", sep = "")
}
write_dat_fst2(addressBs)
}
address2[, c("LATITUDE", "LONGITUDE") := NULL]
write_dat_fst(address2)
write_dat_fst(addressB13_west)
write_dat_fst(addressB13_east)
write_dat_fst(STREET_ID_vs_ADDRESS_ID)
write_dat_fst(STREET_LOCALITY_ID__STREET_NAME_STREET_TYPE_CODE)
find_data_with_noms <- function(noms, just_act = TRUE) {
all_files.psv <- dir(LATEST,
pattern = "\\.psv$",
full.names = TRUE,
recursive = TRUE)
have_noms <-
sapply(all_files.psv,
function(file.psv) {
if (just_act && !grepl("ACT", file.psv)) {
return(FALSE)
}
all(noms %chin% names(fread(file = file.psv, nrows = 0, sep = "|")))
})
all_files.psv[have_noms]
}
rbindfiles <- function(files.psv, ste = TRUE,
showProgress = FALSE,
sep = "|",
na.strings = c("", "NA"),
select = NULL) {
L <- lapply(files.psv,
fread,
showProgress = showProgress,
sep = "|",
na.strings = na.strings,
select = select)
names(L) <- hutils::trim_common_affixes(files.psv)
if (ste) {
rbindlist(L, use.names = TRUE, idcol = "STE")
} else {
rbindlist(L, use.names = TRUE)
}
}
LOCALITY_vs_LOCALITY_PID <-
find_data_with_noms(c("LOCALITY_PID", "NAME"), just_act = FALSE) %>%
rbindfiles
write_dat_fst(LOCALITY_vs_LOCALITY_PID)
LOCALITY_VS_POSTCODE <-
dir(LATEST, pattern = "ADDRESS_DETAIL_psv", recursive = TRUE,
full.names = TRUE) %>%
lapply(function(file.psv) {
fread(file = file.psv,
showProgress = FALSE,
sep = "|",
na.strings = c("", "NA"),
select = c("ADDRESS_DETAIL_PID",
"LOCALITY_PID", "POSTCODE"))
}) %>%
setNames(trim_common_affixes(
dir(LATEST, pattern = "ADDRESS_DETAIL_psv", recursive = TRUE,
full.names = TRUE))) %>%
rbindlist(use.names = TRUE,
idcol = "STE") %>%
.[, .N, keyby = c("STE", "LOCALITY_PID", "POSTCODE")]
write_dat_fst(LOCALITY_VS_POSTCODE)
STREET_NAMES_BY_ID <-
dir(LATEST, pattern = "STREET_LOCALITY_psv\\.psv$", recursive = TRUE,
full.names = TRUE) %>%
lapply(function(file.psv) {
fread(file = file.psv,
showProgress = FALSE,
sep = "|",
na.strings = c("", "NA"),
select = c("STREET_LOCALITY_PID",
"LOCALITY_PID",
"STREET_NAME",
"STREET_TYPE_CODE"))
}) %>%
rbindlist(use.names = TRUE, fill = TRUE)
STREET_BY_POSTCODE <-
STREET_NAMES_BY_ID[LOCALITY_VS_POSTCODE, on = "LOCALITY_PID", nomatch = 0L] %>%
.[, .(STREET_NAME, STREET_TYPE_CODE, POSTCODE)] %>%
unique %>%
setkey(POSTCODE, STREET_NAME) %>%
.[]
write_dat_fst(STREET_BY_POSTCODE)
}
# from hutilscpp
..cut_DT <- function (DT, depth = 1L, x_range = NULL, y_range = NULL) {
if (anyNA(match(c("LATITUDE", "LONGITUDE"), names(DT), nomatch = NA_integer_))) {
stop("`DT` lacked columns 'LATITUDE' and 'LONGITUDE'.")
}
LONGITUDE <- LATITUDE <- NULL
if (is.null(x_range)) {
x_range <- minmax(.subset2(DT, "LONGITUDE"))
}
if (is.null(y_range)) {
y_range <- minmax(.subset2(DT, "LATITUDE"))
}
DT[, `:=`("xbreaks", .bincode(LONGITUDE,
include.lowest = TRUE,
breaks = seq.int(from = x_range[1],
to = x_range[2],
length.out = 2^depth + 1)))]
setnames(DT, "xbreaks", paste0("xbreaks", depth))
DT[, `:=`("ybreaks", .bincode(LATITUDE,
include.lowest = TRUE,
breaks = seq.int(from = y_range[1],
to = y_range[2],
length.out = 2^depth + 1)))]
setnames(DT, "ybreaks", paste0("ybreaks", depth))
DT[]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.