knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(fhidata) library(data.table)
Note that fhidata
has RStudio addins that help you create skeletons!
fhidata::norway_locations_msis_to_fhidata
(you may also do redistricting now)multiskeleton
This is a simple example to introduce you to the concepts of skeletons
# setting up the data data <- expand.grid( date = seq.Date(as.Date("2020-01-01"), as.Date("2020-01-20"), by = 1), location_msis_county = c(3, 3, 50, 52, NA), stringsAsFactors = FALSE ) setDT(data) data[, cases_n := rpois(.N, 5)] data[] # 1. Create a variable (possibly a list) to hold the data # 2. Clean the data # cleaning the data d_agg <- copy(data) # 3. Replace NAs as appropriate for MSIS location numbers d_agg[is.na(location_msis_county), location_msis_county := 99] # 4. Convert MSIS location numbers to location_code's using `fhidata::norway_locations_msis_to_fhidata` # you may also do redistricting (kommunesammenslaaing) at this point (fhidata::norway_locations_redistricting()) d_agg[, location_code := fhidata::norway_locations_msis_to_fhidata(location_msis_county)] # 5. Re-aggregate your data to different geographical levels to ensure that duplicates have now been removed d_agg <- d_agg[ , .( cases_n = sum(cases_n) ), keyby = .( date, location_code ) ] #6. Pull out important dates date_min <- min(d_agg$date) date_max <- max(d_agg$date) # 7. Create `multiskeleton` # granularity_geo should have the following groups: # - nodata (when no data is available, and there is no 'finer' data available to aggregate up) # - all levels of granularity_geo where you have data available # If you do not have data for a specific granularity_geo, but there is 'finer' data available # then you should not include this granularity_geo in the multiskeleton, because you will create # it later when you aggregate up your data (nation) multiskeleton_day <- fhidata::make_skeleton( date_min = date_min, date_max = date_max, granularity_geo = list( "nodata" = c( "wardoslo", "extrawardoslo", "missingwardoslo", "wardbergen", "missingwardbergen", "wardstavanger", "missingwardstavanger", "baregion", "municip", "notmainlandmunicip", "missingmunicip" ), "county" = c( "county", "notmainlandcounty", "missingcounty" ) ) ) # 8. Merge in the information you have at different geographical granularities # county level multiskeleton_day$county[ d_agg, on = c("location_code", "date"), cases_n := cases_n ] multiskeleton_day$county[is.na(cases_n), cases_n := 0] # 9. Aggregate up to higher geographical granularities multiskeleton_day$nation <- multiskeleton_day$county[ , .( cases_n = sum(cases_n), granularity_geo = "nation", location_code = "norge" ), keyby=.( granularity_time, date ) ] # 10. (If desirable) aggregate up to higher time granularities skeleton_day <- rbindlist(multiskeleton_day, fill = TRUE)
This is a more realistic example, where person-level data is given at multiple levels (nation, county, notmainlandcounty, missingcounty, municip, notmainlandmunicip, missingmunicip) and they must be aggregated inside the secure zone and then distributed further for processing and the creation of a skeleton.
# --------------- # setting up the data that would be seen from inside the "secure zone" data <- expand.grid( date = seq.Date(as.Date("2020-01-01"), as.Date("2020-01-20"), by = 1), location_msis_municip = c(301, 1106, 5001, 5001, 2200, NA, NA), variable = c(1:5), # to get some extra rows in the data stringsAsFactors = FALSE ) setDT(data) data[, location_msis_county := floor(location_msis_municip/100)] data[ variable %in% 1:3 & is.na(location_msis_municip), location_msis_county := 3] data[, variable := NULL] data[] # --------------- # this code occurs inside the secure zone # we need to aggregate it down to make it less sensitive # and then distribute it as multiple data sets d_person <- copy(data) d_agg <- list() # this includes granularity_geo = municip, notmainlandmunicip, missingmunicip d_agg$day_municip <- d_person[,.( cases_n = .N ), keyby=.( location_msis_municip, date )] # this includes granularity_geo = county, notmainlandcounty, missingcounty d_agg$day_county <- d_person[,.( cases_n = .N ), keyby=.( location_msis_county, date )] # this includes granularity_geo = nation d_agg$day_nation <- d_person[,.( cases_n = .N, location_msis_nation = 0 ), keyby=.( date )] d_agg[] # --------------- # this code occurs in the ordinary zone # the data is loaded in (probably from a .rds file) data <- copy(d_agg) # 1. Create a variable (possibly a list) to hold the data d_agg <- list() d_agg$day_municip <- copy(data$day_municip) d_agg$day_county <- copy(data$day_county) d_agg$day_nation <- copy(data$day_nation) # 2. Clean the data # fixing location codes # 3. Replace NAs as appropriate for MSIS location numbers d_agg$day_municip[is.na(location_msis_municip), location_msis_municip := 9999] d_agg$day_county[is.na(location_msis_county), location_msis_county := 99] # 4. Convert MSIS location numbers to location_code's using `fhidata::norway_locations_msis_to_fhidata` # you may also do redistricting (kommunesammenslaaing) at this point (fhidata::norway_locations_redistricting()) d_agg$day_municip[, location_code := fhidata::norway_locations_msis_to_fhidata(location_msis_municip)] d_agg$day_county[, location_code := fhidata::norway_locations_msis_to_fhidata(location_msis_county)] d_agg$day_nation[, location_code := fhidata::norway_locations_msis_to_fhidata(location_msis_nation)] # redistricting for(i in seq_along(d_agg)){ d_agg[[i]][, calyear := lubridate::year(date)] d_agg[[i]] <- merge( d_agg[[i]], fhidata::norway_locations_redistricting()[,-"granularity_geo"], by.x = c("location_code", "calyear"), by.y = c("location_code_original", "calyear"), all.x = TRUE ) } # 5. Re-aggregate your data to different geographical levels to ensure that duplicates have now been removed # this will also fix the redistricting/kommunesammenslaaing issues for(i in seq_along(d_agg)){ d_agg[[i]] <- d_agg[[i]][,.( cases_n = round(sum(cases_n*weighting)) ), keyby=.( location_code = location_code_current, date )] } d_agg[] # 6. Pull out important dates date_min <- min(d_agg$day_nation$date) date_max <- max(d_agg$day_nation$date) # 7. Create `multiskeleton` # granularity_geo should have the following groups: # - nodata (when no data is available, and there is no 'finer' data available to aggregate up) # - all levels of granularity_geo where you have data available # If you do not have data for a specific granularity_geo, but there is 'finer' data available # then you should not include this granularity_geo in the multiskeleton, because you will create # it later when you aggregate up your data (baregion) multiskeleton_day <- fhidata::make_skeleton( date_min = date_min, date_max = date_max, granularity_geo = list( "nodata" = c( "wardoslo", "extrawardoslo", "missingwardoslo", "wardbergen", "missingwardbergen", "wardstavanger", "missingwardstavanger" ), "municip" = c( "municip", "notmainlandmunicip", "missingmunicip" ), "county" = c( "county", "notmainlandcounty", "missingcounty" ), "nation" = c( "nation" ) ) ) # 8. Merge in the information you have at different geographical granularities # one level at a time # municip multiskeleton_day$municip[ d_agg$day_municip, on = c("location_code", "date"), cases_n := cases_n ] multiskeleton_day$municip[is.na(cases_n), cases_n := 0] multiskeleton_day$municip[] # county multiskeleton_day$county[ d_agg$day_county, on = c("location_code", "date"), cases_n := cases_n ] multiskeleton_day$county[is.na(cases_n), cases_n := 0] multiskeleton_day$county[] # nation multiskeleton_day$nation[ d_agg$day_nation, on = c("location_code", "date"), cases_n := cases_n ] multiskeleton_day$nation[is.na(cases_n), cases_n := 0] multiskeleton_day$nation[] # 9. Aggregate up to higher geographical granularities multiskeleton_day$baregion <- multiskeleton_day$municip[ fhidata::norway_locations_hierarchy( from = "municip", to = "baregion" ), on = c( "location_code==from_code" ) ][, .( cases_n = sum(cases_n), granularity_geo = "baregion" ), by=.( granularity_time, date, location_code = to_code ) ] multiskeleton_day$baregion[] # combine all the different granularity_geo's skeleton_day <- rbindlist(multiskeleton_day, fill = TRUE) skeleton_day[] # 10. (If desirable) aggregate up to higher time granularities # if necessary, it is now easy to aggregate up to weekly data from here skeleton_week <- copy(skeleton_day) # it is better to use fhi::isoyearweek_c function # but we cannot not use this function inside a 'fhidata' vignette due to circular dependency issues skeleton_week[, isoyearweek := paste0(lubridate::isoyear(date),"-", lubridate::isoweek(date))] skeleton_week <- skeleton_week[ , .( cases_n = sum(cases_n), granularity_time = "week" ), keyby = .( isoyearweek, granularity_geo, location_code ) ] skeleton_week[]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.