In folkehelseinstituttet/fhidata: Structural Data for Norway

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(fhidata)
library(data.table)

Checklists

Note that fhidata has RStudio addins that help you create skeletons!

Secure zone

Aggregate the data to multiple geographical granularities

Ordinary zone

Create a variable (possibly a list) to hold the data
Clean the data
Replace NAs as appropriate for MSIS location numbers
Convert MSIS location numbers to location_code's using fhidata::norway_locations_msis_to_fhidata (you may also do redistricting now)
Re-aggregate your data to different geographical levels to ensure that duplicates have now been removed
Pull out important dates
Create multiskeleton
Merge in the information you have at different geographical granularities
Aggregate up to higher geographical granularities
(If desirable) aggregate up to higher time granularities

Simple example: Only county data is given

This is a simple example to introduce you to the concepts of skeletons

# setting up the data
data <- expand.grid(
  date = seq.Date(as.Date("2020-01-01"), as.Date("2020-01-20"), by = 1), 
  location_msis_county = c(3, 3, 50, 52, NA),
  stringsAsFactors = FALSE
)
setDT(data)
data[, cases_n := rpois(.N, 5)]
data[]

# 1. Create a variable (possibly a list) to hold the data
# 2. Clean the data
# cleaning the data
d_agg <- copy(data)
# 3. Replace NAs as appropriate for MSIS location numbers
d_agg[is.na(location_msis_county), location_msis_county := 99]
# 4. Convert MSIS location numbers to location_code's using `fhidata::norway_locations_msis_to_fhidata`
# you may also do redistricting (kommunesammenslaaing) at this point (fhidata::norway_locations_redistricting())
d_agg[, location_code := fhidata::norway_locations_msis_to_fhidata(location_msis_county)]

# 5. Re-aggregate your data to different geographical levels to ensure that duplicates have now been removed
d_agg <- d_agg[
  ,
  .(
    cases_n = sum(cases_n)
  ),
  keyby = .(
    date,
    location_code
  )
]

#6. Pull out important dates
date_min <- min(d_agg$date)
date_max <- max(d_agg$date)

# 7. Create `multiskeleton`
# granularity_geo should have the following groups:
# - nodata (when no data is available, and there is no 'finer' data available to aggregate up)
# - all levels of granularity_geo where you have data available
# If you do not have data for a specific granularity_geo, but there is 'finer' data available
# then you should not include this granularity_geo in the multiskeleton, because you will create
# it later when you aggregate up your data (nation)
multiskeleton_day <- fhidata::make_skeleton(
  date_min = date_min,
  date_max = date_max,
  granularity_geo = list(
    "nodata" = c(
      "wardoslo",
      "extrawardoslo",
      "missingwardoslo",
      "wardbergen",
      "missingwardbergen",
      "wardstavanger",
      "missingwardstavanger",
      "baregion",
      "municip",
      "notmainlandmunicip",
      "missingmunicip"
    ),

    "county" = c(
      "county",
      "notmainlandcounty", 
      "missingcounty"
    )
  )
)

# 8. Merge in the information you have at different geographical granularities
# county level
multiskeleton_day$county[
  d_agg,
  on = c("location_code", "date"),
  cases_n := cases_n
]
multiskeleton_day$county[is.na(cases_n), cases_n := 0]

# 9. Aggregate up to higher geographical granularities
multiskeleton_day$nation <- multiskeleton_day$county[
  ,
  .(
    cases_n = sum(cases_n),
    granularity_geo = "nation",
    location_code = "norge"
  ),
  keyby=.(
    granularity_time, date
  )
]

# 10. (If desirable) aggregate up to higher time granularities
skeleton_day <- rbindlist(multiskeleton_day, fill = TRUE)

Realistic example: Person-level data is given, multiple levels of data

This is a more realistic example, where person-level data is given at multiple levels (nation, county, notmainlandcounty, missingcounty, municip, notmainlandmunicip, missingmunicip) and they must be aggregated inside the secure zone and then distributed further for processing and the creation of a skeleton.

# ---------------
# setting up the data that would be seen from inside the "secure zone"
data <- expand.grid(
  date = seq.Date(as.Date("2020-01-01"), as.Date("2020-01-20"), by = 1), 
  location_msis_municip = c(301, 1106, 5001, 5001, 2200, NA, NA),
  variable = c(1:5), # to get some extra rows in the data
  stringsAsFactors = FALSE
)
setDT(data)
data[, location_msis_county := floor(location_msis_municip/100)]
data[ variable %in% 1:3 & is.na(location_msis_municip), location_msis_county := 3]
data[, variable := NULL]

data[]

# ---------------
# this code occurs inside the secure zone
# we need to aggregate it down to make it less sensitive
# and then distribute it as multiple data sets
d_person <- copy(data)

d_agg <- list()

# this includes granularity_geo = municip, notmainlandmunicip, missingmunicip
d_agg$day_municip <- d_person[,.(
  cases_n = .N
), keyby=.(
  location_msis_municip,
  date
)]

# this includes granularity_geo = county, notmainlandcounty, missingcounty
d_agg$day_county <- d_person[,.(
  cases_n = .N
), keyby=.(
  location_msis_county,
  date
)]

# this includes granularity_geo = nation
d_agg$day_nation <- d_person[,.(
  cases_n = .N,
  location_msis_nation = 0
), keyby=.(
  date
)]

d_agg[]

# ---------------
# this code occurs in the ordinary zone

# the data is loaded in (probably from a .rds file)
data <- copy(d_agg)

# 1. Create a variable (possibly a list) to hold the data
d_agg <- list()
d_agg$day_municip <- copy(data$day_municip)
d_agg$day_county <- copy(data$day_county)
d_agg$day_nation <- copy(data$day_nation)

# 2. Clean the data

# fixing location codes
# 3. Replace NAs as appropriate for MSIS location numbers
d_agg$day_municip[is.na(location_msis_municip), location_msis_municip := 9999]
d_agg$day_county[is.na(location_msis_county), location_msis_county := 99]

# 4. Convert MSIS location numbers to location_code's using `fhidata::norway_locations_msis_to_fhidata`
# you may also do redistricting (kommunesammenslaaing) at this point (fhidata::norway_locations_redistricting())
d_agg$day_municip[, location_code := fhidata::norway_locations_msis_to_fhidata(location_msis_municip)]
d_agg$day_county[, location_code := fhidata::norway_locations_msis_to_fhidata(location_msis_county)]
d_agg$day_nation[, location_code := fhidata::norway_locations_msis_to_fhidata(location_msis_nation)]

# redistricting
for(i in seq_along(d_agg)){
  d_agg[[i]][, calyear := lubridate::year(date)]
  d_agg[[i]] <- merge(
    d_agg[[i]],
    fhidata::norway_locations_redistricting()[,-"granularity_geo"],
    by.x = c("location_code", "calyear"),
    by.y = c("location_code_original", "calyear"),
    all.x = TRUE
  )
}

# 5. Re-aggregate your data to different geographical levels to ensure that duplicates have now been removed
# this will also fix the redistricting/kommunesammenslaaing issues
for(i in seq_along(d_agg)){
  d_agg[[i]] <- d_agg[[i]][,.(
    cases_n = round(sum(cases_n*weighting))
  ), keyby=.(
    location_code = location_code_current,
    date
  )]
}

d_agg[]

# 6. Pull out important dates
date_min <- min(d_agg$day_nation$date)
date_max <- max(d_agg$day_nation$date)

# 7. Create `multiskeleton`
# granularity_geo should have the following groups:
# - nodata (when no data is available, and there is no 'finer' data available to aggregate up)
# - all levels of granularity_geo where you have data available
# If you do not have data for a specific granularity_geo, but there is 'finer' data available
# then you should not include this granularity_geo in the multiskeleton, because you will create
# it later when you aggregate up your data (baregion)
multiskeleton_day <- fhidata::make_skeleton(
  date_min = date_min,
  date_max = date_max,
  granularity_geo = list(
    "nodata" = c(
      "wardoslo",
      "extrawardoslo",
      "missingwardoslo",
      "wardbergen",
      "missingwardbergen",
      "wardstavanger",
      "missingwardstavanger"
    ),

    "municip" = c(
      "municip",
      "notmainlandmunicip",
      "missingmunicip"
    ),

    "county" = c(
      "county",
      "notmainlandcounty", 
      "missingcounty"
    ),

    "nation" = c(
      "nation"
    )
  )
)

# 8. Merge in the information you have at different geographical granularities
# one level at a time
# municip
multiskeleton_day$municip[
  d_agg$day_municip,
  on = c("location_code", "date"),
  cases_n := cases_n
]
multiskeleton_day$municip[is.na(cases_n), cases_n := 0]

multiskeleton_day$municip[]

# county
multiskeleton_day$county[
  d_agg$day_county,
  on = c("location_code", "date"),
  cases_n := cases_n
]
multiskeleton_day$county[is.na(cases_n), cases_n := 0]

multiskeleton_day$county[]

# nation
multiskeleton_day$nation[
  d_agg$day_nation,
  on = c("location_code", "date"),
  cases_n := cases_n
]
multiskeleton_day$nation[is.na(cases_n), cases_n := 0]

multiskeleton_day$nation[]

# 9. Aggregate up to higher geographical granularities
multiskeleton_day$baregion <- multiskeleton_day$municip[
  fhidata::norway_locations_hierarchy(
    from = "municip",
    to = "baregion"
  ),
  on = c(
    "location_code==from_code"
  )
  ][,
  .(
    cases_n = sum(cases_n),
    granularity_geo = "baregion"
  ),
  by=.(
    granularity_time, 
    date,
    location_code = to_code
  )
]

multiskeleton_day$baregion[]

# combine all the different granularity_geo's
skeleton_day <- rbindlist(multiskeleton_day, fill = TRUE)

skeleton_day[]

# 10. (If desirable) aggregate up to higher time granularities
# if necessary, it is now easy to aggregate up to weekly data from here
skeleton_week <- copy(skeleton_day)
# it is better to use fhi::isoyearweek_c function
# but we cannot not use this function inside a 'fhidata' vignette due to circular dependency issues
skeleton_week[, isoyearweek := paste0(lubridate::isoyear(date),"-", lubridate::isoweek(date))]
skeleton_week <- skeleton_week[
  ,
  .(
    cases_n = sum(cases_n),
    granularity_time = "week"
  ),
  keyby = .(
    isoyearweek,
    granularity_geo,
    location_code
  )
]

skeleton_week[]