knitr::opts_chunk$set(echo = FALSE) # by default turn off code echo
# Set start time ----
startTime <- proc.time()

# Packages needed in this .Rmd file ----
rmdLibs <- c("readxl", # reading xlsx
             "skimr" # for skim
)
# load them
GREENGridData::loadLibraries(rmdLibs)


# Local paramaters

\newpage

About

Report circulation:

License


Citation

If you wish to use any of the material from this report please cite as:

This work is (c) r lubridate::year(today()) the University of Southampton.

History


Support


\newpage

Introduction


The purpose of this report is to:

The resulting cleaned data has no identifying information such as names, addresses, email addresses, telephone numbers and is therefore safe to share across all partners.

The data contains a unique household id (linkID) which can be used to link it to the NZ GREEN Grid time use diaries and dwelling/appliance surveys.

Load data

Household master dataset

Table \@ref(tab:loadMasterData) shows the number of households in each area.

# contains a list of all the household IDs and some basic meta information
# data type: safe
hhMasterDT <- GREENGridData::getHouseholdData(ggrParams$gsHHMasterFile) # cleans it as it loads
setkey(hhMasterDT, linkID)
t <- with(hhMasterDT, table(Location, useNA = "always"))
kableExtra::kable(caption = "Sample location (master file)", t) %>% 
  kable_styling()

In total we have r nrow(hhMasterDT) households in two sample areas.

Appliance data

Household appliance ownership was recorded during recruitment using a detailed survey. However this data is not readily available for all households at present. Table \@ref(tab:applianceTable) shows the number of households in each area for whom summary appliance data exists.

# Loads separate appliance data file that was derived from the household audits
# data type: safe but 'sample' is over-detailed
hhAppliancesDT <- data.table::as.data.table(readxl::read_xlsx(ggrParams$ApplianceData))
hhAppliancesDT$sample <- NULL # too detailed

hhAppliancesDT$source <- "Appliance audit"
# use the master file location
setkey(hhAppliancesDT, linkID)
t <- with(hhAppliancesDT[hhMasterDT], table(Location, hasApplianceSummary, useNA = "always"))
kableExtra::kable(caption = "Sample (appliance summary file, NA indicates no data)", t) %>%
  kable_styling()

The appliances recorded in this summary are shown in Table \@ref(tab:applianceNames). Note that some of this information is also recorded in the household survey data.

# extract types of appliances included via the audit (some of this overlaps with the EC survey)
xcols <- c("linkID", "hasApplianceSummary")
al <- names(hhAppliancesDT[, -..xcols])
kableExtra::kable(caption = "Appliances recorded", al) %>%
  kable_styling()

Household survey data

The Energy Cultures 2 survey [@ec2Survey2015] was used to collect data on household energy and transport attitudes/behaviours and usages. The long form of the survey was used for some households and the short form for others.

# read in data produced by processHouseholdAttributes.R
# do not do any re-coding here - put that in the above .R script
# data type: safe
hhAttributesDT <- data.table::as.data.table(readr::read_csv(ggrParams$hhAttributes))
hhAttributesDT$source <- "Household survey"

Table \@ref(tab:responseTable) shows the number of households in each region who responded to each survey.

t <- hhAttributesDT[, .(`n Households` = .N), keyby = .(Location, hasLongSurvey, hasShortSurvey)]
kableExtra::kable(t, caption = "Survey responses (NA indicates no survey)") %>%
  kable_styling()

Final household attribute data description

Table \@ref(tab:skimTable) reports the final household attribute variables and their internal distributions. The question labels are found in Table \@ref(tab:loadEc2Labels) below.

#Hmisc::describe(hhAttributesDT)
skimDF <- skimr::skim(hhAttributesDT)
#https://github.com/ropensci/skimr/blob/master/README.md

skimr::kable(skimDF, caption = "Descriptive summary (all variables)")

Household attribute frequency tables

This section reports tables of the key household attributes by sample location. Note that NA usually means not known.

Main heat source

t <- with(hhAttributesDT, table(Q20_coded, Location, useNA = "always"))
kableExtra::kable(caption = "Main heat source by location", 
                  addmargins(t)) %>%
  kable_styling()

kableExtra::kable(caption = "Main heat source by location", 
                  prop.table(t, margin = 2)) %>%
  kable_styling()

Majority of light bulbs

t <- with(hhAttributesDT, table(Q49_coded, Location, useNA = "always"))
kableExtra::kable(caption = "Majority of light bulbs by location", 
                  addmargins(t)) %>%
  kable_styling()

kableExtra::kable(caption = "Majority of light bulbs by location (%)", 
                  prop.table(t, margin = 2)) %>%
  kable_styling()

Number of adults

t <- with(hhAttributesDT, table(nAdults, Location, useNA = "always"))
kableExtra::kable(caption = "Number of adults in household by location", t) %>%
  kable_styling()

Number of teenagers

t <- with(hhAttributesDT, table(nTeenagers13_18, Location, useNA = "always"))
kableExtra::kable(caption = "Number of teenagers in household by location", t) %>%
  kable_styling()

Number of children

t <- with(hhAttributesDT, table(nChildren0_12, Location, useNA = "always"))
kableExtra::kable(caption = "Number of children in household by location", t) %>%
  kable_styling()

Notes variable

The notes column may have been set for any number of reasons and means the monitoring data for a given household should be used with caution.

t <- hhAttributesDT[!is.na(notes), .(linkID, notes, Location)]
kableExtra::kable(caption = "Notes by location", t) %>%
  kable_styling()

Summary

The following table (\@ref(tab:dataHeader)) shows the key columns of the household attributes file. The data can be linked to the gridSpy data using linkID. linkID is used to flag the two re-used GridSpy units. As an example, unit rf_15 was re-used in a different household. We have therefore created linkID so that the correct household data (rf_15a or rf_15b) can be linked to the GridSpy data (coded rf_15) at the correct date.

Note also that data exists in the GridSpy power demand data for whom no household data exists (e.g. rf_01 & rf_02).

dt <- hhAttributesDT[, c("hhID", "linkID", "Location", "nAdults", "nChildren0_12", "r_stopDate",
                                "Electric heater", "Heat pump number", "Other Appliance", "PV Inverter", "Energy Storage","Other Generation Device",
                                "notes")]
# clean up table
for (j in names(dt)) {
  set(dt,which(is.na(dt[[j]])),j,".")
}
kableExtra::kable(caption = "Household data: key columns (NA set to '.' in string variables for clarity)", 
             dt[order(hhID)]) %>%
  kable_styling()

Runtime

t <- proc.time() - startTime

elapsed <- t[[3]]

Analysis completed in r round(elapsed,2) seconds ( r round(elapsed/60,2) minutes) using knitr in RStudio with r R.version.string running on r R.version$platform.

R environment

R packages used

Session info

sessionInfo()

Energy Cultures 2 Long Survey Questions

Table \@ref(tab:loadEc2Labels) lists the full Energy Cultures 2 survey [@ec2Survey2015]. As you will see only a subset of these variables are currently available in the GREEN Grid safe data package released via ReShare. Requests to add further variables should be made via a GitHub repository issue.

# When new variables are added to the reshare releases you will need to
# update the list in the following file EVEN if you have run the
# data processing code and added new variables.

# this helps us to add new variables but not to release them just yet (so the documentation is correct)
ec2LabsDT <- data.table::fread(ggrParams$ec2LongSurveyLabels)
setkey(ec2LabsDT, question)

# get the list of variables
question <- names(hhAttributesDT[, -..xcols])

qlDT <- as.data.table(question)
qlDT <- qlDT[, inProcessedData := "Yes"]
setkey(qlDT, question)

fullQlDT <- merge(ec2LabsDT,qlDT, all = TRUE) # merge the two lists of questions as we don't have the appliance list in the survey labels

fullQlDT <- fullQlDT[inProcessedData == "Yes", source := "Appliance audit"] # not really but we set all the ones in the processed dataset to this first
fullQlDT <- fullQlDT[question == "linkID" |
                       question == "hhID" |
                       question == "StartDate" |
                       question == "r_stopDate" |
                       question == "endDate" |
                       question == "Location" |
                       question == "notes"|
                       question == "hasShortSurvey"|
                       question == "hasLongSurvey" |
                       question == "nAdults" |
                       question == "nChildren0_12" |
                       question == "nTeenagers13_18" |
                       question == "source", source := "Meta-data"] # labels meta data variables

fullQlDT <- fullQlDT[question %like% "Q"|
                       question == "surveyStartDate" , source := "Household survey"] # labels survey variables

fullQlDT <- fullQlDT[inProcessedData == "Yes", `included in reshare data?` := "Yes"] # update source for the appliance variables

fullQlDT <- fullQlDT[`included in reshare data?` == "", `included in reshare data?` := "No"] # comes in as empty not NA

kableExtra::kable(caption = "GREEN Grid/Energy Cultures 2 Survey labels (long)", 
                  fullQlDT[, .(question, questionLabel, source, `included in reshare data?`, `in processed data?` = inProcessedData)][order(- `included in reshare data?`, `in processed data?`, source, question)]) %>%
  kable_styling()

References



CfSOtago/GREENGridData documentation built on June 10, 2022, 8:03 p.m.