knitr::opts_chunk$set(echo = FALSE) # by default turn off code echo
# Set start time ---- startTime <- proc.time() # Packages needed in this .Rmd file ---- rmdLibs <- c("readxl", # reading xlsx "skimr" # for skim ) # load them GREENGridData::loadLibraries(rmdLibs) # Local paramaters
\newpage
If you wish to use any of the material from this report please cite as:
r ggrParams$Authors
(r lubridate::year(today())
) r params$title
r params$subtitle
, r ggrParams$pubLoc
.This work is (c) r lubridate::year(today())
the University of Southampton.
\newpage
The purpose of this report is to:
The resulting cleaned data has no identifying information such as names, addresses, email addresses, telephone numbers and is therefore safe to share across all partners.
The data contains a unique household id (linkID
) which can be used to link it to the NZ GREEN Grid time use diaries and dwelling/appliance surveys.
Table \@ref(tab:loadMasterData) shows the number of households in each area.
# contains a list of all the household IDs and some basic meta information # data type: safe hhMasterDT <- GREENGridData::getHouseholdData(ggrParams$gsHHMasterFile) # cleans it as it loads setkey(hhMasterDT, linkID) t <- with(hhMasterDT, table(Location, useNA = "always")) kableExtra::kable(caption = "Sample location (master file)", t) %>% kable_styling()
In total we have r nrow(hhMasterDT)
households in two sample areas.
Household appliance ownership was recorded during recruitment using a detailed survey. However this data is not readily available for all households at present. Table \@ref(tab:applianceTable) shows the number of households in each area for whom summary appliance data exists.
# Loads separate appliance data file that was derived from the household audits # data type: safe but 'sample' is over-detailed hhAppliancesDT <- data.table::as.data.table(readxl::read_xlsx(ggrParams$ApplianceData)) hhAppliancesDT$sample <- NULL # too detailed hhAppliancesDT$source <- "Appliance audit"
# use the master file location setkey(hhAppliancesDT, linkID) t <- with(hhAppliancesDT[hhMasterDT], table(Location, hasApplianceSummary, useNA = "always")) kableExtra::kable(caption = "Sample (appliance summary file, NA indicates no data)", t) %>% kable_styling()
The appliances recorded in this summary are shown in Table \@ref(tab:applianceNames). Note that some of this information is also recorded in the household survey data.
# extract types of appliances included via the audit (some of this overlaps with the EC survey) xcols <- c("linkID", "hasApplianceSummary") al <- names(hhAppliancesDT[, -..xcols]) kableExtra::kable(caption = "Appliances recorded", al) %>% kable_styling()
The Energy Cultures 2 survey [@ec2Survey2015] was used to collect data on household energy and transport attitudes/behaviours and usages. The long form of the survey was used for some households and the short form for others.
# read in data produced by processHouseholdAttributes.R # do not do any re-coding here - put that in the above .R script # data type: safe hhAttributesDT <- data.table::as.data.table(readr::read_csv(ggrParams$hhAttributes)) hhAttributesDT$source <- "Household survey"
Table \@ref(tab:responseTable) shows the number of households in each region who responded to each survey.
t <- hhAttributesDT[, .(`n Households` = .N), keyby = .(Location, hasLongSurvey, hasShortSurvey)] kableExtra::kable(t, caption = "Survey responses (NA indicates no survey)") %>% kable_styling()
Table \@ref(tab:skimTable) reports the final household attribute variables and their internal distributions. The question labels are found in Table \@ref(tab:loadEc2Labels) below.
#Hmisc::describe(hhAttributesDT) skimDF <- skimr::skim(hhAttributesDT)
#https://github.com/ropensci/skimr/blob/master/README.md skimr::kable(skimDF, caption = "Descriptive summary (all variables)")
This section reports tables of the key household attributes by sample location. Note that NA usually means not known.
t <- with(hhAttributesDT, table(Q20_coded, Location, useNA = "always")) kableExtra::kable(caption = "Main heat source by location", addmargins(t)) %>% kable_styling() kableExtra::kable(caption = "Main heat source by location", prop.table(t, margin = 2)) %>% kable_styling()
t <- with(hhAttributesDT, table(Q49_coded, Location, useNA = "always")) kableExtra::kable(caption = "Majority of light bulbs by location", addmargins(t)) %>% kable_styling() kableExtra::kable(caption = "Majority of light bulbs by location (%)", prop.table(t, margin = 2)) %>% kable_styling()
t <- with(hhAttributesDT, table(nAdults, Location, useNA = "always")) kableExtra::kable(caption = "Number of adults in household by location", t) %>% kable_styling()
t <- with(hhAttributesDT, table(nTeenagers13_18, Location, useNA = "always")) kableExtra::kable(caption = "Number of teenagers in household by location", t) %>% kable_styling()
t <- with(hhAttributesDT, table(nChildren0_12, Location, useNA = "always")) kableExtra::kable(caption = "Number of children in household by location", t) %>% kable_styling()
The notes
column may have been set for any number of reasons and means the monitoring data for a given household should be used with caution.
t <- hhAttributesDT[!is.na(notes), .(linkID, notes, Location)] kableExtra::kable(caption = "Notes by location", t) %>% kable_styling()
The following table (\@ref(tab:dataHeader)) shows the key columns of the household attributes file. The data can be linked to the gridSpy data using linkID
. linkID
is used to flag the two re-used GridSpy units. As an example, unit rf_15 was re-used in a different household. We have therefore created linkID
so that the correct household data (rf_15a or rf_15b) can be linked to the GridSpy data (coded rf_15) at the correct date.
Note also that data exists in the GridSpy power demand data for whom no household data exists (e.g. rf_01 & rf_02).
dt <- hhAttributesDT[, c("hhID", "linkID", "Location", "nAdults", "nChildren0_12", "r_stopDate", "Electric heater", "Heat pump number", "Other Appliance", "PV Inverter", "Energy Storage","Other Generation Device", "notes")] # clean up table for (j in names(dt)) { set(dt,which(is.na(dt[[j]])),j,".") }
kableExtra::kable(caption = "Household data: key columns (NA set to '.' in string variables for clarity)", dt[order(hhID)]) %>% kable_styling()
t <- proc.time() - startTime elapsed <- t[[3]]
Analysis completed in r round(elapsed,2)
seconds ( r round(elapsed/60,2)
minutes) using knitr in RStudio with r R.version.string
running on r R.version$platform
.
sessionInfo()
Table \@ref(tab:loadEc2Labels) lists the full Energy Cultures 2 survey [@ec2Survey2015]. As you will see only a subset of these variables are currently available in the GREEN Grid safe data package released via ReShare. Requests to add further variables should be made via a GitHub repository issue.
# When new variables are added to the reshare releases you will need to # update the list in the following file EVEN if you have run the # data processing code and added new variables. # this helps us to add new variables but not to release them just yet (so the documentation is correct) ec2LabsDT <- data.table::fread(ggrParams$ec2LongSurveyLabels) setkey(ec2LabsDT, question) # get the list of variables question <- names(hhAttributesDT[, -..xcols]) qlDT <- as.data.table(question) qlDT <- qlDT[, inProcessedData := "Yes"] setkey(qlDT, question) fullQlDT <- merge(ec2LabsDT,qlDT, all = TRUE) # merge the two lists of questions as we don't have the appliance list in the survey labels fullQlDT <- fullQlDT[inProcessedData == "Yes", source := "Appliance audit"] # not really but we set all the ones in the processed dataset to this first fullQlDT <- fullQlDT[question == "linkID" | question == "hhID" | question == "StartDate" | question == "r_stopDate" | question == "endDate" | question == "Location" | question == "notes"| question == "hasShortSurvey"| question == "hasLongSurvey" | question == "nAdults" | question == "nChildren0_12" | question == "nTeenagers13_18" | question == "source", source := "Meta-data"] # labels meta data variables fullQlDT <- fullQlDT[question %like% "Q"| question == "surveyStartDate" , source := "Household survey"] # labels survey variables fullQlDT <- fullQlDT[inProcessedData == "Yes", `included in reshare data?` := "Yes"] # update source for the appliance variables fullQlDT <- fullQlDT[`included in reshare data?` == "", `included in reshare data?` := "No"] # comes in as empty not NA kableExtra::kable(caption = "GREEN Grid/Energy Cultures 2 Survey labels (long)", fullQlDT[, .(question, questionLabel, source, `included in reshare data?`, `in processed data?` = inProcessedData)][order(- `included in reshare data?`, `in processed data?`, source, question)]) %>% kable_styling()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.