knitr::opts_chunk$set(echo = FALSE) # do not echo code by default
# Housekeeping ----
rm(list=ls(all=TRUE)) # remove all objects from workspace

# Set start time ----
startTime <- proc.time()

# Local parameters ----

dPath <- "/Volumes/hum-csafe/Research Projects/GREEN Grid/" # HPS
diaryPath <- paste0(dPath,"_RAW DATA/Time Use Diaries/") # location of data
outPath <- paste0(dPath, "Clean_data/safe/TUD/") # place to save them 

powerCoPath <- paste0(diaryPath, "Powerco/Powerco Annexes/")
unisonPath <- paste0(diaryPath, "Unison/Unison Raw Data/Raw data with paper diaries included/Cleaned excel data files/")

linkLUT <- paste0(dPath, "_RAW DATA/TUD_2_GridSpyLookup.xlsx")

# Load greenGrid package ----
library(nzGREENGrid) # local utilities

# Other packages ----
reqLibs <- c("data.table", # for data munching
             "lubridate", # for date/time munching
             "ggplot2", # for fancy graphs
             "readr", # for read/write_csv
             "dplyr", # for select columns
             "readxl",  #loading xl(s)
             #"dtplyr", # if needed
             "knitr" # for kable
)

nzGREENGrid::loadLibraries(reqLibs)

# Local functions ----

\newpage

Citation

If you wish to use any of the material from this report please cite as:

\newpage

Introduction

Report circulation:

Purpose

This report is intended to:

Requirements:

Time use survey data held in r diaryPath:

A lookup table to correct mis-coding of household IDs (r linkLUT).

History

Generally tracked via our git.soton repo:

Support

This work was supported by:

This work is (c) r as.POSIXlt(Sys.time())$year + 1900 the University of Southampton.

We do not 'support' the code but if you have a problem check the issues on our repo and if it doesn't already exist, open one. We might be able to fix it :-)

PowerCo

This consists of 1 file found in r powerCoPath:

This is a version of TUD (Merged data).csv with:

Load & process

tudPowerCoDT <- fread(paste0(powerCoPath, "TUD (Merged data)_BA.csv"))

nRows <- nrow(tudPowerCoDT)
print(paste0("Found ", tidyNum(nRows), " rows of data"))
# Remove identifying data ----
unsafe <- c("RowNum", "Name","EmailAddress")
print(paste0("Removing unsafe vars: ", unsafe))
tudPowerCoDT <- tudPowerCoDT[, eval(unsafe) := NULL]

# Fix names of variables ----
print(paste0("Fixing variable names"))
tudPowerCoDT <- data.table::setnames(tudPowerCoDT, 
                                    c("Family size", "Choose the date of your diary / entry:"), 
                                    c("ba_nPeople", "diaryDate")
)

# Fix dates ----
print(paste0("Fixing dates"))
tudPowerCoDT <- tudPowerCoDT[, r_diaryDate := lubridate::mdy(diaryDate)]

# Fix the hhid
print(paste0("Fixing hhID"))
tudPowerCoDT <- tudPowerCoDT[, hhID := paste0("rf_", HHCODE)]
tudPowerCoDT <- tudPowerCoDT[, hhID := ifelse(as.integer(HHCODE) < 10, 
                                              paste0("rf_0", HHCODE), # single digit so needs '0'
                                                     hhID)]

The following table summarises the PowerCo cleaned diary data. In theory we should have 1 diary per day per person - so a 1 person household should have produced 7 diaries... A 3 person household would produce 14 if there were two adults and 1 child (for example).

What was the age cut off for diary completion?

# Summary table ----
t <- tudPowerCoDT[, .(nDiaries = .N,
                      familySize = mean(ba_nPeople, na.rm = TRUE),
                      minDiaryDate = min(r_diaryDate),
                      maxDiaryDate = max(r_diaryDate)), keyby = .(hhID)]

knitr::kable(caption = "Summary of PowerCo diaries by household", t)

# save out safe file ----
ofile <- paste0(outPath, "powerCoTUDsafe.csv")
print(paste0("Saving PowerCo cleaned time use diary to ", ofile))
write.csv(tudPowerCoDT, ofile)
print("Done")

Tests

Should all be in August 2014...

myCaption <- paste0("Data source: ", powerCoPath)

plotDT <- tudPowerCoDT[, .(nDiaries = .N), keyby = .(r_diaryDate)]
ggplot2::ggplot(plotDT, aes(x = r_diaryDate, y = nDiaries)) +
  geom_point() +
    scale_x_date(date_labels = "%a %d %b %Y", date_breaks = "1 day") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + 
  labs(title = "Number of PowerCo diaries per day",
       caption = paste0(myCaption),
       x = "Date",
       y = "Total number of diaries"

  )
ggplot2::ggsave(paste0(outPath, "powerCoTUDdates.png"))

In total we have r nrow(tudPowerCoDT) diaries from r uniqueN(tudPowerCoDT$hhID) PowerCo households.

Unison

This consists of 5 files found in r unisonPath:

As before these are copies of the original versions with slight editing to correct dates and for ease of processing. The relationship between them is currently unclear!

Load & process

#fList <- c("TUDAdult_ONE_Child_Unison_forSAS_BA.xlsx", "TUDAdult_TWO_Children_Unison_forSAS_BA.xlsx",
#           "TUDAdult-THREE-Children-Unison_forSAS_BA.xlsx", "TUDAdult-Unison-forSAS_BA.xlsx", "TUDTeenagerorChild-Unison_forSAS_BA.xlsx")

# load and add sourceFile for easy tracking of errors
tudUnison1chDT <- data.table::as.data.table(read_xlsx(paste0(unisonPath, "TUDAdult_ONE_Child_Unison_forSAS_BA.xlsx")))
tudUnison1chDT$sourceFile <- "TUDAdult_ONE_Child_Unison_forSAS_BA.xlsx"
tudUnison2chDT <- data.table::as.data.table(read_xlsx(paste0(unisonPath, "TUDAdult_TWO_Children_Unison_forSAS_BA.xlsx")))
tudUnison2chDT$sourceFile <- "TUDAdult_TWO_Children_Unison_forSAS_BA.xlsx"
tudUnison3chDT <- data.table::as.data.table(read_xlsx(paste0(unisonPath, "TUDAdult-THREE-Children-Unison_forSAS_BA.xlsx")))
tudUnison3chDT$sourceFile <- "TUDAdult-THREE-Children-Unison_forSAS_BA.xlsx"
tudUnisonAdultDT <- data.table::as.data.table(read_xlsx(paste0(unisonPath, "TUDAdult-Unison-forSAS_BA.xlsx")))
tudUnisonAdultDT$sourceFile <- "TUDAdult-Unison-forSAS_BA.xlsx"
tudUnisonTeenChDT <- data.table::as.data.table(read_xlsx(paste0(unisonPath, "TUDTeenagerorChild-Unison_forSAS_BA.xlsx")))
tudUnisonTeenChDT$sourceFile <- "TUDTeenagerorChild-Unison_forSAS_BA.xlsx"

# Join the various Unison files them together ----
# produces column name explosion
l <- list(tudUnison1chDT,tudUnison2chDT,tudUnison3chDT,tudUnisonAdultDT,tudUnisonTeenChDT)
tudUnisonAllDT <- data.table::rbindlist(l, fill = TRUE)

print(paste0("Found ", tidyNum(nrow(tudUnisonAllDT)), " rows in total"))
# Remove identifying data ----
unsafe <- c("Name","EmailAddress","IPAddress")
print(paste0("Removing unsafe vars: ", unsafe))
tudUnisonAllDT <- tudUnisonAllDT[, eval(unsafe) := NULL]

# do not rename as it's then hard to trace errors
print(paste0("Fixing variable names"))
tudUnisonAllDT <- tudUnisonAllDT[, r_diaryDate := `Choose the date of your diary / entry:`]
tudUnisonAllDT <- tudUnisonAllDT[, code := `Please enter your designated / CODE`]

# Fix dates ----
#dt <- dt[, r_diaryDate := lubridate::dmy(diaryDate)] # not needed as read_xls gets it right :-)
#dt <- dt[, r_surveyStart := lubridate::dmy_hms(StartDate)]
#dt <- dt[, r_surveyEnd := lubridate::dmy_hms(EndDate)]

# Fix hhID ----
print(paste0("Fixing hhID"))
tudUnisonAllDT <- tudUnisonAllDT[, tudCode := substr(code, 0, 2)] # extracts char 1

The following table lists rows where the diary date did not parse (for error checking).

# Check for non-parsed diary dates
t <- head(tudUnisonAllDT[is.na(r_diaryDate),.(ResponseID,r_diaryDate, code, tudCode, StartDate, EndDate)])
knitr::kable(caption = "Test diaryDates that did not parse", t)

The following table reports any diaries where the dates were manually edited before loading.

# report edited diary dates (done in .xlsx)
t <- tudUnisonAllDT[!is.na(dateNote),.(r_diaryDate, tudCode, dateNote, sourceFile)]
knitr::kable(caption = "Report diaries with edited diary dates (done in .xlsx before loading)", t)

The following table summarises the Unison diary data.

# Summary table ----
t <- tudUnisonAllDT[, .(nDiaries = .N,
                      minDiaryDate = min(r_diaryDate),
                      maxDiaryDate = max(r_diaryDate)), keyby = .(tudCode)]

knitr::kable(caption = "Summary of Unison diaries by household", t)

Note that the tudCodes found in the .csv files are not the gridSpy household IDs, we need to create these from the unison sheet in r linkLUT.

unisonLinkLUTDT <- data.table::as.data.table(read_xlsx(paste0(linkLUT), sheet = "unison"))

knitr::kable(caption = "Unison linking table", unisonLinkLUTDT)

tudUnisonAllDT <- tudUnisonAllDT[, linkCode := as.character(tudCode)]
setkey(tudUnisonAllDT, linkCode)
unisonLinkLUTDT <- unisonLinkLUTDT[, linkCode := as.character(CODE)]
setkey(unisonLinkLUTDT, linkCode)

tudUnisonAllDT <- tudUnisonAllDT[unisonLinkLUTDT] # link them

tudUnisonAllDT <- tudUnisonAllDT[, hhID := tag_gridSpy_Hhid]

# check

t <- tudUnisonAllDT[, .(nDiaries = .N), keyby = .(linkCode, hhID)]
knitr::kable(caption = "Check linkage: there should be n * 7 diaries for each combination", t)

In total we have r nrow(tudUnisonAllDT) diaries from r uniqueN(tudUnisonAllDT$hhID) Unison households.

# save out safe file ----
ofile <- paste0(outPath, "unisonTUDsafe.csv")
print(paste0("Saving Unison cleaned time use diary to ", ofile))
write.csv(tudUnisonAllDT, ofile)
print("Done")

Tests

All of the diaries should be in July/August 2015...

myCaption <- paste0("Data source: ", unisonPath)

plotDT <- tudUnisonAllDT[, .(nDiaries = .N), keyby = .(r_diaryDate)]
ggplot2::ggplot(plotDT, aes(x = as.Date(r_diaryDate), y = nDiaries)) +
  geom_point() +
    scale_x_date(date_labels = "%a %d %b %Y", date_breaks = "1 day") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + 
  labs(title = "Number of Unison diaries per day",
       caption = myCaption,
       x = "Date",
       y = "Total number of diaries"

  )
ggplot2::ggsave(paste0(outPath, "unisonTUDdates.png"))

If any of them are earlier than July 2015 they are flagged below for ease of fixing.

t <- tudUnisonAllDT[r_diaryDate < "2015-07-01", .(nDiaries = .N), keyby = .(r_diaryDate, code, tudCode, sourceFile)]
cap <- paste0("Households with potential diary date errors")
knitr::kable(caption = cap, t)

Summary

Total responses:

Runtime

t <- proc.time() - startTime

elapsed <- t[[3]]

Analysis completed in r elapsed seconds ( r round(elapsed/60,2) minutes) using knitr in RStudio with r R.version.string running on r R.version$platform.

R environment

R packages used: r reqLibs

sessionInfo()

References



CfSOtago/GREENGridData documentation built on June 10, 2022, 8:03 p.m.