#' Checks the dates of a date_char column in a data.table
#'
#' \code{checkDates} takes a data.table and splits the `date_char` column on either `/` or `-` (tries each).
#'
#' Expects `date_char` to have the format x-x-x or x/x/x
#'
#' Puts the results into 3 new columns () and returns the dt.
#'
#' @param dt the data.table
#'
#' @importFrom data.table tstrsplit
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
#'
checkDates <- function(dt) {
# Check the date format as it could be y-m-d or d/m/y or m/d/y :-(
dt <- dt[, c("date_char1","date_char2", "date_char3") := data.table::tstrsplit(date_char, "/")]
# if this split failed then tstrsplit puts the dateVar in each one so we can check
# this assumes we never have 9-9-9 10-10-10 or 11-11-11 or 12-12-12 !
# would be better if data.table::tstrsplit returned an error if the split failed? We could then check for NA?
dt <- dt[, splitFailed := ifelse(date_char1 == date_char2 & date_char1 == date_char3, TRUE, FALSE)]
# and then split on / instead
dt <- dt[splitFailed == TRUE, c("date_char1","date_char2", "date_char3") := data.table::tstrsplit(date_char, "-")] # requires data.table
dt$dateFormat <- "ambiguous" # default
# Days: 1-31
# Months: 1 - 12
# Years: could be 2 digit 15 - 18 or 4 digit 2015 - 2018 (+)
max1 <- max(as.integer(dt$date_char1))
#print(paste0("max1 = " , max1))
max2 <- max(as.integer(dt$date_char2))
#print(paste0("max2 = " , max2))
max3 <- max(as.integer(dt$date_char3))
#print(paste0("max3 = " , max3))
if(max1 > 31){
# char 1 = year so default is ymd
dt$dateFormat <- "ymd - default (but day/month value <= 12)"
if(max2 > 12){
# char 2 = day - very unlikely
dt$dateFormat <- "ydm"
}
if(max3 > 12){
# char 3 = day
dt$dateFormat <- "ymd - definite"
}
}
if(max2 > 31){
# char 2 is year - this is very unlikely
if(max1 > 12){
# char 1 = day
dt$dateFormat <- "dym"
}
if(max3 > 12){
# char 3 = day
dt$dateFormat <- "myd"
}
}
if(max3 > 31){
# char 3 is year so default is dmy
dt$dateFormat <- "dmy - default (but day/month value <= 12)"
if(max1 > 12){
# char 1 = day so char 2 = month
dt$dateFormat <- "dmy - definite"
}
if(max2 > 12){
# char 2 = day so char 1 = month
dt$dateFormat <- "mdy - definite"
}
}
# Setting ambiguous to dmy seems OK as this only seems to happen with UTC which is dmy by default
dt <- dt[dateFormat == "ambiguous",
dateFormat := "dmy - inferred"]
return(dt)
}
#' Fixes ambiguous dates
#'
#' \code{fixAmbiguousDates} takes a data.table, fixes dates in dateFormat column and returns the dt.
#'
#' @param dt the data.table
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
fixAmbiguousDates <- function(dt){
aList <- dt[dateFormat == "ambiguous",
.(file, dateColName, dateExample, dateFormat)]
# Setting to dmy seems OK
dt <- dt[dateFormat == "ambiguous",
dateFormat := "dmy - inferred"]
# Get an example date
print(paste0("#-> Fixed ", nrow(aList), " files with an ambiguous dateFormat"))
return(dt)
}
#' Gets a list of GridSpy data files matching a pattern
#'
#' \code{getGridSpyFileList} takes a file path, searches for files matching pattern and returns the filelist as a data.table with metadata.
#'
#' Also saves out the metadata as an interim file
#'
#' @param fpath where to look (recursively)
#' @param pattern the pattern to match
#' @param dataThreshold the file size threshold (files smaller than this will be ignored as we assume they have no data)
#' @param mf name of file to save interim file list and meta data to
#'
#' @import dplyr
#' @import data.table
#' @import lubridate
#' @import progress
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
getGridSpyFileList <- function(fpath, pattern, dataThreshold){
print(paste0("#-> Looking for data using pattern = ", pattern, " in ", fpath, " - could take a while..."))
# > Get the file list as a data.table ----
fList <- list.files(path = fpath, pattern = pattern, # use to filter e.g. 1m from 30s files
recursive = TRUE)
if(length(fList) == 0){ # if there are no files in the list...
print(paste0("#--> No matching data files found, please check your path (", fpath,
") or your search pattern (", pattern, ")"))
dt <- data.table::as.data.table(NULL)
} else {
dt <- data.table::as.data.table(fList) # the column name will be fList
}
nFiles <- nrow(dt)
print(paste0("#-> Found ", tidyNum(nFiles), " files"))
#> Process file metadata ----
if(nrow(dt) == 0){
# Then no files were found - should have been caught previously but...
stop(paste0("#-> No matching data files found, please check your path (", fpath,
") or your search pattern (", pattern1Min, "). If using /hum-csafe/ are you connected to it?!"))
} else {
print(paste0("#-> Processing file list and getting file meta-data including checking date formats."))
dt <- dt[, c("hhID","fileName") := data.table::tstrsplit(fList, "/")]
dt <- dt[, fullPath := paste0(fpath, hhID,"/",fileName)]
loopCount <- 1
# now loop over the files and collect metadata
pb <- progress::progress_bar$new(total = nrow(dt)) # set progress bar
for(f in dt[,fullPath]){
pb$tick()
rf <- path.expand(f) # just in case of ~ etc
fsize <- file.size(rf)
fmtime <- lubridate::ymd_hms(file.mtime(rf), tz = "Pacific/Auckland") # requires lubridate
dt <- dt[fullPath == f, fSize := fsize]
dt <- dt[fullPath == f, fMTime := fmtime]
dt <- dt[fullPath == f, fMDate := as.Date(fmtime)]
dt <- dt[fullPath == f, dateColName := paste0("Unknown - ignore as fsize ( ",
fsize, " ) < dataThreshold ( ",
dataThreshold, " )")]
# only try to read files where we think there might be data
loadThis <- ifelse(fsize > dataThreshold,
paste0("Loading (fsize (", fsize, ") > ", dataThreshold, ")"),
paste0("Skipping (fsize (", fsize, ") < ", dataThreshold, ")")
)
if(gSpyParams$fullFb){print(paste0("Checking file ", loopCount, " of ", nFiles ,
" (", round(100*(loopCount/nFiles),2), "% checked): ", loadThis))}
if(fsize > dataThreshold){
if(gSpyParams$fullFb){print(paste0("fSize (", fsize, ") > ",
dataThreshold, ") -> loading ", f))}
row1DT <- fread(f, nrows = 1)
# what is the date column called?
dt <- dt[fullPath == f, dateColName := "unknown - can't tell"]
if(nrow(dplyr::select(row1DT, dplyr::contains("NZ"))) > 0){ # requires dplyr - selects cols with 'NZ' in name
# we have NZT
setnames(row1DT, 'date NZ', "dateTime_orig")
row1DT <- row1DT[, dateColName := "date NZ"]
dt <- dt[fullPath == f, dateColName := "date NZ"]
}
if(nrow(dplyr::select(row1DT, dplyr::contains("UTC"))) > 0){ # requires dplyr - selects cols with 'UTC' in name
# we have UTC
setnames(row1DT, 'date UTC', "dateTime_orig")
row1DT <- row1DT[, dateColName := "date UTC"]
dt <- dt[fullPath == f, dateColName := "date UTC"]
}
# split dateTime
row1DT <- row1DT[, c("date_char", "time_char") := data.table::tstrsplit(dateTime_orig, " ")]
# add example of date to metadata - presumably they are the same in each file?!
dt <- dt[fullPath == f, dateExample := row1DT[1, date_char]]
if(gSpyParams$fullFb){print(paste0("Checking date formats in ", f))}
testDT <- checkDates(row1DT)
dt <- dt[fullPath == f, dateFormat := testDT[1, dateFormat]]
if(gSpyParams$fullFb){print(paste0("Done ", f))}
}
loopCount <- loopCount + 1
}
print("#-> All files checked")
# any date formats that are still ambiguous need a deeper inspection using the full file - could be slow
fAmbig <- dt[dateFormat == "ambiguous", fullPath] # get ambiguous files as a list
if(length(fAmbig) > 0){ # there were some
print(paste0("#-> Checking ambiguous date formats"))
pbA <- progress::progress_bar$new(total = length(fAmbig))
for(fa in fAmbig){
if(gSpyParams$fullFb){print(paste0("#-> Checking ambiguous date formats in ", fa))}
ambDT <- fread(fa)
pbA$tick()
if(nrow(dplyr::select(ambDT, dplyr::contains("NZ"))) > 0){ # requires dplyr
setnames(ambDT, 'date NZ', "dateTime_orig")
}
if(nrow(dplyr::select(ambDT, dplyr::contains("UTC"))) > 0){ # requires dplyr
setnames(ambDT, 'date UTC', "dateTime_orig")
}
ambDT <- ambDT[, c("date_char", "time_char") := data.table::tstrsplit(dateTime_orig, " ")]
ambDT <- checkDates(ambDT)
# set what we now know (or guess!)
dt <- dt[fullPath == fa, dateFormat := ambDT[1,dateFormat]]
}
print(paste0("#-> Ambiguous date formats checked"))
}
dt <- setnames(dt, "fList", "file")
}
return(dt)
}
#' Loads and processes a list of GridSpy data files for a given household
#'
#' \code{processHhGridSpyData} takes a file list and loads & processes data before returning the data.table for checks & saving.
#'
#' What it does:
#'
#' - Updates the per input file metadata as it loops over each file
#'
#' - Fixes circuit labels for rf_46
#'
#' - Concatenates (rbinds) the files into a data.table and converts to long form for easier re-use
#'
#' - Splits original circuit name by $ into a label (first string) and an id (second string)
#'
#' - Attempts to create a correct r_dateTime in UTC but with tz set to "Pacific/Auckland"
#'
#' - Removes duplicates by r_dateTime <-> circuit <-> power caused by data from duplicate files. Note that this will retain the DST induced duplicate dateTimes if the power values are different (see below)
#'
#' - Removes any cases where power = NA
#'
#' - Returns the data.table
#'
#' Things to note...
#'
#' The function assumes all data in the fileList is for one household (can only be detected from input file path)
#'
#' The original data is sometimes stored as UTC (auto-downloads) & sometimes as NZ time (manual downloads).
#'
#' If the original data was actually NZ time then we parse it with tz = Pacific/Auckland. It is not clear what will happen at the DST break.
#'
#' If the original data was actually UTC then we parse it with the default tz = UTC and then set to Pacific/Auckland. Note that this will create
#' duplicate r_dateTimes during the DST break when there is an extra hour and thus two moments of time with the same r_dateTime.
#'
#' Any true duplicates by r_dateTime <-> circuit <-> power are then removed (see above) to deal with duplicate data files.
#' However this will also remove observations around the DST break which are different moments of time but have the same
#' r_dateTime (as one of them is in the DST break hour) and by chance the same power values in the same circuits. Got that?
#'
#' Due to uncertainties over the timezones and dateTimes, we recommend that analysis should exclude the days on which DST changes occur.
#'
#' #YMMV
#'
#' Loading the resulting saved data from .csv will probably set the tz of r_dateTime to your local time. Be sure to set it to
#' correctly using lubridate::with_tz(r_dateTime, tzone = "Pacific/Auckland")
#'
#' @param hh the household id
#' @param fileList list of files to load (assumed to be all from hh)
#'
#' @import dplyr
#' @import data.table
#' @import lubridate
#' @import progress
#' @import reshape2
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
processHhGridSpyData <- function(hh, fileList){
# Gets all files listed, loads, processes, cleans & merges, saves out in long form
# Does not check that hh matches file path - XX TO DO XX
tempHhDT <- data.table::data.table() # (re)create new hh GridSpy data collector for each loop (hh)
pbF <- progress::progress_bar$new(total = length(fileList)) # set progress bar using n files to load
# X > start of per-file loop ----
for(f in fileList){
if(gSpyParams$fullFb){print(paste0("#--> Loading ", f))}
fDT <- data.table::fread(f)
pbF$tick()
# add hhid for ease of future loading etc
fDT <- fDT[, hhID := hh]
# Fix labels in households where strange things seemed to have happened
if(hh == "rf_24"){
# rf_24 has an additional circuit in some files but value is always NA
#print(paste0("#--> ", hh, ": Fixing rf_24 labels"))
fDT <- fixCircuitLabels_rf_24(fDT)
}
if(hh == "rf_46"){
# rf_46 has 3 different versions of the circuit labels
#print(paste0("#--> ", hh, ": Fixing rf_46 labels"))
fDT <- fixCircuitLabels_rf_46(fDT)
}
# >> set default dateTimes ----
# This will also have consequences for time - esp related to DST:
# smb://storage.hcs-p01.otago.ac.nz/hum-csafe/Research Projects/GREEN Grid/_RAW DATA/GridSpyData/README.txt says:
# "Note that the timestamps may well be problematic - the GridSpy source data used local time,
# and thus there's instability of data around daylight savings time changes."
# e.g. rf_01 file 1Jan2014-24May2014at1.csv at DST switch on April 6th 2014 simply repeats the 02:00 - 03:00 hour.
# So this day has 25 hours in it. The date column is named 'date NZ'
# Large files which seem to have been manually downloaded use 'date NZ' and this
# appears to be local (lived) time with/without DST as appropriate
# The small daily files which were set to auto-download have 'date UTC' as a column name
# and do indeed appear to be UTC. Where this has DST change, no hours are repeated (as the time is UTC)
# See e.g. rf_06 1Apr2018-2Apr2018at1.csv (DST @ 02:00 1st April 2018 Pacific/Auckland)
# Using the pre-inferred dateFormat
fDT <- fDT[, dateFormat := fListToLoadDT[fullPath == f, dateFormat]]
# what is the date column called?
# Use this to work out which TZ is being applied
if(nrow(dplyr::select(fDT, dplyr::contains("NZ"))) > 0){ # requires dplyr
# if we have > 1 row with col name containing 'NZ'
# in fact this means all rows will have NZ time
TZ_orig <- "date NZ"
setnames(fDT, 'date NZ', "dateTime_orig")
fDT <- fDT[, TZ_orig := "date NZ"]
# Original data is NZ time - what this means for the DST break is unclear
fDT <- fDT[dateFormat %like% "mdy",
r_dateTime := lubridate::mdy_hm(dateTime_orig, tz = "Pacific/Auckland")]
fDT <- fDT[dateFormat %like% "dmy",
r_dateTime := lubridate::dmy_hm(dateTime_orig, tz = "Pacific/Auckland")]
fDT <- fDT[dateFormat %like% "ydm",
r_dateTime := lubridate::ydm_hm(dateTime_orig, tz = "Pacific/Auckland")]
fDT <- fDT[dateFormat %like% "ymd",
r_dateTime := lubridate::ymd_hm(dateTime_orig, tz = "Pacific/Auckland")]
}
if(nrow(dplyr::select(fDT, dplyr::contains("UTC"))) > 0){ # requires dplyr
# as above but UTC
TZ_orig <- "date UTC"
setnames(fDT, 'date UTC', "dateTime_orig")
fDT <- fDT[, TZ_orig := "date UTC"]
# Original data is UTC time
fDT <- fDT[dateFormat %like% "mdy",
r_dateTimeUTC := lubridate::mdy_hm(dateTime_orig)] # UTC by default
fDT <- fDT[dateFormat %like% "dmy",
r_dateTimeUTC := lubridate::dmy_hm(dateTime_orig)] # UTC by default
fDT <- fDT[dateFormat %like% "ydm",
r_dateTimeUTC := lubridate::ydm_hm(dateTime_orig)] # UTC by default
fDT <- fDT[dateFormat %like% "ymd",
r_dateTimeUTC := lubridate::ymd_hm(dateTime_orig)] # requires lubridate
fDT <- fDT[, r_dateTime := lubridate::with_tz(r_dateTimeUTC, tzone = "Pacific/Auckland")] # set to NZ
fDT$r_dateTimeUTC <- NULL # not needed, also messes up the reshape as it may/not be present
}
fDT$dateFormat <- NULL # no longer needed
# >> set some file stats ----
#print("Getting file stats")
fileStat <- list()
fileStat$hhID <- hh
fileStat$fullPath <- f
fileStat$nObs <- nrow(fDT) # could include duplicates
fileStat$minDateTime <- min(fDT$r_dateTime)
fileStat$maxDateTime <- max(fDT$r_dateTime)
fileStat$TZ_orig <- toString(unique(fDT$TZ_orig))
fileStat$dateFormat <- fListToLoadDT[fullPath == f, dateFormat]
fileStat$mDateTime <- fListToLoadDT[fullPath == f, fMTime]
fileStat$fSize <- fListToLoadDT[fullPath == f, fSize]
# check for the number of circuits - all seem to contain "$"
fileStat$nCircuits <- ncol(dplyr::select(fDT, dplyr::contains("$")))
# check the names of circuits - all seem to contain "$"; sort them to make it easier to compare them
# - this is the only way we have to check if data from different households has been placed in the wrong folder.
fileStat$circuitLabels <- toString(sort(colnames(dplyr::select(fDT, dplyr::contains("$")))))
# >> save file stats ----
#print("Saving file stats")
ofile <- paste0(gSpyParams$fLoadedStats, gSpyParams$suffix, ".csv")
#print(paste0("Saving ", ofile, "..."))
data.table::fwrite(data.table::as.data.table(fileStat), # convert to DT for writing out
ofile,
append = TRUE) # will only write out col names on first pass
# >> rbind data to the hh data collector ----
tempHhDT <- rbind(tempHhDT, fDT, fill = TRUE) # fill just in case there are different numbers of columns or columns with different names (quite likely - crcuit labels may vary!)
} # X > end of per file loop ----
print(paste0("#--> ", hh, ": Done, cleaning rbound files"))
#> Switch to long form ----
# this turns each circuit label (column) into a label within 'variable' and
# sets value to be the power measurement
print(paste0("#--> ", hh, ": wide form variables -> ", toString(names(tempHhDT))))
# we then relabel them for clarity
hhLongDT <- reshape2::melt(tempHhDT, id=c("hhID","dateTime_orig", "TZ_orig", "r_dateTime"))
data.table::setnames(hhLongDT, "value", "power")
data.table::setnames(hhLongDT, "variable", "circuit")
print(paste0("#--> ", hh, ": converted to long form"))
# > Force power to be numeric ----
hhLongDT <- hhLongDT[, powerW := as.numeric(power)]
# remove NA after conversion to numeric if present
hhLongDT <- hhLongDT[!is.na(powerW)]
hhLongDT$power <- NULL # remove to save space/memory
print(paste0("#--> ", hh, ": removed powerW = NA"))
# > Remove any duplicates by dateTime, circuit & power ----
dupsBy <- c("r_dateTime", "circuit", "powerW")
nDups <- anyDuplicated(hhLongDT, by=dupsBy)
pcDups <- round(100*(nDups/nrow(hhLongDT)), 2)
print(paste0("#--> ", hh, ": removing ", nDups, " (", pcDups,"%) duplicates by ", toString(dupsBy)))
hhLongDT <- unique(hhLongDT, by=dupsBy)
setkey(hhLongDT, r_dateTime, circuit) # force dateTime & circuit order
print(paste0("#--> ", hh, ": final long form variables -> ", toString(names(hhLongDT))))
return(hhLongDT) # for saving etc
}
#' Fixes circuit labels in rf_24
#'
#' \code{fixCircuitLabels_rf_24} is a stub that could be used to fix circuit labels.
#' rf_24 has an additional circuit in some files but value is always NA so we ignore them (no fix).
#'
#' @param dt wide-form (original) datat table to fix & return
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
#'
fixCircuitLabels_rf_24 <- function(dt){
# rf_24 has an additional circuit in some files but value is always NA so we ignore them
return(dt)
}
#' Fixes circuit labels in rf_46
#'
#' \code{fixCircuitLabels_rf_46} is used to fix circuit labels.
#'
#' Inspection of the raw data shows that rf_46 has several different label sets.
#'
#' rf_46 was not re-used and the similarity of the label-sets seems to indicate typos.
#'
#' It appears that power, imaginary power and voltage were all meansured in this household. But the labels are
#' confused. See https://github.com/CfSOtago/GREENGridData/issues/1#issuecomment-520690467
#'
#' This function does nothing (yet). For now it just passes the circuit labels back into the 'clean' data.
#'
#' @param dt wide-form (original) datat table to fix & return
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
fixCircuitLabels_rf_46 <- function(dt){
if(gSpyParams$fullFb){print("Checking circuit labels for rf_46")}
# don't do anything for now until we work out what https://github.com/CfSOtago/GREENGridData/blob/master/plots/rf_46_circuitLabels.png means.
# # check if we have the second form of labels - they have 'Power1$4232' in one col label
# checkCols2 <- ncol(dplyr::select(dt,dplyr::contains("Power1$4232")))
# if(checkCols2 == 1){
# # we got label set 2
# if(gSpyParams$fullFb){print(paste0("Found circuit labels set 2 in ", f))}
# setnames(dt, c("Heat Pumps (2x) & Power1$4232", "Heat Pumps (2x) & Power2$4399", # <- old names
# "Hot Water - Controlled1$4231", "Hot Water - Controlled2$4400",
# "Incomer - Uncontrolled1$4230", "Incomer - Uncontrolled2$4401",
# "Incomer Voltage$4405", "Kitchen & Bedrooms1$4229",
# "Kitchen & Bedrooms2$4402","Laundry & Bedrooms1$4228",
# "Laundry & Bedrooms2$4403", "Lighting1$4233", "Lighting2$4404"), # <- new names - are these duplicates too??
# c("Heat Pumps (2x) & Power$4232", "Heat Pumps (2x) & Power$4399", "Hot Water - Controlled$4231",
# "Hot Water - Controlled$4400", "Incomer - Uncontrolled$4230", "Incomer - Uncontrolled$4401",
# "Incomer Voltage$4405", "Kitchen & Bedrooms$4229", "Kitchen & Bedrooms$4402",
# "Laundry & Bedrooms$4228", "Laundry & Bedrooms$4403", "Lighting$4233", "Lighting$4404"))
# }
# # check if we have the third form of labels - they have 'Power_Imag$4399' in one col label
# checkCols3 <- ncol(dplyr::select(dt,dplyr::contains("Power_Imag$4399")))
# if(checkCols3 == 1){
# # we got label set 3
# if(gSpyParams$fullFb){print(paste0("Found circuit labels set 3 in ", f))}
# # be careful to get this order correct so that it matches the label 1 order
# setnames(dt, c("Heat Pumps (2x) & Power$4232", "Heat Pumps (2x) & Power_Imag$4399",
# "Hot Water - Controlled$4231", "Hot Water - Controlled_Imag$4400",
# "Incomer - Uncontrolled$4230", "Incomer - Uncontrolled_Imag$4401", "Incomer Voltage$4405",
# "Kitchen & Bedrooms$4229", "Kitchen & Bedrooms_Imag$4402",
# "Laundry & Bedrooms$4228", "Laundry & Bedrooms_Imag$4403",
# "Lighting$4233", "Lighting_Imag$4404"),
# c("Heat Pumps (2x) & Power$4232", "Heat Pumps (2x) & Power$4399",
# "Hot Water - Controlled$4231", "Hot Water - Controlled$4400",
# "Incomer - Uncontrolled$4230", "Incomer - Uncontrolled$4401", "Incomer Voltage$4405",
# "Kitchen & Bedrooms$4229", "Kitchen & Bedrooms$4402",
# "Laundry & Bedrooms$4228", "Laundry & Bedrooms$4403",
# "Lighting$4233", "Lighting$4404"))
# }
return(dt)
}
#' Loads cleaned grid spy power data into a data.table
#'
#' \code{getCleanGsFile}
#'
#' Loads a clean Grid Spy data file using \code{readr::read_csv}. Ths enables .gz files to be autoloaded.
#'
#' We STRONLY advise that you use this function to load the cleaned Grid Spy files as it:
#'
#' - forces dateTime_orig to be col_character to prevent readr attempting to parse it (incorrectly).
#' - forces powerW be col_double to stop readr assuming an integer from parsing the first few lines
#'
#' We allow readr to auto-parse the r_dateTime column but note that this will set the timezone etc to the
#' timezone correct for the current location. This may not be what you intend.
#'
#' Note that you can also use this function to load extracts from the Grid Spy files. It will just
#' complain (throw a warning) if any of the columns it expects are missing.
#'
#' @param f file to load
#'
#' @import data.table
#' @import readr
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
#'
getCleanGridSpyFile <- function(f){
dt <- data.table::as.data.table(readr::read_csv(f,
col_types = cols(hhID = col_character(),
linkID = col_character(),
dateTime_orig = col_character(), # <- this is crucial otherwise readr attempts to parse this as a dateTime and FAILS
TZ_orig = col_character(),
r_dateTime = col_datetime(format = ""),
circuit = col_character(),
powerW = col_double() # <- also crucial otherwise readr seems to assume an integer
),
progress = FALSE # no feedback
)
)
return(dt)
}
#' Loads cleaned grid spy power data for a given circuit between two dates into a data.table
#'
#' \code{extractCleanGridSpyCircuit} loops over all clean household grid spy data files and loads each in turn using \code{readr::read_csv}. It filters each file to
#' extract data for particular circuits between two dates and creates some derived time/date variables before
#' using \code{rbind} to create a single data.table which is saved but not returned.
#'
#' Function matches \code{circuitPattern} to extract specific circuits and selects observations between
#' \code{dateFrom} and \code{dateTo}. Use this to extract any circuit you want between any given dates.
#'
#' \code{circuitPattern} is passed to the \code{data.table} operator \code{\%like\%} so wild cards & stuff may work. YMMV
#'
#' Use of \code{readr::read_csv} enables .gz files to be autoloaded and proper parsing of dateTimes.
#'
#' @param fPath location of files to load
#' @param exFile location to save the results
#' @param circuitPattern the circuit pattern to match
#' @param dateFrom date to start extract (inclusive)
#' @param dateTo date to end extract (inclusive)
#'
#' @import data.table
#' @import readr
#' @import hms
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
#'
extractCleanGridSpyCircuit <- function(fPath, exFile, circuitPattern, dateFrom, dateTo) {
# check files to load
fPattern <- "*.csv.gz"
print(paste0("#-> Looking for data using pattern = ", fPattern, " in ", fPath, " - could take a while..."))
#> Get the file list as a data.table ----
# This will list all the individual household data files
fListDT <- data.table::as.data.table(list.files(path = fPath, pattern = fPattern))
nFiles <- nrow(fListDT)
print(paste0("#-> Found ", tidyNum(nFiles), " files"))
fListDT <- fListDT[, fullPath := paste0(fPath, V1)] # add in full path as it doesn't return in list.files()
filesToLoad <- fListDT[, fullPath]
print(paste0("#-> Looking for circuits matching: ", circuitPattern))
print(paste0("#-> Filtering on date range: ", dateFrom, " - ", dateTo))
# loop over household files in list and rbind them
# rbind into a single data.table
nFiles <- length(filesToLoad)
print(paste0("#-> Loading ",nFiles, " files..."))
# don't use parallel for file reading - no performance gain
# http://stackoverflow.com/questions/22104858/is-it-a-good-idea-to-read-write-files-in-parallel
dataDT <- data.table::data.table()
# file load loop ----
for(f in filesToLoad){# should use lapply but...
print(paste0("#--> Loading ", f))
dt <- getCleanGridSpyFile(f)
# check household id
hh <- unique(dt$linkID)
# remove cols we don't need & which break the rbind if they parsed differently due to different TZs
dt$dateTime_orig <- NULL
dt$TZ_orig <- NULL
# filter on circuit label pattern and dates (inclusive)
filteredDT <- dt[circuit %like% circuitPattern & # match circuitPattern
as.Date(r_dateTime) >= dateFrom & # filter by dateFrom
as.Date(r_dateTime) <= dateTo] # filter by dateTo
print(paste0("#--> ", hh," : Found ", tidyNum(nrow(filteredDT)), " that match -> ", circuitPattern,
" <- between ", dateFrom, " and ", dateTo,
" out of ", tidyNum(nrow(dt))))
if(nrow(filteredDT) > 0){# if any matches...
print("#--> Summary of extracted rows:")
print(summary(filteredDT))
print(table(filteredDT$circuit))
dataDT <- rbind(dataDT, filteredDT)
}
}
print("#-> Finished extraction")
if(nrow(dataDT) > 0){
# we got a match
print(paste0("#-> Found ", tidyNum(nrow(dataDT)),
" observations matching -> ", circuitPattern, " <- in ",
uniqueN(dataDT$linkID), " households between ", dateFrom, " and ", dateTo))
print("#-> Summary of all extracted rows:")
print(summary(dataDT))
#> Save the data out for future re-use ----
print(paste0("#-> Saving ", exFile))
data.table::fwrite(dataDT, exFile)
# compress it
cmd <- paste0("gzip -f ", "'", path.expand(exFile), "'") # gzip it - use quotes in case of spaces in file name, expand path if needed
try(system(cmd)) # in case it fails - if it does there will just be .csv files (not gzipped) - e.g. under windows
print(paste0("#-> Gzipped ", exFile))
} else {
# no matches -> fail
stop(paste0("#-> No matching data found, please check your search pattern (", circuitPattern,
") or your dates..."))
}
print(paste0("#-> Extracted ", tidyNum(nrow(dataDT)), " rows of data"))
# return summary table of DT
print(summary(dataDT))
return(dataDT) # for testing
}
#' Loads cleaned grid spy power data for all households between two dates into a single data.table
#'
#' \code{loadCleanGridSpyData} checks to see if the extract file already exists. If not it loops over a file list and loads each in turn using \code{readr::read_csv}. It filters each file to
#' extract data between two dates and creates some derived time/date variables before
#' using \code{rbind} to create a single data.table which is saved and returned.
#'
#' Function selects observations between
#' \code{dateFrom} and \code{dateTo}.
#'
#' Use of \code{readr::read_csv} enables .gz files to be autoloaded and proper parsing of dateTimes.
#'
#' @param fPath location of files to load
#' @param dateFrom date to start extract (inclusive)
#' @param dateTo date to end extract (inclusive)
#'
#' @import data.table
#' @import readr
#' @import hms
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
#'
loadCleanGridSpyData <- function(fPath, dateFrom, dateTo) {
# check files to load
fPattern <- "*.csv.gz"
print(paste0("Looking for data using pattern = ", fPattern, " in ", fPath, " - could take a while..."))
#> Get the file list as a data.table ----
fListDT <- data.table::as.data.table(list.files(path = fPath, pattern = fPattern))
nFiles <- nrow(fListDT)
print(paste0("Found ", tidyNum(nFiles), " files"))
fListDT <- fListDT[, fullPath := paste0(fPath, V1)] # add in full path as it doesn't return in list.files()
filesToLoad <- fListDT[, fullPath]
print(paste0("# Filtering on date range: ", dateFrom, " - ", dateTo))
# loop over files in list and rbind them
# load into a single data.table
nFiles <- length(filesToLoad)
print(paste0("# Loading ",nFiles, " files..."))
# don't use parallel for file reading - no performance gain
# http://stackoverflow.com/questions/22104858/is-it-a-good-idea-to-read-write-files-in-parallel
dataDT <- data.table::data.table()
# file load loop ----
for(f in filesToLoad){# should use lapply but...
print(paste0("# Loading ", f))
df <- readr::read_csv(f, col_types = cols(
hhID = col_character(),
linkID = col_character(),
dateTime_orig = col_character(), # force to stay character as visual check if you need it
TZ_orig = col_character(), # this is the visual check if you ever need it
r_dateTime = col_datetime(), # parse to date time which _might_ be your local tzone. Check
circuit = col_character(),
powerW = col_double())
) # decodes .gz on the fly, requires readr, auto-parses dateTimes, correctly parses powerW
dt <- data.table::as.data.table(df)
dt[,r_dateTime := lubridate::with_tz(r_dateTime, tzone = "Pacific/Auckland")] # make sure this is set to NZ time otherwise all sorts of confusion will arise
# filter on dates (inclusive)
filteredDT <- dt[as.Date(r_dateTime) >= dateFrom & # filter by dateFrom inclusive
as.Date(r_dateTime) <= dateTo] # filter by dateTo inclusive
print(paste0("# Found: ", tidyNum(nrow(filteredDT)),
" between ", dateFrom, " and ", dateTo,
" out of ", tidyNum(nrow(dt))))
if(nrow(filteredDT) > 0){# if any matches...
print("Summary of extracted rows:")
print(summary(filteredDT))
dataDT <- rbind(dataDT, filteredDT)
}
}
print("# Finished extraction")
if(nrow(dataDT) > 0){
# we got a match
# derived variables ----
print("# > Setting useful dates & times (slow)")
dataDT <- dataDT[, timeAsChar := format(r_dateTime, format = "%H:%M:%S")] # creates a char
dataDT <- dataDT[, obsHourMin := hms::as.hms(timeAsChar)] # creates an hms time, makes graphs easier
dataDT$timeAsChar <- NULL # drop to save space
print(paste0("# Found ", tidyNum(nrow(dataDT)),
" observations in ", uniqueN(dataDT$hhID),
" households between ", dateFrom, " and ", dateTo))
print("Summary of all extracted rows:")
print(summary(dataDT))
#> Save the data out for future re-use ----
fName <- paste0(circuitPattern, "_", dateFrom, "_", dateTo, "_observations.csv")
ofile <- paste0(outPath, "dataExtracts/", fName)
print(paste0("Saving ", ofile))
data.table::fwrite(dataDT, ofile)
# do not compress so can use fread to load back in
} else {
# no matches -> fail
stop(paste0("No matching data found, please check your search pattern (", circuitPattern,
") or your dates..."))
}
print(paste0("# Loaded ", tidyNum(nrow(dataDT)), " rows of data"))
# return DT
return(dataDT)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.