knitr::opts_chunk$set(echo = FALSE) # by default turn off code echo
# Set start time ----
startTime <- proc.time()

# Libraries
localLibs <- c("data.table", 
               "ggplot2", 
               "kableExtra", 
               "lubridate")
GREENGridData::loadLibraries(localLibs)
# Local parameters ----

b2Kb <- 1024 #http://whatsabyte.com/P1/byteconverter.htm
b2Mb <- 1048576
plotLoc <- paste0(ggrParams$repoLoc, "/docs/plots/") # where to put the plots


# Local functions ----

\newpage

About

Report circulation:

License


Citation


History


Support


\newpage

Introduction


This report provides an analysis of circuit level negative and outlier power values to be found in the GREEN Grid project [@stephenson_smart_2017] research data.

There are a number of observations that have recorded either very large or negative power. There are at least two potential reasons for this:

The following analysis of the incidence of outliers and negative values makes recommendations on actions to take.

hhDT <- data.table::as.data.table(readr::read_csv(ggrParams$hhAttributes))

The project research sample comprises r nrow(hhDT) households. Table \@ref(tab:sampleSurveys) shows the number for whom valid appliance and survey data is available in this data package. Note that even those which appear to lack appliance data may have sufficient survey data to deduce appliance ownership (see question numbers Q19_* and Q40_*).

t <- hhDT[, .(nHouseholds = .N), keyby = .(Location, hasShortSurvey, hasLongSurvey, hasApplianceSummary)]

kableExtra::kable(t, caption = "Sample information") %>%
  kable_styling()

Key attributes

Table \@ref(tab:allHhData) shows key attributes for the recruited sample. Note that two GridSpy monitors were re-used and so require new hhIDs to be set from the date of re-use using the linkID variable. This is explained in more detail in the GridSpy processing report. Linkage between the survey and GridSpy data should therefore always use linkID to avoid errors.

keyCols <- c("hhID", "linkID", "Location", "surveyStartDate", "nAdults",
             "notes", "r_stopDate", "hasApplianceSummary",
             "PV Inverter", "Energy Storage")
kableExtra::kable(hhDT[, ..keyCols ][order(linkID)], 
             caption = "Sample details") %>%
  kable_styling()

nPV <- table(hhDT$`PV Inverter`)[[1]]

As we can see there were r nPV households with PV inverters. These were:

kableExtra::kable(hhDT[`PV Inverter` == "yes", ..keyCols ][order(linkID)], 
             caption = "Sample details (PV households)") %>%
  kable_styling()

# make easy to use flag
hhDT <- hhDT[, hasPV := "No PV" ]
hhDT <- hhDT[`PV Inverter` == "yes", hasPV := "Has PV" ]

Check outliers

We would rather not load the entire data for all households to do this so instead we start by investigating negative power in extracts for three different circuit labels:

These were extracted from the processed data using the example extraction code provided in this package.

Negative power: Heat Pump

As Figure \@ref(fig:hpBoxplot) shows negative power observations for Heat Pumps are spread across a number of households. It also shows that there is at least one very large outlier reading for rf_27.

dataExtracts <- paste0(ggrParams$dataLoc, "cleanData/safe/gridSpy/1min/dataExtracts/")
hpDT <- GREENGridData::getCleanGridSpyFile(paste0(dataExtracts, "Heat Pump_2015-04-01_2016-03-31_observations.csv.gz")) # use this function to ensure proper number etc formats
summary(hpDT)
hpBox <- ggplot2::ggplot(hpDT, aes(x = linkID, y = powerW, group = linkID)) +
  geom_boxplot()

hpBox + coord_flip()

As we can see from Table \@ref(tab:maxOutlier) there is only 1 value where power > 10kW. We recommend that this value be excluded from analysis.

t <- head(hpDT[powerW > 10000])

kableExtra::kable(t, caption = "All Heat Pump records where power > 10kW") %>%
  kable_styling()
# aggregate to days
hourlyHpDT <- hpDT[, .(minPower = min(powerW), 
                       maxPower = max(powerW),
                       nObs = .N), 
                   keyby = .(linkID, obsHourDate = lubridate::floor_date(r_dateTime, unit = "hour"))]

# match to hhDT
setkey(hourlyHpDT, linkID)
setkey(hhDT, linkID)
keepCols <- c("linkID", "hasPV")
#hourlyHpDT <- hhDT[, ..keepCols][hourlyHpDT]

hpNegTile <- ggplot2::ggplot(hourlyHpDT[minPower < 0], aes(x = obsHourDate, y = linkID, fill = nObs)) +
  geom_tile()

hpNegTile + scale_fill_gradient(low = "green", high = "red") +
  labs(x = "Hour of day")

Turning to the negative values, Figure \@ref(fig:hpNeg) shows that the vast majority of negative power values are due to one household (rf_46, which does not have PV). The others appear on isolated days and are most likely to be due to instrument installation error.

Table \@ref(tab:negHpTable) lists the households where any negative heat pump power values are observed while Figure \@ref(fig:negHpTable) shows a density plot for each household. The latter (together with Figure \@ref(fig:hpNeg)) suggests that the negative values continually observed for rf_46 may constitute some form of intermittent error since positive values were also recorded (c.f. Table \@ref(tab:negHpTable)). The other households on the other hand report a few negative observations suggesting instrument installation error that was quickly corrected.

negHHsHpDT <- hpDT[, .(minPower = min(powerW),
                   meanPower = mean(powerW),
                   maxPower = max(powerW)
                   ),
               keyby = .(linkID)]

kableExtra::kable(negHHsHpDT[minPower < 0], caption = "Households where negative power observed") %>%
  kable_styling()

negHH <- negHHsHpDT[minPower < 0, .(linkID, minPowerHP = minPower, maxPowerHP = maxPower)]

setkey(negHH, linkID)

ggplot(hpDT[powerW < 0], aes(x = powerW, fill = linkID)) + geom_histogram() + facet_grid(linkID ~ ., scales = "free_y") + 
  labs(caption = "Note y scale varies across panel")

We therefore recommend that when analysing heat pump data:

Negative power: Hot Water

As Figure \@ref(fig:hwBoxplot) shows negative power observations for Hot Water are spread across a number of households. In this case though, there are no very large positive outlier readings.

hwDT <- GREENGridData::getCleanGridSpyFile(paste0(dataExtracts, "Hot Water_2015-04-01_2016-03-31_observations.csv.gz")) # use this function to ensure proper number etc formats
summary(hwDT)
hwBox <- ggplot2::ggplot(hwDT, aes(x = linkID, y = powerW, group = linkID)) +
  geom_boxplot()

hwBox + coord_flip()
# aggregate to days
hourlyHwDT <- hwDT[, .(minPower = min(powerW), 
                       maxPower = max(powerW),
                       nObs = .N), 
                   keyby = .(linkID, obsHourDate = lubridate::floor_date(r_dateTime, unit = "hour"))]

# match to hhDT
setkey(hourlyHpDT, linkID)
setkey(hhDT, linkID)
keepCols <- c("linkID", "hasPV")
#hourlyHpDT <- hhDT[, ..keepCols][hourlyHwDT]

hwNegTile <- ggplot2::ggplot(hourlyHwDT[minPower < 0], aes(x = obsHourDate, y = linkID, fill = nObs)) +
  geom_tile()

hwNegTile + scale_fill_gradient(low = "green", high = "red") +
  labs(x = "Hour of day")

Turning to the negative values, Figure \@ref(fig:hwNeg) shows that the vast majority of negative power values are again due to one household (rf_46, which does not have PV) but with the addition of a number from rf_14. The others appear on isolated days and as before are most likely to be due to instrument installation error.

Table \@ref(tab:negHwTable) lists the households where any negative hot water power values are observed while Figure \@ref(fig:negHwTable) shows a density plot for each household. Note that these households do not necessarily match to those reported above for heat pumps partly because not all households have heat pumps. The latter (together with Figure \@ref(fig:hwNeg)) suggests that the negative values continually observed for rf_46 may constitute some form of intermittent error since positive values were also recorded (c.f. Table \@ref(tab:negHwTable)). The other households on the other hand (including rf_14) report negative observations on specific days suggesting instrument installation error that was quickly corrected.

negHHsHwDT <- hwDT[, .(minPower = min(powerW),
                   meanPower = mean(powerW),
                   maxPower = max(powerW)
                   ),
               keyby = .(linkID)]

kableExtra::kable(negHHsHwDT[minPower < 0], caption = "Households where negative power observed") %>%
  kable_styling()

dt <- negHHsHwDT[minPower < 0, .(linkID, minPowerHW = minPower, maxPowerHW = maxPower)]

negHH <- merge(negHH, dt, all = TRUE)

ggplot(hwDT[powerW < 0], aes(x = powerW, fill = linkID)) + geom_histogram(binwidth = 10) + facet_grid(linkID ~ ., scales = "free_y") + 
  labs(caption = "Note y scale varies across panel")

We therefore recommend that when analysing hot water data:

Lighting

As Figure \@ref(fig:lBoxplot) shows negative power observations for Lighting are found in only one household. There are also some relatively large values across at least 2 or 3 households which may indicate that the lighting circuits did not just power lights.

lDT <- GREENGridData::getCleanGridSpyFile(paste0(dataExtracts, "Lighting_2015-04-01_2016-03-31_observations.csv.gz")) # use this function to ensure proper number etc formats
summary(hwDT)
lBox <- ggplot2::ggplot(lDT, aes(x = linkID, y = powerW, group = linkID)) +
  geom_boxplot()

lBox + coord_flip()

This is clear when we consider the distribution of power by household ID and circuit name (Table \@ref(tab:lCheck)). Analysis of 'lighting' for these households should therefore be done with care.

t <- lDT[, .(minPower = min(powerW),
             maxPower = max(powerW)),
         keyby = .(linkID, circuit)]
kableExtra::kable(t[order(-maxPower)], caption = "Power distribution by household and circuit label") %>%
  kable_styling()
# aggregate to days
hourlyDT <- lDT[, .(minPower = min(powerW), 
                       maxPower = max(powerW),
                       nObs = .N), 
                   keyby = .(linkID, obsHourDate = lubridate::floor_date(r_dateTime, unit = "hour"))]

# match to hhDT
setkey(hourlyDT, linkID)
setkey(hhDT, linkID)
keepCols <- c("linkID", "hasPV")
#hourlyHpDT <- hhDT[, ..keepCols][hourlyHwDT]

lNegTile <- ggplot2::ggplot(hourlyDT[minPower < 0], aes(x = obsHourDate, y = linkID, fill = nObs)) +
  geom_tile()

lNegTile + scale_fill_gradient(low = "green", high = "red") +
  labs(x = "Hour of day")

Turning to the negative values, Figure \@ref(fig:lNeg) shows that all negative power values are again due to one household (rf_46, which does not have PV).

Table \@ref(tab:neglTable) lists the households where any negative lighting power values are observed while Figure \@ref(fig:neglTable) shows a density plot for each household. The latter (together with Figure \@ref(fig:lNeg)) again suggests that the negative values continually observed for rf_46 may constitute some form of intermittent error since positive values were also recorded (c.f. Table \@ref(tab:neglTable)).

negHHslDT <- lDT[, .(minPower = min(powerW),
                   meanPower = mean(powerW),
                   maxPower = max(powerW)
                   ),
               keyby = .(linkID)]

kableExtra::kable(negHHslDT[minPower < 0], caption = "Households where negative power observed") %>%
  kable_styling()

dt <- negHHslDT[minPower < 0, .(linkID, minPowerL = minPower, maxPowerL = maxPower)]

negHH <- merge(negHH, dt, all = TRUE)

ggplot(lDT[powerW < 0], aes(x = powerW, fill = linkID)) + geom_histogram(binwidth = 10) + facet_grid(linkID ~ ., scales = "free_y") + 
  labs(caption = "Note y scale varies across panel")

We therefore recommend that when analysing lighting data:

Overall patterns of negative observations

Table \@ref(tab:allNeg) shows that there is only one household where negative values are recorded on multiple non-PV circuits. This confirms our view that data for rf_46 should be treated seperately (see Section \@ref(rf46)) but that other data should be removed on a day by day/household by household basis.

kableExtra::kable(negHH, caption = "All households where negative power observed for heat pumps, hot water or lighting") %>%
  kable_styling()

The mysterious case of rf_46 {#rf46}

19th August 2019: updated - see https://github.com/CfSOtago/GREENGridData/issues/1

The analysis above has identified this household's data as being very strange. We detected a lot of negative power values over a long time but they seem to be interspersed with positive values.

We know that rf_46 did not have PV nor anything else of note. Let's take a look at rf_46 a bit more closely...

rf_46DT <- GREENGridData::getCleanGridSpyFile(paste0(ggrParams$dataLoc, "cleanData/safe/gridSpy/1min/data/rf_46_all_1min_data.csv.gz")) # use this function to ensure proper number etc formats
t <- summary(rf_46DT)

kableExtra::kable(t, caption = "Summary of grid spy data for rf_46") %>%
  kable_styling()

Figure \@ref(fig:plotMinMaxrf46) shows the max and min power per half-hour for each circuit. This suggests that most circuits had two monitors of which only one was recording the 'power demand' we are interested in. The second monitor in each pair appears to have been measuring something which was an order of magnitude smaller (see y scales) than the power demand we are interested in and which fluctuated between relatively small positive and negative values.

rf_46DT <- rf_46DT[, obsHourDate := lubridate::round_date(r_dateTime, unit = "hours")]

plotDT <- rf_46DT[, .(minPower = min(powerW/1000),
                      maxPower = max(powerW/1000)),
                  keyby = .(circuit, obsHourDate)]

p <- ggplot2::ggplot(plotDT, aes(x = obsHourDate)) +
  geom_line(aes(y = minPower), colour = "red") +
  geom_line(aes(y = maxPower), colour = "blue") +
  facet_grid(circuit ~ ., scales = "free_y") +
  labs(y = "Max/min kW",
       x= "Half hour (unlabelled for clarity")

# rotate facet grid labels for clarity
p + theme(strip.text.y = element_text(angle = 0))

This is confirmed by \@ref(tab:minMaxTablerf46) which indicates which of each pair of circuit labels should be excluded from any analysis that uses this household.

t <- rf_46DT[, .(minkW = min(powerW/1000),
                 meankW = mean(powerW/1000),
                 maxkW = max(powerW/1000)),
             keyby = .(circuit)]

kableExtra::kable(t, 
                  caption = "Summary of grid spy data for rf_46 by circuit",
                  digits = 3) %>%
  kable_styling()

19th August 2019: updated - see https://github.com/CfSOtago/GREENGridData/issues/1 - we recommend not using rf_46 data in any analysis at the present time.

Runtime

t <- proc.time() - startTime

elapsed <- t[[3]]

Analysis completed in r round(elapsed,2) seconds ( r round(elapsed/60,2) minutes) using knitr in RStudio with r R.version.string running on r R.version$platform.

R environment

R packages used

Session info

sessionInfo()

References



CfSOtago/GREENGridData documentation built on June 10, 2022, 8:03 p.m.