visdom: R package for energy data analytics

Documented in piecewise.regressor regressorDF regressor.piecewise regressor.split

# Copyright 2016 The Board of Trustees of the Leland Stanford Junior University.
# Direct inquiries to Sam Borgeson (sborgeson@stanford.edu)
# or professor Ram Rajagopal (ramr@stanford.edu)

#library(sandwich) # Contains the NeweyWest function for calculating a modified vcov
#library(lmtest)   # provides coeftest, which can perform the z/t/p tests using NW
#library(cvTools)  # tools for cross validation cvFit

# Model Descriptor and related classes ------------------------------------
#
# Model Descriptors specify all the information required to run one or more
# regression (or other) models and implement a 'run' method that runs the
# model using data from the MeterDataClass instance and returns its resutls
# in a well specified format, as implemented in summarizeModel.
#' @export
ModelDescriptor = function(name,formula,subset=NULL,preRun=NULL,cvReps=0,keepResiduals=F,step=F,datesToIgnore=NULL){
  if (is.null(preRun)){ preRun = function(r,df) { return( c()) } }
  if (is.null(subset)){ subset     = list(all="TRUE")                }
  obj = list (
    name     = name,
    formula  = formula,
    subset   = subset,
    preRun   = preRun,
    cvReps   = cvReps,
    keepResiduals = keepResiduals
  )

  obj$run = function(resData,df=NULL,doOccModel=F,datesToIgnore=NULL) {
    summaries <- c()
    if(is.null(df)) { df = regressorDF(resData,norm=FALSE) }
    if(! is.null(datesToIgnore)) {
      idZip=paste(resData$id,resData$zip,sep='.')
      #print(paste('datesToIgnore',idZip))
      xDates = datesToIgnore[[idZip]] # idZip lookup
      if (! is.null(xDates)) {
        xd = xDates$dateCounts       # awkward. I forgot that the outliers are saved with lots of additional info
        xdts = xd$dates[xd$Freq > 2]    # dateCounts has the dates and their frequency between 1 and 3 from sliding window regressions
        #print(paste('removing ',length(xdts),'dates'))
        #print(paste('from',nrow(df)))
        df = df[! df$dates %in% xdts,] # keep only entries we are not supposed to ignore
        #print(paste('to',nrow(df)))
      }
    }
    #print(paste('[ModelDescriptor.run]',obj$name,'(',names(obj$subset),'):',obj$formula))
    for(snm in names(obj$subset)) {
      df$sub    = eval(parse(text=obj$subset[[snm]]),envir=df)  # load the subset criteria as text and parse as booleans into the data.frame
      lm.result = lm(obj$formula,df,x=T,y=T,subset=sub==T,na.action=na.exclude) # run the lm on the subset indicated in the data frame
      if(step) {
        lmr = lm('kwh ~ 1',df,subset=sub==T)
        parts = strsplit(obj$formula,'~')[[1]]
        #steps = step(lm(paste(parts[1],'~1'),df,subset=sub==T),scope=paste('~',parts[2]),direction='forward',trace=1)
        steps = step(lmr,scope=list(lower='kwh ~ 1',upper=paste('kwh ~ ',parts[2])),direction='forward',trace=1)

        #print(summary(steps))
        print(anova(steps))
        print(names(anova(steps)))
        print(class(anova(steps)))
      }
      summaries = rbind(summaries, summarizeModel(lm.result,
                                                  df,
                                                  modelDescriptor = obj,
                                                  nm              = obj$name,
                                                  id              = resData$id,
                                                  zip             = resData$zip,
                                                  subnm           = snm,
                                                  cvReps          = obj$cvReps,
                                                  keepResiduals   = obj$keepResiduals,
                                                  formula         = obj$formula,
                                                  doOccModel      = doOccModel,
                                                  subset          = obj$subset[[snm]]) )
    }
    return(list(summaries=summaries)) # using list so other functions can return additional data
  }

  class(obj) = "ModelDescriptor"
  return(obj)
}

# Custom function that runs when print(modelDescriptor) is called.
#' @export
print.ModelDescriptor = function(md) {
  print(paste('ModelDescriptor',md$name))
  print(paste('  Formula:',md$formula))
  print(paste('  Subset:',md$subset,collapse=','))
}

# DescriptorGenerators generate model descriptors and additional regressor columns
# for custom model configurations. For example, the hourly change point model
# returns a separate set of piecewise temperature regressors for each hour.
#' @export
DescriptorGenerator = function(genImpl,name='',formula=NULL,subset=NULL,cvReps=0,terms=NULL,datesToIgnore=NULL,...){
  obj = list (
    name       = name,
    formula    = formula,
    subset     = subset,
    terms      = terms,
    genImpl    = genImpl,
    cvReps     = cvReps,
    datesToIgnore = datesToIgnore
  )

  obj$run = function(r,df=NULL,doOccModel=F) {
    summaries = c()
    other     = c()
    ####
    #### todo: add code to look up and remove "outliers" as determined using the occupancy model
    ####
    if(is.null(df)) { df = regressorDF(r,norm=FALSE) }
    updates = obj$genImpl(r,df,name,formula,subset=subset,cvReps=cvReps,terms=terms,...)
    if(! plyr::empty(updates$regressors)) {
      df = cbind(df,updates$regressors)
    }
    for(modelDescriptor in updates$descriptors) {
      runOut    = modelDescriptor$run(r,df,doOccModel,obj$datesToIgnore) # returns a list, with a named entry 'summaries'
      summaries = rbind(summaries,runOut$summaries)
    }
    # the rest....
    updates['regressors']  <- NULL
    updates['descriptors'] <- NULL
    other = rbind(other,data.frame(updates))
    return(list(summaries=summaries,other=other))
  }
  class(obj) = "DescriptorGenerator"
  return(obj)
}

# Custom print function that runs when print(descriptorGenerator) is called.
#' @export
print.DescriptorGenerator = function(dg) {
  print(paste('DescriptorGenerator',dg$name))
  print(paste('  Subset:',dg$subset,collapse=','))
}

# Custom Model Generators -------------------------------------------------
#
# these *Generator functions define custon data frame columns and ModelDescriptors for
# special case models.
# Generators returns a list with members:
# regressors: a data.frame (or compatible) containing additional required regressor columns
# descriptors: a list of ModelDescriptor objects that specify a model formula and other
#              parameters required to run the custom model assuming data is drawn from
#              the cbind() of the original data.frame df and the new regressors.
# other named values: an arbitrary set of additional named fileds containing information
#              related to the custom process of outputs of the pre-processing.
#              For example - the hourly change point model will return the change points
#                            it is using.

# lagGenerator calculates a single separate temperature coefficient for the range of lags specified.
# It would need change point detection to fit data reasonably well.
#' @export
lagGenerator = function(r,df,namePrefix,formula,subset=NULL,cvReps=0,terms=NULL,hrs=0:18) {
  lagRegressors = apply(t(hrs),2,function(x) return(lag(df$tout,x))) # construct a regressor matrix with lagged columns
  colnames(lagRegressors) <- paste('tout_L',hrs,sep='')
  lagStr = paste('tout_L',hrs,sep='',collapse='+')
  hourlyLags = paste('kw ~',lagStr,'+ HOD')
  out = list(
    regressors   = lagRegressors,
    descriptors  = list( hourlyLags=ModelDescriptor(
      name=paste(namePrefix,'hourlyLags', sep=''),
      formula=hourlyLags,
      subset=subset,cvReps=cvReps)
    )
  )
  return(out)
}

#' @export
partsGenerator = function(r,df,namePrefix,formula,subset=NULL,cvReps=0,terms=NULL,hrs=0:18) {

  print(names(df))
  cold = subset(df,subset=tout < 60)

  print(cfg$outDir)
  path = file.path(getwd(),cfg$outDir,paste(r$zip,r$id,'parts.png',sep='_'))
  print(path)
  #png(path)
  hist(cold$kw,breaks=30)
  plot(r$tout,r$kw)
  points(cold$tout,cold$kw,col='blue')
  mn = mean(cold$kw,na.rm=T)
  tRange = floor(min(r$tout,na.rm=T)):floor(max(r$tout,na.rm=T))
  points(tRange,rep(mn,length(tRange)),col='green',type='l')
  points(tRange,rep(mn+ 2*dev,length(tRange)),col='gray',type='l')
  points(tRange,rep(mn- 2*dev,length(tRange)),col='gray',type='l')
  upper = subset(df,kw > mn+2*dev & tout > 60 )
  points(upper$tout,upper$kw,col='red')
  ulm = lm('kw ~ tout',upper)
  points(upper$tout,ulm$fitted.values,col='black',type='l')


  # next:
  # fit upper points.
  # histogram of upper points; select the middle of the highest peak on the histogram
  # refit these
  # intersect lower and upper lines at the CP
  #dev.off()

  lagRegressors = apply(t(hrs),2,function(x) return(lag(df$tout,x))) # construct a regressor matrix with lagged columns
  colnames(lagRegressors) <- paste('tout_L',hrs,sep='')
  lagStr = paste('tout_L',hrs,sep='',collapse='+')
  hourlyLags = paste('kw ~',lagStr,'+ HOD')
  out = list()
  #  regressors   = lagRegressors,
  #  descriptors  = list( hourlyLags=ModelDescriptor(
  #    name=paste(namePrefix,'hourlyLags', sep=''),
  #    formula=hourlyLags,
  #    subset=subset,cvReps=cvReps)
  #  )
  #)
  return(out)
}

# geometricLagGenerator finds the best fit for a single parameter 'a' that determines geometric
# decay for hourly lag terms in the form a^k for the kth lag. This form is consistent
# with a simple physical model of heat transfer through a wall with resistance and
# capacitance.
#' @export
geometricLagGenerator = function(r,df,namePrefix,formula,subset=NULL,cvReps=0,terms=NULL,hrs=18) {
  # grid search for a geometric decay term beest fit such that the coeficient for the kth lag hour is a^k
  # this can be implemented as a single moving average outpit with weights a^k.
  # wma(t) = Sum_{k=0}^hrs [ Tout(t-k) * a^k ]
  # returns the r.squared value for a mode run with a geometric decay for lagged weights
  modR2 = function(guess,df,hrs) {
    weights = guess ** c(1:hrs)
    weights = weights / sum(weights) # normalize
    wma = ma(df$tout,hrs,weights)
    pieces = regressor.piecewise(wma,c(55,65,75))
    dfwma = cbind(df,wma,pieces)
    #model = lm('kw ~ wma',dfwma) # very different results based on the model used
    # it may be that the lagged Tout calc is incompatible with HOD intereactions (one assumes IID, the other does not)
    model = lm(paste('kw ~',paste(colnames(pieces),collapse=" + ",sep=''),'+ HOD - 1'),dfwma)
    return(summary(model)$r.squared)
  }
  # initialize a search over a = (0,1]
  step = 0.05
  guesses = seq(step,1,step) # a coefficients can't be zero

  fits = apply(t(guesses),2,modR2,df,hrs) # run modR2 for all guess values
  bestGuess = guesses[which.max(fits)]    # choose the best one
  #plot(guesses,fits) # see what the fit(a) looks like
  print(paste('best guess for a:',bestGuess))
  weights = bestGuess ** c(1:hrs)
  weights = rep(1,hrs)
  weights = weights / sum(weights) # normalize
  wmaRegressor = as.matrix(ma(df$tout,hrs,weights))
  colnames(wmaRegressor) <- c('ToutWeightedMA')
  pieces = regressor.piecewise(wmaRegressor,c(55,65,75))
  out = list(
    regressors   = cbind(wmaRegressor,pieces),
    fits         = fits,
    descriptors  = list(
      simple=ModelDescriptor( name=paste(namePrefix,'Simple', sep=''),
                              formula=paste('kw ~ ToutWeightedMA'),
                              subset=subset,cvReps=cvReps),
      pieces=ModelDescriptor( name=paste(namePrefix,'Pieces24', sep=''),
                              formula=paste('kw ~',paste(colnames(pieces),collapse=" + ",sep=''),'+ HOD - 1'),
                              subset=subset,cvReps=cvReps)
    )
  )
  return(out)
}

# cp24Generator calculates a separate temperature change point for every hour of the day.
# It also includes terms that calculate sun sky position (aka solar geometry)
#' @export
cp24Generator = function(r,df,namePrefix,formula,subset=NULL,cvReps=0,terms=NULL,diverge=F,...) {
  #tout65     = pmax(0,tout-65) # TODO: should this be here?
  hourlyFits=hourlyChangePoint(df,as.list(1:24),trange=c(50:(max(df$tout,na.rm=T)-5)))
  cps = hourlyFits['cp',]
  cps[hourlyFits['nullModelTest',] > 0.05] <- NA # if the cp model failed the f-test afgainst the no cp model, don't include it
  splitToutList = plyr::alply(cps,.fun=piecewise.regressor,.margin=1,r$tout,diverge=diverge) # get a list of hourly piecewise splits for tout
  # combine the list into a sensible set of regressors
  hourlyCP = c() # hourly change point pieces
  for (hr in 1:length(cps)) { # for every hour, get the piecewise regressors, select just the right hours, and combine with cbind
    splitTout = splitToutList[[hr]]
    if (dim(splitTout)[2] == 1) colnames(splitTout) <- c(paste('tout_',hr,sep='')) # no split made
    if (dim(splitTout)[2] == 2) colnames(splitTout) <- paste(c('tout_lower_','tout_upper_'),hr,sep='') # 2 cols: above and below cp
    hrFilter = df$HOD %in% paste('H',sprintf('%02i',(hr-1)),sep='')
    splitTout[!hrFilter,] <- 0 # zero out values not matching the hour the change point comes from
    hourlyCP = cbind(hourlyCP,splitTout)
  }
  #tic()
  #sg = solarGeom(r$dates,r$zip)
  #toc()
  #newCols = cbind(sg$zone,((sg$zone != 'night') * 1))
  #colnames(newCols) <- c('solarZone','dayTime')
  hourlyCPf  = paste('kw ~',paste(colnames(hourlyCP),collapse=" + ",sep=''),'+ HOD - 1')
  hourlyCPfZ = paste('kw ~',paste(colnames(hourlyCP),collapse=" + ",sep=''),'+ dayTime + DOW - 1')
  #hourlyCPfZ = paste('kw ~',paste(colnames(hourlyCP),collapse=" + ",sep=''),'+ dayTime:solarZone + HOD - 1')
  changePoints = list(id=r$id,changePoints=hourlyFits)
  if(class(subset) == 'character' & subset == 'idxSteps') {
    step = 30*24 # 30 days
    width = 3 # 3 x 30 days = 90 days
    steps = (1-width):(length(df$idx) %/% (step) -1)
    #if(steps < 0) steps = 0
    subset = list()
    for(i in steps) {
      subset[[paste('sub',i,sep='')]] = paste('idx > ',i*step, '& idx <= ',i*step + step*width)
    }
    print('Generating subsets spanning multiple steps')
    #print(subset)
  }
  out = list(
    regressors   = hourlyCP, # had to wait to add the newCols until after the formula generation above
    #regressors   = cbind(hourlyCP,newCols), # had to wait to add the newCols until after the formula generation above
    changePoints = changePoints,
    descriptors  = list ( #hourlyCP  = ModelDescriptor(
      #  name=paste(namePrefix,'hourlyCP', sep=''),
      #  formula=hourlyCPf,
      #  subset=subset,cvReps=cvReps  ),
      hourlyCPZ = ModelDescriptor(
        name=paste(namePrefix,'hourlyCP',sep=''),
        formula=hourlyCPf,
        subset=subset,cvReps=cvReps,... )
    )
  )
  return(out)
}

# toutPieces24Generator breaks temperature into piecewise segments, broken at 55,65,75F
# and regresses these with different coefficients for every hour of the day.
#' @export
toutPieces24Generator = function(r,df,namePrefix,formula,subset=NULL,cvReps=0,terms='+ HOW - 1',basics=NULL,breaks=c(55,65,75),diverge=F,datesToIgnore=NULL) {
  toutPIECES = regressor.piecewise(r$tout,breaks,diverge=diverge)
  if(class(subset) == 'character' & subset == 'idxSteps') {
    step = 30*24
    width = 3
    steps = (1-width):(length(df$idx) %/% (step) -1)
    #if(steps < 0) steps = 0
    subset = list()
    for(i in steps) {
      subset[[paste('sub',i,sep='')]] = paste('idx > ',i*step, '& idx <= ',i*step + step*width)
    }
    print('Generating subsets spanning multiple steps')
    #print(subset)
  }
  out = list(
    regressors   = cbind(toutPIECES),
    descriptors  = list(
      Pieces24=ModelDescriptor( # regression with the best fit lag
        name=paste(namePrefix,'24', sep=''),
        formula=paste('kw ~',paste(colnames(toutPIECES),':HOD',collapse='+',sep=''),terms),
        subset=subset,cvReps=cvReps,datesToIgnore=datesToIgnore)
    )
  )
  return(out)
}

# toutPieces24LagGenerator finds a single lag k such that it maximizes corr(kw,lag(tout,k))
# Physical reality suggests that much of the heat energy entering a home comes via lags in
# the thermal mass of the walls, roof, and ceilings.
# This T* modified (lagged) temperature is then broken into piecewise segments for every
# hour of the day.
#' @export
toutPieces24LagGenerator = function(r,df,namePrefix,formula,subset=NULL,cvReps=0,terms=NULL,basics=NULL) {
  # basic lagged correlations are calculated as a part of basicFeatures. Here we pull out the
  # single lag with the max correlation with kW demand
  if(length(basics) == 0) { basics = basicFeatures(r) }
  lagCorrs = basics[grep('^lag[0-9]+',names(basics))]
  lagVal   = max(lagCorrs)
  lagHrs   = which.max(lagCorrs) - 1 # fisrt cl is 0 lag
  if(lagVal < 0.2) lagHrs = 0
  print(paste('lag by',lagHrs,lagVal))
  toutL = lag(r$tout,lagHrs)
  lagDiff = toutL - r$tout
  toutPIECESL = regressor.piecewise(toutL,c(55,65,75))

  # define regression formula that uses the piecewise pieces
  piecesf24L     = paste('kw ~',paste(colnames(toutPIECESL),':HOD',collapse=" + ",sep=''),'+ HOD - 1')
  piecesf24Ldiff = paste('kw ~',paste(colnames(toutPIECESL),':HOD',collapse=" + ",sep=''),'+ lagDiff + HOD - 1')
  out = list(
    regressors   = cbind(toutPIECESL,lagDiff),
    lagCorrs      = lagCorrs,
    descriptors  = list(
      Pieces24L=ModelDescriptor( # regression with the bet fit lag
        name=paste(namePrefix,'24L', sep=''),
        formula=piecesf24L,
        subset=subset,cvReps=cvReps),
      Pieces24Ldiff=ModelDescriptor( # regression using the diff between the best fit lag and current temp
        name=paste(namePrefix,'24Ldiff', sep=''),
        formula=piecesf24Ldiff,
        subset=subset,cvReps=cvReps)

    )
  )
  return(out)
}

# toutPieces24MAGenerator finds a single averaging width k such that it maximizes corr(kw,ma(tout,k))
# where ma calculates a moving average using a window of width k. Physical reality suggests that
# much of the heat energy entering a home comes via lags in the thermal mass of the walls, roof,
# and ceilings. It is sensible to presume that the thermal storage allows for averaging over some
# characteristic time period.
# This T* modified (averaged) temperature is then broken into piecewise segments for every
# hour of the day.
#' @export
toutPieces24MAGenerator = function(r,df,namePrefix,formula,subset=NULL,cvReps=0,terms=NULL,basics=NULL) {
  # basic moving average correlations are calculated as a part of basicFeatures. Here we pull out the
  # single averaging window with the max correlation with kW demand
  if(length(basics) == 0) { basics = basicFeatures(r) }
  maCorrs = basics[grep('^ma[0-9]+',names(basics))]
  maVal = max(maCorrs)
  maHrs = which.max(maCorrs)
  if(maVal < 0.2) {
    print('Not much correlation for ma(tout) so now what?')
    maVal = 0 # if there isn't much correlation
  }
  print(paste('moving average width',maHrs,maVal))
  toutMA = ma(r$tout,maHrs)
  toutPIECESMA = regressor.piecewise(toutMA,c(55,65,75))
  # define regression formula that uses the piecewise pieces
  piecesf24MA  = paste('kw ~',paste(colnames(toutPIECESMA),collapse=" + ",sep=''),'+ HOD - 1')
  out = list(
    regressors   = toutPIECESMA,
    maCorrs      = maCorrs,
    descriptors  = list(
      Pieces24MA=ModelDescriptor( # regression with the bet fit lag
        name=paste(namePrefix,'24MA', sep=''),
        formula=piecesf24MA,
        subset=subset,cvReps=cvReps)
    )
  )
  return(out)
}

#' @export
toutDailyFixedCPGenerator = function(r,df,namePrefix,formula,subset=NULL,cvReps=0,terms=NULL,basics=NULL,diverge=F) {
  return(toutDailyCPGenerator(r,df,namePrefix,formula,subset=subset,cvReps=cvReps,basics=basics,terms=terms,diverge=diverge,forceCP=65))
}

#' @export
toutDailyNPCPGenerator = function(r,df,namePrefix,formula,subset=NULL,cvReps=0,terms=NULL,basics=NULL,diverge=F) {
  return(toutDailyCPGenerator(r,df,namePrefix,formula,subset=subset,cvReps=cvReps,basics=basics,terms=terms,diverge=diverge,forceCP=c(55,65,75)))
}

#' @export
toutDailyDivergeCPGenerator = function(...) {
  return(toutDailyCPGenerator(...,diverge=T))
}

# toutDailyCP finds a single change point for a model of daily kWh usage or takes any number of forced
# change points as arguments
#' @export
toutDailyCPGenerator = function(r,df,namePrefix,formula,subset=NULL,cvReps=0,terms='+ DOW - 1',basics=NULL,forceCP=NULL,diverge=F) {
  if(is.null(terms)) { terms = ''}
  changeModel = NULL
  if(is.null(forceCP)) {
    changeModel = toutChangePointFast(df=df,reweight=F)
    #print(changeModel)
    modelCP = changeModel[grep('^cp',names(changeModel))]
  }
  else { modelCP = forceCP }
  pieces = regressor.piecewise(df$tout.mean,modelCP,diverge=diverge) # get a list of daily piecewise splits for tout.mean
  nSegs = dim(pieces)[2]
  if (nSegs == 1) colnames(pieces) <- c('tout.mean') # no split made
  if (nSegs == 2) colnames(pieces) <- c('tout.mean_lower','tout.mean_upper') # 2 cols: above and below cp
  if(nSegs > 2) {
    middle = paste('tout.mean_middle_',1:(nSegs-2),sep='')
    colnames(pieces) <- c('tout.mean_lower',middle,'tout.mean_upper')
  }
  # define regression formula that uses the piecewise pieces
  dailyCPf  = paste('kwh ~',paste(colnames(pieces),collapse=" + ",sep=''),terms)
  out = list(
    regressors   = pieces,
    changeModel  = changeModel,
    descriptors  = list(
      DailyCP=ModelDescriptor( # regression with the best fit daily change point
        name=paste(namePrefix,'DailyCP', sep=''),
        formula=dailyCPf,
        subset=subset,cvReps=cvReps)
    )
  )
  return(out)
}

#' @export
toutDailyFlexCPGenerator = function(r,df,namePrefix,formula,subset=NULL,cvReps=0,terms='+ DOW - 1',basics=NULL,forceCP=NULL,diverge=F) {
  # todo: test 1,2,and 3 segment change point models.
  #coolCP = 70
  bestFit = toutDoubleChangePoint(df)
  cp = bestFit[grep('^cp',names(bestFit))]
  pieces = regressor.piecewise(df$tout.mean,cp,diverge=diverge) # get a list of daily piecewise splits for tout.mean
  middle = c()
  nSegs = dim(pieces)[2]
  if(nSegs > 2) middle = paste('tou.mean_middle_',1:(nSegs-2),sep='')
  colnames(pieces) <- c('tout.mean_lower',middle,'tout.mean_upper')
  # define regression formula that uses the piecewise pieces
  dailyCPf  = paste('kwh ~',paste(colnames(pieces),collapse=" + ",sep=''),terms)
  out = list(
    regressors   = pieces,
    changeModel  = bestFit,
    descriptors  = list(
      DailyFlexCP=ModelDescriptor( # regression with the best fit daily change point
        name=paste(namePrefix,'DailyFlexCP', sep=''),
        formula=dailyCPf,
        subset=subset,cvReps=cvReps)
    )
  )
  return(out)
}

# Helper functions --------------------------------------------------------

# this looks really complicated, but it is just converting an arbitrary
# id (numerical, text, or otherwise) into a number suitable for seeding
# random numbers to make cvfold repeatable per id.
idHash = function(obj) {
  md5hash = digest::digest(obj,algo='md5')
  intOfHash = strtoi(substr(md5hash,27,32),base=16)
  return(intOfHash)
}

# summarize regression model for future use
# designed to be friendly to storing a lot of results in sequence
# so it separates out the simple scalar metrics from the more complicated
# coefficients
#' @export
summarizeModel = function(m,df,modelDescriptor,nm,id,zip,subnm=NULL,cv=F,cvReps=1,doNewey=F,doOccModel=F,keepResiduals=F,formula='',subset='') {
  #lm(m,subset=m$y > 1)
  basics = list()
  basics$id          <- id
  basics$zip         <- zip
  basics$model.name  <- nm        # name of model run
  basics$subset.name <- subnm     # name of sample subset criteria (i.e. "summer" or "afternoon" or "weekdays")
  basics$formula     <- formula   # string of lm model call
  basics$subset      <- subset    # string of lm subset argument
  basics$logLik      <- logLik(m) # log liklihood for the model
  basics$AIC         <- AIC(m)    # Akaike information criterion

  #print(names(m))
  #print(summary(m, correlation=T))
  if(doNewey) {
    basics$BG <- unclass(lmtest::bgtest(m,order=1)) # Breusch-Godfrey test for first-order serial correlation
    DW             <- unclass(lmtest::dwtest(m)) # Durban-Watson test for serial corr (range is 0-4. 2 means no corr, DW near 1 is cause for concern)
    basics$DW      <- DW$statistic
    basics$DW.pval <- DW$p.value
    basics$coefficientsNW <- unclass(lmtest::coeftest(m,vcov.=sandwich::NeweyWest))
    basics$pacf = unclass(pacf(m$residuals,plot=F))$acf
    basics$pacf.1 <- basics$pacf[1]
    basics$pacf.significance <- qnorm((1 + 0.95)/2)/sqrt(sum(!is.na(m$residuals)))

  }

  s <- c(basics,as.list(summary(m, correlation=FALSE))) # generic list is more friendly for adding to a data.frame
  class(s) <- 'list'         # make sure the class is no longer summary.lm
  s$hist          <- hist(s$residuals,breaks=100,plot=F)
  s$kurtosis      <- timeDate::kurtosis(s$residuals)

  s$total         <- sum(predict(m),na.rm=T)
  s$dates         <- df$dates[eval(parse(text=subset),envir=df)]
  s$residuals     <- residuals(s) # include NAs in the resudials

  if(doOccModel) {
    # note that we can only assume that the length of residuals and dates
    # are the same if the regression uses na.action=na.exclude and we call
    # residuals(s), not s$residuals. If the lengths differ, we will report
    # incorrect dates
    #print('running occ models')
    dens = quantileDensities(s$residuals,df$dates)
    s$occ.hr   = dens$hr
    s$occ.wday = dens$wday
    s$occ.wknd = dens$wknd
    s$occ.wkdy = dens$wkdy
    s$mon      = dens$mon
    mdens = quantileDensities(s$residuals,df$dates,monthly=T)
    s$occ.hr.m   = mdens$hr
    s$occ.wday.m = mdens$wday
    s$occ.wknd.m = mdens$wknd
    s$occ.wkdy.m = mdens$wkdy
    s$mon.m      = mdens$mon
  }


  s$call          <- c()     # lm model call (depends on variable scope and can be junk)
  s$terms         <- c()     # lm model terms (depends on variable scope and can be junk)
  if(keepResiduals) {
    s$prediction    <- predict(m)
  } else {
    s$residuals     <- c()     # residuals scaled by weights assigned to the model
  }
  s$cov.unscaled  <- c()     # p x p matrix of (unscaled) covariances
  #s$aliased      <- c()     # named logical vector showing if the original coefficients are aliased
  s$na.action     <- c()     # get rid of extra meta info from the model
  #s$sigma                   # the square root of the estimated variance of the random error
  #s$adj.r.squared           # penalizing for higher p
  #s$r.squared               # 'fraction of variance explained by the model'
  #s$fstatistic              # 3-vector with the value of the F-statistic with its numerator and denominator degrees of freedom
  #s$df                      # degs of freedom, vector (p,n-p,p*), the last being the number of non-aliased coefficients.
  #s$coefficients            # a p x 4 matrix: cols = coefficient, standard error, t-stat, (two-sided) p-value

  # net contribution of each coefficient
  # todo: there are a lot of negative numbers in this, so the contributions aren't directly interpretable
  if("x" %in% names(m)) {
    s$contribution <- colSums(t(m$coefficients * t(m$x)))
    #s$contributionCP1 <- colSums(t(m$coefficients * t(changeCP(m$x,1))))
    #s$contributionCP2 <- colSums(t(m$coefficients * t(changeCP(m$x,2))))
    #s$contributionCP5 <- colSums(t(m$coefficients * t(changeCP(m$x,5))))
  }

  # k-fold prediction error
  if (cvReps > 0) {
    #s$fold.rmse <- kFold(df,modelDescriptor,K=5)  # hand rolled cross validation function
    # this use of the id as the random number seed might be a bit strange, but it ensures that
    # all cvFits across different model specifications use the same subsets of data
    # this ensures consistency for the purposes of comparison
    s$cv.rmse   <- cvFold(df,modelDescriptor,K=5,R=cvReps,seed=idHash(s$id)) # pre-rolled function from cvTools
    s$cv.mape   <- cvFold(df,modelDescriptor,K=5,R=cvReps,costfn=cvTools::mape,seed=idHash(s$id) ) # pre-rolled function from cvTools

  }
  PLOT_RESUDIALS = F
  if(PLOT_RESUDIALS) {
    tryCatch( {
      kwh = df$kwh
      lowPts = kwh < 1
      png(file.path(getwd(),'residuals',paste(zip,id,'residuals.png',sep='_')))
      op <- par(no.readonly = TRUE)
      par( mfrow=c(2,2), oma=c(2,2,3,0),mar=c(2,2,2,2))# Room for the title
      plot(m$y,col='black',main=paste('R2=',sprintf("%0.2f",s$r.squared),'cv.RMSE=',sprintf("%0.2f",s$cv.rmse)))
      points(m$fitted.values,col='grey')
      plot(s$hist)
      plot(s$residuals,col='blue',main=paste('Residuals; kurtosis=',s$kurtosis))
      pacf(s$residuals,main='PACF')
    },
    error = function(e) { print(e) },
    finally = {
      par(op)
      dev.off()
    } )
  }
  return(s)
}

#' @export
changeCP = function(data,dt=2) {
  # find and alter the temperature components
  # to simulate a change in the setpoint
  # currently only works for toutCP models.
  # WARNING: doest work with negative changes!!!
  upChanged = data[,'tout.mean_upper'] - dt
  data[,'tout.mean_lower'] = data[,'tout.mean_lower'] - pmin(0,upChanged)
  data[,'tout.mean_upper'] = pmax(0,upChanged)
  return(data)
}

#' @title
#' Split a single column of values into multiple columns as determined by group membership
#' 
#' @description 
#' Split a column of data by membership in discrete groups (as with factor levels) 
#' data into multiple columns containing just the values corresponding to the membership
#' indicator associated with the column and zeros otherwise.
#' 
#' @return 
#' A matrix of length(sort(unique(membership))) columns, that is all
#' zeros except for the regressor values that match the membership values
#' corresponding to the column. This supports regression with separate
#' coefficints for each group defined by the membership
#' for example, regressor.split(Tout,dates$hour) will return a matrix with 24
#' columns, where the only non-zero entry per row contains the Tout value in
#' the column corresponding to the hour of day it was recorded
#' 
#' @param regressor The vector of data to be split
#' 
#' @param membership A vector of the same length as the regressor that assigns group membership categorically.
#' If the regressor is a factor, no membership assignment is required.
#' 
#' @export
regressor.split = function(regressor,membership=NULL) {
  mat <- c()
  nm  <- c()
  if(is.null(membership)) {
    if(class(regressor) == 'factor') {
      membership = regressor
      regressor=rep(1,length(regressor))
    }
  }
  for (i in sort(unique(membership))) {
    mat <- cbind(mat,ifelse(membership==i,1,0)) # add a colunm of 1's and 0's
    nm <- c(nm,i)
  }
  colnames(mat) <- nm
  return(mat * regressor)
}

#' @title
#' Break a vector into piecewise regressor columns, using the specified breaks
#' 
#' @description 
#' break a vector out for continuous piecewise regression (i.e. the fitted
#' segments will join eachother at each end) into a matrix whose row
#' totals are the original values, but whose columns divide the value across
#' a set of bins, so 82 across bins with boundaries c(50,60,65,70,80,90)
#' becomes the row 50,10,5,5,10,2,0, which sums to 82...
#' This is very useful for finding rough change points in thermal response
#' as is expected for buildings with clear setpoints
#' 
#' @param regressor The single array of regressor values to be broken out into piecewise columns.
#' 
#' @param bins - numerical arry whose values are the break points defining the piecewise regressors
#' 
#' @param diverge - Defautl FALSE: Whether the first column contains the distance of
#' the value from the bottom change point (rather than from 0)
#'
#' @details
#' TODO: This can create a column of zeros, which should break the regression
#' so we might need to prune the columns when we're done and keep track of
#' which bins are in play when comparing across regressions
#' 
#' @seealso \code{\link{piecewise.regressor}} for 'apply' syntax
#'
#' @export
regressor.piecewise = function(regressor,bins,diverge=F) {
  if(any(is.na(bins))) return(as.matrix(regressor)) # if bins itself is NA or any of its values are NA, return the original data
  binLower = 0
  mat <- c()
  nm = c()
  for(binUpper in c(bins,Inf)) {
    col = regressor - binLower
    # this is to avoid columns of zeros when the bins are outside the data
    # values are scaled by the first bin value to make sure they aren't too
    # big or too small for the data
    smalls = bins[1] * runif(length(col),0.00001,0.00009)
    negs = col < 0
    negs[is.na(negs)] = F # force NA values to become False boolean values
    col[negs] = 0 #smalls[negs]
    col[(col > binUpper-binLower)] = binUpper-binLower
    mat = cbind(mat,col)
    nm = c(nm,paste('tout',binLower,'_',binUpper,sep=''))
    binLower = binUpper
  }
  if(diverge) {
    mat[,1] = pmax(bins[1] - regressor,0) # the first column contains the distance of
    # the value from the bottom change point (rather than from 0)
  }
  colnames(mat) <- nm
  return(mat)
}

#' @title
#' Break a vector into piecewise regressor columns, using the specified breaks (aka bins)
#' 
#' @description 
#' convienience reordering of \code{\link{regressor.piecewise}} putting the bins first for apply style calls...
#' 
#' break a vector out for continuous piecewise regression (i.e. the fitted
#' segments will join eachother at each end) into a matrix whose row
#' totals are the original values, but whose columns divide the value across
#' a set of bins, so 82 across bins with boundaries c(50,60,65,70,80,90)
#' becomes the row 50,10,5,5,10,2,0, which sums to 82...
#' This is very useful for finding rough change points in thermal response
#' as is expected for buildings with clear setpoints
#' 
#' @param bins - numerical arry whose values are the break points defining the piecewise regressors
#'
#' @param regressor The single array of regressor values to be broken out into piecewise columns.
#' 
#' @param diverge - Defautl FALSE: Whether the first column contains the distance of
#' the value from the bottom change point (rather than from 0)
#'
#' @details
#' This can be used in an \code{apply} setting to run a parametric grid search of break points.
#' 
#' TODO: This can create a column of zeros, which should break the regression
#' so we might need to prune the columns when we're done and keep track of
#' which bins are in play when comparing across regressions
#' 
#' @seealso \code{\link{regressor.piecewise}} for 'normal' syntax
#'
#' @export
piecewise.regressor = function(bins,regressor,...) return(regressor.piecewise(regressor, bins,...))

# pre-compute holiday to save time below - the dates must be a superset of all potential dates.
# holidaysNYSE is a function from the timeDate package
library(timeDate)
hdays = as.Date(holidayNYSE(2008:2011))

#' @title
#' data.frame formatting of \code{\link{MeterDataClass}} data
#' 
#' @description 
#' Given a MeterDataClass instance (meterData), regressorDF returns a data.frame consisting of a standard set of
#' regressor columns suitable for passing into a call to lm.
#' 
#' @param meterData The meter data class to be converted
#' 
#' @export
regressorDF = function(meterData,norm=FALSE,rm.na=FALSE) {
  wday       = meterData$dates$wday
  WKND       = c('WK','ND')[(wday == 0 | wday == 6) * 1 + 1] # weekend indicator
  dateDays   = as.Date(meterData$dates)
  vac        = factor(dateDays %in% hdays)
  hStr       = paste('H',sprintf('%02i',meterData$dates$hour),sep='')
  hwkndStr   = paste(WKND,hStr,sep='')
  dStr       = paste('D',wday,sep='')
  mStr       = paste('M',meterData$dates$mon,sep='')
  howStrs    = paste(dStr,hStr,sep='')
  idx        = 1:length(meterData$kw)
  MOY        = factor(mStr,levels=sort(unique(mStr)))         # month of year
  DOW        = factor(dStr,levels=sort(unique(dStr)))         # day of week
  HOD        = factor(hStr,levels=sort(unique(hStr)))         # hour of day
  HODWK      = factor(hwkndStr,levels=sort(unique(hwkndStr))) # hour of day for weekdays and weekends
  HOW        = factor(howStrs,levels=sort(unique(howStrs))) # hour of week
  tout       = meterData$tout
  pout       = meterData$pout
  rain       = meterData$rain
  dp         = meterData$dp
  rh         = meterData$rh
  kw         = meterData$kw

  tout65     = pmax(0,tout-65) # todo: we know that 65 or any other fixed change point is a bad assumpiton
  # but it is commonly made. Maybe we should eliminate this, but maybe we
  # should keep it to allow for comparisons with other models...

  if(norm) kw = meterData$norm(kw)

  #kw_min = kw - quantile(kw,na.rm=TRUE,c(0.02)) # remove the min for regression w/o const term
  print(paste(length(pout),length(rain),length(rh),length(meterData$dates),length(vac),length(MOY)))
  df = data.frame(
    #tout65_l1 = lag(tout65,1),
    #tout65_l3 = lag(tout65,3),
    #tout_d1 = diff2(tout,1),
    #tout65_d1 = diff2(tout,1)*(tout65 > 0),
    #tout_d3 = diff2(tout,3),
    #kw_min=kw_min,
    idx=idx,
    kw=kw,
    tout=tout,
    tout65=tout65,
    pout=pout,
    rain=rain,
    rh=rh,
    dates=meterData$dates,
    vac=vac,
    MOY,DOW,HOD,HODWK,HOW,WKND   )
  if(rm.na) { df = df[!rowSums(is.na(df)),] }
  #df = cbind(df,toutPIECES) # add the columns with names from the matrix to the df
  return(df)
}

#' @export
rDFG = function(meterData,norm=F,bp=65,rm.na=FALSE) {
  df = data.frame(therms=meterData$therms,tout=meterData$gasTout)
  df$tout.65 = pmax(0,65 - df$tout)
  return(df)
}

#' @export
rDFA = function( meterData, norm=F, bp=65, rm.na=FALSE ) {
  return( as.daily.df( meterData, norm, bp, rm.na )  )
}

#' @export
# TODO: this is the main function in use for converting a MeterDataClass into a data frame. Rename it to something more descriptive
as.daily.df = function(meterData,norm=F,bp=65,rm.na=FALSE) {
  numDays = dim(meterData$kwMat)[1]
  dts  = meterData$dates
  days = as.POSIXlt(paste(meterData$days,'00:00'),format='%Y-%m-%d %H:%M' )
  #days = dts[(0:(numDays-1)*24)+1]
  df = data.frame(day = format(days,'%Y-%m-%d'))
  df$mon   = days$mon
  df$wday  = days$wday
  df$DOW   = paste('D',days$wday,sep='')  # Su=0 ... Sa=6
  df$DOW   = factor(df$DOW, levels=sort(unique(df$DOW)))
  df$MOY   = format(days,'%y-%m')   # as.POSIXlt(df$dates)$mon
  df$MOY   = factor(df$MOY, levels=sort(unique(df$MOY)))
  df$WKND  = (df$wday == 0 | df$wday == 6) * 1 # weekend indicator
  df$kwh   = meterData$daily('kw',sum)
  df$kw.mean = meterData$daily('kw',mean)
  df$kw.max = meterData$daily('kw',max)
  df$CDH = meterData$daily('tout',function(tout,bp=65,na.rm=T) sum(pmax(0,tout-bp),na.rm=na.rm))
  df$HDH = meterData$daily('tout',function(tout,bp=65,na.rm=T) sum(pmax(0,bp-tout),na.rm=na.rm))
  w = meterData$weather
  dayMatch = match(as.Date(df$day),as.Date(w$dayStats$day))

  # "day" "tout_mean" "pout_mean" "rain_mean" "dp_mean" "tout_min" "pout_min" "rain_min" "dp_min" "tout_max" "pout_max" "rain_max" "dp_max"
  # todo: add NAs for days that are in df$day, but not dayStats (caused by > 24 hrs of data missing)
  df$tout.mean  = w$dayStats[dayMatch,'tout_mean'] 
  df$tout.min   = w$dayStats[dayMatch,'tout_min']
  df$tout.max   = w$dayStats[dayMatch,'tout_max']
  df$tout.mean.65 = pmax(0,df$tout.mean-65)
  tPieces = regressor.piecewise(df[['tout.mean']],c(65))
  df$tout.mean.65lower = tPieces[,1] # lower data for fixed 65 CP
  df$tout.mean.65upper = tPieces[,2] # upper data for fixed 65 CP
  df$tout.mean.65.l1 = lag(pmax(0,(df$tout.mean-65)),1) # 1 day lagged tout.mean over 65F
  dl            = w$dayLengths[dayMatch,'dayMeans']
  df$day.length = NULL # day length is optional because it takes a long time to compute
  if(length(dl) > 0) { df$day.length = dl - min(dl) }
  #   df$tout.mean = meterData$daily('tout',mean)
  #   df$tout.max  = meterData$daily('tout',max)
  #   df$tout.min  = meterData$daily('tout',min)
  df$pout.mean = w$dayStats[dayMatch,'pout_mean']
  df$pout.min  = w$dayStats[dayMatch,'pout_min']
  df$pout.max  = w$dayStats[dayMatch,'pout_max']
  #   df$pout.mean = meterData$daily('pout',mean)
  #   df$pout.max  = meterData$daily('pout',max)
  #   df$pout.min  = meterData$daily('pout',min)
  #   df$rh.mean = w$rh(df$tout.mean,w$dayMeans[dayMatch,'dp'])
  df$rh.mean = meterData$daily('rh',mean)
  #df$rh.max  = meterData$daily('rh',max)
  #df$rh.min  = meterData$daily('rh',min)
  # add vacation days flags
  # holidaysNYSE is a function from the dateTime package
  #hdays      = as.Date(holidayNYSE((days[1]$year+1900):(days[length(days)]$year+1900)))
  df$vac  = factor(as.Date(days,,tz="PST8PDT") %in% hdays)

  return(df)
}
# Given a MeterDataClass instance, returns a list of data.frames with rows aggregated to 1 per day,
# typically via averaging. These can be used as input into daily or monthly regression models.
regressorDFAggregated = function(meterData,norm=F,bp=65,rm.na=FALSE) {
  # uses melt and cast to reshape and aggregate data
  df = meterData$df() # kw, tout, dates
  if(norm) df$kw_norm = meterData$norm(df$kw)
  df$kw_min = df$kw - quantile(df$kw,na.rm=TRUE,c(0.02)) # remove the min for regression w/o const term

  df$pout  = meterData$pout
  dp       = meterData$dp
  df$rh    = meterData$rh

  df$day   = format(df$dates,'%Y-%m-%d') # melt has a problem with dates
  df$wday  = as.POSIXlt(df$dates)$wday   # raw for subsetting Su=0 ... Sa=6
  df$DOW   = paste('D',df$wday,sep='')  # Su=0 ... Sa=6
  df$WKND  = (df$wday == 0 | df$wday == 6) * 1 # weekend indicator
  df$DOW   = factor(df$DOW, levels=sort(unique(df$DOW)))
  month    = format(df$dates,'%y-%m')   # as.POSIXlt(df$dates)$mon
  df$mon   = as.POSIXlt(df$dates)$mon   # raw month data for subset functions Jan=0 ... Dec=11
  df$MOY   = factor(month, levels=sort(unique(month)))
  df <- subset(df, select = -c(dates) )  # melt has a problem with dates but we don't need anymore
  # melt and cast to reshape data into monthly and daily time averages
  dfm = melt(df,id.vars=c("day",'DOW','MOY','mon','wday','WKND'),measure.vars=c('kw','tout','pout','rh'),na.rm=TRUE)

  monthly = cast(dfm,MOY + mon ~ variable,fun.aggregate=c(sum,mean,function(ar1,bp=65) sum(ar1 > bp),function(ar2,bp=65) sum(ar2 < bp)),subset= variable %in% c('kw','tout'))
  colnames(monthly) <- c('MOY','mon','kwh','kw.mean','junk1','junk2','junk3','tout.mean','CDH','HDH')
  monthly <- subset(monthly, select = -c(junk1, junk2, junk3) )
  #monthly = c()
  daily = cast(dfm, MOY + day + DOW + mon + wday + WKND ~ variable,fun.aggregate=c(sum,mean,max,function(ar1,bp=65) sum(ar1 > bp),function(ar2,bp=65) sum(ar2 < bp)),subset= variable %in% c('kw','tout','pout','rh'))
  colnames(daily) <- c('MOY','day','DOW','mon','wday','WKND','kwh','kw.mean','kw.max','junk1','junk2','junk3','tout.mean','tout.max','CDH','HDD','junk4','pout.mean','pout.max','junk5','junk6','junk7','rh.mean','rh.max','junk8','junk9')
  daily <- subset(daily, select = grep("^junk", colnames(daily), invert=TRUE) )

  # add vacation days flags
  dateDays   = as.Date(daily$day,,tz="PST8PDT")
  # holidaysNYSE is a function from the dateTime package
  hdays      = as.Date(holidayNYSE(min(as.POSIXlt(dateDays)$year+1900):max(as.POSIXlt(dateDays)$year+1900)),,tz="PST8PDT")
  daily$vac  = factor(dateDays %in% hdays)

  if(FALSE) {
    M <- rbind(c(1, 2), c(3, 4), c(5, 6))
    layout(M)
    pacf(daily$kwh)
    acf(daily$kwh)
    plot(daily$pout.mean, daily$kwh)
    plot(daily$rh.mean, daily$kwh)
    plot(daily$tout.max, daily$kwh)
    plot(daily$kwh,type='l',main=paste('',meterData$id))
    Sys.sleep(1)
  }
  if(rm.na) {
    daily   = daily[!rowSums(is.na(daily)),]
    monthly = monthly[!rowSums(is.na(monthly)),]
  }
  return(list(daily=daily,monthly=monthly))
}

# Change point helper functions -------------------------------------------
#' @export
hourlyChangePoint = function(df,hourBins=list(1:24),trange=NULL,fast=T,reweight=F) {
  # hourBins should be a list of n numeric arrays such that each member of the list
  # is 1 or more hours of the day to use with the subset command to get
  # n change point estimates. So the default list(1:24) corresponds to using
  # all the data. Whereas as.list(1:24) would fit 24 subsets...
  if(class(hourBins) != 'list') hourBins = as.list(hourBins)
  if(fast) {
    cps = sapply(hourBins,toutChangePointFast,subset(df),trange,reweight)
  }
  else {
    cps = sapply(hourBins,toutChangePoint,subset(df),trange,reweight)
  }
  return(cps)
}

# this runs all the models and chooss the minimum one.
# likely waste of CPU on models past the min. See faster impl below
#' @export
toutChangePoint = function(hrs=NULL,df,trange=NULL,reweight=F) {
  toutStr = 'tout'
  if(! is.null(hrs)) {
    sub = df$HOD %in% paste('H',sprintf('%02i',(hrs-1)),sep='') # pull out hrs subset (#0-23 in the df)
    df = df[sub,]
    df = df[!is.na(df$kw),]
  }
  else { toutStr = 'tout.mean' }
  if(is.null(trange)) {
    rng = floor(quantile(df[[toutStr]],c(0.1,0.90),na.rm=T))
    trange = c( rng[1]:rng[2]  )
  }
  steps = sapply(trange,FUN=evalCP,df,reweight)  # run all the models in the range
  #print(steps['SSR',])
  bestFit = steps[,which.min(steps['SSR',])] # find and return the min SSR in the range
  return(bestFit)
}

# find all minima by scanning all values
allMins = function(trange,fn,valCol='SSR',tCol='cp', ...) {
  cur.val = fn(trange[1],...)
  old.delta = 1 # as long as it starts positive, we are fine
  mins = c()
  for(cur.t in trange[-1]) {
    #print(cur.t)
    old.val = cur.val
    cur.val = fn(cur.t,...)
    cur.delta = cur.val[valCol] - old.val[valCol]
    #print(c(old.delta,cur.delta))
    if(old.delta < 0 & cur.delta >= 0) { # if it was descending, but isn't anymore
      mins = rbind(mins,old.val) # record the min
    }
    old.t     = cur.t
    old.delta = cur.delta
  }
  rownames(mins) <- NULL
  return(mins)
}

#' @export
descend = function(trange,fn,valCol='SSR',tCol='cp',default.t=60, ...) {
  results = data.frame( trange )
  names(results) = tCol
  if ( default.t %in% trange[-1] ) { # see if the default is at least one in from the lower bound
    start = default.t
  } else {
    start = trange[floor(length(trange)/2)] # use the mid point
  }
  cur.t        = start - 1
  start.val    = fn(start,...)
  neighbor.val = fn(cur.t,...)
  # to initialize, we need to figure out which way is down
  # the rest of the code assumes that cur.t is down slope, so we need
  # to switch labels at the start to ensure that condition
  if(start.val[valCol] < neighbor.val[valCol]) { # we are descending from neighbor towards start
    old.t   = cur.t
    old.val = neighbor.val
    cur.t   = start
    cur.val = start.val

  } else {
    old.t   = start
    old.val = start.val
    #cur.t   = cur.t # unchanged
    cur.val = neighbor.val

  }
  while(cur.t %in% trange) { # note %in% is checking for membership, not looping through trange values
    if ( cur.val[valCol] < old.val[valCol] ) { # keep going
      dt      = cur.t - old.t
      old.t   = cur.t
      old.val = cur.val
      cur.t   = cur.t + dt
      cur.val = fn(cur.t, ...)
    } else { # if they are equal or greater, we found the (a?) min
      return(old.val)
    }
  }
  cur.val[tCol] = NA
  return(cur.val) # return the last value computed, which is by definition the lowest
}

#' @export
toutChangePointFast2 = function(hrs=NULL,df,trange=NULL,reweight=F,method='descend',warn=T) {
  toutStr = 'tout'
  if(! is.null(hrs)) {
    sub = df$HOD %in% paste('H',sprintf('%02i',(hrs-1)),sep='')  # define hrs subset (#0-23 in the df)
    df = df[sub,]                                                # pull out the subset from the df
    df = df[!is.na(df$kw),]                                      # no NA's. Breaks the algorithm
  }
  else { toutStr = 'tout.mean' }
  if(is.null(trange)) {
    rng = floor(quantile(df[[toutStr]],c(0.1,0.90),na.rm=T))
    trange = c( rng[1]:rng[2]  )
  }
  if(method=='descend') {
    return( descend(trange,evalCP,valCol='SSR',tCol='cp',default.t=60,df,reweight) )
  } else if (method == 'allMins') {
    return( allMins(trange,evalCP,valCol='SSR',tCol='cp',df,reweight) )
  } else {
    print(paste('toutChangePointFast2: Unrecognized method',method))
    return(NULL)
  }

}

# this takes advantage of the fact that for one change point,
# the SSR will be convex so it stops when the change in SSR is positive
# note that each cp in trange could be a list c(40,50,60,70) or just a number
toutChangePointFast = function(hrs=NULL,df,trange=NULL,reweight=F,warn=T) {
  toutStr = 'tout'
  if(! is.null(hrs)) {
    sub = df$HOD %in% paste('H',sprintf('%02i',(hrs-1)),sep='')  # define hrs subset (#0-23 in the df)
    df = df[sub,]                                                # pull out the subset from the df
    df = df[!is.na(df$kw),]                                      # no NA's. Breaks the algorithm
  }
  else { toutStr = 'tout.mean' }
  if(is.null(trange)) {
    rng = floor(quantile(df[[toutStr]],c(0.1,0.90),na.rm=T))
    trange = c( rng[1]:rng[2]  )
  }

  prev = c(cp=-1,SSR=Inf)                                      # init the compare options
  warnMulti = F
  for(cp in trange) {
    #if(length(cp)>1) warnMulti = T # there is no guarantee against global minima
    out = evalCP(cp,df,reweight)                               # run piecewise regression
    if(out['SSR'] > prev['SSR']) { # the previous value was the min
      #plot(df$tout,df$kw,main=paste('Hr',paste(hrs,collapse=',')))
      #points(quickEst(cp,prev['(Intercept)'],prev['lower'],prev['upper']),type='l')
      if(warnMulti) {
        print(paste('warning: toutChangePointFast returning multiple change points: ',
                    paste(prev[grep('^cp[0-9]+',names(prev))],collapse=','),'. ',
                    'May be a local minima. Consider toutChangePoint instead.',sep=''))
      }
      return(prev)
    }
    prev = out
  }
  # failed search
  if (warn) {
    print(paste('Warning. SSR min not found for hr ',paste(hrs,collapse=','),
                '. Increase your temperature range? Returning higest value: ',
                paste(prev[grep('^cp',names(prev),value=T)],collapse=','),sep=''))
  }
  return(prev)
}


toutDoubleChangePoint = function(df) {
  tMin = floor(min(df$tout.mean,na.rm=T)+5)
  tMax = floor(max(df$tout.mean,na.rm=T)-5)
  changeModels = lapply(tMin:tMax,
                        function(coolCP) {
                          trange = lapply((coolCP-1):tMin-1, function(x) c(x,coolCP))
                          toutChangePointFast(df=df,trange=trange,reweight=F,warn=F)
                        })
  cm = do.call(rbind,changeModels)
  bestFit = cm[which.min(cm[,'SSR']),]
  return(bestFit)
}

#' @export
evalCP = function(cp,df,reweight=F) {
  out = list()
  lhs = 'kw'
  toutStr = 'tout'
  if(! is.null(df$kwh)) {
    lhs = 'kwh'
    toutStr = 'tout.mean'
  }
  tPieces = regressor.piecewise(df[[toutStr]],c(cp))
  middle = c()
  nSegs = dim(tPieces)[2]
  if(nSegs > 2) middle = paste('middle_',1:(nSegs-2),sep='')
  colnames(tPieces) <- c('lower',middle,'upper')
  # calculate weights such that each partition of Tout data makes the same
  # potential contribution to the SSR. So if, for example the data is partitioned
  # into 1/4 and 3/4 of obs in each, the minority data would be weighted at 3x
  n = dim(tPieces)[1]
  m = dim(tPieces)[2]
  df$w = rep(1,n) # default weights
  if(reweight) {
    # find the index of the last column with a non-zero value
    # for a piecewise regressor, this happens to be the number of non-zero
    # values per row
    highestCol = rowSums(tPieces != 0,na.rm=T)
    colCounts  = table(highestCol)
    # only alter the weights when all columns are participating
    if(length(colCounts) == m) {
      names(colCounts) <- colnames(tPieces)
      nobs       = sum(colCounts)
      ncols      = length(colCounts)
      colWeights = (nobs/ncols) / colCounts # spread equal weight across segments
      # even if one has fewer obs than the other
      #print(paste(sum(colWeights[highestCol]),nobs))
      #print(colWeights)
      if(colWeights[1] < 1) { # only re-weight if it improves cooling estimate
        df$w = colWeights[highestCol]
      }
    }
  }
  colSums(tPieces != 0)
  df = cbind(df,tPieces) # add the columns from the matrix to the df
  # define regression formula that uses the piecewise pieces
  #    f_0  = paste(lhs,'~',toutStr)
  f_cp = paste(lhs,'~',paste(colnames(tPieces),collapse=" + ",sep=''))
  #    fit_0  = lm(f_0, df) #,weights=w)
  fit_cp = lm(f_cp,df) #,weights=w)
  s_cp = summary(fit_cp)
  #    s_0  = summary(fit_0)
  coefficients = s_cp$coefficients[,'Estimate'] # regression model coefficients
  pvals        = s_cp$coefficients[,'Pr(>|t|)'] # coefficient pvalues
  # if one of our partitions has no data, we need to fill in the missing columns
  # in the returned data to ensure that sapply comes back as a matrix, not a list
  if(any(colSums(tPieces != 0,na.rm=T)==0)) {
    missingCols = setdiff(colnames(tPieces),names(coefficients))
    coefficients[missingCols] = NA
    pvals[missingCols]        = NA
  }
  names(pvals) = paste('pval',names(pvals))

  # weights to enforce a bayesian idea that higher temps should matter more
  # t > 75 gets a double weighting in the SSR
  #w = (df$tout > 75)*3 + 1
  #print(length(df$w))
  #print(length(fit_cp$residuals))
  SSR_cp = (df$w * fit_cp$residuals) %*% (df$w * fit_cp$residuals)
  #    SSR_0  = (df$w * fit_0$residuals ) %*% (df$w * fit_0$residuals )
  k = 1 # we have one regressor, so k = 1
  n = length(fit_cp$residuals)
  out$SSR = SSR_cp
  out$RMSE = sqrt(out$SSR/(n - 1))
  #    out$AIC_cp <- AIC(fit_cp)
  #    out$AIC_0  <- AIC(fit_0)
  k_cp = s_cp$df[1] #- 1 # degs of freedom, assuming intercept doesn't count
  #    k_0  = s_0$df[1]  #- 1
  # for comparison of models, see also f-test http://en.wikipedia.org/wiki/F-test#Regression_problems
  #    fstat = ( (SSR_0 - SSR_cp) / (k_cp - k_0) ) / ( SSR_cp / (n - k_cp) )
  #print(paste((SSR_0 - SSR_cp),(k_cp - k_0),SSR_cp,(n - k_cp)))
  #    plower = pf(fstat,k_cp - k_0, n - k_cp) # single sided
  #    nullModelTest = 2 * min(plower, 1 - plower)      # double sided p test for whether the cp model improves on a non-cp model
  return(c(cp=cp,SSR=out$SSR,
           #    AIC_cp=out$AIC_cp,
           #    AIC_0=out$AIC_0,nullModelTest=nullModelTest,
           coefficients,pvals))
}

# Model evaluation --------------------------------------------------------
#' @export
kFold = function(df,modelDescriptor,K=5) {
  folds <- sample(1:K, dim(df)[1], replace=T)
  residuals = c()
  for (i in 1:K) {
    fld = folds == i
    df$fold = fld
    fmla = modelDescriptor$formula
    subm = lm(fmla, data=df, subset=(!fold),na.action=na.omit)
    yhat = predict(subm,newdata=df[fld,])
    #print(length(yhat))
    ynm = as.character(formula(fmla))[[2]]
    residuals = c(residuals,df[,ynm][fld] - yhat) # accumulate the errors from all predictions
  }
  #plot(residuals)
  rmspe = sqrt(mean(residuals^2,na.rm=TRUE)) # root mean squared prediciton error RMSE
  return(rmspe)
}

#' @export
cvFold = function(df,modelDescriptor,K=5,R=1,costfn=cvTools::rmspe,seed=NULL) {
  fmla = formula(modelDescriptor$formula)
  cvOut = cvTools::cvFit(lm,formula=fmla,data=df,K=K,R=R,foldType='random',cost=costfn,seed=seed) #root mean squared prediction error
  #cvMape = cvTools::cvFit(lm,formula=fmla,data=df,K=K,R=R,foldType='random',cost=cvTools::mape,seed=seed) #root mean squared prediction error

  #print(cvOut)
  #print(names(cvOut))
  #print(cvOut$reps)
  return(mean(cvOut$cv))
}

smartCP = function(r) {
  df = regressorDF(r)
  simpleModel = lm('kw ~ tout + HOD -1',df)

  plot(cumsum(simpleModel$residuals))
}

#smartCP(rCO)
ConvergenceDA/visdom documentation built on May 6, 2019, 12:51 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ConvergenceDA/visdom
R package for energy data analytics

R/util-regression.R
In ConvergenceDA/visdom: R package for energy data analytics

Documented in piecewise.regressor regressorDF regressor.piecewise regressor.split

R Package Documentation

Browse R Packages

We want your feedback!

ConvergenceDA/visdom R package for energy data analytics

R/util-regression.R In ConvergenceDA/visdom: R package for energy data analytics

Documented in piecewise.regressor regressorDF regressor.piecewise regressor.split

R Package Documentation

Browse R Packages

We want your feedback!

ConvergenceDA/visdom
R package for energy data analytics

R/util-regression.R
In ConvergenceDA/visdom: R package for energy data analytics