R/simple_prediction.R

Defines functions SimData_forAggModel FitAggModel PredAggModel AssessMod Estim_ErrDistWrt

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# author: Reza Hosseini


## functions for predicton using simple aggregate functions
# this is developed to be able to be implemented easily in
# simple frameworks like SQL with little statistical modeling
# capabilities
# however the approaches here could be superior to seemingly
# more advanced methods in some contexts.

SimData_forAggModel = function() {

  ss = 1000
  bucketNum = 20

  id = 1:ss
  bucket = id %% bucketNum
  x1 = runif(ss, -0.5, 0.5)
  x2 = runif(ss, -0.5, 0.5)
  x3 = runif(ss, -0.5, 0.5)
  expt_numeric = x3 > 0.5
  expt = rep("cont", ss)
  expt[expt_numeric] = "treat"
  y = 0.1 + 2*x1 + -2*x2 + 0.1*expt_numeric + 0.1*runif(ss, -0.5, 0.5)

  df = data.frame(id, bucket, x1, x2, x3, expt, y)
  dt = data.table(df)

  return(dt)

}


FitAggModel = function(
    dt,
    yCol,
    predCols,
    AggFunc=mean(x, na.rm=TRUE)) {

  dt = data.table(dt)

  dtAgg = dt[ , AggFunc(get(yCol)), by=predCols]
  colnames(dtAgg) = c(predCols, yCol)
  return(dtAgg)

}


PredAggModel = function(newDt, dtAgg, predCols) {

  dt = merge(newDt, dtAgg, by=predCols, all.x=TRUE, all=FALSE)
  return(dt)

}


AssessMod = function(y, yPred) {

  d = na.omit(y - yPred)
  rmse = sqrt(sum(d^2) / length(d))
  corr = cor(y, yPred, use="pairwise.complete.obs")
  covar = cor(y, yPred, use="pairwise.complete.obs")
  r2 = 1 - (rmse^2) / var(y)


}




## get uncertainty for prediction model in two-stage approach
Estim_ErrDistWrt = function(
    dt, valueCol_obs, valueCol_pred, predCols, errColName="err") {

  dt[ , errColName] = Col(dt[ , valueCol_obs]) - Col(dt[ , valueCol_pred])

}
Reza1317/funcly documentation built on Feb. 5, 2020, 4:06 a.m.