Analysis/001-estimate-supply-production-function-ols.R

## ------------------- Code for paper:---------------------------
# "A New Approach to Estimating the Production Function for Housing"
# by Dennis Epple, Brett Gordon, and Holger Sieg, forthcoming AER.
#
# This code is used to estimate the supply and production function.
#
# Originally Written by Brett Gordon, June 2008. 
# Comments should be sent to brg2114@columbia.edu.
#
# Modified to include additional functions for the purpose of replication

#Since R.basic is not availbale in CRAN anymore, it can be installed from the below repository. 
#This package is required to compute the standard errors for the supply and production functions
#install.packages(c("R.basic"), contriburl="http://www.braju.com/R/repos/")

## ----load-the-libraries------

library(MASS)
library(splines)
library(deSolve) #Has been used insted of odesolve in our replication
library(lokern)
library(here)
library(stargazer)
require(splines)
require(R.basic)
#source(here("Analysis/002-gradient-descent-estimator.R"))


## ----read-the-data----

# Just pland, v, and lotarea
data = read.csv(here("Data/Pittsburgh_post1995.txt"), header=TRUE)
N = dim(data)[1]
v = data$v
pland = data$pland
lotarea = data$lotarea
muni = data$muni
tcog = data$tcog

sorted = sort(v,index.return=TRUE, method="quick")
pland = pland[sorted$ix]
lotarea = lotarea[sorted$ix]
muni = muni[sorted$ix]
tcog = tcog[sorted$ix]
v = sorted$x

v.full = v
pland.full = pland

#Calculate 1 and 99 percentile for v
percentile <- quantile(data$v, probs = c(0.01,0.99))

# The 99 percentile = 144.4683 and 1 percentile = 0.923818
goodv = (((v.full < percentile[2])*(v > percentile[1])) == 1)
v = v.full[goodv]
pland = pland.full[goodv]
lotarea = lotarea[goodv]
muni = muni[goodv]

N = length(v)
logpland = log(pland)
logv = log(v)
v2 = v^2
v3 = v^3
v4 = v^4

v.sd = sqrt(var(v))
logv.sd = sqrt(var(logv))

remove(data)

# plot(logv, logpland, main="Value per unit land vs. Price of land \n Allegheny County Residential Properties",xlab="log(v)", ylab="log(pland)",cex.main = 1)


## ----estimate-the-regressions-----

# Basic regressions

lm.loglin = lm(logpland ~ logv)
lm.lin = lm(pland ~ v - 1)
lm.quad = lm(pland ~ v + v2 - 1)
lm.cub = lm(pland ~ v + v2 + v3 - 1)
lm.quart = lm(pland ~ v + v2 + v3 + v4 - 1)

#Log Linear model using Gaussian Generalized Linear Model (variant introduced for the purpose of replication)
glm.loglin = glm(logpland ~ logv)

#Log Linear model with a different Gradient Descent Loss Function (variant introduced for the purpose of replication)
thetaValues = gradientDescentLinearModel(logv, logpland, 0.1, 1000)


stargazer(lm.loglin, lm.lin, lm.quad, lm.cub, glm.loglin,
          summary=FALSE, 
          out = "r(v) estimates.html", 
          type = "text",
          column.labels = c('Log-Linear','Linear','Quadratic','Cubic','Generalized-Log-Linear'),
          keep.stat = c('n'),
          dep.var.labels.include =FALSE,
          dep.var.caption  = "OLS and GLM estimates",
          model.names = FALSE, model.numbers = FALSE,
          title = "Table 2 - Estimates of Equilibrium locus")

#Getting the estimates of gradient descent based log-lin model
grad_descent = data.frame('Gradient_descent-Log-linear' = c(thetaValues[1],thetaValues[2])) 
row.names(grad_descent) = c('Constant','log(v)')
print(grad_descent)

## ----plot-the-fitted-models----

# Non-parametric estimation of r(v)

gkern.deriv = glkerns(v, pland, deriv=1,n.out=300, hetero=TRUE,bandwidth=v.sd*1.1)
gkern = glkerns(v, pland, deriv=0,n.out=500, hetero=TRUE,bandwidth=v.sd*1.1)
gkern.interp = interpSpline(gkern$x.out,gkern$est,na.action=na.omit,bSpline=TRUE)

gkern.log = glkerns(logv, logpland, deriv=0,n.out=300, hetero=TRUE)
gkern.log.interp = interpSpline(gkern.log$x.out,gkern.log$est,na.action=na.omit,bSpline=TRUE)
gkern.log.fitted = predict(gkern.log.interp, logv)


# plot(gkern.deriv$x.out,gkern.deriv$est, type="l")
# plot(gkern.log$x.out,gkern.log$est, type="l", col="red")
# plot(gkern$x.out,gkern$est, type="l")

# Plot the regressions
# par(mfrow=c(1,1))
# plot(logv, lm.loglin$fitted.values, type="l",xlim=c(-2,5), ylim=c(-3,3.5))
# lines(logv, lm.loglin$fitted.values, col="red")
# lines(logv, log(lm.lin$fitted.values), col="blue")
# lines(logv, log(lm.quad$fitted.values), col="green")
# lines(logv, log(lm.cub$fitted.values), col="orange")
# 
# #Plot to add line generated by Gaussian Generalized Log-Linear Model
# lines(logv, glm.loglin$fitted.values, col = '#009E73')

#Plot to add line using Log linear model with Gradient Descent Loss Function
#abline(coef=thetaValues, col = '#F0E442')


#Calculate the predicted values from using Log linear model with Gradient Descent Loss Function
predictedValuesGD <- calculatePredictionValues(logv, logpland, thetaValues[2], thetaValues[1])

#Added two new model prediction
#1) Gaussian Generalized Log Linear Model: Gaus.LLM
#2) Gradient Descent Log Linear Model: GradDes.LLM

regs = data.frame("Log-Linear"=lm.loglin$fitted.values, "Linear"=log(lm.lin$fitted.values),"Quadratic"=log(lm.quad$fitted.values),
                  "Cubic"=log(lm.cub$fitted.values), "Log Kernel"=gkern.log.fitted$y, 'Gaus.LLM'=glm.loglin$fitted.values,
                  'Grad.des.LLM'=predictedValuesGD)

matplot(cbind(logv,logv,logv,logv,logv,logv,logv), regs, col=1:ncol(regs), lty=1:ncol(regs),lwd=c(2,2,2,2), type="l",
        main="Fitted Regressions for r(v)", xlab="Value per unit of land", ylab="Price of land", xlim=c(-0.5,5), ylim=c(-2.2,3.5),
        cex.main = 0.9,cex.lab = 0.9,cex.axis =0.9)
legend(3.1, 0.9, names(regs), col=1:ncol(regs), lty=1:ncol(regs), cex = 0.75)



## ----derive-supply-functions-for-each-model----

vseq.n = 1000
vseq = seq(1,100,length.out=vseq.n)

### Calculate supply functions for quad and cubic cases
r = lm.quad$coefficients
p.quad = (vseq^r[1])*exp(2*r[2]*(vseq-1))
s.quad = vseq/p.quad

r = lm.cub$coefficients
c = 2*r[2] + 1.5*r[3]
p.cub = (vseq^r[1])*exp(2*r[2]*(vseq-1) + 1.5*r[3]*(vseq^2 - 1))
s.cub = vseq/p.cub

# Now create price grid given this upperbound
lb = min(range(p.quad)[1],range(p.cub)[1])
ub = min(range(p.quad)[2],range(p.cub)[2])
pgrid = seq(lb,ub,length.out=vseq.n)


### Calculate supply functions for linear and log-linear cases

alpha = as.numeric(exp(lm.loglin$coefficients[1])*lm.loglin$coefficients[2])
beta = as.numeric(lm.loglin$coefficients[2] - 1)

### Calculate supply function for GLM Log-linear case

alpha_glm = as.numeric(exp(glm.loglin$coefficients[1])*glm.loglin$coefficients[2])
beta_glm = as.numeric(glm.loglin$coefficients[2] - 1)

# Determine upper bound on p, since beta < 0 implies (c+log(p)) < 0
### For log-linear model
ec = exp(-alpha/beta)

### FOr GLM Log-linear model
ec_glm = exp(-alpha_glm/beta_glm)


### Linear case
k = lm.lin$coefficients[1]
s.lin = pgrid^((1-k)/k)

### Log-linear case
s.loglin = (1/pgrid)*((1 + (beta/alpha)*log(pgrid))^(1/beta))

### GLM Log-Linear case
sglm.loglin = (1/pgrid)*((1 + (beta_glm/alpha_glm)*log(pgrid))^(1/beta_glm))

### Solve ODE for kernel supply function

rprime = interpSpline(gkern.deriv$x.out,gkern.deriv$est,na.action=na.omit,bSpline=TRUE)

calc.supply.deriv = function(t,y, params) list((y/t)*((1/(predict(rprime,y*t))$y)-1))


xstart = c(1)
pgrid.kern = c(1, seq(1.001,ub,length.out=vseq.n-1))

ode.out = lsoda(xstart, pgrid.kern, calc.supply.deriv, parms=1, rtol=1e-5, atol=1e-7)
s.kern = data.frame(s=ode.out[,2],p=ode.out[,1])

### Plot all the supply functions

s.funcs = data.frame("Log-Linear"=s.loglin,"Linear"=s.lin,"Quadratic"=s.quad,
                     "Cubic"=s.cub, "Kernel"=s.kern$s, "GLM Log-Linear" = sglm.loglin)
p.grids = cbind(pgrid,pgrid,p.quad,p.cub,s.kern$p,pgrid)

s.lin.range = range(s.lin*pgrid)
s.loglin.range = range(s.loglin*pgrid)
s.quad.range = range(s.quad*p.quad)
s.cub.range = range(s.cub*p.cub)
s.kern.range = range(s.kern$s*s.kern$p)
sglm.loglin.range = range(sglm.loglin*pgrid)

matplot(s.funcs,p.grids,col=1:ncol(s.funcs), lty=1:ncol(s.funcs),lwd=c(2,2,2,2), type="l",
        main="Supply Functions", xlab="Supply", ylab="Price",xlim=c(0,39),
        cex.main = 0.9,cex.lab = 0.9,cex.axis =0.9)
legend(20, 1.6, names(s.funcs), col=1:ncol(s.funcs), lty=1:ncol(s.funcs),cex = 0.75)


## ----production-function-estimation------

v.lin.hat = s.lin*pgrid
v.loglin.hat = s.loglin*pgrid
v.quad.hat = s.quad*p.quad
v.cub.hat = s.cub*p.cub
v.kernel = s.kern$p*s.kern$s
v.glmloglin.hat = sglm.loglin*pgrid

lm.lin.fitted = lm.lin$coefficients[1]*v.lin.hat
lm.loglin.fitted = exp(lm.loglin$coefficients[1] + lm.loglin$coefficients[2]*log(v.loglin.hat))
lm.quad.fitted = lm.quad$coefficients[1]*v.quad.hat + lm.quad$coefficients[2]*v.quad.hat^2
lm.cub.fitted = lm.cub$coefficients[1]*v.cub.hat + lm.cub$coefficients[2]*v.cub.hat^2 + lm.cub$coefficients[3]*v.cub.hat^3
kernel.fitted = predict(gkern.interp,v.kernel)$y
glm.loglin.fitted = exp(glm.loglin$coefficients[1] + glm.loglin$coefficients[2]*log(v.glmloglin.hat))


m.lin = v.lin.hat-lm.lin.fitted
m.loglin = v.loglin.hat-lm.loglin.fitted
m.quad = v.quad.hat-lm.quad.fitted
m.cub = v.cub.hat-lm.cub.fitted
m.kernel = v.kernel - kernel.fitted
m.glmloglin = v.glmloglin.hat-glm.loglin.fitted

##Normalize the production function for the kernel case
m.kernel = m.kernel + (1-m.kernel[1])

prod.funcs = data.frame("Log-Linear"=s.loglin,"Linear"=s.lin,"Quadratic"=s.quad, "Cubic"=s.cub, "Kernel"=s.kern$s,"GLM.LL"=sglm.loglin)

m.grids = cbind(m.loglin, m.lin, m.quad, m.cub,m.kernel,m.glmloglin)

matplot(m.grids,prod.funcs, col=1:ncol(prod.funcs), lty=1:ncol(prod.funcs),lwd=c(2,2,2,2), type="l",
        main="Production Functions", xlab="m = (M/L)", ylab="Output", xlim=c(0,80), ylim=c(0,50),
        cex.main = 0.9,cex.lab = 0.9,cex.axis =0.9)
legend(50, 26, names(prod.funcs), col=1:ncol(prod.funcs), lty=1:ncol(prod.funcs),cex=0.75)



## ----simulate-supply-and-production-functions-to-calculate-standard-errors-----

sup.loglin = function(grid,parms){
  alpha = as.numeric(exp(parms[1])*parms[2])
  beta = as.numeric(parms[2] - 1)
  return((1/grid)*((1 + (beta/alpha)*log(grid))^(1/beta)))
}

sup.glmloglin = function(grid,parms){
  alpha = as.numeric(exp(parms[1])*parms[2])
  beta = as.numeric(parms[2] - 1)
  return((1/grid)*((1 + (beta/alpha)*log(grid))^(1/beta)))
}


prod.loglin = function(q,s){
  v.hat = s.loglin*pgrid
  lm.loglin.fitted = exp(lm.loglin$coefficients[1] + lm.loglin$coefficients[2]*log(v.hat))
  m.loglin = v.hat-lm.loglin.fitted
  return(list(m=m.loglin,q=s))
}

prod.glmloglin = function(q,s){
  v.hat = sglm.loglin*pgrid
  glm.loglin.fitted = exp(glm.loglin$coefficients[1] + glm.loglin$coefficients[2]*log(v.hat))
  m.loglin = v.hat-glm.loglin.fitted
  return(list(m=m.loglin,q=s))
}


sim.sup.prod = function(mod,grid,sfunc,pfunc,nsim){
  r = mod$coefficients[,"Estimate"]
  sd = mod$coefficients[,"Std. Error"]
  r.sim = matrix(NA,nrow=nsim,ncol=length(r))
  for(i in 1:length(r)){
    r.sim[,i] = rnorm(nsim,mean=r[i],sd=sd[i])
  }

  s.sd = rep(NA,length(grid))
  s.mat = matrix(NA,nrow=length(grid),ncol=nsim)
  for(k in 1:length(grid)){
    for(i in 1:nsim){
      s.mat[k,i] = sfunc(grid[k],r.sim[i,])
    }
    s.sd[k] = sd(s.mat[k,])
  }

  p.sd = rep(NA,length(grid))
  p.mat = matrix(NA,nrow=length(grid),ncol=nsim)
  for(i in 1:nsim){
    ## Calculate the prod function for this supply function
    pfunc.temp = pfunc(grid,s.mat[,i])
    p.mat[,i] = pfunc.temp$q
  }
  for(k in 1:length(grid)) p.sd[k] = sd(p.mat[k,])

  return(list(s.sd=s.sd,p.sd=p.sd))
}

sim.funcs = sim.sup.prod(summary(lm.loglin),pgrid,sup.loglin,prod.loglin,100)

s.loglin.ub = s.loglin + 2*sim.funcs$s.sd
s.loglin.lb = s.loglin - 2*sim.funcs$s.sd
prod.loglin.lb = s.loglin - 2*sim.funcs$p.sd
prod.loglin.ub = s.loglin + 2*sim.funcs$p.sd


# Supply and production plots in GLM log-log scale
sim.glmfuncs = sim.sup.prod(summary(glm.loglin),pgrid,sup.glmloglin,prod.glmloglin,100)

s.glmloglin.ub = sglm.loglin + 2*sim.glmfuncs$s.sd
s.glmloglin.lb = sglm.loglin - 2*sim.glmfuncs$s.sd
prod.glmloglin.lb = sglm.loglin - 2*sim.glmfuncs$p.sd
prod.glmloglin.ub = sglm.loglin + 2*sim.glmfuncs$p.sd


## ----glm-supply-func-confidence-band----
plot(log(sglm.loglin),log(pgrid),type="l",main="GLM-Log-linear Supply Function with \n 95% Confidence Band",
     xlab="log(Supply)",ylab="log(Price)", xlim=c(0,log(40)), lwd=2,
     cex.main = 0.9,cex.lab = 0.9,cex.axis =0.9)
lines(log(s.glmloglin.lb),log(pgrid),col="red",lty=2)
lines(log(s.glmloglin.ub),log(pgrid),col="red",lty=2)

## ----glm-production-func-confidence-band----
plot(log(m.glmloglin),log(sglm.loglin), type="l",main="GLM-Log-linear Production Function with \n 95% Confidence Band",
     xlab="log(m)",ylab="log(q)", xlim=c(0,log(80)), lwd=2,
     cex.main = 0.9,cex.lab = 0.9,cex.axis =0.9)
lines(log(m.glmloglin), log(prod.glmloglin.ub), col="red",lty=2)
lines(log(m.glmloglin), log(prod.glmloglin.lb), col="red",lty=2)
UW-MSDS-DATA-598-Reproducibility-WI20/dutta-gupta-manohar-prabhakar-rathod-replication-project documentation built on March 17, 2020, 12:08 a.m.