Nothing
LnRegMisrepEM<-function(formula, v_star, data, lambda = c(0.6,0.4), epsilon = 1e-08, maxit = 10000, maxrestarts = 20, verb = FALSE) {
# Simple checks to make sure the response and the v_star
# variable are contained within the data object;
if(!any(v_star == colnames(data))){
stop(paste("variable", v_star, "not present in dataframe" ))
}
# First check to see if formula is correctly specified.
# needs to be: log(response) ~ x
if(identical(grep("log()", as.character(formula)[[2]]), integer(0))){
# LHS is not 'log(response)'
stop("formula must be 'log(response) ~ terms'. See 'Details'")
}
# The name of the misrepresented variable;
v_star_name <- v_star
# v_star object needs to be a vector of 1's and 0's,
# with class 'numeric'
# Note that v_star changes from being a character to a vector
v_star <- data[, v_star_name]
# If v_star is a numeric, then do nothing
if(is.numeric(v_star)){
}else{
# But if it isn't numeric, then check to see if it's class is factor;
if(is.factor(v_star)){
# This is a safe way of coercing a factor to a numeric, while
# retaining the original numeric vales
v_star <- as.numeric(levels(v_star))[v_star]
}else{
# and if it's not numeric, and not a factor, then something is
# seriously wrong;
stop("v_star variable must be of class 'factor' or 'numeric'")
}
}
if(length(unique(v_star)) != 2){
stop("v_star variable must contain two unique values")
}
if(sort(unique(v_star))[1] != 0 | sort(unique(v_star))[2] != 1){
stop("v_star variable must be coded with ones and zeroes")
}
if(sum(lambda) != 1){
stop("Lambda vector must sum to one")
}
if(length(lambda) != 2){
stop("Lambda vector must contain two elements")
}
if( !is.null(alias(lm(formula = formula, data = data))$Complete) ){
stop("Linear dependencies exist in the covariates")
}
# obtain initial values from lm
naive <- lm(formula = formula, data = data, x = TRUE, y = TRUE)
# This is a final error check that is done to ensure that the v* variable is
# also included in the formula specification;
if( any(colnames(naive$x) == v_star_name) ){
}else{
stop("v_star variable must be specified in 'formula'")
}
sd <- sigma(naive)
coef.reg <- naive$coefficients
coef.reg[1] <- coef.reg[1]+sd^2/2 # convert to intercept under lognormal regression on its mean
coef.reg <- c( "sigma" = sd, coef.reg)
theta <- coef.reg
# Make design matrix
x <- model.matrix(object = terms(formula), data = data)
# This other design matrix is made by first setting the v* column within the dataframe
# to be fixed at one.
data[,v_star_name] <- 1
# Notice capital X
X <- model.matrix(object = terms(formula), data = data)
if( length(theta[-1][ -grep(v_star_name, names(theta[-1])) ]) == 1 ){
xbeta <- as.vector(x[, -grep(v_star_name, colnames(x)) ] * theta[-1][ -grep(v_star_name, names(theta[-1])) ] )
}else{
xbeta <- as.vector(x[, -grep(v_star_name, colnames(x)) ] %*% theta[-1][ -grep(v_star_name, names(theta[-1])) ] )
}
iter <- 0
diff <- epsilon+1
attempts <- 1
# The response
y <- naive$y
n <- length(y)
# This is a term that occurs in the log-normal log-likelihood,
# but not the normal log-likelihood.
sum_log_resp <- sum(y)
# observed loglikelihood (partial LL, eq. 3 from Akakpo, Xia, Polansky 2018).
obs.ll <- function(lambda, coef){
sum( v_star *log( dnorm(x = y, sd = coef[1], mean = -coef[1]^2/2 + x %*% coef[-1] )))+
sum((1-v_star)*log( lambda[2]*dnorm(x = y, sd = coef[1], mean = -coef[1]^2/2 + X %*% coef[-1] )+
lambda[1]*dnorm(x = y, sd = coef[1], mean = -coef[1]^2/2 + x %*% coef[-1] )))
}
# M step loglikelihood
mstep.ll <- function(theta, z){
-sum( log(dnorm(x = y[v_star==1], sd = theta[1], mean = (as.vector(-theta[1]^2/2 + x %*% theta[-1] )[v_star==1] ) )))-
sum((1-z[v_star==0])*log(dnorm(x = y[v_star==0], sd = theta[1], mean = (as.vector(-theta[1]^2/2 + X %*% theta[-1] )[v_star==0] ) ))+
z[v_star==0] *log(dnorm(x = y[v_star==0], sd = theta[1], mean = (as.vector(-theta[1]^2/2 + x %*% theta[-1] )[v_star==0] ) )))
}
old.obs.ll <- obs.ll(lambda, coef.reg) - sum_log_resp
ll <- old.obs.ll
# Number of digits (to the right of decimal point) printed to console will
# depend on default user settings;
num_digits <- getOption("digits")
while(diff > epsilon && iter < maxit){
# E-step
dens1 <- lambda[1]*dnorm(x = y, sd = theta[1], mean = (-theta[1]^2/2 + xbeta))
dens2 <- lambda[2]*dnorm(x = y, sd = theta[1], mean = (-theta[1]^2/2 + X %*% theta[-1]))
z <- dens1/(dens1+dens2)
lambda.hat <- c(mean(z[v_star==0]), (1-mean(z[v_star==0])))
#Non-linear minimization.
m <- try(suppressWarnings(nlm(f = mstep.ll, p = theta, z = z)), silent = TRUE)
theta.hat <- m$estimate
# Annoyingly, nlm() does not provide m$estimate as a named vector,
# which consequently makes updating the xbeta object impossible.
names(theta.hat) <- names(theta)
new.obs.ll <- obs.ll(lambda.hat, theta.hat) - sum_log_resp
diff <- new.obs.ll-old.obs.ll
old.obs.ll <- new.obs.ll
ll <- c(ll,old.obs.ll)
lambda <- lambda.hat
theta <- theta.hat
if( length(theta[-1][ -grep(v_star_name, names(theta[-1])) ]) == 1 ){
xbeta <- as.vector(x[, -grep(v_star_name, colnames(x)) ] * theta[-1][ -grep(v_star_name, names(theta[-1])) ] )
}else{
xbeta <- as.vector(x[, -grep(v_star_name, colnames(x)) ] %*% theta[-1][ -grep(v_star_name, names(theta[-1])) ] )
}
iter <- iter+1
# If TRUE, print EM routine updates to the console;
if(verb){
message("iteration = ", iter,
" log-lik diff = ", format(diff, nsmall = num_digits),
" log-like = ", format(new.obs.ll, nsmall = num_digits) )
}
# stop execution and throw an error if the max iterations has been reached,
# and if the max num. of attempts has been made;
if(iter == maxit && attempts == maxrestarts){
stop("NOT CONVERGENT! Failed to converge after ", attempts, " attempts", call. = F)
}
# If the max iterations is reached, but we can make another attempt, then
# restart the EM routine with new mixing prop., but only notify user
# of this if verb = TRUE
if(iter == maxit && attempts < maxrestarts){
if(verb){
warning("Failed to converge. Restarting with new mixing proportions", immediate. = TRUE,
call. = FALSE)
}
# Update the number of attempts made.
attempts <- attempts + 1
# Reset iter to zero
iter <- 0
cond <- TRUE
while(cond){
lambda.new <- c(0,0)
lambda.new[2] <- runif(1)
lambda.new[1] <- 1-lambda.new[2]
if(min(lambda.new) < 0.15){
cond <- TRUE
lambda <- lambda.new
}else{
cond <- FALSE
}
}
# With the new mixing proportions, re-calculate the old.obs.ll,
old.obs.ll <- obs.ll(lambda, coef.reg) - sum_log_resp
ll <- old.obs.ll
}
}
message("number of iterations = ", iter)
# Make empty Hessian matrix;
hess <- matrix(data = 0, nrow = length(theta) + 1, ncol = length(theta) + 1,
dimnames = list( c("lambda", names(theta)), c("lambda", names(theta)) ) )
sigma <- as.numeric(theta[1])
# Element (1,1)
hess[1,1] <- -sum( (1-v_star)* ( ( exp(-(y + sigma^2/2 - X%*%theta[-1])^2/(2*sigma^2) ) - exp(-(y + sigma^2/2 - x%*%theta[-1])^2/(2*sigma^2) ) ) / ( lambda[2]*exp(-(y + sigma^2/2 - X%*%theta[-1])^2/(2*sigma^2) ) + lambda[1]*exp(-(y + sigma^2/2 - x%*%theta[-1])^2/(2*sigma^2) ) ) )^2 )
# Element (2,2)
hess[2,2] <- 1/sigma^2 * sum( v_star*( (2*sigma^2*(y+sigma^2/2-x%*%theta[-1]) - 3*(y+sigma^2/2-x%*%theta[-1])^2 )/sigma^2 -sigma^2 + y +sigma^2/2-x%*%theta[-1] + 1 ) +
(1-v_star)*( y - sigma^2 + 1 + ( lambda[2]^2*exp(-(y+sigma^2/2-X%*%theta[-1])^2/sigma^2) * ( ( 2*sigma^2*(y+sigma^2/2-X%*%theta[-1]) -3*(y+sigma^2/2-X%*%theta[-1])^2 )/sigma^2 +sigma^2/2-X%*%theta[-1] ) + lambda[2]*lambda[1]*exp( -( (y+sigma^2/2-X%*%theta[-1])^2 + (y+sigma^2/2-x%*%theta[-1])^2 )/(2*sigma^2) ) * ( ( (y+sigma^2/2-x%*%theta[-1])^2 - (y+sigma^2/2-X%*%theta[-1])^2 + sigma^2*(x-X)%*%theta[-1] )^2/sigma^4 + ( 2*sigma^2*(2*y+sigma^2-(x+X)%*%theta[-1]) -3*( (y+sigma^2/2-x%*%theta[-1])^2 + (y+sigma^2/2-X%*%theta[-1])^2 ) )/sigma^2 + sigma^2-(x+X)%*%theta[-1] ) + lambda[1]^2*exp( -(y+sigma^2/2-x%*%theta[-1])^2/sigma^2 )*( (2*sigma^2*(y+sigma^2/2-x%*%theta[-1]) - 3*(y+sigma^2/2-x%*%theta[-1])^2 )/sigma^2 +sigma^2/2-x%*%theta[-1] ) )
/ ( lambda[2]*exp( -(y+sigma^2/2-X%*%theta[-1])^2/(2*sigma^2) ) + lambda[1]*exp(-(y+sigma^2/2-x%*%theta[-1])^2/(2*sigma^2)) )^2 ) )
# Element (2,1), (1,2)
hess[2,1] <- sum( (1-v_star) * ( exp(-( (y+sigma^2/2-X%*%theta[-1])^2 + (y+sigma^2/2-x%*%theta[-1])^2 )/(2*sigma^2) ) * ( ( (y+sigma^2/2-X%*%theta[-1])^2 - (y+sigma^2/2-x%*%theta[-1])^2 )/sigma^3 + (X-x)%*%theta[-1]/sigma ) ) / ( lambda[2]*exp(-(y+sigma^2/2-X%*%theta[-1])^2/(2*sigma^2)) + lambda[1]*exp(-(y+sigma^2/2-x%*%theta[-1])^2/(2*sigma^2)) )^2 )
hess[1,2] <- sum( (1-v_star) * ( exp(-( (y+sigma^2/2-X%*%theta[-1])^2 + (y+sigma^2/2-x%*%theta[-1])^2 )/(2*sigma^2) ) * ( ( (y+sigma^2/2-X%*%theta[-1])^2 - (y+sigma^2/2-x%*%theta[-1])^2 )/sigma^3 + (X-x)%*%theta[-1]/sigma ) ) / ( lambda[2]*exp(-(y+sigma^2/2-X%*%theta[-1])^2/(2*sigma^2)) + lambda[1]*exp(-(y+sigma^2/2-x%*%theta[-1])^2/(2*sigma^2)) )^2 )
# Main diagonal elements that pertain to regression coefficients
for(j in 1:ncol(x)){
k <- j
hess[k+2, j+2] <- -1/sigma^2 * sum(v_star*x[,j]*x[,k]
+ (1-v_star) * (lambda[2]^2*exp(-(y+sigma^2/2-X%*%theta[-1])^2/sigma^2)*X[,j]*X[,k] - lambda[2]*lambda[1]*exp( -( (y+sigma^2/2-X%*%theta[-1])^2 + (y+sigma^2/2-x%*%theta[-1])^2 )/(2*sigma^2) ) * ( 1/sigma^2 * (x[,k]*(y+sigma^2/2-x%*%theta[-1]) - X[,k]*(y+sigma^2/2-X%*%theta[-1])) * (x[,j]*(y+sigma^2/2-x%*%theta[-1]) - X[,j]*(y+sigma^2/2-X%*%theta[-1])) - x[,j]*x[,k] - X[,j]*X[,k] ) + lambda[1]^2*exp(-(y+sigma^2/2-x%*%theta[-1])^2/sigma^2) * x[,j]*x[,k] )
/ ( lambda[2]*exp(-(y+sigma^2/2-X%*%theta[-1])^2/(2*sigma^2)) + lambda[1]*exp(-(y+sigma^2/2-x%*%theta[-1])^2/(2*sigma^2)) )^2 )
}
# Off diagonal elements that pertain to regression coefficients
for(i in 1:choose(ncol(x),2)){
j <- combn(x = 1:ncol(x), m = 2)[1,i]
k <- combn(x = 1:ncol(x), m = 2)[2,i]
hess[k + 2, j + 2] <- -1/sigma^2 * sum(v_star*x[,j]*x[,k]
+ (1-v_star) * (lambda[2]^2*exp(-(y+sigma^2/2-X%*%theta[-1])^2/sigma^2)*X[,j]*X[,k] - lambda[2]*lambda[1]*exp( -( (y+sigma^2/2-X%*%theta[-1])^2 + (y+sigma^2/2-x%*%theta[-1])^2 )/(2*sigma^2) ) * ( 1/sigma^2 * (x[,k]*(y+sigma^2/2-x%*%theta[-1]) - X[,k]*(y+sigma^2/2-X%*%theta[-1])) * (x[,j]*(y+sigma^2/2-x%*%theta[-1]) - X[,j]*(y+sigma^2/2-X%*%theta[-1])) - x[,j]*x[,k] - X[,j]*X[,k] ) + lambda[1]^2*exp(-(y+sigma^2/2-x%*%theta[-1])^2/sigma^2) * x[,j]*x[,k] )
/ ( lambda[2]*exp(-(y+sigma^2/2-X%*%theta[-1])^2/(2*sigma^2)) + lambda[1]*exp(-(y+sigma^2/2-x%*%theta[-1])^2/(2*sigma^2)) )^2 )
hess[j + 2, k + 2] <- -1/sigma^2 * sum(v_star*x[,j]*x[,k]
+ (1-v_star) * (lambda[2]^2*exp(-(y+sigma^2/2-X%*%theta[-1])^2/sigma^2)*X[,j]*X[,k] - lambda[2]*lambda[1]*exp( -( (y+sigma^2/2-X%*%theta[-1])^2 + (y+sigma^2/2-x%*%theta[-1])^2 )/(2*sigma^2) ) * ( 1/sigma^2 * (x[,k]*(y+sigma^2/2-x%*%theta[-1]) - X[,k]*(y+sigma^2/2-X%*%theta[-1])) * (x[,j]*(y+sigma^2/2-x%*%theta[-1]) - X[,j]*(y+sigma^2/2-X%*%theta[-1])) - x[,j]*x[,k] - X[,j]*X[,k] ) + lambda[1]^2*exp(-(y+sigma^2/2-x%*%theta[-1])^2/sigma^2) * x[,j]*x[,k] )
/ ( lambda[2]*exp(-(y+sigma^2/2-X%*%theta[-1])^2/(2*sigma^2)) + lambda[1]*exp(-(y+sigma^2/2-x%*%theta[-1])^2/(2*sigma^2)) )^2 )
}
# Covariance of regression coefficients -- lambda
for(j in 1:ncol(x)){
hess[j + 2, 1] <- 1/sigma^2 * sum( (1-v_star)*( exp( -( (y+sigma^2/2-X%*%theta[-1])^2 + (y+sigma^2/2-x%*%theta[-1])^2 )/(2*sigma^2) ) * ( X[,j]*(y+sigma^2/2-X%*%theta[-1]) - x[,j]*(y+sigma^2/2-x%*%theta[-1]) ) ) / ( lambda[2]*exp( -(y+sigma^2/2-X%*%theta[-1])^2/(2*sigma^2) ) + lambda[1]*exp(- (y+sigma^2/2-x%*%theta[-1])^2/(2*sigma^2) ) )^2 )
hess[1, j + 2] <- 1/sigma^2 * sum( (1-v_star)*( exp( -( (y+sigma^2/2-X%*%theta[-1])^2 + (y+sigma^2/2-x%*%theta[-1])^2 )/(2*sigma^2) ) * ( X[,j]*(y+sigma^2/2-X%*%theta[-1]) - x[,j]*(y+sigma^2/2-x%*%theta[-1]) ) ) / ( lambda[2]*exp( -(y+sigma^2/2-X%*%theta[-1])^2/(2*sigma^2) ) + lambda[1]*exp(- (y+sigma^2/2-x%*%theta[-1])^2/(2*sigma^2) ) )^2 )
}
# Covariance of regression coefficients -- sigma
for(j in 1:ncol(x)){
hess[j + 2, 2] <- sum( v_star*( sigma^2*x[,j]-2*x[,j]*(y+sigma^2/2-x%*%theta[-1]) )/sigma^3
+ (1 - v_star) * ( lambda[2]^2*exp( -(y+sigma^2/2-X%*%theta[-1])^2/sigma^2 )*( sigma^2*X[,j] -2*X[,j]*(y+sigma^2/2-X%*%theta[-1]) )/sigma^3 + lambda[2]*lambda[1]*exp( -( (y+sigma^2/2-X%*%theta[-1])^2 + (y+sigma^2/2-x%*%theta[-1])^2 )/(2*sigma^2) ) * ( ( (x[,j]*(y+sigma^2/2-x%*%theta[-1]) - X[,j]*(y+sigma^2/2-X%*%theta[-1]))/sigma^2 ) * ( ( (y+sigma^2/2-x%*%theta[-1])^2 - (y+sigma^2/2-X%*%theta[-1])^2 )/sigma^3 + (x-X)%*%theta[-1]/sigma ) + ( sigma^2*(x[,j]+X[,j]) -2*(x[,j]*(y+sigma^2/2-x%*%theta[-1]) + X[,j]*(y+sigma^2/2-X%*%theta[-1]) ) )/sigma^3 ) + lambda[1]^2*exp(-(y+sigma^2/2-x%*%theta[-1])^2/sigma^2)*(sigma^2*x[,j] -2*x[,j]*(y+sigma^2/2-x%*%theta[-1]))/sigma^3 )
/ (lambda[2]*exp(-(y+sigma^2/2-X%*%theta[-1])^2/(2*sigma^2)) + lambda[1]*exp(-(y+sigma^2/2-x%*%theta[-1])^2/(2*sigma^2)) )^2 )
hess[2, j + 2] <- sum( v_star*( sigma^2*x[,j]-2*x[,j]*(y+sigma^2/2-x%*%theta[-1]) )/sigma^3
+ (1 - v_star) * ( lambda[2]^2*exp( -(y+sigma^2/2-X%*%theta[-1])^2/sigma^2 )*( sigma^2*X[,j] -2*X[,j]*(y+sigma^2/2-X%*%theta[-1]) )/sigma^3 + lambda[2]*lambda[1]*exp( -( (y+sigma^2/2-X%*%theta[-1])^2 + (y+sigma^2/2-x%*%theta[-1])^2 )/(2*sigma^2) ) * ( ( (x[,j]*(y+sigma^2/2-x%*%theta[-1]) - X[,j]*(y+sigma^2/2-X%*%theta[-1]))/sigma^2 ) * ( ( (y+sigma^2/2-x%*%theta[-1])^2 - (y+sigma^2/2-X%*%theta[-1])^2 )/sigma^3 + (x-X)%*%theta[-1]/sigma ) + ( sigma^2*(x[,j]+X[,j]) -2*(x[,j]*(y+sigma^2/2-x%*%theta[-1]) + X[,j]*(y+sigma^2/2-X%*%theta[-1]) ) )/sigma^3 ) + lambda[1]^2*exp(-(y+sigma^2/2-x%*%theta[-1])^2/sigma^2)*(sigma^2*x[,j] -2*x[,j]*(y+sigma^2/2-x%*%theta[-1]))/sigma^3 )
/ (lambda[2]*exp(-(y+sigma^2/2-X%*%theta[-1])^2/(2*sigma^2)) + lambda[1]*exp(-(y+sigma^2/2-x%*%theta[-1])^2/(2*sigma^2)) )^2 )
}
# FIM is the negative of the Hessian;
FIM <- -hess
# Then find std. errors
cov.pars.estimates <- solve(FIM)
std.error <- sqrt(diag(cov.pars.estimates))
# Calculate t values for regression coefficients
t_vals <- rep(NA, length(theta[-1]))
t_vals <- theta[-1] / std.error[-c(1,2)]
# Calculate p-values of regression coefficients
# argument df: '-1' because theta doesn't include lambda parameter.
p_vals <- rep(NA, length(t_vals))
p_vals <- 2 * pt(q = abs(t_vals), lower.tail = F, df = n - length(theta) - 1 )
# AIC, AICc, BIC
# Note that theta does not contain lamdba1, hence the '+1' included.
perf_metrics <- rep(NA, 3)
AIC <- 2 * (length(theta) + 1 - new.obs.ll)
AICc <- AIC + (2 * (length(theta) + 1)^2 + 2 * (length(theta) + 1) )/(n - (length(theta) + 1) - 1)
BIC <- log(n) * (length(theta) + 1) - 2 * new.obs.ll
perf_metrics <- c(AIC, AICc, BIC)
names(perf_metrics) <- c("AIC", "AICc", "BIC")
# Output
a=list(y = y, lambda = lambda[2], params = theta, loglik = new.obs.ll,
posterior = as.numeric(z), all.loglik = ll, cov.estimates = cov.pars.estimates,
std.error = std.error, t.values = t_vals, p.values = p_vals,
ICs = perf_metrics, ft="LnRegMisrepEM", formula = formula, v_star_name = v_star_name)
class(a) = "misrepEM"
a
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.