#case setwd("C:/Users/Administrator/Documents/frmselection/data-raw") dat <- read.table("401kjae.txt") summary(dat) #The standard deviation of variables sqrt(diag(var(dat))) y <- dat[,1] x <- cbind(dat[,2], log(dat[,3]), log(dat[,3])^2, dat[,4], (dat[,4])^2, dat[,5]) colnames(x) <- c("mrate", "lemp", "lemp2","age", "age2","sole")
#QMLE library(frm) f <- frm(y, x, linkfrac = "logit", table = FALSE) f$p #The standard deviation of estimated coefficients sqrt(diag(f$p.var))
x <- as.data.frame(x, row.names = TRUE) #OLS ols <- lm(y ~ x$mrate + x$lemp + x$lemp2 + x$age + x$age2 +x$sole) coef(ols)
#frmselect() to do the model selection #The four examples x <- as.matrix(x, row.names = TRUE) library(frmselection) frmselect(x,y, criterion = "AIC",linkfrac = "logit", plotit=TRUE)#The default is forward #frmselect(x,y, criterion = "AIC",linkfrac = "logit", method = "allsubsets") #frmselect(x,y, criterion = "AIC",linkfrac = "logit", method = "backward") #frmselect(x,y, criterion = "AIC",linkfrac = "logit", method = "both")
#betaselect() to do the model selection #The four examples newdat <- data.frame(y, x) subdat <- newdat[y<1,] suby <- subdat[,1] subx <- subdat[,-1] subx <- as.matrix(subdat[,-1]) nrow(subx) betaselect(subx, suby, plotit = TRUE) #betaselect(subx, suby, method = "backward") #betaselect(subx, suby, method = "both") #betaselect(subx, suby, method = "allsubsets")
#Another way by replacing 1 with 0.9999 y[y==1] <- 0.9999 betaselect(x, y) #betaselect(x, y, method = "backward") #betaselect(x, y, metihod = "both") #betaselect(x, y, method = "allsubsets")
#Testing whether the modified quasi-binomial family frm_bamlss()function works x <- as.data.frame(x, row.names = TRUE) y <- dat[,1] d <- data.frame(y,x) formula <- as.formula(paste("y ~ ", paste(colnames(x), collapse = "+"))) library(bamlss) b <- bamlss(formula, data = d, family = frm_bamlss(link = "logit") ,sampler = FALSE, multiple = FALSE) coef(b) f <- frm::frm(y, x, linkfrac = "logit", table = FALSE) f$p
#probit link bsub <- bamlss(formula, data = subdat, family = frm_bamlss(link = "probit"), sampler = FALSE, multiple = FALSE) coef(bsub) f1 <- frm(suby, subx, linkfrac = "probit", table = FALSE) f1$p
#Lasso Procedure #Using quasi-binomial family dat <- read.table("401kjae.txt") y <- dat[,1] x <- cbind(dat[,2], log(dat[,3]), log(dat[,3])^2, dat[,4], (dat[,4])^2, dat[,5]) colnames(x) <- c("mrate", "lemp", "lemp2","age", "age2","sole") x <- as.data.frame(x, row.names = TRUE) x$sole <- as.factor(x$sole) d <- data.frame(y,x) xname <- colnames(x) ind <- sapply(x, is.factor) ind <- which(ind) #find the index of the factor variables f1 <- as.formula(paste(" ~ ", paste(xname[-ind], collapse = "+"))) f2 <- as.formula(paste(" ~ ", paste(xname[ind], collapse = "+"))) formula <- y ~ la(f1) + la(f2) #form the formula with two kinds of variables b <- bamlss(formula, data = d, family = frm_bamlss(link = "logit"), sampler = FALSE, optimizer = lasso, nlambda = 100, upper = 1e+08, lower = 1e+03, multiple = FALSE) coefi <- lasso.coef(b) coefi
lasso.select <- function(x, coefficent, threshold = 1e-3){ if(!is.numeric(threshold)){ stop("Threshold should be a number!") } n <- length(coefficent) #n is the number of the coefficients ncommon <- length(x[,-ind]) #the number of the common variables nfactor <- length(levels(x[,ind])) #find the number of levels of factor variables #nfactor <- length(unique(x[,ind])) #Extract the penalized parameters of common and factor variables separately. tau.common <- coefficent[ncommon+1] tau.factor <- coefficent[ncommon+nfactor+1] #The relation between tau and lambda is an inverse relationship lambda.common <- 1/tau.common names(lambda.common) <- "mu.s.la(f1).lambda" lambda.factor <- 1/tau.factor names(lambda.factor) <- "mu.s.la(f2).lambda" index.tau <- c(ncommon+1, ncommon+nfactor+1) coefi.new <- coefficent[-index.tau] #the new coeffients doesn't contain the penalized parameters lasso.index <- NULL #Users can set this threshold as whatever they like. The default value is 1e-3 for(i in 1:length(coefi.new)){ if(abs(coefi.new[i]) < threshold){ coefi.new[i] = 0 lasso.index <- c(lasso.index, i) } } return(list(lambda.common = lambda.common, lambda.factor = lambda.factor, lasso.index = lasso.index, modified.coefficients = coefi.new)) } lasso.select(x, coefi)
#Using beta family subdat <- d[y<1,] suby <- subdat[,1] subx <- subdat[,-1] xname <- colnames(subx) ind1 <- sapply(subx, is.factor) ind1 <- which(ind1) #find the index of the factor variables f11 <- as.formula(paste(" ~ ", paste(xname[-ind], collapse = "+"))) f21 <- as.formula(paste(" ~ ", paste(xname[ind], collapse = "+"))) formula1 <- y ~ la(f11) + la(f21) #form the formula with two kinds of variables b1 <- bamlss(formula1, data = subdat, family = "beta", sampler = FALSE, optimizer = lasso, nlambda = 10, upper = 10e+8, lower = 10e+3, multiple = TRUE) coefi1 <- lasso.coef(b1) coefi1
#Using beta family lasso.select(subx, coefi1)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.