```{css, echo=FALSE} pre code { white-space: pre-wrap; }
```r knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
model.matrix function in R is a convenient way to transform training dataset for modeling. But it does not save any parameter used in transformation, so it is hard to apply the same transformation to test dataset or new dataset. ModelMatrixModel package is created to solve the problem.
#devtools::install_github("xinyongtian/R_ModelMatrixModel") #install from github rm(list=ls()) library(ModelMatrixModel) set.seed(10) traindf= data.frame(x1 = sample(LETTERS[1:5], replace = T, 20), x2 = rnorm(20, 100, 5), x3 = factor(sample(c("U","L","P"), replace = T, 20)), y = rnorm(20, 10, 2)) set.seed(20) newdf=data.frame(x1 = sample(LETTERS[1:5], replace = T, 3), x2 = rnorm(3, 100, 5), x3 = sample(c("U","L","P"), replace = T, 3)) head(traindf) sapply(traindf,class) #input categorical variable can be either character or factor
f1=formula("~x1+x2") head(model.matrix(f1, traindf),2) head(model.matrix(f1, newdf),2)
Note the number of columns is different in the two outputs, which will be problematic when applying the built model to new data . To avoid that, column x1 in both dataset needs to be transformed to factor with exact same levels. That will be cumbersome if there are many categorical columns. In addition, other transforming parameters, in transformation like orthogonal polynomials, also need to be saved.
f2=formula("~ 1+x1+x2") # "1" is need in order to output intercept column mm=ModelMatrixModel( f2,traindf,remove_1st_dummy =T,sparse = F)
class(mm) head(mm$x,2) #note "_Intercept_" is intercept column
mm_pred=predict(mm,newdf) head(mm_pred$x,2)
mm=ModelMatrixModel(~x1+x2+x3,traindf,remove_1st_dummy = F)
default is to keep first dummy variable
data.frame(as.matrix(head(mm$x,2))) mm_pred=predict(mm,newdf) data.frame(as.matrix(head(mm_pred$x,2)))
mm=ModelMatrixModel(~x2+x3+x2:x3,traindf) data.frame(as.matrix(head(mm$x,2))) # ':' in column name is replaced with '_X_' mm_pred=predict(mm,newdf) data.frame(as.matrix(head(mm_pred$x,2)))
mm=ModelMatrixModel(~x2*x3,traindf,remove_1st_dummy = T) data.frame(as.matrix(head(mm$x,2))) mm_pred=predict(mm,newdf) data.frame(as.matrix(head(mm_pred$x,2)))
It is a common categorical column in new data contains in valid level, it can be handled as following
mm=ModelMatrixModel(~x2+x3,traindf) data.frame(as.matrix(head(mm$x,2))) newdf2=newdf newdf2[1,'x3']='z' #create invalid level mm_pred=predict(mm,newdf2,handleInvalid = "keep")
default is to keep the invalid row ,i.e. set all dummy variables as 0. if handleInvalid = "error", throw error.
data.frame(as.matrix(head(mm_pred$x,2)))
ModelMatrixModel can save orthogonal polynomials parameter.
mm=ModelMatrixModel(~poly(x2,3)+x3,traindf) data.frame(as.matrix(head(mm$x,2))) mm_pred=predict(mm,newdf) data.frame(as.matrix(head(mm_pred$x,2)))
also works raw polynomial transformation
mm=ModelMatrixModel(~poly(x2,3,raw=T)+x3, traindf) data.frame(as.matrix(head(mm$x,2))) mm_pred=predict(mm,newdf) data.frame(as.matrix(head(mm_pred$x,2)))
training dataset can be scaled, and same scale parameters then can be applied to new dataset.
mm=ModelMatrixModel(~x2+x3,traindf,scale = T,center = T) data.frame(as.matrix(head(mm$x,2))) mm_pred=predict(mm,newdf) data.frame(as.matrix(head(mm_pred$x,2)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.