tests/testToPmml.R
In Causata: Analysis utilities for binary classification and Causata users.

library(testthat)
library(Causata)
library(glmnet)
library(XML)

equals <- testthat::equals

context("ToPmml")

# testing data
# creat a data frame with two independent variables and a dependent variable
set.seed(65432)
np <- 10000
# x1 is continuous with some missing values
x1.temp <- rnorm(np)
x1 <- x1.temp
x1[x1>2] <- NA
# x2 is a factor with some missing values
# start with an integer vector that will be used to compute y
x2.int <- sample(-5:5, np, replace=TRUE, prob=c(0.02,0.02,0.02, rep(0.94/8, 8)))
x2.temp <- x2.int
x2.temp[x2.temp == -5] <- NA
x2 <- as.factor(x2.temp)

# binary dependent variable y is a function of x1, x2, and noise
y.temp <- abs(x1.temp)*10 + x2.int + rnorm(np, sd=2)
y <- rep(0,np)
y[y.temp > 15] <- 1

# preprocess data
# using a Discretize transformation
varname <- 'x1__AP'
dvname <- 'y__AP'
df <- data.frame(x1__AP=x1, x2__AP=x2, y__AP=y)
causataData <- CausataData(df, dependent.variable=dvname)
causataData <- ReplaceOutliers(causataData, varname, lowerLimit=-3.5, upperLimit=3.5)
# set breakpoints
breaks <- c(-3.5, -2, -1, 0, 1, 2, 3.5)
fiv <- cut(causataData$df$x1__AP, breaks, include.lowest=TRUE)
# Set weight of evidence, the discrete values that continuous values are mapped to
#woe <- Woe(fiv, causataData$df[[dvname]])
#   [-3.5,-2]      (-2,-1]       (-1,0]        (0,1]        (1,2]      (2,3.5]        BLANK 
# 5.347107531  0.003030305 -4.323101956 -4.360874260 -0.013072082  9.210340372  4.787491743 
woe.levels <- c(5.3, 0, -4.3, -4.4, -0.01, 9.2, 4.8)
# Discretize
causataData <- Discretize(causataData, varname, breaks, woe.levels)

# replace missing values
causataData <- CleanNaFromContinuous(causataData)
causataData <- CleanNaFromFactor(causataData)

# create a model
formula.obj <- formula('y__AP ~ x1__AP + x2__AP')
x.matrix <- model.matrix(formula.obj, data=causataData$df)
cv.glmnet.obj <- cv.glmnet(x.matrix, y=df$y__AP, family='binomial', alpha=1, nfolds=3)

# generate string of PMML
model.def <- ModelDefinition(cv.glmnet.obj, causataData,
  formula.obj, cv.glmnet.obj$lambda.1se )
variable.def <- VariableDefinition(
  name = "score-test-model",
  display.name = "Score: Test Model",
  description = "A logistic regression model.",
  author = "Test User" )
pmml.obj <- ToPmml(model.def, variable.def)

# extract the coefficients from PMML
coefs_pmml <- xpathSApply(pmml.obj, "//x:NumericPredictor", xmlGetAttr, name='coefficient', namespaces='x')
# convert string coefs to numeric
coefs_pmml <- as.numeric(coefs_pmml)
# extract coef names
names(coefs_pmml) <- xpathSApply(pmml.obj, "//x:NumericPredictor", xmlGetAttr, name='name',        namespaces='x')
# replace $ with __ in coef names
names(coefs_pmml) <- gsub('\\$', '__', names(coefs_pmml))
# we have all coefficients except the intercept, add that
coefs_pmml <- c(coefs_pmml, as.numeric(xpathApply(pmml.obj, "//x:RegressionTable", xmlGetAttr, name='intercept', namespaces='x')[1]))
names(coefs_pmml)[length(coefs_pmml)] <- '(Intercept)'

# extract coefficients from glmnet model
coefmatrix <- coef(cv.glmnet.obj)
coefs_glmnet <- as.numeric(coefmatrix)
idx.nonzero <- abs(coefs_glmnet) > 0
coefs_glmnet <- coefs_glmnet[idx.nonzero] # get only nonzero coefficients
names(coefs_glmnet) <- rownames(coefmatrix)[idx.nonzero]

#
# run tests
#

test_that("PMML contains sections for Header, DataDictionary, RegressionModel", 
  expect_that(
    sort(names(xmlChildren(pmml.obj))),
    equals( sort(c('Header', 'DataDictionary', 'RegressionModel')) )
  )
)

test_that("PMML RegressionModel section contains MiningSchema, LocalTransformations, RegressionTable",
  expect_that(
    sort(names(xmlChildren(xmlChildren(pmml.obj)$RegressionModel))),
    equals( sort(c("MiningSchema", "LocalTransformations", "RegressionTable")))
  )
)

test_that("Coefficients in PMML match those in glmnet",
  expect_that(
    # coefficients are ordered by name before the comparison
    coefs_glmnet[order(names(coefs_glmnet))], equals(coefs_pmml[order(names(coefs_pmml))])
  )
)

Any scripts or data that you put into this service are public.

Causata documentation built on May 2, 2019, 3:26 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Causata
Analysis utilities for binary classification and Causata users.

tests/testToPmml.R
In Causata: Analysis utilities for binary classification and Causata users.

Try the Causata package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Causata Analysis utilities for binary classification and Causata users.

tests/testToPmml.R In Causata: Analysis utilities for binary classification and Causata users.

Try the Causata package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Causata
Analysis utilities for binary classification and Causata users.

tests/testToPmml.R
In Causata: Analysis utilities for binary classification and Causata users.