Causata-vignette.R
In Causata: Analysis utilities for binary classification and Causata users.

## ----setup, include=FALSE, cache=FALSE-----------------------------------
# Set global chunk options for knitr documentation
library(knitr)
opts_chunk$set(fig.width=6.5, fig.height=4.5, size="small", tidy=FALSE)

# Generate R code from the sweave/knitr documentation using the purl command as shown below
#purl(input="inst/doc/Causata-vignette.rnw", output="inst/doc/Causata-vignette.R", documentation=0)

## ----eval=FALSE----------------------------------------------------------
#  library(Causata)
#  library(glmnet)
#  
#  # Load data using the SQL interface.
#  # A page view event is selected as a focal point.
#  conn <- Connect(group='fsdemo')
#  query <- paste(
#    'SELECT mortgage_application_approved__n7d, total_assets__AP,',
#    'web_has_browsed_on_ipad__l7d',
#    'FROM Scenarios S, page_view P',
#    'WHERE where S.profile_id = P.profile_id',
#    'AND S.focal_point = P.timestamp',
#    'LIMIT 20000')
#  df <- GetData(conn, query)

## ----eval=FALSE----------------------------------------------------------
#  formula.str <- paste('mortgage_application_approved__n30d ~',
#    'total_assets__AP + web_has_browsed_on_ipad__AP')
#  
#  # Create a CausataData object to track preprocessing steps.
#  causataData <- CausataData(df, 'mortgage_application_approved__n30d')
#  
#  # Remove missing values.
#  causataData <- CleanNaFromContinuous(causataData)
#  causataData <- CleanNaFromFactor(causataData)
#  
#  # build a model
#  modelmatrix <- model.matrix(formula(formula.str), data=causataData$df)
#  cv.glmnet.obj <- cv.glmnet(modelmatrix,
#    causataData$df$mortgage_application_approved__n30d,
#    alpha=0.0, family='binomial')

## ----eval=FALSE----------------------------------------------------------
#  # upload model
#  model.def <- ModelDefinition(
#    cv.glmnet.obj, causataData, formula(formula.str),
#    cv.glmnet.obj$lambda.min)
#  variable.def <- VariableDefinition(
#    name='score-mortgage-application',
#    display.name='Score: Mortgage Application',
#    description='Logistic regression model for demo.', author='Justin',
#    labels='Scores')
#  result <- UploadModel(CausataConfig(group='fsdemo'), model.def,
#    variable.def)

## ----results="hide", message=FALSE---------------------------------------
library(Causata)
library(glmnet)
library(pROC)

# Set a random seed so random numbers are repeated
set.seed(87352)

# Load example data from the Causata package
data(df.causata)

## ------------------------------------------------------------------------
# Set the dependent variable
dvname <- "has.responded.mobile.logoff_next.hour_466"

## ------------------------------------------------------------------------
# Create an index of training data.
idx.train <- sample.int(nrow(df.causata), round(nrow(df.causata)*0.75))
# Split data into training and testing data frames by row.
df.train <- df.causata[ idx.train, ]
df.test  <- df.causata[-idx.train, ]

## ------------------------------------------------------------------------
# Run a univariate analysis for each variable in the data frame
binaryPredictorSummary <- BinaryPredictor(df.train, df.train[[dvname]], 
  min.robustness = 0.1)

# Generate summary information and store it in a data frame.
df.summary <- print(binaryPredictorSummary, silent=TRUE)

## ------------------------------------------------------------------------
# Create a CausataData object with training data.
causataData <- CausataData(df.train[, df.summary$keep], dvname)

## ------------------------------------------------------------------------
# Loop for each variable, apply discretization to nonlinear variables
for (varname in names(causataData$df)){
  if (varname %in% c(dvname)){next} # skip the dependent variable
  # Get summary information for this variable
  idx <- which(varname == df.summary$name)
  # Discretize nonlinear variables where the predictive power is > 0.3
  # and the linearity is < 0.9
  if (df.summary$class[idx] %in% c('integer','numeric') &
      df.summary$predictivePower[idx] > 0.03 &
      abs(df.summary$linearity[idx]) < 0.9){
    # Set outlier limits before discretizing
    causataData <- ReplaceOutliers(causataData, varname, 
      lowerLimit=min(causataData$df[[varname]], na.rm=TRUE), 
      upperLimit=max(causataData$df[[varname]], na.rm=TRUE))
    # Find breakpoints
    bclist <- BinaryCut(causataData$df[[varname]], 
      causataData$df[[dvname]], minBin=0, bins=TRUE)
    # Compute weight of evidence
    woe <- Woe(bclist$fiv, causataData$df[[dvname]])
    # Discretize
    causataData <- Discretize(causataData, varname, bclist$breaks,
      woe$woe.levels)
  }
}

## ------------------------------------------------------------------------
# Replace missing values
causataData <- CleanNaFromContinuous(causataData)
causataData <- CleanNaFromFactor(causataData)

## ------------------------------------------------------------------------
# Merge levels in factors with # levels exceeding a threshold
causataData <- MergeLevels(causataData, max.levels=10)

## ------------------------------------------------------------------------
# Extract a set of independent variable names, exclude the dependent
# variable.
indep.vars <- colnames(causataData$df)
indep.vars <- indep.vars[!(indep.vars == dvname) ]
# Build a formula string
formula.string <- paste(dvname, "~", paste(indep.vars, collapse=" + "))
formula.object <- formula(formula.string)
# Build a model matrix
x.matrix.train <- model.matrix(formula.object, data=causataData$df)
# Set the dependent variable
y.train <- causataData$df[[dvname]]
y.test  <- df.test[[dvname]]

## ------------------------------------------------------------------------
# Get a function that applies transformations to a data frame.
Transformer <- GetTransforms(causataData)
# Apply the transformations to the test data.
df.test.transformed <- Transformer(df.test)

## ----glmnet-diagnostic---------------------------------------------------
# Build model, select alpha value near 1 
# since we expect most coefficients to be zero
cv.glmnet.obj <- cv.glmnet(x.matrix.train, y.train, alpha=0.8, 
  family=c("binomial"))
plot(cv.glmnet.obj)

## ------------------------------------------------------------------------
# Use the model to predict responses for training data
predicted.train <- predict(cv.glmnet.obj, newx=x.matrix.train, 
  type="response", s="lambda.min")

# Compute predictions for test data.
model.def <- ModelDefinition(cv.glmnet.obj, causataData, 
  formula.object, cv.glmnet.obj$lambda.min )
predict.result <- predict(model.def, df.test.transformed)
predicted.test <- predict.result$predicted

## ----roc-plot------------------------------------------------------------
# Compute area under the ROC curve using the pROC package
roc.train <- roc(y.train, predicted.train)
roc.test  <- roc(y.test,  predicted.test )
cat("Training / testing area under ROC curve: ", 
  roc.train$auc, ", ", roc.test$auc, "\n")
plot(roc.test, 
  main=sprintf("ROC plot for glmnet model test data.  AUC=%6.4f", 
  roc.test$auc))

## ------------------------------------------------------------------------
# Prepare to import the model into Causata
variable.def <- VariableDefinition(
  name = "score-test-model", 
  display.name = "Score: Test Model",
  description = "A logistic regression model.",
  author = "Test User" )

# Generate a string of PMML representing the model and 
# preprocessing transformations.
# This step is not required to upload a model, 
# it's shown for illustration purposes only
pmml.string <- ToPmml(model.def, variable.def)

# Upload model to Causata.
# The parameters below are for illustration only.
causata.config <- CausataConfig(
  config.server.host = "123.456.789.10",
  config.server.port = 8002,
  config.username = "testuser",
  config.password = "1234abcd")

## ----eval=FALSE----------------------------------------------------------
#  result <- UploadModel(causata.config, model.def, variable.def)

## ------------------------------------------------------------------------
# extract nonzero coefficients
coefs.all <- as.matrix(coef(cv.glmnet.obj, s="lambda.min"))
idx <- as.vector(abs(coefs.all) > 0)
coefs.nonzero <- as.matrix(coefs.all[idx])
rownames(coefs.nonzero) <- rownames(coefs.all)[idx]

## ----size="tiny"---------------------------------------------------------
print(coefs.nonzero)

Any scripts or data that you put into this service are public.

Causata documentation built on May 2, 2019, 3:26 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Causata
Analysis utilities for binary classification and Causata users.

inst/doc/Causata-vignette.R
In Causata: Analysis utilities for binary classification and Causata users.

Try the Causata package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Causata Analysis utilities for binary classification and Causata users.

inst/doc/Causata-vignette.R In Causata: Analysis utilities for binary classification and Causata users.

Try the Causata package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Causata
Analysis utilities for binary classification and Causata users.

inst/doc/Causata-vignette.R
In Causata: Analysis utilities for binary classification and Causata users.