Nothing
#
# R source code for preprocessing and scoring
#
No_Causata_Name <- "No Causata Name" #
#
# constructors for classes
#
CausataVariable <- function(variableName, values, causata.name=variableName) {
# initialize a Causata Variable object
# - variableName: a string containing the variable name
# - values: a vector of values
variableObject <- list()
class(variableObject) <- "CausataVariable"
variableObject$name <- variableName # variable name
variableObject$causata.name <- causata.name # Name that Causata uses for this variable
variableObject$class <- class(values) # class of data: numeric, integer, factor
variableObject$steps <- c() # list of preprocessing steps executed for this variable
variableObject$outlierUpperLimit <- NULL # upper limit for outlier removal
variableObject$outlierLowerLimit <- NULL # lower limit for outlier removal
variableObject$binRight <- NULL # logical, indicating if the intervals should be closed on the right (and open on the left) or vice versa
variableObject$binLimits <- NULL # limit values for continous bins
variableObject$binValues <- NULL # map continuous values to discrete, e.g. weight of evidence binning
variableObject$missingReplacement <- NULL # missing values will be replaced with this
variableObject$unmergedLevels <- NULL # levels that remained after MergeLevels
variableObject$categorizing.attribute <- NULL # Some input variables are categorized.
variableObject$category.value <- NULL # the category to use for a categorized variable.
variableObject$time.domain <- NULL # The variable time domain
# transformation is a function that takes a vector (column) from a data.frame and applies
# all the transformations to it that have been recorded in this CausataVariable, returning the transformed vector
variableObject$transformation <- identity
return(variableObject)
}
is.CausataVariable <- function(this) inherits(this, "CausataVariable")
CausataData <- function(dataframe, dependent.variable=NULL, query=NULL) {
# initialzie a Causata Data object
# - dataframe: a dataframe containing independent variables
# - dependent.variable: a vector of dv values, or a name of a variable in the data frame that contains dependent variable
# - dvName: the name of the dependent variable
dataObject <- list()
class(dataObject) <- "CausataData"
if (is.null(dependent.variable)){
# if dependent variable argument is missing then require that "dependent.variable" is defined in data frame
dataObject$dvName <- "dependent.variable"
if (!(dataObject$dvName %in% names(dataframe))){
stop("No argument provided for dependent.variable, so a column named 'dependent.variable' must be provided in the data frame.")
}
# copy the dependent variable
dv <- dataframe$dependent.variable
} else if (class(dependent.variable) == "character"){
# character was provided, test if it's a column in the data frame
if (!(dependent.variable %in% names(dataframe))){
stop("A variable named ", dependent.variable, " must be present in the data frame.")
}
# store the name
dataObject$dvName <- dependent.variable
# copy the data to a variable called dv
dv <- dataframe[[dataObject$dvName]]
} else {
# assume that the dependent variable is an array of values with length matching data frame
# copy the data to a variable called dv
if (!(nrow(dataframe) == length(dependent.variable))){
stop("dependent.variable length ", length(dependent.variable),
" must match rows in dataframe ", nrow(dataframe))
}
if (is.null(names(dependent.variable))){
# no name provided, so give it a default name
dataObject$dvName <- "dependent.variable"
} else {
# name provided, copy it
dataObject$dvName <- names(dependent.variable)
}
# copy the values to the data frame
dataframe[[dataObject$dvName]] <- dependent.variable
dv <- dependent.variable
}
# dv values can't be missing
if (any(is.na(dv))){
stop("Missing values not allowed in dependent variable ", dataObject$dvName)
}
# initialize a variable object for every variable in the data frame
dataObject$variableList <- list()
dataObject$skippedVariables <- c()
for (varname in names(dataframe)) {
# if the dependent variable is in the data frame then skip it, don't create a causata variable for it
if (varname == dataObject$dvName){
next # skip to next variable
}
# find causata name corresponding to this column of data frame
#causata.name <- r.to.causata.name.mapping[ match(varname, names(dataframe)) ]
causata.name <- RToCausataNames(varname)
if (causata.name == No_Causata_Name){
# a variable in the data frame does not have a causata name, e.g. profile_id
# do not create a variable object for this column
dataObject$skippedVariables <- c(dataObject$skippedVariables, varname)
next
} else {
dataObject$variableList[[varname]] <- CausataVariable(
varname, dataframe[[varname]], causata.name)
}
}
# if variables were skipped then generate a warning
if (length(dataObject$skippedVariables) > 0){
warning("CausataData did not create variables for ", length(dataObject$skippedVariables),
" that did not conform to variable naming conventions.")
}
dataObject$df <- dataframe
dataObject$query <- query
return(dataObject)
}
# Converts Causata variable names to R variable names.
# For example:
# CausataToRNames(c("total-spend$All Past", "page-view-count$Last 30 Days"))
# returns list("total-spend$All Past"="total.spend__All.Past", "page-view-count$Last 30 Days"="page.view.count__Last.30.Days")
#
CausataToRNames <- function(name.vector) {
safe.names <- make.names(str_replace(as.vector(name.vector), "\\$", "__"), unique=TRUE)
ToRName <- function(index) {
l <- list()
l[name.vector[[index]]] <- safe.names[[index]]
return(l)
}
return(unlist(lapply(1:length(name.vector), ToRName)))
}
# Converts R variable column names to Causata variable names.
# For example:
# RToCausataNames(c("total.spend__All.Past", "page.view.count__Last.30.Days"))
# returns list("total.spend_All.Past"="total-spend$All Past", "page.view.count_Last.30.Days"="page-view-count$Last 30 Days")
#
RToCausataNames <- function(name.vector) {
DotsToDashes <- function(string) return(str_replace_all(string, "\\.", "-"))
DotsToSpaces <- function(string) return(str_replace_all(string, "\\.", " "))
ToCausataName <- function(name) {
parts <- str_split(name, "__")[[1]] # split at double underscore for data exported from SQL
parts.csv <- str_split(name, "_")[[1]] # split at single underscore for data exported from csv
if (length(parts) == 2) {
#stopifnot(length(parts) == 2)
return(paste(DotsToDashes(parts[[1]]), DotsToSpaces(parts[[2]]), sep="$", collapse="$"))
} else if (length(parts.csv)==3 & length(parts.csv[2]>0)) {
# this variable fits a pattern where there are two underscores with text in between,
# assume this is a csv variable, return the name unaltered
return(name)
} else {
# the name doesn't fit pattern name__interval, so set a default non-match value
warning("The variable named ", name, " does not match Causata naming conventions.")
return(No_Causata_Name)
}
}
# return a vector, not a list, so use as.vector on results of sapply
v <- unlist(lapply(name.vector, ToCausataName))
names(v) <- name.vector
return(v)
}
#
# define generic functions
#
CleanNaFromContinuous <- function(x, ...) {
# generic function to replace missing values
UseMethod("CleanNaFromContinuous")
}
CleanNaFromFactor <- function(x, ...){
# generic function to replace missing values
UseMethod("CleanNaFromFactor")
}
Discretize <- function(this, ...){
# generic function to replace missing values
UseMethod("Discretize")
}
GetVariable <- function(this, ...) {
UseMethod("GetVariable")
}
GetTransforms <- function(this, ...) {
UseMethod("GetTransforms")
}
GetQuery <- function(this, ...) {
UseMethod("GetQuery")
}
ReplaceOutliers.CausataData <- function(this, variableName, lowerLimit=NULL, upperLimit=NULL, ...){
# Given a causata data object, replaces outliers with specified values.
stopifnot(variableName %in% names(this$variableList))
# extract the causata variable
causataVariable <- this$variableList[[variableName]]
# store this step in the list of steps
causataVariable$steps <- c(causataVariable$steps, "ReplaceOutliers")
# replace missing values
this$df[[variableName]] <- ReplaceOutliers( this$df[[variableName]], lowerLimit=lowerLimit, upperLimit=upperLimit )
# update the variable object
causataVariable$outlierUpperLimit <- upperLimit
causataVariable$outlierLowerLimit <- lowerLimit
this$variableList[[variableName]] <- causataVariable
# return the data object
return(this)
}
CleanNaFromFactor.CausataVariable <- function(x, dataObject, replacement, ...) {
# given a causata variable object, replaces missing values with BLANK
varname <- x$name
# store this step in the list of steps
x$steps <- c(x$steps, "CleanNaFromFactor")
x$missingReplacement <- replacement
# replace missing values
dataObject$df[[varname]] <- CleanNaFromFactor( dataObject$df[[varname]], replacement=replacement )
# update the variable object
dataObject$variableList[[varname]] <- x
# return the data object
return(dataObject)
}
CleanNaFromFactor.CausataData <- function(x, variableName=NULL, replacement="BLANK", ...) {
# allow users to pass in a bare, unquoted variable name, this will convert to a string
if (!is.null(variableName)){
# a single variable name was provided, confirm that it's in the causataData object
if (!(variableName %in% names(x$variableList))){
# the parsed string doesn't match, throw an error
stop("The variable ", variableName, " was not found in the causataData input.")
}
variableList <- variableName
} else {
# no variable name was provided, so use all of them
variableList <- names(x$variableList)
}
for (varname in variableList) {
# test if this is a factor
if (class(x$df[[varname]]) == "factor") {
x <- CleanNaFromFactor.CausataVariable( x$variableList[[varname]], x, replacement=replacement )
}
}
# return the data object
return(x)
}
CleanNaFromContinuous.CausataVariable <- function(x, dataObject, method="median", ...) {
# given a causata variable object, replaces missing values within
# get the variable name
varname <- x$name
# store this step in the list of steps
x$steps <- c(x$steps, "CleanNaFromContinuous")
# replace missing values
outList <- CleanNaFromContinuous( dataObject$df[[varname]], method=method, return.replacement=TRUE )
# update the data object and store the value used to replace missing values
dataObject$df[[varname]] <- outList[[1]]
# check if the missing replacement value was already set, which may have happened in Discretize. If it was set then do not reset here.
if (is.null(x$missingReplacement)){
x$missingReplacement <- outList[[2]]
}
# update the variable object
dataObject$variableList[[varname]] <- x
# return the data object
return(dataObject)
}
CleanNaFromContinuous.CausataData <- function(x, variableName=NULL, method="median", ...) {
# allow users to pass in a bare, unquoted variable name, this will convert to a string
if (!is.null(variableName)){
# a single variable name was provided, confirm that it's in the causataData object
if (!(variableName %in% names(x$variableList))){
# the parsed string doesn't match, throw an error
stop("The variable ", variableName, " was not found in the causataData input.")
}
variableList <- variableName
} else {
# no variable name was provided, so use all of them
variableList <- names(x$variableList)
}
for (varname in variableList) {
# test if this is a numeric or integer variable
varClass <- class(x$df[[varname]])
if (any(varClass=="numeric") || any(varClass=="POSIXct") || any(varClass=="integer")) {
x <- CleanNaFromContinuous.CausataVariable(x$variableList[[varname]], x, method=method)
}
}
return(x)
}
Discretize.CausataData <- function(this, variableName, breaks, discrete.values, verbose=FALSE, ...) {
# Given a causata data object, replaces outliers with specified values.
stopifnot(variableName %in% names(this$variableList))
# copy variable from list
varObject <- this$variableList[[variableName]]
# require that this variable is numeric
stopifnot(varObject$class %in% c("numeric","integer"))
# ensure that missing values were not replaced before this step, missing values should not be replaced before Discretize
if ("CleanNaFromContinuous" %in% varObject$steps) {
stop("The CleanNaFromContinuous step must not be executed before the Discretize step for variable ", variableName)
}
# ensure that outliers were replaced before this step
if (!("ReplaceOutliers" %in% varObject$steps)){
stop("The ReplaceOutliers step must be run before the Discretize step for variable ", variableName)
}
# require that the outlier replacement upper limit is <= the highest bin value
if (is.null(varObject$outlierUpperLimit) || varObject$outlierUpperLimit > breaks[length(breaks)]){
stop("The outlier limit is not set or greater than the last breaks value in ", variableName)
}
if (is.null(varObject$outlierLowerLimit)){
stop("The outlier lower limit must be set before the Discretize step for variable ",variableName)
}
#
# check the length of inputs and for missing values
#
if (any(is.na(this$df[[variableName]]))){
#
# missing values are present in numeric variable, must have N+1 breaks and N values
#
if (length(breaks) != length(discrete.values)){
stop("Discretize requires number of breaks to match number of discrete values when missing values are present for variable ", variableName,
", found ", length(breaks), " breaks and ", length(discrete.values), " discrete values.")
}
# create an artificial bin for missing values, make it above the max value in the data
newbreak <- 0.5 * (max(breaks) - min(breaks)) + max(breaks)
breaks <- c(breaks, newbreak)
# map missing values to the new last break, make it slightly smaller than last artificial bin boundary
varObject$missingReplacement <- newbreak * 0.99
this$df[[variableName]] <- CleanNaFromContinuous(this$df[[variableName]], replacement.value = varObject$missingReplacement)
# update the variable object
varObject$binLimits <- breaks
varObject$binValues <- discrete.values
# set a label to be appended to the last level name for verbose output
last.level.label.append <- " (Missing)" # append missing label since last level contains missing values
} else {
#
# no missing values present, must have N+1 breaks and N discrete values
#
if (length(breaks) != (length(discrete.values)+1)){
stop("Discretize requires number of breaks N+1 and N discrete values for variable ", variableName,
", found ", length(breaks), " breaks and ", length(discrete.values), " discrete values.")
}
# no missing values found, copy over breaks and woe levels as normal
# update the variable object
varObject$binLimits <- breaks
varObject$binValues <- discrete.values
# set a label to be appended to the last level name for verbose output
last.level.label.append <- "" # append blank since the last level does not contain missing values
}
# store this step in the list of steps
varObject$steps <- c(varObject$steps, "Discretize")
# set bins to right, closed on right and open on left, default for cut command
varObject$binRight <- TRUE
# use cut to discretize the variable
fiv <- cut(this$df[[variableName]], breaks=breaks, include.lowest=TRUE)
# replace continuous values with discrete values in the causataData object
discrete.variable <- rep(0, nrow(this$df))
level.vec <- levels(fiv)
if (verbose) {cat("Mapping values to", length(discrete.values), "discrete bins, n is number of values in bins:\n")}
# loop for each level in the discretized data
for (i in 1:length(discrete.values)){
idx <- level.vec[i] == fiv
if (sum(idx)>0){discrete.variable[idx] <- discrete.values[i]}
# print message if verbose flag set, add label to level with missing values if present
if (verbose) {cat(" ", level.vec[i], "->", discrete.values[i], " n =", sum(idx),
if(i==length(discrete.values)) last.level.label.append else "", "\n")}
}
this$df[[variableName]] <- discrete.variable
# update the causata variable and return the causata data object
this$variableList[[variableName]] <- varObject
return(this)
}
MergeLevels.CausataVariable <- function(this, causataData, max.levels, ...) {
# Given a Causata Variable object for a factor, merges the smallest levels
# in the factor so that the total number of levels does not exceed maxlevels.
# get the variable name
varname <- this$name
# store this step in the list of steps
this$steps <- c(this$steps, "MergeLevels")
# Store the levels before merging
levels1 <- levels(causataData$df[[varname]])
# merge levels and update the data object
causataData$df[[varname]] <- MergeLevels(causataData$df[[varname]], max.levels)
# store a vector of the levels that were not merged into "Other" (all other values become "Other")
this$unmergedLevels <- levels(causataData$df[[varname]])
# update the variable object stored in the data object
causataData$variableList[[varname]] <- this
# return the data object
return(causataData)
}
MergeLevels.CausataData <- function(this, variableName=NULL, max.levels, other.name="Other", verbose=FALSE, ...) {
# given a Causata Data object, merges levels in all of the factors within where
# the number of levels exceeds maxlevels
if (!is.null(variableName)){
# a single variable name was provided, confirm that it's in the causataData object
stopifnot(variableName %in% names(this$variableList))
variableList <- variableName
} else {
# no variable name was provided, so use all of them
variableList <- names(this$variableList)
}
if (verbose) {cat("\nMerging levels in factors:\n")}
for (varname in variableList) {
# test if this is a factor
if (class(this$df[[varname]]) == "factor") {
# it is a factor, check the number of levels
numLevels <- length(levels(this$df[[varname]]))
# check if the number of levels exceeds a limit
if (numLevels > max.levels){
# this factor exceeds the limit, merge levels
if (verbose) {cat(" ", varname, "\n")}
}
# run mergelevels regardless of whether the threshold number of levels is exceeded. this way
# we record which levels were present
this <- MergeLevels.CausataVariable( this$variableList[[varname]], this, max.levels )
}
}
# return the data object
return(this)
}
GetVariable.CausataData <- function(this, r.name=NULL, ...) {
for (variable in this$variableList) {
if (variable$name == r.name) {
# exact match found, return variable
return(variable)
} else {
# look for a partial match from a dummy variable produced by model.matrix
# e.g. var.name__APlevel, where we should match var.name__AP
idx <- grep( paste("^", variable$name, sep=""), r.name )
if (length(idx) == 1){
# match found
return(variable)
}
}
}
# if no match was found then throw an error
stop("No matching variable name found for ", r.name)
}
DiscretizeTransform <- function(this) {
column.function <- function(column) {
# if missing values are present then replace them first, before mapping to discrete values
if (!is.null(this$missingReplacement)){
# missing replacement value is set, so replace missing values
column[is.na(column)] <- this$missingReplacement
}
this$binValues[ cut(column, breaks=this$binLimits, include.lowest=TRUE, labels=FALSE, right=this$binRight) ]
}
ColumnarTransformation(this, column.function)
}
ReplaceOutliersTransform <- function(this) {
column.function <- function(column) {
ReplaceOutliers(column, lowerLimit=this$outlierLowerLimit, upperLimit=this$outlierUpperLimit)
}
ColumnarTransformation(this, column.function)
}
CleanNaFromContinuousTransform <- function(this) {
column.function <- function(column) {
column[is.na(column)] <- this$missingReplacement
return(column)
}
ColumnarTransformation(this, column.function)
}
CleanNaFromFactorTransform <- function(this) {
ColumnarTransformation(this, function(column) CleanNaFromFactor(column, this$missingReplacement))
}
MergeLevelsTransform <- function(this) {
stopifnot(is.CausataVariable(this))
unit.of.work <- function(value) {
if (is.na(value)) {
NA
} else if (value %in% this$unmergedLevels) {
value
} else {
"Other"
}
}
return(VectorizedFactorTransformation(this, unit.of.work))
}
# Takes a CausataVariable and a function that operates on a single column vector
# returns a function that operates on a data.frame, and applies the given function to the column vector
# that corresponds to the given CausataVariable
#
ColumnarTransformation <- function(this, column.function) {
function(df) {
#df[,this$name] <- column.function(df[,this$name])
# using set from data.table as it is much more efficient with memory
colnumber <- which(this$name == names(df))
if (length(colnumber)!=1){
# require that one column matches
stop("One column match required but found ", length(colnumber) , " for column ", this$name)
}
set(df, j=colnumber, value=column.function(df[, this$name]))
return(df)
}
}
# Takes a CausataVariable and a function that operates on a single value in a factor,
# returns a function that operates on a data.frame, and applies the given function to the factor
# that corresponds to the given CausataVariable
#
VectorizedFactorTransformation <- function(this, work.unit, desired.levels=NULL) {
function(df) {
col <- df[,this$name]
stopifnot(is.factor(col))
new.col <- as.character(col)
for (i in 1:length(new.col)) {
new.col[i] <- work.unit(new.col[i])
}
new.col <- if (length(desired.levels)) {
factor(new.col, levels=desired.levels)
} else {
factor(new.col)
}
result <- df
result[this$name] <- new.col
return(result)
}
}
# Returns a function that applies the given transforms (functions) in sequence
# Could possibly use Reduce (like a foldleft function)
#
Sequence <- function(functions) {
function(df) {
for (f in functions) {
df <- f(df)
}
return(df)
}
}
GetTransforms.CausataData <- function(this, ...) {
# iterate over steps, getting a function for each, and the return a function that applies
# each of those transforms in sequence.
#
transforms <- NULL
for (variable in this$variableList) {
transforms <- c(transforms, GetTransformFunctionForVariable(variable))
}
return(Sequence(transforms))
}
GetQuery.CausataData <- function(this, ...) {
this$query
}
GetTransformFunctionForVariable <- function(this) {
stopifnot(is.CausataVariable(this))
transforms <- NULL
for (step in this$steps) {
transforms <- c(transforms, switch (step,
"Discretize"=DiscretizeTransform(this),
"ReplaceOutliers"=ReplaceOutliersTransform(this),
"CleanNaFromFactor"=CleanNaFromFactorTransform(this),
"CleanNaFromContinuous"=CleanNaFromContinuousTransform(this),
"MergeLevels"=MergeLevelsTransform(this),
stop("Unknown transformation")
))
}
return(Sequence(transforms))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.