In mdavis29/modelToolKit: Predictive Models for basic tasks at MUSC

Predictive Model Preformance

Get predictive model prefomance by a catagory. Check to see if a model is preforming differently in by a group or catagory

try(detach(package:modelToolKit, unload = TRUE))
library(modelToolKit)
library(treemap)
library(caret)
library(rpart)
data(iris)
set.seed(2012)
fit<-rpart(Species~., iris)
km<-kmeans(iris[,1:4], 3)
preds<-predict(fit, iris, type = 'class')
obs<-iris[,'Species']
cats<-as.factor(km$cluster)

plot preformance by class

x<-preformanceByCatagory(preds, obs, cats)
plot(x)

Preformance by class in the case of regression models

data("Sacramento")
fit<-rpart(price~.,data = Sacramento )
preds<-predict(fit, Sacramento)
obs<-Sacramento[, 'price']
cats<-as.factor(Sacramento$city)
x<-preformanceByCatagory(preds, obs, cats ,catName = 'city' )
plot(x,2)

Cluster Model Utilities

Assign new clusters using kmeans to new data

table(predictKmeans(km, iris)[,'cluster'])
x<-iris
x[,1]<-NA
table(predictKmeans(km, x)[, 'cluster'])

Find the optimal number of clusters

findClusters(iris[,1:4])

Data Frame and Factor Utlities

Find differences between two factors

set.seed(12012)
x = as.factor(rep(c('a','b','c' , 'a'), 10))
y = as.factor(sample(c('a','b','c'), 30, replace = TRUE))
z<-factorDiffs(x,y)
z

Find difference between all factors in two data frames

size = 100
df1<-data.frame(x = as.factor(sample(c('a','b','c' ,'d'),size, replace = TRUE)),
                y = as.factor(sample(c('a','b','c','a') ,size, replace = TRUE)))

df2<-data.frame(x = as.factor(sample(c('a','b','c','c'),size, replace = TRUE)),
                y = as.factor(sample(c('a','b','c','b'), size, replace = TRUE)))

dataFrameFactDiffs(df1, df2)

Dataframe Transforms

Reorder all factor levels accordning to frequency, works on a data frame or a factor

mydata<-data.frame(x = c(rep('a', 10),rep('b', 15), rep('c' , 10)),
                    y = c(rep('a', 10),rep('b', 5), rep('c' , 20)))
str(mydata)
str(reOrderAll(mydata))

Level Matcher Apply one set of levels to another data frame In the case where levels are in the new data frame, but not in the original, the values are mapped to

data1<-data.frame(x = as.factor(rep(c('a', 'b'), 10)), 
                  y =  as.factor(rep(c('c', 'd'), 10)))

data2<-data.frame(x = as.factor(rep(c('a', 'b', 'GG'), 10)), 
                  y =  as.factor(rep(c('c', 'FF','GG'), 10)))
str(levelMatcher(newData = data2, oldData = data1))

Map all values NA values unknown catagory or a specific number if numeric

# create some data
mydata<-data.frame(x1 = rep(c('a', 'b', NA, 'c'),10),
                   x2 = rep(c('a', 'b', NA, 'c'),10),
                   y1 = rep(c(1,2,3,NA), 10),
                   y2 = rep(c(1,2,3,NA), 10))

head(NAtoValueDf(mydata, numVal = 0 , newLevName = 'unKnown' ), 10)

Variable Contributions

Generate a predictive model for housing price over the mean housing price using the Sacramento data set inside caret. This model will be used to demonstrate logistic difference between observation functions

library(caret)
library(ranger)
library(rpart)

data("Sacramento")
predVars<-c('city', 'beds', 'sqft','baths', 'type')
rd<-Sacramento[, predVars]
rd$class<-as.factor(ifelse(Sacramento$price > mean(Sacramento$price),1,0 ))
f<-formula(class~.)
fit<-glm(f, rd, family = 'binomial')

preds<-predict(fit, rd, type = 'response' )
fitranger<-ranger(f, rd, num.tree = 10, probability = TRUE, importance = 'impurity')
fitrpart<-rpart(f, rd)

Find the different factor contributions between a reference observation and a test observation. Uses a the most likely observation to test against.

logisticDifference(fitranger,rd[2,predVars], mydata =  rd )

Find the different factor contributions between a reference observation and a test data frame of observations. At the moment this does not scale very well.

library(reshape)
library(modelToolKit)
x<-logisticDifferenceDf(model = fitranger,
                        testObs = rd[1:4,predVars],
                        refData = rd, 
                        p = .4,
                        verbose = TRUE )
print(x$output)
plot(x, indvidualPlot = TRUE, n.vars = 3)