Get predictive model prefomance by a catagory. Check to see if a model is preforming differently in by a group or catagory
try(detach(package:modelToolKit, unload = TRUE)) library(modelToolKit) library(treemap) library(caret) library(rpart) data(iris) set.seed(2012) fit<-rpart(Species~., iris) km<-kmeans(iris[,1:4], 3) preds<-predict(fit, iris, type = 'class') obs<-iris[,'Species'] cats<-as.factor(km$cluster)
plot preformance by class
x<-preformanceByCatagory(preds, obs, cats) plot(x)
data("Sacramento") fit<-rpart(price~.,data = Sacramento ) preds<-predict(fit, Sacramento) obs<-Sacramento[, 'price'] cats<-as.factor(Sacramento$city) x<-preformanceByCatagory(preds, obs, cats ,catName = 'city' ) plot(x,2)
Assign new clusters using kmeans to new data
table(predictKmeans(km, iris)[,'cluster']) x<-iris x[,1]<-NA table(predictKmeans(km, x)[, 'cluster'])
Find the optimal number of clusters
findClusters(iris[,1:4])
Find differences between two factors
set.seed(12012) x = as.factor(rep(c('a','b','c' , 'a'), 10)) y = as.factor(sample(c('a','b','c'), 30, replace = TRUE)) z<-factorDiffs(x,y) z
Find difference between all factors in two data frames
size = 100 df1<-data.frame(x = as.factor(sample(c('a','b','c' ,'d'),size, replace = TRUE)), y = as.factor(sample(c('a','b','c','a') ,size, replace = TRUE))) df2<-data.frame(x = as.factor(sample(c('a','b','c','c'),size, replace = TRUE)), y = as.factor(sample(c('a','b','c','b'), size, replace = TRUE))) dataFrameFactDiffs(df1, df2)
Reorder all factor levels accordning to frequency, works on a data frame or a factor
mydata<-data.frame(x = c(rep('a', 10),rep('b', 15), rep('c' , 10)), y = c(rep('a', 10),rep('b', 5), rep('c' , 20))) str(mydata) str(reOrderAll(mydata))
Level Matcher Apply one set of levels to another data frame
In the case where levels are in the new data frame, but not in the original, the values are mapped to
data1<-data.frame(x = as.factor(rep(c('a', 'b'), 10)), y = as.factor(rep(c('c', 'd'), 10))) data2<-data.frame(x = as.factor(rep(c('a', 'b', 'GG'), 10)), y = as.factor(rep(c('c', 'FF','GG'), 10))) str(levelMatcher(newData = data2, oldData = data1))
Map all values NA values unknown catagory or a specific number if numeric
# create some data mydata<-data.frame(x1 = rep(c('a', 'b', NA, 'c'),10), x2 = rep(c('a', 'b', NA, 'c'),10), y1 = rep(c(1,2,3,NA), 10), y2 = rep(c(1,2,3,NA), 10)) head(NAtoValueDf(mydata, numVal = 0 , newLevName = 'unKnown' ), 10)
Generate a predictive model for housing price over the mean housing price using the Sacramento data set inside caret. This model will be used to demonstrate logistic difference between observation functions
library(caret) library(ranger) library(rpart) data("Sacramento") predVars<-c('city', 'beds', 'sqft','baths', 'type') rd<-Sacramento[, predVars] rd$class<-as.factor(ifelse(Sacramento$price > mean(Sacramento$price),1,0 )) f<-formula(class~.) fit<-glm(f, rd, family = 'binomial') preds<-predict(fit, rd, type = 'response' ) fitranger<-ranger(f, rd, num.tree = 10, probability = TRUE, importance = 'impurity') fitrpart<-rpart(f, rd)
Find the different factor contributions between a reference observation and a test observation. Uses a the most likely observation to test against.
logisticDifference(fitranger,rd[2,predVars], mydata = rd )
Find the different factor contributions between a reference observation and a test data frame of observations. At the moment this does not scale very well.
library(reshape) library(modelToolKit) x<-logisticDifferenceDf(model = fitranger, testObs = rd[1:4,predVars], refData = rd, p = .4, verbose = TRUE ) print(x$output) plot(x, indvidualPlot = TRUE, n.vars = 3)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.