scoringmodule <- function(filename,modelSel,prevSessionid) {
library(VIF)
library(EvaluationMeasures)
library(pROC)
library(SDMTools)
library(party)
library(caret)
library(randomForest)
library(ROSE)
library(e1071)
library(car)
library(plyr)
library(dplyr)
library(caTools)
library(glmnet)
library(VIF)
library(plotly)
library(woeBinning)
library(tidyverse)
loc <- getServerPath(prevSessionid,getwd())
cleanedDataLoc <- paste0(loc,'/cleaned_data.csv')
data<-read.csv(file=cleanedDataLoc,stringsAsFactors = FALSE)
if(!grepl(".csv$", filename)){
stop("Uploaded filename must be a .csv filename!");
}
df_full_bin<-utils::read.csv(filename, header = TRUE, stringsAsFactors = FALSE )
list(message = paste("Read Successful" ))
test<-data.frame(df_full_bin)
thresholdLoc <- paste0(loc,'/threshold.csv')
data_pts<-read.csv(file=thresholdLoc,stringsAsFactors = FALSE)
target.var.name <- data_pts$DVName
typeof(target.var.name)
preprocessing <- function(df_data,dv){
data <- df_data
#add string for data summary
Summary_df <- data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE)
#consider target variable name given as input in HybridFS function as DV'
n <- dv
if(length(unique(data[[n]]))!=2)
{
stop("Error: Dependent variables should be 2 for binary classification!")
}
names(data)[names(data)==n] <- "DV"
df_temp<-data
#get the list of categorical variables
cat_var=data.frame()
df_temp<-df_temp[, names(df_temp) != "DV"]
#get the list of character variables
char <- df_temp[sapply(df_temp,is.character)]
cat_var<-char
#get the list of logical variables
logcl <- df_temp[sapply(df_temp, is.logical)]
cat_var<-cbind(cat_var,logcl)
if(ncol(cat_var) == 0)
{
fact <- df_temp[sapply(df_temp, is.factor)]
cat_var<-cbind(cat_var,fact)
}
#get the list of Factors
#fact <- df_temp[sapply(df_temp, is.factor)]
#cat_var<-cbind(cat_var,fact)
#removing the categorical variables in df_temp
df_temp<-df_temp[, !sapply(df_temp,is.logical)]
df_temp<-df_temp[, !sapply(df_temp,is.character)]
#determining other categorical variables with less than 52 levels
unique_lvl_cnt<-df_temp[lengths(lapply(df_temp, unique)) <= 52]
disc_var_names<-list()
disc_var_names<-names(unique_lvl_cnt)
discrete <- list(discrete=I(disc_var_names))
cat_var<-cbind(cat_var,unique_lvl_cnt)
cat_var_names<-list()
cat_var_names<-names(cat_var)
#display the list of categorical variables
#cat_var_names
categorical <- list(categorical=I(cat_var_names))
df_cont <- data[, names(data) != "DV"]
for(i in names(cat_var))
{
df_cont<-df_cont[names(df_cont) != i]
}
cont_var_names<-list()
cont_var_names<-names(df_cont)
#display the list of continuous variables
#cont_var_names
continuous <- list(continuous=I(cont_var_names))
#store the variables as list of lists
final_list <- list(discrete,categorical,continuous)
#add string for variable list
lapply(final_list, function(x) write.table( data.frame(x), 'LogFile.csv' , append= T, sep=',' ))
######################################################################################################################################################
#Categorical variables treatment
######################################################################################################################################################
#Steps
#Replace missing values as "Unknown" in categorical variables
#Check for categorical variables with more than 52 levels and reduce them to the top 10 levels that occur frequently
#convert categorical variables to factors and unique variables treatment
for(j in cat_var_names)
{
print(j)
data[,j]<-as.factor(data[,j])
if(length(unique(data[[j]])) >= 0.9*nrow(data))
{
data<-data[, names(data) != j]
}
}
#add string for cat var treatment
cat_var_df <- data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE)
write.table(cat_var_df, "LogFile.csv", sep = ",", col.names = T, append = T)
#Identify replace the missing values as Unknown in categorical variables
df_cat = data[sapply(data, is.factor) & colnames(data) != "DV"]
#View(df_cat)
#df_cat <- data[,sapply(data,is.factor)]
#df_cat <- df_cat[, names(df_cat) != "DV"]
if(ncol(df_cat)>0)
{
for (i in 1:ncol(df_cat))
{
levels <- levels(df_cat[,i])
if('Unknown' %in% levels)
{}else{
levels[length(levels) + 1] <- 'Unknown'
df_cat[,i] <- factor(df_cat[,i], levels = levels)
}
# refactor to include "Unknown" as a factor level
# and replace NA, null, blanks and ? with "Unknown"
df_cat[,i] <- factor(df_cat[,i], levels = levels)
df_cat[,i][is.na(df_cat[,i])] <- "Unknown"
df_cat[,i][is.null(df_cat[,i])] <- "Unknown"
df_cat[,i] <- sub("[?]", "Unknown", df_cat[,i])
df_cat[,i] <- sub("^$", "Unknown", df_cat[,i])
df_cat[,i]<-as.factor(df_cat[,i])
}
#add string for missing value treatment
missing_df <- data.frame(unclass(summary(df_cat)), check.names = FALSE, stringsAsFactors = FALSE)
write.table(missing_df, "LogFile.csv", sep = ",", col.names = T, append = T)
#Check for categorical variables with more than 52 levels and reduce them to the top 10 levels that
#occur frequently
for(i in names(df_cat))
{
column<-df_cat[,i]
uniq_lvls_cnt <- length(unique(column))
temp<-as.data.frame(column)
if (uniq_lvls_cnt>52)
{ temp<-data.frame()
cat_freq_cnt <- data.frame(table(column))
cat_freq_cnt <- cat_freq_cnt[ which(cat_freq_cnt$column!='Unknown' ),]
cat_sort <- cat_freq_cnt[order(-cat_freq_cnt$Freq),]
top_1<-head(cat_sort,10)
top<-as.character(top_1[,1])
data_cnt<-length(column)
levels = unique(column)
levels=as.character(levels)
temp <- factor(temp, levels = levels)
if('Unknown' %in% levels)
{}else{
levels[length(levels) + 1] <- 'Unknown'
temp <- factor(temp, levels = levels)
}
for(k in 1:data_cnt)
{
value<-column[k]
if(value %in% top)
{
temp<-rbind(temp,as.character(value))
}else
{
temp<-rbind(temp,'Unknown')
}
}}
df_cat[,i]<-temp
}
#add string to show reduced categorical values > 52
reduce_cat_df <- data.frame(unclass(summary(df_cat)), check.names = FALSE, stringsAsFactors = FALSE)
write.table(reduce_cat_df, "LogFile.csv", sep = ",", col.names = T, append = T)
#**********dv leakage code begin*************
df_factor_check<-data.frame()
dvleak_data<-df_cat
dvleak_data$DV <- ifelse(data$DV==unique(data$DV)[1], 0, 1)
for (fac in colnames(dvleak_data))
{
len<-1
if (class(dvleak_data[,fac])=="factor")
{
df_factor<-dvleak_data[,c("DV",fac)]
num_dv<-aggregate(DV~.,df_factor,sum)
num_levels <- aggregate(DV~.,df_factor,length)
df_factor_final<-merge(num_levels,num_dv,by=fac)
while(len<=nrow(num_dv))
{
output_matrix <- as.data.frame(matrix(data=c(
fac,
as.character(df_factor_final[len,1]),
df_factor_final[len,2],
df_factor_final[len,3]
),nrow=1,ncol=4))
df_factor_check <- rbind(df_factor_check, output_matrix)
len = len + 1
}
}
}
sum_dv<-sum(df_factor$DV==1)
df_factor_check<-cbind(df_factor_check,sum_dv)
colnames(df_factor_check)[1]<-"Variable"
colnames(df_factor_check)[2]<-"Factor_levels"
colnames(df_factor_check)[3]<-"Records_each_level"
colnames(df_factor_check)[4]<-"Num_Records_DV=1"
colnames(df_factor_check)[5]<-"Sum_of_DV"
df_factor_check$`Num_Records_DV=1`<-as.numeric(as.character(df_factor_check$`Num_Records_DV=1`))
for ( i in 1:nrow(df_factor_check))
{
df_factor_check$dv_leak[i]<-(df_factor_check$Sum_of_DV[i])-(df_factor_check$`Num_Records_DV=1`[i])
}
# subset for dv_leak =0 and delete the variables in the original data set.
#df_factor_check$`Num_Employees_DV=1`<-as.numeric(as.character(df_factor_check$`Num_Employees_DV=1`))
rem_names=as.character(df_factor_check[df_factor_check$dv_leak==0,]$Variable) #Response same for 90% of the employees
if(length(rem_names)!=0)
{
for ( i in 1:length(rem_names))
{
df_cat[,rem_names[i]]<-NULL
}
}
}
#******DV leakage code ends**************
write.csv(df_cat, file = "final_out.csv")
######################################################################################################################################################
#continuous variables treatment
######################################################################################################################################################
#Steps
#first get the correlation matrix
#get the variable pairs that highly correlated
#bin the variables using woe binning
#get the significance of these binned variables using chi square test.
#Remove variables from highly correlated variable list that are not significant
#check if muliticollinearity still exists and keep removing variables until vif drops below 5 for all variables
#get the binned version of the continuous variables
df<-data
#Removing categorical variables from data
for(i in cat_var_names)
{
df<-df[names(df) != i]
}
#unique variables treatment if ID is not categorical
for(i in names(df))
{
if(grepl("id", i) | grepl("ID", i))
{
if(length(unique(df[[i]])) >= 0.9*nrow(df))
{
df<-df[names(df) != i]
}
}
}
df1<-df%>%data.frame
#creating correlation matrix for continuous variables
if(length(df1)>1)
{
df1<-df1[complete.cases(df1),]
##New Change - Sai - DV should not be sent for correlation check
corr<-round(cor(df1[,names(df1) != 'DV']),2)
corr_val<-corr
corr_val[lower.tri(corr_val,diag=TRUE)]=NA # make all the diagonal elements as NA
corr_val<-as.data.frame(as.table(corr_val)) # as a dataframe
corr_val<-na.omit(corr_val) # remove NA
corr_val<-corr_val[with(corr_val, order(-Freq)), ] # order by correlation
corr_test_var<-subset(corr_val,Freq>=0.85)
#add string to show continuous var treatment
reduce_cat_df <- data.frame(unclass(summary(df_cat)), check.names = FALSE, stringsAsFactors = FALSE)
#write.table(reduce_cat_df, "LogFile.csv", sep = ",", col.names = T, append = T)
#adding ".binned" to each variable
##New Change - Sai - Check if the df is not empty before operation
if(nrow(corr_test_var) > 0)
{
corr_test_var$Var1<-paste(corr_test_var$Var1,"binned",sep = ".")
corr_test_var$Var2<-paste(corr_test_var$Var2,"binned",sep = ".")
}
#woe binning
var_del<-as.character(names(df))
binning <- woeBinning::woe.binning(df, 'DV', df)
tabulate.binning <- woeBinning::woe.binning.table(binning)
data_cont_binned <- woeBinning::woe.binning.deploy(data, binning)
#names(data_cont_binned)
#add string to show binned variables
bin_df <- data.frame(unclass(summary(data_cont_binned)), check.names = FALSE, stringsAsFactors = FALSE)
#write.table(bin_df, "LogFile.csv", sep = ",", col.names = T, append = T)
#removing original values of variables that have been binned
for(i in var_del)
{
data_cont_binned<-data_cont_binned[, names(data_cont_binned) != i]
}
data_cont_binned[is.na(data_cont_binned)] <- "Missing"
data_cont_binned$DV<-data$DV
#getting the correlated variables as a unique list
##New Change - Sai - Check if there are any correlated variables before
##applying treatments
if(nrow(corr_test_var) > 0)
{
corr_var<-list()
corr_var<-corr_test_var$Var1
corr_var<-c(corr_var,corr_test_var$Var2)
corr_var_unique<-unique(corr_var)
#getting the chi sq for each highly correlated variable
corr_var_chsq <- data.frame()
for(each in corr_var_unique)
{
p_val=chisq.test((data_cont_binned[[each]]),data_cont_binned$DV)$p.value
c <- data.frame('Var_name' = each,'p_value' = p_val)
corr_var_chsq <- rbind(corr_var_chsq,c)
}
#add string to show Chi sq test
chisq_df <- data.frame(unclass(summary(corr_var_chsq)), check.names = FALSE, stringsAsFactors = FALSE)
#write.table(chisq_df, "LogFile.csv", sep = ",", col.names = T, append = T)
#remove the highly correlated variables that are not significant
corr_var_insig<-as.character(corr_var_chsq[which(corr_var_chsq$p_value>0.05),1])
#stripping off the "'binned" from variable names
corr_var_insig_strip<-substr(corr_var_insig, 1, nchar(corr_var_insig)-7)
#removing the insignificant variables
for (f in corr_var_insig) {
df1[[f]] <- NULL
}
}
df1$DV <- ifelse(df1$DV==unique(df1$DV)[1], 0, 1)
#checking if we still have multi collinearity and removing variables with very high vif until no such variable exists
# Fit a model to the data
fit=glm(DV ~ ., data=df1,family=binomial)
##New Change - Nithya - Check if there are linearly dependent variables in the model and remove it
df_alias <- attributes(alias(fit)$Complete)$dimnames[[1]]
if(!is.null(df_alias))
{
for(i in df_alias)
{
df1<-df1[, names(df1) != i]
}
# Fit a model to the data
fit=glm(DV ~ ., data=df1,family=binomial)
}
# Calculating VIF for each independent variable
car::vif(fit)
# Set a VIF threshold. All the variables having higher VIF than threshold
#are dropped from the model
threshold=5
# Sequentially drop the variable with the largest VIF until
# all variables have VIF less than threshold
drop=TRUE
aftervif=data.frame()
while(drop==TRUE) {
vfit=car::vif(fit)
aftervif=rbind.fill(aftervif,as.data.frame(t(vfit)))
if(max(vfit)>threshold) { fit=
update(fit,as.formula(paste(".","~",".","-",names(which.max(vfit))))) }
else { drop=FALSE }}
# How variables were removed sequentially
t_aftervif= as.data.frame(t(aftervif))
# Final (uncorrelated) variables and their VIFs
vfit_d= as.data.frame(vfit)
#add string to show VIF
vif_df <- data.frame(unclass(summary(vfit_d)), check.names = FALSE, stringsAsFactors = FALSE)
#write.table(vif_df, "LogFile.csv", sep = ",", col.names = T, append = T)
rem_var<-as.character(rownames(vfit_d))
#retaining only the uncorrelated variables in the final data
df<-df[ , rem_var]
#getting the concatenated version of variables that needs to be retained
rem_var<-paste(rem_var,"binned",sep=".")
#getting the binned continuous variables
data_cont_binned_fin<-data_cont_binned[,rem_var]
df_final <- df_cat
df_cat <- cbind(df_cat, data_cont_binned_fin)
}
######################################################################################################################################################
#getting the final data frame with the required variables
######################################################################################################################################################
df_final <- cbind(df_final, df)
write.csv(df_final, file = "clean.csv")
##New Change - Sai - DV should be added since the df_cat
## is used in building final dataframe
final_data_after_processing=data.frame()
final_data_after_processing<-df_final
final_data_after_processing<-cbind(final_data_after_processing,select(data,.data$DV))
#add string to show summary of final pre-processed data
#final_df <- data.frame(unclass(summary(final_data_after_processing)), check.names = FALSE, stringsAsFactors = FALSE)
#write.table(final_df, "LogFile.csv", sep = ",", col.names = T, append = T)
return(final_data_after_processing)
}
test <- preprocessing(test,target.var.name)
names(data)[names(data)==target.var.name] <- "DV"
names(test)[names(test)==target.var.name] <- "DV"
#############################################
#data_nDV<-data.frame(filename)
library(dplyr)
data_names<-names(data %>% select(contains("binned")))
test_names<-sub(".binned","",data_names)
names<-list()
for (value in data_names){
names[[value]]<-unique(data[value])
}
# for(i in 1:2){
# for(j in 1:length(unlist(names[[i]]))){
# print(unlist(names[[i]])[j])
# }
# }
#
hold_data <- list()
for( i in 1:length(names)){
data_final <- c()
tt <- names[[i]]
colnames(tt) <- NULL
for(z in 1:nrow(tt)){
m <- tt[z,]
l <- unlist( strsplit(as.character(m),","))
lower <- NA
upper <- NA
for(z_z in 1:length(l)){
o <- l[z_z]
x <- gsub("\\(|\\[|\\)|\\]","",o)
holdit <- gsub("^\\s+|\\s+$", "", x)
hold_me <- holdit
if( z_z == 1){
lower <- hold_me
}else{
upper <- hold_me
}
}
#temp <- data.frame(lower,upper)
temp <- c(lower,upper)
data_final <- rbind(data_final,temp)
}
data_final <- as.matrix(data_final)
colnames(data_final) <- c("Lower","Upper")
rownames(data_final) <- NULL
hold_data[[i]] <- data_final
}
names(hold_data) <- names(names)
for (i in data_names){
test[,i]<-NA
}
for (row in 1:nrow(test)) {
for (i in 1:length(data_names)){
nam<-data_names[i]
y=length(hold_data[[i]])/2
for (x in 1:y){
if(is.na(as.integer(hold_data[[i]][x,]["Lower"]))){
if(test[test_names[i]][row,]<=as.numeric(hold_data[[i]][x,]["Upper"])){
test[row,][nam]<-(paste("(",hold_data[[i]][x,]["Lower"],",",hold_data[[i]][x,]["Upper"],"]", sep=""))
}
}
else if(is.na(as.integer(hold_data[[i]][x,]["Upper"]))){
if(test[test_names[i]][row,]>as.numeric(hold_data[[i]][x,]["Lower"])){
test[row,][nam]<-paste("(",hold_data[[i]][x,]["Lower"],", ",hold_data[[i]][x,]["Upper"],"]", sep="")
}
}
else{
if(test[test_names[i]][row,]>as.numeric(hold_data[[i]][x,]["Lower"]) && test[test_names[i]][row,]<=as.numeric(hold_data[[i]][x,]["Upper"])){
test[row,][nam]<-paste("(",hold_data[[i]][x,]["Lower"],",",hold_data[[i]][x,]["Upper"],"]", sep="")
}
}
}
}}
test$DV <- NULL
test <- test[, !colnames(test) %in% test_names]
write.csv(test,'test_binned.csv')
testD<-test
###To Convert into categorical an variables
variablesLoc <- paste0(loc,'/variable_list.csv')
data_type<-read.csv(file=variablesLoc,stringsAsFactors = FALSE)
cat_var<- as.vector(data_type$categorical)
cat_var <- cat_var[!is.na(cat_var)]
num_var<- as.vector(data_type$continuous)
num_var <- num_var[!is.na(num_var)]
for (value in cat_var){
#print(typeof(testD[value]))
testD[value]<- as.factor(testD[[value]])
}
for (value in num_var){
testD[value]<- as.numeric(testD[[value]])
}
#
model<-data_pts$ModelName
if (model == "SVM") {
typeResp <- 'prob'
} else if(model == "NB"){
typeResp <- 'raw'
} else {
typeResp <- 'response'
}
posit_class<-data_pts$PredictorClass
if(is.null(posit_class))
{
if(is.numeric(testD$DV))
{
posit_class <- 1
}
else if(is.factor(testD$DV))
{
dvList <- tolower(unique(testD$DV))
if("yes" %in% dvList)
{
posit_class <- "yes"
}
else
{
posit_class <- names(which.max(table(testD$DV)))
}
}
positive_class <- posit_class
}
if(posit_class==1) {
negClass <- 0
} else {
uniqLvls <- as.character(unique(testD$DV))
negClass <- uniqLvls[uniqLvls != posit_class]
}
#threshold<-k_stat_value(modelInput,trainD,testD,posit_class,model)
threshold<-data_pts$Threshold
modelName <- paste(tolower(modelSel),'_model.RData',sep="")
finalmodelPath <- paste0(loc,paste0('/',modelName))
#the_model<-load(paste(modelPath,modelName,sep=""))
the_model <- load(finalmodelPath)
sum_model <- get(the_model)
if(! (model %in% c('SVM','NB'))) {
pred <- predict(sum_model, newdata=testD, type=typeResp,se.fit=FALSE)
} else {
pred <- predict(sum_model, newdata=testD, type=typeResp,se.fit=FALSE)[,posit_class]
}
testD$Prob <- pred
testD$Predicted[pred>max(threshold)] <- posit_class
testD$Predicted[pred<=max(threshold)] <- negClass
names(testD)[names(testD)=="Predicted"] <- target.var.name
write.csv(testD,'scoredData.csv')
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.