Nothing
#' Contour plots and MCA function for classification modeling
#'
#' This function presents visual graphics by means of Multiple correspondence Analysis projection.
#' Interval variables are categorized to bins.
#' Dependent classification variable is set as supplementary variable.
#' Machine learning algorithm predictions are presented in a filled contour setting.
#' @usage mcacontour(dataf=dataf,listconti,listclass,vardep,proba="",bins=8,
#' Dime1="Dim.1",Dime2="Dim.2",classvar=1,intergrid=0,selec=0,
#' title="",title2="",listacol="",depcol="",alpha1=0.8,alpha2=0.8,alpha3=0.7,modelo="glm",
#' nodos=3,maxit=200,decay=0.01,sampsize=400,mtry=2,nodesize=5,
#' ntree=400,ntreegbm=500,shrink=0.01,bag.fraction=1,n.minobsinnode=10,C=100,gamma=10)
#' @param bins Number of bins for categorize interval variables .
#' @inheritParams famdcontour
#' @keywords MCA, classification, contour_curves
#' @export
#' @import MASS
#' @import ggplot2
#' @importFrom MBA mba.points
#' @importFrom magrittr %>%
#' @importFrom nnet nnet
#' @import gbm
#' @import e1071
#' @importFrom stats aggregate formula glm terms predict family binomial
#' @importFrom dplyr inner_join
#' @importFrom randomForest randomForest
#' @importFrom pROC roc
#' @examples
#' data(breastwisconsin1)
#' dataf<-breastwisconsin1
#' listconti=c( "clump_thickness","uniformity_of_cell_shape","mitosis")
#' listclass=c("")
#' vardep="classes"
#' result<-mcacontour(dataf=dataf,listconti,listclass,vardep)
#' @details
#' This function applies MCA (Multiple Correspondence Analysis) in order to project points and categories of
#' class variables in the same plot. In addition, interval variables listed in listconti are categorized to
#' the number given in bins parameter (by default 8 bins). Further explanation about machine learning classification
#' and contour curves, see the famdcontour function documentation.
#' @return A list with the following objects:\describe{
#' \item{graph1}{plot of points on MCA two dimensions}
#' \item{graph2}{plot of points and variables}
#' \item{graph3}{plot of points and contour curves}
#' \item{graph4}{plot of points, contour curves and variables}
#' \item{graph5}{plot of points colored by fitted probability}
#' \item{graph6}{plot of points colored by abs difference}
#' \item{df1}{ dataset used for graph1}
#' \item{df2}{ dataset used for graph2}
#' \item{df3}{ dataset used for graph3}
#' \item{df4}{ dataset used for graph4}
#' \item{listconti}{ interval variables used}
#' \item{listclass}{ class variables used}
#' \item{...}{color schemes and other parameters }
#' }
######################################################################
#
# mcacontour.R
#
# copyright (c) 2020-13-10, Javier Portela
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# version 3, as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but without any warranty; without even the implied warranty of
# merchantability or fitness for a particular purpose. See the GNU
# General Public License, version 3, for more details.
#
# A copy of the GNU General Public License, version 3, is available
# at http://www.r-project.org/Licenses/GPL-3
#
######################################################################
mcacontour<-function(dataf=dataf,listconti,listclass,vardep,proba="",bins=8,
Dime1="Dim.1",Dime2="Dim.2",classvar=1,intergrid=0,selec=0,
title="",title2="",listacol="",depcol="",alpha1=0.8,alpha2=0.8,alpha3=0.7,
modelo="glm",nodos=3,maxit=200,decay=0.01,sampsize=400,mtry=2,nodesize=5,ntree=400,
ntreegbm=500,shrink=0.01,bag.fraction=1,n.minobsinnode=10,C=100,gamma=10)
{
if (any(listacol=="")==TRUE)
{
listacol<-c("#377EB8", "#4DAF4A", "#984EA3", "#FF7F00",
"#A65628", "#F781BF", "#999999", "#1B9E77", "#D95F02", "#7570B3",
"#E7298A", "#66A61E","#E6AB02", "#A6761D", "#666666", "#8DD3C7",
"#FFFFB3", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69",
"#FCCDE5", "#D9D9D9","#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C",
"#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928")
}
minori<-numeric()
dimf1<-numeric()
dimf2<-numeric()
Frecu<-numeric()
Variable=character()
fontface=numeric()
z<-numeric()
dife<-numeric()
tabla1<-as.data.frame(table(dataf[,vardep]))
tabla1<-tabla1[order(tabla1$Freq),]
minoritaria<-as.character(tabla1[1,c("Var1")])
tabla1<-tabla1[order(-tabla1$Freq),]
mayoritaria<-as.character(tabla1[1,c("Var1")])
cosa<-as.data.frame(prop.table(table(dataf[[vardep]])))
fremin<-100*round(min(cosa$Freq),2)
totalobs=nrow(dataf)
minori=1
cosa<-as.data.frame(table(dataf[[vardep]]))
totalmin<-round(min(cosa$Freq),2)
salida<-mcamodelobis(dataf=dataf,listconti=listconti,listclass=listclass,vardep=vardep,
Dime1=Dime1,Dime2=Dime2,bins=bins,selec=selec)
daf<-as.data.frame(salida[[1]])
etiquetas<-as.data.frame(salida[[2]])
datafnuevo<-as.data.frame(salida[[3]])
listconti<-salida[4]
listclass<-salida[5]
if (length(listclass)==0)
{listclass<-c("")}
ejex<-salida[6]
ejey<-salida[7]
variss<-salida[[8]]
formu<-paste("factor(",vardep,")~.")
formu<-stats::formula(formu)
# *********************
# MODELOS
# *********************
if (any(proba=="")==TRUE)
{
if (modelo=="nnet") {
model <- nnet( formu,data=datafnuevo, size = nodos,
maxit = maxit, trace = FALSE,decay=decay,linout=FALSE)
proba<- predict(model,dataf,type="raw")
}
if (modelo=="glm") {
model <- glm(formula(formu),family=binomial(link='logit'),data=datafnuevo)
proba<- predict(model,dataf,type="response")
}
if (modelo=="rf") {
if (sampsize>nrow(dataf)) {sampsize=nrow(dataf)}
model <- randomForest(formu,data=datafnuevo,sampsize=sampsize,
mtry=mtry,nodesize=nodesize,ntree=ntree)
proba <- predict(model, dataf,type="prob")
proba<-as.vector(proba[,2])
}
if (modelo=="gbm") {
formu<-paste("clasefin~.")
datafnuevo$clasefin<-ifelse(datafnuevo[,c(vardep)]==minoritaria,1,0)
# datafnuevo$clasefin<-as.character(datafnuevo$clasefin)
a2<-datafnuevo
a2[,c(vardep)]<-NULL
dataf$clasefin<-ifelse(dataf[,c(vardep)]==minoritaria,1,0)
# dataf$clasefin<-as.character(dataf$clasefin)
a3<-dataf
a3[,c(vardep)]<-NULL
model <- gbm(formula(formu),distribution="bernoulli", data=a2,
n.trees=ntreegbm,shrinkage = shrink,
bag.fraction = bag.fraction,n.minobsinnode =n.minobsinnode ,
interaction.depth = 2)
proba <- predict(model,a3,type="response",n.trees=model$n.trees)
}
if (modelo=="svm") {
a3<-datafnuevo
a3$vardep2<-ifelse(a3[,c(vardep)]==minoritaria,1,0)
a3[,c(vardep)]<-NULL
formu<-paste("factor(vardep2)~.")
model <- svm(formu, data=a3,
kernel = "radial",cost=C,gamma=gamma,probability = TRUE)
proba<- predict(model, dataf,probability = TRUE)
proba<-as.data.frame(attr(proba, "probabilities","dimnames"))
proba<-as.data.frame(proba[,c("1")])
names(proba)<-"V1"
datafnuevo$vardep2<-NULL
}
}
proba<-as.data.frame(proba)
names(proba)<-"V1"
union<-cbind(proba,daf)
curvaroc<-suppressMessages(pROC::roc(response=union$vardep,predictor=union$V1))
auc<-round(curvaroc$auc,2)
if (any(title2=="")==TRUE)
{
title2<-paste(vardep," ","minor=",fremin,"% ","totalobs=",totalobs," ","totalmin=",totalmin)
title3<-paste(vardep," ","minor=",fremin,"% ","totalobs=",totalobs," ","totalmin=",totalmin," ",
"algorithm=", modelo," auc=",auc)
}
else if (any(title2=="")==FALSE)
{
title3=title2
}
union$x<-union$dimf1
union$y<-union$dimf2
union$z<-union$V1
unia<-union[,c("x","y","z")]
unia<-unia[order(unia$x,unia$y,unia$z),]
filasorig<-nrow(unia)
aggdata <-aggregate(unia, by=list(unia$x,unia$y),
FUN=mean, na.rm=TRUE)
filasfin<-nrow(aggdata)
ag<-aggdata[order(aggdata$x,aggdata$y),]
# Aquí miro min max
minx<-min(aggdata$x)
miny<-min(aggdata$y)
maxx<-max(aggdata$x)
maxy<-max(aggdata$y)
filas<-nrow(aggdata)
rango1<-maxy-miny
rango2<-maxx-minx
if (intergrid==0) {
intergrid=0.5*rango1*rango2/filas
intergrid=sqrt(intergrid)
}
f<-data.frame()
for (x in seq(minx,maxx,intergrid)) {
for (y in seq(miny,maxy,intergrid)){
b<-cbind(x,y)
f<-rbind(f,b)
}
}
ag<-ag[,c("x","y","z")]
# AQUÍ SE INTERPOLA EN LOS PUNTOS CREADOS EN EL GRID F ANTERIOR, CON BASE EN EL dataf AG
otro<-mba.points(ag,xy.est=f)
otro<-as.data.frame(otro)
otro<-otro[order(otro$xyz.est.x,otro$xyz.est.y),]
t<-otro
t$x<-t$xyz.est.x
t$y<-t$xyz.est.y
t$z<-t$xyz.est.z
t$z<-ifelse(t$z<0,0.00000001,t$z)
t$z<-ifelse(t$z>1,0.9999999,t$z)
t<-t[,c("x","y","z")]
daf$x<-daf$dimf1
daf$y<-daf$dimf2
uni<-etiquetas
uni$x<-uni$dimf1
uni$y<-uni$dimf2
uni$Variable<-as.character(uni$Variable)
# GRAFICOS
# COLORES
if (any(depcol=="")==TRUE)
{
depcol<-c("darkolivegreen3", "red")
}
la<-c(mayoritaria,minoritaria)
colScale <- scale_colour_manual(name = vardep,values = depcol,labels=la)
# k colors from list
# a<-paste(".",vardep,sep="")
# uni<-uni[which(uni$Variable!=a),]
ta<-as.data.frame(table(uni$Variable))
longi<-nrow(ta)
ta<-as.character(ta$Var1)
listabis<-listacol[c(1:longi)]
la2<-c(la,ta)
colScale2<- scale_colour_manual(name = "",values =c(depcol,listabis),labels=la2)
mxim<-max(as.numeric(names(table(daf$Frecu))))
k<-nrow(table(daf$Frecu))
vectorFre<-as.numeric(names(table(daf$Frecu)))
if (k<=4)
{
bre<-vectorFre
rango<-c(1,k)
}
if (k>=4)
{
bre<-numeric()
for(i in seq(1,mxim,by=mxim/4))
{
bre<-append(bre,ceiling(i))
}
rango<-c(1,4)
}
union$minori<-as.numeric(union$minori)
union$dife<-(union$minori-union$z)
breaks=c(0,0.25,0.50,0.75,0.95,1)
miniz<-min(t$z)
maxiz<-max(t$z)
for (i in seq(1,5,by=1))
{
if (breaks[i]<=miniz & miniz<=breaks[i+1])
{minimo<-i}
if (breaks[i]<=maxiz & maxiz<=breaks[i+1])
{maximo<-i}
}
inicio=0.8-0.12*(minimo-1)
fin=0.2+0.12*(5-maximo)
g2<-ggplot(daf, aes(x=x, y=y)) + theme_bw()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
geom_point(aes(colour=minori,size=Frecu,alpha=minori))+
scale_alpha_manual(values = c(alpha1, alpha2),guide="none")+
scale_size(name="Freq",breaks=bre,range=rango)+
ggtitle(title,subtitle=title2)+
theme(
plot.title = element_text(hjust=0.5,color="blue"),
plot.subtitle= element_text(hjust=0.5,color="orangered4")
)
if (classvar==1)
{
g2<-g2+
geom_text(data = variss,aes(x =dimf1, y =dimf2,label = rownames(variss),
fontface=2),size=5,show.legend=F,colour = "orangered4")
}
g2<-g2+colScale+geom_density2d(colour = "gray80",alpha=1,show.legend=FALSE)+
xlab(ejex)+ylab(ejey)+
guides(colour = guide_legend("", override.aes = list(size = 4,alpha = 1)),
size=guide_legend(title="Freq", override.aes = list(alpha = 1))
)
g3<-g2+ geom_contour_filled(data=t, aes(x=x, y=y,z=z),alpha=0.65,breaks=breaks)+
scale_fill_grey(start=inicio,end=fin)+
ggtitle(title, subtitle = title3)+
theme(
plot.title = element_text(hjust=0.5,color="blue"),
plot.subtitle= element_text(hjust=0.5,color="orangered4")
)
g4<-ggplot(daf, aes(x=x, y=y)) + theme_bw()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
geom_point(aes(colour=minori,size=Frecu,alpha =minori))+
scale_alpha_manual(values = c(alpha1, alpha2),guide="none")+
scale_size(name="Freq",breaks=bre,range=rango)
if (classvar==1)
{
g4<-g4+
geom_text(data = variss,aes(x =dimf1, y =dimf2,label = rownames(variss),
fontface=2),size=5,show.legend=F,colour = "orangered4")
}
g4<-g4+geom_density2d(colour = "gray80",alpha=1,show.legend=FALSE)+
xlab(ejex)+ylab(ejey)+
geom_text(data = uni,aes(x =x, y =y,label = rownames(uni),
colour = Variable,fontface=1),size=sort(uni$tama),show.legend =F)+
colScale2
g4<-g4+ ggtitle(title, subtitle = title3)+
theme(
plot.title = element_text(hjust=0.5,color="blue"),
plot.subtitle= element_text(hjust=0.5,color="orangered4")
)+
guides(colour = guide_legend("", override.aes = list(size = 4,alpha = 1)),
size=guide_legend(title="Freq", override.aes = list(alpha = 1))
)
g3<-g2+ geom_contour_filled(data=t, aes(x=x, y=y,z=z),alpha=0.65,breaks=breaks)+scale_fill_grey(start=inicio,end=fin)
g5<-g4+ geom_contour_filled(data=t, aes(x=x, y=y,z=z),alpha=0.65,breaks=breaks)+scale_fill_grey(start=inicio,end=fin)+
ggtitle(title, subtitle = title3)+
theme(
plot.title = element_text(hjust=0.5,color="blue"),
plot.subtitle= element_text(hjust=0.5,color="orangered4")
)+
guides(colour = guide_legend("", override.aes = list(size = 4,alpha = 1)),
size=guide_legend(title="Freq", override.aes = list(alpha = 1))
)
gradi<-ggplot(union, aes(x=x, y=y)) + theme_bw()+
geom_point(aes(colour=z,size=Frecu),alpha =alpha3)+
scale_size(name="Freq",breaks=bre,range=rango)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
gradi<-gradi+scale_colour_gradient(low= "green1",high = "red",name="Fitted probability")+
xlab(ejex)+ylab(ejey)+ ggtitle(title,subtitle=title3)+
theme(
plot.title = element_text(hjust=0.5,color="blue"),
plot.subtitle= element_text(hjust=0.5,color="orangered4")
)+
guides(size=guide_legend(title="Freq", override.aes = list(alpha = 1)))
gradi2<-ggplot(union, aes(x=x, y=y)) + theme_bw()+
geom_point(aes(fill=dife,size=Frecu),alpha =alpha3,shape=21,color="orange")+
scale_size(name="Freq",breaks=bre,range=rango)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
gradi2<-gradi2+
scale_fill_gradient2(midpoint = 0, low = "blue4", mid = "white",
high = "darkred", space = "Lab",name="vardep-p" )
gradi2<-gradi2+ xlab(ejex)+ylab(ejey)+ ggtitle(title,subtitle=title3)+
theme(
plot.title = element_text(hjust=0.5,color="blue"),
plot.subtitle= element_text(hjust=0.5,color="orangered4")
)+
guides(size=guide_legend(title="Freq", override.aes = list(alpha = 1)))
cosa<-list(g2,g4,g3,g5,gradi,gradi2,daf,uni,t,union,listconti,listclass,
colScale,colScale2,ejex,ejey,title2,title3)
names(cosa)<-c("graph1","graph2","graph3","graph4","graph5","graph6","df1",
"df2","df3","df4","listconti","listclass")
return(cosa)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.