#----------------------------------
# load variable selection functions
#----------------------------------
#--- set random number seed
setSeed <- function(SEED){
if( is.null(SEED) ){
SEED = as.integer(Sys.time())
set.seed(SEED)
} else {
set.seed(SEED)
}
return(SEED)
}
Min <- function(x){ min(x,na.rm=T) }
Max <- function(x){ max(x,na.rm=T) }
Extract <- function(x, ii){ x[ii] }
dealWithNA <- function(MAT,NAval=-1){
MAT[which(is.na(MAT),arr.ind=TRUE)]=NAval
return(MAT)
}
rough.fixNA <- function(MAT){
## fill NA with it's column median
## Ref: https://cran.r-project.org/web/packages/RRF/RRF.pdf
indx = which(is.na(MAT),arr.ind=TRUE)
Cols = unique(indx[,2])
for( i in 1:length(Cols) ){
val = MAT[,Cols[i]]
med = median(val,na.rm=T)
MAT[,Cols[i]] = ifelse(is.na(val),med,val)
}
return(MAT)
}
fscore <- function(X){
classes <- levels(as.factor(X[,1]))
pred <- levels(as.factor(X[,2]))
Nclasses <- length(classes)
Npred <- length(pred)
## binary confusion matrix
## asd con
## asd TP FN
## con FP TN
##------------------------
conmat <- matrix(NA,nrow=Nclasses,ncol=Npred)
rownames(conmat) <- classes
colnames(conmat) <- pred
for( i in 1:Nclasses ){
for( j in 1:Npred ){
conmat[i,j] = sum(X[,1] == classes[i] & X[,2] == pred[j])
}
}
## precision = TP / (TP+FP)
prec = rep(NA,(Nclasses+1))
names(prec) = c(classes,"total")
## recall = TP / (TP+FN)
recall = rep(NA,Nclasses+1)
names(recall) = c(classes,"total")
TP = diag(conmat)
prec[1] = TP[1]/sum(conmat[,1]);
prec[2] = TP[2]/sum(conmat[2,]);
prec[3] = sum(TP)/sum(conmat);
recall[1] = TP[1]/sum(conmat[1,]);
recall[2] = TP[2]/sum(conmat[,2]);
recall[3] = sum(TP)/sum(conmat);
Fscore = (2*prec*recall)/(prec+recall);
names(Fscore) = c(classes,"total")
return(list(conmat=conmat,prec=prec,recall=recall,Fscore=Fscore))
}
logBase <- function(x, BASE=0){
x = as.numeric(x)
if( BASE == 2 ){
return(log2(x))
} else if ( BASE == 10 ){
return(log10(x))
}
return(log(x))
}
lgrad <- function(n,k,p){
## Ref: https://mc-stan.org/docs/2_21/functions-reference/binomial-distribution.html
return( k/(p+SMALL) - (n-k)/(1-p+SMALL) );
}
lbin <- function(n,k){
## Ref: https://mc-stan.org/docs/2_21/functions-reference/binomial-distribution.html
return(lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1))
}
lbinomial <- function(n,k,p){
## Ref: https://mc-stan.org/docs/2_21/functions-reference/binomial-distribution.html
return( lbin(n,k) + k * log(p+SMALL) + (n-k) * log(1-p+SMALL) );
}
milog <- function(pxyz, pxz, pyz, pz, BASE=0){
#SMALL <- 1.0e-100
if( pxyz == 0 ){ return(0) }
if( pxz == 0 ){ pxz = pxz + SMALL }
if( pyz == 0 ){ pyz = pyz + SMALL }
if( pz == 0 ){ pz = pz + SMALL }
return(pxyz*logBase( (pxyz*pz)/(pxz*pyz), BASE ))
}
entropy <- function(x, BASE=0){
x <- as.numeric(x)
if( is.na(x) ){ return(NA) }
else if( x == 0 ){ return(0) }
else { return(-x*logBase(x,BASE)) }
}
MutualInformation <- function(X,Y,BASE=0){
RES <- list()
Hx = 0
Hy = 0
Hxy = 0
MIxy = 0
NMIxy = 0
X = as.numeric(X)
Y = as.numeric(Y)
indx = !is.na(X) & !is.na(Y)
x = X[indx]
y = Y[indx]
if( length(x) > 0 && length(y) > 0 ){
ct2 = table(x,y)
N = sum(ct2)
ix = nrow(ct2)
jy = ncol(ct2)
# H(X)
for( i in 1:ix ){
nx = sum(ct2[i,])
px = nx/N
Hx = Hx + entropy(px,BASE)
}
# H(Y) and H(X|Y)
for( j in 1:jy ){
ny = sum(ct2[,j])
py = ny/N
Hy = Hy + entropy(py,BASE)
sum = 0
for( i in 1:ix ){
nxy = ct2[i,j]
pxy = nxy/ny
sum = sum + entropy(pxy,BASE)
}
Hxy = Hxy + py*sum
}
MIxy = Hx - Hxy
NMIxy = 2*MIxy/(Hy+Hx)
}
RES$Ha = Hx
RES$Hb = Hy
RES$Hab = Hxy
RES$MI = MIxy
RES$NMI = NMIxy
return(RES)
}
conditionalMI <- function(X,Y,Z,BASE=0){
RES <- list()
MIxyCz = 0
MIxzCy = 0
MIxyz = 0
IIxyz = 0
SRxyz = 0
X = as.numeric(X)
Y = as.numeric(Y)
Z = as.numeric(Z)
indx = !is.na(X) & !is.na(Y) & !is.na(Z)
x = X[indx]
y = Y[indx]
z = Z[indx]
if( length(x) > 0 && length(y) > 0 && length(z) > 0 ){
MIxy = MutualInformation(X,Y,BASE)
MIxz = MutualInformation(X,Z,BASE)
MIyz = MutualInformation(Y,Z,BASE)
ctxyz = table(x,y,z)
N = sum(ctxyz)
II = dim(ctxyz)
#-----------------------------------------
# Interaction Information: II(X;Y;Z) = I(X;Y) - I(X;Y|Z)
# II(X;Y;Z) = I(X;Y) - I(X;Y|Z)
#-----------------------------------------
# Conditional MI: I(X;Y|Z) (is symmetric)
# I(X;Y|Z) = H(X;Z) + H(Y;Z) - H(X,Y,Z) - H(Z)
# = H(X|Z) + H(Y|Z) - H(X,Y,Z) + H(Z)
# = Sum_{x,y,z} p(x,y,z) * log( (p(x,y,z)*p(z))/(p(x,z)*p(y,z)) )
#-----------------------------------------
# Joint Mutual Information: I(X,Y;Z)
# I(X,Y;Z) = I(X;Z) + I(Y;Z) + I(X;Y|Z) - I(X;Y)
#-----------------------------------------
# Refs: https://math.stackexchange.com/questions/168316/calculating-conditional-entropy-given-two-random-variables
# : https://en.wikipedia.org/wiki/Mutual_information
# : https://en.wikipedia.org/wiki/Interaction_information
# : https://www.mdpi.com/1099-4300/21/9/869/htm
# : https://arxiv.org/pdf/2001.06089.pdf
#-----------------------------------------
MIxyCz = 0
Hxyz = 0
Hyz = 0
Hxz = 0
Hz = 0
for( zi in 1:II[3] ){
nz = sum(ctxyz[,,zi])
pz = nz/N
Hz = Hz + entropy(pz,BASE)
for( yi in 1:II[2] ){
nyz = sum(ctxyz[,yi,zi])
pyz = nyz/N
Hyz = Hyz + entropy(pyz,BASE)
for( xi in 1:II[1] ){
nxyz = ctxyz[xi,yi,zi]
pxyz = nxyz/N
Hxyz = Hxyz + entropy(pxyz,BASE)
}
}
for( xi in 1:II[1] ){
nxz = sum(ctxyz[xi,,zi])
pxz = nxz/N
Hxz = Hxz + entropy(pxz,BASE)
}
}
#Hy=0
#Hxy=0
#for( yi in 1:II[2] ){
# ny = sum(ctxyz[,yi,])
# py = ny/N
# Hy = Hy + entropy(py,BASE)
# for( xi in 1:II[1] ){
# nxy = sum(ctxyz[xi,yi,])
# pxy = nxy/N
# Hxy = Hxy + entropy(pxy,BASE)
# }
#}
# Conditional MI
#MIxyCz = MIxz$Hab + MIyz$Hab - Hxyz + MIxz$Hb#Hz
MIxyCz = Hxz + Hyz - Hxyz - Hz
# Correct for any rounding error
if( MIxyCz < 0 ){ MIxyCz = 0 }
# Joint MI
MIxyz = MIxz$MI + MIyz$MI + MIxyCz - MIxy$MI
##MIxyz2 = Hxy - Hxyz + Hz
# Interaction Information
IIxyz = MIxy$MI - MIxyCz
# SR = I(X,Y;Z)/H(X,Y,Z)
SRxyz = MIxyz/Hxyz
# Normalised Conditonal Mutual Information
# I(X,Y|Z)/H(Y|Z)
NMI = MIxyCz/Hyz
}
RES$MIabCc <- MIxyCz
RES$MIacb <- MIxyz
#RES$MIacb2 <- MIxyz2
RES$IIabc <- IIxyz
RES$Habc <- Hxyz
RES$SRabc <- SRxyz
RES$NMI <- NMI
RES$MIxy <- MIxy
RES$MIxz <- MIxz
RES$MIyz <- MIyz
#RES$HxCz <- MIxz$Hab
#RES$HyCz <- MIyz$Hab
#RES$Hz1 <- MIxz$Hb
#RES$Hz2 <- MIyz$Hb
#RES$Hx <- MIxz$Ha
#RES$Hy <- MIyz$Ha
#RES$Hz <- Hz
#RES$Hxz <- Hxz
#RES$Hyz <- Hyz
return(RES)
}
## feature selection
Jcmi <- function( S, IXkY, IXjXk, IXjXkCY, beta=1, gamma=1 ){
Nf <- length(S[,1])
Sin <- seq_along(1:Nf)[as.logical(S[,2])]
Sout <- seq_along(1:Nf)[as.logical(S[,3])]
Jcmi <- rep(NA,Nf)
if( length(Sin) > 0 && length(Sout) > 0 ){
for( k in 1:length(Sout) ){
val1 = as.numeric(IXkY[Sout[k]])
val1 = ifelse( is.na(val1), 0, val1)
term1 = val1#as.numeric(IXkY[Sout[k]])
term2 = 0
term3 = 0
for( j in 1:length(Sin) ){
val2 = IXjXk [Sin[j],Sout[k]]
val2 = ifelse( is.na(val2), 0, val2)
val3 = IXjXkCY[Sin[j],Sout[k]]
val3 = ifelse( is.na(val3), 0, val3)
term2 = term2 + val2#IXjXk [Sin[j],Sout[k]]
term3 = term3 + val3#IXjXkCY[Sin[j],Sout[k]]
}
Jcmi[Sout[k]] = term1 - (beta * term2) + (gamma * term3)
}
}
return(Jcmi)
}
## feature selection
Fcbf <- function( S, IXkY, IXjXk, beta=1 ){
Nf <- length(S[,1])
Sin <- seq_along(1:Nf)[as.logical(S[,2])]
Sout <- seq_along(1:Nf)[as.logical(S[,3])]
fcbf <- rep(NA,Nf)
if( length(Sin) > 0 && length(Sout) > 0 ){
for( k in 1:length(Sout) ){
val1 = as.numeric(IXkY[Sout[k]])
val1 = ifelse( is.na(val1), 0, val1)
term1 = val1#as.numeric(IXkY[Sout[k]])
term2 = 0
for( j in 1:length(Sin) ){
val2 = IXjXk [Sin[j],Sout[k]]
val2 = ifelse( is.na(val2), 0, val2)
term2 = term2 + val2
}
fcbf[Sout[k]] = term1 - (beta * term2)
}
}
return(fcbf)
}
jointEntropy <- function(x,y){
x <- as.numeric(x)
y <- as.numeric(y)
indx <- !is.na(x) & !is.na(y)
x <- x[indx]
y <- y[indx]
N <- length(x)
sum <- 0
for( i in 1:N ){
for( j in 1:N ){
if( i <= j ){
x.y <- x[i]*y[j]
if( !is.na(x.y) ){
if( i == j ){
sum <- sum + entropy(x.y)
} else {
sum <- sum + 2*entropy(x.y)
}
}
}
}
}
return(sum)
}
printPer6Months <- function(VATTS, DES, TARGET,PRINT=TRUE){
N = length(VATTS[,1])
CN = colnames(VATTS)
Indx = match(TARGET,CN)
val = as.vector(VATTS[,Indx[1]])
val = as.numeric(ifelse(val=="unknown",-1,val))
perMonths = c("unknown",6,12,18,24,30,36,42,48,52)
months = matrix(NA,ncol=length(perMonths),nrow=1)
colnames(months) = perMonths
x = rep("",N)
for( i in 1:length(perMonths) ){
Mrange = colnames(months)[i]
tally = 0
switch(Mrange,
"unknown" = { indx = val < 0; },
"6" ={ indx = val > 0 & val <= 6; },
"12"={ indx = val > 6 & val <= 12; },
"18"={ indx = val > 12 & val <= 18; },
"24"={ indx = val > 18 & val <= 24; },
"30"={ indx = val > 24 & val <= 30; },
"36"={ indx = val > 30 & val <= 36; },
"42"={ indx = val > 36 & val <= 42; },
"48"={ indx = val > 42 & val <= 48; },
"52"={ indx = val > 52; }
)
x[indx] = Mrange; tally = sum(indx);
months[1,i] = tally;
}
if( PRINT ){
cat("---------------\n")
cat("", DES, " variable: ", colnames(VATTS)[Indx[1]], "\n" );
print(months)
months = (months/N)*100
print(months)
#print((table(months)/N)*100)
cat("---------------\n")
}
return(x);
}
printVattVariables <- function(VATTS, DES, TARGETS){
N = length(VATTS[,1])
CN = colnames(VATTS)
indx = match(TARGETS,CN)
if( length(indx) != 0 ){
for( i in 1:length(indx) ){
cat("", DES, " variable: ", colnames(VATTS)[indx[i]], "\n" );
print(table(VATTS[,indx[i]]))
print((table(VATTS[,indx[i]])/N)*100)
cat("----- DONE -----\n")
}
}
}
buildVattsTable <- function(GG, VATTS){
NR = length(V(GG))
NC = length(VATTS)
oo=matrix(NA,NR,NC)
colnames(oo) = VATTS
for( i in 1:NC ){
x = igraph::get.vertex.attribute(GG,VATTS[i])
if( !is.null(x) ){
oo[,i] = x
}
}
return(oo)
}
addPrior <- function(x, alpha){
x[x==0] = alpha
x = x/sum(x)
}
shuffle <- function(MAT,DIAG=FALSE,P=1){
## randomly shuffle P% elements in upper and lower diagonal MAT
X = NULL
if( !is.null(MAT) ){
NR = dim(MAT)[1]
NC = dim(MAT)[2]
if( P==0 ){ return(MAT); }
if( NR == NC && NR > 1 ){
if( P == 1 ){ #shuffle all
indx = upper.tri(MAT,diag=DIAG)
MAT[indx] = sample(MAT[indx])
indx = lower.tri(MAT,diag=DIAG)
MAT[indx] = sample(MAT[indx])
X = MAT
colnames(X) = colnames(MAT)
rownames(X) = rownames(MAT)
} else {#shuffle fraction (P) of elements
indx = upper.tri(MAT,diag=DIAG)
ele = cbind(which(indx,arr.ind=T),MAT[indx])
Nele = dim(ele)[1]
Subele = ele[sample(Nele,floor(Nele*P),replace=F),]
Subele[,3] = Subele[sample(dim(Subele)[1]),3]
MAT[Subele[,1:2]] = Subele[,3]
indx = lower.tri(MAT,diag=DIAG)
ele = cbind(which(indx,arr.ind=T),MAT[indx])
Nele = dim(ele)[1]
Subele = ele[sample(Nele,floor(Nele*P),replace=F),]
Subele[,3] = Subele[sample(dim(Subele)[1]),3]
MAT[Subele[,1:2]] = Subele[,3]
X = MAT
colnames(X) = colnames(MAT)
rownames(X) = rownames(MAT)
}
}
}
return(X)
}
shuffle2 <- function(MAT,P=1){
## randomly shuffle P% elements in MAT
X = NULL
if( !is.null(MAT) ){
NR = dim(MAT)[1]
NC = dim(MAT)[2]
if( P == 0 ){ return(MAT); }
if( P == 1 ){
if( NR == 1 ){
X = MAT[1,sample.int(NC)]
names(X) = colnames(MAT)
} else {
X = MAT[sample.int(NR),sample.int(NC)]
colnames(X) = colnames(MAT)
rownames(X) = rownames(MAT)
}
} else {
if( NR == 1 ){
X = matrix(NA,nrow=NR,ncol=NC)
Indx = sample.int(floor(P*NC))
RIndx = sample(Indx,replace=F)
X[1,RIndx] = MAT[1,Indx]
X[1,] = ifelse(is.na(X[1,]),MAT[1,],X[1,])
#names(X) = colnames(MAT)
colnames(X) = colnames(MAT)
rownames(X) = rownames(MAT)
} else {
X = matrix(NA,nrow=NR,ncol=NC)
Indx = seq(1,(NR*NC),1)
Indx = sample(Indx, floor(P*NR*NC),replace=F)
Rndx = sample(Indx)
X[Rndx] = MAT[Indx]
indx = which(is.na(X),arr.ind=T)
X[indx] = MAT[indx]
colnames(X) = colnames(MAT)
rownames(X) = rownames(MAT)
}
}#ifelse
}##null
return(X)
}
shuffle3 <- function(MAT,DIAG=FALSE,P=1){
## randomly select P% elements in upper and lower diagonal MAT and then set values randonly between [0,1]
X = NULL
if( !is.null(MAT) ){
NR = dim(MAT)[1]
NC = dim(MAT)[2]
if( P == 0 ){ return(MAT); }
if( NR == NC && NR > 1 ){
if( P == 1 ){
indx = upper.tri(MAT,diag=DIAG)
MAT[indx] = runif(length(indx),0,1)
indx = lower.tri(MAT,diag=DIAG)
MAT[indx] = runif(length(indx),0,1)
X = MAT
colnames(X) = colnames(MAT)
rownames(X) = rownames(MAT)
} else {
indx = upper.tri(MAT,diag=DIAG)
ele = cbind(which(indx,arr.ind=T),MAT[indx])
Nele = dim(ele)[1]
Subele = ele[sample(Nele,floor(Nele*P),replace=F),]
Subele[,3] = runif(dim(Subele)[1],0,1)
MAT[Subele[,1:2]] = Subele[,3]
indx = lower.tri(MAT,diag=DIAG)
ele = cbind(which(indx,arr.ind=T),MAT[indx])
Nele = dim(ele)[1]
Subele = ele[sample(Nele,floor(Nele*P),replace=F),]
Subele[,3] = runif(dim(Subele)[1],0,1)
MAT[Subele[,1:2]] = Subele[,3]
X = MAT
colnames(X) = colnames(MAT)
rownames(X) = rownames(MAT)
}
}
}
return(X)
}
shuffle4 <- function(MAT,P=1){
## randomly select P% elements in MAT and then set values randonly between [0,1]
X = NULL
if( !is.null(MAT) ){
NR = dim(MAT)[1]
NC = dim(MAT)[2]
if( P == 0 ){ return(MAT); }
if( P == 1 ){
if( NR == 1 ){
X = matrix(runif(NC,0,1),nrow=NR,ncol=NC)
names(X) = colnames(MAT)
} else {
X = matrix(runif(NR*NC,0,1),nrow=NR,ncol=NC)
colnames(X) = colnames(MAT)
rownames(X) = rownames(MAT)
}
} else {
if( NR == 1 ){
X = matrix(NA,nrow=NR,ncol=NC)
indx = seq(1,NC,1)
indx = sample(indx,floor(P*NC),replace=F)
X[1,indx] = runif(floor(P*NC),0,1)
X[1,] = ifelse(is.na(X[1,]),MAT[1,],X[1,])
#names(X) = colnames(MAT)
colnames(X) = colnames(MAT)
rownames(X) = rownames(MAT)
} else {
X = matrix(NA,nrow=NR,ncol=NC)
indx = seq(1,(NR*NC),1)
indx = sample(indx,floor(P*NR*NC),replace=F)
X[indx] = runif(floor(P*NR*NC),0,1)
indx = which(is.na(X),arr.ind=T)
X[indx] = MAT[indx]
colnames(X) = colnames(MAT)
rownames(X) = rownames(MAT)
}
}#ifelse
}#null
return(X)
}
zScore <- function(x, useLOG=FALSE){
x = as.numeric(x)
if( useLOG ){
x[x<=0]=NA
x = log(x)
}
x = (x-mean(x,na.rm=T))/sd(x,na.rm=T)
return(x)
}
## KL divergence KL(P || Q)
kl_divergence <- function(p,q){
p = as.numeric(p)
q = as.numeric(q)
if( is.na(p) || is.na(q) ){return(NA)}
else if(p==0 || q==0){ return(NA) }
else { return(p*log(p/q)) }
}
## Jensen-Shannon divergence JS(P || Q)
js_divergence <- function(p,q){
p = as.numeric(p)
q = as.numeric(q)
m = 0.5*(p+q)
return( (0.5*(kl_divergence(p,m)) + 0.5*(kl_divergence(q,m))) )
}
rankDistance <- function(X,Y){
X = as.numeric(X)
Y = as.numeric(Y)
if( is.na(X) && is.na(Y) ){ return(NA); }
if( is.na(X) ){ X=1e100; }
if( is.na(Y) ){ Y=1e100; }
return( sqrt( (X-Y)^2 ) );
}
Stabplot <- function(DF,YLAB="",TIT="",DIR="",subDIR="",shortTIT="",groups=FALSE){
DF = as.data.frame(DF)
Xmin = min(as.numeric(as.vector(DF$errRate)))
Xmax = max(as.numeric(as.vector(DF$errRate)))
if( groups ){
colours <- c('firebrick2','royalblue2')
shapes <- c(16,17)
gplot = ggplot(DF,aes(x=as.numeric(as.vector(errRate)),y=as.numeric(as.vector(stab)),colour=as.vector(group)))+
geom_point(aes(colour=as.vector(group), shape=as.vector(group)))+
geom_errorbar(aes(ymin=as.numeric(as.vector(CI_lower)), ymax=as.numeric(as.vector(CI_upper))),show.legend=FALSE)+
theme(legend.key=element_blank())+
labs(x="error Rate",y=YLAB,title=TIT)+
scale_x_continuous(breaks = seq(Xmin, Xmax, by = 0.1))+
theme(
title=element_text(face="bold",size=rel(1.5)),
axis.title.x=element_text(face="bold",size=rel(1.0)),
axis.title.y=element_text(face="bold",size=rel(1.0)),
axis.text.x = element_text(angle = 40, hjust = 1, face="bold", size=rel(1.0)),
legend.title=element_text(face="bold",size=rel(1.0)),
legend.text=element_text(face="bold",size=rel(1.0)),
legend.key=element_blank())+
theme(panel.grid.major = element_line(colour = "grey40",size=0.2),
panel.grid.minor = element_line(colour="grey40",size=0.1),
panel.background = element_rect(fill = "white"),
panel.border = element_rect(linetype="solid",fill=NA))+
guides(colour = guide_legend(override.aes = list(shape = shapes, size=rel(7))),
size = FALSE,
shape = FALSE)+
scale_color_manual("group",breaks=levels(factor(DF$group)),values=c(colours))
} else {
gplot = ggplot(DF,aes(x=as.numeric(as.vector(errRate)),y=as.numeric(as.vector(stab))))+
geom_point()+geom_errorbar(aes(ymin=as.numeric(as.vector(CI_lower)), ymax=as.numeric(as.vector(CI_upper))))+
labs(x="error Rate",y=YLAB,title=TIT)+
scale_x_continuous(breaks = seq(Xmin, Xmax, by = 0.1))+
theme(
title=element_text(face="bold",size=rel(1.5)),
axis.title.x=element_text(face="bold",size=rel(1.0)),
axis.title.y=element_text(face="bold",size=rel(1.0)),
axis.text.x = element_text(angle = 40, hjust = 1, face="bold", size=rel(1.0)),
legend.title=element_text(face="bold",size=rel(1.0)),
legend.text=element_text(face="bold",size=rel(1.0)),
legend.key=element_blank())+
theme(panel.grid.major = element_line(colour = "grey40",size=0.2),
panel.grid.minor = element_line(colour="grey40",size=0.1),
panel.background = element_rect(fill = "white"),
panel.border = element_rect(linetype="solid",fill=NA))
}
png(sprintf("%s/%s/%s.png",DIR,subDIR,shortTIT),width=WIDTH,height=HEIGHT,units="px");
print(gplot)
dev.off()
return(gplot)
}
Stabdf <- function( errRate, DF, group="" ){
N = length(errRate)
OUT = matrix(NA,ncol=5,nrow=N)
colnames(OUT) = c("errRate","stab","CI_lower","CI_upper","group")
k=1
for( e in 1:N ){
if( errRate[e] == 0 ){
OUT[k,1] = errRate[e]
OUT[k,2] = 1
OUT[k,3] = 1
OUT[k,4] = 1
OUT[k,5] = group
} else {
OUT[k,1] = errRate[e]
OUT[k,2] = as.numeric(DF[[e]]$stab)
OUT[k,3] = as.numeric(DF[[e]]$CIlower)
OUT[k,4] = as.numeric(DF[[e]]$CIupper)
OUT[k,5] = group
}
k=k+1
}
return(OUT)
}
avgFeatureProb <- function(DF){
R = dim(DF)[1];
N = dim(DF)[2];
Pc = 0;
tmp = rep(0,R);
for( i in 1:R ){ tmp[i] = sum(DF[i,2:N]==1)/(N-1); }
Pc = sum(tmp)/R;
return(Pc);
}
featureFreq <- function(DF){
R = dim(DF)[1];
N = dim(DF)[2];
freq = rep(0,R);
for( i in 1:R ){ freq[i] = sum(DF[i,2:N]==1); }
return(freq);
}
stabilityCI <- function(DF,Pi,Kbar,stab,alpha=0.05){
## Ref: https://www.jmlr.org/papers/volume18/17-514/17-514.pdf
F = dim(DF)[1]; #number of features
N = dim(DF)[2];
M = (N-1); #number of random samples
Kf = Kbar/F;
Norm = 1/( Kf*(1-Kf) );
Thetai = rep(NA,M);
for( i in 1:(M-1) ){
ii = (i+1);
ki = sum(DF[,ii],na.rm=T);
term1 = 0;
for( f in 1:F ){ term1 = term1 + DF[f,ii] * Pi[f]; }
term1 = (1/F) * term1;
term2 = (ki*Kf)/F;
term3 = (stab/2) * ( (2*Kf*ki)/F - ki/F - Kf + 1 );
Thetai[ii] = Norm * ( term1 - term2 + term3 );
}
## the average value of Thetai
Thetai_avg = 1/M * sum(Thetai,na.rm=T);
## estimate of the variance of stab
Thetai_var = 4/M^2 * sum( (Thetai - Thetai_avg)^2, na.rm=T);
## the inverse cumulative of a standard normal distribution at 1−alpha/2
## the inverse cumulative of a standande normal at x is calculated using qnorm(x)
Zstar = qnorm( (1-alpha/2) );
upper = stab + Zstar * sqrt( Thetai_var );
lower = stab - Zstar * sqrt( Thetai_var );
return(list(var=Thetai_var,upper=upper,lower=lower))
}
featureStability <- function(DF, alpha=0.05){
## Ref: https://www.jmlr.org/papers/volume18/17-514/17-514.pdf
F = dim(DF)[1]; #number of features
N = dim(DF)[2];
M = (N-1); #number of random samples
Fn = as.character(DF[,1]); #feature names
Fi = featureFreq(DF); #frequency that feature i is selected
Pi = Fi/M; #probability that feature i is selected
Kbar= sum(Fi)/M; #average number of features selected
Kf = Kbar/F; #probability of a feature begin selected
Efi = Kf * M; #expected number times a feature is selected
Sf = (M/(M-1)) * Pi * (1-Pi); #variance of feature i being selected
##negative log Binomial probability for selecting each feature
lBin= rep(NA,F);
lGad= rep(NA,F);
for( i in 1:F ){
if( Pi[i] > 0 ){
lBin[i] = -lbinomial(M,Fi[i],Pi[i]);
lGad[i] = lgrad(M,Fi[i],Pi[i]);
}
}
## sample feature selection stability measure
stab= ( (1/F) * sum( Sf, na.rm=T ) );
stab= stab / ( Kf*(1-Kf) );
stab= 1 - stab;
## confidence interval on sample feature statbility
ci = stabilityCI(DF=DF,Pi=Pi,Kbar=Kbar,stab=stab,alpha=alpha);
## expect population feature selection stability
Pbar = sum(Pi)/F;
Estab = (1/F) * sum( Pi * (1-Pi) );
Estab = Estab / ( Pbar*(1-Pbar) );
Estab = 1 - Estab;
##feature robustness measure
Sigma = sqrt( Kf*(1-Kf)*M );
Zscore = rep(NA,F);
if( Sigma != 0 ){ Zscore = (Fi - Efi)/Sigma; }
CN = c("feature","Fi","Pi","Sf","nlBin","lgrad","robst")
OUT = matrix(NA,ncol=length(CN),nrow=F);
colnames(OUT) = CN
OUT = as.data.frame(OUT)
OUT[,1] = as.character(Fn)
OUT[,2] = as.numeric(Fi)
OUT[,3] = as.numeric(Pi)
OUT[,4] = as.numeric(Sf)
OUT[,5] = as.numeric(lBin)
OUT[,6] = as.numeric(lGad)
OUT[,7] = as.numeric(Zscore)
return(list(stab=stab,Estab=Estab, Vstab=ci$var, CIupper=ci$upper, CIlower=ci$lower,
alpha=alpha, OUT=OUT));
}
#fullSetRobustness <- function(){}
featureRobustness <- function(DF){
# Ref: http://memetic-computing.org/publication/journal/MBGA.pdf
F = dim(DF)[1]; #number of features
N = dim(DF)[2];
M = (N-1); #number of random samples
Fi = as.character(DF[,1]) #feature names
Si = featureFreq(DF) #frequency that feature i is selected
Sc = sum( (Si/M) ); #average number of features selected
Psi = Sc/F; #probability of a feature begin selected
Esi = Psi * M; #expected number of times a feature is selected
Sigma = sqrt( Psi*(1-Psi)*M ); #the standard deviation
CN = c("feature","robustness")
Zscore = matrix(NA,ncol=length(CN),nrow=F);
colnames(Zscore) = CN
Zscore[,1] = as.character(Fi)
if( Sigma != 0 ){ Zscore[,2] = (Si - Esi) / Sigma; }
return(Zscore);
}
linearRankSel <- function(X,decreasing=FALSE, negative=FALSE){
# Ref: https://stackoverflow.com/questions/20290831/how-to-perform-rank-based-selection-in-a-genetic-algorithm
F = dim(X)[1]
if( !negative ){
Fsub = X[as.numeric(X[,2]) > 0,]
} else {
Fsub = X
}
indx = order(as.numeric(Fsub[,2]),decreasing=!decreasing)
Pndx = order(as.numeric(Fsub[,2]),decreasing=decreasing)
R = dim(Fsub)[1]
Rank = seq(1,R,1)
SumR = (R+1)*(R/2)
Pf = Rank/SumR
CN = c(colnames(X),"rank","prob")
out = matrix(NA,ncol=length(CN),nrow=F)
colnames(out) = CN
out[,1] = as.character(X[,1])
out[,2] = as.character(X[,2])
out[match(Fsub[indx,1],out[,1]),3] = Rank
out[match(Fsub[Pndx,1],out[,1]),4] = Pf
return(out)
}
buildPhenotypeProbMatrix <- function(VAR, ii, CT){
VALS = names(table(VAR))
xx = CT[match(VAR,VALS),ii]
return(xx/sum(xx,na.rm=T))
}
buildCommunitySizeTable <- function(VATTS, ID="SPARKID", ALG=NULL ){
RES = NULL
if( !is.null(ALG) ){
IDindx = which(colnames(VATTS)==ID)
Ccindx = which(colnames(VATTS)==ALG)
if( length(IDindx) !=0 && length(Ccindx) != 0 ){
Cmax = max(VATTS[,Ccindx])
C = seq_along(1:Cmax)
RES = matrix(0,Cmax,2)
RES[,1]= C
for( c in 1:Cmax ){
RES[c,2] = sum(VATTS[,Ccindx]==c)
}
}
}
return(RES)
}
buildPhenotypeMatrix <- function( VATTS, FIN, ID="SPARKID" ){
RES = NULL
FN = length(names(FIN))
if( !is.null(ID) ){
IDindx = which(colnames(VATTS)==ID)
if( length(IDindx) !=0 ){
IDS <- as.vector(VATTS[,IDindx])
RN <- length(IDS)
CN <- c()
for( f in 1:FN ){
N = length(colnames(FIN[[f]]))
ii = seq(3,N,1)
CN <- c(CN,sprintf("%s_%s",names(FIN)[f],colnames(FIN[[f]])[ii]))
}
RES <- matrix(NA,RN,length(CN))
colnames(RES) <- CN
for( f in 1:FN ){
N = length(colnames(FIN[[f]]))
ii = seq(3,N,1)
CN = sprintf("%s_%s",names(FIN)[f],colnames(FIN[[f]])[ii])
id = as.character(unlist(FIN[[f]][,.SD,.SDcol=1]))
Rindx = match(IDS,id)
for( i in 1:length(ii) ){
val = as.vector(unlist(FIN[[f]][Rindx,.SD,.SDcol=ii[i]]))
Cindx = match(CN[i],colnames(RES))
RES[,Cindx] = val
}
}
}
}
return(RES)
}
buildPhenotypeCorrelationMatrix <- function( MAT ){
COR = NULL
CORz = NULL
if( !is.null(MAT) ){
CN = colnames(MAT)
N = length(CN)
COR = matrix(NA,N,N)
CORz = matrix(NA,N,N)
colnames(COR) = CN
rownames(COR) = CN
colnames(CORz) = CN
rownames(CORz) = CN
for( i in 1:N ){
for( j in 1:N ){
if( i <= j ){
x = MAT[,i]
y = MAT[,j]
indx = !is.na(x) & !is.na(y)
val = cor(x[indx],y[indx])
COR[i,j] = val
COR[j,i] = val
xz = x[indx]
yz = y[indx]
xz = (xz - mean(xz))/sd(xz)
yz = (yz - mean(yz))/sd(yz)
val = cor(xz,yz)
CORz[i,j] = val
CORz[j,i] = val
}
}
}
}
return(list(COR=COR,CORz=CORz))
}
buildPhenotypeNMImatrix <- function( MAT ){
NMI = NULL
if( !is.null(MAT) ){
CN = colnames(MAT)
N = length(CN)
NMI = matrix(NA,N,N)
colnames(NMI) = CN
rownames(NMI) = CN
for( i in 1:N ){
for( j in 1:N ){
if( i <= j ){
res = MutualInformation(MAT[,i],MAT[,j])
NMI[i,j] = res$NMI
NMI[j,i] = res$NMI
}
}
}
}
return(list(NMI=NMI))
}
extractSubMatrix <- function( MAT, TARGETS ){
RES <- NULL
if( !is.null(MAT) && !is.null(TARGETS) ){
indx = match(TARGETS, colnames(MAT))
if( length(indx) != 0 ){
RES = MAT[indx,indx]
#colnames(RES) = colnames(MAT)
#rownames(RES) = colnames(MAT)
}
}
return(RES)
}
saveNMImatrix <- function( MAT, DIR, subDIR, TITLE="" ){
cat("Save ", TITLE,"...")
saveRDS(MAT, sprintf("%s/%s/%s.RDS",DIR,subDIR,TITLE),compress=TRUE)
cat("done.\n")
}
saveMImatrices <- function( MAT, GROUPS, DIR, subDIR, TITLE="" ){
cat("building Conditional MI ", TITLE,"...")
## build I(X,Y|Z) matrix for Z = GROUPS
res = buildPhenotypeIImatrix(MAT,GROUPS)
cat("done! ")
saveRDS(res, sprintf("%s/%s/%s.RDS",DIR,subDIR,TITLE),compress=TRUE)
cat("Saved ", TITLE, ".\n")
}
CSFplot <- function(DF,YLAB="",TIT="",DIR="",subDIR="",shortTIT=""){
DF = as.data.frame(DF)
Xmin = min(as.numeric(as.vector(DF$errRate)))
Xmax = max(as.numeric(as.vector(DF$errRate)))
gplot = ggplot(DF,aes(x=as.numeric(as.vector(errRate)),y=as.numeric(as.vector(median))))+
geom_point()+geom_errorbar(aes(ymin=as.numeric(as.vector(min)), ymax=as.numeric(as.vector(max))))+
labs(x="error Rate",y=YLAB,title=TIT)+
scale_x_continuous(breaks = seq(Xmin, Xmax, by = 0.1))+
theme(
title=element_text(face="bold",size=rel(1.5)),
axis.title.x=element_text(face="bold",size=rel(1.0)),
axis.title.y=element_text(face="bold",size=rel(1.0)),
axis.text.x = element_text(angle = 40, hjust = 1, face="bold", size=rel(1.0)),
legend.title=element_text(face="bold",size=rel(1.0)),
legend.text=element_text(face="bold",size=rel(1.0)),
legend.key=element_blank())+
theme(panel.grid.major = element_line(colour = "grey40",size=0.2),
panel.grid.minor = element_line(colour="grey40",size=0.1),
panel.background = element_rect(fill = "white"),
panel.border = element_rect(linetype="solid",fill=NA))
png(sprintf("%s/%s/%s.png",DIR,subDIR,shortTIT),width=WIDTH,height=HEIGHT,units="px");
print(gplot)
dev.off()
return(gplot)
}
CFSdf <- function( errRate, DF, INDX=1 ){
N = length(errRate)
OUT = matrix(NA,ncol=5,nrow=N)
colnames(OUT) = c("errRate","median","min","max","max.ind")
k=1
for( e in 1:N ){
if( errRate[e] == 0 ){
OUT[k,1] = errRate[e]
OUT[k,2] = DF[[e]][INDX]
OUT[k,3] = DF[[e]][INDX]
OUT[k,4] = DF[[e]][INDX]
OUT[k,5] = 2
} else {
OUT[k,1] = errRate[e]
OUT[k,2] = median(DF[[e]][,INDX])
OUT[k,3] = min(DF[[e]][,INDX])
OUT[k,4] = max(DF[[e]][,INDX])
OUT[k,5] = (which.max(DF[[e]][,INDX])+1)
}
k=k+1
}
return(OUT)
}
CFS <- function(X=NULL, fNMI=NULL, cNMI=NULL ){
## Correlation feature selection, using symmetric uncertainty (i.e. NMI).
## Ref: https://en.wikipedia.org/wiki/Feature_selection
## X ==> feature selection vector,
## X[i] == 1 if feature selected and X[i] == 0 if feature not selected
## fNMI ==> feature-feature correlation
## cNMI ==> feature-class correlation
CFS = NA
if( !is.null(X) && !is.null(fNMI) && !is.null(cNMI) ){
X[is.na(X)]=0
cNMI[1,is.na(cNMI[1,])]=0
fNMI[is.na(fNMI)]=0
diag(fNMI)=0
num = cNMI[1,] %*% X;
term1 = sum( X );
term2 = 2 * X %*% fNMI %*% X;
CFS = num/(term1 + term2);
}
return(CFS)
}
CCFS <- function(X=NULL, fNMI=NULL, cNMI=NULL, CNMI=NULL ){
## Conditional Correlation feature selection, using symmetric uncertainty (i.e. NMI).
## Ref: https://en.wikipedia.org/wiki/Feature_selection
## X ==> feature selection vector,
## X[i] == 1 if feature selected and X[i] == 0 if feature not selected
## fNMI ==> feature-feature correlation
## cNMI ==> feature-class correlation
## CNMI ==> feature-feature given class correlation
CCFS = NA
if( !is.null(X) && !is.null(fNMI) && !is.null(cNMI) && !is.null(CNMI) ){
X[is.na(X)]=0
cNMI[1,is.na(cNMI[1,])]=0
fNMI[is.na(fNMI)]=0
diag(fNMI)=0
CNMI[is.na(CNMI)]=0
diag(CNMI)=0
num = cNMI[1,] %*% X;
term1 = sum( X );
term2 = 2 * X %*% (fNMI-CNMI) %*% X;
CCFS = num/(term1 + term2);
}
return(CCFS)
}
getNextFi <- function( Fsel, Fi ){
indx = which(Fsel==Fi)
if( length(indx) == 0 ){ return(NULL); }
else { return(Fsel[indx+1]); }
}
addFi <- function( Fset, Fi ){
if( is.null(Fset) ){ return(Fi); }
indx = which(Fset==Fi)
if( length(indx) == 0 ){ return(c(Fi,Fset)); }
else { return(Fset); }
}
removeFi <- function( Fsel, Fi ){
indx = which(Fsel==Fi)
if( length(indx) == 0 ){ return(Fsel) }
else { return(Fsel[-indx]); }
}
FCCFrule <- function( Fi, Fj, fSU, cSU, pSU, CSU, NCLASSES ){
## Fast Class Correlation Filter rule (FCCF)
## Does the feature Fj form a Markov Blacket around feature Fi (in other words,
## is feature Fi subsume/absorb by feature Fj), if so
## feature Fi is redundat and we can remove it.
## return 1 => remove feature
## return 0 => don't remove feature
ii = which(colnames(pSU)==Fi)
jj = which(colnames(pSU)==Fj)
if( length(ii) == 0 || length(jj) == 0 ){ return(0); }
if( is.na(fSU[ii,jj]) || is.na(cSU[ii]) ){ return(0); }
perClass = pSU[,jj] >= pSU[,ii]
if( (sum(perClass,na.rm=T) == NCLASSES) && (fSU[ii,jj] >= cSU[ii]) ){
## feature Fi redundant and can be removed.
return(1);
}
## feature Fi not redundant so keep.
return(0);
}
FCBFrule <- function( Fi, Fj, fSU, cSU, pSU, CSU, NCLASSES ){
## Fast Correlation Based Filter rule (FCCF)
## Does the feature Fj form a Markov Blacket around feature Fi (in other words,
## is feature Fi subsume/absorb by feature Fj), if so
## feature Fi is redundat and we can remove it.
## return 1 => remove feature
## return 0 => don't remove feature
#cat("Fi: ", Fi, " Fj: ", Fj, "\n")
ii = which(colnames(pSU)==Fi)
jj = which(colnames(pSU)==Fj)
#cat("ii: ", ii, " jj: ", jj, "\n")
#cat("cSU[ii]: ", cSU[ii], " fSU: ", fSU[ii,jj], " cSU[jj]: ", cSU[jj], "\n")
if( length(ii) == 0 || length(jj) == 0 ){ return(0); }
if( is.na(cSU[ii]) || is.na(cSU[jj]) || is.na(fSU[ii,jj]) ){ return(0); }
if( (cSU[jj] >= cSU[ii]) && (fSU[ii,jj] >= cSU[ii]) ){
## feature Fi redundant and can be removed.
return(1);
}
## feature Fi not redundant so keep.
return(0);
}
Jcmirule <- function( Fi, Fj, fSU, cSU, pSU, CSU, NCLASSES ){
## Jcmi rule
## Does the feature Fj form a Markov Blacket around feature Fi (in other words,
## is feature Fi subsume/absorb by feature Fj), if so
## feature Fi is redundat and we can remove it.
## Jcmi(Fi) = NMI(Xi;Y) - NMI(Fi;Fj) + NMI(Fi;Fj|Y)
## NMI(Fi;Fj) - NMI(Fi;Fj|Y) >= NMI(Xi;Y)
## return 1 => remove feature
## return 0 => don't remove feature
ii = which(colnames(pSU)==Fi)
jj = which(colnames(pSU)==Fj)
if( length(ii) == 0 || length(jj) == 0 ){ return(0); }
## expected feature correlation condition on class
#condSU = 0
#for( i in 1:NCLASSES ){ condSU = condSU + CSU[[i]][ii,jj]; }
#condSU = condSU/NCLASSES;
condSU = CSU[ii,jj]
## -----
if( is.na(cSU[ii]) || is.na(condSU) || is.na(fSU[ii,jj]) ){ return(0); }
## let's keep this... matches condition in original FCBF rule
if( (cSU[jj] >= cSU[ii]) && (fSU[ii,jj] - condSU) >= cSU[ii] ){
## old condition, which is also fine.
##if( (fSU[ii,jj] - condSU) >= cSU[ii] ){
## feature Fi redundant and can be removed.
return(1);
}
## feature Fi not redundant so keep.
return(0);
}
CFCCFrule <- function( Fi, Fj, fSU, cSU, pSU, CSU, NCLASSES ){
## Conditional Fast Class Correlation Filter rule (CFCCF)
## Does the feature Fj form a Markov Blacket around feature Fi (in other words,
## is feature Fi subsume/absorb by feature Fj), if so
## feature Fi is redundat and we can remove it.
## return 1 => remove feature
## return 0 => don't remove feature
ii = which(colnames(pSU)==Fi)
jj = which(colnames(pSU)==Fj)
if( length(ii) == 0 || length(jj) == 0 ){ return(0); }
perClass = pSU[,jj] >= pSU[,ii]
#condSU = 0
#for( i in 1:NCLASSES ){ condSU = condSU + CSU[[i]][ii,jj]; }
#condSU = condSU/NCLASSES;
condSU = CSU[ii,jj]
if( (sum(perClass) == NCLASSES) && (fSU[ii,jj] >= (cSU[ii] + condSU)) ){
## feature Fi redundant and can be removed.
return(1);
}
## feature Fi not redundant so keep.
return(0);
}
##REFS:
## [1]: FEATURE SELECTION VIA JOINT LIKELIHOOD, 2012 (http://www.cs.man.ac.uk/~gbrown/publications/pocockPhDthesis.pdf).
## [2]: Efficient Feature Selection via Analysis ofRelevance and Redundancy. Lei Yu & Huan Liu, Journal of Machine Learning Research 5 (2004) 1205–1224 (https://jmlr.org/papers/volume5/yu04a/yu04a.pdf)
## [3]: Scalable Feature Selection for Multi-class Problems, Boris Chidlovskii and Loıc Lecerf, 2008 (https://link.springer.com/content/pdf/10.1007%2F978-3-540-87479-9_33.pdf)
FCBF <- function( fNMI, cNMI, pNMI, CNMI, delta=NULL, excSET=NULL, print=FALSE, rule="FCCF", runSel=FALSE ){
## Fast Correlation-Based Filter Algorithm (FCBF)
Fout <- NULL
## Fast Class Correlation Filter
## fNMI => feature-by-feature NMI
## cNMI => class-by-feature NMI
## pNMI => per class-by-feature NMI
## delta => irrelevance threshold
if( !is.null(fNMI) && !is.null(cNMI) && !is.null(pNMI) ){
#if( (rule == 3 || rule == 4) && is.null(CNMI) ){ break; }
if( print ){ cat("using rule", rule, "\n"); }
if( runSel == TRUE ){
if( (rule == "FCBF") || (rule == "FCCF") ){
score = variableSelectionFCBF( cNMI=cNMI, fNMI=fNMI );
}
if( (rule == "Jcmi") || (rule == "CFCCF") ){
score = variableSelectionJcmi( cNMI=cNMI, fNMI=fNMI, CNMI=CNMI );
}
}
CLASSES = dim(pNMI)[1] ## number of classes
N = dim(pNMI)[2] ## number of features
Irr = c()
Rel = seq_along(1:N)
sig = data.frame(a=as.character(),b=as.numeric());
## step 1, select subset of relevant features
if( is.null(excSET) ){
## select subset of relevant features using cut-off on SU(Y,Fi) values.
if( is.null(delta) ){ delta = quantile(pNMI)[2]; }
RELindx = cNMI > delta
FpNMI = pNMI[,RELindx]
if( is.vector(cNMI) ){ FcNMI = cNMI[RELindx]; }
if( is.matrix(cNMI) ){ FcNMI = cNMI[,RELindx]; }
FfNMI = fNMI[RELindx,RELindx]
if( !is.null(CNMI) ){ CNMI = CNMI[RELindx,RELindx]; }
#if( !is.null(CNMI) ){
# for( i in 1:length(names(CNMI)) ){ CNMI[[i]] = CNMI[[i]][RELindx,RELindx]; }
#}
tmp = data.frame( a=as.character(colnames(pNMI)[!RELindx]),
b=as.numeric(rep(-1,sum(!RELindx))) );
sig = rbind(sig,tmp);
Irr = !RELindx
Rel = RELindx
} else {
## user defined sub set of features to exclude
EXCindx = match(excSET,colnames(pNMI))
EXCindx = EXCindx[!is.na(EXCindx)]
if( length(EXCindx) != 0 ){
FpNMI = pNMI[,-EXCindx]
if( is.vector(cNMI) ){ FcNMI = cNMI[-EXCindx]; }
if( is.matrix(cNMI) ){ FcNMI = cNMI[,-EXCindx]; }
FfNMI = fNMI[-EXCindx,-EXCindx]
if( !is.null(CNMI) ){ CNMI = CNMI[-EXCindx,-EXCindx]; }
#if( !is.null(CNMI) ){
# for( i in 1:length(names(CNMI)) ){ CNMI[[i]] = CNMI[[i]][-EXCindx,-EXCindx]; }
#}
Irr = EXCindx
Rel = Rel[-EXCindx]
} else {
FpNMI = pNMI
FcNMI = cNMI
FfNMI = fNMI
}
tmp = data.frame( a=as.character(colnames(pNMI)[EXCindx]),
b=as.numeric(rep(-1,length(EXCindx))) );
sig = rbind(sig,tmp);
}
## step 2, select predominat features from relevant ones,
## i.e. remove redundant features which have a Markov Blanket.
if( is.vector(FcNMI)){ Fsel = names(FcNMI); }
if( is.matrix(FcNMI)){ Fsel = colnames(FcNMI); }
## order features
Frank = order(FcNMI,decreasing=T)
Fsel = Fsel[Frank]
Frank = cbind(Fsel,seq(1,length(Fsel),1))
## get first feature
Fj = Fsel[1]
while( !is.null(Fj) ){
##Fopt = addFi(Fpivot, Fopt);
Fi = getNextFi(Fsel=Fsel, Fi=Fj);
while( !is.null(Fi) ){
switch(rule,
"FCBF"={
signal = FCBFrule(Fi=Fi, Fj=Fj, fSU=FfNMI, cSU=FcNMI, pSU=FpNMI, CSU=CNMI, NCLASSES=CLASSES);
},
"FCCF"={
signal = FCCFrule(Fi=Fi, Fj=Fj, fSU=FfNMI, cSU=FcNMI, pSU=FpNMI, CSU=CNMI, NCLASSES=CLASSES);
},
"Jcmi"={
signal = Jcmirule(Fi=Fi, Fj=Fj, fSU=FfNMI, cSU=FcNMI, pSU=FpNMI, CSU=CNMI, NCLASSES=CLASSES);
},
"CFCCF"={
signal = CFCCFrule(Fi=Fi, Fj=Fj, fSU=FfNMI, cSU=FcNMI, pSU=FpNMI, CSU=CNMI, NCLASSES=CLASSES);
})
if( signal == 1 ){
Fsel = removeFi(Fsel=Fsel, Fi=Fi);
if( print ){ cat("remove: ", Fi, "\n"); }
tmp = data.frame(a=as.character(Fi),b=as.numeric(signal) );
sig = rbind(sig,tmp);
}
Fi = getNextFi(Fsel=Fsel, Fi=Fi);
}
Fj = getNextFi(Fsel=Fsel, Fi=Fj);
}
tmp = data.frame(a=as.character(Fsel),b=as.numeric(rep(2,length(Fsel))));
sig = rbind(sig,tmp);
##output
CN = c("feature","irrelevant","relevant","predominat","rank","score","signal")
Fout = matrix(0,ncol=length(CN),nrow=N)
colnames(Fout) = CN
Fout[,1] = colnames(pNMI)
if( length(Irr) != 0 ){ Fout[Irr,2] = 1; }
if( length(Rel) != 0 ){ Fout[Rel,3] = 1; }
#Fout[!RELindx,2] = 1
#Fout[RELindx,3] = 1
Fout[match(Fsel,Fout[,1]),4] = 1;
if( runSel ){
Fout[match(score[,1],Fout[,1]),5] = score[,4];
Fout[match(score[,1],Fout[,1]),6] = score[,5];
} else {
Fout[,5] = rep(NA,length(Fout[,5]));
Fout[,6] = rep(NA,length(Fout[,6]));
}
#Fout[match(Frank[,1],Fout[,1]),5] = Frank[,2];
#Fout[,5] = ifelse( Fout[,6] == 0, NA, Fout[,6]);
Fout[match(sig[,1],Fout[,1]),7] = sig[,2];
}
return(Fout);
}
symmetricUncertainty <- function( MAT, GROUPS ){
NMI = NULL
cNMI = NULL
if( !is.null(MAT) ){
CN = colnames(MAT)
RN = colnames(GROUPS)
Ncol = length(CN)
Nrow = length(RN)
NMI = matrix(NA,Nrow,Ncol)
colnames(NMI) = CN
rownames(NMI) = RN
for( i in 1:Nrow ){
for( j in 1:Ncol ){
res = MutualInformation(GROUPS[,i],MAT[,j])
NMI[i,j] = res$NMI
}
}
cNMI = matrix(NA,1,Ncol)
colnames(cNMI) = CN
rownames(cNMI) = "ALL"
if( Nrow == 1 ){
cNMI = NMI
rownames(cNMI) = "ALL"
} else {
for( i in 1:(Nrow-1) ){
GROUPS[,(i+1)] = GROUPS[,(i+1)] + max(GROUPS[,i])
}
a = GROUPS[,1]
for( i in 2:Nrow ){
a = c(a,GROUPS[,i])
}
for( j in 1:Ncol ){
b=rep(MAT[,j],Nrow)
res=MutualInformation(a,b)
cNMI[1,j] = res$NMI
}
}#ifelse
}
return(list(NMI=NMI, cNMI=cNMI))
}
variableSelection <- function( VARS, Y, RES ){
S = NULL
Sn = NULL
## step 1: calculate I(X_k;Y) for each variable
temp = list()
Nf = length(colnames(VARS))
F = seq_along(1:Nf)
S = matrix(NA,ncol=5,nrow=Nf)
S[,1] = colnames(VARS)
S[,2] = rep(FALSE,Nf)
S[,3] = rep(TRUE,Nf)
colnames(S) <- c("variable name","S","!S","rank","Jcmi")
Sn = matrix(NA,ncol=5,nrow=Nf)
Sn[,1] = colnames(VARS)
Sn[,2] = rep(FALSE,Nf)
Sn[,3] = rep(TRUE,Nf)
colnames(Sn) <- c("variable name","S","!S","rank","Jcmi")
for( k in 1:Nf ){
temp[[k]] <- MutualInformation(VARS[,k],Y)
names(temp)[k] <- colnames(VARS)[k]
}
MIXkY <- cbind( sapply(temp,Extract,ii=4) )
NMIXkY <- cbind( sapply(temp,Extract,ii=5) )
rownames(MIXkY) <- NULL
rownames(NMIXkY) <- NULL
## step 2: select variable with highest I(X_k;Y) and put into set S
Fadd1 = which.max(as.numeric(MIXkY))
S[Fadd1,2] = TRUE
S[Fadd1,3] = FALSE
S[Fadd1,4] = 1
S[Fadd1,5] = round(max(as.numeric(MIXkY),na.rm=T),3) #round(res$MI[Fadd1,Fadd1],3)
Fnadd1 = which.max(as.numeric(NMIXkY))
Sn[Fnadd1,2] = TRUE
Sn[Fnadd1,3] = FALSE
Sn[Fnadd1,4] = 1
Sn[Fnadd1,5] = round(max(as.numeric(NMIXkY),na.rm=T),3) #round(res$MI[Fadd1,Fadd1],3)
## step 3: keep selecting the variable with the next highest Jcmi score, and add to set S
for( i in 1:(Nf-1) ){
jcmi = Jcmi(S=S, IXkY=MIXkY, IXjXk=RES$IXY, IXjXkCY=RES$MI)
Jmax = max(jcmi,na.rm=T)
Fadd = which.max(as.numeric(jcmi))
if( !is.na(jcmi[Fadd]) && length(Fadd) > 0 ){
S[Fadd,2] = TRUE
S[Fadd,3] = FALSE
S[Fadd,4] = (i+1)
S[Fadd,5] = round(Jmax,3)
}
jcmi = Jcmi(S=Sn, IXkY=NMIXkY, IXjXk=RES$NMIXY, IXjXkCY=RES$NMI)
Jmax = max(jcmi,na.rm=T)
Fadd = which.max(as.numeric(jcmi))
if( !is.na(jcmi[Fadd]) && length(Fadd) > 0 ){
Sn[Fadd,2] = TRUE
Sn[Fadd,3] = FALSE
Sn[Fadd,4] = (i+1)
Sn[Fadd,5] = round(Jmax,3)
}
}
return(list(S=S,Sn=Sn))
}
variableSelectionJcmi <- function( cNMI, fNMI, CNMI ){
Sn = NULL
## step 1: calculate I(X_k;Y) for each variable
temp = list()
Nf = length(colnames(cNMI))
F = seq_along(1:Nf)
Sn = matrix(NA,ncol=5,nrow=Nf)
Sn[,1] = colnames(cNMI)
Sn[,2] = rep(FALSE,Nf)
Sn[,3] = rep(TRUE,Nf)
colnames(Sn) <- c("variable name","S","!S","rank","Jcmi")
## step 2: select variable with highest I(X_k;Y) and put into set Sn
Fnadd1 = which.max(as.numeric(cNMI))
Sn[Fnadd1,2] = TRUE
Sn[Fnadd1,3] = FALSE
Sn[Fnadd1,4] = 1
Sn[Fnadd1,5] = round(max(as.numeric(cNMI),na.rm=T),3) #round(res$MI[Fadd1,Fadd1],3)
## step 3: keep selecting the variable with the next highest Jcmi score, and add to set S
for( i in 1:(Nf-1) ){
jcmi = Jcmi(S=Sn, IXkY=cNMI, IXjXk=fNMI, IXjXkCY=CNMI)
Jmax = max(jcmi,na.rm=T)
Fadd = which.max(as.numeric(jcmi))
if( !is.na(jcmi[Fadd]) && length(Fadd) > 0 ){
Sn[Fadd,2] = TRUE
Sn[Fadd,3] = FALSE
Sn[Fadd,4] = (i+1)
Sn[Fadd,5] = round(Jmax,3)
}
}
return(Sn=Sn)
}
variableSelectionFCBF <- function( cNMI, fNMI ){
Sn = NULL
## step 1: calculate I(X_k;Y) for each variable
temp = list()
Nf = length(colnames(cNMI))
F = seq_along(1:Nf)
Sn = matrix(NA,ncol=5,nrow=Nf)
Sn[,1] = colnames(cNMI)
Sn[,2] = rep(FALSE,Nf)
Sn[,3] = rep(TRUE,Nf)
colnames(Sn) <- c("variable name","S","!S","rank","fcbf")
## step 2: select variable with highest I(X_k;Y) and put into set Sn
Fnadd1 = which.max(as.numeric(cNMI))
Sn[Fnadd1,2] = TRUE
Sn[Fnadd1,3] = FALSE
Sn[Fnadd1,4] = 1
Sn[Fnadd1,5] = round(max(as.numeric(cNMI),na.rm=T),3) #round(res$MI[Fadd1,Fadd1],3)
## step 3: keep selecting the variable with the next highest Jcmi score, and add to set S
for( i in 1:(Nf-1) ){
fcbf = Fcbf(S=Sn, IXkY=cNMI, IXjXk=fNMI)
Jmax = max(fcbf,na.rm=T)
Fadd = which.max(as.numeric(fcbf))
if( !is.na(fcbf[Fadd]) && length(Fadd) > 0 ){
Sn[Fadd,2] = TRUE
Sn[Fadd,3] = FALSE
Sn[Fadd,4] = (i+1)
Sn[Fadd,5] = round(Jmax,3)
}
}
return(Sn=Sn)
}
buildPhenotypeIImatrix <- function( MAT, GROUPS ){
MI =NULL
II =NULL
NMI =NULL
IXY =NULL
IXZ =NULL
NMIXY=NULL
NMIXZ=NULL
if( !is.null(MAT) ){
CN = colnames(MAT)
N = length(CN)
MI = matrix(NA,N,N)
II = matrix(NA,N,N)
NMI = matrix(NA,N,N)
IXY = matrix(NA,N,N)
IXZ = matrix(NA,N,N)
NMIXY = matrix(NA,N,N)
NMIXZ = matrix(NA,N,N)
colnames(MI) = CN
rownames(MI) = CN
colnames(II) = CN
rownames(II) = CN
colnames(NMI) = CN
rownames(NMI) = CN
colnames(IXY) = CN
rownames(IXY) = CN
colnames(IXZ) = CN
rownames(IXZ) = CN
colnames(NMIXY) = CN
rownames(NMIXY) = CN
colnames(NMIXZ) = CN
rownames(NMIXZ) = CN
for( i in 1:N ){
for( j in 1:N ){
if( i <= j ){
res = conditionalMI(MAT[,i],MAT[,j], GROUPS)
MI[i,j] = res$MIabCc
MI[j,i] = res$MIabCc
II[i,j] = res$IIabc
II[j,i] = res$IIabc
NMI[i,j] = res$NMI
NMI[j,i] = res$NMI
IXY[i,j] = res$MIxy$MI
IXY[j,i] = res$MIxy$MI
IXZ[i,j] = res$MIxz$MI
IXZ[j,i] = res$MIxz$MI
NMIXY[i,j] = res$MIxy$NMI
NMIXY[j,i] = res$MIxy$NMI
NMIXZ[i,j] = res$MIxz$NMI
NMIXZ[j,i] = res$MIxz$NMI
}
}
}
}
return(list(MI=MI, NMI=NMI, II=II, IXY=IXY, NMIXY=NMIXY, IXZ=IXZ, NMIXZ=NMIXZ))
}
buildCommunityPhenotypeTable <- function( VATTS, FIN, ID="SPARKID", ALG=NULL, normROW=TRUE ){
RES = NULL
RAW = NULL
FN = length(names(FIN))
if( !is.null(ALG) ){
IDindx = which(colnames(VATTS)==ID)
Ccindx = which(colnames(VATTS)==ALG)
if( length(IDindx) !=0 && length(Ccindx) != 0 ){
Cmax = max(VATTS[,Ccindx])
C = seq_along(1:Cmax)
OO <- list()
RW <- list()
TOT <- list()
for( f in 1:FN ){
N = length(colnames(FIN[[f]]))
ii = seq(3,N,1)
oo <- matrix(0,Cmax,length(ii))
colnames(oo) <- sprintf("%s_%s",names(FIN)[f],colnames(FIN[[f]])[ii])
tot <- matrix(0,1,length(ii))
colnames(tot)<- sprintf("%s_%s",names(FIN)[f],colnames(FIN[[f]])[ii])
for( i in 1:length(ii) ){
val = as.vector(unlist(FIN[[f]][,.SD,.SDcol=ii[i]]))
tot[1,i] = sum(val,na.rm=T)
}
OO[[f]] <- oo
names(OO) <- names(FIN)[f]
RW[[f]] <- oo
names(RW) <- names(FIN)[f]
TOT[[f]] <- tot
names(TOT)<- names(FIN)[f]
}
for( f in 1:FN ){
N = length(colnames(FIN[[f]]))
ii = seq(3,N,1)
for( c in 1:Cmax ){
indx = ifelse(!is.na(match(VATTS[,Ccindx],C[c])),TRUE,FALSE)
cids = as.character(VATTS[indx,IDindx])
id = as.character(unlist(FIN[[f]][,.SD,.SDcol=1]))
Findx = ifelse(!is.na(match(id,cids)),TRUE,FALSE)
for( i in 1:length(ii) ){
val = as.vector(unlist(FIN[[f]][Findx,.SD,.SDcol=ii[i]]))
Raw = sum(val != 0 & !is.na(val))
Sum = sum(val,na.rm=T)
Norm = TOT[[f]][1,i]
term = 0
if( Norm != 0 ){ term = Sum/Norm }
RW[[f]][C[c],i] = RW[[f]][C[c],i] + Raw
OO[[f]][C[c],i] = OO[[f]][C[c],i] + term
}
}
}
CN <- c()
for( f in 1:FN ){
CN <- c(CN,colnames(OO[[f]]))
}
RES <- matrix(0,Cmax,length(CN))
colnames(RES) <- CN
RAW <- matrix(0,Cmax,length(CN))
colnames(RAW) <- CN
for( f in 1:FN ){
Cindx = match(colnames(OO[[f]]),colnames(RES))
RES[,Cindx] = OO[[f]]
RAW[,Cindx] = RW[[f]]
}
if( normROW ){
for( i in 1:Cmax ){
Norm = sum(RES[i,])
if( Norm != 0 ){
RES[i,] = RES[i,]/Norm
}
}
}
}
}
return(list(RES=RES,RAW=RAW))
}
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
plotCor <- function(MAT=NULL, TARGETS=NULL, DIR="", subDIR="",
TIT="", shortTIT="", gradientTIT="Pearson\nCorrelation",
WIDTH=480, HEIGHT=480, valSIZE=3){
gplot1 = NULL
BORDERCOL= "grey50"#"black"#grey60
LINECOL = BORDERCOL
LABELCOL = "black"
NACOL = "grey90"
YMIN = -4.0
YMAX = -4.0
if( !is.null(MAT) ){
CN = colnames(MAT)
N = length(CN)
ii = seq_along(1:N)
if( !is.null(TARGETS) ){
Nt = length(TARGETS)
MIN = rep(NA,Nt)
MAX = rep(NA,Nt)
MID = rep(NA,Nt)
LABELS = list()
for( i in 1:Nt ){
temp = ii[grepl(TARGETS[i],CN)]
MIN[i] = temp[1]
MAX[i] = temp[length(temp)]
MID[i] = MIN[i] + ((MAX[i]-MIN[i])/2)
#cat(MIN[i], ", ", MAX[i], ", ", MID[i], "\n")
LABELS[[i]] = textGrob(TARGETS[i], gp=gpar(fontsize=15, fontface="bold",col=LABELCOL))
}
}
Nlabs = length(LABELS)
colnames(MAT) = ii
rownames(MAT) = ii
cormat <- round(MAT,2)
# Melt the correlation matrix
melted_cormat <- reshape2::melt(cormat)#, na.rm=T)
df = melted_cormat
colnames(df) <- c("V1","V2","value")
# Create a ggheatmap
gplot1 <- ggplot(df, aes(df$V2, df$V1, fill = df$value))+
labs(x="",y="",title=TIT)+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name=gradientTIT,#"Pearson\nCorrelation",
na.value=NACOL) +
#theme_minimal()+ # minimal theme
theme_grey(base_size=10)+
#theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_rect(colour = BORDERCOL, fill=NA, size=1),
panel.background = element_blank(),
axis.line = element_line(colour = BORDERCOL),
axis.ticks = element_blank(),
legend.justification = c("right","center"),
legend.margin=margin(grid::unit(0,"cm")),
legend.position = "right",
legend.direction = "vertical")+
guides(fill = guide_colorbar(barwidth = 1, barheight = 7,
title.position = "top", title.hjust = 0.5))+
{if( !is.na(MAX[1]))geom_vline(xintercept = MAX[1], color=LINECOL);}+
{if( !is.na(MAX[1]))geom_hline(yintercept = MAX[1], color=LINECOL);}+
{if( !is.na(MAX[2]))geom_vline(xintercept = MAX[2], color=LINECOL);}+
{if( !is.na(MAX[2]))geom_hline(yintercept = MAX[2], color=LINECOL);}+
{if( !is.na(MAX[3]))geom_vline(xintercept = MAX[3], color=LINECOL);}+
{if( !is.na(MAX[3]))geom_hline(yintercept = MAX[3], color=LINECOL);}+
{if( 1 <= Nlabs )annotation_custom(LABELS[[1]],xmin=MID[1],xmax=MID[1],ymin=YMIN,ymax=YMAX);}+
{if( 2 <= Nlabs )annotation_custom(LABELS[[2]],xmin=MID[2],xmax=MID[2],ymin=YMIN,ymax=YMAX);}+
{if( 3 <= Nlabs )annotation_custom(LABELS[[3]],xmin=MID[3],xmax=MID[3],ymin=YMIN,ymax=YMAX);}+
{if( 4 <= Nlabs )annotation_custom(LABELS[[4]],xmin=MID[4],xmax=MID[4],ymin=YMIN,ymax=YMAX);}
png(sprintf("%s/%s/%s.png",DIR,subDIR,shortTIT),width=WIDTH,height=HEIGHT,units="px");
print(gplot1)
dev.off()
}
return(gplot1)
}
plotMI <- function(MAT=NULL, TARGETS=NULL, DIR="", subDIR="",
TIT="", shortTIT="",
WIDTH=480, HEIGHT=480, valSIZE=3){
gplot1 = NULL
BORDERCOL= "grey50"#"black"#grey60
LINECOL = BORDERCOL
LABELCOL = "black"
NACOL = "grey90"
YMIN = -4.0
YMAX = -4.0
if( !is.null(MAT) ){
CN = colnames(MAT)
N = length(CN)
ii = seq_along(1:N)
if( !is.null(TARGETS) ){
Nt = length(TARGETS)
MIN = rep(NA,Nt)
MAX = rep(NA,Nt)
MID = rep(NA,Nt)
LABELS = list()
for( i in 1:Nt ){
temp = ii[grepl(TARGETS[i],CN)]
MIN[i] = temp[1]
MAX[i] = temp[length(temp)]
MID[i] = MIN[i] + ((MAX[i]-MIN[i])/2)
#cat(MIN[i], ", ", MAX[i], ", ", MID[i], "\n")
LABELS[[i]] = textGrob(TARGETS[i], gp=gpar(fontsize=15, fontface="bold",col=LABELCOL))
}
}
Nlabs <- length(LABELS)
colnames(MAT) = ii
rownames(MAT) = ii
cormat <- round(MAT,2)
# Melt the correlation matrix
melted_cormat <- reshape2::melt(cormat)#, na.rm=T)
df = melted_cormat
colnames(df) <- c("V1","V2","value")
top = (max(as.numeric(as.vector(df$value))))
bot = (min(as.numeric(as.vector(df$value))))
mid = ((top-bot)/2) - abs(bot)
# Create a ggheatmap
gplot1 <- ggplot(df, aes(df$V2, df$V1, fill = df$value))+
labs(x="",y="",title=TIT)+
geom_tile(color = "white")+
scale_fill_gradient2(low = "cyan", high = "red", mid="black",
midpoint = mid, limit = c(bot,top), space = "Lab",
name="MI",
na.value=NACOL) +
#theme_minimal()+ # minimal theme
theme_grey(base_size=10)+
#theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_rect(colour = BORDERCOL, fill=NA, size=1),
panel.background = element_blank(),
axis.line = element_line(colour = BORDERCOL),
axis.ticks = element_blank(),
legend.justification = c("right","center"),
legend.margin=margin(grid::unit(0,"cm")),
legend.position = "right",
legend.direction = "vertical")+
guides(fill = guide_colorbar(barwidth = 1, barheight = 7,
title.position = "top", title.hjust = 0.5))+
{if( !is.na(MAX[1]))geom_vline(xintercept = MAX[1], color=LINECOL);}+
{if( !is.na(MAX[1]))geom_hline(yintercept = MAX[1], color=LINECOL);}+
{if( !is.na(MAX[2]))geom_vline(xintercept = MAX[2], color=LINECOL);}+
{if( !is.na(MAX[2]))geom_hline(yintercept = MAX[2], color=LINECOL);}+
{if( !is.na(MAX[3]))geom_vline(xintercept = MAX[3], color=LINECOL);}+
{if( !is.na(MAX[3]))geom_hline(yintercept = MAX[3], color=LINECOL);}+
{if( 1 <= Nlabs )annotation_custom(LABELS[[1]],xmin=MID[1],xmax=MID[1],ymin=YMIN,ymax=YMAX);}+
{if( 2 <= Nlabs )annotation_custom(LABELS[[2]],xmin=MID[2],xmax=MID[2],ymin=YMIN,ymax=YMAX);}+
{if( 3 <= Nlabs )annotation_custom(LABELS[[3]],xmin=MID[3],xmax=MID[3],ymin=YMIN,ymax=YMAX);}+
{if( 4 <= Nlabs )annotation_custom(LABELS[[4]],xmin=MID[4],xmax=MID[4],ymin=YMIN,ymax=YMAX);}
png(sprintf("%s/%s/%s.png",DIR,subDIR,shortTIT),width=WIDTH,height=HEIGHT,units="px");
print(gplot1)
dev.off()
}
return(gplot1)
}
plotII <- function(MAT=NULL, TARGETS=NULL, DIR="", subDIR="",
TIT="", shortTIT="",
WIDTH=480, HEIGHT=480, valSIZE=3){
gplot1 = NULL
BORDERCOL= "grey50"#"black"#grey60
LINECOL = BORDERCOL
LABELCOL = "black"
NACOL = "grey90"
YMIN = -4.0
YMAX = -4.0
if( !is.null(MAT) ){
CN = colnames(MAT)
N = length(CN)
ii = seq_along(1:N)
if( !is.null(TARGETS) ){
Nt = length(TARGETS)
MIN = rep(NA,Nt)
MAX = rep(NA,Nt)
MID = rep(NA,Nt)
LABELS = list()
for( i in 1:Nt ){
temp = ii[grepl(TARGETS[i],CN)]
MIN[i] = temp[1]
MAX[i] = temp[length(temp)]
MID[i] = MIN[i] + ((MAX[i]-MIN[i])/2)
#cat(MIN[i], ", ", MAX[i], ", ", MID[i], "\n")
LABELS[[i]] = textGrob(TARGETS[i], gp=gpar(fontsize=15, fontface="bold",col=LABELCOL))
}
}
Nlabs <- length(LABELS)
colnames(MAT) = ii
rownames(MAT) = ii
cormat <- round(MAT,2)
# Melt the correlation matrix
melted_cormat <- reshape2::melt(cormat)#, na.rm=T)
df = melted_cormat
colnames(df) <- c("V1","V2","value")
top = (max(as.numeric(as.vector(df$value))))
bot = (min(as.numeric(as.vector(df$value))))
mid = ((top-bot)/2) - abs(bot)
# Create a ggheatmap
gplot1 <- ggplot(df, aes(df$V2, df$V1, fill = df$value))+
labs(x="",y="",title=TIT)+
geom_tile(color = "white")+
scale_fill_gradient2(low = "cyan", high = "red",
limit = c(bot,top), space = "Lab",
name="II",
na.value=NACOL) +
#theme_minimal()+ # minimal theme
theme_grey(base_size=10)+
#theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_rect(colour = BORDERCOL, fill=NA, size=1),
panel.background = element_blank(),
axis.line = element_line(colour = BORDERCOL),
axis.ticks = element_blank(),
legend.justification = c("right","center"),
legend.margin=margin(grid::unit(0,"cm")),
legend.position = "right",
legend.direction = "vertical")+
guides(fill = guide_colorbar(barwidth = 1, barheight = 7,
title.position = "top", title.hjust = 0.5))+
{if( !is.na(MAX[1]))geom_vline(xintercept = MAX[1], color=LINECOL);}+
{if( !is.na(MAX[1]))geom_hline(yintercept = MAX[1], color=LINECOL);}+
{if( !is.na(MAX[2]))geom_vline(xintercept = MAX[2], color=LINECOL);}+
{if( !is.na(MAX[2]))geom_hline(yintercept = MAX[2], color=LINECOL);}+
{if( !is.na(MAX[3]))geom_vline(xintercept = MAX[3], color=LINECOL);}+
{if( !is.na(MAX[3]))geom_hline(yintercept = MAX[3], color=LINECOL);}+
{if( 1 <= Nlabs )annotation_custom(LABELS[[1]],xmin=MID[1],xmax=MID[1],ymin=YMIN,ymax=YMAX);}+
{if( 2 <= Nlabs )annotation_custom(LABELS[[2]],xmin=MID[2],xmax=MID[2],ymin=YMIN,ymax=YMAX);}+
{if( 3 <= Nlabs )annotation_custom(LABELS[[3]],xmin=MID[3],xmax=MID[3],ymin=YMIN,ymax=YMAX);}+
{if( 4 <= Nlabs )annotation_custom(LABELS[[4]],xmin=MID[4],xmax=MID[4],ymin=YMIN,ymax=YMAX);}
png(sprintf("%s/%s/%s.png",DIR,subDIR,shortTIT),width=WIDTH,height=HEIGHT,units="px");
print(gplot1)
dev.off()
}
return(gplot1)
}
plotNMI <- function(MAT=NULL, TARGETS=NULL, DIR="", subDIR="",
TIT="", shortTIT="", gradientTIT="NMI",
WIDTH=480, HEIGHT=480, valSIZE=3){
gplot1 = NULL
BORDERCOL= "grey50"#"black"#grey60
LINECOL = BORDERCOL
LABELCOL = "black"
NACOL = "grey90"
YMIN = -4.0
YMAX = -4.0
if( !is.null(MAT) ){
CN = colnames(MAT)
N = length(CN)
ii = seq_along(1:N)
if( !is.null(TARGETS) ){
Nt = length(TARGETS)
MIN = rep(NA,Nt)
MAX = rep(NA,Nt)
MID = rep(NA,Nt)
LABELS = list()
for( i in 1:Nt ){
temp = ii[grepl(TARGETS[i],CN)]
MIN[i] = temp[1]
MAX[i] = temp[length(temp)]
MID[i] = MIN[i] + ((MAX[i]-MIN[i])/2)
#cat(MIN[i], ", ", MAX[i], ", ", MID[i], "\n")
LABELS[[i]] = textGrob(TARGETS[i], gp=gpar(fontsize=15, fontface="bold",col=LABELCOL))
}
}
Nlabs = length(LABELS)
colnames(MAT) = ii
rownames(MAT) = ii
cormat <- round(MAT,2)
# Melt the correlation matrix
melted_cormat <- reshape2::melt(cormat)#, na.rm=T)
df = melted_cormat
colnames(df) <- c("V1","V2","value")
# Create a ggheatmap
gplot1 <- ggplot(df, aes(df$V2, df$V1, fill = df$value))+
labs(x="",y="",title=TIT)+
geom_tile(color = "white")+
scale_fill_gradient2(low = "cyan", high = "red", mid="black",
midpoint = 0.5, limit = c(0,1), space = "Lab",
name=gradientTIT,
na.value=NACOL) +
#theme_minimal()+ # minimal theme
theme_grey(base_size=10)+
#theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_rect(colour = BORDERCOL, fill=NA, size=1),
panel.background = element_blank(),
axis.line = element_line(colour = BORDERCOL),
axis.ticks = element_blank(),
legend.justification = c("right","center"),
legend.margin=margin(grid::unit(0,"cm")),
legend.position = "right",
legend.direction = "vertical")+
guides(fill = guide_colorbar(barwidth = 1, barheight = 7,
title.position = "top", title.hjust = 0.5))+
{if( !is.na(MAX[1]))geom_vline(xintercept = MAX[1], color=LINECOL);}+
{if( !is.na(MAX[1]))geom_hline(yintercept = MAX[1], color=LINECOL);}+
{if( !is.na(MAX[2]))geom_vline(xintercept = MAX[2], color=LINECOL);}+
{if( !is.na(MAX[2]))geom_hline(yintercept = MAX[2], color=LINECOL);}+
{if( !is.na(MAX[3]))geom_vline(xintercept = MAX[3], color=LINECOL);}+
{if( !is.na(MAX[3]))geom_hline(yintercept = MAX[3], color=LINECOL);}+
{if( 1 <= Nlabs )annotation_custom(LABELS[[1]],xmin=MID[1],xmax=MID[1],ymin=YMIN,ymax=YMAX);}+
{if( 2 <= Nlabs )annotation_custom(LABELS[[2]],xmin=MID[2],xmax=MID[2],ymin=YMIN,ymax=YMAX);}+
{if( 3 <= Nlabs )annotation_custom(LABELS[[3]],xmin=MID[3],xmax=MID[3],ymin=YMIN,ymax=YMAX);}+
{if( 4 <= Nlabs )annotation_custom(LABELS[[4]],xmin=MID[4],xmax=MID[4],ymin=YMIN,ymax=YMAX);}
png(sprintf("%s/%s/%s.png",DIR,subDIR,shortTIT),width=WIDTH,height=HEIGHT,units="px");
print(gplot1)
dev.off()
}
return(gplot1)
}
binomialModel <- function( data, alpha0=NULL, beta0=NULL, plot=TRUE ){
##Ref: http://dm13450.github.io/2019/02/22/Nonparametric-Prior.html
gplot=NULL
if( is.null(alpha0) ){ alpha0 = 0.01 }
if( is.null(beta0) ){ beta0 = 0.01 }
alphaPosterior <- alpha0 + sum(data[,1])
betaPosterior <- beta0 + sum(data[,2]) - sum(data[,1])
thetaDraws <- rbeta(1000, alphaPosterior, betaPosterior)
if( plot ){
xGrid <- seq(0.01, 1-0.01, 0.01)
bayesFrame <- data.frame(Theta=xGrid,
Prior = dbeta(xGrid, alpha0, beta0),
Likelihood = vapply(xGrid, function(x) sum(dbinom(data[,1], data[,2], x, log=T)), numeric(1)),
Posterior = vapply(xGrid, function(x) dbeta(x, alphaPosterior, betaPosterior), numeric(1)))
bayesFrame %>% gather(Part, Value, -Theta) -> bayesFrameTidy
bayesFrameTidy$Part <- factor(bayesFrameTidy$Part, levels=c("Prior", "Likelihood", "Posterior"))
gplot = ggplot(bayesFrameTidy, aes(x=Theta, y=Value, colour=Part)) +
geom_line() +
facet_wrap(~Part, scales="free") +
theme(legend.position = "bottom")
}
return(list(THETA=thetaDraws,GPLOT=gplot))
}
plotJcmi <- function(DF, DIR="", subDIR="", TIT="", shortTIT="", WIDTH=480, HEIGHT=480, STEPS=0.05, INTmax=30, runSCALED=FALSE, scaledMIN=-0.5, scaledMAX=0.5 ){
DF <- as.data.frame(DF)
CLASSES <- levels(DF[,2])
PLOTS <- list()
for( i in 1:length(CLASSES) ){
DF2 = DF[grepl(CLASSES[i], DF[,2]),]
MIN <- min(as.numeric(as.vector(DF2[,1])))
MAX <- max(as.numeric(as.vector(DF2[,1])))
INT <- ceiling((MAX-MIN)/STEPS)
#print(INT)
if( INT > INTmax ){ INT = INTmax }
BW <- round((MAX-MIN)/INT,3)
gplot <- ggplot(DF2, aes(as.numeric(as.vector(jcmi))))+#,fill=class))+
geom_histogram(binwidth=BW,fill="gray80",color="black")+
scale_x_continuous(breaks = seq(MIN, MAX, by = BW))+
stat_bin(binwidth = BW, aes(label=..count..), vjust=-0.5, geom = "text")+
geom_vline(xintercept = 0, color = "red", size=1)+
labs(x=shortTIT,y="count",title=CLASSES[i])+
theme(
title=element_text(face="bold",size=rel(1.5)),
axis.title.x=element_text(face="bold",size=rel(1.0)),
axis.title.y=element_text(face="bold",size=rel(1.0)),
axis.text.x = element_text(angle = 40, hjust = 1, face="bold", size=rel(1.0)),
legend.title=element_text(face="bold",size=rel(1.0)),
legend.text=element_text(face="bold",size=rel(1.0)),
legend.key=element_blank())+
theme(panel.grid.major = element_line(colour = "grey40",size=0.2),
#panel.grid.minor = element_line(colour="grey40",size=0.1),
panel.background = element_rect(fill = "white"),
panel.border = element_rect(linetype="solid",fill=NA))
png(sprintf("%s/%s/%s_%s.png",DIR,subDIR,shortTIT,CLASSES[i]),width=WIDTH,height=HEIGHT,units="px");
print(gplot)
dev.off()
PLOTS[[i]] <- gplot
names(PLOTS)[i] <- CLASSES[i]
}
MIN <- min(as.numeric(as.vector(DF[,1])))
MAX <- max(as.numeric(as.vector(DF[,1])))
gplot <- ggplot(DF) +
geom_density(aes(x = as.numeric(as.vector(jcmi)), fill = class), alpha = 0.2)+
geom_vline(xintercept = 0, color = "red", size=1)+
labs(x=shortTIT,y="count",title="ALL Classes")+
xlim(c(MIN,MAX))+
theme(
title=element_text(face="bold",size=rel(1.5)),
axis.title.x=element_text(face="bold",size=rel(1.0)),
axis.title.y=element_text(face="bold",size=rel(1.0)),
axis.text.x = element_text(angle = 40, hjust = 1, face="bold", size=rel(1.0)),
legend.title=element_text(face="bold",size=rel(1.0)),
legend.text=element_text(face="bold",size=rel(1.0)),
legend.key=element_blank())+
theme(panel.grid.major = element_line(colour = "grey40",size=0.2),
#panel.grid.minor = element_line(colour="grey40",size=0.1),
panel.background = element_rect(fill = "white"),
panel.border = element_rect(linetype="solid",fill=NA))
png(sprintf("%s/%s/%s.png",DIR,subDIR,shortTIT),width=WIDTH,height=HEIGHT,units="px");
print(gplot)
dev.off()
i=(i+1)
PLOTS[[i]] <- gplot
names(PLOTS)[i] <- "ALL"
if( runSCALED ){
MIN = scaledMIN
MAX = scaledMAX
gplot <- ggplot(DF) +
geom_density(aes(x = as.numeric(as.vector(jcmi)), fill = class), alpha = 0.2)+
geom_vline(xintercept = 0, color = "red", size=1)+
labs(x=shortTIT,y="count",title="ALL Classes")+
xlim(c(MIN,MAX))+
theme(
title=element_text(face="bold",size=rel(1.5)),
axis.title.x=element_text(face="bold",size=rel(1.0)),
axis.title.y=element_text(face="bold",size=rel(1.0)),
axis.text.x = element_text(angle = 40, hjust = 1, face="bold", size=rel(1.0)),
legend.title=element_text(face="bold",size=rel(1.0)),
legend.text=element_text(face="bold",size=rel(1.0)),
legend.key=element_blank())+
theme(panel.grid.major = element_line(colour = "grey40",size=0.2),
#panel.grid.minor = element_line(colour="grey40",size=0.1),
panel.background = element_rect(fill = "white"),
panel.border = element_rect(linetype="solid",fill=NA))
png(sprintf("%s/%s/%s_scaled.png",DIR,subDIR,shortTIT),width=WIDTH,height=HEIGHT,units="px");
print(gplot)
dev.off()
i=(i+1)
PLOTS[[i]] <- gplot
names(PLOTS)[i] <- "ALLscaled"
}
return(PLOTS)
}
binomialDPModel <- function( data, alpha0=NULL, beta0=NULL, ITS=20, FIT=TRUE ){
##Ref: http://dm13450.github.io/2019/02/22/Nonparametric-Prior.html
saveList <- NULL
if(FIT){
xGrid <- seq(0,1,by=0.01)
if( is.null(alpha0) ){ alpha0 = 0.01 }
if( is.null(beta0) ){ beta0 = 0.01 }
N = length(data[,1])
alphaPosterior <- alpha0 + sum(data[,1])
betaPosterior <- beta0 + sum(data[,2]) - sum(data[,1])
thetaDirichlet <- rbeta(N, alphaPosterior, betaPosterior)
dp <- DirichletProcessBeta( thetaDirichlet,
1,
mhStep = c(0.002, 0.005),
#(alpha,beta) shape parameters for beta distribution
#https://en.wikipedia.org/wiki/Beta_distribution
alphaPrior = c(2,2))#c(2,0.5) )
dp$numberClusters <- N
dpclusterLabels <- seq_len(N)
dp$clusterParameters <- PriorDraw(dp$mixingDistribution, N)
dp$pointsPerCluster <- rep_len(1, N)
dp <- Fit(dp,1)
postFuncEval <- matrix(ncol=ITS, nrow=length(xGrid))
muPostVals <- matrix(ncol=ITS, nrow=N)
nuPostVals <- matrix(ncol=ITS, nrow=N)
pb <- txtProgressBar(max=ITS, width=50, char="-", style=3)
Error = 1
Problem = PosteriorClusters(dp)$weights
Theta = thetaDirichlet
tryCatch(
expr = {
for( i in seq_len(ITS) ){
postClusters <- PosteriorClusters(dp)
postFuncEval[,i] <- PosteriorFunction(dp)(xGrid)
if( length(postClusters$weights) > 0 &&
sum(is.na(postClusters$weights)) == 0 ){
wk <- sample.int(length(postClusters$weights),
N,
replace=T,
prob=postClusters$weights)
muPost <- postClusters$params[[1]][,,wk]
nuPost <- postClusters$params[[2]][,,wk]
aPost <- muPost * nuPost
bPost <- (1-muPost) * nuPost
muPostVals[,i] <- muPost
nuPostVals[,i] <- nuPost
newTheta <- rbeta(N, aPost+data[,1], bPost+data[,2] - data[,1])
dp <- ChangeObservations(dp, newTheta)
dp <- Fit(dp, 100, updatePrior=T, progressBar=F)
Error <- 0
Problem <- postClusters$weights
Theta <- newTheta
#generate an error
#log("text")
}
setTxtProgressBar(pb, i)
}#for
},
error = function(e){
message('Caught an error!')
print(e)
Error = 1
Problem = postClusters$weights
}
#,
#
# warning = function(w){
# message('Caught an warning!')
# #print(w)
# }
)
saveList <- list()
saveList$muPostVals <- muPostVals
saveList$nuPostVals <- nuPostVals
saveList$postFuncEval <- postFuncEval
saveList$dp <- dp
saveList$Error <- Error
saveList$Problem <- Problem
saveList$Theta <- Theta
}
return(saveList)
}
variablePlotBiomModel <- function(DF, DIR, subDIR, WIDTH, HEIGTH ){
gplot = ggplot(DF, aes(x=as.numeric(as.vector(mean))) ) +
stat_bin(breaks=c(-0.004, seq(0.001,1.0, by=0.005)),color="#E69F00")+
labs(x="mean(theta)",y="count")+
theme(
title=element_text(face="bold",size=rel(1.5)),
axis.title.x=element_text(face="bold",size=rel(1.5)),
axis.title.y=element_text(face="bold",size=rel(1.5)),
legend.title=element_text(face="bold",size=rel(2.0)),
legend.text=element_text(face="bold",size=rel(2.0)),
legend.key=element_blank())+
theme(panel.grid.major = element_line(colour = "grey40",size=0.2),
panel.grid.minor = element_line(colour="grey40",size=0.1),
panel.background = element_rect(fill = "white"),
panel.border = element_rect(linetype="solid",fill=NA))
png(sprintf("%s/%s/%s.png",DIR,subDIR,"Mn_theta"),width=WIDTH,height=HEIGHT,units="px");
print(gplot)
dev.off()
gplot = ggplot(DF, aes(x=as.numeric(as.vector(mean)),color=set, fill=set) ) +
stat_bin(breaks=c(-0.004, seq(0.001,1.0, by=0.005)))+
labs(x="mean(theta)",y="count")+
scale_color_manual(values=c("#E69F00", "#999999", "#56B4E9", "#CC79A7"))+
theme(
title=element_text(face="bold",size=rel(1.5)),
axis.title.x=element_text(face="bold",size=rel(1.5)),
axis.title.y=element_text(face="bold",size=rel(1.5)),
legend.title=element_text(face="bold",size=rel(2.0)),
legend.text=element_text(face="bold",size=rel(2.0)),
legend.key=element_blank(),
legend.background=element_blank() )+
theme(panel.grid.major = element_line(colour = "grey40",size=0.2),
panel.grid.minor = element_line(colour="grey40",size=0.1),
panel.background = element_rect(fill = "white"),
panel.border = element_rect(linetype="solid",fill=NA))
png(sprintf("%s/%s/%s.png",DIR,subDIR,"Mn_theta_bySet"),width=WIDTH,height=HEIGHT,units="px");
print(gplot)
dev.off()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.