knitr::opts_chunk$set(echo = TRUE)

library(FDclassifieR)

Prepare Data

set.seed(1024)

#yeast <- read_table(url("http://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data"))
tmp <- read.table('data/yeast.csv')
names(tmp)<- c("SequenceName", "mcg", "gvh", "alm", 
               "mit", "erl", "pox", "vac", "nuc", "LocalizationSite")

#head(tmp)
#table(tmp$LocalizationSite)

# choose only 'CYT' and 'NUC', ignore SequnceName
Yeast <- tmp[tmp$LocalizationSite %in% c('CYT', 'NUC'), 2:10]
names(Yeast)[ncol(Yeast)] <- 'Site'
Yeast$Site <- factor(Yeast$Site, c('CYT', 'NUC'))

inTraining0 <- createDataPartition(Yeast$Site, p = .75, list = FALSE)
training <- Yeast[ inTraining0,]
testing  <- Yeast[-inTraining0,]
testingY <- to_label(Yeast[-inTraining0, ncol(Yeast)])
table(Yeast$Site)

Data analysis

pca <- princomp(tmp[, 2:9], cor=T) # principal components analysis using correlation matrix
pc.comp <- pca$scores
PrincipalComponent1 <- -1*pc.comp[,1] # principal component 1 scores (negated for convenience)
PrincipalComponent2 <- -1*pc.comp[,2] # principal component 2 scores (negated for convenience)
clustering.data <- cbind(PrincipalComponent1, PrincipalComponent2)

K-Mean Clustering

set.seed(100)
km <- kmeans(clustering.data, 8, iter.max = 30, nstart=30)
#km
km$cluster
plot(PrincipalComponent1, PrincipalComponent2, col=km$cluster)
points(km$centers, pch=16)

aggregate(tmp[, 2:9],by=list(km$cluster),mean)
table(km$cluster, tmp$LocalizationSite)

Spectral Clustering

library(kknn)
cl   <- specClust(clustering.data, centers=8, nn=50, iter.max=100) 
#cl
plot(PrincipalComponent1, PrincipalComponent2, col=cl$cluster)

table(cl$cluster, tmp$LocalizationSite)

aggregate(tmp[, 2:9],by=list(cl$cluster),mean)

Hierarchical Clustering

d_yeast<- dist(clustering.data)
hclusters <- hclust(d_yeast, method = "average")
clusterCut <- cutree(hclusters, 8)
#clusterCut
table(clusterCut, tmp$LocalizationSite)
aggregate(tmp[, 2:9],by=list(clusterCut),mean)

plot(PrincipalComponent1, PrincipalComponent2, col=clusterCut)

Train models

model_list <- c('nnet', 'rda', 'svmLinear', 'svmRadial', 'pls', 'knn', 'earth', 'avNNet', 'mlp', 'nb', 'rf', 'rpart', 'ctree', 'C5.0', 'gbm', 'bayesglm', 'glm', 'glmnet', 'simpls')
t1 <- mtrainer(model_list, dataInfo = 'Yeast')
t1 <- train(t1, Site~., training, update=T)
plot(t1)
summary(s1)
t1 <- t1 %>%
  addmodel.mtrainer(c('ctree', 'C5.0', 'gbm', 'svmLinear', 'svmRadial', 'pls', 'earth', 'avNNet')) %>%
  train(Site~., training) 
t1 <- predict(t1, newdata=testing)
#auclist <- apply(t1$predictions, 2, auc.rank, testingY)

fde1 <- fde(t1$predictions)
fde1 <- calculate_performance(fde1, testingY, alpha=7)
#fde1 <- predict_performance(fde1, auclist, attr(testingY, 'rho'))
#plot_performance(fde1, nsample=100, trendline=F)
plot_performance_nmethods(fde1, nmethod_list = 3:15, nsample=100)
plot_cor(fde1, class_flag='positive')

plot_cor(fde1, class_flag='negative')
fde1 <- fde(t1$predictions, testingY)
plot_single(fde1, 'score')
store.mtrainer(t1, 'yeast_m8_pre.RData')
saveRDS(testingY, 'yeast_m8_y.RData')
saveRDS(t1, 'yeast_all.RData')
plot_ensemble(fde1, method='invauc', alpha=0.8)
y <- to_label(fde1@actual_label, class1 = 'NUC')
p1 <- pcr(fde1@predictions[,10], y, sample_size = 100, sample_n = 1000)
plot(p1)


sungcheolkim78/FiDEL documentation built on Nov. 13, 2024, 7:58 a.m.