In yhhc2/machinelearnr: Package with various functions for preprocessing, clustering, and classification

Installation

# Install the package from GitHub
devtools::install_github("yhhc2/machinelearnr")

# Load package
library("machinelearnr")

Examples

All functions with example code is run in this section. The functions are listed below in alphabetical order with example code to illustrate how each function should be used. The example code should be very similar to the example code in the function reference.

To see detailed descriptions for each function, please visit the package's website.

Examples for clustering

CalcOptimalNumClustersForKMeans()

example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31,
                                 35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
                          y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
                                27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45))

#dev.new()
plot(example.data$x, example.data$y)

#Results should say that 3 clusters is optimal
output <- CalcOptimalNumClustersForKMeans(example.data, c("x", "y"))

elbow.plot <- output[[1]]

ch.and.asw.plot <- output[[2]]

#dev.new()
elbow.plot

#dev.new()
ch.and.asw.plot

generate.2D.clustering.with.labeled.subgroup()

example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35,
                                 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
                           y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 27,
                                 29, 20, 28, 21, 30, 31, 23, 24, 40, 45),
                           z = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                 1, 1, 1, 1, 1, 1, 1, 1, 1))

#dev.new()
plot(example.data$x, example.data$y)

km.res <- stats::kmeans(example.data[,c("x", "y", "z")], 3, nstart = 25, iter.max=10)

grouped <- km.res$cluster

pca.results <- prcomp(example.data[,c("x", "y", "z")], scale=FALSE)

actual.group.label <- c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", 
                        "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B")

results <- generate.2D.clustering.with.labeled.subgroup(pca.results, grouped, actual.group.label)

#PC1 vs PC2
print(results[[1]])

#PC1 vs PC3
print(results[[2]])

#Chi-square results
print(results[[3]])

#Table
print(results[[4]])

generate.3D.clustering.with.labeled.subgroup()

example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35,
                                 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
                           y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
                                 27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45),
                           z = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,
                                 3, 3, 3, 3, 3, 3, 3, 3))

#dev.new()
plot(example.data$x, example.data$y)

km.res <- stats::kmeans(example.data[,c("x", "y", "z")], 3, nstart = 25, iter.max=10)

grouped <- km.res$cluster

pca.results <- prcomp(example.data[,c("x", "y", "z")], scale=FALSE)

actual.group.label <- c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B")

results <- generate.3D.clustering.with.labeled.subgroup(pca.results, grouped, actual.group.label)

xlab.values <- results[[1]]
ylab.values <- results[[2]]
zlab.values <- results[[3]]
xdata.values <- results[[4]]
ydata.values <- results[[5]]
zdata.values <- results[[6]]

data.to.plot <- data.frame(xdata.values, ydata.values, zdata.values)

# #Interactive 3d plot, but cannot display in html or md
# rgl::rgl.bg(color = "white")
# rgl::plot3d(x= xdata.values, y= ydata.values, z= zdata.values,
# xlab = xlab.values, ylab = ylab.values, zlab = zlab.values, col=(grouped+1), pch=20, cex=2)
# rgl::text3d(x= xdata.values, y= ydata.values, z= zdata.values, text= actual.group.label, cex=1)
# rgl::rglwidget()

#Non-interactive 3d plot
s3d <- scatterplot3d::scatterplot3d(data.to.plot[,1:3], pch = 16, color=(grouped+1),
                                    xlab = xlab.values, ylab = ylab.values, 
                                    zlab = zlab.values)

legend(s3d$xyz.convert(-30, 20, 1), legend = levels(as.factor(grouped)),
      col = levels(as.factor(grouped+1)), pch = 16)

text(s3d$xyz.convert(data.to.plot[,1:3]), labels = actual.group.label,
     cex= 0.7, col = "steelblue")

generate.plots.comparing.clusters()

example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31,
                                 35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
                          y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
                                27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45))

dev.new()
plot(example.data$x, example.data$y)

km.res <- stats::kmeans(example.data[,c("x", "y")], 3, nstart = 25, iter.max=10)

grouped <- km.res$cluster

generate.plots.comparing.clusters(example.data, grouped, c("x", "y"))

GenerateParcoordForClusters()

example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31,
                                 35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
                          y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
                                27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45))

plot(example.data$x, example.data$y)

km.res <- stats::kmeans(example.data[,c("x", "y")], 3, nstart = 25, iter.max=10)

grouped <- km.res$cluster

example.data <- cbind(example.data, grouped)

print(GenerateParcoordForClusters(example.data, "grouped", c("x", "y")))

HierarchicalClustering()

id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "3h", "3i", "3a",
       "3b", "3c", "3d", "3e", "3f", "3g", "2g", "2h")

x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30)

y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45)

color = as.factor(c(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1))

example.data <- data.frame(id, x, y, color)

#dev.new()
plot(example.data$x, example.data$y)
text(example.data$x, example.data$y, labels = id, cex=0.9, font=2)

results <- HierarchicalClustering(working.data = example.data,
                       clustering.columns = c("x", "y"),
                       label.column.name = "id",
                       grouping.column.name = "color",
                       number.of.clusters.to.use = 3,
                       distance_method = "euclidean",
                       correlation_method = NULL,
                       linkage_method_type = "ward.D",
                       Use.correlation.for.hclust = FALSE,
                       terminal.branch.font.size = 1,
                       title.to.use = "Clustering based on x and y data")

hclust.res <- results[[1]]
dend <- results[[2]]
kcboot.res <- results[[3]]
title.to.use <- results[[4]]
labeled.samples <- results[[5]]
stability.table <- results[[6]]

#Plot dendrogram                         
plot(dend, main = title.to.use, cex.main = 0.75)

#Add legend to dendrogram
legend.labels.to.use <- levels(example.data[,"color"])
col.to.use <- as.integer(levels(example.data[,"color"])) + 1
pch.to.use <- rep(20, times = length(legend.labels.to.use))
graphics::legend("topright",
      legend = legend.labels.to.use,
      col = col.to.use,
      pch = pch.to.use, bty = "n",  pt.cex = 1.5, cex = 0.8 ,
      text.col = "black", horiz = FALSE, inset = c(0, 0.1),
      title = "Color")

#Display sample assignment
labeled.samples

#Display stability of clusters
plot(1,1)
gridExtra::grid.table(stability.table)

#Display kcboot full output
kcboot.res

Examples for classification

eval.classification.results()

id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
       "3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i")

x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54)

y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24)

a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)


actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "3", "3", "3",
       "3", "3", "3", "3", "3", "3"))

example.data <- data.frame(id, x, y, a, b, actual)

set.seed(2)
rf.result <- randomForest::randomForest(x=example.data[,c("x", "y", "a", "b")],
y=example.data[,"actual"], proximity=TRUE)

predicted <- rf.result$predicted
actual <- example.data[,"actual"]

eval.classification.results(as.character(actual), as.character(predicted), "Example")

find.best.number.of.trees()

id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e",
       "2f", "3a",
       "3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i")

x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46,
47, 48, 49, 54)

y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21,
30, 31, 23, 24)

a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)


actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2",
"2", "2", "3", "3", "3",
       "3", "3", "3", "3", "3", "3"))

example.data <- data.frame(id, x, y, a, b, actual)

set.seed(1)
rf.result <- randomForest::randomForest(x=example.data[,c("x", "y", "a", "b")],
y=example.data[,"actual"], proximity=TRUE, ntree=50)

error.oob <- rf.result[[4]][,1]

best.tree <- find.best.number.of.trees(error.oob)

trees <- 1:length(error.oob)

plot(trees, error.oob, type = "l")

#dev.new()
plot(example.data$x, example.data$y)
text(example.data$x, example.data$y,labels=example.data$id)

LOOCVPredictionsRandomForestAutomaticMtryAndNtree()

id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
       "3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i")

x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54)

y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24)

a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)


actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2",
       "3", "3", "3",
       "3", "3", "3", "3", "3", "3"))

example.data <- data.frame(id, x, y, a, b, actual)

result <- LOOCVPredictionsRandomForestAutomaticMtryAndNtree(example.data,
                           predictors.that.PCA.can.be.done.on = c("x", "y", "a", "b"),
                           predictors.that.should.not.PCA = NULL,
                           should.PCA.be.used = FALSE,
                           target.column.name = "actual",
                           seed=2,
                           percentile.threshold.to.keep = 0.5)

predicted <- result[[1]]
actual <- example.data[,"actual"]

eval.classification.results(as.character(actual), as.character(predicted), "Example")

result[[2]]

LOOCVRandomForestClassificationMatrixForPheatmap()

#Make example data where samples with 1, 2, and 3 in their ID names can be
#predicted using features x and y, while samples with 4 and 5 in their ID names
#can be predicted using features a and b.

id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
       "3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i",
       "4a", "4b", "4c", "4d", "4e", "4f", "4g", "5a", "5b", "5c",
       "5d", "5e", "5f", "5g")

x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18, 21, 22, 24, 26, 26, 27, 40, 41, 42, 44, 46, 47, 48)

b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10, 11, 22, 15, 12, 13, 14, 27, 29, 20, 28, 21, 30, 31)

sep.xy.ab = c("1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5",
"4/5", "4/5", "4/5")

actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2",
       "2", "2", "3", "3", "3",
       "3", "3", "3", "3", "3", "3", "4", "4", "4", "4", "4", "4", "4", "5",
       "5", "5", "5", "5", "5", "5"))

example.data <- data.frame(id, x, y, a, b, sep.xy.ab, actual)

#dev.new()
plot(example.data$x, example.data$y)
text(example.data$x, example.data$y,labels=example.data$id)

#dev.new()
plot(example.data$a, example.data$b)
text(example.data$a, example.data$b,labels=example.data$id)

result <- LOOCVRandomForestClassificationMatrixForPheatmap(input.data = example.data,
                                       factor.name.for.subsetting = "sep.xy.ab",
                                       name.of.predictors.to.use = c("x", "y", "a", "b"),
                                       target.column.name = "actual",
                                       seed = 2,
                                       should.mtry.and.ntree.be.optimized = FALSE,
                                       percentile.threshold.to.keep = 0.5)

matrix.for.pheatmap <- result[[1]]

pheatmap_RF <- pheatmap::pheatmap(matrix.for.pheatmap, fontsize_col = 12, fontsize_row=12)


#The pheatmap shows that the points in groups 1, 2, and 3 can be predicted
#with features x and y. While points in group 4 and 5 can be predicted with
#features a and b.

#dev.new()
pheatmap_RF

RandomForestAutomaticMtryAndNtree()

id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
       "3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i")

x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54)

y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24)

a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)


actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2",
       "2", "3", "3", "3",
       "3", "3", "3", "3", "3", "3"))

example.data <- data.frame(id, x, y, a, b, actual)

rf.result <- RandomForestAutomaticMtryAndNtree(example.data, c("x", "y", "a", "b"),
"actual", seed=2)

predicted <- rf.result$predicted
actual <- example.data[,"actual"]

#Result is not perfect because RF model does not over fit to the training data.
eval.classification.results(as.character(actual), as.character(predicted), "Example")

RandomForestClassificationGiniMatrixForPheatmap()

#Make example data where samples with 1, 2, and 3 in their ID names can be
#predicted using features x and y, while samples with 4 and 5 in their ID names
#can be predicted using features a and b.

id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
       "3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i",
       "4a", "4b", "4c", "4d", "4e", "4f", "4g", "5a", "5b", "5c",
       "5d", "5e", "5f", "5g")

x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18, 21, 22, 24, 26, 26, 27, 40, 41, 42, 44, 46, 47, 48)

b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10, 11, 22, 15, 12, 13, 14, 27, 29, 20, 28, 21, 30, 31)

sep.xy.ab = c("1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5",
"4/5", "4/5", "4/5")

actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "3", "3", "3",
       "3", "3", "3", "3", "3", "3", "4", "4", "4", "4", "4", "4", "4", "5", "5", "5", "5", "5",
       "5", "5"))

example.data <- data.frame(id, x, y, a, b, sep.xy.ab, actual)

#dev.new()
plot(example.data$x, example.data$y)
text(example.data$x, example.data$y,labels=example.data$id)

#dev.new()
plot(example.data$a, example.data$b)
text(example.data$a, example.data$b,labels=example.data$id)

results <- RandomForestClassificationGiniMatrixForPheatmap(input.data = example.data,
                                       factor.name.for.subsetting = "sep.xy.ab",
                                       name.of.predictors.to.use = c("x", "y", "a", "b"),
                                       target.column.name = "actual",
                                       seed = 2)

matrix.for.pheatmap <- results[[1]]

#Add MCC to column names
for(i in 1:dim(matrix.for.pheatmap)[2])
{
   old.name <- colnames(matrix.for.pheatmap)[[i]]
   MCC.val <- matrix.for.pheatmap[dim(matrix.for.pheatmap)[1],][[i]]
   new.name <- paste(old.name, ", MCC: ", format(round(MCC.val, 2), nsmall = 2))
   colnames(matrix.for.pheatmap)[[i]] <- new.name
}

#Remove MCC row from matrix
matrix.for.pheatmap.MCC.row.removed <- matrix.for.pheatmap[1:(dim(matrix.for.pheatmap)[1]-1),]

pheatmap_RF <- pheatmap::pheatmap(matrix.for.pheatmap.MCC.row.removed, fontsize_col = 12,
fontsize_row=12)


#The pheatmap shows that the points in groups 1, 2, and 3 can be predicted
#with features x and y. While points in group 4 and 5 can be predicted with
#features a and b.
#dev.new()
pheatmap_RF

#Note that the 1/2/3 column is exactly the same as the random forest
#model created in the example for eval.classification.results.
#Since this pheatmap is using the number of LOOCV rounds
#as the input matrix, the colors can be misleading. It looks
#like features x and y are more important for 1/2/3 than features a and b
#is for 4/5. However, this is not the case. The mean decrease
#in gini index cannot be used to compare features between models. It
#can only be used to compare features within a single model.

RandomForestClassificationPercentileMatrixForPheatmap()

#Make example data where samples with 1, 2, and 3 in their ID names can be
#predicted using features x and y, while samples with 4 and 5 in their ID names
#can be predicted using features a and b.

id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
       "3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i",
       "4a", "4b", "4c", "4d", "4e", "4f", "4g", "5a", "5b", "5c",
       "5d", "5e", "5f", "5g")

x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18, 21, 22, 24, 26, 26, 27, 40, 41, 42, 44, 46, 47, 48)

b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10, 11, 22, 15, 12, 13, 14, 27, 29, 20, 28, 21, 30, 31)

sep.xy.ab = c("1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5",
"4/5", "4/5", "4/5")

actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2",
       "3", "3", "3",
       "3", "3", "3", "3", "3", "3", "4", "4", "4", "4", "4", "4", "4", "5", "5", "5",
       "5", "5", "5", "5"))

example.data <- data.frame(id, x, y, a, b, sep.xy.ab, actual)

#dev.new()
plot(example.data$x, example.data$y)
text(example.data$x, example.data$y,labels=example.data$id)

#dev.new()
plot(example.data$a, example.data$b)
text(example.data$a, example.data$b,labels=example.data$id)

results <- RandomForestClassificationPercentileMatrixForPheatmap(
                                       input.data = example.data,
                                       factor.name.for.subsetting = "sep.xy.ab",
                                       name.of.predictors.to.use = c("x", "y", "a", "b"),
                                       target.column.name = "actual",
                                       seed = 2)

matrix.for.pheatmap <- results[[1]]

pheatmap_RF <- pheatmap::pheatmap(matrix.for.pheatmap, fontsize_col = 12, fontsize_row=12)


#The pheatmap shows that the points in groups 1, 2, and 3 can be predicted
#with features x and y. While points in group 4 and 5 can be predicted with
#features a and b.

#dev.new()
pheatmap_RF

CVPredictionsRandomForest()

example.data <- GenerateExampleDataMachinelearnr()

set.seed(1)
example.data.shuffled <- example.data[sample(nrow(example.data)),]

result.CV <- CVPredictionsRandomForest(inputted.data = example.data.shuffled,
name.of.predictors.to.use = c("x", "y", "a", "b"),
target.column.name = "actual",
seed = 2,
percentile.threshold.to.keep = 0.5,
number.of.folds = nrow(example.data))

#Predicted
result.CV[[1]]

#Feature importance
result.CV[[2]]

CVRandomForestClassificationMatrixForPheatmap()

example.data <- GenerateExampleDataMachinelearnr()

set.seed(1)
example.data.shuffled <- example.data[sample(nrow(example.data)),]

result.two.fold.CV <- CVRandomForestClassificationMatrixForPheatmap(
input.data = example.data.shuffled,
factor.name.for.subsetting = "sep.xy.ab",
name.of.predictors.to.use = c("x", "y", "a", "b"),
target.column.name = "actual",
seed = 2,
percentile.threshold.to.keep = 0.5,
number.of.folds = 2)

matrix.for.pheatmap <- result.two.fold.CV[[1]]

pheatmap_RF <- pheatmap::pheatmap(matrix.for.pheatmap, fontsize_col = 12, fontsize_row=12)

yhhc2/machinelearnr documentation built on Dec. 23, 2021, 7:19 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

yhhc2/machinelearnr
Package with various functions for preprocessing, clustering, and classification

In yhhc2/machinelearnr: Package with various functions for preprocessing, clustering, and classification

Installation

Examples

Examples for clustering

CalcOptimalNumClustersForKMeans()

generate.2D.clustering.with.labeled.subgroup()

generate.3D.clustering.with.labeled.subgroup()

generate.plots.comparing.clusters()

GenerateParcoordForClusters()

HierarchicalClustering()

Examples for classification

eval.classification.results()

find.best.number.of.trees()

LOOCVPredictionsRandomForestAutomaticMtryAndNtree()

LOOCVRandomForestClassificationMatrixForPheatmap()

RandomForestAutomaticMtryAndNtree()

RandomForestClassificationGiniMatrixForPheatmap()

RandomForestClassificationPercentileMatrixForPheatmap()

CVPredictionsRandomForest()

CVRandomForestClassificationMatrixForPheatmap()

R Package Documentation

Browse R Packages

We want your feedback!

yhhc2/machinelearnr Package with various functions for preprocessing, clustering, and classification

In yhhc2/machinelearnr: Package with various functions for preprocessing, clustering, and classification

Installation

Examples

Examples for clustering

CalcOptimalNumClustersForKMeans()

generate.2D.clustering.with.labeled.subgroup()

generate.3D.clustering.with.labeled.subgroup()

generate.plots.comparing.clusters()

GenerateParcoordForClusters()

HierarchicalClustering()

Examples for classification

eval.classification.results()

find.best.number.of.trees()

LOOCVPredictionsRandomForestAutomaticMtryAndNtree()

LOOCVRandomForestClassificationMatrixForPheatmap()

RandomForestAutomaticMtryAndNtree()

RandomForestClassificationGiniMatrixForPheatmap()

RandomForestClassificationPercentileMatrixForPheatmap()

CVPredictionsRandomForest()

CVRandomForestClassificationMatrixForPheatmap()

R Package Documentation

Browse R Packages

We want your feedback!

yhhc2/machinelearnr
Package with various functions for preprocessing, clustering, and classification