library(here)
library(tidyverse)
data(algae, package = "DMwR2")
mean(algae$a1)
mean(algae$NO3) # NA due to NAs?
mean(algae$NO3, na.rm = TRUE)
median(algae$a3)
median(algae$mxPH, na.rm = TRUE)
alg = tbl_df(algae)
summarise(alg, avgNO3 = mean(NO3, na.rm = TRUE), medA1 = median(a1))
alg %>%
select(mxPH:Cl) %>%
summarise_all(funs(mean(., na.rm = TRUE), median(., na.rm = TRUE)))
alg %>%
group_by(season, size) %>%
summarize(nObs = n(), mA7 = median(a7)) %>%
ungroup() %>%
arrange(desc(mA7))
Mode <- function(x, na.rm = FALSE) {
if (na.rm) x <- x[!is.na(x)]
ux <- unique(x)
return(ux[which.max(tabulate(match(x, ux)))])
}
Mode(algae$mxPH, na.rm = TRUE)
Mode(algae$season)
library(DMwR2)
centralValue(algae$a1)
centralValue(algae$speed)
var(algae$a1)
sd(algae$Cl, na.rm = TRUE)
IQR(algae$mxPH, na.rm = TRUE)
quantile(algae$a3)
quantile(algae$a3, probs = c(0.2, 0.8))
range(algae$a1)
max(algae$a5) - min(algae$a5)
alg %>%
select(a1:a7) %>%
summarise_all(funs(var))
data(iris)
aggregate(iris$Sepal.Length, list(Species = iris$Species), quantile)
aggregate(Sepal.Length ~ Species, data = iris, quantile)
nasRow = apply(algae, 1, function(r) sum(is.na(r)))
cat("The Algae dataset contains ", sum(nasRow), " NA values.\n")
cat(
"There are ",
sum(!complete.cases(algae)),
" rows that have at least one NA value.\n"
)
bpRule <- function(x, const = 1.5, positions = FALSE) {
x <- x[!is.na(x)]
qs <- quantile(x, probs = c(0.25, 0.75))
iqr <- qs[2] - qs[1]
if (!positions)
x[x < qs[1] - const * iqr | x > qs[2] + const * iqr]
else
which(x < qs[1] - const * iqr | x > qs[2] + const * iqr)
}
bpRule(algae$a1)
bpRule(algae$NO3)
bpRule(algae$NO3, positions = TRUE)
summary(iris)
library(Hmisc)
Hmisc::describe(iris)
by(algae[, 2:5], algae$season, summary)
plot(sin(seq(0, 10, by = 0.1)), type = "l")
library(ggplot2)
data(algae, package="DMwR2")
freqOcc <- table(algae$season)
barplot(freqOcc,main='Frequency of the Seasons')
algae %>%
ggplot(aes(x=season)) +
geom_bar() +
ggtitle("Frequency of the Seasons")
data(iris)
boxplot(iris$Sepal.Width, ylab='Sepal Width')
iris %>%
ggplot(aes(x = factor(0), y = Sepal.Width)) +
geom_boxplot() +
xlab("") +
ylab("Sepal Width") +
theme(axis.text.x = element_blank())
boxplot(Sepal.Length ~ Species, iris, ylab = "Sepal.Length")
iris %>%
ggplot(aes(x = Species, y = Sepal.Length)) +
geom_boxplot()
algae %>%
ggplot(aes(x=a1)) +
geom_histogram() +
facet_grid(size ~ speed)
plot(iris$Sepal.Length,
iris$Sepal.Width,
main="Relationship between Sepal Length and Width",
xlab="Sepal Length",
ylab="Sepal Width")
iris %>%
ggplot(aes(x=Sepal.Length,y=Sepal.Width)) +
geom_point() +
xlab("Sepal Length") + ylab("Sepal Width") +
ggtitle("Relationship between Sepal Length and Width")
algae %>%
ggplot(aes(x = a1, y = a2, color = season)) +
geom_point() +
facet_wrap(~ season)
library(GGally)
ggpairs(algae, columns=12:16)
ggpairs(algae, columns=2:5)
ggparcoord(algae,columns=12:18,groupColumn="season")
library(arules)
library(dplyr)
data(Boston,package="MASS")
b <- Boston
b$chas <- factor(b$chas,labels=c("river","noriver"))
b$rad <- factor(b$rad)
b$black <- cut(b$black,breaks=4,labels=c(">31.5%","18.5-31.5%","8-18.5%","<8%"))
discr <- function(x) cut(x,breaks=4, labels=c("low","medLow","medHigh","high"))
b <- select(b,-one_of(c("chas","rad","black"))) %>%
mutate_all(funs(discr)) %>%
bind_cols(select(b,one_of(c("chas","rad","black"))))
b <- as(b,"transactions")
summary(b)
itemFrequencyPlot(b, support=0.3,cex.names=0.8)
ars <- apriori(b, parameter=list(support=0.025, confidence=0.75))
ars
table(discr(Boston$medv))
inspect(head(subset(ars, subset=rhs %in% "medv=high"),5,by="confidence"))
inspect(head(subset(ars, subset=rhs %in% "medv=low"),5,by="confidence"))
inspect(
head(subset(ars, subset = lhs %in% "nox=high" | rhs %in% "nox=high"),
5, by="confidence")
)
library(arulesViz)
plot(ars)
somerules <- subset(
ars,
subset=rhs %in% c("medv=high","medv=medHigh") & confidence>0.75
)
plot(somerules, method="matrix", measure="lift")
somerules <- subset(ars, subset=rhs %in% "medv=high" & confidence > 0.95)
plot(somerules, method="graph", control=list(type="itemsets"))
# 3.4.3
set.seed(1234)
randDat <- matrix(rnorm(50), nrow=5)
dist(randDat) # Euclidean distance (default)
dist(randDat, method="manhattan")
dist(randDat, method="minkowski", p=4)
set.seed(1234) # setting a seed for the random number generator
data(iris)
ir3 <- kmeans(iris[,-5], centers=3, iter.max=200) # not using Species info.
ir3
table(ir3$cluster, iris$Species)
cm <- table(ir3$cluster, iris$Species)
1 - sum(diag(cm)) / sum(cm)
library(cluster)
s <- silhouette(ir3$cluster, dist(iris[,-5]))
plot(s)
set.seed(1234)
d <- dist(iris[,-5])
avgS <- c()
for(k in 2:6) {
cl <- kmeans(iris[,-5],centers=k,iter.max=200)
s <- silhouette(cl$cluster,d)
avgS <- c(avgS,mean(s[,3]))
}
data.frame(nClus=2:6,Silh=avgS)
pc <- pam(iris[,-5],k=3)
(cm <- table(pc$clustering, iris$Species))
100*(1-sum(diag(cm))/sum(cm))
pc$silinfo$avg.width
library(fpc)
sol <- pamk(iris[,-5], krange=2:10, criterion="asw", usepam=TRUE)
sol
d <- dist(scale(iris[,-5]))
h <- hclust(d)
plot(h,hang=-0.1,labels=iris[["Species"]],cex=0.5)
clus3 <- cutree(h, 3)
(cm <- table(clus3, iris$Species))
100*(1-sum(diag(cm))/sum(cm))
plot(h,hang=-0.1,labels=iris[["Species"]],cex=0.5)
rect.hclust(h,k=3)
d <- dist(scale(iris[,-5]))
methds <- c('complete','single','average')
avgS <- matrix(NA,ncol=3,nrow=5,
dimnames=list(2:6,methds))
for (k in 2:6) {
for (m in seq_along(methds)) {
h <- hclust(d,meth=methds[m])
c <- cutree(h,k)
s <- silhouette(c,d)
avgS[k-1,m] <- mean(s[,3])
}
}
avgS
di <- diana(iris[,-5], metric='euclidean', stand=TRUE)
di3 <- cutree(di, 3)
(cm <- table(di3, iris$Species))
100*(1-sum(diag(cm))/sum(cm))
cm <- cm[c(1,3,2),]
100*(1-sum(diag(cm))/sum(cm))
library(fpc)
d <- scale(iris[,-5])
db <- dbscan(d, eps=0.9, MinPts=5)
db
table(db$cluster,iris$Species)
# 3.4.4
# 3.4.4.1
grubbs.outliers <- function(x, p.thresh = 0.05) {
require(outliers, quietly = TRUE)
x <- x[!is.na(x)]
n <- length(x)
zs <- abs(x - mean(x)) / sd(x)
outs <- 1 - sapply(zs, function(z) pgrubbs(z, n, type = 10))
posOuts <- which(outs <= p.thresh)
return(
list(
zs = zs,
pvals = outs,
outliers = x[posOuts],
positions = posOuts
)
)
}
data(algae, package="DMwR2")
grubbs.outliers(algae$a2)$outliers
table(algae$season)/length(algae$season)
#3.4.4.2
dbscan.outliers <- function(data, ...) {
require(fpc, quietly=TRUE)
cl <- dbscan(data, ...)
posOuts <- which(cl$cluster == 0)
list(positions = posOuts,
outliers = data[posOuts, ],
dbscanResults = cl)
}
library(forcats)
data(Glass, package = "mlbench")
count(Glass, Type)
g = Glass %>%
mutate(
Type = forcats::fct_collapse(
Type,
rare = as.character(c(3, 5, 6)),
normal = as.character(c(1, 2, 7))
)
)
g %>%
count(Type) %>%
mutate(prop = 100 * n / nrow(g))
outs <- dbscan.outliers(g[,-10], eps=1, scale=TRUE)
head(outs$outliers)
nrow(outs$outliers)
g %>%
slice(outs$positions) %>%
count(Type)
g %>%
count(Type)
og <- DMwR2::outliers.ranking(select(g, -Type))
slice(g, og$rank.outliers[1:40]) %>% count(Type)
library(DMwR2)
lof.scores <- lofactor(select(g, -Type), 10)
g %>%
slice(order(lof.scores, decreasing = TRUE)[1:40]) %>%
count(Type)
library(UBL)
g %>%
count(Type)
newg <- RandUnderClassif(Type ~ ., g)
count(newg, Type)
newg2 <- RandUnderClassif(Type ~ ., g, list(normal=0.4, rare=1))
count(newg2,Type)
newg3 <- RandOverClassif(Type ~ .,g)
count(newg3,Type)
library(e1071)
trainD <- filter(g, Type == "normal") %>% select(-Type)
s <- svm(trainD, y=NULL, type="one-classification", nu=0.5)
(cm <- table(g$Type, predict(s,select(g, -Type))))
# 3.4.5 -> Predictive Analytics
# Only did some notes for the different classification metrics
# 3.4.5.2 -> Tree-Based models
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.