library(ggplot2) library(reshape2) library(mice) library(cluster)
tmlzz <- read.csv("D:\\data\\thesis\\201610\\tmldata\\tmlzz.csv",header = T) rownames(tmlzz) <- tmlzz$X tmlzz <- tmlzz[,-1] dim(tmlzz)
sum(!complete.cases(tmlzz))
tmlzz[8,232] <- 28.625 sum(!complete.cases(tmlzz))
days <- rownames(tmlzz) days
原来的距离是欧氏距离。
$$d_{ab}=\sqrt{(x_1-x_2)^2+(x_2-y_2)^2+\dots+(x_n-y_n)^2}$$
$$d_{ab}=\sqrt{\sum_{i=1}^n(x_i-y_i)^2}$$
fit_hc_euclidean <- hclust(dist(tmlzz)) plot(fit_hc_euclidean)
样本1和样本2的欧式距离检验
distance12 <- sqrt(sum((tmlzz[1,]-tmlzz[2,])**2)) distance12
验证无误
$$d_{ab}=|x_1-y_1|+|x_2-y_2|+\dots+|x_n-y_n|$$
$$d_{ab}=\sum_{i=1}^n |x_i-y_i|$$
fit_hc_manhattan <- hclust(dist(tmlzz,method = "manhattan")) plot(fit_hc_manhattan)
$$d_{ab}=\sum_{i=1}^n \frac{|a_i-b_i|}{|a_i|+|b_i|}$$
fit_hc_canberra <- hclust(dist(tmlzz,method = "canberra")) plot(fit_hc_canberra)
相比欧氏距离,余弦距离更加注重两个向量在方向上的差异。并不适用于交通流相似性比较。
$$S=\frac{x·y}{|x||y|}=\frac{\sum_{i=1}^nx_iy_i}{\sqrt{\sum_{i=1}^nx_i^2}*\sqrt{\sum_{i=1}^ny_i^2}}$$
dim(tmlzz)
21*21
cosdis <- matrix(rep(0,441),21,21)
for(i in 1:21){ for(j in 1:21){ if(i>j){ cosdis[i,j]=sum(t(tmlzz[i,])*tmlzz[j,])/sqrt((sum(tmlzz[i,]^2))*sum(tmlzz[j,]^2)) } } } cosdis <- as.data.frame(cosdis) cosdis
names(cosdis) <- days
rownames(cosdis) <- days
sum(is.na(cosdis))
str(dist(tmlzz))
as.dist(cosdis)
fit_hc_cos <- hclust(as.dist(cosdis)) plot(fit_hc_cos)
可见余弦距离的聚类没有什么用。
考虑欧氏距离:
$$d_{ab}=\sqrt{\sum_{i=1}^n(x_i-y_i)^2}$$
只考虑了两向量各维度上的距离,没有考虑距离的平稳性。
x <- seq(0,8,1) y <- -(x-4)**2+100 y1 <- y-40 y2 <- c(50,60,80,75,70,65,70,80,60) plot(x,y,type="b",ylim = c(0,100),pch=16,xlab = "",ylab="") lines(x,y1,col="red",type = "b",pch=15) lines(x,y2,col="blue",type = "b",pch=17) legend("bottomright",title = "traffic flow",legend=c("y","y1","y2"),pch=c(16,15,17),col=c("black","red","blue"))
z1与y的欧氏距离
dist(rbind(y,z1,z2))
sqrt(sum((y-z1)**2))
sqrt(sum((y-z2)**2))
显示y与z1的距离远,y与z2的距离近,与交通流情况不符。
考虑欧氏距离:
$$d_{ab}=\sqrt{\sum_{i=1}^n(x_i-y_i)^2}$$
尝试加标准差
$$z_i=x_i-y_i$$
$$d_{ab}=R_1\sqrt{\sum_{i=1}^nz_i^2}+R_2\sqrt{\sum_{i=1}^n(z_i-\bar{z_i})^2}$$
欧氏距离
z <- (tmlzz[1,]-tmlzz[2,])
sqrt(sum(z**2))
正则项
sqrt(sum((z-mean(as.matrix(z)))**2))
newdis <- matrix(rep(0,441),21,21) for(i in 1:21){ for(j in 1:21){ if(i>j){ z=tmlzz[i,]-tmlzz[j,] newdis[i,j]=sqrt(sum(z**2))+sqrt(sum((z-mean(as.matrix(z)))**2)) } } } newdis
names(newdis) <- days rownames(newdis) <- days as.dist(newdis)
fit_hc_new <- hclust(as.dist(newdis)) plot(fit_hc_new)
group_k3 <- cutree(fit_hc_new,k=3) group_k3 <- as.data.frame(group_k3) group_k3
tmlallnew <- read.csv("D:\\data\\thesis\\201610\\tmldata\\tmlallnew.csv",header = T) dim(tmlallnew)
names(tmlallnew)
group_k3$日期 <- rownames(group_k3) names(group_k3)
tmltotal <- merge(tmlallnew,group_k3,by="日期") dim(tmltotal)
names(tmltotal)
str(tmltotal)
tmltotal$group_k3 <- as.factor(tmltotal$group_k3) names(tmltotal)[5] <- "层次聚类结果" names(tmltotal)
ggplot(tmltotal,aes(tmltotal$时间序号,tmltotal$机动车当量,group=tmltotal$日期,color=tmltotal$层次聚类结果))+ geom_line(alpha=1/5)+ geom_smooth(method = "loess",span=0.2,se = F)+ xlab("时间序号")+ylab("车流量")+scale_color_hue("类别簇")+ scale_x_continuous(breaks = seq(0,288,24),limits = c(0,288))+ scale_y_continuous(breaks = seq(0,120,20)) ggsave("D:\\王致远\\论文\\大论文\\实验\\绘图\\原始数据改进距离.jpg",width=7.29,height=4.5,dpi=600)
ggplot(tmltotal,aes(tmltotal$时间序号,tmltotal$机动车当量,group=tmltotal$日期,color=tmltotal$日期))+ stat_summary(aes(group=tmltotal$层次聚类结果,color=tmltotal$层次聚类结果),fun.y=mean,geom="line")+ xlab("时间序号")+ylab("车流量")+scale_color_hue("类别簇")+ scale_x_continuous(breaks = seq(0,288,24),limits = c(0,288))+ scale_y_continuous(breaks = seq(0,120,20)) ggsave("D:\\王致远\\论文\\大论文\\实验\\绘图\\原始数据改进距离平均值.jpg",width=7.29,height=4.5,dpi=600)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.