library(ggplot2)

10月1日数据

tml1001 <- read.csv("D:\\data\\thesis\\201610\\tmldata\\tml1001all.csv",header = T)
dim(tml1001)
str(tml1001)
names(tml1001)
tml1001 <- tml1001[,c(5,15)]
names(tml1001)
ggplot(data=tml1001,aes(tml1001$时间序号,tml1001$机动车当量))+
  geom_point(colour="steelblue")+
  geom_line(colour="steelblue")
rm(tml1001)

产生历史数据库

长型数据

tmlall <- read.csv("D:\\data\\thesis\\201610\\tmldata\\tmlall.csv",header = T)
dim(tmlall)
table(tmlall$日期)

此为没有填补缺失值的数据,没有利用价值,删除

rm(tmlall)
tmlallnew <- read.csv("D:\\data\\thesis\\201610\\tmldata\\tmlallnew.csv",header = T)
table(tmlallnew$日期)
dim(tmlallnew)
names(tmlallnew)

此为填补了缺失值的数据

宽型数据

tmlzz <- read.csv("D:\\data\\thesis\\201610\\tmldata\\tmlzz.csv",header = T)
dim(tmlzz)
rownames(tmlzz) <- tmlzz[,1]
tmlzz <- tmlzz[,-1]
dim(tmlzz)

查看是否有缺失值

sum(is.na(tmlzz))

缺失值定位

which(is.na(tmlzz),arr.ind = T) 
tmlzz[8,232] = 28.625
sum(is.na(tmlzz))

缺失值填补完

写入新数据

write.csv(tmlzz,"D:\\data\\thesis\\201610\\tmldata\\tmlzznew.csv")

可以猜想,可以利用宽型数据作为数据库

绘制平行坐标图,查看规律

ggplot(tmlallnew,aes(tmlallnew$时间序号,tmlallnew$机动车当量,group=tmlallnew$日期,color=tmlallnew$日期))+
  geom_line(alpha=2/5)+
  geom_smooth(method = "loess",span=0.1)+
  xlab("时间序号")+ylab("车流量")+scale_color_hue("日期")
tmlallnew[tmlallnew$时间序号==332,2] = 232
write.csv(tmlallnew,"D:\\data\\thesis\\201610\\tmldata\\tmlallnew.csv")
ggplot(tmlallnew,aes(tmlallnew$时间序号,tmlallnew$机动车当量,group=tmlallnew$日期,color=tmlallnew$日期))+
  geom_line(alpha=2/5)+
  geom_smooth(method = "loess",span=0.1)+
  xlab("时间序号")+ylab("车流量")+scale_color_hue("日期")+
  scale_x_continuous(breaks = seq(0,288,24))+
  scale_y_continuous(breaks = seq(0,120,20))
tmlallnew$是否为节假日 <- as.factor(tmlallnew$是否为节假日)

96~120

ggplot(tmlallnew,aes(tmlallnew$时间序号,tmlallnew$机动车当量,group=tmlallnew$日期,color=tmlallnew$是否为节假日))+
  #geom_line(alpha=2/5)+
  geom_smooth(method = "loess",span=0.2,se = F)+
  xlab("时间序号")+ylab("车流量")+scale_color_hue("是否为节假日")+
  scale_x_continuous(breaks = seq(0,288,24),limits = c(96,120))+
  scale_y_continuous(breaks = seq(0,120,20))

120~144

ggplot(tmlallnew,aes(tmlallnew$时间序号,tmlallnew$机动车当量,group=tmlallnew$日期,color=tmlallnew$是否为节假日))+
  #geom_line(alpha=2/5)+
  geom_smooth(method = "loess",span=0.2,se = F)+
  xlab("时间序号")+ylab("车流量")+scale_color_hue("是否为节假日")+
  scale_x_continuous(breaks = seq(0,288,24),limits = c(120,144))+
  scale_y_continuous(breaks = seq(0,120,20))

144~168

ggplot(tmlallnew,aes(tmlallnew$时间序号,tmlallnew$机动车当量,group=tmlallnew$日期,color=tmlallnew$是否为节假日))+
  #geom_line(alpha=2/5)+
  geom_smooth(method = "loess",span=0.2,se = F)+
  xlab("时间序号")+ylab("车流量")+scale_color_hue("是否为节假日")+
  scale_x_continuous(breaks = seq(0,288,24),limits = c(144,168))+
  scale_y_continuous(breaks = seq(0,120,20))

168~192

ggplot(tmlallnew,aes(tmlallnew$时间序号,tmlallnew$机动车当量,group=tmlallnew$日期,color=tmlallnew$是否为节假日))+
  #geom_line(alpha=2/5)+
  geom_smooth(method = "loess",span=0.2,se = F)+
  xlab("时间序号")+ylab("车流量")+scale_color_hue("是否为节假日")+
  scale_x_continuous(breaks = seq(0,288,24),limits = c(168,192))+
  scale_y_continuous(breaks = seq(0,120,20))
ggplot(tmlallnew,aes(tmlallnew$时间序号,tmlallnew$机动车当量,group=tmlallnew$日期,color=tmlallnew$是否为节假日))+
  #geom_line(alpha=2/5)+
  geom_smooth(method = "loess",span=0.2,se = F)+
  xlab("时间序号")+ylab("车流量")+scale_color_hue("日期")+
  scale_x_continuous(breaks = seq(0,288,24),limits = c(192,216))+
  scale_y_continuous(breaks = seq(0,120,20))

聚类尝试

Kmeans 0~120

fit_km1 <- kmeans(tmlzz[,1:120],centers = 5)
fit_km1$cluster

dist 0~120

as.matrix(dist(tmlzz[96:120]))[,1]
sort(as.matrix(dist(tmlzz[96:120]))[,1])

结果不尽如人意,有非节假日混入

dist 120~144

sort(as.matrix(dist(tmlzz[120:144]))[,1])

半自动K近邻算法

算法想法

  1. 通过之前2小时的预测之后1小时的;
  2. 每次预测更新;

算法描述

  1. 选择需要预测的某日交通流,即目标序列;
  2. 生成不包含目标序列的序列库;
  3. 设定起始时间序号如73;目标序列在起始时间序号之前的预测值为真实值;
  4. 计算观察窗73-12*2=49,观察窗为49~72;计算预测窗73+12-1=84,预测窗为73~84;
  5. 计算观察窗内,目标序列与序列库中序列的距离,并从小到大排序,选择前K个作为近邻;
  6. 计算近邻在73~84的合成值,例如平均值,作为目标序列73~84的预测值;;
  7. 推进时间序号到84+1=85;重复4~6;

数据准备

选择目标序列,例如10月6日;生成序列库;

rownames(tmlzz)
dim(tmlzz)
tmlobj <- tmlzz[6,]
rownames(tmlobj)
tmlbase <- tmlzz[-6,]
rownames(tmlbase)
dim(tmlbase)

写函数

flowknn <- function(obj,base,start,k,lag_duration,fore_duration){


  ld = lag_duration
  fd = fore_duration
  st = start


  foreflow <- obj
  foreflow[,] <- 0


  foreflow[,1:st] = obj[,1:st]


  fl = st

  flowall = rbind(obj,base)

  while(fl<(ncol(obj)-1)){


    fwin = fl - ld
    bwin = fl + fd - 1


    knnames = names(sort(as.matrix(dist(flowall[,fwin:fl-1]))[,1]))[2:(2+k-1)]


    kn <- base[knnames,fl:bwin]
    foreflow[,fl:bwin] <- colMeans(kn)



    fl = bwin+1
  }
  return(foreflow)
}
pre <- flowknn(tmlobj,tmlbase,start = 73,k = 3,lag_duration = 24,fore_duration = 12)
pre
plot(1:288,pre,type="l",col="red")
points(1:288,tmlobj,col="blue")
tmlboth <- rbind(tmlobj,pre)
tmlboth <- as.data.frame(tmlboth)
tmlboth <- t(tmlboth)
names(tmlboth) <- c("true","pre")
ggplot(tmlboth,aes(1:288,tmlboth$`06-10月-16`))+geom_point(colour="steelblue")+
  geom_line(aes(1:288,tmlboth$`06-10月-161`),colour="red",size=1)+
  xlab("时间序号")+ylab("车流量")+scale_color_hue("日期")+
  scale_x_continuous(breaks = seq(0,288,24))+
  scale_y_continuous(breaks = seq(0,120,20))


ahorawzy/TFTSA documentation built on May 13, 2019, 12:18 p.m.