rm(list=ls())

1. 熟悉数据

library(TFTSA)
library(tidyverse)
jd201610 <- read.csv("D:\\data\\thesis\\201610\\201610.csv",stringsAsFactors = F)
dim(jd201610)
names(jd201610)

给变量改名字

names(jd201610) <- c("station_index","station_name","date","hour","minute","CDH","SX",
                     "timestamp","small_pass_car","big_pass_car","small_fre_car",
                     "mid_fre_car","big_fre_car","large_fre_car","box_car","mt_car","tlj_car")
table(jd201610$CDH,jd201610$SX)

车道号11,12,13属于上行方向;车道号31,32,33属于下行方向

head(jd201610,10)
unique(jd201610$station_name)

一共有12个观测站

table(jd201610$station_name)

黑水/乌沙数据量不够充足,可以剔除

jd201610 <- filter(jd201610,station_name != "黑水",station_name != "乌沙")
unique(jd201610$station_name)
dim(jd201610)

数据量减少了不到1万行

10月13/14/15/16的数据,除了松坎观测站外,普遍缺失。

2. 数据预处理

2.1 日期的处理

以“-”为分隔符,拆分字符串

str_split_todf <- function(strvec,sep){
  tep <- strsplit(strvec,sep)
  len <- length(tep[[1]])
  return(as.data.frame(matrix(unlist(tep),ncol=len,byrow = T),stringsAsFactors=F))
}
jd201610 <- cbind(jd201610,str_split_todf(jd201610$date,"-"))
names(jd201610)[18:20] <- c("day","month","year")
jd201610[jd201610$month=="9月 ",19] <- "Sep"
jd201610[jd201610$month=="10月",19] <- "Oct"
jd201610 <- mutate(jd201610,date=str_c(month,day,sep = "-"))
jd201610 <- select(jd201610,-(day:year))
date_order <- unique(jd201610$date)
date_order
head(jd201610)
names(jd201610)

2.2 当量计算

jd201610 <- mutate(jd201610,
                   volume=small_pass_car+small_fre_car+
                     1.5*big_pass_car+1.5*mid_fre_car+
                     3*big_fre_car+4*large_fre_car+4*box_car+
                     mt_car+tlj_car) %>% 
  select(station_index:timestamp,volume) %>% 
  select(-(hour:minute))
names(jd201610)
save(jd201610,file="D:\\data\\jd201610.RData")

2.3 分组加合

by_timestamp <- group_by(jd201610,station_name,date,timestamp)
jd201610tt <- summarise(by_timestamp,ttvolume=sum(volume,na.rm = T))
head(jd201610tt)

2.4 数据转置

unique(jd201610tt$station_name)

桐木岭实验

tml <- filter(jd201610tt,station_name=="桐木岭站")
tml <- arrange(tml, date, timestamp)
x <- reshape2::dcast(tml,date~timestamp)
rownames(x) <- x[[1]]
x <- x[-1]

拓展到全数据集

jdzzl <- plyr::dlply(jd201610tt,"station_name",
                     function(x) reshape2::dcast(x,date~timestamp))
jdzzl <- lapply(jdzzl, function(x) {
  rownames(x) <- x[[1]]
  x <- x[-1]
})

2.5 插补缺失值

桐木岭实验

缺失值识别

rowSums(is.na(x))

缺失值剔除

剔除缺失值在15个以上的日期

rowSums(is.na(x)) < 15

按日期排序,并剔除缺失值多的日期

x <- x[date_order,]
x <- x[rowSums(is.na(x)) < 15,]

展开矩阵

y <- as.matrix(x)
y <- t(y)
y <- as.vector(y)

缺失值插补

na_loc <- which(is.na(y))
notna_loc <- which(!is.na(y))
for(i in na_loc){
  y[i] <- mean(y[notna_loc[which(abs(i-notna_loc)<5)]])
}

重构矩阵

y <- matrix(y,ncol = 288,byrow = T)
#y <- as.data.frame(matrix(y,ncol = 288,byrow = T))
rownames(y) <- rownames(x)
colnames(y) <- 1:288
y <- as.data.frame(y)
rm(x,y)

拓展到全数据集

缺失值识别

lapply(jdzzl, function(x) rowSums(is.na(x)))

缺失值剔除

jdzzl <- lapply(jdzzl, function(x){
  x <- x[date_order,]
  x <- x[rowSums(is.na(x))<15,]
})
lapply(jdzzl, function(x) rowSums(is.na(x)))

展开矩阵

jdzzl <- lapply(jdzzl, function(x){
  x <- as.matrix(x)
  y <- t(x)
  y <- as.vector(y)
})

缺失值插补

jdzzl_try <- jdzzl
fillupna <- function(y){
  na_loc <- which(is.na(y))
  notna_loc <- which(!is.na(y))
  for(i in na_loc){
    y[i] <- mean(y[notna_loc[which(abs(i-notna_loc)<7)]])
  }
  return(y)
}
jdzzl_try <- lapply(jdzzl_try, fillupna)
lapply(jdzzl_try, function(x) sum(is.na(x)))
jdzzl <- lapply(jdzzl,function(y){
  na_loc <- which(is.na(y))
  notna_loc <- which(!is.na(y))
  for(i in na_loc){
    y[i] <- mean(y[notna_loc[which(abs(i-notna_loc)<7)]])
  }
  return(y)
})
lapply(jdzzl, function(x) sum(is.na(x)))

步骤整合

jdzzl <- plyr::dlply(jd201610tt,"station_name",
                     function(x) reshape2::dcast(x,date~timestamp))
jdzzl <- lapply(jdzzl, function(x) {
  rownames(x) <- x[[1]]
  x <- x[-1]
})
jdzzl <- lapply(jdzzl, function(x){

  # 缺失值剔除
  x <- x[date_order,]
  x <- x[rowSums(is.na(x))<15,]

  # 展开矩阵
  x <- as.matrix(x)
  dates <- rownames(x)
  x <- t(x)
  x <- as.vector(x)

  # 缺失值插补
  na_loc <- which(is.na(x))
  notna_loc <- which(!is.na(x))
  for(i in na_loc){
    x[i] <- mean(x[notna_loc[which(abs(i-notna_loc)<7)]])
  }

  # 重构矩阵
  x <- matrix(x,ncol = 288,byrow = T)
  rownames(x) <- dates
  colnames(x) <- 1:288
  return(x)
})

2.6 LOESS平滑处理

曾经使用的原始版的handle_loess

handle_loess <- function(df,exp,resp,sp){
  result <- rep(NA,nrow(df))
  result <- stats::loess(df[,resp]~df[,exp],span=sp)
  return(result$fitted)
}

这个handle_loess有一些问题:

  1. 该步骤是在转置之前做的,而现在已经对数据做了转置;要考虑是再开发个转置之后的版本,还是继续放在转置之前;
  2. 对数据框形式要求的局限性较大,按loess的原理,理论上只需要vector型输入即可。

桐木岭实验

names(jdzzl)
x <- jdzzl[[10]]

绘制平行坐标图

reshape2::melt(x) %>% arrange(Var1,Var2) %>% 
  ggplot(aes(x = Var2, y = value, group = Var1, color = Var1))+
  geom_line()+
  labs(x="timestamp",y="traffic volume")

开发LOESS函数

handle_loess_fordf <- function(df,sp){
  ndays <- nrow(df)
  timestamps <- 1:ncol(df)
  for(i in 1:ndays){
    loess_result <- stats::loess(df[i,]~timestamps,span=sp)
    df[i,] <- loess_result$fitted
  }
  return(df)
}
smoothed_x <- handle_loess_fordf(x,sp=0.2)
reshape2::melt(smoothed_x) %>% arrange(Var1,Var2) %>% 
  ggplot(aes(x = Var2, y = value, group = Var1, color = Var1))+
  geom_line()+
  labs(x="timestamp",y="traffic volume")
rm(x)

拓展到全数据集

jdzzl_loess <- lapply(jdzzl, handle_loess_fordf, sp=0.2)

检验

reshape2::melt(jdzzl_loess[[10]]) %>% arrange(Var1,Var2) %>% 
  ggplot(aes(x = Var2, y = value, group = Var1, color = Var1))+
  geom_line()+
  labs(x="timestamp",y="traffic volume")

没什么问题,将jdzzl和jdzzl_loess写入包中

修改交调站名

names(jdzzl) <- c("丙妹","黑石","麻尾","南宁","平关","平胜","普宜","松坎","台盘","桐木岭")
names(jdzzl_loess) <- c("丙妹","黑石","麻尾","南宁","平关","平胜","普宜","松坎","台盘","桐木岭")


ahorawzy/TFTSA documentation built on May 13, 2019, 12:18 p.m.