##' load the raw file from PeMS
##'
##' Will scan the data and load up, then return the result of scanning
##'
##' @title load.raw.file
##' @param file the file to scan
##' @param skip the number of lines to skip at the beginning, in case there was a problem on the prior run. passed to scan
##' @param nlines the number of lines to scan passed to scan
##' @return the result of scanning the file
##' @author James E. Marca
load.raw.file <- function (file="",skip=0,nlines=0){
whatlist <- list(ts="",vds_id=0,
n1=0,o1=0,s1=0,
n2=0,o2=0,s2=0,
n3=0,o3=0,s3=0,
n4=0,o4=0,s4=0,
n5=0,o5=0,s5=0,
n6=0,o6=0,s6=0,
n7=0,o7=0,s7=0,
n8=0,o8=0,s8=0)
testScan <-
scan(file, what=whatlist, skip=skip,nlines=nlines,multi.line=FALSE,flush=TRUE,sep=',')
testScan
}
##' load a given file, whether by scanning the raw file, or by loading
##' an existing RData file
##'
##' A side effect is that if the raw data file is scanned, then an
##' RData file will be saved to the file system to speed things up
##' next time
##'
##' @title load.file
##' @param f the full file name for the raw text. optional if there
##' is an RData file, but if there isn't an RData file you need this
##' @param fname the business end of the name. Typically the part
##' that identifies the device id and the year. It will be passed
##' to the dir() function as part of the pattern to match when
##' looking for the RData file.
##' @param year the year of data. Used when searching for RData file
##' @param path the path to the root of the thing.
##' @return a data frame with the data in it
##' @export
##' @author James E. Marca
##'
load.file <- function(f,fname,year,path){
## is there a df available?
df <- data.frame()
target.file =paste(fname,'.df.*',year,'.RData',sep='')
isa.df <- dir(path, pattern=target.file,full.names=TRUE, ignore.case=TRUE,recursive=TRUE,all.files=TRUE)
need.to.save <- FALSE
if(length(isa.df)>0){
print (paste('loading', isa.df[1]))
load.result <- load(file=isa.df[1])
}else{
if(!missing(f)){
print (paste('scanning', f))
fileScan <- load.raw.file(f)
## pre-process the vds data
ts <- as.POSIXct(strptime(fileScan$ts,"%m/%d/%Y %H:%M:%S",tz='GMT'))
fileScan$ts <- ts
df <- trim.empty.lanes(fileScan)
if(dim(df)[2]>2 ## sometimes df is totally NA
& is.element("n1",names(df))
& is.element("o1",names(df)) ## sometimes get random interior lanes
){
df <- recode.lanes(df[,-1]) ## drop time for recode lanes
## put back time
df$ts <- ts
}
keep <- ! is.na(df$ts)
df <- df[keep,]
savepath <- get.save.path(f)
save(df,
file=paste(savepath,'/',fname,'.df.',year,'.RData',sep=''),
compress='xz')
}
}
df
}
##' The raw data often has empty lanes. This will get rid of them.
##'
##' @title trim.empty.lanes
##' @param testScan the scanned data file
##' @return a data frame without lanes.
##' @author James E. Marca
trim.empty.lanes <- function(testScan){
df <- data.frame(testScan)
## bug bit me in the behind. If the first row has missing values but a non-missing time,
## then this will fail
# pattern <- ! is.na(df[1,])
## so going with mean, which chokes on na's
## bleh. Another bug, a little more difficult. sometimes speed in
## one lane seems to be out. so make sure if you have x lanes of n
## and o, you also have x lanes of speed or none at all
## old way: pattern <- ! is.nan(mean(df,na.rm=TRUE))
pattern <- rep(TRUE,dim(df)[2])
names(pattern) <- colnames(df)
pattern[1]=FALSE ## keep time in the end, but ignore for now
pattern[2]=FALSE ## don't need the vdsid for every record
means <- colMeans(df[,pattern],na.rm=TRUE)
pattern[pattern] <- ( ! is.nan(means) ) ## old way
## have seen cases with reasonable n, but nothing for o, and mostly NAs
## so cut off cases with more than 95% missing as junk
pattern[pattern] <- (apply(df[pattern],2,pct.na) < 0.95)
## if any speeds are non null, then make sure speed lanes equals v and o lanes
speed.names <- grepl( pattern="^s\\d",x=names(df),perl=TRUE)
if(any( speed.names & pattern ) ){
vol.names <- grepl( pattern="^n\\d",x=names(df),perl=TRUE)
vol.lanes.with.data <- pattern[vol.names] & !is.na(pattern[vol.names])
lanes <- length(vol.lanes.with.data[vol.lanes.with.data] )
while(lanes>0){
pattern[paste('s',lanes,sep='')] <- TRUE
lanes <- lanes-1
}
}
## keep time
pattern[1] <- TRUE
df[,pattern]
}
##' Determine the percent of entries that have NA values
##'
##' @title pct.na
##' @param v the vector to examine
##' @return the percentage the entries in the vector that are NA.
##' @author James E. Marca
pct.na <- function(v){
sum(is.na(v))/length(v)
}
##' Get the canonical lane numbering system for a given number of lanes
##'
##' ##' For example, if you have count and occupancy, you'd pass in
##' c('n','o')
##' @return a vector consisting of the correct naming for the
##' variables. For example, if you have one lane and pass in
##' c('n','o'), you will get back the vector c("nr1","or1"). If you
##' have three lanes and still just volume and occupancy c('n','o')
##' then you will get back c("nl1", "ol1","nr2", "or2","nr1", "or1")
##'
##' Note that this enforces my special naming scheme, in which a site
##' always has a right lane, and then a left lane, and then all other
##' lanes are coded from the right as r2, r3, etc. So a 5 lanes site
##' will have, from left to right, l1, r4, r3, r2, r1.
##'
##' @title vds.lane.numbers
##' @param lanes the number of lanes in the desired data set
##' @param raw.data a vector of the values recorded for each lane.
##'
##' @author James E. Marca
vds.lane.numbers <- function(lanes,raw.data){
Y <- paste(raw.data,"l1", sep="")
YL <- paste(raw.data,"l1", sep="")
YR <- paste(raw.data,"r1", sep="")
if(lanes>2){
## interior lanes
YM <- paste(raw.data,
"r",
rep(sort(rep(c((lanes-1):2),length(raw.data)),decreasing=TRUE)),
sep="")
Y <- c(YL,YM,YR)
}else{
## no interior lanes, could also be a ramp
if(lanes==1){
Y <- c(YR)
}else{
## lanes == 2
Y <- c(YL,YR)
}
}
Y
}
recode.lanes <- function(df){
# run this only after you've
# run trim empty lanes
## This is broken if the lanes are screwy after trim empty lanes
## example:
## > summary(df)
## n1 n2 n5 o5 n6 o6
## Min. : 0 Min. : 0 Min. :0.000e+00 Min. :0.000e+00 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.:0.000e+00 1st Qu.:0.000e+00 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0 Median :0.000e+00 Median :0.000e+00 Median : 0 Median : 0
## Mean : 0 Mean : 0 Mean :6.646e-02 Mean :9.154e-04 Mean : 0 Mean : 0
## 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.:0.000e+00 3rd Qu.:0.000e+00 3rd Qu.: 0 3rd Qu.: 0
## Max. : 0 Max. : 0 Max. :1.800e+01 Max. :9.044e-01 Max. : 0 Max. : 0
## NA's :521354 NA's :521354 NA's :2.013e+05 NA's :2.013e+05 NA's :201302 NA's :201302
## here I have n1, n2, but not n3, n4, and not o[1..4], so recode thinks there are 2 lanes, and recodes the names on teh wrong variables.
##
## recode to be right lane (r1), right lane but one (r2), r3, ... and then
## left lane (l1)
##
names.data <- names(df)
## testing:
## names.data <-c('n1', 'n2', 'n5', 'o5', 'n6', 'o6')
## eventually recode this to use grep.
YR <- NULL
for (site.lanes in 1:8){
r.d <- NULL
if( is.element(paste("n",site.lanes,sep=''),names.data)){
r.d <- c('n')
}
if( is.element(paste("o",site.lanes,sep=''),names.data)){
r.d <- c(r.d,'o')
}
if( is.element(paste("s",site.lanes,sep=''),names.data)){
r.d <- c(r.d,'s')
}
if(!is.null(r.d)){
if(site.lanes == 1){
YR <- c(YR, paste(r.d,'l1',sep=""))
}else{
YR <- c(YR, paste(r.d,paste('r',site.lanes-1,sep=""),sep=""))
}
}
}
names(df) <- YR
df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.