R/just.amelia.call.R

## everything options are:
## ,count.pattern = "^(not|empty|heavyheavy|nl|nr)"
## ,mean.pattern="(^ol|^or|_weight|_axle|_length|_speed)"
## ,mean.exclude.pattern="^(mean)"
## ){



## old version
                            ## ,count.pattern = "(heavyheavy|^nl|^nr\\d|count_all_veh_speed)"
                            ## ,mean.pattern="(^ol|^or\\d|_weight|_axle|_length|_speed||wgt_spd_all_veh_speed)"
                            ## ,mean.exclude.pattern="^(mean|mt_|tr_)"

#' Impute best estimate of truck counts and features.
#'
#' This function takes the merged VDS and WIM sets
#'
#' @param df the merged VDS and WIM input data
#' @param count.pattern how to identify the count variables.  Default
#' is:
#'    "(heavyheavy|^nl|^nr\\d|_weight|_axle|_length|_speed|_all_veh_speed)"
#' @param mean.pattern how to identify the "mean" variables.  Not to
#' be taken literally.  This is a pattern to identify variables that
#' aren't count variables, but that are included in the
#' imputation. For example, occupancy.  Default is "(^ol|^or\\d)"
#' @param exclude.pattern a pattern to identify variables that should
#' be excluded from the imputation.  The default is "^(mean|mt_|tr_)"
#' @param maxiter the maximum number of iterations before Amelia
#' should quit.  Default is 200, but seriously it is pretty rare when
#' iterating 200 times will produce a usable outcome.  On the other
#' hand, there are times when iteraing say, 100 times does.  So set
#' lower to run faster through better behaved data, and then increase
#' to catch some that require a little more time.
#' @return the Amelia output
fill.truck.gaps <- function(df
                            ,count.pattern = "(heavyheavy|^nl|^nr\\d|_weight|_axle|_length|_speed|_all_veh_speed)"
                            ,mean.pattern="(^ol|^or\\d)"
                            ,exclude.pattern="^(mean|mt_|tr_)"
                            ,maxiter=200
                            ){

  ## truncate negative values to zero
  negatives <-  !( df>=0 | is.na(df) )
  df[negatives] <- 0

  ## sort out upper limits
  occ.pattern <- "(^ol1$|^or\\d$)"
  ic.names <- names(df)
  ic.names <- grep( pattern=exclude.pattern,x=ic.names ,perl=TRUE,value=TRUE,invert=TRUE)
  sd.vars <-   grep( pattern="sd(\\.|_)[r|l]\\d+$",x=ic.names,perl=TRUE,value=TRUE)
  ic.names <-  grep( pattern="sd(\\.|_)[r|l]\\d+$",x=ic.names,perl=TRUE,value=TRUE,invert=TRUE)
  count.vars <- grep( pattern=count.pattern,x=ic.names,perl=TRUE,value=TRUE)
  ic.names<- grep( pattern=count.pattern,x=ic.names,perl=TRUE,value=TRUE,invert=TRUE)
  mean.vars <- grep( pattern=mean.pattern,x=ic.names,perl=TRUE,value=TRUE)
  occ.vars <-  grep( pattern=occ.pattern,x=mean.vars ,perl=TRUE,value=TRUE)
  M <- 10000000000  #arbitrary bignum
  ic.names <- names(df)
  pos.count <- (1:length(ic.names))[is.element(ic.names, c(count.vars))]
  max.vals <- apply( df[,count.vars], 2, max ,na.rm=TRUE)
  pos.bds <- cbind(pos.count,0,1.10*max.vals)
  ## limit the mean vars less, but exclude occ vars
  if(length(setdiff(mean.vars,occ.vars))>0){
    pos.count <- (1:length(ic.names))[is.element(ic.names, setdiff(mean.vars,occ.vars))]
    pos.bds <- rbind(pos.bds,cbind(pos.count,0,M))
  }
  ## now limit occupancy to (0,1)
  pos.count <- (1:length(ic.names))[is.element(ic.names, occ.vars)]
  pos.bds <- rbind(pos.bds,cbind(pos.count,0,1))

  print("bounds:")
  print(pos.bds)

  exclude.as.id.vars <- setdiff(ic.names,c(mean.vars,count.vars,'tod','day'))
  print(paste("count vars:",paste(count.vars,collapse=' ')))
  print(paste("mean vars:", paste(mean.vars,collapse=' ')))
  print(paste("excluded:",  paste(exclude.as.id.vars,collapse=' ')))
  ## this version is thorough, but does not run fast enough for the moment
  ## df.amelia <-
  ##   Amelia::amelia(df,idvars=exclude.as.id.vars,
  ##          ts="tod",
  ##          splinetime=6,
  ##          lags =count.vars,leads=count.vars,
  ##          sqrts=count.vars,
  ##          cs="day",
  ##          intercs=TRUE,
  ##          emburn=c(2,maxiter),
  ##          bounds = pos.bds, max.resample=10,empri = 0.05 *nrow(df))
  ## this one works
  ## df.amelia <-
  ##   Amelia::amelia(df,idvars=exclude.as.id.vars,
  ##          ts="tod",
  ##          splinetime=3,
  ##          ## lags =count.vars,leads=count.vars,
  ##          sqrts=count.vars,
  ##          cs="day",
  ##          intercs=TRUE,
  ##          emburn=c(2,maxiter),
  ##          bounds = pos.bds, max.resample=10,empri = 0.05 *nrow(df)
  ##          )
  ## this one is going for speed
  df$toddow <- 24 * df$day + df$tod

  exclude.as.id.vars <- c('tod','day',exclude.as.id.vars)
    df.amelia <-
    Amelia::amelia(df,idvars=exclude.as.id.vars,
           ts="toddow",
           splinetime=6,
           #lags =count.vars,
           #leads=count.vars,
           sqrts=count.vars,
           #cs="day",
           #intercs=TRUE,
           emburn=c(2,maxiter),
           bounds = pos.bds, max.resample=10,empri = 0.05 *nrow(df)
           ##,m=1 ## desperate measures!  set to limit the imputations
           )
  df.amelia
}

## old version
                            ## ,count.pattern = "(heavyheavy|^nl|^nr1|^nr2)"
                            ## ,mean.pattern="(^ol|^or1|^or2|_weight|_axle|_length|_speed)"
                            ## ,mean.exclude.pattern="^(mean|mt_|tr_)"

#' process variable names in preparation for Amelia call
#'
#' I split this out so that I could test different things
#'
#' @param names.df the dataframe's names
#' @param count.pattern how to recognize the count variables, default is
#' "(heavyheavy|^nl|^nr\\d|_weight|_axle|_length|_speed|_all_veh_speed)"
#' @param mean.pattern how to recognize what I used to call mean
#' variables but now that I by default ignore mean variables, there
#' you go.  Default is "(^ol|^or\\d)" which will grab occupancy
#' variables
#' @param exclude.pattern what variables to exclude from the Amelia
#' run as id variables.  Default is "^(mean|mt_|tr_)"
#' @param df the data frame.  Optional but if you pass it in, then
#' will be used to set up boundaries too
#' @return a list of things,
#' list(exclude.as.id.vars=exclude.as.id.vars,
#'      pos.count =pos.count,
#'      mean.vars =mean.vars,
#'      count.vars=count.vars,
#'      pos.bds   =pos.bds)
#'
#' which is what you need in the above amelia function
names.munging <- function(names.df
                            ,count.pattern = "(heavyheavy|^nl|^nr\\d|_weight|_axle|_length|_speed|_all_veh_speed)"
                            ,mean.pattern="(^ol|^or\\d)"
                            ,exclude.pattern="^(mean|mt_|tr_)"
                            ,df=data.frame()
                          ){

  ic.names <- names.df
  occ.pattern <- "(^ol1$|^or\\d$)"
  ic.names <- grep( pattern=exclude.pattern,x=ic.names ,perl=TRUE,value=TRUE,invert=TRUE)
  sd.vars <-   grep( pattern="sd(\\.|_)[r|l]\\d+$",x=ic.names,perl=TRUE,value=TRUE)
  ic.names <-  grep( pattern="sd(\\.|_)[r|l]\\d+$",x=ic.names,perl=TRUE,value=TRUE,invert=TRUE)
  count.vars <- grep( pattern=count.pattern,x=ic.names,perl=TRUE,value=TRUE)
  ic.names<- grep( pattern=count.pattern,x=ic.names,perl=TRUE,value=TRUE,invert=TRUE)
  mean.vars <- grep( pattern=mean.pattern,x=ic.names,perl=TRUE,value=TRUE)
  occ.vars <-  grep( pattern=occ.pattern,x=mean.vars ,perl=TRUE,value=TRUE)

  M <- 10000000000  #arbitrary bignum
  ic.names <- names.df

  pos.count <- (1:length(ic.names))[is.element(ic.names, c(count.vars))]

  pos.bds <- c()
  if(length(df)>0){
    max.vals <- apply( df[,count.vars], 2, max ,na.rm=TRUE)
    pos.bds <- cbind(pos.count,0,1.10*max.vals)
    ## limit the mean vars less, but exclude occ vars
    if(length(setdiff(mean.vars,occ.vars))>0){
      pos.count <- (1:length(ic.names))[is.element(ic.names, setdiff(mean.vars,occ.vars))]
      pos.bds <- rbind(pos.bds,cbind(pos.count,0,M))
    }
    ## now limit occupancy to (0,1)
    pos.count <- (1:length(ic.names))[is.element(ic.names, occ.vars)]
    pos.bds <- rbind(pos.bds,cbind(pos.count,0,1))
  }

  pos.count <- (1:length(ic.names))[is.element(ic.names, c(count.vars))]
  exclude.as.id.vars <- setdiff(ic.names,c(mean.vars,count.vars,'tod','day'))

  ##  print(paste("count vars:",paste(count.vars,collapse=' ')))
  ##  print(paste("mean vars:", paste(mean.vars,collapse=' ')))
  list(exclude.as.id.vars=exclude.as.id.vars,
       pos.count=pos.count,
       mean.vars=mean.vars,
       count.vars=count.vars,
       pos.bds=pos.bds
       )
}


## the commented version


## fill.truck.gaps <- function(df
##                             ,count.pattern = "^(truck|heavyheavy|nl|nr)"
##                             ,truck.count.pattern = "^(truck|heavyheavy)"
##                             ,mean.pattern="(^ol|^or|_weight|_axle|_len|_speed)"
##                             ,mean.exclude.pattern="^(mean)"
##                             ){


##   ## run amelia on the combined vds and wim data.  the vds data has
##   ## already been imputed on itself, so should have no gaps.  ditto
##   ## for the wim data

##   ## imputed combined is the merged set of aggregated, imputed,  wim data merged with vds data, and also merged with a non-paired vds data

##   ##
##   ## I had the idea that first I want to just impute counts, and then
##   ## later I want to impute variables that depend on the counts, like
##   ## weight, speed, and axles
##   ##

##   ic.names <- names(df)
##   ## keep standard deviation data from the data
##   sd.vars <-   grep( pattern="sd(\\.|_)[r|l]\\d+$",x=ic.names,perl=TRUE,value=TRUE)
##   ic.names <-  grep( pattern="sd(\\.|_)[r|l]\\d+$",x=ic.names,perl=TRUE,value=TRUE,invert=TRUE)

##   count.vars <- grep( pattern=count.pattern,x=ic.names,perl=TRUE,value=TRUE)
##   ic.names<- grep( pattern=count.pattern,x=ic.names,perl=TRUE,value=TRUE,invert=TRUE)


##   ## sort out the "mean variables" that are really sums at this point
##   mean.vars <- grep( pattern=mean.pattern,x=ic.names,perl=TRUE,value=TRUE)

##   ## exclude some too so things don't get non-invertible and highly correlated
##   mean.vars <- grep( pattern=mean.exclude.pattern,x=mean.vars ,perl=TRUE,value=TRUE,invert=TRUE)


##   M <- 10000  #arbitrary bignum
##   # reset ic.names now
##   ic.names <- names(df)

##   truck.related.vs <- grep( pattern=truck.count.pattern,x=count.vars,perl=TRUE,value=TRUE)
##   not.truck.vs <- grep( pattern='^truck_',x=truck.related.vs,perl=TRUE,value=TRUE,invert=TRUE)
##   ## reset each of these non-truck count values to a fraction of the
##   ## truck count
##   truck.vs <- grep( pattern='^truck_',x=truck.related.vs,perl=TRUE,value=TRUE)

##   ## what I am doing here is getting rid of truck variables.  each
##   ## truck related category gets split into two categories, it, and
##   ## not it. so truck counts can always be restored by adding it and
##   ## not it. And I am using new.cnt.vs to track that migration without
##   ## mucking about with more regexes than I'm already (ab)using.
##   new.cnt.vs <- setdiff(count.vars,truck.related.vs)
##   new.cnt.vs <- c(new.cnt.vs,not.truck.vs)

##   lane.types <- strsplit(truck.vs,"^truck_")
##   for(l.type in lane.types){
##     ## match corresponding lanes in not.truck.vars
##     this.lane.vars <- grep(pattern=l.type[2],x=not.truck.vs,perl=TRUE,value=TRUE)
##     this.truck.var <- grep(pattern=l.type[2],x=truck.vs,perl=TRUE,value=TRUE)
##     for(vvar in this.lane.vars){
##       new.var <- paste('not',vvar,sep='.')
##       df[,new.var] <- df[,this.truck.var] - df[,vvar]
##       new.cnt.vs <- c(new.cnt.vs,new.var)
##     }
##   }

##   ## now build up all the final variables for the amelia call

##   ## first get all the names in the data frame
##   ic.names <- names(df)

##   ##make sure there are not dupes

##   ## the count variables
##   new.cnt.vs <- intersect(new.cnt.vs,ic.names)

##   ## generate the position of each count variable
##   pos.count <- (1:length(ic.names))[is.element(ic.names, c(new.cnt.vs))]

##   ## for the count variables,
##   ## make a max allowed value of 110% of observed values and a min value of 0
##   max.val <- max(df[,new.cnt.vs],na.rm=TRUE)
##   pos.bds <- cbind(pos.count,0,1.10*max.val)

##   ## for other variables, make a looser bound of zero to M
##   ## this fairly loose bound
##   ## generate the position of each count variable
##   pos.count <- (1:length(ic.names))[is.element(ic.names, mean.vars)]
##   pos.bds <- rbind(pos.bds,cbind(pos.count,0,M))


##   ## Impute some but not all of the variables.  I want to impute the
##   ## mean vars, but not nasty vars, sd vars, the truck counts, and any
##   ## other variables that weren't selected by the count var passed
##   ## pattern.

##   ## so instead of listing each, figure out which aren't

##   exclude.as.id.vars <- setdiff(ic.names,c(mean.vars,new.cnt.vs,'tod','day'))

##   df.amelia <-
##     Amelia::amelia(df,idvars=exclude.as.id.vars,
##            ts="tod",
##            ##splinetime=3,
##            ##lags =new.cnt.vs,leads=new.cnt.vs,
##            sqrts=new.cnt.vs,
##            cs="day",
##            # intercs=TRUE,
##            emburn=c(2,maxiter),
##            bounds = pos.bds, max.resample=10,empri = 0.05 *nrow(df))

##   df.amelia

## }
jmarca/calvad_rscripts documentation built on May 19, 2019, 1:50 p.m.