R/get.medianed.amelia.vds.R

## Turn this back on when I turn on remote files again
## source('node_modules/rstats_remote_files/remoteFiles.R',chdir=TRUE)

#' get amelia vds file
#'
#' go grab the output of amelia for this detector from the file server
#'
#' currently a little out of date, review the code before using.
#' Also, obviously, it needs a server running at the other end
#'
#' @param vdsid the VDS id
#' @param path the root path to ping the web service.  Use it if there
#' is some sort of path like, say, /vile/file/server for the thing
#' @param year the year
#' @param server defaults to http://lysithia.its.uci.edu
#' @param serverfile the already cached server file, if any.  If
#' 'none' (the default) then an initial hit to the server will be done
#' to find the file's REST path, and also ascertain whether that file
#' exists.
#' @return either 'todo' indicating that the file is not on the
#' server, 'rejected' and some files if the remote fetch failed, or a
#' dataframe containing the amelia output
get.amelia.vds.file <- function(vdsid,path='/',year,server='http://lysithia.its.uci.edu'
                               ,serverfile='none'){
    stop('do not use before fixing and removing this')
  ## df.vds.agg.imputed <- list()
  ## files = c(serverfile)
  ## if(serverfile == 'none'){
  ##   path <- toupper(path)
  ##   amelia.dump.file <- make.amelia.output.pattern(vdsid,year)
  ##   files <- get.filenames(server=server,base.dir=path, pattern=amelia.dump.file)
  ##   if(length(files)==0){
  ##     return('todo')
  ##   }
  ## }
  ## result <- 'fail'
  ## attempt <- 0
  ## serverfile = files[1]
  ## while(attempt < 10){
  ##   attempt <-  attempt + 1
  ##   ## print(paste('try',attempt,'loading stored vds amelia object from file',serverfile))
  ##   fetched <- fetch.remote.file(server,service='vdsdata',root=paste(path,'/',sep=''),file=serverfile)
  ##   if(fetched == 'df.vds.agg.imputed') break
  ##   ##print(paste('attempt ',attempt))
  ## }
  ## if(result == 'reject'){
  ##   return(paste('rejected',files[1]))
  ## }
  ## if (attempt>9){
  ##   stop('no data from data server')
  ## }
  ## df.vds.agg.imputed
}

#' get Amelia VDS file from the local file system
#'
#' Rather than hitting a remote server, just get the Amelia output
#' from the local file system.  Use this if everything is running on
#' one machine, ya?
#'
#' @param vdsid the VDS id
#' @param path the root path of where the Amelia files are stashed
#' @param year the year of the data
#' @param server yeah well, cruft
#' @param serverfile more cruft The thing is, you can just slot in
#' this file in place of the other one above and use the same function
#' call, just these last two parameters will get ignored.
#' @return either 'todo' indicating that the file is not on this
#' machine or not yet done, or a dataframe containing the amelia
#' output
#' @export
get.amelia.vds.file.local <- function(vdsid,path='/',year,server,serverfile){
  df.vds.agg.imputed <- NULL

  target.file <- make.amelia.output.pattern(vdsid,year)
  isa.df <- dir(path, pattern=target.file,full.names=TRUE, ignore.case=TRUE,recursive=TRUE,all.files=TRUE)
  ## print(paste(path,target.file,paste(isa.df,collapse=','),sep=' : '))
  if(length(isa.df)==0){
      return('todo')
  }
    ## keep the file with the correct year
    right_file <- grep(pattern=paste('_',year,sep=''),x=isa.df,value=TRUE)
    if(length(right_file) == 0){
        print(paste('failed to find year',year,'in list',paste(isa.df,collapse=',')))
        return('todo')

    }
    if(length(right_file) > 1){
        print(paste('failed to isolate one file from list',paste(isa.df,collapse=','),'got',paste(right_file,collapse=',')))
        stop(2)

    }
    env <- new.env()
    res <- load(file=right_file,envir=env)
    return (env[[res]])

}

#' unget Amelia VDS output file
#'
#' So you need this if you fetch from a remote server because you can
#' get lots of files laying around filling up your hard drive.  This
#' call will unmap any temp files generated in the prior call
#'
#' @param vdsid the VDS id
#' @param path the root path to ping the web service.  Use it if there
#' is some sort of path like, say, /vile/file/server for the thing
#' @param year the year
#' @param server defaults to http://lysithia.its.uci.edu
#' @return nothing at all
unget.amelia.vds.file <- function(vdsid,path='/',year,server='http://calvad.ctmlabs.net'){
    stop('currently not useful, as all files are local')
  ## path <- toupper(path)
  ## amelia.dump.file <- make.amelia.output.pattern(vdsid,year)
  ## files <- get.filenames(server=server,base.dir=path, pattern=amelia.dump.file)
  ## df.vds.agg.imputed <- list()
  ## if(length(files)==0){
  ##   return()
  ## }
  ## unmap.temp.files(files[1])
  ## return()
}

## tempfix.borkborkbork <- function(df){
##   summing.zoo.combiner(df)
## }
## summing.zoo.combiner <- function(df){
##   ## use zoo to combine a mean of occupancy, sums of volumes
##   varnames <- names(df)
##   varnames <- grep( pattern="^ts",x=varnames,perl=TRUE,inv=TRUE,value=TRUE)

##   df.z <- zooreg(df[,varnames]
##                  ,order.by=as.numeric(df$ts))

##   hour=3600
##     print('make it an hour')
##   df.z$tick <- 1
##     df.z <-  aggregate(df.z,
##                        as.numeric(time(df.z)) -
##                        as.numeric(time(df.z)) %% hour,
##                        sum, na.rm=TRUE)
##   names.occ <- grep( pattern="(^o(l|r)\\d+$)",x=names(df.z),perl=TRUE)
##   df.z[,names.occ] <-  df.z[,names.occ] / df.z[,'tick']
##   df.z$tick <- NULL
##   unzoo.incantation(df.z)
## }

#' medianed aggregate df  Generate a hourly aggregate of the impuation results
#'
#' @param df_combined the mulitple imputations, rbind into a single dataframe
#' @param op defaults to median, but you can also try mean.  This is
#' how the multiple imputations are merged into a single imputation
#' result
#'
#' @return a dataframe holding one entry per hour, with the multiple
#' imputations aggregated according to \code{op} according to whatever
#' time step the amelia run was done, and then aggregated up to one
#' hour by summing the counts and averaging the occupancies
#'
#' Speeds are ignored because speed never works out so far
medianed.aggregate.df <- function(df_combined,op=median){
    ## print(paste('use sqldf to aggregate imputations'))

    varnames <- names(df_combined)
    varnames <- grep( pattern="^ts",x=varnames,perl=TRUE,invert=TRUE,value=TRUE)
    varnames <- setdiff(varnames,c('tod','day'))

    n.names <- grep(pattern="^n(l|r)\\d+",x=varnames,perl=TRUE,value=TRUE)
    o.names <- grep(pattern="^o(l|r)\\d+",x=varnames,perl=TRUE,value=TRUE)

    other.names <- grep(pattern="^(n|o)(l|r)\\d+",x=varnames,invert=TRUE,perl=TRUE,value=TRUE)

    ## use sqldf...so much faster than zoo, aggregate

    ## fix ts to make sure it is  POSIXct
    df_combined$ts <- as.POSIXct(df_combined$ts)
    attr(df_combined$ts,'tzone') <- 'UTC'

    sqlstatement <- paste("select ts,",
                          paste('median(',varnames,') as ',varnames,sep=' ',collapse=','),
                          'from df_combined group by ts',
                          sep=' ',collapse=' '
                          )

    ## aggregate the multiple imputations, resulting in one value per
    ## time step
    ## print(sqlstatement)
    temp_df <- sqldf::sqldf(sqlstatement,drv="SQLite")
    attr(temp_df$ts,'tzone') <- 'UTC'


    ## aggregate up to one hour
    hour <-  3600 ## seconds per hour
    temp_df$hourly <- as.numeric(temp_df$ts) - as.numeric(temp_df$ts) %% hour

    temp_df$tick <- 1 ## a value to sum up # of records per hour, to
                      ## compute averages of occupancy (because summed
                      ## occupancy is meaningless!)

    sqlstatement2 <- paste("select min(ts) as ts,",
                           paste('total(',c(varnames,'tick'),') as ',c(varnames,'tick'),sep=' ',collapse=','),
                           'from temp_df group by hourly',
                           sep=' ',collapse=' '
                           )
    ## print(sqlstatement2)
    ## generate the hourly summation
    df_hourly <- sqldf::sqldf(sqlstatement2,drv="SQLite")

    ## divide out the number of intervals summed to create average
    ## occupancy per hour

    df_hourly[,o.names] <- df_hourly[,o.names]/df_hourly[,'tick']

    ## assign the correct timezone again
    df_hourly$ts <- trunc(df_hourly$ts,units='hours')
    df_hourly$ts <- as.POSIXct(df_hourly$ts)
    attr(df_hourly$ts,'tzone') <- 'UTC'

    df_hourly$tick <- NULL

    ts.lt <- as.POSIXlt(df_hourly$ts)
    df_hourly$tod   <- ts.lt$hour + (ts.lt$min/60)
    df_hourly$day   <- ts.lt$wday

    ## all done, return value
    df_hourly
}

## medianed.aggregate.df.oldway <- function(df.combined,op=median){
##   print('use zoo to combine a value')
##   varnames <- names(df.combined)
##   varnames <- grep( pattern="^ts",x=varnames,perl=TRUE,invert=TRUE,value=TRUE)

##   df.z <- zooreg(df.combined[,varnames]
##                        ,order.by=as.numeric(df.combined$ts))

##   ## using median not mean because median is resistant to extreme outliers
##   df.z <- aggregate(df.z, identity, op, na.rm=TRUE )
##   hour=3600
##   print(table(df.z$tod))
##   print('make it an hour')
##   df.z$tick <- 1
##   df.z <-  aggregate(df.z,
##                      as.numeric(time(df.z)) -
##                      as.numeric(time(df.z)) %% hour,
##                      sum, na.rm=TRUE)
##   names.occ <- grep( pattern="(^o(l|r)\\d+$)",x=names(df.z),perl=TRUE)

##   df.z[,names.occ] <-  df.z[,names.occ] / df.z[,'tick']
##   df.z$tick <- NULL
##   df.z
## }

#' Condense Amelia output
#'
#' This combines the "imputations" part of the Amelia output object
#' into a single dataframe, with all the data first rbind-ed together,
#' and then combined into a single record per timestamp using whatever
#' op requested (defaults to median, which is resistant to possible
#' outliers that occasionally result from the imputation process)
#'
#' @param aout the Amelia output
#' @param op usually median or mean.  With the new way, it is pretty
#' much ignored for now and you're stuck with median.
#' @return a dataframe containing the aggregated results of the Amelia
#' imputations, combined one per timestamp with median.
#' @export
condense.amelia.output <- function(aout,op=median){
    ## as with the with WIM data, using median

    df.c <- NULL
    for(i in 1:length(aout$imputations)){
        df.c <- rbind(df.c,aout$imputations[[i]])
    }

    ## gather the extra variable names and remove them
    extravars <- grep('_times_',x=aout$orig.vars,perl=TRUE,value=TRUE)

    for(name_o_n in extravars){
        df.c[,name_o_n] <- NULL
    }


    df.agg <- medianed.aggregate.df(df.c,op)
    df.agg
}
## alias for backwards

#' Condense Amelia output
#'
#' old way of calling condense.amelia.output
#'
#' This combines the "imputations" part of the Amelia output object
#' into a single dataframe, with all the data first rbind-ed together,
#' and then combined into a single record per timestamp using whatever
#' op requested (defaults to median, which is resistant to possible
#' outliers that occasionally result from the imputation process)
#'
#' @param aout the Amelia output
#' @param op usually median or mean.  With the new way, it is pretty
#' much ignored for now and you're stuck with median.
#' @return a dataframe containing the aggregated results of the Amelia
#' imputations, combined one per timestamp with median.
condense.amelia.output.into.zoo <- function(aout,op){
    print('old way of accessing condense.amelia.output')
    condense.amelia.output(aout,op)
}

#' Get aggregated VDS Amelia file
#'
#' Yet another way to call the above stack!
#'
#' This is a bit of a wrapper.  It will either hit the local or the
#' remote server to get the raw Amelia output, and then will run that
#' output through the aggregation function above, producing a
#' dataframe with just one observation/impuation per timestamp
#'
#' @param vdsid the VDS id
#' @param serverfile a cached list of the server's files
#' @param path either 'none', the remote server's service path, or the
#' local file system's root directory for Amelia output
#' @param remote default is TRUE, meaning hit a remote server, or
#' FALSE meaning hit the local server
#' @param server defaults to http://lysithia.its.uci.edu
#' @param year the year
#' @return the condensed, aggregated Amelia imputation results as a
#' dataframe
get.aggregated.vds.amelia <- function(vdsid,
                                      serverfile='none',
                                      path='none',
                                      remote=TRUE,
                                      server='http://lysithia.its.uci.edu',
                                      year){
    ## print(paste('in get.aggregated.vds.amelia, remote is',remote))
    df.vds.agg.imputed <- NULL
    if(path=='none'){
        path <- district.from.vdsid(vdsid)
    }
    if(remote){
        df.vds.agg.imputed <- get.amelia.vds.file(vdsid,path=path,year=year,serverfile=serverfile)
    }else{
        ## print('calling local version')
        df.vds.agg.imputed <- get.amelia.vds.file.local(vdsid,path=path,year=year,serverfile=serverfile)
    }

    if(is.null(df.vds.agg.imputed) || length(df.vds.agg.imputed) == 1){
        print("amelia run for vds not good")
        return(NULL)
    }else if(!length(df.vds.agg.imputed)>0 || !length(df.vds.agg.imputed$imputations)>0 || df.vds.agg.imputed$code!=1 ){
        print("amelia run for vds not good")
        return(NULL)
    }
    condense.amelia.output(df.vds.agg.imputed)
}


#' Get and plot VDS Amelia file
#'
#' Same as call to get.aggregated.vds.amelia, but then it will plot
#' the output results.
#'
#' Which now that I think about it is sort of stupid, because really I
#' probably also want to be able to trigger the native Amelia object
#' plot functions.
#'
#' @param pair the VDS id or the wim id or the vds wim pair, I guess
#' @param year the year
#' @param doplots default is TRUE, set to FALSE if you want to skip
#' plotting, but then, why are you running this if you don't want
#' plots.
#' @param remote default is TRUE, meaning hit a remote server, or
#' FALSE meaning hit the local server
#' @param server defaults to http://lysithia.its.uci.edu
#' @param path either 'none', the remote server's service path, or the
#' local file system's root directory for Amelia output
#' @param force.plot defaults to FALSE, if TRUE will replot even if
#' the plot appears to exist already
#' @param trackingdb the CouchDB database that is being used to track
#' detector status.  Will push the generated plot files to this
#' database as attached files
#' @return 1 if all went well, NULL otherwise
#'
#' And plots get made and saved
#' @export
get.and.plot.vds.amelia <- function(pair,year,doplots=TRUE,
                                    remote=TRUE,
                                    server='http://lysithia.its.uci.edu',
                                    path,
                                    force.plot=FALSE,
                                    trackingdb){
    ## load the imputed file for this site, year
    aout <- NULL
    if(remote){
        aout <- get.amelia.vds.file(vdsid=pair,year=year,path=path,server=server)
    }else{
        aout <- get.amelia.vds.file.local(vdsid=pair,year=year,path=path)
    }
    if(is.null(aout) || is.data.frame(aout)){
        return (NULL)
    }

    df.vds.agg <- NULL
    if(aout$code == 1){

        df.vds.agg <- condense.amelia.output(aout)

        ## prior to binding, weed out excessive flows
        varnames <- names(df.vds.agg)
        flowvars <- grep('^n(r|l)\\d',x=varnames,perl=TRUE,value=TRUE)
        for(lane in flowvars){
            idx <- df.vds.agg[,lane] > 2500
            if(length(idx[idx])>0){
                df.vds.agg[idx,lane] <- 2500
                print(paste('Hey, got excessive flows for ',pair,year))
            }
        }
        ## and drop flat days

        config <- rcouchutils::get.config()
        sqldf_postgresql(config)
        print('going to look for flat days of data')
        print(summary(df.vds.agg$nr1))
        df.vds.agg <- good.high.clustering.vds(df.vds.agg)
        print(summary(df.vds.agg$nr1))


        ## cruft, but may as well keep it up
        ##rcouchutils::couch.set.state(year,pair,doc=list('occupancy_averaged'=1)
        ##                            ,db=trackingdb)

        if(doplots){

            ## do the amelia default plot functions
            plotvars <- grep('^(o|n)(r|l)\\d',x=varnames,perl=TRUE,value=TRUE)

            attach.files <- plot_amelia.plots(aout=aout,
                                              plotvars=plotvars,
                                              vdsid=pair,
                                              year=year,
                                              force.plot=force.plot,
                                              path=path,
                                              trackingdb = trackingdb)

            for(f2a in c(attach.files)){
                rcouchutils::couch.attach(trackingdb,pair,f2a)
            }

            subhead='\npost-imputation data'
            fileprefix='imputed'
            attach.files <- plot_vds.data(df.vds.agg,pair,year,
                                          fileprefix,subhead,
                                          force.plot=force.plot,
                                          path=path,
                                          trackingdb=trackingdb)
            for(f2a in c(attach.files)){
                rcouchutils::couch.attach(trackingdb,pair,f2a)
            }
        }
    }
    df.vds.agg
}

#' Get and plot raw VDS data, prior to imputations
#'
#' Basically the same as the above, except the initial processing of
#' the data is geared towards the raw data, not the output of the
#' imputations.
#'
#' Will load the raw data, aggregate up to one hour (Whereas the
#' imputation step just aggregates up to two minutes these days) then
#' will do the plots
#'
#' @param fname the base name of the file, will be passed to load.file
#' @param f (optional) the full file name of a raw text file, will be
#'     passed to load.file
#' @param path either 'none', the remote server's service path, or the
#'     local file system's root directory for Amelia output
#' @param year the year
#' @param vds.id the VDS id or the wim id or the vds wim pair, I guess
#' @param remote default is FALSE, meaning hit the local server; pass
#'     TRUE if you want hit a remote server or Note that accessing
#'     remote file server stuff is currently disabled
#' @param server defaults to http://lysithia.its.uci.edu Note that
#'     accessing remote file server stuff is currently disabled
#' @param force.plot defaults to FALSE, if TRUE will replot even if
#'     the plot appears to exist already
#' @param trackingdb the CouchDB database that is being used to track
#'     detector status.  Will push the generated plot files to this
#'     database as attached files
#' @return some sort of status if there are problems, but nothing
#'     really.  This function is run for the side effect that plots
#'     get made and saved
#' @export
#'
plot_raw.data <- function(fname,f,path,year,vds.id,
                          remote=FALSE,
                          server='lysithia.its.uci.edu',
                          force.plot=FALSE,
                          trackingdb){
  ## plot the data out of the detector
  fileprefix='raw'
  if(!force.plot){
      have.plot <- check.for.plot.attachment(vds.id,year,fileprefix,
                                             trackingdb=trackingdb)
      if(have.plot){
          print('already have plots')
          return (1)
      }
  }

  ## get the raw data
  ## currently totally ignoring remote file stuff
  ts <- data.frame()
  df <- data.frame()
  ## df.pattern =paste('**/',fname,'*df*',year,'RData',sep='')
  ##rdata.file <- make.amelia.output.file(path,fname,seconds,year)
  if(remote){
      stop('remote file stuff is untested at this time')
      ## fetched <- fetch.remote.file(server,service='vdsdata',root=path,file=f)
      ## r <- try(result <- load(file=fetched))
      ## if(class(r) == "try-error") {
      ##     print (paste('need to get the raw file.  hold off for now'))
      ##     return (FALSE)
      ## }
      ## unlink(x=fetched)
  }else{
      df <- load.file(f,fname,year,path)
  }
  ## break out ts
  ts <- df$ts
  df$ts <- NULL

    ## aggregate up to an hour?
    ## sometimes with raw data, you get empty, so loop here
    sec_seq <- c(3600,1800,600,120)
    idx <- 1
    ## going for a reasonable number of periods for plot
    goodratio <- 10
    df.vds.agg <- NULL
    while(idx <=  length(sec_seq) && goodratio > 5 ){
        seconds <- sec_seq[idx]
        df.vds.agg <- vds.aggregate(df,ts,seconds=seconds)
        have_enough <- df.vds.agg$obs_count == seconds/30
        ## print(table(have_enough))
        if(length(have_enough[have_enough]>0)){
            goodratio <- length(have_enough)/length(have_enough[have_enough])
        }
        if(goodratio > 5){
            print('trying smaller time aggregation')
            idx <- idx + 1
        }
    }
    have_enough <- df.vds.agg$obs_count == seconds/30
    ## print(paste('goodratio',goodratio,'have enough len',length(have_enough[have_enough])))

    if(goodratio == 10){
        ## cannot plot nothing at all
        rcouchutils::couch.set.state(year,
                                     id=vds.id,
                                     doc=list('raw_plots'='not enough good periods to plot')
                                    ,db=trackingdb)

        return (FALSE)
    }

  subhead='\npre-imputation data'
    attach.files <- plot_vds.data(df.vds.agg,
                                  vds.id,
                                  year,
                                  fileprefix,subhead,
                                  force.plot=force.plot,
                                  path=path,
                                  trackingdb=trackingdb)

  for(f2a in attach.files){
      rcouchutils::couch.attach(trackingdb,vds.id,f2a)
  }

  return (TRUE)
}


#' Check CouchDB for whether or not plots have been previously created
#' and attached.  If TRUE, then you can skip the work, if FALSE, then
#' you need to make all the plots.
#'
#' It doesn't check each file, just for the 4th plot created, figuring
#' that if that one has been saved, then 1,2,and 3 have also been
#' saved.
#'
#' @param vdsid the detector id
#' @param year the year
#' @param fileprefix the file prefix for the plot file
#' @param trackingdb no default to track down a bug
#' @return TRUE or FALSE whether the doc (based on VDSID) has the attached file
#'
#' What this routine does is to re-create the expected filename (I
#' should modularize that routine) and then figure outthe doc id, then
#' will use couchUtils.R to hit the db and check for whether the
#' plot file number 4 has been attached to the database yet.
#'
#' In my code I use this to test whether or not to redo the plots.
check.for.plot.attachment <- function(vdsid,year,
                                      fileprefix=NULL,
                                      trackingdb){
  imagefileprefix <- paste(vdsid,year,sep='_')
  if(!is.null(fileprefix)){
    imagefileprefix <- paste(vdsid,year,fileprefix,sep='_')
  }
  fourthfile <- paste(imagefileprefix,'004.png',sep='_')
  ## print(paste('checking for ',fourthfile,'in tracking doc'))
  return (rcouchutils::couch.has.attachment(trackingdb,vdsid,fourthfile))
}

#' Plot VDS data and save the resulting plots to the files system and
#' CouchDB tracking database
#'
#' This is more or less a generic function to plot data either before
#' or after running Amelia.  It only works for VDS data of course, but
#' it doesn't care if imputations have been done or not, so indicate
#' so by including a note in the fileprefix parameter
#'
#'
#' @param aout the amelia output
#' @param plotvars the variables to plot
#' @param vdsid the VDS id
#' @param year the year
#' @param force.plot defaults to FALSE.  If FALSE, and a file exists
#' @param fileprefix helps name the output file, and also to find it.
#' By default the plot file will be named via the pattern
#'
#'     imagefileprefix <- paste(vdsid,year,sep='_')
#'
#' But if you include the fileprefix parameter, then the image file
#' naming will have the pattern
#'
#'     imagefileprefix <- paste(vdsid,year,fileprefix,sep='_')
#'
#' So you can add something like "imputed" to the file name to
#' differentiate the imputed plots from the input data plots.
#' @param path where to put the plots
#' @param trackingdb defaults to 'vdsdata\%2ftracking' for checking if
#' plots already done
#' @return files.to.attach the files that you need to send off to
#' couchdb tracking database.
#' @export
plot_amelia.plots  <- function(aout,plotvars,vdsid,year,force.plot=FALSE,
                               fileprefix='amelia',path='.',
                               trackingdb){

    if(!force.plot){
        have.plot <- check.for.plot.attachment(vdsid,year,
                                               fileprefix,
                                               trackingdb=trackingdb)
        if(have.plot){
            return (1)
        }
    }
    ## print('need to make plots')
    savepath <- paste(path,'images',sep='/')
    if(!file.exists(savepath)){dir.create(savepath)}
    savepath <- paste(savepath,vdsid,sep='/')
    if(!file.exists(savepath)){dir.create(savepath)}

    imagefileprefix <- paste(vdsid,year,sep='_')
    if(!is.null(fileprefix)){
        imagefileprefix <- paste(vdsid,year,fileprefix,sep='_')
    }

    imagefilename <- paste(savepath,paste(imagefileprefix,'%03d.png',sep='_'),sep='/')

     png(filename = imagefilename, width=1600, height=1200, bg="transparent",pointsize=24)
    plotoutput <- plot(aout,which.vars=plotvars,ask=FALSE)
    print(plotoutput)

    dev.off()

    files.to.attach <- dir(savepath,pattern=paste(imagefileprefix,'0',sep='_'),full.names=TRUE,all.files=TRUE)

    return(files.to.attach)

}



#' Plot VDS data and save the resulting plots to the files system and
#' CouchDB tracking database
#'
#' This is more or less a generic function to plot data either before
#' or after running Amelia.  It only works for VDS data of course, but
#' it doesn't care if imputations have been done or not, so indicate
#' so by including a note in the fileprefix parameter
#'
#'
#' @param df.merged the dataframe to plot
#' @param vdsid the VDS id
#' @param year the year
#' @param fileprefix helps name the output file, and also to find it.
#' By default the plot file will be named via the pattern
#'
#'     imagefileprefix <- paste(vdsid,year,sep='_')
#'
#' But if you include the fileprefix parameter, then the image file
#' naming will have the pattern
#'
#'     imagefileprefix <- paste(vdsid,year,fileprefix,sep='_')
#'
#'     So you can add something like "imputed" to the file name to
#'     differentiate the imputed plots from the input data plots.
#' @param subhead Written on the plot
#' @param force.plot defaults to FALSE.  If FALSE, and a file exists
#' @param path where to put the plots
#' @param trackingdb defaults to 'vdsdata\%2ftracking' for checking if
#'     plots already done
#' @return files.to.attach the files that you need to send off to
#'     couchdb tracking database.
#' @export
#'
plot_vds.data  <- function(df.merged,vdsid,year,fileprefix=NULL,subhead='\npost imputation',force.plot=FALSE,path='.',trackingdb){
    if(!force.plot){
        have.plot <- check.for.plot.attachment(vdsid,year,
                                               fileprefix,
                                               trackingdb=trackingdb)
        if(have.plot){
            return (1)
        }
    }
    ## print('need to make plots')
    varnames <- names(df.merged)
    ## make some diagnostic plots

    z <- difftime(df.merged$ts[2],df.merged$ts[1])
    formatted_time <- format(z)
    ## fix 1 hours irritant
    if(as.numeric(z,units='secs') == 3600){
        ## one hour
        formatted_time <- '1 hour'
    }

    ## set up a reconfigured dataframe
    recoded <- recode.df.vds( df.merged )
    print('summary of data about to be plotted:')
    print(summary(recoded))

    ## for coloring occ
    occmidpoint <- mean(sqrt(recoded$occupancy),na.rm=TRUE)
    volmidpoint <- mean((recoded$volume),na.rm=TRUE)
    daymidpoint <- 12

    savepath <- paste(path,'images',sep='/')
    if(!file.exists(savepath)){dir.create(savepath)}
    savepath <- paste(savepath,vdsid,sep='/')
    if(!file.exists(savepath)){dir.create(savepath)}

    imagefileprefix <- paste(vdsid,year,sep='_')
    if(!is.null(fileprefix)){
        imagefileprefix <- paste(vdsid,year,fileprefix,sep='_')
    }

    imagefilename <- paste(savepath,paste(imagefileprefix,'%03d.png',sep='_'),sep='/')

    ## print(paste('plotting to',imagefilename))

    numlanes <- length(levels(recoded$lane))
    plotheight = 400 * numlanes
    png(filename = imagefilename, width=1600, height=plotheight, bg="transparent",pointsize=24)

    p <- ggplot2::ggplot(recoded)

    q <- p +
        ggplot2::labs(list(title=paste(formatted_time,"volume in each lane, by time of day and day of week, for",year,"at site",vdsid,subhead),
                           x="time of day",
                           y="hourly volume per lane")) +
        ggplot2::geom_point(ggplot2::aes(x = tod, y = volume,  color=occupancy),alpha=0.7) +
        ggplot2::facet_grid(lane~day)+
        ggplot2::scale_color_gradient2(midpoint=occmidpoint,high=("blue"), mid=("red"), low=("yellow"))

    print(q)
    q <- p +
        ggplot2::labs(list(title=paste(formatted_time,"avg occupancy in each lane, by time of day and day of week, for",year,"at site",vdsid,subhead),
                           x="time of day",
                           y="hourly avg occupancy per lane")) +
        ggplot2::geom_point(ggplot2::aes(x = tod, y = occupancy,  color=volume),alpha=0.7) +
        ggplot2::facet_grid(lane~day)+
        ggplot2::scale_color_gradient2(midpoint=volmidpoint,high=("blue"), mid=("red"), low=("yellow"))

    print(q)


    q <- p +
        ggplot2::labs(list(title=paste(formatted_time,"volume vs occupancy in each lane, by day of week, for",year,"at site",vdsid,subhead),
                           y="hourly volume per lane",
                           x="hourly avg occupancy per lane")) +
            ggplot2::geom_point(ggplot2::aes(y = volume, x = occupancy,  color=tod),alpha=0.7) +
                ggplot2::facet_grid(lane~day)+
                    ggplot2::scale_color_gradient2(midpoint=daymidpoint,high=("blue"), mid=("red"), low=("lightblue"),name="hour")

    print(q)

    ## this last plot is gross if too many time periods
    if(length(levels(as.factor(recoded$tod)))<30){

        q <- p +
            ggplot2::labs(list(title=paste(formatted_time,"volume vs time in each lane, by hour of day, for",year,"at site",vdsid,subhead),
                               y="hourly volume per lane",
                               x="date")) +
            ggplot2::geom_point(ggplot2::aes(x = ts, y = volume,  color=lane),alpha=0.6) +
            ggplot2::facet_wrap(~tod,ncol=4) +
            ggplot2::scale_color_hue()

    }else{
        q <- p +
            ggplot2::labs(list(title=paste(formatted_time,"volume vs time in each lane, by hour of day, for",year,"at site",vdsid,subhead),
                               y="hourly volume per lane",
                               x="date")) +
            ggplot2::geom_point(ggplot2::aes(x = ts, y = volume,  color=lane),alpha=0.6) +
            ggplot2::facet_grid(lane ~ .) +
            ggplot2::scale_color_hue()

    }

    print(q)

    dev.off()

    files.to.attach <- dir(savepath,pattern=paste(imagefileprefix,'0',sep='_'),full.names=TRUE,all.files=TRUE)

    files.to.attach
}

## plot.vds.data.oldway  <- function(df.merged,vdsid,year,fileprefix=NULL,subhead='\npost imputation',force.plot=FALSE){
##     if(!force.plot){
##         have.plot <- check.for.plot.attachment(vdsid,year,fileprefix,subhead)
##         if(have.plot){
##             return (1)
##         }
##     }
##   print('need to make plots')
##   varnames <- names(df.merged)
##   ## make some diagnostic plots
##   ## set up a reconfigured dataframe
##   plotvars <- grep('^n(r|l)\\d',x=varnames,perl=TRUE,value=TRUE)

##   recode <- data.frame()
##   for(nlane in plotvars){
##     lane <- substr(nlane,2,3)
##     olane <- paste('o',lane,sep='')
##     keepvars <- c(nlane,olane,'tod','day','ts')
##     subset <- data.frame(df.merged[,keepvars])
##     subset$lane <- lane
##     names(subset) <- c('n','o','tod','day','ts','lane')
##     if(length(recode)==0){
##       recode <- subset
##     }else{
##       recode <- rbind(recode,subset)
##     }
##   }


##   numlanes <- length(levels(as.factor(recode$lane)))
##   plotheight = 300 * numlanes

##   savepath <- 'images'
##   if(!file.exists(savepath)){dir.create(savepath)}
##   savepath <- paste(savepath,vdsid,sep='/')
##   if(!file.exists(savepath)){dir.create(savepath)}

##   imagefileprefix <- paste(vdsid,year,sep='_')
##   if(!is.null(fileprefix)){
##     imagefileprefix <- paste(vdsid,year,fileprefix,sep='_')
##   }

##   imagefilename <- paste(savepath,paste(imagefileprefix,'%03d.png',sep='_'),sep='/')

##   print(paste('plotting to',imagefilename))

##   png(filename = imagefilename, width=900, height=plotheight, bg="transparent",pointsize=24)

##   plotvars <- grep('^n',x=varnames,perl=TRUE,value=TRUE)
##   f <- formula(paste( paste(plotvars,collapse='+' ),' ~ tod | day'))
##   strip.function.a <- strip.custom(which.given=1,factor.levels=day.of.week, strip.levels = TRUE )
##   strip.function.b <- strip.custom(which.given=2,factor.levels=lane.defs[1:length(plotvars)], strip.levels = TRUE )
##   a <- xyplot( n ~ tod | day + lane, data=recode
##               ,main=paste("Scatterplot volume in each lane, by time of day and day of week, for ",year," at site",vdsid,subhead),
##               ,strip = function(...){
##                 strip.function.a(...)
##                 strip.function.b(...)
##               }
##               ,ylab=list(label='hourly volume, by lane', cex=2)
##               ,xlab=list(label='time of day', cex=2)
##               ,type='p' ## or 'l
##               ,pch='*'
##               ,scales=list(cex=1.8)
##               ,par.settings=simpleTheme(lty=1:length(plotvars),lwd=3)
##               ,panel=pf
##               ,auto.key = list(
##                  space='bottom',
##                  points = TRUE, lines = FALSE,columns=length(plotvars),padding.text=10,cex=2
##                  )
##               )
##   print(a)

##   plotvars <- grep('^o(r|l)\\d',x=varnames,perl=TRUE,value=TRUE)
##   f <- formula(paste( paste(plotvars,collapse='+' ),' ~ tod | day'))
##   a <- xyplot( o ~ tod | day + lane, data=recode
##               ,main=paste("Scatterplot occupancy in each lane, by time of day and day of week, for ",year," at site",vdsid,subhead)
##               ,strip = function(...){
##                 strip.function.a(...)
##                 strip.function.b(...)
##               }
##               ,ylab=list(label='hourly occupancy, by lane', cex=2)
##               ,xlab=list(label='time of day', cex=2)
##               ,type='p' ## or 'l'
##               ,pch='*'
##               ,scales=list(cex=1.8)
##               ,par.settings=simpleTheme(lty=1:length(plotvars),lwd=3)
##               ,panel=pf
##               ,auto.key = list(
##                  space='bottom',
##                  points = TRUE, lines = FALSE,columns=length(plotvars),padding.text=10,cex=2
##                  )
##               )

##   print(a)


##   a <- xyplot(n ~ o | day + lane, data=recode
##               ,main=paste("Scatterplot hourly volume vs occupancy per lane, by day of week, for ",year," at site",vdsid,subhead)
##               ,strip = function(...){
##                 strip.function.a(...)
##                 strip.function.b(...)
##               }
##               ,ylab=list(label='hourly volume', cex=2)
##               ,xlab=list(label='hourly occupancy', cex=2)
##               ,type='p' ## or 'l'
##               ,pch='*'
##               ,scales=list(cex=1.8)
##               ,par.settings=simpleTheme(lty=1:length(plotvars),lwd=3)
##               ,panel=pf
##               ,auto.key = list(
##                  space='bottom',
##                  points = TRUE, lines = FALSE,columns=length(plotvars),padding.text=10,cex=2
##                  )
##               )

##   print(a)

##   strip.function.b <- strip.custom(which.given=1,factor.levels=lane.defs[1:length(plotvars)], strip.levels = TRUE )

##   a <- xyplot(n ~ ts | lane, data=recode
##               ,main=paste("Scatterplot hourly volume vs time, per lane, for ",year," at site",vdsid,subhead)
##               ,strip = function(...){
##                 strip.function.b(...)
##               }
##               ,ylab=list(label='hourly volume', cex=2)
##               ,xlab=list(label='date', cex=2)
##               ,type='p' ## or 'l'
##               ,scales=list(cex=1.8)
##               ,par.settings=simpleTheme(lty=1:length(plotvars),lwd=3)
##               ,auto.key = list(
##                  space='bottom',
##                  points = TRUE, lines = FALSE,columns=length(plotvars),padding.text=10,cex=2
##                  )
##               )
##   print(a)

##   dev.off()

##   files.to.attach <- dir(savepath,pattern=paste(imagefileprefix,'0',sep='_'),full.names=TRUE,all.files=TRUE)
##   for(f2a in files.to.attach){
##     couch.attach(trackingdb,vdsid,f2a)
##   }
##   rm(recode)
##   gc()
##   TRUE
## }

#' Recode VDS dataframe for plotting
#'
#'
#' @param df the dataframe
#' @return will return a new dataframe recoded for easy plotting
recode.df.vds <- function( df ){
    varnames <- names(df)
    plotvars <- grep('^(n|o)(r|l)\\d+',x=varnames,perl=TRUE,value=TRUE)
    n.idx <- grep('^n',x=plotvars,perl=TRUE,value=TRUE)
    o.idx <- grep('^o',x=plotvars,perl=TRUE,value=TRUE)

    id.vars <- intersect(c('ts','tod','day','obs_count'),varnames)
    melt_1 <- reshape2::melt(data=df,
                             measure.vars=n.idx,
                             id.vars=id.vars,
                             variable.name='lane',
                             value.name='volume')
    melt_1$lane <- as.factor(substr(melt_1$lane,2,3))

    melt_2 <- reshape2::melt(data=df,
                             measure.vars=o.idx,
                             id.vars=id.vars,
                             variable.name='lane',
                             value.name='occupancy')

    melt_2$lane <- as.factor(substr(melt_2$lane,2,3))

    melded <- merge(x=melt_1,y=melt_2,all=TRUE)

    ## some useful factor things
    melded$day <- factor(melded$day,
                         labels=c('Su','Mo','Tu','We','Th','Fr','Sa'))

    ## lanes?

    lanes <- levels(as.factor(melded$lane))

    lane.names <- c("left","right")
    if(length(lanes)==1){
        lane.names <- c("right")
    }else{
        numbering <- length(lanes)
        if(length(lanes)>2){
            for(i in 3:numbering){
                lane.names[i]=paste("lane",(numbering-i+2))
            }
            ## a little switcheroo here
            lanes <- c(lanes[1],rev(lanes[-1]))
            lane.names <- c(lane.names[1],rev(lane.names[-1]))

        }
    }


    melded$lane <- factor(melded$lane,
                          levels=lanes,
                          labels=lane.names)

    melded
}

recode.df.vds.oldway <- function( df,plotvars ){
    recod_df <- NULL
    for(nlane in plotvars){
        lane <- substr(nlane,2,3)
        olane <- paste('o',lane,sep='')
        keepvars <- c(nlane,olane,'tod','day','ts')
        subset <- data.frame(df[,keepvars])
        subset$lane <- lane
        names(subset) <- c('n','o','tod','day','ts','lane')
        recod_df <- rbind(recod_df,subset)
    }
    recod_df
}
jmarca/calvad_rscripts documentation built on May 19, 2019, 1:50 p.m.