grid_data: Process CalVAD 4km grid data

##' A little script to save typing out the same bit of sql
##'
##' @title select.grids.in.basin
##' @param basin the basin
##' @return a string for use as a with or select statement
##' @author James E. Marca
select.grids.in.basin <- function(basin){

    select_statement <- paste(
        "select i_cell,j_cell,"
       ,"st_centroid(grids.geom4326) as centroid"
       ,", grids.geom4326 as geom4326"
       ," from carbgrid.state4k grids ,public.carb_airbasins_aligned_03 basins"
       ," where ab='",basin
       ,"' and st_contains(basins.geom_4326,st_centroid(grids.geom4326))"
        ##        ,"' and basins.geom_4326 && grids.geom4326"
       ,sep='')
    if(basin ==  'California'){

        select_statement <- paste(
            "select i_cell,j_cell,"
           ,"st_centroid(grids.geom4326) as centroid"
           ,", grids.geom4326 as geom4326"
           ," from carbgrid.state4k grids"
           ,sep='')

    }
    return (select_statement)
}

##' Get all the grids in an airbasin shape with hpms data
##'
##' This function will hit postgresql and fetch all of the grid cells
##' that lie inside of the passed-in airbasin shape.  If you stare at
##' the query, you should see that the rule "inside" is that the
##' centroid of the grid lies inside of the shape.
##'
##' @title get.grids.with.hpms
##' @param basin the name of the basin (two letter abbreviation)
##' @param hpms_geom_table the hpms table to use defaults to hpms.hpms_geom
##' @return the result of the query: rows of i_cell,j_cell, centroid
##'     lon, centroid lat
##' @author James E. Marca
get.grids.with.hpms <- function(basin,hpms_geom_table='hpms.hpms_geom'){
    if(is.null(hpms_geom_table) || hpms_geom_table == ''){
        hpms_geom_table <- 'hpms.hpms_geom'
    }
    grid.with = paste("with basingrids as (",select.grids.in.basin(basin),")",sep='')
    ## select grid cells with hpm records in them
    grid.query <- paste(grid.with
                       ," select i_cell,j_cell,st_x(centroid) as lon, st_y(centroid) as lat"
                       ," from basingrids"
                       ," join ",hpms_geom_table," hg on st_intersects(basingrids.geom4326,hg.geom)"
                       ##," join hpms.hpms_link_geom hd on (hg.id=hd.geo_id)"
                       ," group by i_cell,j_cell,lon,lat"
                       ,sep='')
    print(grid.query)
    rs <- RPostgreSQL::dbSendQuery(spatialvds.con,grid.query)
    df.grid <- RPostgreSQL::fetch(rs,n=-1)
    df.grid$geo_id <- paste(df.grid$i_cell,df.grid$j_cell,sep='_')


    df.grid
}


##' Get all the grids in an airbasin shape with hpms data
##'
##' This function will first try to get from fs, then couchdb, then
##' will call postgresql version.
##'
##' @title load.grids.with.hpms
##' @param basin the name of the basin (two letter abbreviation)
##' @param year the year, for couchdb call/save
##' @return the result of the query: rows of i_cell,j_cell, centroid
##'     lon, centroid lat
##' @author James E. Marca
load.grids.with.hpms <- function(basin,year){

    df.grid <- load.grid.data.from.fs('hpms',basin,year)
    if(nrow(df.grid) == 0){
        df.grid <- load.grid.data.from.couchdb('hpms',basin,year)
    }
    if(nrow(df.grid) == 0){
        df.grid <- NULL
        if(year > 2010 && year <=  2015){
            df.grid <- get.grids.with.hpms(basin,config$postgresql$hpms_2014_table)
        }else{
            df.grid <- get.grids.with.hpms(basin,config$postgresql$hpms_table)
        }
        res <- attach.grid.data.to.couchdb('hpms',df.grid,basin,year)
        ## print(res)
    }
    return(df.grid)
}




##' Get grid cells overlapping highway detector segments (vds or wim).
##'
##' Get the grid cells that are likely to have data from the VDS/WIM detectors.
##'
##' @title get.grids.with.detectors
##' @param basin the air basin, two letter abbreviation
##' @return the results of the sql query, rows of
##'     {i_cell,j_cell,centroid lon,centroid lat}
##' @author James E. Marca
get.grids.with.detectors <- function(basin){
    grid.with = paste("with basingrids as (",select.grids.in.basin(basin),")",sep='')

    ## select grid cells with tdetectors records in them
    grid.query <- paste(grid.with
                       ," select i_cell,j_cell,st_x(centroid) as lon, st_y(centroid) as lat"
                       ," from basingrids"
                       ," join tempseg.tdetector ttd on  st_intersects(ttd.geom,geom4326)"
                       ," group by i_cell,j_cell,lon,lat"
                       ,sep='')
    ## print(grid.query)
    rs <- RPostgreSQL::dbSendQuery(spatialvds.con,grid.query)
    df.grid <- RPostgreSQL::fetch(rs,n=-1)
    df.grid$geo_id <- paste(df.grid$i_cell,df.grid$j_cell,sep='_')
    df.grid
}

##' Get all the grids in an airbasin shape with hwy data
##'
##' This function will first try to get from fs, then couchdb, then
##' will call postgresql version.
##'
##' @title load.grids.with.hwy
##' @param basin the name of the basin (two letter abbreviation)
##' @param year the year, for couchdb call/save
##' @return the result of the query: rows of i_cell,j_cell, centroid
##'     lon, centroid lat
##' @author James E. Marca
load.grids.with.hwy <- function(basin,year){

    df.grid <- load.grid.data.from.fs('hwy',basin,year)
    if(nrow(df.grid) == 0){
        df.grid <- load.grid.data.from.couchdb('hwy',basin,year)
    }
    if(nrow(df.grid) == 0){
        df.grid <- get.grids.with.detectors(basin)
        res <- attach.grid.data.to.couchdb('hwy',df.grid,basin,year)
        ## print(res)
    }
    return(df.grid)
}

##' Compute the manhattan distance between two lat, lon points
##'
##' @title compute.manhattan.distance
##' @param a a value with lat, lon
##' @param b a value with lat, lon
##' @return manhattan distance---that is, how many blocks over plus how many up
##' @author James E. Marca
compute.manhattan.distance <- function(a,b){
    abs(a$lat - b$lat) + abs(a$lon - b$lon)
}

##' Assign an HPMS grid cell to a cluster
##'
##' Assign an HPMS grid cell to a cluster.  Uses manhattan distance,
##' and clusters based on a next nearest neighbor rule.  Clustering is
##' needed because doing eerything at once chokes the ram and takes an
##' age to run.
##'
##' @title assign.hpms.grid.cell
##' @param centers the centers of clusters, for computing the intial distances
##' @return a function that will do the assigning.
##' @author James E. Marca
assign.hpms.grid.cell <- function(centers){
    assign.cell <- function(lon,lat=NULL){
        if(is.data.frame(lon)){
            lat <- lon$lat
            lon <- lon$lon
        }
        ## centers.distance <- ddply(centers,.(lat,lon,clustering),.fun=function(a){compute.manhattan.distance(a,hpms.cell)})
        centers.distance <- as.matrix(dist(matrix(c(lon,as.vector(centers$lon),lat,as.vector(centers$lat)),ncol=2,byrow=FALSE)))[,1]
        ## drop own distance
        centers.distance <-  centers.distance[-1]
        ## pick off which cluster has the min distance, ignoring ties
        assigned.cluster <- centers$clustering[centers.distance == min(centers.distance)]
        assigned.cluster <- c(assigned.cluster)[1]
        ##return (assigned.cluster)
        return(assigned.cluster)
        ##return(data.frame(cluster=assigned.cluster))
    }
}



##' The main function that does everything.
##'
##' @title runme
##' @return null
##' @author James E. Marca
##' @export
runme <- function(){

    year = as.numeric(Sys.getenv(c("CARB_GRID_YEAR"))[1])
    month = as.numeric(Sys.getenv(c("CARB_GRID_MONTH"))[1])
    day = as.numeric(Sys.getenv(c("CARB_GRID_DAY"))[1])
    basin = Sys.getenv(c("AIRBASIN"))[1]

    cl <- NULL
    df.hpms.grids <- NULL
    df.grid.data <- NULL

    retrieve.result <- fetch.outer.data(year,month,day,basin)


    if(length(retrieve.result) == 3){
        cl <- retrieve.result$cl
        df.hpms.grids <- retrieve.result$df.hpms.grids
        df.grid.data <- retrieve.result$df.grid.data
    }
    if(length(dim(df.hpms.grids))>0){
        # okay data saved
    }else{
        ## load data from couchdb attachments, if available
        df.grid <- load.grids.with.hwy(basin,year)

        ## cluster **ONLY** the grid cells with valid data
        data.count <- get.rowcount.of.grids(df.grid,
                                            day=day,month=month,year=year)

        df.grid.data <- df.grid[data.count>0,]



        if(nrow(df.grid.data) == 0){
            print('skipping, no data')
            return (0)
        }
        print('dim df hwy grid')
        print(dim(df.grid.data))


        df.hpms.grids <- load.grids.with.hpms(basin,year)
        print('dim df hpms grid')
        print(dim(df.hpms.grids))


        ## remember, fake days have no data, so you're safe here

        ## want clusters of about 20
        numclust <- ceiling(dim(df.grid.data)[1] / 20)
		print(paste('numclust before adjust is ',numclust,'num grid cells is',nrow(df.grid.data)))
        if(numclust > 10) numclust = 10
        print(paste('numclust is ',numclust,'num grid cells is',nrow(df.grid.data)))
        cl <- NULL
        if(numclust > 1){
            cl <- cluster::clara(as.matrix(df.grid.data[,c('lon','lat')]),numclust,pamLike = TRUE,samples=100)
            centers <- as.data.frame(cl$medoids)
            centers$clustering = cl$clustering[rownames(cl$medoids)]

            ## create the assigning function based on the clustered centers
            ascl <- assign.hpms.grid.cell(centers)
            df.hpms.grids$cluster <- -1
            for(i in 1:length(df.hpms.grids$lat)){
                df.hpms.grids$cluster[i] <- ascl(df.hpms.grids[i,])
            }
        }else{
            ## everything is in one cluster
            df.hpms.grids$cluster <- 1
            cl <- data.frame('clustering'=1)
        }

        ## stash all of: cl, df.hpms.grids, df.grid.data by year, month,
        ## day, basin

        stash(year,month,day,basin,cl,df.hpms.grids,df.grid.data)
    }

    print(paste('processing',basin,year,month,day))

    ## first make sure that the clusters are not too big.  if so, catch next pas
    numclust <- max(df.hpms.grids$cluster)

    returnval <- -2
    maxiter <- max(1,ceiling(10/numclust))
                                        # temporary hacking for all_california run
    maxiter <- 1
    print(paste('starting model loop with maxiter=',maxiter))

    for(cl.i in 1:numclust){
        print(paste('cluster',cl.i,'of',numclust))
        grid.idx <- cl$clustering==cl.i
        hpms.idx <- df.hpms.grids$cluster==cl.i
        somereturnval <- process.data.by.day(df.grid.data[grid.idx,]
                                            ,df.hpms.grids[hpms.idx,],year=year
                                            ,month=month
                                            ,day=day
                                            ,basin=paste(basin,cl.i,numclust,sep='_')
                                            ,maxiter=maxiter)
        returnval <- max(returnval,somereturnval )
        ## add a break statement here.  The california-wide modeling
        ## runs have very large clusters, so running multiple clusters
        ## per R job results in a *lot* of leaked RAM.  So break if
        ## there something productive was done here
        if(returnval >=  0){
            returnval <- 1 ## did something, quitting.  force a revisit to this date
            break()
        }
    }
	if(returnval == 0){
        ## save dummies to FS to reduce space
        ## stash(year,month,day,basin,list(),list(),list())
     returnval <- 10
    }
    if(returnval < 0){
        ## that means every iteration above returned "already
        ## done". but I don't want to return -1 to the caller, because
        ## that would be misunderstood.
        returnval <- 0
    }


    return (returnval)
}