In sidjai/biosplit: Model for continental seasonal migration of insects

knitr::opts_chunk$set(fig.width=6.5, fig.height=3.5,fig.pos='center',echo=TRUE,comment='>',dpi=800)
library(ggplot2)
library(biosplit)

figList <- list()
source(paste(realWd,"docs/rmdFunctions.R",sep='/'))
theme_set(getNiceTheme())

meteoLoc <- "C:/Users/Siddarta.Jairam/Desktop/sid/MeteoInfo1.2"
goldLoc <- "C:/Program Files/Golden Software/Surfer 12"
RLoc <- "C:/Users/Siddarta.Jairam/Documents/R/R-3.2.0"

Setup

When the package is installed there is a blank "config.txt" file created in the package directory. This holds all the information about the run and the file folders. Ultimately, most of the file folders are optional but the parameters are vital to how the biological side of the program runs. One file folder that is necessary is the location of the HYSPLIT working directory and the installed "execs" file. This program assumes that the HYSPLIT is installed.

cfg <- loadConfig()
tail(names(cfg))

Make the grids of the different manipulation steps:

boBox <- raster::extent(cfg$xmin, cfg$xmax, cfg$ymin, cfg$ymax)
cropGrid <- raster::raster(boBox,
    crs = cfg$cropProj,resolution = cfg$spc)
niceGrid <- raster::projectExtent(cropGrid,cfg$niceProj)

Data Collection

Crop

Download the files directly from NASS cropscape

NASS2TIFs(cfg$CropFold[1], cfg$year, cropGrid)

convert the downloaded Tiff 30m blocks to 40 km blocks and get rid of projection

finCrop <- paste(cfg$CropFold[2],paste0("aggCrop_", cfg$year, ".nc"), sep='/')
rawCrop2nicenc(cfg$CropFold[1], finCrop, cropGrid)

add in Canada

addCanada <- "C:/Users/Siddarta.Jairam/Documents/Documentation/Untitled Polygon.kml"
finAddCrop <- gsub("[.]nc", "add.nc", finCrop)
addCropArea(finCrop, addCanada, cropAmt = 1, pathOut = finAddCrop)

Meterological

Go through the ARL data Require MeteoInfo

runMeteo <- makeRunFun(meteoLoc,"py")
runMeteo("ARL2GRD.py",
                 c(cfg$MetARLFold, cfg$proARLDir, cfg$wantedMetVars, recursive = TRUE), TRUE)

Get all the variables cleaned up unprojected and netCDFs.

#get the units for the variables (comes from readme of the met data used
unitsDict <- list( T02M = 'K', V10M = 'm/s North', TPP3 = 'm', SOLT = 'K')
vapply(cfg$wantedMetVars, function(var){
    namu <- paste0(var, "Fold")
    rawMet2nicenc(dirTreeIn = cfg[[namu]],
                                projKey = cfg$MetMappingLoc,
                                unit = unitsDict[[var]],
                                niceGrid = niceGrid)
    TRUE},TRUE)

Running the program

Transform predictive data into the derived variables used by the program

collectAprioriVars(cfg)

Call the model

runBiosplit(cfg)

Validation

Grab the trap data and get a list of all the trap locations and their captures

pathTrapXlsx <- "C:/Users/Siddarta.Jairam/Documents/Documentation/PestWatchDump.xlsx"

pathScrubTrap <- "C:/Users/Siddarta.Jairam/Documents/Documentation/scrubTrap.csv"
trapRec <- scrubTrap(pathTrapXlsx, cfg$year)

Go through the haplotype data and relate it back to the trap data for a smaller subset.

pathHapXlsx <- "C:/Users/Siddarta.Jairam/Documents/Documentation/hapDat.xlsx"
pathTrapDict <- "C:/Users/Siddarta.Jairam/Documents/Documentation/County2TrapLoc.csv"
csvTab <- scrubHaplo(pathHapXlsx, pathTrapDict)
csvTab <- scrubHaplo(pathHapXlsx, pathTrapDict, pathCsvOut = gsub("[.]xlsx", paste0(cfg$year, ".csv"), pathHapXlsx), onlyYr = cfg$year)

Summarize the validation on a per year basis in the form of a single column csv file. After this, one can further summarize the data using area of interests that either calculate the first occurrence in that area using the minimum date of arrival or the average mixing ratio that takes the sum of all the moths in that entered that area as a population.

pathHapSum <- "C:/Users/Siddarta.Jairam/Documents/Documentation/sumHapTrap.csv"
sumTab <- summarizeValid(trapRec, cfg$year, hapIn = csvTab)
valAreaStats(pathHapSum,"C:/Users/Siddarta.Jairam/Documents/Documentation/Central PA.kml", niceGrid)

Post-processing

Make the final csv tables comparing all the different validation and simulation data.

ncdf2trapdata(cfg$SimOutFold, cfg$TrapLoc[paste0(cfg$year)], shDoSum = TRUE)

runs <- c("runInterYear","runAbsFlightLimit","runFlightAndNight","runNightDurDel")
pushLoc <- paste('X:/2 WESTBROOK/Sid/Hysplit Out Moth table',cfg$year, sep='/')
vapply(runs, function(x){
    direc <- gsub(cfg$runName, x, cfg$SimOutFold)
    direc[2] <- paste(pushLoc, x, sep='/')
    ncdf2trapdata(direc[1], cfg$TrapLoc[paste0(cfg$year)])
    pathCSV <- paste0(direc, "/TrapFinal")
    vapply(c('H.csv','V.csv'), function(x){

        file.copy(paste0(pathCSV[1],x), paste0(pathCSV[2],x), overwrite = TRUE)
    },TRUE)

},rep(TRUE,2))

You can also do the regional statistics on the simulation data using a similar scheme.

simAreaStats(cfg$SimOutFold, "C:/Users/Siddarta.Jairam/Documents/Documentation/Central PA.kml")

Spatially average the output over the whole grid. This compares the trap data on the same scale as the simulation data.

predObv <- ncdf2trapgrid(cfg$SimOutFold,
                                                "C:/Users/Siddarta.Jairam/Documents/Documentation/Result Files/firstOccTrap.grd",
                                                paste(cfg$SimOutFold, "Sim-TrapFirstOcc.nc",sep='/'))
#The spatial average distance between the weeks
mean(abs(predObv[,,]),na.rm=TRUE)

Along with the general comparison of the rasters, one can look at the difference with the first occurrence and the mixing ratios in both cases spatially averaged to the base grid size. This now is the third way of calculating these values for three different spatial scales (point and time specific, regional over the year, spatially averaged per point over the year).

ncdf2trapgrid(cfg$SimOutFold,
                            pathOut = paste(cfg$SimOutFold, "SimFirstOcc.nc",sep='/'),
                            shUseSum = TRUE)
ncdf2trapgrid(cfg$SimOutFold,
                            pathOut = paste(cfg$SimOutFold, "SimMixRatio.nc",sep='/'),
                            shDoMix = TRUE,
                            shUseSum = TRUE)

Visualization

There are 3 different maps used in this analysis

Contour maps of the simulation weekly output snapshots
Class post maps of the exact location of the simulation output
Contour plots of the result of post-processing with overlays of the validation data

These are done using a provided script running function to call scripter as follows:

runScripter <- makeRunFun(goldLoc, "BAS")

The contour maps are the first thing to look at with a given simulation. You can see the progress through the year and have a good idea about mixing areas and population densities. Mixing areas are hard to ascertain but first occurrence is mapped precisely.

runScripter("Mothtxt2Contour.BAS",cfg$SimOutFold)

The contours are made for almost thumbnail- like size for use in the autoreport. This means that areas have been smoothed and complexities have been eliminated. To get a clearer look at what is happening in each grid cell the class post maps are used. All the complexities are shown in gory details including individual populations in a specific grid point. The downside is that these are overlayed on top of each other which makes the graph extremely messy after June or so. These are not ready for publication and should be used as raw verification outputs.

runScripter("Mothtxt2ClassPost.BAS",cfg$SimOutFold)

After the post-processing is done the last graph is used. Without too much manipulation, these graphs were arrived upon for showing the data as they are. It takes an netCDF output of one of the statistics used, mixing ratio or first occurrence as well as a table of scrubbed validation data. It has flags to manipulate the number of bins and if color is used.

pathMix <- paste(cfg$SimOutFold, "SimMixRatio.nc",sep='/')
pathFirst <- paste(cfg$SimOutFold, "SimFirstOcc.nc",sep='/')
pathHap <- "C:/Users/Siddarta.Jairam/Documents/Documentation/hapDat2013.csv"
pathSum <- "C:/Users/Siddarta.Jairam/Documents/Documentation/sumTrapHap2013.csv"

runScripter("ncdf2contour.BAS", c(pathMap, 3, "bw", pathHap, 3, 2, 8, 1))
runScripter("ncdf2contour.BAS", c(pathFirst, 4, "bw", pathSum, 3, 2, 4, 0, "First Occ (PestWatch wk)"))

The order of the flags (the character vector given after the script name) are:

Simulation netCDF location
Number of levels = 5
Color or bw? = BW
Class post table location
Longitude column = 1
Latitude column = 2
Z column = 3
offset? = 0
Class post title = "Haplotype Ratio (h2/h4)"

Tools and documentation

For the pre-processing, this document should suffice as the individual steps may change with the specific project. As with all the functions, roxygen comments are used to produce the final man pages for more detailed documentation.

An attempt to summarize the biophysical models that go into the model can be found in "docs/BiologicalEqs.rmd". An explanation of the HYSPLIT model can be located at the NOAA website.

batchRun.R in the scripts folder details the modeling steps while running a multiple runs in a single source call.

The most complete collection of the post processing steps and scientific underpinnings can be found in "docs/AutoReport.rmd". This is a parameterized report that compares two or more runs with each other and largely runs through the post-processing steps. It has a required parameter other than the specific runs so the call in the final output grid as follows:

rmarkdown::render(system.file("docs", "AutoReport.rmd", package = "biosplit"),
    params = list(
    runs = rep("runWithCanada",4),
    years = c(2011, 2012, 2013, 2014),
    niceGrid = niceGrid
    )
)