knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "READMEfigs/"
)
package.root <- "~/git/pkgButterfly"
data.input.folder <- paste0( package.root, "/nogit/data_input" )
data.output.folder <- paste0( package.root, "/nogit/data_output" )

pkgButterfly

Install and load

Install pkgButterfly from github if not installed yet.

devtools::install_github( "rossholmberg/pkgButterfly" )

Load the package as well as a few other packages we'll use in this process.

library( pkgButterfly )
library( magrittr )
library( data.table )
library( plyr )

Chlorophyll analysis

The parameters to the costCalcMaster function are as follows.

Central point from which foraging occurs (note areas of land will not be taken into account, so a coastline point is suitable here)

colony <- c( 145.2, -38.8 )
central.foraging.area <- c( 145.014879, -38.752039 )

A file containing currents data needs to be input to the calculations. This data will be used to calculate an "area of influence" (a.k.a "butterfly", due to a shape of which this area may take the form).

currents.file <- paste0( data.input.folder, "/dataset-armor-3d-v4-cmems-v2_1486508039772.nc" )

A folder to be used to temporarily store data created during processing.

temp.output.folder <- paste0( data.output.folder, "/Rtemp/" )

We should also decide on how much leeway to give when matching dates. It is often necessary to find a nearest date match, when no exact match is available. In those cases, how much leeway do we want to allow before giving up (and returning NA)?

buffer.days <- 30

If variable names are logical within the .nc file, costCalcMaster will find them automatically. If any are strangely names, we may need to specify them in the function call. To get a list of variable names within the file, we can run:

nc_variableNames( currents.file )

Note the names in this file are fairly logical. We have "lat", "lon", "ZonalCurrent", and "MeridionalCurrent". There is also a "depth" variable here, meaning there may be more than one depth value available. We will only analyse one depth value (multiple values can be analysed by calling costCalcMaster within, for example, lapply or similar). If more than one depth value is available, it will be necessary to specify which depth to use in the analysis, and which dimension in the currents data represents that depth dimension. We can look at the depths available in the file:

input.data <- ncdf4::nc_open( currents.file )
depths <- ncdf4::ncvar_get( input.data, varid = "depth" )
depths

And we can look at the dimensions of the currents data:

currents.dim <- dim( ncdf4::ncvar_get( input.data, varid = "zvelocity" ) )
# we can now close the file
ncdf4::nc_close( input.data )
currents.dim

Notice that there is only one dimension in that data array matching the length of the depths vector. That dimension (here "3") is what the creators of this .nc file have deemed the "depth" dimension. Our next call will need this value. Note that in a case like this, where only one dimension matches the size of the depth vector, we can obtain the apropriate value programmatically like so (where there is more than one match, the user will need to input specifically which one represents depths):

depth.dimension <- which( currents.dim == length( depths ) )
depth.dimension

Run the main "butterfly" analysis. This can be a very long process. Expect this step to take 10-90mins, depending on the size of the dataset, and the parameters set on input (particularly dates.range and cell.size).

conductance.table <- costCalcMaster( output.folder = temp.output.folder,
                                     # dates.range = as.Date( c( "2015-01-01", "2015-08-30" ) ), # uncomment to limit dataset
                                     buffer.days = buffer.days,
                                     currents.file = currents.file,
                                     depth.touse = 0,
                                     depth.dimension = depth.dimension,
                                     POI = central.foraging.area,
                                     foraging.distance = 50,
                                     cell.size = 0.1,
                                     circuitscape.link = ifelse( Sys.info()['sysname'] == "Windows",
                                                                 'C:/"Program Files"/Circuitscape/cs_run.exe',
                                                                 'python2.7' ),
                                     parallel = 6L
)
gc() # collect garbage to clear RAM
saveRDS( conductance.table, paste0( data.output.folder, "/condtable.rds" ) )
conductance.table <- readRDS( paste0( data.output.folder, "/condtable.rds" ) )

In some cases, the "dates" values, which now reside in the column names of conductance.table, are not in a logical format. It may be a good idea to change these accordingly. Here we use the ncdf4 package to read in what the creators of this file designated for its time format.

# get all of the element names (this will include every data value as well)
ncfile <- ncdf4::nc_open( currents.file )
allnames <- names( unlist( ncfile ) )

# remove those names ending with a number
# (these are generally repeated names, meaning they represent data itself, rather than a name)
names <- allnames[ !grepl( "[0-9]$", allnames ) ]

# keep only names beginning with "dim". This *should* represent the dimension values,
# meaning it's the place to look for information about the dimensions of the data
names[ grepl( "^dim", names ) ]

Note that the above represents every available element of the dataset, with nesting levels separated by ".". Note the values beginning with "dim.time" here represent the timeseries data (this may sometimes be named "date", "timestamp", etc. depending on the decisions of the file's creators).

Looking through the names above, it looks like "dim.time.units" is probably what we're looking for. We can look at that element by replacing the "." characters with "$" in an R call, to specify nested list elements:

ncfile$dim$time$units
ncdf4::nc_close( ncfile )

Now we see what the .nc file's creators decided as the most appropriate formatting structure for timeseries values. In this case they have represented those values as numeric, specifying the number of "hours since 1950-01-01".

We can use that information to replace the current column names to useful date values. Note we're only changing the values for columns starting from 3, since the first 2 columns are "lat" and "lon"

library( magrittr )
names( conductance.table )[ 3:ncol( conductance.table ) ] %<>%
    as.numeric() %>% # make sure they're numeric
    divide_by( 24 ) %>% # convert hours to days
    as.Date( origin = "1950-01-01" ) %>% # convert to date, specifying origin
    as.character() # convert to character for use as column names

So now the column names in our main dataset should be in a logical date format.

head( names( conductance.table ), 6 )

We can also look at the conductance data itseld. The best way to do this is visually as a spatial plot. Here, we look at a single point in the butterfly over time.

pkgButterfly::plotButterfly( conductance.table, round( ncol( conductance.table ) - 2L ) * 0.5 )

We will specify only one Chlorophyll data file for processing here. It may be worthwhile at times to pass multiple Chlorophyll files for processing against a given conductance dataset, which can be achieved using a looping call such as plyr::ldply. Below we will use plyr::ldply simply to allow for this possibility should it be needed.

chl.file <- paste0( data.input.folder, "/dataset-global-analysis-forecast-bio-001-014_1486686773262.nc" )

The pkgButterfly functions will attempt to find logical ways in which data has been stored within the given .nc file. However, since there doesn't seem to be standard for how to store data in .nc files (we should standardise this!), we may need to extract certain things from the file manually. In this case, we'll extract the timeseries data maually, convert it to a logical date format, then pass that to the next function chlorophyllCalc to override any automated extraction of that.

variables.available <- nc_variableNames( chl.file )
variables.available

This time, the timeseries data has been named "time_counter"

ncfile <- ncdf4::nc_open( chl.file )
times <- ncdf4::ncvar_get( ncfile, "time_counter" )
head( times )

Note again that the timeseries is not in a logical date format, so we must again look into the file to see how they formatted this (this time without the comments, look at the previous similar process for details on each step here).

allnames <- names( unlist( ncfile ) )
names <- allnames[ !grepl( "[0-9]$", allnames ) ]
names[ grepl( "^dim", names ) ]

Seeing "dim.time_counter.units", we can now find the date syntax used in this file.

ncfile$dim$time_counter$units
ncdf4::nc_close( ncfile )

Note that as with the currents file (both of these files came from the same data source, the European Union's "Copernicus" repository), we need to convert the times from a numeric "hourse since 1950-01-01 00:00:00":

library( magrittr )
times %<>%
    as.numeric() %>%
    divide_by( 24 ) %>%
    as.Date( origin = "1950-01-01" )
head( times )

Then process the chlorophyll file to achieve an output chlorophyll measurement for each available date. Note that each dated chlorophyll matrix must have conductance data against which to apply the algorithm. We can specify a buffer time as max.day.diff which will allow some leeway here, such that slight mismatches in conductance data and chlorophyll data can be tolerated (either a single match, or extrapolated data between nearest before and after data points will be used).

Note that in the following call we use depth.dimension = NULL (let the function attempt to automatically find which dimension represents the depth variable) and depth.touse = NULL (use, by default, the shallowest depth value available). Each of these values can be left out of the following call, since NULL is the default for both; they are shown here only to highlight them for your awareness.

dates.chlorophyll <- plyr::ldply( .data = chl.file,
                                  .fun = chlorophyllCalc,
                                  data.variable = "chl",
                                  depth.dimension = NULL,
                                  depth.touse = NULL,
                                  timeseries = times,
                                  conductance.data = conductance.table,
                                  max.day.diff = buffer.days,
                                  .parallel = FALSE ) %>%
  setDT() %>%
  # for duplicates (useful in cases where more than one file has been analysed) take an average
  .[ , mean( mean.chlorophyll ), by = date ] %>%
  # sort by date
  setorder( date ) %>%
  # make sure the column names are appropriate
  setnames( c( "date", "mean.chlorophyll" ) )
gc() # collect garbage to clear RAM
saveRDS( dates.chlorophyll, paste0( data.output.folder, "/dateschl.rds" ) )
dates.chlorophyll <- readRDS( paste0( data.output.folder, "/dateschl.rds" ) )

What we're left with is a data frame displaying dates and chlorophyll values.

head( dates.chlorophyll )

We can now plot the results:

library( ggplot2 )
ggplot( data = dates.chlorophyll, mapping = aes( x = as.Date( date ), y = mean.chlorophyll ) ) +
    geom_point() +
    geom_smooth( method = "loess", span = 0.2 ) +
    ggtitle( "Mean Chlorophyll concentration, based on local dynamic Butterfly" ) +
    xlab( "Date" ) + ylab( "Chlorophyll concentration (mg/m^3)" )

The above shows interpolated chlorophyll level over time, for the specified area of interest (a radius of foraging.distance km around POI).

Analysis of other variables

Note that what we've done so far is to compute a "Butterfly" area, then apply that area to interpolate available Chlorophyll data in the most meaningful way. This is what pkButterfly was designed to do, being based on the research of Afan et.al on Chlorophyll specifically.

If we so wish however, we can apply the butterfly area to variables other than Chlorophyll. For example, the file referenced here contains data for "net_primary_productivity_of_carbon" as well:

dates.PP <- plyr::ldply( .data = chl.file,
                         .fun = chlorophyllCalc,
                         data.variable = "PP",
                         timeseries = times,
                         conductance.data = conductance.table,
                         max.day.diff = buffer.days,
                         .parallel = FALSE ) %>%
    setDT() %>%
    # for duplicates (useful in cases where more than one file has been analysed) take an average
    .[ , mean( mean.chlorophyll ), by = date ] %>%
    # sort by date
    setorder( date ) %>%
    # make sure the column names are appropriate
    setnames( c( "date", "mean.pp" ) )
gc() # collect garbage to clear RAM
saveRDS( dates.PP, paste0( data.output.folder, "/datespp.rds" ) )
dates.PP <- readRDS( paste0( data.output.folder, "/datespp.rds" ) )
library( ggplot2 )
ggplot( data = dates.PP, mapping = aes( x = as.Date( date ), y = mean.pp ) ) +
    geom_point() +
    geom_smooth( method = "loess", span = 0.2 ) +
    ggtitle( "Mean Primary Production, based on local dynamic Butterfly" ) +
    xlab( "Date" ) + ylab( "net Primary production of Carbon (g/m^3/day)" )

Likewise, we have included in this file the data for "mole_concentration_of_phytoplankton_expressed_as_carbon_in_sea_water":

dates.Phyto <- plyr::ldply( .data = chl.file,
                         .fun = chlorophyllCalc,
                         data.variable = "PHYC",
                         timeseries = times,
                         conductance.data = conductance.table,
                         max.day.diff = buffer.days,
                         .parallel = FALSE ) %>%
    setDT() %>%
    # for duplicates (useful in cases where more than one file has been analysed) take an average
    .[ , mean( mean.chlorophyll ), by = date ] %>%
    # sort by date
    setorder( date ) %>%
    # make sure the column names are appropriate
    setnames( c( "date", "mean.phyto" ) )
gc() # collect garbage to clear RAM
saveRDS( dates.Phyto, paste0( data.output.folder, "/datesphyto.rds" ) )
dates.Phyto <- readRDS( paste0( data.output.folder, "/datesphyto.rds" ) )
library( ggplot2 )
ggplot( data = dates.Phyto, mapping = aes( x = as.Date( date ), y = mean.phyto ) ) +
    geom_point() +
    geom_smooth( method = "loess", span = 0.2 ) +
    ggtitle( "Mean Phytoplankton, based on local dynamic Butterfly" ) +
    xlab( "Date" ) + ylab( "Phytoplankton concentration (mmol/m^3)" )


rossholmberg/pkgButterfly documentation built on May 27, 2019, 11:36 p.m.