In flahn/scidbst: Spatio Temporal Extension for SciDB (scidbst)

library(scidbst)
library(rgdal)
library(plyr)
source("/home/lahn/GitHub/scidbst/vignettes/assets/credentials.R")
scidbconnect(host=host,port=port,user=user,password=password,protocol="https",auth_type = "digest")

Aim and Goals

In this Use Case, we are going to estimate model parameter for a linear regression using the 8 neighboring pixel for temporal chunks of one year. The main use of this will be to create a use case for a simple spatio-temporal analysis using r_exec on SciDB.

First we are going to create a smaller subset of the TRMM data set to develop a function that is able to perform the mentioned task above. Then we will run the analysis on a spatial subset, before we apply the analysis to the whole dataset.

Assumptions

We will use a worldwide TRMM data set from 01.01.1998 to 01.01.2015.

The data is reparted under the following conditions:

the overlap in space is 1 pixel
the overlap in time is (number of years/4)+(1 or 2) -> (15/4)+2 = ca. 6 days
the time dimension is chunked into 365 days
the dimensions are added as attributes with a prefixed "dim", e.g. "dimx","dimy" and "dimt"

This structure allows us to guarantee that combined with the overlap the data will accomodate the leap years and it will contain one year of data per chunk.

RScript operation

Now I will state a first version of the function that will be applied to the data chunk using the previous lines of code. The procedures are the following:

resolve the dot parameter and make the reference variables available in the function
find the data with a one year time span based on the minimum boundary of the whole array
add the neighboring cells to the original observed data in preparation for adapting the linear regression model a. use a focal operation to create RasterLayers with the neighboring cells for each pixel b. retransform the layer stacks into a data.frame
calculate the linear models per pixel and store relevant information in a data.frame for output

Funcion details: x is the data.frame of the chunk and with ... we will pass the spatial and temporal reference parameter.

Resolve dot parameter

Using 'lapply' we establish the named variables in the functions environment ('linearRegressionNeighborhood') from the additional parameters. Those parameters are the spatio-temporal references of the scidbst object.

dot.input = list(...)

i <- 1
lapply(dot.input, function(x,y) {
    assign(x=y[i],value=x,envir=parent.env(environment()))
    i <<- i+1
    x},
    names(dot.input))
rm(i)

Restrict data to one exactly one year

Due to the overlap of the chunks and the leap year problem we cannot use equal 365 days years. So we need to find the first and last day of the year indices. To achieve that we assign a helper function that makes use of the sequence function for POSIXt timestamps ( seq.POSIXt). For the output later on we will calculate the number of the year, since the output will be a yearwise aggregation.

 # helper function to add a certain time to a date
addTime = function(x, i, what) {
    byText = paste(i,what)
    seq(x,by=byText,length=2)[2]
}

# determine the first and last index of the yearly subset in the chunk
first = 0
last = 0

while (! any(first %in% x$dimt) ) {
    indexDate = addTime(t0,first,tunit) # current date of first
    first <- as.numeric(difftime(addTime(indexDate,1,"year"),t0)) # 1 year later of indexDate
    last <- as.numeric(difftime(addTime(addTime(indexDate,2,"year"),-1,"day"),t0)) # 2 years later - 1 day from indexDate
}

numberOfYear = as.POSIXlt(addTime(t0,first,tunit))$year - as.POSIXlt(t0)$year

Prepare the data for calculation

At first we will create a data structure that contains the neighboring cells in addition to each original value. Since this is a raster im age operation we will create a daily spatial raster layer timeline in each chunk. Therefore we use a focal matrix operation on the raster to select the correct neighboring cells for each cell and stack them.

addNeighboringValues = function(x) {
    t = unique(x$dimt)
    x= x[,-which("dimt" == names(x))]
    coordinates(x) = ~dimx+dimy
    gridded(x) = TRUE
    layer = as(x,"RasterLayer")

    p1 = focal(layer, w=matrix(c(1,0,0, 0, 0, 0, 0, 0, 0),nr = 3, nc=3,byrow=TRUE))
    p2 = focal(layer, w=matrix(c(0,1,0, 0, 0, 0, 0, 0, 0),nr = 3, nc=3,byrow=TRUE))
    p3 = focal(layer, w=matrix(c(0,0,1, 0, 0, 0, 0, 0, 0),nr = 3, nc=3,byrow=TRUE))
    p4 = focal(layer, w=matrix(c(0,0,0, 1, 0, 0, 0, 0, 0),nr = 3, nc=3,byrow=TRUE))

    p6 = focal(layer, w=matrix(c(0,0,0, 0, 0, 1, 0, 0, 0),nr = 3, nc=3,byrow=TRUE))
    p7 = focal(layer, w=matrix(c(0,0,0, 0, 0, 0, 1, 0, 0),nr = 3, nc=3,byrow=TRUE))
    p8 = focal(layer, w=matrix(c(0,0,0, 0, 0, 0, 0, 1, 0),nr = 3, nc=3,byrow=TRUE))
    p9 = focal(layer, w=matrix(c(0,0,0, 0, 0, 0, 0, 0, 1),nr = 3, nc=3,byrow=TRUE))

    stack = stack(list(orig.values=layer,p1=p1,p2=p2,p3=p3,p4=p4,p6=p6,p7=p7,p8=p8,p9=p9))
    list(t=t,data=stack)
}
layer.timeline = dlply(x, .variables=c("dimt"), .fun = addNeighboringValues)

The output of the operation is a list of lists containing the time index 't' and the layer stack 'data'. To perform the linear regression algorithm we need to restructure the data as a data.frame. As a precaution we remove the any NA values that might interfere with a successful calculation.

rebuildDataFrame = function(x) {
    b = cbind(dimy=coordinates(x$data)[,"y"],dimx=coordinates(x$data)[,"x"],dimt=x$t,getValues(x$data))
    as.data.frame(b)
}
neighbor.df = ldply(layer.timeline[], .fun = rebuildDataFrame)
neighbor.df = na.omit(neighbor.df)

Running the linear regression and prepare output

Using again a plyr function we sort the data.frame ascending in time and adapt a linear regression model, based on the the neighboring cell values. For the output we will create a data.frame containing the coordinates, model parameter and some statistical quality values.

lmCalculation = function(x,year) {
    #sort by t ascending by time
    x = arrange(x,dimt)

    model = lm(orig.values~p1+p2+p3+p4+p6+p7+p8+p9, x,na.action=na.omit)
    c = coefficients(model)
    s = summary(model)
    data.frame(dimy=unique(x$dimy),
               dimx=unique(x$dimx),
               dimt=as(year*1.0,"double"),
               intercept = c[1],
               c_p1= c[2],
               c_p2= c[3],
               c_p3= c[4],
               c_p4= c[5],
               c_p6= c[6],
               c_p7= c[7],
               c_p8= c[8],
               c_p9= c[9],
               r_sq = s$r.squared,
               sum_res_sq = sum(s$residuals^2))
}
calculated.df = ddply(neighbor.df, .variables=c("dimy","dimx"), .fun = lmCalculation, year=numberOfYear)
calculated.df = na.omit(calculated.df)

Lastly, the complete function that will be integrated into the R-Script:

linearRegressionNeighborhood = function(x, ...) {
    #make the dots named parameters available in this function
    dot.input = list(...)

    i <- 1
    lapply(dot.input, function(x,y) {
        assign(x=y[i],value=x,envir=parent.env(environment()))
        i <<- i+1
        x},
        names(dot.input))
    rm(i)

    cat("processing chunk",file="/tmp/r_exec/lahn/trmm_linear.log",append=TRUE,sep="\n")
    additionalParams = names(list(...))
    cat(paste("Using additional parameter through ...:"),file="/tmp/r_exec/lahn/trmm_linear.log",append=TRUE,sep="\n")
    cat(paste(additionalParams,collapse=", "),file="/tmp/r_exec/lahn/trmm_linear.log",append=TRUE,sep="\n")

    # helper function to add a certain time to a date
    addTime = function(x, i, what) {
        byText = paste(i,what)
        seq(x,by=byText,length=2)[2]
    }

    # determine the first and last index of the yearly subset in the chunk
    first = 0
    last = 0

    while (! any(first %in% x$dimt) ) {
        indexDate = addTime(t0,first,tunit) # current date of first
        first <- as.numeric(difftime(addTime(indexDate,1,"year"),t0)) # 1 year later of indexDate
        last <- as.numeric(difftime(addTime(addTime(indexDate,2,"year"),-1,"day"),t0)) # 2 years later - 1 day from indexDate
    }

    numberOfYear = as.POSIXlt(addTime(t0,first,tunit))$year - as.POSIXlt(t0)$year


    # create a subset to use only data from one year

    x = na.omit(x[which(x$dimt %in% first:last),])

    # add the neighboring values to each of the pixel values using raster operations
    addNeighboringValues = function(x) {
        t = unique(x$dimt)
        x= x[,-which("dimt" == names(x))]
        coordinates(x) = ~dimx+dimy
        gridded(x) = TRUE
        layer = as(x,"RasterLayer")

        p1 = focal(layer, w=matrix(c(1,0,0, 0, 0, 0, 0, 0, 0),nr = 3, nc=3,byrow=TRUE))
        p2 = focal(layer, w=matrix(c(0,1,0, 0, 0, 0, 0, 0, 0),nr = 3, nc=3,byrow=TRUE))
        p3 = focal(layer, w=matrix(c(0,0,1, 0, 0, 0, 0, 0, 0),nr = 3, nc=3,byrow=TRUE))
        p4 = focal(layer, w=matrix(c(0,0,0, 1, 0, 0, 0, 0, 0),nr = 3, nc=3,byrow=TRUE))

        p6 = focal(layer, w=matrix(c(0,0,0, 0, 0, 1, 0, 0, 0),nr = 3, nc=3,byrow=TRUE))
        p7 = focal(layer, w=matrix(c(0,0,0, 0, 0, 0, 1, 0, 0),nr = 3, nc=3,byrow=TRUE))
        p8 = focal(layer, w=matrix(c(0,0,0, 0, 0, 0, 0, 1, 0),nr = 3, nc=3,byrow=TRUE))
        p9 = focal(layer, w=matrix(c(0,0,0, 0, 0, 0, 0, 0, 1),nr = 3, nc=3,byrow=TRUE))

        stack = stack(list(orig.values=layer,p1=p1,p2=p2,p3=p3,p4=p4,p6=p6,p7=p7,p8=p8,p9=p9))
        list(t=t,data=stack)
    }
    layer.timeline = dlply(x, .variables=c("dimt"), .fun = addNeighboringValues)

    # create a data.frame from the RasterStacks and omit the NA values in the overlap border
    # careful x and y are the assigned names for coordinates in package raster
    rebuildDataFrame = function(x) {
        b = cbind(dimy=coordinates(x$data)[,"y"],dimx=coordinates(x$data)[,"x"],dimt=x$t,getValues(x$data))
        as.data.frame(b)
    }
    neighbor.df = ldply(layer.timeline[], .fun = rebuildDataFrame)
    neighbor.df = na.omit(neighbor.df)

    # fit linear regression models on each pixel, using year as parameter
    lmCalculation = function(x,year) {
        #sort by t ascending by time
        x = arrange(x,dimt)

        model = lm(orig.values~p1+p2+p3+p4+p6+p7+p8+p9, x,na.action=na.omit)
        c = coefficients(model)
        s = summary(model)
        data.frame(dimy=unique(x$dimy),
                   dimx=unique(x$dimx),
                   dimt=as(year*1.0,"double"),
                   intercept = c[1],
                   c_p1= c[2],
                   c_p2= c[3],
                   c_p3= c[4],
                   c_p4= c[5],
                   c_p6= c[6],
                   c_p7= c[7],
                   c_p8= c[8],
                   c_p9= c[9],
                   r_sq = s$r.squared,
                   sum_res_sq = sum(s$residuals^2))
    }
    calculated.df = ddply(neighbor.df, .variables=c("dimy","dimx"), .fun = lmCalculation, year=numberOfYear)

    calculated.df = na.omit(calculated.df)
    cat(paste("Finished year:",numberOfYear),file="/tmp/r_exec/lahn/trmm_linear.log",append=TRUE,sep="\n")
    return(calculated.df)
}

Notes

Debugging the user defined R code will be somewhat hard, because r_exec in SciDB does not return detailed error information. Be advised to test your code locally on a small data chunk, before going "BIG". Also avoid the following pitfalls:

make sure that the resulting values are of type double: meaning remove all NA and multiply integer values with 1.0
when using the raster package remember that the labels for coordinates are "x" and "y"

Test the Script on a spatial subset

Prepare a TRMM spatial subset to run the query.

name = "trmm_ethiopia_sub_reparted"
arrays = scidbls()

if (! name %in% arrays) {
  trmm = scidbst("TRMM3B42_DAILY_PREC")
  ex = extent(32,46,1,15)
  trmm.sub = subarray(trmm,ex,between=TRUE)
  trmm.sub = scidbsteval(trmm.sub,"trmm_ethiopia_sub")
  trmm.sub = transform(trmm.sub,dimy="double(y)",dimx="double(x)",dimt="double(t)")
  trmm.reparted = repart(trmm.sub,schema="<band1:double,dimy:double,dimx:double,dimt:double> [y=0:399,30,1,x=0:1439,30,1,t=0:*,365,6]")
  trmm.reparted = scidbsteval(trmm.reparted,"trmm_ethiopia_sub_reparted")
} else {
  trmm.reparted = scidbst("trmm_ethiopia_sub_reparted")
}

Perform a dry-run and show the script that was passed to as expression to r_exec.

script <- r.apply(x=trmm.reparted,
                   f = linearRegressionNeighborhood,
                   array = "trmm_yearly_linear_model",
                   packages = c("raster","sp","rgdal","plyr"),
                   aggregates=c(),
                   output = list(dimy="double",
                                 dimx="double",
                                 dimt="double",
                                 intercept="double",
                                 c_p1= "double",
                                 c_p2= "double",
                                 c_p3= "double",
                                 c_p4= "double",
                                 c_p6= "double",
                                 c_p7= "double",
                                 c_p8= "double",
                                 c_p9= "double",
                                 r_sq = "double",
                                 sum_res_sq = "double"),
                   eval=FALSE,
                   return="rscript"
                   )
cat(script)

Run the function

sa.array <- r.apply(x=trmm.reparted,
                   f = linearRegressionNeighborhood,
                   array = "trmm_yearly_linear_model",
                   packages = c("raster","sp","rgdal","plyr"),
                   aggregates=c(),
                   output = list(dimy="double",
                                 dimx="double",
                                 dimt="double",
                                 intercept="double",
                                 c_p1= "double",
                                 c_p2= "double",
                                 c_p3= "double",
                                 c_p4= "double",
                                 c_p6= "double",
                                 c_p7= "double",
                                 c_p8= "double",
                                 c_p9= "double",
                                 r_sq = "double",
                                 sum_res_sq = "double")
                   )

trmm.reparted = scidbst("trmm_ethiopia_sub_reparted")
img.1 = as(aggregate(subset(trmm.reparted,"t>=0 and t < 365"),by=c("x","y"),"avg(band1)"),"RasterBrick")
img.2 = as(aggregate(subset(trmm.reparted,"t>=365 and t < 730"),by=c("x","y"),"avg(band1)"),"RasterBrick")
img.3 = as(aggregate(subset(trmm.reparted,"t>=730 and t < 1096"),by=c("x","y"),"avg(band1)"),"RasterBrick")
img.4 = as(aggregate(subset(trmm.reparted,"t>=1096 and t < 1461"),by=c("x","y"),"avg(band1)"),"RasterBrick")

spplot(img)

array = scidbst("trmm_yearly_linear_model")
estimateFileSize(array)
df = as(array,"data.frame")
createRasterLayer = function(x) {
  x = suppressWarnings( x[,-which("i" == names(x))])
  t = unique(x[,"dimt"])
  coordinates(x) = ~dimx+dimy
  gridded(x) <- TRUE
  stack = as(x,"RasterStack")
  list(t=t,data=stack)
}
l = dlply(df,.variables=c("dimt"),.fun=createRasterLayer)


spplot(stack(img.1,img.2,img.3,img.4))
spplot(stack(subset(l[[1]]$data,"sum_res_sq"),
             subset(l[[2]]$data,"sum_res_sq"),
             subset(l[[3]]$data,"sum_res_sq"),
             subset(l[[4]]$data,"sum_res_sq")))

aplot = function(l, var) {
  spplot(stack(subset(l[[1]]$data,var),
             subset(l[[2]]$data,var),
             subset(l[[3]]$data,var),
             subset(l[[4]]$data,var)))
}

aplot(l, "c_p1")
aplot(l, "c_p2")
aplot(l, "c_p3")
aplot(l, "c_p4")
aplot(l, "c_p6")
aplot(l, "c_p7")
aplot(l, "c_p8")
aplot(l, "c_p9")


cp1.mod = cp1.3
cp1.mod[cp1.mod > 6] <- NA 
spplot(cp1.mod)
cp1.3 = subset(l[[3]]$data,"c_p1")


# TODO 
# visualize data
# perform redimensioning