knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "",
  fig.width = 5,
  fig.height = 5
)
options(rmarkdown.html_vignette.check_title = FALSE)
library(pbsEDM)

Introduction

Example use of the pbsSmap() function for implementing the S-map algorithm, giving the results reported in the manuscript.

Again we use the simulated population time series from the manuscript, with population size denoted $N_t$ and first-differenced values denoted $Y_t$ at $t = 1, 2, 3, ..., 99, 100$. We extract them from the saved tibble NY_lags_example_3:

input <-  dplyr::select(NY_lags_example_3, t, N_t, Y_t) %>%
  dplyr::rename(Time = t)
input

In the negative_predictions vignette we used the simplex algorithm and found the optimal embedding dimension to be $E = 3$, so we use that here.

E_fix <- 3

Calculations using pbsEDM

Since $E = 3$ we use lags of 0, 1, and 2, and test over a range of $\theta$, where $\theta$ represents the degree of local weighting:

theta_vec = seq(0, 5, 0.1)
rho_vec <- smap_thetavec(input,
                         lags = list(Y_t = 0:2),
                         theta_vec = theta_vec)

plot(theta_vec,
     rho_vec,
     xlab = expression("Degree of local weighting, " * theta),
     ylab = expression("Forecast skill, " * rho),
     ylim = c(0.7, 1))

Thus we can see that the forecast skill (measured by the correlation coefficient $\rho$) increases as $\theta$ increases. The value $\theta = 0$ represents the global linear solution, with the solutions becoming more locally weighted (and hence nonlinear) as $\theta$ increases.

The maximum value of $\rho$ is, with corresponding $\theta$:

max(rho_vec)
theta_max <- theta_vec[which.max(rho_vec)]
theta_max

We then test if this is signficantly different to the linear system with $\theta = 0$:

rho_vec[1]     # rho at theta=0
p_val <- smap_surrogates(input,
                         lags = list(Y_t = 0:2),
                         theta = theta_max)
p_val

Since the p-value is $>0.05$, we conclude that there is not sufficient evidence that the system is nonlinear, since the fit is not significantly better to that from using $\theta = 0$. However, the focus of our application is in making projections, so we use the optimal $\theta$ for that:

Smap_optimal <- pbsSmap(input,
                        lags = list(Y_t = 0:2),
                        theta = theta_max)

Yhat_100 <- Smap_optimal$Y_forecast[100]
Yhat_100
Nhat_101 <- Yhat_100 + input$N_t[100]
Nhat_101

The latter gives $\hat{N}_{101}$, the forecast of the population at time step 101. This is different to the -0.017 calculated using just the simplex algorithm with $E=3$ in the negative_predictions vignette. So prediction of a negative future population is avoided here, but there is no guarantee of this with the S-map algorithm.

Related to this, we can check how many negative projected values (across the whole time series) arise from S-map calculations:

pbsSmap_pred_Y <- Smap_optimal$Y_forecast[-length(Smap_optimal$Y_forecast)]
                                           # Just the predictions for Y_t, excluding
                                           #  t=101, so for t = 1:100
pbsSmap_pred_N = c(NA,
                   input$N_t + pbsSmap_pred_Y)
pbsSmap_pred_N
which(pbsSmap_pred_N < 0)

Thus there are only three time indices for which we obtain negative predictions of $N_t$ when using S-map, compared to six when using the simplex algorithm. This difference may be due to the higher correlation coefficient comparing predictions of $Y_t$ to actual values when using S-map compared to using simplex.

Calculations using rEDM

We now do the same S-map calculations using code from the rEDM package, to see how the library of candidate neighbours compares to our default pbsEDM library.

rEDM_Smap <- rEDM::PredictNonlinear(dataFrame = input,
                                    columns = "Y_t",
                                    target = "Y_t",
                                    lib = "1 99",
                                    pred = "1 99",
                                    theta = theta_vec,
                                    E = 3,
                                    verbose = TRUE,
                                    showPlot = FALSE)
plot(theta_vec,
     rho_vec,
     xlab = expression("Degree of local weighting, " * theta),
     ylab = expression("Forecast skill, " * rho),
     ylim = c(0.7, 1))
lines(theta_vec,
      rEDM_Smap$rho,
      col = "red")

The red line gives the rEDM results, which are close, but not exactly the same, as those from pbsEDM (black circles) derived earlier. We now use rEDM::SMap() for the previously calculated theta_max, as it gives more detailed output:

rEDM_Smap_full <- rEDM::SMap(dataFrame = input,
                             columns = "Y_t",
                             target = "Y_t",
                             lib = "1 99",
                             pred = "1 99",
                             theta = theta_max,
                             E = 3,
                             verbose = TRUE)
rEDM_Smap_pred_Y <- c(NA,
                      NA,
                      rEDM_Smap_full$predictions$Predictions)

plot(rEDM_Smap_pred_Y,
     pbsSmap_pred_Y,
     xlab = expression("rEDM predictions of " * Y_t),
     ylab = expression("pbsSmap predictions of " * Y_t))
abline(0, 1)

So the predicted values from the two sets of code are certainly similar, but not exactly the same.

Work out which ones appear most different:

epsilon <- 0.14     # Gives the four most different
different <- dplyr::tibble(t = input$Time,
                           rEDM_Smap_pred_Y,
                           pbsSmap_pred_Y) %>%
  dplyr::mutate(diff = rEDM_Smap_pred_Y - pbsSmap_pred_Y)

different_few <- dplyr::filter(different,
                               abs(diff) > epsilon)

different_few

plot(rEDM_Smap_pred_Y,
     pbsSmap_pred_Y,
     xlab = expression("rEDM predictions of " * Y_t),
     ylab = expression("pbsSmap predictions of " * Y_t))
abline(0, 1)

points(different_few$rEDM_Smap_pred_Y,
       different_few$pbsSmap_pred_Y,
       col = "red",
       pch = 20)

So the largest difference occurs for $Y_{31}$, for which $t^* = 30$.

The pbsEDM output contains full details of calculations, so we can look at the neighbours of $\bf{x}_{30}$ (with the closest given first):

t_star <- 30
Smap_optimal$neighbour_index[t_star, ]

By sorting them we can see the indices in the library:

sort(Smap_optimal$neighbour_index[t_star, ])

This matches equation (7) in the manuscript, with the omitted indices in equation (6) given by

c(1:(E_fix - 1),
  setdiff(E_fix:nrow(input),
          sort(Smap_optimal$neighbour_index[t_star, ])))

We wish to see whether some of those indices are included in the library of candidate neighbours for the S-map calculations in rEDM, but detailed output regarding nearest neighbours etc. is not available from the rEDM calculations.

So first we visualise the focal point ${\bf x}{t^}$ and the three subsequent points that should be omitted from the library (as per equations 6 or 7), namely ${\bf x}_{t^+1}$, ${\bf x}{t^+2}$, and ${\bf x}_{t^+3}$. We can use the pbsEDM::plot_phase_3d() function (which requires an input of class pbsEDM so we do the calculation here); for $t^* = 30$:

pbsEDM_simplex <- pbsEDM(NY_lags_example,
                         lags = list(Y_t = 0:2))
plot_phase_3d(pbsEDM_simplex,
              tstar = 30,
              early.col.lines = NA)

The blue dot is ${\bf x}{30}$, and the red dots are ${\bf x}{31}$, ${\bf x}{32}$, and ${\bf x}{33}$. These are not the nearest neighbours in this case, but note that their inclusion would still affect the mean distance of all points from the focal point (equation S.14 in the manuscript), and hence affect the estimate of ${\bf x}_{31}$.

However, for $t^*=93$ (the value with the second largest different between the pbsEDM and rEDM results) we have:

plot_phase_3d(pbsEDM_simplex,
              tstar = 93,
              early.col.lines = NA)

for which one of the red points is very close to the blue ${\bf x}{93}$, and so its inclusion in the library would surely affect the prediction of ${\hat Y}{94}$.

# keep for reference
#plot_phase_3d(pbsEDM_simplex, tstar = 60, early.col.lines = NA, angle = 120) # excluded are
   # very close
#plot_phase_3d(pbsEDM_simplex, tstar = 14, early.col.lines = NA) # looks like excluded are
                                        # far away, therefore may affect mean

We can test that assertion by including the extra three points as later dummy data and re-running the analysis in pbsEDM; being later in the time series they will not get automatically excluded because they will not be close in time to $t^*$:

t_star = 93
pbsEDM_simplex$X[t_star:(t_star+3), ]    # x(tstar) and next three

# It looks like the third one here is the closest to the the first row, verified
#  by the 0.677 value in:
dist(pbsEDM_simplex$X[t_star:(t_star+3), ])

# These are the distances of the four nearest neighbours from the simplex calc:
pbsEDM_simplex$neighbour_distance[t_star, ]

# So x(tstar+2) would have been the second nearest neighbour, but it is
#  excluded here from the simplex calculation.
# So now add the necessary values as dummy data, re-run S-map from pbsSmap, and
#  see if it gets close to the rEDM calculation:

input_dummy <- rbind(input[1:99, ],
                     input[(t_star - 1):(t_star + 3), ],
                     input[100, ])

Smap_optimal_dummy <- pbsSmap(input_dummy,
                              lags = list(Y_t = 0:2),
                              theta = theta_max)

Smap_optimal_dummy$Y_forecast[t_star+1]

That estimate is very close to that from rEDM:

rEDM_Smap_pred_Y[t_star+1]

and somewhat different to the original pbsEDM estimate:

Smap_optimal$Y_forecast[t_star+1]

The actual nearest neighbours are

Smap_optimal_dummy$neighbour_index[t_star, ]

So the second nearest neighbour now is indeed for $t=103$, which is the dummy point in the state space added in, corresponding to $t=95$. So $t=95$ was originally correctly excluded from the library of neighbours in pbsSmap, but by adding it as dummy data later we allow that neighbour to be included as $t=103$. The result is close to that from rEDM, suggesting that $t=95$ is included in the library for $t^* = 93$, unlike in pbsSmap.

Note that the estimates of $\hat{Y}{94}$ from rEDM and from pbsEDM with the dummy data added do not match exactly because the dummy $Y_t$ values create extra ${\bf x}_t$ points, which affect the mean of the distances of all points from ${\bf x}{93}$.

Test that pbsEDM and rEDM results have not changed

Just check that the results have not changed (due to any updates in the packages)::

testthat::expect_equal(Nhat_101,
                       0.5405147,
                       tolerance = 0.000001)       # If no errors then they match
testthat::expect_equal(different_few$diff,
                       c(-0.1724516, -0.2447201,  0.1473963, -0.1927386),
                       tolerance = 0.000001)


luke-a-rogers/pbsedm documentation built on June 3, 2024, 5:20 a.m.