knitr::opts_chunk$set(
  fig.retina = 3,
  fig.align = "center",
  fig.width = 6.93,
  fig.height = 6.13,
  out.width = "70%",
  echo = FALSE
)

library("drake")
library("magrittr")
R.utils::sourceDirectory("R")
R.utils::sourceDirectory("code")

# load drake objects
loadd(
  vi_data, nri_data,
  bands_data, coords_vi_nri_clean,

  filter_values,
  filter_info_gain_nbins,

  trees_with_bands,
  trees_with_bands_no_buffer
)
library("DataExplorer")
library("dplyr")
library("ggsci")
library("ggpubr")
library("ggplot2")
library("knitr")
library("purrr")
library("sp")
library("raster")
library("fs")
library("patchwork")
library("here")

Datasets {.tabset .tabset-fade}

VI

Overview

intro <- introduce(vi_data)
intro_df <- data.frame(
  "Name" = c(
    "Rows", "Columns",
    "Discrete columns", "Continuous columns", "All missing columns",
    "Missing observations", "Complete Rows",
    "Total observations"
  ),
  "Value" = c(
    format(intro[["rows"]], big.mark = ","),
    format(intro[["columns"]], big.mark = ","),
    format(intro[["discrete_columns"]], big.mark = ","),
    format(intro[["continuous_columns"]], big.mark = ","),
    format(intro[["all_missing_columns"]], big.mark = ","),
    format(intro[["total_missing_values"]], big.mark = ","),
    format(intro[["complete_rows"]], big.mark = ","),
    format(intro[["total_observations"]], big.mark = ",")
  )
)
knitr::kable(intro_df)

Histograms

plot_histogram(vi_data)

PCA

# remove response
vi_data_no_defol_no_response <- vi_data
vi_data_no_defol_no_response$defoliation <- NULL

pca_vi <- DataExplorer::plot_prcomp(vi_data_no_defol_no_response,
  variance_cap = 0.95,
  ggtheme = ggpubr::theme_pubclean(),
  geom_label_args = list(size = 2.5, nudge_y = 0.03),
  parallel = TRUE
)
pca_vi <- pca_vi[[1]] +
  labs(title = "Target POV: 95%")
pca_vi[[1]]

Corr

plot_correlation(vi_data)

NRI

Overview

intro <- introduce(nri_data)
intro_df <- data.frame(
  "Name" = c(
    "Rows", "Columns",
    "Discrete columns", "Continuous columns", "All missing columns",
    "Missing observations", "Complete Rows",
    "Total observations"
  ),
  "Value" = c(
    format(intro[["rows"]], big.mark = ","),
    format(intro[["columns"]], big.mark = ","),
    format(intro[["discrete_columns"]], big.mark = ","),
    format(intro[["continuous_columns"]], big.mark = ","),
    format(intro[["all_missing_columns"]], big.mark = ","),
    format(intro[["total_missing_values"]], big.mark = ","),
    format(intro[["complete_rows"]], big.mark = ","),
    format(intro[["total_observations"]], big.mark = ",")
  )
)
kable(intro_df)

Histograms

No histograms for NRI -> too many features.

PCA

# remove response
nri_data_no_response <- nri_data
nri_data_no_response$defoliation <- NULL

pca_nri <- DataExplorer::plot_prcomp(nri_data_no_response,
  variance_cap = 0.95,
  ggtheme = ggpubr::theme_pubclean(),
  geom_label_args = list(size = 2.5, nudge_y = 0.03),
  parallel = TRUE
)
pca_nri <- pca_nri[[1]] +
  labs(title = "Target POV: 95%")
pca_nri

Corr

No correlation plot for NRI -> too many features.

HR

Overview

intro <- introduce(bands_data)
intro_df <- data.frame(
  "Name" = c(
    "Rows", "Columns",
    "Discrete columns", "Continuous columns", "All missing columns",
    "Missing observations", "Complete Rows",
    "Total observations"
  ),
  "Value" = c(
    format(intro[["rows"]], big.mark = ","),
    format(intro[["columns"]], big.mark = ","),
    format(intro[["discrete_columns"]], big.mark = ","),
    format(intro[["continuous_columns"]], big.mark = ","),
    format(intro[["all_missing_columns"]], big.mark = ","),
    format(intro[["total_missing_values"]], big.mark = ","),
    format(intro[["complete_rows"]], big.mark = ","),
    format(intro[["total_observations"]], big.mark = ",")
  )
)
kable(intro_df)

Histograms

plot_histogram(bands_data)

PCA

# remove response
bands_data_no_response <- bands_data
bands_data_no_response$defoliation <- NULL

pca_hr <- DataExplorer::plot_prcomp(bands_data_no_response,
  variance_cap = 0.98,
  geom_label_args = list(size = 2.5, nudge_y = 0.03),
  ggtheme = ggpubr::theme_pubclean()
)
pca_hr <- pca_hr[[1]] +
  labs(title = "Target POV: 95%")
pca_hr

Corr

plot_correlation(bands_data)

Custom plots

vi_data_plot <- vi_data %>%
  mutate(plot = factor(rep(
    c("Laukiz 1", "Laukiz 2", "Luiando", "Oiartzun"),
    c(559, 451, 301, 497)
  )))

Mean defoliation per plot

mean_defol <- vi_data_plot %>%
  group_by(plot) %>%
  summarise(mean(defoliation)) %>%
  pull(.)
vi_data_plot %>%
  group_by(plot) %>%
  summarise(mean(defoliation))

Coefficient of variation

cov_defol <- vi_data_plot %>%
  group_by(plot) %>%
  summarise((sd(defoliation) / mean(defoliation)) * 100) %>%
  pull(.)
vi_data_plot %>%
  group_by(plot) %>%
  summarise((sd(defoliation) / mean(defoliation)) * 100)

sd / skewness

sd_skewness_defol <- vi_data_plot %>%
  group_by(plot) %>%
  summarise(((sd(defoliation) / mean(defoliation)) * 100) / e1071::skewness(defoliation)) %>%
  pull(.)
vi_data_plot %>%
  group_by(plot) %>%
  summarise(((sd(defoliation) / mean(defoliation)) * 100) / e1071::skewness(defoliation))
sd_defol <- vi_data_plot %>%
  group_by(plot) %>%
  summarise(sd(defoliation)) %>%
  pull(.)
vi_data_plot %>%
  group_by(plot) %>%
  summarise(sd(defoliation))
boxplot_defol <- vi_data_plot %>%
  group_by(plot) %>%
  ggboxplot(
    x = "plot", y = "defoliation", color = "plot",
    add = "jitter", add.params = list(size = "defoliation")
  ) +
  scale_size(range = c(0.5, 0.5)) +
  annotate("text",
    label = expression(bold(atop("n = 559", bar(x) ~ "= 57.23"))), x = 1,
    y = 112, size = 4, colour = "#BC3C29", fontface = 2
  ) +
  annotate("text",
    label = expression(bold(atop("n = 451", bar(x) ~ "= 13.54"))), x = 2,
    y = 112, size = 4, colour = "#0072B5", fontface = 2
  ) +
  annotate("text",
    label = expression(bold(atop("n = 301", bar(x) ~ "= 68.36"))), x = 3,
    y = 112, size = 4, colour = "#E18727", fontface = 2
  ) +
  annotate("text",
    label = expression(bold(atop("n = 497", bar(x) ~ "= 69.07"))), x = 4,
    y = 112, size = 4, colour = "#20854E", fontface = 2
  ) +
  scale_color_nejm() +
  ggpubr::theme_pubr(base_size = 14) +
  theme(legend.position = "none") +
  labs(y = "Total defoliation per tree (%)", x = "Plot")
boxplot_defol

Point density

Mean average distance between all trees in a given plot.

Calculated via raster::pointDistance(allpairs = TRUE).

plots <- list("Laukiz 1", "Laukiz 2", "Luiando", "Oiartzun")
dist <- map(plots, ~ {
  coords <- coords_vi_nri_clean %>%
    mutate(plot = factor(rep(
      c("Laukiz 1", "Laukiz 2", "Luiando", "Oiartzun"),
      c(559, 451, 301, 497)
    ))) %>%
    filter(plot == .x) %>%
    dplyr::select(-plot)
  points <- SpatialPoints(
    coords = coords,
    # EPSG: 25830
    proj4string = CRS("+proj=utm +zone=30 +ellps=WGS84 +datum=WGS84 +units=m +no_defs")
  )
  distance <- mean(as.dist(pointDistance(points,
    allpairs = TRUE,
    lonlat = FALSE
  )))
})
set_names(dist, plots)
dist_plots <- unlist(dist)
# file_move("docs/figure/eda.Rmd/defoliation-distribution-plot-1.pdf",
#           "code/98-paper/journal/")
# file_move("docs/figure/eda.Rmd/pca-bands-1.pdf",
# "code/98-paper/journal/")

PCA feature set comparison

Comparing the "proportion of variance explained" (POV) for all feature sets.

pca_vi / pca_nri / pca_hr *
  theme(
    plot.title = element_text(size = 10),
    axis.title.y = element_text(size = 8),
    axis.title.x = element_text(size = 10)
  ) +
  patchwork::plot_annotation(
    tag_levels = list(c("VI", "NRI", "HR")),
    title = "Proportion of variance explained by principal components",
    subtitle = "Labels indicate cumulative % of explained variance"
  )

Buffer extraction scenario sketches

Standard scenario (somewhere within the pixel)

knitr::include_graphics(here("docs/figure/tree-buffer-vis.png"))

Extreme scenario (at the border of one or multiple pixels)

knitr::include_graphics(here("docs/figure/tree-buffer-vis-max.png"))


pat-s/2019-feature-selection documentation built on Dec. 24, 2021, 8:40 a.m.