knitr::opts_chunk$set(
  fig.retina = 3,
  fig.align = "center",
  fig.width = 8.5,
  fig.asp = 0.66,
  out.width = "70%",
  echo = FALSE
)

options(
  scipen = 999
)

library("drake")
library("hsdar")
library("dplyr")
library("ggplot2")
library("ggpubr")
library("ggpmisc")
library("patchwork")
library("iml")

# load drake objects
loadd(
  # Slurm resources suggestion: at least 20 cores and 2 GB / core = 40 GB pro job
  fi_permut_hr,
  fi_permut_vi,

  fi_ale_hr,
  fi_ale_vi,
  fi_ale_hr_gs20,
  fi_ale_vi_gs20,

  df_wavelengths_from_indices,
  spec_sigs
)

Preview the ordered feature importance results for datasets "HR" and "VI".

fi_ranked_hr <- fi_permut_hr$res %>%
  tibble::rownames_to_column("measure") %>%
  tidyr::pivot_longer(
    cols = starts_with("B"),
    values_to = "importance", names_to = "feature"
  ) %>%
  dplyr::mutate(wavelength = seq(420, 995, 4.75)) %>%
  dplyr::mutate(numeric_id = seq(5, 126, 1)) %>%
  dplyr::arrange(desc(importance)) %>%
  dplyr::mutate(rank = row_number()) %>%
  dplyr::select(-measure)
fi_ranked_hr

fi_ranked_vi <- fi_permut_vi$res %>%
  tibble::rownames_to_column("measure") %>%
  dplyr::select(-contains("ID..")) %>% 
  tidyr::pivot_longer(
    cols = !measure,
    values_to = "importance", names_to = "feature"
  ) %>%
  # mutate(wavelength = seq(420, 995, 4.75)) %>%
  dplyr::arrange(desc(importance)) %>%
  dplyr::mutate(rank = row_number()) %>%
  dplyr::select(-measure) %>%
  dplyr::mutate(feature = stringr::str_replace(feature, "bf2_", ""))
fi_ranked_vi

Create a virtual Spectral Signature (mean) of vegetation using PROSAIL.

PROSAIL is a algorithm simulating Spectral Signature (mean)s of vegetation, see ?hsdar::PROSAIL. Reflectance is scaled to 0-10 to be able to plot it in the same plot as the feature importance rankings -> the axis limits for the y and z axis needs to match.

PROSAIL returns a Spectral Signature (mean) from 400 nm to 2500 nm -> we take the values only and subset to 400 nm - 1000 nm. Because we order from 1 - 10 with 1 being the best rank, we have to reverse the scaling of the reflectance values.

spectra_sim <- hsdar::PROSAIL()
spectra_df <- data.frame(
  reflectance = as.vector(spectra_sim@spectra@spectra_ma),
  wavelength = seq(400, 2500, 1)
) %>%
  dplyr::filter(wavelength < 1000) %>%
  # scale the reflectance to [0, 2] to play nicely with the y-axis later (mean dec in rmse)
  dplyr::mutate(reflectance = scale(reflectance,
    center = FALSE,
    scale = max(reflectance, na.rm = TRUE) / 2
  ))

Next we bind the simulated data with the feature importance rankings. To join both data.frames we need to round the reflectance centers of the bands to integers to match with the reflectance values created by PROSAIL.

# round the wavelengths of the HR dataset to match with the simulated ones
fi_ranked_hr$wavelength <- round(fi_ranked_hr$wavelength)

data_hr_merged <- dplyr::left_join(spectra_df, fi_ranked_hr, by = c("wavelength")) %>%
  dplyr::left_join(spec_sigs, by = c("wavelength")) %>%
  dplyr::mutate(class = "HR") %>%
  dplyr::mutate(reflectance = as.numeric(reflectance))

To label only a subset of the data, a custom data.frame is created.

df_wavelengths_from_indices_imp <- df_wavelengths_from_indices %>%
  dplyr::left_join(fi_ranked_vi, by = c("class" = "feature")) %>%
  na.omit()

df_wavelengths_from_indices_label <- df_wavelengths_from_indices_imp %>%
  arrange(rank) %>%
  dplyr::group_by(class, rank) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(rank) %>%
  dplyr::slice(1:10)

df_wavelengths_from_hr <- data_hr_merged %>%
  dplyr::arrange(rank) %>%
  dplyr::group_by(class, rank) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(rank) %>%
  dplyr::slice(1:10) %>%
  dplyr::select(-class) %>%
  dplyr::rename(class = feature)

df_label_all <- df_wavelengths_from_indices_label %>%
  dplyr::bind_rows(df_wavelengths_from_hr)
plot_reflectance_imp_hr <- function(data) {
  ggplot(data, aes(x = .data[["wavelength"]], y = .data[["importance"]])) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$laukiz1), ],
      aes(
        x = wavelength, y = laukiz1,
        color = "Spectral Signature (mean) of Laukiz1"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$laukiz2), ],
      aes(
        x = wavelength, y = laukiz2,
        color = "Spectral Signature (mean) of Laukiz2"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$luiando), ],
      aes(
        x = wavelength, y = luiando,
        color = "Spectral Signature (mean) of Luiando"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$oiartzun), ],
      aes(
        x = wavelength, y = oiartzun,
        color = "Spectral Signature (mean) of Oiartzun"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +

    # HR: plot only rank 11:last
    geom_point(
      data = data_hr_merged[which(!(data_hr_merged$rank %in% c(1:10))), ],
      aes(color = "HR: Rank 11:122"),
      size = 2.5,
      shape = 1,
      na.rm = TRUE
    ) +
    # HR: plot only rank 1:10
    geom_point(
      data = data_hr_merged[which(data_hr_merged$rank %in% c(1:10)), ],
      aes(color = "HR: Rank 1:10"),
      shape = 20,
      size = 3,
      na.rm = TRUE
    ) +

    scale_color_manual(values = c(
      "HR: Rank 11:122" = "grey",
      "HR: Rank 1:10" = "black",
      "Spectral Signature (mean) of Laukiz1" = "#BC3C29",
      "Spectral Signature (mean) of Laukiz2" = "#0072B5",
      "Spectral Signature (mean) of Luiando" = "#E18727",
      "Spectral Signature (mean) of Oiartzun" = "#20854E"
    )) +
    scale_x_continuous(limits = c(400, 1000), breaks = scales::pretty_breaks()) +
    scale_y_continuous(
      sec.axis = sec_axis(~ scale(-.,
        center = FALSE,
        scale = max(., na.rm = TRUE)
      ),
      labels = c(1.0, 0.75, 0.55, 0.25, 0),
      name = "Scaled Reflectance [0, 1]"
      )
    ) +
    guides(color = guide_legend(
      title = NULL,
      override.aes = list(
        linetype = c("blank", "blank", "solid", "solid", "solid", "solid"),
        shape = c(20, 1, NA, NA, NA, NA),
        color = c("black", "grey", "#BC3C29", "#0072B5", "#E18727", "#20854E")
      )
    )) +
    labs(
      title = "Permutation-based Variable Importance for dataset 'HR'",
      subtitle = paste0(
        "The ten most important features are labeled by their band number."
      ),
      caption = "Learner: SVM; 100 Monte-Carlo Iterations",
      y = "Mean Decrease in RMSE", x = "Wavelength [nm]"
    ) +
    ggrepel::geom_label_repel(
      # only label the best 10 features
      data = df_wavelengths_from_hr,
      label = df_wavelengths_from_hr$class,
      size = 4,
      na.rm = TRUE
    ) +
    ggpubr::theme_pubclean(base_size = 14)
}

plot_reflectance_imp_vi <- function(data) {
  ggplot(data, aes(x = .data[["wavelength"]], y = .data[["importance"]])) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$laukiz1), ],
      aes(
        x = wavelength, y = laukiz1,
        color = "Spectral Signature (mean) of Laukiz1"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$laukiz2), ],
      aes(
        x = wavelength, y = laukiz2,
        color = "Spectral Signature (mean) of Laukiz2"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$luiando), ],
      aes(
        x = wavelength, y = luiando,
        color = "Spectral Signature (mean) of Luiando"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$oiartzun), ],
      aes(
        x = wavelength, y = oiartzun,
        color = "Spectral Signature (mean) of Oiartzun"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +

    # VI: rank 1:10
    geom_point(
      data = df_wavelengths_from_indices_imp[which((df_wavelengths_from_indices_imp$rank %in% c(1:10))), ],
      aes(x = wavelength, y = importance, group = class),
      size = 3, shape = 20
    ) +
    # VI: rank 1:10
    geom_line(
      data = df_wavelengths_from_indices_imp[which((df_wavelengths_from_indices_imp$rank %in% c(1:10))), ],
      aes(x = wavelength, y = importance, group = class, color = "VI: Rank 1:10"),
      size = 0.2, linetype = "dashed"
    ) +

    # VI: rank 11:89
    geom_point(
      data = df_wavelengths_from_indices_imp[which(!(df_wavelengths_from_indices_imp$rank %in% c(1:10))), ],
      aes(x = wavelength, y = importance, group = class),
      color = "grey",
      shape = 1, size = 2.5
    ) +
    # VI: rank 11:89
    geom_line(
      data = df_wavelengths_from_indices_imp[which(!(df_wavelengths_from_indices_imp$rank %in% c(1:10))), ],
      aes(x = wavelength, y = importance, group = class, color = "VI: Rank 11:89"),
      size = 0.2, linetype = "dashed"
    ) +

    scale_color_manual(values = c(
      "VI: Rank 11:89" = "grey",
      "VI: Rank 1:10" = "black",
      "Spectral Signature (mean) of Laukiz1" = "#BC3C29",
      "Spectral Signature (mean) of Laukiz2" = "#0072B5",
      "Spectral Signature (mean) of Luiando" = "#E18727",
      "Spectral Signature (mean) of Oiartzun" = "#20854E"
    )) +
    scale_x_continuous(limits = c(400, 1000), breaks = scales::pretty_breaks()) +
    scale_y_continuous(
      sec.axis = sec_axis(~ scale(-.,
        center = FALSE,
        scale = max(., na.rm = TRUE)
      ),
      labels = c(1.0, 0.75, 0.55, 0.25, 0),
      name = "Scaled Reflectance [0, 1]"
      )
    ) +
    guides(color = guide_legend(
      title = NULL,
      override.aes = list(
        linetype = c("solid", "solid", "solid", "solid", "dashed", "dashed"),
        shape = c(NA, NA, NA, NA, 20, 1),
        color = c("#BC3C29", "#0072B5", "#E18727", "#20854E", "black", "grey")
      )
    )) +
    labs(
      title = "Permutation-based Variable Importance for dataset 'VI'",
      subtitle = paste0(
        "The ten most important features are labeled by their index name."
      ),
      caption = "Learner: SVM; 100 Monte-Carlo Iterations",
      y = "Mean Decrease in RMSE", x = "Wavelength [nm]"
    ) +
    ggrepel::geom_label_repel(
      # only label the best 10 features
      data = df_wavelengths_from_indices_label,
      label = df_wavelengths_from_indices_label$class,
      size = 4,
      na.rm = TRUE
    ) +
    ggpubr::theme_pubclean(base_size = 14)
}

plot_reflectance_imp_absolute <- function(data, x_identifier, class) {
  pl <- ggplot(data, aes(x = .data[[x_identifier]], y = .data[["importance"]])) +
    labs(y = "Importance", x = "Band number") +
    geom_segment(aes(
      x = .data[[x_identifier]], y = 0,
      xend = .data[[x_identifier]], yend = .data[["importance"]]
    ),
    color = "grey", show.legend = FALSE
    ) +
    geom_point(size = 1, color = "black", show.legend = T) +
    labs(
      title = glue::glue("Permutation-based Variable Importance for Dataset '{class}'"),
      subtitle = "Absolute importance values by band",
      caption = "Learner: SVM; 100 Monte-Carlo iterations"
    ) +
    ggpubr::theme_pubclean()

  if (is.character(data[[x_identifier]])) {
    pl
  } else {
    pl + scale_x_continuous(breaks = seq(5, 125, 5))
    pl
  }
}

P1 Main plot

p11 <- data_hr_merged %>%
  plot_reflectance_imp_hr()
p11
p12 <- data_hr_merged %>%
  plot_reflectance_imp_vi()
p12
p11 / p12

Plots by dataset {.tabset .tabset-fade}

HR

P2 Absolute permutation based Var Imp

p2 <- fi_ranked_hr %>%
  plot_reflectance_imp_absolute("numeric_id", class = "HR")
p2

VI

P3 Absolute permutation based Var Imp

p3 <- fi_ranked_vi %>%
  plot_reflectance_imp_absolute("feature", class = "VI") +
  ggpubr::rotate_x_text()
p3

Vogelmann2 $(R_{734}-R_{747})/(R_{715}+R_{726})$ Vogelmann et al. (1993)

Vogelmann4 $(R_{734}-R_{747})/(R_{715}+R_{720})$ Vogelmann et al. (1993)

Vogelmann3 $D_{715}/D_{705}$ Vogelmann et al. (1993)

Vogelmann $R_{740}/R_{720}$ Vogelmann et al. (1993)

NPCI $(R_{680}-R_{430})/(R_{680}+R_{430})$

D2 $D_{705}/D_{722}$

Datt3 $D_{754}/D_{704}$

PWI $R_{900}/R_{970}$

SR7 $R_{440}/R_{690}$

SRPI $R_{430}/R_{680}$

Dxxx: First derivation of reflectance values at wavelength 'xxx'. Rxxx: Reflectance at wavelength 'xxx'.

Reference: ?hsdar::vegindex()

Combined

p2 / p3

ALE plots {.tabset .tabset-fade}

ALE plots via package {iml}

P2 HR

Grid size: 100

Top ten HR features from permutation Vimp

ale_p1 = fi_ale_hr$plot(
  features = df_wavelengths_from_hr$class,
  ncol = 2
) & scale_y_continuous(
  n.breaks = 2
) &
  labs(y = "ALE") &
  theme_pubr(base_size = 14) &
  theme(
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    axis.title.y = element_text(size = 8),
    axis.text.y = element_text(size = 9)
  )

# Custom y axis breaks for subplots
ale_p1[[2]] = ale_p1[[2]] + scale_y_continuous(
  n.breaks = 3
)

ale_p1[[3]] = ale_p1[[3]] + scale_y_continuous(
  n.breaks = 4
)

ale_p1[[7]] = ale_p1[[7]] + scale_y_continuous(
  n.breaks = 4
)

ale_p1[[8]] = ale_p1[[8]] + scale_y_continuous(
  n.breaks = 4
)

ale_p1[[9]] = ale_p1[[9]] + scale_y_continuous(
  n.breaks = 4
)

ale_p1[[10]] = ale_p1[[10]] + scale_y_continuous(
  n.breaks = 4
)
ale_p1

Grid size: 20

Top ten HR features from permutation Vimp

ale_p2 = fi_ale_hr_gs20$plot(
  features = df_wavelengths_from_hr$class,
  ncol = 2
) & scale_y_continuous(
  n.breaks = 2
) &
  labs(y = "ALE") &
  theme_pubr(base_size = 14) &
  theme(
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    axis.title.y = element_text(size = 8),
    axis.text.y = element_text(size = 9)
  )

# Custom y axis breaks for subplots
ale_p2[[1]] = ale_p2[[1]] + scale_y_continuous(
  n.breaks = 4
)
ale_p2[[2]] = ale_p2[[2]] + scale_y_continuous(
  n.breaks = 5
)
ale_p2[[3]] = ale_p2[[3]] + scale_y_continuous(
  n.breaks = 4
)
ale_p2[[4]] = ale_p2[[4]] + scale_y_continuous(
  n.breaks = 4
)
ale_p2[[6]] = ale_p2[[6]] + scale_y_continuous(
  n.breaks = 4
)
ale_p2[[7]] = ale_p2[[7]] + scale_y_continuous(
  n.breaks = 4
)
ale_p2[[10]] = ale_p2[[10]] + scale_y_continuous(
  n.breaks = 3
)
ale_p2

P3 VI

Grid size: 100

Top ten VI features from permutation Vimp

ale_p3 = fi_ale_vi$plot(
  features = df_wavelengths_from_indices_label$class,
  ncol = 2
) & scale_y_continuous(
  n.breaks = 2
) &
  labs(y = "ALE") &
  theme_pubr(base_size = 14) &
  theme(
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    axis.title.y = element_text(size = 8),
    axis.text.y = element_text(size = 9)
  )

# Custom y axis breaks for subplots
ale_p3[[1]] = ale_p3[[1]] + scale_y_continuous(
  n.breaks = 4
)
ale_p3[[2]] = ale_p3[[2]] + scale_y_continuous(
  n.breaks = 4
)
ale_p3[[3]] = ale_p3[[3]] + scale_y_continuous(
  n.breaks = 4
)
ale_p3[[4]] = ale_p3[[4]] + scale_y_continuous(
  n.breaks = 4
)
ale_p3

Grid size: 20

Top ten HR features from permutation Vimp

ale_p4 = fi_ale_vi_gs20$plot(
  features = df_wavelengths_from_indices_label$class,
  ncol = 2
) & scale_y_continuous(
  n.breaks = 2
) &
  labs(y = "ALE") &
  theme_pubr(base_size = 14) &
  theme(
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    axis.title.y = element_text(size = 8),
    axis.text.y = element_text(size = 9)
  )

# Custom y axis breaks for subplots
ale_p4[[2]] = ale_p3[[2]] + scale_y_continuous(
  n.breaks = 4
)
ale_p4[[3]] = ale_p3[[3]] + scale_y_continuous(
  n.breaks = 4
)
ale_p4[[4]] = ale_p3[[4]] + scale_y_continuous(
  n.breaks = 4
)
ale_p4


pat-s/2019-feature-selection documentation built on Dec. 24, 2021, 8:40 a.m.