load, clean and join data

library(tidyverse)
#load traits and aop
setwd("..")
nComp = 10
invisible(base_data <- readr::read_csv("./indir/ch3.csv"))

traits_names <- c("ntrgnPr",  "crbnPrc", "extrCBC",  "extrcCC"
                  , "extrCAC",  "lgnnPrc",  "clllsPr"  
                  #, "lm_frs_",  "lm_dry_", "cl_lg__"
                  , "leafAre" , "lfMssPA",  "dryMssF")
random_effects <- c("domanID",  "nlcdCls") #taxonID, soilID, plantStatus
local_environment <- c("ope", "ect", "DTM")
plant_status <- c("CHM", "plntStt")
reflectance <- colnames(base_data)[grep("hsi", colnames(base_data))]
bands_clusters <- readr::read_csv("./indir/bands_clusters_kld.csv")

# load climate trends
invisible(climate_trends <- readr::read_csv("./indir/climate_features.csv"))
colnames(bands_clusters) <- "KLDcluster"
#apply KL-D to the data and reduce hyperspectral info into 15 aggregated pseudo-bands
reflectance_data <- base_data[colnames(base_data) %in% reflectance] %>%
  t %>%
  cbind.data.frame(bands_clusters) %>%
  group_by(KLDcluster) %>%
  summarise_all(list(mean)) %>%
  mutate_all(~ . /10000) %>%
  t %>% data.frame

reflectance_data <-  cbind.data.frame(base_data[["indvdID"]], reflectance_data[-1,])
colnames(reflectance_data)[1] <- "individualID"

log scale and normalize responses

# log scale and normalize responses
leaf_traits_data <- base_data[colnames(base_data) %in% traits_names] %>%
  mutate_all(log)# %>%
  #scale

leaf_traits_data <-  cbind.data.frame(base_data[["indvdID"]], leaf_traits_data)
colnames(leaf_traits_data) <- c( "individualID", "ntrgnPr",  "crbnPrc", "extrCBC",  "extrcCC"
                                 , "extrCAC", "lgnnPrc",  "clllsPr"  
                                 #, "lm_frs",  "lm_dry", "cl_lg" 
                                 , "leafAre","lfMssPA",  "dryMssF"
                                 )

reduce climate data dimensionality using PCA

climate_features <- climate_trends %>%
  dplyr::filter(individualID %in% base_data[["indvdID"]])
prm <-  FactoMineR::PCA(climate_features[-1], ncp=10)
climate_features <- data.frame(climate_features[["individualID"]], prm$ind$coord)
colnames(climate_features)[1] <- "individualID"

get soil categories by querying soilDB

#load data.frame with plots coordiantes in lat/long
#setwd("..")
cfc_df <- readr::read_csv("indir/cfc_field_data.csv") %>%
  dplyr::filter(individualID %in% base_data[["individualID"]]) %>%
  dplyr::select(individualID, plotID, siteID, taxonID, decimalLongitude, decimalLatitude) %>%
  unique
soil_data <- list()
for(st in unique(base_data$siteID)){
  soil_data[[st]] <- cfc_df %>% 
    dplyr::filter(siteID == st) %>%  
    get_soil_features
}
setwd("..")
soil_data = readRDS("./indir/soil_data.rds")
s_data <- do.call(rbind.data.frame, soil_data) %>%
  dplyr::select(individualID, areasymbol, spatialversion, 
                musym, nationalmusym, mukey, muareaacres, mupolygonkey, geometry)

get species correaltion

setwd("..")
# log scale and normalize responses
species_list <- base_data %>%
  dplyr::select(taxonID, scntfcN)
colnames(species_list)[2] <- "scientificName"
sp_list2 <- readr::read_csv("./indir/vegetation_structure.csv") %>%
  dplyr::select(taxonID, scientificName) 

species_list <- rbind.data.frame(species_list, sp_list2) %>%
  unique 
species_list <-  get_cophenetic_distance(species_list, sc_name = "scientificName")

species_features <- base_data %>% 
  dplyr::select(indvdID, scntfcN)
species_features[["scntfcN"]] <- word(species_features[["scntfcN"]], 1,2)
species_features[["scntfcN"]] <- 
  str_replace(species_features[["scntfcN"]], " spp.", "")
species_features[["scntfcN"]] <- 
  str_replace(species_features[["scntfcN"]], " sp.", "")
species_features[["scntfcN"]] <- tolower(species_features[["scntfcN"]])
species_features <-left_join(species_features, species_list$resolved_names, 
                             by = c("scntfcN" = "search_string")) %>%
  select(indvdID, taxonID)
colnames(species_features) <- c("individualID", "taxonID")
#get meaningful soil characters
in.statement <- soilDB::format_SQL_in_statement((s_data$mukey))
q <- paste("SELECT
component.mukey,  taxclname,
           taxorder, taxsuborder, taxgrtgroup, taxsubgrp
           FROM legend
           INNER JOIN mapunit ON mapunit.lkey = legend.lkey
LEFT OUTER JOIN component ON component.mukey = mapunit.mukey
           WHERE mapunit.mukey IN ", in.statement, sep="")
# run query, process results, and return as data.frame object
res <- soilDB::SDA_query(q)
res <- res[complete.cases(res),]
res$mukey
local_soil <- c("mukey")
soil_features <- s_data %>%dplyr::select(individualID, local_soil)
soil_features <- unique(soil_features) 
soil_features <- left_join(soil_features, res)

get local environment features

# log scale and normalize responses
site_features <- base_data[colnames(base_data) %in% local_environment] %>%
  scale

site_features <-  cbind.data.frame(base_data[["indvdID"]], site_features)
colnames(site_features)[1] <- "individualID"
status_features <- base_data[colnames(base_data) %in% plant_status] 
status_features <-  cbind.data.frame(base_data[["indvdID"]], status_features)
colnames(status_features)[1] <- "individualID"

define random effects

random_effects_data <- base_data[colnames(base_data) %in% random_effects] 
random_effects_data <-  cbind.data.frame(base_data[["indvdID"]], random_effects_data)
colnames(random_effects_data)[1] <- "individualID"

train test split

# log scale and normalize responses
cfc_df <- cfc_df[complete.cases(cfc_df),]
train_data <- cfc_df %>% dplyr::select(individualID, taxonID, siteID) %>%
  group_by(taxonID, siteID) %>%
  sample_frac(0.8)
test_data <- cfc_df %>% dplyr::select(individualID, taxonID, siteID) %>%
  filter(!individualID %in% train_data[["individualID"]])
train_dataset <- train_data %>%
  ungroup() %>%
  dplyr::select(-one_of("taxonID")) %>%
  inner_join(leaf_traits_data) %>%
  inner_join(climate_features) %>%
  inner_join(site_features) %>%
  inner_join(random_effects_data) %>%
  inner_join(reflectance_data) %>% 
  inner_join(species_features) %>%
  inner_join(soil_features) %>%
  inner_join(status_features) %>%
  unique

train_dataset<- train_dataset[!is.na(train_dataset[["taxonID"]]),]
train_dataset[!train_dataset[["taxonID"]] 
         %in% colnames(species_list$cov_taxa_eff),"taxonID"] %>% 
        unique %>% 
        print
train_dataset[train_dataset[["taxonID"]]=="ABIES","taxonID"] = "ABBA"
train_dataset[train_dataset[["taxonID"]]=="BETUL","taxonID"] = "BELE"
train_dataset[train_dataset[["taxonID"]]=="BOURR","taxonID"] = "BOSU2"
train_dataset[train_dataset[["taxonID"]]=="FRAXI","taxonID"] = "FRAM2"
train_dataset[train_dataset[["taxonID"]]=="HALES","taxonID"] = "HACA3"
train_dataset[train_dataset[["taxonID"]]=="NYSY","taxonID"] = "NYBI"
train_dataset[train_dataset[["taxonID"]]=="OXYDE","taxonID"] = "OXAR"
train_dataset[train_dataset[["taxonID"]]=="PINUS","taxonID"] = "PIEC2"
train_dataset[train_dataset[["taxonID"]]=="SASSA","taxonID"] = "SAAL5"

train_dataset = train_dataset[complete.cases(train_dataset),]
train_dataset = mutate_if(train_dataset, is.numeric, scale)
imp_data <- mice::mice(train_dataset, m = 5, print = FALSE)

Now we can finally use all these variables to generate models! Let's check first how well everything but reflectance represents our traits data

library(brms)

#define formula and build the model
list_formulas <- paste(paste("mvbind("
                       , paste(colnames(leaf_traits_data)[-1], collapse = " , "), ") ~ ", sep = "")
                       #climate variables
                       , paste("s(",colnames(climate_features)[2:6],  ")", collapse = " + ", sep = "")  
                      # , paste(colnames(climate_features)[2:6], collapse = " + ", sep = "")
                       , " + "
                       , paste(local_environment, collapse = " + ", sep = "") , " + "
                       #, paste("(1 | soil| ",local_soil, ")", collapse = "+"), " + "
                       # , paste("(1 | gr(",random_effects
                       #, "(1 | reg | domanID)" , "+"
                       , "(1 | hlt | plntStt)" , "+"
                       , "(1 | soil | musym)" , "+"
                       , "(1 | eco | nlcdCls)" , "+"
                       , "(1 | evo | taxonID"
                       , ")", collapse = "+")

formula_reflectance_only <- paste(paste("mvbind("
                       , paste(colnames(leaf_traits_data)[-1], collapse = " , "), ") ~ ", sep = "")
                       , paste("s(", colnames(reflectance_data)[-1], ")", collapse = " + ", sep = "") , " + "
                       , "(1 | site | siteID"
                       , ")", collapse = "+")
#check if positive definite, in case slightly modify it to be invertible, according to: 
taxaPD <- Matrix::nearPD( species_list$cov_taxa_eff)
fit <- brm(list_formulas, data = train_dataset, cores =2, chains = 2, iter = 2000,  
           family=brmsfamily("gaussian"), cov_ranef = list(taxonID = as.matrix(taxaPD$mat)))

refl_fit <- brm(formula_reflectance_only, data = train_dataset, cores =2, chains = 2, iter = 6000,  
           family=brmsfamily("gaussian"), prior = prior(horseshoe(), class = "b"))
arameters_summary <- VarCorr(refl_fit)

test dataset

test_dataset <- test_data %>%
  ungroup() %>%
  dplyr::select(-one_of("taxonID")) %>%
  inner_join(leaf_traits_data) %>%
  inner_join(climate_features) %>%
  inner_join(site_features) %>%
  inner_join(random_effects_data) %>%
  inner_join(reflectance_data) %>% 
  inner_join(species_features) %>%
  inner_join(soil_features) %>%
  inner_join(status_features) %>%
  unique

test_dataset<- test_dataset[!is.na(test_dataset[["taxonID"]]),]
test_dataset[!test_dataset[["taxonID"]] 
         %in% colnames(species_list$cov_taxa_eff),"taxonID"] %>% 
        unique %>% 
        print
test_dataset[test_dataset[["taxonID"]]=="ABIES","taxonID"] = "ABBA"
test_dataset[test_dataset[["taxonID"]]=="BETUL","taxonID"] = "BELE"
test_dataset[test_dataset[["taxonID"]]=="BOURR","taxonID"] = "BOSU2"
test_dataset[test_dataset[["taxonID"]]=="FRAXI","taxonID"] = "FRAM2"
test_dataset[test_dataset[["taxonID"]]=="HALES","taxonID"] = "HACA3"
test_dataset[test_dataset[["taxonID"]]=="NYSY","taxonID"] = "NYBI"
test_dataset[test_dataset[["taxonID"]]=="OXYDE","taxonID"] = "OXAR"
test_dataset[test_dataset[["taxonID"]]=="PINUS","taxonID"] = "PIEC2"
test_dataset[test_dataset[["taxonID"]]=="SASSA","taxonID"] = "SAAL5"

test_dataset = test_dataset %>% dplyr::filter(siteID %in% unique(train_dataset$siteID))
test_dataset = test_dataset[complete.cases(test_dataset),]
#test model

bR2 <- bayes_R2(fit, newdata= test_dataset, robust = T)
print(bR2)

parameters_summary <- VarCorr(refl_fit)
obj = list(mod = refl_fit, R2 = bR2, params =parameters_summary )
saveRDS(obj, "./outdir/refl_mod.rds")


MarconiS/hyperspectral_meteR documentation built on April 25, 2020, 12:59 a.m.