Scripts/Regularized_regression_model.R

###############################################
# Regularized regression model of tree growth #
###############################################

# Author: Stuart Graham
# Created: 10/3/2020
# Last edited: 11/10/2020

# Load TreeNeighborhood package
devtools::load_all()

# Load other required packages
library(glmnet)
library(ggplot2)

######################
# Part 1. Loading data
######################

# Load mapping data
mapping <- read.csv("Data/Cleaned_mapping_2017.csv", stringsAsFactors = F)

# Load tree measurement data
tree <- read.csv("Data/Cleaned_tree_growth_2017.csv", stringsAsFactors = F)

# Load stand elevation data
elev <- read.csv("Data/stand_elevations.csv", stringsAsFactors = F)

################################
# Part 2. Creating neighborhoods
################################

# Obtain all neighborhood data
neighbors <- graph_mat_all(mapping, radius = 15)

# Remove focals whose neighborhood overlaps stand boundary
neighbors <- neighbors %>% filter(x_coord >= 15 & x_coord <= 85 & y_coord >= 15 & y_coord <= 85)

# Remove small competitors
neighbors <- neighbors %>% filter(size_cat_comp == "regular")

###################################
# Part 3. Calculating annual growth
###################################

# Calculate annual growth for all trees
growth <- growth_summary(tree)

# Produces some NA values. Remove those caused by a tree being measured only once,
# which would result in an annual growth measurement of NA
growth <- growth[growth$first_record != growth$last_record, ]

# Check for any remaining NA annual growth values
sum(is.na(growth$annual_growth))

# Check for remaining NaNs in size corrected growth
sum(is.nan(growth$size_corr_growth))

# NaNs could be caused by negative annual growth values. Remove these
growth <- growth[growth$annual_growth >= 0, ]

# Confirm no remaining NaNs in size corrected growth
sum(is.nan(growth$size_corr_growth))

################################################
# Part 4. Combining growth and neighborhood data
################################################

# Extract required columns from growth data
growth_cols <- growth[, c("tree_id", "midpoint_size", "size_corr_growth")]

# Join growth and neighborhood data. Use inner join because there will be
# no growth data for focals measured only once or with negative growth
full <- inner_join(neighbors, growth_cols, by = "tree_id")

# Add elevation and side
full <- left_join(full, elev, by = "stand_id")

# Change stand_id column to factor
full$stand_id <- as.factor(full$stand_id)

##############################################
# Part 5. Dividing into training and test data
##############################################

# Specify training data subsets
full_sub1 <- full %>% filter(y_coord <= 50 - (15 / 2)) # Training 1
full_sub2 <- full %>% filter(x_coord >= 50 + (15 / 2) & y_coord > 50 - (15 / 2)) # Training 1
#full_sub1 <- full %>% filter(x_coord <= 50 - (15 / 2)) # Training 2
#full_sub2 <- full %>% filter(y_coord <= 50 - (15 / 2) & x_coord > 50 - (15 / 2)) # Training 2
#full_sub1 <- full %>% filter(y_coord >= 50 + (15 / 2)) # Training 3
#full_sub2 <- full %>% filter(x_coord <= 50 - (15 / 2) & y_coord < 50 + (15 / 2)) # Training 3
#full_sub1 <- full %>% filter(x_coord >= 50 + (15 / 2)) # Training 4
#full_sub2 <- full %>% filter(y_coord >= 50 + (15 / 2) & x_coord < 50 + (15 / 2)) # Training 4

# Define full training set
training <- rbind(full_sub1, full_sub2)

# Define test data
test <- full %>% filter(y_coord >= 50 + (15 / 2) & x_coord <= 50 - (15 / 2)) # Test 1
#test <- full %>% filter(y_coord >= 50 + (15 / 2) & x_coord >= 50 + (15 / 2)) # Test 2
#test <- full %>% filter(y_coord <= 50 - (15 / 2) & x_coord >= 50 + (15 / 2)) # Test 3
#test <- full %>% filter(y_coord <= 50 - (15 / 2) & x_coord <= 50 - (15 / 2)) # Test 4


######################################
# Part 6. Creating model design matrix
######################################

# Subset to a single species
sing_sp <- droplevels(training[training$species == "CANO", ])

# Explore rare competitors
#rare_comp_table <- table(sing_sp$sps_comp)[which(table(sing_sp$sps_comp) < 100)]
#length(rare_comp_table)
#sum(rare_comp_table)
#rare_comp_table
#prop.table(rare_comp_table)

# Find common competitors
comps <- names(which(table(sing_sp$sps_comp) >= 100))

# Change rare competitor species to OTHR
sing_sp$sps_comp[which(sing_sp$sps_comp %in% comps == F)] <- "OTHR"

# Create OTHR density column by summing densities over rare competitors
comps <- c(comps, "all")
density_cols <- rep(NA, times = length(comps))
for(i in 1:length(comps)){
  density_cols[i] <- grep(comps[i], names(sing_sp))
}
other_cols <- setdiff(grep("density", names(sing_sp)), density_cols)
sing_sp$OTHR_density <- apply(sing_sp[, other_cols], 1, sum)

# Add intra vs. inter column
sing_sp$intra <- 0
sing_sp$intra[sing_sp$sps_comp == sing_sp$species] <- 1

# Convert competitor species and side to factor
sing_sp$sps_comp <- as.factor(sing_sp$sps_comp)
sing_sp$side <- as.factor(sing_sp$side)

# Create matrix of factor variables
#dm_fac <- model.matrix(size_corr_growth ~ sps_comp + stand_id, sing_sp,
#                       contrasts.arg = list(sps_comp = contrasts(sing_sp$sps_comp, contrasts = F),
#                       stand_id = contrasts(sing_sp$stand_id, contrasts = F)))
#dm_fac <- model.matrix(size_corr_growth ~ sps_comp, sing_sp,
#                       contrasts.arg = list(sps_comp = contrasts(sing_sp$sps_comp, contrasts = F)))
dm_fac <- model.matrix(size_corr_growth ~ sps_comp + side, sing_sp,
                       contrasts.arg = list(sps_comp = contrasts(sing_sp$sps_comp, contrasts = F),
                       side = contrasts(sing_sp$side, contrasts = F)))

# Combine factor predictors with continuous predictors (prox, size_comp_dbh, all_density, species-specific densities)
dm <- cbind(dm_fac, as.matrix(sing_sp[c(9, 12, density_cols, 36:38)])) # including intra and elev

# Standardize variables except for first column (intercept)
dm[, 2:ncol(dm)] <- apply(dm[, 2:ncol(dm)], 2, z_trans)

# Change columns of NaNs (no variation) to zeros
dm[, which(is.nan(dm[1, ]))] <- 0

###########################
# Part 7. Fitting the model
###########################

# Fit the model 100 times
all_fits_list <- mod_iter(des_mat = dm, growth_dat = sing_sp, runs = 100)

# Separate coefficients table from model in output
all_fits <- all_fits_list[[1]]
mod <- all_fits_list[[2]]

# Identify parameter columns needed to test hypotheses
comp_id_cols <- grep("sps_comp", names(all_fits))
prox_col <- which(names(all_fits) == "prox")
size_comp_col <- which(names(all_fits) == "size_comp_dbh")
intra_col <- which(names(all_fits) == "intra")
dens_cols <- grep("density", names(all_fits))

# Create new summation columns needed to test hypotheses
all_fits$nbhds <- abs(apply(all_fits[, c(comp_id_cols, prox_col, size_comp_col,
                                     intra_col, dens_cols)], 1, sum)) 
all_fits$compID <- abs(apply(all_fits[, c(comp_id_cols, intra_col)], 1, sum))

# In what percentage of model runs is competitive neighborhood important?
pct_dif(all_fits$nbhds, "greater", 0)

# In what percentage of model runs is species identity of competitors important?
pct_dif(all_fits$compID, "greater", 0)

# In what percentage of model runs is conspecific competition strongest?
pct_dif(all_fits$intra, "less", 0)

# In what percentage of model runs is heterospecific competition strongest?
pct_dif(all_fits$intra, "greater", 0)

# In what percentage of models is each species a strong competitor?
apply(all_fits[, comp_id_cols], 2, pct_dif, direction = "less", value = 0) 

# In what percentage of models is each species a weak competitor?
apply(all_fits[, comp_id_cols], 2, pct_dif, direction = "greater", value = 0) 

# Make predictions of training data using best model
int_pred <- predict(mod, newx = dm, s = "lambda.1se")
int_obs <- sing_sp %>% select(tree_id, size_corr_growth)
comparison <- cbind(int_obs, int_pred)
colnames(comparison)[2:3] <- c("obs", "pred")
comparison <- comparison %>%
  group_by(tree_id) %>%
  summarize(observations = obs[1],
            predictions = mean(pred))

# Extract appropriate axis limits for plot
axis_max <- max(c(max(comparison$predictions), max(comparison$observations))) + 0.01

# Plot predictions vs. observations
ggplot(comparison, aes(x = predictions, y = observations)) +
  geom_hex() +
  theme_bw() +
  ylim(-0.01, axis_max) +
  xlim(-0.01, axis_max) +
  geom_abline(intercept = 0, slope = 1)

# Compute coefficient of determination
coef_det(comparison)

# Calculate slope of observed growth vs. predicted growth
slope_fit <- lm(observations ~ 0 + predictions, comparison)
coef(slope_fit)

# Extract model coefficients
#mod_coef <- as.matrix(coef(mod))
#mod_coef <- cbind(mod_coef, as.matrix(coef(mod)))

# Save model coefficients table
write.csv(all_fits, "Data/ABAM1_elev.csv")



################################









##############################
# Part 8. Predicting test data
##############################

# Subset to a single species
test_ss <- droplevels(test[test$species == "ABAM", ])

# Change rare competitor species to OTHR
test_ss$sps_comp[which(test_ss$sps_comp %in% comps == F)] <- "OTHR"

# Create OTHR density column by summing densities over rare competitors
test_ss$OTHR_density <- apply(test_ss[, other_cols], 1, sum)

# Add intra vs. inter column
test_ss$intra <- 0
test_ss$intra[test_ss$sps_comp == test_ss$species] <- 1

# Convert competitor species and side to factor
test_ss$sps_comp <- as.factor(test_ss$sps_comp)
test_ss$side <- as.factor(test_ss$side)

# Create matrix of factor variables
#dm_fac_test <- model.matrix(size_corr_growth ~ sps_comp + stand_id, test_ss,
#                            contrasts.arg = list(sps_comp = contrasts(test_ss$sps_comp, contrasts = F),
#                                                 stand_id = contrasts(test_ss$stand_id, contrasts = F)))
#dm_fac_test <- model.matrix(size_corr_growth ~ sps_comp, test_ss,
#                            contrasts.arg = list(sps_comp = contrasts(test_ss$sps_comp, contrasts = F)))
dm_fac_test <- model.matrix(size_corr_growth ~ sps_comp + side, test_ss,
                       contrasts.arg = list(sps_comp = contrasts(test_ss$sps_comp, contrasts = F),
                                            side = contrasts(test_ss$side, contrasts = F)))

# Combine factor predictors with continuous predictors (prox, size_comp_dbh, all_density, species-specific densities)
#dm_test <- cbind(dm_fac_test, as.matrix(test_ss[c(9, 12, density_cols, 35, 36)])) # including intra
dm_test <- cbind(dm_fac_test, as.matrix(test_ss[c(9, 12, density_cols, 36:38)])) # including intra and elev

# Add any columns to dm_test that are missing (but present in dm)
missing_cols <- colnames(dm)[colnames(dm) %in% colnames(dm_test) == F]
for(i in 1:length(missing_cols)){
  dm_test <- cbind(dm_test, rep(0, times = nrow(dm_test)))
  colnames(dm_test)[ncol(dm_test)] <- missing_cols[i]
}
dm_test <- dm_test[, match(colnames(dm), colnames(dm_test))]

# Standardize variables except for first column (intercept)
dm_test[, 2:ncol(dm_test)] <- apply(dm_test[, 2:ncol(dm_test)], 2, z_trans)

# Change columns of NaNs (no variation) to zeros
dm_test[, which(is.nan(dm_test[1, ]))] <- 0

# Make predictions
test_pred <- predict(mod, newx = dm_test, s = "lambda.1se")
test_obs <- test_ss %>% select(tree_id, size_corr_growth)
comparison_test <- cbind(test_obs, test_pred)
colnames(comparison_test)[2:3] <- c("obs", "pred")
comparison_test <- comparison_test %>%
  group_by(tree_id) %>%
  summarize(observations = obs[1],
            predictions = mean(pred))

# Extract appropriate axis limits for plot
axis_max <- max(c(max(comparison_test$predictions), max(comparison_test$observations))) + 0.01

# Plot predictions vs. observations
ggplot(comparison_test, aes(x = predictions, y = observations)) +
  geom_hex() +
  theme_bw() +
  ylim(-0.01, axis_max) +
  xlim(-0.01, axis_max) +
  geom_abline(intercept = 0, slope = 1)

# Compute coefficient of determination
coef_det(comparison_test)

# Calculate slope of observed growth vs. predicted growth
slope_fit <- lm(observations ~ 0 + predictions, comparison_test)
coef(slope_fit)
sgraham9319/ForestPlotR documentation built on April 15, 2022, 4:01 p.m.