data-raw/data_collation.R

# Check full dataset for obvious mistakes
library(tidyverse)
library(forcats)

seedlings <- read_csv("data-raw/seedling_list.csv")

biomass <- read_csv("data-raw/biomass/biomass_weights.csv")
pretransplant <- read_csv("data-raw/biomass/pretransplant_weights.csv")

seed <- read_csv("data-raw/seed/seed_weight_count.csv")

leaf <- read_csv("data-raw/sla/sla.csv")

pots <- read_csv("data-raw/study_design.csv")



# Relabel A. bigeniculata as B. hordaeceus, relevel factors
samples <- full_join(seedlings, biomass) %>%
  full_join(seed) %>%
  full_join(leaf) %>%
  left_join(pots) %>%
  filter(!grepl("died|not planted", planting_notes)) %>%
  mutate(fertility = fct_relevel(fertility, "LOW", "MED", "HIGH"),
         species_code = fct_recode(species_code, BHOR = "ABIG"),
         species_code = fct_relevel(species_code, "ABAR", "BDIA", "ECUR")) %>%
  group_by(pot) %>%
  mutate(total_density = n()) %>%
  ungroup() %>%
  select(-native_dens, -exotic_dens, -total_dens,
         -weight_notes, -seed_notes, -leaf_weight_notes)


# Impute missing ABAR seed weight
abar <- filter(samples, species_code == "ABAR") %>%
  select(seedling_id, sample_id, biomass_weight_g,
         seed_weight_g, seed_number, husk_number,
         fertility, total_density, block, pot) %>%
  mutate(single_seed = seed_weight_g / seed_number)

# Choose 1:1 husk-seed ratio
ggplot(abar, aes(x = seed_number, y = husk_number)) +
  geom_point() +
  geom_abline(aes(intercept = 0, slope = 1)) +
  geom_abline(aes(intercept = 0, slope = 2), color = "red") +
  labs(title= "Husk to seed ratio in ABAR",
       subtitle = "black = 1:1, red = 2:1")

# Regress avg. seed weight against biomass
lm(single_seed ~ biomass_weight_g, data = abar)

# Impute using individaul avg. weight and biomass regression methods
imp <-  group_by(abar, pot) %>%
  mutate(single_seed_pot =
           ifelse(is.na(seed_weight_g),
                  mean(single_seed, na.rm = T),
                  single_seed)) %>%
  ungroup() %>%
  mutate(est_weight_pot = husk_number * single_seed_pot,
         est_weight_reg = husk_number * (0.0078 + 0.0065 * biomass_weight_g))

# Calculate RMSE
mutate(imp, pot_err = est_weight_pot - seed_weight_g,
            reg_err = est_weight_reg - seed_weight_g) %>%
  summarise(pot_rmse = sqrt(mean(pot_err ^ 2, na.rm = T)),
            reg_rmse = sqrt(mean(reg_err ^ 2, na.rm = T)))

# Plot methods
gather(imp, method, value, est_weight_pot:est_weight_reg) %>%
  ggplot(., aes(seed_weight_g, value)) +
  geom_point() +
  geom_abline(aes(intercept = 0, slope = 1)) +
  facet_grid(~method) +
  theme(aspect.ratio = 1) +
  labs(x = "Observed seed weight (g)",
       y = "Imputed total seed weight (g)",
       title = "Comparison of imputation methods against observed data",
       subtitle = "Black line describes 1:1 correspondence")


# Compare against other species
corrected_samples <-
  select(imp, seedling_id, est_weight_pot, est_weight_reg) %>%
  left_join(samples, .) %>%
  mutate(est_weight_pot = ifelse(is.na(est_weight_pot), seed_weight_g, est_weight_pot),
         est_weight_reg = ifelse(is.na(est_weight_reg), seed_weight_g, est_weight_reg))


gather(corrected_samples,
       method, val,
       est_weight_pot:est_weight_reg) %>%
  filter(!is.na(val)) %>%
    ggplot(., aes(x = biomass_weight_g, y = val)) +
  geom_point(aes(color = species_code)) +
  geom_abline(aes(intercept=0, slope = 1)) +
  facet_grid(method ~ fertility) +
  theme(aspect.ratio = 1) +
  annotate(geom = "segment", x = -Inf, xend = -Inf, y = -Inf, yend = Inf) +
  annotate(geom = "segment", x = -Inf, xend = Inf, y = -Inf, yend = -Inf) +
  labs(x = "Vegetative biomass (g)",
       y = "Total seed weight (g)",
        color = "",
       title = "Biomass, seed output comparison of four species")

ggsave(last_plot(), filename = 'figures/Sxx_corrected_seedmass.png',
  device = "png", dpi = 600, width = 10, height = 8)

# Use regression method for corrected seed weight
samples <- group_by(corrected_samples, seedling_id) %>%
  mutate(aboveground_biomass_g =
           sum(biomass_weight_g,
           est_weight_reg,
           total_leaf_weight_g, na.rm = T)) %>%
  ungroup() %>%
  select(seedling_id,
         block,
         pot,
         fertility,
         mixture = competition,
         total_density,
         position,
         species_code,
         date_planted,
         age_days,
         native,
         exotic,
         aboveground_biomass_g,
         vegetative_biomass_g = biomass_weight_g,
         corrected_seed_weight_g = est_weight_reg,
         total_SLA)

usethis::use_data(samples, overwrite = T)



# Summarise pre-transplant seedling biomass
biomass_t0 <- mutate(pretransplant,
                     species_code = fct_recode(species_code, BHOR = "ABIG"),
                     species_code = fct_relevel(species_code, "ABAR", "BDIA", "ECUR")) %>%
  group_by(species_code, sheet) %>%
  summarise(n_individuals = sum(n),
    aboveground_sample_mean = sum(aboveground) / n_individuals)

usethis::use_data(biomass_t0, overwrite = T)
aornugent/competition documentation built on Oct. 3, 2019, 6:33 p.m.