R/summary_figures.R

## An R script to summarize and look at various things
## based on discussion with Mark, Ian, and Colleeen
## Essentially, these things are meant to
## get everyone on the same page
## with how the urban scores are working for species


# packages
library(readr)
library(sf)
library(dplyr)
library(ggplot2)
library(scales)
library(ggcorrplot)
library(tidyr)

# assigned urbanness scores
urbanness <- readRDS("data/urbanness_summaries.RDS")

# read in cnc only data
cnc_data <- readRDS("data/inat_cnc.rds")

# read in regional/semi-continental lvl2 data
lvl2 <- readRDS("data/lvl2.rds")

# read in sf of cnc_area
cnc_area <- readRDS("data/cnc_area.rds")

# read in viirs_lights_obs
lights_data <- readRDS("data/viirs_lights_obs.RDS")

# lights for each town data
lights_towns <- readRDS("data/viirs_lights_towns.RDS")


# trim urbanness values to those with > 100 observations
urbanness.2 <- urbanness %>%
  dplyr::filter(number_of_records>=100)

# this trims the number of species from 15,595 to 1,728

# look at the relationship between the 0.25, median, mean etc.
# for each species
# based off lights data (i.e., urbanization level)
lvl2 %>%
  dplyr::filter(species %in% urbanness.2$species) %>%
  left_join(., lights_data, by="catalogNumber") %>%
  group_by(species) %>%
  summarize(med=median(lights, na.rm=TRUE),
            mean=mean(lights, na.rm=TRUE),
            quantile_0.25=quantile(lights, 0.25, na.rm=TRUE),
            quantile_0.75=quantile(lights, 0.75, na.rm=TRUE)) %>%
  dplyr::select(-species) %>%
  cor(.) %>%
  ggcorrplot(lab=TRUE)

ggsave("outputs/graphics/distribution_summary_correlation.png", width=6.8, height=5, units="in")

# all highly correlated
# so will continue with the 'median' for now as the urban score

# plot histogram of these 1700 species
ggplot(urbanness.2, aes(x=median_lights))+
  geom_histogram(fill="orange", color="black", bins=40)+
  theme_classic()+
  theme(axis.text=element_text(color="black"))+
  ylab("Count")+
  xlab("Urban score (median lights)")

ggsave("outputs/graphics/urban_score_histogram.png", width=6.8, height=5, units="in")

# re make this plot, but with a log-transformed axis
ggplot(urbanness.2, aes(x=median_lights))+
  geom_histogram(fill="orange", color="black", bins=40)+
  theme_classic()+
  scale_x_log10()+
  theme(axis.text=element_text(color="black"))+
  ylab("Count")+
  xlab("log Urban score (median lights)")

ggsave("outputs/graphics/urban_score_histogram_log-scale.png", width=6.8, height=5, units="in")

# now try and get a list of 6 or so species to plot where they occur along this axis
# first create a tiny dataframe with this
example_species <- urbanness.2 %>%
  dplyr::filter(species %in% c("Acalypha virginica",
                               "Acer negundo",
                               "Oxydendrum arboreum",
                               "Iris pseudacorus",
                               "Alces alces",
                               "Vicia villosa")) %>%
  mutate(y=c(40, 70, 60, 90, 170, 120))


# now combine with previous plot
ggplot()+
  geom_histogram(data=urbanness.2, aes(x=median_lights), 
                 fill="orange", color="black", bins=40)+
  geom_segment(data=example_species, aes(x=median_lights, xend=median_lights, 
                                         y=y, yend=0), color="red", 
               size=1.5, linetype="dashed")+
  geom_label(data=example_species, aes(x=median_lights, y=y, label=species))+
  theme_classic()+
  scale_x_log10()+
  theme(axis.text=element_text(color="black"))+
  ylab("Count")+
  xlab("log Urban score (median lights)")

ggsave("outputs/graphics/urban_score_histogram_log-scale_with_examples.png", width=7.5, height=5, units="in")


# make example plots of the distributions
# for our 6 example species
light_dat_subsetted <- lvl2 %>%
  dplyr::filter(species %in% example_species$species) %>%
  left_join(., lights_data, by="catalogNumber") %>%
  group_by(species) %>%
  mutate(med_lights=median(lights, na.rm=TRUE))

ggplot(light_dat_subsetted, aes(x=lights, fill=species))+
  geom_density(color="black")+
  scale_x_log10()+
  facet_wrap(~species)+
  theme_bw()+
  theme(axis.text=element_text(color="black"))+
  xlab(bquote('Average radiance ('* 'nW' ~cm^-2~sr^-1*')'))+
  ylab("Density")+
  geom_vline(aes(xintercept=med_lights), colour='red')+
  guides(fill=FALSE)+
  scale_fill_brewer(palette="Dark2")

ggsave("outputs/graphics/species_distribution_examples.png", width=7.5, height=5, units="in")

# plot the same thing, but with free scales
ggplot(light_dat_subsetted, aes(x=lights, fill=species))+
  geom_density(color="black")+
  scale_x_log10()+
  facet_wrap(~species, scales="free_y")+
  theme_bw()+
  theme(axis.text=element_text(color="black"))+
  xlab(bquote('Average radiance ('* 'nW' ~cm^-2~sr^-1*')'))+
  ylab("Density")+
  geom_vline(aes(xintercept=med_lights), colour='red')+
  guides(fill=FALSE)+
  scale_fill_brewer(palette="Dark2")

ggsave("outputs/graphics/species_distribution_examples_free_y_scales.png", width=7.5, height=5, units="in")

# Now lets look at the relationship between the 'continental/regional' scores
# and the score sif they were assigned locally
# first get a list of the 100 most abundant species within the cnc area
common_species_in_cnc <- cnc_data %>%
  group_by(species) %>%
  summarize(N=n()) %>%
  dplyr::filter(N>=50) %>%
  .$species

cnc_urbanness <- cnc_data %>%
  dplyr::filter(species %in% common_species_in_cnc) %>%
  left_join(., lights_data, by="catalogNumber") %>%
  group_by(species) %>%
  summarize(cnc_urban_score=median(lights, na.rm=TRUE))

# now look at the cnc urban score versus the regional urban score
plot_temp <- cnc_urbanness %>%
  left_join(., urbanness, by="species") %>%
  dplyr::select(species, median_lights, number_of_records, cnc_urban_score) %>%
  rename(regional_urban_score=median_lights) %>%
  dplyr::filter(number_of_records>=100)

ggplot(plot_temp, aes(x=cnc_urban_score, y=regional_urban_score))+
  geom_point()+
  theme_bw()+
  theme(axis.text=element_text(color="black"))+
  xlab("CNC urbanness (log)")+
  ylab("Regional urbanness (log)")+
  scale_x_log10(labels=comma)+
  scale_y_log10(labels=comma)+
  geom_smooth(method="lm", color="orange")+
  ggtitle(paste0("N = ", nrow(plot_temp), "species"))

ggsave("outputs/graphics/regional_vs_local_urbanness_scores.png", width=7.5, height=5, units="in")


# number of obs in CNC area that will be used
# i.e., removing the species in the cnc area which have < 100 observations througout
# the level 2 terra region as a whole

# first get list of species with >100 obs
greater_than_100_obs <- urbanness %>%
  dplyr::filter(number_of_records>=100) %>%
  .$species

# remove obs
cnc_trimmed <- cnc_data %>%
  dplyr::filter(species %in% greater_than_100_obs)


(nrow(cnc_trimmed)/nrow(cnc_data))*100

# Now do some spatial stuff
# looking at neighborhoods as the spatial unit of interest

# read in towns
towns <- st_read("data/towns_in_cnc_area/towns_in_cnc_area.shp")

inat_points <- st_as_sf(cnc_data, coords=c("decimalLongitude", "decimalLatitude"), crs=st_crs(cnc_area))

cnc_trimmed_points <- st_as_sf(cnc_trimmed, coords=c("decimalLongitude", "decimalLatitude"),
                               crs=st_crs(towns))

towns_joined <- cnc_trimmed_points %>%
  st_intersects(towns) %>%
  as.data.frame() %>%
  right_join(., cnc_trimmed_points %>%
               mutate(row.id=1:nrow(cnc_trimmed_points)), by="row.id") %>%
  left_join(., towns %>%
              st_set_geometry(NULL) %>%
              mutate(col.id=1:nrow(towns)), by="col.id")

# make a histogram of the number of data points from
# all the towns
towns_joined %>%
  group_by(TOWN) %>%
  summarize(N=n()) %>%
  arrange(N) %>%
  ggplot(., aes(x=N))+
  geom_histogram(color="black", fill="blue")+
  theme_bw()+
  theme(axis.text=element_text(color="black"))+
  xlab("Number of observations")+
  ylab("Count")

ggsave("outputs/graphics/number_of_obs_per_town.png", width=7.5, height=5, units="in")

sample_size <- towns_joined %>%
  group_by(TOWN) %>%
  summarize(N=n())

plot_sample_size_dat <- towns %>%
  dplyr::select(TOWN) %>%
  left_join(., sample_size, by="TOWN")

ggplot()+
  geom_sf(data=cnc_area)+
  geom_sf(data=plot_sample_size_dat, aes(fill=N))+
  theme_classic()+
  ggtitle("Number of iNat submissions")

ggsave("outputs/graphics/number_of_obs_per_town_map.png", width=7.5, height=5, units="in")

ggplot()+
  geom_sf(data=cnc_area)+
  geom_sf(data=plot_sample_size_dat, aes(fill=log10(N)))+
  scale_fill_continuous(labels=comma)+
  theme_classic()+
  ggtitle("Number of iNat submissions")

ggsave("outputs/graphics/number_of_obs_per_town_map_log10.png", width=7.5, height=5, units="in")

# Now start looking at the relationship between
# the night-time lights within a town
# and the observation sampling within a town
# let's pick 6 towns and then remake the plots for the 'species' example above
example_towns <- c("WALTHAM", "LINCOLN", "PLYMOUTH", "WELLESLEY", "CONCORD", "QUINCY")

# make example plots of the distributions
# for our 6 example cities
town_dat_subsetted <- lights_towns %>%
  dplyr::filter(TOWN %in% example_towns) %>%
  group_by(TOWN) %>%
  mutate(med_lights=median(viirs_values, na.rm=TRUE)) %>%
  ungroup() %>%
  dplyr::filter(complete.cases(.))

ggplot(town_dat_subsetted, aes(x=viirs_values, fill=TOWN))+
  geom_density(color="black")+
  scale_x_log10()+
  facet_wrap(~TOWN)+
  theme_bw()+
  theme(axis.text=element_text(color="black"))+
  xlab(bquote('Average radiance ('* 'nW' ~cm^-2~sr^-1*')'))+
  ylab("Density")+
  geom_vline(aes(xintercept=med_lights), colour='red')+
  guides(fill=FALSE)+
  scale_fill_brewer(palette="Dark2")

ggsave("outputs/graphics/town_light_distribution_examples.png", width=7.5, height=5, units="in")

# get the median lights of all pixels for each town
town_urban_summary <- lights_towns %>%
  group_by(TOWN) %>%
  summarize(median_underlying_score=median(viirs_values, na.rm=TRUE),
            sd_underlying_score=sd(viirs_values, na.rm=TRUE),
            number_pixels=n(viirs_values))

# now to make sure this makes sense
# lets make a map showing the median scores for each town polygon
plot_median_scores_dat <- towns %>%
  dplyr::select(TOWN) %>%
  left_join(., town_urban_summary, by="TOWN")

ggplot()+
  geom_sf(data=cnc_area)+
  geom_sf(data=plot_median_scores_dat, aes(fill=log(median_underlying_score)))+
  theme_classic()+
  ggtitle("Urbanness of towns")

ggsave("outputs/graphics/map_of_town_urbanness.png", width=7.5, height=5, units="in")

# it appears to make logical sense

# now we want to look at the distribution of the actual observations in a given town/spatial unit
# once again, use the example six towns from above
town_dat_observations <- towns_joined %>%
  left_join(., lights_data, by="catalogNumber") %>%
  dplyr::filter(TOWN %in% example_towns) %>%
  group_by(TOWN) %>%
  mutate(med_lights=median(lights, na.rm=TRUE)) %>%
  ungroup()

ggplot(town_dat_observations, aes(x=lights, fill=TOWN))+
  geom_density(color="black")+
  scale_x_log10()+
  facet_wrap(~TOWN)+
  theme_bw()+
  theme(axis.text=element_text(color="black"))+
  xlab(bquote('Average radiance ('* 'nW' ~cm^-2~sr^-1*')'))+
  ylab("Density")+
  geom_vline(aes(xintercept=med_lights), colour='red')+
  guides(fill=FALSE)+
  scale_fill_brewer(palette="Dark2")

ggsave("outputs/graphics/town_observation_distribution_examples.png", width=7.5, height=5, units="in")


# now summarize the observations within a town, regardless of species
town_observations_summary <- towns_joined %>%
  group_by(TOWN) %>%
  left_join(., lights_data, by="catalogNumber") %>%
  summarize(observation_median=median(lights, na.rm=TRUE),
            observation_sd=sd(lights, na.rm=TRUE))

# look at the relationship between the town observations
# and the town underlying pixels
town_observations_summary %>%
  left_join(., town_urban_summary) %>%
  ggplot(., aes(x=observation_median, median_underlying_score))+
  geom_point()+
  scale_x_log10()+
  scale_y_log10()+
  theme_bw()+
  theme(axis.text=element_text(color="black"))+
  xlab("Median score of all observations")+
  ylab("Median score of pixels in a town")+
  geom_smooth(method="lm", color="orange")

ggsave("outputs/graphics/relationship_between_sampling_and_town_urbanization.png", width=7.5, height=5, units="in")


# now summarize the median of all the species in a town
# for a third distribution to compare
town_species_summary <- towns_joined %>%
  left_join(., urbanness) %>%
  group_by(TOWN) %>%
  summarize(species_median_all_obs=median(median_lights),
            sd_all_obs=sd(median_lights),
            number_obs=n(species),
            number_species=length(unique(species))) %>%
  left_join(., towns_joined %>%
              left_join(., urbanness) %>%
              dplyr::select(TOWN, species, median_lights) %>%
              distinct() %>%
              group_by(TOWN) %>%
              summarize(species_median_distinct=median(median_lights),
                        sd_median_distinct=sd(median_lights)), by="TOWN")



# plot an example of two towns, with the three distributions for each town
town_dat_lights_ex <- town_dat_subsetted %>%
  dplyr::filter(TOWN %in% c("WALTHAM", "CONCORD")) %>%
  dplyr::rename(lights=viirs_values) %>%
  dplyr::select(TOWN, lights, med_lights) %>%
  mutate(distribution="Town pixels")

town_dat_obs_ex <- towns_joined %>%
  left_join(., lights_data, by="catalogNumber") %>%
  dplyr::filter(TOWN %in% c("WALTHAM", "CONCORD")) %>%
  group_by(TOWN) %>%
  mutate(med_lights=median(lights, na.rm=TRUE)) %>%
  ungroup() %>%
  dplyr::select(TOWN, lights, med_lights) %>%
  mutate(distribution="Town observations")

town_dat_species_ex <- towns_joined %>%
  left_join(., urbanness) %>%
  dplyr::filter(TOWN %in% c("WALTHAM", "CONCORD")) %>%
  dplyr::select(TOWN, median_lights) %>%
  dplyr::rename(lights=median_lights) %>%
  group_by(TOWN) %>%
  mutate(med_lights=median(lights)) %>%
  ungroup() %>%
  mutate(distribution="Species-specific urban scores")

bind_rows(town_dat_lights_ex,
          town_dat_obs_ex,
          town_dat_species_ex) %>%
  ggplot(., aes(x=lights, fill=distribution))+
  geom_density(color="black", alpha=0.6)+
  scale_x_log10()+
  facet_wrap(~TOWN)+
  theme_bw()+
  theme(axis.text=element_text(color="black"))+
  xlab(bquote('Average radiance ('* 'nW' ~cm^-2~sr^-1*')'))+
  ylab("Density")+
  geom_vline(aes(xintercept=med_lights), colour='red')+
  scale_fill_brewer(palette="Dark2")
            
ggsave("outputs/graphics/two_towns_example_of_three_distributions.png", width=7.5, height=5, units="in")            
            
            
# create a large table that can be used for comparisons
summary_dataframe <- town_urban_summary %>%
  left_join(., town_observations_summary) %>%
  left_join(., town_species_summary)
            
write_csv(summary_dataframe, "C:/Users/CTC/Desktop/summary_of_towns_for_Mark.csv")            
iozeroff/cncpointR documentation built on Feb. 4, 2020, 6:18 p.m.