knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(alohakez)
library(tidyverse)

take a look at data first

vcr_fish %>% head()
sapply(vcr_fish, typeof)

data pre-processing

vcr_fish <- vcr_fish %>%
  mutate( sample_date=as.Date(sample_date, "%Y-%m-%d") ) %>%
  mutate(sample_year=as.integer( format(sample_date, format = "%Y") ),
         sample_month=as.integer( format(sample_date, format = "%m") ),
         sample_day=as.integer( format(sample_date, format = "%d") )
         )
vcr_fish <- vcr_fish %>%
  mutate(sample_hour=as.integer(
              format(as.POSIXct(sample_time, format = "%H:%M"), format = "%H")
              ),
         sample_min=as.integer(
           format(as.POSIXct(sample_time, format = "%H:%M"), format = "%M")
           )
        )

check

sapply(vcr_fish, typeof)

look at fish length

vcr_fish %>%
  ggplot() +
  geom_histogram(aes(x=length), binwidth=30)

there is one super long fish

let's see how many species were measured

vcr_fish %>%
  count(count) 

filter out Na row for count & length since we focus on fish for now

fish <- vcr_fish %>% filter(!is.na(count) & !is.na(length)) 
#vcr_fish %>% filter(is.na(count) | is.na(length)) 
fish %>% group_by(site) %>% count()

We want to know how many of each species do we have

species_count <- fish %>%
  group_by(species_name) %>%
  summarise(total_count=sum(count))
species_count

from the plot, we can see there are 32 species in this data

species_count %>%
  ggplot(aes(y=species_name) ) +
  geom_bar(aes(weight = total_count))
main_species_name <- species_count %>% filter(total_count>300)
main_species_name <- main_species_name %>% arrange(total_count) %>% pull(species_name)
main_species_name 

"Bay Anchovy" "Silver Perch" "Silversides" "Pinfish" "Pipefish"

look at these species

main_species_r <- fish %>% filter(species_name %in% main_species_name)
main_species_r %>% 
  group_by(species_name) %>%
  summarise(total_count=sum(count))
main_species_r %>% filter(count!=1) %>% arrange(desc(count)) %>%
  select(species_name, count, length)

let's see a histogram based on species

main_species_r %>% 
  ggplot() +
  geom_histogram(aes(x=length, fill=species_name), binwidth=10)

we suspect there is an outlier, remove it

pinfish_outlier <- main_species_r %>% slice_max(length) #store it in case
main_species <- main_species_r %>% filter(length!=pinfish_outlier$length)

let's plot it again

main_species %>% 
  ggplot() +
  geom_histogram(aes(x=length, fill=species_name), binwidth=10)

there are rows where the count is bigger than one, if we want to analyze the data, it shuld be taken into account that those rows are weighed more

main_species_r %>% filter(count!=1) %>% arrange(desc(count)) %>%
  select(species_name, count, length)

duplicate these rows

count2fish <- main_species %>% filter(count==2) %>% slice(rep(1:n(), each = 2-1))
count3fish <- main_species %>% filter(count==3) %>% slice(rep(1:n(), each = 3-1))
count5fish <- main_species %>% filter(count==5) %>% slice(rep(1:n(), each = 5-1))
count6fish <- main_species %>% filter(count==6) %>% slice(rep(1:n(), each = 6-1))
count19fish <- main_species %>% filter(count==19) %>% slice(rep(1:n(), each = 19-1))
count50fish <- main_species %>% filter(count==50) %>% slice(rep(1:n(), each = 50-1))
count56fish <- main_species %>% filter(count==56) %>% slice(rep(1:n(), each = 56-1))
main_species <- main_species %>%
  bind_rows(count2fish, count3fish, count5fish, count6fish,
            count19fish, count50fish, count56fish)

draw desnity plot of these 3 fishes

main_species %>% 
  ggplot() +
  geom_histogram(aes(x=length, fill=species_name), binwidth=10) +
  labs(y="fish length", 
       title="Histogram of fish length based on species")
main_species %>%
  ggplot(aes(x=length)) +
  geom_histogram(aes(y = ..density.., 
                     fill=species_name, color=species_name), binwidth = 10) +
  geom_density(aes(fill=species_name), alpha = 0.8, size=0.2) +
  labs(y="fish length", 
       title="density plot of fish length")
main_species %>%
  ggplot(aes(length, fill=species_name, color=species_name)) +
  geom_density(alpha = 0.6, position = "stack") + 
  labs(y="fish length", 
       title="stacked density plot of fish length")

see each species

main_species %>%
  ggplot(aes(length, fill=species_name)) +
  geom_histogram(binwidth=15) +
  facet_wrap(~species_name)
main_species %>%
  ggplot(aes(length, fill = species_name, color=species_name)) +
  geom_density(alpha=2) +
  facet_wrap(~species_name)

look at pipefish, intersting, why 2 clusters

main_species %>% filter(species_name=="Pipefish") %>%
  group_by(sample_year, sample_month) %>% count()

main_species %>% filter(species_name=="Pipefish") %>%
  group_by(site) %>% count()
main_species %>% filter(species_name=="Pipefish") %>%
  ggplot(aes(length, fill = species_name, color=species_name)) +
  geom_histogram(binwidth=10) +
  facet_wrap(~site)
main_species %>% filter(species_name=="Pipefish") %>%
  ggplot() +
  geom_point(aes(x=sample_date, y=length, color=site)) +
  scale_x_date(date_breaks = "3 months", date_labels = "%b %Y") +
  theme(axis.text.x = element_text(angle = 90, hjust=0, vjust=0))

main_species %>% filter(species_name=="Pipefish") %>%
  ggplot() +
  geom_point(aes(x=sample_date, y=length, color=site)) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  theme(axis.text.x = element_text(angle = 90, hjust=0, vjust=0)) +
  facet_wrap(~site)


karenezhao/alohakez documentation built on June 27, 2021, 10:22 p.m.