############################################################################
##
## TCRENV 2016
##
###########################################################################
# ------------------------------------------------------------------------
# 01 Coding R - Bjarki
# 29/2/2016
# ------------------------------------------------------------------------
# make a vector
(x1<-seq(1,10))
# the same
(x1<-1:10)
(x2<-seq(0,1,length=11))
(x3<-seq(0,1,by=0.1))
rep(1,10)
rep(c(1,2,3),2)
rep(c(1,2,3),each=2)
# length of a vector
length(x1)
(x3<-c(6,3,8,9,4))
# sort the elements of the vector
sort(x3)
# find the rank of each element
rank(x3)
# gives an index for reordering
order(x3)
# join two vectors
c(x1,x2)
cbind(x1,x2)
rbind(x1,x2)
x <- rnorm(100)
mean(x)
median(x)
quantile(x)
# vector indices ------------------------------
x4<-c(4,7,5,3,4,8,9)
# first element of x4
x4[1]
x3[5]
# first and fourth
x4[c(1,4)]
# use logicals
x4[x4>=7]
x4[x4 != 3]
x4[x4>7 & x4 != 9]
# indices that satify a condition
which(x4>=7)
which(x4!=3) # != not equal to
which(x4==3) # == equal to
# get everything except the first element
x4[-1]
# get everything except the first element and second element
x4[-c(1,2)]
# dealing with NA
x5 <- c(1,54,7,NA)
is.na(x5)
x5[!is.na(x5)]
# Data frames ----------------------------
# Read the minke whale data
minke <- read.csv("http://www.hafro.is/~bthe/minke.csv")
minke["length"]
head(minke)
head(minke[c("date.caught","length")])
head(minke[,c(2,7)])
install.packages("stringr")
# ------------------------------------------------------------------------
# 02 GGPLOT - Einar
# 29/2/2016
# ------------------------------------------------------------------------
# install.packages(c("geo","dplyr","tidyr", "mapproj"))
library(geo)
library(ggplot2)
library(dplyr) # not used here?
library(tidyr) # not used here?
library(maps)
library(mapdata)
library(mapproj)
library(ggmap) # to get e.g. google or OSM map
# Read in data Minke whale data
d <- read.csv("http://www.hafro.is/~einarhj/data/minke.csv",
stringsAsFactors = FALSE)
dim(d)
str(d) #give me the structure
# Read in cod assessment data
d2 <- read.csv("http://data.hafro.is/assmt/2015/cod/summary.csv")
str(d2)
# Simple GGplot
ggplot(d, aes(x = age, y = length)) +
geom_point()
# different syntax, same results
ggplot(d, aes(x = age, y = length)) + geom_point()
ggplot(d, aes(age, length)) + geom_point()
ggplot() + geom_point(data = d, aes(age, length))
ggplot(data = d) + geom_point(aes(x = age, y = length))
ggplot(d) + geom_point(aes(age, length))
d %>% ggplot() + geom_point(aes(age, length))
# generating a ggplot object
p <- ggplot(d, aes(age, length)) + geom_point()
class(p)
str(p)
# plotting with separate colouring
p <- ggplot(d)
p + geom_point(aes(age, length, colour = sex))
p + geom_point(aes(age, length, colour = area))
# Manual control of colours:
p <- ggplot(d)
p + geom_point(aes(age, length, colour = sex)) +
scale_colour_manual(values = c("orange","brown"))
p + geom_point(aes(age, length, colour = area)) +
scale_colour_manual(values = c("orange","brown"))
# Exercise
p + geom_point(aes(age, length, colour = maturity))
p + geom_point(aes(age, length, shape = maturity))
# Different formatting
p + geom_point(aes(age, length, size = stomach.volume))
p + geom_point(aes(age, length, colour = sex, size = stomach.volume))
# Reveal overlays in the data
# Alpha = 1 is no transparency, Alpha = 0 is fully transparent.
p + geom_point(aes(age, length, size = stomach.volume), alpha = 0.6)
p + geom_point(aes(age, length, colour= sex, size = stomach.volume), alpha = 0.6)
p + geom_point(aes(age, length, size = stomach.volume), alpha = 0.3, col = "red")
# Facetting
ggplot(d) +
geom_point(aes(age, length, colour = sex)) +
facet_wrap(~ area)
# facet grid approach
ggplot(d) +
geom_point(aes(age, length, colour = sex)) +
facet_grid(area ~ maturity)
# need to remove the NAs before the ggplot is called.
x <- d[!is.na(d$maturity),] #give me all the rows where this vector is true
ggplot(x) +
geom_point(aes(age, length, colour = sex)) +
facet_grid(area ~ maturity)
# Add multiple layers; e.g. smoother layers
p <- ggplot(d, aes(age, length))
p + geom_point() + geom_smooth()
p + geom_point() + geom_smooth(span = 0.1)
# linear smoothing
p + geom_point() + geom_smooth(method = "lm")
# loess smoothing with parameters
loess(age~length, d)
scatter.smooth (d$age,d$length)
# Boxplots and violin plots
p <- ggplot(d, aes(sex, length))
p + geom_boxplot()
p + geom_violin()
# Jitter to randomply displace points a little bit
p + geom_boxplot() + geom_jitter()
p + geom_violin() + geom_boxplot() + geom_jitter()
p + geom_violin() + geom_boxplot() + geom_jitter(width=0.25, height=0.25)
# Layer: frequency polygons
p <- ggplot(d, aes(length))
p + geom_freqpoly()
p + geom_freqpoly(aes(colour = sex))
# Layer: histograms
p + geom_histogram()
p + geom_histogram(aes(fill = sex), bins=15)
p + geom_histogram(aes(fill = sex), bins=30)
p + geom_histogram(aes(fill = sex)) + facet_wrap(~ sex, ncol = 1)
# Layer: barchart
ggplot(d, aes(maturity)) + geom_bar()
ggplot(d, aes(maturity)) + geom_bar() + facet_wrap(~area, ncol=1)
# Layer: line (on icelandic cod!)
ggplot(d2, aes(Year, Rec)) + geom_line()
ggplot(d2, aes(Year, Rec)) + geom_line() + expand_limits(y = 0)
ggplot(d2, aes(Year, Rec)) + geom_line() + expand_limits(y = 0) + geom_smooth(span=0.5)
# Exercise
# line graph of biomass
ggplot(d2, aes(Year, B4plus)) + geom_line() + expand_limits(y = 0) +
scale_y_continuous(breaks=seq(0,2500, by=500)) +
geom_line(aes(y=SSB), col="red")
# NOTE: it is not possible to have data on a second axis !!!
# bargraph of landings
ggplot(d2, aes(Year, Landings)) + geom_bar(stat="identity")
# Layer: path
ggplot(d2, aes(SSB, Fbar)) + geom_path()
ggplot(d2, aes(SSB, Fbar)) + geom_path() +
geom_point(aes(colour = Year), size = 3)
# LIMITS
p <- ggplot(d, aes(maturity, length))
p + geom_jitter()
p + geom_jitter() + ylim(600, 800)
p + geom_jitter() + ylim(NA, 800) # setting only one limit
p + geom_jitter() + ylim(600,800) + xlim("immature","mature")
# careful when using boxplots on limited graphs; boxplots changes. It takes away
# all the data outside the range.
# Solution
p + geom_boxplot() + coord_cartesian(ylim = c(600, 800))
# Mini GIS
?island
str(island)
p <- ggplot(island, aes(lon, lat)) + labs(x = NULL, y = NULL)
p + geom_point()
p + geom_path() #follow the path
p + geom_path() + coord_map() +
geom_point(data=d, aes(lon,lat, colour=sex, size = age), alpha=0.6)
# Map data of the world
m <- map_data("world")
sort(unique(m$region))
ggplot(m) +
geom_polygon(aes(long, lat, group = group)) +
coord_quickmap()
ggplot(m) +
geom_path(aes(long, lat, group = group)) +
coord_quickmap()
#higher resolution
m <- map_data("worldHires")
m1 <- filter(m, long > 5 & long < 15 & lat > 54, lat < 58)
ggplot(m1) +
geom_polygon(aes(long, lat, group = group)) +
coord_quickmap()
ggplot(m1) +
geom_path(aes(long, lat, group = group)) +
coord_quickmap()
# Get a fixed position map e.g. OSM, google
m2 <- ggmap::get_map(location=c(-20,65), zoom=6, source="osm")
ggmap(m2)
ggmap(m2) +
geom_point(data=d, aes(lon,lat, colour=sex, size = age), alpha=0.6)
# ------------------------------------------------------------------------
# 05 DPLYR and TIDYR
# ------------------------------------------------------------------------
library(tidyr)
library(dplyr)
library(pander)
minke <- read.csv("http://www.hafro.is/~einarhj/data/minke.csv",
stringsAsFactors = FALSE)
## FILTERING
minke.2003males <- dplyr::filter(minke,year==2003,sex !="Female")
View(minke.2003males)
## create a dataset with only females
minke.females <-
filter(minke,sex=='Female')
## or non-pregnant
minke.nonpregnant <-
filter(minke,sex=='Female', maturity != 'pregnant')
## SELECTING (columns)
## select id, age, length and sex
minke.redux <-
dplyr::select(data=minke,id,age,length,sex)
## change variable names to capitals as well.
minke.AgeLength <- select (minke, Age=age, Length=length)
## select all column except stomach volume and weight
minke.noStom <-
select(minke,-contains('stomach'))
## select id and all column between length and sex
minke.min <-
select(minke,c(id,length:sex))
minke.neg <-
select(minke,-date)
## ARRANGING
## arrange by length (ascending)
minke.lasc <-
dplyr::arrange(minke,length)
## arrange by length (descending)
minke.ldesc <-
arrange(minke,desc(length))
## arrange by age and then length (ascending)
minke.alasc <-
arrange(minke,age,length)
## combining arrange with select
head(arrange(select(minke, id, age, length),age,length))
## MUTATING: e.g. create a number of columns
## add new column in the traditional r style
minke$approx.weight <- 3.85*1e-06*minke$length^3.163
## add new column using mutate
minke <-
dplyr::mutate(minke,
approx.weight = 3.85*1e-06*length^3.163)
## not really useful until you add more than one
minke <-
mutate(minke,
approx.weight = 3.85*1e-06*length^3.163,
#
adj.weight = ifelse(is.na(weight),approx.weight, weight))
library(lubridate)
minke.mutate <- mutate(minke,
date2 = ymd_hms(date),
month = month(date2),
year = year(data2))
## SUMMARISE
## create summaries
dplyr::summarise(minke,num.obs = n())
## you can have as many as you like
summarise(filter(minke, sex == 'Female', year == 2003),
num.obs = n(),
num.fem = sum(sex == 'Female'),
num.large = sum(length > 650))
summarise(filter(minke, sex == 'Female', year == 2003),
num.obs = n(),
num.fem = sum(stomach.weight, na.rm=TRUE),
num.large = sum(length > 650))
## But not particularly interesting until one can group the data
minke.2 <- group_by(minke, area, sex)
summarise(minke.2,
num.obs = n(),
num.fem = sum(sex == 'Female'),
num.large = sum(length > 650))
## have to submit but to github Hadley!!
## this is the same as splitting things up
minke.N <- filter(minke,area=='North')
summarise(minke.N,
num.obs = n(),
num.fem = sum(sex == 'Female'),
num.large = sum(length > 650))
minke.S <- filter(minke,area=='South')
summarise(minke.S,
num.obs = n(),
num.fem = sum(sex == 'Female'),
num.large = sum(length > 650))
## TIDYR
## RESHAPING tools
## GATHER: gather columns into rows
## read in data in a wide format and convert to long format
catatage <-
read.csv('http://data.hafro.is/assmt/2015/cod/catage.csv')
catlong <-
# takes df, takes name of the header values, takes values,
gather(catatage, #df
key=age, #names of new variables
value=number, # values
X3:X14) #apply to what columns
catlong <-
mutate(catlong,
age = as.numeric(gsub('X', '', age)))
## SPREAD: from long to wide, Spread rows into columns
## Traditional way of getting from long to wide
minke.y.a <- group_by(minke, year, area)
num.minke.y.a <- summarise(minke.y.a, num = n())
spread(num.minke.y.a,year,num)
catwide <- spread(catlong, key=age,value=number)
## !!!! This is very useful for generating report tables
catwide2 <- spread(catlong, key=Year, value=number)
pandoc.table(catwide2, split.tables=80)
## SEPARATE into different columns
minke.by.day <-
separate(minke,
date,
c('yr', 'm', 'd'),
sep = '-') #split date based on hyphens
minke.by.day <-
separate(minke.by.day,
d,
c('d2', 'time'),
sep = ' ') #split date based on space
minke.by.day <-
separate(minke.by.day,
time,
c('hh', 'mm', 'ss'),
sep = ':') #split date based on semicolon
# In one go
separate(select(minke,date),
date,
c('yr', 'mo', 'da', "hh", "mm", "ss"))
## unite: Unite several columns into one
## combine
minke.m.s <-
unite(minke,mat.sex,c(sex,maturity),sep='-')
## CHAINING operations -----------------------------
## chaining operations together can be quite complex
summarise(
group_by(
filter(minke,!is.na(weight)),
sex),
num.whale=n(),
m.weight = mean(weight)
)
## using the %>% operator
minke %>%
filter(!is.na(weight)) %>%
group_by(sex) %>%
summarise(num.whale = n(),
m.weight = mean(weight))
## lets do more
minke %>%
filter(area == 'North') %>%
group_by(maturity) %>%
summarise(n=n(),
num.na = sum(is.na(age)),
m.age = mean(age,na.rm=TRUE),
sd.age = sd(age,na.rm=TRUE)) %>%
mutate(prop = n/sum(n)) %>%
arrange(prop)
## play with ungroup
minke %>%
group_by(maturity) %>%
mutate(lnorm.in = length/mean(length)) %>%
ungroup() %>%
mutate(lnorm.out = length/mean(length)) %>%
select(whale.id,maturity,lnorm.in,lnorm.out)
## class excercise
## num caught by year
minke %>%
group_by(year) %>%
summarise(num.caught = n())
## num which are females
minke %>%
group_by(year) %>%
summarise(num.caught = n(),
num.fem = sum(sex=='Female'),
prop.fem = mean(sex=='Female')) %>%
mutate(prop2 = num.fem/num.caught)
## mean age and sd by maturity
minke %>%
group_by(maturity) %>%
summarise(ml = mean(length),
sdl = sd(length),
ma = mean(age,na.rm=TRUE),
sda = sd(age,na.rm=TRUE))
##
minke %>%
group_by(year,area) %>%
summarise(num = n()) %>%
spread(year,num)
# number of whales caught each year
m1 <- group_by(minke, year)
m1a <- summarise(m1,Ntot=n())
# proportion caught of which are females each year
m2 <- filter(m1, sex =="Female")
m2a <- cbind(summarise(m2,Nfem=n()),m1a$Ntot)
m2b <- mutate(m2a, FemProp = Nfem/m1a$Ntot)
# Quicker way
summarise(m1,propfem = mean(sex == "Female"))
# Calculate the mean length and age along with standard deviation
# grouped by maturity
m3 <- group_by(minke, maturity)
summarise(
m3,
ml = mean(length),
sl = sd(length),
ma = mean(age, na.rm = TRUE),
sa = sd(age, na.rm = TRUE)
)
# Using %>% and spread, calculate number of whales caught by area (rows) and year (columns)
minke %>%
group_by(area,year) %>%
summarise(Ntot=n()) %>%
spread(key=year,Ntot)
# ------------------------------------------------------------------------
# 06 Applied data and graph grammar
# Icelandic survey raising example
# ------------------------------------------------------------------------
library(dplyr)
library(tidyr)
library(ggplot2)
library(geo)
library(maps)
library(mapdata)
library(lubridate)
Station <-
# read.csv("http://www.hafro.is/~einarhj/data/tcrenv2016/Station.csv") %>%
read.csv("Station.csv") %>%
mutate(year = year(date1)) %>%
select(id, year, towlength, lon = lon1, lat = lat1) %>%
tbl_df() # print in simple format
Subsampling <-
# read.csv("http://www.hafro.is/~einarhj/data/tcrenv2016/Subsampling.csv") %>%
read.csv("Subsampling.csv") %>%
tbl_df()
Length <-
# read.csv("http://www.hafro.is/~einarhj/data/tcrenv2016/Length.csv") %>%
read.csv("Length.csv") %>%
select(id, species, length, n) %>%
tbl_df()
# 1. Pick a year
Species <- 1 # pick a species
Year <- 2015 # pick a year
# 2. Subset (filter) the station table (only the year I want)
st <-
Station %>%
filter(year %in% Year) %>%
select(id, towlength, lon, lat)
Station %>% group_by(year) %>% summarise(count=n())
# plot the stations
ggplot(st, aes(lon,lat)) + geom_point()
# plot towlength histogram
ggplot(st, aes(towlength)) + geom_histogram()
# 3. Subset (filter) the station table
# The year is not stored in the Length dataframe.
# So limit the data by using the station id in the st dataframe
le <-
Length %>%
filter(id %in% st$id, # ID in length table same as station table
species == Species) %>% # only required species
select(id, length, n) # keep only three variables
# number of length measurement
sum(le$n)
# 4. Get the subsampling information
ss <-
Subsampling %>%
filter(id %in% st$id,
species == Species) %>%
select(id, n.total, n.measured)
Subsampling %>% group_by(species) %>% summarize(count=sum(n.total))
# and calculate the raising factor
ss <-
ss %>%
mutate(r = n.total/n.measured,
r = ifelse(is.na(r), 1, r)) %>% # just in case
select(id, r) # only columns needed
# plot raising factor histogram
ggplot(ss, aes(r)) + geom_histogram()
# how many samples with r > 1
summarise(ss,sum(r>1))
sum(ss$r>1)
# 5. Raise the numbers measured
# Here we join the subsampling table (ss) with the length table (le)
# If you don't specify the by statement
le <-
le %>%
left_join(ss) %>% # Left join: add it to the left of the x
mutate(n = n * r) # Apply raising factor
# Filter the data such that you βseeβ the first few records were length was raised
# β¦ and calculate the weight
le <-
le %>%
mutate(wt = (n * 0.01 * length^3)/1000) # kilos already multiplied by number of fish
# 6. Calculate the catch per tow
# Each id represents a tow, hence we can group by it and then summarise
d <-
le %>%
group_by(id) %>% # group by ID
summarise(n = sum(n), # summarize by number of fish and weight
wt = sum(wt))
nrow(d)
nrow(st) # there are stations where no cod was caught
# 7. Join the calculation with the station table
# We do this because the station table is the only place were the tow postion
# is recorded
d <-
st %>%
left_join(d) %>%
mutate(n = ifelse(is.na(n), 0, n)/1e3, # replace NAs by 0
wt = ifelse(is.na(wt), 0, wt)/1e3)
# raise for towlength
d <-
d %>%
mutate(n = n * 4 / towlength,
wt = wt * 4 / towlength)
# 8. Plot the results
ggplot(d, aes(lon, lat)) +
geom_point(aes(size = wt), alpha = 0.3, colour = "red") +
scale_size_area(max_size = 20, breaks = c(0, .1, .5, 1, 3)) +
labs(x = NULL, y = NULL, size = "tonnes/tow") +
coord_quickmap()
iceland <- map_data("worldHires", region = "iceland")
ggplot(d, aes(lon, lat)) +
theme_bw() +
geom_polygon(data=iceland, aes(long, lat, group=group), fill = "grey90") +
geom_point(aes(size = wt), alpha = 0.3, colour = "red") +
scale_size_area(max_size = 20, breaks = c(0, .1, .5, 1, 3)) +
labs(x = NULL, y = NULL, size = "tonnes/tow") +
coord_quickmap()
# CONDENSED VERSION
Species <- 1
Year <- 2015
st <-
Station %>%
filter(year %in% Year) %>%
select(id, towlength, lon, lat)
d <-
Length %>%
filter(id %in% st$id,
species %in% Species) %>%
select(id, length, n) %>%
left_join(
Subsampling %>% # Note: here we do the raising
filter(species %in% Species) %>% # "inside" the left_join function
mutate(r = n.total / n.measured,
r = ifelse(is.na(r), 1, r)) %>%
select(id, r)
) %>%
mutate(n = n * r,
wt = (n * 0.01 * length ^ 3) / 1000) %>% # kilos
group_by(id) %>% # summarise per tow
summarise(n = sum(n),
wt = sum(wt)) %>%
right_join(st) %>% # join with station table
mutate(n = ifelse(is.na(n), 0, n) / 1e3, # note the right_join
wt = ifelse(is.na(wt), 0, wt) / 1e3) %>%
mutate(n = n * 4 / towlength,
wt = wt * 4 / towlength)
# CONVERTING THE CODE INTO A FUNCTION
catch_per_tow <- function(Year, Species) {
st <-
Station %>%
filter(year %in% Year) %>%
select(id, towlength, lon, lat)
d <-
Length %>%
filter(id %in% st$id, # this ensures we only work with SMB length
species %in% Species) %>%
select(id, length, n) %>%
left_join(
Subsampling %>%
filter(species %in% Species) %>%
mutate(r = n.total / n.measured,
r = ifelse(is.na(r), 1, r)) %>%
select(id, r),
by = c("id")
) %>%
mutate(n = n * r,
wt = (n * 0.01 * length ^ 3) / 1000) %>%
group_by(id) %>%
summarize(n = sum(n),
wt = sum(wt)) %>%
right_join(st, by = c("id")) %>%
mutate(n = ifelse(is.na(n), 0, n) / 1e3,
wt = ifelse(is.na(wt), 0, wt) / 1e3)
return(d)
}
source("R/catch_per_tow.r")
catch_per_tow(Year = 2015, Species = 1, TowCorrect = TRUE) %>%
ggplot(aes(lon, lat)) +
theme_bw() +
geom_polygon(data = iceland, aes(long, lat, group = group), fill = "grey90") +
geom_point(aes(size = wt), alpha = 0.3, colour = "red") +
scale_size_area(max_size = 20, breaks = c(0, .1, .25, 0.5, 1)) +
labs(x = NULL, y = NULL, size = "tonnes/tow") +
coord_quickmap()
# alternative presentation
d <- catch_per_tow(Year = 2015, Species = 2, TRUE)
d2 <-
d %>%
mutate(statrec = geo::d2ir(lat, lon)) %>% # convert to rectangles
group_by(statrec) %>%
summarise(count = n(),
n = mean(n),
wt = mean(wt)) %>%
mutate(lon = geo::ir2d(statrec)$lon, # set positions to centre rect.
lat = geo::ir2d(statrec)$lat)
ggplot(d2) +
# set theme
theme_bw() +
# add rectangle density (by weight)
geom_tile(aes(lon, lat, fill = wt)) +
# change the colouring of the tiles
scale_fill_continuous(low = "yellow", high = "red") +
# add iceland
geom_polygon(data = iceland, aes(long, lat, group = group), fill = "grey90") +
# add the number of observations
geom_text(aes(lon, lat, label = round(count, 0)), size = 2) +
# add the station locations
geom_point(data = d, aes(lon, lat), col = "blue", size = 0.05, alpha = 0.5) +
# select coordinates and mapping
coord_quickmap(range(d$lon), range(d$lat)) +
# do labelling
labs(x = NULL, y = NULL, fill = "t/tow")
# B. Many years, one species
# Exercises
# Modify the function such that it can handle multiple years.
# Create a bubble plot of some selected years (e.g. every tenth year) for some species
# 1. Simple mean and median
# calculate indices by year; create long df
d <-
catch_per_tow(Year = 1985:2015, Species = 1) %>%
group_by(year) %>%
summarise(Median = median(wt),
Mean = mean(wt)) %>%
gather(key = Statistic, value = value, -year)
# plot the species
d %>%
ggplot(aes(year, value, colour = Statistic)) +
geom_point() +
geom_line()
# now for all species
# calculate indices by year; create long df
d <-
catch_per_tow(Year = 1985:2015, Species = 1:60) %>%
group_by(species, year) %>%
summarise(Median = median(wt),
Mean = mean(wt)) %>%
gather(key = Statistic, value = value, Median:Mean)
# plot the species
d %>%
ggplot(aes(year, value, colour = Statistic)) +
theme_bw() +
geom_point() +
geom_line() +
facet_wrap(~species, scale="free_y")
# ------------------------------------------------------------------------
# Getting data into R - BJarki
# ------------------------------------------------------------------------
## read in the minke data properly
minke <-
read.table(
file = "http://www.hafro.is/~einarhj/data/tcrenv2016/Station.csv",
header = TRUE,
dec = '.', ## decimal sign
sep = ',', ## column separator symbol
skip = 0, ## num lines at top to skip
stringsAsFactors = FALSE,
comment.char = '#' ## indicating comments
)
## more low level:
minke.lines <- readLines("http://www.hafro.is/~einarhj/data/tcrenv2016/Station.csv", n=4)
## read the header in using scan
## this is useful for the PFA length data !!!!!!!
header <- scan(text = minke.lines[4],
what = 'character',sep=',')
minke <- read.table(text = minke.lines[-c(1:4)],
sep = ',',dec='.')
## use the header read with scan to name the columns
names(minke) <- header
## simple sanity checks
head(minke)
tail(minke)
dim(minke)
names(minke)
summary(minke)
str(minke)
glimpse(minke)
## one can write the data to file
write.table(minke,
file = 'minke-class.csv', ## file name
col.names = TRUE, ## write header
row.names = FALSE, ## write row names
quote = FALSE, ## characters qouted?
sep = ',',
dec = '.')
## or simply
write.csv(minke,file='minke.csv')
## location of files
minke <-
read.csv('http://www.hafro.is/~bthe/minke.csv')
catatage <-
read.csv('http://data.hafro.is/assmt/2015/cod/catage.csv')
## excel files
install.packages("readxl")
library(readxl)
library(openxlsx)
?read_excel
minke.xls <- read_excel("minke.xlsx")
pfa <- read_excel("C:/DATA/DATA/Selfsampling/PFA 20160217 selfsampling data.xlsm",
sheet = 2)
excel_sheets(pfa)
haul <- read.xlsx("C:/DATA/DATA/Selfsampling/PFA 20160217 selfsampling data.xlsm",
sheet="Trek", detectDates = TRUE, skipEmptyRows = TRUE)
## write excel files
library(openxlsx)
write.xlsx(minke,file='minke.xlsx')
fish <- read.csv2('lec1/fish.csv')
write.xlsx(list("minke"=minke,'fish'=fish),file='minkeandfish.xlsx')
## list sheets
excel_sheets('minkeandfish.xlsx')
## read in only the fish data
fish.xls <- read_excel('minkeandfish.xlsx',sheet = 'fish')
## database connectivity
library(dplyr)
## setup a connection to a database
## by creating it
db <- src_sqlite('minke.db',create = TRUE)
## dump data into it
tmp <- copy_to(db,minke,'minke_tbl')
## query the database
minke.tbl <- tbl(db,'minke_tbl')
## run sql on the database
num.minke <- tbl(db,sql('select count(1) from minke_tbl'))
## describe the table (analogous to str in R)
tbl_vars(minke.tbl)
glimpse(minke.tbl)
## list all tables in a data base
db_list_tables(db$con)
## for more db stuff refer to
?db_list_tables
## DATRAS
# install.packages("httr")
# install.packages("xml2")
library("httr")
library("xml2")
source('R/datras.R')
## get station data
st <- get_datras(record = "HH", survey = 'NS-IBTS',
year = 2000, quarter = 1)
## get length data
le <- get_datras(record = "HL", survey = 'NS-IBTS',
year = 2000, quarter = 1)
## get age data
age <- get_datras(record = "CA", survey = 'NS-IBTS',
year = 2000, quarter = 1)
# ------------------------------------------------------------------------
# Friday 4 March 2016
# Fitting statistical models
# ------------------------------------------------------------------------
# Statistics
# https://cran.r-project.org/doc/contrib/Faraway-PRA.pdf
# https://www.openintro.org/stat/?stat_book=os
library(dplyr)
library(tidyr)
library(ggplot2)
library(broom) # Package broom for dealing with model fits.
cod.ca <-
read.csv("~/TCRENC 2016/Course/example-ca.csv") %>%
tbl_df()
ggplot(cod.ca,aes(length,ungutted)) + geom_point()
ggplot(cod.ca,aes(log(length),log(ungutted))) + geom_point()
# linear regression
# Often one sees the length weight relationship charaterised as:
# W = a Lb
# Now this is a non-linear relationship (if b ΜΈ= 1) but is easy to linearise
# using log:
# log(W) = log(a) + b log(L)
wlFun <- function(dat, a, b) {
p <- ggplot(dat, aes(log(length), log(ungutted))) +
geom_point() +
geom_abline(intercept = log(a), slope = b) +
ylim(c(0, 10)) + xlim(c(0, 10))
ss <- sum((log(dat$ungutted) - log(a) -
b * log(dat$length)) ^ 2)
return(list(p = p, ss = ss))
}
wlFun(cod.ca,0.01,3)
# lm (y~x, data)
fit <- lm(log(ungutted)~log(length), cod.ca)
summary(fit)
coefficients(fit)[1]
wlFun(cod.ca,exp(coefficients(fit)[1]),coefficients(fit)[2])
# Interaction
fit2 <- lm(log(ungutted)~log(length)*sex, cod.ca)
summary(fit2)
plot(fit2)
AIC(fit2) # just the AIC value of the model
step(fit2) # estimate which parameters contribute to the AIC
# Interaction and extra variable
fit3 <- lm(log(ungutted)~log(length)*sex + liver, cod.ca)
summary(fit3)
# plot(fit3)
step(fit3, na.rm=TRUE)
# Estimate a maturity ogive using a logistic model; proportions at length and age
cod.ca <-
mutate(cod.ca,
mat1 = ifelse(maturity==1,0,1)) %>%
filter(!is.na(mat1))
fit.mat <- glm(mat1~length, family="binomial", data=cod.ca)
summary(fit.mat)
plot(fit.mat)
broom::tidy(fit.mat)
output <- broom::augment(fit.mat, type.predict = "response")
ggplot(output, aes(length, .fitted)) +
geom_line()
L50 <- output %>%
filter(.fitted <= 0.50) %>%
summarise(L50 = max(length))
ggplot(output, aes(length, .fitted)) +
geom_line() +
geom_jitter(data=cod.ca, aes(length,mat1), alpha=0.1, height=0.1) +
geom_vline(xintercept=64)
# Fit by age
fit.mat2 <- glm(mat1~age, family="binomial", data=cod.ca)
output2 <- broom::augment(fit.mat2, type.predict = "response")
ggplot(output2, aes(age, .fitted)) +
geom_line() +
geom_jitter(data=cod.ca, aes(age,mat1), alpha=0.1, height=0.11, width=0.5) +
geom_vline(xintercept=5)
output2 %>%
filter(.fitted <= 0.50) %>%
summarise(A50 = max(age)) # does not work very wel because of integer values
# Non-linear models (e.g. VBGF) - nls()
minke <- read.csv("http://www.hafro.is/~bthe/minke.csv")
age.data <- filter(minke, !is.na(age))
minke.vonB.par <-
nls(length ~ Linf * (1 - exp(-K * (age - t0))),
data = age.data,
start = list(Linf = 1100, K = 0.1, t0 = -1))
minke.vonB.par
# fisheries package in R (too many things at the same time) ??
# variables that are not in the data will be estimated
vonB.par <-
nls(length ~ Linf * (1 - exp(-K * (age - t0))),
data = cod.ca,
start = list(Linf = 100, K = 0.1, t0 = -1))
summary(vonB.par)
# confidence intervals
confint(vonB.par)
head(tmp)
confint(vonB.par)
tidy(vonB.par)
tmp <- augment(vonB.par)
ggplot(tmp, aes(age, .fitted)) +
geom_line(colour="blue") +
geom_point(data=cod.ca, aes(age,length), alpha=0.1, colour="red") +
geom_hline(yintercept=149, colour="blue")
# plot residuals vs. age
ggplot(tmp, aes(age, .resid)) +
geom_point()
install.packages("devtools")
library(devtools)
install_github("einarhjorleifsson/wices") # install wices through github
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.