In einarhjorleifsson/datrasr: For analysing and visualising ICES datras trawl survey data

Practical 01 - Exploring the Chrons

The aim of this exercise is download ICES exchange format data from www.ices.dk, extract and visualise the chron records.

Install R (http://cran.r-project.org/), packages and perhaps Tinn-R (http://sourceforge.net/projects/tinn-r/).

Install the packages. Remember you only need to do this once.

install.packages('maps')
install.packages('maptools')
install.packages('rgdal')
install.packages('sp')
install.packages('spatial')
install.packages('PBSmapping')

Attach to search path:

library(sp)
library(maptools)
library(rgdal)
library(maps)
library(PBSmapping)
library(knitr)

Select some data from the ICES website - http://datras.ices.dk/Data_products/Download/Download_Data_public.aspx
Save the data to a directory of your choice, eg. 'c:/datras/data' and extract them from the zip files.
Install the R library datras. Do this by downloading datras*zip from the sharepoint. Open R, go to packages, install package from local zip files.
Attach datras package written by me to the R search path

library(datrasR)

Extract the chron records in the relevant directory. In the first instance chose a survey area in which you are interested. The suggestions below are simply options. Remember that you will need to adjust the working directory (wd) path so that R can find the data. The ICES website only allows relatively small subsets of data to be downloaded at once. For the time-being stick to fairly small subsets of data.

Addendum (einar) - do only once:

download.file("http://www.hafro.is/~einarhj/data/Exchange\ Data_2011-10-05\ 16_30_08.csv",
              destfile = "Exchange Data_x.csv")

chrons <- parseExchangeFormatChrons(wd = "/home/einarhj/r/Pakkar/datrasr")

Here only examine valid hauls, non-duplicates and hauls where we have both shoot and haul positions.

chrons <- chrons[chrons$haulval == 'V', ]        # only valid hauls
chrons <- chrons[!duplicated(chrons), ]          # chuck any duplicates
chrons <- chrons[!is.na(chrons$haullong), ]      # don't want data without haul longs
chrons <- chrons[chrons$country != 'NOR', ]      # throw out Norway because shoot and haul positions are the same.

Print and examine first 5 lines of chron file

head (chrons)

Explore the data with R function table

table(chrons$haulval)                                             # number of valid hauls (check)
table(chrons$hauldur)                                             # number of hauls at different durations
table(chrons$gear)                                                # number of gears
table(chrons$quarter,chrons$year)                                 # number of hauls by year and quarter
table(chrons$country,chrons$ship)                                 # number of hauls by country and ship
head(table(chrons$depth, chrons$daynight))                        # number of hauls by day/night and depth                                   
table(chrons$ship,cut(chrons$depth,breaks=c(seq(0,250,by=25))))   # number of hauls by ship and depth bin.
head(table(chrons$statrec,as.character(chrons$ship)))             # number of hauls by statistical rectangle and ship
table(chrons$year)                                                # number of hauls by year

Some simply summary statistics

summary(chrons$depth)
summary(chrons$hauldur)
summary(chrons$datim.shot)

average temperature (if available) in each statrectangle (not run).

tapply(chrons$surtemp,list(chrons$quarter, chrons$statrec), mean,na.rm=T)

average depth (if available) in each statrectangle (not run).

tapply(chrons$depth, list(chrons$quarter, chrons$statrec), mean,na.rm=T)

average number, duration and distance of hauls in ICES area

chrons$ices_area <- ICESarea(chrons, roman = FALSE)                               # Add on ICES area using function datras
tapply(chrons$hauldur, chrons$ices_area, length)                                  # number of hauls in each ICES area
round(tapply(chrons$hauldur, chrons$ices_area, sum,na.rm=T)/60)                   # total haul duration in each ICES area
round(tapply(chrons$distance, chrons$ices_area, sum,na.rm=T)/1000)                # total distance hauled in each ICES area

You might want to write data out into a csv file for reading with excel or some other software (not run):

write.table(chrons,file='chrons.csv',sep=',',row.names=F)

Explore the data with graphs

Simple map of shoot and haul locations

par(mfrow=c(1,1))
plot(chrons$shootlong,chrons$shootlat,pch=".",col='blue',ylab='',xlab='',cex=2)
# add points
points(chrons$haullong,chrons$haullat,pch=".",col='red',cex=2)
# add lines for stat rectangles
abline(h=0:80,v=seq(-30,30,by=.5),lty=2,col='grey',lwd=.5)
# add map
library(maps)
map('worldHires',add=T,col='darkseagreen',fill=T)   


# Add depth contours (different files for different areas since the data sets are so big).

# EINAR - DELIBERATELY EXCLUDED
#data(nseaBathy)   # north sea bathymetry
#data(nwestBathy)  # north west atlantic shelf bathymetry
#data(balticBathy) # baltic sea bathymetry

#contour(nseaBathy$x, nseaBathy$y, nseaBathy$z, levels=c(20,40,60,100), add=T)    # add on depth contours for north sea
#contour(nwestBathy$x, nwestBathy$y, nwestBathy$z, levels=c(100,200,500), add=T)  # add on depth contours for baltic

Explore data in various ways looking for potential confounding

what.quarter <- 1 # remember to add on the correct quarter you are examining

chrons1 <- chrons[chrons$quarter == what.quarter,]
par(mfrow=c(1,2),mar=c(4,4,2,2))
plot(chrons1$shootlong,chrons1$shootlat,pch=16,col='blue',ylab='',xlab='')  # just shoot long
# add lines for stat rectangles
abline(h=0:80,v=seq(-30,30,by=.5),lty=2,col='grey')
# add map
library(maps)
map('world',add=T,col='green',fill=T)   # low res map. Try 'worldHires' too.
tt <- tapply(chrons1$depth,format(chrons1$datim.shot,"%H"),median,na.rm=T) # calculate mean depths at different times of day
plot(as.numeric(names(tt)),tt,type='l',ylab="average depth",xlab="time of day")

Plot the survey by year

chrons1 <- chrons[chrons$quarter == what.quarter,]  # extract data according to what.quarter

uy <- sort(unique(chrons$year))
#print(uy)
#print(length(uy))                # how many unique years in the chron file ??
par(mfrow=c(3,4),mar=c(2,2,2,2)) # use the result of print(length(uy)) to set up plot for number of graphs. Remember mfrow=c(3,4) means 'plot 12 graphs'


for (i in uy) {
  plot(chrons1$shootlong[chrons1$year == i],chrons1$shootlat[chrons1$year==i],pch=16,col='blue',ylab='',xlab='')
  # add points
  #points(jitter(chrons1$haullong),jitter(chrons1$haullat),pch=16,col='red')
  # add lines for stat rectangles
  abline(h = 0:80, v = seq(-30,30, by=.5), lty = 2, col = 'grey')
  # add map
  library(maps)
  map('world', add=T, col='green', fill=T)   # low res map. Try 'worldHires' too.
  title(as.character(i))
}

Comment on the distribution of the survey stations

Find chrons within a polygon. Note: The 'polygon' can also come from a shapefile.

First create a data frame with the Round 3 Zone 5 Boundary Co-ordinates (a proposed area for an offshore wind farm).

dat.r3 <- data.frame (
lon.d = c(3,3,2,2,2,1,1,2,2,2),
lon.m = c(4,10,27,6,8,52,55,13,14,2),
lon.s = c(15.035,46.476,11.669,56.469,30.217,6.891,56.591,32.66,29.436,40.5),
lat.d = c(53,52,51,52,52,52,52,52,52,52),
lat.m = c(14,51,59,0,17,15,24,26,36,49),
lat.s = c(46.991,46.076,49.19,33.467,25.312,57.710,7.55,12.405,1.973,33.309))

# Convert to decimal lat and long co-ordinates 
x <- ((dat.r3$lon.m*60) + dat.r3$lon.s)/ 3600
dat.r3$lon <- dat.r3$lon.d + x
y <- ((dat.r3$lat.m*60) + dat.r3$lat.s) / 3600
dat.r3$lat <- dat.r3$lat.d + y

# Create polyset data.frame (from PBSmapping but there are many other alternatives). 
round.3  <- data.frame(PID = rep(1,11), POS = 1:11, X = c(dat.r3$lon,dat.r3$lon[1]), Y = c(dat.r3$lat,dat.r3$lat[1]))
round.3  <- as.PolySet(round.3, projection='UTM')

head(round.3) # examine data (typical method for storage of spatial data). 

data(europa)  # map of europe (with a detailed Dutch coastline) included in the package datras

par(mfrow=c(1,1))

plotMap(europa, xlim = c(-1,5), ylim = c(50,54), col = 'darkgreen', xlab = '', ylab = '')
addLines(round.3,lwd=2)
addPolys(round.3,col='red')
points(chrons$shootlong,chrons$shootlat,pch=".",cex=2)

# Identify the points within the polygon

events <- data.frame(EID=1:length(chrons[,1]),X=chrons$shootlong,Y=chrons$shootlat) # Must first creat an events data frame
events <- as.EventData(events,projection="UTM")
r3<-findPolys(events,round.3,maxRows=1e+06)

str(r3) #examine the output

points(chrons$shootlong[r3$EID],chrons$shootlat[r3$EID],col='blue',pch='*')   # plot points INSIDE the polygon
points(chrons$shootlong[-r3$EID],chrons$shootlat[-r3$EID],col='black',pch='+')   # plot points OUTSIDE the polygon

Create and plot points and tow tracks using R library sp. Output the result as kml (Google Earth) and shapefiles.

spDF <- Chrons2SpatialPointsDF(data=chrons) # Convert the data to a spatial points data.frame

# Plot map in R
data(europa) # in built map
plotMap(europa,xlim=c(-15,10),ylim=c(47,62),xlab='',ylab='')  # Note: change xlim and ylim to focus in on correct area.
points(spDF,pch=".",col='red',cex=2)

The magic of Google Earth. Note you can change the driver for other formats (not run):

writeOGR(obj=spDF, dsn="chrons.kml", layer="shoot-locations", driver="KML") 
writeOGR(obj=spDF, dsn="chrons.shp", layer="shoot-locations", driver="ESRI Shapefile")  # A shapefile.

print(getwd())  # see which directory the file has gone too, find it and open it in Google Earth

Similarly tow tracks instead of points

uships <- unique(chrons$ship) # how many unique ships in the dataset ? 
print(uships)
spLDF <- ChronsTrack2SpatialLineDF(input=chrons,what.ship = c("TRI2"))   # convert to a spatial line data frame for a particular ship. Note the function takes too long if you select too many ships/data.
par(mfrow=c(1,1))
plotMap(europa,xlim=c(-5,10),ylim=c(47,62),xlab='',ylab='',col='darkseagreen')  #  Plot to see 
lines(spLDF,col='blue',lwd=1)

(not run)

writeOGR(spLDF, "towtracks.kml", "tracks", "KML") 
print(getwd())  # see which directory the file has gone too, find it and open file towtracks.kml in Google Earth

FURTHER EXERCISES

Explore chron files from different datasets downloaded from ICES website.
Make your own polygons for selecting data.
Find examples of 'confounding'/non-random sampling in survey data and share them with the group at Plenary, eg. create maps with dots color-coded for different ships, gears, etc.

Practical 02 - Exploring the Length-Frequencies

The aim of this exercise is download ICES exchange format data from www.ices.dk, extract, analyse and visualise the length-frequency data for individual species.

Extract the length-frequency data in the relevant directory

#library(sp);library(mda);library(doBy);library(PBSmapping);library(maps);library(maptools);library(rgdal);library(spatial);library(fields)  # attach to R search path
lfs <- parseExchangeFormatLFs(wd="/home/einarhj/r/Pakkar/datrasr")

Add on scientific name using TSN (downloaded from ITIS website). data(tsn) # itis database of all species in the world

data(tsn)
head(tsn)  # first 5 lines

lfs$scientific.name <- as.character(tsn$completename[match(lfs$speccode,tsn$tsn)])   # match scientific names using tsn 

unique(lfs$scientific.name)[1:10] # print some fish species in the data set.

Tidy up the data

lfs <- lfs[!duplicated(lfs),]                  #  remove any duplicates
lfs <- lfs[lfs$speccode != -9,]                #  remove missing species codes
lfs <- lfs[lfs$lngtclass != -9,]               #  remove missing length classes
lfs <- lfs[!is.na(lfs$lngtclass),]             #  remove missing length classes
lfs <- lfs[!is.na(lfs$lngtcode),]              #  remove missing length codes
lfs <- lfs[!is.na(lfs$scientific.name),]       #  remove any missing scientific names 
lfs$hlnoatlngt <- lfs$hlnoatlngt*lfs$subfact   #  Multiply by the subfactor

Standardise length codes to cms (some are in mms) and create bins of 0.5.

table(lfs$lngtcode)

lfs$lngtclass[lfs$lngtcode == "."] <- lfs$lngtclass[lfs$lngtcode == '.']/10
lfs$lngtclass[lfs$lngtcode == "0"] <- lfs$lngtclass[lfs$lngtcode == "0"]/10

lfs$lngtclass <- floor(lfs$lngtclass)  #Round length down and add 0.5
lfs$lngtclass <- lfs$lngtclass+0.5

Estimate weight of each fish.

Note: It can be useful to quote results in terms of weight rather than numbers. This is usually done with parameters (a & b) from non-linear equations where Weight in grammes = aL^b where L = length.

data(length.weight)   # attach a list of parameters courtesy of IMARES and Marine Science Scotland.

# Match a and b parameters onto the correct species

lfs$a <- length.weight$a[match(lfs$scientific.name,length.weight$scientific.name)]
lfs$b <- length.weight$b[match(lfs$scientific.name,length.weight$scientific.name)]

lfs$hlwtatlngt<-((lfs$a*lfs$lngtclass^lfs$b)*lfs$hlnoatlngt) /1000    # calculate weight in kilos 

print(head(lfs))   # are the weights reasonable ?

Extract a species. These are just examples but you can let your curiosity roam freely!

plaice <- lfs[lfs$scientific.name == 'Pleuronectes platessa',]
#cod <- lfs[lfs$scientific.name == 'Gadus morhua',]
#dab    <- lfs[lfs$scientific.name == 'Limanda limanda',]
#lemon  <- lfs[lfs$scientific.name == 'Microstomus kitt',]
#haddock <-lfs[lfs$scientific.name == 'Melanogrammus aeglefinus',]

# plot some distributions 
barplot(table(rep(plaice$lngtclass,plaice$hlnoatlngt)),space=0   )  # these are the entire distributions for all the fish in the data you chose

Merge lfs with the chrons. One reason is that only the +ve part of the lf data are typically stored and you need a complete list of stations to find the zero hauls.

# Get the chrons for the same survey (e.g. Practical01)
chrons <- parseExchangeFormatChrons(wd="/home/einarhj/r/Pakkar/datrasr")
# Tidy up the chrons
chrons <- chrons[chrons$haulval=='V',]          #only valid hauls
chrons <- chrons[!duplicated(chrons),]          # chuck any duplicates

Do the merging, ie. merge the positive component (the length frequencies) with the chrons. The result is a file which has zeros where you did a haul but caught none.

merged.plaice  <- mergeChronsLfs(chrons=chrons,length.frequencies=plaice)

Always useful to examine plots of length frequencies and/or probability density functions by year (why ?)

fdat <- merged.plaice
table(chrons$year,chrons$quarter)   # find range of data 

par(mar=c(3,3,1,1),oma=c(3,3,3,3),mfcol=c(3,4))
plot.Lfs(fish=fdat,what.quarter=1,chrons1=chrons)             #e.g plot the length frequencies

plot.PDensity(fish=fdat,what.quarter=1,chrons1=chrons,bw=.5)   #e.g plot the density function

Plot CPUE by year and quarter

Note: In my opinion it is always preferable to keep catch and effort databases separate, chose level of aggregation, aggregate over that level and then divide the catch by the effort.

fish <- merged.plaice

fish <-fish[!is.na(fish$haullong),]     # sometimes there are incomplete chron records.

fish$cpue.by.n  <- fish$hlno/fish$hauldur # divide catch by effort (n)
fish$cpue.by.wt <- fish$hlwt/fish$hauldur # divide catch by effort (weight)

nfish <- aggregate(list(cpue.by.n=fish$cpue.by.n,cpue.by.wt=fish$cpue.by.wt),
list(year=fish$year,quarter=fish$quarter),mean,na.rm=T)     # average cpues caught over year and quarter. Is this correct ??


# Put on absolute time (qtrend).

nfish$qtrend <- qtrend.f(nfish$year,nfish$quarter,start.year=min(nfish$year))

par(mfrow=c(2,1),mar=c(2,4,2,2))
plot(nfish$qtrend,nfish$cpue.by.n,type='b',xlab='time',ylab='cpue',xaxt='n');title('cpue by n')
axis(side=1,at=nfish$qtrend,labels=as.character(nfish$year))
plot(nfish$qtrend,nfish$cpue.by.wt,type='b',xlab='time',ylab='cpue',xaxt='n');title('cpue by wt')
axis(side=1,at=nfish$qtrend,labels=as.character(nfish$year))

Map total numbers of individuals caught and/or weights

How about Bubble plots?

# Select data (species) 
fish <- merged.plaice   # chose plaice or whatever
#fish <- merged.dab
#fish <- merged.angler
#fish <- merged.cod
#fish <- merged.haddock

fish <-fish[!is.na(fish$haullong),]      # sometimes there are incomplete chron records.

fish$cpue.by.n  <- fish$hlno/fish$hauldur # divide catch by effort (n)
fish$cpue.by.wt <- fish$hlwt/fish$hauldur # divide catch by effort (weight)

nfish2 <- aggregate(list(cpue.by.n=fish$cpue.by.n,cpue.by.wt=fish$cpue.by.wt),
list(scientific.name=fish$scientific.name,year=fish$year,quarter=fish$quarter,shootlat=fish$shootlat,shootlong=fish$shootlong),mean,na.rm=T) # average cpues caught over year and quarter

table(nfish2$year,nfish2$quarter)         # investigate the range of the data to get year quarter combination to plot

par(mfrow=c(1,1))                         # plot data for quarter 3 2007
plotMapBlobs(input=nfish2,what.year=2007,what.quarter=1,what.cpue='cpue.by.n',xlim0=c(-15,15),ylim0=c(48,62),scaling.factor=.5)

# Compare spatial distributions among groups of years

print(sort(unique(nfish2$year)))  # how many years ? adjust mfrow accordingly & remember correct quarter argument

par(mfrow=c(2,2),cex.main=.8)
for(i in 2007:2010){
plotMapBlobs(input=nfish2,what.year=i,what.quarter=1,what.cpue='cpue.by.n',xlim0=c(-2,8),ylim0=c(48,62),scaling.factor=.5) 
}

#
# Grid of colors scaled to abundance
#

# Note: same thing here. We want to first chose our level of aggregation, sum the numbers or weights over it and then the same for effort dividing one by the other.

fish <- merged.plaice    # Chose the species
#fish <- merged.dab
#fish <- merged.angler
#fish <- merged.cod
#fish <- merged.haddock
#

fish <-fish[!is.na(fish$haullong),]     # sometimes there are incomplete chron records.


grid.size.y <- 0.5 # grid size here is ICES statistical rectangle but this is entirely flexible depending on the resolution of your survey
grid.size.x <- 1  

we <- -16             # Set east and west boundaries for your grid
ea <- 10
so <- 48
no <- 62

par(mfrow=c(1,1))    # number of plots

# Take out dat for 1 year and 1 quarter

table(chrons$year,chrons$quarter) # examine distribution of observations using the chron data

what.year <- 2010
what.quarter <- 1

fish.yq <- fish[fish$quarter== what.quarter & fish$year == what.year,]         # create new dataset with selected year and quarter
chrons.yq <- chrons[chrons$quarter== what.quarter & chrons$year == what.year,] # create new dataset with selected year and quarter

maps of total weights (not run, overlay function depreciated)

wts <-TrawlSurveyGridPlot(fish.yq, we=we,ea=ea,so=so,no=no,nameLon = "shootlong", nameLat = "shootlat",plotMap=T,
nameVarToSum="hlwtatlngt",cellsizeX = grid.size.x, cellsizeY = grid.size.y,
legendx='bottomright',numcats=10,plotPoints=T,legendncol=2,paletteCats = "topo.colors",addICESgrid=TRUE,legendtitle="weights")

maps of total numbers (not run, overlay function depreciated)

nos <-TrawlSurveyGridPlot(fish.yq, we=we,ea=ea,so=so,no=no,nameLon = "shootlong", nameLat = "shootlat",plotMap=T,
nameVarToSum="hlnoatlngt",cellsizeX = grid.size.x, cellsizeY = grid.size.y,
legendx='bottomright',numcats=10,plotPoints=T,legendncol=2,paletteCats = "topo.colors",outGridFile="text.txt",addICESgrid=TRUE,legendtitle="numbers")

# map of trawling effort from chron files

effort<-TrawlSurveyGridPlot(chrons.yq, we=we,ea=ea,so=so,no=no,nameLon = "shootlong", nameLat = "shootlat",plotMap=T,
nameVarToSum="hauldur",cellsizeX = grid.size.x, cellsizeY = grid.size.y,
legendx='bottomright',numcats=10,plotPoints=T,legendncol=2,paletteCats="heat.colors",addICESgrid=TRUE,legendtitle="minutes trawled")


# Cpue using the output from TrawlSurveyGridPlot

im.nos <-  as.image.SpatialGridDataFrame(nos, attr=2)     # extract nos from output of TrawlSurveyGridPlot
im.eff <-  as.image.SpatialGridDataFrame(effort, attr=2)  # extract effort from output of TrawlSurveyGridPlot

im.cpue <- im.nos
im.cpue$z <- im.nos$z/im.eff$z                             # divide catch by effort


data(nseaBathy)    # North Sea
data(nwestBathy)   # North west

library(fields)

image.plot(im.nos$x,im.cpue$y,log(im.cpue$z+1),col=topo.colors(10),xlab='',ylab='',xlim=c(-5,8))                     # plot the data on a map
contour(nseaBathy$x,nseaBathy$y,nseaBathy$z,add=T,level=c(20,40,60),col='black')         # add on depth contour
contour(nwestBathy$x,nwestBathy$y,nwestBathy$z,add=T,level=c(100,200,500,1000,2000,4000),col='black') 

map("worldHires", add=TRUE, col='darkseagreen', fill=TRUE, bg="white",
regions=c('uk','ireland','france','germany','netherlands', 'norway','belgium',
'spain','luxembourg','denmark', 'sweden','iceland', 'portugal','italy','sicily','ussr','sardinia','albania','monaco','turkey','austria',
'switzerland','czechoslovakia','finland','libya', 'hungary','yugoslavia','poland','greece','romania','bulgaria', 'slovakia','morocco',
'tunisia','algeria','egypt' ))

FURTHER EXERCISES

Explore length frequency data for different species from different datasets downloaded from ICES website.
Compare distributions of fish species from different areas accross Europe.

Practical 03 - Exploring the Age Length Keys

Install R (http://cran.r-project.org/) packages and Tinn-R (http://sourceforge.net/projects/tinn-r/).

library(sp);library(mda);library(doBy);library(PBSmapping);library(maps);library(maptools);library(rgdal);library(spatial);library(fields)  # attach to R search path

Select some data from the ICES website - http://datras.ices.dk/Data_products/Download/Download_Data_public.aspx
Save the data to a directory of your choice, eg. 'c:/datras/data' and extract them from the zip files.
Install the R library datras. Do this by downloading datras*zip from the sharepoint. Open R, go to packages, install package from local zip files.
Attach datras package to search path
Extract the age length, sex, maturity data from the relevant directory

alk <- parseExchangeFormatALKs(wd = "/home/einarhj/r/Pakkar/datrasr")
head(alk) # have a look at the data

Standardise to cm length categories

alk$lngtclass[alk$lngtcode == "."] <- alk$lngtclass[alk$lngtcode == '.']/10
alk$lngtclass[alk$lngtcode == 0] <- alk$lngtclass[alk$lngtcode == 0]/10

Round down length classes & add 0.5

alk$lngtclass[alk$lngtcode != "5"] <- round(alk$lngtclass[alk$lngtcode != "5"])
alk$lngtclass[alk$lngtcode != "5"] <- alk$lngtclass[alk$lngtcode != "5"]+0.5
print(table(alk$lngtclass)) # print distribution of ALL length categories
print(table(alk$age))      # print distribution of ALL ages

Put on species name (see Practical 2).

alk$scientific.name <- as.character(tsn$completename[match(alk$speccode,tsn$tsn)])   # match scientific name onto species code 
sort(unique(alk$scientific.name)) # List of species with at least some age data
# Note: you tend to only get more commercially important species being aged etc.
table(alk$scientific.name,alk$age) # print distribution of ages by species (Note: this is for all ages combined).

Tidy up

alk <- alk[!duplicated(alk),]     #  remove any duplicates
alk <- alk[alk$speccode != -9,]   #  missing species codes
alk <- alk[alk$lngtclass != -9,]  #  missing length classes
alk <- alk[alk$age != -9,]        # missing ages
alk <- alk[!is.na(alk$lngtcode),] #  missing length codes
alk <- alk[!is.na(alk$scientific.name),] # missing species codes
alk$age <- as.numeric(as.character(alk$age))

Select a species for a particular year and quarter and examine the age length key

table(alk$year,alk$scientific.name) # investigate range of years covered for each species
table(alk$quarter)                  # see range of quarters

Chose a species, year and quarter to examine

what.species    <- 'Pleuronectes platessa'   
what.year      <- 2001
what.quarter   <- 1

alk1 <- alk[alk$scientific.name == what.species & alk$year == what.year & alk$quarter == what.quarter,]

alk1t<-table(alk1$lngtclass,alk1$age) # create a table of the distribution of length by ages for the chosen species

print(alk1t)                          # have a look at the age distribution by length

rowSms <- apply(alk1t,1,sum,na.rm=T) # use apply to calculate row sums

alk1tp <- round(alk1t/rowSms,3)      # divide numbers by total to get proportions

print(alk1tp)                        # have a look at the proportions

Plot the data.

# Try a barplot
par(mfrow=c(1,1))
barplot(t(alk1tp),space=0,col=1:dim(alk1tp)[[2]],ylab="P",xlab="Length(cms)",
legend.text=dimnames(alk1tp)[[2]],args.legend=list(x='topleft',bg='wheat'))

# Try a matrix plot 

matplot(alk1tp,type='l',ylab='P(age)',xlab="Length(cms)")

# Since data are a matrix why not try a 3D plot ?

alk1tp[alk1tp==0] <- NA
library(fields)
image.plot(as.numeric(dimnames(alk1tp)[[1]]), as.numeric(dimnames(alk1tp)[[2]]),alk1tp,col=topo.colors(100),xlab='Length(cms)',ylab='P(age)')
contour(as.numeric(dimnames(alk1tp)[[1]]), as.numeric(dimnames(alk1tp)[[2]]),alk1tp,add=T)

Apply ALK to Length frequency data

Simplest approach is to multiply the estimate of the proportion of each age at each length by the number of individuals at that length caught. If you do this you must ensure that you have a length in your ALK available for the corresponding length in your length frequency data.

lfs <- parseExchangeFormatLFs(wd="/home/einarhj/r/Pakkar/datrasr")
#  Again make sure to add on scientific name using TSN (downloaded from ITIS website). 
#data(tsn)  # itis database of all species in the world
#head(tsn)  # first 5 lines
lfs$scientific.name <- as.character(tsn$completename[match(lfs$speccode,tsn$tsn)])   # match scientific name 
# Tidy up
lfs <- lfs[!duplicated(lfs),]                  #  remove any duplicates
lfs <- lfs[lfs$speccode != -9,]                #  remove missing species codes
lfs <- lfs[lfs$lngtclass != -9,]               #  remove missing length classes
lfs <- lfs[!is.na(lfs$lngtclass),]             #  remove missing length classes
lfs <- lfs[!is.na(lfs$lngtcode),]              #  remove missing length codes
lfs <- lfs[!is.na(lfs$scientific.name),]       #  remove any missing scientific names 
lfs$hlnoatlngt <- lfs$hlnoatlngt*lfs$subfact   #  multiply by the subfactor
# Standardise length codes to cms (some are in mms).
lfs$lngtclass[lfs$lngtcode == "."] <- lfs$lngtclass[lfs$lngtcode == '.']/10
lfs$lngtclass[lfs$lngtcode == "0"] <- lfs$lngtclass[lfs$lngtcode == "0"]/10
lfs$lngtclass[lfs$lngtcode != "5"] <- round(lfs$lngtclass[lfs$lngtcode != "5"])  # Don't do this to fish binned at 5cm intervals
lfs$lngtclass[lfs$lngtcode != "5"] <- lfs$lngtclass[lfs$lngtcode != "5"]+0.5
# Useful to insert a weight for each length category for each species.
# Remember that this is usually done with parameters (a & b) from
# non-linear equations where Weight in grammes = aL^b  where L = length.
#data(length.weight)
# Match a and b parameters onto the correct species
lfs$a <- length.weight$a[match(lfs$scientific.name,length.weight$scientific.name)]
lfs$b <- length.weight$b[match(lfs$scientific.name,length.weight$scientific.name)]
# Use the parameters to estimate the weight in kgs.  Remember that weight = aL^b
lfs$hlwtatlngt<-((lfs$a*lfs$lngtclass^lfs$b)*lfs$hlnoatlngt) /1000

Extract a species to 'play' with. Remember that this should be one for which age length keys are available.

plaice <- lfs[lfs$scientific.name == 'Pleuronectes platessa',]

Get the chrons (e.g. Practical01) because it's easier to use them to estimate effort at different spatial levels of aggregation.

chrons <- parseExchangeFormatChrons(wd="/home/einarhj/r/Pakkar/datrasr")
#  Usual 'tidy up' procedure.
chrons <- chrons[chrons$haulval=='V',]          # only valid hauls
chrons <- chrons[!duplicated(chrons),]          # chuck any duplicates

Make sure we have zeros in the dataset for stations that were trawled but no individuals caught.

merged.plaice  <- mergeChronsLfs(chrons=chrons,length.frequencies=plaice)
fish <- merged.plaice

Calculate a very simple index to see the principles using the year quarter combination selected above.

lf1 <- fish[fish$year == what.year & fish$quarter == what.quarter,]  # select dat for what.year and what.quarter
head(lf1)
lf2 <- tapply(lf1$hlnoatlngt,lf1$lngtclass,sum,na.rm=T) # the total sum of individuals caught on that survey 
print(lf2) #Examine the data
# See if the available lengths correspond to the available aged lengths
length(lf2)
dim(alk1tp)
alk2tp <- data.frame(alk1tp)
dimnames(alk2tp)[[2]] <- c("lngtclass","age","p")   # Convert to a dataframe which can make life easier (relational database format)
alk2tp$lngtclass <- as.numeric(as.character(alk2tp$lngtclass))  # Make 'factors' into 'characters'
alk2tp$age <- as.numeric(as.character(alk2tp$age))              # 
head(alk2tp)
# Match the length frequencies with the lengths
alk2tp$hlnoatlngt <- lf2[match(alk2tp$lngtclass,as.numeric(dimnames(lf2)[[1]]))]
alk2tp$no.at.age  <- alk2tp$hlnoatlngt * alk2tp$p         # multiply the frequencies by the proportions
alk2tp$no.at.age[is.na(alk2tp$no.at.age)] <- 0            # replace Nas with zeros
# create the most straightforward index possible for that survey
index <- aggregate(list(no.at.age=alk2tp$no.at.age),list(age=alk2tp$age),sum,na.rm=T)

Plot the index for the year selected.

par(mfrow=c(1,1))
plot(index$age,index$no.at.age,ylab='nos at age', xlab='age',type='b',lwd=1)  # Linear

plot(index$age,log(index$no.at.age+1),ylab='nos at age', xlab='age',type='b')   # Log scale
abline(lm(log(no.at.age+1)~age,data=index),lty=3,col='blue')

Now create a dataset of numbers by age per ICES statistical rectangle using the function NosatAgebyYearStatSquare

out <- NosatAgebyYearStatSquare(lfdat = merged.plaice,alk = alk, chrons=chrons,  what.species ='Pleuronectes platessa') 
print(head(out))
# Plot the nos by year age and stat rectangle.
library(lattice)
xyplot(log(hlnoatage)~year|age,groups=statrec,type='l',data=out)

Compare indices calculated in different ways

# total numbers divided by total effort
a <- aggregate(list(hlnoatage=out$hlnoatage), list(age=out$age,year=out$year),sum,na.rm=T) # sum the total numbers
hd <- aggregate(list(haulduration=chrons$hauldur/60), list(year=chrons$year),sum,na.rm=T)            # sum the effort 
b0 <- merge(a,hd,all=T)                        # merge
b0$cpue <- b0$hlnoatage/b0$haulduration        # calculate cpue
b0 <- b0[!is.na(b0$hlnoatage),]
print(head(b0)) # examine the data b0
# versus AVERAGE of the separate cpue in each statistical rectangle
out$cpue <- out$hlnoatage/out$haulduration
b1 <- aggregate(list(cpue=out$cpue), list(age=out$age,year=out$year),mean,na.rm=T)
print(head(b1))
# Plot them both to compare
par(mfrow=c(3,4),mar=c(2,2,3,2))     # This bit just finds the range of the data for plotting
u.ages <- sort(unique(b0$age))
r0 <- tapply(b0$cpue,b0$age,range)
r1 <- tapply(b1$cpue,b1$age,range)
r2 <- as.list(u.ages)
for (i in u.ages){
  r2[[i+1]] <- range(c(r0[as.character(i)],r1[as.character(i)])) 
}                                   
# Do the plotting
for(i in u.ages) {
  plot(b0$year[b0$age==i],b0$cpue[b0$age==i],type='l',xlab='',ylab='',ylim=r2[[i+1]])
  lines(b1$year[b1$age==i],b1$cpue[b1$age==i],type='l',col='blue')
  title(paste('age',i,sep=' '),cex.main=1.2)    }

Now create a dataset of numbers by age each station (eg. by lat and long) using the function NosatAgebyYearLonLat

out <- NosatAgebyYearLonLat(lfdat = merged.plaice,alk = alk, chrons=chrons,  what.species ='Pleuronectes platessa') 
out[1:5,]                                        # First 5 lines
out$cpue.by.n <- out$hlnoatage/out$haulduration  # add on CPUE 
out$cpue.by.wt <- out$hlwtatage/out$haulduration

Plot data by age

On a bubble plots

par(mfrow=c(1,1))
plotMapBlobs(out[out$age==3,],what.quarter=1,what.year = 2007,what.cpue='cpue.by.n',xlim0=c(-16,12),ylim0=c(45,62),scaling.factor=.5)

Or a grid

grid.size.y <- .5                                                # grid size here is ICES statistical rectangle but this is entirely flexible
grid.size.x <- 1  

we <- -16                                                        # east and west boundaries
ea <- 10
so <- 48
no <- 62

par(mfrow=c(1,1))                                                # number of plots

out3 <- out[is.na(out$age) | out$age==3 & out$year ==2007,]


nos <-TrawlSurveyGridPlot(out3, we=we,ea=ea,so=so,no=no,nameLon = "shootlong", nameLat = "shootlat",plotMap=T,
nameVarToSum="hlnoatage",cellsizeX = grid.size.x, cellsizeY = grid.size.y,
legendx='bottomright',numcats=10,plotPoints=T,legendncol=2,paletteCats = "topo.colors",addICESgrid=TRUE,legendtitle="nos")

effort <-TrawlSurveyGridPlot(out3, we=we,ea=ea,so=so,no=no,nameLon = "shootlong", nameLat = "shootlat",plotMap=T,
nameVarToSum="haulduration",cellsizeX = grid.size.x, cellsizeY = grid.size.y,
legendx='bottomright',numcats=10,plotPoints=T,legendncol=2,paletteCats = "topo.colors",addICESgrid=TRUE,legendtitle="haul duration")

im.nos <-  as.image.SpatialGridDataFrame(nos, attr=2)      # extract nos from output of TrawlSurveyGridPlot
im.eff <-  as.image.SpatialGridDataFrame(effort, attr=2)   # extract effort from output of TrawlSurveyGridPlot

im.cpue <- im.nos
im.cpue$z <- im.nos$z/im.eff$z                             # divide catch by effort

#data(nseaBathy)                                            # North Sea
#data(nwestBathy)                                           # North west

library(fields)

image.plot(im.nos$x,im.cpue$y,log(im.cpue$z+1),col=topo.colors(10),xlab='',ylab='')                     # plot the data on a map
contour(nseaBathy$x,nseaBathy$y,nseaBathy$z,add=T,level=c(20,40,60),col='black')         # add on depth contour
#contour(nwestBathy$x,nwestBathy$y,nwestBathy$z,add=T,level=c(100,200,500,1000,2000,4000),col='black') 

map("worldHires", add=TRUE, col='darkseagreen', fill=TRUE, bg="white",
regions=c('uk','ireland','france','germany','netherlands', 'norway','belgium',
'spain','luxembourg','denmark', 'sweden','iceland', 'portugal','italy','sicily','ussr','sardinia','albania','monaco','turkey','austria',
'switzerland','czechoslovakia','finland','libya', 'hungary','yugoslavia','poland','greece','romania','bulgaria', 'slovakia','morocco',
'tunisia','algeria','egypt' ))

Multi-nomial logit modeling of age length keys. The problem is that the data are packed full of missing lengths at various categories. The problem with the simple arithmetic approach above is that there are often no data available. There may be instances where data can be modeled over missing length categories to provide more information.

alk <- parseExchangeFormatALKs(wd="/home/einarhj/r/Pakkar/datrasr")
#Note: There is something wrong with the French IBTS data? 
#alk$lngtclas[alk$ship == 'THA2' & alk$year ==2006] <- alk$lngtclas[alk$ship == 'THA2' & alk$year ==2006]/10
alk$lngtclass[alk$lngtcode == "."] <- alk$lngtclass[alk$lngtcode == '.']/10
alk$lngtclass[alk$lngtcode == 0]   <- alk$lngtclass[alk$lngtcode == 0]/10
# Round it down
alk$lngtclass <- round(alk$lngtclass)
# Put on species name (see Practical 2).  
alk$scientific.name <- as.character(tsn$completename[match(alk$speccode,tsn$tsn)])   # match scientific name onto species code 
#  Tidy up
alk <- alk[!duplicated(alk),]     #  remove any duplicates
alk <- alk[alk$speccode != -9,]   #  missing species codes
alk <- alk[alk$lngtclass != -9,]  #  missing length classes
alk <- alk[!is.na(alk$lngtcode),] #  missing length codes
alk$age <- as.numeric(as.character(alk$age))
# Select a species for a particular year and quarter and examine the age length key 
# Chose a species, year and quarter to examine
what.species    <- 'Gadus morhua' # These are examples.
alk1 <- alk[alk$scientific.name == what.species,]
#Extract the columns of interest
alk2 <- data.frame(lngtclass=alk1$lngtclass,age=alk1$age,scientific.name = alk1$scientific.name)
xtabs(~age+lngtclass,data=alk2)     # examine the data
# install.packages('mlogit') # Install package if necessary
library(mlogit)
mldata <- mlogit.data(alk2,varying=NULL,choice='age',shape='wide')
mlogit.model<- mlogit(age~1|lngtclass, data = mldata, reflevel="1")
summary(mlogit.model)
#  Calculate the logit values
newdata <- data.frame(lngtclass = 1:90)

EINAR: TO DO - below is somewhat strange

logit0 <- 13.9517 +  -0.8576*newdata$lngtclass
logit1 <- rep(0, 70)
logit2 <- -6.5674 +   0.1988*newdata$lngtclass
logit3 <- -10.099 +   0.2625*newdata$lngtclass
logit4 <- -15.8707 +  0.3558*newdata$lngtclass
logit5 <- -18.9503 +  0.3840*newdata$lngtclass
logit6 <- -19.8364 +  0.3791*newdata$lngtclass
logit7 <- -21.9500 +  0.3995*newdata$lngtclass
# We can exponentiate these logit values and scale them, knowing that the probabilities of for all the ages must sum to one.
logits <- cbind(logit0, logit1, logit2, logit3,logit4,logit5,logit6,logit7)
p.unscaled <- exp(logits)
p <- cbind(newdata, (p.unscaled / rowSums(p.unscaled)))
colnames(p) <- c("lngtclass", "age0", "age1", "age2", "age3","age4","age5","age6","age7")
apply(p[,-1],1,sum) #check

np <- p[,-1]

par(mfrow=c(1,1),mar=c(5,5,3,3))
barplot(t(np),space=0,col=1:dim(np)[[2]],ylab="P",xlab="Length(cms)",
        legend.text=dimnames(np)[[2]],args.legend=list(x='topleft',bg='wheat'))


# This basic approach can be extended to incorporate other 'predictor' variables, e.g. sex.

# As an exercise see if you can expand the model to do this and apply the predicted or modeled ALKs to the appropriate length-frequency data.


alk2 <- data.frame(lngtclass=alk1$lngtclass,age=alk1$age,sex=alk1$sex)

# Just males and females ano no missing data

alk3 <- alk2[alk2$sex %in% c('M','F'),]
alk3$sex <- ifelse(alk3$sex =="M",0,1)


xtabs(~age+lngtclass+sex,data=alk3)     # examine the data

mldata <- mlogit.data(alk3,varying=NULL,choice='age',shape='wide')

mlogit.model<- mlogit(age~1|lngtclass + sex, data = mldata, reflevel="1")

summary(mlogit.model)

Practical 04 - Exploring trawl data with statistical models

The aim of this exercise is download ICES exchange format data from www.ices.dk create nos-at-age data and investigate the output by various types of interpolation and regression models. We can then look at the implications of various assumptions.

library(mgcv)
library(akima)

Select some data from the ICES website - http://datras.ices.dk/Data_products/Download/Download_Data_public.aspx
Save the data to a directory of your choice, eg. 'c:/datras/data' and extract them from the zip files.
Install the R library datras. Do this by downloading datras*zip from the sharepoint. Open R, go to packages, install package from local zip files.
Attach datras package and others to search path
Extract the chrons, lf and alk data for the survey of your choice from the relevant directory #

alk <- parseExchangeFormatALKs(wd="/home/einarhj/r/Pakkar/datrasr")
lfs <- parseExchangeFormatLFs(wd="/home/einarhj/r/Pakkar/datrasr")
chrons <- parseExchangeFormatChrons(wd="/home/einarhj/r/Pakkar/datrasr")

Tidy up the data. This time using the functions tidy** which simply makes things tidier.

alk    <- tidyALKdata(input=alk)
lfs    <- tidyLfData(input=lfs)
chrons <- tidyChronData(input=chrons)
lfs$scientific.name[lfs$scientific.name == 'Solea vulgaris'] <- 'Solea solea'      # Change Solea solea  for Solea vulgaris ( I do this here since there is only a and b lw parameters 
alk$scientific.name[alk$scientific.name == 'Solea vulgaris'] <- 'Solea solea'
head(alk) # have a look at the data to make sure they are still there. 
head(lfs)
head(chrons)
#  NOTE: You can ONLY proceed in this Practical with Age data available look at the species with ALK data 
# sort(unique(alk$scientific.name))
ww <- grep('Culex', alk$scientific.name)      # There are mosquitoes in the NS IBTS data
alk[ww,][1:5,]

Merge the length-frequencies and chrons

what.species <- 'Gadus morhua'  # chose a species
merged.fish  <- mergeChronsLfs(chrons=chrons,length.frequencies=lfs[lfs$scientific.name==what.species,])

Get the numbers at age at each station. Note all modeling work below will be based on these files.

nage <- NosatAgebyYearLonLat(lfdat = merged.fish,alk = alk, chrons=chrons,  what.species = what.species)
head(nage,15) # Examine first 10 lines

Put on CPUEs at each station #

nage$cpue.by.n   <- nage$hlnoatage/nage$haulduration
nage$cpue.by.wt  <- nage$hlwtatage/nage$haulduration

Simple linear interpolation can be handy.

what.age  <- 4  # chose ages
grid.size <- 40  # chose grid size (larger number = finer grid)
nage.a <- nage[is.na(nage$age) | nage$age == what.age,]    # select age to plot. Remember this is all the data for all the years. 
summary(nage.a$cpue.by.n) # check you have any +ve data. Sometimes you just have zeros which makes interpolation tricky!
nage.a <- nage.a[!is.na(nage.a$cpue.by.n),]
inage <- interp(jitter(nage.a$shootlong),jitter(nage.a$shootlat),
                xo=seq(min(nage.a$shootlong), max(nage.a$shootlong), length = grid.size),
                yo=seq(min(nage.a$shootlat), max(nage.a$shootlat), length = grid.size),nage.a$cpue.by.n) # do the interpolation
par(mfrow=c(1,1))
image.plot(inage$x,inage$y,log(inage$z+1),col=topo.colors(100),xlab='latititude',ylab='longitude') # plot the nos at age
contour(inage$x,inage$y,log(inage$z+1),add=T)                                                  # contour too
#data(nseaBathy)
#contour(nseaBathy$x,nseaBathy$y,nseaBathy$z,col='white',levels=c(40,100),add=T)                # depths  too
map("worldHires", add=TRUE, col='darkseagreen', fill=TRUE, bg="white",                         
regions=c('uk','ireland','france','germany','netherlands', 'norway','belgium',
'spain','luxembourg','denmark', 'sweden','iceland', 'portugal','italy','sicily','ussr','sardinia','albania','monaco','turkey','austria',
'switzerland','czechoslovakia','finland','libya', 'hungary','yugoslavia','poland','greece','romania','bulgaria', 'slovakia','morocco',
'tunisia','algeria','egypt' ))                                                                  # add a map

IDEAS for EXERCISES

Try for different years and ages and species.
Try increasing the grid resolution.
Try adding depth contours instead.
Regression modeling where a variable, say CPUE, is 'modeled' as a function of some PREDICTOR variables (lat, lon, depth, time of day etc.).

In this file (nage) we only potentially have year, quarter, shootlong, shootlat, and age as potential PREDICTOR variables. There are of course many other possibilities, eg. depth, temperature, salinity, and these can be incorporated from the chron files if needs be. For now, however,the complexity with even these few PREDICTORS is enough to be vexing as you will see.

Linear models 13.1 Model the CPUE as a function of year and age using linear models

nage$age <- as.factor(nage$age)
table(nage$age)
mdat <- nage[nage$age != "0",]    # optional
m0 <- lm(cpue.by.n ~ 1,data=mdat)         # The 'null' model against which others will be selected
m1 <- lm(cpue.by.n ~ year,data=mdat)      # The linear effect of year
m2 <- lm(cpue.by.n ~ year+age,data=mdat)  # plus the linear effect of age
m3 <- lm(cpue.by.n ~ year*age,data=mdat)  # plus the interaction between year and age. Are the lines parallel ?
anova(m0,m1,m2,m3)         # which model would you chose ?
summary(m3)                # what do you think of this model ?
# Create a data set of years and ages over which to demonstrate the results
newdata <- expand.grid(year = min(mdat$year,na.rm=T):max(mdat$year,na.rm=T),
                       age  = as.factor(min(as.numeric(as.character(mdat$age)),na.rm=T ):
                                          max(as.numeric(as.character(mdat$age)),na.rm=T )))
# Have a look at the 'new data'
head(newdata)
# Predict over the newdata using the model 
newdata$pred <- predict(m3,newdata=newdata,type='response')
xyplot(pred ~ year | age, data= newdata)  # anything odd ?  Negative predictions ? # Investigate what might be odd patterns. Try again without the age 0s since there are so 
# few observations these are distorting the out

13.2 Select a single age group and model the trend with bendy functions NOT RUN - SOME ERROR

mdat <- nage[nage$age != "0",]
z0 <- lm(cpue.by.n ~ 1,data=mdat)         # The 'null' model against which others will be selected
z1 <- lm(cpue.by.n ~ ns(year,df=5),data=mdat)      # The linear effect of year
z2 <- lm(cpue.by.n ~ ns(year,df=5)+age,data=mdat)  # plus the linear effect of age
z3 <- lm(cpue.by.n ~ ns(year,df=5)*age,data=mdat)  # plus the interaction between year and age. Are the lines parallel ?
# Have a look at the model params 
summary(z3)
# Predict over the same new data using the model 
newdata$pred <- predict(z3,newdata=newdata,type='response')

xyplot(log(pred) ~ year | age, data= newdata)   # Compare the plots output from the different models, ie. substitute z2, for z3.

Generalized Linear and Additive Models Often data are not normal or log-normal. These models give you the ability to model data from different (discrete) distributions such as the Binomial (Bernoulli), Poisson, etc. With these models the abundance measure (usually a count) should be modeled with the effort measure (hours towed, distance towed as an offset. Beware, however. Just because these models are more sophisticated doesn't make them necessarily better. In general you should use the simplest model possible.

14.1 Model the probabability of catching a fish at each age in each year using a binomial GLM.

mdat      <- nage[nage$age != "0",]        # remove age 0s as usual 
mdat$bin  <- ifelse(mdat$hlnoatage==0,0,1) # create a binary vector (0=0 and 1= anything greater than zero). 

q0 <- glm(bin ~ offset(haulduration)+1,data=mdat,family=binomial)                  # The 'null' model against which others will be selected
q1 <- glm(bin ~ offset(haulduration)+ year,data=mdat,family=binomial)      # The  effect of year
q2 <- glm(bin ~ offset(haulduration)+ year+age,data=mdat,family=binomial)  # plus the  effect of age
q3 <- glm(bin ~ offset(haulduration)+ year*age,data=mdat,family=binomial)  # plus the interaction between year and age. Are the lines parallel ?
anova(q0,q1,q2,q3,test='Chi')    # which model would you chose ??  

# We can assess the fit with
1-pchisq(q3$null.deviance-q3$deviance, q3$df.null-q3$df.residual)

The chi-square of q3$null.deviance-q3$deviance with q3$df.null - q3$df.residual degrees of freedom and an associated p-value of less than 0.001 tells us that our model as a whole fits significantly better than an empty model. This is sometimes called a likelihood ratio test (the deviance residual is -2*log likelihood).

Make some predictions with the model Note: now that we have the 'offset' in the model we must add it to the grid. Let's standardise all our predictions to 30 mins

newdata$haulduration <- 1
newdata$pred <- predict(q3,newdata=newdata,type='response')
xyplot(log(pred) ~ year | age, data= newdata, type='l')   # Compare the plots output from the different models, ie. substitute z2, for z3.

14.2 Model counts of fish using a GLM.

mdat      <- nage[nage$age != "0",] # remove age 0s as usual 
mdat$freq <- round(mdat$cpue.by.n)  # make sure numbers are integers. A bit of a fudge this and there are probably better ways...

p0 <- glm(freq ~ offset(haulduration)+1,data=mdat,family=quasipoisson)                  # The 'null' model against which others will be selected
p1 <- glm(freq ~ offset(haulduration)+ year,data=mdat,family=quasipoisson)      # The  effect of year
p2 <- glm(freq ~ offset(haulduration)+ year+age,data=mdat,family=quasipoisson)  # plus the  effect of age
p3 <- glm(freq ~ offset(haulduration)+ year*age,data=mdat,family=quasipoisson)  # plus the interaction between year and age. Are the lines parallel ?

anova(p0,p1,p2,p3,test='Chi')    # which model would you chose ??  

# We can assess the fit with
1-pchisq(p3$null.deviance-p3$deviance, p3$df.null-p3$df.residual)

Make some predictions with the model Note: now that we have the 'offset' in the model we must add it to the grid. Let's standardise all our predictions to 30 mins

newdata$haulduration <- 30
newdata$pred <- predict(p3,newdata=newdata,type='response')
xyplot(log(pred) ~ year | age, data= newdata, type='l')   # Compare the plots output from the different models, ie. substitute z2, for z3.

14.2 Model counts of fish using a GAM.

These models use 'non-parametric' smoothing functions based on the data to model the abundances. Can be handy for summarizing spatial patterns.

library(mgcv)
mdat      <- nage[nage$age != "0",] # remove age 0s as usual 
mdat$freq <- round(mdat$cpue.by.n)  # make sure numbers are integers. A bit of a fudge this and there are probably better ways...
mdat.a    <- mdat[mdat$age==7,]     # in this case reduce the complexity and model a single age category.

g0 <- gam(freq ~ offset(haulduration)+1,data=mdat.a,family=quasipoisson)                  # The 'null' model against which others will be selected
g1 <- gam(freq ~ offset(haulduration)+ s(shootlong,k=100),data=mdat.a,family=quasipoisson)      # The  effect of longitude
g2 <- gam(freq ~ offset(haulduration)+ s(shootlong,k=100)+s(shootlat,k=100),data=mdat.a,family=quasipoisson)  # plus the  effect of latitude

g3 <- gam(freq ~ offset(haulduration)+ s(shootlong,shootlat,k=25),data=mdat.a,family=quasipoisson)  # plus the interaction between longitude and latitude. Are the lines parallel ?

anova(g0,g1,g2,g3,test='Chi')    # which model would you chose ??  

summary(g3)
1-pchisq(g3$null.deviance-g3$deviance, g3$df.null-g3$df.residual)

Do the predictions. This time we have to make a spatial grid. Use the datras function create.grid which creates a grid with a flag for whether or not it is outside the survey area. Handy since you do not want to extrapolate.

lon.range =  c(-3,8)  # Set the long and lat ranges for the grid depending on survey
lat.range <- c(52,58)
grid.size <- 50       # Set the resolution of the grid 

grid <- create.grid(lon.range=lon.range, lat.range=lat.range, lon.n=grid.size, lat.n=grid.size, lon.obs=mdat.a$shootlong, lat.obs=mdat.a$shootlat)

# Extract some data from the output of create.grid that are handy for plotting.

lonnie <- grid$lonnie
lattie <- grid$lattie
grid <- grid$grid

plot(grid$shootlong,grid$shootlat,type='n',xlab='',ylab='')   # check grid looks ok
points(grid$shootlong[grid$ok.lat==T & grid$ok.lon==T ],grid$shootlat[grid$ok.lat==T & grid$ok.lon==T],col='red',pch='.')
points(chrons$shootlong,chrons$shootlat,pch=".",col='black')


grid$haulduration <- 1 # standardise to 60 minute tows

grid$pred <- predict(g3,grid)
grid$pred[grid$ok.lon==F | grid$ok.lat==F] <- NA

image.plot(lonnie,lattie,matrix(grid$pred,50,50))
contour(lonnie,lattie,matrix(grid$pred,50,50),add=T)
# points(mdat.a$shootlong,mdat.a$shootlat,pch=16)   # put on locations of raw data to see if some of the extrapolation is justified.
map("worldHires", add=TRUE, col='darkseagreen', fill=TRUE, bg="white",
regions=c('uk','ireland','france','germany','netherlands', 'norway','belgium',
'spain','luxembourg','denmark', 'sweden','iceland', 'portugal','italy','sicily','ussr','sardinia','albania','monaco','turkey','austria',
'switzerland','czechoslovakia','finland','libya', 'hungary','yugoslavia','poland','greece','romania','bulgaria', 'slovakia','morocco',
'tunisia','algeria','egypt' ))

EXERCISES

These are the numbers at a specific age modeled for a group of years. The last map therefore represents some sort of 'average' spatial distribution. Explore how the average spatial distribution DIFFERS by age. Explore how the spatial distribution for single age groups varies among years. * Note this can be done, either by looking at each age/year combination separately, or by incorporating age and year into the model, modifying the grid and predicting over it.

Practical 05 - Simple Community Analyses

The aim of this exercise is download ICES exchange format data from www.ices.dk and calculate various types of fish community indicators.

library(vegan)
start.year <- min(chrons$year,na.rm=T)
end.year   <- max(chrons$year,na.rm=T)
# Throw out any non-fish or keep them in if you want. You decide.
species.to.omit <- c("Aequipecten opercularis", "Arctica islandica" , "Buccinum undatum",
                     "Cancer pagurus" ,"Homarus gammarus", "Loligo subulata", "Loligo vulgaris",
                     "Loliginidae" ,"Loligo" ,"Loligo forbesii","Pecten maximus" ,
                     "Nephrops norvegicus" ,"Macropipus puber",  "Cephalopoda"  ,
                     "Illex coindetii", "Sepia officinalis","Sepiola","Sepiola atlantica",
                     "Sepiolida","Sepietta owenina","Sepia")

w0 <- (1:length(lfs[,1]))[lfs$scientific.name %in% species.to.omit]

nlfs <- lfs[-w0,]              # Create new data.frame with species that you selected thrown out

Merge the length-frequencies with the chrons.

Make a shorter chron file with some important covariates to make computation quicker

short.chrons<- data.frame(quarter=chrons$quarter,country=chrons$country,ship=chrons$ship,stno=chrons$stno,
year=chrons$year,shootlong=chrons$shootlong,shootlat=chrons$shootlat,
hauldur=(chrons$hauldur/60))
#memory.size(4000)   # this can help sometimes with big jobs.
afish <- merge(nlfs,short.chrons,all=T)

Total numbers of fish divide by the total effort.

afish.yr <- tapply(afish$hlnoatlngt, list(afish$year,as.character(afish$scientific.name)), sum,na.rm=T)
hd <- tapply(short.chrons$hauldur, list(short.chrons$year),sum,na.rm=T)

dat <- matrix(NA,nrow=dim(afish.yr)[1],ncol=dim(afish.yr)[2])
for (i in 1: dim(afish.yr)[2]){
  dat[,i] <- afish.yr[,i]/hd      # divide by effort
}

Calculate diversity indices using package vegan

dat <- ifelse(is.na(dat),0,dat) # replace NAs with 0s
dat<-round(dat)
# Calculate and plot the Diversity
H <- diversity(dat)
plot(start.year:end.year,H,type='l',xlab='',lwd=2)
abline(v=start.year:end.year,lty=2,col='blue')

# Calculate and plot the Simpson
simp <- diversity(dat, "simpson");
par(mfrow=c(1,1))
plot(start.year:end.year,simp,type='l',xlab='',lwd=2)
abline(v=start.year:end.year,lty=2,col='blue')

# Calculate and plot the Inverse Simpson

invsimp <- diversity(dat, "inv");
plot(start.year:end.year,invsimp,type='l',xlab='',lwd=2)
abline(v=start.year:end.year,lty=2,col='blue')

# Calculate and plot the r2

r.2 <- rarefy(dat, 2)

plot(start.year:end.year,r.2,type='l',xlab='',lwd=2)
abline(v=start.year:end.year,lty=2,col='blue')

# Alpha 

alpha <- fisher.alpha(dat)

plot(start.year:end.year,alpha,type='l',xlab='',lwd=2)
abline(v=start.year:end.year,lty=2,col='blue')

# Plot them all against each other

pairs(cbind(H, simp, invsimp,r.2), pch="+", col="blue")   # They're all so correlated !

# Species richness (S) and Pielou's evenness (J):

S <- specnumber(dat) 

J <- H/log(S)

dat1<-cbind(H, simp, invsimp, r.2, S, J)

# Plot them all together

par(mfrow=c(2,3),mar=c(3,3,3,2),oma=c(4,4,4,4),cex=.9)
tit <- c("H","simp","invsimp","r.2","S","J")
tit <- c("Shannon","Simp","Invsimp","r.2","S","J")
y.labs <- c("H","Simp","Inv","r.2","S","J")
atime <- start.year:end.year
for(j in 1:6){
  plot(atime,(dat1[,j]),xaxt="n",,xlab="",type='b',pch=16,ylab=y.labs[i]);title(tit[j]);
  lines(supsmu(atime,(dat1[,j]),span=0.5), col='red',lwd=3,xlim=c(start.year,end.year))
  box(lwd=2)
  abline(v=seq(start.year,end.year,by=5),lty=2,col="blue")
  abline(v=c(1989,1994),lwd=2,col='black')
  axis(side=1,at=seq(start.year,end.year,by=1),seq(start.year,end.year,by=1),cex.axis=1)

  mtext(outer=T,paste('Piscean Diversity',sep=' '),cex=1.7)
}

# Create a plot to insert into a Word Document # 
#savePlot(filename = "Piscean-diversity.wmf", type=c("wmf"), device=dev.cur())

Some Simple Indices based on Average lengths

Average length by year and scientific name. Note you may want to include seasonal information (quarter) if available in your survey. #

avl <- aggregate(list(a=(nlfs$lngtclass*nlfs$hlnoatlngt),b=nlfs$hlnoatlngt), list(year=nlfs$year,scientific.name=nlfs$scientific.name),sum,na.rm=T)

avl$mean.length <-avl$a/avl$b

which.species.to.plot <- sort(c("Zeus faber","Solea solea","Gadus morhua" ,  "Platichthys flesus", "Psetta maxima",  "Pollachius virens", "Hippoglossoides platessoides", 
"Lepidorhombus whiffiagonis", "Mustelus mustelus","Pleuronectes platessa","Melanogrammus aeglefinus","Scyliorhinus canicula","Squalus acanthus"))

xyplot(mean.length~year|scientific.name,data=avl[avl$scientific.name %in% which.species.to.plot,],type='l')

# Average demersal fish length # 

# Would be ridiculous for me to 'hardwire' these choices into any code. Far better for you to make choices based on what you want.

group.to.omit <- c('Alosa agone','Alosa alosa','Alosa fallax','Ammodytes','Ammodytes marinus','Ammodytes tobianus', 'Ammodytidae',
'Clupea harengus','Engraulis encrasicolus','Scomber scombrus','Sardina pilchardus')    # Easier to omit the pelagics

w1 <- (1:length(nlfs[,1]))[nlfs$scientific.name %in% group.to.omit]                    # Find where the omitted group are.

nlfs0 <- nlfs[-w1,]                                                                    # Create new data.frame with species that you selected thrown out

avl <- aggregate(list(a=(nlfs0$lngtclass*nlfs0$hlnoatlngt),b=nlfs0$hlnoatlngt),
list(year=nlfs0$year),sum,na.rm=T)                                     # Average length over year.

avl$mean.length <-avl$a/avl$b
                                                                        # Plot average length against year for group chosen
par(mfrow=c(1,1))
plot(start.year:end.year, avl$mean.length,type='l',xlab='',ylab='')
abline(v=start.year:end.year,lty=2,col='blue')


#
#12. Large Fish Indicator#
#

# Note: here we calculate LFI by weight. Substitute hlwtatlngt to calculate numbers.

how.big <- 40 #   Set your large fish threshold

big.fish <- afish[!is.na(afish$lngtclass) & afish$lngtclass >= how.big ,]

# By year #

afish.yr <- tapply(afish$hlwtatlngt, list(afish$year), sum,na.rm=T)
bfish.yr <- tapply(big.fish$hlwtatlngt, list(big.fish$year), sum,na.rm=T)

lfi     <- bfish.yr/afish.yr

par(mfrow=c(1,1))
plot(start.year:end.year,lfi,xlab='',ylab='',type='l',lwd=2)
abline(v=start.year:end.year,lty=3,col='blue')
lines(supsmu(start.year:end.year,lfi),col='red')
title('Large Fish Indicator')


# Spatially #

# Sum total weights over year and location 
afish.xy <- aggregate(list(all.hlwt=afish$hlwtatlngt), list(shootlong=afish$shootlong,shootlat=afish$shootlat,year=afish$year), sum,na.rm=T)

# Sum the weights of big fish over year and location

bfish.xy <- aggregate(list(big.hlwt=big.fish$hlwtatlngt), list(shootlong=big.fish$shootlong,shootlat=big.fish$shootlat,year=big.fish$year), sum,na.rm=T)

Not run (bug in function):

# Plot it on a grid/map using TrawlSurveyGridPlot #

grid.size.y <- .5                                                # grid size here is ICES statistical rectangle but this is entirely flexible
grid.size.x <- 1  

we <- -5                                                        # east and west boundaries
ea <- 10
so <- 48
no <- 62

par(mfrow=c(1,1))                                                # number of plots

awt <-TrawlSurveyGridPlot(afish.xy, we=we,ea=ea,so=so,no=no,nameLon = "shootlong", nameLat = "shootlat",plotMap=T,
nameVarToSum="all.hlwt",cellsizeX = grid.size.x, cellsizeY = grid.size.y,
legendx='bottomright',numcats=10,plotPoints=T,legendncol=2,paletteCats = "topo.colors",addICESgrid=TRUE,legendtitle="weight")

bwt <-TrawlSurveyGridPlot(bfish.xy, we=we,ea=ea,so=so,no=no,nameLon = "shootlong", nameLat = "shootlat",plotMap=T,
nameVarToSum="big.hlwt",cellsizeX = grid.size.x, cellsizeY = grid.size.y,
legendx='bottomright',numcats=10,plotPoints=T,legendncol=2,paletteCats = "topo.colors",outGridFile='test.txt',addICESgrid=TRUE,legendtitle="weight")


im.awt <-  as.image.SpatialGridDataFrame(awt, attr=2)      # extract total weights from output of TrawlSurveyGridPlot
im.bwt <-  as.image.SpatialGridDataFrame(bwt, attr=2)      # extract weights of big fish from output of TrawlSurveyGridPlot

im.lfi <- im.awt
im.lfi$z <- im.bwt$z/im.awt$z                              # divide weight of the big fish by weight of all the fish

data(nseaBathy)                                            # North Sea
data(nwestBathy)                                           # North west

library(fields)

image.plot(im.lfi$x,im.lfi$y,im.lfi$z,col=topo.colors(10),xlab='',ylab='')                     # plot the data on a map

contour(nseaBathy$x,nseaBathy$y,nseaBathy$z,add=T,level=c(20,40,60),col='black')         # add on depth contour
#contour(nwestBathy$x,nwestBathy$y,nwestBathy$z,add=T,level=c(100,200,500,1000,2000,4000),col='black') 

map("worldHires", add=TRUE, col='darkseagreen', fill=TRUE, bg="white",
regions=c('uk','ireland','france','germany','netherlands', 'norway','belgium',
'spain','luxembourg','denmark', 'sweden','iceland', 'portugal','italy','sicily','ussr','sardinia','albania','monaco','turkey','austria',
'switzerland','czechoslovakia','finland','libya', 'hungary','yugoslavia','poland','greece','romania','bulgaria', 'slovakia','morocco',
'tunisia','algeria','egypt' ))