R/00_data-processing-fiadb.R

Defines functions `get_dist` `stem_coord` `f`

######################################################################
#
# GRM -- FIA data processing
#
#  Rob Smith, phytomosaic@gmail.com, 29 Mar 2020
#
##  CC-BY-SA 4.0 License (Creative Commons Attribution-ShareAlike 4.0)


### preamble
rm(list=ls())
# devtools::install_github('phytomosaic/ecole')
require(ecole)
require(data.table)
setwd('./data_raw/fia_data/')

### function to read FIADB file
`f` <- function(x, keepcols=NULL, ...) {
  cat('File size (MB):',round(file.info(x)$size/1024^2,1),'--> ')
  cat('Time elapsed:',
      system.time({
        out <- data.table::fread(x, header = T, # sep = ',',
                                 dec = '.', # quote = '',
                                 fill = T, data.table = T,
                                 select = keepcols,
                                 integer64='character', ...)
      })[[3]], '\n')
  return(out)
}

### read GRM files
d <- rbindlist(lapply(list.files('.',pattern='_GRM_COMPONENT.csv'), f,
                      keepcols=c('TRE_CN','PREV_TRE_CN','PLT_CN',
                                 'DIA_BEGIN','DIA_MIDPT','DIA_END',
                                 'ANN_DIA_GROWTH','STEM_COMPONENT')))
setnames(d, names(d), tolower(names(d))) # cleanup column names

### read TREE files
tr <- rbindlist(lapply(list.files('.', pattern='_TREE.csv'), f,
                       keepcols=c('CN','PLT_CN','INVYR','SPCD',
                                  'DIA','AZIMUTH','DIST','SUBP')))
setnames(tr, names(tr), tolower(names(tr))) # cleanup column names
setnames(tr, 'cn', 'tre_cn')                # rename
# assign subplot identifier
tr$subpid <-paste(formatC(tr$plt_cn,wid=2,format='d',flag='0'),
                  formatC(tr$subp,wid=2,format='d',flag='0'), sep='_')
# calculate stem coordinates, and identify seedlings and edge trees
`stem_coord` <- function(a, d, x0=0, y0=0) {
  data.frame(x = x0 + d * cos(a / 180 * pi),
             y = y0 + d * sin(a / 180 * pi))
}
tr <- cbind(tr, stem_coord(-tr$azimuth, tr$dist,0,0)) ; rm(stem_coord)
is_seed <- (tr$dia < 5
            & !is.na(tr$dia)
            & !is.na(tr$x)
            & !(abs(tr$dist) > 6.8))
tr$x[is_seed] <- tr$x[is_seed] + 12 # offset seedlings by 12 ft
tr$is_seed <- is_seed  ;  rm(is_seed)
`get_dist` <- function(x, y) { sqrt((x - 0)^2 + (y - 0)^2) }
tr[, dist      := get_dist(x,y)] # recalc dist AFTER offset seedlings
tr[, is_macro  := any(abs(dist) > 24), by=plt_cn] # 58.9-ft macroplot?
tr[, edge_dist := ifelse(is_macro, 58.9, 24.0)] # assign size
tr[, is_edge := (abs(edge_dist - dist) < 6)] # arbitrary 6-ft buffer
tr[, c('is_macro','edge_dist') := NULL] # squash unneeded columns

### read PLOT files
p <- rbindlist(lapply(list.files('.', pattern='_PLOT.csv')[-1], f,
                      keepcols=c('CN','STATECD','MEASYEAR',
                                 'LAT','LON','ELEV','ECOSUBCD',
                                 'REMPER')))
setnames(p, names(p), tolower(names(p))) # cleanup column names
setnames(p, 'cn', 'plt_cn')              # rename
p[, elev := round(elev * 0.3047851,0)] # ft to m
p$ecosubcd <- ecole::clean_text(p$ecosubcd) # trim leading whitespace

### read REF_SPECIES file
ref <- read.csv('./REF_SPECIES.csv', stringsAsFactors=F)
ref <- ref[,colnames(ref) %in% c('SPCD','COMMON_NAME','GENUS',
                                 'SPECIES', 'VARIETY','SUBSPECIES')]
names( ref ) <- tolower(names( ref ))     # names to lowercase
ref$spp <- paste0(ref$genus,'_',ref$species,'_',
                  ref$variety,'_',ref$subspecies)
ref$spp <- ecole::clean_text(ref$spp, lower=TRUE)
ref     <- as.data.table(ref[,c('spcd','spp')])

### reference table for ecological subsections (to split east-west)
e <- read.csv('./REF_ECOSUBSECTION.csv', stringsAsFactors=F)
names(e) <- tolower(names(e))
e <- as.data.table(e)
e[, province_name := NULL] # drop one column

# ### read CONFIDENTIAL file
# cf <- f('BASE_PLOT.csv', keepcols=c('CN','LAT_DIGITIZED',
#                                       'LON_DIGITIZED','ELEV'))
# setnames(cf, names(cf), tolower(names(cf))) # cleanup col names
# setnames(cf, c('cn','lat_digitized','lon_digitized'),
#          c('plt_cn','latt','lonn'))                 # rename
# cf[, elev := round(elev * 0.3047851,1)] # ft to m

### merging
d <-  tr[d, on = c(tre_cn = 'tre_cn')]   # get tree attributes at *y2*
d <-   p[d, on = c(plt_cn = 'plt_cn')]   # get plot info
# d <-  cf[d, on = c(plt_cn = 'plt_cn')] # get confidential plot info
d <- ref[d, on = c(spcd = 'spcd')]       # get species names
d <- e[d, on = c(ecosubcd = 'ecosubcd')] # merge on ecosubcd
d[, grep('^i\\.', names(d), value=T) := NULL] # omit dupe columns
setnames(d, 'measyear', 'measyr_2')      # rename
d$measyr_1 <- round(d$measyr_2 - d$remper,0)  # original measyear?
rm(tr, p, ref, f, e)

### keep only plots from northern CA, all OR, all WA
d <- d[lat > 38.72,]

### reclassify sides of Cascades, based on elevation (east-west/lo-hi)
d$ewh <- d$ew <- ifelse(d$assigned %in% c('east','eastmont'),
                        'east', 'west')
d$ewh[d$ew == 'west' & d$elev > 1000] <- 'westhi' # 1100 m ~3300 ft
d$ewh[d$ew == 'east' & d$elev > 1500] <- 'easthi' # 1500 m ~4900 ft

### remove some invalid 'species'
d <- d[!c(spp %in% c('tree_broadleaf','tree_evergreen'))]

### remove any infrequent species (< 49 individuals)
rev(sort(table(d$spp)))
d <- d[, frq := .N, by = spp][(frq >= 49)] # remove infrequent spp
d[, frq := NULL ]  # omit frequency column

### change a few names to be compatible with TRY-DB
d$spp[d$spp == 'chrysolepis_chrysophylla_chrysophylla'] <-
  'chrysolepis_chrysophylla'
d$spp[d$spp == 'populus_balsamifera_trichocarpa'] <-
  'populus_trichocarpa'
d$spp[d$spp == 'abies_shastensis'] <-
  'abies_magnifica_var_shastensis'

### temporarily fill NA=0 for new recruits (for next step)
d$dia_begin[is.na(d$dia_begin) & d$stem_component == 'INGROWTH'] <- 0

### *annualize* Y1/Y2 diameters using 'ann_dia_growth'
d$dia_y1   <- d$dia_begin
d$dia_y2   <- d$dia_begin + d$ann_dia_growth

### remove invalid/removed/logged trees
table(d$stem_component)
is_invalid <- d$stem_component %in% c('REMEASURED DEAD','NOT USED',
                                      'UNKNOWN')
was_harvested  <- d$stem_component %in% c('CUT DEAD', 'CUT1', 'CUT2')
sum(c(is_invalid | was_harvested)) / NROW(d) # nearly 20% are invalid
d <- d[!c(is_invalid | was_harvested),]      # CAUTION!

### assign tree fate, recode by recruit/mortality
is_newrecruit  <- d$stem_component == 'INGROWTH'
is_naturaldead <- d$stem_component %in% c('MORTALITY1','MORTALITY2')
is_survivor    <- d$stem_component == 'SURVIVOR'
d$dia_y1[is_newrecruit]  <- NA # if new recruit then expect no Y1
# if dead then expect no Y2 !CAUTION! dead trees still have 'size':
d$dia_y2[is_naturaldead] <- NA
# ### size-distribution of survivor/recruit/mortality classes
# hist(d$ann_dia_growth[is_survivor],    bre=44, col='#00000050',
#      main='', xlab='Annual diameter growth')
# hist(d$ann_dia_growth[is_naturaldead], bre=44, col='#FF000050', add=T)
# hist(d$ann_dia_growth[is_newrecruit],  bre=44, col='#00000050', add=T)
rm(is_invalid,is_naturaldead,is_newrecruit,is_survivor,was_harvested)

### CAUTION: remove if both Y1/Y2 == NA (and therefore untrackable)
sum(is.na(d$dia_y2) & is.na(d$dia_y1)) / NROW(d) # ~16% invalid
d <- d[!(is.na(d$dia_y2) & is.na(d$dia_y1)),]

### tally recruits/mortality/survivors
### identify survivors (if has measurement at both events)
d$surv <- ifelse(c(!is.na(d$dia_y2) & !is.na(d$dia_y1)),1,0)
sum(d$surv)                             # tally survival
sum(is.na(d$dia_y2))                    # tally mortality
sum(is.na(d$dia_y1) & !is.na(d$dia_y2)) # tally new recruits

# remove *negative* diameter growth (only a few hundred cases)
isneg <- c(d$dia_y1 > d$dia_y2)
isneg <- isneg & !is.na(isneg)
sum(isneg) / NROW(d) # 0.2% of cases
d <- d[!isneg,]
rm(isneg)

### COARSE way to get climate info:
###   - fuzzed locations
###   - data from ClimateNA
# TODO: get climate values for *exact* remeasurement period
# TODO: get *actual* plot locations
a <- d[!duplicated(d$plt_cn),c('plt_cn','lat','lon','elev','measyr_1')]
dim(a) # 13716 unique plots
# ### TODO! slice climate values for each measurement period
# ### --> --> --> go to ClimateNA --> --> --> --> --> -->
# prep_climatena(ID1=a$plt_cn, lat=a$lat, long=a$lon, el=a$elev,
#                file = 'C:/Users/Rob/Desktop/to_climatena.csv')
# ### --> --> --> return from ClimateNA --> --> --> --> --> -->
# file.copy(from = 'C:/Users/Rob/Desktop/from_climatena.csv',
#           to = './from_climatena.csv',
#           overwrite = TRUE)   # copy for posterity
# ### --> --> --> return from ClimateNA --> --> --> --> --> -->
aa <- read.csv('./from_climatena.csv')
aa[aa==(-9999)] <- NA   # replace NA value with NA
all(unique(a$plt_cn) %in% unique(as.character(aa$ID1)))
a <- cbind(a, aa[match(a$plt_cn, as.character(aa$ID1)),])
names(a) <- tolower(names(a))
j <- names(a)[-c(grep(paste( # trim some not-useful columns
  c('dd_0','dd5','dd_18','dd18','bffp','effp','mar','rad',
    'id1','id2','elevation','latitude','longitude'),
  collapse='|'), names(a), value=F))]
a <- a[ , ..j]  ;  rm(j)
d <- cbind(d,a[match(d$plt_cn, a$plt_cn),-c(1:5)])# match clim data
rm(aa,a)
d

# ### CAUTION: remove if coordinates (therefore climate) not avail?
# isna <- is.na(d$lat)
# sum(isna) / length(d$lat)
# # d <- d[!isna,]  ;  rm(isna)

### squash unneeded columns
d[,c('assigned','province','section','section_name','dia','dia_begin',
     'dia_midpt','dia_end','ann_dia_growth','stem_component',
     'azimuth','invyr', 'prev_tre_cn' ) := NULL]

### sort remaining columns
names(d)
j <- c('plt_cn','tre_cn','statecd','lat','lon','elev','subp','subpid',
       'ecosubcd','eco_subsection_name','ew','ewh',
       'spcd','spp','azimuth','dist','x','y','is_seed','is_edge',
       'remper','measyr_1','measyr_2','dia_y1','dia_y2','surv',
       'tmax_wt', 'tmax_sp', 'tmax_sm', 'tmax_at', 'tmin_wt',
       'tmin_sp', 'tmin_sm', 'tmin_at', 'tave_wt', 'tave_sp',
       'tave_sm', 'tave_at', 'ppt_wt', 'ppt_sp',  'ppt_sm',  'ppt_at',
       'nffd_wt', 'nffd_sp', 'nffd_sm', 'nffd_at', 'pas_wt','pas_sp',
       'pas_sm',  'pas_at', 'eref_wt','eref_sp','eref_sm','eref_at',
       'cmd_wt',  'cmd_sp', 'cmd_sm', 'cmd_at',
       'rh_wt', 'rh_sp', 'rh_sm', 'rh_at')
d <- d[ , ..j]

### climate PCA
j <- grep('tmax|tmin|tave|ppt|nffd|pas|eref|cmd|rh', names(d))
j <- j[j!=62]
anyNA(d[,..j]) # expect FALSE
(pc <- prcomp(d[,..j], retx=T, center=T, scale.=T, rank.=2))
summary(pc)    # first 2 explain 82.1% variance = 50.6 + 31.4
V <- t(apply(pc$rotation,1,function(a,b){a*b},pc$sdev))[,1:2] # corr
V[,1]    <-  (-1) * V[,1]     # reverse sign of PC1
pc$x[,1] <-  (-1) * pc$x[,1]  # reverse sign of PC1
# write.csv(V, file='./V_climate_pca.csv')
colnames(pc$x) <- tolower(colnames(pc$x))
d <- cbind(d, pc$x[,1:2], stringsAsFactors=F)
### plot the PCs
`h` <- function (x){
  r <- c("#5E4FA2", "#4F61AA",
         "#4173B3", "#3386BC", "#4198B6", "#51ABAE",
         "#62BEA6", "#77C8A4", "#8ED1A4", "#A4DAA4",
         "#B8E2A1", "#CBEA9D", "#DEF199", "#EAF69F",
         "#F2FAAC", "#FAFDB8", "#FEFAB6", "#FEF0A5",
         "#FEE695", "#FDD985", "#FDC978", "#FDB96A",
         "#FCA75E", "#F99254", "#F67D4A", "#F26943",
         "#E85A47", "#DE4B4B", "#D33C4E", "#C1284A",
         "#AF1446", "#9E0142")
  image(1:ncol(x), 1:nrow(x), t(x), col=r, axes=F, xlab='', ylab='')
  axis(3, at=1:ncol(x),lab=c('PC1','PC2'),las=1,tick=F,cex.axis=0.6)
  axis(2, at=1:nrow(x), lab=rownames(x), las=1, cex.axis=0.6)
}
png('./../../fig/fig_01_climate_PCA.png',9,3,'in',bg='transparent',
    res=1000)
set_par(3) ; par(font=2, tcl=-0.2, mgp=c(1.6,0.4,0))
h(V)  # loadings
abline(h=seq(1,NROW(V)+4,by=4)-0.5) ; box()
i <- !duplicated(d$plt_cn) # one point per site
plo(d$lon[i],d$lat[i],cex=0.5,col=colvec(d$pc1[i]),asp=1.6,ylab='',
    xlab='')
add_text(0.03,0.95,'PC1 (Thermal)', cex=1.2)
plo(d$lon[i],d$lat[i],cex=0.5,col=colvec(d$pc2[i]),asp=1.6,ylab='',
    xlab='')
add_text(0.03,0.95,'PC2 (Aridity)', cex=1.2)
dev.off()
rm(pc, V)
# PC1 (50.6% varexpl) = thermal: less montane, hotter temps, little
#                                   precip as snow, greater evaporation
# PC2 (31.4% varexpl) = aridity: larger moisture deficit, lower RH,
#                                   less precip

### save
pnw_tree <- d
save(pnw_tree, file='../../data/pnw_tree.rda')

####    END    ####
phytomosaic/pnw documentation built on April 16, 2020, 7:29 p.m.