## /* segments_clean.R ---
## Filename: segments_clean.R
## Description: Cleaning contour seedling data, seesapmas11
## Author: Noah Peart
## Created: Tue Feb 2 17:07:06 2016 (-0500)
## Last-Updated: Sat Mar 12 00:19:51 2016 (-0500)
## By: Noah Peart
## */
## /* yaml */
##' ---
##' title: "Cleaning seesapmas11"
##' author: "Noah Peart"
##' date: "`r Sys.Date()`"
##' output_format:
##' html_document:
##' theme: journal
##' highlight: zenburn
##' toc: true
##' toc_float:
##' smooth_scroll: true
##' ---
##'
##+setup, include=FALSE, echo=FALSE
## /* knitr setup */
library(knitr)
opts_chunk$set(fig.path='figures/', cache=FALSE, echo=TRUE, message=FALSE) # echo=FALSE
## /* end setup */
##' The goal is to separate out extension growth files, and convert
##' the resulting files to long format, excluding uneccesary columns as well as
##' checking the data for some details. Note that there is a fair amount of extra
##' code to make summary figures/checking that could be removed.
##'
##' __Notes__:
##'
##' + More documentation in _Segment Data History 1988-2012.docx_
##' + `TAG` should be character to avoid floating point issues
##' + unique `CONTNAM`/`STPACE` designates a plot
##' + 1988/1989: 5 PIRU/ABBA measured in each plot and tagged 1:5
##' + Locations of plants defined by `ALONG`/`DISUPDN` (distance along/distance up/down)
##'
##' __Things to clean__:
##'
##' + Set empty strings to NA
##' + Duplicated tags from 1989 for seedlings and saplings
##' + create unique IDs taking into account the duplicates
##' + Should be unique IDs taking into account `CONTNAM`/`STPACE`/`SPEC`
##' + A few rows missing `SPEC`
##' + A few rows missing `ALONG` (a few overlap with those missing `SPEC`)
##' + All heights in cm (check from docs)
##' + Separate extension growth into a new dataset:
##' * Extenstion growths:
##' * in mm or cm (check)
##'
##' __Seedling notes__:
##'
##' + defined under 1m tall (arbitrary here, for analyses)
##' + Only seedlings have substrate measurements
##'
##' __Substrate__:
##'
##' These procedures are very convoluted and year specific.
##' For the current analysis:
##'
##' + Only used 1999 data (the most extensive type system)
##' + All `ON` variables were ignored
##' + Broad types: moss, mixed litter, coniferous litter, deciduous litter
##'
##+load-data
library(data.table)
library(stringi)
library(DT)
library(seedsub.mas) # storing master data in this package
source('utils.R')
dtopts <- list(scrollX=TRUE)
cseed <- copy(seesapmas11)
## Plot keys:
ckeys <- c('CONTNAM', 'STPACE')
cseed[, TAG := as.character(TAG)] # TAG as char
## empty strings to NA
chars <- names(cseed)[vapply(cseed, is.character, logical(1))]
cseed[, chars := lapply(.SD, function(x) { x[x==''] <- NA_character_; x}),
.SDcols=chars, with=FALSE]
## Some variables of interest
hts <- grep('^HT[0-9]+', names(cseed), value=TRUE) # height columns
ids <- c('TAG', 'SPEC') # individual identifies
locs <- c('ALONG', 'DISUPDN') # location variables
cols <- c(ckeys, ids, 'YRTAG', locs, hts) # general cols for tables
## /* end load-data */
##'
##' ------------
##'
##' ## Heights
##' Check heights:
##' - types
##' - NAs
##' - missing rows
##+heights1
types <- cseed[, lapply(.SD, typeof), .SDcols=hts]
nans <- cseed[, lapply(.SD, function(x) sum(is.na(x))), .SDcols = hts] # NAs
tab <- rbind(types, nans)[, TYPE := c('Type', 'NA count')][]
setcolorder(tab, c('TYPE', names(types)))
knitr::kable(tab, caption='Height column types and counts.')
## Rows where there was no height measurement?
nohts <- cseed[, Reduce('&', lapply(.SD, is.na)), .SDcols=hts]
nohts <- which(nohts) # 11 with no height measurements
## /* end heights check */
##'
##' Rows with missing heights (11):
{{prettify(nohts, type='all')}}
##'
##' ----------------------
##'
##' ## Duplicates
##'
##' Check unique tags `CONTNAM`/`STPACE`/`SPEC`
##' There are duplicates when not excluding 1988 and 1989
##' Notes:
##' - duplicates in 1989 were b/c 3 seedlings and 3 saplings
##' of both ABBA and PIRU were sampled in each segment (actually 4 in a couple).
##' These seedlings and saplings were both labelled with tags 1:3 (or 4).
##' However, they are readily separated since only seedlings have substrate measurements
##' and in general the heights will be vastly different (unless a sapling was
##' totally leaned over).
##+dupe-setup
dupes <- cseed[, .(TAG=TAG[duplicated(TAG)]), keyby=c(ckeys, 'SPEC')]
dupes2 <- cseed[!(YRTAG %in% c(1988, 1989)), TAG[duplicated(TAG)],
by=c(ckeys, 'SPEC')] # no dupes when excluding 1989
## Examine these duplicates: 200 of them (100 repeated once each) in 1989
## All have heights, each tag seems to have a height on two scales
dupe1 <- cseed[dupes, on=c(ckeys, 'SPEC', 'TAG')][,
NUM_DUPES := .N, keyby=c(ckeys, 'SPEC', 'TAG')][]
dupeyrs <- table(dupe1$YRTAG) # 200 in 1989
## Table showing these dupes only had heights measured in 1989
dupehts <- dupe1[, lapply(.SD, function(x) sum(!is.na(x))), .SDcols=hts]
## /* end dupe-setup */
##' ### Visualize the duplicates in table
##' Every plot measured in 1989 (25 of them) followed same procedure
##' of id'ing the saplings and seedlings with same tag numbers.
##+dupe-table
dcols <- c(ckeys, 'SPEC', 'TAG', 'HT89', 'NUM_DUPES') # columns to include
## datatable(dupe1[, c(dcols, 'CLASS'), with=FALSE], options=dtopts)
dupe1dt <- datatable(dupe1[, dcols, with=FALSE], options=dtopts) # indiv. dupes
dupe1dt
## /* end dupe-table */
##'
##' Numbers of duplicated tags by `CONTNAM`/`STPACE` (ie. plot).
##+dupe2-table
cdupes <- dupe1[, .(TOTAL_DUPES=.N), by=ckeys]
dupe2dt <- datatable(cdupes, options=dtopts) # number of dupes/plot
dupe2dt
## /* end dupe2-table */
##'
##+unique-ids
## Assign unique IDs, since duplicate TAGs were actually different
## plants in 1989 this is no problem
cseed[, ID := 1:.N]
## /* end unique-ids */
##'
##' ----------------------
##'
##' ## Extension Growth
##'
##' Pull out extension growth data:
##' - key by `ID`
##'
##+extension
## matching patterns
patts <- c(
'^EX[0-9]+', # extension growth
'^PEX[0-9]+', # partial extension (in the yr it was sampled)
'^MINAGE[0-9]+', # minimum age
'^ENOTE[0-9]+', # notes on ext. growths
'^EXCNT[0-9]+', # number of ext. measured
'^AGE[0-9]+', # Total age for 1988 seedlings
'^D[0-9]RM[0-9]+', # distance remaining after last countable extension
'^EXSUM[0-9]+', # summed extension growth
'YRREX', # last measureable year from ext. growths
'^NL[0-9]+_[0-9]', # Derived from ENOTE
'^TERM[0-9]+' # Terminal leader condition
)
patt <- paste(patts, collapse='|')
ecols <- grep(patt, names(cseed), value=TRUE)
ecols <- c('ID', ecols)
## Extract columns into ext. growth dataset
cextgr <- cseed[, ecols, with=FALSE]
## Save raw ext. growth for later: needs to be melted
## if (!file.exists('temp')) dir.create('temp')
## save(cextgr, file='temp/cextgr.rda', compress='bzip2')
## /* end extension */
##'
##' The following columns will be removed from `seesapmas11` and used in the new
##' extension growth dataset.
##'
##' **Extension growth columns**:
{{prettify(ecols)}}
##'
##'
##' For actually, cleaning of the ext. growth data see [cextgr_clean.R](cextgr_clean.R).
##'
##' --------------------------------
##'
##' ## Remove columns
##' - Remove ext. growth columns (save the `ID`)
##' - `SEG88` is redundant, `YRTAG == 88` is the same information.
##' - `CLASS` is all 'S' (segment) so removing.
##' - `TREE` is just derived from height (in year of tagging) to determine sapling/seedling
##' - Removing unknown columns: `CODE98`, `FLAG`
##'
##+remove-cols
cseed[, setdiff(ecols, 'ID') := NULL, with=FALSE]
others <- c('SEG88', 'CLASS', 'CODE98', 'FLAG', 'TREE')
cseed[, others := NULL, with=FALSE]
## /* end remove-cols */
##'
##' ## Soil columns
##' Keeping these in for now...
##+soil-cols
soilcols <- c(
'SOILCL[0-9]?', # soil classes, two soilcl columns...
'HISTO',
'SPODO',
'INCEP',
'MINER',
'ROCK',
'PHISTO'
)
scols <- grep(paste(soilcols, collapse='|'), names(cseed), value=TRUE)
## /* end soil-cols */
##' Columns:
{{prettify(scols)}}
##'
##' ----------------------------------
##'
##' ## Separate joined variable years
##' - `NOTES9899`
##' - `SGLEN8889`, `SGLEN9899`
##' - `SGDSP9899`
##'
##' These should be separated into individual year columns prior to melting
##+sglen-sep
sampyrs <- cseed[, lapply(.SD, function(x) sum(!is.na(x))), .SDcols=hts, by=ckeys]
sglens <- grep('SGLEN', names(cseed), value=TRUE)
cseed[, paste0('SGLEN', c('88', '89')) := SGLEN8889]
cseed[, paste0('SGLEN', c('98', '99')) := SGLEN9899]
cseed[, sglens := NULL, with=FALSE] # remove old ones
cseed[, paste0('NOTES', c('98', '99')) := NOTES9899]
cseed[, NOTES9899 := NULL]
cseed[, paste0('SGDSP', c('98', '99')) := SGDSP9899]
cseed[, SGDSP9899 := NULL]
## /* end sglen-sep */
##'
##' --------------------------------
##'
##' ## Reshaping
##'
##' Gonna do a bunch of melting separately and join back together
##' since the years are different for all the variables and we want a single
##' resulting `YEAR` column. Also, adding a unique plot identifier columns, `PID`,
##' for convenience, and to link the plot-level variables to individual-level variables
##' when the data is separated by this distinction.
##'
##+melt-vars
ids <- c('ID') # melt with these ids
jids <- c(ids, 'YEAR') # join back with these ids
## Constants
consts <- c(ids, 'CONTNAM', 'STPACE', 'SPEC', 'ALONG', 'DISUPDN', 'YRTAG',
'TAG', 'YRMORT', 'ASPCL', 'ELEVCL', 'HRB1', 'HRB2', 'HRB3', 'PALONG',
'SDSP1', 'DSDSP1', 'NSDSP1', 'SDSP2', 'DSDSP2', 'NSDSP2', 'CHECK',
scols)
## varying
sglens <- grep('SGLEN', names(cseed), value=TRUE)
hts <- grep('^HT[0-9]+', names(cseed), value=TRUE)
stats <- grep('STAT[0-9]+', names(cseed), value=TRUE)
notes <- grep('NOTES?[0-9]+', names(cseed), value=TRUE)
decms <- grep('DECM[0-9]+', names(cseed), value=TRUE)
subs <- grep('^SUB[0-9]+', names(cseed), value=TRUE)
subons <- grep('^SUBON[0-9]+', names(cseed), value=TRUE)
sgdsp <- grep('SGDSP[0-9]+', names(cseed), value=TRUE)
stmlens <- grep('STMLEN[0-9]', names(cseed), value=TRUE) # only 2011
tcvrs <- grep('TCVR[0-9]+', names(cseed), value=TRUE) # only 1988
nfcvrs <- grep('NFCVR[0-9]+', names(cseed), value=TRUE) # only 1988
brhts <- grep('BRHT[0-9]+', names(cseed), value=TRUE) # only 1999
ciis <- grep('CII[0-9]+', names(cseed), value=TRUE) # only 2000
## All year-varying variables
allvary <- c(sglens, hts, stats, notes, decms, subs, subons,
stmlens, sgdsp, tcvrs, nfcvrs, brhts, ciis)
## Check got them all
inds <- names(cseed) %in% c(consts, allvary)
checkcols <- !length(names(cseed)[!inds])
if (!checkcols) stop(sprintf('Missed some columns: %s',
toString(names(cseed)[!inds])))
## /* end melt-vars */
##'
##' By __year__ the following variables are considered:
##'
##' - __Constant__:
{{prettify(consts)}}
##' - __Varying__:
{{prettify(allvary)}}
##'
##' The main constraint on the output is that there should be no NA values
##' for the `HT` column. **Actually** this isn't true, because of the 2000 measurements
##' which were only on status and didn't involve heights at all. Is there a better way
##' to handle this -- it leads to a bloating of the data.
##'
##+melting
## Melt HT first, then join each additional to it
res <- melt(cseed[, c(ids, hts), with=FALSE],
id.vars=ids, na.rm=TRUE, value.name='HT', measure.vars=hts,
variable.name='YEAR')
res[, YEAR := stri_extract(YEAR, regex='[0-9]+')]
## All year-varying variables
allvary <- list(sglens, stats, notes, decms, subs, subons,
stmlens, sgdsp, tcvrs, nfcvrs, brhts, ciis)
## For each element, melt by those variables, clean the year column,
## and join back to `ht`
for (v in allvary) {
coln <- stri_extract(v[[1]], regex='^[A-Z]+', mode='first')
tmp <- melt(cseed[, c(ids, v), with=FALSE],
id.vars=ids, na.rm=TRUE, value.name=coln, measure.vars=v,
variable.name='YEAR')
tmp[, YEAR := stri_extract(YEAR, regex='[0-9]+')]
res <- if (all(unique(tmp$YEAR) %in% unique(res$YEAR))) {
tmp[res, on=c(ids, 'YEAR')] # simple join on ID and YEAR
} else {
merge(res, tmp, by=c(ids, 'YEAR'), all.y=TRUE)
}
}
## Check counts
ss <- sum(seesapmas11[, Reduce("sum", lapply(.SD, function(x) !is.na(x))),
.SDcols=hts])
stopifnot(ss == sum(!is.na(res$HT)))
ss <- sum(seesapmas11[, Reduce("sum", lapply(.SD, function(x) x != '')),
.SDcols=stats])
stopifnot(sum(!is.na(res$STAT)) == ss)
ss <- sum(seesapmas11[, Reduce("sum", lapply(.SD, function(x) !is.na(x))),
.SDcols=tcvrs])
stopifnot(sum(!is.na(res$TCVR)) == ss)
ss <- sum(seesapmas11[, Reduce("sum", lapply(.SD, function(x) x != '')),
.SDcols=subs])
stopifnot(sum(!is.na(res$SUB)) == ss)
ss <- sum(seesapmas11[, Reduce("sum", lapply(.SD, function(x) x != '')),
.SDcols=subons])
stopifnot(sum(!is.na(res$SUBON)) == ss)
## Add constants
res <- cseed[, consts, with=FALSE][res, on=ids]
## Check for 88
## res[, length(unique(ID)), by=c("CONTNAM", "STPACE", "YEAR")]
counts <- seesapmas11[!is.na(HT88), .N, by=c("CONTNAM", "STPACE")][,N]
test <- res[!is.na(HT) & YEAR==88, .N, by=c("CONTNAM", "STPACE")][,N]
stopifnot(all(counts == test))
## Add a unique plot identifier
res[,PID := .GRP, by=c('CONTNAM', 'STPACE')]
## Put some of the columns up front, soil cols at the back
coln <- c( # these will go up front for visual convenience
'PID', 'ID', 'CONTNAM', 'STPACE', 'SPEC', 'HT', 'SUB', 'SUBON', 'YEAR',
'YRTAG', 'ALONG', 'DISUPDN', 'YRMORT', 'STAT', 'DECM', 'ELEVCL',
'ASPCL', 'TAG'
)
midcols <- sort(setdiff(names(res), c(coln, scols)))
ord <- c(coln, midcols, sort(scols)) # soil columns in the back
setcolorder(res, ord)
## Convert year column to full years (not abbreviations) and to integers
res[, YEAR := paste0(ifelse(YEAR %in% c('00', '11'), '20', '19'), YEAR)]
res[, YEAR := as.integer(YEAR)]
resDims <- dim(res)
## /* end melting */
##'
##+type-conversions
intInds <- unlist(lapply(res, function(x) {
if (is.character(x)) return(FALSE)
all(x == as.integer(x), na.rm=TRUE)
}))
## Convert to integers
intCols <- names(res)[intInds]
res[, intCols := lapply(.SD, as.integer),
.SDcols=intCols, with=FALSE]
## /* end type-conversions */
##'
##' ## Type Conversions
##' Many of the columns are being stored as doubles, but are just integer values,
##' so here some of these will be converted.
##' The following are converted to integers:
##'
##' + Columns:
{{prettify(names(intInds))}}
##'
##+results
dims <- dim(res)
restab <- datatable(head(res, 200), options=dtopts)
## /* end results */
##' ----------------------
##'
##' ## Results
##'
##' The resulting dataset has
{{dims[[1]]}}
##' rows and
{{dims[[2]]}}
##' columns:
##'
##' + Column names:
{{prettify(names(res))}}
##'
##' See `cseed_summary.R` for some summaries of the data
##'
##'
##' ## Save
##'
##' Saving the whole dataset as **segments**. Also, saving two data tables where
##' the columns that are constant across plots are separated from those that vary,
##' `PID` relating the two. These will be called **segplants** (individual-level) and
##' **segplots** (plot-level).
##+save
## All segment data bundled
segdata <- res
## Plot-level variables to separate out
consts <- c('PID', 'CONTNAM', 'STPACE', 'ELEVCL', 'ASPCL', scols)
segplots <- unique(segdata[, consts, with=FALSE])
segplants <- segdata[, c('PID', setdiff(names(segdata), consts)), with=FALSE]
## If saving
## save(segdata, file='../temp/segdata.rda', compress='bzip2')
## save(segplots, file='../temp/segplots.rda', compress='bzip2')
## save(segplants, file='../temp/segplants.rda', compress='bzip2')
## /* end save */
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.