R/01_data-processing-trydb.R

Defines functions `f`

######################################################################
#
# GRM -- TRY-DB data processing
#
#  Rob Smith, phytomosaic@gmail.com, 29 Mar 2020
#
##  CC-BY-SA 4.0 License (Creative Commons Attribution-ShareAlike 4.0)


### preamble
rm(list=ls())
# devtools::install_github('phytomosaic/ecole')
require(ecole)
require(data.table)
options(stringsAsFactors=F)
setwd('./data_raw/try_data/')

### function to read file
`f` <- function(x) {
        cat('File size (MB):',round(file.info(x)$size/1024^2,1),'--> ')
        cat('Time elapsed:',
            system.time({
                    out <- data.table::fread(x, header = T, sep = '\t',
                                             dec = '.', quote = '',
                                             fill = T, data.table = T)
            })[[3]], '\n')
        return(out)
}

# ### first, break up one ridiculously HUGE monster file (1.27 GB)
# cat('File size (MB):', round(file.info('./8540.txt')$size/1024^2),'\n')
# d <- f('./8540.txt') # ! ! ! TIMEWARN ! ! ! ~90 seconds
# nr  <- NROW(d) # n rows in this big file
# len <- 250000  # desired length of each smaller file
# nr / len       # how many smaller files to be produced
# i <- rep(1:ceiling(nr/len), ea=len)[1:nr] # create index
# # d[, indx := i]   # DONT add index to data.table!
# d[, fwrite(.SD, paste0('8540_',i,'.txt'),sep='\t'), by=i] # save small
# rm(list=ls())

### read ALL files in the directory ! ! ! TIMEWARN ! ! !
fnm <- list.files('.', pattern='.txt')   # all files in dir
fnm <- fnm[fnm != '_readme.txt']         # avoid the readme file
fnm <- fnm[fnm != '8540.txt']            # avoid the monster file
d   <- rbindlist(lapply(fnm, f)) ; rm(f) # TIMEWARN !
setnames(d, names(d), tolower(names(d))) # cleanup column names
rm(fnm, pth)                             # cleanup environment
d[,c('firstname','lastname','dataset',   # drop unneeded columns
     'accspeciesid','obsdataid','origlname','valuekindname',
     'origuncertaintystr','uncertaintyname','replicates',
     'reluncertaintypercent','reference','comment','v28') := NULL]
d <- d[order(accspeciesname, traitid)]   # sort for convenience

### clean species names
d[, speciesname    := ecole::clean_text(speciesname,lower=T)]
d[, accspeciesname := ecole::clean_text(accspeciesname,lower=T)]

### remove one blank row
d <- d[!c(traitname == '' & dataname == ''),]

### remove duplicates based on 'origobsdataid'
i <- which(duplicated(d,by='origobsdataid') & !is.na(d$origobsdataid))
length(i) / NROW(d) # 2% = propn of obsvns that are dupes
d <- d[-i,]  ;  rm(i)

# ### TODO: keep only if error risk < 5 SD from mean? (z-scores < 5)
# hist(d$errorrisk, breaks=44, col='grey', las=1, xaxs='i', yaxs='i')
# # d <- d[is.na(errorrisk) | errorrisk < 5, ]

### keep only species of interest
load('../../data/pnw_tree.rda', verbose=T)
(x <- sort(unique(pnw_tree$spp)))  ;  rm(pnw_tree)    # FIA species
y <- sort(unique(c(d$speciesname, d$accspeciesname))) # TRY species

### strict EXACT match of spp -- beware synonyms/orthovariants...
all(x %in% y)          # all 57 FIA spp are present in TRY
(keep <- y[y %in% x])  # TRY in FIA -- all 57 FIA species!
# y[y %notin% x] # TRY not in FIA (may incl synonyms/orthovariants!!)
d <- d[accspeciesname %in% keep] # strict EXACT match

### summary
cat('Object size (MB):', round(object.size(d)/1024^2,2))
names(d)
hist(table(d$observationid), breaks=33, col='grey',
     xlab='Measurements per entity', main='')
table(d$dataid == 59)         # how many latitude observations?
sort(table(d$speciesname))    # old/mixed species names
sort(table(d$accspeciesname)) # accepted species names
hist(table(d$accspeciesname), breaks=33, col='grey',
     xlab='Measurements per species', main='')
table(d$datasetid)
table(d$unitname)
table(d$traitname)
sort(table(d$traitname))
sort(table(d$dataname))

# ### TODO: make wide if needed:
# # 'observationid' identifies measurements on the same entity
# `f` <- function (data, ...) {
#         if (!is.data.frame(data))
#                 stop("must be dataframe")
#         if (ncol(data) != 3)
#                 stop("must have 3-column format")
#         x <- data[, 1]
#         y <- data[, 2]
#         z <- data[, 3]
#         fx <- as.factor(x)
#         fy <- as.factor(y)
#         m <- matrix(NA, nrow = nlevels(fx), ncol = nlevels(fy),
#                     dimnames = list(levels(fx), levels(fy)))
#         m[cbind(fx, fy)] <- z
#         list(x = sort(unique(x)), y = sort(unique(y)), z = m)
# }
# w <- f(d[,c('observationid','traitname','origvaluestr')])
# # rows = one observationid, cols = traits, elements = trait vals
# str(w)

### save
pnw_traits <- d
save(pnw_traits, file='../../data/pnw_traits.rda')

####    END    ####
phytomosaic/pnw documentation built on April 16, 2020, 7:29 p.m.