R/data.R

Defines functions capFirst parseMetadata translate sift frameRasters euclid convertRaster clearTempFiles updateEcoclim logit2prop pairsData

Documented in capFirst clearTempFiles convertRaster euclid frameRasters logit2prop pairsData parseMetadata sift translate updateEcoclim

####################################

#' Capitalize first letter of a string.
#'
#' Capitalizes the first letter of a string.
#' @param x A character vector.
capFirst <- function(x){
      first <- substr(x, 1, 1)
      rest <- substr(x, 2, nchar(x))
      paste0(toupper(first), rest)
}


######################################

#' Parse factors found in filenames or strings.
#'
#' This function parses a vector of strings, returning a data frame of factors
#' found in those strings. It takes a folder path (or alternatively a character
#' vector) as input, and it returns a data frame with a row for each file in the
#' folder (or for each item in the vector) and columns containing matched
#' factors.
#'
#' @param paths A character vector of complete file directory paths.
#' @param variables Optional character vector selecting the subset of default
#'   keys titles to employ.
#' @param keys A list of named character vectors, representing variable title
#'   and factors.
#' @param is.dir Logical indicating whether path is a file directory, or just a
#'   vector of character strings to parse.
#' @param recursive Logical indicating whether subdirectories should be
#'   searched.
#' @param drops A character vector of path patterns that will cause that path to
#'   be dropped from results.
#' @param skips A character vector of path patterns to ignore during matching,
#'   typically used if folder names contain factor patterns.
#' @param pattern A character indicating a seach pattern to match in file paths.
#'   All files not containing this path will be ignored.
#' @param simplify Logical indicating whether columns with all NA values should
#'   be removed from result.
#'
#' @return Data frame with a column for the parsed strings and for each of the
#'   factors.
#' @aliases prs

parseMetadata <- function(paths, is.dir=T, variables=NULL, keys=NULL, recursive=T, drops=NULL, skips=NULL, pattern=NULL, simplify=T){

      # default keys
      key <- list(
            variable=c("tmin", "tmax", "tmean", "ppt", "tdmean", "vpr", "dpd", "pfog",
                       "Tmin", "Tmax", "Tave", "PPT", "cmd", "CMD", "NFFD", "nffd", "FFP", "ffp",
                       "bio1", "bio2", "bio3", "bio4", "bio5", "bio6", "bio7", "bio8", "bio9", "bio10", "bio11", "bio12", "bio13", "bio14", "bio15", "bio16", "bio17", "bio18", "bio19"),
            month=c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "14"),
            season=c("sp", "sm", "fl", "at", "wt"),
            stat=c("delta", "anomaly", "percentile", "zscore",
                   "mk_sl", "mk_D", "mk_S", "mk_tau", "mk_varS", "ts_slope", "ts_intercept", "ts_mResidual", "ts_change"),
            scenario=c("B1", "A2", "A1B"),
            model=c("bccr_bcm20", "cccma_cgcm3", "csiro_mk30", "gfdl_cm2", "gfdl_cm21", "giss_eh", "ipsl_cm4", "miroc32_Hires", "miroc32_medres", "mpi_echam5", "mri_cgcm232a", "ncar_ccsm30", "ukmo_hadcm3", "UKMO_HadGEM1"),
            run=c("run1", "run2", "run3", "run4", "run5"),
            year=1800:2200
      )

      # restrict keys to the requested subset, and add extras supplied manually
      if(!is.null(variables)) key <- key[variables]
      if(!is.null(keys)) key <- c(key, keys)

      # get names if needed, and format as data frame
      if(is.dir==T) paths <- list.files(paths, full.names=T, recursive=recursive)
      d <- data.frame(path=paths, stringsAsFactors=F)
      d$info <- d$path

      # standardize punctuation
      for(separator in c("/", "\\.", " ")) d$info <- gsub(separator, "_", d$info)
      d$info <- paste0("_", d$info, "_")

      # mask patterns that need to be ignored
      if(!is.null(skips)) for(skip in skips) d$info <- gsub(skip, "___", d$info)

      # filter to match patterns
      if(!is.null(pattern)) d <- d[grepl(pattern, d$path),]

      # serach and match key set
      for(k in names(key)){
            for(v in key[[k]]){
                  d[grepl(paste0("_", v, "_"), d$info), k] <- v
            }
      }

      # drops results as needed
      if(!is.null(drops)) for(drop in drops) d <- d[grepl(drop, d$path)==F,]
      if(simplify==T) d <- d[,colSums(is.na(d))<nrow(d)] # drop columns with all NAs
      d <- d[,!names(d)=="info"] # drop info column

      d

}




########################################

#' Translate common climate data abbreviations.
#'
#' This is a simple translation dictionary that can be used to look up things
#' like names and units, for climate variables, statistics, and months.
#'
#' @param key A character vector.
#' @param to A character vector, must be either "words", "units", "delta",
#'   "abbv", or "letter". These are not all available for every type of key.
#' @return A character vector of translated results.

translate <- function(key, to="words"){
      dict <- list(tmin=list(words="minimum temperature", units="(deg C)", delta="(deg C)"),
                   tmax=list(words="maximum temperature", units="(deg C)", delta="(deg C)"),
                   tmean=list(words="mean temperature", units="(deg C)", delta="(deg C)"),
                   tdmean=list(words="dew point", units="(deg C)", delta="(deg C)"),
                   ppt=list(words="precipitation", units="(mm)", delta="(mm)"),
                   vpr=list(words="vapor pressure", units="(Pa)", delta="(Pa)"),
                   cmd=list(words="climatic moisture deficit", units="(mm)", delta="(ratio)"),
                   nffd=list(words="number of frost-free days", units="(days)", delta="(days)"),
                   ffp=list(words="frost-free period", units="(days)", delta="(days)"),
                   dpd=list(words="dew point depression", units="(deg C)", delta="(deg C)"),
                   pfog=list(words="fog frequency", units="(proportion)", delta="(absolute proportion)"),
                   pptr=list(words="precipitation", units="(proportion)", delta="(proportion)"),

                   bio1=list(words="mean annual temperature", units="(deg C)", delta="(deg C)"),
                   bio2=list(words="mean diurnal temperature range", units="(deg C)", delta="(deg C)"),
                   bio3=list(words="isothermality", units="(ratio)", delta="(ratio)"),
                   bio4=list(words="temperature seasonality", units="(sd*100)", delta="(sd*100)"),
                   bio5=list(words="maximum temperature of warmest month", units="(deg C)", delta="(deg C)"),
                   bio6=list(words="minimum temperature of coldest month", units="(deg C)", delta="(deg C)"),
                   bio7=list(words="annual temperature range", units="(deg C)", delta="(deg C)"),
                   bio8=list(words="mean temperature of wettest quarter", units="(deg C)", delta="(deg C)"),
                   bio9=list(words="mean temperature of driest quarter", units="(deg C)", delta="(deg C)"),
                   bio10=list(words="mean temperature of warmest quarter", units="(deg C)", delta="(deg C)"),
                   bio11=list(words="mean temperature of coldest quarter", units="(deg C)", delta="(deg C)"),
                   bio12=list(words="total annual precipitation", units="(mm)", delta="(mm)"),
                   bio13=list(words="precipitation of wettest month", units="(mm)", delta="(mm)"),
                   bio14=list(words="precipitation of driest month", units="(mm)", delta="(mm)"),
                   bio15=list(words="precipitation seasonality", units="(cv)", delta="(cv)"),
                   bio16=list(words="precipitation of wettest quarter", units="(mm)", delta="(mm)"),
                   bio17=list(words="precipitation of driest quarter", units="(mm)", delta="(mm)"),
                   bio18=list(words="precipitation of warmest quarter", units="(mm)", delta="(mm)"),
                   bio19=list(words="precipitation of coldest quarter", units="(mm)", delta="(mm)"),

                   mk_sl=list(words="Mann-Kendall p-value", units="MK p-value"),
                   ts_change=list(words="Theil-Sen change magnitude", units="_variable_units_"),
                   delta=list(words="Delta", units="_variable_delta_"),
                   zscore=list(words="Standard (z) score", units="standard\ndeviations"),
                   percentile=list(words="Anomaly percentile", units="percentile"),

                   "01"=list(words="January", abbv="Jan", letter="J"),
                   "02"=list(words="February", abbv="Feb", letter="F"),
                   "03"=list(words="March", abbv="Mar", letter="M"),
                   "04"=list(words="April", abbv="Apr", letter="A"),
                   "05"=list(words="May", abbv="May", letter="M"),
                   "06"=list(words="June", abbv="Jun", letter="J"),
                   "07"=list(words="July", abbv="Jul", letter="J"),
                   "08"=list(words="August", abbv="Aug", letter="A"),
                   "09"=list(words="September", abbv="Sep", letter="S"),
                   "10"=list(words="October", abbv="Oct", letter="O"),
                   "11"=list(words="November", abbv="Nov", letter="N"),
                   "12"=list(words="December", abbv="Dec", letter="D"),
                   "14"=list(words="Annual", abbv="Annual", letter="Avg"))

      sapply(key, FUN=function(x)dict[[x]][[to]])
}



#############################################

#' Subsetting by loop iterators.
#'
#' This convenience function is a wrapper around data frame subsetting, designed
#' for use inside loops that iterate through raster data directories.
#'
#' @param data Data frame from which records will be selected.
#' @param ... Filtration factors, saved as varialbes with names matching data
#'   columns.
#' @return Data frame subsetted to match filtration factors.

sift <- function(data, ...){
      variables <- data.frame(..., stringsAsFactors=F)
      for(var in names(variables)) data <- data[data[,var]==variables[1,var],]
      return(data)
}

#########################################

#' Create a data frame from raster files
#'
#' This function takes a metadata frame (likely generated via parseMetadata) and
#' compiles all the files into a single data frame. Supports multiband files.
#'
#' @param metadata Data frame with a "path" variable and other metadata
#'   variables.
#' @param layer_names Character vector of layer names for multiband rasters that
#'   are missing layer names (must be correct length).
#' @param bands Character vector denoting which layers to keep from multiband
#'   files. Default is all.
#' @param cpus Integer indicating number of CPUs to use for parallel processing.
#' @return A data frame, with a row for each pixel of each raster in metadata.
frameRasters <- function(metadata, layer_names=NA, cpus=1, bands=NA) {
      require(raster)

      if(cpus>1) {
            require(doParallel)
            cl <- makeCluster(cpus)
            registerDoParallel(cl)
            ss <- foreach(i=1:nrow(metadata),
                          .packages=c("raster")) %dopar% {
                                s <- stack(metadata$path[i])
                                if(nlayers(s)==1) names(s) <- "value"
                                if(nlayers(s)>=1 & !is.na(layer_names[1])) names(s) <- layer_names
                                if(nlayers(s)>=1 & !is.na(bands[1])) s <- raster::subset(s, bands)
                                s <- as.data.frame(rasterToPoints(s))

                                # add columns for identifying factors
                                for(j in names(metadata)[2:ncol(metadata)]){
                                      s[,j] <- metadata[,j][i]
                                }

                                s
                          }
            stopCluster(cl)
            return(do.call("rbind", ss))
      }

      for(i in 1:nrow(metadata)){
            s <- stack(metadata$path[i])
            if(nlayers(s)==1) names(s) <- "value"
            if(nlayers(s)>=1 & !is.na(layer_names[1])) names(s) <- layer_names
            if(nlayers(s)>=1 & !is.na(bands[1])) s <- raster::subset(s, bands)
            s <- as.data.frame(rasterToPoints(s))

            # add columns for identifying factors
            for(j in names(metadata)[2:ncol(metadata)]){
                  s[,j] <- metadata[,j][i]
            }

            if(i==1) ss <- s else{ss <- rbind(ss, s)}
      }

      return(ss)
}


########################################


#' Euclidian distance
#'
#' Calculate the Euclidian distance from a set of leg lengths. We use this to
#' calculate SED based on z-scores. Square root of sum of squares.
#'
#' @param z Numeric vector of z-scores or other distances.
euclid <- function(z){
      sqrt(sum(z^2))
}

########################################


#' Convert raster files
#'
#' Convert raster file(s) to a new format. Via popup boxes, specify input
#' files, output file type, and output folder. All input files will be saved in
#' new format to the output folder, with the same filename as the original.
convertRaster <- function() {
      froms <- choose.files(caption="Select raster files to convert")
      ext <- winDialogString("Select an output format\n(.asc, .bil, .grd, .img, .nc, .tif)", ".asc")
      outdir <- choose.dir(caption="Select output folder for converted files")
      for(from in froms){
            name <- basename(tools::file_path_sans_ext(from))
            out <- paste0(outdir, "/", name, ext)
            to <- raster::stack(from)
            raster::writeRaster(to, out)
            writeLines(paste("saved:", out))
      }
}




########################################


#' Delete unused raster temp files
#'
#' The raster library often saves temp files to the C: drive, and standard
#' cleanup methods can fail for lengthy parallel processing jobs, causing the
#' disk to fill up. Use this function to prevent that problem.
#'
#' @param tempdir Character: full file path of the folder where temp files are
#'   stored. Will be searched recursively.
#' @param loop Logical: should the function keep monitoring and deleting temp
#'   files indefinitely until the script is manually terminated?
clearTempFiles <- function(tempdir="C:/Users/matt_kling/AppData/Local/Temp", loop=F){
      run=T
      while(run){
            files <- list.files(tempdir, recursive=T, full.names=T)
            files <- files[substr(files, nchar(files)-2, nchar(files)) %in% c("grd", "gri")]
            for(f in files) try(file.remove(f))
            run <- loop
            if(run) Sys.sleep(5*60)
      }
}



########################################


#' Update the ecoclim package
#'
#' Convenience function -- download the latest version of ecoclim package from
#' github.
updateEcoclim <- function() devtools::install_github("matthewkling/ecoclim")




########################################


#' Convert logit to proportion
#'
#' Logistic regression uses a logistic link function to convert between
#' proportions and the modeled dependent variable. This function back-converts,
#' for use when transforming model predictions into proportions.
#' @param x Numeric: logit.
#' @return Numeric: proportion.
logit2prop <- function(x){exp(x)/(1+exp(x))}



####################

#' Generate data for pairwise scatterplot matrix.
#'
#' Convert a wide dataset into a format that can be fed to ggplot to create a
#' scatterplot matrix.
#'
#' @param data Data frame, in "wide" format (i.e. must contain a column for each
#'   xy_vars and z_vars).
#' @param xy_vars Character vector indicating variables to be paired (these
#'   would be the x and y dimensions if the result is used for a scatterplot
#'   matrix).
#' @param z_vars Optional character vector indicating additional variables to
#'   retain for each record.
#' @param mirror Logical indicating whether to generate data for both upper and
#'   lower trianges of the matrix (default is FALSE).
#' @return A restructured data frame.
pairsData <- function(data, xy_vars, z_vars=NULL, mirror=F){
      combos <- combn(xy_vars, 2)
      for(pair in 1:ncol(combos)){
            v1 <- combos[1,pair]
            v2 <- combos[2,pair]
            dpair <- data.frame(pair_id=as.character(pair), x_var=v2, y_var=v1, x_value=data[,v2], y_value=data[,v1])
            names(dpair)[4:5] <- c("x_value", "y_value")
            zdata <- data[,z_vars]
            if(length(z_vars)==1) zdata <- as.data.frame(zdata)
            names(zdata) <- z_vars
            dpair <- cbind(dpair, zdata)
            if(pair==1) f <- dpair else(f <- rbind(f, dpair))
      }
      if(mirror){
            f2 <- f
            f2$x_var <- f$y_var
            f2$x_value <- f$y_value
            f2$y_var <- f$x_var
            f2$y_value <- f$x_value
            f$triangle <- 1
            f2$triangle <- 2
            f <- rbind(f, f2)
      }
      return(f)
}
matthewkling/ecoclim documentation built on May 21, 2019, 1:04 p.m.