R/utilities.R

Defines functions calc_save predict_save get_model_performance multispread remove_outliers vector_to_df list_shp list_csv list_rasters complete_dateseq detect_datecol

Documented in calc_save complete_dateseq detect_datecol get_model_performance list_csv list_rasters list_shp multispread predict_save remove_outliers vector_to_df

#' Identifying column index or name in Date or POSIXct format
#'
#' This function allows you to identify the index or the name of the columns in a
#' data.frame that are in Date or POSIXct format
#' @param df data.frame
#' @param format string, the format you are looking for, one of Date or POSIXct. If null, both are considered
#' @param return string, what should the function return as result, either the index or the name of the columns,
#' one of index or name.
#' @keywords date, POSIXct
#' @export
#' @examples
#' detect_datecol(df, format = 'Date', return = 'name')

detect_datecol <- function(df, format = NULL, return = 'index') {

  stopifnot(return %in% c('index', 'name'))

  col.classes <- sapply(df, class)

  datecols <- vector('list')

  #Detect Date or POSIXct column(s)
  datecols[['POSIXct']] <- which(sapply(col.classes, function(x) is.element('POSIXct', x)))
  datecols[['Date']] <- which(sapply(col.classes, function(x) is.element('Date', x)))

  if (is.null(format)) format <- c('POSIXct', 'Date')

  datecols <- datecols[names(datecols) %in% format]

  if (all(sapply(datecols, length) == 0)) { stop(stringr::str_c('No column of class ',
                                                 stringr::str_c(format, collapse = ' or '),
                                                 'present')) }

  if (return == 'index') result <- unlist(datecols)
  if (return == 'name') result <- colnames(df)[unlist(datecols)]

  return(result)

}

#' Fill up missing dates with NA values
#'
#' This function allows you to generate a dataframe with a complete datesequence by filling up missing
#' dates with rows filled with NAs
#' @param df data.frame, a column of format Date or POSIXct has to be present
#' @param timestep integer, number of seconds between each row in the data.frame. Optional, if not supplied
#' this number is guessed which may take some time for large data.frames
#' @param datecol.name string, optional name of the datecolumn, otherwise it will be detected automatically
#' using rebecka::detect_datecol
#' @param unit string, one of 'year' or 'day', start timestamp will be rounded down and end timestamp up to this unit
#' and the sequence will then be filled from start to end
#' @param verbose logical, if TRUE and timestep is not given the detected time resolution will be printed
#' @keywords date, POSIXct, complete
#' @export
#' @examples
#' complete_dateseq(df, format = 'Date', group.cols = 'st_id', unit = 'year')

complete_dateseq <- function(df, timestep = NULL, sample.size = 0.5, datecol.name = NULL, unit = 'year', verbose = F) {

  if (is.null(datecol.name)) datecol.name <- rebecka::detect_datecol(df, return = 'name')

  if (any( is.na( dplyr::select(df, !!datecol.name) ) )) stop('NAs in datecol detected!')

  datecol.name <- as.name(datecol.name)

  dates <- df %>% arrange(!!datecol.name) %>% dplyr::pull(!!datecol.name)

  if (is.null(timestep)) {

    #Get indices of random dates to speed up computation and not use full vector
    sample.size <- round(length(dates) * sample.size)
    sample.indices <- base::sample.int(length(dates), size = sample.size)

    ####CONSIDER USING LEAD INSTEAD OF LAG, LEAD SHIFTS NEXT ROW ONE STEP TO THE FRONT
    dates.lag <- dplyr::lag(dates)

    dates.sample <- dates[sample.indices]
    dates.lag.sample <- dates.lag[sample.indices]

    #Calculate interval in s between adjacent dates
    intervals <- purrr::map2(dates.lag.sample, dates.sample, function(x, y) {
      int <- lubridate::interval(x, y)
      return(lubridate::int_length(int))
    })

    intervals <- na.exclude(unlist(intervals))

    #Extract most common interval
    count <- table(intervals)
    timestep <- as.numeric(names(which.max(count[count != 0])))

    if (verbose) print(paste0('Time frequency of: ',
                              timestep,
                              's detected.'))

  }

  if(is.element('Date', class(dates))) timestep <- lubridate::day(lubridate::seconds_to_period(timestep))

  #Complete the timeseries
  df.complete <- df %>%

    dplyr::arrange(!!datecol.name) %>%

    tidyr::complete(!!datecol.name := seq(lubridate::floor_date(min(!!datecol.name, na.rm = T), unit = unit),
                                          lubridate::ceiling_date(max(!!datecol.name, na.rm = T), unit = unit),
                                          by = timestep)) %>%

    dplyr::arrange(!!datecol.name) %>%
    dplyr::slice(1:n()-1)

  #For columns with only 1 unqique value fill these column with this value
  unique.length <- sapply(df.complete, function(x) length(unique(x[!is.na(x)])))
  id.unique <- which(unique.length == 1)

  unique.names <- names(id.unique)
  unique.names <- rlang::syms(unique.names)

  df.complete <-
    df.complete %>%
    dplyr::mutate_at(dplyr::vars(!!!unique.names), function(x) x[is.na(x)] <- unique(x[!is.na(x)]))

  return(df.complete)

}

#' Listing raster tiffs in a folder
#'
#' This function allows you to create a vector with paths to raster files in the specified folder
#' @param path a string, path to the folder with the files
#' @param pattern a regular expression. Only filenames that match pattern will be returned
#' @param full.names a  logical value. If TRUE the full path will be returned, otherwise only the file names
#' @keywords files, list
#' @export
#' @examples
#' list_rasters('a_data/original_data')

list_rasters <- function(path = getwd(), pattern = '.tif$', full.names = T, ...) {

  vec <- list.files(path = path, pattern = pattern, full.names = full.names, ...)

  return(vec)
}

#' Listing csv files in a folder
#'
#' This function allows you to create a vector with paths to csv files in the specified folder
#' @param path a string, path to the folder with the files
#' @param pattern a regular expression. Only filenames that match pattern will be returned
#' @param full.names a  logical value. If TRUE the full path will be returned, otherwise only the file names
#' @keywords files, list
#' @export
#' @examples
#' list_rasters('a_data/original_data')

list_csv <- function(path = getwd(), pattern = '.csv$', full.names = T, ...) {

  vec <- list.files(path = path, pattern = pattern, full.names = full.names, ...)

  return(vec)
}

#' Listing shapefiles in a folder
#'
#' This function allows you to create a vector with paths to shapefiles in the specified folder
#' @param path a string, path to the folder with the files
#' @param pattern a regular expression. Only filenames that match pattern will be returned
#' @param full.names a  logical value. If TRUE the full path will be returned, otherwise only the file names
#' @keywords files, list
#' @export
#' @examples
#' list_rasters('a_data/original_data')

list_shp <- function(path = getwd(), pattern = '.shp$', full.names = T, ...) {

  vec <- list.files(path = path, pattern = pattern, full.names = full.names, ...)

  return(vec)
}

#' Open all the files and append rows to a single df
#'
#' This function allows you to combine all the files in the vector within a single df by reading
#' them first as data.frames and then using bind_rows from dplyr package to combine them
#' @param v vector of paths to the files
#' @param names bool, if true the name of the files will be used to create a new column with the name 'id_col'
#' @param id_col string, name of the column that will be appended to the resulting df if names is TRUE
#' @param ... further parameters given to readr::read_csv
#' @keywords files, list, open
#' @export
#' @examples
#' vector_to_df(v = c(path1, path2), names = T, id_col = 'st_id')

vector_to_df <- function(v, names = F, id_col = 'id', ...) {

  df_list <- purrr:::map(v, readr::read_csv, ...)

  if (names) names(df_list) <- stringr::str_replace(basename(v), '(.*?)\\..*$', '\\1')
  id <- ifelse(names, id_col, NULL)

  df <- dplyr::bind_rows(df_list, .id = id)

  return(df)
}

#' Remove outliers from a vector based on sigma test or interquartile range
#'
#' This function removes outliers from a vector and replaces them with an optional value based
#' on sigma test or the interquartile range
#' @param v numeric vector
#' @param type string, one of iqr for interquartile range or sigma for sigma test
#' @param fill numeric or function, either a fixed value or a function such as mean from which
#' the replacement values can be computed
#' @param range numeric, this number is multiplied with the interquartile range for type = 'iqr'
#' or the standard deviation for type = 'sigma' and determines the width of the window in which
#' values are considered as NOT being outliers
#' @keywords outliers, cleaning
#' @export
#' @examples
#' remove_outliers(c(1, 2, 3, 100))

remove_outliers <- function(v, type = 'iqr', fill = NA, range = ifelse(type == 'iqr', 1.5, 3)) {

  if (type == 'iqr') {

    #####IQR test for outliers
    q25 = quantile(v, c(.25))
    q75 = quantile(v, c(.75))
    iqr = q75 - q25
    upper = q75 + (iqr * range)
    lower = q25 - (iqr * range)

  } else if (type == 'sigma') {

    ####3 sigma test for outliers
    v_mean = mean(v)
    v_sd = sd(v)
    lower = v - (range * v_sd)
    upper = v + (range * v_sd)

  }

  if (is.function(fill)) {
    v_clean <- ifelse(v > upper | v < lower, fill(v), v)
  } else {
    v_clean <- ifelse(v > upper | v < lower, fill, v)
  }

  return(v_clean)

}

#' Spread multiple columns
#'
#' This function spreads multiple columns so that in the end the unique values of the
#' rows are new columns
#' @param df data.frame or tibble
#' @param key/value name of key and value columns
#' @keywords tidyr, spread
#' @export

multispread <- function(df, key, value) {

  # quote key
  keyq <- rlang::enquo(key)

  # break value vector into quotes
  valueq <- rlang::enquo(value)
  s <- rlang::quos(!!valueq)

  df %>%
    tidyr::gather(variable, value, !!!s) %>%
    tidyr::unite(temp, !!keyq, variable) %>%
    tidyr::spread(temp, value)
}

#' Extract model statistics from model object
#'
#' This function extracts performance measures as dataframe from a model object
#' @param model model object or train object
#' @keywords model, mae, rmse, rsquared
#' @export

get_model_performance <- function(model) {

  if('train' %in% class(model)) {
    v <- model$results %>% summarise(mae = min(MAE),
                                     RMSE = min(RMSE),
                                     Rsquared = max(Rsquared))
    v[['vars']] <- paste(sort(model$coefnames), collapse = ', ')

    return(v)

  } else {
    summary(model)$r.squared
  }

}

#' Predict to a raster layer, save it and return the path instead of the raster layer to save memory
#'
#' This takes a raster_layer or raster_stack, a model object and a character vector as filename, predicts a new raster_layer using the model object, saves it to path and returns that path
#' @param object A Raster* object. The names of the single layers have to match the variables in "model".
#' @param model A model  that is used to generate predictions from "object"
#' @param path A character vector, the predicted Raster* object will be stored under this path.
#' @param ... Further parameters passed on to raster::predict
#' @keywords model, predict, raster
#' @export

predict_save <- function(object, model, path, format = 'GTiff', ...) {

  raster_pred <- raster::predict(object = object, model = model,
                                 filename = path, format = format, ...)

  print(basename(path))
  return(path)

}

#' Adds together two raster layers, saves the result under path and returns the path as character vector
#'
#' This takes two raster_layers x and y  and a character vector as filename, calculates a new raster_layer by adding raster x and raster y and then saves it to path and returns that path
#' @param x Raster layer
#' @param y Raster layer
#' @param path A character vector, the calculated Raster_layer will be stored under this path.
#' @param operation A character vector, one of +, -, * or / specifying how x and y should be combined
#' @param ... Further parameters passed on to raster::writeRaster
#' @keywords model, predict, raster
#' @export

calc_save <- function(x, y, path, format = 'GTiff', operation = '+', ...) {

  if (!inherits(x, 'RasterLayer')) x <- raster::raster(x)
  if (!inherits(y, 'RasterLayer')) y <- raster::raster(y)

  if(!raster::compareRaster(x, y, orig = T)) {
    print('Reprojecting y to x using bilinear method')
    y <- projectRaster(from = y, to = x, method = 'bilinear')

  }

  raster_calc <- switch(operation,
                        '+' = x + y,
                        '-' = x - y,
                        '/' = x / y,
                        '*' = x * y)

  raster::writeRaster(x = raster_calc, filename = path, format = format, ...)

  print(basename(path))
  return(path)

}
sitscholl/rebecka_package documentation built on Aug. 25, 2020, 4:20 a.m.