#' Identifying column index or name in Date or POSIXct format
#'
#' This function allows you to identify the index or the name of the columns in a
#' data.frame that are in Date or POSIXct format
#' @param df data.frame
#' @param format string, the format you are looking for, one of Date or POSIXct. If null, both are considered
#' @param return string, what should the function return as result, either the index or the name of the columns,
#' one of index or name.
#' @keywords date, POSIXct
#' @export
#' @examples
#' detect_datecol(df, format = 'Date', return = 'name')
detect_datecol <- function(df, format = NULL, return = 'index') {
stopifnot(return %in% c('index', 'name'))
col.classes <- sapply(df, class)
datecols <- vector('list')
#Detect Date or POSIXct column(s)
datecols[['POSIXct']] <- which(sapply(col.classes, function(x) is.element('POSIXct', x)))
datecols[['Date']] <- which(sapply(col.classes, function(x) is.element('Date', x)))
if (is.null(format)) format <- c('POSIXct', 'Date')
datecols <- datecols[names(datecols) %in% format]
if (all(sapply(datecols, length) == 0)) { stop(stringr::str_c('No column of class ',
stringr::str_c(format, collapse = ' or '),
'present')) }
if (return == 'index') result <- unlist(datecols)
if (return == 'name') result <- colnames(df)[unlist(datecols)]
return(result)
}
#' Fill up missing dates with NA values
#'
#' This function allows you to generate a dataframe with a complete datesequence by filling up missing
#' dates with rows filled with NAs
#' @param df data.frame, a column of format Date or POSIXct has to be present
#' @param timestep integer, number of seconds between each row in the data.frame. Optional, if not supplied
#' this number is guessed which may take some time for large data.frames
#' @param datecol.name string, optional name of the datecolumn, otherwise it will be detected automatically
#' using rebecka::detect_datecol
#' @param unit string, one of 'year' or 'day', start timestamp will be rounded down and end timestamp up to this unit
#' and the sequence will then be filled from start to end
#' @param verbose logical, if TRUE and timestep is not given the detected time resolution will be printed
#' @keywords date, POSIXct, complete
#' @export
#' @examples
#' complete_dateseq(df, format = 'Date', group.cols = 'st_id', unit = 'year')
complete_dateseq <- function(df, timestep = NULL, sample.size = 0.5, datecol.name = NULL, unit = 'year', verbose = F) {
if (is.null(datecol.name)) datecol.name <- rebecka::detect_datecol(df, return = 'name')
if (any( is.na( dplyr::select(df, !!datecol.name) ) )) stop('NAs in datecol detected!')
datecol.name <- as.name(datecol.name)
dates <- df %>% arrange(!!datecol.name) %>% dplyr::pull(!!datecol.name)
if (is.null(timestep)) {
#Get indices of random dates to speed up computation and not use full vector
sample.size <- round(length(dates) * sample.size)
sample.indices <- base::sample.int(length(dates), size = sample.size)
####CONSIDER USING LEAD INSTEAD OF LAG, LEAD SHIFTS NEXT ROW ONE STEP TO THE FRONT
dates.lag <- dplyr::lag(dates)
dates.sample <- dates[sample.indices]
dates.lag.sample <- dates.lag[sample.indices]
#Calculate interval in s between adjacent dates
intervals <- purrr::map2(dates.lag.sample, dates.sample, function(x, y) {
int <- lubridate::interval(x, y)
return(lubridate::int_length(int))
})
intervals <- na.exclude(unlist(intervals))
#Extract most common interval
count <- table(intervals)
timestep <- as.numeric(names(which.max(count[count != 0])))
if (verbose) print(paste0('Time frequency of: ',
timestep,
's detected.'))
}
if(is.element('Date', class(dates))) timestep <- lubridate::day(lubridate::seconds_to_period(timestep))
#Complete the timeseries
df.complete <- df %>%
dplyr::arrange(!!datecol.name) %>%
tidyr::complete(!!datecol.name := seq(lubridate::floor_date(min(!!datecol.name, na.rm = T), unit = unit),
lubridate::ceiling_date(max(!!datecol.name, na.rm = T), unit = unit),
by = timestep)) %>%
dplyr::arrange(!!datecol.name) %>%
dplyr::slice(1:n()-1)
#For columns with only 1 unqique value fill these column with this value
unique.length <- sapply(df.complete, function(x) length(unique(x[!is.na(x)])))
id.unique <- which(unique.length == 1)
unique.names <- names(id.unique)
unique.names <- rlang::syms(unique.names)
df.complete <-
df.complete %>%
dplyr::mutate_at(dplyr::vars(!!!unique.names), function(x) x[is.na(x)] <- unique(x[!is.na(x)]))
return(df.complete)
}
#' Listing raster tiffs in a folder
#'
#' This function allows you to create a vector with paths to raster files in the specified folder
#' @param path a string, path to the folder with the files
#' @param pattern a regular expression. Only filenames that match pattern will be returned
#' @param full.names a logical value. If TRUE the full path will be returned, otherwise only the file names
#' @keywords files, list
#' @export
#' @examples
#' list_rasters('a_data/original_data')
list_rasters <- function(path = getwd(), pattern = '.tif$', full.names = T, ...) {
vec <- list.files(path = path, pattern = pattern, full.names = full.names, ...)
return(vec)
}
#' Listing csv files in a folder
#'
#' This function allows you to create a vector with paths to csv files in the specified folder
#' @param path a string, path to the folder with the files
#' @param pattern a regular expression. Only filenames that match pattern will be returned
#' @param full.names a logical value. If TRUE the full path will be returned, otherwise only the file names
#' @keywords files, list
#' @export
#' @examples
#' list_rasters('a_data/original_data')
list_csv <- function(path = getwd(), pattern = '.csv$', full.names = T, ...) {
vec <- list.files(path = path, pattern = pattern, full.names = full.names, ...)
return(vec)
}
#' Listing shapefiles in a folder
#'
#' This function allows you to create a vector with paths to shapefiles in the specified folder
#' @param path a string, path to the folder with the files
#' @param pattern a regular expression. Only filenames that match pattern will be returned
#' @param full.names a logical value. If TRUE the full path will be returned, otherwise only the file names
#' @keywords files, list
#' @export
#' @examples
#' list_rasters('a_data/original_data')
list_shp <- function(path = getwd(), pattern = '.shp$', full.names = T, ...) {
vec <- list.files(path = path, pattern = pattern, full.names = full.names, ...)
return(vec)
}
#' Open all the files and append rows to a single df
#'
#' This function allows you to combine all the files in the vector within a single df by reading
#' them first as data.frames and then using bind_rows from dplyr package to combine them
#' @param v vector of paths to the files
#' @param names bool, if true the name of the files will be used to create a new column with the name 'id_col'
#' @param id_col string, name of the column that will be appended to the resulting df if names is TRUE
#' @param ... further parameters given to readr::read_csv
#' @keywords files, list, open
#' @export
#' @examples
#' vector_to_df(v = c(path1, path2), names = T, id_col = 'st_id')
vector_to_df <- function(v, names = F, id_col = 'id', ...) {
df_list <- purrr:::map(v, readr::read_csv, ...)
if (names) names(df_list) <- stringr::str_replace(basename(v), '(.*?)\\..*$', '\\1')
id <- ifelse(names, id_col, NULL)
df <- dplyr::bind_rows(df_list, .id = id)
return(df)
}
#' Remove outliers from a vector based on sigma test or interquartile range
#'
#' This function removes outliers from a vector and replaces them with an optional value based
#' on sigma test or the interquartile range
#' @param v numeric vector
#' @param type string, one of iqr for interquartile range or sigma for sigma test
#' @param fill numeric or function, either a fixed value or a function such as mean from which
#' the replacement values can be computed
#' @param range numeric, this number is multiplied with the interquartile range for type = 'iqr'
#' or the standard deviation for type = 'sigma' and determines the width of the window in which
#' values are considered as NOT being outliers
#' @keywords outliers, cleaning
#' @export
#' @examples
#' remove_outliers(c(1, 2, 3, 100))
remove_outliers <- function(v, type = 'iqr', fill = NA, range = ifelse(type == 'iqr', 1.5, 3)) {
if (type == 'iqr') {
#####IQR test for outliers
q25 = quantile(v, c(.25))
q75 = quantile(v, c(.75))
iqr = q75 - q25
upper = q75 + (iqr * range)
lower = q25 - (iqr * range)
} else if (type == 'sigma') {
####3 sigma test for outliers
v_mean = mean(v)
v_sd = sd(v)
lower = v - (range * v_sd)
upper = v + (range * v_sd)
}
if (is.function(fill)) {
v_clean <- ifelse(v > upper | v < lower, fill(v), v)
} else {
v_clean <- ifelse(v > upper | v < lower, fill, v)
}
return(v_clean)
}
#' Spread multiple columns
#'
#' This function spreads multiple columns so that in the end the unique values of the
#' rows are new columns
#' @param df data.frame or tibble
#' @param key/value name of key and value columns
#' @keywords tidyr, spread
#' @export
multispread <- function(df, key, value) {
# quote key
keyq <- rlang::enquo(key)
# break value vector into quotes
valueq <- rlang::enquo(value)
s <- rlang::quos(!!valueq)
df %>%
tidyr::gather(variable, value, !!!s) %>%
tidyr::unite(temp, !!keyq, variable) %>%
tidyr::spread(temp, value)
}
#' Extract model statistics from model object
#'
#' This function extracts performance measures as dataframe from a model object
#' @param model model object or train object
#' @keywords model, mae, rmse, rsquared
#' @export
get_model_performance <- function(model) {
if('train' %in% class(model)) {
v <- model$results %>% summarise(mae = min(MAE),
RMSE = min(RMSE),
Rsquared = max(Rsquared))
v[['vars']] <- paste(sort(model$coefnames), collapse = ', ')
return(v)
} else {
summary(model)$r.squared
}
}
#' Predict to a raster layer, save it and return the path instead of the raster layer to save memory
#'
#' This takes a raster_layer or raster_stack, a model object and a character vector as filename, predicts a new raster_layer using the model object, saves it to path and returns that path
#' @param object A Raster* object. The names of the single layers have to match the variables in "model".
#' @param model A model that is used to generate predictions from "object"
#' @param path A character vector, the predicted Raster* object will be stored under this path.
#' @param ... Further parameters passed on to raster::predict
#' @keywords model, predict, raster
#' @export
predict_save <- function(object, model, path, format = 'GTiff', ...) {
raster_pred <- raster::predict(object = object, model = model,
filename = path, format = format, ...)
print(basename(path))
return(path)
}
#' Adds together two raster layers, saves the result under path and returns the path as character vector
#'
#' This takes two raster_layers x and y and a character vector as filename, calculates a new raster_layer by adding raster x and raster y and then saves it to path and returns that path
#' @param x Raster layer
#' @param y Raster layer
#' @param path A character vector, the calculated Raster_layer will be stored under this path.
#' @param operation A character vector, one of +, -, * or / specifying how x and y should be combined
#' @param ... Further parameters passed on to raster::writeRaster
#' @keywords model, predict, raster
#' @export
calc_save <- function(x, y, path, format = 'GTiff', operation = '+', ...) {
if (!inherits(x, 'RasterLayer')) x <- raster::raster(x)
if (!inherits(y, 'RasterLayer')) y <- raster::raster(y)
if(!raster::compareRaster(x, y, orig = T)) {
print('Reprojecting y to x using bilinear method')
y <- projectRaster(from = y, to = x, method = 'bilinear')
}
raster_calc <- switch(operation,
'+' = x + y,
'-' = x - y,
'/' = x / y,
'*' = x * y)
raster::writeRaster(x = raster_calc, filename = path, format = format, ...)
print(basename(path))
return(path)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.