#' Fast md5 check
#'
#' md5 values are computed based on combination of file path, file size and file modified time.
#'
#' @param path_files a character vector containing absolute file paths.
#'
#' @return a character vector containing md5 values.
#' @export
#'
#' @examples
#' custom_md5_hash(path_files = list.files(full.names = TRUE))
custom_md5_hash <- function(path_files) {
path_files <- gsub(pattern = "\\\\", replacement = "/", x = path_files)
size_files <- sapply(path_files, file.size)
mtime_files <- sapply(path_files, file.mtime)
info_files <- paste0(gsub(pattern = ".*/(.*)$", replacement = "\\1", x = path_files), size_files, mtime_files)
sapply(info_files, digest::digest, algo="md5")
}
#' Missing date range in a date range
#'
#' The result of Date range A minus Date range B.The result is expressed as a data.frame of date range collection.
#'
#' @param base_start_date a date value to represent start of the base date range.
#' @param base_end_date a date value to represent end of the base date range.
#' @param start_date a date value to represent start of the excluded date range.
#' @param end_date a date value to represent end of the excluded range.
#'
#' @return a data.frame containing the collection of date range.
#' @export
#'
#' @examples
#' missing_ranges(base_start_date = as.Date("2020-01-01"),
#' base_end_date = as.Date("2021-01-01"),
#' start_date = as.Date("2020-03-01"), end_date = as.Date("2020-06-01"))
missing_ranges <- function(base_start_date, base_end_date, start_date, end_date) {
df <- data.frame(`Start Date` = as.Date(character()), `End Date` = as.Date(character()), check.names = FALSE)
if(length(base_start_date) != 1 | length(base_end_date) != 1)
return(df)
if(base_start_date > base_end_date)
return(df)
original_range <- df
original_range[1,] <- NA
original_range$`Start Date` <- base_start_date
original_range$`End Date` <- base_end_date
if(length(start_date) != 1 | length(end_date) != 1)
return(original_range)
if(start_date > end_date)
{
return(original_range)
}
else
{
range1 <- df
range1[1,] <- NA
range1$`Start Date` <- max(base_start_date, (end_date+1))
range1$`End Date` <- base_end_date
range2 <- df
range2[1,] <- NA
range2$`Start Date` <- base_start_date
range2$`End Date` <- min(base_end_date, (start_date-1))
missing_range <- rbind(range1, range2)
missing_range <- missing_range[missing_range$`Start Date` <= missing_range$`End Date`,]
return(missing_range)
}
return(df)
}
#' Check overlap for a given date range
#'
#' Check whether a given date range overlaps with the other collection of date range.
#'
#' @param start_value a date value which signifies start of the date range to check for overlap.
#' @param end_value a date value which signifies end of the date range to check for overlap.
#' @param start_vector a date vector which signifies start of the multiple date range to check overlap against.
#' @param end_vector a date vector which signifies end of the multiple date range to check overlap against.
#'
#' @return a logical value to represent overlap.
#' @export
#'
#' @examples
#' overlapping_range(start_value = as.Date("2020-01-01"),
#' end_value = as.Date("2021-01-01"),
#' start_vector = as.Date("2019-03-01"), end_vector = as.Date("2021-06-01"))
overlapping_range <- function(start_value, end_value, start_vector, end_vector) {
if(length(start_value) != 1 | length(end_value) != 1)
return(TRUE)
if(length(start_vector) == 0)
return(FALSE)
if(length(start_vector) != length(end_vector))
return(TRUE)
if(start_value > end_value)
return(TRUE)
if(TRUE %in% (start_vector > end_vector))
return(TRUE)
if(TRUE %in% (start_vector < start_value))
{
if(max(end_vector[which(start_vector < start_value)]) >= start_value)
return(TRUE)
}
if(TRUE %in% (start_vector >= start_value))
{
if(min(start_vector[which(start_vector >= start_value)]) <= end_value)
return(TRUE)
}
return(FALSE)
}
#' Get fiscal year from a date
#'
#' Extract fiscal year from a date provided fiscal year starts in October.
#'
#' @param arg1 a Date value
#'
#' @return a character value representing fiscal year.
#' @export
#'
#' @examples
#' get_fiscal_year(arg1 = as.Date("2020-11-02"))
get_fiscal_year <- function(arg1) {
m <- lubridate::month(arg1)
y <- lubridate::year(arg1)
if(m >=10)
fy <- paste0(y+1)
else
fy <- paste0(y)
return (fy)
}
#' Get fiscal quarter from a date
#'
#' Extract fiscal quarter from a date provided fiscal year starts in October.
#'
#' @param arg1 a Date value
#'
#' @return a character value representing fiscal quarter.
#' @export
#'
#' @examples
#' get_fiscal_period(arg1 = as.Date("2020-11-02"))
get_fiscal_period <- function(arg1) {
m <- lubridate::month(arg1)
y <- lubridate::year(arg1)
if(m >=10)
fy <- paste0(y+1, "-Q1")
else if(m>0 & m<=3)
fy <- paste0(y, "-Q2")
else if(m>3 & m<=6)
fy <- paste0(y, "-Q3")
else
fy <- paste0(y, "-Q4")
return (fy)
}
#' Funtion to create month level date range input
#'
#' Shiny doesn't have a way to restrict selection at month level. This function tries to overcome this shortfall by adding an extra more argument minviewmode.
#'
#' @param inputId The input slot that will be used to access the value.
#' @param label Display label for the control, or NULL for no label.
#' @param start The initial start date. Either a Date object, or a string in yyyy-mm-dd format. If NULL (the default), will use the current date in the client's time zone.
#' @param end The initial end date. Either a Date object, or a string in yyyy-mm-dd format. If NULL (the default), will use the current date in the client's time zone.
#' @param min The minimum allowed date. Either a Date object, or a string in yyyy-mm-dd format.
#' @param max The maximum allowed date. Either a Date object, or a string in yyyy-mm-dd format.
#' @param format The format of the date to display in the browser. Defaults to "yyyy-mm-dd".
#' @param startview The date range shown when the input object is first clicked. Can be "month" (the default), "year", or "decade".
#' @param minviewmode The selection level to stop the date hierarchy, "months" by default.
#' @param weekstart Which day is the start of the week. Should be an integer from 0 (Sunday) to 6 (Saturday).
#' @param language The language used for month and day names. Default is "en". Other valid values include "ar", "az", "bg", "bs", "ca", "cs", "cy", "da", "de", "el", "en-AU", "en-GB", "eo", "es", "et", "eu", "fa", "fi", "fo", "fr-CH", "fr", "gl", "he", "hr", "hu", "hy", "id", "is", "it-CH", "it", "ja", "ka", "kh", "kk", "ko", "kr", "lt", "lv", "me", "mk", "mn", "ms", "nb", "nl-BE", "nl", "no", "pl", "pt-BR", "pt", "ro", "rs-latin", "rs", "ru", "sk", "sl", "sq", "sr-latin", "sr", "sv", "sw", "th", "tr", "uk", "vi", "zh-CN", and "zh-TW".
#' @param separator String to display between the start and end input boxes.
#' @param width The width of the input, e.g. '400px', or '100%'; see validateCssUnit().
#'
#' @return A date range input for shiny.
#' @export
#'
#' @examples
#' if (interactive()) {
#' ui <- shiny::fluidPage(
#' dateRangeMonthsInput("daterange1", "Date range:", start = "2001-01-01"
#' , end = "2010-12-31", minviewmode = "months")
#' )
#' shiny::shinyApp(ui, server = function(input, output) { })
#' }
dateRangeMonthsInput <- function(inputId, label, start = NULL, end = NULL,
min = NULL, max = NULL, format = "yyyy-mm-dd", startview = "month",
minviewmode = "months", # added manually
weekstart = 0, language = "en", separator = " to ", width = NULL) {
# the datePickerDependency is taken from https://github.com/rstudio/shiny/blob/master/R/input-date.R
datePickerDependency <- htmltools::htmlDependency(
"bootstrap-datepicker", "1.6.4", c(href = "shared/datepicker"),
script = "js/bootstrap-datepicker.min.js",
stylesheet = "css/bootstrap-datepicker3.min.css",
# Need to enable noConflict mode. See #1346.
head = "<script>
(function() {
var datepicker = $.fn.datepicker.noConflict();
$.fn.bsDatepicker = datepicker;
})();
</script>")
`%AND%` <- function(x, y) {
if (!is.null(x) && !is.na(x))
if (!is.null(y) && !is.na(y))
return(y)
return(NULL)
}
controlLabel <- function(controlName, label) {
label %AND% htmltools::tags$label(class = "control-label", `for` = controlName, label)
}
# If start and end are date objects, convert to a string with yyyy-mm-dd format
# Same for min and max
if (inherits(start, "Date")) start <- format(start, "%Y-%m-%d")
if (inherits(end, "Date")) end <- format(end, "%Y-%m-%d")
if (inherits(min, "Date")) min <- format(min, "%Y-%m-%d")
if (inherits(max, "Date")) max <- format(max, "%Y-%m-%d")
htmltools::attachDependencies(
shiny::div(id = inputId,
class = "shiny-date-range-input form-group shiny-input-container",
style = if (!is.null(width)) paste0("width: ", shiny::validateCssUnit(width), ";"),
controlLabel(inputId, label),
# input-daterange class is needed for dropdown behavior
shiny::div(class = "input-daterange input-group",
htmltools::tags$input(
class = "input-sm form-control",
type = "text",
`data-date-language` = language,
`data-date-weekstart` = weekstart,
`data-date-format` = format,
`data-date-start-view` = startview,
`data-date-min-view-mode` = minviewmode, # added manually
`data-min-date` = min,
`data-max-date` = max,
`data-initial-date` = start
),
shiny::span(class = "input-group-addon", separator),
htmltools::tags$input(
class = "input-sm form-control",
type = "text",
`data-date-language` = language,
`data-date-weekstart` = weekstart,
`data-date-format` = format,
`data-date-start-view` = startview,
`data-date-min-view-mode` = minviewmode, # added manually
`data-min-date` = min,
`data-max-date` = max,
`data-initial-date` = end
)
)
),
datePickerDependency
)
}
#' Convert a vector of customer names into a vector of customer page links.
#'
#' Add html elements to a character vector to convert it into clickable links.
#'
#' @param to_tab A character value specifying the page name to jump to.
#' @param modify_vector A character vector of customer names to create links.
#' @param from_tab A character value specifying the current page.
#'
#' @return A character vector of customer page links.
#' @export
#'
#' @examples
#' add_openTab("customer", c("Customer A", "Customer B"), "search")
add_openTab <- function(to_tab, modify_vector, from_tab) {
with_double_quote <- grep(pattern = '"', x = modify_vector)
if(length(with_double_quote))
{
if(length(modify_vector[-with_double_quote]))
{
modify_vector[-with_double_quote] <- paste("<a onclick='openTab(\"", to_tab, "\", \"", gsub(pattern = '"', replacement = '"', x = gsub(pattern = "'", replacement = "'", x = modify_vector[-with_double_quote])) ,"\", \"", from_tab, "\")'>", modify_vector[-with_double_quote], "</a>", sep = "")
}
if(length(modify_vector[with_double_quote]))
{
modify_vector[with_double_quote] <- paste("<a onclick=\"openTab('", to_tab, "', '", gsub(pattern = '"', replacement = '"', x = gsub(pattern = "'", replacement = "'", x = modify_vector[with_double_quote])) ,"', '", from_tab, "')\">", modify_vector[with_double_quote], "</a>", sep = "")
}
}
else
{
modify_vector <- paste("<a onclick='openTab(\"", to_tab, "\", \"", gsub(pattern = '"', replacement = '"', x = gsub(pattern = "'", replacement = "'", x = modify_vector)) ,"\", \"", from_tab, "\")'>", modify_vector, "</a>", sep = "")
}
modify_vector
}
#' Convert a vector of customer names into a vector of customer page links for RAC Analytics
#'
#' Add html elements to a character vector to convert it into clickable links.
#'
#' @param to_tab A character value specifying the page name to jump to.
#' @param from_tab A character value specifying the current page.
#' @param jsfunction A character value specifying javascript function.
#' @param modify_vector A character vector of customer names to create links.
#' @param customer_name A character vector specifying customer name.
#' @param linkColor A character vector specifying color.
#'
#' @return A character vector of customer page links.
#' @export
#'
#' @examples
#' add_openTabForRACAnalytics("engagement_tab", "rac_analytics_customers_list_tab",
#' "openTabForRACAnalytics", c("Link A", "Link B"), c("Customer A", "Customer B"), c("green", "red"))
add_openTabForRACAnalytics <- function(to_tab, from_tab, jsfunction, modify_vector, customer_name, linkColor) {
with_double_quote <- grep(pattern = '"', x = modify_vector)
if(length(with_double_quote))
{
if(length(modify_vector[-with_double_quote]))
{
modify_vector[-with_double_quote] <- paste("<a onclick='", jsfunction, "(\"", to_tab, "\", \"", gsub(pattern = '"', replacement = '"', x = gsub(pattern = "'", replacement = "'", x = modify_vector[-with_double_quote])) ,"\", \"", from_tab, "\", \"", customer_name, "\")'", " style=color:", linkColor, ">", modify_vector[-with_double_quote], "</a>", sep = "")
}
if(length(modify_vector[with_double_quote]))
{
modify_vector[with_double_quote] <- paste("<a onclick=\"", jsfunction, "('", to_tab, "', '", gsub(pattern = '"', replacement = '"', x = gsub(pattern = "'", replacement = "'", x = modify_vector[with_double_quote])) ,"', '", from_tab, "\", \"", customer_name, "')\"", " style=color:", linkColor, ">", modify_vector[with_double_quote], "</a>", sep = "")
}
}
else
{
modify_vector <- paste("<a onclick='", jsfunction, "(\"", to_tab, "\", \"", gsub(pattern = '"', replacement = '"', x = gsub(pattern = "'", replacement = "'", x = modify_vector)) ,"\", \"", from_tab, "\", \"", customer_name, "\")'", " style=color:", linkColor, ">", modify_vector, "</a>", sep = "")
}
modify_vector
}
#' Find the number of days in a given month
#'
#' Calculates number of days for a particular month in a given year.
#'
#' @param date A Date value.
#'
#' @return An integer value specifying number of days.
#' @export
#'
#' @examples
#' numberOfDays(date = as.Date("2020-02-02"))
numberOfDays <- function(date) {
m <- format(date, format = "%m")
while (format(date, format = "%m") == m) {
date <- date + 1
}
return(as.integer(format(date - 1, format = "%d")))
}
#' Anonymize data
#'
#' Anonymize given columns in a data.frame. Also take a look at function deanonymise.
#'
#' @param original_df A data.frame containing the original data.
#' @param cols_to_anon A character vector containing the column names to anonymize.
#' @param algo The algorithms to be used to anonymize data; currently available choices are md5, which is also the default, sha1, crc32, sha256, sha512, xxhash32, xxhash64, murmur32, spookyhash and blake3.
#'
#' @return A data.frame containing the anonymized data.
#' @export
#'
#' @examples
#' anonymise(original_df = data.frame(col1 = c("val1", "val2", "val3"), col2 = c(3, 5, 7)),
#' cols_to_anon = c("col1"), algo = "crc32")
anonymise <- function(original_df, cols_to_anon, algo = "md5") {
col_num <- which(x = colnames(original_df) %in% cols_to_anon)
for (i in col_num) {
original_df[[i]] <- unlist(lapply(original_df[[i]], digest::digest, algo = algo))
}
original_df
}
#' Deanonymize data
#'
#' Deanonymize the anonymized columns in a data.frame. Also take a look at function anonymise.
#'
#' @param original_df A data.frame containing the original data.
#' @param modified_df A data.frame containing the anonymized modified data.
#' @param cols_to_deanon A character vector containing the column names to deanonymize.
#' @param algo The algorithm which was used to anonymize data; currently available choices are md5, which is also the default, sha1, crc32, sha256, sha512, xxhash32, xxhash64, murmur32, spookyhash and blake3.
#'
#' @return A data.frame containing the deanonymized data.
#' @export
#'
#' @examples
#' deanonymise(original_df = data.frame(col1 = c("val1", "val2", "val3"), col2 = c(3, 5, 7)),
#' modified_df = data.frame(col1 = c("b33fc2aa", "b33fc2aa", "2a369310"), col2 = c(8, 3, 9)),
#' cols_to_deanon = c("col1"), algo = "crc32")
deanonymise <- function(original_df, modified_df, cols_to_deanon, algo = "md5") {
col_num_orig <- which(x = colnames(original_df) %in% cols_to_deanon)
col_num_modif <- which(x = colnames(modified_df) %in% cols_to_deanon)
for (i in 1:length(cols_to_deanon)) {
map <- data.frame(original = unique(original_df[[col_num_orig[i]]]), anon_value = unlist(lapply(unique(original_df[[col_num_orig[i]]]), digest::digest, algo = algo)))
modified_df[[col_num_modif[i]]] <- map[match(x = modified_df[[col_num_modif[i]]], table = map$anon_value), "original"]
}
modified_df
}
#' Fill missing values in a data.frame or a data.table by reference
#'
#' Intelligently fill missing values in a data.frame or a data.table in memory.
#' If column names are not provided, missing values will be filled in for all the columns where data type of the column matches with the val argument's data type.
#' Use with caution. Since data is modified in memory, no data is returned. Hence no need to specify assignment operator '<-'.
#'
#' @param DT A data.table or a data.frame. A data.frame will be converted to a data.table.
#' @param column_names A character vector of column names to fill missing values. If NULL which is default, missing values will be filled in for all the columns where data type of the column matches with the val argument's data type.
#' @param val Value to replace the missing values. Default is 0.
#'
#' @return A data.table with missing values replace with val.
#' @export
#' @importFrom data.table :=
#' @importFrom data.table .SD
#'
#' @examples
#' dt_sample <- data.table::data.table(Col_1 = c("a", "b", NA), Col_2 = c(2, NA, 5))
#' dt_fill_NAs(DT = dt_sample, val = 0)
dt_fill_NAs <- function(DT, column_names = NULL, val = 0) {
if(!"data.table" %in% class(DT))
{
data.table::setDT(DT)
}
if(class(val) == "numeric")
{
integer_cols <- colnames(DT)[sapply(DT, class) %in% "integer"]
if(length(integer_cols))
{
DT[ , (integer_cols) := lapply(.SD, as.numeric), .SDcols = integer_cols]
}
}
if(is.null(column_names))
{
colnums <- which(x = as.vector(sapply(DT, class)) == class(val))
# or by number (slightly faster than by name) :
for (j in colnums)
data.table::set(DT, which(is.na(DT[[j]])), j, val)
}
else
{
colnums <- colnames(DT)[colnames(DT) %in% column_names]
class_mismatch <- colnames(DT[,colnums, with = FALSE])[as.vector(sapply(DT[,colnums, with = FALSE], class)) != class(val)]
if(length(class_mismatch))
{
if(class(val) == "numeric")
{
DT[ , (class_mismatch) := lapply(.SD, as.character), .SDcols = class_mismatch]
DT[ , (class_mismatch) := lapply(.SD, as.numeric), .SDcols = class_mismatch]
}
if(class(val) == "character")
{
DT[ , (class_mismatch) := lapply(.SD, as.character), .SDcols = class_mismatch]
}
if(class(val) == "Date")
{
DT[ , (class_mismatch) := lapply(.SD, as.Date), .SDcols = class_mismatch]
}
}
for (j in colnums)
data.table::set(DT, which(is.na(DT[[j]])), j, val)
}
}
#' Find columns which introduces duplication in key columns
#'
#' If a lot of columns have the same value in a large dataframe with unique rows,
#' it becomes challenging to figure out which column/columns are responsible for causing this duplication.
#' This function tries to simplify this analysis by figuring out these columns which are wreaking havoc.
#'
#' @param DT A data.table or a data.frame. A data.frame will be converted to a data.table.
#' @param key_cols A character vector containing key column names.
#'
#' @return A character vector with column names which are causing duplicates.
#' @export
#'
#' @examples
#' cols_mult_vals(DT = data.table::data.table(col_key = c("a", "b", "a"),
#' col_1 = c(3 , 5, 3), col_2 = c("v1", "v2", "v3")),
#' key_cols = c("col_key"))
cols_mult_vals <- function(DT, key_cols) {
if(!"data.table" %in% class(DT))
{
DT <- data.table::as.data.table(DT)
}
key_nrow <- nrow(unique(DT[, key_cols, with = FALSE]))
problematic_cols <- c()
if(nrow(unique(DT)) != key_nrow)
{
other_cols <- colnames(DT)[!colnames(DT) %in% key_cols]
for(i in other_cols)
{
check_cols <- c(key_cols, i)
if(nrow(unique(DT[, check_cols, with = FALSE])) != key_nrow)
{
problematic_cols <- c(problematic_cols, i)
}
}
}
else
{
problematic_cols
}
problematic_cols
}
#' Capitalize the first letter of all words
#'
#' Make first letter of every word uppercase. It is advised to use a string with all lower case as input.
#'
#' @param x A character value (string) preferably lowercase string.
#'
#' @return A character value (string) with first letter of each word in capital.
#' @export
#'
#' @examples
#' simpleCap("watch the magic")
simpleCap <- function(x) {
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1,1)), substring(s, 2),
sep="", collapse=" ")
}
#' Get month's starting date for a given date
#'
#' @param date A Date value.
#'
#' @return A Date value representing start of that month.
#' @export
#'
#' @examples
#' som(as.Date("2020-02-07"))
som <- function(date) {
mon <- as.numeric(format(date,'%m'))
year <- as.numeric(format(date,'%Y'))
as.Date(paste0(year, "-", mon, "-01"))
}
#' Get month's ending date for a given date
#'
#' @param date A Date value.
#'
#' @return A Date value representing end of that month.
#' @export
#'
#' @examples
#' eom(as.Date("2020-02-07"))
eom <- function(date) {
mon <- as.numeric(format(date,'%m')) + 1
year <- as.numeric(format(date,'%Y'))
year <- year + as.integer(mon==13) # if month was December add a year
mon[mon==13] <- 1
as.Date(paste0(year, "-", mon, "-01")) - 1
}
#' Get quarter's starting date for a given date
#'
#' @param date A Date value.
#'
#' @return A Date value representing start of that quarter.
#' @export
#'
#' @examples
#' soq(as.Date("2020-02-07"))
soq <- function(date) {
mon <- as.numeric(format(date,'%m'))
mon <- (((mon-1)%/%3)*3) + 1
year <- as.numeric(format(date,'%Y'))
as.Date(paste0(year, "-", mon, "-01"))
}
#' Get quarter's ending date for a given date
#'
#' @param date A Date value.
#'
#' @return A Date value representing end of that quarter.
#' @export
#'
#' @examples
#' eoq(as.Date("2020-02-07"))
eoq <- function(date) {
mon <- as.numeric(format(date,'%m'))
mon <- (((mon-1)%/%3)*3) + 4
year <- as.numeric(format(date,'%Y'))
year <- year + as.integer(mon==13) # if month was December add a year
mon[mon==13] <- 1
as.Date(paste0(year, "-", mon, "-01")) - 1
}
#' Get half year's starting date for a given date
#'
#' @param date A Date value.
#'
#' @return A Date value representing start of that half year.
#' @export
#'
#' @examples
#' soh(as.Date("2020-02-07"))
soh <- function(date) {
year <- as.numeric(format(date,'%Y'))
if(as.numeric(format(date,'%m')) > 6)
{
mon <- 7
}
else
{
mon <- 1
}
as.Date(paste0(year, "-", mon, "-01"))
}
#' Get half year's ending date for a given date
#'
#' @param date A Date value.
#'
#' @return A Date value representing end of that half year.
#' @export
#'
#' @examples
#' eoh(as.Date("2020-02-07"))
eoh <- function(date) {
if(as.numeric(format(date,'%m')) > 6)
{
mon <- 12
}
else
{
mon <- 6
}
mon <- mon + 1
year <- as.numeric(format(date,'%Y'))
year <- year + as.integer(mon==13) # if month was December add a year
mon[mon==13] <- 1
as.Date(paste0(year, "-", mon, "-01")) - 1
}
#' Get the sum from a numeric vector with NAs
#'
#' @param x A numeric vector.
#'
#' @return A numeric value representing sum of all the values excluding NAs.
#' @export
#'
#' @examples
#' sum_na(c(3,5,NA))
sum_na <- function(x) {
if(FALSE %in% is.na(x))
{
sum(as.numeric(x[!is.na(x)]))
}
else
{
as.numeric(NA)
}
}
#' Get the maximum number from a numeric vector with NAs
#'
#' @param x A numeric vector.
#'
#' @return A numeric value representing maximum of all the values excluding NAs.
#' @export
#'
#' @examples
#' maximum_num(c(3,5,NA))
maximum_num <- function(x) {
if(FALSE %in% is.na(x))
{
max(as.numeric(x[!is.na(x)]))
}
else
{
as.numeric(NA)
}
}
#' Append multiple files in a given folder into one
#'
#' Appends all the files in a folder. Files have to be either csv, xlsx or txt. Note that certain file types are ignored such as those end with .trc, .xlsm, .log and those containing ~$.
#' custom_md5_hash from package "zimplify" is used to generate md5 hash.
#'
#' @param folder_path A character string representing an absolute folder path.
#' @param sheet A character string representing name of the excel sheet.
#'
#' @return A data.table representing the combined data of all the files within that given folder.
#' @export
#'
#' @examples
file_append <- function(folder_path, sheet = NULL) {
output_file <- gsub(pattern = "^(.*)/$", replacement = "\\1", x = folder_path)
# file_list <- grep(pattern = "[^\\.log]$", x = grep(pattern = "^[^\\~\\$].*", x = list.files(path = folder_path), value = TRUE), value = TRUE)
all_files <- list.files(path = folder_path)
file_list <- all_files[!all_files %in% grep(pattern = "\\.trc$|\\.xlsm$|\\.log$|\\~\\$.*", x = all_files, value = TRUE)]
files_md5 <- zimplify::custom_md5_hash(path_files = paste0(output_file, "/", file_list))
if(file.exists(paste0(output_file, "_md5.csv")))
{
read_md5 <- data.table::fread(file = paste0(output_file, "_md5.csv"))
if(FALSE %in% (read_md5$x %in% files_md5))
{
reset_flag <- TRUE
}
else
{
reset_flag <- FALSE
}
if(FALSE %in% (files_md5 %in% read_md5$x))
{
update_flag <- TRUE
}
else
{
update_flag <- FALSE
}
}
else
{
read_md5 <- data.frame(x = character())
update_flag <- FALSE
reset_flag <- TRUE
}
if(!reset_flag)
{
if(file.exists(paste0(output_file, "_merged.feather")))
{
data_final <- feather::read_feather(path = paste0(output_file, "_merged.feather"))
data.table::setDT(data_final)
}
else
{
reset_flag <- TRUE
}
}
if(reset_flag | update_flag)
{
for(i in 1:length(file_list))
{
if((!files_md5[i] %in% read_md5$x) | reset_flag)
{
if(grepl(pattern = "\\.xlsx$", x = file_list[i])) {
read_file <- tibble::as_tibble(readxl::read_excel(path = paste0(folder_path, "/", file_list[i]), sheet = sheet, col_types = "text", .name_repair = "minimal"), .name_repair = "minimal")
#If the file is in the SAP BO raw format
if(read_file[1,1] == "Business Reporting and Analytics")
{
read_file <- tibble::as_tibble(readxl::read_excel(path = paste0(folder_path, "/", file_list[i]), sheet = sheet, col_types = "text", .name_repair = "minimal", skip = 14), .name_repair = "minimal")
read_file <- read_file[,-c(1:3)]
}
if(colnames(read_file)[1] == "" | is.na(colnames(read_file)[1]))
{
read_file[1,][is.na(read_file[1,])] <- ""
colnames(read_file) <- gsub(pattern = "^Snapshot Calender.*", replacement = "", x = colnames(read_file), ignore.case = TRUE)
col_names <- paste(colnames(read_file), read_file[1,])
for(j in 1:length(col_names))
{
read_file[1,j] <- col_names[j]
}
names(read_file) <- gsub(pattern = "^\\s*(.*)", replacement = "\\1", x = read_file[1,])
read_file <- read_file[-1,]
}
}
else if(grepl(pattern = "\\.csv$", x = file_list[i])) {
read_file <- tibble::as_tibble(readr::read_csv(file = paste0(folder_path, "/", file_list[i]), col_types = readr::cols(.default = readr::col_character())), .name_repair = "minimal")
}
else if(grepl(pattern = "\\.txt$", x = file_list[i])) {
read_file <- tibble::as_tibble(readLines(con = paste0(folder_path, "/", file_list[i]), n = 1), stringsAsFactors = FALSE, .name_repair = "minimal")
if(grepl(pattern = "\t", x = read_file[1,1]))
{
read_file <- tibble::as_tibble(utils::read.delim(file = paste0(folder_path, "/", file_list[i]), stringsAsFactors = FALSE), .name_repair = "minimal")
}
else
{
read_file <- tibble::as_tibble(readLines(con = paste0(folder_path, "/", file_list[i]), n = -1, warn = FALSE), stringsAsFactors = FALSE, .name_repair = "minimal")
colnames(read_file) <- "Lines"
}
}
else if(grepl(pattern = "\\.json$", x = file_list[i])) {
read_file <- jsonlite::fromJSON(txt = paste0(folder_path, "/", file_list[i]))
}
else if(grepl(pattern = "\\.feather$", x = file_list[i])) {
read_file <- feather::read_feather(path = paste0(folder_path, "/", file_list[i]))
}
#Set as data.table
data.table::setDT(read_file)
#Rename blank column names
col_rename <- which(colnames(read_file) == "")
if(length(col_rename))
{
colnames(read_file)[col_rename] <- paste0(colnames(read_file)[col_rename-1], " name")
}
if(i == 1 & reset_flag)
{
data_final <- read_file
}
else
{
cols_intersect <- intersect(colnames(data_final), colnames(read_file))
if(length(colnames(data_final)) == length(cols_intersect) & length(colnames(read_file)) == length(cols_intersect))
{
data_final <- data_final[,cols_intersect, with = FALSE]
read_file <- read_file[,cols_intersect, with = FALSE]
data_final <- rbind(data_final, read_file)
}
else
{
max_rows <- nrow(data_final) + nrow(read_file)
data_final <- merge(x = data_final, y = read_file, by = cols_intersect, all = TRUE)
if(nrow(data_final) > max_rows) {
stop("Rows in merged data are more than sum of rows in individual files")
}
}
}
}
}
feather::write_feather(x = data_final, path = paste0(output_file, "_merged.feather"))
if(file.exists(paste0(output_file, "_merged.csv")))
{
if(file.remove(paste0(output_file, "_merged.csv")))
{
data.table::fwrite(x = data_final, file = paste0(output_file, "_merged.csv"), row.names = FALSE)
}
else
{
data.table::fwrite(x = data_final, file = paste0(output_file, "_merged", format(x = Sys.time(), format = "-%Y-%m-%d-%H%M%S-locked"), ".csv"), row.names = FALSE)
}
}
else
{
data.table::fwrite(x = data_final, file = paste0(output_file, "_merged.csv"), row.names = FALSE)
}
}
data_final
}
#' Update the md5 hash once all the files in a given folder have processed
#'
#' A csv file with the same name as the folder name is created/updated with new md5 values. This file with md5 hash values and the given folder has same parent directory.
#'
#' @param folder_path A character string representing an absolute folder path.
#' @param md5_data A character vector representing md5 hash values of the files within the given folder.
#'
#' @return A logical value TRUE if the operation is successful.
#' @export
#'
#' @examples
update_md5_file <- function(folder_path, md5_data) {
dir.create(path = gsub(pattern = "([^/]*)$", replacement = "", x = folder_path), recursive = TRUE, showWarnings = FALSE)
output_file <- gsub(pattern = "^(.*)/$", replacement = "\\1", x = folder_path)
utils::write.csv(x = md5_data, file = paste0(output_file, "_md5.csv"), row.names = FALSE)
TRUE
}
#' Generate md5 hash of the files in a given folder
#'
#' Generates the md5 hash of the files in a given folder. Note that certain file types are ignored such as those end with .trc, .xlsm, .log and those containing ~$.
#' custom_md5_hash from package "zimplify" is used to generate md5 hash.
#'
#' @param folder_path A character string representing an absolute folder path.
#'
#' @return A character vector with md5 hash values of all the files within a given folder.
#' @export
#'
#' @examples
#' read_md5_folder(folder_path = getwd())
read_md5_folder <- function(folder_path) {
output_file <- gsub(pattern = "^(.*)/$", replacement = "\\1", x = folder_path)
all_files <- list.files(path = folder_path)
file_list <- all_files[!all_files %in% grep(pattern = "\\.trc$|\\.xlsm$|\\.log$|\\~\\$.*", x = all_files, value = TRUE)]
files_md5 <- zimplify::custom_md5_hash(path_files = paste0(output_file, "/", file_list))
files_md5
}
#' Update/create a file with the given data at a given path
#'
#' Updates data in a file based on the given path. Backs up the old file if the file with the same name is already present.
#' Appends the file name with the timestamp for the back up files.
#'
#' @param data A data.frame, tibble or data.table.
#' @param file_path A character string representing the path of the output file. Do not include file extension.
#' @param max_limit A numeric value representing the number of most recent back up files to keep. Default is 4.
#'
#' @return A logical value TRUE if the operation is successful.
#' @export
#'
#' @examples
update_output_file <- function(data, file_path, max_limit = 4) {
# if("try-error" %in% class(
# try(expr = write.csv(x = data, file = paste0(file_path, ".csv"), row.names = FALSE),
# silent = TRUE
# )))
dir.create(path = gsub(pattern = "([^/]*)$", replacement = "", x = file_path), recursive = TRUE, showWarnings = FALSE)
if(file.exists(paste0(file_path, ".feather")))
{
if(file.rename(from = paste0(file_path, ".feather"), to = paste0(file_path, format(x = Sys.time(), format = "-%Y-%m-%d-%H%M%S"), ".feather")))
{
feather::write_feather(x = data, path = paste0(file_path, ".feather"))
}
else
{
feather::write_feather(x = data, path = paste0(file_path, format(x = Sys.time(), format = "-%Y-%m-%d-%H%M%S-locked"), ".feather"))
}
}
else
{
feather::write_feather(x = data, path = paste0(file_path, ".feather"))
}
if(file.exists(paste0(file_path, ".csv")))
{
if(file.rename(from = paste0(file_path, ".csv"), to = paste0(file_path, format(x = Sys.time(), format = "-%Y-%m-%d-%H%M%S"), ".csv")))
{
data.table::fwrite(x = data, file = paste0(file_path, ".csv"), row.names = FALSE)
}
else
{
data.table::fwrite(x = data, file = paste0(file_path, format(x = Sys.time(), format = "-%Y-%m-%d-%H%M%S-locked"), ".csv"), row.names = FALSE)
}
}
else
{
data.table::fwrite(x = data, file = paste0(file_path, ".csv"), row.names = FALSE)
}
folder_path <- gsub(pattern = "^(.*)/.+$", replacement = "\\1", x = file_path)
file_name <- gsub(pattern = ".*/(.*)$", replacement = "\\1", x = file_path)
all_extra_csv_files <- sort(list.files(path = folder_path, pattern = paste0("^", file_name, "-.+\\.csv"), full.names = TRUE))
if(length(all_extra_csv_files) > max_limit)
{
files_to_remove <- all_extra_csv_files[1:(length(all_extra_csv_files)-max_limit)]
file.remove(files_to_remove)
}
all_extra_feather_files <- sort(list.files(path = folder_path, pattern = paste0("^", file_name, "-.+\\.feather"), full.names = TRUE))
if(length(all_extra_feather_files) > max_limit)
{
files_to_remove <- all_extra_feather_files[1:(length(all_extra_feather_files)-max_limit)]
file.remove(files_to_remove)
}
TRUE
}
#' Create/update log file with the logged data
#'
#' Log file is created or updated with the latest message along with the log timestamp.
#' Enhanced for multi-threading if multiple threads are trying to log messages at the same time.
#'
#' @param message A character string representing the message to log.
#' @param file_path A character string representing the path of the log file. Do not include file extension.
#'
#' @return A logical value TRUE if the operation is successful.
#' @export
#'
#' @examples
log_messages <- function(message, file_path) {
message_df <- tibble::tibble(Timestamp = as.character(Sys.time()), Message = message)
logs_lock <- filelock::lock(path = paste0(file_path, ".lock"), exclusive = TRUE, timeout = 5000)
if(file.exists(paste0(file_path, ".feather")))
{
existing_file <- feather::read_feather(path = paste0(file_path, ".feather"))
new_file <- rbind(message_df, existing_file)
if("try-error" %in% class(try(expr = feather::write_feather(x = new_file, path = paste0(file_path, ".feather")), silent = TRUE)))
{
feather::write_feather(x = new_file, path = paste0(file_path, format(x = Sys.time(), format = "-%Y-%m-%d-%H%M%S-locked"), ".feather"))
}
if("try-error" %in% class(try(expr = data.table::fwrite(x = new_file, file = paste0(file_path, ".csv")), silent = TRUE)))
{
data.table::fwrite(x = new_file, file = paste0(file_path, format(x = Sys.time(), format = "-%Y-%m-%d-%H%M%S-locked"), ".csv"))
}
}
else
{
feather::write_feather(x = message_df, path = paste0(file_path, ".feather"))
data.table::fwrite(x = message_df, file = paste0(file_path, ".csv"))
}
filelock::unlock(lock = logs_lock)
TRUE
}
#' Sync directory
#'
#' Create a mirror image of a directory. Files will be removed if they are not present in the source directory.
#'
#' @param source A character string representing the path of the source directory.
#' @param destination A character string representing the path of the destination directory. Default: Customer_Success/Input_Data/Software_Downloads_Raw.
#' @param ignore A character vector representing the directory paths to ignore. Default: c("Customer_Success/Input_Data/Software_Downloads_Raw").
#' @param delete A logical value representing whether to delete files/folders absent in source directory. Default is FALSE.
#'
#' @return A logical value TRUE if the operation is successful.
#' @export
#'
#' @examples
sync_directory <- function(source, destination, ignore = c("Customer_Success/Input_Data/Software_Downloads_Raw"), delete = FALSE) {
source <- gsub(pattern = "\\\\", replacement = "/", x = source)
source <- gsub(pattern = "^(.*)/$", replacement = "\\1", x = source)
if(!dir.exists(source)) {
stop("Source directory doesn't exist")
}
destination <- gsub(pattern = "\\\\", replacement = "/", x = destination)
destination <- gsub(pattern = "^(.*)/$", replacement = "\\1", x = destination)
if(!dir.exists(destination)) {
dir.create(destination, recursive = TRUE)
}
# source_dirs <- c(list.dirs(path = source, full.names = TRUE, recursive = TRUE), source)
source_dirs <- c(as.character(fs::dir_ls(path = source, recurse = TRUE, type = "directory")), source)
# source_filtered_dirs <- source_dirs[!grepl(pattern = "Customer_Success/Input_Data/Software_Downloads_Raw", x = source_dirs) & !grepl(pattern = "Software_Analytics/SA", x = source_dirs)]
source_filtered_dirs <- source_dirs[!grepl(pattern = paste0(ignore, collapse = " | "), x = source_dirs)]
destination_create_dirs <- gsub(pattern = source, replacement = destination, x = source_filtered_dirs)
source_global_files <- character()
if(length(source_filtered_dirs)) {
for(i in 1:length(source_filtered_dirs))
{
source_local_files <- list.files(path = source_filtered_dirs[i], full.names = TRUE, recursive = FALSE)
# source_local_files <- as.character(fs::dir_ls(path = source_filtered_dirs[i], recurse = FALSE, type = "file"))
source_local_files <- source_local_files[!grepl(pattern = "\\d{4}-\\d{2}-\\d{2}-\\d{6}\\.", x = source_local_files) & !dir.exists(source_local_files)]
# source_local_files <- source_local_files[!grepl(pattern = "\\d{4}-\\d{2}-\\d{2}-\\d{6}\\.", x = source_local_files) & !fs::dir_exists(source_local_files)]
source_global_files <- c(source_global_files, source_local_files)
}
}
source_dest_files <- gsub(pattern = source, replacement = destination, x = source_global_files)
source_dest_dirs <- gsub(pattern = source, replacement = destination, x = source_filtered_dirs)
# destination_dirs <- c(list.dirs(path = destination, full.names = TRUE, recursive = TRUE), destination)
destination_dirs <- c(as.character(fs::dir_ls(path = destination, recurse = TRUE, type = "directory")), destination)
# destination_filtered_dirs <- destination_dirs[!grepl(pattern = "Customer_Success/Input_Data/Software_Downloads_Raw", x = destination_dirs) & !grepl(pattern = "Software_Analytics/SA", x = destination_dirs)]
destination_filtered_dirs <- destination_dirs[!grepl(pattern = paste0(ignore, collapse = " | "), x = destination_dirs)]
destination_global_files <- character()
if(length(destination_filtered_dirs)) {
for(i in 1:length(destination_filtered_dirs))
{
destination_local_files <- list.files(path = destination_filtered_dirs[i], full.names = TRUE, recursive = FALSE)
# destination_local_files <- as.character(fs::dir_ls(path = destination_filtered_dirs[i], recurse = FALSE, type = "file"))
destination_local_files <- destination_local_files[!grepl(pattern = "\\d{4}-\\d{2}-\\d{2}-\\d{6}\\.", x = destination_local_files) & !dir.exists(destination_local_files)]
# destination_local_files <- destination_local_files[!grepl(pattern = "\\d{4}-\\d{2}-\\d{2}-\\d{6}\\.", x = destination_local_files) & !fs::dir_exists(destination_local_files)]
destination_global_files <- c(destination_global_files, destination_local_files)
}
}
if(delete) {
destination_delete_dirs <- destination_filtered_dirs[!destination_filtered_dirs %in% source_dest_dirs]
sapply(destination_delete_dirs, function(x) {if(fs::file_exists(path = x)) {fs::dir_delete(path = x)}})
destination_delete_files <- destination_global_files[!destination_global_files %in% source_dest_files]
destination_delete_files <- destination_delete_files[fs::file_exists(path = destination_delete_files)]
fs::file_delete(path = destination_delete_files)
}
# source_details <- paste0(file.size(source_global_files), file.mtime(source_global_files))
source_info <- fs::file_info(source_global_files)
source_details <- paste0(source_info$size, source_info$modification_time)
# destination_details <- paste0(file.size(source_dest_files), file.mtime(source_dest_files))
destination_info <- fs::file_info(source_dest_files)
destination_details <- paste0(destination_info$size, destination_info$modification_time)
file_table <- data.table::data.table(source_global_files = source_global_files, source_dest_files = source_dest_files,
source_details = source_details, destination_details = destination_details, source_mtime = source_info$modification_time)
modified_files <- file_table[!(file_table$source_details == file_table$destination_details), ]
sapply(destination_create_dirs[!dir.exists(destination_create_dirs)], dir.create)
# fs::dir_create(path = destination_create_dirs, recurse = TRUE)
if(nrow(modified_files)) {
for (j in 1:nrow(modified_files)) {
tryCatch(expr = {
fs::file_copy(path = modified_files$source_global_files[j], new_path = modified_files$source_dest_files[j], overwrite = TRUE)
fs::file_touch(path = modified_files$source_dest_files[j], modification_time = modified_files$source_mtime[j])
},
error = function(c) {
message(c)
}
)
}
}
TRUE
}
#' Update file
#'
#' Update file if the source file size is larger than the destination file.
#'
#' @param source A character string representing the path of the source file.
#' @param destination A character string representing the path of the destination file.
#'
#' @return A logical value TRUE if the operation is successful or appropriate message.
#' @export
#'
#' @examples
update_file <- function(source, destination) {
source <- gsub(pattern = "\\\\", replacement = "/", x = source)
destination <- gsub(pattern = "\\\\", replacement = "/", x = destination)
if(fs::is_file(source)) {
src_size <- file.info(source)$size
dest_size <- file.info(destination)$size
src_size[is.na(src_size)] <- 0
dest_size[is.na(dest_size)] <- 0
if(src_size > dest_size) {
tryCatch(expr = {
fs::file_copy(path = source, new_path = destination, overwrite = TRUE)
fs::file_touch(path = destination, modification_time = file.info(source)$mtime)
return(paste0("File successfully copied - ", destination))
},
error = function(c) {
return(paste0(c, " - ", destination))
}
)
} else {
return(paste0("Error: Source file size is smaller than or equal to destination - ", destination))
}
} else {
return(paste0("Error: Source is not a valid file path - ", destination))
}
}
#' Send email
#'
#' Send email programmatically
#'
#' @param from A valid email address of the sender.
#' @param to A character vector of recipient valid email addresses.Default: Siemens DISW Business Analytics <analytics_insights.sisw@siemens.com>
#' @param cc A character vector of recipient valid email addresses in cc.
#' @param bcc A character vector of recipient valid email addresses in bcc.
#' @param reply_to A valid email address for the reply.
#' @param subject Subject of the email.
#' @param body Body of the email as text. If the parameter body refers to an existing file location, the text of the file is parsed as body of the email.
#' @param html A boolean value indicating whether the body of the email should be parsed as HTML.
#' @param inline A boolean indicating whether images in the HTML file should be embedded inline.
#' @param smtp_server A character string indicating the SMTP server address.
#' @param port A numeric value indicating port.
#' @param attachments A character vector of paths in the file system linking to files or valid URLs to be attached to the email.
#' @param file_names An optional character vector of display names for the attached files.
#'
#' @return A logical value TRUE if the operation is successful.
#' @export
#'
#' @examples
#' send_email(to = "receiver@gmail.com")
send_email <- function(from = "Siemens DISW Business Analytics <analytics_insights.sisw@siemens.com>", to, cc = NULL, bcc = NULL, reply_to = NULL, subject = NULL, body = " ", html = FALSE, inline = TRUE, smtp_server = "cismtp.net.plm.eds.com", port = 25, attachments = NULL, file_names = NULL) {
mailR::send.mail(from = from,
to = to,
cc = cc,
bcc = bcc,
replyTo = reply_to,
subject = subject,
body = body,
html = html,
inline = inline,
smtp = list(host.name = smtp_server, port = port),
attach.files = attachments,
file.names = file_names,
authenticate = FALSE,
send = TRUE)
TRUE
}
#' Get latest timestamp
#'
#' Fetch the latest timestamp for the files in a particular folder
#'
#' @param folder_path A character string representing an absolute folder path.
#'
#' @return A character value of the latest file timestamp.
#' @export
#'
#' @examples
latest_timestamp_folder <- function(folder_path) {
all_files <- list.files(path = folder_path, full.names = TRUE, recursive = TRUE)
file_list <- all_files[!all_files %in% grep(pattern = "\\.trc$|\\.xlsm$|\\.log$|\\~\\$.*", x = all_files, value = TRUE)]
latest_mtime <- max(file.mtime(file_list))
as.character(latest_mtime)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.