# These are special purpose standardization functions that only make sense when
# used with established CSIS data. They standardize names and colors using
# lookup tables prepared specifically for these variables.
#
# You can learn more about package authoring with RStudio at:
#
# http://r-pkgs.had.co.nz/
#
# Some useful keyboard shortcuts for package authoring:
#
# Build and Reload Package: 'Ctrl + Shift + B'
# Check Package: 'Ctrl + Shift + E'
# Test Package: 'Ctrl + Shift + T'
#***********************Standardize Variable Names
#' Standardize variable names
#'
#' @param data the data frame to be joined
#' @param path the location of the lookup file
#' @param var the variable names to standardize; by default all will be done
#' @param replace_special whether to replaces spaces and special characters in column names with periods
#'
#' @return Data with standardized variable names.
#'
#' @details This function is designed to prepare CSIS data files for lookup
#' application. It primarily smooths out variation between different ways we've
#' written SQL statements. It relies on a pre-existing table of variant variable names.
#' The variable names are matched against that table in a case insensitive manner,
#' though no other procedural standardization is applied at this time.
#'
#' @examples FullData<-standardize_variable_names(
#' FullData,
#' Path)
#'
#' @export
standardize_variable_names<- function(data,
path = "https://raw.githubusercontent.com/CSISdefense/Lookup-Tables/master/style/",
var = NULL,
replace_special = FALSE
){
#Take out spaces
#If there are two blank rows because of SQL server messages, remove them.
if(nrow(data)==0) stop("No rows of data.")
if(all(is.na(data[(nrow(data)-1):nrow(data),])))
data<-data[1:(nrow(data)-2),]
if(!file.exists(file.path(path,"Lookup_StandardizeVariableNames.csv")) || path=="offline")
path<-file.path(get_local_lookup_path(),"style//")
if(replace_special==TRUE){
#First cover any special character names we always have parsed.
standardize_variable_names(data,path,var,replace_special = FALSE)
colnames(data)<-make.names(colnames(data))
# colnames(data)<-gsub("[ ()&*/-]|\r\n",".",colnames(data))
}
if(!is.null(var) & any(!var %in% colnames(data)))
stop(paste(var," is not present in colnames(data)."))
if(is.data.frame(path))
stop("path parameter is a data frame, it should be a file path, e.g. 'https://raw.githubusercontent.com/CSISdefense/Lookup-Tables/master/style/'.")
if(!is.data.frame(data))
stop("data parameter is a not data frame, it should be.")
#Remove nonsense characters sometimes added to start of the input file
data<-remove_bom(data)
#Consider removing non-alphanumerics _s .s etc.
if(is.null(var)) var<-colnames(data)
#***Standardize variable names
NameList<-read.csv(
paste(
path,
"Lookup_StandardizeVariableNames.csv",sep=""),
header=TRUE, sep=",", na.strings=c("NA","NULL",""), dec=".", strip.white=TRUE,
stringsAsFactors=FALSE
)
if(any(is.na(NameList$Original))){
paste(NameList$Repalcement[is.na(NameList$Original)])
stop("Blank row in Original column")
}
if(any(is.na(NameList$Replacement))){
paste(NameList$Original[is.na(NameList$Replacement)])
stop("Blank row in Replacement column")
}
# NameList<-subset(NameList,toupper(Original) %in% toupper(colnames(data)))
for(x in 1:nrow(NameList)){
#Limits it to names in var, if passed. By default, all will be covered.
if(toupper(NameList$Original[[x]]) %in% toupper(var)){
colnames(data)[toupper(colnames(data))==toupper(NameList$Original[[x]])]<-
NameList$Replacement[[x]]
}
}
if(any(duplicated(colnames(data)))){
stop(paste("Duplicated columns (",colnames(data)[duplicated(colnames(data))],
") after name standardization"))
}
data
}
#' Returns data in the appropriate format for the user-specified plot
#'
#' @param data The data to format for the plot, as a tibble
#' @param x_var x-axis
#' @param y_var y-axis
#' @param breakout the facets or other divisions that will be grouped by when summing
#' @param aggregate aggregation function; defaults to sum
#'
#' @return A tibble of formatted data
#'
#' @details Transforms data by summarizing on key display variable
#'
#'
#'
#' @export
group_data_for_plot <-function(
data, # data to format for the plot, as a tibble
x_var,
y_var,
breakout,
aggregate ="sum"
#
# Returns:
# a tibble of formatted data
){
# account for potential spaces in breakout and x_var
# note that this doesn't test for whether quotes already exist
if(!y_var %in% colnames(data)) stop(paste("y_var: ",y_var,"is missing from data."))
if(all(is.na(breakout))) breakout<-NULL
if(grepl(" ", x_var)) x_var <- paste0("`", x_var, "`")
if(!x_var %in% colnames(data)) stop(paste("x_var: ",x_var,"is missing from data."))
if(length(breakout) >= 1){
if(grepl(" ", breakout[1])) breakout[1] <- paste0("`", breakout[1], "`")
if(!breakout[1] %in% colnames(data)) stop(paste("breakout[1]: ",breakout[1],"is missing from data."))
}
if(length(breakout) >= 2){
if(grepl(" ", breakout[2])) breakout[2] <- paste0("`", breakout[2], "`")
if(!breakout[2] %in% colnames(data)) stop(paste("breakout[2]: ",breakout[2],"is missing from data."))
}
if(length(breakout) == 3){
if(grepl(" ", breakout[3])) breakout[3] <- paste0("`", breakout[3], "`")
if(!breakout[3] %in% colnames(data)) stop(paste("breakout[3]: ",breakout[3],"is missing from data."))
}
data<-data %>% filter(!is.na(!! as.name(y_var)))
# aggregate to the level of [fiscal year x breakout]
# the evaluation for dplyr::summarize_ was a pain in the ass to figure out;
# see stack overflow at https://tinyurl.com/z82ywf3
agg_list<-c(breakout,x_var)
agg_list<-agg_list[!duplicated(agg_list)]
if(aggregate=="sum"){
if(length(agg_list) == 1){
data <- data %>%
dplyr::group_by(!! as.name(agg_list)) %>%
summarize_(
agg_val = lazyeval::interp(~sum(var, na.rm = TRUE), var = as.name(y_var)))
} else {
data <- data %>%
dplyr::group_by_(.dots = c(agg_list)) %>%
summarize_(
agg_val = lazyeval::interp(~sum(var, na.rm = TRUE), var = as.name(y_var)))
}
} else if (aggregate=="mean"){
if(length(agg_list) == 1){
data <- data %>%
dplyr::group_by(as.name(!! as.name(agg_list))) %>%
summarize_(
agg_val = lazyeval::interp(~mean(var, na.rm = TRUE), var = as.name(y_var)))
} else {
data <- data %>%
group_by_(.dots = c(agg_list)) %>%
summarize_(
agg_val = lazyeval::interp(~mean(var, na.rm = TRUE), var = as.name(y_var)))
}
} else (stop(paste("group_data_for_plot does not know how to handle aggregate = ",aggregate)))
names(data)[which(names(data) == "agg_val")] <- y_var
return(data)
}
#' Returns data in the appropriate format for the user-specified plot
#'
#' @param data A data frame to format for the plot, as a tibble
#' @param fy_var The fiscal year variable, as string
#' @param y_var The variable to be plotted on the y-axis
#' @param share If TRUE, calculates the share as a percentage
#' @param start_fy Start fiscal year
#' @param end_fy End fiscal Year
#' @param color_var Coloration variable, as string
#' @param facet_var Facet variable, as string
#' @param second_var Facet variable, as string
#' @param labels_and_colors A csis360 lookup data.frame with factor information
#' @param group If TRUE aggregate
#' @param drop_missing_labels If TRUE, drop levels to avoid residual levels from labels_and_colors.
#' @param add_ytextposition If TRUE, add a ytextposition numerical position to aid in adding text to a graph
#'
#' @return Returns a tibble of formatted data
#'
#' @export
format_data_for_plot <- function(data, fy_var,
y_var,
share = FALSE,
start_fy = NA,
end_fy = NA,
color_var="None",
facet_var="None",
second_var=NULL,
alpha_var=NULL,
labels_and_colors=NULL,
group=TRUE,
drop_missing_labels=TRUE,
add_ytextposition=FALSE
#wide=FALSE #' @param wide If TRUE, pivot_wider using the fy_var and arrange for table output
){
shown_data <- data
if(all(!is.null(second_var),facet_var==second_var | second_var=="None")) second_var<-NULL
if(all(!is.null(alpha_var),facet_var==alpha_var | alpha_var=="None")) alpha_var<-NULL
breakout <- c(color_var, facet_var, second_var, alpha_var)
breakout <- breakout[breakout != "None"]
breakout <- breakout[!is.null(breakout)]
breakout <- breakout[!duplicated(breakout)]
if(group){
shown_data<-group_data_for_plot(
shown_data,
fy_var,
y_var,
breakout
)
}
shown_data<-as.data.frame(shown_data)
if(!is.na(start_fy) & !is.na(end_fy)){
# filter by year - see https://tinyurl.com/lm2u8xs
if(is.numeric(shown_data[,fy_var])){
shown_data <-shown_data %>%
filter_(paste0(fy_var, ">=", as.character(start_fy), "&", fy_var,
"<=", as.character(end_fy)))
} else{
shown_data <-shown_data %>%
filter_(paste0("between(year(",fy_var, "),", as.character(start_fy),
",", as.character(end_fy),")"))
}
}
#
# NOTE: NAs replaced with 0 here; potential data quality issue
#
if(color_var!="None")
shown_data <- shown_data %>% replace_nas_with_unlabeled(color_var)
if(facet_var!="None")
shown_data <- shown_data %>% replace_nas_with_unlabeled(facet_var)
if(!is.null(second_var))
shown_data <- shown_data %>% replace_nas_with_unlabeled(second_var)
shown_data[is.na(shown_data)] <- 0
# calculate shares if share checkbox is checked
if(share == TRUE){
if (color_var != "None"){
# share_vars indicates which columns are being used to calculate the shares.
share_list <- c(facet_var,second_var)
if(color_var!="None") #For histograms or the like, fy_var (really x_var) should not be included in grouping
share_list <- c(share_list,fy_var)
share_list <- share_list[!share_list %in% c("None",color_var)]
if(length(share_list) == 1){
shown_data <- shown_data %>%
dplyr::group_by(!! as.name(share_list)) %>%
mutate_(
agg_val = lazyeval::interp(~var/sum(var, na.rm = TRUE), var = as.name(y_var)))
} else {
shown_data <- shown_data %>%
dplyr::group_by_(.dots = c(share_list)) %>%
mutate_(
agg_val = lazyeval::interp(~var/sum(var, na.rm = TRUE), var = as.name(y_var)))
}
shown_data<-shown_data[,colnames(shown_data)!=y_var]
colnames(shown_data)[colnames(shown_data) == "agg_val"] <- y_var
# if(length(share_list)==0)
# share_vars <- c(-1)
# else if (length(share_list)==1)
# share_vars <- c(-1,-2)
# else
# share_vars <- c(-1,-2, -3)
# spread the shares breakout variable across multiple columns
# shown_data<-shown_data %>%
# tidyr::spread(color_var, y_var)
#
# NOTE: NAs replaced with 0 here; potential data quality issue
#
# calculate a total for each row - i.e. the total for the shares breakout
# variable for each fiscal year,
# or for each [fiscal year x facet variable] combo
# shown_data$total <- rowSums(shown_data[share_vars],na.rm=TRUE)
# divide each column by the total column, to get each column as shares
# shown_data[share_vars] <-
# sapply(shown_data[share_vars], function(x){x / shown_data$total})
# shown_data <- shown_data %>% dplyr::select(-total)
# gather the data back to long form
# shown_data <- gather_(
# data = shown_data,
# key_col = color_var,
# value_col = y_var,
# gather_cols = names(shown_data[share_vars])
# )
}
# For the case where the user displays shares not broken out by any variable.
# This is going to make a very boring chart of 100% shares,
# but it's handled here to avoid displaying an error.
if(color_var == "None"){
shown_data<-shown_data %>%
mutate(total = 1)
shown_data <- shown_data[which(names(shown_data) != y_var)]
names(shown_data)[which(names(shown_data) == "total")] <- y_var
}
}
shown_data<-as.data.frame(shown_data)
if(!is.null(labels_and_colors)){
if(color_var!="None"){
if(!color_var %in% labels_and_colors$column) warning("color_var missing from labels_and_colors")
else{
if(!all(unlist(unique(shown_data[,color_var])) %in%
c(subset(labels_and_colors,column==color_var)$variable,"Unlabeled"))){
print(unlist(unique(shown_data[,color_var]))[
!unlist(unique(shown_data[,color_var])) %in% subset(labels_and_colors,column==color_var)$variable])
stop(paste("color_var:",color_var,"is missing labels within labels_and_colors"))
}
shown_data <- shown_data %>% replace_nas_with_unlabeled(color_var)
shown_data[,colnames(shown_data)==color_var]<-
ordered(shown_data[,colnames(shown_data)==color_var],
levels=subset(labels_and_colors,column==color_var)$variable,
labels=subset(labels_and_colors,column==color_var)$Label)
}
}
if(facet_var!="None" & color_var != facet_var){
if(!facet_var %in% labels_and_colors$column) warning("facet_var missing from labels_and_colors")
else{
shown_data[,colnames(shown_data)==facet_var]<-
ordered(shown_data[,colnames(shown_data)==facet_var],
levels=subset(labels_and_colors,column==facet_var)$variable,
labels=subset(labels_and_colors,column==facet_var)$Label
)
}
}
if(all(!is.null(second_var), color_var != second_var)){
if(!second_var %in% labels_and_colors$column) warning("second_var missing from labels_and_colors")
else{
shown_data[,colnames(shown_data)==second_var]<-
ordered(shown_data[,colnames(shown_data)==second_var],
levels=subset(labels_and_colors,column==second_var)$variable,
labels=subset(labels_and_colors,column==second_var)$Label
)
}
}
#If x-axis variable is a factor
if((is.factor(shown_data[,colnames(shown_data)==fy_var])|is.character(shown_data[,colnames(shown_data)==fy_var])) &
fy_var %in% labels_and_colors$column &
!fy_var %in% c(color_var,facet_var,second_var)){
if(length(subset(labels_and_colors,column==fy_var)$variable)==0)
stop(paste("label_and_colors is missing values for x_var:",fy_var))
shown_data[,colnames(shown_data)==fy_var]<-
ordered(shown_data[,colnames(shown_data)==fy_var],
levels=subset(labels_and_colors,column==fy_var)$variable,
labels=c(subset(labels_and_colors,column==fy_var)$Label)
)
}
if(drop_missing_labels==TRUE)
shown_data<-droplevels(shown_data)
# if(wide)
# shown_data<-ordered(shown_data[,colnames(shown_data)]0
shown_data
}
#Add numbers once everything is properly ordered.
if(add_ytextposition){
agg_list<-c(facet_var, second_var,fy_var)
agg_list <- agg_list[agg_list != "None"]
agg_list <- agg_list[!is.null(agg_list)]
agg_list<-agg_list[!duplicated(agg_list)]
if(length(agg_list) == 1){
shown_data<-shown_data %>%
dplyr::arrange(desc(!! as.name(color_var))) %>%
dplyr::group_by(as.name(!! as.name(agg_list))) %>%
mutate_(
ytextposition = lazyeval::interp(~cumsum(var)-0.5*var, var = as.name(y_var)))
} else {
shown_data <- shown_data %>%
dplyr::arrange(desc(!! as.name(color_var))) %>%
dplyr::group_by_(.dots = c(agg_list)) %>%
mutate_(
ytextposition = lazyeval::interp(~cumsum(var)-0.5*var, var = as.name(y_var)))
}
}
# return the ggplot-ready data
return(shown_data)
}
#' Returns data in the appropriate format for the user-specified plot
#'
#' @param data data frame
#' @param period_var The variable with the period designations, grouped into those periods
#' @param y_var The name of variable to plot on y-axis
#' @param breakout Facet and/or color; everything that is to be grouped by for retention
#' @param labels_and_colors A csis360 lookup data.frame with factor information
#'
#' @return Returns the average of the year entries across each period
#'
#' @export
format_period_average <- function(
data,
period_var, #The variable with the period designations, one per entry
y_var,
breakout, #Facet and/or color
labels_and_colors
)
{
breakout <- breakout[breakout != "None"]
data<-group_data_for_plot(
data,
period_var,
y_var,
breakout,
aggregate="mean"
)
data
}
#######Log setting 0s and negatives to NA
#' Set non-positive values to na and then log.
#'
#' @param x A list of numbers
#'
#' @return The list of number logs, with 0 and negative set to NA
#'
#' @details This is a function to use when the data should never
#' be 0s or negatives. It saves the step of setting them to na.
#'
#' @examples x<-c(0,2,3,-4); transform_contract(x)
#'
#' @export
na_non_positive_log<-function(x){
x[x<=0]<-NA
log(x)
}
#***********************Standardize Variable Names
#' Transform contract names
#'
#' @param contract A contract dataset
#'
#' @return contract dataset ready for statistical analysis.
#'
#' @details This function is designed to prepare CSIS data files for lookup
#' application. It primarily smooths out variation between different ways we've
#' written SQL statements. It relies on a pre-existing table of variant names.
#' The var names are matched against that table in a case insensitive manner,
#' though no other procedural standardization is applied at this time.
#'
#' @examples transform_contract(def)
#'
#' @import dplyr
#' @import lubridate
#' @import tidyverse
#' @import Hmisc
#' @export
transform_contract<-function(
contract
){
contract<-standardize_variable_names(contract)
if("Action_Obligation" %in% colnames(contract))
contract$Action_Obligation <- as.numeric(contract$Action_Obligation)
if("Number.Of.Actions" %in% colnames(contract))
contract$Number.Of.Actions <- as.numeric(contract$Number.Of.Actions )
create_naics2<-function(NAICS){
NAICS2<-substring(NAICS,1,2)
NAICS2[NAICS2 %in% c('31','32','33')]<-'31-33'
NAICS2[NAICS2 %in% c('44','45')]<-'44-45'
NAICS2[NAICS2 %in% c('48','49')]<-'48-49'
NAICS2<-factor(NAICS2)
NAICS2
}
# contract$pNewWorkUnmodifiedBaseAndAll<-as.numeric(as.character(contract$pNewWorkUnmodifiedBaseAndAll))
#Newwork and change
# contract$pNewWork3Sig<-round(
# contract$pNewWorkUnmodifiedBaseAndAll,3)
cap<-function(column,cap){
column[column>cap]<-cap
column
}
#Customer
if(!"Is.Defense" %in% colnames(contract) & "Who" %in% colnames(contract)){
contract$Is.Defense<-as.character(contract$Who)
contract$Is.Defense[contract$Is.Defense %in%
c("Air Force","Army",
"Navy","Other DoD","Uncategorized" )
]<-"Defense"
contract$Is.Defense<-factor(contract$Is.Defense)
contract$Who[contract$Who=="Uncategorized"]<-NA
#b_ODoD
# contract$b_ODoD<-contract$Who
# levels(contract$b_ODoD)<- list("1"=c("Other DoD"),
# "0"=c("Air Force","Army","Navy"))
# contract$b_ODoD<-as.integer(as.character(contract$b_ODoD))
# contract$ODoD<-contract$Who
# levels(contract$ODoD)<- list("Military Departments"=c("Air Force","Army","Navy"),
# "Other DoD"=c("Other DoD"))
}
#SumOfisChangeOrder
if("SumOfisChangeOrder" %in% colnames(contract))
contract$qNChg <- Hmisc::cut2(contract$SumOfisChangeOrder,c(1,2,3))
if("What" %in% colnames(contract))
contract$What[contract$What=="Unlabeled"]<-NA
#PSR_What
if("PSR_What" %in% colnames(contract)){
contract$PSR_What<-factor(paste(as.character(contract$PSR),
as.character(contract$What),sep="."))
}
#b_Term
if("Term" %in% colnames(contract)){
#Not sure why Term was swapped to binary, but fixing it.
if(!is.factor(contract$Term)) contract$Term<-factor(contract$Term)
levels(contract)<- list("Partial or Complete Termination"=c("Partial or Complete Termination","Terminated","1",1),
"Unterminated"=c("Unterminated","0",0))
contract$b_Term<-if_else(contract$Term %in% c("Partial or Complete Termination","Terminated",1),1,NA)
contract$b_Term[contract$Term %in% c("Unterminated",0)]<-0
#Create a jittered version of Term for display purposes
#Unlike geom_jitter, this caps values at 0 and 1
contract$j_Term<-jitter_binary(contract$b_Term)
}
#Ceiling Breach
#b_CBre
if("CBre" %in% colnames(contract)){
contract$b_CBre<-if_else(contract$CBre=="Ceiling Breach",1,NA)
contract$b_CBre[contract$CBre=="None"]<-0
#Create a jittered version of CBre for display purposes
#Unlike geom_jitter, this caps values at 0 and 1
contract$j_CBre<-jitter_binary(contract$b_CBre)
}
#Overrides
contract<-read_and_join_experiment( contract,
"CSIS_contract_inspection.csv",
path="https://raw.githubusercontent.com/CSISdefense/Lookup-Tables/master/",
directory="contract/",
by=c("CSIScontractID"),
# add_var=c("EntityID","UnmodifiedEntityID"),
new_var_checked=FALSE,
create_lookup_rdata=FALSE
)
#Ceilings
if ("UnmodifiedCeiling" %in% colnames(contract) ){
#Set entries to NA when we've inspected them and found them to be wrong.
contract$UnmodifiedCeiling[contract$override_unmodified_ceiling==TRUE]<-NA
#Deflate the dolla figures
contract<-deflate(contract,
money_var = "Action_Obligation",
# deflator_var="OMB.2019",
fy_var="StartFY"
)
contract<-deflate(contract,
money_var = "UnmodifiedCeiling",
# deflator_var="OMB.2019",
fy_var="StartFY"
)
#cln_Ceil
contract$cln_Ceil<-arm::rescale(na_non_positive_log(contract$UnmodifiedCeiling_OMB20_GDP18))
# lowroundedcutoffs<-c(15000,100000,1000000,30000000)
highroundedcutoffs<-c(15000,100000,1000000,10000000,75000000)
# contract$qLowCeiling <- Hmisc::cut2(contract$UnmodifiedCeiling_OMB20_GDP18,cuts=lowroundedcutoffs)
contract$qHighCeiling <- Hmisc::cut2(contract$UnmodifiedCeiling_OMB20_GDP18,cuts=highroundedcutoffs)
rm(highroundedcutoffs)#lowroundedcutoffs,
if (all(levels(contract$qHighCeiling)[1:5]==c("[0.00e+00,1.50e+04)",
"[1.50e+04,1.00e+05)",
"[1.00e+05,1.00e+06)",
"[1.00e+06,1.00e+07)",
"[1.00e+07,7.50e+07)"))|
all(levels(contract$qHighCeiling)[1:5]==c("[0.0e+00,1.5e+04)",
"[1.5e+04,1.0e+05)",
"[1.0e+05,1.0e+06)",
"[1.0e+06,1.0e+07)",
"[1.0e+07,7.5e+07)"))
){
contract$qHighCeiling<-factor(contract$qHighCeiling,
levels=levels(contract$qHighCeiling),
labels=c("[0,15k)",
"[15k,100k)",
"[100k,1m)",
"[1m,10m)",
"[10m,75m)",
"[75m+]")
)
}
# if (all(levels(contract$qLowCeiling)[1:4]==c("[0.00e+00,1.50e+04)",
# "[1.50e+04,1.00e+05)",
# "[1.00e+05,1.00e+06)",
# "[1.00e+06,3.00e+07)"))){
# contract$qLowCeiling<-factor(contract$qLowCeiling,
#
# levels=c("[0.00e+00,1.50e+04)",
# "[1.50e+04,1.00e+05)",
# "[1.00e+05,1.00e+06)",
# "[1.00e+06,3.00e+07)",
# levels(contract$qLowCeiling)[5]),
# labels=c("[0,15k)",
# "[15k,100k)",
# "[100k,1m)",
# "[1m,30m)",
# "[30m+]"),
# ordered=TRUE
# )
# }
contract<-contract %>% group_by(qHighCeiling) %>%
mutate(ceil.median.wt = median(UnmodifiedCeiling_OMB20_GDP18))
if (identical(levels(contract$qHighCeiling),c("[0,15k)",
"[15k,100k)",
"[100k,1m)",
"[1m,10m)",
"[10m,75m)",
"[75m+]"
))){
contract$Ceil.Simple<-contract$qHighCeiling
levels(contract$Ceil.Simple)<- list("0k - <100k"=c("[15k,100k)",
"[0,15k)"),
"100k - <10m"=c("[1m,10m)",
"[100k,1m)"),
"10m+"=c("[75m+]",
"[10m,75m)"))
contract$Ceil.Big<-contract$qHighCeiling
levels(contract$Ceil.Big)<- list("0k - <100k"=c("[15k,100k)",
"[0,15k)"),
"100k - <10m"=c("[1m,10m)",
"[100k,1m)"),
"10m - <75m"=c("[10m,75m)"),
"75m+"=c("[75m+]"))
contract$Ceil.1m<-contract$qHighCeiling
levels(contract$Ceil.1m)<- list("0k - <1m"=c("[0,15k)",
"[15k,100k)",
"[100k,1m)"
),
"1m - <10m"=c("[1m,10m)"),
"10m - <75m"=c("[10m,75m)"),
"75m+"=c("[75m+]"))
} else if (identical(levels(contract$qHighCeiling),c("0 - <15k",
"15k - <100k",
"100k - <1m",
"1m - <10m",
"10m - <75m",
"75m+"
))){
contract$Ceil.Simple<-contract$qHighCeiling
levels(contract$Ceil.Simple)<- list("0k - <100k"=c("15k - <100k",
"0 - <15k"),
"100k - <10m"=c("1m - <10m",
"100k - <1m"),
"10m+"=c("75m+",
"10m - <75m"))
contract$Ceil.Big<-contract$qHighCeiling
levels(contract$Ceil.Big)<- list("0k - <100k"=c("15k - <100k",
"0 - <15k"),
"100k - <10m"=c("1m - <10m",
"100k - <1m"),
"10m - <75m"=c("10m - <75m"),
"75m+"=c("75m+"))
contract$Ceil.1m<-contract$qHighCeiling
levels(contract$Ceil.1m)<- list("0k - <1m"=c("15k - <100k",
"0 - <15k",
"100k - <1m"),
"1m - <10m"=c("1m - <10m"),
"10m - <75m"=c("10m - <75m"),
"75m+"=c("75m+"))
}
#ChangeOrderCeilingGrowth
if("ChangeOrderCeilingGrowth" %in% colnames(contract)){
#Set entries to NA when we've inspected them and found them to be wrong.
if(!"n_CBre" %in% colnames(contract)) stop("n_CBre is missing. Rerun the relevant create dataset file.")
contract$n_CBre[contract$override_change_order_growth==TRUE]<-NA
if(min(contract$n_CBre,na.rm=TRUE)>0) stop("1 has been added to n_CBre. Fix this before proceeding.")
contract$p_CBre<-(contract$n_CBre/
contract$UnmodifiedCeiling_Then_Year)
contract$p_CBre[
is.na(contract$p_CBre) & contract$b_CBre==0]<-0
contract<-deflate(contract,
money_var = "n_CBre",
# deflator_var="OMB.2019",
fy_var="StartFY"
)
contract$pChange3Sig<-round(
contract$p_CBre,3)
contract$qCrai <- Hmisc::cut2(
contract$p_CBre,c(
0,
0.001,
0.15)
)
#lp_CBre
contract$lp_CBre<-na_non_positive_log(contract$p_CBre)
#ln_CBre
contract$ln_CBre_Then_Year<-na_non_positive_log(contract$n_CBre_Then_Year)
contract$ln_CBre_OMB20_GDP18<-na_non_positive_log(contract$n_CBre_OMB20_GDP18)
}
# if ("NewWorkUnmodifiedBaseAndAll" %in% colnames(contract) ){
# contract$pNewWorkUnmodifiedBaseAndAll<-contract$NewWorkUnmodifiedBaseAndAll/
# contract$UnmodifiedCeiling_Then_Year
# contract$pNewWorkUnmodifiedBaseAndAll[
# is.na(contract$pNewWorkUnmodifiedBaseAndAll) & contract$SumOfisChangeOrder==0]<-0
# contract$pNewWorkUnmodifiedBaseAndAll<-as.numeric(as.character(contract$pNewWorkUnmodifiedBaseAndAll))
# contract$pChange3Sig<-round(
# contract$pNewWorkUnmodifiedBaseAndAll,3)
# }
}
if ("UnmodifiedCurrentCompletionDate" %in% colnames(contract) ){
contract$UnmodifiedCurrentCompletionDate<-as.Date(contract$UnmodifiedCurrentCompletionDate)
}
#Rename standardization
colnames(contract)[colnames(contract)=="Dur"]<-"qDuration"
#l_Days
if("UnmodifiedDays" %in% colnames(contract)){
contract$UnmodifiedDays[contract$UnmodifiedDays<0]<-NA
contract$capped_UnmodifiedDays <- if_else(contract$UnmodifiedDays > 3650, 3650, contract$UnmodifiedDays)
contract$cln_Days<-arm::rescale(na_non_positive_log(contract$capped_UnmodifiedDays))
contract$UnmodifiedYearsFloat<-contract$UnmodifiedDays/365.25
contract$UnmodifiedYearsCat<-floor(contract$UnmodifiedYearsFloat)
#Break the count of days into four categories.
if (!"qDuration" %in% colnames(contract)){
contract$qDuration<-Hmisc::cut2(contract$UnmodifiedDays,cuts=c(61,214,366,732))
}
if (levels(contract$qDuration)[[2]]=="[ 61, 214)"){
levels(contract$qDuration)<- list(
"[0 months,~2 months)"=c("[ 0, 61)","[ 1, 61)"),
"[~2 months,~7 months)"="[ 61, 214)",
"[~7 months-~1 year]"="[ 214, 366)",
"(~1 year,~2 years]"="[ 366, 732)",
"(~2 years+]"=levels(contract$qDuration)[5])
}
contract$qDuration[contract$UnmodifiedYearsCat<0]<-NA
contract$Dur.Simple<-contract$qDuration
levels(contract$Dur.Simple)<- list(
"<~1 year"=c("[0 months,~2 months)","[~2 months,~7 months)","[~7 months-~1 year]"),
"(~1 year,~2 years]"="(~1 year,~2 years]",
"(~2 years+]"="(~2 years+]")
}
#n_Fixed
if("FxCb" %in% colnames(contract)){
contract$n_Fixed<-contract$FxCb
levels(contract$n_Fixed)<- list("1"=c("Fixed-Price","Fixed"),
"0.5"=c("Combination or Other","Combo/Other"),
"0"=c("Cost-Based","Cost"))
levels(contract$FxCb)<- list("Fixed"=c("Fixed-Price","Fixed"),
"Combo/Other"=c("Combination or Other","Combo/Other"),
"Cost"=c("Cost-Based","Cost"))
contract$n_Fixed<-as.numeric(as.character(contract$n_Fixed))
#n_Incent
contract$n_Incent<-contract$Fee
levels(contract$n_Incent) <-
list("1"=c("Incentive"),
"0.5"=c("Combination"),
"0"=c("Award Fee", "FFP or No Fee", "Fixed Fee", "Other Fee"))
contract$n_Incent<-as.numeric(as.character(contract$n_Incent))
#n_NoFee
contract$n_NoFee<-contract$Fee
levels(contract$n_NoFee) <-
list("1"=c("FFP or No Fee"),
"0.5"=c("Combination"),
"0"=c("Award Fee", "Incentive", "Fixed Fee", "Other Fee"))
contract$n_NoFee<-as.numeric(as.character(contract$n_NoFee))
contract$Pricing<-as.character(contract$FxCb )
summary(contract$Fee)
summary(factor(contract$Pricing))
contract$Pricing[contract$Pricing %in% c("Fixed","Fixed-Price") & contract$Fee=="FFP or No Fee"]<-"FFP"
contract$Pricing[contract$Pricing %in% c("Fixed","Fixed-Price") & contract$Fee!="FFP or No Fee"]<-"Other FP"
contract$Pricing[contract$Pricing %in% c("Cost","Cost-Based") & contract$Fee=="Other Fee"]<-"T&M/LH/FPLOE"
contract$Pricing[contract$Pricing %in% c("Combo/Other")]<-"Combination or Other"
contract$Pricing[contract$Pricing %in% c("Cost")]<-"Cost-Based"
contract$Pricing<-factor(contract$Pricing,c("FFP","Other FP","Combination or Other",
"Cost-Based","T&M/LH/FPLOE"))
summary(contract$Fee)
summary(factor(contract$Pricing))
contract$PricingFee<-as.character(contract$Pricing)
contract$PricingFee[contract$Fee=="Incentive"]<-"Incentive"
# contract$PricingFee[contract$PricingFee %in% c("Other FP","FFP")] <-"Other FP"
contract$PricingFee[contract$PricingFee %in% c("Cost-Based")] <-"Other CB"
# contract$PricingFee<-factor(contract$PricingFee,c("Other FP","Incentive",
# "Combination or Other",
# "Other CB","T&M/LH/FPLOE"))
contract$PricingFee<-factor(contract$PricingFee,c("FFP","Other FP","Incentive",
"Combination or Other",
"Other CB","T&M/LH/FPLOE"))
summary(contract$PricingFee)
# summary(factor(contract$PricingFee))
contract$PricingUCA<-as.character(contract$PricingFee)
contract$PricingUCA[is.na(contract$UCA)]<-NA
# summary(factor(contract$PricingUCA))
contract$PricingUCA[contract$UCA=="UCA"]<-"UCA"
contract$PricingUCA<-factor(contract$PricingUCA,c("FFP","Other FP","Incentive",
"Combination or Other",
"Other CB","T&M/LH/FPLOE","UCA"))
summary(contract$PricingUCA)
}
#Competition
if("Comp" %in% colnames(contract)){
#Right now comp is not actually a factor, so don't need to process it
contract$b_Comp<-contract$Comp #Fix in Rdata, and add back comp
levels(contract$b_Comp) <-
list("0"="No Comp.",
"1"="Comp.")
contract$b_Comp<-as.integer(as.character(contract$b_Comp))
#n_Comp
# contract$n_Comp<-contract$EffComp #Fix in Rdata, and add back comp
# levels(contract$n_Comp) <-
# list("0"="No Comp.",
# "0.5"="1 offer",
# "1"="2+ offers")
# contract$n_Comp<-as.numeric(as.character(contract$n_Comp))
contract$q_Offr<-Hmisc::cut2(contract$UnmodifiedNumberOfOffersReceived,c(2,3,5))
levels(contract$q_Offr) <-
list("1"=c("1"," 1"),
"2"=c("2"," 2"),
"3-4"=c("[ 3, 5)"),
"5+"=c("[ 5,999]")
)
#Set number of offers =1 when there is a NA and no competition
#This seems to be redundant, but no harm in it.
contract$q_Offr[is.na(contract$q_Offr)&
!is.na(contract$b_Comp)&
contract$b_Comp==0
]<-"1"
contract$CompOffr<-as.character(contract$q_Offr)
contract$CompOffr[contract$b_Comp==0 & !is.na(contract$b_Comp)]<-"No Competition"
contract$CompOffr[is.na(contract$b_Comp)]<-NA
contract$CompOffr<-factor(contract$CompOffr)
levels(contract$CompOffr) <-
list("No Competition"="No Competition",
"1 offer"="1",
"2 offers"="2",
"3-4 offers"="3-4",
"5+ offers"="5+")
#l_Offr
contract$l_Offr<-na_non_positive_log(contract$UnmodifiedNumberOfOffersReceived)
# contract$cn_Offr<-arm::rescale(contract$nq_Offr)
# contract$cln_Offr<-arm::rescale(contract$l_Offr)
#Urgency
contract$b_Urg<-NA
contract$b_Urg<-if_else(contract$Urg=="Urgency Except.",1,NA)
contract$b_Urg[contract$Urg=="Not Urgency"]<-0
contract$NoComp<-NA
contract$NoComp<-if_else(contract$Urg=="Urgency Except.","Urgency",NA)
contract$NoComp[contract$Urg=="Not Urgency"]<-"Other No"
contract$NoComp[contract$b_Comp==1]<-"Any Comp."
contract$NoComp<-factor(contract$NoComp,
c("Any Comp.","Other No","Urgency"))
contract$NoCompOffr<-contract$CompOffr
levels(contract$NoCompOffr) <-
list("No Competition"="No Competition",
"1 offer"="1 offer",
"2-4 offers"=c("2 offers","3-4 offers"),
"5+ offers"="5+ offers")
contract$NoCompOffr<-as.character(contract$NoCompOffr)
contract$NoCompOffr[is.na(contract$NoComp) |
contract$NoComp!="Any Comp."]<-
as.character(contract$NoComp[is.na(contract$NoComp) |
contract$NoComp!="Any Comp."])
contract$NoCompOffr<-factor(contract$NoCompOffr,c(
c("Other No",
"Urgency",
"1 offer",
"2-4 offers",
"5+ offers"
)
))
contract$Comp1or5<-contract$CompOffr
levels(contract$Comp1or5)<-
list("No Competition"="No Competition",
"1 offer"="1 offer",
"2-4 offers"=c("2 offers","3-4 offers"),
"5+ offers"="5+ offers")
summary(contract$Comp1or5)
}
else if ("Offr" %in% colnames(contract) & !"Comp1or5" %in% colnames(contract)){
contract$Comp1or5<-as.character(contract$EffComp)
contract$Comp1or5[!is.na(contract$Comp1or5)&
contract$Comp1or5=="2+ offers"]<-
as.character(contract$Offr[!is.na(contract$Comp1or5)&
contract$Comp1or5=="2+ offers"])
contract$Comp1or5<-factor(contract$Comp1or5)
levels(contract$Comp1or5)<-
list("No Comp."=c("No Competition","No Comp."),
"1 offer"=c("1 offer","1 Offer"),
"2-4 offers"=c("2 offers","3-4 offers","2","3-4"),
"5+ offers"=c("5+ offers","5+"))
}
if("Intl" %in% colnames(contract)){
#b_Intl
contract$Intl <- factor(contract$Intl,
c("Just U.S.", "Any International")) #Manually remove "NA" from levels of variable Intl
levels(contract$Intl)<- list("Just U.S."=c("Just U.S."),
"Any Intl."=c("Any Intl.","Any International"))
contract$b_Intl<-contract$Intl
contract$b_Intl[contract$b_Intl=="Unlabeled"]<-NA
levels(contract$b_Intl) <-
list("0"=c("Just U.S."),
"1"=c("Any Intl.","Any International"))
contract$b_Intl<-as.integer(as.character(contract$b_Intl))
}
if("UCA" %in% colnames(contract)){
#b_UCA
contract$b_UCA<-contract$UCA
levels(contract$b_UCA) <-
list("0"=c("Not UCA"),
"1"=c("UCA"))
contract$b_UCA<-as.integer(as.character(contract$b_UCA))
}
#
# if(!"Is.Defense" %in% colnames(contract)){
# contract$Is.Defense<-as.character(contract$Who)
# contract$Is.Defense[contract$Is.Defense %in%
# c("Air Force","Army",
# "Navy","Other DoD","Uncategorized" )
# ]<-"Defense"
# contract$Is.Defense<-factor(contract$Is.Defense)
# }
if("Veh" %in% colnames(contract)){
levels(contract$Veh)[levels(contract$Veh)=="SINGLE AWARD IDC"]<-"S-IDC"
levels(contract$Veh)[levels(contract$Veh)=="MULTIPLE AWARD IDC"]<-"M-IDC"
levels(contract$Veh)[levels(contract$Veh)=="def_detail/Pur"]<-"Def/Pur"
contract$Veh<-factor(contract$Veh,c("Def/Pur",
"S-IDC",
"M-IDC",
"FSS/GWAC",
"BPA/BOA"))
}
if("Crisis" %in% colnames(contract)){
#Crisis Dataset
# contract$ARRA<-0
# contract$ARRA[contract$MaxOfDecisionTree=="ARRA"]<-1
# contract$Dis<-0
# contract$Dis[contract$MaxOfDecisionTree=="Disaster"]<-1
# contract$OCO<-0
# contract$OCO[contract$MaxOfDecisionTree=="OCO"]<-1
contract$Crisis<-factor(contract$Crisis)
levels(contract$Crisis) <-
list( "Other"=c( "Other","Excluded"),
"ARRA"=c("ARRA"),
"Dis"=c("Dis","Disaster"),
"OCO"=c("OCO"))
contract$Crisis[is.na(contract$Crisis)]<-"Other"
}
#Calendar Year
if("MinOfSignedDate" %in% colnames(contract)){
contract$StartCY<-lubridate::year(contract$MinOfSignedDate)
}
#NAICS
#Note that this must be placed a new in each repository.
#In theory we could store a version in csis360, something to consider for the future.
local_semi_clean_path<-"..\\data\\semi_clean\\"
if(!dir.exists(local_semi_clean_path)& dir.exists("data\\semi_clean\\"))
local_semi_clean_path<-"data\\semi_clean\\"
else if(!dir.exists(local_semi_clean_path))
stop("Don't know where local_semi_clean directory is")
if("NAICS" %in% colnames(contract) & "StartCY" %in% colnames(contract) ){
naics.file<-NA
#Vendor repository location
if(file.exists("../output/naics_join.Rdata")) naics.file<-"../output/naics_join.Rdata"
else if(file.exists("output/naics_join.Rdata")) naics.file<-"output/naics_join.Rdata"
else if(file.exists(paste(local_semi_clean_path,"naics_join.Rdata",sep="")))
naics.file<-paste(local_semi_clean_path,"naics_join.Rdata",sep="")
else if(file.exists("../data/clean/naics_join.Rdata")) naics.file<-"../data/clean/naics_join.Rdata"
else if(file.exists("data/clean/naics_join.Rdata")) naics.file<-"data/clean/naics_join.Rdata"
if(!is.na(naics.file)){
load(naics.file)
# contract<-left_join(contract,NAICS_join, by=c("StartFY"="StartFY",
# "NAICS"="NAICS_Code"))
contract$NAICS<-as.integer(as.character(contract$NAICS))
contract$NAICS5<-as.integer(substr(contract$NAICS,1,5))
contract$NAICS4<-as.integer(substr(contract$NAICS,1,4))
contract$NAICS3<-as.integer(substr(contract$NAICS,1,3))
contract$NAICS2<-create_naics2(contract$NAICS)
#This critical NAICS6 split in 2 from 2012 to 2017 and would prevent analysis of 7% of obligations if not reunited.
contract$NAICS[substr(contract$NAICS,1,5)==54171 &
!is.na(contract$NAICS)]<-54171
if(!"def6_HHI_lag1" %in% colnames(contract))
contract<-left_join(contract,NAICS6_join, by=c("StartCY"="CalendarYear",
"NAICS"="NAICS6"))
if(!"def5_HHI_lag1" %in% colnames(contract))
contract<-left_join(contract,NAICS5_join, by=c("StartCY"="CalendarYear",
"NAICS5"="NAICS5"))
if(!"def4_HHI_lag1" %in% colnames(contract))
contract<-left_join(contract,NAICS4_join, by=c("StartCY"="CalendarYear",
"NAICS4"="NAICS4"))
if(!"def3_HHI_lag1" %in% colnames(contract))
contract<-left_join(contract,NAICS3_join, by=c("StartCY"="CalendarYear",
"NAICS3"="NAICS3"))
if(!"def2_HHI_lag1" %in% colnames(contract))
contract<-left_join(contract,NAICS2_join, by=c("StartCY"="CalendarYear",
"NAICS2"="NAICS2"))
#Remove 0s, they make no sense, source must be one contractors in field have 0 obligations, which is just missing data really
contract$def6_HHI_lag1[contract$def6_HHI_lag1==0]<-NA
contract$cn_def6_HHI_lag1<-arm::rescale(contract$def6_HHI_lag1)
contract$l_def6_HHI_lag1<-na_non_positive_log(contract$def6_HHI_lag1)
contract$cln_Def6HHI<-arm::rescale(contract$l_def6_HHI_lag1)
contract$def5_HHI_lag1[contract$def5_HHI_lag1==0]<-NA
contract$cn_def5_HHI_lag1<-arm::rescale(contract$def5_HHI_lag1)
contract$l_def5_HHI_lag1<-na_non_positive_log(contract$def5_HHI_lag1)
contract$cln_def5_HHI_lag1<-arm::rescale(contract$l_def5_HHI_lag1)
contract$def4_HHI_lag1[contract$def4_HHI_lag1==0]<-NA
contract$cn_def4_HHI_lag1<-arm::rescale(contract$def4_HHI_lag1)
contract$l_def4_HHI_lag1<-na_non_positive_log(contract$def4_HHI_lag1)
contract$cln_def4_HHI_lag1<-arm::rescale(contract$l_def4_HHI_lag1)
contract$def3_HHI_lag1[contract$def3_HHI_lag1==0]<-NA
contract$cn_def3_HHI_lag1<-arm::rescale(contract$def3_HHI_lag1)
contract$l_def3_HHI_lag1<-na_non_positive_log(contract$def3_HHI_lag1)
contract$cln_Def3HHI<-arm::rescale(contract$l_def3_HHI_lag1)
contract$def2_HHI_lag1[contract$def2_HHI_lag1==0]<-NA
contract$cn_def2_HHI_lag1<-arm::rescale(contract$def2_HHI_lag1)
contract$l_def2_HHI_lag1<-na_non_positive_log(contract$def2_HHI_lag1)
contract$cln_def2_HHI_lag1<-arm::rescale(contract$l_def2_HHI_lag1)
contract$capped_def6_ratio_lag1<-cap(contract$def6_ratio_lag1,1)
contract$clr_Def6toUS<-arm::rescale(na_non_positive_log(contract$capped_def6_ratio_lag1))
contract$capped_def5_ratio_lag1<-cap(contract$def5_ratio_lag1,1)
contract$clr_Def5toUS<-arm::rescale(na_non_positive_log(contract$capped_def5_ratio_lag1))
contract$capped_def4_ratio_lag1<-cap(contract$def4_ratio_lag1,1)
contract$clr_Def4toUS<-arm::rescale(na_non_positive_log(contract$capped_def4_ratio_lag1))
contract$capped_def3_ratio_lag1<-cap(contract$def3_ratio_lag1,1)
contract$clr_Def3toUS<-arm::rescale(na_non_positive_log(contract$capped_def3_ratio_lag1))
contract$capped_def2_ratio_lag1<-cap(contract$def2_ratio_lag1,1)
contract$clr_Def2toUS<-arm::rescale(na_non_positive_log(contract$capped_def2_ratio_lag1))
contract$l_def6_obl_lag1<-na_non_positive_log(contract$def6_obl_lag1)
contract$cln_Def6Obl<-arm::rescale(contract$l_def6_obl_lag1)
contract$l_def5_obl_lag1<-na_non_positive_log(contract$def5_obl_lag1)
contract$cln_def5_obl_lag1<-arm::rescale(contract$l_def5_obl_lag1)
contract$l_def4_obl_lag1<-na_non_positive_log(contract$def4_obl_lag1)
contract$cln_def4_obl_lag1<-arm::rescale(contract$l_def4_obl_lag1)
contract$l_def3_obl_lag1<-na_non_positive_log(contract$def3_obl_lag1)
contract$cln_def3_obl_lag1<-arm::rescale(contract$l_def3_obl_lag1)
contract$l_def2_obl_lag1<-na_non_positive_log(contract$def2_obl_lag1)
contract$cln_def2_obl_lag1<-arm::rescale(contract$l_def2_obl_lag1)
contract$cln_US6_avg_sal_lag1<-arm::rescale(na_non_positive_log(contract$US6_avg_sal_lag1))
contract$cln_US5_avg_sal_lag1<-arm::rescale(na_non_positive_log(contract$US5_avg_sal_lag1))
contract$cln_US4_avg_sal_lag1<-arm::rescale(na_non_positive_log(contract$US4_avg_sal_lag1))
contract$cln_US3_avg_sal_lag1<-arm::rescale(na_non_positive_log(contract$US3_avg_sal_lag1))
contract$cln_US2_avg_sal_lag1<-arm::rescale(na_non_positive_log(contract$US2_avg_sal_lag1))
}
}
colnames(contract)[colnames(contract)=="ProductOrServiceCode"]<-"ProdServ"
if("ProdServ" %in% colnames(contract)){
contract$ProdServ[contract$ProdServ==""]<-NA
contract<-read_and_join_experiment( contract,
"ProductOrServiceCodes.csv",
path="https://raw.githubusercontent.com/CSISdefense/Lookup-Tables/master/",
directory="",
by=c("ProdServ"="ProductOrServiceCode"),
add_var=c("Simple",
"ProductServiceOrRnDarea",
"ProductOrServiceArea",
"HostNation3Category",
"CrisisProductOrServiceArea",
"ProductOrServiceCodeText"
),
new_var_checked=FALSE,
lookup_char_as_factor=TRUE)
contract$ProductServiceOrRnDarea<-factor(contract$ProductServiceOrRnDarea)
contract$ProductOrServiceArea<-factor(contract$ProductOrServiceArea)
contract$HostNation3Category<-factor(contract$HostNation3Category)
contract$CrisisProductOrServiceArea<-gsub(" & ","+",contract$CrisisProductOrServiceArea) #Shortening slightly.
contract$CrisisProductOrServiceArea<-factor(contract$CrisisProductOrServiceArea)
contract$ProductOrServiceCodeText<-factor(contract$ProductOrServiceCodeText)
}
#Office
colnames(contract)[colnames(contract)=="ContractingOfficeCode"]<-"Office"
if("Office" %in% colnames(contract)){
contract<-read_and_join_experiment( contract,
"Office.ContractingOfficeCode.txt",
path="https://raw.githubusercontent.com/CSISdefense/Lookup-Tables/master/",
directory="office\\",
by=c("Office"="ContractingOfficeCode"),
add_var=c("ContractingOfficeName","PlaceIntlPercent","CrisisPercent"),
new_var_checked=FALSE,
lookup_char_as_factor=TRUE,
guess_max=50000)
colnames(contract)[colnames(contract)=="PlaceIntlPercent"]<-"OffIntl"
contract$OffPlace<-Hmisc::cut2(contract$OffIntl,c(0.01,0.50))
levels(contract$OffPlace) <-
list("US99"=c("[0.00,0.01)"),
"Mixed"=c("[0.01,0.50)"),
"Intl"=c("[0.50,1.00]"))
colnames(contract)[colnames(contract)=="CrisisPercent"]<-"OffCri"
contract$c_OffCri<-arm::rescale(contract$OffCri)
if("Intl" %in% colnames(contract)){
contract$Reach6<-factor(paste(contract$OffPlace,contract$Intl,sep="-"))
levels(contract$Reach6) <-
list( "US99-Dom"=c("US99-Just U.S."),
"Mixed-Dom"=c("Mixed-Just U.S."),
"Intl-Dom"=c("Intl-Just U.S."),
"US99-Intl"=c("US99-Any International","US99-Any Intl."),
"Mixed-Intl"=c("Mixed-Any International","Mixed-Any Intl."),
"Intl-Intl"=c("Intl-Any International","Intl-Any Intl."))
contract$Reach<-contract$Reach6
levels(contract$Reach) <-
list( "US50-Dom"=c("US99-Just U.S.","Mixed-Just U.S."),
"Mixed-Dom"=c(),
"Intl-Dom"=c("Intl-Just U.S."),
"US50-Intl"=c("Mixed-Any International","Mixed-Any Intl.","US99-Any International","US99-Any Intl."),
"Intl-Intl"=c("Intl-Any International","Intl-Any Intl."))
}
if(file.exists(paste(local_semi_clean_path,"Office.sp_OfficeHistoryCapacityLaggedConst.txt",sep=""))){
contract<-read_and_join_experiment( contract,
"Office.sp_OfficeHistoryCapacityLaggedConst.txt",
path="",
directory=local_semi_clean_path,
by=c("Office"="ContractingOfficeCode",
"StartFY"="Fiscal_Year"),
add_var=c("office_obligatedamount_1year",
"office_numberofactions_1year",
"office_PBSCobligated_1year",
"office_obligatedamount_7year"),
new_var_checked=FALSE,
create_lookup_rdata=TRUE,
lookup_char_as_factor=TRUE
)
contract$office_numberofactions_1year[is.na(contract$office_numberofactions_1year)]<-0
contract$office_obligatedamount_7year[is.na(contract$office_obligatedamount_7year) |
contract$office_obligatedamount_7year<0]<-0
contract$office_obligatedamount_1year[is.na(contract$office_obligatedamount_1year) |
contract$office_obligatedamount_1year<0]<-0
contract$office_PBSCobligated_1year[is.na(contract$office_PBSCobligated_1year)|
contract$office_PBSCobligated_1year<0]<-0
contract$pPBSC<-contract$office_PBSCobligated_1year/contract$office_obligatedamount_1year
contract$pPBSC[contract$office_obligatedamount_1year==0]<-0
contract$pPBSC[contract$pPBSC>1]<-1
contract$office_numberofactions_1year[is.na(contract$Office)]<-NA
contract$office_obligatedamount_7year[is.na(contract$Office)]<-NA
contract$office_obligatedamount_1year[is.na(contract$Office)]<-NA
contract$office_PBSCobligated_1year[is.na(contract$Office)]<-NA
contract$pPBSC[is.na(contract$Office)]<-NA
contract$cln_OffCA<-arm::rescale(log(contract$office_numberofactions_1year+1))
contract$cln_OffObl7<-arm::rescale(log(contract$office_obligatedamount_7year+1))
# summary(contract$l_OffVol)
# summary(contract$cln_OffObl7)
#
contract$cp_OffPerf7<-arm::rescale(contract$pPBSC)
}
if("ProdServ" %in% colnames(contract) &
file.exists(paste(local_semi_clean_path,"Office.sp_ProdServOfficeHistoryLaggedConst.txt",sep=""))){
contract<-read_and_join_experiment( contract,
"Office.sp_ProdServOfficeHistoryLaggedConst.txt",
path="",
directory=local_semi_clean_path,
by=c("Office"="ContractingOfficeCode",
"StartFY"="Fiscal_Year",
"ProdServ"="ProductOrServiceCode"),
add_var=c("office_psc_obligatedamount_7year"),
new_var_checked=FALSE,
col_types="ccddddc",
create_lookup_rdata=TRUE)
# summary(contract$office_psc_obligatedamount_7year)
contract$office_psc_obligatedamount_7year[is.na(contract$office_psc_obligatedamount_7year)|
contract$office_psc_obligatedamount_7year<0]<-0
contract$pOffPSC<-contract$office_psc_obligatedamount_7year/contract$office_obligatedamount_7year
contract$pOffPSC[contract$office_obligatedamount_7year==0]<-0
contract$pOffPSC[contract$pOffPSC>1]<-1
# summary(contract$pOffPSC)
contract$office_psc_obligatedamount_7year[is.na(contract$Office) |
is.na(contract$ProdServ)]<-NA
contract$pOffPSC[is.na(contract$Office) |
is.na(contract$ProdServ)]<-NA
contract$cp_OffPSC7<-arm::rescale(contract$pOffPSC)
}
# summary(contract$l_OffVol)
# summary(contract$cln_OffObl7)
#
if("EntityID" %in% colnames(contract)){
contract<-read_and_join_experiment( contract,
"Office.sp_EntityIDofficeHistoryLaggedConst.txt",
path="",
directory=local_semi_clean_path,
by=c("EntityID"="EntityID",
"Office"="ContractingOfficeCode",
"StartFY"="Fiscal_Year"),
add_var=c("office_entity_paircount_7year","office_entity_numberofactions_1year",
"office_entity_obligatedamount_7year"),
new_var_checked=FALSE,
create_lookup_rdata=TRUE,
lookup_char_as_factor=TRUE)
# summary(contract$EntityID)
# summary(contract$office_entity_numberofactions_1year)
# summary(contract$office_entity_paircount_7year)
# summary(contract$office_entity_obligatedamount_7year)
contract$office_entity_numberofactions_1year[is.na(contract$office_entity_numberofactions_1year)&
!is.na(contract$EntityID)&!is.na(contract$Office)]<-0
contract$office_entity_paircount_7year[is.na(contract$office_entity_paircount_7year)&
!is.na(contract$EntityID)&!is.na(contract$Office)]<-0
contract$office_entity_obligatedamount_7year[(is.na(contract$office_entity_obligatedamount_7year)|
contract$office_entity_obligatedamount_7year<0)&
!is.na(contract$EntityID)&!is.na(contract$Office)]<-0
contract$pMarket<-contract$office_entity_obligatedamount_7year/contract$office_obligatedamount_7year
contract$pMarket[contract$office_obligatedamount_7year==0 &
!is.na(contract$EntityID)&!is.na(contract$Office)]<-0
contract$pMarket[contract$pMarket>1]<-1
contract<-deflate(contract,
money_var = "office_entity_obligatedamount_7year",
# deflator_var="OMB.2019",
fy_var="StartFY"
)
contract$cln_PairObl7<-arm::rescale(log(contract$office_entity_obligatedamount_7year_OMB20_GDP18+1))
#*********** Options Growth
# summary(contract$pMarket)
contract$cp_PairObl7<-arm::rescale(contract$pMarket)
contract$l_pairCA<-log(contract$office_entity_numberofactions_1year+1)
contract$cln_PairCA<-arm::rescale(contract$l_pairCA)
contract$cn_PairHist7<-arm::rescale(contract$office_entity_paircount_7year)
}
colnames(contract)[colnames(contract)=="ContractingOfficeCode"]<-"Office"
}
#Base and Options
if("UnmodifiedBase" %in% colnames(contract)){
contract$UnmodifiedBase[contract$UnmodifiedBase<=0]<-NA
contract$UnmodifiedBase[contract$override_unmodified_base==TRUE]<-NA
contract<-deflate(contract,
money_var = "UnmodifiedBase",
# deflator_var="OMB.2019",
fy_var="StartFY"
)
contract$Ceil2Base<-contract$UnmodifiedCeiling_Then_Year/contract$UnmodifiedBase_Then_Year
contract$Ceil2Base[contract$Ceil2Base<1 | !is.finite(contract$Ceil2Base)]<-NA
contract$clr_Ceil2Base<-arm::rescale(log(contract$Ceil2Base))
contract$cln_Base<-arm::rescale(na_non_positive_log(contract$UnmodifiedBase_OMB20_GDP18))
if("n_OptGrowth" %in% colnames(contract)){
contract$n_OptGrowth[contract$override_exercised_growth==TRUE]<-NA
contract$p_OptGrowth<-contract$n_OptGrowth/contract$UnmodifiedBase_Then_Year+1
contract$lp_OptGrowth<-log(contract$p_OptGrowth)
contract<-deflate(contract,
money_var = "n_OptGrowth",
# deflator_var="OMB.2019",
fy_var="StartFY"
)
#*********** Options Growth
contract$ln_OptGrowth_OMB20_GDP18<-log(contract$n_OptGrowth_OMB20_GDP18)
contract$Opt<-NA
contract$Opt[contract$AnyUnmodifiedUnexercisedOptions==1]<-"Available Options"
# contract$Opt[contract$AnyUnmodifiedUnexercisedOptions==1& contract$n_OptGrowth_Then_Year>0]<-"Option Growth"
# contract$Opt[(contract$AnyUnmodifiedUnexercisedOptions==1)& contract$n_OptGrowth_Then_Year==0]<-"Not Some Growth"
contract$Opt[contract$AnyUnmodifiedUnexercisedOptions==0]<-"Initial Base=Ceiling"
contract$Opt[contract$UnmodifiedBase_Then_Year>contract$UnmodifiedCeiling_Then_Year]<-NA
contract$Opt<-factor(contract$Opt,levels=c("Initial Base=Ceiling","Available Options"))
}
}
if("Crisis" %in% colnames(contract) &
file.exists(paste(local_semi_clean_path,"ProductOrServiceCode.ProdServHistoryCFTEcoalesceLaggedConst.txt",sep=""))){
# summary(contract$Crisis)
contract$OCO_GF<-contract$Crisis
levels(contract$OCO_GF)<-
list("GF"=c("Other","ARRA","Dis"),
"OCO"="OCO")
# summary(contract$OCO_GF)
contract<-read_and_join_experiment( contract,
"ProductOrServiceCode.ProdServHistoryCFTEcoalesceLaggedConst.txt",
path="",
directory=local_semi_clean_path,
by=c("StartFY"="Fiscal_Year",
"OCO_GF"="OCO_GF",
"ProdServ"="ProductOrServiceCode"),
add_var=c("CFTE_Rate_1year"),
new_var_checked=FALSE,
lookup_char_as_factor=TRUE,
guess_max=100000)
# summary(contract$CFTE_Rate_1year)
contract$l_CFTE<-log(contract$CFTE_Rate_1year)
contract$cln_PSCrate<-arm::rescale(contract$l_CFTE)
}
if("Action_Obligation" %in% colnames(contract)){
contract$ObligationWT<-contract$Action_Obligation
contract$ObligationWT[contract$ObligationWT<0]<-NA
}
if("Action_Obligation_Then_Year" %in% colnames(contract)){
contract$ObligationWT_Then_Year<-contract$Action_Obligation_Then_Year
contract$ObligationWT_Then_Year[contract$ObligationWT_Then_Year<0]<-NA
}
#Removing l_s just to reduce size. They can be derived easily.
contract<-contract[!colnames(contract) %in% colnames(contract)[grep("^l_",colnames(contract))]]
contract<-contract[!colnames(contract) %in% colnames(contract)[grep("^capped_l_",colnames(contract))]]
contract
}
#' Update a sample using a larger data frame.
#'
#' @param smp A data frame of contracts ready for statistical analysis, which must contain CSIScontractID.
#' @param full A data frame of contracts with no key missing data and which must contain CSIScontractID.
#' @param col Speific columns to add, if blank, add all in full missing from sample
#' @param drop_and_replace If true, drop rows from sample missing from full. Then replace them with new rows from full.
#'
#' @return The updated sample
#'
#' @details This is a function that updates samples using an updated
#' version of the population, e.g. new columns, and adds them to
#' existing samples. This might be used if a new column has been added
#' from SQL or if NA values are found in a oolumn being used in.
#' This isn't appropriate if the larger being drawn from has changed
#' in make up, for example adding a new years data.
#'
#' @examples update_sample_col_CSIScontractID(smp,def[complete,],drop_and_replace=TRUE)
#'
#' @export
update_sample_col_CSIScontractID<-function(smp,
full,
col=NULL,
drop_and_replace=FALSE){
if(is.null(full)) stop("full variable is null")
if(is.null(smp)) stop("smp variable is null")
if(nrow(smp)==0) stop("No observations in smp")
if(nrow(full)==0) stop("No observations in full")
#If column(s) are specified
if(!is.null(col)){
toadd<-full[,colnames(full) %in% c("CSIScontractID",col)]
smp<-smp[,!colnames(smp) %in% col]
}
#If no column(s) specified, add all missing columns.
else{
full<-full %>% group_by()
toadd<-full[,!colnames(full) %in% colnames(smp) | colnames(full)=="CSIScontractID"]
}
if(drop_and_replace==FALSE){
missing<-sum(!smp$CSIScontractID %in% full$CSIScontractID)
if(missing>0) stop(paste("There are",missing,"rows in smp not present in full"))
if(ncol(toadd)==1) stop("No columns to add")
smp<-left_join(smp,toadd)
}
else{
original_l<-nrow(smp)
smp<-inner_join(smp,toadd, by="CSIScontractID")
rm(toadd)
missing_l<-original_l-nrow(smp)
if(missing_l>0){
full<-full[,colnames(full) %in% colnames(smp)]
if(ncol(full)<ncol(smp)){
print(paste(colnames(smp)[!colnames(smp) %in% colnames(full)]))
stop("Full is missing columns present in sample")
}
full<-full[!full$CSIScontractID %in% smp$CSIScontractID,]
smp<-dplyr::bind_rows(smp,full[sample(nrow(full),missing_l),])
if(nrow(smp)!=original_l) stop("Mismatched rowcount. Too few in full? This shouldn't happen.")
#
warning(paste(missing_l, "rows removed and replaced due to absence from full"))
}
}
smp
}
#***********************Check Key
#' Check Key
#'
#' @param x the data frame to be checked
#' @param key list of one or more column names that are suspected to be the key
#'
#' @return True if the columns are unique identifiers, false with warning otherwise.
#'
#' @details This function is uesed to check if a provided set of columns act as
#' unique identifiers. This is often particularly valuable before merging two
#' sets of tables, though it has a variety of other uses.
#'
#' @export
check_key<-function(x,key){
if(!all(key %in% colnames(x))){
stop(paste("Key(s) missing from data frame: ",key[!key %in% colnames(x)],"\n"))
}
dupe<-sum(duplicated(x[,key]))
if(dupe>0){
warning(paste("Using pk list (",paste(key,collapse=", "),")",dupe,"out of",nrow(x),"are duplicated"))
return(FALSE)
}
else{
return(TRUE)
}
}
#***********************All Duplicates
#' All Duplicates
#'
#' @param x the data frame to be checked
#' @param key list of one or more column names that are suspected to be the key
#'
#' @return All instances of rows where the primary key is duplicated
#'
#' @details Duplicated just returns the 2nd row of duplicates and doesn't have
#' a trivially easy way of just checking for duplicates in primary keys rather
#' than all rows, this function covers both.
#'
#' @export
all_duplicate<-function(x,key=NULL){
if(is.null(key)) key<-colnames(x)
x[duplicated(x[,key])|duplicated(x[,key],fromLast=TRUE),]
}
#***********************Check Derived
#' Check Derived
#'
#' @param x the data frame to be checked
#' @param key list of one or more column names that are suspected to be the key
#' @param derived_col check whether this variable only varies with the primary key
#' @param na.rm whether to ignore na derived columns
#'
#' @return True if the derived_col varies only with primary key.
#'
#' @details A derived column is one that could be consolidated only to the primary keys
#' and the derived column
#'
#' @export
check_derived<-function(x,key,derived_col,na.rm=FALSE){
if(!all(key %in% colnames(x))){
stop(paste("key(s) missing from data frame: ",key[!key %in% colnames(x)],"\n"))
}
if(!all(derived_col %in% colnames(x))){
stop(paste("derived_col(s) missing from data frame: ",derived_col[!derived_col %in% colnames(x)],"\n"))
}
if(all(is.na(x[,derived_col]))) stop("derived_col is all na")
if(derived_col %in% key) stop("derived_col should not be part of key")
x<-unique(x[,c(key,derived_col)])
if (na.rm)
x<-x[!is.na(x[,derived_col]),]
return(check_key(x,key))
}
#***********************Fill Derived
#' Fill Derived
#'
#' @param x the data frame to be checked
#' @param key list of one or more column names that are suspected to be the key
#' @param derived_col check whether this variable only varies with the primary key
#'
#' @return x with any na values filled in, if all non-na values are consistent.
#'
#' @details A derived column is one that could be consolidated only to the primary keys
#' and the derived column (with any nas removed). If that criteria is met, this function
#' then fills in the nas with those derived alues.
#'
#' @export
fill_derived<-function(x,key,derived_col){
if(!check_derived(x,key,derived_col,na.rm=TRUE)){
stop("Inconsistent derived_col")
}
derived<-unique(x[!is.na(x[,derived_col]),c(key,derived_col)])
x<-x[,derived_col != colnames(x)]
x<-left_join(x,derived)
return(x)
}
#***********************Group By List
#' Group By List
#'
#' @param x the data frame to be checked
#' @param key list of one or more column names that are suspected to be the key
#'
#' @return Group_By using a list of quoted names.
#'
#' @details Replacement for group_by_ now that it has been depricated.
#'
#' @export
group_by_list<-function(x,key){
if(all(key=="") | length(key)==0) return(group_by(x))
x<-x %>% group_by(!!as.name(key[1]),add=FALSE)
for(i in 2:length(key))
x<-x %>% group_by(!!as.name(key[i]),add=TRUE)
x
}
#' #***********************Label Top
#' #' Group By List
#' #'
#' #' @param df the data frame to be checked
#' #' @param label_col The column from which to pull top entries
#' #' @param value_col The column used to determine what counts as top
#' #' @param n=7 The number of top
#' #'
#' #' @return Group_By using a list of quoted names.
#' #'
#' #' @details Replacement for group_by_ now that it has been depricated.
#' #'
#' #' @export
#'
#' colnames(platpscintldef)[colnames(platpscintldef)=="Action_Obligation_Then_Year_Then_Year"]<-
#' "Action_Obligation_Then_Year"
#'
#'
#' topplat<-platpscintldef %>% group_by (Project.Name,PlatformPortfolio) %>%
#' summarise(Action_Obligation_OMB24_GDP22=sum(Action_Obligation_OMB24_GDP22),
#' Action_Obligation_2020=sum(if_else(Fiscal_Year==2020,Action_Obligation_OMB24_GDP22,0)))%>%
#' group_by (PlatformPortfolio) %>%
#' mutate(rank_total=rank(desc(Action_Obligation_OMB24_GDP22)),
#' rank_2020=rank(desc(Action_Obligation_2020)))
#' topplat %>% arrange(desc(Action_Obligation_OMB24_GDP22))
#'
#'
#' topplat$TopProject<-
#' if_else(topplat$rank_2020<=7 | topplat$rank_total<=7,topplat$Project.Name,NA)
#'
#' platpscintldef<-left_join(platpscintldef,topplat %>% select(-Action_Obligation_OMB24_GDP22,Action_Obligation_2020),
#' by=c("Project.Name","PlatformPortfolio"))
#'
#' platpscintldef$TopProject[is.na(platpscintldef$TopProject) & !is.na(platpscintldef$Project.Name)]<-
#' "Other Labeled Project"
#'
#'
#' summary(factor(platpscintldef$TopProject))
#' Save a copy of the plot, a current dollars csv, and an excel copy
#'
#' @param plot a ggplot object
#' @param df the underlying data
#' @param filename the name for the files, excluding extension
#' @param xlsx the excel file to output to
#' @param sheet the sheet to use in excel, typically shorter than the name
#' @param path="..\\output\\" what directory for the output
#' @param second_path=NA for saving to a second location to automatically
#' @param width=6.5 Width for the plot in inches
#' @param height=3.5 Height for the plot in inches
#' @param output_doc_svg=TRUE GGsave a svg of the graph for a document?
#' @param output_doc_png=FALSE GGsave a png of the graph for a document?
#' @param suppress_text=NA Remove titles and captions. If unspecified, treated as TRUE for SVG and FALSE for PNG.
#' @param startRow=1 Start row for excel output
#' @param startCol=NA Start column for excel output
#' @param format=TRUE Format the data rather then listing the df directl
#' @param x_var=NA Override option for x_var
#' @param y_var=NA Override option for y_var
#' @param var_list=NA Override option for what variables to include in addition to x_var and y_var, also sets arrangement order.
#' @param group_unlabeled_facets Whether to all unlabeled facets (but not colors) into a single line
#' @param csv_then_year=TRUE Override the graphed y_var to include nominal dollars in csv output
#' @param excel_then_year=TRUE Override the graphed y_var to include nominal dollars in excel output
#' @param excel_y_var=FALSE Include the graphed y_var (or over)
#' @param excel_share=FALSE Include percent shares for the y_var for each of the facets
#' @param excel_formulas=FALSE Create formulas to accompany the table
#' @param hist_year=2015 Historical anchor to include in summary stats
#' @param cur_year=2023 Most recent complete year to include in summary stats
#' @param group_unlabeled_facets=FALSE Combine all unlabeled facet categories into a single line in the data.
#'
#'
#' @return no value
#'
#'
#'
#' @export
# log_plot2 <- function(plot, df,filename,xlsx,sheet,path="..\\output",
# second_path=NA,
# width=6.5,height=3.5,output_doc_svg=TRUE,output_doc_png=FALSE,
# suppress_text=NA,
# startRow=1,startCol=NA,format=TRUE,
# x_var=NA,y_var=NA,var_list=NA,
# csv_then_year=TRUE,
# excel_then_year=TRUE,excel_y_var=FALSE,excel_share=FALSE,
# excel_formulas=FALSE,
# hist_year=2020, cur_year=2023,
# group_unlabeled_facets=FALSE
# ) {
#
#
# if(format){
# #This may end up breaking with pivoted graphs. But lets cross that bridge when we come to it.
# if(is.na(y_var)) y_var<-plot$plot_env$y_var
# if(is.na(x_var)) x_var<-plot$plot_env$x_var
# if(all(is.na(var_list))){
# var_list<-colnames(plot$data)
# var_list<-var_list[!var_list %in% y_var & !var_list %in% x_var &
# !var_list %in% plot$plot_env$x_var & !var_list %in% "YTD"]
# }
# if(is.na(startCol)) startCol<-10+length(var_list)
# #Swap in Fiscal_Year for dFYear for ease of table readability
# if("dFYear"==x_var & "Fiscal_Year" %in% colnames(df))
# x_var<-"Fiscal_Year"
# if("dtDelivYear"==x_var & "Delivery.year" %in% colnames(df))
# x_var<-"Delivery.year"
# if(excel_then_year | csv_then_year){
# #Add other constant dollar here variables
# if(y_var %in% c("Then_Year_Dollars","Action_Obligation_Then_Year") &
# excel_y_var==FALSE)
# then_year_y_var<-y_var
# else if(y_var %in% c("Action_Obligation_OMB24_GDP22"))
# then_year_y_var<-"Action_Obligation_Then_Year"
# else if(y_var %in% c("Amount_OMB24_GDP22"))
# then_year_y_var<-"Amount_Then_Year"
# else if(y_var %in% c("Action_Obligation_OMB25_GDP23"))
# then_year_y_var<-"Action_Obligation_Then_Year"
# else if(y_var %in% c("Amount_OMB25_GDP23"))
# then_year_y_var<-"Amount_Then_Year"
# else if(y_var %in% c("delivery_BEA22"))
# then_year_y_var<-"delivery_Then_Year"
# else if(y_var %in% c("DefenseObligated_OMB25_GDP23"))
# then_year_y_var<-"DefenseObligated_Then_Year"
# else stop("Unrecognized y_var")
# if(any(is.Date(df[,x_var]) & !is.na(df[,x_var]) & df[,x_var]==""))
# stop("Empty string values in x_var cause a pivot_wider error.")
# then_year_df<-group_data_for_plot(df,x_var=x_var, y_var=then_year_y_var, breakout=var_list) %>%
# arrange(!!as.name(x_var))%>%
# pivot_wider(names_from=!!as.name(x_var),
# values_from=!!as.name(then_year_y_var)) %>%
# arrange(.by_group = TRUE)
# }
# if (excel_y_var)
# y_var_df<-group_data_for_plot(df,x_var=x_var, y_var=y_var, breakout=var_list) %>%
# arrange(!!as.name(x_var))%>%
# pivot_wider(names_from=!!as.name(x_var),
# values_from=!!as.name(y_var)) %>%
# arrange(.by_group = TRUE)
# # I probably should switch to format_data_for_plot to do this
# # if(excel_shared)
# # shared_df<--group_data_for_plot(df,x_var=x_var, y_var=y_var, breakout=var_list) %>%
# # arrange(!!as.name(x_var))%>%
# # pivot_wider(names_from=!!as.name(x_var),
# # values_from=!!as.name(y_var)) %>%
# # arrange(.by_group = TRUE)
#
# }
# #Now that formatting is done, we can efficiently call for a second log path
#
# #To allow for efficient output to two paths, all of the post-formatting output
# #is put together in one function.
# output_log_plot<-function(plot, then_year_df,y_var_df,
# filename,xlsx,sheet,path,
# width,height,output_doc_svg,output_doc_png,suppress_text,
# startRow,startCol,
# x_var,y_var,var_list,
# csv_then_year,
# excel_then_year,excel_y_var,excel_share,
# excel_formulas,
# hist_year, cur_year,
# group_unlabeled_facets){
# if (output_doc_svg==TRUE)
# ggsave600dpi(plot+ifelse(suppress_text | is.na(suppress_text), labs(caption=NULL,title=NULL),labs()),
# file=file.path(path,paste(filename,".svg",sep="")),size=12,caption_fraction=8/12,lineheight=1, height =height, width=width)
# if (output_doc_png==TRUE)
# ggsave600dpi(plot+ifelse(suppress_text & !is.na(suppress_text), labs(caption=NULL,title=NULL),labs()),
# file=file.path(path,paste(filename,".png",sep="")),size=12,caption_fraction=8/12,lineheight=1, height =height+0.25, width=width)
#
# if(csv_then_year){
# if(!dir.exists(file.path(path,"then_year_csv")))
# dir.create(file.path(path,"then_year_csv"))
# write.csv(then_year_df,file=file.path(path,"then_year_csv",paste(filename,".csv",sep="")),row.names = FALSE, na = "")
# }
# if(excel_then_year | excel_y_var | excel_share){
# if(file.exists(file.path(path,xlsx))){
# wb <- openxlsx::loadWorkbook(file.path(path,xlsx))
# }
# else{
# wb<-wb_workbook()
# }
# if(!sheet %in% wb_get_sheet_names(wb))
# wb$add_worksheet(sheet)
# numstyle<-openxlsx::createStyle(numFmt = "0.00,,,\"B\"")
# pstyle<-openxlsx::createStyle(numFmt = "PERCENTAGE")
# if(excel_then_year){
# writeData(wb, then_year_df, sheet = sheet, startRow = startRow, startCol = startCol)
# if(length(var_list)<startCol)
# for (c in 1:length(var_list))
# openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(c+startCol-1),(startRow):(startRow+nrow(then_year_df)+1))),
# startRow=startRow,startCol=c)
# gt<-data.frame(Total=c("Grand Total",rep("",length(var_list)-1),
#
# paste0("Sum(",openxlsx::int2col((startCol+length(var_list)):(startCol+ncol(then_year_df)-length(var_list))),
# startRow+1,":",
# openxlsx::int2col((startCol+length(var_list)):(startCol+ncol(then_year_df)-length(var_list))),
# startRow+nrow(then_year_df),")")))
# gt$rn<-rownames(gt)
# gt<-as.data.frame(pivot_wider(gt,values_from=Total,names_from=rn))
# for (i in colnames(gt)[(length(var_list)+1):ncol(gt)]) class(gt[,i])<-c(class(gt[,i]),"formula")
# writeData(wb,sheet=sheet,
# gt,
# startRow=startRow+nrow(then_year_df)+1,
# startCol=startCol,
# colNames=FALSE
# )
#
# openxlsx::addStyle(wb, sheet, numstyle,gridExpand = T,
# rows=(startRow+1):(startRow+nrow(then_year_df)+1),
# cols=(startCol+length(var_list)):(startCol+ncol(then_year_df)+2-length(var_list)))
# startRow<-startRow+nrow(then_year_df)+4 #Header row, total row, check_sum_row, blank row
# }
# if(excel_y_var){
# writeData(wb, y_var_df, sheet = sheet, startRow = startRow, startCol = startCol)
# if(length(var_list)<startCol)
# for (c in 1:length(var_list))
# openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(c+startCol-1),(startRow):(startRow+nrow(y_var_df)+1))),
# startRow=startRow,startCol=c)
# if(excel_formulas){
# if(!hist_year %in% colnames(y_var_df))
# stop("hist_year not in provided data")
# #Historic year
# hist_col<-which(colnames(y_var_df)==hist_year)+startCol-1
# openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(hist_col),(startRow):(startRow+nrow(y_var_df)+1))),
# startRow=startRow,startCol=length(var_list)+1)
# cur_col<-which(colnames(y_var_df)==cur_year)+startCol-1
# #Year before current
# openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(cur_col-1),(startRow):(startRow+nrow(y_var_df)+1))),
# startRow=startRow,startCol=length(var_list)+2)
# #Current year
# openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(cur_col),(startRow):(startRow+nrow(y_var_df)+1))),
# startRow=startRow,startCol=length(var_list)+3)
# #Incomplete year
# openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(cur_col+1),(startRow):(startRow+nrow(y_var_df)+1))),
# startRow=startRow,startCol=length(var_list)+4)
#
#
# openxlsx::addStyle(wb, sheet, numstyle,gridExpand = T,
# rows=(startRow+1):(startRow+nrow(y_var_df)+1),
# cols=(length(var_list)+1):(length(var_list)+4))
#
#
# #Year before current to current comparison
# openxlsx::writeFormula(wb,sheet,c(#Heading
# paste0(openxlsx::int2col(cur_col-1),startRow,"&\"-\"&",openxlsx::int2col(cur_col),startRow),
# #Formulas
# paste0(openxlsx::int2col(cur_col),(startRow+1):(startRow+nrow(y_var_df)+1),"/",
# openxlsx::int2col(cur_col-1),(startRow+1):(startRow+nrow(y_var_df)+1),"-1")),
# startRow=startRow,startCol=length(var_list)+5)
#
# #Historic year to current comparison
# openxlsx::writeFormula(wb,sheet,c(#Heading
# paste0(openxlsx::int2col(hist_col),startRow,"&\"-\"&",openxlsx::int2col(cur_col),startRow),
# #Formulas
# paste0(openxlsx::int2col(cur_col),(startRow+1):(startRow+nrow(y_var_df)+1),"/",
# openxlsx::int2col(hist_col),(startRow+1):(startRow+nrow(y_var_df)+1),"-1")),
# startRow=startRow,startCol=length(var_list)+6)
#
# #YTD over current comparison
# openxlsx::writeFormula(wb,sheet,c(#Heading
# paste0(openxlsx::int2col(cur_col+1),startRow,"&\"/\"&",openxlsx::int2col(cur_col),startRow),
# #Formulas
# paste0(openxlsx::int2col(cur_col+1),(startRow+1):(startRow+nrow(y_var_df)+1),"/",
# openxlsx::int2col(cur_col),(startRow+1):(startRow+nrow(y_var_df)+1))),
# startRow=startRow,startCol=length(var_list)+7)
#
# #Current year share
# openxlsx::writeFormula(wb,sheet,c(#Heading
# paste0("\"Share \"&",openxlsx::int2col(cur_col),startRow),
# #Formulas
# paste0(openxlsx::int2col(cur_col),(startRow+1):(startRow+nrow(y_var_df)),"/",
# "Sum(",openxlsx::int2col(cur_col),"$",(startRow),":",
# openxlsx::int2col(cur_col),"$",(startRow+nrow(y_var_df)),")"),
# paste0("Sum(",openxlsx::int2col(length(var_list)+8),"$",(startRow),":",
# openxlsx::int2col(length(var_list)+8),"$",(startRow+nrow(y_var_df)),")")),
# startRow=startRow,startCol=length(var_list)+8)
#
# #YTD share
#
# openxlsx::writeFormula(wb,sheet,c(#Heading
# paste0("\"Share \"&",openxlsx::int2col(cur_col+1),startRow),
# #Formulas
# paste0(openxlsx::int2col(cur_col+1),(startRow+1):(startRow+nrow(y_var_df)),"/",
# "Sum(",openxlsx::int2col(cur_col+1),(startRow),":",
# openxlsx::int2col(cur_col+1),"$",(startRow+nrow(y_var_df)),")"),
# paste0("Sum(",openxlsx::int2col(length(var_list)+9),"$",(startRow),":",
# openxlsx::int2col(length(var_list)+9),"$",(startRow+nrow(y_var_df)),")")),
# startRow=startRow,startCol=length(var_list)+9)
#
# openxlsx::addStyle(wb, sheet, pstyle,gridExpand = T,
# rows=(startRow+1):(startRow+nrow(y_var_df)+1),
# cols=(length(var_list)+5):(length(var_list)+9))
#
# gt<-data.frame(Total=c("Grand Total",rep("",length(var_list)-1),
#
# paste0("Sum(",openxlsx::int2col((startCol+length(var_list)):(startCol+ncol(y_var_df)-length(var_list))),
# startRow+1,":",
# openxlsx::int2col((startCol+length(var_list)):(startCol+ncol(y_var_df)-length(var_list))),
# startRow+nrow(y_var_df),")")))
# gt$rn<-rownames(gt)
# gt<-as.data.frame(pivot_wider(gt,values_from=Total,names_from=rn))
# for (i in colnames(gt)[(length(var_list)+1):ncol(gt)]) class(gt[,i])<-c(class(gt[,i]),"formula")
# writeData(wb,sheet=sheet,
# gt,
# startRow=startRow+nrow(y_var_df)+1,
# startCol=startCol,
# colNames=FALSE
# )
# }
#
#
#
# openxlsx::addStyle(wb, sheet, numstyle,gridExpand = T,
# rows=(startRow+1):(startRow+nrow(y_var_df)+1),
# cols=(startCol+length(var_list)):(startCol+ncol(y_var_df)+2-length(var_list)))
#
#
# startRow<-startRow+nrow(y_var_df)+4 #Header row, total row, check_sum_row, blank row
# }
# openxlsx::freezePane(wb,sheet,firstActiveRow = 2,firstActiveCol = 1+length(var_list))
#
# openxlsx::saveWorkbook(wb,file=(file.path(path,xlsx)),overwrite = TRUE)
# rm(wb)
# }
# }
# output_log_plot(plot=plot, then_year_df,y_var_df,filename=filename,
# xlsx=xlsx,sheet=sheet,path=path,
# width=width,height=height,
# output_doc_svg=output_doc_svg,output_doc_png=output_doc_png,
# suppress_text=suppress_text,
# startRow=startRow,startCol=startCol,
# x_var=x_var,y_var=y_var,var_list=var_list,
# csv_then_year=csv_then_year,
# excel_then_year=excel_then_year,
# excel_y_var=excel_y_var,excel_share=excel_share,
# excel_formulas=excel_formulas,
# hist_year=hist_year, cur_year=cur_year,
# group_unlabeled_facets=group_unlabeled_facets
# )
# if (!is.na(second_path))
# output_log_plot(plot=plot, then_year_df,y_var_df,filename=filename,
# xlsx=xlsx,sheet=sheet,path=second_path,
# width=width,height=height,
# output_doc_svg=output_doc_svg,output_doc_png=output_doc_png,
# suppress_text=suppress_text,
# startRow=startRow,startCol=startCol,
# x_var=x_var,y_var=y_var,var_list=var_list,
# csv_then_year=csv_then_year,
# excel_then_year=excel_then_year,
# excel_y_var=excel_y_var,excel_share=excel_share,
# excel_formulas=excel_formulas,
# hist_year=hist_year, cur_year=cur_year,
# group_unlabeled_facets=group_unlabeled_facets
# )
# }
#' Save a copy of the plot, a current dollars csv, and an excel copy
#'
#' @param df the underlying data
#' @param xlsx the excel file to output to
#' @param sheet the sheet to use in excel, typically shorter than the name
#' @param path="..\\output\\" what directory for the output
#' @param second_path=NA for saving to a second location to automatically
#' @param startRow=1 Start row for excel output
#' @param startCol=NA Start column for excel output
#'
#'
#' @return no value
#'
#'
#'
#' @export
export_worksheet <- function(df,xlsx,sheet,path="..\\output",
second_path=NA,
startRow=1,startCol=1
) {
wb <- loadWorkbook(file.path(path,xlsx))
writeData(wb, sheet = sheet, startRow = startRow, startCol = startCol,
df)
saveWorkbook(wb,file=file.path(path,xlsx),overwrite = TRUE)
if(!is.na(second_path)){
wb <- loadWorkbook(file.path(second_path,xlsx))
writeData(wb, sheet = sheet, startRow = startRow, startCol = startCol,
df)
saveWorkbook(wb,file=file.path(second_path,xlsx),overwrite = TRUE)
}
}
#' Save a copy of the plot, a current dollars csv, and an excel copy
#'
#' @param plot a ggplot object
#' @param df the underlying data
#' @param filename the name for the files, excluding extension
#' @param xlsx the excel file to output to
#' @param sheet the sheet to use in excel, typically shorter than the name
#' @param path="..\\output\\" what directory for the output
#' @param second_path=NA for saving to a second location to automatically
#' @param width=6.5 Width for the plot in inches
#' @param height=3.5 Height for the plot in inches
#' @param output_doc_svg=TRUE GGsave a svg of the graph for a document?
#' @param output_doc_png=FALSE GGsave a png of the graph for a document?
#' @param suppress_text=NA Remove titles and captions. If unspecified, treated as TRUE for SVG and FALSE for PNG.
#' @param startRow=1 Start row for excel output
#' @param startCol=NA Start column for excel output
#' @param format=TRUE Format the data rather then listing the df directl
#' @param x_var=NA Override option for x_var
#' @param y_var=NA Override option for y_var
#' @param var_list=NA Override option for what variables to include in addition to x_var and y_var, also sets arrangement order.
#' @param group_unlabeled_facets Whether to all unlabeled facets (but not colors) into a single line
#' @param csv_then_year=TRUE Override the graphed y_var to include nominal dollars in csv output
#' @param excel_then_year=TRUE Override the graphed y_var to include nominal dollars in excel output
#' @param excel_y_var=FALSE Include the graphed y_var (or over)
#' @param excel_share=FALSE Include percent shares for the y_var for each of the facets
#' @param excel_formulas=FALSE Create formulas to accompany the table
#' @param hist_year=2020 Historical anchor to include in summary stats
#' @param cur_year=2023 Most recent complete year to include in summary stats
#' @param YTD=TRUE Include formulas for cur_year+1 if excel_formulas is true.
#' @param group_unlabeled_facets=FALSE Combine all unlabeled facet categories into a single line in the data.
#' @param num_format="0.00,,,\"B\"" How to format data values, default "0.00,,,\"B\""
#'
#'
#' @return no value
#'
#'
#'
#' @export
log_plot <- function(plot, df,filename,xlsx,sheet,path="..\\output",
second_path=NA,
width=6.5,height=3.5,output_doc_svg=TRUE,output_doc_png=FALSE,
suppress_text=NA,
startRow=1,startCol=NA,format=TRUE,
x_var=NA,y_var=NA,var_list=NA,
csv_then_year=TRUE,
excel_then_year=TRUE,excel_y_var=FALSE,excel_share=FALSE,
excel_formulas=FALSE,
hist_year=2020, cur_year=2023,include_YTD=TRUE,
group_unlabeled_facets=FALSE,
num_format="0.00,,,\"B\""
) {
if(format){
#This may end up breaking with pivoted graphs. But lets cross that bridge when we come to it.
if(is.na(y_var)) y_var<-plot$plot_env$y_var
if(is.na(x_var)) x_var<-plot$plot_env$x_var
if(all(is.na(var_list))){
var_list<-colnames(plot$data)
var_list<-var_list[!var_list %in% y_var & !var_list %in% x_var &
!var_list %in% plot$plot_env$x_var & !var_list %in% "YTD"]
}
if(is.na(startCol)) startCol<-11+length(var_list)
#Why 11?
#(1) 1 indexed
#(4) Comparison year, prior year, current year, YTD | blank
#(3) Growth since comparison year; growth since last year; YTD/current | blank
#(2) Share current year, year last year
#(1) Blank column before actual data
#Swap in Fiscal_Year for dFYear for ease of table readability
if("dFYear"==x_var & "Fiscal_Year" %in% colnames(df))
x_var<-"Fiscal_Year"
if("dtDelivYear"==x_var & "Delivery.year" %in% colnames(df))
x_var<-"Delivery.year"
if(excel_then_year | csv_then_year){
#Add other constant dollar here variables
if(y_var %in% c("Then_Year_Dollars","Action_Obligation_Then_Year",
"TIV_delivery_value") &
excel_y_var==FALSE)
then_year_y_var<-y_var
else if(y_var %in% c("Dollars_OMB25_GDP23"))
then_year_y_var<-"Dollars_Then_Year"
else if(y_var %in% c("Action_Obligation_OMB24_GDP22"))
then_year_y_var<-"Action_Obligation_Then_Year"
else if(y_var %in% c("Amount_OMB24_GDP22"))
then_year_y_var<-"Amount_Then_Year"
else if(y_var %in% c("Action_Obligation_OMB25_GDP23"))
then_year_y_var<-"Action_Obligation_Then_Year"
else if(y_var %in% c("Amount_OMB25_GDP23"))
then_year_y_var<-"Amount_Then_Year"
else if(y_var %in% c("delivery_BEA22"))
then_year_y_var<-"delivery_Then_Year"
else if(y_var %in% c("DefenseObligated_OMB25_GDP23"))
then_year_y_var<-"DefenseObligated_Then_Year"
else stop("Unrecognized y_var")
if(any(is.Date(df[,x_var]) & !is.na(df[,x_var]) & df[,x_var]==""))
stop("Empty string values in x_var cause a pivot_wider error.")
then_year_df<-group_data_for_plot(df,x_var=x_var, y_var=then_year_y_var, breakout=var_list) %>%
arrange(!!as.name(x_var))%>%
pivot_wider(names_from=!!as.name(x_var),
values_from=!!as.name(then_year_y_var)) %>%
arrange(.by_group = TRUE)
}
if (excel_y_var)
y_var_df<-group_data_for_plot(df,x_var=x_var, y_var=y_var, breakout=var_list) %>%
arrange(!!as.name(x_var))%>%
pivot_wider(names_from=!!as.name(x_var),
values_from=!!as.name(y_var)) %>%
arrange(.by_group = TRUE)
# I probably should switch to format_data_for_plot to do this
# if(excel_shared)
# shared_df<--group_data_for_plot(df,x_var=x_var, y_var=y_var, breakout=var_list) %>%
# arrange(!!as.name(x_var))%>%
# pivot_wider(names_from=!!as.name(x_var),
# values_from=!!as.name(y_var)) %>%
# arrange(.by_group = TRUE)
}
#Now that formatting is done, we can efficiently call for a second log path
#To allow for efficient output to two paths, all of the post-formatting output
#is put together in one function.
output_log_plot<-function(plot, then_year_df,y_var_df,
filename,xlsx,sheet,path,
width,height,output_doc_svg,output_doc_png,suppress_text,
startRow,startCol,
x_var,y_var,var_list,
csv_then_year,
excel_then_year,excel_y_var,excel_share,
excel_formulas,
hist_year, cur_year,
group_unlabeled_facets){
if (output_doc_svg==TRUE)
ggsave600dpi(plot+ifelse(suppress_text | is.na(suppress_text), labs(caption=NULL,title=NULL),labs()),
file=file.path(path,paste(filename,".svg",sep="")),size=12,caption_fraction=8/12,lineheight=1, height =height, width=width)
if (output_doc_png==TRUE)
ggsave600dpi(plot+ifelse(suppress_text & !is.na(suppress_text), labs(caption=NULL,title=NULL),labs()),
file=file.path(path,paste(filename,".png",sep="")),size=12,caption_fraction=8/12,lineheight=1, height =height+0.25, width=width)
if(csv_then_year){
if(!dir.exists(file.path(path,"then_year_csv")))
dir.create(file.path(path,"then_year_csv"))
write.csv(then_year_df,file=file.path(path,"then_year_csv",paste(filename,".csv",sep="")),row.names = FALSE, na = "")
}
if(excel_then_year | excel_y_var | excel_share){
if(file.exists(file.path(path,xlsx))){
wb <- openxlsx::loadWorkbook(file.path(path,xlsx))
}
else{
wb<-openxlsx::createWorkbook(file.path(path,xlsx))
}
if(!sheet %in% names(wb))
openxlsx::addWorksheet(wb,sheet)
numstyle<-openxlsx::createStyle(numFmt = num_format)
pstyle<-openxlsx::createStyle(numFmt = "PERCENTAGE")
if(excel_then_year){
openxlsx::writeData(wb, then_year_df, sheet = sheet, startRow = startRow, startCol = startCol)
#If and only if there's a blank left on the left, fill in category name links in the first column.
if(length(var_list)<startCol)
for (c in 1:length(var_list))
openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(c+startCol-1),(startRow):(startRow+nrow(then_year_df)+1))),
startRow=startRow,startCol=c)
gt<-data.frame(Total=c("Grand Total",rep("",length(var_list)-1),
paste0("Sum(",openxlsx::int2col((startCol+length(var_list)):(startCol+ncol(then_year_df)-length(var_list)+1)),
startRow+1,":",
openxlsx::int2col((startCol+length(var_list)):(startCol+ncol(then_year_df)-length(var_list)+1)),
startRow+nrow(then_year_df),")")))
gt$rn<-rownames(gt)
gt<-as.data.frame(pivot_wider(gt,values_from=Total,names_from=rn))
for (i in colnames(gt)[(length(var_list)+1):ncol(gt)]) class(gt[,i])<-c(class(gt[,i]),"formula")
openxlsx::writeData(wb,sheet=sheet,
gt,
startRow=startRow+nrow(then_year_df)+1,
startCol=startCol,
colNames=FALSE
)
openxlsx::addStyle(wb, sheet, numstyle,gridExpand = T,
rows=(startRow+1):(startRow+nrow(then_year_df)+1),
cols=(startCol+length(var_list)):(startCol+ncol(then_year_df)+2-length(var_list)))
startRow<-startRow+nrow(then_year_df)+4 #Header row, total row, check_sum_row, blank row
}
if(excel_y_var){
openxlsx::writeData(wb, y_var_df, sheet = sheet, startRow = startRow, startCol = startCol)
if(length(var_list)<startCol)
for (c in 1:length(var_list))
openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(c+startCol-1),(startRow):(startRow+nrow(y_var_df)+1))),
startRow=startRow,startCol=c)
if(excel_formulas){
if(startCol<10+length(var_list))
stop(paste("startCol",startCol,
"is too small to have room to add an excel formula. Set excel_formula to false ,or increase StartCol>=",
10+length(var_list)))
if(!hist_year %in% colnames(y_var_df))
stop("hist_year not in provided data")
#Historic year
hist_col<-which(colnames(y_var_df)==hist_year)+startCol-1
openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(hist_col),(startRow):(startRow+nrow(y_var_df)+1))),
startRow=startRow,startCol=length(var_list)+1)
cur_col<-which(colnames(y_var_df)==cur_year)+startCol-1
#Year before current
openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(cur_col-1),(startRow):(startRow+nrow(y_var_df)+1))),
startRow=startRow,startCol=length(var_list)+2)
#Current year
openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(cur_col),(startRow):(startRow+nrow(y_var_df)+1))),
startRow=startRow,startCol=length(var_list)+3)
if(include_YTD){
#Incomplete year
openxlsx::writeFormula(wb,sheet,c(paste0(openxlsx::int2col(cur_col+1),(startRow):(startRow+nrow(y_var_df)+1))),
startRow=startRow,startCol=length(var_list)+4)
}
openxlsx::addStyle(wb, sheet, numstyle,gridExpand = T,
rows=(startRow+1):(startRow+nrow(y_var_df)+1),
cols=(length(var_list)+1):(length(var_list)+4))
#Year before current to current comparison
openxlsx::writeFormula(wb,sheet,c(#Heading
paste0(openxlsx::int2col(cur_col-1),startRow,"&\"-\"&",openxlsx::int2col(cur_col),startRow),
#Formulas
paste0(openxlsx::int2col(cur_col),(startRow+1):(startRow+nrow(y_var_df)+1),"/",
openxlsx::int2col(cur_col-1),(startRow+1):(startRow+nrow(y_var_df)+1),"-1")),
startRow=startRow,startCol=length(var_list)+5)
#Historic year to current comparison
openxlsx::writeFormula(wb,sheet,c(#Heading
paste0(openxlsx::int2col(hist_col),startRow,"&\"-\"&",openxlsx::int2col(cur_col),startRow),
#Formulas
paste0(openxlsx::int2col(cur_col),(startRow+1):(startRow+nrow(y_var_df)+1),"/",
openxlsx::int2col(hist_col),(startRow+1):(startRow+nrow(y_var_df)+1),"-1")),
startRow=startRow,startCol=length(var_list)+6)
if(include_YTD){
#YTD over current comparison
openxlsx::writeFormula(wb,sheet,c(#Heading
paste0(openxlsx::int2col(cur_col+1),startRow,"&\"/\"&",openxlsx::int2col(cur_col),startRow),
#Formulas
paste0(openxlsx::int2col(cur_col+1),(startRow+1):(startRow+nrow(y_var_df)+1),"/",
openxlsx::int2col(cur_col),(startRow+1):(startRow+nrow(y_var_df)+1))),
startRow=startRow,startCol=length(var_list)+7)
}
#Current year share
openxlsx::writeFormula(wb,sheet,c(#Heading
paste0("\"Share \"&",openxlsx::int2col(cur_col),startRow),
#Formulas
paste0(openxlsx::int2col(cur_col),(startRow+1):(startRow+nrow(y_var_df)),"/",
"Sum(",openxlsx::int2col(cur_col),"$",(startRow),":",
openxlsx::int2col(cur_col),"$",(startRow+nrow(y_var_df)),")"),
paste0("Sum(",openxlsx::int2col(length(var_list)+8),"$",(startRow),":",
openxlsx::int2col(length(var_list)+8),"$",(startRow+nrow(y_var_df)),")")),
startRow=startRow,startCol=length(var_list)+8)
#YTD share
if(include_YTD){
openxlsx::writeFormula(wb,sheet,c(#Heading
paste0("\"Share \"&",openxlsx::int2col(cur_col+1),startRow),
#Formulas
paste0(openxlsx::int2col(cur_col+1),(startRow+1):(startRow+nrow(y_var_df)),"/",
"Sum(",openxlsx::int2col(cur_col+1),(startRow),":",
openxlsx::int2col(cur_col+1),"$",(startRow+nrow(y_var_df)),")"),
paste0("Sum(",openxlsx::int2col(length(var_list)+9),"$",(startRow),":",
openxlsx::int2col(length(var_list)+9),"$",(startRow+nrow(y_var_df)),")")),
startRow=startRow,startCol=length(var_list)+9)
}
openxlsx::addStyle(wb, sheet, pstyle,gridExpand = T,
rows=(startRow+1):(startRow+nrow(y_var_df)+1),
cols=(length(var_list)+5):(length(var_list)+9))
gt<-data.frame(Total=c("Grand Total",rep("",length(var_list)-1),
paste0("Sum(",openxlsx::int2col((startCol+length(var_list)):(startCol+ncol(y_var_df)-length(var_list)+1)),
startRow+1,":",
openxlsx::int2col((startCol+length(var_list)):(startCol+ncol(y_var_df)-length(var_list)+1)),
startRow+nrow(y_var_df),")")))
gt$rn<-rownames(gt)
gt<-as.data.frame(pivot_wider(gt,values_from=Total,names_from=rn))
for (i in colnames(gt)[(length(var_list)+1):ncol(gt)]) class(gt[,i])<-c(class(gt[,i]),"formula")
openxlsx::writeData(wb,sheet=sheet,
gt,
startRow=startRow+nrow(y_var_df)+1,
startCol=startCol,
colNames=FALSE
)
}
openxlsx::addStyle(wb, sheet, numstyle,gridExpand = T,
rows=(startRow+1):(startRow+nrow(y_var_df)+1),
cols=(startCol+length(var_list)):(startCol+ncol(y_var_df)+2-length(var_list)))
startRow<-startRow+nrow(y_var_df)+4 #Header row, total row, check_sum_row, blank row
}
openxlsx::freezePane(wb,sheet,firstActiveRow = 2,firstActiveCol = 1+length(var_list))
openxlsx::saveWorkbook(wb,file=(file.path(path,xlsx)),overwrite = TRUE)
rm(wb)
}
}
output_log_plot(plot=plot, then_year_df,y_var_df,filename=filename,
xlsx=xlsx,sheet=sheet,path=path,
width=width,height=height,
output_doc_svg=output_doc_svg,output_doc_png=output_doc_png,
suppress_text=suppress_text,
startRow=startRow,startCol=startCol,
x_var=x_var,y_var=y_var,var_list=var_list,
csv_then_year=csv_then_year,
excel_then_year=excel_then_year,
excel_y_var=excel_y_var,excel_share=excel_share,
excel_formulas=excel_formulas,
hist_year=hist_year, cur_year=cur_year,
group_unlabeled_facets=group_unlabeled_facets
)
if (!is.na(second_path))
output_log_plot(plot=plot, then_year_df,y_var_df,filename=filename,
xlsx=xlsx,sheet=sheet,path=second_path,
width=width,height=height,
output_doc_svg=output_doc_svg,output_doc_png=output_doc_png,
suppress_text=suppress_text,
startRow=startRow,startCol=startCol,
x_var=x_var,y_var=y_var,var_list=var_list,
csv_then_year=csv_then_year,
excel_then_year=excel_then_year,
excel_y_var=excel_y_var,excel_share=excel_share,
excel_formulas=excel_formulas,
hist_year=hist_year, cur_year=cur_year,
group_unlabeled_facets=group_unlabeled_facets
)
}
#***********************Get Base Folder
#' Get Base Folder
#'
#' @param folder
#'
#' @return "../../[folder]", "../[folder]", or "[folder]" depending on the relative position.
#'
#' @details When a project is open, R files use the base of the repository
#' as the working directory. RMD files use their own position, typically
#' script or analysis.
#'
#' @export
get_base_folder<-function(folder){
if(dir.exists(folder)) return(folder)
else if(dir.exists(file.path("..",folder))) return(file.path("..",folder))
else if(dir.exists(file.path("..","..",folder))) return(file.path("..","..",folder))
else stop("Directory not found, is it present in repository base and are you in a project?")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.