R/Apply_Lookups.R

Defines functions apply_standard_lookups label_top get_fiscal_year text_to_bit text_to_number deflate read_and_join_experiment read_and_join remove_bom na_check replace_nas_with_unlabeled get_delim FactorToNumber get_local_sharepoint_path get_local_lookup_path swap_in_zip

Documented in apply_standard_lookups deflate FactorToNumber get_delim get_fiscal_year get_local_lookup_path get_local_sharepoint_path label_top na_check read_and_join read_and_join_experiment remove_bom replace_nas_with_unlabeled swap_in_zip text_to_bit text_to_number

# This file contains functions to join data frames to pre-established CSIS
# lookup files
#
# You can learn more about package authoring with RStudio at:
#
#   http://r-pkgs.had.co.nz/
#
# Some useful keyboard shortcuts for package authoring:
#
#   Build and Reload Package:  'Ctrl + Shift + B'
#   Check Package:       'Ctrl + Shift + E'
#   Test Package:        'Ctrl + Shift + T'


#' When passed a file that doesn't exist, check for the zip version
#'
#' @param filename The name of the data file
#' @param path The location of the data file. Default ""
#' @param directory The directory within the path that holds the lookup, default "Data\\
#'
#' @return The filename name, if the file exists. Otherwise the zip file that contains the file.
#'
#' @details This function is meant for large data files. Our default approach is to zip them
#' up, one zip per file, and name the zip file the same name as the data file, except with zip
#' as an extension instead of .txt or .csv. This checks if the base file is available and if
#' not it handles opening the zip file instead.
#'
#' @examples swap_in_zip(filename="Defense_Contract_SP_ContractSampleCriteriaDetailsCustomer.csv")
#'
#' @export
swap_in_zip<-function(filename,path,directory=""){
  input<-file.path(path,directory,filename)
  if (path=="") input<-file.path(path,directory,filename)
  #File.exist seems only to work for local files.
  if(!file.exists(input) & !tolower(substr(input,1,4)) %in% c("http","ftp:") ){
    zip_file<-paste(substring(input,1,nchar(input)-3),"zip",sep="")
    if (!file.exists(zip_file)){
      stop(paste(input,"does not exist"))
    }
    input<-unz(description=zip_file,filename=filename)
    file_size<-file.info(zip_file)$size
    if (file_size>200000000){
      stop(paste("Zip file size (",file_size,") exceeds 200 megabytes and unz can't handle this. Current solution is to unzip in file system and read in directly."))
    }
  }
  input
}

#' A kludge, tests a variety of local paths to find the lookup-tables reposistory
#'
#' @return The path, if a known one exists. Otherwise it will throw an error.
#'
#' @details The lookup-table repository is a public one and can be found online.
#' However, when updating lookup files, it can be faster to use the local version
#' rather than wait for github to full parse the pushed files. In addition a
#' local path can enable working offline.
#'
#' @examples get_local_lookup_path()
#'
#' @export
get_local_lookup_path<-function(){
  local_path<-"C:\\Users\\Present\\Documents\\Repositories\\Lookup-Tables\\"
  if(file.exists(local_path))
    return(local_path)
  local_path<-"C:\\Users\\grego\\Repos\\Lookup-Tables\\"
  if(file.exists(local_path))
    return(local_path)
    local_path<-"K:\\Users\\Greg\\Repositories\\Lookup-Tables\\"
  if(file.exists(local_path))
    return(local_path)
  local_path<-"C:\\Users\\gsand\\Repositories\\Lookup-Tables\\"
  if(file.exists(local_path))
    return(local_path)
  local_path<-"D:\\Users\\Greg\\Repositories\\Lookup-Tables\\"
  if(file.exists(local_path))
    return(local_path)
  local_path<-"F:\\Users\\Greg\\Repositories\\Lookup-Tables\\"
  if(file.exists(local_path))
    return(local_path)
  local_path<-"D:\\Repositories\\Lookup-Tables\\"
  if(file.exists(local_path))
    return(local_path)
  local_path<-"F:\\Users\\gsanders\\Documents\\Repositories\\Lookup-Tables\\"
  if(file.exists(local_path))
    return(local_path)
  local_path<-"C:\\Users\\grego\\Repositories\\Lookup-Tables\\"
  if(file.exists(local_path))
    return(local_path)
  stop("Could not find local path. Update the list in Apply_Lookups.R")
}



#' A kludge, that should someday be replaced by using Microsoft365 repo.
#'
#'' @param site Which site. The DIIG sharepoint folder is the default.
#'
#' @return The path, if a known one exists. Otherwise it will throw an error.
#'
#' @details This is strictly for internal CSIS usage, as the user will need to
#' separately be syncing sharepoint to their drive. The intent is to update,
#' particularly via log_path,
#'
#' @examples get_local_sharepoint_path()
#'
#' @export
get_local_sharepoint_path<-function(site="DIIG - Documents"){
  local_path<-file.path("C:\\Users\\Present\\Documents\\Center Strategic Intl Studies Inc CSIS",site)
  if(file.exists(local_path))
    return(local_path)
  local_path<-file.path("K:\\Users\\Greg\\Center Strategic Intl Studies Inc CSIS",site)
  if(file.exists(local_path))
    return(local_path)
  local_path<-file.path("C:\\Users\\gsand\\Center Strategic Intl Studies Inc CSIS",site)
  if(file.exists(local_path))
    return(local_path)
  local_path<-file.path("D:\\Users\\Greg\\Center Strategic Intl Studies Inc CSIS",site)
  if(file.exists(local_path))
    return(local_path)
  local_path<-file.path("F:\\Users\\Greg\\Center Strategic Intl Studies Inc CSIS",site)
  if(file.exists(local_path))
    local_path<-file.path("F:\\Users\\Greg\\Center Strategic Intl Studies Inc CSIS",site)
  local_path<-file.path("F:\\Users\\gsanders\\Center Strategic Intl Studies Inc CSIS",site)
  if(file.exists(local_path))
    local_path<-file.path("F:\\Users\\gsanders\\Center Strategic Intl Studies Inc CSIS",site)
  if(file.exists(local_path))
    return(local_path)
  local_path<-file.path("D:\\Center Strategic Intl Studies Inc CSIS",site)
  if(file.exists(local_path))
    return(local_path)
  local_path<-file.path("F:\\Users\\gsanders\\Documents\\Center Strategic Intl Studies Inc CSIS",site)
  if(file.exists(local_path))
    return(local_path)
  local_path<-file.path("C:\\Users\\grego\\Center Strategic Intl Studies Inc CSIS",site)
  if(file.exists(local_path))
    return(local_path)
  stop("Could not find local path. Update the list in Apply_Lookups.R")
}

#' Turn a factor  into a number
#'
#' @param x The factor or character string
#'
#' @return The same column in numerical format.
#'
#' @details Returns the variable after removing $s and ,s.
#'
#' @examples FactorToNumber("5")
#'
#' @export
FactorToNumber<-function(x){
  warning("Deprecated in favor of text_to_number")
  if ((is.factor(x))||(is.character(x))){
    x<-gsub('\\$','',as.character( x))
    x<-as.double(gsub('\\,','',as.character( x)))
  }
  x
}



#' Return the appropriate delimeter for the file type
#'
#' @param filename The name of the data file
#'
#' @return The delimter to use.
#'
#' @details Returns ',' for csv and '/t' for txt files. Creates an error for other file types.
#'
#' @examples get_delim("test.csv")
#'
#' @export
get_delim<-function(filename){
  delim<-NULL
  extension<-substring(filename,nchar(filename)-2,nchar(filename))
  if(extension=="csv") delim<-","
  else if(extension=="txt") delim<-"\t"
  else stop(paste("Unknown file type",extension))
  delim
}


#' Replace NAs in one variable of a data frame with a specified valued
#'
#' @param data A data frame
#' @param var The variable to have NAs removed
#' @param replacement Charater string that will replace NAs, by default "Unlabeled"
#'
#' @return Returns a data frame with the var values replaced with replacement,
#' otherwise identical to the original data frame
#'
#' @details This function is intended for preparing columns for use.
#' As far as the function creator understands it, NAs do not match to NAs. However,
#' we often wish to include NAs in a graph under a proper name, such as Unlabeled
#' and with a color of our choosing. One critical step in this process is the addition
#' of the replacement term to the levels of the var factor. Which means we
#' should probably add input protection when non factors are passed. I also haven't
#' tested the handling if multiple columns are passed, I think it may not work.
#'
#' VAR.long.DF<-replace_nas_with_unlabeled(data=data.DF
#'   ,var="SubCustomer")
#'   replacement = "Uncategorized"
#'   )
#'
#' @export
replace_nas_with_unlabeled<- function(data,
                                      var,
                                      replacement="Unlabeled"){
  data<-as.data.frame(data)
  if(is.factor(var)) var<-as.character(var)

  if(!is.factor(data[,var]))
    data[,var]<-factor(data[,var])


  if(any(is.na(data[,var]))){
    #Make sure the replacement value is in the is within the list of levels
    # if (!(replacement %in% levels(data[,var]))){
    data[,var]<-addNA(data[,var],ifany=TRUE)
    # }
    levels(data[,var])[is.na(levels(data[,var]))] <- replacement
  }
  data
}



#' An internal function to check for NAs in columns after a join
#'
#' @param data A data frame
#' @param input_var The var(s) that had been used to join
#' @param output_var The var(s) that result from the join
#' @param lookup_file The file used in the join
#' @param missing_file Filename to output any unmatched variables for easy of processing
#'
#' @return None. Instead the function raises an error if there are NAs
#'
#' @details This function is intended to catch gaps in lookup tables
#' and to alert the developer before they can come into use. The core intent
#' is to throw an error message that, if needed, will guide the developer to
#' the file they need to update and the rows they need to add.
#'
#' @examples na_check(data,
#'   input_var=by,
#'   output_var=add_var,
#'   lookup_file=lookup_file)
#'
#' @export
na_check<-function(data
                   , input_var
                   , output_var
                   , lookup_file
                   , missing_file=NULL
){


  #Limit just to relevant columns
  na_check.df<-subset(data
                      , select=c(input_var,output_var)
  )

  #Drop cases where there is an na in the input_var
  na_check.df<-na_check.df[complete.cases(na_check.df[,input_var]),]

  #Drop all complete rows
  na_check.df<-na_check.df[!complete.cases(na_check.df),]

  if(nrow(na_check.df)>0){
    if(!is.null(missing_file))
      write.csv(file=missing_file,unique(na_check.df))
    print(unique(na_check.df))
    stop(paste(nrow(na_check.df)
               ,"rows of NAs generated in "
               ,paste(output_var,collapse=", ")
               ,"from "
               ,lookup_file)
    )
  }
}



#' Remove byte order marks, these appear before the first column name:
#'
#' @param data A /code(data.frame)
#'
#' @return /code(data) with the first column stripped of a byte order mark, if present
#'
#' @details Byte order marks are included at the start of UTF
#' text files and if not properly processed on read in, can add
#' nonsense characters to the start of the first column of a the
#' /code(data.frame) read in from a text file.
#'
#' @examples remove_bom(data)
#'
#' @export
remove_bom<-function(data
){
  #Remove nonsense characters sometimes added to start of files
  if(substring(colnames(data)[1],2,3)==".."&
     charToRaw(as.character(substring(colnames(data)[1],1,1)))=='ef')
    colnames(data)[1]<-substring(colnames(data)[1],4)

  colnames(data)[substring(colnames(data),2,3)=="?.."]<-
    substring(colnames(data)[substring(colnames(data),1,3)=="?.."],4)

  data
}


#' Read in an external file and join it with an existing data frame
#'
#' @param data The data frame to be joined
#' @param lookup_file The name of the lookup file
#' @param path="https://github.com/CSISdefense/R-scripts-and-data/tree/master/",
#' The location of the lookup file
#' @param directory="Lookups\\" The directory within the path that holds the lookup
#' @param by The columns used to join, if not provided, matching columns will be used
#' @param replace_na_var Before the join, these columns will have NAs values replaced
#' @param overlap_var_replaced Should the function replace for common columns not used to join?
#' @param add_var, What new columns should be checked for NA values?
#' @param new_var_checked Should only checked new columns be kept?
#' @param skip_check_var List of vars that should not be checked for NA values
#' @param missing_file Filename to output any unmatched variables for easy of processing
#'
#' @return The data frame plus new columns from the lookup file. If new_var_checked is
#' true and only new columns listed in add_var will be kept. Note to self, should
#' add input protection that throws an error if new_var_checked is set to true when
#' add_var is false.
#'
#' @details This function is an elaborate join with various quality check measures thrown in.
#' At its simplest, it just joins the existing data frame with the passed file. But along the way
#' it will make some fixes to common CSV errors and also take advantage of some known facts about
#' how CSIS data is organized.
#'
#' @examples na_check(data,
#'   input_var=by,
#'   output_var=add_var,
#'   lookup_file=lookup_file)
#'
#' @import utils
#' @import stringr
#' @export
read_and_join<-function(
    data,
    lookup_file,
    path="https://raw.githubusercontent.com/CSISdefense/R-scripts-and-data/master/",
    directory="Lookups/",
    by=NULL,
    replace_na_var=NULL,
    overlap_var_replaced=TRUE,
    add_var=NULL,
    new_var_checked=TRUE,
    skip_check_var=NULL,
    missing_file=NULL,
    lookup_char_as_factor=FALSE){


  if(is.data.frame(lookup_file))
    stop("lookup_file parameter is a data frame, it should be a filename, e.g. 'lookup_customer.csv'.")

  if(is.null(nrow(data))) stop("Data parameter passed to read_and_join has no rows.")

  if(any(!by %in% colnames(data))){
    by<-by[!by %in% colnames(data)]
    stop(paste(paste(by,collapse=" & "),"not present in data"))
  }

  #Replace NAs in input column if requested
  if(!is.null(replace_na_var)){
    data<-replace_nas_with_unlabeled(data,
                                     replace_na_var)
  }
  pathdir=file.path(path,directory)
  if(path=="") pathdir<-directory
  #This doesn't  work for URLs. Worth trying again later with some parsing
  # if (!file.exists(file.path(pathdir,lookup_file))){
  # stop(paste(file.path(pathdir,lookup_file)," does not exist.",sep=""))
  # }

  #Read in the lookup file
  lookup<-read.csv(
    file.path(pathdir,lookup_file),
    header=TRUE,
    sep=if_else(substring(lookup_file,nchar(lookup_file)-3)==".csv",",","\t"),
    na.strings=c("NA","NULL"),
    quote = "\"",#Necessary because there are some 's in the names.
    dec=".",
    strip.white=TRUE,
    stringsAsFactors=lookup_char_as_factor  #This can get weird when true, as sometimes it confuses numerical variables and factors
  )


  #Remove byte order marks present in UTF encoded files
  data<-remove_bom(data)
  lookup<-remove_bom(lookup)


  #Raise an error if newly added variables are  absent from the lookup
  if(any(!add_var %in% colnames(lookup))){
    print(colnames(lookup))
    add_var<-add_var[!add_var %in% colnames(lookup)]
    stop(paste(paste(add_var,collapse=" & "), "not present in lookup"))
  }

  #Raise an error if by is missing from either file
  if(any(!by %in% colnames(lookup))){
    by<-by[!by %in% colnames(lookup)]
    stop(paste(paste(by,collapse=" & "), "not present in lookup"))
  }

  if(any(duplicated(lookup[,by]))){
    print(unique(lookup[duplicated(lookup[,by]),by]))
    stop(paste("Duplicate entries in lookup for by variables: ",by))
  }

  #Handle any fields in both data and lookup held in common not used in the joining
  if(!is.null(by)){
    #If add_var is specified, dropped new fields not in add_var
    if(!is.null(add_var)){
      droplist<-names(lookup)[!names(lookup) %in% by
                              &!names(lookup) %in% add_var]
      data<-data[,!names(data) %in% droplist]
    }

    droplist<-names(lookup)[names(lookup) %in% names(data)]
    droplist<-droplist[!droplist %in% by]
    if(length(droplist)>0){
      if(overlap_var_replaced)
        data<-data[,!names(data) %in% droplist]
      else
        lookup<-lookup[,!names(lookup) %in% droplist]
    }
  }

  #Fixes for Excel's penchant to drop leading 0s.
  if("Contracting.Agency.ID" %in% names(lookup) & "data" %in% names(lookup)){
    lookup$Contracting.Agency.ID<-factor(str_pad(lookup$Contracting.Agency.ID,4,side="left",pad="0"))
    data$Contracting.Agency.ID<-as.character(data$Contracting.Agency.ID)
    data$Contracting.Agency.ID[is.na(data$Contracting.Agency.ID=="")]<-"0000"
    data$Contracting.Agency.ID<-factor(str_pad(data$Contracting.Agency.ID,4,side="left",pad="0"))
  }

  #Make sure CSIScontractIDs are numeric and not a factor
  if("CSIScontractID" %in% colnames(lookup)){
    if(!is.numeric(lookup$CSIScontractID)){
      lookup$CSIScontractID<-as.numeric(as.character(lookup$CSIScontractID))
    }
  }

  #Conduct the join
  if(is.null(by)){
    data<- dplyr::left_join(
      data,
      lookup
    )
  }
  else{
    data<- dplyr::left_join(
      data,
      lookup,
      by=by
    )
  }




  if(!is.null(by)&new_var_checked==TRUE){
    #If add_var is not specified, set it equal to all new vars
    if(is.null(add_var))
      add_var<-colnames(lookup)[!colnames(lookup) %in% by]

    if(!is.null(skip_check_var)){
      add_var<-add_var[!add_var %in% skip_check_var]
    }
    #First verify  there are any variables to check
    if(length(add_var)>0){
      na_check(data,
               input_var=by,
               output_var=add_var,
               lookup_file = lookup_file,
               missing_file= missing_file)
    }
  }

  data
}



#' Read in an external file and join it with an existing data frame
#'
#' @param data The data frame to be joined
#' @param lookup_file The name of the lookup file
#' @param path The location of the lookup file
#' @param directory The directory within the path that holds the lookup
#' @param by The columns used to join, if not provided, matching columns will be used
#' @param replace_na_var Before the join, these columns will have NAs values replaced
#' @param overlap_var_replaced Should the function replace for common columns not used to join?
#' @param add_var What new columns should be checked for NA values?
#' @param new_var_checked Should only checked new columns be kept?
#' @param skip_check_var List of vars that should not be checked for NA values
#' @param zip_file The source zip file.
#' @param missing_file Filename to output any unmatched variables for easy of processing
#' @param create_lookup_rdata Whether to create a rdata file using the lookup for ease of future inputing
#'
#' @return The data frame plus new columns from the lookup file. If new_var_checked is
#' true and only new columns listed in add_var will be kept. Note to self, should
#' add input protection that throws an error if new_var_checked is set to true when
#' add_var is false.
#'
#' @details This function is an elaborate join with various quality check measures thrown in.
#' At its simplest, it just joins the existing data frame with the passed file. But along the way
#' it will make some fixes to common CSV errors and also take advantage of some known facts about
#' how CSIS data is organized.
#'
#' @examples na_check(data,
#'   input_var=by,
#'   output_var=add_var,
#'   lookup_file=lookup_file)
#'
#' @import utils
#' @export
read_and_join_experiment<-function(
    data,
    lookup_file,
    path="https://raw.githubusercontent.com/CSISdefense/Lookup-Tables/master/",
    directory="Lookups/",
    by=NULL,
    replace_na_var=NULL,
    overlap_var_replaced=TRUE,
    add_var=NULL,
    new_var_checked=TRUE,
    skip_check_var=NULL,
    zip_file=NULL,
    col_types=NULL,
    case_sensitive=TRUE,
    missing_file=NULL,
    create_lookup_rdata=FALSE,
    lookup_char_as_factor=FALSE,
    guess_max=NULL,
    join_type="left"
){
  if(!is.null(names(by)))
    left_by<-names(by)
  else left_by<-by
  # read.delim doesn't like \\
  path<-gsub("\\\\","//",path)
  directory<-gsub("\\\\","//",directory)

  if(tolower(substr(path,1,4))=="http"&!RCurl::url.exists(file.path(path,directory,lookup_file))
     || path=="offline"){
    warning("Using offline path")
    path<-get_local_lookup_path()
  }

  #Prevent "" from sending the path back to the \ directory rather than the root for the project.
  pathdir=file.path(path,directory)
  if(path=="") pathdir<-directory


  case_match<-function(name, list){
    if(!name %in% (list)){
      if(tolower(name) %in% tolower(list)){
        name<-list[tolower(list)==tolower(name)]
      }
    }
    name
  }


  #Replace NAs in input column if requested
  if(!is.null(replace_na_var)){
    if(any(!replace_na_var %in% colnames(data))) stop("replace_na_var is missing.")
    data<-replace_nas_with_unlabeled(data,
                                     replace_na_var)
  }

  if(is.data.frame(lookup_file))
    stop("lookup_file parameter is a data frame, it should be a filename, e.g. 'lookup_customer.csv'.")

  #If the  file specified is an RDA
  if(tolower(substring(lookup_file,nchar(lookup_file)-3))==".rda"){
    if (!file.exists(file.path(pathdir,lookup_file)))
      stop(paste0(file.path(pathdir,lookup_file)," does not exist"))
    load(file.path(pathdir,lookup_file))
  }
  #If there exists an rda variant of the file passed.
  else if (file.exists(file.path(pathdir,paste0(substring(lookup_file,1,nchar(lookup_file)-3),"rda"))))
    load(file.path(pathdir,paste0(substring(lookup_file,1,nchar(lookup_file)-3),"rda")))

  else{ if(!is.null(zip_file)){
    #Case sensitivity fix for zip filename
    # dir_list<-list.files(file.path(pathdir))
    # zip_file<-case_match(zip_file,dir_list)

    #Read in the lookup file
    if (!file.exists(file.path(pathdir,zip_file))){
      stop(paste(file.path(pathdir,zip_file),"does not exist"))
    }
    file_size<-file.info(file.path(pathdir,zip_file))$size
    if (file_size>200000000){
      stop(paste("Zip file size (",file_size,") exceeds 200 megabytes and unz can't handle this. Current solution is to unzip in file system and read in directly."))
    }

    #Case sensitivity fix for data filename
    file_list<-unzip(file.path(pathdir,zip_file),list=TRUE)
    lookup_file<-case_match(lookup_file,file_list$Name)
    if(!lookup_file %in% (file_list$Name)){
      print(file_list)
      stop(paste(lookup_file,"not present in",zip_file))
    }
    input<-file.path(pathdir,zip_file)#unz(description=paste(pathdir,zip_file,sep=""),filename=lookup_file)

  }
    else{#No zip file
      input<-swap_in_zip(lookup_file,pathdir)
    }
    if(is.null(guess_max)){
      lookup<-readr::read_delim(
        input,
        col_names=TRUE,
        delim=if_else(substring(lookup_file,nchar(lookup_file)-3)==".csv",",","\t"),
        na=c("NA","NULL"),
        trim_ws=TRUE,
        col_types=col_types
      )
    } else{
      lookup<-readr::read_delim(
        input,
        col_names=TRUE,
        delim=if_else(substring(lookup_file,nchar(lookup_file)-3)==".csv",",","\t"),
        na=c("NA","NULL"),
        trim_ws=TRUE,
        col_types=col_types,
        guess_max =guess_max
      )
    }

    #Convert character strings to factors
    if (lookup_char_as_factor==TRUE){
      #Found here: https://stackoverflow.com/questions/2851015/convert-data-frame-columns-from-factors-to-characters/2853231#2853231
      lookup<-lookup %>% mutate_if(is.character, factor)
    }

    if (create_lookup_rdata==TRUE)
      save(lookup,file=file.path(pathdir,
                                 paste0(substring(lookup_file,1,nchar(lookup_file)-3),"rda"))
      )
  }
  #Remove byte order marks present in UTF encoded files
  data<-remove_bom(data)
  lookup<-remove_bom(lookup)


  #Raise an error if by is missing from either file
  if(any(!by %in% colnames(lookup))){
    by<-by[!by %in% colnames(lookup)]
    stop(paste(paste(by,collapse=" & "), "not present in lookup"))
  }
  if(any(!left_by %in% colnames(data))){
    left_by<-left_by[!left_by %in% colnames(data)]
    stop(paste(paste(left_by,collapse=" & "),"not present in data"))
  }
  if(any(!add_var %in% colnames(lookup))){
    add_var<-add_var[!add_var %in% colnames(lookup)]
    stop(paste(paste(add_var,collapse=" & "),"not present in lookup"))
  }

  #Handle any fields in both data and lookup held in common not used in the joining
  #And if add_vars is specified, drop non-add_vars from lookup.
  if(!is.null(by)){
    if(!is.null(add_var))
      lookup<-lookup[,names(lookup) %in% c(by, add_var)]

    droplist<-names(lookup)[names(lookup) %in% names(data)]
    droplist<-droplist[!droplist %in% by]

    if(length(droplist)>0){
      if(overlap_var_replaced)
        data<-data[,!names(data) %in% droplist]
      else{
        if(!is.null(add_var))
          stop(paste("Not replacing overlap, but add_var present in data:",droplist))
        lookup<-lookup[,!names(lookup) %in% droplist]
      }
    }
    rm(droplist)
  }
  #If add_vars is specified but there is no by, drop non add_vars from lookup
  else if( !is.null(add_var)){
    bylist<-names(lookup)[names(lookup) %in% names(data)]
    lookup<-lookup[,names(lookup) %in% c(bylist, add_var)]
    rm(bylist)
  }

  #Fixes for Excel's penchant to drop leading 0s.
  if("Contracting.Agency.ID" %in% names(lookup) & "data" %in% names(lookup)){
    lookup$Contracting.Agency.ID<-factor(str_pad(lookup$Contracting.Agency.ID,4,side="left",pad="0"))
    data$Contracting.Agency.ID<-as.character(data$Contracting.Agency.ID)
    data$Contracting.Agency.ID[is.na(data$Contracting.Agency.ID=="")]<-"0000"
    data$Contracting.Agency.ID<-factor(str_pad(data$Contracting.Agency.ID,4,side="left",pad="0"))
  }

  #Make sure CSIScontractIDs are numeric and not a factor
  if("CSIScontractID" %in% colnames(lookup)){
    if(!is.numeric(lookup$CSIScontractID)){
      lookup$CSIScontractID<-as.numeric(as.character(lookup$CSIScontractID))
    }
  }

  lookup<-as.data.frame(lookup)
  if(any(!complete.cases(lookup[,by]))){
    warning("NAs found in by variable. Filtering them out.")
    lookup<-lookup[complete.cases(lookup[,by]),]
  }

  if(anyDuplicated(lookup[,by])){
    print(unique(lookup[duplicated(lookup[,by]),by]))
    stop(paste("Duplicate entries in lookup for by variables: ",by))
  }

  #Conduct the join
  if(is.null(by)){
    if(case_sensitive==FALSE) stop("Haven't implemented case insensitive yet w/o by parameter")

    #If add_var is not specified, set it equal to all lookup vars not present in data
    if(is.null(add_var))
      add_var<-colnames(lookup)[!colnames(lookup) %in% colnames(data)]
    left_by<-colnames(lookup)[colnames(lookup) %in% colnames(data)]

    if(join_type=="left"){
      data<- dplyr::left_join(
        data,
        lookup
      )
    } else if(join_type=="full"){
      data<- dplyr::full_join(
        data,
        lookup
      )
    }
    else stop("Unrecognized join_type")
  }
  else{
    if(case_sensitive==FALSE){
      #Create a temporary holder for original values of each of the by ariables
      #And then switch them to lower case before the  join is run.
      data<-as.data.frame(data)
      lookup<-as.data.frame(lookup)
      for(i in 1:length(left_by)){
        original_temp_name<-paste(left_by[i],"original",sep="_")
        if(original_temp_name %in% colnames(data)) stop(paste(original_temp_name,"already exists as a column in data, nowhere to store the original values."))
        data[,original_temp_name]<-data[,left_by[i]]
        data[,left_by[i]]<-tolower(data[,left_by[i]])
        # lookup[,original_temp_name]<-lookup[,left_by[i]]
        lookup[,by[i]]<-tolower(lookup[,by[i]])
      }
    }

    if(join_type=="left"){
      data<- dplyr::left_join(
        data,
        lookup,
        by=by
      )
    } else if(join_type=="full"){
      data<- dplyr::full_join(
        data,
        lookup,
        by=by
      )
    }
    else stop("Unrecognized join_type")


    if(case_sensitive==FALSE){
      #Switch back the by variables to their pre-tolower value
      #Lookup isn't kept, commented code was just switching it back for error checking purposes.
      #But creating a lookup column also imports it into data, a needless compllication.
      for(i in 1:length(left_by)){
        original_temp_name<-paste(left_by[i],"original",sep="_")
        data[,left_by[i]]<-data[,original_temp_name]
        data <- data[,!(colnames(data) %in% original_temp_name)]
        # lookup[,left_by[i]]<-lookup[,original_temp_name]
        # lookup <- lookup[,!(colnames(lookup) %in% original_temp_name)]
      }
    }
  }


  if(new_var_checked==TRUE){
    #If add_var is not specified, set it equal to all new vars
    if(is.null(add_var))
      add_var<-colnames(lookup)[!colnames(lookup) %in% by]

    if(!is.null(skip_check_var)){
      add_var<-add_var[!add_var %in% skip_check_var]
    }
    #First verify  there are any variables to check
    if(length(add_var)>0){
      na_check(data,
               input_var=left_by,
               output_var=add_var,
               lookup_file = lookup_file,
               missing_file= missing_file)
    }
  }

  return(data)
}


#' Deflation using GitHub-based CSV file
#'
#' @param data A data frame
#' @param money_var The quoted name of the dollar-value variable
#' @param fy_var The quoted name of the fiscal year variable
#' @param deflator_file The quoted file name of the deflators to use;
#' must be a CSV with the column "Fiscal_Year."
#' @param deflator_var The quoted name of the defalator variable variable,
#' by default "deflator.2017"
#' @param path The path or url for the deflator_file CSV.  By default, checks
#' the CSISdefense Github lookups repository at CSISdefense/csis360/Lookup-Tables/data/economic/
#'
#' @return Returns a data frame with the money_var deflated, using deflator_var
#' otherwise identical to the original data frame
#'
#' @details Warning: This function should be used __in data processing only__,
#' not in a live app.  It reads an external file from GitHub,
#' which will slow down an app substantially if done repeatedly.
#'
#' @examples
#'
#' FullData <- read_csv("2017_SP_CompetitionVendorSizeHistoryBucketPlatformSubCustomer.csv",
#'   col_names = TRUE, col_types = "cccccccccc",na=c("NA","NULL"))
#' PrepareLabelsAndColors(Coloration,FullData,"Customer")
#'
#' @export
deflate <- function(
    data,
    money_var = "Amount",
    fy_var = "Fiscal_Year",
    deflator_file = "Lookup_Deflators.csv",
    deflator_var="OMB25_GDP23",
    path="https://raw.githubusercontent.com/CSISdefense/Lookup-Tables/master/",
    directory="economic/",
    deflator_dropped=TRUE
){
  #Default deflator if none passed.
  if(is.null(deflator_var)) deflator_var<-"OMB25_GDP23"

  #Tibbles run into trouble with the [[]] new variable specifying.
  data<-as.data.frame(data)


  #If the money_var already has _Then_Year, remove it so it can be added again
  if(substr(money_var,nchar(money_var)-9,nchar(money_var))=="_Then_Year"){
    colnames(data)[colnames(data)==money_var]<-substr(money_var,1,nchar(money_var)-10)
    money_var<-substr(money_var,1,nchar(money_var)-10)
  }

  if(!fy_var %in% colnames(data)){
    if(fy_var == "Fiscal.Year" & "Fiscal_Year" %in% colnames(data)) fy_var<-"Fiscal_Year"
    if(fy_var == "fiscal_year" & "Fiscal_Year" %in% colnames(data)) fy_var<-"Fiscal_Year"
    else stop(paste(fy_var," is not present in data."))
  }

  if(!money_var %in% colnames(data)){
    if((paste(money_var,"Then_Year",sep="_") %in% colnames(data)) &
       (paste(money_var,deflator_var,sep="_") %in% colnames(data))){
      warning(paste(money_var," is not present in data, due to prior run of deflate with money_var=",money_var,".",sep=""))
      return(data)
    }
    else{
      stop(paste(money_var,"is not present in data."))
    }
  }


  data[[money_var]] <- as.numeric(data[[money_var]])

  if(tolower(substr(path,1,4))=="http"&!RCurl::url.exists(file.path(path,directory,deflator_file))
     || path=="offline"){
    warning("Using offline path")
    path<-get_local_lookup_path()
  }

  cat(paste("\n Applying\n", deflator_var, "\n in \n", deflator_file, "\n from\n", path, "\n"))
  deflators_retrieved <- readr::read_csv(file.path(path, directory,deflator_file))

  #Rename the Fiscal.Year variable to be match the name used in data
  colnames(deflators_retrieved)[colnames(deflators_retrieved)=="Fiscal_Year"]<-fy_var

  #Drop unneded deflator columns and then join the deflators to the data
  deflators_retrieved<-subset(deflators_retrieved,select=c(fy_var,deflator_var))
  data<-plyr::join(data,deflators_retrieved,by=fy_var)

  #Create current and constant dollar variants of money_var
  data[[paste(money_var,deflator_var,sep="_")]] <- as.numeric(as.character(
    data[[money_var]])) / as.numeric(data[[deflator_var]])

  colnames(data)[colnames(data)==money_var]<-paste(money_var,"Then_Year",sep="_")

  #Drop the deflator var unless deflator_dropped = FALSE
  if(deflator_dropped)
    data<-data[,colnames(data)!=deflator_var]

  #Standardize the newly created names
  data<-standardize_variable_names(data,
                                   var=c(paste(money_var,"Then_Year",sep="_"),
                                         paste(money_var,deflator_var,sep="_")))

  return(data)
}






#' Get Column Key based on the names in a data frame
#'
#' @param x A list of numbers stored as characters or factor
#'
#' @return The list in numerical format
#'
#' @details Converts from factor/character to number and strips
#' out commas and $s along the way.
#'
#' @examples
#'
#' text_to_number(c('10','1,000','$20.00'))
#'
#' @export
text_to_number<-function(x){
  if ((is.factor(x))||(is.character(x))){
    x<-gsub('\\$','',as.character( x))
    #Remove commas
    x<-gsub('\\,','',as.character( x))
    #Switch to negative if handling Handle the ($XX,XXX) case
    x<-gsub('\\(','-',as.character( x))
    x<-gsub('\\)$','',as.character( x))
    # if(length(grep("\\([0-9|\\,|\\.]+\\)", as.character(x)))==1){
    #   x<-gsub('[\\(|\\)]','',as.character( x))
    #   x<-as.double(as.character(x))*-1
    # }
    x<-as.double(as.character(x))
  }
  x
}



#' Get Column Key based on the names in a data frame
#'
#' @param x Text / numerical expression of a bit
#'
#' @return A 0,1,NULL value.
#'
#' @details Converts from character/number to bit
#'
#' @examples
#'
#' text_to_bit(c('Y','1','$20.00'))
#'
#' @export
text_to_bit<-function(x){
  x<-as.character(x)
  yes_list<-c("Y","YES","Y: YES","Y:",
              "1","TRUE","TRUE:","T")
  no_list<-c("N","NO","N: NO",": NO",
              "0","FALSE","FALSE:","F")
  na_list<-c("",":")
  b<-
                 case_when(str_to_upper(x) %in% yes_list ~
                             1,
                           str_to_upper(x) %in% no_list ~
                             0,
                           str_to_upper(x) %in% na_list~
                             NA,
                 )
  if(any(!x %in% c(yes_list,no_list,na_list) & !is.na(x))){
    stop(paste0("text_to_bit does not know how to handle ",
                x[!x %in% c(yes_list,no_list,na_list) & !is.na(x)]))
  }
  x<-as.integer(b)
  x
}


#' Get Column Key based on the names in a data frame
#'
#' @param x A date.
#'
#' @return The fiscal year of the date.
#'
#' @details Converts the date to fiscal year by
#' extracting the date and adding 1 if the month
#' is october or later.
#' @examples
#'
#' get_fiscal_year(as.Date('2019-12-10'))
#'
#' @export
get_fiscal_year<-function(
    x){

  lubridate::year(x)+if_else(lubridate::month(x)<10,0,+1)
  # ymd(paste(          ,"-10-1",sep=""))
}


#' Label Top X entries in a data frame
#'
#' @param df A data frame.
#' @param col The column to be evaluated.
#' @param weight The variable summed to determine top entries, by default Action_Obligation_OMB25_GDP23
#' @param top_name The new column that will include
#' @param group Sub groupings by which to rank, by default not included
#' @param time The variable used for when considering recent top entries, by default Fiscal_Year
#' @param recent Value used to give extra weight to recently top entries rather than just the overall period
#' @param retain_rank Retain any ranked variables created in processing.
#' @param write_file File name in which to record the calculations underlying the labeling. Default is not to record.
#'
#' @return The revised data frame with the new variable
#'
#' @details Add column variants, including summaries, and deflated dollars,
#' to a contract dataset.
#'
#'
#' @export
#'
label_top<-function(df,
                    col,
                    n=5,
                    weight="Action_Obligation_OMB25_GDP23",
                    top_name=NA,
                    group_list=NA,
                    time="Fiscal_Year",
                    recent=NA,
                    retain_rank=FALSE,
                    write_file=NA){


  agg_list<-col
  if(!col %in% colnames(df))
    stop(paste(col,"is not founds in the columns of df"))
  if(!weight %in% colnames(df))
    stop(paste(weight,"is not founds in the columns of df"))
  if(!is.na(group_list)){
    agg_list<-c(col,group_list)
    for(g in group_list){
      if(!g %in% colnames(df))
        stop(paste(g,"is not founds in the columns of df"))
    }
  }

  if(is.na(top_name)) top_name<-paste0("Top_",col)
  if(top_name %in% colnames(df)){
    warning(paste(top_name,"already present in df, removing."))
    df<-df[,colnames(df)!=top_name]
  }

  if(is.na(recent)){
    #Just overall
    if(length(agg_list) == 1){
      agg_df <- df %>%
        dplyr::group_by(!! as.name(agg_list)) %>%
        summarize_(
          agg_val = lazyeval::interp(~sum(var, na.rm = TRUE), weight = as.name(weight)))
      agg_df<- agg_df %>% mutate(
        rank_total=rank(desc(agg_val))) %>%
        arrange(desc(agg_value))
    } else {
      agg_df <- df %>%
        dplyr::group_by_(.dots = c(agg_list)) %>%
        summarize_(
          agg_val = lazyeval::interp(~sum(var, na.rm = TRUE), var = as.name(weight)))
      agg_df<- agg_df %>% dplyr::group_by_(.dots = c(group_list)) %>%
        mutate(
          rank_total=rank(desc(agg_val))) %>%
        arrange(desc(agg_val))

    }
    agg_df[,top_name]<-NA
    agg_df[agg_df$rank_total<=n,top_name]<-
      agg_df[agg_df$rank_total<=n,col]

  } else{
    #Recent
    agg_df<-df
    if("recent" %in% colnames(agg_df))
      stop("'recent' is a column in agg_df and that name is needed.")

    agg_df$recent_weight<-if_else(agg_df[,time]>=recent,agg_df[,weight],NA)
    recent_weight<-"recent_weight"

    if(length(agg_list) == 1){
      agg_df <- agg_df %>%
        dplyr::group_by(!! as.name(agg_list)) %>%
        summarize_(
          agg_val = lazyeval::interp(~sum(var, na.rm = TRUE), var = as.name(weight)),
          agg_val_recent = lazyeval::interp(~sum(var, na.rm = TRUE), var = as.name(recent_weight))
        )
      agg_df<- agg_df %>% mutate(
        rank_total=rank(desc(agg_val)),
        rank_recent=rank(desc(agg_val_recent))) %>%
        arrange(desc(agg_value))
    } else {
      agg_df <- agg_df %>%
        dplyr::group_by_(.dots = c(agg_list)) %>%
        summarize_(
          agg_val = lazyeval::interp(~sum(var, na.rm = TRUE), var = as.name(weight)),
          agg_val_recent = lazyeval::interp(~sum(var, na.rm = TRUE), var = as.name(recent_weight)))
      agg_df<- agg_df %>% dplyr::group_by_(.dots = c(group_list)) %>%
        mutate(
          rank_total=rank(desc(agg_val)),
          rank_recent=rank(desc(agg_val_recent))) %>%
        arrange(desc(agg_val))

    }
    agg_df[,top_name]<-NA
    agg_df[agg_df$rank_total<=n|agg_df$rank_recent<=n ,top_name]<-
      agg_df[agg_df$rank_total<=n|agg_df$rank_recent<=n,col]

  }


  if(retain_rank){
    rank_total_name<-paste0(top_name,"_rank_total")
    if(rank_total_name %in% colnames(df)){
      warning(paste(rank_total_name,"already present in df, removing."))
      df<-df[,colnames(df)!=rank_total_name]
    }
    colnames(agg_df)[colnames(agg_df)=="rank_total"]<-rank_total_name
    if(!is.na(recent)){
      rank_recent_name<-paste0(top_name,"_rank_recent")
      if(rank_recent_name %in% colnames(df)){
        warning(paste(rank_recent_name,"already present in df, removing."))
        df<-df[,colnames(df)!=rank_recent_name]
      }
      colnames(agg_df)[colnames(agg_df)=="rank_recent"]<-rank_recent_name
    }
  }

  write.csv(agg_df,file=write_file,row.names = FALSE)

  df<-left_join(df,agg_df[,!colnames(agg_df) %in% c(weight,"rank_total","rank_recent")],
                by=c(agg_list))


  # agg_df<-df %>% group_by (Project.Name,PlatformPortfolio) %>%
  #             # Action_Obligation_2022=sum(if_else(Fiscal_Year>=2022,Action_Obligation_OMB24_GDP22,0)))%>%
  #   summarise(Action_Obligation_OMB25_GDP23=sum(Action_Obligation_OMB25_GDP23),
  #   group_by (PlatformPortfolio) %>%
  #   mutate(rank_total=rank(desc(Action_Obligation_OMB25_GDP23)),
  #          # rank_2022=rank(desc(Action_Obligation_2022)))
  # agg_df %>% arrange(desc(Action_Obligation_OMB25_GDP23))

  df[is.na(df[,top_name]) & !is.na(df[,col]),top_name]<-
    "Other Labeled"
  df
}

#' Add column variants, including summaries, and deflated dollars, to a contract dataset.
#'
#' @param df A data frame.
#' @param path The path or url for the column key.  By default, checks
#' the CSISdefense Github lookups repository at CSISdefense/Lookup/master/
#'
#' @return The fiscal year of the date.
#'
#' @details Add column variants, including summaries, and deflated dollars,
#' to a contract dataset.
#'
#'
#' @export
apply_standard_lookups<- function(df,path="https://raw.githubusercontent.com/CSISdefense/Lookup-Tables/master/",
                                  deflator_var=NULL){

  if(path=="offline"){
    warning("Using offline path")
    path<-get_local_lookup_path()
  }
  df<-standardize_variable_names(df)

  #Clear out blank/admin/error message rows at the end of the input file.
  while(substring(df[nrow(df),1],1,15) %in% c(
    "Completion time",
    "An error occurr",#ed while executing batch. Error message is: One or more errors occurred
    "Msg 208, Level ",#16, State 1, Procedure
    "Invalid object "#Invalid object name 'contract.FDPSpartia
  )){

    df<-df[-nrow(df),]
  }
  while(stringr::str_sub(df[nrow(df),1],-15,-1) %in% c(
    " row affected)\r",
    "rows affected)\r"
  )){

    df<-df[-nrow(df),]
  }


  #Empty rows
  if("Fiscal_Year" %in% colnames(df))
    if((df[nrow(df),1]=="" | is.na(df[nrow(df),1]))  & is.na(df$Fiscal_Year[nrow(df)]))
      df<-df[-nrow(df),]

  #### Dates #####
  if("action_date" %in% names(df)){
    conversion_test<- as.Date(df$action_date)
    if(any(is.na(conversion_test)&!is.na(df$action_date))){
      stop("Failed action_date conversion")
    }
    df$action_date <- conversion_test
    rm(conversion_test)

  }
  if("signeddate" %in% names(df)){
    conversion_test<- as.Date(df$signeddate)
    if(any(is.na(conversion_test)&!is.na(df$signeddate))){
      stop("Failed signeddate conversion")
    }
    df$signeddate <- conversion_test
    rm(conversion_test)

  }
  if("Fiscal_Year"%in% names(df)){

    df$Fiscal_Year <- text_to_number(df$Fiscal_Year)
    df$dFYear<-as.Date(paste("1/1/",as.character(df$Fiscal_Year),sep=""),"%m/%d/%Y")


    if("fiscal_quarter" %in% colnames(df)){
      df$Fiscal_YQ<-NA
      df$Fiscal_YQ[!is.na(df$fiscal_quarter)]<-text_to_number(paste(df$Fiscal_Year[!is.na(df$fiscal_quarter)],
                                                                    text_to_number(df$fiscal_quarter[!is.na(df$fiscal_quarter)]),sep="."))
      df$Fiscal_YQ[is.na(df$Fiscal_YQ)]<-df$Fiscal_Year[is.na(df$Fiscal_YQ)]
      df$YTD<-if_else(df$Fiscal_Year==max(df$Fiscal_Year),"YTD","Full Year")
    }
    else if ("fiscal_quarter_YTD" %in% colnames(df)){
      df$Fiscal_YQ<-NA
      df$Fiscal_YQ[!is.na(df$fiscal_quarter_YTD)]<-text_to_number(paste(df$Fiscal_Year[!is.na(df$fiscal_quarter_YTD)],
                                                                        text_to_number(df$fiscal_quarter_YTD[!is.na(df$fiscal_quarter_YTD)]),sep="."))
      df$Fiscal_YQ[is.na(df$Fiscal_YQ)]<-df$Fiscal_Year[is.na(df$Fiscal_YQ)]
    }

    # df$Fiscal_Year.End <-as.Date(paste("9/30/",as.character(year(df$Fiscal_Year)),sep=""),"%m/%d/%Y")
    # df$Fiscal_Year.Start <-as.Date(paste("10/1/",as.character(year(df$Fiscal_Year)-1),sep=""),"%m/%d/%Y")
  } else if ("signeddate" %in% colnames(df)){
    df$Fiscal_Year<-get_fiscal_year(df$signeddate)
    df$Fiscal_YQ<-NA
    df$fiscal_quarter<-lubridate::quarter(df$Fiscal_Year,fiscal_start = 10)
    df$Fiscal_YQ[!is.na(df$fiscal_quarter)]<-text_to_number(paste(df$Fiscal_Year[!is.na(df$fiscal_quarter)],
                                                                  text_to_number(df$fiscal_quarter[!is.na(df$fiscal_quarter)]),sep="."))
    df$YTD<-if_else(df$Fiscal_Year==max(df$Fiscal_Year),"YTD","Full Year")

    df$dFYear<-as.Date(paste("1/1/",as.character(df$Fiscal_Year),sep=""),"%m/%d/%Y")
  }
  #
  #   #***Join relevant variables to lookup tables
  #   if("ContractingAgencyID" %in%  names(df) &
  #      "ContractingOfficeID" %in%  names(df)  &
  #      "Fiscal_Year" %in%  names(df)  &
  #      !("ContractingOfficeName" %in%  names(df)|
  #        "MajorCommandID"  %in%  names(df))){
  #
  #     if("MajorCommandCode"%in% names(df)){
  #       df<-subset(df, select=-c(MajorCommandCode))
  #     }
  #
  #     if("MajorCommandName"%in% names(df)){
  #       df<-subset(df, select=-c(MajorCommandName))
  #     }
  #
  #     if("MajorCommandID"%in% names(df)){
  #       df<-subset(df, select=-c(MajorCommandCode))
  #     }
  #
  #
  #     df<-replace_nas_with_unlabeled(df,"ContractingOfficeID")
  #     df<-replace_nas_with_unlabeled(df,"ContractingAgencyID")
  #
  #     df<-read_and_join_experiment(df,
  #                           "Defense_Major_Command_Codes_and_Offices.csv",
  #                           by=c("Fiscal_Year",
  #                                "ContractingAgencyID",
  #                                "ContractingOfficeID"),
  #                           new_var_checked=FALSE)
  #
  #     NA.check.df<-subset(df, is.na(MajorCommandCode) &
  #                           ContractingAgencyID!="Uncategorized" &
  #                           ContractingAgencyID!="ContractingOfficeID" &
  #                           !is.na(Fiscal_Year),
  #                         select=c("Fiscal_Year",
  #                                  "ContractingOfficeID",
  #                                  "ContractingOfficeName",
  #                                  "MajorCommandID",
  #                                  "MajorCommandCode",
  #                                  "MajorCommandName"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in MajorCommandCode"))
  #     }
  #     #
  #     #     NA.check.df<-subset(df,is.na(SubCustomer.detail), select=c("Customer","SubCustomer"))
  #     #     if(nrow(NA.check.df)>0){
  #     #       print(unique(NA.check.df))
  #     #       stop(paste(nrow(NA.check.df),"rows of NAs generated in SubCustomer.detail"))
  #     #     }
  #     #
  #     #     NA.check.df<-subset(df,is.na(SubCustomer.detail), select=c("Customer","SubCustomer"))
  #     #     if(nrow(NA.check.df)>0){
  #     #       print(unique(NA.check.df))
  #     #       stop(paste(nrow(NA.check.df),"rows of NAs generated in SubCustomer.component"))
  #     #     }
  #     #
  #     #Rather than have one uncategorized per year, just manually assigning
  #     df<-replace_nas_with_unlabeled(df,"MajorCommandName","Uncategorized")
  #
  #
  #   }
  #   else if("MajorCommandID" %in%  names(df)){
  #
  #     if("MajorCommandCode"%in% names(df)){
  #       df<-subset(df, select=-c(MajorCommandCode))
  #     }
  #
  #     if("MajorCommandName"%in% names(df)){
  #       df<-subset(df, select=-c(MajorCommandName))
  #     }
  #
  #     # df<-replace_nas_with_unlabeled(df,"MajorCommandName","Uncategorized")
  #
  #     df<-read_and_join_experiment(df,
  #                           "Lookup_MajorCommandID.csv",
  #                           by="MajorCommandID",
  #                           skip_check_var=c("ContractingOfficeCode",
  #                                            "ContractingOfficeName"
  #                           ))
  #
  #     df<-replace_nas_with_unlabeled(df,"MajorCommandID","Uncategorized")
  #
  #
  #   }
  #
  #   else if("CSISofficeName" %in%  names(df)){
  #
  #     df<-replace_nas_with_unlabeled(df,"CSISofficeName","Uncategorized")
  #
  #     df<-read_and_join_experiment(df,
  #                           "LOOKUP_CSISofficeName.txt",
  #                           by="CSISofficeName")
  #
  #   }
  #
  #


  # Organization Funder ####



  if("fundingrequestingagencyid" %in% names(df))
  {

    if("Funding.Department.ID" %in% names(df)){
      df<-subset(df, select=-c(Funding.Department.ID))
    }
    if("Funding.Agency.Name"%in% names(df)){
      df<-subset(df, select=-c(Funding.Agency.Name))
    }

    if("FundingAgencyName"%in% names(df)){
      df<-subset(df, select=-c(FundingAgencyName))
    }
    if("Funder"%in% names(df)){
      df<-subset(df, select=-c(Funder))
    }
    if("SubFunder"%in% names(df)){
      df<-subset(df, select=-c(SubFunder))
    }
    if(is.numeric(df$fundingrequestingagencyid))
      df$fundingrequestingagencyid<-as.character(df$fundingrequestingagencyid)
    df<-read_and_join_experiment(df,
                                 path=path,
                                 "Agency_AgencyID.csv",
                                 directory="",
                                 by=c("fundingrequestingagencyid"="AgencyID"),
                                 add_var=c("Customer","SubCustomer","AgencyIDtext"),#Funding.Agency.ID
                                 skip_check_var=c("Platform","Customer","SubCustomer","AgencyIDtext"),
                                 guess_max=2000)
    colnames(df)[colnames(df)=="AgencyIDtext"]<-"FundingAgencyName"

    if("fundingrequestingofficeid" %in% names(df) & !"FundingMajorCommandID" %in% names(df)){

      colnames(df)[colnames(df)=="MajorCommandID"]<-"ContractingMajorCommandID"
      colnames(df)[colnames(df)=="MajorCommandCode"]<-"ContractingMajorCommandCode"
      colnames(df)[colnames(df)=="MajorCommandName"]<-"ContractingMajorCommandName"

      df<-read_and_join_experiment(df,
                                   path=path,
                                   directory="office\\",
                                   lookup_file = "MajComID.csv",
                                   by =c("Fiscal_Year"="Fiscal_Year",
                                         "fundingrequestingagencyid"="Contracting_Agency_ID",
                                         "fundingrequestingofficeid"="ContractingOfficeID"),
                                   skip_check_var = "MajorCommandID")

      df<-read_and_join_experiment(df,
                                   path=path,
                                   directory="office\\",
                                   lookup_file = "MajComSum.csv")

      colnames(df)[colnames(df)=="MajorCommandID"]<-"FundingMajorCommandID"
      colnames(df)[colnames(df)=="MajorCommandCode"]<-"FundingMajorCommandCode"
      colnames(df)[colnames(df)=="MajorCommandName"]<-"FundingMajorCommandName"


      colnames(df)[colnames(df)=="ContractingMajorCommandID"]<-"MajorCommandID"
      colnames(df)[colnames(df)=="ContractingMajorCommandCode"]<-"MajorCommandCode"
      colnames(df)[colnames(df)=="ContractingMajorCommandName"]<-"MajorCommandName"
    }
  }
  # if("Customer" %in% names(df) && "SubCustomer" %in% names(df)){
  #   if("SubCustomer.sum"%in% names(df)){
  #     df<-subset(df, select=-c(SubCustomer.sum))
  #   }
  #
  #
  #   df<-replace_nas_with_unlabeled(df,"SubCustomer","Uncategorized")
  #   df<-replace_nas_with_unlabeled(df,"Customer","Uncategorized")
  #
  #   #     debug(read_and_join_experiment)
  #   df<-read_and_join_experiment(df,
  #                                         "SubCustomer.csv",
  #                                         by=c("Customer"="Customer","SubCustomer"="SubCustomer"),
  #                                         add_var=c("SubCustomer.platform","SubCustomer.sum"),
  #                                         path=path,
  #                                         directory="office/"
  #   )
  # }
  # else if ("FundingCustomer" %in% names(df) & "FundingSubCustomer" %in% names(df)){
  #   df<-replace_nas_with_unlabeled(df,"FundingSubCustomer","Uncategorized")
  #   df<-read_and_join_experiment(df,
  #                                         "SubCustomer.csv",
  #                                         by=c("FundingCustomer"="Customer","FundingSubCustomer"="SubCustomer"),
  #                                         add_var=c("SubCustomer.platform","SubCustomer.sum"),
  #                                         path=path,
  #                                         directory="office/")
  # }

  # Organization Awarding ####
  if("agencyid" %in% names(df) &
     "PIID" %in% names(df) &
     "idvagencyid" %in% names(df) &
     "idvpiid" %in% names(df)
  )
  {

    df<-df%>% mutate(derived_link=paste("https://www.usaspending.gov/award/CONT_AWD_",PIID,"_",agencyid,"_",
                                        if_else(is.na(idvpiid)|idvpiid=="","-NONE-",idvpiid),"_",
                                        if_else(is.na(idvagencyid)|idvagencyid=="","-NONE-",idvagencyid),"/",sep=""))
  }

  if("Contracting_Agency_ID" %in% names(df))
  {

    if("Contracting.Department.ID" %in% names(df)){
      df<-subset(df, select=-c(Contracting.Department.ID))
    }
    if("ContractingAgencyName"%in% names(df)){
      df<-subset(df, select=-c(ContractingAgencyName))
    }

    if("Contracting.Agency.Name"%in% names(df)){
      df<-subset(df, select=-c(Contracting.Agency.Name))
    }
    if("Customer"%in% names(df)){
      df<-subset(df, select=-c(Customer))
    }
    if("SubCustomer"%in% names(df)){
      df<-subset(df, select=-c(SubCustomer))
    }
    if(is.numeric(df$Contracting_Agency_ID))
      df$Contracting_Agency_ID<-as.character(df$Contracting_Agency_ID)
    df<-read_and_join_experiment(df,
                                 path=path,
                                 "Agency_AgencyID.csv",
                                 directory="",
                                 by=c("Contracting_Agency_ID"="AgencyID"),
                                 add_var=c("Customer","SubCustomer","AgencyIDtext"),#Contracting.Agency.ID
                                 skip_check_var=c("Platform","Customer","SubCustomer","AgencyIDtext"),
                                 guess_max=2000)
    colnames(df)[colnames(df)=="AgencyIDtext"]<-"ContractingAgencyName"

    if("ContractingOfficeID" %in% names(df) & !"MajorCommandID" %in% names(df)){

      df<-read_and_join_experiment(df,
                                   path=path,
                                   directory="office\\",
                                   lookup_file = "MajComID.csv",
                                   by =c("Fiscal_Year"="Fiscal_Year",
                                         "Contracting_Agency_ID"="Contracting_Agency_ID",
                                         "ContractingOfficeID"="ContractingOfficeID"),
                                   skip_check_var = "MajorCommandID")

      df<-read_and_join_experiment(df,
                                   path=path,
                                   directory="office\\",
                                   lookup_file = "MajComSum.csv")




    }
  }

  # if("awarding_agency_code" %in% names(df))
  # {
  #
  #   df<-read_and_join_experiment(df,
  #                                path=path,
  #                                "Awarding_Agency_Code.csv",
  #                                directory="office",
  #                                by=c("awarding_agency_code"="awarding_agency_code"),
  #                                add_var=c("Customer","SubCustomer","AgencyIDtext"),#Contracting.Agency.ID
  #                                skip_check_var=c("Platform","Customer","SubCustomer","AgencyIDtext"),
  #                                guess_max=2000)
  #   colnames(df)[colnames(df)=="AgencyIDtext"]<-"ContractingAgencyName"
  #
  #   if("ContractingOfficeID" %in% names(df) & !"MajorCommandID" %in% names(df)){
  #
  #     df<-read_and_join_experiment(df,
  #                                  path=path,
  #                                  directory="office\\",
  #                                  lookup_file = "MajComID.csv",
  #                                  by =c("Fiscal_Year"="Fiscal_Year",
  #                                        "Contracting_Agency_ID"="Contracting_Agency_ID",
  #                                        "ContractingOfficeID"="ContractingOfficeID"),
  #                                  skip_check_var = "MajorCommandID")
  #
  #     df<-read_and_join_experiment(df,
  #                                  path=path,
  #                                  directory="office\\",
  #                                  lookup_file = "MajComSum.csv")
  #
  #
  #
  #
  #   }
  # }


  if("Customer" %in% names(df) && "SubCustomer" %in% names(df)){
    if("SubCustomer.sum"%in% names(df)){
      df<-subset(df, select=-c(SubCustomer.sum))
    }


    df<-replace_nas_with_unlabeled(df,"SubCustomer","Uncategorized")
    df<-replace_nas_with_unlabeled(df,"Customer","Uncategorized")

    #     debug(read_and_join_experiment)
    df<-read_and_join_experiment(df,
                                          "SubCustomer.csv",
                                          by=c("Customer"="Customer","SubCustomer"="SubCustomer"),
                                          add_var=c("SubCustomer.platform","SubCustomer.sum"),
                                          path=path,
                                          directory="office/"
    )
  }
  else if ("ContractingCustomer" %in% names(df) & "ContractingSubCustomer" %in% names(df)){
    df$ContractingCustomer[df$ContractingCustomer==""]<-NA
    df<-replace_nas_with_unlabeled(df,"ContractingSubCustomer","Uncategorized")
    df<-read_and_join_experiment(df,
                                          "SubCustomer.csv",
                                          by=c("ContractingCustomer"="Customer","ContractingSubCustomer"="SubCustomer"),
                                          add_var=c("SubCustomer.platform","SubCustomer.sum"),
                                          path=path,
                                          directory="office/")
  }

  # else if ("SubCustomer" %in% names(df)){
  #   stop("Customer is missing from the table, SubCustomer does not stand alone.")
  # }
  # else if("Customer" %in% names(df)){
  #   df<-replace_nas_with_unlabeled(df,"Customer")
  #   df<-read_and_join_experiment(df,
  #                         "LOOKUP_Customer.csv",
  #                         by=c("Customer"))
  #   # NA.check.df<-subset(df,is.na(Customer.sum), select=c("Customer","Customer.sum"))
  #   # if(nrow(NA.check.df)>0){
  #   #   print(unique(NA.check.df))
  #   #   stop(paste(nrow(NA.check.df),"rows of NAs generated in Customer.sum"))
  #   # }
  #
  # }


  # classify competition
  if("CompetitionClassification" %in% names(df) & "ClassifyNumberOfOffers" %in% names(df) )
  {
    df<-read_and_join_experiment(df,
                                          "CompetitionClassification.csv",
                                          by=c("CompetitionClassification","ClassifyNumberOfOffers"),
                                          replace_na_var="ClassifyNumberOfOffers",
                                          add_var=c("Competition.sum",
                                                    "Competition.multisum",
                                                    "Competition.effective.only",
                                                    "No.Competition.sum"),
                                          path=path,
                                          directory="contract/"
    )
  }
  if("Vehicle" %in% names(df) ){
    df<-read_and_join_experiment(df,
                                          "Vehicle.csv",
                                          by=c("Vehicle"="Vehicle.detail"),
                                          add_var=c("Vehicle.sum","Vehicle.sum7","Vehicle.AwardTask"),
                                          path=path,
                                          # path="K:/Users/Greg/Repositories/Lookup-Tables/",
                                          directory="contract/"
    )
  }

  # Mechanism Type ####
  if("assistance_type_code" %in% names(df) ){
    df<-df %>% mutate(cfda_num=if_else(
      nchar(cfda_number)>6,
      round(text_to_number(cfda_number),3),
      cfda_number))
    if(any(!is.na(df$cfda_number)&is.na(df$cfda_num)))
      stop("Mangled CFDA number")
    df<-read_and_join_experiment(df,directory="assistance//",lookup_file="assistance_type_code.csv",
                                 by="assistance_type_code",path=path)
  }

  if("assistance_type_code" %in% names(df) ){
    df<-read_and_join_experiment(data=df
                                 ,"assistance_type_code.csv"
                                 ,path=path
                                 ,directory="assistance//"
                                 ,by="assistance_type_code",
                                 ,add_var = c("assistance_type_description"),
                                 # ,skip_check_var = c("PricingInflation","TypeOfContractPricingText")
                                 ,lookup_char_as_factor = TRUE
                                 # ,new_var_checked=FALSE
                                 # ,create_lookup_rdata=TRUE
                                 # ,col_types="dddddddddccc"
    )
  }
  if("costaccountingstandardsclause" %in% names(df))
    df$costaccountingstandardsclause[df$costaccountingstandardsclause==""]<-NA
  if("costorpricingdata" %in% names(df))
    df$costorpricingdata[df$costorpricingdata==""]<-NA


  if("TypeOfContractPricing" %in% names(df) ){

    df<-read_and_join_experiment(data=df
                                 ,"contract.TypeOfContractPricing.csv"
                                 ,path=path
                                 ,directory="contract/"
                                 ,add_var = c("PricingInflation","TypeOfContractPricingText")
                                 ,skip_check_var = c("PricingInflation","TypeOfContractPricingText")
                                 # ,by=c("informationtechnologycommercialitemcategory"="informationtechnologycommercialitemcategory")
                                 # ,new_var_checked=FALSE
                                 # ,create_lookup_rdata=TRUE
                                 # ,col_types="dddddddddccc"
    )
    df$PricingInflation<-factor(df$PricingInflation)
    df$TypeOfContractPricingText<-factor(df$TypeOfContractPricingText)
  }

  if("PricingUCA" %in% names(df) & !"PricingUCA.sum" %in% names(df) ){

    df$PricingUCA.sum<-factor(df$PricingUCA)
    df<-replace_nas_with_unlabeled(df,"PricingUCA.sum")
    levels(df$PricingUCA.sum)<-
      list("FFP"="FFP",
           "Less Common"=c("Other FP","T&M/LH/FPLOE"),
           "Incentive"="Incentive",
           "Other CB"="Other CB",
           "UCA"="UCA",
           "Unclear"=c("Combination/Other","Unlabeled"))
  }

  if("informationtechnologycommercialitemcategory" %in% names(df)){
    df<-read_and_join_experiment(data=df
                                 ,"InformationTechnologyCommercialItemCategory.csv"
                                 ,path=path
                                 ,directory="productorservice/"
                                 ,by=c("informationtechnologycommercialitemcategory"="informationtechnologycommercialitemcategory")
                                 # ,new_var_checked=FALSE
                                 # ,create_lookup_rdata=TRUE
                                 # ,col_types="dddddddddccc"
    )
  }
  #
  #
  #     df<-replace_nas_with_unlabeled(df,"Contracting.Agency.ID","Uncategorized")
  #
  #     df<-read_and_join_experiment(df,
  #                                      "Agency_AgencyID.csv",
  #                                      by=c("Contracting.Agency.ID"="AgencyID"),
  #                                      path=path,
  #                                      directory="")
  #
  #     # NA.check.df<-subset(df, is.na(Contracting.Agency.Name) , select=c("Contracting.Agency.ID"))
  #     # if(nrow(NA.check.df)>0){
  #     #   print(unique(NA.check.df))
  #     #   stop(paste(nrow(NA.check.df),"rows of NAs generated in Contracting.Agency.Name"))
  #     # }
  #
  #   }
  #   else if("Contracting.Department.ID" %in% names(df)){
  #
  #     if("Contracting.Agency.Name"%in% names(df)){
  #       df<-subset(df, select=-c(Contracting.Agency.Name))
  #     }
  #     if("Customer"%in% names(df)){
  #       df<-subset(df, select=-c(Customer))
  #     }
  #     if("SubCustomer"%in% names(df)){
  #       df<-subset(df, select=-c(SubCustomer))
  #     }
  #     names(df)[which(names(df)=="Contracting.Department.ID")]<-"Contracting.Agency.ID"
  #     #     stop("safety")
  #
  #     #     debug(read_and_join_experiment)
  #
  #     df<-read_and_join_experiment(df,
  #                           "LOOKUP_Contracting_Agencies.csv",
  #                           by=c("Contracting.Agency.ID"))
  #     # NA.check.df<-subset(df, is.na(Contracting.Agency.Name), select=c("Fiscal_Year","Contracting.Agency.ID"))
  #     # if(nrow(NA.check.df)>0){
  #     #   print(unique(NA.check.df))
  #     #   stop(paste(nrow(NA.check.df),"rows of NAs generated in Contracting.Agency.Name"))
  #     # }
  #   }
  #
  #
  #
  #
  #   if("Funder" %in% names(df) && "SubFunder" %in% names(df)){
  #
  #     if("SubFunder.Sum"%in% names(df)){
  #       df<-subset(df, select=-c(SubFunder.Sum))
  #     }
  #
  #     df<-read_and_join_experiment(df,
  #                           "LOOKUP_SubFunder.csv",
  #                           by=c("Funder","Subfunder"))
  #     # NA.check.df<-subset(df, is.na(SubFunder.Sum) & !is.na(Funder), select=c("Fiscal_Year","Funder","SubFunder"))
  #     # if(nrow(NA.check.df)>0){
  #     #   print(unique(NA.check.df))
  #     #   stop(paste(nrow(NA.check.df),"rows of NAs generated in SubFunder.Sum"))
  #     # }
  #   }
  #
  #
  #   df<-competition_vehicle_lookups(path,df)
  if("Action_Obligation"%in% names(df))
    df$Action_Obligation<-text_to_number(df$Action_Obligation)
  if("NumberOfActions" %in% colnames(df))
    df$NumberOfActions <- text_to_number(df$NumberOfActions)
  #
  #
  #   if("PoPstateCode" %in% names(df)){
  #
  #     if("StateText"%in% names(df)){
  #       df<-subset(df, select=-c(StateText))
  #     }
  #
  #     if("Census.Region"%in% names(df)){
  #       df<-subset(df, select=-c(Census.Region))
  #     }
  #
  #     if("BEA.Region"%in% names(df)){
  #       df<-subset(df, select=-c(BEA.Region))
  #     }
  #
  #     if("CSIS.Region"%in% names(df)){
  #       df<-subset(df, select=-c(CSIS.Region))
  #     }
  #
  #     df<-read_and_join_experiment(df,
  #                           "LOOKUP_State_Code.csv")
  #     NA.check.df<-subset(df, is.na(StateText) & !is.na(PoPstateCode), select=c("PoPstateCode","StateText"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in StateText"))
  #     }
  #   }
  #
  #
  #
  #
  #   if(("OMBagencyCode" %in% names(df))
  #      #     &"OMBagencyName" %in% names(df)
  #      &"OMBbureauCode" %in% names(df)
  #      #     &"OMBbureauname" %in% names(df)
  #   )
  #   {
  #     #
  #     if("OMBagencyName" %in% names(df)){
  #       df<-subset(df, select=-c(OMBagencyName))
  #     }
  #     if("OMBbureauname"%in% names(df)){
  #       df<-subset(df, select=-c(OMBbureauname))
  #     }
  #     #   if("Customer"%in% names(df)){
  #     #     df<-subset(df, select=-c(Customer))
  #     #   }
  #     #   if("SubCustomer"%in% names(df)){
  #     #     df<-subset(df, select=-c(SubCustomer))
  #     #   }
  #     #       debug(read_and_join_experiment)
  #     df<-read_and_join_experiment(df,
  #                           "LOOKUP_OMBagencyBureau.csv")
  #     NA.check.df<-subset(df
  #                         , is.na(CSISbureau) #& !is.na(OMBbureauCode)
  #                         , select=c("OMBagencyCode"
  #                                    ,"OMBagencyName"
  #                                    ,"OMBbureauCode"
  #                                    ,"OMBbureauname"
  #                                    ,"CSISbureau"
  #                         )
  #     )
  #
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in CSISbureau"))
  #     }
  #
  #   }
  #
  #If there's existing productorservce descriptions, these may be more precise than
  #What we can provide via lookup tables.
  #### PSC / NAICS / Project ####
  if("ProductOrServiceCode" %in% names(df) & !"Product.or.Service.Description" %in% names(df)&
     !"ProductOrServiceCodeText" %in% names(df))
  {
    if(is.integer(df$ProductOrServiceCode)){
      df$ProductOrServiceCode<-factor(df$ProductOrServiceCode)
    }
    df$ProductOrServiceCode[df$ProductOrServiceCode==""]<-NA

    df<-read_and_join_experiment(df,
                                          "ProductOrServiceCodes.csv",
                                          by=c("ProductOrServiceCode"="ProductOrServiceCode"),
                                          add_var=c("ProductOrServiceCodeText"),
                                          path=path,
                                          skip_check_var = c("ProductServiceOrRnDarea"),
                                          directory=""
    )
  }
  #We can be more precise
  if("ProductOrServiceCode" %in% names(df) &
     !"ProductServiceOrRnDarea" %in% names(df) &
     "Fiscal_Year"  %in% names(df))
  {
    if(is.integer(df$ProductOrServiceCode)){
      df$ProductOrServiceCode<-factor(df$ProductOrServiceCode)
    }
    df$ProductOrServiceCode[df$ProductOrServiceCode==""]<-NA

    df<-df %>% mutate(Fiscal_Year_gt_2020=if_else(Fiscal_Year>2020,1,0))
    df<-read_and_join_experiment(df,
                                          "PSCAtransition.csv",
                                          directory="ProductOrService/",
                                          by=c("ProductOrServiceCode"="ProductOrServiceCode",
                                               "Fiscal_Year_gt_2020"="Fiscal_Year_gt_2020"),
                                          add_var=c("ProductServiceOrRnDarea"),
                                          path=path,
                                          skip_check_var = c("ProductServiceOrRnDarea",
                                                             "TransitionProductServiceOrRnDarea"),
    )
    colnames(df)[colnames(df)=="ProductServiceOrRnDarea"]<-"TransitionProductServiceOrRnDarea"

    df<-read_and_join_experiment(df,
                                          "ProductOrServiceCodes.csv",
                                          by=c("ProductOrServiceCode"="ProductOrServiceCode"),
                                          add_var=c("Simple","ProductOrServiceArea","ProductServiceOrRnDarea"),
                                          path=path,
                                          skip_check_var = c("ProductServiceOrRnDarea"),
                                          directory=""

    )
    df$ProductServiceOrRnDarea[is.na(df$ProductServiceOrRnDarea)]<-
      df$TransitionProductServiceOrRnDarea[is.na(df$ProductServiceOrRnDarea)]
    #Manual check because we have to draw from two sources
    NA.check.df<-subset(df, is.na(ProductServiceOrRnDarea)&!is.na(ProductOrServiceCode),
                        select=c("ProductOrServiceCode","TransitionProductServiceOrRnDarea","ProductServiceOrRnDarea"))
    if(nrow(NA.check.df)>0){
      print(unique(NA.check.df))
      warning(paste(nrow(NA.check.df),"rows of NAs generated in ProductServiceOrRnDarea"))
    }

    df<-df %>% dplyr::select(-Fiscal_Year_gt_2020,-TransitionProductServiceOrRnDarea)

  }
  else if("ProductServiceOrRnDarea" %in% names(df))
  {
    df<-replace_nas_with_unlabeled(df,"ProductServiceOrRnDarea")

    if("ProductServiceOrRnDarea.sum" %in% names(df)){
      df<-subset(df, select=-c(ProductServiceOrRnDarea.sum))
    }
    if("SimpleArea" %in% names(df)){
      df<-subset(df, select=-c(SimpleArea))
    }
    if("Simple" %in% names(df)){
      df<-subset(df, select=-c(Simple))
    }
    # if("ServicesCategory.sum" %in% names(df)){
    #   df<-subset(df, select=-c(ServicesCategory.sum))
    # }
    # if("ProductOrServiceArea" %in% names(df)){
    #   df<-subset(df, select=-c(ProductOrServiceArea))
    # }
    # if("ServicesCategory.detail" %in% names(df)){
    #   df<-subset(df, select=-c(ServicesCategory.detail))
    # }
    #     debug(read_and_join_experiment)

    #Classify Product or Service Codes
    df<-read_and_join_experiment(df,
                                          "ProductServiceOrRnDarea.csv",
                                          # by="ProductOrServiceArea",
                                          by="ProductServiceOrRnDarea",
                                          replace_na_var="ProductServiceOrRnDarea",
                                          add_var=c("ProductServiceOrRnDarea.sum","ServicesCategory.detail","ServicesCategory.sum"),
                                          path=path,
                                          directory="productorservice/"
    )


    NA.check.df<-subset(df, is.na(ServicesCategory.sum), select=c("ProductServiceOrRnDarea"))
    if(nrow(NA.check.df)>0){
      print(unique(NA.check.df))
      stop(paste(nrow(NA.check.df),"rows of NAs generated in ProductServiceOrRnDarea"))
    }

  }
  #   else if("ProductOrServiceArea" %in% names(df))
  #   {
  #     df<-replace_nas_with_unlabeled(df,"ProductOrServiceArea")
  #
  #     if("ServicesCategory.sum" %in% names(df)){
  #       df<-subset(df, select=-c(ServicesCategory.sum))
  #     }
  #     #     debug(read_and_join_experiment)
  #     colnames(df)[colnames(df)=="ProductOrServiceArea"]<-"ProductServiceOrRnDarea"
  #     df<-read_and_join_experiment(df,
  #                           "LOOKUP_Buckets.csv",by="ProductServiceOrRnDarea")
  #     colnames(df)[colnames(df)=="ProductServiceOrRnDarea"]<-"ProductServiceOrRnDarea"
  #
  #     NA.check.df<-subset(df, is.na(ServicesCategory.sum), select=c("ProductOrServiceArea"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in ProductOrServiceArea"))
  #     }
  #
  #     NA.check.df<-subset(df, is.na(ServicesCategory.detail), select=c("ProductOrServiceArea"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in ServicesCategory.detail"))
  #     }
  #
  #   }
  #
  #   else if("ServicesCategory.detail" %in% names(df))
  #   {
  #
  #     if("ServicesCategory.sum" %in% names(df)){
  #       df<-subset(df, select=-c(ServicesCategory.sum))
  #     }
  #
  #     #     debug(read_and_join_experiment)
  #     df<-read_and_join_experiment(df,
  #                           "LOOKUP_Buckets.csv")
  #     NA.check.df<-subset(df, is.na(ServicesCategory.sum), select=c("Fiscal_Year","ServicesCategory.detail"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in ServicesCategory.sum"))
  #     }
  #
  #   }
  #
  #
  #
  if("PlatformPortfolio" %in% names(df)){
    #   {
    #
    #     if("PlatformPortfolio.sum" %in% names(df)){
    #       df<-subset(df, select=-c(PlatformPortfolio.sum))
    #     }
    #
    df<-replace_nas_with_unlabeled(df,"PlatformPortfolio")
    #
    #     df<-read_and_join_experiment(df,
    #                           "LOOKUP_PlatformPortfolio.csv")
    #     NA.check.df<-subset(df, is.na(PlatformPortfolio.sum), select=c("PlatformPortfolio"))
    #     if(nrow(NA.check.df)>0){
    #       print(unique(NA.check.df))
    #       stop(paste(nrow(NA.check.df),"rows of NAs generated in PlatformPortfolio.sum"))
    #     }
    if("IsRemotelyOperated" %in% names(df)){
      df$PlatformPortfolioUAV<-as.character(df$PlatformPortfolio)
      df$PlatformPortfolioUAV[df$IsRemotelyOperated==1& !is.na(df$IsRemotelyOperated)]<-"Remotely Crewed"
    }
    else if ("ProductOrServiceCode" %in% names(df) & "ProjectID" %in% names(df)){

      df<-read_and_join_experiment(df,
                                   lookup_file="ProjectID.txt",
                                   path=path,directory="project/",
                                   add_var = c("IsRemotelyOperated"),
                                   by=c("ProjectID"),
                                   # missing_file="missing_iso.csv",
                                   skip_check_var = c("IsRemotelyOperated"))
      df$IsRemotelyOperated[df$ProductOrServiceCode==1550]<-1
      df$PlatformPortfolioUAV<-as.character(df$PlatformPortfolio)
      df$PlatformPortfolioUAV[df$IsRemotelyOperated==1& !is.na(df$IsRemotelyOperated)]<-"Remotely Crewed"
    }
  }


  #**** ProjectID *******
  if("ProjectID" %in% names(df)){


    if("Project.Name" %in% names(df)){
      df<-subset(df, select=-c(Project.Name))
    }

    df<-read_and_join_experiment(df,
                                 lookup_file="ProjectID.txt",
                                 path=path,directory="project/",
                                 add_var = c("ProjectName","IsUnknown","IsRemotelyOperated"),
                                 by=c("ProjectID"),
                                 # missing_file="missing_iso.csv",
                                 skip_check_var = c("IsRemotelyOperated")
    )

    if ("SubCustomer.platform" %in% names(df) & "ProjectName" %in% names(df)){
      df$SubCustomer.JPO<-as.character(df$SubCustomer.platform)
      df$SubCustomer.JPO[df$ProjectName %in% c("JSF (F-35) ","JSF (F-35)") & !is.na(df$ProjectName)&df$SubCustomer.platform=="Navy"]<-"F-35 JPO"
      df$SubCustomer.JPO<-factor(df$SubCustomer.JPO)
    }

  } else if ("SubCustomer.platform" %in% names(df) & "IsF35" %in% names(df)){
    df$SubCustomer.JPO<-as.character(df$SubCustomer.platform)
    df$SubCustomer.JPO[df$IsF35==1 & !is.na(df$IsF35==1)&df$SubCustomer.platform=="Navy"]<-"F-35 JPO"
    df$SubCustomer.JPO<-factor(df$SubCustomer.JPO)
  }

  #
  #
  #
  #   if("Arms.Type" %in% names(df)){
  #     #     debug(read_and_join_experiment)
  #     df<-read_and_join_experiment(df,
  #                           "LOOKUP_ArmsType.csv")
  #
  #
  #     NA.check.df<-subset(df, is.na(Arms.Summary), select=c("Arms.Type","Arms.Summary")
  #     )
  #
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in Arms.Type"))
  #     }
  #
  #   }
  #
  #   if("Country" %in% names(df)){
  #     #     debug(read_and_join_experiment)
  #     df<-read_and_join_experiment(df,
  #                           "LOOKUP_Country.csv")
  #
  #
  #     NA.check.df<-subset(df, is.na(Country.Proper), select=c("Country","Country.Proper")
  #     )
  #
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in Countries"))
  #     }
  #
  #   }
  #   else if("Destination.Country" %in% names(df)){
  #     #     debug(read_and_join_experiment)
  #     df<-read_and_join_experiment(df,
  #                           "LOOKUP_Country.csv")
  #
  #
  #     NA.check.df<-subset(df, is.na(Country.Proper), select=c("Destination.Country","Country.Proper")
  #     )
  #
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in Countries"))
  #     }
  #   }
  #   #     browser()
  #   if("Pricing.Mechanism" %in% names(df)){
  #     df$Pricing.Mechanism[df$Pricing.Mechanism==""]<-NA
  #
  #     df<-replace_nas_with_unlabeled(df,"Pricing.Mechanism")
  #
  #     if("Pricing.Mechanism.sum" %in% names(df)){
  #       df<-subset(df, select=-c(Pricing.Mechanism.sum))
  #     }
  #
  #     if("Pricing.Mechanism.detail" %in% names(df)){
  #       df<-subset(df, select=-c(Pricing.Mechanism.detail))
  #     }
  #
  #     if("Pricing.Mechanism.Correlation" %in% names(df)){
  #       df<-subset(df, select=-c(Pricing.Mechanism.Correlation))
  #     }
  #
  #     if("Pricing.Mechanism.Graph" %in% names(df)){
  #       df<-subset(df, select=-c(Pricing.Mechanism.Graph))
  #     }
  #
  #     #     stop("hammertiime")
  #
  #
  #     df<-read_and_join_experiment(df,
  #                           "LOOKUP_Pricing_Mechanism.csv",
  #                           by=c("Pricing.Mechanism"),
  #                           skip_check_var=c("IsCostBased","Pricing.Mechanism.Code","IsFixedPrice",	"IsIncentive"),
  #                           replace_na_var=("Pricing.Mechanism")
  #     )
  #   }
  #   #   else if ("Pricing.Mechanism.Code" %in% names(df)){
  #   #     #Replace blank strings with Unlabeled
  #   # #     df$Pricing.Mechanism<-mapvalues(df$Pricing.Mechanism,from=c(""),to=c("Unlabeled"))
  #   # #
  #   # #     #Handle NA values if present
  #   # #     if(any(is.na(df$Pricing.Mechanism))){
  #   # #       #Make sure unlabeled is within the list of levels
  #   # #       if (!("Unlabeled" %in% levels(df$Pricing.Mechanism))){
  #   # #         df$Pricing.Mechanism=factor(df$Pricing.Mechanism,levels=c(unique(df$Pricing.Mechanism),"Unlabeled"))
  #   # #       }
  #   # #     }
  #   # #
  #   # #     #Replace NAs with Uncategorized
  #   # #     df$Pricing.Mechanism[is.na(df$Pricing.Mechanism)]<-"Unlabeled"
  #   # #
  #   #
  #   #
  #   #     if("IsCostBased" %in% names(df) && (!any(!is.na(df$IsCostBased)))){
  #   #       df<-subset(df, select=-c(IsCostBased))
  #   #     }
  #   #
  #   #     if("IsFixedPrice" %in% names(df) && all(is.na(df$IsFixedPrice))){
  #   #       df<-subset(df, select=-c(IsFixedPrice))
  #   #     }
  #   #
  #   #
  #   #     if("IsIncentive" %in% names(df) && (!any(!is.na(df$IsIncentive)))){
  #   #       df<-subset(df, select=-c(IsIncentive))
  #   #     }
  #   #
  #   #     if("Pricing.Mechanism.sum" %in% names(df)){
  #   #       df<-subset(df, select=-c(Pricing.Mechanism.sum))
  #   #     }
  #   #
  #   #     if("Pricing.Mechanism.detail" %in% names(df)){
  #   #       df<-subset(df, select=-c(Pricing.Mechanism.detail))
  #   #     }
  #   #
  #   #     if("Pricing.Mechanism.Correlation" %in% names(df)){
  #   #       df<-subset(df, select=-c(Pricing.Mechanism.Correlation))
  #   #     }
  #   #
  #   #     if("Pricing.Mechanism.Graph" %in% names(df)){
  #   #       df<-subset(df, select=-c(Pricing.Mechanism.Graph))
  #   #     }
  #   #
  #   #     #     stop("hammertiime")
  #   #
  #   #
  #   #
  #   #     df<-read_and_join_experiment(df,"LOOKUP_Pricing_Mechanism.csv")
  #   #
  #   #
  #   #     NA.check.df<-subset(df, !is.na(Pricing.Mechanism.Code) & is.na(Pricing.Mechanism.sum), select=c("Pricing.Mechanism"))
  #   #     if(nrow(NA.check.df)>0){
  #   #       print(unique(NA.check.df))
  #   #       stop(paste(nrow(NA.check.df),"rows of NAs generated in Pricing.Mechanism.sum"))
  #   #     }
  #   #
  #   #     NA.check.df<-subset(df, is.na(Pricing.Mechanism.detail), select=c("Pricing.Mechanism"))
  #   #     if(nrow(NA.check.df)>0){
  #   #       print(unique(NA.check.df))
  #   #       stop(paste(nrow(NA.check.df),"rows of NAs generated in Pricing.Mechanism.detail"))
  #   #     }
  #   #   }
  #
  #
  #   if("Contractor.Size" %in% names(df)){
  #     df<-read_and_join_experiment(df,"LOOKUP_Contractor_Size_named.csv")
  #
  #     NA.check.df<-subset(df, is.na(Contractor.Size.detail), select=c("Contractor.Size"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in Contractor.Size.detail"))
  #     }
  #
  #     NA.check.df<-subset(df, is.na(Contractor.Size.sum), select=c("Contractor.Size"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in Contractor.Size.sum"))
  #     }
  #
  #     NA.check.df<-subset(df, is.na(Contractor.Size.correlation), select=c("Contractor.Size"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in Contractor.Size.correlation"))
  #     }
  #
  #   }
  #
  #

  if("principalnaicscode" %in% names(df))
  {
    df$principalnaicscode[df$principalnaicscode==""]<-NA
    if(any(is.na(text_to_number(df$principalnaicscode))&!is.na(df$principalnaicscode)))
      stop(paste("Text in principalnaicscode:",
                 levels(factor(df$principalnaicscode[is.na(text_to_number(df$principalnaicscode))&!is.na(df$principalnaicscode)]))))
    df$principalnaicscode<-text_to_number(df$principalnaicscode)

    df<-read_and_join_experiment(df,
                                          "Lookup_PrincipalNAICScode.csv",
                                          by=c("principalnaicscode"="principalnaicscode"),
                                          add_var=c("principalnaicscodeText"),
                                          path=path,
                                          skip_check_var = c("principalnaicscodeText"),
                                          directory="economic"
    )
  }

  #### Vendor Size ####
  if("VendorSize" %in% names(df)){
    df<-replace_nas_with_unlabeled(df,"VendorSize")

    df<-read_and_join_experiment(df,
                                          "VendorSize.csv",
                                          by=c("VendorSize"="VendorSize"),
                                          add_var="Shiny.VendorSize",
                                          path=path,
                                          directory="vendor/"
    )

    # df<-read_and_join_experiment(df,"LOOKUP_Contractor_Size.csv")
    #
    # NA.check.df<-subset(df, is.na(Vendor.Size.detail), select=c("Vendor.Size"))
    # if(nrow(NA.check.df)>0){
    #   print(unique(NA.check.df))
    #   stop(paste(nrow(NA.check.df),"rows of NAs generated in Vendor.Size.detail"))
    # }
    #
    # NA.check.df<-subset(df, is.na(Vendor.Size.sum), select=c("Vendor.Size"))
    # if(nrow(NA.check.df)>0){
    #   print(unique(NA.check.df))
    #   stop(paste(nrow(NA.check.df),"rows of NAs generated in Vendor.Size.sum"))
    # }

  }

  #
  #
  #   if("Contract.Size" %in% names(df)){
  #     df<-replace_nas_with_unlabeled(df,"Contract.Size")
  #
  #     df<-read_and_join_experiment(df,"LOOKUP_Contract_Size.csv")
  #
  #     NA.check.df<-subset(df, is.na(Contract.Size.detail), select=c("Contract.Size"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in Contract.Size.detail"))
  #     }
  #
  #     NA.check.df<-subset(df, is.na(Contract.Size.sum), select=c("Contract.Size"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in Contract.Size.sum"))
  #     }
  #
  #   }
  #
  #
  #   if("Extent.Competed" %in% names(df)){
  #     df<-read_and_join_experiment(df,"LOOKUP_Extent_Competed.csv")
  #
  #     NA.check.df<-subset(df, is.na(Extent.Competed.Sum), select=c("Extent.Competed.Sum"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in Extent.Competed.Sum"))
  #     }
  #   }
  #
  #   if("systemequipmentcode" %in% names(df)){
  #     df<-replace_nas_with_unlabeled(df,"systemequipmentcode")
  #
  #     df<-read_and_join_experiment(df,"LOOKUP_systemequipmentcode.csv")
  #
  #     NA.check.df<-subset(df, is.na(systemequipmentcode)|is.na(systemequipmentshorttext), select=c("systemequipmentcode","systemequipmentcodeText","systemequipmentshorttext"))
  #     if(nrow(NA.check.df)>0){
  #       print(unique(NA.check.df))
  #       stop(paste(nrow(NA.check.df),"rows of NAs generated in systemequipmentcodeText or systemequipmentshorttext"))
  #     }
  #   }
  #
  #
  if("EntitySizeCode" %in% names(df)){

    # EntitySizeText.detail = EntitySizeText,
    # EntitySizeText = fct_recode(
    #   EntitySizeText,
    #   Small = "Always Small Vendor",
    #   Small = "Sometimes Small Vendor",
    #   Medium = "Medium Vendor",
    #   "Large+" = "Big Five",
    #   "Large+" = "Large Vendor",
    #   "Large+" = "Large: Big 5 JV"))%>%

    df<-read_and_join_experiment(df,
                                          "EntitySizeCode.csv",
                                          by=c("EntitySizeCode"="EntitySizeCode"),
                                          add_var=c("EntitySizeText","EntitySmall","EntitySizeText.sum"),
                                          path=path,
                                          directory="vendor/")
    df<-replace_nas_with_unlabeled(df,"EntitySizeText",replacement="Unlabeled Vendor")
    df<-replace_nas_with_unlabeled(df,"EntitySizeText.sum",replacement="Unlabeled Vendor")
    df<-replace_nas_with_unlabeled(df,"EntitySmall",replacement="Unlabeled Vendor")
  }



  #Deflators
  if("Fiscal_Year"%in% names(df)){
    if("Action_Obligation"%in% names(df)){
      df<-deflate(df,
                  money_var = "Action_Obligation",
                  fy_var="Fiscal_Year",
                  deflator_var=deflator_var,
                  path=path
      )
    }
  }

  if("fundedbyforeignentity" %in% colnames(df) &
     !"foreign_funding_description" %in% colnames(df)){
    df$fundedbyforeignentity[df$fundedbyforeignentity==""]<-NA
    df<-   read_and_join_experiment(df,lookup_file="Budget_FundedByForeignEntity.csv",
                                     path="https://raw.githubusercontent.com/CSISdefense/Lookup-Tables/master/",directory="budget/",
                                     add_var = c("foreign_funding_description"),
                                     by=c("fundedbyforeignentity")
                                     # missing_file="missing_iso.csv",
                                     # skip_check_var = "territory_capital"
    )
  }

  if("foreign_funding_description" %in% colnames(df) &
     "IsFMSml" %in% colnames(df) &
     "IsFMSmac" %in% colnames(df)){
    df$IsFMS<-NA
    df$IsFMS[df$foreign_funding_description %in% c("Foreign Funds FMS")]<-1
    df$IsFMS[df$foreign_funding_description %in% c("Foreign Funds non-FMS", "Not Applicable")]<-0
    df$IsFMS[is.na(df$IsFMS)]<-df$IsFMSml[is.na(df$IsFMS)]
    df$IsFMS[is.na(df$IsFMS) & df$IsFMSmac==1]<-1
    df$IsFMS[is.na(df$IsFMS) & df$IsFMSmac==0]<-0
    if("mainaccountcode" %in% colnames(df) & "treasuryagencycode" %in% colnames(df)){
      df$IsUnlabeledMAC<-is.na(df$mainaccountcode) | is.na(df$treasuryagencycode)
      df$IsFMS[is.na(df$IsFMS) & df$IsUnlabeledMAC==0]<-0
    }
  }

  if("PlaceOfManufacture" %in% colnames(df)){
    #Place of manufacture
    df<-read_and_join_experiment(df,
                                          "Location_PlaceOfManufacture.csv",
                                          by="PlaceOfManufacture",
                                          add_var=c("PlaceOfManufactureText","PlaceOfManufacture_Sum"),
                                          skip_check_var = c("PlaceOfManufactureText","PlaceOfManufacture_Sum"),
                                          path=path,
                                          directory="location/",
                                          case_sensitive = FALSE
    )
  }

  if("VendorAddressCountry" %in% colnames(df) & !"VendorAddressISOalpha3" %in% colnames(df)){
    if("ISOalpha3" %in% colnames(df))
      df<-subset(df,select=-c(ISOalpha3))
    df$VendorAddressCountry[df$VendorAddressCountry==""]<-NA
    df<-read_and_join_experiment(df,lookup_file="Location_CountryName.csv",
                                 path=path,directory="location/",
                                 add_var = c("ISOalpha3"),
                                 by=c("VendorAddressCountry"="CountryName"),
                                 # skip_check_var=c("NATOyear",	"MajorNonNATOyear","NTIByear"	,"SEATOendYear","RioTreatyStartYear","RioTreatyEndYear","FiveEyes","OtherTreatyName"	,"OtherTreatyStartYear","OtherTreatyEndYear","isforeign"),
                                 missing_file="missing_VendorAddressCountry.csv",
                                 case_sensitive = FALSE)
    colnames(df)[colnames(df)=="ISOalpha3"]<-"VendorAddressISOalpha3"
  }



  if("PlaceISOalpha3" %in% colnames(df)){
    if("PlaceIsForeign" %in% colnames(df))
      df<-subset(df,select=-c(PlaceIsForeign))
    df<-read_and_join_experiment(df,lookup_file="Location_CountryCodes.csv",
                                 path=path,directory="location/",
                                 add_var = c("isforeign"),#"USAID region",
                                 by=c("PlaceISOalpha3"="alpha-3"),
                                 # skip_check_var=c("NATOyear",	"MajorNonNATOyear","NTIByear"	,"SEATOendYear","RioTreatyStartYear","RioTreatyEndYear","FiveEyes","OtherTreatyName"	,"OtherTreatyStartYear","OtherTreatyEndYear","isforeign"),
                                 missing_file="missing_DSCA_iso.csv")
    colnames(df)[colnames(df)=="isforeign"]<-"PlaceIsForeign"
  }

  if("VendorISOalpha3" %in% colnames(df)){
    if("VendorIsForeign" %in% colnames(df))
      df<-subset(df,select=-c(VendorIsForeign))
    df<-read_and_join_experiment(df,lookup_file="Location_CountryCodes.csv",
                                 path=path,directory="location/",
                                 add_var = c("isforeign"),#"USAID region",
                                 by=c("VendorISOalpha3"="alpha-3"),
                                 # skip_check_var=c("NATOyear",	"MajorNonNATOyear","NTIByear"	,"SEATOendYear","RioTreatyStartYear","RioTreatyEndYear","FiveEyes","OtherTreatyName"	,"OtherTreatyStartYear","OtherTreatyEndYear","isforeign"),
                                 missing_file="missing_DSCA_iso.csv")
    colnames(df)[colnames(df)=="isforeign"]<-"VendorIsForeign"
  }


  if("VendorAddressISOalpha3" %in% colnames(df)){
    if("VendorAddressIsForeign" %in% colnames(df))
      df<-subset(df,select=-c(VendorIsForeign))
    df<-read_and_join_experiment(df,lookup_file="Location_CountryCodes.csv",
                                 path=path,directory="location/",
                                 add_var = c("isforeign"),#"USAID region",
                                 by=c("VendorAddressISOalpha3"="alpha-3"),
                                 # skip_check_var=c("NATOyear",	"MajorNonNATOyear","NTIByear"	,"SEATOendYear","RioTreatyStartYear","RioTreatyEndYear","FiveEyes","OtherTreatyName"	,"OtherTreatyStartYear","OtherTreatyEndYear","isforeign"),
                                 missing_file="missing_iso.csv")
    colnames(df)[colnames(df)=="isforeign"]<-"VendorAddressIsForeign"
  }

  if ("Shiny.VendorSize" %in% colnames(df) & "VendorIsForeign" %in% colnames(df)){
    df$VendorSize_Intl<-factor(df$Shiny.VendorSize)
    levels(df$VendorSize_Intl)<-list(
      "Unlabeled"="Unlabeled",
      "International"="International",
      "U.S. Big Five"=c("Big Five","U.S. Big Five"),
      "U.S. Large"=c("Large","U.S. Large"),
      "U.S. Medium"=c("Medium","U.S. Medium"),
      "U.S. Small"=c("Small","U.S. Small")
    )
    df$VendorSize_Intl[df$VendorIsForeign==1]<-"International"
    df$VendorSize_Intl[is.na(df$VendorIsForeign)]<-"Unlabeled"

  }




  #### Duration ####
  if("CurrentDurationCategory" %in% colnames(df)){
    df$CurrentDurationIsYear<-factor(df$CurrentDurationCategory)
    levels(df$CurrentDurationIsYear)<-list(
      "<=1 year" =c("<=2 Months", ">2-7 Months"  ,">7-12 Months"),
      ">1 year"=c(">1-2 Years",   ">2-4 Years",  ">4 years")
    )
    if("PricingInflation" %in% colnames(df)){
      df$PricingInflation.1year<-as.character(df$PricingInflation)
      df$PricingInflation.1year[df$CurrentDurationIsYear=="<=1 year"]<-"<=1 Year (All Types)"
      if("PricingUCA.sum" %in% colnames(df)){
        df$PricingInflation.1yearUCA<-as.character(df$PricingInflation.1year)
        df$PricingInflation.1yearUCA[df$PricingUCA.sum=="UCA"]<-"UCA"
      }
    }

    if("PricingInflationUCA" %in% colnames(df)){
      df$PricingInflation.1yearUCA<-as.character(df$PricingInflationUCA)
      df$PricingInflation.1yearUCA[df$CurrentDurationIsYear=="<=1 year"]<-"<=1 Year (All Types)"
    }


    if("PricingUCA.sum" %in% colnames(df)){
      df$PricingUCA.1year<-as.character(df$PricingUCA.sum)
      df$PricingUCA.1year[df$CurrentDurationIsYear=="<=1 year"]<-"<=1 Year (All Types)"
    }
  }


  if("UnmodifiedUltimateDurationCategory" %in% colnames(df)){
    df$UnmodifiedUltimateDurationIsYear<-factor(df$UnmodifiedUltimateDurationCategory)
    levels(df$UnmodifiedUltimateDurationIsYear)<-list(
      "<=1 year" =c("<=2 Months", ">2-7 Months"  ,">7-12 Months"),
      ">1 year"=c(">1-2 Years",   ">2-4 Years",  ">4 years")
    )
  }

  if("recoveredmaterialclauses" %in% colnames(df)){
    df$recoveredmaterialclauses[df$recoveredmaterialclauses==""]<-"Unlabeled"
  }
  #
  #   if("Fiscal_Year"%in% names(df)){
  #     df<-read_and_join_experiment(df,
  #                           "Lookup_Deflators.csv",
  #                           by="Fiscal_Year",
  #                           new_var_checked=FALSE,
  #                           path=path,
  #                           directory="economic/")
  #     # NA.check.df<-subset(df,  is.na(Deflator.2014) & is.na(Deflator.2013) & !is.na(Fiscal_Year), select=c("Fiscal_Year","Deflator.2013","Deflator.2014"))
  #     # if(nrow(NA.check.df)>0){
  #     #   print(unique(NA.check.df))
  #     #   stop(paste(nrow(NA.check.df),"rows of NAs generated in value"))
  #     # }
  #
  #
  #     if("Action_Obligation"%in% names(df)){
  #       df$Action_Obligation<-text_to_number(df$Action_Obligation)
  #       if("Deflator.2013"%in% names(df)){
  #         df$Obligation.2013<-df$Action_Obligation/df$Deflator.2013/1000000000
  #       }
  #       if("Deflator.2014"%in% names(df)){
  #         df$Obligation.2014<-df$Action_Obligation/df$Deflator.2014/1000000000
  #       }
  #       if("Deflator.2015"%in% names(df)){
  #         df$Obligation.2015<-df$Action_Obligation/df$Deflator.2015/1000000000
  #       }
  #
  #       if("Deflator.2016"%in% names(df)){
  #         df$Obligation.2016<-df$Action_Obligation/df$Deflator.2016/1000000000
  #       }
  #     }
  #
  #
  #
  #     if("GBKdisbursements"%in% names(df)){
  #       df$GBKdisbursements<-text_to_number(df$GBKdisbursements)
  #       if("Deflator.2013"%in% names(df)){
  #         df$GBKdisbursements.ConstantB<-df$GBKdisbursements/df$Deflator.2013/1000000000
  #       }
  #     }
  #
  #     if("GBKobligations"%in% names(df)){
  #       df$SumOfObligations<-text_to_number(df$GBKobligations)
  #       if("Deflator.2013"%in% names(df)){
  #         df$GBKobligations.2013<-df$GBKobligations/df$Deflator.2013/1000000000
  #       }
  #     }
  #
  #     if("Outlay"%in% names(df)){
  #       df$Outlay<-text_to_number(df$Outlay)
  #       if("Deflator.2013"%in% names(df)){
  #         df$Outlay.2013<-df$Outlay/df$Deflator.2013/1000000000
  #       }
  #     }
  #
  #
  #     if("OutlayNoOffsetAccount"%in% names(df)){
  #       df$OutlayNoOffsetAccount<-text_to_number(df$OutlayNoOffsetAccount)
  #       if("Deflator.2013"%in% names(df)){
  #         df$OutlayNoOffsetAccount.2013<-df$OutlayNoOffsetAccount/df$Deflator.2013/1000000000
  #       }
  #     }
  #
  #     if("OutlayOffsetAccount"%in% names(df)){
  #       df$OutlayOffsetAccount<-text_to_number(df$OutlayOffsetAccount)
  #       if("Deflator.2013"%in% names(df)){
  #         df$OutlayOffsetAccount.2013<-df$OutlayOffsetAccount/df$Deflator.2013/1000000000
  #       }
  #     }
  #
  #
  #
  #     if("Fed_Grant_Funding_Amount"%in% names(df)){
  #       df$Fed_Grant_Funding_Amount	   <-text_to_number(df$Fed_Grant_Funding_Amount	   )
  #       if("Deflator.2013"%in% names(df)){
  #         df$Fed_Grant_Funding_Amount.2013<-df$Fed_Grant_Funding_Amount	   /df$Deflator.2013/1000000000
  #       }
  #     }
  #
  #
  #     if("ContractObligatedAmount"%in% names(df)){
  #       df$ContractObligatedAmount<-text_to_number(df$ContractObligatedAmount)
  #       if("Deflator.2013"%in% names(df)){
  #         df$ContractObligatedAmount.2013<-df$ContractObligatedAmount/df$Deflator.2013/1000000000
  #       }
  #     }
  #   }
  #   if("OutlayNoOffsetAccount.2013" %in% names(df)
  #      & "OutlayOffsetAccount.2013" %in% names(df)
  #      & "ContractObligatedAmount.2013" %in% names(df)
  #      #      & "Fed_Grant_Funding_Amount.2013" %in% names(df)
  #      & "Outlay.2013" %in% names(df)
  #   ){
  #
  #
  #     df$ResidualOutlay.2013<-if_else(is.na(df$OutlayNoOffsetAccount.2013)
  #                                        ,0
  #                                        ,df$OutlayNoOffsetAccount.2013
  #     )
  #
  #     df$ResidualOutlay.2013<-df$ResidualOutlay.2013-if_else(is.na(df$ContractObligatedAmount.2013)
  #                                                                   ,0
  #                                                                   ,df$ContractObligatedAmount.2013
  #     )
  #
  #
  #     Measurement.Vars.List=c("OutlayNoOffsetAccount.2013"
  #                             ,"OutlayOffsetAccount.2013"
  #                             ,"ContractObligatedAmount.2013"
  #                             #                             ,"Fed_Grant_Funding_Amount.2013"
  #                             ,"Outlay.2013"
  #                             ,"ResidualOutlay.2013"
  #     )
  #
  #     if("Fed_Grant_Funding_Amount.2013" %in% names(df))
  #     {
  #
  #       df$ResidualOutlay.2013<-df$ResidualOutlay.2013-if_else(is.na(df$Fed_Grant_Funding_Amount.2013)
  #                                                                     ,0
  #                                                                     ,df$Fed_Grant_Funding_Amount.2013
  #       )
  #
  #       Measurement.Vars.List<-rbind(Measurement.Vars.List,"Fed_Grant_Funding_Amount.2013")
  #
  #
  #     }
  #
  #
  #
  #
  #     df<-melt(df,
  #                  #                        id=c("Fiscal_Year"
  #                  #                                                  ,"SubFunder.Detail")
  #                  measure.vars=Measurement.Vars.List,
  #                  variable.name="comparison.dollar.type")
  #
  #
  #
  #     df<-read_and_join_experiment(
  #       df,
  #       "LOOKUP_comparison_dollar_type.csv"
  #     )
  #
  #   }
  #
  #   if("SimpleArea" %in% names(df))
  #   {
  #     df$SimpleArea[df$SimpleArea==""]<-NA
  #     df$SimpleArea<-factor(df$SimpleArea
  #                               ,exclude=NULL
  #                               ,levels = c("Products"
  #                                           ,"Services"
  #                                           ,"R&D"
  #                                           ,NA)
  #                               ,labels = c("Products"
  #                                           ,"Services"
  #                                           ,"R&D"
  #                                           ,"Mixed or Unlabeled")
  #     )
  #   }
  #
  #   if("IsTerminated" %in% names(df))
  #   {
  #     levels(df$IsTerminated)<-list("Unterminated"=c("Unterminated","0",0),
  #                                       "Terminated"=c("Terminated","1",1)
  #     )
  #   }
  #
  #   if("IsClosed" %in% names(df))
  #   {
  #     levels(df$IsClosed)<-list("Unspecified"=c("Unspecified","0",0),
  #                                   "Closed"=c("Closed","1",1)
  #     )
  #
  #     #     addNA(df$IsClosed, ifany = TRUE)
  #   }
  #   if("numberofoffersreceived" %in% names(df)){
  #     df$numberofoffersreceived[df$numberofoffersreceived==0]<-NA
  #   }
  #
  #   if("UnmodifiedNumberOfOffersReceived" %in% names(df))
  #   {
  #     df$UnmodifiedNumberOfOffersReceived<-text_to_number(df$UnmodifiedNumberOfOffersReceived)
  #     df$UnmodifiedNumberOfOffersReceived[df$UnmodifiedNumberOfOffersReceived==0]<-NA
  #     if("numberofoffersreceived" %in% names(df)){
  #       df$UnmodifiedNumberOfOffersReceived[is.na(df$UnmodifiedNumberOfOffersReceived)]<-df$numberofoffersreceived[is.na(df$UnmodifiedNumberOfOffersReceived)]
  #     }
  #
  #     df$UnmodifiedNumberOfOffersSummary[is.na(df$UnmodifiedNumberOfOffersReceived)]<-NA
  #     df$UnmodifiedNumberOfOffersSummary[df$UnmodifiedNumberOfOffersReceived==1]<-1
  #     df$UnmodifiedNumberOfOffersSummary[df$UnmodifiedNumberOfOffersReceived==2]<-2
  #     df$UnmodifiedNumberOfOffersSummary[df$UnmodifiedNumberOfOffersReceived==3 | df$UnmodifiedNumberOfOffersReceived==4]<-3.5
  #     df$UnmodifiedNumberOfOffersSummary[df$UnmodifiedNumberOfOffersReceived>=5]<-5
  #
  #
  #     df$UnmodifiedNumberOfOffersSummary<-factor(df$UnmodifiedNumberOfOffersSummary,
  #                                                    levels = c(NA,1,2,3.5,5),
  #                                                    labels = c("Unlabeled", "1\nOffer","2\noffers","3-4\noffers","5+\noffers"),
  #                                                    exclude=NULL
  #     )
  #
  #
  #   }
  #
  #
  #
  #   if("MaxOfisChangeOrder" %in% names(df))
  #   {
  #     levels(df$MaxOfisChangeOrder)<-list("No Change Order"=c("No Change Order","0",0),
  #                                             "Closed"=c("Change Order(s)","1",1)
  #     )
  #
  #     addNA(df$MaxOfisChangeOrder, ifany = TRUE)
  #   }
  #
  #   if("IsFixedPrice" %in% names(df))
  #   {
  #     levels(df$IsFixedPrice)<-list("Other"=c("Other","0",0),
  #                                       "Fixed Price"=c("Fixed Price","1",1),
  #                                       "Combination \nor Unlabeled"=c("Combination \nor Unlabeled",NA)
  #     )
  #
  #     # df$IsFixedPrice<-factor(df$IsFixedPrice,
  #     #                             exclude=NULL,
  #     #                             levels = c(1,0,NA),
  #     #                             labels = c("Fixed Price", "Other","Combination \nor Unlabeled")
  #     # )
  #     if (!("Combination \nor Unlabeled" %in% levels(df$IsFixedPrice))){
  #       df$IsFixedPrice<-addNA(df$IsFixedPrice,ifany=TRUE)
  #       levels(df$IsFixedPrice)[is.na(levels(df$IsFixedPrice))] <- "Combination \nor Unlabeled"
  #     }
  #   }
  #
  #
  #   if("Action_Obligation" %in% names(df)){
  #     df$LogOfAction_Obligation<-log10(df$Action_Obligation)
  #     df$LogOfAction_Obligation[is.infinite(df$LogOfAction_Obligation)]<-0
  #
  #     if("NewWorkObligatedAmount" %in% names(df)){
  #       df$pNewWorkVsContractObligatedAmount<-df$NewWorkObligatedAmount/df$Action_Obligation
  #       df$pNewWorkVsContractObligatedAmount[is.infinite(df$pNewWorkVsContractObligatedAmount)]<-NA
  #       df$pNewWorkVsContractObligatedAmount[abs(df$pNewWorkVsContractObligatedAmount)>100]<-NA
  #     }
  #     if("ChangeOrderObligatedAmount" %in% names(df)){
  #       df$pChangeOrderVsContractObligatedAmount<-df$ChangeOrderObligatedAmount/df$Action_Obligation
  #       df$pChangeOrderVsContractObligatedAmount[is.infinite(df$pChangeOrderVsContractObligatedAmount)]<-NA
  #       df$pChangeOrderVsContractObligatedAmount[abs(df$pChangeOrderVsContractObligatedAmount)>100]<-NA
  #     }
  #   }
  #   if("ContractBaseAndAllOptionsValue" %in% names(df)){
  #     df$ContractBaseAndAllOptionsValue<-text_to_number(df$ContractBaseAndAllOptionsValue)
  #     df$LogOfContractBaseAndAllOptionsValue<-log10(df$ContractBaseAndAllOptionsValue)
  #     df$LogOfContractBaseAndAllOptionsValue[is.infinite(df$LogOfContractBaseAndAllOptionsValue)]<-0
  #
  #     if("NewWorkBaseAndAllOptionsValue" %in% names(df)){
  #       df$pNewWorkVsContractBaseAndAllOptionsValue<-df$NewWorkBaseAndAllOptionsValue/df$ContractBaseAndAllOptionsValue
  #       df$pNewWorkVsContractBaseAndAllOptionsValue[is.infinite(df$pNewWorkVsContractBaseAndAllOptionsValue)]<-NA
  #       df$pNewWorkVsContractBaseAndAllOptionsValue[abs(df$pNewWorkVsContractBaseAndAllOptionsValue)>100]<-NA
  #     }
  #     if("ChangeOrderBaseAndAllOptionsValue" %in% names(df)){
  #       df$pChangeOrderVsContractBaseAndAllOptionsValue<-df$ChangeOrderBaseAndAllOptionsValue/df$ContractBaseAndAllOptionsValue
  #       df$pChangeOrderVsContractBaseAndAllOptionsValue[is.infinite(df$pChangeOrderVsContractBaseAndAllOptionsValue)]<-NA
  #       df$pChangeOrderVsContractBaseAndAllOptionsValue[abs(df$pChangeOrderVsContractBaseAndAllOptionsValue)>100]<-NA
  #     }
  #   }
  #   if("ContractBaseAndExercisedOptionsValue" %in% names(df)){
  #     df$ContractBaseAndExercisedOptionsValue<-text_to_number(df$ContractBaseAndExercisedOptionsValue)
  #     df$LogOfContractBaseAndExercisedOptionsValue<-log10(df$ContractBaseAndExercisedOptionsValue)
  #     df$LogOfContractBaseAndExercisedOptionsValue[is.infinite(df$LogOfContractBaseAndExercisedOptionsValue)]<-0
  #
  #     if("NewWorkBaseAndExercisedOptionsValue" %in% names(df)){
  #       df$pNewWorkVsContractBaseAndExercised<-df$NewWorkBaseAndExercisedOptionsValue/df$ContractBaseAndExercisedOptionsValue
  #       df$pNewWorkVsContractBaseAndExercised[is.infinite(df$pNewWorkVsContractBaseAndExercised)]<-NA
  #       df$pNewWorkVsContractBaseAndExercised[abs(df$pNewWorkVsContractBaseAndExercised)>100]<-NA
  #     }
  #     if("ChangeOrderBaseAndExercisedOptionsValue" %in% names(df)){
  #       df$pChangeOrderVsContractBaseAndExercised<-df$ChangeOrderBaseAndExercisedOptionsValue/df$ContractBaseAndExercisedOptionsValue
  #       df$pChangeOrderVsContractBaseAndExercised[is.infinite(df$pChangeOrderVsContractBaseAndExercised)]<-NA
  #       df$pChangeOrderVsContractBaseAndExercised[abs(df$pChangeOrderVsContractBaseAndExercised)>100]<-NA
  #     }
  #   }
  #   if("UnmodifiedContractObligatedAmount" %in% names(df)){
  #     df$UnmodifiedContractObligatedAmount<-text_to_number(df$UnmodifiedContractObligatedAmount)
  #     df$LogOfUnmodifiedContractObligatedAmount<-log10(df$UnmodifiedContractObligatedAmount)
  #     df$LogOfUnmodifiedContractObligatedAmount[is.infinite(df$LogOfUnmodifiedContractObligatedAmount)]<-0
  #     if("Action_Obligation" %in% names(df)){
  #       df$pUnmodifiedContractObligated<-df$UnmodifiedContractObligatedAmount/df$Action_Obligation
  #     }
  #     if("NewWorkObligatedAmount" %in% names(df)){
  #       df$pNewWorkVsUnmodifiedObligatedAmount<-df$NewWorkObligatedAmount/df$UnmodifiedContractObligatedAmount
  #       df$pNewWorkVsUnmodifiedObligatedAmount[is.infinite(df$pNewWorkVsUnmodifiedObligatedAmount)]<-NA
  #       df$pNewWorkVsUnmodifiedObligatedAmount[abs(df$pNewWorkVsUnmodifiedObligatedAmount)>100]<-NA
  #     }
  #     if("ChangeOrderObligatedAmount" %in% names(df)){
  #       df$pChangeOrderVsUnmodifiedObligatedAmount<-df$ChangeOrderObligatedAmount/df$UnmodifiedContractObligatedAmount
  #       df$pChangeOrderVsUnmodifiedObligatedAmount[is.infinite(df$pChangeOrderVsUnmodifiedObligatedAmount)]<-NA
  #       df$pChangeOrderVsUnmodifiedObligatedAmount[abs(df$pChangeOrderVsUnmodifiedObligatedAmount)>100]<-NA
  #     }
  #   }
  #   if("UnmodifiedContractBaseAndAllOptionsValue" %in% names(df)){
  #     df$UnmodifiedContractBaseAndAllOptionsValue<-text_to_number(df$UnmodifiedContractBaseAndAllOptionsValue)
  #     df$LogOfUnmodifiedContractBaseAndAllOptionsValue<-log10(df$UnmodifiedContractBaseAndAllOptionsValue)
  #     df$LogOfUnmodifiedContractBaseAndAllOptionsValue[is.infinite(df$LogOfUnmodifiedContractBaseAndAllOptionsValue)]<-0
  #
  #     df$SizeOfUnmodifiedContractBaseAndAll<-CreateSize(df$UnmodifiedContractBaseAndAllOptionsValue)
  #
  #     if("ContractBaseAndAllOptionsValue" %in% names(df)){
  #       df$pUnmodifiedContractBaseAndAll<-df$UnmodifiedContractBaseAndAllOptionsValue/df$ContractBaseAndAllOptionsValue
  #     }
  #     if("NewWorkBaseAndAllOptionsValue" %in% names(df)){
  #       df$pNewWorkVsUnmodifiedBaseAndAll<-df$NewWorkBaseAndAllOptionsValue/df$UnmodifiedContractBaseAndAllOptionsValue
  #       df$pNewWorkVsUnmodifiedBaseAndAll[is.infinite(df$pNewWorkVsUnmodifiedBaseAndAll)]<-NA
  #       df$pNewWorkVsUnmodifiedBaseAndAll[abs(df$pNewWorkVsUnmodifiedBaseAndAll)>100]<-NA
  #     }
  #     if("ChangeOrderBaseAndAllOptionsValue" %in% names(df)){
  #       df$pChangeOrderVsUnmodifiedBaseAndAll<-df$ChangeOrderBaseAndAllOptionsValue/df$UnmodifiedContractBaseAndAllOptionsValue
  #       df$pChangeOrderVsUnmodifiedBaseAndAll[is.infinite(df$pChangeOrderVsUnmodifiedBaseAndAll)]<-NA
  #       df$pChangeOrderVsUnmodifiedBaseAndAll[abs(df$pChangeOrderVsUnmodifiedBaseAndAll)>100]<-NA
  #     }
  #   }
  #   if("UnmodifiedContractBaseAndExercisedOptionsValue" %in% names(df)){
  #     df$UnmodifiedContractBaseAndExercisedOptionsValue<-text_to_number(df$UnmodifiedContractBaseAndExercisedOptionsValue)
  #     df$LogOfUnmodifiedContractBaseAndExercisedOptionsValue<-log10(df$UnmodifiedContractBaseAndExercisedOptionsValue)
  #     df$LogOfUnmodifiedContractBaseAndExercisedOptionsValue[is.infinite(df$LogOfUnmodifiedContractBaseAndExercisedOptionsValue)]<-0
  #     if("ContractBaseAndExercisedOptionsValue" %in% names(df)){
  #       df$pUnmodifiedContractBaseAndExercised<-df$UnmodifiedContractBaseAndExercisedOptionsValue/df$ContractBaseAndExercisedOptionsValue
  #     }
  #     if("NewWorkBaseAndExercisedOptionsValue" %in% names(df)){
  #       df$pNewWorkVsUnmodifiedBaseAndExercised<-df$NewWorkBaseAndExercisedOptionsValue/df$UnmodifiedContractBaseAndExercisedOptionsValue
  #       df$pNewWorkVsUnmodifiedBaseAndExercised[is.infinite(df$pNewWorkVsUnmodifiedBaseAndExercised)]<-NA
  #       df$pNewWorkVsUnmodifiedBaseAndExercised[abs(df$pNewWorkVsUnmodifiedBaseAndExercised)>100]<-NA
  #     }
  #     if("ChangeOrderBaseAndExercisedOptionsValue" %in% names(df)){
  #       df$pChangeOrderVsUnmodifiedBaseAndExercised<-df$ChangeOrderBaseAndExercisedOptionsValue/df$UnmodifiedContractBaseAndExercisedOptionsValue
  #       df$pChangeOrderVsUnmodifiedBaseAndExercised[is.infinite(df$pChangeOrderVsUnmodifiedBaseAndExercised)]<-NA
  #       df$pChangeOrderVsUnmodifiedBaseAndExercised[abs(df$pChangeOrderVsUnmodifiedBaseAndExercised)>100]<-NA
  #     }
  #   }
  #
  #   #
  #   # ChangeOrderObligatedAmount
  #   # ChangeOrderBaseAndExercisedOptionsValue
  #   # ChangeOrderBaseAndAllOptionsValue
  #   # NewWorkObligatedAmount
  #   # NewWorkBaseAndExercisedOptionsValue
  #   # NewWorkBaseAndAllOptionsValue
  #
  #
  #

  #
  #   if("Date.Signed"%in% names(df)){
  #
  #     if(max(nchar(as.character(df$Date.Signed)))==10){
  #       #         if((max(substring(as.character(df$Date.Signed),7,8))=="99" |
  #       #                max(substring(as.character(df$Date.Signed),7,8))<"20") &
  #       #                !max(substring(as.character(df$Date.Signed),1,2))>"12"){
  #       df$Date.Signed <-as.Date(as.character(df$Date.Signed),"%m/%d/%Y")
  #     }
  #     else{
  #       df$Date.Signed <-as.Date(as.character(df$Date.Signed),"%y/%m/%d")
  #     }
  #   }
  #
  #   if("SignedMonth"%in% names(df)){
  #     df$SignedMonth <-as.Date(as.character(df$SignedMonth),"%Y-%m-%d")
  #   }
  #
  #
  #   if("YEAR"%in% names(df)){
  #     df$YEAR <-as.Date(paste("12/31/",as.character(df$YEAR),sep=""),"%m/%d/%Y")
  #   }
  #
  #   if(!("Graph" %in% names(df))){
  #     df$Graph<-TRUE
  #   }
  #   if("ProductOrServiceArea.Graph"%in% names(df)){
  #     df$Graph<-df$Graph&&df$ProductOrServiceArea.Graph
  #     df<-subset(df, select=-c(ProductOrServiceArea.Graph))
  #   }
  #   if("Contract.Size.Graph"%in% names(df)){
  #     df$Graph<-df$Graph&&df$Contract.Size.Graph
  #     df<-subset(df, select=-c(Contract.Size.Graph))
  #   }
  #
  #   if("Competition.Graph"%in% names(df)){
  #     df$Graph<-df$Graph&&df$Competition.Graph
  #     df<-subset(df, select=-c(Competition.Graph))
  #   }
  #   if("Vehicle.Graph"%in% names(df)){
  #     df$Graph<-df$Graph&&df$Vehicle.Graph
  #     df<-subset(df, select=-c(Vehicle.Graph))
  #   }
  #   if("Pricing.Mechanism.Graph"%in% names(df)){
  #     df$Graph<-df$Graph&&df$Pricing.Mechanism.Graph
  #     df<-subset(df, select=-c(Pricing.Mechanism.Graph))
  #   }
  #   if("Customer.Graph"%in% names(df)){
  #     df$Graph<-df$Graph&&df$Customer.Graph
  #     df<-subset(df, select=-c(Customer.Graph))
  #   }
  #   if("LastCurrentCompletionDate"%in% names(df)&"MinOfEffectiveDate"%in% names(df)){
  #
  #     df$CurrentMonths<-as.numeric(difftime(strptime(df$LastCurrentCompletionDate,"%Y-%m-%d")
  #                                               , strptime(df$MinOfEffectiveDate,"%Y-%m-%d")
  #                                               , unit="weeks"
  #     ))
  #     df$CategoryOfCurrentMonths<-CreateDuration(df$CurrentMonths)
  #     df$CurrentMonths<-ceiling(df$CurrentMonths/4)
  #   }
  #
  #
  #   if("UnmodifiedCurrentCompletionDate"%in% names(df)&"MinOfEffectiveDate"%in% names(df)){
  #
  #     df$UnmodifiedMonths<-as.numeric(difftime(strptime(df$UnmodifiedCurrentCompletionDate,"%Y-%m-%d")
  #                                                  , strptime(df$MinOfEffectiveDate,"%Y-%m-%d")
  #                                                  , unit="weeks"
  #     ))
  #     df$CategoryOfUnmodifiedMonths<-CreateDuration(df$UnmodifiedMonths)
  #     df$UnmodifiedMonths<-ceiling(df$UnmodifiedMonths/4)
  #
  #   }


  #   if("UnmodifiedIsSomeCompetition" %in% names(df))
  #   {
  #     levels(df$UnmodifiedIsSomeCompetition)<-list("No Comp."=c("No Comp.","0",0),
  #                                                      "Comp."=c("Comp.","1",1),
  #                                                      "Unlabeled"=c("Unlabeled",NA)
  #     )
  #
  #     # df$UnmodifiedIsSomeCompetition<-factor(df$UnmodifiedIsSomeCompetition,
  #     #                                            exclude=NULL,
  #     #                                            levels = c(1,0,NA),
  #     #                                            labels = c("Comp.", "No Comp.","Unlabeled")
  #     # )
  #   }
  #
  #   if("IsSomeCompetition" %in% names(df))
  #   {
  #
  #
  #
  #     if ("IsFullAndOpen" %in% names(df)&
  #         "IsOnlyOneSource" %in% names(df)){
  #       df$UnmodifiedCompetition[df$IsFullAndOpen==1]<-1
  #       df$UnmodifiedCompetition[df$IsSomeCompetition==1
  #                                    &is.na(df$UnmodifiedCompetition)]<-2
  #       df$UnmodifiedCompetition[df$IsOnlyOneSource==1
  #                                    &is.na(df$UnmodifiedCompetition)]<-3
  #       df$UnmodifiedCompetition[df$IsOnlyOneSource==0
  #                                    &is.na(df$UnmodifiedCompetition)]<-4
  #       df$UnmodifiedCompetition<-factor(df$UnmodifiedCompetition
  #                                            ,exclude=NULL
  #                                            ,levels=c(1,2,3,4,NA)
  #                                            ,labels=c("Full and Open"
  #                                                      ,"Some Comp."
  #                                                      ,"No Comp.\n1 Source"
  #                                                      ,"No Comp.\nOther"
  #                                                      ,"Unlabeled"
  #                                            )
  #       )
  #
  #     }
  #     levels(df$IsSomeCompetition)<-list("No Comp."=c("No Comp.","0",0),
  #                                            "Comp."=c("Comp.","1",1),
  #                                            "Mixed or \nUnlabeled"=c("Mixed or \nUnlabeled",NA)
  #     )
  #
  #
  #     df$IsSomeCompetition<-factor(df$IsSomeCompetition,
  #                                      exclude=NULL,
  #                                      levels = c(1,0,NA),
  #                                      labels = c("Comp.", "No Comp.","Mixed or \nUnlabeled")
  #     )
  #
  #     if("IsFullAndOpen" %in% names(df))
  #     {
  #       df$IsFullAndOpen<-factor(df$IsFullAndOpen,
  #                                    exclude=NULL,
  #                                    levels = c(1,0,NA),
  #                                    labels = c("Full & Open", "Not Full \n& Open","Mixed or \nUnlabeled")
  #       )
  #     }
  #
  #
  #
  #     if("UnmodifiedIsFullAndOpen" %in% names(df))
  #     {
  #       df$UnmodifiedIsFullAndOpen<-factor(df$UnmodifiedIsFullAndOpen,
  #                                              exclude=NULL,
  #                                              levels = c(1,0,NA),
  #                                              labels = c("Full & Open", "Not Full \n& Open","Unlabeled")
  #       )
  #     }
  #     if("IsOnlyOneSource" %in% names(df))
  #     {
  #       df$IsOnlyOneSource<-factor(df$IsOnlyOneSource,
  #                                      exclude=NULL,
  #                                      levels = c(1,0,NA),
  #                                      labels = c("Only One Source", "Not Only Once Source","Unlabeled")
  #       )
  #     }
  #
  #
  #
  #
  #     if ("IsIDV" %in% names(df)&
  #         "multipleorsingleawardidc" %in% names(df)&
  #         "AwardOrIDVcontractactiontype" %in% names(df)
  #     ){
  #       df$UnmodifiedVehicle[is.na(df$IsIDV)]<-NA
  #       df$UnmodifiedVehicle[df$AwardOrIDVcontractactiontype %in% c("Definitive Contract")
  #                                &is.na(df$UnmodifiedVehicle)]<-1
  #       df$UnmodifiedVehicle[df$AwardOrIDVcontractactiontype %in% c("Purchase Order")
  #                                &is.na(df$UnmodifiedVehicle)]<-2
  #       df$UnmodifiedVehicle[df$AwardOrIDVcontractactiontype %in% c("Blanket Purchase Agreement"
  #                                                                           ,"Federal Supply Schedule"
  #                                                                           ,"Government Wide Acquisition Contract"
  #                                                                           ,"Basic Ordering Agreement")
  #                                &is.na(df$UnmodifiedVehicle)]<-5
  #
  #       df$UnmodifiedVehicle[df$multipleorsingleawardidc=="MULTIPLE AWARD"
  #                                &is.na(df$UnmodifiedVehicle)]<-4
  #       df$UnmodifiedVehicle[df$multipleorsingleawardidc=="SINGLE AWARD"
  #                                &is.na(df$UnmodifiedVehicle)]<-3
  #       df$UnmodifiedVehicle[is.na(df$UnmodifiedVehicle)]<-6
  #       df$UnmodifiedVehicle<-factor(df$UnmodifiedVehicle
  #                                        ,exclude=NULL
  #                                        ,levels=c(1
  #                                                  ,2
  #                                                  ,3
  #                                                  ,4
  #                                                  ,5
  #                                                  ,6
  #                                                  ,NA)
  #                                        ,labels=c("Definitive"
  #                                                  ,"Purchase\nOrder"
  #                                                  ,"Single-Award\nIDC"
  #                                                  ,"Multi-Award\nIDC"
  #                                                  ,"Other IDC"
  #                                                  ,"Unlabeled\nIDC"
  #                                                  ,"Unlabeled"
  #                                        )
  #       )
  #
  #     }
  #
  #   }

  standardize_variable_names(df)
}
CSISdefense/csis360 documentation built on April 25, 2024, 12:01 a.m.