priceR: Economics and Pricing Tools

Documented in extract_salary

#' Extract numeric salary from text data
#'
#' Extract numeric salary from text data. `extract_salary` automatically converts weekly and hourly rates to amounts per annum.
#' @name extract_salary
#'
#' @usage extract_salary(salary_text, exclude_below, exclude_above, salary_range_handling,
#' include_periodicity, hours_per_workday, days_per_workweek, working_weeks_per_year)
#'
#' @param salary_text A character string, or vector of character strings.
#' @param exclude_below A lower bound. Anything lower than this number will be replaced with NA.
#' @param exclude_above An upper bound. Anything above this number will be replaced with NA.
#' @param salary_range_handling A method of handling salary ranges. Defaults to returning an average of the range; can also be set to "max" or "min".
#' @param include_periodicity Set to TRUE to return an additional column stating the detected peridicity in the character string. Periodicity is assumed to be 'Annual' unless evidence is found to the contrary.
#' @param hours_per_workday Set assumed number of hours in the workday. Only affects annualisation of rates indentified as Daily. Default is 8 hours.
#' @param days_per_workweek Set assumed number of days per workweek. Only affects annualisation of rates indentified as Daily. Default is 5 days.
#' @param working_weeks_per_year Set assumed number of working weeks in the year. Only affects annualisation of rates indentified as Daily or Weekly. Default is 50 weeks.
#'
#' @return A data.frame of 1 column, or 2 columns if include_periodicity is set to TRUE
#'
#' @import dplyr stringr gsubfn
#'
#' @export
#'
#' @examples
#'
#' # Provide a salary string and 'extract_salary' and will extract the salary and return it
#' extract_salary("$160,000 per annum")
#' # 160000
#'
#'
#' # If a range is present, the average will be taken by default
#' extract_salary("$160,000 - $180000.00 per annum")
#' # 170000
#'
#'
#' # Take the 'min' or 'max' of a salary range by setting salary_range_handling parameter accordingly
#' extract_salary("$160,000 - $180000.00 per annum", salary_range_handling = "min")
#' # 160000
#'
#'
#' # Extract salaries from character string(s)
#' annual_salaries <- c("$160,000 - $180000.00 per annum",
#'                      "$160000.00 - $180000.00 per annum",
#'                      "$145000 - $155000.00 per annum",
#'                      "$70000.00 - $90000 per annum",
#'                      "$70000.00 - $90000.00 per annum plus 15.4% super",
#'                      "$80000.00 per annum plus 15.4% super",
#'                      "60,000 - 80,000",
#'                      "$78,686 to $89,463 pa, plus 15.4% superannuation",
#'                      "80k - 100k")
#'
#' extract_salary(annual_salaries)
#' # 170000 170000 150000  80000  53338  40008  70000  56055  90000
#' # Note the fifth, sixth, and eighth elements are averages including '15' (undesirable)
#' # Using exclude_below parameter avoids this (see below)
#'
#' # Automatically detect, extract, and annualise daily rates
#' daily_rates <- c("$200 daily", "$400 - $600 per day", "Day rate negotiable dependent on experience")
#' extract_salary(daily_rates)
#' # 48000 120000     NA
#'
#'
#' # Automatically detect, extract, and annualise hourly rates
#' hourly_rates <- c("$80 - $100+ per hour", "APS6/EL1 hourly rate contract")
#' extract_salary(hourly_rates)
#' # 172800   6720
#' # Note 6720 is undesirable. Setting the exclude_below and exclude_above sensibly avoids this
#'
#'
#' salaries <- c(annual_salaries, daily_rates, hourly_rates)
#'
#'
#' # Setting lower and upper bounds provides a catch-all to remove unrealistic results
#' # Out of bounds values will be converted to NA
#' extract_salary(salaries, exclude_below = 20000, exclude_above = 600000)
#' # 170000 170000 150000  80000  80000  80000  70000  84074  90000  48000 120000     NA 172800     NA
#'
#'
#' # extract_salary automatically annualises hourly and daily rates
#' # It does so by making assumptions about the number of working weeks in a year,
#' # days per workweek, and hours per workday
#' # And the assumed number of hours per workday can be changed from the default (8)
#' # The assumed number of workdays per workweek can be changed from the default (5)
#' # The assumed number of working weeks in year can be changed from the default (50)
#' # E.g.
#' extract_salary(salaries, hours_per_workday = 7, days_per_workweek = 4,
#'                working_weeks_per_year = 46, exclude_below = 20000)
#' # 170000 170000 150000  80000  53338  40008  70000  56055  90000  36800  92000     NA 115920     NA
#'
#'
#' # To see which salaries were detected as hourly or weekly, set include_periodicity to TRUE
#' extract_salary(salaries, include_periodicity = TRUE, exclude_below = 20000)
#'
#' # salary periodicity
#' # 1  170000      Annual
#' # 2  170000      Annual
#' # 3  150000      Annual
#' # 4   80000      Annual
#' # 5   80000      Annual
#' # 6   80000      Annual
#' # 7   70000      Annual
#' # 8   84074      Annual
#' # 9   90000      Annual
#' # 10  48000       Daily
#' # 11 120000       Daily
#' # 12     NA       Daily
#' # 13 172800      Hourly
#' # 14     NA      Hourly
#'
#'


extract_salary <- function(salary_text, exclude_below, exclude_above, salary_range_handling, include_periodicity, hours_per_workday, days_per_workweek, working_weeks_per_year) {


  ### These steps ensure any digits found via regular expressions are replaced with complete numbers
  # and not scientific notation! - very important.

  # Set existing option(s)
  op <- options("scipen")

  # Give scipen option a new value
  options(scipen = 100)

  # Run this to ensure that on existing the current scope (i.e. function), the option is set back to
  # its original value
  on.exit(options(op))





  if(missing(exclude_above)) { exclude_above <- 9999999999999 }
  if(missing(exclude_below)) { exclude_below <- 0 }
  if(missing(include_periodicity)) { include_periodicity <- FALSE }

  if(missing(hours_per_workday)) { hours_per_workday <- 8 }
  if(missing(days_per_workweek)) { days_per_workweek <- 5 }
  if(missing(working_weeks_per_year)) { working_weeks_per_year <- 50 }

  if(missing(salary_range_handling)) { salary_range_handling <- "average" }
  if(!salary_range_handling %in% c("average", "max", "min")) { stop("salary_range_handling parameter must be either \"average\", \"min\", or \"max\"") }

  # Clean
  salary <- salary_text %>% gsub(",", "", .) %>% str_replace_all(., "\\d+\\.\\d+", function(x) { as.integer(x) }) %>%
    gsub("(\\d+)k", "\\1000", .)  %>% gsub("(\\d+)K", "\\1000", .) # Sub out commas, round decimals, and convert k to 000


  # Identify hourly, daily or annual
  hourly_daily_or_annual <- function(salary) {

    hourly_terms <- c("ph", "p\\.h", "p\\.h\\.", "p/h", "hourly", "hour")
    daily_terms <- c("pd", "p\\.d", "p\\.d\\.", "p/d", "daily", "day")

    period <- salary %>% { ifelse((grepl(paste(daily_terms, collapse="|"), . , ignore.case=TRUE)), "Daily",
                                  ifelse(grepl(paste(hourly_terms, collapse="|"), . , ignore.case=TRUE), "Hourly",
                                         "Annual")) }
    temp <- data.frame(salary, period, stringsAsFactors = FALSE)
    return(temp)
  }
  temp <- hourly_daily_or_annual(salary)


  # For every number regex'd out, these will convert into annual
  hourly_to_annual <- function(hourly_pay) {
    hourly_pay %>% as.numeric %>% {. * hours_per_workday * days_per_workweek * working_weeks_per_year} %>%
      round %>% { ifelse(. > exclude_above, "", .) } %>% { ifelse(. < exclude_below, "", .) }
  }

  salary[temp$period == "Hourly"] <- salary[temp$period == "Hourly"] %>% gsubfn("(\\d+)", ~ { hourly_to_annual(x) }, . , backref = -1) %>% unlist

  daily_to_annual <- function(daily_pay) {
    daily_pay %>% as.numeric %>% {. * days_per_workweek * working_weeks_per_year} %>%
      round %>% { ifelse(. > exclude_above, "", .) } %>% { ifelse(. < exclude_below, "", .) }
  }

  salary[temp$period == "Daily"] <- salary[temp$period == "Daily"]  %>% gsubfn("(\\d+)", ~ { daily_to_annual(x) }, . , backref = -1) %>% unlist


  # Define 3 functions
  extract_average <- function(salary) {
    salary %>% str_extract_all(. , "\\d+") %>% lapply(., function(x) { x[as.numeric(x) >= exclude_below] } ) %>%
      lapply(., function(x) { x[as.numeric(x) <= exclude_above] } ) %>% lapply(., function(x) { mean(as.numeric(x)) } ) %>% unlist %>% {ifelse(. == -Inf, NA_character_, .) }
  }

  extract_max <- function(salary) {
    salary %>% str_extract_all(. , "\\d+") %>% lapply(., function(x) { x[as.numeric(x) >= exclude_below] } ) %>%
      lapply(., function(x) { x[as.numeric(x) <= exclude_above] } )%>% lapply(., function(x) { max(as.numeric(x)) } ) %>% unlist %>% {ifelse(. == -Inf, NA_character_, .) }
  }

  extract_min <- function(salary) {
    salary %>% str_extract_all(. , "\\d+") %>% lapply(., function(x) { x[as.numeric(x) >= exclude_below] } ) %>%
      lapply(., function(x) { x[as.numeric(x) <= exclude_above] } ) %>% lapply(., function(x) { min(as.numeric(x)) } ) %>% unlist %>% {ifelse(. == -Inf, NA_character_, .) }
  }


  switch(salary_range_handling,
         "average"=suppressWarnings({salary <- salary %>% extract_average %>% as.numeric}),
         "min"=suppressWarnings({salary <- salary %>% extract_min %>% as.numeric}),
         "max"=suppressWarnings({salary <- salary %>% extract_max %>% as.numeric})
  )

  salary <- salary %>% { ifelse(. > exclude_above, "", .) } %>% as.numeric(.) %>% { ifelse(. < exclude_below, "", .) } %>% as.numeric(.) %>% round

  annualised_salaries <- data.frame(salary, stringsAsFactors = FALSE)

  if(include_periodicity) {
    annualised_salaries <- data.frame(annualised_salaries, temp$period, stringsAsFactors = FALSE)
    colnames(annualised_salaries)[2] <- "periodicity"
  }


  return(annualised_salaries)
}