R/helpers.R
In cbsREPS: Hedonic and Multilateral Index Methods for Real Estate Price Statistics

Documented in calculate_geometric_average calculate_hedonic_imputation calculate_hedonic_imputationmatrix calculate_hmts_index calculate_index validate_input

### This is the first internal function

#' Calculate imputation averages with the 1st period as base period
#'
#' Prices are estimated based on a provided Hedonic model
#' The model values are calculated for each period in the data
#' With these values, new prices of base period observations are estimated.
#' With this function, imputations according to the Laspeyres and Paasche method can be estimated.
#'
#' @author Farley Ishaak
#' @param dataset_temp table with data 
#' @param period_temp 'period'
#' @param dependent_variable_temp usually the sale price
#' @param independent_variables_temp vector with quality determining variables
#' @param period_list_temp list with all available periods
#' @return
#' Table with imputation averages per period
#' @keywords internal


calculate_hedonic_imputation <- function(dataset_temp = dataset
                                         , period_temp = "period_var_temp"
                                         , dependent_variable_temp = dependent_variable
                                         , independent_variables_temp = independent_variables
                                         , number_of_observations_temp = number_of_observations
                                         , period_list_temp = period_list) {
  
  # Count number of periods
  number_of_periods <- length(period_list_temp)
  
  # Select required variables
  dataset_temp <- dataset_temp |>
    dplyr::select(all_of(c(period_temp, dependent_variable_temp,
                           independent_variables_temp)))
  
  # Remove lines without values
  dataset_temp[dataset_temp == ''] <- NA
  dataset_temp <- stats::na.omit(dataset_temp)
  
  # Remove unused levels. R remembers the original state of the levels, but if a level is not present in a certain period, this may result in an error in the bootstrap.
  dataset_temp <- droplevels(dataset_temp)
  
  
  dependent_variable_temp <- paste0("log(", dependent_variable_temp, ")")
  
  
  ## Model
  
  # Construct the regression formula from independent variables.
  for (indep_var in 1: length(independent_variables_temp)) {
    if (indep_var == 1) {
      model <- paste0(dependent_variable_temp, "~", independent_variables_temp[indep_var])
    } else {
      model <- paste0(model, "+", independent_variables_temp[indep_var])
    }
  }
  
  ## Calculate imputations per period
  
  # Empty vector for the values and numbers
  average_imputations <- c()
  number_observations_total <- c()
  
  for (current_period in 1:number_of_periods) {
    
    # Estimate coefficients of the 1st period
    if (current_period == 1) {
      rekenbestand <- subset(dataset_temp, period_var_temp == period_list_temp[1])
      fitmdl <- stats::lm(model, rekenbestand)
      predictmdl_0 <- mean(stats::predict(fitmdl, rekenbestand))
      

      predictmdl_0 <- exp(predictmdl_0)

      if (number_of_observations_temp == TRUE) {
        number <- nrow(rekenbestand)
      }
    } else {
      # Estimate coefficients of all periods after
      rekenbestand_t <- subset(dataset_temp, period_var_temp == period_list_temp[current_period])
      fitmdl <- stats::lm(model, rekenbestand_t)
      
      # If parameter number_of_observations = TRUE, then calculate numbers
      if (number_of_observations_temp == TRUE) {
        number <- nrow(rekenbestand_t)
      }
    }
    
    # Recoding of values, where the categorical variable has a level that is not estimated in the reference period
    rekenbestand_0 <- rekenbestand
    for (var in names(fitmdl$xlevels)) {
      missend_in_model <- levels(rekenbestand_0[[var]])[!(levels(rekenbestand_0[[var]]) %in% fitmdl$xlevels[[var]])]
      
      # Replace level of the variable by the first level (default). The variable is in fact not taken into account by this step in the calculation.
      sel <- rekenbestand_0[[var]] %in% missend_in_model
      rekenbestand_0[[var]][sel] <- fitmdl$xlevels[[var]][1]
    }
    predictmdl_t <- mean(predict(fitmdl, rekenbestand_0))
    
    predictmdl_t <- exp(predictmdl_t)
    
    average_imputations[current_period] <- predictmdl_t
    
    # If parameter number_of_observations = TRUE, then add numbers to table
    if (number_of_observations_temp == TRUE) {
      number_observations_total[current_period] <- number
    }
  }
  
  # Create table
  tbl_average_imputation <- data.frame(period = period_list_temp)
  
  # If parameter number_of_observations = TRUE, then add numbers to table
  if (number_of_observations_temp == TRUE) {
    tbl_average_imputation$number_of_observations <- number_observations_total
  }
  
  # Add imputations to table
  tbl_average_imputation$average_imputation <- average_imputations
  
  # Result
  return(tbl_average_imputation)
  
}

### This is the second internal function


#' Transform series into index
#'
#' The index can be calculated in two ways:
#' - from a series of values
#' - from a series of mutations (from_growth_rate = TRUE)
#'
#' N.B. with from_growth_rate:
#' The series of mutations must be equally long to the series of values.
#' The vector should, therefore, also contain a mutation for the first period (this is likely 1).
#' In the calculation, this first mutation is not used.
#'
#' N.B. for the reference period:
#' The first value is on default set to 100.
#' An adjusted reference period can be provided in the paramater.
#' The reference period can also be a part of a period.
#' E.g. if the series contains months (2019jan, 2019feb), the reference period can be a year (2019).
#'
#' @author Farley Ishaak
#' @param periods vector/variable with periods (numeric/string)
#' @param values vector/variable with to be transformed values (numeric)
#' @param reference_period period or group of periods that will be set to 100 (numeric/string)
#' @return Index series
#' @keywords internal

calculate_index <- function(periods
                            , values
                            , reference_period = NULL) {
  
  # Check length periods and values
  if (length(periods) != length(values)) {
    stop("The periods variable is not of the same length as the values variable.")
  }
  
  # Check numeric values
  if (is.numeric(values) == FALSE) {
    stop("The values variable is not (fully) numeric.")
  }
  
  # Transforms periods to characters
  periods <- as.character(periods)
  
  # If reference_period is not provided, then reference_period = 1st period from list
  if (is.null(reference_period) == TRUE) {
    reference_period <- periods[1]
    periods_short <- periods
  } else {
    # Determine length reference_period
    length_reference_period <- nchar(reference_period)
    periods_short <- substr(periods, 1, length_reference_period)
  }
  
  # Check reference_period
  if (!(reference_period %in% periods_short)) {
    stop("The provided reference period is not part of the series with periods")
  }
  
  
  # Create table
  tbl_index <- data.frame(period = periods_short, value = values)
  
  average <- tbl_index |>
    dplyr::filter(period == reference_period) |>
    dplyr::summarise(value = mean(value)) |>
    dplyr::pull(value)
  
  # Calculate index
  tbl_index$Index <- tbl_index$value / average * 100
  
  # Result = index series
  return(index = tbl_index$Index)
  
}

### This is the third internal function
#' Validate Input Data for Hedonic Index Calculation
#'
#' This function checks whether the dataset contains all required variables, whether the dependent and continuous variables are numeric, 
#' and whether the period variable is formatted correctly (e.g., "2020Q1", "2020M01").
#' It ensures that the data is suitable for further processing in hedonic index calculations.
#'
#' @param dataset A data.frame containing the dataset to be validated.
#' @param period_variable A string specifying the name of the period variable column.
#' @param dependent_variable A string specifying the name of the dependent variable (usually the sale price).
#' @param continuous_variables A character vector with names of numeric quality-determining variables.
#' @param categorical_variables A character vector with names of categorical variables (including dummies).
#'
#' @return Returns TRUE invisibly if all checks pass. Otherwise, an error is thrown.
#'
#' @author David Pietersz
#' @keywords internal
#' 
#' @importFrom assertthat assert_that has_name
#' @importFrom stringr str_detect

validate_input <- function(dataset, period_variable, dependent_variable, continuous_variables, categorical_variables) {
  
  # Dataset contains all necessary columns
  assertthat::assert_that(assertthat::has_name(dataset, c(period_variable, dependent_variable, continuous_variables, categorical_variables)))
  
  # Dependent and continuous variables only contain numeric values
  numeric_cols <- c(dependent_variable, continuous_variables)
  for (col in numeric_cols) {
    assertthat::assert_that(is.numeric(dataset[[col]]), msg = paste("Column", col, "is not (fully) numeric."))
  }
  
  
  # If log transformation still needs to be performed, dependent variable should contain strictly positive values.
  assertthat::assert_that(all(dataset[[dependent_variable]] > 0), msg = "The dependent variable contains negative values while log transformation needs to be performed.")
 
  
  regex_period <- "^[0-9]{4}([Mm](0?[1-9]|1[0-2])|[Qq](0?[1-4]))$"
  
  assertthat::assert_that(all(stringr::str_detect(dataset[[period_variable]], regex_period)), msg = "The period variable should be in the correct format, for example: 2020Q1, 2020q1, 2020M1, 2020M01, 2020m01, 2020Q4, 2020q04.")
  
  
}

## HTMS helper 1

#' Calculate HMTS index only (Hedonic Multilateral Time series re-estimation Splicing)
#'
#' Based on a hedonic model, an index is calculated in below steps. See also Ishaak, Ouwehand, Remoy & De Haan (2023).
#' 1: for each period, average imputed prices are calculated with the first period as base period.
#' 2: step 1 is repeated for every possible base period. This result in as many series of imputed values as the number of periods.
#' 3: All series with imputed prices are re-estimated with a Kalman filter (also time series model/state space model)
#'    This step can be turned off with a parameter.
#' 4: The series of imputed values are transformed into index series.
#' 5: a specified (parameter) window is chosen of index figures that continues in the calculation.
#'    This step can be turned off with a parameter.
#' 6: Of the remaining index figures, the geometric average per period is calculated.
#'    The remaining index figures form the final index.
#'
#' Parameter 'production_since':
#' To simulate a series, where 1 period a time expires (as in production), a manual choice in the past is possible.
#' Until this period, all periods are imputed. After that, 1 period is added.
#'
#' Parameter 'resting_points':
#' If TRUE, the output is a list of tables. These tables can be called with a $ after the output.
#' $Index table with periods, index and number of observations
#' $Window table with the index figures within the chosen window
#' $Chosen_index_series table with index series before the window splice
#' $Matrix_HMTS_index table with index series based on re-estimated imputations (time series model)
#' $Matrix_HMTS table with re-estimated imputations (time series model)
#' $Matrix_HMTS_index table with index series based on estimated imputations (hedonic model)
#' $Matrix_HMTS table with estimated imputations (time series model)l
#' $Matrix_HMTS_analyse table with diagnostic values of the time series model per base period
#'
#' @author Farley Ishaak
#' @param period_variable variable in the dataset with the period
#' @param dependent_variable usually the sale price
#' @param continuous_variables vector with quality-determining continues variables (numeric, no dummies)
#' @param categorical_variables vector with categorical variables (also dummy)
#' @param reference_period period or group of periods that will be set to 100 (numeric/string)
#' @param number_of_observations number of observations per period (default = TRUE)
#' @param periods_in_year if month, then 12. If quarter, then 4, etc. (default = 4)
#' @param production_since 1 period in the format of the period_variable. See description above (default = NULL)
#' @param number_preliminary_periods number of periods that the index is preliminary. Only works if production_since <> NULL. default = 3
#' @param resting_points should analyses values be returned? (default = FALSE)
#' @return
#' $Matrix_HMTS_index table with index series based on estimations with time series re-estimations
#' $Matrix_HMTS table with estimated values based on time series re-estimations
#' $Matrix_HMS_index table with index series based on estimations with the hedonic model
#' $Matrix_HMS table with estimated values based on the hedonic model
#' $Matrix_HMTS_analysis table with analysis values of the time series model per base period
#' @keywords internal
#' @return table with periods, index and number of observations. If resting_points = TRUE, then list with tables. See general description and examples.


calculate_hmts_index <- function(
    dataset,
    period_variable,
    dependent_variable,
    continuous_variables,
    categorical_variables,
    reference_period,
    periods_in_year,
    production_since = NULL,
    number_preliminary_periods,
    number_of_observations = NULL,
    resting_points) {
  
  period_list <- sort(unique(dataset$period))
  number_of_periods <- length(period_list)
  
  if (is.null(number_preliminary_periods) == TRUE) {
    number_preliminary_periods <- number_of_periods
  }
  
  imputations_complete <- calculate_hedonic_imputationmatrix(dataset = dataset
                                                             , period_variable = "period"
                                                             , dependent_variable = dependent_variable
                                                             , continuous_variables = continuous_variables
                                                             , categorical_variables = categorical_variables
                                                             , periods_in_year = periods_in_year
                                                             , number_of_observations = number_of_observations
                                                             , production_since = production_since
                                                             , number_preliminary_periods = number_preliminary_periods)
  
  matrix_hmts <- as.data.frame(imputations_complete$matrix_hmts)
  matrix_hmts_index <- as.data.frame(imputations_complete$matrix_hmts_index)
  matrix_hmts_analysis <- as.data.frame(imputations_complete$matrix_hmts_analysis)
  imputations <- matrix_hmts_index
  
  start_window <- 2
  
  if (number_preliminary_periods != number_of_periods) {
    
    for (current_period in 1:number_of_periods) {
      
      if (current_period == 1) {
        window <- imputations[current_period, c(current_period:(number_preliminary_periods + start_window))]
        window$period <- NULL
        end_window <- start_window
        start_window_update <- start_window + 1
      }
      
      if (current_period > 1 && current_period <= number_preliminary_periods + 1) {
        # end_window <- end_window + 1
        end_window <- number_preliminary_periods + start_window
        window <- dplyr::bind_rows(window, as.data.frame(imputations[current_period, c(start_window:end_window)]))
      }
      
      if (current_period > number_preliminary_periods + 1) {
        end_window <- end_window + 1
        if (number_preliminary_periods == 0) {
          window_plus_1 <- as.data.frame(imputations[current_period, c((start_window_update - 1):end_window)])
          window_plus_1[, 1] <- NA
          window <- dplyr::bind_rows(window, window_plus_1)
        } else {
          window <- dplyr::bind_rows(window, as.data.frame(imputations[current_period, c(start_window_update:end_window)]))
        }
        start_window_update <- start_window_update + 1
      }
      
    }
    
  }
  
  if (number_preliminary_periods == number_of_periods) {
    window <- imputations[, -1]
  }
  
  window_transposed <- as.data.frame(t(window))
  geometric_averages <- c()
  
  for (current_period in 1:number_of_periods) {
    
    geometric_average <- calculate_geometric_average(na.omit(window_transposed[, current_period]))
    
    geometric_averages[current_period] <- geometric_average
    
  }
  
  window$period <- period_list
  window <- window[, c(number_of_periods + 1, 1:number_of_periods)]
  matrix_hmts$geom_avg <- matrix_hmts_index$geom_avg <- window$geom_avg <- geometric_averages
  imputations$Index <- matrix_hmts$index <- matrix_hmts_index$index <- window$index <- calculate_index(period_list, geometric_averages, reference_period = reference_period)
  
  if (number_of_observations == TRUE) {
    results <- dplyr::select(imputations, dplyr::all_of(c("period", "number_of_observations", "Index")))
  } else {
    results <- dplyr::select(imputations, dplyr::all_of(c("period", "Index")))
  }
  
  if (resting_points == TRUE) {
    results <- list(Index = results
                    , window = window
                    , chosen_index_series = imputations
                    , matrix_hmts_index = matrix_hmts_index
                    , matrix_hmts = matrix_hmts
                    , matrix_hmts_analysis = matrix_hmts_analysis)
  }
  
  return(results)
}


## HMTS helper 2

#' Calculate the geometric average of a series of values
#'
#' The equation for the calculation is:: exp(mean(log(series_values)))
#'
#' @author Farley Ishaak 
#' @param values series with numeric values
#' @return geometric average
#' @keywords Internal

calculate_geometric_average <- function(values){
  
  # Remove NA values
  values <- values[!is.na(values)]
  
  assertthat::assert_that(is.numeric(values), msg = "Values is not (fully) numeric")
  
  return(exp(mean(log(values))))
  
}

## HMTS helper 3

#' Calculate a matrix with hedonic imputation averages, re-estimated time series imputation averages and  corresponding index series.
#'
#' Based on a hedonic model, a series of imputed values is calculated in below steps:
#' 1: for every period average imputed prices are estimated with the 1st period as base period.
#' 2: the above is repeated for each possible base period. This result in an equal number of series as the number of periods.
#' 3: All series are re-estimated with a time series model (state space).
#'    This step is optionally skipped with a parameter (state_space_model = NULL)
#' 4: the series imputed values are transformed into index series.
#' This matrix can be used for an index calculations according to the HMTS method.
#'
#' Parameter 'production_since':
#' To simulate a series, where 1 period a time expires (as in production), a manual choice in the past is possible.
#' Until this period, all periods are imputed. After that, 1 period is added.
#'
#' @author Farley Ishaak
#' @param dataset table with data (does not need to be a selection of relevant variables)
#' @param period_variable variable in the dataset with the period
#' @param dependent_variable usually the sale price
#' @param continuous_variables vector with quality-determining continues variables (numeric, no dummies)
#' @param categorical_variables vector with categorical variables (also dummy)
#' @param number_of_observations number of observations per period (default = TRUE)
#' @param periods_in_year if month, then 12. If quarter, then 4, etc. (default = 4)
#' @param production_since 1 period in the format of the period_variable. See description above (default = NULL)
#' @param number_preliminary_periods number of periods that the index is preliminary. Only works if production_since <> NULL. default = 3
#' @return
#' $Matrix_HMTS_index table with index series based on estimations with time series re-estimations
#' $Matrix_HMTS table with estimated values based on time series re-estimations
#' $Matrix_HMS_index table with index series based on estimations with the hedonic model
#' $Matrix_HMS table with estimated values based on the hedonic model
#' $Matrix_HMTS_analysis table with analysis values of the time series model per base period
#' @keywords internal

calculate_hedonic_imputationmatrix <- function(dataset
                                               , period_variable
                                               , dependent_variable
                                               , continuous_variables
                                               , categorical_variables
                                               , periods_in_year
                                               , number_of_observations = TRUE
                                               , production_since = NULL
                                               , number_preliminary_periods) {
  
  independent_variables <- c(continuous_variables, categorical_variables)
  period_list <- sort(unique(as.character(dataset$period)))
  number_periods <- length(period_list)
  
  if (!is.null(production_since)) {
    production_since_index <- match(production_since, period_list)
    if (is.na(production_since_index)) {
      stop("The provided production_since-period is not part of the periods in the data. Check the notation.")
    }
  } else {
    production_since_index <- number_preliminary_periods <- number_periods
  }
  
  dataset_temp <- dataset[, (names(dataset) %in% c("period", dependent_variable, independent_variables))]
  
  dataset_temp[dataset_temp == ""] <- NA
  dataset_temp <- stats::na.omit(dataset_temp)
  
  dataset_temp <- droplevels(dataset_temp)
  
  
  dependent_variable <- paste0("log(", dependent_variable, ")")
  
  
  for (indep_var in 1:length(independent_variables)) {
    if (indep_var == 1) {
      model <- paste0(dependent_variable, "~", independent_variables[indep_var])
    } else {
      model <- paste0(model, "+", independent_variables[indep_var])
    }
  }
  
  number_observations_total <- c()
  
  matrix_hmts <- matrix_hmts_index <- data.frame(period=period_list)
  
  for (current_period in 1:number_periods) {
    
    if (current_period <= production_since_index - number_preliminary_periods) {
      number_periods_production_since <- production_since_index
    }
    if (current_period > production_since_index - number_preliminary_periods) {
      number_periods_production_since <- current_period + number_preliminary_periods
    }
    if (number_periods_production_since > number_periods) {
      number_periods_production_since <- number_periods
    }
    
    difference_length_series <- number_periods - number_periods_production_since
    
    dataset_base <- subset(dataset_temp, dataset_temp[[period_variable]] == period_list[current_period])
    
    if (number_of_observations == TRUE) {
      number_observations_total[current_period] <- nrow(dataset_base)
    }
    
    hms <- c()
    hmts <- c()
    hmts_index <- c()
    hmts_analysis <- c()
    
    for (reporting_period in 1:number_periods_production_since) {
      dataset_dynamic <- subset(dataset_temp, dataset_temp[[period_variable]] == period_list[reporting_period])
      fitmdl <- stats::lm(model, dataset_dynamic)
      
      for (var in names(fitmdl$xlevels)) {
        missend_in_model <- levels(dataset_base[[var]])[!(levels(dataset_base[[var]]) %in% fitmdl$xlevels[[var]])]
        sel <- dataset_base[[var]] %in% missend_in_model
        dataset_base[[var]][sel] <- fitmdl$xlevels[[var]][1]
      }
      
      predictmdl <- mean(stats::predict(fitmdl, dataset_base))
      predictmdl <- exp(predictmdl)
      hms[reporting_period] <- predictmdl
      
    }
    
    
    hmts_temp <- calculate_trend_line_kfas(original_series = hms, periodicity = periods_in_year, resting_points = TRUE)
    hmts <- hmts_temp$trend_line
    hmts_analysis <- hmts_temp$resting_points
    hmts_index <- calculate_index(periods = c(1:number_periods_production_since), values = hmts)
    
    
    hmts <- c(hmts, rep(NA, difference_length_series))
    hmts_index <- c(hmts_index, rep(NA, difference_length_series))
    
    matrix_hmts[paste0("Base_", period_list[current_period])] <- hmts
    matrix_hmts_index[paste0("Base_", period_list[current_period])] <- hmts_index
    
    if (current_period == 1) {
      matrix_hmts_analysis <- data.frame(reeks = hmts_analysis$reeks) 
    }
    matrix_hmts_analysis[paste0("Base_", period_list[current_period])] <- hmts_analysis$value
  }
  
  # Add numbers to calculation
  if (number_of_observations == TRUE) {
    matrix_hmts["number_of_observations"] <- matrix_hmts_index["number_of_observations"] <- number_observations_total
  }
  
  matrices <- list(matrix_hmts = matrix_hmts
                   , matrix_hmts_index = matrix_hmts_index
                   , matrix_hmts_analysis = matrix_hmts_analysis)
  
  return(matrices)
  
}
Any scripts or data that you put into this service are public.
cbsREPS documentation built on June 8, 2025, 9:38 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
cbsREPS
Hedonic and Multilateral Index Methods for Real Estate Price Statistics

R/helpers.R
In cbsREPS: Hedonic and Multilateral Index Methods for Real Estate Price Statistics

Defines functions calculate_hedonic_imputationmatrix calculate_geometric_average calculate_hmts_index validate_input calculate_index calculate_hedonic_imputation

Documented in calculate_geometric_average calculate_hedonic_imputation calculate_hedonic_imputationmatrix calculate_hmts_index calculate_index validate_input

Try the cbsREPS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cbsREPS Hedonic and Multilateral Index Methods for Real Estate Price Statistics

R/helpers.R In cbsREPS: Hedonic and Multilateral Index Methods for Real Estate Price Statistics

Defines functions calculate_hedonic_imputationmatrix calculate_geometric_average calculate_hmts_index validate_input calculate_index calculate_hedonic_imputation

Documented in calculate_geometric_average calculate_hedonic_imputation calculate_hedonic_imputationmatrix calculate_hmts_index calculate_index validate_input

Try the cbsREPS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cbsREPS
Hedonic and Multilateral Index Methods for Real Estate Price Statistics

R/helpers.R
In cbsREPS: Hedonic and Multilateral Index Methods for Real Estate Price Statistics