R/generateHHID.R
In surveysd: Survey Standard Error Estimation for Cumulated Estimates and their Differences in Complex Panel Designs

Documented in generate.HHID

#' Generate new houshold ID for survey data with rotating panel design taking
#' into account split households
#'
#' Generating a new houshold ID for survey data using a houshold ID and a
#' personal ID.
#' For surveys with rotating panel design containing housholds, houshold members
#' can move from an existing household to a new one, that was not originally in
#' the sample. This leads to the creation of so called split households. Using a
#' peronal ID (that stays fixed over the whole survey), an indicator for
#' different time steps and a houshold ID, a new houshold ID is assigned to the
#' original and the split household.
#'
#' @param dat data table of data frame containing the survey data
#' @param period column name of \code{dat} containing an indicator for the
#'   rotations, e.g years, quarters, months, ect...
#' @param pid column name of \code{dat} containing the personal identifier. This
#'   needs to be fixed for an indiviual throught the whole survey
#' @param hid column name of \code{dat} containing the household id. This needs
#'   to for a household throught the whole survey
#'
#' @return the survey data \code{dat} as data.table object containing a new and
#'   an old household ID. The new household ID which considers the split
#'   households is now named \code{hid} and the original household ID has a
#'   trailing "_orig".
#' @export generate.HHID
#'
#' @examples
#' \dontrun{
#' library(surveysd)
#' library(laeken)
#' library(data.table)
#'
#' eusilc <- surveysd:::demo.eusilc(n=4)
#'
#' # create spit households
#' eusilc[,rb030split:=rb030]
#' year <- eusilc[,unique(year)]
#' year <- year[-1]
#' leaf_out <- c()
#' for(y in year) {
#'   split.person <- eusilc[year==(y-1)&!duplicated(db030)&!db030%in%leaf_out,
#'                          sample(rb030,20)]
#'   overwrite.person <- eusilc[year==(y)&!duplicated(db030)&!db030%in%leaf_out,
#'                              .(rb030=sample(rb030,20))]
#'   overwrite.person[,c("rb030split","year_curr"):=.(split.person,y)]
#'
#'   eusilc[overwrite.person,
#'          rb030split:=i.rb030split,on=.(rb030,year>=year_curr)]
#'   leaf_out <- c(
#'     leaf_out,
#'     eusilc[rb030%in%c(overwrite.person$rb030,overwrite.person$rb030split),
#'     unique(db030)])
#' }
#'
#' # pid which are in split households
#' eusilc[,.(uniqueN(db030)),by=list(rb030split)][V1>1]
#'
#' eusilc.new <- generate.HHID(eusilc, period = "year", pid = "rb030split",
#'                             hid = "db030")
#'
#' # no longer any split households in the data
#' eusilc.new[,.(uniqueN(db030)),by=list(rb030split)][V1>1]
#' }
#'


generate.HHID <- function(dat, period = "RB010", pid = "RB030", hid = "DB030") {

  ID_new <- ID_orig <- ALL_NEW <- ID_new_help <- NULL


  ID_new <- ID_orig <- ALL_NEW <- na.omit
  # check input
  if (!is.data.frame(dat) & !is.data.table(dat)) {
    stop("dat must be a data frame or data table")
  }
  if (!is.data.table(dat)) {
    dat <- data.table(dat)
  }
  dat <- copy(dat)
  c.names <- colnames(dat)

  #
  if (!is.character(period)) {
    stop("period must be a string")
  } else {
    if (length(period) > 1) {
      stop("period must have length 1")
    }
    if (!period %in% c.names) {
      stop(period, " is not a column of dat")
    }
    if (!is.numeric(dat[[period]]) & !is.integer(dat[[period]])) {
      stop(period, " must be an integer or numeric vector")
    }
  }

  if (!is.character(pid)) {
    stop("pid must be a string")
  }else{
    if (length(pid) > 1) {
      stop("pid must have length 1")
    }
    if (!pid %in% c.names) {
      stop(pid, " is not a column of dat")
    }
  }
  if (!is.character(hid)) {
    stop("hid must be a string")
  } else {
    if (length(hid) > 1) {
      stop("hid must have length 1")
    }
    if (!hid %in% c.names) {
      stop(hid, " is not a column of dat")
    }
  }

  # create lookup table starting from first period
  ID_lookup <- dt.eval("dat[", period, "==min(", period, "),.(", pid,
                       ",ID_orig=", hid, ")]")
  ID_lookup[, ID_new := .GRP, by = ID_orig]
  ID_lookup[, ID_orig := NULL]

  periods <- sort(dt.eval("dat[,unique(", period, ")]"))

  for (i in periods[-1]) {

    ID_lookup <- merge(ID_lookup, dt.eval(
      "dat[", period, "==", i, ",.(", pid, ",ID_orig=", hid, ")]"),
      by = pid, all = TRUE)
    ID_lookup[!is.na(ID_orig), ALL_NEW := all(is.na(ID_new)), by = ID_orig]
    ID_lookup[ALL_NEW == FALSE, ID_new := na.omit(ID_new)[1], by = ID_orig]
    ID_next <- ID_lookup[, max(ID_new, na.rm = TRUE)]
    ID_lookup[ALL_NEW == TRUE, ID_new := .GRP + ID_next, by = ID_orig]
    ID_lookup[, c("ID_orig", "ALL_NEW") := NULL]
  }
  dat <- merge(dat, ID_lookup, by = pid)
  # if ID not unique by hid and year
  # leave original grouping for this year
  # this happens if household splits up and people move to already existing
  #   households
  group_broke <- dt.eval("dat[,length(unique(ID_new)),by=list(", period, ",",
                         hid, ")][V1>1,.(", period, ",", hid, ")]")
  if (nrow(group_broke) > 0) {
    setkeyv(dat, c(period, hid))
    dt.eval("dat[group_broke,ID_new_help:=paste0(head(", hid,
            ",1),'_1'),by=list(ID_new)]")
    dat[is.na(ID_new_help), ID_new_help := as.character(ID_new)]
    dat[, ID_new := .GRP, by = ID_new_help]
    dat[, ID_new_help := NULL]
  }

  setnames(dat, hid, paste0(hid, "_orig"))
  setnames(dat, "ID_new", hid)
  return(dat)
}

Any scripts or data that you put into this service are public.

surveysd documentation built on March 17, 2026, 5:06 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

surveysd
Survey Standard Error Estimation for Cumulated Estimates and their Differences in Complex Panel Designs

R/generateHHID.R
In surveysd: Survey Standard Error Estimation for Cumulated Estimates and their Differences in Complex Panel Designs

Defines functions generate.HHID

Documented in generate.HHID

Try the surveysd package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

surveysd Survey Standard Error Estimation for Cumulated Estimates and their Differences in Complex Panel Designs

R/generateHHID.R In surveysd: Survey Standard Error Estimation for Cumulated Estimates and their Differences in Complex Panel Designs

Defines functions generate.HHID

Documented in generate.HHID

Try the surveysd package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

surveysd
Survey Standard Error Estimation for Cumulated Estimates and their Differences in Complex Panel Designs

R/generateHHID.R
In surveysd: Survey Standard Error Estimation for Cumulated Estimates and their Differences in Complex Panel Designs