R/table_creation.R

#############################################################################
# Erzeugung Aggregierte Tabellen für SOEP-Transfer Projekt
#############################################################################

### Was muss definiert werden:
# Persönlicher Pfad
if (Sys.info()[["user"]] == "szimmermann") {
  datapath <- "H:/data/"
  #"C:/git/platform-datasets/metadaten_example/"  
  metapath <- "H:/Clone/soep-transfer/metadata/"
  exportpath <- "H:/Clone/soep-transfer/"
}

# Definition von Objekten
dataset <- "h_statistics"  # Aus welchem Datensatz sollen Werte genommen werden
cell.min <- 30 # Maximal erlaubte Zellgröße
year <- "syear" # Erhebungsjahr muss definiert sein
weight <- "hhrf" # Gewicht muss definiert sein
#############################################################################

## load packages
loadpackage(c("foreign", "dplyr", "tidyverse", "readstata13", "spatstat",
              "gsubfn", "rjson")) 

## load dataset
data.file.num <- read.dta13(paste0(datapath,dataset,".dta"), 
                            convert.factors = FALSE, encoding = "UTF-8")

# Gewichte mit 0 machen Probleme bei der Mittelwertberechnung
data.file.num <- data.file.num %>%
  filter(weight > 0) 

data.file.fac <- read.dta13(paste0(datapath,dataset,".dta"), 
                            convert.factors = TRUE, 
                            nonint.factors = TRUE, encoding = "UTF-8")

data.file.fac <- data.file.fac %>%
  filter(weight > 0) 

meta <- read.csv(paste0(metapath, "variables.csv") , header = TRUE,
                 colClasses = "character")

################################################################################
################################################################################
### Code zum Erzeugen:
# Datensatz definieren über metadaten
meta <- meta[meta$dataset==dataset, ]

### Demovariablen vorbereiten
meta_demo <- meta %>%
  filter(meantable == "demo") 

meta_demo <- subset(data.file.num,
                    select=meta_demo$variable)

difflist <- c("",combn(sort(names(meta_demo)),1,simplify=FALSE, FUN = sort), 
  combn(sort(names(meta_demo)),2,simplify=FALSE))

diffcountlist <- difflist
diffcountlist[[1]] <- 0
diffcountlist[2:(1+length(combn(names(meta_demo),1,simplify=FALSE)))] <- 1
diffcountlist[(2+length(combn(names(meta_demo),1,simplify=FALSE))):length(diffcountlist)] <- 2


for (var in 1:length(meta$variable)){
  
  if (meta$meantable[var] == "yes" | meta$probtable[var] == "yes") {
      variable <- meta$variable[var] 
      
    for(i in seq_along(difflist)){
      diffcount <- diffcountlist[[i]]
      diffvars <- difflist[[i]]
  
      if (!is.na(diffvars[1])) {
        diffvar1 <- diffvars[1]
      } else {
        diffvar1 <- ""
      }
  
      if (!is.na(diffvars[2]))  {
        diffvar2 <- diffvars[2]
      } else {
        diffvar2 <- ""
      }
  
      if (!is.na(diffvars[3]))  {
        diffvar3 <- diffvars[3]
      } else {
        diffvar3 <- ""
      }

      if (meta$meantable[var] == "yes") {
      data <- get_data(datasetnum =  data.file.num, 
                       datasetfac = data.file.fac,
                       variable = variable, 
                       year = year, 
                       weight = weight,
                       diffcount = diffcount,
                       diffvars = diffvars,
                       vallabel = FALSE)
      

      table.values <- get_mean_values(dataset = data, 
                                      year = "year", 
                                      diffcount = diffcount,
                                      diffvar1 = diffvar1,
                                      diffvar2 = diffvar2,
                                      diffvar3 = diffvar3)
      

      table.values <- create_table_lables(table = table.values)
      
      protected.table <- get_protected_values(dataset = table.values, cell.size = 30)
      
      
      protected.table <- expand_table(table = protected.table, diffvar1 = diffvar1, 
                                      diffvar2 = diffvar2, diffvar3 = diffvar3,
                                      diffcount = diffcount, tabletype = "mean")
      
      data.csv <- get_table_export(table = protected.table, variable = variable, 
                                   metadatapath = paste0(metapath, "variables.csv"),
                                   exportpath = exportpath, diffcount = diffcount, 
                                   tabletype = "mean")
      
      json_create_lite(variable = variable, 
                       varlabel = meta$label_de[meta$variable==variable],
                       startyear = as.numeric(unique(data.csv$year)[1]), 
                       endyear = as.numeric(unique(data.csv$year)[length(unique(data.csv$year))]), 
                       tabletype = "mean",
                       exportpath = paste0(exportpath, "/numerical/", variable, "/meta.json"),
                       dataset = dataset)

      print(paste("Die Variable", variable, "wird verarbeitet mit Differenzierung", 
                  paste(difflist[[i]],collapse=","), "als Mittelwert-Tabelle"))
      }
      
      if (meta$probtable[var] == "yes") {
        data <- get_data(datasetnum =  data.file.num, 
                         datasetfac = data.file.fac,
                         variable = variable, 
                         year = year, 
                         weight = weight,
                         diffcount = diffcount,
                         diffvars = diffvars,
                         vallabel = TRUE)
        
        if (diffvars=="") {
          columns <- c("usedvariable", "year")
        } else {
          columns <- c("usedvariable", "year", diffvars)
        }
        
        prop.data <- get_prop_values(dataset = data, groupvars = columns, alpha = 0.05)
        
        protected.table <- get_protected_values(dataset = prop.data, cell.size = 30)
        
        protected.table <- expand_table(table = protected.table, diffvar1 = diffvar1, 
                                        diffvar2 = diffvar2, diffvar3 = diffvar3,
                                        diffcount = diffcount, tabletype = "prop")
        
        data.csv <- get_table_export(table = protected.table, variable = variable, 
                                     metadatapath = paste0(metapath, "variables.csv"),
                                     exportpath = exportpath, diffcount = diffcount,
                                     tabletype = "prop")
        
        json_create_lite(variable = variable, 
                         varlabel = meta$label_de[meta$variable==variable],
                         startyear = as.numeric(unique(data.csv$year)[1]), 
                         endyear = as.numeric(unique(data.csv$year)[length(unique(data.csv$year))]), 
                         tabletype = "prop",
                         exportpath = paste0(exportpath, "/categorical/", variable, "/meta.json"),
                         dataset = dataset)
        
        print(paste("Die Variable", variable, "wird verarbeitet mit Differenzierung", 
                    paste(difflist[[i]],collapse=","), "als Prozentwert-Tabelle"))
      }
    } 
  }
}  
ddionrails/transfer-pipeline documentation built on April 14, 2022, 3:24 p.m.