R/UnitToIDDDNames.R

#' @title Convert production unit names into their corresponding statistical variable names (IDDD)
#'
#' @description \code{UnitToIDDDNames} returns a \linkS4class{data.table} with the statistical
#' variable name (IDDD + Qualifiers) corresponding to the production unit variable name specified as
#' input argument.
#'
#' @param UnitNames \code{Character} vector with the production unit variable name.
#'
#' @param Correspondence Object with the IDDD variable name.
#'
#' @return Returns a \code{character} vector with all the corresponding IDDD variable names. For
#' objects the classes \link{DD} and \link{StQ} it returns the IDDD in the slot
#' VarNameCorresp of the corresponding DD object.
#'
#' @details IDDD and qualifiers compose together the so-called IDDDname of the variable by pasting
#' the IDDD identifier and each consecutive qualifier with an underscore _.
#'
#' @examples
#' # An example for VNC and DD objects:
#' library(data.table)
#' ### We build the VNC object
#' VarList <- list(ID = data.table(IDQual = c('NumIdEst', rep('', 4)),
#'                                             NonIDQual = c(rep('',5)),
#'                                             IDDD = c('', 'Name', 'Surname', 'PostalAddr',
#'                                                      'PhoneNo'),
#'                                             NumIdEst = c('', rep('.', 4)),
#'                                             UnitName = c('numidest', 'nombre', 'apellidos',
#'                                                       'direccion', 'telefono'),
#'                                             InFiles = rep('FF', 5)),
#'                 MicroData = data.table(
#'                                         IDQual = c('NumIdEst', rep('', 2)),
#'                                         NonIDQual = c('', 'Market', ''),
#'                                         IDDD = c(rep('', 2), 'NewOrders'),
#'                                         NumIdEst = c(rep('', 2), '.'),
#'                                         Market = c(rep('', 2), '2.'),
#'                                         UnitName = c('numidest', '', 'cp09'),
#'                                         InFiles = rep('FF', 3)),
#'                 ParaData = data.table(
#'                                         IDQual = c('NumIdEst', rep('', 2)),
#'                                         NonIDQual = c('', 'Action', ''),
#'                                         IDDD = c(rep('', 2), 'Date'),
#'                                         NumIdEst = c(rep('', 2), '.'),
#'                                         Action = c(rep('', 2), 'Imputation'),
#'                                         UnitName = c('numidest', '', 'FechaImput'),
#'                                         InFiles = rep('FP', 3)),
#'                 Aggregates = data.table(
#'                                          IDQual = c('Province', 'NACE09', '', ''),
#'                                          NonIDQual = c(rep('', 2), 'Market', ''),
#'                                          IDDD = c('', '', '', 'Turnover'),
#'                                          Province = c('', '', '', '.'),
#'                                          NACE09 = c('', '', '', '.'),
#'                                          Market = c('', '', '', '3.'),
#'                                          UnitName = c('provincia', 'actividad', '', 'cn01'),
#'                                          InFiles = rep('FA', 4)))
#'
#' VNC <- BuildVNC(VarList)
#'
#' ### We build the specification data.tables
#' IDdt <- data.table(
#'                      Variable = c('NumIdEst', 'Name', 'Surname', 'PostalAddr', 'PhoneNo'),
#'                      Sort = c('IDQual', rep('IDDD', 4)),
#'                      Class = rep('character', 5),
#'                      Length = c('11', '25', '25', '50', '9'),
#'                      Qual1 = c('', rep('NumIdEst', 4)),
#'                      ValueRegExp = c('[0-9]{9}PP', '.+', '.+', '.+', '(6|9)[0-9]{8}'))
#' Microdt <- data.table(
#'                      Variable = c('NumIdEst', 'Market', 'NewOrders'),
#'                      Sort = c('IDQual', 'NonIDQual', 'IDDD'),
#'                      Class = c(rep('character', 2), 'numeric'),
#'                      Length = c('11', '2', '7'),
#'                      Qual1 = c(rep('', 2), 'NumIdEst'),
#'                      Qual2 = c(rep('', 2), 'Market'),
#'                      ValueRegExp = c('[0-9]{9}PP', '(0|1| )', '([0-9]{1, 10}| )'))
#' Paradt <- data.table(
#'                      Variable = c('NumIdEst', 'Action', 'Date'),
#'                      Sort = c('IDQual', 'NonIDQual', 'IDDD'),
#'                      Class = rep('character', 3),
#'                      Length = c('11', '4', '10'),
#'                      Qual1 = c(rep('', 2), 'NumIdEst'),
#'                      Qual2 = c(rep('', 2), 'Action'),
#'                      ValueRegExp = c('[0-9]{9}PP', 'Collection|Editing|Imputation',
#'                      '(([0-9]{2}-(0[1-9]|1(0-2))-[0-9]{4})| )'))
#' Aggdt <- data.table(
#'                      Variable = c('Province', 'NACE09', 'Turnover'),
#'                      Sort = c(rep('IDQual', 2), 'IDDD'),
#'                      Class = c(rep('character', 2), 'numeric'),
#'                      Length = c('25', '4', '12'),
#'                      Qual1 = c(rep('', 2), 'Province'),
#'                      Qual2 = c(rep('', 2), 'NACE09'),
#'                      ValueRegExp = c('[0-9]{4}', '([0-4][0-9])|(5[0-2])', '([0-9]{1, 15}| )'))
#'
#' DD <- DD(VNC = VNC,
#'           ID = IDdt,
#'           MicroData = Microdt,
#'           ParaData = Paradt,
#'           Aggregates = Aggdt)
#'
#' UnitToIDDDNames(DD, UnitNames = c('cn01', 'cp09'))
#'
#' # An example for StQ objects:
#' data(ExampleStQ)
#' UnitToIDDDNames(ExampleStQ, UnitNames = c('cnae09','C11','C121','C122' , 'EXISTENCIAS', 'B1'))
#'
#'
#' @include getIDQual.R VNC.R DD.R BuildVNC.R BuildDD.R StQ.R getDD.R
#'
#' @export
setGeneric("UnitToIDDDNames", function(UnitNames, Correspondence){standardGeneric("UnitToIDDDNames")})

#' @rdname UnitToIDDDNames
#'
#' @include DD.R VarNamesToDD.R getVNC.R BuildDD.R BuildVNC.R VNC.R
#'
#' @import data.table
#'
#' @export
setMethod(
    f = "UnitToIDDDNames",
    signature = c("character", "DD"),
    function(UnitNames, Correspondence){
        
        UnitName <- IDDDName <- IDDD <- NonIDQual <- Variable <- Unit <- Suffixes <- NULL
        
        IDQualsGlobal <- getIDQual(Correspondence)
        DD <- Correspondence
        Correspondence <- getVNC(DD)
        
        prefix <- gsub("_.+", '', UnitNames)
        all_U <- getUnitName(Correspondence)
        allWithSuffix <- all_U[grep("_\\[.+\\]", all_U)]
        allPrefix <- gsub("_\\[.+\\]", '', allWithSuffix)
        names(allWithSuffix) <- allPrefix
        whichChange <- UnitNames[prefix %in% allPrefix]
        
        if(length(whichChange) > 0){
            prefixChange <- prefix[prefix %in% allPrefix]
            names(whichChange) <- prefixChange
            oriUnitNames <- UnitNames
            suffixChanged <- allWithSuffix[allPrefix %in% prefix]
            suffixChanged <- suffixChanged[names(whichChange)]
            UnitNames[which(prefix %in% allPrefix)] <- suffixChanged
            infoChange <- data.table(Unit = suffixChanged, oriUnit = whichChange)
        }
        
        output.list <- lapply(names(Correspondence), function(nameVNC){
            
            VNC <- Correspondence[[nameVNC]]
            nameVNC <- ExtractNames(nameVNC)
            XLS <- VNC[UnitName %in% UnitNames]
            XLS[, IDDDName := IDDD]
            XLS.Quals <- XLS[IDDD == '']
            XLS.Quals[IDQual != '', IDDDName := IDQual]
            XLS.Quals[NonIDQual != '', IDDDName := NonIDQual]
            XLS.Quals <- XLS.Quals[, c('IDQual', 'NonIDQual', 'UnitName', 'IDDDName', 'InFiles'),
                                   with = F]
            IDQual <- XLS.Quals[IDQual != '']
            IDQual <- IDQual[IDQual != '']
            DotQual <- getDotQual(Correspondence)
            
            XLS <- XLS[IDDD != '']
            XLS <- XLS[, setdiff(names(XLS), IDQual), with = F]
            XLS.list <- split(XLS, XLS[['IDDD']])
            
            XLS.list <- lapply(XLS.list, function(xls){
                
                #ColNames <- setdiff(names(xls), IDQualsGlobal)
                #NotEmptyCols <- c()
                #for (col in ColNames){
                
                #    if (!all(is.na(xls[[col]]) | xls[[col]] == '')) NotEmptyCols <- c(NotEmptyCols, col)
                #
                #}
                
                #xls <- xls[, NotEmptyCols, with = F]
                
                #ColsNotUnit <- setdiff(names(xls), c('IDDD', 'UnitName', 'IDDDName', 'InFiles'))
                #ColsNotUnit <- intersect(names(VNC), ColsNotUnit)
                auxDT <- DD[[nameVNC]][Variable == unique(xls[['IDDDName']])]
                ColsNotUnit <- t(as.matrix(auxDT[, names(auxDT)[grep('Qual', names(auxDT))], with = FALSE]))[,1]
                ColsNotUnit <- setdiff(ColsNotUnit, c(IDQual, DotQual))
                ColsNotUnit <- ColsNotUnit[ColsNotUnit != '']
                
                for (col in ColsNotUnit) {
                    
                    #if (all(xls[[col]] == '.') | all(is.na(xls[[col]]))) next
                    if (any(xls[[col]] == '.')) next
                    xls[, IDDDName := paste(IDDDName, get(col), sep = '_')]
                    
                }
                return(xls)
            })
            
            output <- rbindlist(XLS.list, fill = TRUE)
            output <- rbindlist(list(output, XLS.Quals), fill = TRUE)
            
            aux <- output[, c('UnitName', 'IDDDName'), with = FALSE]
            
            # Patterns in UnitNames : [mm], [aa], [aaaa], [n], etc.
            UnitNames_aux <- unique(aux[['UnitName']])
            patrones <- UnitNames_aux[grep('[[]', UnitNames_aux)]
            UnitToIDDDNames.local <- function(UnitNamesLocal){
                
                outputNewName <- UnitNamesLocal[!UnitNamesLocal %in% output[['UnitName']]]
                
                
                if (length(outputNewName) > 0 & length(patrones) > 0){
                    
                    metaVar <- lapply(patrones, function(patron){
                        
                        patron_aux <- patron
                        patron <- gsub('\\[mm\\]', '(([0][1-9])|([1][0-2]))', patron)
                        patron <- gsub('\\[aa\\]', '[0-9]{2}', patron)
                        patron <- gsub('\\[aaaa\\]', '[0-9]{4}', patron)
                        patron <- gsub('\\[n\\]', '[0-9]+', patron)
                        patron <- gsub('\\[varGestion\\]', '.*', patron)
                        patron <- gsub('\\[ccaa\\]', '[0-9]{2}', patron)
                        patron <- gsub('\\[IDEdit\\]', '[.]+', patron)
                        Var <- lapply(outputNewName, function(name){
                            out <- regexpr(patron, name)
                            out <- regmatches(name, out)
                            names(out) <- rep(aux[UnitName %in% patron_aux][['IDDDName']], length(out))
                            return(out)
                        })
                        
                        return(Var)
                    })
                    
                    metaVar <- unlist(metaVar)
                    #outputNew <- setdiff(outputNewName, metaVar)
                    if (length(metaVar) > 0) {
                        
                        outputMetaVar <- data.table(UnitName = metaVar, IDDDName = names(metaVar))
                        
                    } else {
                        
                        outputMetaVar <- data.table()
                        
                    }
                } else {
                    
                    outputMetaVar <- data.table()
                    #outputNew <- outputNewName
                }
                
                #outputNew <- data.table(UnitName = outputNew, IDDDName = outputNew)
                output <- output[which(output[['UnitName']] %in% UnitNamesLocal),
                                 c('UnitName','IDDDName'), with = F]
                output <- rbindlist(list(output, outputMetaVar))
                out <- output[['IDDDName']]
                names(out) <- output[['UnitName']]
                out <- out[UnitNamesLocal]
                return(out)
            }
            VNCNames <- unique(VNC[['UnitName']])
            UnitNamesLocal <- intersect(UnitNames, VNCNames)
            UnitNamesLocalNewName <- setdiff(UnitNames, UnitNamesLocal)
            UnitNamesLocal <- c(UnitNamesLocal, UnitNamesLocalNewName)
            namesLocal <- UnitToIDDDNames.local(UnitNamesLocal)
            outDT <- data.table(Unit = names(namesLocal), IDDD = namesLocal)
            outDT <- outDT[Unit %in% UnitNames]
            return(outDT)
        })
        
        outDT <- rbindlist(output.list)
        if(length(whichChange) > 0){
            
            addOutDT <- merge(infoChange, outDT, by = "Unit", all.x = TRUE)
            addOutDT <- addOutDT[, Unit := NULL]
            setnames(addOutDT, "oriUnit", "Unit")
            outDT <- rbind(outDT[!addOutDT, on = "IDDD"], addOutDT)
            UnitNames <- oriUnitNames
            
        }
        
        setkeyv(outDT, names(outDT))
        outDT <- outDT[!duplicated(outDT, by = key(outDT))]
        if(nrow(outDT) > 0 ){
            Suf <- sub("^[^_]*_", "", outDT[, Unit])
            outDT[, Suffixes := Suf]
        }
        # outDT[, Suffixes := gsub('([A-Za-z]+_)((\\[.*)|(.*))','\\2', Unit)]
        
        outDT.list <- split(outDT, outDT[['Suffixes']])
        outDT.list <- lapply(outDT.list, function(DT){
            
            DT[, IDDD := gsub('..', unique(Suffixes), IDDD, fixed = TRUE)]
            DT[, Suffixes := NULL]
            return(DT)
        })
        outDT <- rbindlist(outDT.list)
        outVector <- outDT[['IDDD']]
        names(outVector) <- outDT[['Unit']]
        outVector <- outVector[UnitNames]
        return(outVector)
    }
    
)

#' @rdname UnitToIDDDNames
#'
#' @import data.table
#'
#' @export
setMethod(
    f = "UnitToIDDDNames",
    signature = c("character", "StQ"),
    function(UnitNames, Correspondence){
        
        
        DD <- getDD(Correspondence)
        
        output <- UnitToIDDDNames(UnitNames, DD)
        
        return(output)
        
    }
)
david-salgado/StQ documentation built on Aug. 12, 2021, 3:23 p.m.