R/forEasySdcTable.R

Defines functions CrossLevels GroupNrList UniqueNrList FindCommonCells CheckLevels FindDimLists DimFromHier1 DimFromHier FindTableGroup SortRows SortNrList functionRecursive HierarchicalGroups Nlevels FactorLevCorr

Documented in FactorLevCorr FindCommonCells FindDimLists FindTableGroup HierarchicalGroups SortRows

#' Factor level correlation
#'
#' A sort of correlation matrix useful to detect (hierarchical) relationships between the levels of factor variables.
#'
#'
#' @encoding UTF8
#'
#' @param x Input matrix or data frame containing the variables
#'
#' @return Output is a sort of correlation matrix.
#'
#'         Here we refer to ni as the number of present levels of variable i (the number of unique elements) and we refer to nij as the number
#'         of present levels obtained by crossing variable i and variable j (the number unique rows of x[,c(i,j)]).
#'
#'         The diagonal elements of the output matrix contains the number of present levels of each variable (=ni).
#'
#'         The absolute values of off-diagonal elements:
#'         \item{0}{when nij = ni*nj}
#'         \item{1}{when nij = max(ni,nj)}
#'         \item{Other values}{Computed as (ni*nj-nij)/(ni*nj-max(ni,nj))}
#'
#'         So 0 means that all possible level combinations exist in the data and 1 means that the two variables are
#'         hierarchically related.
#'
#'         The sign of off-diagonal elements:
#'         \item{positive}{when ni<nj}
#'         \item{negative}{when ni>nj}
#'
#'         In cases where ni=nj elements will be positive above the diagonal and negative below.
#'
#'
#' @export
#'
#' @examples
#'  x <- rep(c("A","B","C"),3)
#'  y <- rep(c(11,22,11),3)
#'  z <- c(1,1,1,2,2,2,3,3,3)
#'  zy <- paste(z,y,sep="")
#'  m <- cbind(x,y,z,zy)
#'  FactorLevCorr(m)
#'
FactorLevCorr <- function(x) {
  if (NROW(x) <= 1) 
    stop(paste(NROW(x), "row in data."))
  x <- unique(x, MARGIN = 1)
  n <- NCOL(x)
  nLevels <- rep(NaN, n)
  for (i in matlabColon(1, n)) nLevels[i] <- Nlevels(x[, i])
  z <- diag(nLevels, nrow = length(nLevels))  # nrow input to allow length one as input
  for (i in matlabColon(1, n)) for (j in matlabColon(i + 1, n)) {
    ni <- nLevels[i]
    nj <- nLevels[j]
    nij <- Nlevels(x[, c(i, j)])
    multij <- ni * nj
    maxij <- max(ni, nj)
    if (ni <= nj) 
      one <- 1 else one <- -1
    if (nij == maxij) 
      z[i, j] <- one else z[i, j] <- one * (multij - nij)/(multij - maxij)
    z[j, i] <- -z[i, j]
  }
  colnames(z) <- colnames(x)
  rownames(z) <- colnames(x)
  z
}


Nlevels = function(x){
  NROW(unique(x,MARGIN=1))
}



#' Finding hierarchical variable groups
#'
#' According to the (factor) levels of the variables
#'
#' @encoding UTF8
#'
#' @param x Matrix or data frame containing the variables
#' @param mainName When TRUE output list is named according to first variable in group.
#' @param eachName When TRUE variable names in output instead of indices.
#' @param fCorr When non-null x is not needed as input.
#'
#' @return Output is a list containing the groups. First variable has most levels.
#'
#' @export
#'
#' @examples
#'  x <- rep(c("A","B","C"),3)
#'  y <- rep(c(11,22,11),3)
#'  z <- c(1,1,1,2,2,2,3,3,3)
#'  zy <- paste(z,y,sep="")
#'  m <- cbind(x,y,z,zy)
#'  HierarchicalGroups(m)
HierarchicalGroups <- function(x = NULL, mainName = TRUE, eachName = FALSE, fCorr = FactorLevCorr(x)) {
  nLevels <- diag(fCorr)
  if (min(nLevels) <= 1) 
    stop("Number of levels < 2 in a variable")
  ix <- order(nLevels, decreasing = TRUE)
  # print(fCorr[ix,ix])
  if (length(fCorr) > 1) 
    z <- functionRecursive(fCorr[ix, ix], 1:NCOL(fCorr))$l else z <- list(1)
  z1 <- rep(NA, length(z))
  for (i in 1:length(z)) {
    z[[i]] <- ix[z[[i]]]
    z1[i] <- z[[i]][1]
    if (mainName) 
      names(z)[i] <- colnames(fCorr)[z1[i]]
  }
  z <- SortNrList(z)
  for (i in 1:length(z)) {
    if (length(unique(nLevels[z[[i]]])) != length(z[[i]])) 
      warning("There are identical variables")
  }
  if (eachName) {
    for (i in 1:length(z)) z[[i]] <- colnames(fCorr)[z[[i]]]
  }
  z
}


# Recursive algorithm ...
functionRecursive <- function(fCorr, ind) {
  drop <- numeric(0)
  x <- vector("list", 0)
  for (i in ind) if (!(i %in% drop)) {
    z <- functionRecursive(fCorr, (1:NCOL(fCorr))[fCorr[i, ] == -1])
    drop <- c(drop, i, z$drop)
    l <- z$l
    for (k in matlabColon(1, length(l))) l[[k]] <- c(i, l[[k]])
    if (!length(l)) 
      l <- list(i)
    x <- c(x, l)
  }
  list(drop = drop, l = x)
}

# Special sorting function
SortNrList <- function(x, index.return = FALSE) {
  m <- matrix(0, length(x), max(sapply(x, length)))
  for (i in seq_len(length(x))) m[i, seq_len(length(x[[i]]))] <- x[[i]]
  ix <- SortRows(m, index.return = TRUE)
  if (index.return) 
    return(ix)
  x[ix]
}

#' Sorting rows
#'
#' @param m 
#' @param cols 
#' @param index.return 
#'
#' @return sorted m
#' @export
#' @keywords internal
#'
#' @examples
#' SortRows(matrix(sample(1:3,15,TRUE),5,3))
SortRows <- function(m, cols = 1:dim(m)[2], index.return = FALSE) {
  ix <- eval(parse(text = paste("order(", paste("m[,", cols, "]", sep = "", collapse = ","), 
                                ")")))
  if (index.return) 
    return(ix)
  m[ix, , drop = FALSE]
}


#' Finding table(s) of hierarchical variable groups
#'
#' A single table or two linked tables are found
#'
#' @encoding UTF8
#'
#' @param x Matrix or data frame containing the variables
#' @param findLinked When TRUE, two linked tables can be in output
#' @param mainName When TRUE the groupVarInd ouput is named according to first variable in group.
#' @param fCorr When non-null x is not needed as input.
#' @param CheckHandling Function (warning or stop) to be used in problematic situations.
#'
#' @return Output is a list with items
#'  \item{groupVarInd}{List defining the hierarchical variable groups. First variable has most levels.}
#'  \item{table}{List containing one or two tables. These tables are coded as indices referring to elements of groupVarInd.}
#'
#' @export
#'
#' @examples
#'  x <- rep(c('A','B','C'),3)
#'  y <- rep(c(11,22,11),3)
#'  z <- c(1,1,1,2,2,2,3,3,3)
#'  zy <- paste(z,y,sep='')
#'  m <- cbind(x,y,z,zy)
#'  FindTableGroup(m)
#'  FindTableGroup(m,findLinked=TRUE)
FindTableGroup <- function(x = NULL, findLinked = FALSE, mainName = TRUE, fCorr = FactorLevCorr(x), 
                           CheckHandling = warning) {
  hier <- HierarchicalGroups(mainName = mainName, eachName = FALSE, fCorr = fCorr)
  table1 <- UniqueNrList(hier, 1)
  table2 <- UniqueNrList(hier, -1)
  if (identical(table1, table2)) 
    table2 <- NULL
  if (is.null(table2)) {
    if (length(table1) != length(hier)) {
      outside <- seq_len(length(hier))[!(seq_len(length(hier)) %in% table1)]
      table2 <- outside[UniqueNrList(hier[outside])]
    }
  }
  if (!findLinked) {
    # extra check
    uh <- unlist(hier)
    if (length(unique(uh)) == length(uh)) 
      uniqueTable <- TRUE else uniqueTable <- FALSE
      if (uniqueTable & !is.null(table2)) 
        stop("Error detected in unique algorithm")
      table2 <- NULL
      if (!uniqueTable) 
        CheckHandling("Not a single unique table")
  } else {
    if (length(unique(c(table1, table2))) != length(hier)) 
      CheckHandling("All variables could not be used")
  }
  if (is.null(table2)) 
    return(list(groupVarInd = hier, table = list(ind1 = table1)))
  return(list(groupVarInd = hier, table = list(ind1 = table1, ind2 = table2)))
}



DimFromHier <- function(x, hier, addName = FALSE, total = "Total") {
  for (i in matlabColon(1, length(hier))) hier[[i]] <- DimFromHier1(x, hier[[i]], 
                                                                    addName = addName, total = total)
  hier
}

# addName here use '.' as separator
DimFromHier1 <- function(x, indHier = 1:dim(x)[2], addName = FALSE, total = "Total") {
  start <- "@@"
  add <- "@"
  r1 <- data.frame(levels = "@", codes = total, stringsAsFactors = FALSE)
  
  b <- CrossLevels(x[, rev(indHier), drop = FALSE])
  
  m <- NCOL(b)
  n <- NROW(b)
  symbol <- start
  for (i in matlabColon(2, m)) symbol <- c(symbol, paste(symbol[i - 1], add, sep = ""))
  
  symbols <- rep(" ", m * n)
  codes <- rep(" ", m * n)
  k <- 0
  bb <- b[1, , drop = FALSE]
  for (i in matlabColon(1, n)) for (j in matlabColon(1, m)) {
    newrow <- FALSE
    if (i == 1) 
      newrow <- TRUE else if (bb[1, j] != b[i, j]) 
        newrow <- TRUE
      if (newrow) {
        k <- k + 1
        bb[1, j] <- b[i, j]
        symbols[k] <- symbol[j]
        if (addName) 
          codes[k] <- paste(colnames(b)[j], as.character(b[i, j]), sep = ".") else codes[k] <- as.character(b[i, j])
      }
      
  }
  rbind(r1, data.frame(levels = symbols[matlabColon(1, k)], codes = codes[matlabColon(1, 
                                                                                      k)], stringsAsFactors = FALSE))
}


#' Finding dimList
#'
#' Finding lists of level-hierarchy as needed for the input parameter
#' dimList to the function makeProblem in package sdcTable
#'
#' @encoding UTF8
#'
#' @param x Matrix or data frame containing the variables (micro data or cell counts data).
#' @param groupVarInd List of vectors of indices defining the hierarchical variable groups.
#' @param addName When TRUE the variable name is added to the level names, except for variables with most levels.
#' @param sep A character string to separate when addName apply.
#' @param xReturn When TRUE x is also in output, possibly changed according to addName.
#' @param total String used to name totals.
#'
#' @return Output is a list according to the specifications in sdcTable.
#'         When xReturn is TRUE output has an extra list level and x is the first element.
#'
#' @export
#'
#' @examples
#'  x <- rep(c('A','B','C'),3)
#'  y <- rep(c(11,22,11),3)
#'  z <- c(1,1,1,2,2,2,3,3,3)
#'  zy <- paste(z,y,sep='')
#'  m <- cbind(x,y,z,zy)
#'  FindDimLists(m)
FindDimLists <- function(x, groupVarInd = HierarchicalGroups(x = x), addName = FALSE, 
                         sep = ".", xReturn = FALSE, total = "Total") {
  hierGr <- GroupNrList(groupVarInd)
  CheckOk <- TRUE
  if (!addName) 
    for (i in seq_len(length(hierGr))) if (!CheckLevels(x, hierGr[[i]], CheckLevelsHandling = warning)) 
      CheckOk <- FALSE
  if (!CheckOk) {
    warning("Settting addName to TRUE (overriding input)")
    addName <- TRUE
  }
  if (addName) {
    addVar <- NULL
    for (i in matlabColon(1, length(hierGr))) addVar <- c(addVar, hierGr[[i]][matlabColon(2, 
                                                                                          length(hierGr[[i]]))])
    addVar <- unique(addVar)
    for (i in addVar) x[, i] <- paste(colnames(x)[i], x[, i], sep = sep)
  }
  if (addName) 
    for (i in seq_len(length(hierGr))) CheckLevels(x, hierGr[[i]], CheckLevelsHandling = stop)
  for (i in seq_len(length(groupVarInd))) CheckLevels(x, groupVarInd[[i]], CheckLevelsHandling = stop, 
                                                      checkDecreasing = TRUE, total = total)
  dimLists <- DimFromHier(x, groupVarInd, addName = FALSE, total = total)  # addName already done
  if (!xReturn) 
    return(dimLists)
  for (i in seq_len(NCOL(x))) x[, i] <- as.character(x[, i])
  list(x = x, dimLists = dimLists)
}

CheckLevels <- function(data, dimVarInd = 1:NCOL(data), CheckLevelsHandling = warning, 
                        checkDecreasing = FALSE, total = NULL) {
  x <- NULL
  oldlength <- Inf
  for (i in dimVarInd) {
    iunique <- unique(as.character(data[, i]))
    ilength <- length(iunique)
    if (checkDecreasing) 
      if (ilength > oldlength) 
        stop("Number of levels not decreasing")
    oldlength <- ilength
    x <- c(x, iunique)
  }
  if (!is.null(total)) {
    if (total %in% x) 
      CheckLevelsHandling(paste(total, "is used as a level name ...", paste(colnames(data)[dimVarInd], 
                                                                            collapse = ", ")))
  }
  if (length(x) == length(unique(x))) 
    return(TRUE)
  CheckLevelsHandling(paste("Levelnames must be different in", paste(colnames(data)[dimVarInd], 
                                                                     collapse = ", ")))
  return(FALSE)
}


#' Finding commonCells
#'
#' Finding lists defining common cells as needed for the input parameter
#' commonCells to the function protectLinkedTables in package sdcTable.
#' The function handles two tables based on the same main variables
#' but possibly different aggregating variables.
#'
#' @encoding UTF8
#'
#' @param dimList1 As input parameter dimList to the function makeProblem in package sdcTable.
#' @param dimList2 Another dimList with the same names and using the same level names.
#'
#' @return Output is a list according to the specifications in sdcTable.
#'
#' @export
#'
#' @examples
#'  x <- rep(c('A','B','C'),3)
#'  y <- rep(c(11,22,11),3)
#'  z <- c(1,1,1,2,2,2,3,3,3)
#'  zy <- paste(z,y,sep='')
#'  m <- cbind(x,y,z,zy)
#'  fg <- FindTableGroup(m,findLinked=TRUE)
#'  dimLists <- FindDimLists(m,fg$groupVarInd)
#'  # Using table1 and table2 in this example cause error,
#'  # but in other cases this may work well
#'  try(FindCommonCells(dimLists[fg$table$table1],dimLists[fg$table$table2]))
#'  FindCommonCells(dimLists[c(1,2)],dimLists[c(1,3)])
FindCommonCells <- function(dimList1, dimList2) {
  okNames <- TRUE
  if (length(unique(names(dimList1))) != length(dimList1)) 
    okNames <- FALSE
  if (length(unique(names(dimList2))) != length(dimList2)) 
    okNames <- FALSE
  if (!okNames) 
    stop("Elements of dimLists must be named uniquely.")
  commonNames <- names(dimList1)[names(dimList1) %in% names(dimList2)]
  niceProblem <- identical(names(dimList1), names(dimList2))
  if (!niceProblem) 
    stop("Only problems where identical(names(dimList1),names(dimList2))=TRUE implemented.")
  
  commonCells <- vector("list", length(commonNames))
  names(commonCells) <- commonNames
  for (i in seq_len(length(commonNames))) {
    okAll <- (niceProblem & identical(dimList1[[i]], dimList2[[i]]))
    if (okAll) 
      commonCells[[i]] <- vector("list", 3) else commonCells[[i]] <- vector("list", 4)
      commonCells[[i]][[1]] <- commonNames[i]
      commonCells[[i]][[2]] <- commonNames[i]
      if (okAll) 
        commonCells[[i]][[3]] <- "All" else {
          c1 <- dimList1[[which(names(dimList1) == commonNames[i])]]$codes
          c2 <- dimList2[[which(names(dimList2) == commonNames[i])]]$codes
          cc <- c1[c1 %in% c2]
          commonCells[[i]][[3]] <- cc
          commonCells[[i]][[4]] <- cc
        }
  }
  commonCells
}




UniqueNrList <- function(x, sort = 0) {
  if (sort == 0) 
    ix <- seq_len(length(x)) else {
      ix <- SortNrList(x, index.return = TRUE)
      if (sort < 0) 
        ix <- rev(ix)
    }
  z <- NULL
  xz <- NULL
  for (i in ix) {
    if (!any((x[[i]] %in% xz))) {
      z <- c(z, i)
      xz <- c(xz, x[[i]])
    }
  }
  sort(z)
}

# Setter sammen grupper hvis de har noe felles
GroupNrList <- function(x) {
  n <- length(x)
  z <- vector("list", n)
  z[[1]] <- x[[1]]
  k <- 1
  for (i in matlabColon(2, n)) {
    a <- x[[i]]
    jj <- 0
    for (j in seq_len(k)) {
      if (any(x[[i]] %in% z[[j]])) 
        jj <- j
    }
    if (jj == 0) {
      k <- k + 1
      z[[k]] <- x[[i]]
    } else {
      z[[jj]] <- unique(c(z[[jj]], x[[i]]))
    }
  }
  z[seq_len(k)]
}

CrossLevels <- function(x) {
  SortRows(unique(x, MARGIN = 1))
}

Try the SSBtools package in your browser

Any scripts or data that you put into this service are public.

SSBtools documentation built on Aug. 22, 2018, 5:05 p.m.