R/swcGetMapping.R

#' Compute a matching table between two lists of municipality IDs
#'
#' For two lists of Swiss municipality IDs at any two points in time, this
#' function creates a data frame with two columns where each row represents
#' a match between municipality IDs.  This can be used as an intermediate
#' table for merging two data sets with municipality identifiers taken at
#' different, possibly unknown, points in time.
#'
#' It is advisable to use factors as list of municipality IDs.  By that,
#' comparisons and merges for municipality IDs are automatically checked for
#' compatibility.
#'
#' Note that the "from" list must be from an earlier time than the "to" list.
#' Trying to compute the mapping the other way round results in an error.
#' This is intentional: As municipalities are usually merged, it makes sense
#' to use the most recent data set as target for the mapping.  This can also be
#' a file with suitable geometries to allow for visualization.
#'
#' @template swc
#' @param ids.from A list of "source" municipality IDs, preferably a factor
#' @param ids.to A list of "target" municipality IDs, preferably a factor
#'
#' @return A data frame with columns prefixed by \code{from.} and \code{to} that
#'   represents the computed match.  The municipality IDs are stored in the
#'   columns \code{from.mId} and \code{to.mId}.  The columns
#'   \code{from.MergeType} and \code{to.MergeType} contain \code{valid} if
#'   the municipality is contained in both the input and the mapping table,
#'   \code{missing} if the municipality is missing from the input, and
#'   \code{extra} if the municipality is in the input but not in the mapping
#'   table; most columns are \code{NA} for such rows.  In addition, the column
#'   \code{MergeType} offers a summary of the "from" and "to" status: Rows with
#'   values other than \code{"valid"} or \code{"missing"} should be examined.
#'
#' @example example/swcGetMapping.R
#' @export
swcGetMapping <- function(swc = NULL, ids.from, ids.to) {
  if (!is.null(swc)) {
    warning("swc ignored.", call. = FALSE)
  }
  municipality_mutations <- SwissHistMunData::municipality_mutations

  #' @details
  #' For two lists of municipalities, we construct a mapping from the first list
  #' to the second.  First, the most probable mutation number in the
  #' "municipality mutations" data set is computed.
  tid <- function(ids) {
    if (is.factor(ids)) ids <- as.character(ids)
    ids <- as.integer(ids)
    ids
  }
  ids.from <- sort(unique(ids.from))
  ids.from.int <- tid(ids.from)
  mid.from <- getMostProbableMutationId(ids.from.int)
  hist.list.from <- getHistIdList(mid.from)

  ids.to <- sort(unique(ids.to))
  ids.to.int <- tid(ids.to)
  mid.to <- getMostProbableMutationId(ids.to.int)
  hist.list.to <- getHistIdList(mid.to)

  ret <- getMunicipalityMappingWorker(hist.list.from, mid.from, hist.list.to, mid.to)

  mt <- function(c) factor(c, levels=c("valid", "missing", "extra"))
  cn <- function(name, n) paste(name, n, sep='.')

  resultTable <- function(histId, inId, name) {
    ret <- municipality_mutations[match(histId, municipality_mutations$mHistId),
                                 c('mHistId', 'cAbbreviation', 'mId', 'mLongName', 'mShortName')]
    ret$MatchType <- mt(ifelse(ret$mId %in% inId, "valid", "missing"))
    names(ret) <- cn(names(ret), name)
    ret
  }

  extraTable <- function(retId, inId, name) {
    mId <- sort(setdiff(inId, retId))
    if (length(mId) == 0) return(NULL)
    ret <- data.frame(mId=mId)
    ret <- transform(ret, MatchType=mt("extra"))
    names(ret) <- cn(names(ret), name)
    ret
  }

  fixFactor <- function(ret, inId, name) {
    nameId <- paste0("mId.", name)
    nameIdAsNumber <- paste0("mIdAsNumber.", name)
    if (is.factor(inId)) {
      ret[[nameIdAsNumber]] <- ret[[nameId]]
      ret[[nameId]] <- factor(ret[[nameId]], levels=levels(inId))
    }

    ret
  }

  ret.from <- resultTable(ret$from, ids.from.int, "from")
  ret.to <- resultTable(ret$to, ids.to.int, "to")
  ret <- cbind(ret.from, ret.to)
  ret <- plyr::rbind.fill(ret,
                          extraTable(ret.from$mId.from, ids.from.int, "from"),
                          extraTable(ret.to$mId.to, ids.to.int, "to"))
  ret <- fixFactor(ret, ids.from, "from")
  ret <- fixFactor(ret, ids.to, "to")

  dMatchType <- c(
    `valid.valid`="valid",
    `missing.missing`="missing",
    `missing.valid`="missing.from",
    `extra.NA`="extra.from",
    `valid.missing`="missing.to",
    `NA.extra`="extra.to"
  )
  ret$MatchType <- factor(
    dMatchType[paste(ret$MatchType.from, ret$MatchType.to, sep='.')],
    levels=dMatchType)
  ret
}

getMostProbableMutationId <- function(municipalityIds) {
  fitness <- getMunicipalityIdFitness(municipalityIds)
  logging::logdebug("fitness:\n%s", head(plyr::arrange(fitness, -fitness)$fitness, 10))
  logging::logdebug("mutationId:\n%s", head(plyr::arrange(fitness, -fitness)$mMutationId, 10))
  fitness.max <- which.max(fitness$fitness)
  ret <- fitness$mMutationId[fitness.max]
  logging::logdebug("getMostProbableMutationId: %s", ret)
  ret
}

getMunicipalityIdFitness <- function(municipalityIds) {
  mun.mut <- swcGetMutations()
  computeFitness(mun.mut, municipalityIds)
}

seq_by <- function(data, varname) {
  data <- plyr::arrange(data, get(varname))
  lengths <- rle(data[[varname]])$lengths
  var_seq <- unlist(lapply(lengths, seq_len))

  data[["seq"]] <- var_seq
  data
}

computeMunList <- function(mun.mut.m) {
  logging::loginfo("computeMunList")
  # The .y values of the argument contain the newly added, the .x values
  # the dropped column identifiers.  This function verifies that after each
  # .y value there is a corresponding .x value, and vice versa, and that the
  # mutation IDs are in nondecreasing order.  The return value is the list of
  # municipality IDs after all transformations have been applied.
  #
  # Work around performance issues in plyr:
  mun.mut.m$mMutationId <- with(mun.mut.m, as.integer(mMutationId))
  #
  # Split input:
  y <- subset(mun.mut.m, grepl("[.]y$", get("variable")))
  x <- subset(mun.mut.m, grepl("[.]x$", get("variable")))
  stopifnot(nrow(x) + nrow(y) == nrow(mun.mut.m))
  #
  # Assign to each x even sequence values, to each y odd sequence values:
  xs <- seq_by(x, "value")
  xs$seq <- xs$seq * 2L
  ys <- seq_by(y, "value")
  ys$seq <- ys$seq * 2L - 1L
  #
  # Mingle, order by municipality (=value) and sequence number:
  xys <- plyr::arrange(plyr::rbind.fill(xs, ys), get("value"), get("seq"))
  #
  # Municipality groups: Compute "group change points" and end of group:
  xys.dvg <- diff(xys$value) > 0
  xys.endgroup <- c(xys.dvg, TRUE)
  #
  # Mutation IDs must be nondecreasing, and sequence numbers must be increasing
  # by one in each group (i.e., everywhere except perhaps at group boundaries)
  xys.dMutationId <- diff(xys$mMutationId)
  xys.dseq <- diff(as.integer(xys$seq))
  stopifnot(xys.dMutationId[!xys.dvg] >= 0)
  stopifnot(xys.dseq[!xys.dvg] == 1)
  #
  # Return value: All municipalities that have not expired at the end of their
  # group.
  subset(xys[xys.endgroup, c("seq", "value")], seq %% 2 == 1)$value
}

meltMutations <- function(mun.mut, hist) {
  logging::loginfo("meltMutations")
  # The list of mutations is processed in the order of the mutation ID, which
  # is composed of mutation date and mutation number. (Conversation with Ernst
  # Oberholzer end of March 2013.)  All records with the same mutation ID
  # form a mutation.
  measure.vars=paste0(if (hist) "mHistId" else "mId", ".", c("x", "y"))

  mun.mut.m <- reshape2::melt(
    mun.mut, id.vars="mMutationId",
    measure.vars=measure.vars,
    na.rm=TRUE)
  mun.mut.m <- plyr::arrange(mun.mut.m,
                             get("mMutationId"), get("variable"))
  unique(mun.mut.m)
}

computeFitness <- function(mun.mut, municipalityIds) {
  mun.mut.m <- meltMutations(mun.mut, hist = F)

  mun.mut.m <- plyr::mutate(
    mun.mut.m,
    dir = ifelse(grepl("[.]y$", get("variable")), 1L, -1L),
    desired = ifelse(get("value") %in% municipalityIds, 1L, -1L),
    delta = get("dir") * get("desired"))

  mun.mut.c <- reshape2::dcast(data=mun.mut.m, formula=mMutationId~.,
                               fun.aggregate = sum,
                               value.var = "delta")

  mun.mut.c$fitness <- cumsum(mun.mut.c$.)
  mun.mut.c[, c("mMutationId", "fitness")]
}

getHistIdList <- function(mutationId) {
  mun.mut <- subset(
    swcGetMutations(), get("mMutationId") <= mutationId)
  mun.mut.m <- meltMutations(mun.mut, hist = T)
  computeMunList(mun.mut.m)
}

getMunicipalityMappingWorker <- function(hist.list.from, mid.from, hist.list.to, mid.to) {
  # Here, we receive the most probable mutation number in the "municipality
  # mutations" data set as parameter.

  # We assume that the first list is older than the second list:
  stopifnot(mid.from <= mid.to)

  # Then we iterate over all mutations.  We must cover all mutations,
  # because a mutation A -> A' -> B would be lost otherwise if A'
  # is not in either municipality list.
  mun.mut <- swcGetMutations()

  # The mapping will be represented as a linear transformation, encoded
  # as a sparse matrix with rows as "from" and columns as "to" municipalities.
  # Each mutation is converted to such a matrix, the matrix product of
  # all mutations will be the final mapping.  We start with a unit
  # matrix (the identity transformation) of all "from" columns:

  f <- Matrix::sparseMatrix(i=seq_along(hist.list.from),
                            j=seq_along(hist.list.from),
                            x=1,
                            dimnames=list(hist.list.from, hist.list.from))

  # Subsequently, this transformation is augmented with all mutations
  # between "from" and "to"
  trans.list <- plyr::ddply(
    subset(mun.mut, kimisc::in.interval.lo(
      as.numeric(get("mMutationId")), as.numeric(mid.from), as.numeric(mid.to))),
    "mMutationId",
    function(m) {
      rn <- colnames(f)

      abolId <- unique(m$mHistId.x)
      admId <- unique(m$mHistId.y)
      abolId <- subset(abolId, !is.na(abolId))
      admId <- subset(admId, !is.na(admId))
      logging::logdebug('%s: +(%s), -(%s)', m$mMutationId[1], format(admId), format(abolId))

      removedId <- setdiff(abolId, admId)
      remainingId <- setdiff(abolId, removedId)
      addedId <- sort(setdiff(admId, abolId))
      logging::logdebug('%s: ++(%s), =(%s), --(%s)', m$mMutationId[1], format(addedId), format(remainingId), format(removedId))

      stopifnot(sort(remainingId) == sort(setdiff(admId, addedId)))

      cn <- setdiff(c(rn, addedId), removedId)

      idId <- setdiff(rn, abolId)
      idI <- match(idId, rn)
      idJ <- match(idId, cn)
      stopifnot(!is.na(idI))
      stopifnot(!is.na(idJ))

      logging::logdebug('%s: (%s)->(%s)', m$mMutationId[1], format(m$mHistId.x), format(m$mHistId.y))
      trI <- match(m$mHistId.x, rn)
      trJ <- match(m$mHistId.y, cn)

      stopifnot(!is.na(trI))
      stopifnot(!is.na(trJ))
      logging::logdebug('%s: ((%s))->((%s))', m$mMutationId[1], format(trI), format(trJ))
      logging::logdebug('%s: %s x %s', m$mMutationId[1], length(rn), length(cn))
      g <- Matrix::sparseMatrix(c(idI, trI), c(idJ, trJ), x=1, dimnames=list(rn, cn))

      logging::logdebug('%s: %s %%*%% %s', m$mMutationId[1], dim(f), dim(g))
      f <<- f %*% g
      logging::logdebug('%s: %s', m$mMutationId[1], dim(f))

      data.frame(rows=length(rn), columns=length(cn))
    }
  )

  ff <- Matrix::summary(f)
  ff$from <- as.integer(rownames(f)[ff$i])
  ff$to <- as.integer(colnames(f)[ff$j])
  ff <- plyr::arrange(ff, get("from"))
  ff$i <- NULL
  ff$j <- NULL
  ff$x <- NULL
  as.data.frame(ff)
}
nattimmis/Swiss-Communes-Statistical-Data documentation built on May 22, 2019, 3:52 p.m.