Nothing
#' Create ID Crosswalk
#'
#' Link ID columns from multiple data sets. De-identified columns
#' are created to make a crosswalk.
#'
#' \sQuote{visit.id} and \sQuote{uniq.id} may occur multiple
#' times, but should have a one-to-one linkage defined by at least
#' one of the input data sets. A new visit number is generated for
#' each repeated \sQuote{uniq.id}.
#'
#' @param data list of data.frames
#' @param idcols list of character vectors, indicating ID columns
#' found in each data set given in \sQuote{data}
#' @param visit.id character sting indicating visit-level ID variable (default is "subject_id")
#' @param uniq.id character sting indicating subject-level ID variable (default is "subject_uid")
#'
#' @return crosswalk of ID columns and their de-identified versions
#'
#' @examples
#' demo_data <- data.frame(subj_id=c(4.1,4.2,5.1,6.1),
#' pat_id=c(14872,14872,24308,37143),
#' gender=c(1,1,0,1),
#' weight=c(34,42,28,63),
#' height=c(142,148,120,167))
#'
#' conc_data <- data.frame(subj_id=rep((4:6)+0.1,each=5),
#' event=rep(1:5,times=3),
#' conc.level=15*exp(-1*rep(1:5,times=3))+rnorm(15,0,0.1))
#'
#' data <- list(demo_data, conc_data)
#' idcols <- list(c('subj_id', 'pat_id'), 'subj_id')
#' idCrosswalk(data, idcols, visit.id='subj_id', uniq.id='pat_id')
#'
#' @export
idCrosswalk <- function(data, idcols, visit.id="subject_id", uniq.id="subject_uid") {
dl <- length(data)
il <- length(idcols)
stopifnot(dl == il)
res <- vector("list", dl)
for (i in seq(dl)) {
df <- data[[i]][, idcols[[i]], drop = FALSE]
res[[i]] <- unique(df)
}
red <- Reduce(function(x, y) merge(x, y, all = TRUE), res)
# special case: only one ID variable to de-identify
if(visit.id == uniq.id && ncol(red) == 1 && names(red) == visit.id) {
red <- red[order(red[, visit.id], red[, uniq.id]),,drop = FALSE]
uid <- seq(nrow(red))
red[, "mod_visit"] <- 1
red[, "mod_id"] <- uid
red[, "mod_id_visit"] <- paste(uid, red[, "mod_visit"], sep = ".")
} else {
ix <- which(is.na(red[, visit.id]))
red[ix, visit.id] <- ix * -1 + 0.1
red[, "subj"] <- trunc(red[, visit.id])
idBySubj <- function(x) unique(x[!is.na(x)])
# should only have one `uniq.id` per `visit.id`
meanIds <- tapply(red[, uniq.id], red[, "subj"], idBySubj)
idlen <- lengths(meanIds)
meanIds[idlen == 0] <- NA
meanIds <- unlist(meanIds)
ix <- which(idlen > 1)
if (length(ix)) {
badId <- names(ix)
print(unique(red[red[, "subj"] %in% badId, c("subj",
uniq.id)]))
stop("discrepancy with visit.id and uniq.id must be fixed")
}
red[, uniq.id] <- meanIds[match(red[, "subj"], names(meanIds))]
# if missing `visit.id` can be found for `uniq.id`, use that instead
# visit number would be unknown though
# after discussion, impute visit number by using the minimum/first known
revr_check <- tapply(red[,'subj'], red[, uniq.id], idBySubj)
ix <- which(lengths(revr_check) > 1)
if(length(ix)) {
chk_sub_ids <- names(revr_check)[ix]
chk_split <- red[, uniq.id] %in% chk_sub_ids
chk0 <- red[!chk_split,]
chk1 <- red[chk_split,]
chk2 <- do.call(rbind, lapply(split(chk1, chk1[, uniq.id]), function(i) {
subj_ids <- i[,'subj']
u_subj_ids <- unique(subj_ids[subj_ids >= 0])
if(length(u_subj_ids) == 1) {
jix <- subj_ids != u_subj_ids
# use the minimum visit number
i[jix, visit.id] <- min(i[!jix, visit.id])
}
i
}))
red <- rbind(chk0, chk2)
}
red <- red[, c(visit.id, uniq.id)]
ix <- which(is.na(red[, uniq.id]))
red[ix, uniq.id] <- ix * -100
red <- unique(red)
red <- red[order(red[, visit.id], red[, uniq.id]),]
dd <- duplicated(red[, uniq.id])
ddcs <- cumsum(dd)
ddncs <- cumsum(!dd)
red[, "mod_visit"] <- ddcs - ddcs[which(!dd)][ddncs] + 1
red[, "mod_id"] <- ddncs
red[, "mod_id_visit"] <- paste(ddncs, red[, "mod_visit"],
sep = ".")
red[red[, visit.id] <= 0, visit.id] <- NA
red[red[, uniq.id] <= 0, uniq.id] <- NA
}
rownames(red) <- NULL
attr(red, "deidentified_cols") <- c("mod_visit", "mod_id",
"mod_id_visit")
red
}
#' Pull Fake/Mod ID
#'
#' Replace IDs with de-identified version pulled from a crosswalk.
#'
#' @param dat a data.frame
#' @param xwalk a data.frame providing linkage for each ID, e.g. output from \code{\link{idCrosswalk}}
#' @param firstCols name of columns to put at front of output data set
#' @param orderBy name of columns used to reorder output data set
#' @param uniq.id character string indicating subject-level id variable (default is "subject_uid")
#'
#' @return The modified data.frame
#'
#' @examples
#' demo_data <- data.frame(subj_id=c(4.1,4.2,5.1,6.1),
#' pat_id=c(14872,14872,24308,37143),
#' gender=c(1,1,0,1),
#' weight=c(34,42,28,63),
#' height=c(142,148,120,167))
#'
#' # crosswalk w/ same format as idCrosswalk() output
#' xwalk <- data.frame(subj_id=c(4.1,4.2,5.1,6.1),
#' pat_id=c(14872,14872,24308,37143),
#' mod_visit=c(1,2,1,1),
#' mod_id=c(1,1,2,3),
#' mod_id_visit=c(1.1,1.2,2.1,3.1))
#'
#' demo_data_deident <- pullFakeId(demo_data, xwalk,
#' firstCols = c('mod_id','mod_id_visit','mod_visit'),
#' uniq.id='pat_id')
#'
#' @export
pullFakeId <- function(dat, xwalk, firstCols = NULL, orderBy = NULL, uniq.id="subject_uid") {
cmn <- intersect(names(dat), names(xwalk))
# uniq.id (MRN) may have multiple IDs; if it's the key, reduce
if(length(cmn) == 1 && cmn == uniq.id) {
xwalk <- unique(xwalk[, c(uniq.id,'mod_id')])
xwalk <- xwalk[!is.na(xwalk[,uniq.id]),]
}
if('deidentified_cols' %in% names(attributes(xwalk))) {
toremove <- setdiff(names(xwalk), attr(xwalk, 'deidentified_cols'))
} else {
toremove <- cmn
}
if(is.null(orderBy)) {
dat[,'row_order'] <- seq(nrow(dat))
}
x <- merge(dat, xwalk, by = cmn, all.x = TRUE)
x <- x[,-match(toremove, names(x)), drop = FALSE]
if(is.null(orderBy)) {
x <- x[order(x[,'row_order']),-match('row_order', names(x)), drop = FALSE]
} else {
x <- x[do.call(order, as.list(x[,orderBy,drop = FALSE])),, drop = FALSE]
}
rownames(x) <- NULL
if(!is.null(firstCols)) {
x <- x[,c(firstCols, setdiff(names(x), firstCols)), drop = FALSE]
}
x
}
#' Pull Real ID
#'
#' Replace de-identified IDs with identified version pulled from a crosswalk.
#'
#' @param dat a data.frame
#' @param xwalk a data.frame providing linkage for each ID, e.g. output from \code{\link{idCrosswalk}};
#' if NULL, the crosswalk will be pulled from the \sQuote{pkxwalk} option, or otherwise the
#' unmodified data.frame.
#' @param remove.mod.id logical, should the de-identified IDs -- mod_id, mod_visit, mod_id_visit --
#' be removed (default=FALSE)
#'
#' @return The modified data.frame
#'
#' @examples
#' demo_data_deident <- data.frame(mod_id=c(1,1,2,3),
#' mod_id_visit=c(1.1,1.2,2.1,3.1),
#' mod_visit=c(1,2,1,1),
#' gender=c(1,1,0,1),
#' weight=c(34,42,28,63),
#' height=c(142,148,120,167))
#'
#' # crosswalk w/ same format as idCrosswalk() output
#' xwalk <- data.frame(subj_id=c(4.1,4.2,5.1,6.1),
#' pat_id=c(14872,14872,24308,37143),
#' mod_visit=c(1,2,1,1),
#' mod_id=c(1,1,2,3),
#' mod_id_visit=c(1.1,1.2,2.1,3.1))
#'
#' pullRealId(demo_data_deident, xwalk)
#' pullRealId(demo_data_deident, xwalk, remove.mod.id=TRUE)
#'
#' @export
pullRealId <- function(dat, xwalk = NULL, remove.mod.id=FALSE) {
if(is.null(xwalk)) {
xwalkVar <- getOption('pkxwalk')
if(!is.null(xwalkVar) && xwalkVar %in% ls(envir = .GlobalEnv)) {
xwalk <- get(xwalkVar, envir = .GlobalEnv)
} else {
return(dat)
}
}
xnames <- names(xwalk)
dnames <- names(dat)
coi <- c(xnames[!grepl('^mod_', xnames)])
xw <- xwalk[,coi,drop = FALSE]
ix <- FALSE
if('mod_id_event' %in% dnames && 'mod_id_visit' %in% xnames) {
ix <- match(sub('_.*', '', dat[,'mod_id_event']), xwalk[,'mod_id_visit'])
} else if('mod_id_visit' %in% dnames && 'mod_id_visit' %in% xnames) {
ix <- match(dat[,'mod_id_visit'], xwalk[,'mod_id_visit'])
} else if('mod_id' %in% dnames && 'mod_id' %in% xnames) {
ix <- match(dat[,'mod_id'], xwalk[,'mod_id'])
}
if(length(ix) == nrow(dat)) {
dat <- cbind(xw[ix,,drop = FALSE], dat)
}
if(remove.mod.id){
dat <- dat[,!grepl('^mod_',names(dat))]
}
dat
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.