Nothing
#'Probabilistic Patient Record Linkage
#'
#'@param data1 either a binary (\code{1} or \code{0} values only) matrix or binary
#'data frame of dimension \code{n1 x K} whose rownames are the observation identifiers.
#'
#'@param data2 either a binary (\code{1} or \code{0} values only) matrix or a binary
#'data frame of dimension \code{n2 x K} whose rownames are the observation identifiers.
#'Columns should be in the same order as in \code{data1}.
#'
#'@param data1_cont2diff either a matrix or dataframe of continuous features,
#'such as age, for which the similarity measure uses the difference with
#'\code{data2_cont2diff}, whose rownames are . Default is \code{NULL}.
#'
#'@param data2_cont2diff either a matrix or dataframe of continuous features,
#'such as age, for which the similarity measure uses the difference with
#'\code{data2_cont1diff}, whose rownames are . Default is \code{NULL}.
#'
#'@param eps_plus discrepancy rate between \code{data1} and \code{data2}
#'
#'@param eps_minus discrepancy rate between \code{data2} and \code{data1}
#'
#'@param aggreg_2ways a character string indicating how to merge the posterior two
#'probability matrices obtained for each of the 2 databases. Four possibility are
#'currently implemented: \code{"maxnorm"}, \code{"max"}, \code{"min"}, \code{"mean"}
#'and \code{"prod"}. Default is \code{"mean"}.
#'
#'@param min_prev minimum prevalence for the variables used in matching.
#'Default is 1\%.
#'
#'@param d_max a numeric vector of length \code{K} giving the minimum difference
#'from which it is considered a discrepancy.
#'
#'@param use_diff logical flag indicating whether continuous differentiable variables should be used in the
#'
#'@param dates1 matrix or dataframe of dimension \code{n1 x K} including the concatenated dates intervals for each corresponding
#'diagnosis codes in \code{data1}. Default is \code{NULL} in which case dates are not used.
#'
#'@param dates2 matrix or dataframe of dimension \code{n2 x K} including the concatenated dates intervals for each corresponding
#'diagnosis codes in \code{data2}. Default is \code{NULL} in which case dates are not used. See details.
#'
#'@details \code{Dates:} the use of \code{dates1} and \code{dates2} requires that at least one date interval matches across
#'\code{dates1} and \code{dates2} for claiming an agreement on a diagnosis code between \code{data1} and \code{data2},
#'in addition of having that very same code recorded in both.
#'
#@param eps_inf discrepancy rate for the differentiable variables
#'
#'@references Hejblum BP, Weber G, Liao KP, Palmer N, Churchill S, Szolovits P,
#'Murphy S, Kohane I and Cai T, Probabilistic Record Linkage of De-Identified
#'Research Datasets Using Diagnosis Codes, \emph{Scientific Data}, 6:180298 (2019).
#'\doi{10.1038/sdata.2018.298}.
#'
#'@importFrom landpred VTM
#'@importFrom fGarch dsstd sstdFit
#'
#'@return a matrix of size \code{n1 x n2} with the posterior probability of matching for each \code{n1*n2} pair
#'
#'@examples
#'set.seed(123)
#'ncodes <- 500
#'npat <- 200
#'incid <- abs(rnorm(n=ncodes, 0.15, 0.07))
#'bin_codes <- rbinom(n=npat*ncodes, size=1, prob=rep(incid, npat))
#'bin_codes_mat <- matrix(bin_codes, ncol=ncodes, byrow = TRUE)
#'data1_ex <- bin_codes_mat[1:(npat/2+npat/10),]
#'data2_ex <- bin_codes_mat[c(1:(npat/10), (npat/2+npat/10 + 1):npat), ]
#'rownames(data1_ex) <- paste0("ID", 1:(npat/2+npat/10), "_data1")
#'rownames(data2_ex) <- paste0("ID", c(1:(npat/10), (npat/2+npat/10 + 1):npat), "_data2")
#'
#'if(interactive()){
#'res <- recordLink(data1 = data1_ex, data2 = data2_ex,
#' use_diff = FALSE, eps_minus = 0.01, eps_plus = 0.01)
#'round(res[c(1:3, 19:23), c(1:3, 19:23)], 3)
#'}
#'
#'@export
recordLink <- function(data1, data2, dates1 = NULL, dates2 = NULL,
eps_plus, eps_minus, aggreg_2ways = "mean",
min_prev = 0.01,
data1_cont2diff = NULL, data2_cont2diff = NULL,
#eps_inf,
d_max, use_diff=TRUE){
datesNotNull <- (!is.null(dates1) & !is.null(dates2))
nb_feat <- ncol(data1)
if(ncol(data2)!=nb_feat){stop("Number of columns in data2 is different from data1")}
n1 <- nrow(data1)
n2 <- nrow(data2)
if((is.null(data1_cont2diff) | is.null(data2_cont2diff)) && use_diff){
stop("cannot 'use_diff' when 'data1_cont2diff' and/or 'data2_cont2diff' is NULL\n Probably need to set 'use_diff = FALSE'")
}
if((is.null(dates1) & !is.null(dates2)) | (!is.null(dates1) & is.null(dates2))){
stop("missing one of the dates tables")
}
if(use_diff){
ndiff <- ncol(data1_cont2diff)
stopifnot(ncol(data2_cont2diff)==ndiff)
#if(length(eps_inf)==1 && ndiff>1){
# eps_inf <- rep(eps_inf, ndiff)
#}
if(length(d_max)==1 && ndiff>1){
d_max <- rep(d_max, ndiff)
}
#stopifnot(length(eps_inf)==ndiff)
stopifnot(length(d_max)==ndiff)
}
ind1 <- rownames(data1)
ind2 <- rownames(data2)
freq_select <- (colMeans(data1) > min_prev)
data1_bin <- data1[, freq_select, drop=FALSE]
data2_bin <- data2[, freq_select, drop=FALSE]
if(datesNotNull){
dates1_fs <- dates1[, freq_select, drop=FALSE]
dates2_fs <- dates2[, freq_select, drop=FALSE]
}
#rm(list=c("data1", "data2"))
#n_freq10 <- sum(colSums(data1_bin)/nrow(data1_bin)<0.10 | colSums(data1_bin)/nrow(data1_bin)>0.90)
#n_freq5 <- sum(colSums(data1_bin)/nrow(data1_bin)<0.05 | colSums(data1_bin)/nrow(data1_bin)>0.95)
#n_freq1 <- sum(colSums(data1_bin)/nrow(data1_bin)<0.01 | colSums(data1_bin)/nrow(data1_bin)>0.99)
#removing info too rare in any of the 2 database ?
col_in <- (pmax(apply(X=(data1_bin==0), MARGIN=2, FUN=mean), apply(X=data2_bin==0, MARGIN=2, FUN=mean)) < 0.995 &
pmin(apply(data1_bin==0,2,mean),apply(data2_bin==0,2,mean)) > 0.005)
if(sum(col_in)<1){stop("No features whith adequate prevalence")}
data1_bin <- data1_bin[, col_in, drop=FALSE]
data2_bin <- data2_bin[, col_in, drop=FALSE]
if(datesNotNull){
dates1_fs_ci <- dates1_fs[, col_in, drop=FALSE]
dates2_fs_ci <- dates2_fs[, col_in, drop=FALSE]
}
pi1 <- apply(data1_bin==1, MARGIN=2, FUN=mean)
pi2 <- apply(data2_bin==1, MARGIN=2, FUN=mean)
K0 <- ncol(data1_bin)
if(length(eps_plus)==1){
eps_p <- rep(eps_plus, K0)
}else if(length(eps_plus)==nb_feat){
eps_p <- eps_plus[freq_select][col_in]
}else{
stop("Length of 'eps_plus' doesn't make sense")
}
if(length(eps_minus)==1){
eps_n <- rep(eps_minus, K0)
}else if(length(eps_minus)==nb_feat){
eps_n <- eps_minus[freq_select][col_in]
}else{
stop("Length of 'eps_minus' doesn't make sense")
}
# need matrices for the C function
data1_bin <- as.matrix(data1_bin)
data2_bin <- as.matrix(data2_bin)
if(datesNotNull){
dates1_fs_ci <- as.matrix(dates1_fs_ci)
dates2_fs_ci <- as.matrix(dates2_fs_ci)
}
zeros_p <- which(eps_p==0)
if(length(zeros_p)>0){
eps_p[zeros_p] <- eps_p[zeros_p] + 1E-6
}
zeros_n <- which(eps_n==0)
if(length(zeros_n)>0){
eps_n[zeros_n] <- eps_n[zeros_n] + 1E-6
}
keep_eps <- which(eps_p!=1 & eps_n!=1)
if(datesNotNull){
dist_bin <- loglikC_bin_wDates(Bmat = data2_bin[, keep_eps], Amat = data1_bin[, keep_eps],
Bdates = dates2_fs_ci[, keep_eps], Adates = dates1_fs_ci[, keep_eps],
eps_p = eps_p[keep_eps], eps_n = eps_n[keep_eps],
piA = pi1[keep_eps], piB = pi2[keep_eps])
}else{
dist_bin <- loglikC_bin(Bmat = data2_bin[, keep_eps], Amat = data1_bin[, keep_eps], eps_p = eps_p[keep_eps],
eps_n = eps_n[keep_eps], piA = pi1[keep_eps], piB = pi2[keep_eps])
}
if(use_diff){
pi_inf <- numeric(ndiff)
for(j in 1:ndiff){
pi_inf[j] <- mean((unlist(lapply(data1_cont2diff[, j, drop=FALSE], function(x){x-data2_cont2diff[, j, drop=FALSE]}))<d_max[j]))
}
dist_diff <- loglikratioC_diff_arbitrary(Bmat = as.matrix(t(data2_cont2diff)),
Amat = as.matrix(t(data1_cont2diff)),
d_max = d_max,
cost = rep(-1E+4, ndiff)
)
dist_all <- dist_bin + dist_diff
}else{
dist_all <- dist_bin
}
rm(list=c("data1_bin", "data2_bin"))
colnames(dist_all) <- ind2
rownames(dist_all) <- ind1
#sstdFit generates warnings - to be ignored
if(length(dist_all)>10000){
sstdFit_1way <- try(suppressWarnings(fGarch::sstdFit(x = sample(dist_all, 10000))), silent=TRUE)
if(inherits(sstdFit_1way, "try-error")){sstdFit_1way <- try(suppressWarnings(fGarch::sstdFit(x = sample(dist_all,10000))), silent=TRUE)}
tmp_est <- sstdFit_1way$estimate
for(i in 1:4){
sstdFit_1way_sub <- try(suppressWarnings(fGarch::sstdFit(x = sample(dist_all, 10000))), silent=TRUE)
if(inherits(sstdFit_1way_sub, "try-error")){sstdFit_1way_sub <- try(suppressWarnings(fGarch::sstdFit(x = sample(dist_all, 10000))), silent=TRUE)}
tmp_est <- rbind(tmp_est, sstdFit_1way_sub$estimate)
}
sstdFit_1way$est <- colMeans(tmp_est, na.rm = TRUE)
}else{
sstdFit_1way <- try(suppressWarnings(fGarch::sstdFit(x = as.vector(dist_all))), silent=TRUE)
if(inherits(sstdFit_1way, "try-error")){sstdFit_1way <- try(suppressWarnings(fGarch::sstdFit(x = as.vector(dist_all))), silent=TRUE)}
if(inherits(sstdFit_1way, "try-error")){
stop("Error in fitting the Student-t distribution")
}
sstdFit_1way$est <- sstdFit_1way$estimate
}
inter_x <- 0.1
obs_dist_x_1way <- seq(min(dist_all), max(dist_all), by=inter_x)
### select cutoff
# # empirical solution
# # single skew-t parametric estimation
# fitskewt_dens_est <- fGarch::dsstd(obs_dist_x_1way,
# mean=sstdFit_1way$est[1],
# sd=sstdFit_1way$est[2],
# nu=sstdFit_1way$est[3],
# xi=sstdFit_1way$est[4])
# peak_fitskewt_dens_est <- max(fitskewt_dens_est)
# rho1_hat_empirical_1way <- min((obs_dist_x_1way[obs_dist_x_1way >= peak_fitskewt_dens_est])[fitskewt_dens_est[obs_dist_x_1way >= peak_fitskewt_dens_est]/peak_fitskewt_dens_est < 1/min(n1,n2)])
# analytical solution
nu <- sstdFit_1way$est[3]
s <- sstdFit_1way$est[2]
m <- sstdFit_1way$est[1]
xi <- sstdFit_1way$est[4]
Beta <- exp(lgamma(0.5) - lgamma((nu+1)/2) + lgamma(nu/2))
m1 <- 2*sqrt(nu-2)/((nu-1)*Beta)
mu <- m1*(xi-1/xi)
sigma <- sqrt((1-m1^2)*(xi^2+1/xi^2)+2*m1^2-1)
d1max_x <- m-(mu-xi*sqrt((nu-2)/(nu+2)))*s/sigma
d1 <- function(x){
#c <- 2*sigma*gamma((nu+1)/2)/(s*(xi+1/xi)*sqrt(pi*nu-2)*gamma(nu/2))
c <- exp(log(2) + log(sigma) + lgamma((nu+1)/2) - log(s*(xi+1/xi)*sqrt(pi*nu-2)) - lgamma(nu/2))
d <- -c*(nu+1)*sigma/(xi^2*(nu-2)*s)
y <- (x-m)/s*sigma+mu
return(d*y*(1+y^2/(xi^2*(nu-2)))^(-(nu+3)/2))
}
d2 <- function(x){
#c <- 2*sigma*gamma((nu+1)/2)/(s*(xi+1/xi)*sqrt(pi*nu-2)*gamma(nu/2))
c <- exp(log(2) + log(sigma) + lgamma((nu+1)/2) - log(s*(xi+1/xi)*sqrt(pi*nu-2)) - lgamma(nu/2))
d <- -c*(nu+1)*sigma/(xi^2*(nu-2)*s)
y <- (x-m)/s*sigma+mu
return(d*sigma/s*(1+y^2/(xi^2*(nu-2)))^(-(nu+5)/2)*(1-y^2*(nu+2)/xi^2*(nu-2)))
}
rho1_hat <- obs_dist_x_1way[obs_dist_x_1way>d1max_x][min(which(abs(d1(obs_dist_x_1way[obs_dist_x_1way>d1max_x]))<(1/min(n1,n2)) & abs(d2(obs_dist_x_1way[obs_dist_x_1way>d1max_x]))<(1/min(n1,n2))))]
if(is.na(rho1_hat)){
rho1_hat <- obs_dist_x_1way[obs_dist_x_1way>d1max_x][min(which(abs(d1(obs_dist_x_1way[obs_dist_x_1way>d1max_x]))<(1/min(n1,n2))))]
}
if(is.na(rho1_hat)){
rho1_hat <- 0
}
rm(list="obs_dist_x_1way")
dist_all_select <- dist_all > rho1_hat # rho1_hat_empirical_1way
nmatch_hat_1way <- min(sum(colSums(dist_all_select)>=1), sum(rowSums(dist_all_select)>=1)) # Empirical Bayes prior on P(M=1)
rm(list="dist_all_select")
prop_match_1way <- nmatch_hat_1way/(n1*n2)
rank_match_1 <- matchProbs_rank_full_C(dist_all, prop_match=prop_match_1way)
rownames(rank_match_1) <- ind1
colnames(rank_match_1) <- ind2
rank_match_2 <- t(matchProbs_rank_full_C(t(dist_all), prop_match=prop_match_1way))
rm(list="dist_all")
rownames(rank_match_2) <- ind1
colnames(rank_match_2) <- ind2
mode(rank_match_1) <- "numeric"
mode(rank_match_2) <- "numeric"
if(aggreg_2ways == "maxnorm"){
rank_match_add0_1 <- cbind(rank_match_1, 1-rowSums(rank_match_1))
probMax_1 <- apply(rank_match_add0_1, MARGIN=1, FUN=max)
rm(list="rank_match_add0_1")
rank_match_add0_2 <- rbind(rank_match_2, 1-colSums(rank_match_2))
probMax_2 <- apply(rank_match_add0_2, MARGIN=2, FUN=max)
rm(list="rank_match_add0_2")
}
probMatch_mat <- switch(aggreg_2ways,
maxnorm = rank_match_1*rank_match_2/matrix(probMax_1, ncol=1)%*%matrix(probMax_2, nrow=1),
min = pmin(rank_match_1, rank_match_2),
max = pmax(rank_match_1, rank_match_2),
mean = (rank_match_1+rank_match_2)/2,
prod = rank_match_1*rank_match_2
)
return(probMatch_mat)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.