The following set of scripts was developed by a research team of Prof. Justin Remais at the University of California, Berkeley School of Public Health for the purpose of matching records corresponding to the same individual across and within Chinese language databases. It is designed to execute a full probabilistic record linkage routine on Chinese character data using various fields of identifying information.
The authors have included an accompanying high-level description of the steps involved in our record linkage approach. The following code is still in active development. Refinements and expansions of its capability and scalability are expected, and users are encouraged to customize it according to their specific needs.
Please feel free to contact the authors directly with any questions or feedback.
NOTE: For execution of the algorithms on the example dataset, at least 8 GB RAM is recommended. Execution takes ~ 1 hour in total
Philip Collender: pcollender@berkeley.edu | Charles Li: charlesli@berkeley.edu (WeChat ID: charlesli37) | Audrey Webb: awebb@berkeley.edu | Qu Cheng: qcheng@berkeley.edu (WeChat ID: canalcheng)
*Correspondence to Charles Li or Qu Cheng may be in Chinese.
Sys.setlocale(category = 'LC_ALL', locale = 'chs') set.seed(13597)
devtools::install_github('OPTI-SURVEIL/chinsimi') # functions for Chinese character conversion to pinyin and FCC and string similarity calculations devtools::install_github('OPTI-SURVEIL/corlink') # patch of existing package for record linkage with imputation of missing agreement patterns and dependence between fields library(ChinSimi) library(corlink) reqpackages = c('stringdist','foreach','doParallel','parallel','readr','Matrix','tidyverse') toinstall = !(reqpackages %in% installed.packages()) if(any(toinstall)) install.packages(pkgs = reqpackages[toinstall]) library(stringdist) library(foreach) library(doParallel) library(parallel) #parallelization is recommended to speed up massive calculations, though its utilization is limited in the vignette presented here library(readr) library(Matrix) library(tidyverse) nc = detectCores()-1
#dfA <- read.csv('', stringsAsFactors = FALSE) # '' = name of file or file path #dfB <- read.csv('', stringsAsFactors = FALSE) #optional
names = read_csv('names.csv') S1 = data.frame(names[,1]); names(S1) = 'name' S2 = data.frame(names[,2]); names(S2) = 'name' yobs = floor(rlnorm(nrow(S1),7.59,.005)) mobs = round(runif(nrow(S1),.5,12.5)) dobs = round(runif(nrow(S1),.5,31.5)) sex = rbinom(nrow(S1),1,.5) S1$sex=sex; S1$yob = yobs; S1$mob = mobs; S1$dob = dobs S2$sex=sex; S2$yob = yobs; S2$mob = mobs; S2$dob = dobs #introduce 1% error rates into date of birth and sex in S2: err = rbinom(nrow(S1),1,.01); S2$sex[which(err==1)] = 1 - S2$sex[which(err==1)] err = rbinom(nrow(S1),1,.01); S2$yob[which(err==1)] = S2$yob[which(err==1)] + round(rnorm(sum(err), 0,2.5)) err = rbinom(nrow(S1),1,.01); S2$mob[which(err==1)] = S2$mob[which(err==1)] + round(rnorm(sum(err), 0,1)) err = rbinom(nrow(S1),1,.01); S2$dob[which(err==1)] = S2$dob[which(err==1)] + round(rnorm(sum(err), 0,5)) #set invalid values to missing S2$mob[!(S2$mob %in% 1:12)] <- NA S2$dob[!(S2$dob %in% 1:31)] <- NA #uncomment these lines to store copies of the generated datasets #write_csv(S1,'dfA.csv') #write_csv(S2,'dfB.csv')
clean_list = list('[A-Za-z]',' ', '/"?') for(i in clean_list){ S1$name <- gsub(i,'',S1$name) S2$name <- gsub(i,'',S2$name) }
sus_names1 <- S1[nchar(S1$name) >= 6, ] sus_indices1 <- as.numeric(rownames(sus_names1)) sus_names2 <- S2[nchar(S2$name) >= 6, ] sus_indices <- as.numeric(rownames(sus_names2)) #in this example, no names have more than 5 characters
par_names1 <- S1[grepl('\\(',S1$name) & grepl('\\)',S1$name), ] par_indices1 <- as.numeric(rownames(par_names1)) par_names2 <- S2[grepl('\\(',S2$name) & grepl('\\)',S2$name), ] par_indices2 <- as.numeric(rownames(par_names2)) #Examine one name with parentheses par_names1 action1 = cbind(par_indices1,0) action1[,2] = 1 #let 1 indicate 'clean', 2 indicated 'delete' #Appears that parens are unnecessary so remove them #recommend creating dummy indicator vectors for ambiguity, unnecessary parens, or unnecessary information inside parens and treating differently according to the indicators at each index clean_list = c(clean_list,'\\(','\\)') for(i in 1:length(par_indices1)){ if(action1[i,2] == 1) S1$name[action1[i,1]] <- gsub('\\(|\\)','',S1$name[action1[i,1]]) if(action1[i,2] == 2){ stinds = grep('\\(',unlist(strsplit(S1$name[action1[i,1]],''))) endinds = grep('\\)',unlist(strsplit(S1$name[action1[i,1]],''))) delinds = unlist(lapply(1:length(endinds),function(n) stinds[n]:endinds[n])) S1$name[action1[i,1]] = paste0(unlist(strsplit(S1$name[action1[i,1]],''))[-delinds], collapse='') } }
#dfA$DOB <- as.Date(dfA$DOB, '%m/%d/%Y')
And store each subfield separately
#dfA$YOB <- as.numeric(format(dfA$DOB, '%Y')) #dfA$MOB <- as.numeric(format(dfA$DOB, '%m')) #dfA$dOB <- as.numeric(format(dfA$DOB, '%d')) #day of birth (dOB)
S1$lname <- substr(S1$name, 1, 1) S1$fname <- substr(S1$name, 2, 1000000L) S1[S1[,]==''] <- NA #label missing data S1$name <- NULL #erase old column S2$lname <- substr(S2$name, 1, 1) S2$fname <- substr(S2$name, 2, 1000000L) S2[S2[,]==''] <- NA S2$name <- NULL
S1$lname.py <- ChStr2py(S1$lname, method = 'tone', multi = TRUE,sep = '') S1$fname.py <- ChStr2py(S1$fname, method = 'tone', multi = TRUE,sep = '') S1$lname.fc <- ChStr2fc(S1$lname,sep = '') S1$fname.fc <- ChStr2fc(S1$fname, sep = '') S2$lname.py <- ChStr2py(S2$lname, method = 'tone', multi = TRUE,sep = '') S2$fname.py <- ChStr2py(S2$fname, method = 'tone', multi = TRUE,sep = '') S2$lname.fc <- ChStr2fc(S2$lname,sep = '') S2$fname.fc <- ChStr2fc(S2$fname, sep = '')
sub1 = S1[1:100,] sub2 = S2[1:100,] cartesian12 = merge(sub1,sub2,by=NULL)
cols_to_link = list('sex','yob','mob','dob','lname.','fname.') #Note that lname. and fname. will select both the fc and py columns in subsequent code u_vals = lapply(cols_to_link, function(c){ #Store unique nonmissing values for each variable in a named list col1 = grep(c,colnames(S1)) col2 = grep(c,colnames(S2)) list(na.omit(unique(S1[,col1])),na.omit(unique(S2[,col2]))) }) names(u_vals) = cols_to_link u_inds = new.env() #store row indices corresponding to each unique value in a hash table for(c in cols_to_link){ col1 = grep(c,colnames(S1)) col2 = grep(c,colnames(S2)) if(length(col1)>1){ allvals = unique(rbind(na.omit(unique(S1[,col1])),na.omit(unique(S2[,col2])))) u_inds[[c]] = apply(allvals,1, function(vals){ inds1 = S1[,col1[1]] == vals[1] inds2 = S2[,col2[1]] == vals[1] for(i in 2:length(vals)){ inds1 = inds1 * (S1[,col1[i]] == vals[i]) inds2 = inds2 * (S2[,col2[i]] == vals[i]) } list(which(inds1==1), which(inds2==1)) }) names(u_inds[[c]]) = apply(allvals, 1, paste0,collapse='|') }else{ allvals = na.omit(unique(unlist(c(S1[,col1],S2[,col2])))) u_inds[[c]] = lapply(allvals, function(val){ list(which(S1[,col1] == val),which(S2[,col2] == val)) }) names(u_inds[[c]]) = allvals } } m_inds = lapply(cols_to_link, function(c){ #store missing indices for each variable in a named list col1 = grep(c,colnames(S1)) #selects both lname.fc and lname.py, for example col2 = grep(c,colnames(S2)) inds1 = is.na(S1[,col1[1]]) inds2 = is.na(S2[,col2[1]]) if(length(col1)>1) for(i in 2:length(col1)){ inds1 = pmax(inds1, is.na(S1[col1[i]])) inds2 = pmax(inds2, is.na(S2[col2[i]])) } list(which(inds1==1),which(inds2==1)) }) names(m_inds) = cols_to_link m_inds = list2env(m_inds)
sim_vars = c('lname.','fname.') sim_fun = rep('sim_ld',length(sim_vars)) #levenstein (edit) similarity with functionality to handle characters with more than one pronunciation sims = new.env() #Store hash table of unique similarity values, each with index of unique values in u_vals used to generate them #st = Sys.time() cl = makeCluster(nc) #initializing parallel backend registerDoParallel(cl) for(i in 1:length(sim_vars)){ c = sim_vars[i] f = match.fun(sim_fun[i]) u1 = u_vals[[c]][[1]] u2 = u_vals[[c]][[2]] d1 = max(c(length(u1),nrow(u1))) d2 = max(c(length(u2),nrow(u2))) slices.1 = lapply(seq_len(d1 %/% 1000 + 1), function(i) (1000*i-999) : pmin(i*1000,d1)) slices.2 = lapply(seq_len(d2 %/% 1000 + 1), function(i) (1000*i-999) : pmin(i*1000,d2)) for(i in 1:length(slices.1)){ for(j in 1:length(slices.2)){ temp = expand.grid(slices.1[[i]],slices.2[[j]]) chunksize = ceiling(nrow(temp)/nc) chunks = lapply(seq_len(nc), function(i) (chunksize*i-chunksize + 1) : pmin(i*chunksize,nrow(temp))) if(class(u_vals[[c]][[1]]) %in% c('data.frame','matrix')){ sim = foreach(i = 1:nc, .combine = 'c') %dopar% { library(stringdist) f(u1[temp[chunks[[i]],1],],u2[temp[chunks[[i]],2],]) } }else{ sim = foreach(i = 1:nc, .combine = 'c') %dopar% { library(stringdist) f(u1[temp[chunks[[i]],1]],u2[temp[chunks[[i]],2]]) } } usims = unique(sim) chunksize = ceiling(length(usims)/nc) chunks = lapply(seq_len(nc), function(i) (chunksize*i-chunksize + 1) : pmin(i*chunksize,length(usims))) sims[[c]][[length(slices.2) * (i-1) + j]]= foreach(i = 1:nc,.combine='c') %dopar% { sims_ = list() for(s in usims[chunks[[i]]]) sims_[[sprintf("%0.17f",s)]] <- temp[sim == s, ] sims_ } cat(((i-1)*length(slices.2) + j)/(length(slices.1)*length(slices.2)) * 100, '% Complete for variable', c,'\n') } } usims = unique(do.call('c',lapply(sims[[c]],names))) sims[[c]] = lapply(usims, function(s){ do.call(rbind,lapply(sims[[c]],'[[',s)) }) names(sims[[c]]) = usims } #en = Sys.time() stopCluster(cl) rm(list = c('temp', 'u1','u2','d1','d2','usims','sim','slices.1','slices.2','chunks','chunksize')) gc()
#save(list = ls(all.names = T), file = 'linkage_1.RData') #load('linkage_1.RData')
thr = list('fname.' = 0.75, 'lname.'= 0.85) SpMats = vector('list',length=length(cols_to_link)) #list of sparse matrices, each of which stores the agreement for all record pairs of in one field (e.g. sparse matrix 1 has the agreement of S1[4,] and S2[22,] on sex in index [4,22]) names(SpMats) = cols_to_link for(c in cols_to_link){ if(c %in% sim_vars){ todo = which(as.numeric(names(sims[[c]])) >= thr[[c]]) todo = do.call(rbind,sims[[c]][todo]) uvalnames = lapply(u_vals[[c]],apply,1,paste0,collapse='|') temp = do.call(rbind,apply(todo,1,function(v){ ID1 = uvalnames[[1]][v[1]] ID2 = uvalnames[[2]][v[2]] expand.grid(u_inds[[c]][[ID1]][[1]], u_inds[[c]][[ID2]][[2]]) })) }else{ temp = do.call(rbind,lapply(u_inds[[c]], expand.grid)) #store coordinates of every exact match } SpMats[[c]] = sparseMatrix(i = temp[,1], j = temp[,2], x = rep(1,nrow(temp)), dims = c(nrow=nrow(S1), ncol = nrow(S2))) } rm(list = c('temp','uvalnames')) gc()
# atleasttwocols = function(v){ # matches = list() # tmatches = list() # for(i in 1:(length(v)-1)){ # matches[[i]] = which(SpMats[[v[i]]] == 1) # tmatches[[i]] = list() # for(j in (i+1):length(v)){ # tmatches[[i]][[j-i]] = matches[[i]][which(SpMats[[v[j]]][matches[[i]]]==1)] # } # matches[[i]] = unique(unlist(tmatches[[i]])) # } # unique(unlist(matches)) # } # # suff_cols = c('fname.','lname.') #fields for which a match qualifies the associated indices to be stored # part_cols = c('sex','yob','mob','dob') #A match on at least two of these fields will be required to store the associated indices # todo = unique(unlist(lapply(suff_cols, function(c) which(SpMats[[c]] == 1)))) #long matrix of indices to find agreement patterns and store # todo = unique(c(todo, atleasttwocols(part_cols))) starts = seq(0,length(SpMats[[1]]),1e6) ends = unique(c(starts[2:length(starts)]-1, length(SpMats[[1]]))) slices = lapply(1:length(starts), function(i) starts[i]:ends[i]) cl = makeCluster(nc) #Initializing parallel backend registerDoParallel(cl) patlist = foreach(i = 1:length(slices)) %dopar% { library(Matrix) inds = todo[slices[[i]]] i_s = inds %% nrow(S1) i_s[i_s==0] = nrow(S1) j_s = inds %/% nrow(S1) + 1 pat = matrix(nrow = length(inds), ncol = length(cols_to_link)) for(c in 1:length(cols_to_link)){ pat[,c] = SpMats[[c]][inds] pat[which(i_s %in% m_inds[[c]][[1]] | j_s %in% m_inds[[c]][[2]]),c] = 999 } colnames(pat) = cols_to_link freqtab = plyr::count(pat) freqtab } mastertable = do.call(rbind,patlist) #contingency table of frequency of each observed agreement pattern colnames(mastertable) = c(cols_to_link,'counts') mastertable = mastertable %>% group_by(sex,yob,mob,dob,lname.,fname.) %>% summarise(counts = sum(counts)) mastertable[mastertable==999] = NA stopCluster(cl) rm(list=c('patlist', 'slices', 'todo')) #rm('Spmats') Typically we'd remove these large sparse matrices, but we use the sparse matrix objects later on to check model fit, since we know which indices correspond to matches gc()
ifit = linkd(mastertable, alg = 'i') mfit = linkd(mastertable, alg = 'm') bfit = linkd(mastertable, alg = 'b')
summary(bfit$fitted_models$model_match) summary(mfit$fitted_models$model_match)
summary(bfit$fitted_models$model_mismatch)
corplot(fit = ifit, title = 'independence') corplot(fit = mfit, title = 'match interactions') corplot(fit = bfit, title = 'all interactions')
ifit$fitted_models$MRC mfit$fitted_models$MRC bfit$fitted_models$MRC
matchpats = do.call(cbind,lapply(SpMats,diag)) matchpats = plyr::count(matchpats) matchfreqs = matchpats$freq allpats = mastertable allpats[is.na(allpats)] = 999 matchpats = apply(matchpats[,1:length(cols_to_link)],1,paste0,collapse='') allpats_ = apply(allpats[,1:length(cols_to_link)],1,paste0,collapse='') allcounts = mastertable$counts trueprobs = allcounts * 0 for(i in 1:length(matchpats)){ p = matchpats[i] i2 = which(allpats_==p) trueprobs[i2] = matchfreqs[i]/allcounts[i2] } iprobs = ifit$fitted_probs$fitted_prob_match mprobs = mfit$fitted_probs$fitted_prob_match bprobs = bfit$fitted_probs$fitted_prob_match df = data.frame(trueprobs,iprobs,mprobs,bprobs,counts = mastertable$counts) ir2 = summary(lm(trueprobs ~ iprobs))$r.squared mr2 = summary(lm(trueprobs ~ mprobs))$r.squared br2 = summary(lm(trueprobs ~ bprobs))$r.squared
ggplot(df) + stat_smooth(aes(x = iprobs, y = trueprobs), formula = y ~ x,method='lm',col='black', alpha = 0.5) + geom_point(aes(x = iprobs, y = trueprobs, size = log(counts)), col='black', alpha = 0.5) + ggtitle(paste('Independence \n R^2 =',ir2)) ggplot(df) + stat_smooth(aes(x = mprobs, y = trueprobs), formula = y ~ x,method='lm',col='navy', alpha = 0.5) + geom_point(aes(x = mprobs, y = trueprobs, size = log(counts)), col='navy', alpha = 0.5) + ggtitle(paste('Interactions in Matching Category \n R^2 =',mr2)) ggplot(df) + stat_smooth(aes(x = bprobs, y = trueprobs), formula = y ~ x,method='lm',col='blue', alpha = 0.5) + geom_point(aes(x = bprobs, y = trueprobs, size = log(counts)), col='blue', alpha = 0.5) + ggtitle(paste('Interactions in both Categories \n R^2 =',br2))
p_FP = 0.01 p_FN = 0.05
freqtable = data.frame(cbind(ifit$fitted_probs,trueprobs) %>% arrange(desc(fitted_prob_match))) #sort agreement patterns in descending order of match probability propFP = cumsum(freqtable$counts * (1-freqtable$fitted_prob_match))/cumsum(freqtable$counts) #calculate the proportion of predicted nonmatches associated with an upper threshold at each possible value thr.high = freqtable$fitted_prob_match[sum(propFP <= p_FP)] freqtable = freqtable %>% arrange(fitted_prob_match) #re-sort in ascending order of matching probability propFN = cumsum(freqtable$counts * freqtable$fitted_prob_match)/sum(freqtable$counts * freqtable$fitted_prob_match) #calculate the predicted proportion of total matches excluded at each possible lower threshold value thr.low = freqtable$fitted_prob_match[sum(propFN <= p_FN)] cat('# declared nonlinks by independence model is', sum(freqtable$counts[freqtable$fitted_prob_match<=thr.low]),'\n') cat('# declared links by independence model is', sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high]),'\n') cat('# record pairs for clerical review under independence model is', sum(freqtable$counts[freqtable$fitted_prob_match<thr.high & freqtable$fitted_prob_match>thr.low]),'\n') cat('# of declared links that are correct under independence model is', sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high] * freqtable$trueprobs[freqtable$fitted_prob_match>=thr.high]),'/',sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high]),' = ',sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high] * freqtable$trueprobs[freqtable$fitted_prob_match>=thr.high])/sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high]),'\n')
freqtable = data.frame(cbind(mfit$fitted_probs,trueprobs) %>% arrange(desc(fitted_prob_match))) #sort agreement patterns in descending order of match probability propFP = cumsum(freqtable$counts * (1-freqtable$fitted_prob_match))/cumsum(freqtable$counts) #calculate the proportion of predicted nonmatches associated with an upper threshold at each possible value thr.high = freqtable$fitted_prob_match[sum(propFP <= p_FP)] freqtable = freqtable %>% arrange(fitted_prob_match) #re-sort in ascending order of matching probability propFN = cumsum(freqtable$counts * freqtable$fitted_prob_match)/sum(freqtable$counts * freqtable$fitted_prob_match) #calculate the predicted proportion of total matches excluded at each possible lower threshold value thr.low = max(c(min(freqtable$fitted_prob_match),freqtable$fitted_prob_match[sum(propFN <= p_FN)])) cat('# declared nonlinks by match interaction model is', sum(freqtable$counts[freqtable$fitted_prob_match<=thr.low]),'\n') cat('# declared links by match interaction model is', sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high]),'\n') cat('# record pairs for clerical review under match interaction model is', sum(freqtable$counts[freqtable$fitted_prob_match<thr.high & freqtable$fitted_prob_match>thr.low]),'\n') cat('# of declared links that are correct under match interaction model is', sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high] * freqtable$trueprobs[freqtable$fitted_prob_match>=thr.high]),'/',sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high]),' = ',sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high] * freqtable$trueprobs[freqtable$fitted_prob_match>=thr.high])/sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high]),'\n')
freqtable = data.frame(cbind(bfit$fitted_probs,trueprobs) %>% arrange(desc(fitted_prob_match))) #sort agreement patterns in descending order of match probability propFP = cumsum(freqtable$counts * (1-freqtable$fitted_prob_match))/cumsum(freqtable$counts) #calculate the proportion of predicted nonmatches associated with an upper threshold at each possible value thr.high = freqtable$fitted_prob_match[sum(propFP <= p_FP)] freqtable = freqtable %>% arrange(fitted_prob_match) #re-sort in ascending order of matching probability propFN = cumsum(freqtable$counts * freqtable$fitted_prob_match)/sum(freqtable$counts * freqtable$fitted_prob_match) #calculate the predicted proportion of total matches excluded at each possible lower threshold value thr.low = max(c(min(freqtable$fitted_prob_match),freqtable$fitted_prob_match[sum(propFN <= p_FN)])) cat('# declared nonlinks by full interaction model is', sum(freqtable$counts[freqtable$fitted_prob_match<=thr.low]),'\n') cat('# declared links by full interaction model is', sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high]),'\n') cat('# record pairs for clerical review under full interaction model is', sum(freqtable$counts[freqtable$fitted_prob_match<thr.high & freqtable$fitted_prob_match>thr.low]),'\n') cat('# of declared links that are correct under full interaction model is', sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high] * freqtable$trueprobs[freqtable$fitted_prob_match>=thr.high]),'/',sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high]),' = ',sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high] * freqtable$trueprobs[freqtable$fitted_prob_match>=thr.high])/sum(freqtable$counts[freqtable$fitted_prob_match>=thr.high]),'\n')
#retrieving matching indices savepats = subset(freqtable,fitted_prob_match >= thr.high) cl = makeCluster(nc) #Initializing parallel backend registerDoParallel(cl) link_inds = foreach(i = 1:nrow(savepats), .combine = rbind) %dopar% { library(Matrix) agrcols = which(unlist(savepats[i,1:length(cols_to_link)]) == 1) discols = which(unlist(savepats[i,1:length(cols_to_link)]) == 0) miscols = which(is.na(unlist(savepats[i,1:length(cols_to_link)]))) nums = sapply(agrcols, function(c) sum(SpMats[[c]])) c = agrcols[which.min(nums)] #start with the narrowest set inds = which(SpMats[[c]]==1) for(c in agrcols[-which.min(nums)]){ inds = inds[which(SpMats[[c]][inds] == 1)] } for(c in miscols){ i_s = inds %% nrow(S1); i_s[i_s == 0] = nrow(S1) j_s = inds %/% nrow(S1) + 1 inds = inds[which(i_s %in% m_inds[[c]][[1]] | j_s %in% m_inds[[c]][[2]])] } for(c in discols){ i_s = inds %% nrow(S1); i_s[i_s == 0] = nrow(S1) j_s = inds %/% nrow(S1) + 1 inds = inds[which(SpMats[[c]][inds] == 0 & !(i_s %in% m_inds[[c]][[1]]) & j_s %in% m_inds[[c]][[2]])] } i_s = inds %% nrow(S1); i_s[i_s == 0] = nrow(S1) j_s = inds %/% nrow(S1) + 1 cbind(i_s, j_s, savepats[i,'fitted_prob_match']) } linked_db = cbind(S1[link_inds[,1], ], S2[link_inds[,2],]) linked_db$prob = link_inds[,3] colnames(linked_db[,1:ncol(S1)]) = paste0('S1.',colnames(linked_db[,1:ncol(S1)])) colnames(linked_db[,(ncol(S1)+1):ncol(S2)]) = paste0('S2.',colnames(linked_db[,(ncol(S1)+1):ncol(S2)])) View(linked_db)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.