In difuture-lmu/dsBinVal: ROC and Calibration Analysis for DataSHIELD

options(width = 80)

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "Readme_files/"
)

## Helper to determine OPAL version:
v_major  = seq(4L, 10L)
v_minor1 = seq_len(10L) - 1
v_minor2 = seq_len(10L) - 1

versions = expand.grid(v_major, v_minor1, v_minor2)
od = order(versions[, 1], versions[, 2], versions[, 3])
versions = versions[od, ]
vstrings = paste(versions[, 1], versions[, 2], versions[, 3], sep = ".")

getOPALVersion = function(opal, versions) {
  k = 1
  ov = opalr::opal.version_compare(opal, versions[k])
  while (ov != 0) {
    if (ov > 0) k = k + 1
    if (ov < 0) stop("Version is smaller than the smallest one from vector.")
    ov = opalr::opal.version_compare(opal, versions[k])
  }
  return(versions[k])
}

pkgs = c("here", "opalr", "DSI", "DSOpal", "dsBaseClient")
for (pkg in pkgs) {
  if (! requireNamespace(pkg, quietly = TRUE))
    install.packages(pkg, repos = c(getOption("repos"), "https://cran.obiba.org"))
}
devtools::install(quiet = TRUE, upgrade = "always")
library(DSI)
library(DSOpal)
library(dsBaseClient)

## Install packages on the DataSHIELD test machine:
surl     = "https://opal-demo.obiba.org/"
username = "administrator"
password = "password"

opal = opalr::opal.login(username = username, password = password, url = surl)
opal_version = getOPALVersion(opal, vstrings)

check1 = opalr::dsadmin.install_github_package(opal = opal, pkg = "dsBinVal", username = "difuture-lmu", ref = "main")
if (! check1)
  stop("[", Sys.time(), "] Was not able to install dsBinVal!")

check2 = opalr::dsadmin.publish_package(opal = opal, pkg = "dsBinVal")
if (! check2)
  stop("[", Sys.time(), "] Was not able to publish methods of dsBinVal!")

opalr::opal.logout(opal)

# Build model for the example, therefore download the CNSIM data sets from:
# https://github.com/datashield/DSLite/tree/master/data

if (FALSE) {
  dpath = "~/Downloads"
  dnames = paste0(dpath, "/CNSIM", seq_len(3), ".rda")
  dLoader = function(n) {
    load(n)
    dn = ls()
    dn = dn[grep("CNSIM", dn)]
    return(get(dn))
  }
  CNSIM = na.omit(do.call(rbind, lapply(dnames, dLoader)))
  mod = glm(DIS_DIAB ~ ., data = CNSIM, family = binomial())
  save(mod, file = here::here("Readme_files/mod.rda"))
}

ROC-GLM and Calibration for DataSHIELD

The package provides functionality to conduct and visualize ROC analysis and calibration on decentralized data. The basis is the DataSHIELD infrastructure for distributed computing. This package provides the calculation of the ROC-GLM with AUC confidence intervals as well as calibration curves and the Brier score. In order to calculate the ROC-GLM or assess calibration it is necessary to push models and predict them at the servers which is also provided by this package. Note that DataSHIELD uses privacy filter from DataSHIELD v5 onwards that are also used in this package. Additionally, this package uses the old option datashield.privacyLevel (to indicate the minimal amount of values required to allow sharing an aggregation) as fallback. Instead of setting the option, we directly retrieve the fallback privacy level from the DESCRIPTION file each time a function calls for it. This options is set to 5 by default. The methodology of the package is explained in detail here.

Installation

At the moment, there is no CRAN version available. Install the development version from GitHub:

remotes::install_github("difuture-lmu/dsBinVal")

Register methods

It is necessary to register the assign and aggregate methods in the OPAL administration. These methods are registered automatically when publishing the package on OPAL (see DESCRIPTION).

Note that the package needs to be installed at both locations, the server and the analysts machine.

Installation on DataSHIELD

The two options are to use the Opal API:

Log into Opal ans switch to the Administration/DataSHIELD/ tab
Click the Add DataSHIELD package button
Select GitHub as source, and use difuture-lmu as user, dsBinVal as name, and main as Git reference.

The second option is to use the opalr package to install dsBinVal directly from R:

### User credentials (here from the opal test server):
surl     = "https://opal-demo.obiba.org/"
username = "administrator"
password = "password"

### Install package and publish methods:
opal = opalr::opal.login(username = username, password = password, url = surl)

opalr::dsadmin.install_github_package(opal = opal, pkg = "dsBinVal", username = "difuture-lmu", ref = "main")
opalr::dsadmin.publish_package(opal = opal, pkg = "dsBinVal")

opalr::opal.logout(opal)

Usage

A more sophisticated example is available here.

library(dsBinVal)

Log into DataSHIELD server

builder = newDSLoginBuilder()

surl     = "https://opal-demo.obiba.org/"
username = "administrator"
password = "password"

builder$append(
  server   = "ds1",
  url      = surl,
  user     = username,
  password = password,
  table    = "CNSIM.CNSIM1"
)
builder$append(
  server   = "ds2",
  url      = surl,
  user     = username,
  password = password,
  table    = "CNSIM.CNSIM2"
)
builder$append(
  server   = "ds3",
  url      = surl,
  user     = username,
  password = password,
  table    = "CNSIM.CNSIM3"
)

connections = datashield.login(logins = builder$build(), assign = TRUE)

Load test model, push to DataSHIELD, and calculate predictions

# Load the model fitted locally on CNSIM:
load(here::here("Readme_files/mod.rda"))
# Model was calculated by:
#> glm(DIS_DIAB ~ ., data = CNSIM, family = binomial())

# Push the model to the DataSHIELD servers:
pushObject(connections, mod)

# Create a clean data set without NAs:
ds.completeCases("D", newobj = "D_complete")

# Calculate scores and save at the servers:
pfun =  "predict(mod, newdata = D, type = 'response')"
predictModel(connections, mod, "pred", "D_complete", predict_fun = pfun)

datashield.symbols(connections)

Calculate l2-sensitivity

# In order to securely calculate the ROC-GLM, we have to assess the
# l2-sensitivity to set the privacy parameters of differential
# privacy adequately:
l2s = dsL2Sens(connections, "D_complete", "pred")
l2s

# Due to the results presented in https://arxiv.org/abs/2203.10828, we set the privacy parameters to
# - epsilon = 0.2, delta = 0.1 if        l2s <= 0.01
# - epsilon = 0.3, delta = 0.4 if 0.01 < l2s <= 0.03
# - epsilon = 0.5, delta = 0.3 if 0.03 < l2s <= 0.05
# - epsilon = 0.5, delta = 0.5 if 0.05 < l2s <= 0.07
# - epsilon = 0.5, delta = 0.5 if 0.07 < l2s BUT results may be not good!

Calculate ROC-GLM

# The response must be encoded as integer/numeric vector:
ds.asInteger("D_complete$DIS_DIAB", "truth")
roc_glm = dsROCGLM(connections, truth_name = "truth", pred_name = "pred",
  dat_name = "D_complete", seed_object = "pred")
roc_glm

plot(roc_glm)

Assess calibration

dsBrierScore(connections, "truth", "pred")

### Calculate and plot calibration curve:
cc = dsCalibrationCurve(connections, "truth", "pred")
cc

plot(cc)

Deploy information:

Build by r Sys.info()[["login"]] (r Sys.info()[["sysname"]]) on r as.character(Sys.time()).

This readme is built automatically after each push to the repository and weekly on Monday. The autobuilt is computed by installing the package on the DataSHIELD test server and is therefore a test if the functionality of the package works on DataSHIELD servers. Additionally, the functionality is tested using the GH Actions with tests/testthat/test_on_active_server.R. The system information of the local and remote machines are:

ri_l  = sessionInfo()
ri_ds = datashield.aggregate(connections, quote(getDataSHIELDInfo()))
client_pkgs = c("DSI", "DSOpal", "dsBaseClient", "dsBinVal")
remote_pkgs = c("dsBase", "resourcer", "dsBinVal")

Local machine:
- R version: r ri_l$R.version$version.string
- Version of DataSHELD client packages:

dfv = installed.packages()[client_pkgs, ]
dfv = data.frame(Package = rownames(dfv), Version = unname(dfv[, "Version"]))
knitr::kable(dfv)

Remote DataSHIELD machines:
- OPAL version of the test instance: r opal_version
- R version of r names(ri_ds)[1]: r ri_ds[[1]]$session$R.version$version.string
- R version of r names(ri_ds)[2]: r ri_ds[[2]]$session$R.version$version.string
- Version of server packages:

dfv = do.call(cbind, lapply(names(ri_ds), function(nm) {
  out = ri_ds[[nm]]$pcks[remote_pkgs, "Version", drop = FALSE]
  colnames(out) = paste0(nm, ": ", colnames(out))
  as.data.frame(out)
}))
dfv = cbind(Package = rownames(dfv), dfv)
rownames(dfv) = NULL
knitr::kable(dfv)