dsBase: DataSHIELD server site base functions

Documented in blackBoxDS

#' @title Secure ranking of "V2BR" (vector to be ranked) across all sources
#' @description The first key serverside function that sets up the V2BR for
#' ranking in the client. 
#' @details Severside assign function called by ds.ranksSecure. 
#' Creates pseudo-data by using the real distribution of values in V2BR
#' to create a large number of synthetic data with a similar distribution to
#' the values in V2BR but with a slightly broader distribution at both ends
#' to ensure that any extreme values in the "combined real+pseudo data vector"
#' are all pseudo-data. Also ensures that the number of decimal places of
#' the values in the V2BR is reflected by the number of decimal places in
#' the pseudodata. Finally, takes the "combined real+pseudo data vector" through
#' seven rounds of rank consistent encryption that involves algorithms
#' themselves generated by a pseudorandom process that selects which
#' transformation to apply and with what parameters. The encryption algorithms
#' are the same in each study ensuring that ranks also remain consistent between
#' studies. After encryption the encrypted "combined real+pseudo data vector" is
#' written to the serverside as a dataframe also including other key component
#' vectors from the first stage of the ranking procedure.
#' For more details about the cluster of functions that
#' collectively enable secure global ranking and estimation of global quantiles
#' see the associated document entitled "secure.global.ranking.docx". Also
#' see the header file for ds.ranksSecure
#' @param input.var.name a character string specifying the name of V2BR. This
#' argument is set by the argument with the same name in the clientside function
#' ds.ranksSecure
#' @param shared.seedval a pseudorandom number seed that ensures that the
#' processes generating the order and parameterisation of the encryption
#' algorithms are the same in each study. This argument is set by the argument
#' <shared.seed.value> in the clientside function ds.ranksSecure. For more
#' details, including future plans to share this starting seed in a more secure
#' way, please see the associated document entitled "secure.global.ranking.docx"
#' and the header file for ds.ranksSecure.
#' @param synth.real.ratio an integer value representing
#' the ratio of synthetic (pseudo-data) values to the real number of 
#' values in V2BR. This argument is set by the argument with the same name
#' in the clientside function ds.ranksSecure. For more details, please see the
#' associated document entitled "secure.global.ranking.docx" and the header file
#' for ds.ranksSecure.
#' @param NA.manage character string indicating how missing values (NAs) in
#' V2BR should be managed. It takes three possible values: "NA.delete",
#' "NA.low","NA.hi". This argument is set by the argument with the same name
#' in the clientside function ds.ranksSecure. For more details, please see the
#' associated document entitled "secure.global.ranking.docx" and the header file
#' for ds.ranksSecure.
#' @return writes a data frame object entitled blackbox.output.df to the
#' serverside. In each study this contains the encrypted
#' "combined real+pseudo data vector" and a range of other key components
#' from the first stage of the ranking procedure. For more details see the
#' associated document entitled "secure.global.ranking.docx"
#' @author Paul Burton 9th November, 2021
#' @export
#'
blackBoxDS <- function(input.var.name=NULL, 
                       shared.seedval, synth.real.ratio, NA.manage){ #START FUNC
  
  #######################################################
  #MODULE 1: CAPTURE THE nfilter SETTINGS
  #thr<-dsBase::listDisclosureSettingsDS()
  #nfilter.tab <- as.numeric(thr$nfilter.tab)
  #nfilter.glm <- as.numeric(thr$nfilter.glm)
  #nfilter.subset <- as.numeric(thr$nfilter.subset)
  #nfilter.string <- as.numeric(thr$nfilter.string)
  #nfilter.stringShort <- as.numeric(thr$nfilter.stringShort)
  #nfilter.kNN <- as.numeric(thr$nfilter.kNN)
  #nfilter.noise <- as.numeric(thr$nfilter.noise)
  #nfilter.levels <- as.numeric(thr$nfilter.levels)
  ########################################################


 input.var <- eval(parse(text=input.var.name), envir = parent.frame())
 
 input.var.orig<-input.var

#manage NAs

if(NA.manage=="NA.delete"){
  input.var <- input.var[stats::complete.cases(input.var)]
  input.var.orig<-input.var
  }
 
 if(NA.manage=="NA.low"){
   input.var[is.na(input.var)]<-min.max.df[,1]
   input.var.orig<-input.var
 }

 if(NA.manage=="NA.hi"){
   input.var[is.na(input.var)]<-min.max.df[,2]
   input.var.orig<-input.var
 }

#ESTIMATED OVERALL MEAN AND SD FROM meanQuantileDS
#SAVED IN input.mean.sd.df BY ds.dmtC2S
max.sd.input.var<-input.mean.sd.df$max.sd.input.var
mean.input.var<-input.mean.sd.df$mean.input.var
 
numsubs.real<-length(input.var.orig)
numsubs.synth<-length(input.var.orig)*synth.real.ratio
numsubs<-numsubs.real+numsubs.synth

#Create indicators for original original sequence order and real/synthetic
ID.seq.real.synth.orig<-1:numsubs
is.real.synth.orig<-c(rep(1,numsubs.real),rep(0,numsubs.synth))

#Allow individual implementations of blackBoxDS for groups of projects to have a 
#unique but shared starting seed (given specified starting seed) but only
#the person setting up the function can know how that seed is perturbed relative
#to the specified seed

restart.seed.transformation.control.n<-37169
restart.seed.other.seed.actions<-67881

#Now set up for repeated transformations
set.seed(shared.seedval)    

#Create long fixed sequence of calls to seed to keep resetting seed at start
null.vector<-stats::runif(restart.seed.transformation.control.n,0,1)

#Create transformation control vectors/values first so all studies
#have the same transformation controls given input random seed 
#even if the lengths of later vectors (eg synthetic concealing data) are
#different between studies so would lead to inconsistent control
#vectors/values

 
control.vector<-sample(c(1,1,2,2,3,3),replace=FALSE)
#control.vector<-c(1,control.vector)
control.vector
  
  
control.value<-stats::runif(6,0.0001,1)
control.value

#Reset seed for other purposes
#Include study-specific component so seeds different in each study

#This code ensures you end up with a valid seed

study.specific.seed<-tryCatch(sum(abs(input.var.orig),na.rm=TRUE)/abs(stats::quantile(input.var.orig,0.25,na.rm=TRUE))+abs(stats::median(input.var.orig,na.rm=TRUE))+abs(stats::quantile(input.var.orig,0.75,na.rm=TRUE)))

if(is.na(study.specific.seed)||is.infinite(study.specific.seed))
{
  #specific seed to ensure reproducibility if tryCatch fails
  set.seed(671628)
  study.specific.seed<-stats::runif(1,1000,1000000)
}

if(study.specific.seed<1000){
  study.specific.seed<-study.specific.seed*1234
  if(study.specific.seed<1||study.specific.seed>5.0E08){
    study.specific.seed<-shared.seedval
  }
}

study.specific.seed<-round(study.specific.seed,0)


set.seed(shared.seedval+restart.seed.other.seed.actions+study.specific.seed)    


#Check that no attempt has been made to enter a fake value for max.sd.input.var
#via the clientside that is close to zero so that the random component of the
#sythetic data is always effectively zero

if(max.sd.input.var<(sqrt(stats::var(input.var, na.rm=TRUE)))/2){
  error.message<-
    paste0("FAILED: the estimated standard deviation being used to generate part of the random component of the synthetic pseudodata is considerably smaller than the actual standard deviation of the input variable (V2BR) suggesting that information may have been fed in from the clientside to create a modified value that is too small. As this increases disclosure risk the whole ranking analysis has been halted")
  stop(error.message, call. = FALSE)
}

#create synthetic data vector
input.var.synth.fixed<-sample(input.var.orig,numsubs.synth,replace=TRUE)
input.var.synth.random<-stats::rnorm(numsubs.synth,0,max.sd.input.var)
input.var.synth.random.rounded<-input.var.synth.random

input.var.synth<-input.var.synth.fixed+input.var.synth.random

#ARE THERE MULTIPLE TIES SUGGESTING LIMITED DECIMAL PLACES OR POWERS OF 10
#IF SO MODIFY input.var.synth ACCORDINGLY


  num.3dp<-sum(round(input.var.orig%%0.001,2)==0)
  prop.3dp<-num.3dp/length(input.var.orig)
  if(prop.3dp>=0.25){
  input.var.synth.random.rounded<-round(input.var.synth.random,3)
  input.var.synth<-input.var.synth.fixed+input.var.synth.random.rounded
  }

  num.2dp<-sum(round(input.var.orig%%0.01,2)==0)
  prop.2dp<-num.2dp/length(input.var.orig)
  if(prop.2dp>=0.25){
    input.var.synth.random.rounded<-round(input.var.synth.random,2)
    input.var.synth<-input.var.synth.fixed+input.var.synth.random.rounded
  }


  num.1dp<-sum(round(input.var.orig%%0.1,2)==0)
  prop.1dp<-num.1dp/length(input.var.orig)
  if(prop.1dp>=0.25){
    input.var.synth.random.rounded<-round(input.var.synth.random,1)
    input.var.synth<-input.var.synth.fixed+input.var.synth.random.rounded
  }
  
  num.integers<-sum(round(input.var.orig%%1,2)==0)
  prop.integers<-num.integers/length(input.var.orig)
  if(prop.integers>=0.25){
    input.var.synth.random.rounded<-round(input.var.synth.random,0)
    input.var.synth<-input.var.synth.fixed+input.var.synth.random.rounded
  }

  num.tens<-sum(round(input.var.orig%%10,2)==0)
  prop.tens<-num.tens/length(input.var.orig)
  if(prop.tens>=0.25){
    input.var.synth.random.rounded<-round(input.var.synth.random,-1)
    input.var.synth<-input.var.synth.fixed+input.var.synth.random.rounded
  }

  num.hundreds<-sum(round(input.var.orig%%100,2)==0)
  prop.hundreds<-num.hundreds/length(input.var.orig)
  if(prop.hundreds>=0.25){
    input.var.synth.random.rounded<-round(input.var.synth.random,-2)
    input.var.synth<-input.var.synth.fixed+input.var.synth.random.rounded
  }
 
  num.thousands<-sum(round(input.var.orig%%1000,2)==0)
  prop.thousands<-num.thousands/length(input.var.orig)
  if(prop.thousands>=0.25){
    input.var.synth.random.rounded<-round(input.var.synth.random,-3)
    input.var.synth<-input.var.synth.fixed+input.var.synth.random.rounded
  }


#Initialise input.var for analysis. Start by converting to values 0 to 1
#in same order as original input.var. Going to use probit function (pnorm) for this.
#This can take any values -inf to +inf but to avoid extreme value rounding errors
#scale input.var to normal 0 1

input.var.real.orig<-input.var.orig

#have to round this because otherwise eg 0.3 -> 0.2999999999999999999
input.var.real.synth.orig<-round(c(input.var.real.orig,input.var.synth),5)


#input.var.probit.temp<-((input.var.real.synth.orig-mean.input.var)/max.sd.input.var)
input.var.probit.temp<-(input.var.real.synth.orig-mean.input.var)/(max.sd.input.var)
input.var.probit<-stats::pnorm(input.var.probit.temp)



if(min(input.var.probit)<=0 | max(input.var.probit)>=1){
  error.message<-
    paste0("FAILED: initialised values should strictly be >0 and <1 this rule has been violated
           there is possiblyly an NA, inf or other error in the input.var.orig")
  stop(error.message, call. = FALSE)
} 


if(min(rank(input.var.real.synth.orig)-rank(input.var.probit))<0 | max(rank(input.var.real.synth.orig)-rank(input.var.probit))>0) {
  error.message<-
    paste0("FAILED: probit initialised values are not in an identical order to the original input variable please check")
  stop(error.message, call. = FALSE)
} 



intermediate.value.matrix<-matrix(NA,ncol=8,nrow=numsubs)

intermediate.value.matrix[,1]<-input.var.real.synth.orig
intermediate.value.matrix[,2]<-input.var.probit

utils::head(intermediate.value.matrix)

for(cv in 3:(length(control.vector)+2)){
  
  if(control.vector[cv-2]==1){
    intermediate.value.matrix[,cv]<-intermediate.value.matrix[,(cv-1)]^(control.value[cv-2])
  }
  if(control.vector[cv-2]==2){
    intermediate.value.matrix[,cv]<-intermediate.value.matrix[,(cv-1)]+(control.value[cv-2])
  }
  if(control.vector[cv-2]==3){
    intermediate.value.matrix[,cv]<-intermediate.value.matrix[,(cv-1)]*(control.value[cv-2])
  }
}

intermediate.value.matrix<-cbind(intermediate.value.matrix,ID.seq.real.synth.orig,is.real.synth.orig)
colnames(intermediate.value.matrix)<-c("input.var.real.synth.orig","input.var.probit",
                                       1:6,"ID.seq.real.synth.orig","is.real.synth.orig")



dim(intermediate.value.matrix)
utils::head(intermediate.value.matrix)
utils::tail(intermediate.value.matrix)


rank.intermediate.value.matrix<-matrix(NA,ncol=8,nrow=numsubs)



for(k in 1:8)
{
  rank.intermediate.value.matrix[,k]<-rank(intermediate.value.matrix[,k])
}

rank.intermediate.value.matrix<-cbind(rank.intermediate.value.matrix,ID.seq.real.synth.orig,is.real.synth.orig)

colnames(rank.intermediate.value.matrix)<-c("input.var.real.synth.orig","input.var.probit",
                                            1:6,"ID.seq.real.synth.orig","is.real.synth.orig")

utils::head(rank.intermediate.value.matrix)
utils::tail(rank.intermediate.value.matrix)


cat("\nRANKS IN ALL COLUMNS ABOVE SHOULD BE THE SAME\n")

control.vector
control.value


#CREATE blackBox OUTPUT DF

intermediate.value.df<-data.frame(intermediate.value.matrix)


encrypted.var<-intermediate.value.df$"X6"

ranks.input.var.real.synth.orig<-rank(intermediate.value.df$input.var.real.synth.orig)
ranks.encrypted.var<-rank(intermediate.value.df$"X6")


output.df<-data.frame(cbind(input.var.real.synth.orig,encrypted.var,ranks.input.var.real.synth.orig,ranks.encrypted.var,ID.seq.real.synth.orig,is.real.synth.orig))

utils::head(output.df)
utils::tail(output.df)



#Sort df by magnitude of key variable
ord.by.val<-order(input.var.real.synth.orig)

output.temp.sort.by.val<-output.df[ord.by.val,]


ID.by.val<-1:nrow(output.temp.sort.by.val)

output.df.sort.by.val<-data.frame(cbind(output.temp.sort.by.val,ID.by.val))


utils::head(output.df.sort.by.val)
utils::tail(output.df.sort.by.val)
dim(output.df.sort.by.val)



blackbox.output.df <- output.df.sort.by.val


if(sum(round(rank(blackbox.output.df[,3])-rank(blackbox.output.df[,4]),2)==0)!=numsubs)
{
  error.message<-
    paste0("FAILED: inconsistent values across different transformations in black box,
            try a different seed. Altenatively this could reflect modification of the
            clientside code which is not recommended. Finally, it can also occur
            if the R session on one or more of the opal data servers runs out
            of memory")
  stop(error.message, call. = FALSE)
}else{
  cat("\nPROCESSING SUCCESSFUL, ALL RANKS AGREE FOR ALL TRANSFORMATIONS\n\n")
}


return(blackbox.output.df)

} #END FUNCTION

#ASSIGN
# blackBoxDS