R/blackBoxRanksDS.R

Defines functions blackBoxRanksDS

Documented in blackBoxRanksDS

#' @title Secure ranking of "V2BR" (vector to be ranked) across all sources
#' @description The second key serverside function that prepares the global
#' ranks of the the real data only generated in the first stage of the
#' ranking procedure and encrypts them in preparation for generating global
#' ranks that correspond 1 to 1 with only the real data in V2BR. 
#' @details Severside assign function called by ds.ranksSecure. It takes the
#' global ranks currently held in sR5.df which reflect the global ranks based on
#' the "combined real+pseudo data vector" as encrypted by blackBoxDS 
#' but with all pseudo-data stripped out. It then uses these global ranks (of
#' the real data) as if they were a new variable to be ranked. This is then
#' equivalent to blackBoxDS with the primary difference that no
#' pseudo-data are needed. This is because the global ranks are fundamentally
#' non-disclosive and so can be transferred to the clientside with no risk of
#' disclosure. However, in order to ensure that the client cannot compare
#' the list of global.ranks in sR4.df (after initial global ranking based on
#' ranking of real and pseudo-data combined) with the global.ranks to be
#' generated by blackBoxRanksDS (based solely on the real data they are 
#' processed through seven more rounds of encryption as before in blackBoxDS.
#' In consequence the client remains unable to determine which of the original
#' global ranks corresponded to real data and which to pseudo-data. In
#' addition, blackBoxRanksDS does not need to determine the number of decimal
#' places in the data because it is only applied to ranks which are assumed to
#' be integers. For more details about the cluster of functions that
#' collectively enable secure global ranking and estimation of global quantiles
#' see the associated document entitled "secure.global.ranking.docx". Also
#' see the header file for ds.ranksSecure and the header file for blackBoxDS
#' @param input.var.name a character string specifying the name of the
#' vector holding the global ranks. This argument is set automatically by
#' the clientside function ds.ranksSecure
#' @param shared.seedval a pseudorandom number seed that ensures that the
#' processes generating the order and parameterisation of the encryption
#' algorithms are the same in each study. This argument is set by the argument
#' <shared.seed.value> in the clientside function ds.ranksSecure. The
#' seed value shared by all studies in setting up the encryption procedures
#' in blackBoxRanksDS is arbitrarily changed from that used to set up the
#' encryption procedures in blackBoxDS, so the the set of 7 encryption
#' algorithms is deliberately different. For more
#' details, including future plans to share this starting seed in a more secure
#' way, please see the associated document entitled "secure.global.ranking.docx"
#' and the header file for ds.ranksSecure.
#' @return writes a data frame object entitled blackbox.ranks.df to the
#' serverside. In each study this contains the encrypted global ranks
#' and a range of other key components from the second stage (ranking of global
#' ranks for real observations only) of the ranking procedure. For more details
#' see the associated document entitled "secure.global.ranking.docx"
#' @author Paul Burton 9th November, 2021
#' @export
#'
blackBoxRanksDS <- function(input.var.name=NULL, shared.seedval){ #START FUNC
  
  #######################################################
  #MODULE 1: CAPTURE THE nfilter SETTINGS                 
  #thr<-dsBase::listDisclosureSettingsDS()                  
  #nfilter.tab <- as.numeric(thr$nfilter.tab)             
  #nfilter.glm <- as.numeric(thr$nfilter.glm)               
  #nfilter.subset <- as.numeric(thr$nfilter.subset)           
  #nfilter.string <- as.numeric(thr$nfilter.string)      
  #nfilter.stringShort <- as.numeric(thr$nfilter.stringShort)
  #nfilter.kNN <- as.numeric(thr$nfilter.kNN)             
  #nfilter.noise <- as.numeric(thr$nfilter.noise)       
  #nfilter.levels <- as.numeric(thr$nfilter.levels)       
  ########################################################


input.var <- eval(parse(text=input.var.name), envir = parent.frame())
input.global.ranks<-input.var

#ESTIMATED OVERALL MEAN AND SD FROM meanQuantileDS
#SAVED IN input.mean.sd.df BY ds.dmtC2S
max.sd.input.ranks<-input.ranks.sd.df$max.sd.input.ranks
mean.input.ranks<-input.ranks.sd.df$mean.input.ranks

numsubs.real<-length(input.global.ranks)

numsubs<-numsubs.real

#Create indicators for original original sequence order and real/synthetic
ID.seq.real.orig<-1:numsubs

#Allow individual implementations of blackBoxDS for groups of projects to have a 
#unique but shared starting seed (given specified starting seed) but only
#the person setting up the function can know how that seed is perturbed relative
#to the specified seed

restart.seed.transformation.control.n<-78615
restart.seed.other.seed.actions<-43

#Now set up for repeated transformations
shared.seedval<-(shared.seedval+19)*7
set.seed(shared.seedval)    

#Create long fixed sequence of calls to seed to keep resetting seed at start
null.vector<-stats::runif(restart.seed.transformation.control.n,0,1)

#Create transformation control vectors/values first so all studies
#have the same transformation controls given input random seed 
#even if the lengths of later vectors (eg synthetic concealing data) are
#different between studies so would lead to inconsistent control
#vectors/values

 
control.vector<-sample(c(1,1,2,2,3,3),replace=FALSE)
#control.vector<-c(1,control.vector)
control.vector
  
  
control.value<-stats::runif(6,0.0001,1)
control.value

#Reset seed for other purposes
set.seed(shared.seedval+restart.seed.other.seed.actions)    


#Initialise input.var for analysis. Start by converting to values 0 to 1
#in same order as original input.var. Going to use probit function (stats::pnorm) for this.
#This can take any values -inf to +inf but to avoid extreme value rounding errors
#scale input.var to normal 0 1

#Unlike blackBoxDS no need to check that no attempt has been made to enter a
#fake value for max.sd.input.var. That is because in blackBoxRanksDS no
#synthetic data are generated so the only impact of changing the value of
#max.sd.input.var is to change the first probit transformation. This will either
#cause the whole analysis to fail if the value of max.sd.input.var is too small
#or large or will have no effect at all. So there is no enhanced disclosure risk
#and no need for the equivalent disclosure trap.

input.var.probit.temp<-((input.global.ranks-mean.input.ranks)/max.sd.input.ranks)
input.var.probit<-stats::pnorm(input.var.probit.temp)


if(min(input.var.probit)<=0 | max(input.var.probit)>=1){
  error.message<-
    paste0("FAILED: initialised values should strictly be >0 and <1 this rule has been violated
           there is possiblyly an NA, inf or other error in the input.global.ranks")
  stop(error.message, call. = FALSE)
} 


if(min(rank(input.global.ranks)-rank(input.var.probit))<0 | max(rank(input.global.ranks)-rank(input.var.probit))>0) {
  error.message<-
    paste0("FAILED: probit initialised values are not in an identical order to the original input variable please check")
  stop(error.message, call. = FALSE)
} 



intermediate.value.matrix<-matrix(NA,ncol=8,nrow=numsubs)

intermediate.value.matrix[,1]<-input.global.ranks
intermediate.value.matrix[,2]<-input.var.probit


for(cv in 3:(length(control.vector)+2)){
  
  if(control.vector[cv-2]==1){
    intermediate.value.matrix[,cv]<-intermediate.value.matrix[,(cv-1)]^(control.value[cv-2])
  }
  if(control.vector[cv-2]==2){
    intermediate.value.matrix[,cv]<-intermediate.value.matrix[,(cv-1)]+(control.value[cv-2])
  }
  if(control.vector[cv-2]==3){
    intermediate.value.matrix[,cv]<-intermediate.value.matrix[,(cv-1)]*(control.value[cv-2])
  }
}

intermediate.value.matrix<-cbind(intermediate.value.matrix,ID.seq.real.orig)
colnames(intermediate.value.matrix)<-c("input.global.ranks.orig","input.var.probit",
                                       1:6,"ID.seq.real.orig")



dim(intermediate.value.matrix)
utils::head(intermediate.value.matrix)
utils::tail(intermediate.value.matrix)


rank.intermediate.value.matrix<-matrix(NA,ncol=8,nrow=numsubs)



for(k in 1:8)
{
  rank.intermediate.value.matrix[,k]<-rank(intermediate.value.matrix[,k])
}

rank.intermediate.value.matrix<-cbind(rank.intermediate.value.matrix,ID.seq.real.orig)

colnames(rank.intermediate.value.matrix)<-c("input.global.ranks.orig","input.var.probit",
                                            1:6,"ID.seq.real.orig")


cat("\nRANKS IN ALL COLUMNS ABOVE SHOULD BE THE SAME\n")

control.vector
control.value


#CREATE blackBox OUTPUT DF

intermediate.value.df<-data.frame(intermediate.value.matrix)

names(intermediate.value.df)
encrypted.global.ranks<-intermediate.value.df$"X6"

ranks.input.global.ranks.orig<-rank(intermediate.value.df$input.global.ranks.orig)
ranks.encrypted.global.ranks<-rank(intermediate.value.df$"X6")


output.df<-data.frame(cbind(input.global.ranks,encrypted.global.ranks,ranks.input.global.ranks.orig,ranks.encrypted.global.ranks,ID.seq.real.orig))

utils::head(output.df)
utils::tail(output.df)



#Sort df by magnitude of key variable
ord.by.val<-order(input.global.ranks)

output.temp.sort.by.val<-output.df[ord.by.val,]

utils::head(output.temp.sort.by.val)


ID.by.val<-1:nrow(output.temp.sort.by.val)

output.df.sort.by.val<-data.frame(cbind(output.temp.sort.by.val,ID.by.val))


utils::head(output.df.sort.by.val)
utils::tail(output.df.sort.by.val)
dim(output.df.sort.by.val)



blackbox.ranks.df <- cbind(sR5.df[,c(5,1,2,3,9,6,10)],output.df.sort.by.val[,1:2])

colnames(blackbox.ranks.df)[3]<-"encrypted.input.var"
colnames(blackbox.ranks.df)[5]<-"global.ranks.input.from.sR5"
colnames(blackbox.ranks.df)[6]<-"is.real"
colnames(blackbox.ranks.df)[8]<-"global.ranks.after.blackbox"
colnames(blackbox.ranks.df)[9]<-"encrypted.global.ranks"



utils::head(blackbox.ranks.df)


if(sum(round(rank(blackbox.ranks.df[,5])-rank(blackbox.ranks.df[,8]),2)==0)!=numsubs)
{
  error.message<-
    paste0("FAILED: inconsistent ranking across different transformations in black box,
            try a different seed. Altenatively this could reflect modification of the
            clientside code which is not recommended. Finally, it can also occur
            if the R session on one or more of the opal data servers runs out
            of memory")
  stop(error.message, call. = FALSE)
}else{
  cat("\nPROCESSING SUCCESSFUL, ALL RANKS AGREE FOR ALL TRANSFORMATIONS\n\n")
}


return(blackbox.ranks.df)

} #END FUNCTION

#ASSIGN
# blackBoxRanksDS
datashield/dsBase documentation built on May 16, 2023, 10:01 p.m.