dsBetaTest5: New DataSHIELD Server Functions

#' @title
#' glmDS2
#'
#' @description
#' This is the second serverside function called by ds.glm. It is an aggregation function that uses the model structure
#' and starting beta.vector constructed by glmDS1 to iteratively fit the generalized linear model that has been
#' been specified. The function glmDS2 also carries out a series of disclosure checks and if the arguments or data fail any of those tests,
#' model construction is blocked and an appropriate serverside error message is created and returned to ds.glm on the clientside.
#' For more details please see the extensive header for ds.glm.
#'
#' @export
#'
glmDS2.b<-function (formula, family, beta.vect, offset, weights, data) {

###########################
#BUG CORRECT
dataName<-data
##################

errorMessage2<-"No errors"
# Get the value of the 'data' parameter provided as character on the client side
# Same is done for offset and weights lower down function

  if(is.null(data)){
    dataTable <- NULL 
  }else{
    dataTable <- eval(parse(text=data))
  }

 
# Rewrite formula extracting variables nested in strutures like data frame or list
# (e.g. D$A~D$B will be re-written A~B)
# Note final product is a list of the variables in the model (yvector and covariates)
# it is NOT a list of model terms - these are derived later

# Convert formula into an editable character string
  formulatext <- Reduce(paste, deparse(formula))

# First save original model formala
  originalFormula <- formulatext

# Convert formula string into separate variable names split by |
  formulatext <- gsub(" ", "", formulatext, fixed=TRUE)
  formulatext <- gsub("~", "|", formulatext, fixed=TRUE)
  formulatext <- gsub("+", "|", formulatext, fixed=TRUE)
  formulatext <- gsub("*", "|", formulatext, fixed=TRUE)
  formulatext <- gsub("||", "|", formulatext, fixed=TRUE)

  
#Remember model.variables and then varnames INCLUDE BOTH yvect AND linear predictor components 
 model.variables <- unlist(strsplit(formulatext, split="|", fixed=TRUE))
 
 varnames <- c()
  for(i in 1:length(model.variables)){
    elt <- unlist(strsplit(model.variables[i], split="$", fixed=TRUE))
    if(length(elt) > 1){
      assign(elt[length(elt)], eval(parse(text=model.variables[i])))
      originalFormula.modified <- gsub(model.variables[i], elt[length(elt)], originalFormula, fixed=TRUE)
      varnames <- append(varnames, elt[length(elt)])
    }else{
      varnames <- append(varnames, elt)
    }
  }
varnames <- unique(varnames)

#varnames.with.df<-varnames

  if(!is.null(data)){
      for(v in 1:length(varnames)){
	varnames[v]<-paste0(dataName,"$",varnames[v])
	cbindraw.text <- paste0("cbind(", paste(varnames, collapse=","), ")")
      }
  } else {
  	    cbindraw.text <- paste0("cbind(", paste(varnames, collapse=","), ")")
         }

#Identify and use variable names to count missings
#	cbindraw.text <- paste0("cbind(", paste(varnames, collapse=","), ")")

	all.data <- eval(parse(text=cbindraw.text))



#COMMENTED OUT TRACER
#return(list(varnames=varnames,all.data[1:10,]))
#}
#glmDS2.d #temp end


#WORKS TO HERE
######################## 





############################

	Ntotal<-dim(all.data)[1]
	
	nomiss.any<-complete.cases(all.data)
	nomiss.any.data<-all.data[nomiss.any,]
	N.nomiss.any<-dim(nomiss.any.data)[1]

	Nvalid<-N.nomiss.any
	Nmissing<-Ntotal-Nvalid


#######################################
  
# Now fit model specified in formula: by using x=TRUE this is how we generate all of the model terms
# and the data that underlie them. This will include a vector of 1s for the intercept and
# any dummy variables required for factors
 
    formula2use <- as.formula(paste0(Reduce(paste, deparse(originalFormula)))) # here we need the formula as a 'call' object
    mod.glm.ds <- glm(formula2use, family=family, x=TRUE, control=glm.control(maxit=1), contrasts=NULL, data=dataTable)

 X.mat.orig <- as.matrix(mod.glm.ds$x)
 y.vect.orig <-as.vector(mod.glm.ds$y)
 f<-mod.glm.ds$family

# Remove rows of offset or weights which contain NA in any Y or X variable
# Rows where offset or weights are missing but Y and X are non-NA, remain at this stage
    cbindtext <- paste0("cbind(", paste(varnames, collapse=","), ")")
    dtemp <- eval(parse(text=cbindtext))
    # now get the above table with no missing values (i.e. complete) and grab the offset variable (the last column)
    row.noNA.YX<-complete.cases(dtemp)

#Both weights and offset
if(!(is.null(weights))&&!(is.null(offset))){
    cbindtext <- paste0("cbind(", paste(varnames, collapse=","), ",", weights, ",", offset,")")
    dtemp <- eval(parse(text=cbindtext))
    # now get the above table with no missing values (i.e. complete) and grab the offset variable (the last column)
    cmplt <- dtemp[row.noNA.YX,]
    offsetvar.orig <- cmplt[, dim(cmplt)[2]] 
    weightsvar.orig <- cmplt[, (dim(cmplt)[2]-1)]    
  }
 
#Offset no weights 
if(is.null(weights)&&!(is.null(offset))){
    cbindtext <- paste0("cbind(", paste(varnames, collapse=","), ",", offset, ")")
    dtemp <- eval(parse(text=cbindtext))
    # now get the above table with no missing values (i.e. complete) and grab the offset variable (the last column)
    cmplt <- dtemp[row.noNA.YX,]
    offsetvar.orig <- cmplt[, dim(cmplt)[2]]
  }
  
#Weights no offset
if(!(is.null(weights))&&(is.null(offset))){
    cbindtext <- paste0("cbind(", paste(varnames, collapse=","), ",", weights, ")")
    dtemp <- eval(parse(text=cbindtext))
    # now get the above table with no missing values (i.e. complete) and grab the offset variable (the last column)
    cmplt <- dtemp[row.noNA.YX,]
    weightsvar.orig <- cmplt[, dim(cmplt)[2]]    
  } 
 
#Now work with y vector and X matrix from actual model (with all terms explicit)

#Strip rows of y, X matrix, offset and weights if missing values in offset or weights
#If an offset is not specified then NAs in it are meaningless and so have no impact
#If weights are not specified then NAs in it are meaningless and so have no impact
 
#Both weights and offset
 if(!(is.null(weights))&&!(is.null(offset))){
 YXWO.orig<-cbind(y.vect.orig,X.mat.orig,weightsvar.orig,offsetvar.orig)
 YXWO.complete<-YXWO.orig[complete.cases(YXWO.orig),]
 numcol.YXWO<-dim(YXWO.orig)[2]
 y.vect<-YXWO.complete[,1]
#NB - must specify X.mat as.matrix because otherwise with a one parameter linear predictor
#ie just the column of 1s for the intercept, X.mat is n x 1 and defaults to vector which does
#not then work in the matrix multiplication code below 
 X.mat<-as.matrix(YXWO.complete[,(2:(numcol.YXWO-2))])
 weightsvar<-YXWO.complete[,numcol.YXWO-1]
 offsetvar<-YXWO.complete[,numcol.YXWO]
 }

#Offset no weights
 if(is.null(weights)&&!(is.null(offset))){
 YXO.orig<-cbind(y.vect.orig,X.mat.orig,offsetvar.orig)
 YXO.complete<-YXO.orig[complete.cases(YXO.orig),]
 numcol.YXO<-dim(YXO.orig)[2]
 y.vect<-YXO.complete[,1]
#NB - must specify X.mat as.matrix because otherwise with a one parameter linear predictor
#ie just the column of 1s for the intercept, X.mat is n x 1 and defaults to vector which does
#not then work in the matrix multiplication code below 
 X.mat<-as.matrix(YXO.complete[,(2:(numcol.YXO-1))])
 weightsvar<-rep(1,length(y.vect))
 offsetvar<-YXO.complete[,numcol.YXO]
 }
 
#Weights no offset
 if(!(is.null(weights))&&(is.null(offset))){
 YXW.orig<-cbind(y.vect.orig,X.mat.orig,weightsvar.orig)
 YXW.complete<-YXW.orig[complete.cases(YXW.orig),]
 numcol.YXW<-dim(YXW.orig)[2]
 y.vect<-YXW.complete[,1]
 X.mat<-as.matrix(YXW.complete[,(2:(numcol.YXW-1))])
 weightsvar<-YXW.complete[,numcol.YXW]
 offsetvar<-rep(0,length(y.vect))
 }
 
 #No weights or offset
 if(is.null(weights)&&(is.null(offset))){
 y.vect<-y.vect.orig
 X.mat<-X.mat.orig
 weightsvar<-rep(1,length(y.vect))
 offsetvar<-rep(0,length(y.vect))
 }
 
 
numsubs<-length(y.vect)

#Convert beta.vect from transmittable (character) format to numeric 

  beta.vect.n <- as.numeric(unlist(strsplit(beta.vect, split=",")))
 
  
#If an offset is specified, add it directly to the values in the linear predictor
  if(!is.null(offset)){
    lp.vect <- (X.mat%*%beta.vect.n)+offsetvar
  }else{
    lp.vect <- (X.mat%*%beta.vect.n)   
  }
   
#Use the available functions for family f to generate the components giving the deviance and
#the working weights for the IRLS algorithm
 
  mu.vect<-f$linkinv(lp.vect)
  mu.eta.val<-f$mu.eta(lp.vect)
  var.vect<-f$variance(mu.vect)
  
#If a prior weights vector is specified multiply the working weights by the prior weights 
  if(!is.null(weights)){
  W.vect<-as.vector(mu.eta.val^2/var.vect)
  W.vect<-W.vect*weightsvar
  dev<-sum(f$dev.resids(y.vect, mu.vect, rep(1, length(y.vect)))*weightsvar)
  }else{
  W.vect<-as.vector(mu.eta.val^2/var.vect)
  dev<-sum(f$dev.resids(y.vect, mu.vect, rep(1, length(y.vect))))
  }

#Generate information matrix as XWX  
  WX.mat<-W.vect*X.mat
  info.matrix<-t(X.mat)%*%WX.mat
  
#Generate score vector as XWz (where z is working response vector on scale of linear predictor)
#See theoretical basis in the .pdf in RELEVANT.GLM.THEORY directory.
#Note mu.et.val is first differential of inverse link function (d.mu by d.eta)
#which is inverse of first diff of link function (g') in thoretical explanation

  u.vect<-(y.vect-mu.vect)*1/mu.eta.val
  W.u.mat<-matrix(W.vect*u.vect)
  score.vect<-t(X.mat)%*%W.u.mat

##########################
#BACKUP DISCLOSURE TRAP
#If y, X or w data are invalid but user has modified clientside
#function (ds.glm) to circumvent trap, model will get to this point without
#giving a controlled shut down with a warning about invalid data.
#So as a safety measure, we will now use the same test that is used to
#trigger a controlled trap in the clientside function to destroy the
#score.vector and information.matrix in the study with the problem.
#So this will make model fail without explanation

#Disclosure code from glmDS1

 dimX<-dim((X.mat))
#SET FILTER THRESHOLDS
#NEEDS SETTING FROM INTERNAL OPAL FILTERS
filter.threshold.tab<-DANGER.nfilter.tab
filter.threshold.glm<-DANGER.nfilter.glm

##############################################################
#FIRST TYPE OF DISCLOSURE TRAP - TEST FOR OVERSATURATED MODEL#
##############################################################
 glm.saturation.invalid<-0
 num.p<-dimX[2]
 num.N<-dimX[1]
   
 if(num.p>filter.threshold.glm*num.N){
 glm.saturation.invalid<-1
 errorMessage2<-"FAILED: Model has too many parameters, there is a possible risk of disclosure - please simplify model"
 }

################################
#SECOND TYPE OF DISCLOSURE TRAP#
################################


#CHECK Y VECTOR VALIDITY
	y.invalid<-0

	unique.values.y<-unique(y.vect)
	unique.values.noNA.y<-unique.values.y[complete.cases(unique.values.y)]

	if(length(unique.values.noNA.y)==2){
		tabvar<-table(y.vect)[table(y.vect)>=1]
		min.category<-min(tabvar)
		if(min.category<filter.threshold.tab){
		   y.invalid<-1
		   errorMessage2<-"FAILED: y vector is binary with one category less than filter threshold for table cell size"
		   }
		}

#CHECK X MATRIX VALIDITY 
#Check no dichotomous X vectors with between 1 and filter.threshold 
#observations at either level 
	dimX<-dim((X.mat))

  	num.Xpar<-dimX[2]

	Xpar.invalid<-rep(0,num.Xpar)

  	for(pj in 1:num.Xpar){
	unique.values<-unique(X.mat[,pj])
	unique.values.noNA<-unique.values[complete.cases(unique.values)]

	if(length(unique.values.noNA)==2){
		tabvar<-table(X.mat[,pj])[table(X.mat[,pj])>=1]
		min.category<-min(tabvar)
		if(min.category<filter.threshold.tab){
		    Xpar.invalid[pj]<-1
		    errorMessage2<-"FAILED: at least one column in X matrix is binary with one category less than filter threshold for table cell size"
            }
	   }
	}


#CHECK W VECTOR VALIDITY
	w.invalid<-0

#Keep same object name as in glmDS1
	w.vect<-weightsvar

	unique.values.w<-unique(w.vect)
	unique.values.noNA.w<-unique.values.w[complete.cases(unique.values.w)]

	if(length(unique.values.noNA.w)==2){
		tabvar<-table(w.vect)[table(w.vect)>=1]
		min.category<-min(tabvar)
		if(min.category<=filter.threshold.tab){
        w.invalid<-1
		errorMessage2<-"FAILED: w vector is binary with one category less than filter threshold for table cell size"
            }
	}




disclosure.risk<-0

########################################################################
#If there is a disclosure risk destroy the info.matrix and score.vector#
########################################################################

if(y.invalid>0||w.invalid>0||sum(Xpar.invalid)>0||glm.saturation.invalid>0){
	info.matrix<-NA
	score.vector<-NA
	disclosure.risk<-1		
	}


  return(list(family=f, info.matrix=info.matrix, score.vect=score.vect, numsubs=numsubs, dev=dev,
	        Nvalid=Nvalid,Nmissing=Nmissing,Ntotal=Ntotal,disclosure.risk=disclosure.risk,
              errorMessage2=errorMessage2
              ))  
}
#glmDS2.b