R/CDFtestingSuite.R

Defines functions MovetoRange getMaxError MaxError_CDF MaxError_PDF findMaxError MaxErrorAt_CDF MaxErrorAt_PDF diffatQuantile diffat25 diffatMedian diffat75 horzdiffatQuantile horzdiffat25 horzdiffatMed horzdiffat75 QuantileFromCDF Medians nodes MSEanalytic L2empiric L1empiric MAE MSE SDempiric getMean MeanDiffpdf SkewDiffpdf VarDiffpdf ModeDiffpdf StdDiffpdf KurtDiffpdf DerivDiff smoothVector2 Abbrev badCDF CDFtestTrack CDFtestTrackx CDFtest

Documented in Abbrev badCDF CDFtest CDFtestTrack CDFtestTrackx DerivDiff diffat25 diffat75 diffatMedian diffatQuantile findMaxError getMaxError getMean horzdiffat25 horzdiffat75 horzdiffatMed horzdiffatQuantile KurtDiffpdf L1empiric L2empiric MAE MaxErrorAt_CDF MaxErrorAt_PDF MaxError_CDF MaxError_PDF MeanDiffpdf Medians ModeDiffpdf MovetoRange MSE MSEanalytic nodes QuantileFromCDF SDempiric SkewDiffpdf smoothVector2 StdDiffpdf VarDiffpdf

#'@importFrom Rcpp evalCpp
#'@useDynLib CDF.PSIdekick

#' @import grDevices
#' @import graphics
#' @import stats
#' @import utils

##########################################################
# DP_CDF Testing Suite: system for determining the utility
# and privacy of DP-CDF creation methods. 
#
# Scribe: Daniel Muise (2015)
# Harvard University

# Unless otherwise specified, the following processes were authored by
# Daniel Muise, Kobbi Nissim, Georgios Kellaris, Mark Bun, Victor Balcer  
##########################################################

## The UI for this testing suite is pasted and commented out at the end

# NULLing to rectify global variable binding
Abounds <- functionname <- AnalyticBounds <- smoothvector <- smoothVector <- NULL

### Functions Available
##########################################################
   # Diagnostic and minor functions defined
   #   MovetoRange        (val, range)
   #   getMaxError        (Y, est)
   #      MaxError_CDF    (Y, est)  
   #      MaxError_PDF    (Y, est) 
   #   findMaxError       (Y, est, databins)
   #       MaxErrorAt_CDF (Y, est, databins)
   #       MaxErrorAt_PDF (Y, est, databins)
   #   diffatQuantile     (Y, est, quantile = 0.50)
   #       diffat25       (Y, est)
   #       diffatMed      (Y, est)
   #       diffat75       (Y, est)
   #   horzdiffatQuantile (Y, est, range, gran, quantile)
   #       horzdiffat25   (Y, est, range, gran)
   #       horzdiffatMed  (Y, est, range, gran)
   #       horzdiffat75   (Y, est, range, gran)
   #   QuantileFromCDF    (est, range, gran, quantile)
   #       Medians        (est, range, gran)
   #   smoothVector       (vector)
   #   MSEanalytic         (eps, range, gran, data)
   #       nodes          (height, k, l) (helper to MSEanalytic)
   #   MAE                (Y, est)
   #   MSE                (Y, est)
   #   SDempiric          (Y, est)
   #   getMean            (est, range, gran)
   #   DerivDiff          (Y, est)
   #   smoothVector       (vector) 
   #      partialMeans    (vector, i, j) (helper to smoothVector)
   #   smoothVector2      (cdf)
   #   Abbrev             (value)
   ############################################


##########################################################
# The following segment defines internal helper functions
#########################################################################
##################################################################

#' @title Clamp a value to a specified range.
#'
#' @description  Returns a vector of elements clamped to the specified minimum
#'    and maximum
#' @param val A value to clamp.
#' @param range A vector of length 2 in the form \code{c(min, max)}
#' @return A single value that is either unchanged or clamped upward to minimum 
#'    or clamped downward to the maximum
#' @export
#' @examples 
#' MovetoRange(11, c(1,10))

MovetoRange <- function(val, range) {
 
  if(val < range[1]) {
    return(range[1])
  }
  else if(val > range[2]) {
    return(range[2])
  }
  else
    return(val)
  }
#################################################
#' @title Determine an approximate CDF's maximum error.
#'
#' @description Find the maximum direct error between a non-private CDF and a 
#'    DP approximation of that CDF.
#'
#' @param Y The vector output of a non-differentially private CDF 
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single value, the largest absolute vertical difference between 
#'    parallel observations in the private- and true-CDF vectors.
#' @export
#' @examples 
#' getMaxError(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))
getMaxError <- function(Y, est,...) {
  diff <- (Y-est) #store the point-by-point deviations
  return <-(max(abs(diff))) #output the max
}

#################################################
#' @title Determine an approximate CDF's maximum error.
#'
#' @description Find the maximum direct error between a non-private CDF and a
#'    DP approximation of that CDF.
#' @param Y The vector output of a non-differentially private CDF 
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF
#'    computation (cumulative count bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single value, the largest absolute vertical difference between 
#'    parallel observations in the private- and true-CDF vectors.
#' @export
#' @examples 
#' MaxError_CDF(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))
MaxError_CDF <- function(Y,est,...){
  return(getMaxError(Y, est,...))
}

#################################################
#' @title Determine an approximate PDF's maximum error.
#'
#' @description Find the maximum direct error between a non-private PDF and a 
#'    DP approximation of that PDF.
#'
#' @param Y The vector output of a non-differentially private PDF 
#'    computation (heights of bins)
#' @param est The vector output of a differentially private PDF
#'    computation (heights of bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single value, the largest absolute vertical difference between 
#'    parallel observations in the private- and true-PDF vectors.
#' @export
#' @examples 
#' MaxError_PDF(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))
MaxError_PDF <- function(Y,est,...){
  return(getMaxError(Y, est,...))
}

#################################################
#' @title Locate where the maximum error occurs between two CDFs
#'
#' @description Find the location of the maximum direct error between a 
#'     non-private CDF and a DP approximation of that CDF.
#'
#' @param Y The vector output of a non-differentially private CDF 
#'     computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF
#'     computation (cumulative count bins) 
#' @param range A vector length 2 containing user-specified min and max to
#'     truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year] for 
#'     a list of ages)
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single value, the value at which the  largest absolute vertical
#'     difference between
#'   parallel observations in the private- and true-CDF vectors occurs.
#' @export
#' @examples 
#' findMaxError(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1), c(1,10),1)
findMaxError <- function(Y, est, range, gran,...) {
  databins <- seq(from=range[1], to=range[2], by=gran)
  diff <- (Y-est) #store the point-by-point deviations
  maxx <- (max(abs(diff))) #output the max
  for (i in 1: length(diff)){
    if (abs(diff[i]) == maxx){
      MaxErrorAt <- databins[i]
    }
  }
  return <-(MaxErrorAt) #output the max error location
}

#################################################
# define specific versions to help with syntax in CDFtestTrack

#' @title Locate where the maximum error occurs between two CDFs
#' @description Find the location of the maximum direct error between a 
#'    non-private CDF and a DP approximation of that CDF.
#'
#' @param Y The vector output of a non-differentially private CDF 
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF
#'    computation (cumulative count bins) 
#' @param range A vector length 2 containing user-specified min and max to 
#'    truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year] 
#'    for a list of ages)
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single value, the value at which the  largest absolute vertical
#'    difference between
#'   parallel observations in the private- and true-CDF vectors occurs.
#' @export
#' @examples 
#' MaxErrorAt_CDF(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1),
#'     range= c(1,10), gran =1)
MaxErrorAt_CDF <- function(Y, est,range,gran,...){
  return(findMaxError(Y, est, range,gran,...))
}
#################################################
#' @title Locate where the maximum error occurs between two PDFs
#' @description Find the location of the maximum direct error between a
#'    non-private PDF and a DP approximation of that PDF.
#'
#' @param Y The vector output of a non-differentially private PDF
#'    computation (values within bins)
#' @param est The vector output of a differentially private PDF 
#'    computation (values within bins) 
#' @param range A vector length 2 containing user-specified min and max to
#'    truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year]
#'    for a list of ages)
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single value, the value at which the  largest absolute vertical
#'    difference between
#'   parallel observations in the private- and true-PDF vectors occurs.
#' @export
#' @examples 
#' MaxErrorAt_PDF(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1),
#'    range= c(1,10), gran =1)
MaxErrorAt_PDF <- function(Y, est, range, gran,...){
  return(findMaxError(Y, est, range, gran,...))
}

#################################################
#' @title Determine the distance between CDFs at key quantiles.
#' @description Find the error (between 0 and 1) introduced by DP-Noise at a
#'    given quantile in the CDF
#'
#' @param Y The vector output of a non-differentially private CDF
#'    computation (cumulative count bins).
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins).
#' @param quantile A quantile value between 0 and 1, defaults to 0.5
#'    for the median.
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The error at the quantile specified by \code{quantile}
#' @export
#' @examples 
#' diffatQuantile(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1),
#'     c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1), .05)

#################################################

diffatQuantile <- function(Y, est, quantile=0.5,...) {
xY <- c()
xest <- c()
#for storing the real quantile value and the corresponding estimate value
tempvector <- c(0,0)  
#locate the quantile in the CDF vector using the index, and store that value
# the median of quantile values must be found in the case of non-monotonic CDFs
if (min(Y-quantile)<0){
  for (i in 2:(length(Y)-1)) {  
    if ((Y[i]-quantile) >=0)  {
      if ((Y[i-1] -quantile) <0 ){
      xY[length(xY)+1] <- Y[i] 
      tempvector[1] <- median(xY)
      }
    }
  }
}
else{
  tempvector[1]<-Y[1]
}
  # do the same for the private-CDF vector
  if (min(est-quantile)<0){
  for (i in 2:(length(est)-1)) {  
    if ((est[i]-quantile) >=0)  {
      if ((est[i-1] -quantile) <0 ){
     xest[length(xest)+1] <- est[i] 
     tempvector[2] <- median(xest) 
      }
    }
  }
}
else{
  tempvector[2] <-est[1]
}
#find the absolute difference
diff <- abs(tempvector[2] - tempvector[1])
return(diff)
}
#################################################
####################
#' @title Determine the distance between CDFs at the .25 quantile.
#'
#' @description Find the error (between 0 and 1) introduced by DP-Noise at 
#' the .25 quantile.
#'
#' @param Y The vector output of a non-differentially private CDF 
#'     computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF
#'     computation (cumulative count bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The error at the .25 quantile
#'@export
#' @examples 
#' diffat25(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))
# define specific versions to help with syntax in CDFtestTrack
diffat25 <- function(Y, est,...){
return(diffatQuantile(Y, est, .25,...)) 
}

#################################################
#' @title Determine the distance between CDFs at the median.
#'
#' @description Find the error (between 0 and 1) introduced by DP-Noise at 
#'    the median
#'
#' @param Y The vector output of a non-differentially private CDF 
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The error at the .5 quantile
#'@export
#' @examples 
#' diffatMedian(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))
diffatMedian <- function(Y, est,...){
return(diffatQuantile(Y, est, .50,...)) 
}

#################################################
#' @title Determine the distance between CDFs at the .75 quantile.
#'
#' @description Find the error (between 0 and 1) introduced by DP-Noise 
#'   at the .75 quantile.
#'
#' @param Y The vector output of a non-differentially private 
#'     CDF computation (cumulative count bins)
#' @param est The vector output of a differentially private
#'     CDF computation (cumulative count bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The error at the .75 quantile
#'@export
#' @examples 
#' diffat75(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))
diffat75 <- function(Y, est,...){
return(diffatQuantile(Y, est, .75,...)) 
}

###########################################
#' @title Determine the distance between the quantile values 
#'   returned by two CDFs.
#' @description Find the distance between the quantile value and that returned
#'   by the dpCDF at a given quantile.
#'
#' @param Y The vector output of a non-differentially private CDF
#'   computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF 
#'   computation (cumulative count bins) 
#' @param range A vector length 2 containing user-specified min and max 
#'   to truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year]
#'   for a list of ages)
#' @param quantile A quantile value between 0 and 1, defaults to
#'   0.5 for the median
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The horizontal error at the quantile specified by \code{quantile}
#'@export
#' @examples 
#' diffatQuantile(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), 
#'    c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1),c(1,10), 1, .05)
horzdiffatQuantile <- function(Y, est, range, gran, quantile,...) {
databins <- seq(from=range[1], to=range[2], by=gran)

xY <- c()
xest <- c()
tempvector <- c(0,0) #for storing the real and estimated quantile values 
#locate the quantile in the true-CDF vector using the index, then store 
#  the corresponding databins obeservation.
# the median of quantile values must be found in the case of non-monotonic CDFs.
if (min(Y-quantile)<0){
for (i in 2:(length(Y)-1)) {  
  if ((Y[i]-quantile) >=0)  {
    if ((Y[i-1] -quantile) <0 ){
     xY[length(xY)+1] <- databins[i] 
     tempvector[1] <- median(xY)  
   }
  }
 }
}
else{
  tempvector[1]<-databins[1]
}
# do the same for the private-CDF
if (min(est-quantile)<0){
for (i in 2:(length(est)-1)) {  
  if ((est[i]-quantile) >=0)  {
    if ((est[i-1] -quantile) <0 ){
     xest[length(xest)+1] <- databins[i] 
     tempvector[2] <- median(xest)  
   }
  } 
 }
}
else{
  tempvector[2]<-databins[1]
}
# store the absolute difference
diff <- abs(tempvector[2] - tempvector[1])
return(diff)
}
#################################################
########################
# define specific versions to help with syntax in CDFtestTrack
#' @title Determine the distance between the .25 quantile values returned 
#'   by two CDFs.
#' @description Find the distance between the .25 quantile value and that
#'   returned by the dpCDF.
#'
#' @param Y The vector output of a non-differentially private CDF
#'   computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF
#'   computation (cumulative count bins) 
#' @param range A vector length 2 containing user-specified min and max to 
#'   truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year]
#'   for a list of ages)
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The horizontal error at the .25 quantile
#' @export
#' @examples 
#' horzdiffat25(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), 
#'  c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1),c(1,10), 1)
#################################################
horzdiffat25 <- function(Y, est, range, gran,...){
return(horzdiffatQuantile(Y, est, range, gran, .25,...)) 
}
#' @title Determine the distance between the median values returned by two CDFs.
#' @description Find the distance between the median value and that returned
#'    by the DP CDF.
#'
#' @param Y The vector output of a non-differentially private CDF
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF
#'    computation (cumulative count bins) 
#' @param range A vector length 2 containing user-specified min and max to
#'    truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year] 
#'    for a list of ages)
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The horizontal error at the median
#'@export
#' @examples 
#' horzdiffatMed(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), 
#'    c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1),c(1,10), 1)
#################################################
horzdiffatMed <- function(Y, est, range, gran,...){
return(horzdiffatQuantile(Y, est, range, gran, .50,...)) 
}
#' @title Determine the distance between the .75 quantile values
#'    returned by two CDFs.
#' @description Find the distance between the .75 quantile value
#'    and that returned by the DP CDF.
#'
#' @param Y The vector output of a non-differentially private CDF
#'   computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF 
#'   computation (cumulative count bins) 
#' @param range A vector length 2 containing user-specified min
#'   and max to truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year]
#'   for a list of ages)
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The horizontal error at the .75 quantile
#'@export
#' @examples 
#' horzdiffat75(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1),
#'             c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1),c(1,10), 1)
horzdiffat75 <- function(Y, est, range, gran,...){
return(horzdiffatQuantile(Y, est, range, gran, .75,...)) 
}
###########################################

#' @title Retrieve a private quantile estimate from the dpCDF
#' @description Determines a quantile value from a CDF vector.
#'
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param range A vector length 2 containing user-specified min and max to 
#'    truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year] 
#'    for a list of ages),
#'    the Domain (ie gran and range) should be identical to those used to 
#'    create the CDF!
#' @param quantile the quantile score in question (for testing the median,
#'    use quantile = 0.5) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A quantile value obtained from a (differentially private) CDF vector,
#'    not using any extra privacy budget
#'@export
#' @examples 
#' QuantileFromCDF(c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1),c(1,10), 1, .05)
QuantileFromCDF <- function(est, range, gran, quantile,...){
  Domain <- seq(from=range[1], to=range[2], by=gran)
  outputvalue <- 0
  DifferenceVector <- est-quantile

  for (z in 1: length(DifferenceVector)){
    if (DifferenceVector[z] >0) {
      DifferenceVector[z] <- -10
    }
  }
  maxDiff <- max(DifferenceVector)

  for (x in (length(DifferenceVector)):1){
    if (DifferenceVector[x] == maxDiff){
      outputvalue <- Domain[x]
    }
  }
  return(outputvalue)
}
#################################################
###
#' @title Retrieve a median estimate from the dpCDF
#' @description Determines a median value from a CDF vector.
#'
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param range A vector length 2 containing user-specified min and max to 
#'    truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year]
#'    for a list of ages),
#'    the Domain (ie gran and range) should be identical to those used to 
#'    create the CDF!
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A vector of medians obtained from a (differentially private) 
#'    CDF vector, not using any extra privacy budget,
#'    there may be more than one due to random noise causing the DPCDF doubling
#'    back over the .5 probablity latitude
#'@export
#' @examples 
#' Medians(c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1),c(1,10), 1)
Medians <- function(est, range, gran,...){
  return(QuantileFromCDF(est, range, gran, 0.5,...))
}

################################################
#'@title Node parser.
#'@description Runs through tree nodes (assists MSE analytic)
#'
#' @param height The height of the tree
#' @param k The tree degree
#' @param l The leaf length
#'
#' @return A nodesum containing information for MSEanalytic
#'@export
#' @examples 
#' nodes(10,4,2)
nodes <- function (height, k, l){
  #(FOR MSEanalytic)
  v<-l
  j <- k^height
  sum <- 0
  while (j >= k){
    sum <- sum + v %/% j
    v <- v %% j
    height  <- height - 1
    j <- k^height
  }
  sum <- sum + v %% k
  
  # var[i]=2.12*var[i]/(double)n;

  return(sum)
}
##################
#' @title Determine the expected MSE of a simple DPCDF from its parameters.
#' @description Generates the analytically expected 
#'    Mean Squared Error of a dpCDF.
#'    introduced by random noise, SUPPOSING that the DP-CDF is through the use
#'    of a noisy binary tree.
#'
#' @param eps Epsilon value for differential privacy control
#' @param range A vector length 2 containing user-specified min and max to
#'    truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year]
#'    for a list of ages)
#' @param data The vector of data from which the DP CDF was/is computed
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The MSE guaranteed by the given parameter combination assuming 
#'    it's built from 
#'    the min and max inward from a DP-Histogram, with 95% probability
#'@export
#' @examples 
#' MSEanalytic(.01, c(1,10),1, rexp(10000,.4))
MSEanalytic <- function(eps, range, gran, data,...) {

 k <- 2
  universe_size <- floor((range[2] - range[1]) / gran) + 1
  height <- ceiling(log(universe_size)/log(k))

  avg <- 0
  for (i in 1:(universe_size-1) ){
    num1 <- nodes(height, k, i)
    num2 <- nodes(height, k, universe_size-i)
    avg <- avg + (num1*(num2/(num1+num2))*(num2/(num1+num2))*8.0/(eps^2)*(height^2)+num2*(num1/(num1+num2))*(num1/(num1+num2))*8.0/(eps^2)*(height^2))/(universe_size);
  }

  return ((avg)/length(data))
}

############################################################
#' @title Calculate the empirical L2norm between two CDFs.
#' @description Calculates the L2 (squared error) area between the 
#'    non-private CDF and the dpCDF
#' @param Y The vector output of a non-differentially private CDF 
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF
#'    computation (cumulative count bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The empirical L2 norm
#'@export
#' @examples 
#' L2empiric(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))
L2empiric <- function(Y, est, ...){
   sqvect <- c()
   sqvect <- (Y-est)^2
  
L2 <- sqrt(sum(sqvect))
return(L2)
}

################################
#' @title Calculate the area between two CDFs.
#' @description Calculates the L1 (distance error) area between the non-private 
#'    CDF and the dpCDF
#' @param Y The vector output of a non-differentially private CDF
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF
#'    computation (cumulative count bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The empirical L1 norm
#'@export
#' @examples 
#' L1empiric(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))
L1empiric <- function(Y,est,...){

  diffs <- c()    
  diffs <- abs(Y-est)
  L1 <- sum(diffs)
return(L1)
}

################################
#' @title Calculate the MAE of a dpCDF relative to that of the non-private CDF.
#' @description Calculates the Mean Absolute Error area between the
#'    non-private CDF and the dpCDF
#' @param Y The vector output of a non-differentially private 
#'    CDF computation (cumulative count bins)
#' @param est The vector output of a differentially private
#'    CDF computation (cumulative count bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The MAE
#'@export
#' @examples 
#' MAE(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))
MAE <- function(Y,est,...){

  diffs <- c()
  diffs <- abs(Y-est)
  L1 <- sum(diffs)
  MAE <- L1/length(Y)

return(MAE)
}
################################
#' @title Calculate the MSE of a DP-CDF relative to the non-private CDF.
#' @description Calculates the Mean Squared Error area between the non-private 
#'     CDF and the DP-CDF
#' @param Y The vector output of a non-differentially private CDF
#'     computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF
#'     computation (cumulative count bins)
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The MSE
#' 
#' @export
#' @examples 
#' MSE(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))
MSE <- function(Y,est,...){
  sqvect <- c()    
  sqvect <- (Y-est)^2 
  L2 <- sqrt(sum(sqvect))
  MSE <- (L2^2)/length(Y)
return(MSE)
}

#######################################
#' @title Calculate the std. dev. on a DPCDF.
#' @description Calculates the standard deviation across bins between the 
#'    non-private CDF and the DP-CDF
#' @param Y The vector output of a non-differentially private CDF
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF
#'    computation (cumulative count bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return The standard deviation
#'@export
#' @examples 
#' SDempiric(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1), c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))

SDempiric <- function(Y, est,...){
  sqvect <- c()
  sqvect <- (Y - est)^2
  L2 <- sqrt(sum(sqvect))
  SD <- L2/(sqrt(length(Y)))
return(SD)
}

################################
#' @title Calculate the private mean from the DP-CDF
#' @description Calculates the mean value from a CDF plot.
#'
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param gran The smallest unit of measurement in the data (one [year]
#'    for a list of ages)
#' @param range A vector length 2 containing user-specified min and max 
#'    Note that the gran and range must be the same as used to make the DP-CDF!
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#'@export
#' @examples 
#' getMean(c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1), c(1,10), 1)
  getMean <- function(est, range, gran,...){
  Domain <- seq(from=range[1], to=range[2], by=gran)
  PDFvector <- c(est[1])
    for (x in (length(est)):2){
    PDFvector[x] <- est[x] - est[x-1]
    }
  weights <-c()
    for (x in 1:length(Domain)){
      weights[x] <- ( PDFvector[x] * Domain[x] )
    }

  mean <- sum( weights )
return(mean)
}

#####################################################

############################################################
# use it on PDF output to save time

#' @title Error in mean from CDF
#' @description Calculate difference between the private mean and the original mean (from CDFs)
#'
#' @param Y The vector output of a non-differentially private CDF
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param gran The smallest unit of measurement in the data (one [year]
#'    for a list of ages)
#' @param range A vector length 2 containing user-specified min and max 
#'    Note that the gran and range must be the same as used to make the DP-CDF!
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single difference value
#'
MeanDiffpdf <- function(Y, est, range, gran){

Domain <- seq(from=range[1], to=range[2], by=gran)

trueweights <-c()
dpweights   <-c()

  trueweights<- ( Y * Domain )
    dpweights <- ( est * Domain )

truemean <- sum( trueweights )
dpmean <-sum(dpweights)
return(truemean-dpmean)
}
#############################################
#' @title Error in Skewness from CDF (under development)
#' @description Calculate difference between the private Skewness and the original Skewness (from CDFs)
#'
#' @param Y The vector output of a non-differentially private CDF
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param gran The smallest unit of measurement in the data (one [year]
#'    for a list of ages)
#' @param range A vector length 2 containing user-specified min and max 
#'    Note that the gran and range must be the same as used to make the DP-CDF!
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single difference value
#'
SkewDiffpdf <- function(Y, est, range, gran){

  #determines the skewness of a univariate distribution taken from a DP-CDF vector

Domain <- seq(from=range[1], to=range[2], by=gran)

trueweights <-c()
dpweights   <-c()

  trueweights <- ( Y * Domain)
    dpweights <- ( est * Domain )

truemean <- sum( trueweights )
dpmean <-sum(dpweights)


truedistances <- c()
  dpdistance  <- c()  

  truedistances <- ((truemean - Domain)^2) * (Y)
    dpdistances <- ((  dpmean - Domain)^2) * (est)


truestddev <- sqrt(sum(truedistances))
dpstddev <- sqrt(sum(dpdistances))

trueskewness <- mean(((trueweights-truemean)/ truestddev)^3)
  dpskewness <- mean(((  dpweights-  dpmean)/   dpstddev)^3)
return(trueskewness-dpskewness)
}

#############################################

#' @title Error in Variance from CDF
#' @description Calculate difference between the private Variance and the original Variance (from CDFs)
#'
#' @param Y The vector output of a non-differentially private CDF
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param gran The smallest unit of measurement in the data (one [year]
#'    for a list of ages)
#' @param range A vector length 2 containing user-specified min and max 
#'    Note that the gran and range must be the same as used to make the DP-CDF!
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single difference value
#'
VarDiffpdf <- function(Y, est, range, gran){

Domain <- seq(from=range[1], to=range[2], by=gran)

trueweights <-c()
dpweights   <-c()

  trueweights <- ( Y * Domain )
    dpweights <- ( est * Domain )


truemean <- sum( trueweights )
dpmean <-sum(dpweights)


truedistances <- c()
  dpdistance  <- c()  

  truedistances <- ((truemean - Domain)^2) * (Y)
    dpdistances <- ((  dpmean - Domain)^2) * (est)



truevariance <- sum(truedistances)
dpvariance <- sum(dpdistances)
return(truevariance-dpvariance)
}

################################################
#' @title Error in Mode from CDF
#' @description Calculate difference between the private Mode and the original Mode (from CDFs)
#'
#' @param Y The vector output of a non-differentially private CDF
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param gran The smallest unit of measurement in the data (one [year]
#'    for a list of ages)
#' @param range A vector length 2 containing user-specified min and max 
#'    Note that the gran and range must be the same as used to make the DP-CDF!
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single difference value
#'

ModeDiffpdf <- function(Y,est, range, gran,...){
  Domain <- seq(from=range[1], to=range[2], by=gran)
  
  PDF1k <- Y*10000000000000000
  PDFblank <- PDF1k - (max(PDF1k)-1)
  PDFblank <- (PDFblank + abs(PDFblank)) / 2
  valuevect<- PDFblank * Domain
  
  truemode <- (sum(valuevect)/ sum(PDFblank))
  
  PDF1k <- est*1000000000000000
  PDFblank <- PDF1k - (max(PDF1k)-1)
  
  PDFblank <- (PDFblank + abs(PDFblank)) / 2
  valuevect<- PDFblank * Domain
  
  dpmode <- (sum(valuevect)/ sum(PDFblank))
  
  
  return(truemode-dpmode)
}


#############################################
#' @title Error in Standard Deviation from CDF 
#' @description Calculate difference between the private Standard Deviation and the original Standard Deviation (from CDFs)
#'
#' @param Y The vector output of a non-differentially private CDF
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param gran The smallest unit of measurement in the data (one [year]
#'    for a list of ages)
#' @param range A vector length 2 containing user-specified min and max 
#'    Note that the gran and range must be the same as used to make the DP-CDF!
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single difference value
#'
StdDiffpdf <- function(Y, est, range, gran){

Domain <- seq(from=range[1], to=range[2], by=gran)

trueweights <-c()
dpweights   <-c()

  trueweights <- ( Y * Domain )
    dpweights <- ( est * Domain )


truemean <- sum( trueweights )
dpmean <-sum(dpweights)


truedistances <- c()
  dpdistance  <- c()  

  truedistances <- ((truemean - Domain)^2) * (Y)
    dpdistances <- ((  dpmean - Domain)^2) * (est)



trueSTD <- sqrt(sum(truedistances))
dpSTD <- sqrt(sum(dpdistances))
return(trueSTD-dpSTD)
}

################################################
#' @title Error in Kurtosis from CDF (under development)
#' @description Calculate difference between the private Kurtosis and the original Kurtosis (from CDFs)
#'
#' @param Y The vector output of a non-differentially private CDF
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param gran The smallest unit of measurement in the data (one [year]
#'    for a list of ages)
#' @param range A vector length 2 containing user-specified min and max
#'    Note that the gran and range must be the same as used to make the DP-CDF!
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single difference value
#'
KurtDiffpdf <- function(Y, est, gran, range){

Domain <- seq(from=range[1], to=range[2], by=gran)

trueweights <-c()
dpweights   <-c()

  trueweights <- ( Y * Domain )
    dpweights <- ( est * Domain )

truemean <- sum( trueweights )
dpmean <-sum(dpweights)

truedistances <- c()
  dpdistance  <- c()  

  truedistances <- ((truemean - Domain)^2) * (Y)
    dpdistances <- ((  dpmean - Domain)^2) * (est)

truevariance <- sum(truedistances)
truemoment4 <- (mean(truedistances))
truesigma4  <-  truevariance^2
#normalize the 4th moment
truekurtosis <- truemoment4/truesigma4

dpvariance <- sum(dpdistances)
dpmoment4 <- (mean(dpdistances))
dpsigma4  <-  dpvariance^2
#normalize the 4th moment
dpkurtosis <- dpmoment4/dpsigma4


return(truekurtosis-dpkurtosis)

}

###############################################
#' @title Determine how well a single DPCDF matches the shape of its data.
#' @description Calculates a score for how much the DP-CDF's slope varies from
#'    the true CDF's slope at various resolutions.
#'
#' @param Y The vector output of a non-differentially private CDF
#'    computation (cumulative count bins)
#' @param est The vector output of a differentially private CDF
#'    computation (cumulative count bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single so-called derivative score; lower scores suggest
#'    better performance 
#'@export
#' @examples 
#' DerivDiff(c(.1,.2,.3,.4,.5,.6,.7,.8,.9,1),c(.1,.2,.3,.3,.3,.3,.3,.3,.4,1))
DerivDiff <- function(Y, est,...) {

workingY <- Y
workingE <- est

crunch <- c()
Ypdf <- c()
Epdf <- c()
Ypdf[1]<-workingY[1]
Epdf[1]<-workingE[1]
  for(v in length(workingY):2){
                Ypdf[v] <- workingY[v]-  workingY[v-1]
                Epdf[v] <- workingE[v]-  workingE[v-1]
  }
  diff      <- abs(Ypdf - Epdf)
  L1area    <- sum(diff)
  MAE <- L1area/length(Y)

  crunch[length(crunch)+1] <-MAE


for (k in 1:(floor(log2(length(Y))))){
Yx <- c()
Ex <- c()

  for (i in 1:floor(length(workingY)/2)){
    Yx[i] <- (workingY[2*i]+workingY[(2*i)-1])/2
  }

  if ((floor(length(Yx))*2) != length(workingY) ){
    remainder <- c(workingY[length(workingY)-(2*length(Yx))]:workingY[length(workingY)])
    Yx[length(Yx)+1] <- mean(remainder)
  }
####
  for (i in 1:floor(length(workingE)/2)){
    Ex[i] <- (workingE[2*i]+workingE[(2*i)-1])/2
  }
  if ((floor(length(Ex))*2) != length(workingE) ){
    remainder <- c(workingE[length(workingE)-(2*length(Ex))]:workingE[length(workingE)])
    Ex[length(Ex)+1] <- mean(remainder)
  }
    workingE <- Ex
    workingY <- Yx
Ypdf <-c()
Epdf <-c()
Ypdf[1]<-workingY[1]
Epdf[1]<-workingE[1]
if(length(workingY) >1){
  for(v in length(workingY):2){
                Ypdf[v] <- workingY[v]-  workingY[v-1]
                Epdf[v] <- workingE[v]-  workingE[v-1]
  }
  diff      <- abs(Ypdf - Epdf)
  L1area    <- sum(diff)
  MAE <- L1area/length(workingY)
   crunch[length(crunch)+1] <-MAE
}
}
output <- mean(crunch)

return(output)
}
########################################################
# Smooth <- function() {
#   .Call('dpCDFtester_Smooth', PACKAGE = 'dpCDFtester')
# }
############################################################
#' @title Enforce monotnocity on a vector.

#' @description Forces DP-CDFs into the nearest monotonic vector
#'    (by euclidean distance minimization).
#'
#' @param cdf The vector output of a differentially private CDF 
#'    computation (cumulative count bins) 
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A single monotonically increasing vector which is the post-processed
#'    DP-CDF's Y coordinates
#'@export
#' @examples 
#' smoothVector2(c(.1,.2,.3,.2,.3,.3,.3,.3,1))
smoothVector2 <- function(cdf){
  #Rcpp::sourceCpp("smooth.cpp")
  returnVector  <- Smooth(cdf)
return(returnVector)
}

##########################################

#' @title Tranforms long numbers into short strings.
#' @description Abbreviates long numeric values into visually shorter strings
#'
#' @param value A single numeric value
#' @export
#' @return A string value such as 1k for 1000
#' @export
#' @examples 
#' Abbrev(1700000)
Abbrev <- function(value){
     returnValue <- value

        if(value >= 1000000000000){
            returnValue <- paste(as.numeric(value)/1000000000000,"t", sep="")
        }
        else if(value >= 1000000000){
            returnValue <- paste(as.numeric(value)/1000000000,"b", sep="")
        }
         else if(value >= 1000000){
            returnValue <- paste(as.numeric(value)/1000000,"m", sep="")
        }
         else if (value >= 1000){
            returnValue <- paste(as.numeric(value)/1000,"k", sep="")
              }
     return(returnValue)
} 

############################################################################################################################
#' @title Make a straight-line faux CDF.
#' @description Creates a placeholder CDF (a uniform straight line) 
#'    for demonatration.
#'
#' @param range A vector length 2 containing user-specified min and max to
#'    truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year]
#'    for a list of ages)
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A fake CDF for demonstration only.
#' @export
#' @examples 
#' badCDF(c(1,50), 1)
badCDF <- function(range, gran,...){
  Domain <- seq(range[1], range[2], gran)
  theCDF <- Domain/range[2]
return(theCDF)
}

############################################################################################################################
############################################################################################################################
############################################################################################################################
#' @title Test a single CDF implementation with one set of parameters.
#' @description Generates mean/median empirical error measurements, complete
#'    results, single iterations of DP CDFs at each combination of parameters,
#'    and diagnostic functions used.
#'
#' @param funct The differentially-private CDF-generating function to be tested
#' @param eps Epsilon value for Differential privacy control
#' @param cdfstep The step sized used in outputting the approximate CDF; the 
#'   values output are [min, min + cdfstep], [min, min + 2 * cdfstep], etc. 
#'   Setting cdfstep equal to 0 (default) will set cdfstep = granularity
#' @param data A vector of the data (single variable to compute CDFs from) 
#' @param range A vector length 2 containing user-specified min and max to
#'   truncate the universe to.
#' @param gran The smallest unit of measurement in the data (one [year] for a
#'   list of ages). The Domain (ie gran and range) should be identical to those 
#'   used to create the CDF!
#' @param reps The number of times the combination of CDFfunction, dataset, and 
#'   epsilon will be tested
#' @param SmoothAll Applies L2 monotnocity post-processing to every DP-CDF
#' @param ABounds This is a flag and should be set to "true" if the functions 
#'   being tested are expected to output 
#'   analytical variance bounds. The proper output form is 
#'   output = list(DPCDFvector, LowerBoundVector, UpperBoundVector)
#' @param EmpiricBounds When TRUE, outputted graphs
#'   depict the minimum and maximum values taken by each bin across reps
#' @param ExtraTests_CDF If a user wishes to add extra diagnostics, the proper 
#'   syntax would be:
#'   ExtraTests_CDF = list( functionName1 = function1, functionName2 = function2) 
#' @param  ExtraTests_PDF See above
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @return A list in the form of:
#' 
#' ...\code{$meanscores} Contains mean diagnostic
#'  results for each diagnostic across reps iterations;
#'  
#' ...\code{$medianscores} Contains median diagnostic 
#'  results for each diagnostic across reps iterations;
#'  
#'...\code{$yourCDFoutput} Containing a single dpCDF iteration;
#'
#'...\code{$yourPDFoutput}  Containing a single dpPDF iteration;
#'
#' ...\code{$realCDFoutput} Containing the real (non-DP) CDF output;
#' 
#' ...\code{$realPDFoutput} Containing the real (non-DP) PDF output;
#' 
#' ...\code{$databins} Containing the domain used to construct the CDFs;
#' 
#' ...\code{$TestPack_CDF} Containing the definitions of diagnostic functions used on dpCDFs;
#' 
#' ...\code{$TestPack_PDF} Containing the definitions of diagnostic functions used on dpPDFs;
#' 
#' ...\code{$allscores} Containing all raw diagnostic output.
#' @export
#' @examples 
#' CDFtestTrack(badCDF, eps = .01, cdfstep = 1, data = rexp(10000,.4),
#' range= c(1,10), gran = .1, reps = 20)
CDFtestTrack <- function(funct, eps, cdfstep = 1, data, range, gran, reps, SmoothAll = FALSE, ABounds = FALSE, EmpiricBounds = FALSE, ExtraTests_CDF = list(), ExtraTests_PDF = list(), ...) {


# determine range, universe_size, and create vectors to hold outputs of
# the non-private and private-function (for reps-many iterations)

range         <- as.vector(range)
universe_size <- floor(((range[2] - range[1]) / gran) +1)
output_size   <- floor(((range[2] - range[1]) / cdfstep) +1)

yourCDFoutput <- matrix(nrow=output_size,ncol= reps)
yourPDFoutput <- matrix(nrow=output_size,ncol= reps)

# set up the vector for the x axis
databins      <- seq(from=(range[1]), to=(range[2]), by=cdfstep) 

# this will all output scores
allscores     <- list()

#this is how we'll check later if bounds should be used
LBoundAnalytic <- 100

CDF           <- ecdf(data)     # use the ecdf function to define my CDF function, for an ordindary CDF
realCDFoutput <- CDF (databins) # fill the realCDF vector
realPDFoutput <-c()
realPDFoutput[1]<-realCDFoutput[1]
  for(v in length(realCDFoutput):2){
                realPDFoutput[v] <- realCDFoutput[v]-  realCDFoutput[v-1]
  }


for(i in 1:reps) {

              #run the function as many times as specified by "reps"
              results<- funct(eps=eps, data=data, range=range,gran= gran, 
                        varbounds=Abounds)
              # if AnalyticBounds is true, the inputted function is expected to output 3 vectors:
              # the DP-CDF, a lower bound and an upper bound

if (SmoothAll == FALSE){

if (cdfstep==gran){
                        if (ABounds == TRUE){ 
                             
                                    yourCDFoutput[,i]  <-results[[1]]                                                      
                                if (i ==1){

                                    LBoundAnalytic<- results[[1]] - results[[2]]
                                    UBoundAnalytic<- results[[1]] + results[[2]]
                                }
                        }
                             else{
                                    yourCDFoutput[,i]  <-results[[1]]                                 
                             }
             }else{
                    output_seq <- (seq(range[1],range[2], by=cdfstep)/gran)-(range[1]/gran)+range[1]
                       if (ABounds == TRUE){                               
                                    WorkingVector <-results[[1]]
                              if (i ==1){
                                    WorkingL<- results[[1]] - results[[2]]
                                    WorkingU<- results[[1]] + results[[2]]

                                   for(c in 1:length(output_seq)){
                                     yourCDFoutput[c,i] <- WorkingVector[output_seq[c]]
                                     LBoundAnalytic<- WorkingL[output_seq[c]]
                                     UBoundAnalytic<- WorkingU[output_seq[c]]
                                   }     
                              }
                       }
                       else if (length(results[[1]]) > 1){
                               WorkingVector <-results[[1]]                     
                               for(c in 1:length(output_seq)){                
                               yourCDFoutput[c,i] <- WorkingVector[output_seq[c]]
                               }
                       }
                       else{
                               for(c in 1:length(output_seq)){
                                 yourCDFoutput[,i] <-results[output_seq[c]]
                               }

                       }
             }

                     yourPDFoutput[1,i] <-yourCDFoutput[1,i]
                   for(k in length(yourCDFoutput[,1]):2){
                       yourPDFoutput[k,i] <- yourCDFoutput[k,i] -  yourCDFoutput[k-1,i]
                   }
                    yourPDFoutput[,i] <- round(yourPDFoutput[,i],3)  
                   for (j in 1:length(yourPDFoutput[,i])){
                    yourPDFoutput[j,i] <- MovetoRange(yourPDFoutput[j,i], c(0,1))
                   }
           
}else{
nonMonoCDFs <- matrix(nrow=output_size,ncol= reps)

 if (cdfstep==gran){
                        if (ABounds == TRUE){
                            nonMonoCDFs[,i]   <- results[[1]]   
                            yourCDFoutput[,i] <- smoothVector2(results[[1]])                                                            
                                if (i ==1){
                                    LBoundAnalytic<- results[[1]] - results[[2]]
                                    UBoundAnalytic<- results[[1]] + results[[2]]
                                }
                        }
                        else if (length(results[[1]]) > 1){
                             nonMonoCDFs[,i]   <- results[[1]]                                  
                             yourCDFoutput[,i] <- smoothVector2(results[[1]])                                 
                             }
                        else{                                 
                            yourCDFoutput[,i] <- smoothVector2(results[[1]])                                 
                            }
             }else{
                    output_seq <- (seq(range[1],range[2], by=cdfstep)/gran)              
                       if (ABounds == TRUE){                               
                           WorkingVector <- smoothVector2(results[[1]])
                           NonMonoWV   <- results[[1]]                               
                              if (i ==1){
                                    WorkingL<- WorkingVector - results[[2]]
                                    WorkingU<- WorkingVector + results[[2]]
                                    LBoundAnalytic <-c()
                                    UBoundAnalytic <-c()
                                   for(c in 1:length(output_seq)){
                                     yourCDFoutput[c,i] <- WorkingVector[output_seq[c]]
                                     LBoundAnalytic[c]<- WorkingL[output_seq[c]]
                                     UBoundAnalytic[c]<- WorkingU[output_seq[c]]
                                     nonMonoCDFs[c,i] <- NonMonoWV[output_seq[c]]
                                   }     
                              }
                       }
                       else if (length(results[[1]]) > 1){                               
                                    WorkingVector <- smoothVector2(results[[1]])   
                                    NonMonoWV     <- results[[1]]                                         
                               for(c in 1:length(output_seq)){                
                               yourCDFoutput[c,i] <- WorkingVector[output_seq[c]]
                               nonMonoCDFs[c,i] <- NonMonoWV[output_seq[c]]
                               }
                       }
                       else{
                               for(c in 1:length(output_seq)){
                               yourCDFoutput[,i] <-results[output_seq[c]]
                               nonMonoCDFs[c,i]  <- NonMonoWV[output_seq[c]]
                               }
                       }
             }

                     yourPDFoutput[1,i] <-nonMonoCDFs[1,i]
                   for(k in length(nonMonoCDFs[,1]):2){
                       yourPDFoutput[k,i] <- nonMonoCDFs[k,i] -  nonMonoCDFs[k-1,i]
                   }
                    yourPDFoutput[,i] <- round(yourPDFoutput[,i],3)     
                   for (j in 1:length(yourPDFoutput[,i])){
                    yourPDFoutput[j,i] <- MovetoRange(yourPDFoutput[j,i], c(0,1))
                   }
      }
}         

################################################
TestPack_CDF <-list(MaxError_CDF   = MaxError_CDF,
                    MaxErrorAt_CDF = MaxErrorAt_CDF,
                    diffat25       = diffat25,
                    diffatMedian   = diffatMedian,
                    diffat75       = diffat75,
                    horzdiffat25   = horzdiffat25,
                    horzdiffatMed  = horzdiffatMed,
                    horzdiffat75   = horzdiffat75,
                    Medians        = Medians,
                    MAE_CDF        = MAE,
                    MSE            = MSE,
                  #  L1emp          = L1empiric,
                  #  L2emp          = L2empiric,
                    DerivScore     = DerivDiff             
              )

TestPack_PDF <-list(MaxError_PDF   = MaxError_PDF,
                    MaxErrorAt_PDF = MaxErrorAt_PDF,
                    MAE_PDF        = MAE,
                    MSE_PDF        = MSE,
                    MeanDiff = MeanDiffpdf,
                     ModeDiff = ModeDiffpdf,
                      StdDiff = StdDiffpdf,
                       VarDiff = VarDiffpdf,
                        SkewDiff = SkewDiffpdf,
                         KurtDiff= KurtDiffpdf)


## if the user has any custom built DP-CDF or DP-PDF empirical diagnostic functions, they're introduced here.

if (length(ExtraTests_CDF) >0){
assist <- names(ExtraTests_CDF)
  
    for(q in 1:length(ExtraTests_CDF)){
      TestPack_CDF[length(TestPack_CDF)+q] <- ExtraTests_CDF[q]
       names(TestPack_CDF)[length(TestPack_CDF)] <- assist[q]
    }
  }

if (length(ExtraTests_PDF) >0){
assist <- names(ExtraTests_PDF)
    for(q in 1:length(ExtraTests_PDF)){
      TestPack_PDF[length(TestPack_PDF)+q] <- ExtraTests_PDF[q]
       names(TestPack_PDF)[length(TestPack_PDF)] <- assist[q]
    }
  }

############################################################
# initialize dataframes to hold the single output of each diagnostic.
TheNames <- names(TestPack_CDF)
      MeanScores   <- data.frame( TheNames = 0)
      MedianScores <- data.frame( TheNames = 0)

for (u in 1:length(TestPack_CDF)){
      TempVector <- rep(0, times = reps)

      #and fill those frames
      for (i in 1:reps){
           TempVector[i] <- TestPack_CDF[[u]](Y = realCDFoutput, est = yourCDFoutput[,i], range = range, gran = gran, eps = eps, data= data)
      }

      MeanScores  [[TheNames[u]]] <- mean  (TempVector)
      MedianScores[[TheNames[u]]] <- median(TempVector)
      allscores   [[TheNames[u]]] <-        TempVector
      }


#####
TheNames <- names(TestPack_PDF)
for (u in 1:length(TestPack_PDF)){
      TempVector <- rep(0, times = reps)
      #and fill those frames

         for (i in 1:reps){

         TempVector[i] <- TestPack_PDF[[u]](Y = realPDFoutput, est = yourPDFoutput[,i], range = range, gran = gran)
         }

      MeanScores  [[TheNames[u]]] <- mean  (TempVector)
      MedianScores[[TheNames[u]]] <- median(TempVector)
      allscores   [[TheNames[u]]] <-        TempVector
}


  #make empirical bounds for the CDF
  if (EmpiricBounds){

        differences <- matrix(0,nrow = length(databins), ncol = reps)

        for (i in 1: reps){
             differences[,i]<- yourCDFoutput[,i] - realCDFoutput
        }
        UBoundEmpiric <- rep(0,length(databins)) 
        LBoundEmpiric <- rep(0,length(databins)) 

        for (b in 1:length(databins)){
             UBoundEmpiric[b] <- max(differences[b,]) + realCDFoutput[b]
             LBoundEmpiric[b] <- min(differences[b,]) + realCDFoutput[b]
        } 
}
############################################################
# store everything! Including two arbitrary iterations of the dp-CDF function,
# used for plotting in the the CDFtest graphical ouput
scores <- list()
scores$meanscores      <-  MeanScores
scores$medianscores    <-  MedianScores
scores$yourCDFoutput   <-  yourCDFoutput[,1]
scores$yourPDFoutput   <-  yourPDFoutput[,1]
scores$realCDFoutput   <-  realCDFoutput
scores$realPDFoutput   <-  realPDFoutput
scores$databins        <-  databins
scores$TestPack_CDF    <-  TestPack_CDF
scores$TestPack_PDF    <-  TestPack_PDF
scores$allscores       <-  allscores

if(EmpiricBounds == TRUE){
    scores$yourCDF_LboundE <-  LBoundEmpiric
    scores$yourCDF_UboundE <-  UBoundEmpiric
}
if(ABounds == TRUE){
    scores$yourCDF_LboundA <-  LBoundAnalytic
    scores$yourCDF_UboundA <-  UBoundAnalytic
}


return(scores)
}

############################################################################
#This is a streamlined version of \code{CDFtestTrack} (see above), used in Data Collection mode. 
#' @title Test a single CDF implementation with one set of parameters.
#' @description Applies diagnostic functions to a single dpCDF, and only 
#'   releases a complete set of diagnostic results (called within\code{CDFtest}
#'   in Data Collection mode --- e.g., when \code{Visualization = FALSE})
#'
#' @param funct The differentially-private CDF-generating function to be tested
#' @param eps Epsilon value for Differential privacy control
#' @param data A vector of the data (single variable to compute CDFs from) 
#' @param range A vector length 2 containing user-specified min and max to
#'   truncate the universe to
#' @param gran The smallest unit of measurement in the data (one [year] for a 
#'   list of ages)
#' @param reps The number of times the combination of CDFfunction, dataset,
#'   and epsilon will be tested
#' @param samplesize The specified sample size is randomly selected from each dataset
#'   without replacement. 
#' @param SmoothAll Applies L2 monotonicity post-processing to every DP-CDF
#' @param ExtraTests_CDF If a user wishes to add extra diagnostics, the proper 
#'   syntax would be:   
#'   \code{ExtraTests_CDF = list( functionName1 = function1, functionName2 = function2)}
#' @param  ExtraTests_PDF See above
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.

#' @return A complete set of diagnostic results in the form of
#'       \code{...$allscores}, which holds out a row of output for each of \code{reps} results.
#' @export
#' @examples 
#' CDFtestTrackx(badCDF, eps = .01, cdfstep = 0, data = rexp(10000,.4),
#'   range= c(1,10), gran = .1, reps = 20, samplesize = 10000)
CDFtestTrackx <- function(funct, eps, data, range=range, gran, reps,samplesize, SmoothAll = FALSE,ExtraTests_CDF = list(), ExtraTests_PDF = list(),...) {
# source relevant scripts and libraries
# determine range, universe_size, and create vectors to hold outputs of the non-private and private-function (for reps-many iterations)
  smoothVector <- NULL
range         <- as.vector(range)
universe_size <- floor(((range[2] - range[1]) / gran) +1)
output_size   <- universe_size

data <- sample(data, samplesize, replace = TRUE)

yourCDFoutput <- matrix(nrow=output_size,ncol= reps)
yourPDFoutput <- matrix(nrow=output_size,ncol= reps)

# set up the vector for the x axis
databins      <- seq(from=(range[1]), to=(range[2]), by=gran) 

# this will all output scores
allscores     <- list()

#this is how we'll check later if bounds should be used
LBoundAnalytic <- 100

CDF           <- ecdf(data)     # use the ecdf function to define my CDF function, for an ordindary CDF
realCDFoutput <- CDF (databins) # fill the realCDF vector
realPDFoutput <-c()
realPDFoutput[1]<-realCDFoutput[1]
  for(v in length(realCDFoutput):2){
                realPDFoutput[v] <- realCDFoutput[v]-  realCDFoutput[v-1]
  }
MSEanalytic <- c()
for(i in 1:reps) {
              #run the function as many times as specified by "reps"
              results<- funct(eps=eps, data=data, range=range,gran= gran, 
                        Analyticbounds=AnalyticBounds)

              # if AnalyticBounds is true, the inputted function is expected to output 2 vectors:
              # the DP-CDF, and MSEanalytic abs diffs (analytic bounds)

                     yourCDFoutput[,i]  <-results[[1]]
                    MSEanalytic[i]            <-mean(results[[2]])                                  
           
                   CDFshift<- c(0)
                   CDFshift[2:length(yourCDFoutput[,i])]<-yourCDFoutput[(1:length(yourCDFoutput[,i])-1),i]
                   yourPDFoutput[,i] <-yourCDFoutput[,i] - CDFshift
                   yourPDFoutput[,i] <- round(yourPDFoutput[,i],3)  


if (SmoothAll){                               
                     yourCDFoutput[,i] <- smoothVector2(results[[1]])      
      }
}         

################################################
TestPack_CDF <-list(MaxError_CDF   = MaxError_CDF,
                    MaxErrorAt_CDF = MaxErrorAt_CDF,
                    diffat25       = diffat25,
                    diffatMedian   = diffatMedian,
                    diffat75       = diffat75,
                    horzdiffat25   = horzdiffat25,
                    horzdiffatMed  = horzdiffatMed,
                    horzdiffat75   = horzdiffat75,
                    Medians        = Medians,
                    MAE_CDF        = MAE,
                    MSE            = MSE
                  #  L1emp          = L1empiric,
                  #  L2emp          = L2empiric,
                  #  DerivScore     = DerivDiff,
                  # MSEanalytic     = MSEanalytic                
              )

TestPack_PDF <-list(MaxError_PDF   = MaxError_PDF,
                    MaxErrorAt_PDF = MaxErrorAt_PDF,
                    MAE_PDF        = MAE,
                    MSE_PDF        = MSE,
                    MeanDiff = MeanDiffpdf,
                     ModeDiff = ModeDiffpdf,
                      StdDiff = StdDiffpdf,
                       VarDiff = VarDiffpdf,
                        SkewDiff = SkewDiffpdf,
                         KurtDiff= KurtDiffpdf)

## if the user has any custom built DP-CDF or DP-PDF empirical diagnostic functions, they're introduced here.

if (length(ExtraTests_CDF) >0){

assist <- names(ExtraTests_CDF)
  
    for(q in 1:length(ExtraTests_CDF)){
      TestPack_CDF[length(TestPack_CDF)+1] <- ExtraTests_CDF[q]
       names(TestPack_CDF)[length(TestPack_CDF)] <- assist[q]
    }
  }

if (length(ExtraTests_PDF) >0){

assist <- names(ExtraTests_PDF)

    for(q in 1:length(ExtraTests_PDF)){
      TestPack_PDF[length(TestPack_PDF)+1] <- ExtraTests_PDF[q]
       names(TestPack_PDF)[length(TestPack_PDF)] <- assist[q]

    }
  }

############################################################
# initialize dataframes to hold the single output of each diagnostic.

TheNames <- names(TestPack_CDF)
      MeanScores   <- data.frame( TheNames = 0)
      MedianScores <- data.frame( TheNames = 0)
for (u in 1:length(TestPack_CDF)){
      TempVector <- rep(0, times = reps)

      #and fill those frames
      for (i in 1:reps){
           TempVector[i] <- TestPack_CDF[[u]](Y = realCDFoutput, est = yourCDFoutput[,i], range = range, gran = gran, eps = eps, data= data)
      }

      MeanScores  [[TheNames[u]]] <- mean  (TempVector)
      MedianScores[[TheNames[u]]] <- median(TempVector)
      allscores   [[TheNames[u]]] <-        TempVector
      }

#####
TheNames <- names(TestPack_PDF)
for (u in 1:length(TestPack_PDF)){
      TempVector <- rep(0, times = reps)
      #and fill those frames
         for (i in 1:reps){

         TempVector[i] <- TestPack_PDF[[u]](Y = realPDFoutput, est = yourPDFoutput[,i], range = range, gran = gran)

         }

      MeanScores  [[TheNames[u]]] <- mean  (TempVector)
      MedianScores[[TheNames[u]]] <- median(TempVector)
      allscores   [[TheNames[u]]] <-        TempVector
}

      allscores$MSEanalytic        <- MSEanalytic

############################################################
# store everything! Including two arbitrary iterations of the dp-CDF function, used for plotting in the the CDFtest graphical ouput
scores <- list()
scores$allscores       <-  allscores


return(scores)
}

###############################################################################
###############################################################################
###############################################################################
#' @title Comprehensively evaluate and visualize the utility of 
#'     CDF-generating implementations.
#' @description The suite is a system for determining the utility
#'     of differentially private cumulative distribution function (DP-CDF)
#'     algorithm implementations. The system can empirically evaluate and 
#'     provide visualizations for several DP-CDF algorithms simultaneously,
#'     under various parameters. It can also be set to focus strictly on data
#'     collection, rather than spending time on visualization. 
#'     
#'     It comes with several pre-loaded adjustable synthetic datasets, and can
#'     also analyze functions on user-defined datasets.
#'     
#'     dpCDF implementations to test must take the following as arguments:
#'     \code{data, epsilon, granularity, range}, and any number of other inputs.
#'     Use "?functionH" for an example of an implementation drawing on C++ files
#'      through Rcpp.
#'     
#'       USERS SHOULD NOTE: the
#'     following included diagnostic functions are under development:
#'     \code{SkewDiffpdf,KurtDiffpdf, StdDiffpdf}, corresponding to error measurements of
#'     skewness, kurtoses, and standard deviations generated from dpCDFs.
#'     This is evident through the occasional result of \code{NA}.
#' @param Visualization Sets the testing suite into Visualization mode (default, 
#'        \code{Visualization = TRUE}) 
#'     or Data Collection mode \code{(Visualization = FALSE)}
  #'  In Visualization mode (default):
  #'    A \code{.csv} file conatining the mean and median results (across \code{reps}
  #'        iterations) of diagnostic functions on DP-CDF algorithms per each 
  #'        combination of data, function, and epsilon. 
  #'    A \code{.pdf} file containing one graphical example DP CDF for each combination
  #'        of dataset, function, and epsilon, as well as a set of boxplots 
  #'        showing the distribution of all diagnostic results for all 
  #'        combinations of parameters.
  #'  In Data Collection mode (set \code{Visualization = FALSE}):
  #'    A \code{.csv} file containing the entire (raw) results (across \code{reps}
  #'    iterations) of diagnostic functions on DP-CDF algorithms
  #'    per each combination of dataset, and function, seperately looped over
  #'    all epsilons, then all granularities, and all samplesizes. 
#' @param OutputDirectory The location of the folder which will hold the
#'    output (\code{.csv} and \code{.pdf} files). This defaults to the
#'    \code{tempdir()} directory.
#' @param functlist A list of CDF-computing functions to be tested on the
#'    \code{CDFtestTrack} (if \code{visualization = TRUE}) or \code{CDFtestTrackx}
#'    (if \code{Visualization =FALSE})) 
#' @param Fnameslist A vector of function names corresponding to the functions
#' @param epslist  A vector of epsilon values for differential privacy
#' @param datalist A list containing vectors of data, each to be used in a test
#' @param Dnameslist A list of dataset names corresponding to the data/variables 
#'   being tested; used for labelling the output
#' @param SmoothAll Applies L2 monotnocity post-processing to every DP-CDF
#' @param synthsets This script generates pre-defined synthetic datasets upon 
#'  request, and fully incorporates them into testing. To call them, users
#'  should input a string vector containing the names of the sets they desire.
#'    For example, \code{synthsets = list(list(type,size,shape),list(type,size,shape))}.
#'    There are no limits on the amounts of datasets included.
#'    Sets available include:
#'       type:  \code{"age"} (which ranges from about 0 to 100, \code{gran =1})
#'       and \code{"wage"} which ranges from 0 to 500k);
#'       size:  Any positive integer. Type in exact numerical representation
#'         (eg, for ten thousand use 10000 not 10k and not 10^4);
#'       shape: gaussian, sparse, uniform, bimodal;
#'       It is assumed that the data input is rounded to the granularity
#' @param range The range of the domain as a vector \code{c(min, max)}. Defined based 
#'   on user intuition. to preserve differential privacy, the domain is constructed
#'   using this range. Setting the min too high will bias output upward. 
#'   Same in reverse for a low max. However, setting min too low and max 
#'   too high could reveal the true limits of your data,
#'   compromising some privacy.
#' @param gran FOR Visualization MODE ONLY. refer to \code{granlist} for setting 
#'   granularities (thus domain sizes) in Data Collection mode.
#'   This command is irrelevant in Data Collection mode.          
#'   The granularity of the domain between the min and max. ie, if age is
#'   measureds per 1 year of age, \code{gran =1}.
#'   The same granularity is applied to all datasets, so using comparable 
#'   (or scaled) data is necessary.
#' @param granlist FOR Data Collection MODE ONLY. refer to \code{gran} for selecting
#'   samplesizes in Data Collection mode. This command is irrelevant in
#'   Visualization mode. A list of granularities of the domain between the
#'   min and max. ie, if age is measure per 1 year of age, \code{gran =1}.
#' @param samplesize FOR Visualization MODE ONLY. refer to \code{nlist} for selecting
#'   samplesizes in Data Collection mode. This command is irrelevant in 
#'   Data Collection mode. when set to zero, the entire dataset is used.
#'   Otherwise, the specified sample size is randomly selected from each dataset
#'   without replacement. 
#' @param nlist FOR Data Collection MODE ONLY. refer to \code{samplesize} for 
#'   selecting samplesizes in visualization mode. This command is irrelevant in
#'   Visualization mode. Sets the absolute sample sizes to draw from each
#'   dataset, with replacement. Any vector of integer values is appropriate.
#' @param cdfstep The step size used in outputting the approximate CDF;
#' @param reps  The number of times to repeat each diagnostic. higher \code{reps} lends
#'   greater accuracy, but comsumes time and power. Author recommends \code{reps = 10}
#'   for quick examples and \code{reps = 100} for more robust examinations.
#' @param ExtraTests_CDF If a user wishes to add extra diagnostics, the proper
#    syntax would be:
#'     \code{ExtraTests_CDF = list(functionName1=function1, functionName2=function2)}.  
#'    Diagnostic Functions should have inputs such as \code{Y} for a public CDF,
#'     \code{est} for a DP-representation of that CDF,
#'     \code{range} and \code{gran}, and the output should be just one value.
#' @param ExtraTests_PDF See above
#' @param setseed In the function, each combination of data, epsilon, and
#'   function is executed with a separate seed, which by default is randomly
#'   generated and reported. Users interested in replicating specific results
#'   can locate the reported seed and parameter combination to replicate tests.
#' @param comments "Comments written here print to a log in excel" 
#' @param EmpiricBounds FOR Visualization MODE ONLY. When TRUE, outputted graphs
#'   depict the minimum and maximum values taken by each bin across reps
#' @param AnalyticBounds FOR Visualization MODE ONLY. This is a flag and should
#'   be set to \code{TRUE} if the functions being tested are expected to output 
#'  analytical variance bounds. The proper output form for such a function is 
#'   \code{output = list(DPCDFvector, LowerBoundVector, UpperBoundVector)}.
#' @param AnalyticProbSleeve FOR Visualization MODE ONLY. When \code{TRUE}, outputted
#'   DP-CDFs will have a 'fuzzy' analytic sleeve around them, approximating
#'   probabalitity density for each point given by DP. This also requires the
#'   function format specified above in the description for \code{AnalyticBounds}.
#' @param SuppressRealCDF FOR Visualization MODE ONLY. When \code{TRUE}, outputted
#'   graphs will not include real (non-private) CDFs.
#' @param SuppressDPCDF   FOR Visualization MODE ONLY. When \code{TRUE}, outputted
#'   graphs will not include DP-CDFs (but if \code{SmoothAll = TRUE}, monotonized
#'   DP CDFs still appear).
#' @param SuppressLegends FOR Visualization MODE ONLY. When \code{TRUE}, outputted
#'   graphs will not include legends
#' @param ... Optionally add additional parameters. This is primarily used to allow automated
#'   execution of varied diagnostic functions.
#' @export
#' @return If \code{Visualization = TRUE}, a list containing:                        
#'
#'  ...\code{$means} Contains mean diagnostic
#'  results for each diagnostic across reps iterations for each parameter combination;
#'  
#' ..\code{$medians} Contains median diagnostic 
#'  results for each diagnostic across reps iterations for each parameter combination;\\
#'
#'...\code{$yourCDFoutput} Containing a single dpCDF iteration for each parameter combination;\\
#'
#'...\code{$yourPDFoutput}  Containing a single dpPDF iteration for each parameter combination;\\
#'
#'...\code{$realCDFoutput} Containing the real (non-DP) CDF output for each relevant parameter combination;
#'
#'...\code{$realPDFoutput} Containing the real (non-DP) PDF output for each relevant parameter combination;
#'
#'...\code{$databins} Containing the domain used to construct the CDFs;
#'
#'...\code{$TestPack_CDF} Containing the definitions of diagnostic functions used on dpCDFs;
#'
#'...\code{$TestPack_PDF} Containing the definitions of diagnostic functions used on dpPDFs;
#'
#'...\code{$allscores} Containing all raw diagnostic output.
#'
#'...\code{$seed} Containing the list of seeds used in the test
#'
#'...\code{$permetric} holding a rearranged dataframe (ordered by parameter
#'                   combinations) useful for plotting.
#'  
#'  A \code{.pdf} file:
#'      with boxplots showing the distributions of diagnostic outputs, 
#'      and categorized plots of dp-CDF function output. Each such graph with
#'      show one arbitrary CDF iterations and empirical boundaries.
#'      the empirical boundaries are the max and min values reached by that
#'      function (and parameters) during the test.
#'              
#'   A \code{.csv} file:
#'      containing the mean and median scores of each diagnostic on each 
#'      combination of data, eps, function, and the seedlist for reproduction.
#'
#'  Notes on Visualization mode: Both the \code{.pdf} and \code{.csv} components are named with a time stamp index,
#'  in the form of \code{YearMonthDayHourMinuteSecond}. To locate particular tests,
#'  look at the \code{CDFtestindexchart.csv}, which automatically records the
#'  parameters and index of each test. These can be found in the file specified by 
#'  \code{OutputDirectory}, which defaults to the R temp files \code{tempdir()}.
#'  
#'  Alternatively in Data Collection mode (\code{Visualization = FALSE}), a list containing:
#'
#'      ...\code{$allscores} holding the output of each combination of parameters,
#'      which is that each \code{eps} in epslist is varied across the first value
#'      specified in \code{granlist} and \code{nlist}. The same is true for varying
#'      granularity and sample size. In that way, only one variable is varied at a time
#'      while the other two are held fixed. All such combinations of parameters are
#'      executed on all combinations of data and function (specified within       
#'      ...\code{datalist} and \code{functlist});
#'      
#'      ...\code{$seed} holding the list of seeds used in the test.
#'      
#'    A \code{.csv} file conatining the entire (raw) results (across reps iterations) 
#'    of diagnostic functions on DP-CDF algorithms per each combination of 
#'    dataset, and function, looped over epsilon, granularity, and sample size values
#'    as described directly above.\\
#'    This mode was designed for collecting metric data for subsequent supervised 
#'    learning modelling. 
#'
#' @examples 
#' CDFtest( Visualization = TRUE,OutputDirectory = 0, functlist = c(functionH),
#' Fnameslist = c("H"), epslist  = c(.1, .01), datalist = list(),
#' Dnameslist = c(), synthsets= list(list("wage", 100000, "uniform"), 
#'  list("wage",100000,"sparse"), list("wage",100000,"bimodal")),
#'  range    = c(1,500000),gran =1000,granlist =c(2500,1250,1000,500), 
#'  samplesize = 0,nlist = c(100,1000,10000,100000,1000000),
#'  cdfstep  =0, reps = 5,  ExtraTests_CDF = list(),ExtraTests_PDF = list(),
#'  setseed = c(-100),
#'  comments = "x",SmoothAll = FALSE,EmpiricBounds = FALSE,
#'  AnalyticBounds = FALSE,AnalyticProbSleeve = FALSE,
#'  SuppressRealCDF = FALSE,SuppressDPCDF = FALSE,SuppressLegends = FALSE)

CDFtest <- function(
 Visualization = TRUE,  OutputDirectory= 0, 
 functlist, Fnameslist, epslist = c(.05,.1,1),
 datalist,  Dnameslist, synthsets =NULL, range, gran = 1, granlist = c(1), samplesize =0, nlist = (10000), cdfstep =1, 
 reps = 5,  ExtraTests_CDF = list(), ExtraTests_PDF = list(), setseed = -100,  comments="none",
 SmoothAll = FALSE,       EmpiricBounds = FALSE, AnalyticBounds  = FALSE,  AnalyticProbSleeve = FALSE, 
 SuppressRealCDF = FALSE, SuppressDPCDF = FALSE, SuppressLegends = FALSE,... ){


if(OutputDirectory == 0){
  print("No output directory specified; plotting to R temp directory:")
  OutputDirectory <- tempdir()
}else{
  print("Plotting to the following directory:")
}
print(OutputDirectory)

if(Visualization){
print("Using Visualization Mode")
}else{
  print("Using Data Collection Mode")
}

# stops
if (length(epslist)<1){
  print("STOP: At least one epsilon value required")
  return("function stopped")
}
for(w in 1:length(epslist)){
  if (epslist[w] <= 0| is.numeric(epslist[w]) == FALSE){
    print("STOP: Epsilon values must be positive.")
    return("function stopped")
  } 
}

if(range[1] > range[2]){
    print("STOP: range[1] value must be the minimum observation value; range[2] is the maximum.")
    return("function stopped")
}


if (cdfstep == 0){
print("cdfstep set to 0, using maximum resolution (cdfstep = gran)")
cdfstep <- gran
}
else if (Visualization){
 if(cdfstep < gran){
    print("STOP: cdfstep must be greater than gran, or set to 0 to equal gran.")
    return("function stopped")
}}
else{
  for(z in 1:length(granlist)){
    if (cdfstep < granlist[z]){
    print("STOP: cdfstep must always be greater than gran, or set to 0 to equal gran.")
    return("function stopped")
    }
  }
}

if (reps<1 | reps%%1 != 0){
    print("STOP: reps must be a positive integer")
    return("function stopped")
}

if(Visualization){
if (samplesize < 0){
    print("STOP: samplesize must be a positive integer, or 0 to use the full data")
    return("function stopped")
}
else if (samplesize == 0){
  print("samplesize set to 0, using entire dataset")
}
}else{

  for (z in 1:length(nlist)){
    if (nlist[z] <1){
          print("STOP: samplesize must be a positive integer, or 0 to use the full data")
    return("function stopped")
   }
  }

}
################## SAMPLE THE DATASETS #######################

if(length(datalist)!=0){

  if(samplesize != 0){
      for(d in 1:length(datalist)){
        datalist[[d]]<- sample(datalist[[d]],samplesize)
      }
    }
 
 dataSizeNaming<- list()
for(d in 1:length(datalist)){
  dataSizeNaming[d] <- Abbrev(length(datalist[[d]]))
         Dnameslist[d]     <- paste(Dnameslist[d],dataSizeNaming[d], sep = ".")
  }
 }
######################### CREATE SYNTHETIC DATASETS ###############################

if(length(synthsets)!=0){

 for (s in 1:length(synthsets)){

   #########GAUSSIAN-SHAPED##########

   if(!is.null(synthsets[[s]]) && synthsets[[s]][3] == "gaussian"){
      if (synthsets[[s]][1] == "age"){
        set.seed=100
        NewData <- round(rnorm(as.numeric(synthsets[[s]][2]),38,25),0)
        }

      if (synthsets[[s]][1] == "wage"){
        set.seed=100
        NewData <- round(rnorm(as.numeric(synthsets[[s]][2]),220000,100000),-3)
        }
      datalist[[length(datalist)+1]]       <-  NewData

      synthsets[[s]][2] <-Abbrev(synthsets[[s]][2])

      Dnameslist[length(Dnameslist)+1]     <- paste(synthsets[[s]][1],synthsets[[s]][2],synthsets[[s]][3], sep = ".")
   }

   #########UNIFORMLY-SHAPED##########

   if(!is.null(synthsets[[s]]) && synthsets[[s]][3] == "uniform"){
      if (synthsets[[s]][1] == "age"){  
           databins       <- seq(from=(range[1]), to=(range[2]), by=gran)
           tool           <- floor(as.numeric(synthsets[[s]][2])/(range[2]-range[1]+1))
           NewData <- rep(databins, times=tool)
        }
      if (synthsets[[s]][1] == "wage"){
           databins       <- seq(from=(range[1]), to=(range[2]), by=gran)
           tool           <- floor(as.numeric(synthsets[[s]][2])/((range[2]/gran)-((range[1]/gran)+1)))
           NewData <- rep(databins, times=tool)
        }
      datalist[[length(datalist)+1]]       <- NewData

               synthsets[[s]][2] <-Abbrev(synthsets[[s]][2])
                
      Dnameslist[length(Dnameslist)+1]     <- paste(synthsets[[s]][1],synthsets[[s]][2],synthsets[[s]][3], sep=".")
   }

    #########BIMODALLY-SHAPED##########

   if(!is.null(synthsets[[s]]) && synthsets[[s]][3] == "bimodal"){
           if (synthsets[[s]][1] == "age"){
           set.seed=100
           A <- round(rnorm(as.numeric(synthsets[[s]][2])/2,20,10),0)
           set.seed=100
           B <- round(rnorm(as.numeric(synthsets[[s]][2])/2,60,10),0)
           NewData <- c(A,B)
        }
      if (synthsets[[s]][1] == "wage"){

           set.seed=100
           A <- round(rnorm(as.numeric(synthsets[[s]][2]),100000,35000),-3)
           set.seed=100
           B <- round(rnorm(as.numeric(synthsets[[s]][2]),300000,35000),-3)
           NewData <- c(A,B)
        }
      datalist [[length(datalist)  +1]]    <- NewData

             synthsets[[s]][2] <-Abbrev(synthsets[[s]][2])
      Dnameslist[length(Dnameslist)+1]     <- paste(synthsets[[s]][1],synthsets[[s]][2],synthsets[[s]][3], sep=".")
   }
   #########SPARSE##########

   if(!is.null(synthsets[[s]]) && synthsets[[s]][3] == "sparse"){
      if (synthsets[[s]][1] == "age"){

        agerep   <- c(10,11,13,20,21,35,60,61,65)
        timesrep <- round(c(2500,2000,5000,7000,3500,3000,5000,10000,8000)*(35/46),0)
        rand     <- c(15000,30)
        set.seed=100
        NewData <- rpois(round( rand[1]*(as.numeric(synthsets[[s]][2])/50000) ,0),rand[2])
           for(w in 1:9){
           NewData <- c(NewData, rep(agerep[w], times= round((timesrep[w] * (as.numeric(synthsets[[s]][2])/50000)),0)))
          }
         }

      if (synthsets[[s]][1] == "wage"){
       
        wagerep <- c(0,6000,10000,12000,14000,19000,20000,23000,25000,26000,30000,32000,35000,40000,42000,46000,47000,49000,
             50000,53000,60000,65000, 67000, 70000, 75000,80000,86000,87000,92000,95000, 97000,100000,105000, 
             110000,120000,125000,130000,135000,140000,145000,150000,160000, 170000,172000,175000, 180000,190000,200000,211000,220000,225000,230000,240000,250000,260000, 265000, 275000,300000,310000,
             330000,350000,360000,375000, 3850000, 400000)
        timesrep<- c(1000,200,50,2600,300,100,50,50,300,150,150,25,1400,3000,1050,50,100,1750,5600,50,50,1500,900,100,900,250,
              1250,50,50,100,2000,500,25,50,3500,300,200,25,50,1050,50,4000,100,50,150,125,100,1250,2100, 25, 1000, 50,
              1350, 2925, 100, 25, 850, 2050, 100, 75, 25, 25, 2600, 25, 25)
        rand    <- c(5000,200000)
        set.seed=100
       NewData <- rpois(round( rand[1]*(as.numeric(synthsets[[s]][2])/50000) ,0),rand[2])
           for(w in 1:length(wagerep)){
           NewData <- c(NewData, rep(wagerep[w], times= round((timesrep[w] * (as.numeric(synthsets[[s]][2])/50000)),0)))
          }
         }
      datalist[[length(datalist)+1]]       <- NewData

              synthsets[[s]][2] <-Abbrev(synthsets[[s]][2])
      Dnameslist[length(Dnameslist)+1]     <- paste(synthsets[[s]][1],synthsets[[s]][2],synthsets[[s]][3])
   }
}
}

##################################################################################################################################
if (Visualization == FALSE){

  # this takes the given seedlist
seedlist <- setseed  

# if you're looking for a random seed, this will overwrite the 
suppressWarnings

  if (setseed[1] == -100){
          seedlist <- round(runif((length(epslist) * length(functlist)* length(datalist)) + 
                                  (length(granlist)* length(functlist)* length(datalist)) +
                                  (length(nlist)   * length(functlist)* length(datalist)), min=0, max=1000),0)
            }
set.seed(seedlist)

MainOutput <- matrix(nrow = (((length(datalist))*length(granlist)*length(functlist)) +
                             ((length(datalist))*length(epslist)* length(functlist)) + 
                             ((length(datalist))*length(nlist)*   length(functlist)))  *reps,
                     ncol = 0)
MainOutput <- data.frame(MainOutput)

  index <- 1-reps
  # run the function for every CDF function
  for (f in 1:length(functlist)){
  print(paste("__this is functlist",f))

    # run the function for every dataset
    for(d in 1:length(datalist)){
    datalist[[d]] <- as.numeric(lapply(datalist[[d]], function(x) MovetoRange(x, range)))
    print(paste("____this is datalist",d))

      # run the function for every epsilon, hold dSize and n fixed
      for(i in 1:length(epslist)){
      print(paste("_______this is epslist",i))  
   #   index <-  ((f-1)*length(epslist)*length(datalist) + (d-1)*length(epslist)+i + counter)
index <- index+(1*reps)

     dSize <- ceiling(((range[2]-range[1])/1)+1)
     MainOutput$imp  [index:(index+reps-1)] <- Fnameslist[f] 
     MainOutput$eps  [index:(index+reps-1)] <- epslist[i]
     MainOutput$n    [index:(index+reps-1)] <- nlist[1]
     MainOutput$dSize[index:(index+reps-1)] <- dSize
     MainOutput$data [index:(index+reps-1)] <- Dnameslist[d]
     MainOutput$gran [index:(index+reps-1)] <- granlist[1]
     MainOutput$min  [index:(index+reps-1)] <- range[1]
     MainOutput$max  [index:(index+reps-1)] <- range[2]
     MainOutput$seed [index:(index+reps-1)] <- seedlist[index]

          scores1<- CDFtestTrackx(funct = functlist[[f]],
                                eps= epslist[i],
                                data=datalist[[d]],
                                range,
                                gran = granlist[1],
                                samplesize = nlist[1],
                                reps,
                                SmoothAll,
                                ExtraTests_CDF = ExtraTests_CDF,
                                ExtraTests_PDF = ExtraTests_PDF
                                )
       #   MainOutput$Mean          [index:(index+reps-1)] <-scores1$allscores$Mean
          MainOutput$MSEanalytic    [index:(index+reps-1)] <-scores1$allscores$MSEanalytic
          MainOutput$SDempiric     [index:(index+reps-1)] <-scores1$allscores$SDempiric
          MainOutput$MaxError_CDF  [index:(index+reps-1)] <-scores1$allscores$MaxError_CDF 
          MainOutput$MaxErrorAt_CDF[index:(index+reps-1)] <-scores1$allscores$MaxErrorAt_CDF
          MainOutput$diffat25      [index:(index+reps-1)] <-scores1$allscores$diffat25
          MainOutput$diffatMedian  [index:(index+reps-1)] <-scores1$allscores$diffatMedian 
          MainOutput$diffat75      [index:(index+reps-1)] <-scores1$allscores$diffat75
          MainOutput$horzdiffat25  [index:(index+reps-1)] <-scores1$allscores$horzdiffat25
          MainOutput$horzdiffatMed [index:(index+reps-1)] <-scores1$allscores$horzdiffatMed 
          MainOutput$horzdiffat75  [index:(index+reps-1)] <-scores1$allscores$horzdiffat75
          MainOutput$Medians       [index:(index+reps-1)] <-scores1$allscores$Medians
          MainOutput$MAE_CDF       [index:(index+reps-1)] <-scores1$allscores$MAE_CDF
          MainOutput$MSE_CDF       [index:(index+reps-1)] <-scores1$allscores$MSE
       #   MainOutput$DerivScore    [index:(index+reps-1)] <-scores1$allscores$DerivScore
          MainOutput$MaxError_PDF  [index:(index+reps-1)] <-scores1$allscores$MaxError_PDF 
          MainOutput$MaxErrorAt_PDF[index:(index+reps-1)] <-scores1$allscores$MaxErrorAt_PDF
          MainOutput$MAE_PDF       [index:(index+reps-1)] <-scores1$allscores$L1_area_PDF
          MainOutput$MSEanalytic   [index:(index+reps-1)] <-scores1$allscores$MSEanalytic
          MainOutput$MeanDiff      [index:(index+reps-1)] <-scores1$allscores$MeanDiff
          MainOutput$ModeDiff      [index:(index+reps-1)] <-scores1$allscores$ModeDiff                     
          MainOutput$StdDiff       [index:(index+reps-1)] <-scores1$allscores$StdDiff                     
          MainOutput$VarDiff       [index:(index+reps-1)] <-scores1$allscores$VarDiff                     
          MainOutput$SkewDiff      [index:(index+reps-1)] <-scores1$allscores$SkewDiff                     
          MainOutput$KurtDiff      [index:(index+reps-1)] <-scores1$allscores$KurtDiff                                    
}

#     # run the function for every n, hold eps and dSize fixed
    for(i in 1:length(nlist)){
    print(paste("_______this is nlist",i))

index <- index+(1*reps)
     dSize <- ceiling(((range[2]-range[1])/1)+1)
     MainOutput$imp  [index:(index+reps-1)] <- Fnameslist[f] 
     MainOutput$eps  [index:(index+reps-1)] <- epslist[1]  
     MainOutput$n    [index:(index+reps-1)] <- nlist[i]
     MainOutput$dSize[index:(index+reps-1)] <- dSize
     MainOutput$data [index:(index+reps-1)] <- Dnameslist[d] 
     MainOutput$gran [index:(index+reps-1)] <- granlist[1] 
     MainOutput$min  [index:(index+reps-1)] <- range[1]
     MainOutput$max  [index:(index+reps-1)] <- range[2]
     MainOutput$seed [index:(index+reps-1)] <- seedlist[index]

          scores2<- CDFtestTrackx(funct = functlist[[f]],
                                eps= epslist[1],
                                data=datalist[[d]], 
                                range, 
                                gran <- granlist[1],
                                samplesize = nlist[i],
                                reps,
                                SmoothAll,
                                ExtraTests_CDF = ExtraTests_CDF,
                                ExtraTests_PDF = ExtraTests_PDF
                                )
       #   MainOutput$Mean          [index:(index+reps-1)] <-scores2$allscores$Mean
          MainOutput$MSEanalytic    [index:(index+reps-1)] <-scores2$allscores$MSEanalytic
          MainOutput$SDempiric     [index:(index+reps-1)] <-scores2$allscores$SDempiric
          MainOutput$MaxError_CDF  [index:(index+reps-1)] <-scores2$allscores$MaxError_CDF 
          MainOutput$MaxErrorAt_CDF[index:(index+reps-1)] <-scores2$allscores$MaxErrorAt_CDF
          MainOutput$diffat25      [index:(index+reps-1)] <-scores2$allscores$diffat25
          MainOutput$diffatMedian  [index:(index+reps-1)] <-scores2$allscores$diffatMedian 
          MainOutput$diffat75      [index:(index+reps-1)] <-scores2$allscores$diffat75
          MainOutput$horzdiffat25  [index:(index+reps-1)] <-scores2$allscores$horzdiffat25
          MainOutput$horzdiffatMed [index:(index+reps-1)] <-scores2$allscores$horzdiffatMed 
          MainOutput$horzdiffat75  [index:(index+reps-1)] <-scores2$allscores$horzdiffat75
          MainOutput$Medians       [index:(index+reps-1)] <-scores2$allscores$Medians
          MainOutput$MAE_CDF       [index:(index+reps-1)] <-scores2$allscores$MAE_CDF
          MainOutput$MSE_CDF       [index:(index+reps-1)] <-scores2$allscores$MSE
       #   MainOutput$DerivScore    [index:(index+reps-1)] <-scores2$allscores$DerivScore
          MainOutput$MaxError_PDF  [index:(index+reps-1)] <-scores2$allscores$MaxError_PDF 
          MainOutput$MaxErrorAt_PDF[index:(index+reps-1)] <-scores2$allscores$MaxErrorAt_PDF
          MainOutput$MAE_PDF       [index:(index+reps-1)] <-scores2$allscores$L1_area_PDF
          MainOutput$MSEanalytic   [index:(index+reps-1)] <-scores2$allscores$MSEanalytic
          MainOutput$MeanDiff      [index:(index+reps-1)] <-scores2$allscores$MeanDiff
          MainOutput$ModeDiff      [index:(index+reps-1)] <-scores2$allscores$ModeDiff                     
          MainOutput$StdDiff       [index:(index+reps-1)] <-scores2$allscores$StdDiff                     
          MainOutput$VarDiff       [index:(index+reps-1)] <-scores2$allscores$VarDiff                     
          MainOutput$SkewDiff      [index:(index+reps-1)] <-scores2$allscores$SkewDiff                     
          MainOutput$KurtDiff      [index:(index+reps-1)] <-scores2$allscores$KurtDiff      
           
}

    # run the function for every dSize, hold eps and n fixed
    for(i in 1:length(granlist)){
    print(paste("_______this is granlist",i))


index <- index+(1*reps)
     dSize <- ceiling(((range[2]-range[1])/granlist[i])+1)
     MainOutput$imp  [index:(index+reps-1)] <- Fnameslist[f] 
     MainOutput$eps  [index:(index+reps-1)] <- epslist[1]  
     MainOutput$n    [index:(index+reps-1)] <- nlist[1]
     MainOutput$dSize[index:(index+reps-1)] <- dSize
     MainOutput$data [index:(index+reps-1)] <- Dnameslist[d] 
     MainOutput$gran [index:(index+reps-1)] <- granlist[i]
     MainOutput$min  [index:(index+reps-1)] <- range[1]
     MainOutput$max  [index:(index+reps-1)] <- range[2]
     MainOutput$seed [index:(index+reps-1)] <- seedlist[index]

          scores3<- CDFtestTrackx(funct = functlist[[f]],
                                eps= epslist[1],
                                data=datalist[[d]], 
                                range, 
                                gran= granlist[i], 
                                samplesize = nlist[1],
                                reps,
                                SmoothAll,
                                ExtraTests_CDF = ExtraTests_CDF,
                                ExtraTests_PDF = ExtraTests_PDF
                                )
       #   MainOutput$Mean          [index:(index+reps-1)] <-scores3$allscores$Mean
          MainOutput$MSEanalytic    [index:(index+reps-1)] <-scores3$allscores$MSEanalytic
          MainOutput$SDempiric     [index:(index+reps-1)] <-scores3$allscores$SDempiric
          MainOutput$MaxError_CDF  [index:(index+reps-1)] <-scores3$allscores$MaxError_CDF 
          MainOutput$MaxErrorAt_CDF[index:(index+reps-1)] <-scores3$allscores$MaxErrorAt_CDF
          MainOutput$diffat25      [index:(index+reps-1)] <-scores3$allscores$diffat25
          MainOutput$diffatMedian  [index:(index+reps-1)] <-scores3$allscores$diffatMedian 
          MainOutput$diffat75      [index:(index+reps-1)] <-scores3$allscores$diffat75
          MainOutput$horzdiffat25  [index:(index+reps-1)] <-scores3$allscores$horzdiffat25
          MainOutput$horzdiffatMed [index:(index+reps-1)] <-scores3$allscores$horzdiffatMed 
          MainOutput$horzdiffat75  [index:(index+reps-1)] <-scores3$allscores$horzdiffat75
          MainOutput$Medians       [index:(index+reps-1)] <-scores3$allscores$Medians
          MainOutput$MAE_CDF       [index:(index+reps-1)] <-scores3$allscores$MAE_CDF
          MainOutput$MSE_CDF       [index:(index+reps-1)] <-scores3$allscores$MSE
        #  MainOutput$L1empiric     [index:(index+reps-1)] <-scores3$allscores$L1emp
        #  MainOutput$L2empiric     [index:(index+reps-1)] <-scores3$allscores$L2emp
        #  MainOutput$DerivScore    [index:(index+reps-1)] <-scores3$allscores$DerivScore
          MainOutput$MaxError_PDF  [index:(index+reps-1)] <-scores3$allscores$MaxError_PDF 
          MainOutput$MaxErrorAt_PDF[index:(index+reps-1)] <-scores3$allscores$MaxErrorAt_PDF
          MainOutput$MAE_PDF       [index:(index+reps-1)] <-scores3$allscores$L1_area_PDF
          MainOutput$MSEanalytic   [index:(index+reps-1)] <-scores3$allscores$MSEanalytic
          MainOutput$MeanDiff      [index:(index+reps-1)] <-scores3$allscores$MeanDiff
          MainOutput$ModeDiff      [index:(index+reps-1)] <-scores3$allscores$ModeDiff                     
          MainOutput$StdDiff       [index:(index+reps-1)] <-scores3$allscores$StdDiff                     
          MainOutput$VarDiff       [index:(index+reps-1)] <-scores3$allscores$VarDiff                     
          MainOutput$SkewDiff      [index:(index+reps-1)] <-scores3$allscores$SkewDiff                     
          MainOutput$KurtDiff      [index:(index+reps-1)] <-scores3$allscores$KurtDiff      
         
}
}}
MainOutput$comments<- comments

########################## BEGIN PRINTING ##############################
 
  ##### Excel Files w/ means and medians of diagnostic results #####

            t <- Sys.time()
            timestamp <-as.POSIXlt(t)
            timeindex <-strftime(t, "%Y%m%d%H%M%S")

            write.table(MainOutput,
                  paste( OutputDirectory,"/DataCollector",timeindex,".csv", sep =""), row.names=FALSE, col.names=TRUE,  sep=",")

            maxlength <- max(length(datalist), length(epslist), length(functlist), length(range), length(granlist), length(nlist), length(setseed))

            forchart= data.frame( "index"         = c(timeindex, rep(" ",times= maxlength-1)),
                                  "functions"     = c(Fnameslist,rep(" ",times= maxlength-(length(Fnameslist)))),
                                  "datasets"      = c(Dnameslist,rep(" ",times= maxlength-(length(Dnameslist)))),
                                  "epsilons"      = c(epslist,   rep(" ",times= maxlength-(length(epslist)))),
                                  "range"         = c(range,     rep(" ",times= maxlength-2)),
                                  "granularities" = c(granlist,  rep(" ",times= maxlength-length(granlist))),
                                  "nlist"         = c(nlist,     rep(" ",times= maxlength-length(nlist))),
                                  "cdfstep"       = c(cdfstep,   rep(" ",times= maxlength-1)),               
                                  "repetitions"   = c(reps,      rep(" ",times= maxlength-1)),
                                  "SmoothAll"     = c(SmoothAll, rep(" ",times= maxlength-1)),
                                  "setseed"       = c(setseed,   rep(" ",times= maxlength-length(setseed))), 
                                  "comments"      = c(comments,  rep(" ",times= maxlength-1))
                                 )          
            suppressWarnings(
            write.table(forchart,   
                 paste(OutputDirectory,"/CDFtestindexchart.csv", sep =""),    row.names=FALSE, col.names=TRUE,  sep=",", append=TRUE)
            )


return(MainOutput)
}else{

##################################################################################################################################

# this takes the given seedlist
seedlist <- setseed  

# if you're looking for a random seed, this will overwrite the 
suppressWarnings

  if (setseed[1] == -100){
          seedlist <- round(runif((length(epslist)*length(functlist)* length(datalist)), min=0, max=1000),0)
            }


set.seed(seedlist)



universe         <- seq(from=(range[1]), to=(range[2]), by=gran)
databins         <- seq(from=(range[1]), to=(range[2]), by=cdfstep)
yourCDF          <- matrix(nrow=length(databins), ncol= (length(epslist)*length(functlist)* length(datalist)))
realCDF          <- matrix(nrow=length(databins), ncol=length(datalist))

 if (EmpiricBounds){
        yourCDF_LboundE  <- matrix(nrow=length(databins), ncol= (length(epslist)*length(functlist)* length(datalist)))
        yourCDF_UboundE  <- matrix(nrow=length(databins), ncol= (length(epslist)*length(functlist)* length(datalist)))
    }
 if (AnalyticBounds|AnalyticProbSleeve){
        yourCDF_LboundA  <- matrix(nrow=length(databins), ncol= (length(epslist)*length(functlist)* length(datalist)))
        yourCDF_UboundA  <- matrix(nrow=length(databins), ncol= (length(epslist)*length(functlist)* length(datalist)))
    }

yourPDF        <- matrix(nrow=length(databins), ncol= (length(epslist)*length(functlist)* length(datalist)))
realPDF        <- matrix(nrow=length(databins), ncol=length(datalist))

# run the function for every dataset
for(d in 1:length(datalist)){


datalist[[d]] <- as.numeric(lapply(datalist[[d]], function(x) MovetoRange(x, range)))
print(paste("this is datalist",d))

    # run the function for every epsilon
    for(i in 1:length(epslist)){
    print(paste("this is epslist",i))

        # run the function for every CDF function
        for (f in 1:length(functlist)){
        print(paste("this is functlist",f))

            Z <- FALSE
               if(AnalyticBounds|AnalyticProbSleeve){
            Z <- TRUE
   } 
          scores<- CDFtestTrack(funct = functlist[[f]],
                                eps= epslist[i],
                                cdfstep, 
                                data=datalist[[d]], 
                                range, 
                                gran, 
                                samplesize = samplesize,
                                reps,
                                SmoothAll,
                                ABounds = Z,
                                EmpiricBounds   = EmpiricBounds,
                                ExtraTests_CDF, 
                                ExtraTests_PDF
                                )
            # extract everything from the output list. 
                meansBase <- data.frame (  blank    = 0, 
                                           blank    = 0, 
                                           blank    = 0, 
                                           blank    = 0,
                                           blank    = 0,
                                           blank    = 0,
                                           blank    = 0,
                                           blank    = 0
                                     )
                mediansBase <- data.frame (blank    = 0, 
                                           blank    = 0, 
                                           blank    = 0, 
                                           blank    = 0,
                                           blank    = 0,
                                           blank    = 0,
                                           blank    = 0,
                                           blank    = 0
                                     ) 

                scores$meanscores   <- c(meansBase,   scores$meanscores  )
                scores$medianscores <- c(mediansBase, scores$medianscores)
          if ( (f + d + i) == 3){   

                means <- data.frame (  stat          = 0, 
                                       seed          = 0, 
                                       dataset       = 0, 
                                       epsilon       = 0, 
                                       functionName  = 0,
                                       n             = 0,
                                       domainSize    = 0,
                                       minValue      = 0,
                                       maxValue      = 0
                                     )
                medians <- data.frame (stat          = 0, 
                                       seed          = 0, 
                                       dataset       = 0, 
                                       epsilon       = 0, 
                                       functionName  = 0,
                                       n             = 0,
                                       domainSize    = 0,
                                       minValue      = 0,
                                       maxValue      = 0
                                     )

                    # create objects to store outputs
                    for (p in 1:length(scores$TestPack_CDF)){
                    means[p+9] <- 0
                    colnames(means)[p+9]   <- names(scores$TestPack_CDF[p])
                    medians[p+9] <- 0
                    colnames(medians)[p+9] <- names(scores$TestPack_CDF[p])
                    }

                    # create objects to store outputs 
                    for (p in 1:length(scores$TestPack_PDF)){
                    means[p+9+length(scores$TestPack_CDF)]   <- 0
                    colnames(means)[p+9+length(scores$TestPack_CDF)]   <- names(scores$TestPack_PDF[ p ])
                    medians[p+9+length(scores$TestPack_CDF)] <- 0
                    colnames(medians)[p+9+length(scores$TestPack_CDF)]   <- names(scores$TestPack_PDF[ p ])
                    }

          }

      index <-  ((d-1)*length(epslist)*length(functlist) + (i-1)*length(functlist)+f)

            means  [index, ]     <- scores$meanscores
            means  [index,9]     <- range[2]
            means  [index,8]     <- range[1]
            means  [index,7]     <- length(seq(range[1], range[2],gran))
            means  [index,6]     <- length(datalist[[d]])
            means  [index,5]     <- Fnameslist[f]
            means  [index,4]     <- epslist   [i]
            means  [index,3]     <- Dnameslist[d]
            means  [index,2]     <- seedlist[index]
            means  [index,1]     <- "means"

            medians  [index, ]  <- scores$medianscores
            medians  [index,9]     <- range[2]
            medians  [index,8]     <- range[1]
            medians  [index,7]     <- length(seq(range[1], range[2],gran))
            medians  [index,6]     <- length(datalist[[d]])
            medians  [index,5]     <- Fnameslist[f]
            medians  [index,4]     <- epslist   [i]
            medians  [index,3]     <- Dnameslist[d]
            medians  [index,2]     <- seedlist[index]
            medians  [index,1]     <- "medians"
      
           # round each numeric value in the vector (for asthetics, mostly)
             for (k in 10:(length(means[i,]))){             
            means[,k] <- round(means[,k],3)
            medians[,k] <- round(medians[,k],3)
            }

            yourCDF        [,index] <- scores$yourCDFoutput   # this one will be used for plotting
            if (EmpiricBounds){ 
                yourCDF_LboundE[,index] <- scores$yourCDF_LboundE # this one will be used for plotting (empirical lower bound)
                yourCDF_UboundE[,index] <- scores$yourCDF_UboundE # this one will be used for plotting
            }
            if (AnalyticBounds|AnalyticProbSleeve){    
                yourCDF_LboundA[,index] <- scores$yourCDF_LboundA # this one will be used for plotting (analytic Lower bound)
                yourCDF_UboundA[,index] <- scores$yourCDF_UboundA # this one will be used for plotting
            }

            realCDF [,d]           <- scores$realCDFoutput
            yourPDF [,index]       <- scores$yourPDFoutput   # this one will be used for plotting
            realPDF [,d]           <- scores$realPDFoutput
            databins               <- scores$databins

            # store for out. Add the seed as well, for reproducibility.       
            masterlist <- list()
            masterlist$allscores <-scores$allscores
            masterlist$seed      <-seedlist
            masterlist$means     <- means
            masterlist$medians   <- medians

            foroutput          <- scores$allscores
            dataset            <- rep(Dnameslist[d], reps)
            epsilon            <- rep(epslist   [i], reps)
            functName          <- rep(Fnameslist[f], reps)
            foroutput$dataset      <- dataset
            foroutput$epsilon      <- epsilon
            foroutput$functionname <- functName
            foroutput <- data.frame(foroutput)
            if(d+i+f ==3){
              container  <-foroutput
              }

            else{                                 
            container <- rbind(container, foroutput, make.row.names = FALSE)
            }
         } # ends the function forloop
        } # ends the epsilon forloop
       } # ends the dataset forloop

########################## BEGIN PRINTING ##############################
 
  ##### Excel Files w/ means and medians of diagnostic results #####

            t <- Sys.time()
            timestamp <-as.POSIXlt(t)
            timeindex <-strftime(t, "%Y%m%d%H%M%S")
            maxlength <- max(length(datalist), length(epslist), length(functlist), length(range), length(setseed))

             forchart= data.frame("index"          = c(timeindex, rep(" ",times= maxlength-1)),
                                  "functions"      = c(Fnameslist,rep(" ",times= maxlength-(length(Fnameslist)))),
                                  "datasets"       = c(Dnameslist,rep(" ",times= maxlength-(length(Dnameslist)))),
                                  "epsilons"       = c(epslist,   rep(" ",times= maxlength-(length(epslist)))),
                                  "range"          = c(range,     rep(" ",times= maxlength-2)),
                                  "granularity"    = c(gran,      rep(" ",times= maxlength-1)),
                                  "cdfstep"        = c(cdfstep,   rep(" ",times= maxlength-1)),
                                  "samplesize"     = c(samplesize,rep(" ",times= maxlength-1)),
                                  "repetitions"    = c(reps,      rep(" ",times= maxlength-1)),
                                  "SmoothAll"         = c(SmoothAll,            rep(" ",times= maxlength-1)),
                                  "EmpiricBounds"     = c(EmpiricBounds,        rep(" ",times= maxlength-1)),
                                  "AnalyticBounds"    = c(AnalyticBounds,       rep(" ",times= maxlength-1)),
                                  "AnalyticProbSleeve"= c(AnalyticProbSleeve,   rep(" ",times= maxlength-1)), 
                                  "SuppressRealCDF"   = c(SuppressRealCDF,      rep(" ",times= maxlength-1)), 
                                  "ExtraTests_CDF"    = c(names(ExtraTests_CDF),rep(" ",times= maxlength-length(names(ExtraTests_CDF)))), 
                                  "ExtraTests_PDF"    = c(names(ExtraTests_PDF),rep(" ",times= maxlength-length(names(ExtraTests_PDF)))), 
                                  "setseed"           = c(setseed,              rep(" ",times= maxlength-length(setseed))), 
                                  "comments"          = c(comments,             rep(" ",times= maxlength-1))
                                 )          

            write.table(masterlist[[3]],
                  paste( OutputDirectory,"/CDFtesting",timeindex,".csv", sep=""), row.names=FALSE, col.names=TRUE,  sep=",")

            suppressWarnings(
            write.table(masterlist[[4]],
                  paste( OutputDirectory,"/CDFtesting",timeindex,".csv", sep=""), row.names=FALSE, col.names=TRUE,  sep=",", append=TRUE)
            )

            suppressWarnings(
            write.table(forchart,   
                 paste(OutputDirectory,"/CDFtestindexchart.csv", sep=""),    row.names=FALSE, col.names=TRUE,  sep=",", append=TRUE)
            )
########################################################################################
##############################################################
##### PDF file with graphs each showing:
######## REAL CDF,
######## 1 DP CDF,
######## Optional Analytical Bounds and Empirical Bounds ####

## make the PDF file
g <- paste(OutputDirectory,"/CDFtesting",timeindex,".pdf", sep="")
pdf(file =g, onefile = TRUE, paper = "a4r", width = 0, height = 0)   

for(d in 1:length(datalist)){
 par(mfrow =c(length(functlist),length(epslist))) 

  if (length(functlist) == 1 ){
   par(mar= c(2,2,4,1))
  }else{
     par(mar= c(0,2,2,1))
  }

  for (f in 1:length(functlist)){ 

    for (i in 1:length(epslist)){
        
         index <-  ((d-1)*length(epslist)*length(functlist) + (i-1)*length(functlist)+f)
            
            #plot the original CDF for each dataset
          if(SuppressRealCDF){
            fakeline <- rep(1.1, times = (length(databins)))
            if (f==1 ) {
                plot(databins,
                fakeline,
                col  = "white",
                lwd  = 6, 
                type = "l", 
                ylim = c(0, 1.05),
                ylab = "cumulative proportion", 
                xlab = " "
                )  
              title(main = " ")  
                }
            else{
              if (f== length(functlist)){
                plot(databins,
                fakeline,
                col  = "white",
                lwd  = 6, 
                type = "l", 
                ylim = c(0, 1.05),
                ylab = "cumulative proportion",
                xlab = " " 
                )
                 title(main =".",
                sub  = "Functions are shown by line thickness, and epsilon is shown by color.",
                )  
              }
              else{
               plot(databins,
                fakeline,
                col  = "white",
                lwd  = 6, 
                type = "l", 
                ylim = c(0, 1.05),
                ylab = "cumulative proportion",
                xlab= " "
                )
              }
              }   
          }else {

            if (f==1 ) {
                plot(databins,
                realCDF[,d],
                col  = "black",
                lwd  = 6, 
                type = "l", 
                ylim = c(0, 1.05),
                ylab = "cumulative proportion", 
                xlab = " "
                )  
              title(main = " ")  
                }
            else{
              if (f== length(functlist)){
                plot(databins,
                realCDF[,d],
                col  = "black",
                lwd  = 6, 
                type = "l", 
                ylim = c(0, 1.05),
                ylab = "cumulative proportion",
                xlab = "binned data" 
                )
                 title(main =".",
                sub  = "_",
                )  
              }
              else{
               plot(databins,
                realCDF[,d],
                col  = "black",
                lwd  = 6, 
                type = "l", 
                ylim = c(0, 1.05),
                ylab = "cumulative proportion",
                xlab= " "
                )
              }
              }   
            }
# lines(databins, yourCDF        [,index],  col=colors()[26 + 10*((i-1)%%4)],  type="l", lwd=2)

      lines(databins, rep(.25,length(databins)), col="gray", lwd =(1), type="l")
      lines(databins, rep(.50,length(databins)), col="gray", lwd =(1), type="l")
      lines(databins, rep(.75,length(databins)), col="gray", lwd =(1), type="l")


 if (EmpiricBounds){
      lines(databins, yourCDF_LboundE[,index],  col=colors()[26 + 10*((i-1)%%4)],  type="l", lwd=1)
      lines(databins, yourCDF_UboundE[,index],  col=colors()[26 + 10*((i-1)%%4)],  type="l", lwd=1)
 }

## plot the curves
 if (AnalyticBounds){
      lines(databins, yourCDF_LboundA[,index],  col="black", type="l", lwd=1)
      lines(databins, yourCDF_UboundA[,index],  col="black", type="l", lwd=1)
 }

if (AnalyticProbSleeve){
  if(Z){
    SleeveVector <- seq(1,100, 1)
    for(v in 1:100){
       probsleeveLine <- c()

          for(w in 1: length(yourCDF[,index])){
            probsleeveLine[w] <- abs((yourCDF[,index][w])-(yourCDF_LboundA[,index][w]))  /max(SleeveVector) * SleeveVector[v]
          }
      lines(databins, yourCDF        [,index]+ probsleeveLine, col= adjustcolor(colors()[26 + 10*((i-1)%%4)], alpha.f=0),  type="n", lwd=0)
      lines(databins, yourCDF        [,index]- probsleeveLine, col= adjustcolor(colors()[26 + 10*((i-1)%%4)], alpha.f=0),  type="n", lwd=0)
      
      polygon(c(databins, rev(databins)), c((yourCDF [,index]+ probsleeveLine), rev(yourCDF [,index]- probsleeveLine)),
      col = adjustcolor(colors()[26 + 10*((i-1)%%4)], alpha.f=0.01), border = NA)
      # polygon(c(databins, rev(databins)), c((yourCDF [,index]+ probsleeveLine), rev(yourCDF [,index]- probsleeveLine)),
      # col = adjustcolor(colors()[555], alpha=0.01), border = NA)

    }
  
  }else{
  SleeveVector <- rexp(100,40)
  sort(SleeveVector)
  for(v in 1:40){
      lines(databins, yourCDF        [,index]+SleeveVector[v], col= adjustcolor(colors()[26 + 10*((i-1)%%4)], alpha.f=0.017),  type="n", lwd=15)
      lines(databins, yourCDF        [,index]-SleeveVector[v], col= adjustcolor(colors()[26 + 10*((i-1)%%4)], alpha.f=0.017),  type="n", lwd=15)
        }
      }      
}

if(SuppressDPCDF == FALSE){
  lines(databins, yourCDF        [,index],  col=colors()[26 + 10*((i-1)%%4)],  type="l", lwd=2)
#lines(databins, yourCDF        [,index],  col=colors()[555],  type="l", lwd=2)
#lines(databins, yourCDF        [,index],  col="black",  type="l", lwd=6)
}

if(SuppressLegends == FALSE){
     # add the legend

          #shrink ranges if necessary
          printRange <- c()
          printRange[1] <- Abbrev(range[1])
          printRange[2] <- Abbrev(range[2])
          printgran     <- Abbrev(gran)
          printcdfstep  <- Abbrev(cdfstep)

         # Determine plot boundaries, in units of the data
              xmin <- par("usr")[1]
              xmax <- par("usr")[2]
              ymin <- par("usr")[3]
              ymax <- par("usr")[4]
               
         # Now determine the size of the legend you would like to plot.  
              lgd <- legend(x = mean(c(xmin,xmax)), y =  mean(c(ymin,ymax)),
              c(paste("d:",Dnameslist[d]), 
                             paste("f: ",Fnameslist[f]),
                             paste("e:",epslist[i]),
                             paste("r: ",range[1],",",range[2]),
                             paste("gran: ", printgran),
                             paste("binSize: ",printcdfstep)),  
              pch = rep(1, times=6),
              col = rep(colors()[26 + 10*((i-1)%%4)], times=6),
              bg ="white",
              plot = F)

              # Add legend in the lower right corner:
              legend(x = xmax - lgd$rect$w, y =  ymin + lgd$rect$h,
              c(paste("d:",Dnameslist[d]), 
                paste("f: ",Fnameslist[f]),
                paste("e:",epslist[i]),
                paste("r: ",printRange[1],",",printRange[2]),
                paste("gran: ", printgran),
                paste("binSize: ",printcdfstep)), 
              pch = rep(1, times=6),
              col = rep(colors()[26 + 10*((i-1)%%4)], times=6),
              bg ="white",
              plot = T)

       text(c(xmin*1.1,ymax*.95), "CDF", font=4)

         }
        }
       }
      }

#####################################################################
#####  NOW DO THE SAME FOR PROB. DENSE. FUNCTIONS ###
for(d in 1:length(datalist)){
 par(mfrow =c(length(functlist),length(epslist))) 
 par(mar= c(0,2,2,2))

  for (f in 1:length(functlist)){ 

    for (i in 1:length(epslist)){
        
       index <-  ((d-1)*length(epslist)*length(functlist) + (i-1)*length(functlist)+f)
           mx <- max(realPDF)*2


            #plot the original curve for each dataset
            if (f==1 ) {
              
                plot(databins,
                realPDF[,d],
                col  = 1,
                lwd  = 6, 
                type = "l", 
                ylim = c(0,.35),
                ylab = "proportion", 
                xlab = " "
                )  
              title(main = " ")  
                }
            else{
              if (f== length(functlist)){
                plot(databins,
                realPDF[,d],
                col  = 1,
                lwd  = 6, 
                type = "l", 
                ylim = c(0,.35),
                ylab = "proportion",
                xlab = "binned data" 
                )
                 title(main =" ",
                sub  = "Functions are shown by line thickness, and epsilon is shown by color.",
                )  
              }
              else{
               plot(databins,
                realPDF[,d],
                col  = 1,
                lwd  = 6, 
                type = "l", 
                ylim = c(0,.35),
                ylab = "proportion",
                xlab= " "
                )
              }
              }   
if (SuppressDPCDF == FALSE){

tool <- ceiling(length(databins)/50)
LowResDPPDF <- c()
z <-1

  for (w in 1:50){
    if ((z-1+tool)<length(databins)){
  LowResDPPDF[z:(z-1+tool)] <- mean(yourPDF[w:(w+1),index])
  z<- z+tool
  }
}
if(length(LowResDPPDF)<length(databins)){
  LowResDPPDF[(length(LowResDPPDF)+1):length(databins)] <- LowResDPPDF[length(LowResDPPDF)]
}

             lines(databins, LowResDPPDF,  col=colors()[26 + 10*((i-1)%%4)],  type="l", lwd=2)
     }
      # add the legend


if (SuppressLegends==FALSE){
              # Determine plot boundaries, in units of the data
              xmin <- par("usr")[1]
              xmax <- par("usr")[2]
              ymin <- par("usr")[3]
              ymax <- par("usr")[4]
               
              #Now determine the size of the legend you would like to plot.  Right now the exact
              #location is not important, we just want to know the dimension!  Note that we are
              # treating the lengend as a variable and we are NOT plotting the legend on the figure!
              lgd <- legend(x = mean(c(xmin,xmax)), y =  mean(c(ymin,ymax)),
              c(paste("d:",Dnameslist[d]), 
                             paste("f: ",Fnameslist[f]),
                             paste("e:",epslist[i]),
                             paste("r: ",printRange[1],",",printRange[2]),
                             paste("gran: ",printgran),
                             paste("binSize: ",printcdfstep)), 
              pch = rep(1, times=6),
              col = rep(colors()[26 + 10*((i-1)%%4)], times=6),
              bg ="white",
              plot = F)

              # Add legend in the lower right corner:
              legend(x = xmax - lgd$rect$w, y =  ymax,
              c(paste("d:",Dnameslist[d]), 
                paste("f: ",Fnameslist[f]),
                paste("e:",epslist[i]),
                paste("r: ",printRange[1],",",printRange[2]),
                             paste("gran: ",printgran),
                             paste("binSize: ",printcdfstep)),  
              pch = rep(1, times=6),
              col = rep(colors()[26 + 10*((i-1)%%4)], times=6),
              bg ="white",
              plot = T)

       text(c(xmin*1.1,ymax*.95), "PDF", font=4)

            }
           }
          }
         }

################################################
# lastly, create some boxplots representing errors
################################################
par(mfrow= c(1,1))
permetric <- list()
for (p in 1:(length(container)-3)){
    forplotting <- data.frame("1"= reps, stringsAsFactors=FALSE)

    nextstep <- data.frame("epsilon" = container$epsilon, "dataset" = container$dataset, "functionName" = container$functionname, "metric"=container[[p]])
    nextstep$dataset      <- factor(nextstep$dataset)
    nextstep$functionname <- factor(nextstep$functionName)
     
    forplots<- list()
    for(i in 1:length(epslist)){
        forplotting <- data.frame("1"= c(seq(from=1, to=reps,by=1)), stringsAsFactors=FALSE)
        for (d in 1:length(Dnameslist)){
          for (f in 1:length(Fnameslist)){

                A   <- subset(nextstep, epsilon == epslist[i])
                AA  <- subset(A,        dataset == Dnameslist[d])
                AAA <- subset(AA,  functionname == Fnameslist[f])
                forplotting <- cbind(forplotting,AAA$metric)
        }}
        forplotting[,1] <-NULL
        forplots[[i]]<-forplotting
    }

    permetric[[p]] <- forplots
}
masterlist$permetric <- permetric


#######################################################33

for (p in 1:length(permetric)){

    forplots <- permetric[[p]]
    for (i in 1:length(epslist)){
        forplotting <- forplots[[i]]

    #set up the colors of the boxplot columns
        cols <- numeric(0)
        for (f in 1:length(functlist)){
             cols[length(cols)+1] <- f+1
          }
        cols <- rep(cols, times=length(datalist))

    #set up the locations of the boxplot columns (function sequence grouped by dataset)
    # if 3 functions and 3 datasets, creates |F1_D1| |F2_D1| |F3_D1| _____ |F1_D2| |F2_D2| |F3_D2|
        ats <- numeric(0)
        for (d in 1:length(datalist)){
             for (f in 1:length(functlist)){
                  ats[length(ats)+1] <- (d-1)*(length(functlist)+2)+f+1
          }}

        namer <- rep("", times= length(datalist)*length(functlist))
        mx <- 1.25*max(forplotting)

################################################
        labellist<-list()
        labellist[1]  <- paste("Maximum Vertical Error ('Linf norm')\n epsilon=",         epslist[i],",",reps,"repetitions")
        labellist[2]  <- paste("Location of Maximum Vertical Error\n epsilon=",           epslist[i],",",reps,"repetitions")
        labellist[3]  <- paste("Vertical Error at the True 25th Percentile\n epsilon=",   epslist[i],",",reps,"repetitions")
        labellist[4]  <- paste("Vertical Error at the True Median\n epsilon=",            epslist[i],",",reps,"repetitions")
        labellist[5]  <- paste("Vertical Error at the True 75th Percentile\n epsilon=",   epslist[i],",",reps,"repetitions")
        labellist[6]  <- paste("Horizontal Error at the True 25th Percentile\n epsilon=", epslist[i],",",reps,"repetitions")
        labellist[7]  <- paste("Horizontal Error at the True Median\n epsilon=",          epslist[i],",",reps,"repetitions")
        labellist[8]  <- paste("Horizontal Error at the True 75th Percentile\n epsilon=", epslist[i],",",reps,"repetitions")
        labellist[9]  <- paste("Medians Returned\n epsilon=",                             epslist[i],",",reps,"repetitions")
        labellist[10] <- paste("Mean Absolute Error (emp, L1/DomainSize)\n epsilon=",     epslist[i],",",reps,"repetitions")
        labellist[11] <- paste("Mean Squared Error (emp (L2^2)/DomainSize)\n epsilon=",   epslist[i],",",reps,"repetitions")
        labellist[12] <- paste("Derivative Score (higher values are 'better')\n epsilon=", epslist[i],",",reps,"repetitions")


        if (length(ExtraTests_CDF) >0){
        assist <- names(ExtraTests_CDF)              
             TestNames <- names(scores$TestPack_CDF)
             for (L in (length(labellist)+1):(length(labellist)+length(assist))){
                  labellist[L] <- paste("Custom:",assist[L-length(labellist)], "\n epsilon=", epslist[i],",",reps,"repetitions" )
             }
        }
TestPack_PDF <-list(MaxError_PDF   = MaxError_PDF,
                    MaxErrorAt_PDF = MaxErrorAt_PDF,
                    MAE_PDF        = MAE,
                    MSE_PDF        = MSE,
                    MeanDiff = MeanDiffpdf,
                     ModeDiff = ModeDiffpdf,
                      StdDiff = StdDiffpdf,
                       VarDiff = VarDiffpdf,
                        SkewDiff = SkewDiffpdf,
                         KurtDiff= KurtDiffpdf)


        labellist[length(labellist)+1] <- paste("Maximum Vertical Error (PDF)\n epsilon=",            epslist[i],",",reps,"repetitions")
        labellist[length(labellist)+1] <- paste("Location of Maximum Vertical Error (PDF)\n epsilon=",epslist[i],",",reps,"repetitions")
        labellist[length(labellist)+1] <- paste("Mean Absolute Error (PDF)\n epsilon=",               epslist[i],",",reps,"repetitions")
        labellist[length(labellist)+1] <- paste("Mean Squared Error (PDF)\n epsilon=",                epslist[i],",",reps,"repetitions")
        labellist[length(labellist)+1] <- paste("Difference between Means given by CDFs\n epsilon=",                epslist[i],",",reps,"repetitions")
        labellist[length(labellist)+1] <- paste("Difference between Modes given by CDFs\n epsilon=",                epslist[i],",",reps,"repetitions")
        labellist[length(labellist)+1] <- paste("Difference between Standard Deviation given by CDFs\n (under development, see ?CDFtest) epsilon=",                epslist[i],",",reps,"repetitions")
        labellist[length(labellist)+1] <- paste("Difference between Variances given by CDFs\n epsilon=",                epslist[i],",",reps,"repetitions")
        labellist[length(labellist)+1] <- paste("Difference between Skewnesses given by CDFs \n (under development, see ?CDFtest) epsilon=",                epslist[i],",",reps,"repetitions")
        labellist[length(labellist)+1] <- paste("Difference between Kurtoses given by CDFs \n (under development, see ?CDFtest) epsilon=",                epslist[i],",",reps,"repetitions")
 
        if (length(ExtraTests_CDF) >0){
        assist <- names(ExtraTests_PDF)
             TestNames <- names(scores$TestPack_PDF)
             for (L in (length(labellist)+1):(length(labellist)+length(assist))){
                   labellist[L] <- paste(assist[L-length(labellist)], "\n epsilon=", epslist[i],",",reps,"repetitions" )
             }
        }

        ylablist <-list()
        ylablist[1]   <- paste("Vertical Distance between CDFs at maximum distance point")
        ylablist[2]   <- paste("Databin (X-axis) Values")
        ylablist[3]   <- paste("Distance (Error) Values")
        ylablist[4]   <- paste("Distance (Error) Values")
        ylablist[5]   <- paste("Distance (Error) Values")
        ylablist[6]   <- paste("Distance between 25th percentile values")
        ylablist[7]   <- paste("Distance between median values")
        ylablist[8]   <- paste("Distance between 75th percentile values")
        ylablist[9]   <- paste("Median values") 
        ylablist[10]  <- paste("Average distance between real and estimated CDF")
        ylablist[11]  <- paste("SqRt of average squared distance between real and estimated CDF")
        ylablist[12]  <- paste("Normalized difference in derivatives across gran resolutions")


       if (length(ExtraTests_CDF) >0){
        assist <- names(ExtraTests_CDF)
   
             TestNames <- names(scores$TestPack_CDF)
             for (L in (length(ylablist)+1):(length(ylablist)+length(assist))){
                  ylablist[L] <- paste("Custom: user-defined units (CDF)" )
             }
        }




        ylablist[length(ylablist)+1]  <- paste("Vertical Distance between PDFs at maximum distance point")
        ylablist[length(ylablist)+1]  <- paste("Databin (X-axis) Values")
        ylablist[length(ylablist)+1]  <- paste("Average distance between real and estimated CDF")
        ylablist[length(ylablist)+1]  <- paste("Average squared distance between real and estimated CDF")
        ylablist[length(ylablist)+1]  <- paste("Error Introduced by DP noise")
        ylablist[length(ylablist)+1]  <- paste("Error Introduced by DP noise")
        ylablist[length(ylablist)+1]  <- paste("Error Introduced by DP noise")
        ylablist[length(ylablist)+1]  <- paste("Error Introduced by DP noise")
        ylablist[length(ylablist)+1]  <- paste("Error Introduced by DP noise")
        ylablist[length(ylablist)+1]  <- paste("Error Introduced by DP noise")

        if (length(labellist) > length(ylablist)){
            for (Y in length(ylablist):length(labellist)){
              ylablist[Y] <- paste("Custom: user-defined units (PDF)")
            }
        }
        boxplot(forplotting, 
                las = 2,
                col = cols, 
                at = ats,
                names = namer,
                par(mar = c(12, 5, 4, 2) + 0.1),
                ylim = c(0,mx),        
                main = labellist[p],
                ylab = " "
                )    
        mtext(ylablist[p],side=2, line =4)

        for (d in 1:length(datalist)){
        for (f in 1:length(functlist)){
          labelindex <- (d-1)*(length(functlist)+2)+f+1
        mtext(Fnameslist[f], side=1, line=1, at=labelindex, las=2, font=2, col=(f+1))
        }}

        datatitlesAt <- numeric(0)
        for (d in 1:length(datalist)) {
          if(length(functlist)>2){
        datatitlesAt[d] <- (d-1)*(length(functlist)+2)+ mean(1,length(functlist)) +2.5
        }
          if(length(functlist)<=2){
        datatitlesAt[d] <- (d-1)*(length(functlist)+2)+ mean(1,length(functlist)) +1
        }
        text(datatitlesAt[d],.9*mx, Dnameslist[d], font=2)
         if(p==10){
            text(datatitlesAt[d],median(datalist[[d]]),"----------", font=2, col=1)
                  }
                  }                   
    }
}

dev.off()
return(masterlist)
}}


##begin UI:
## To use, highlight from here downwards and press ctrl+/
## some arbitrary example values have been input.
## The 3 functions here are available from the Harvard DP group in privateZelig on Github
#   results <- CDFtest(
# 
#   Visualization = TRUE,
# #           #Sets the testing suite into Visualization mode (default) or Data Collection mode (visualization = FALSE)
# #           #   In Visualization mode (default):
# #           #       An excel sheet conatining the mean and median results (across reps iterations) of diagnostic functions on DP-CDF algorithms
# #           #         per each combination of dataset, function, and epsilon.
# #           #       A .pdf file containing one graphical example DP CDF for each combination of dataset, function, and epsilon,
# #           #        as well as a set of boxplots showing the distribution of all diagnostic results for all combinations of parameters.
# #           #   In Data Collection mode (set Visualization = FALSE):
# #           #       An excel sheet conatining the entire (raw) results (across reps iterations) of diagnostic functions on DP-CDF algorithms
# #           #        per each combination of dataset, and function, sepearately looped over all epsilons, then all granularities, and all samplesizes
# 
#            OutputDirectory = "C:\\Users\\Dan\\Dropbox\\Harv\\CDFtesting\\",
# #           # the location of the folder which will hold the output (csv and pdf files)
# 
#            functlist = c(functionH),
# #           # A list of CDF-computing functions to be tested on the CDFtestTrack
# 
#            Fnameslist = c("H"),
# #           # A list of function names corresponding to the functions being tested; used for labelling the output.
# epslist = c(.2),
# #           epslist   = c(0.01,1,0.1,0.001,0.0001),
# #           # A vector of epsilon values for differential privacy
# 
#            datalist  = list(),
# #           # A list containing vectors of data, each to be used in a test
# 
#            Dnameslist     = c(),
# #           # A list of dataset  names corresponding to the functions being tested; used for labelling the output.
# synthsets = list(list("wage", 10000, "uniform")),
# #           synthsets      = list(list("wage", 100000, "uniform"), list("wage",100000,"sparse"), list("wage",100000,"bimodal"), list("wage", 100000, "gaussian")),
# #           # This script generates pre-defined synthetic datasets upon request, and fulling incorporates them into testing.
# #           # To call them, users should input a character vector containing the names of the sets they desire.
# #           # For example, synthsets = list( list(type,size,shape) , list(type,size,shape) ). There are no limits on the amounts of datasets included.
# #           # sets available include:
# #               # type: ("age" which ranges from about 0 to 100, gran =1) ("wage" which ranges from 0 to 450k, gran = 1000)
# #               # size: Any positive integer. Type in exact numerical representation (eg, for ten thousand use 10000 not 10k and not 10^4)
# #               # shape: gaussian, sparse, uniform, bimodal
# 
#            range     = c(1,500000),
# #           # The range of the universe as a vector (min, max). Defined based on user intuition.
# #           # Setting min too high will bias output upward. Same in reverse for a low max.
# #           # However, setting min too low and max too high could reveal the true limits of your data, compromising some privacy.
# 
#            gran    =1000,
# #           # FOR Visualization MODE ONLY. refer to "granlist" for setting granularities (thus domain sizes) in Data Collection mode.
# #           #     This command is irrelevant in Data Collection mode.
# #           # The granularity of the domain between the min and max. ie, if age is measure per 1 year of age, gran =1
# #           # The same granularity is applied to all datasets, so using comparable (or scaled) data is necessary
#     #  granlist = c(1000,100000,50000,25000,12500,10000,5000,2500,1250,500,250,125,100,50,25,10),  
#           granlist =c(5000),
#         #    granlist =c(100000,50000,25000,12500,10000,5000,2500,1250,1000,500,250,125,100,50,25),
# #           # FOR Data Collection MODE ONLY. refer to "gran" for selecting granularities in Data Collection mode.
# #           #     This command is irrelevant in Visualization mode.
# #           # A list of granularities of the universe between the min and max. ie, if age is measure per 1 year of age, gran =1
# 
#            samplesize = 0,
# #           # FOR Visualization MODE ONLY. refer to "nlist" for selecting samplesizes in Data Collection mode.
# #           #     This command is irrelevant in Data Collection mode.
# #           #     when set to zero, the entire dataset is used.
# #           # Otherwise, the specified sample size is randomly selected from each dataset without replacement.
# #           # the samples are used.
# nlist = c(10000),
# #           nlist = c(100000,100,1000,10000,1000000),
# #           # FOR Data Collection MODE ONLY. refer to "samplesize" for selecting samplesizes in visualization mode.
# #           #     This command is irrelevant in Visualization mode.
# #           #sets the absolute sample sizes to draw from each dataset, with replacement. Any vector of integer values is appropriate.
# 
#            cdfstep   =0,
# #           # The step size used in outputting the approximate CDF.
# #           # The values output are [min, min + cdfstep], [min, min + 2 * cdfstep], etc.
# #           # For best results (maximum CDF resolution), set cdfstep = 0, which gives cdfstep = gran
# 
#            reps    = 50,
# #           # The number of times to repeat each diagnostic. higher reps lends greater accuracy,
# #           # but comsumes time and power. Author recommends reps = 10 for quick examples and 300 for robust examinations
# 
#            ExtraTests_CDF = list(),
# #           # If a user wishes to add extra diagnostics, the proper syntax would be:
# #           # ExtraTests_CDF = list( functionName1 = function1, functionName2 = function2)
# #           # Diagnostic Functions should have similar inputs as diagnostics defined above, and output just one value
# 
#            ExtraTests_PDF = list(),
# #           # see above
# 
#            setseed    = c(-100),
# #           # In the function, each combination of data, epsilon, and function is executed with a separate seed, which
# #           # by default is randomly generated and reported. Users interested in replicating specific results
# #           # can locate the reported seed and parameter combination to replicate tests.
# 
#            comments       = ("x"),
# #           # In the function, each combination of data, epsilon, and function is executed with a separate seed, which
# #           # by default is randomly generated and reported. Users interested in replicating specific results
# #           # can locate the reported seed and parameter combination to replicate tests.
# 
#            SmoothAll      = FALSE,
# #           # Applies a monotonizing post-processing function to all dpCDFs
# #           # Monotonization does not affect prob dense functions
# 
#            EmpiricBounds  = FALSE,
# #           # When TRUE, outputted graphs depict the minimum and maximum values taken by each bin across reps
# 
#            AnalyticBounds = FALSE,
# #           # AnalyticBounds: This is a flag and should be set to "true" if the functions being tested are expected to output
# #           # analytical variance bounds. The proper output form is output = list(DPCDFvector, LowerBoundVector, UpperBoundVector)
# #           # This will not switch on an Analytical bounds flag within the inputted function if the function's flag is set/defaulted to "FALSE".
# #           # However, if this flag here is set FALSE, it will suppress bounds from the inputted function.
# #           # Best practice is to have an analytical bounds flag defaulted to "TRUE" within the inputted function, such that it can be turned on and off
# #           # with this flag.
# 
#            AnalyticProbSleeve = TRUE,
# #           # When TRUE, outputted DP-CDFs will have a fuzzy analytic sleeve aroudn them, approximating probabalitity density for each point.
# 
# #           SuppressRealCDF = FALSE,
# #           # When TRUE, outputted graphs will not include Real (non-private) CDFs
# 
#            SuppressDPCDF = FALSE,
# #           # When TRUE, outputted graphs will not include DP-CDFs (but if SmoothAll = TRUE, monotonized DP CDFs still appear)
# 
#            SuppressLegends = FALSE,
# #           # When TRUE, outputted graphs will not include legends
#           )

Try the CDF.PSIdekick package in your browser

Any scripts or data that you put into this service are public.

CDF.PSIdekick documentation built on May 30, 2017, 5:09 a.m.