R/SDPSyn.R

Defines functions SDPSYN2 fitthemodel fitmodel.rf sample.ctree samplefrompool getnodesfromrules fitmodel.ctree preparepredictorsforctreefit sample.fn fitmodel.fn good.syn.parameters good.sample.parameters good.fit.parameters onlygoodargs

Documented in fitmodel.ctree fitmodel.fn fitmodel.rf fitthemodel getnodesfromrules good.fit.parameters good.sample.parameters good.syn.parameters onlygoodargs preparepredictorsforctreefit sample.ctree sample.fn samplefrompool SDPSYN2

#' Generic function: remove all the elements of a named list which names are not arguments of a specific function.
#' @details 
#' remove all the elements of a list which names are not arguments of a specific function.
#' @param fun a function
#' @param L a list
#' @return a list.  
#' @examples
#' onlygoodargs(lm,list(data=cars,formula=speed~dist,tutu="not an arg from lm"))
onlygoodargs<-function(fun,L){L[intersect(names(L),names(formals(fun)))]}

#' Select only arguments for a specific fitting function
#' @details 
#' Currently only works with method="ctree"
#' Only selects the arguments that match the function partykit::ctree_control
#' @param method a string. currently only method="ctree".
#' @param synparameters a named list.  
#' @return a sublist of synparameters, which names are possible arguments of partykit::ctree_control if method="ctree".
#' @examples
#' good.fit.parameters(method="ctree",list(teststat=30,tutu=3))
#' good.fit.parameters(method="ctree.new",list(rslope="test"))
good.fit.parameters<-function(method,synparameters){
  if(method=="ctree"){sp<-synparameters[intersect(names(synparameters),
                                              names(formals(partykit::ctree_control)))]
  }
  if(method=="ctree.new"){sp<-synparameters[intersect(names(synparameters),
                                                  names(formals(fitmodel.ctree.new)))]
  }
  
  return(sp)}

#' Select only arguments for sampling.
#' @details 
#' In prevision of future developments. returns NULL for the moment
#' @param method a string. currently only method="ctree".
#' @param synparameters a named list.  
#' @return NULL
#' @examples
#' good.sample.parameters(method="ctree",list(teststat=30,tutu=3))
good.sample.parameters<-function(method,synparameters){NULL}
#' Select only arguments for a specific fitting function
#' This function is not used anymore
#' @details 
#' Currently only works with method="ctree"
#' Only selects the arguments that match the function synthpop::syn.ctree
#' @param method a string. currently only method="ctree".
#' @param synparameters a named list.  
#' @return a sublist of synparameters, which names are possible arguments of synthpop::syn.ctree if method="ctree".
#' @examples
#' good.syn.parameters(method="ctree",list(y=c(1:2),smoothing=TRUE,tutu="not in synthpop::syn.ctree arguments"))
good.syn.parameters<-function(method,synparameters){
  if(method=="ctree"){
    synparameters[intersect(names(synparameters),
                            names(formals(synthpop::syn.ctree)))]}}

#' Fit a model with a specific function
#' @param method a string. currently only method="ctree" or "rf" (random forest).
#' @param x a predictors, a dataframe.
#' @param y variable to predict, a vector
#' @param   treeplotsavepath a 
#' @param ... synthetic parameters to pass to the right fit model function. the fit model function name is the concatenation of "fit.model" and method
#' @return a sublist of synparameters, which names are possible arguments of synthpop::syn.ctree if method="ctree".
#' @examples
#' fitmodel.fn(method="ctree",x=iris[,-5],y=iris$Species,nbuckets=30,tutu="not a good argument")
#' fitmodel.fn(method="ctree.new",x=iris[,-5],y=iris$Species,nbuckets=30,tutu="not a good argument")
#' ##other example with MM
#' 
#' Sparameters_i<-list(
#'   splits=list(
#'     split1=list(
#'       condition=expression((TRUE)),
#'       method="ctree.new",
#'       variable="bscore",
#'       synthparameters=list(
#'         random = "schoolid", 
#'         lgmodel = "slope",
#'         rslope = "+ female + sclass"))))
#' good.fit.parameters("ctree.new",list(random = "schoolid", 
#'                                      lgmodel = "slope",
#'                                      rslope = "+ female + sclass"))
#' fitmodel.fn("ctree.new",x=school[1:9],y=school[10],random = "schoolid", 
#'             lgmodel = "slope",
#'             rslope = "+ female + sclass")


#' 
fitmodel.fn<-function(method,x,y,treeplotsavepath=NULL,fit.parameters=NULL,...){
  do.call((get(paste0("fitmodel.",method))),
          c(list(x=x,y=y,treeplotsavepath=treeplotsavepath),
            good.fit.parameters(method,c(fit.parameters,list(...)))))}

#' Sample a model with a specific function
#' @param method a string. currently only method="ctree".
#' @param x a predictors, a dataframe.
#' @param y variable to predict, a vector
#' @param ... synthetic parameters to pass to the right fit model function. the fit model function name is the concatenation of "fit.model" and method
#' @return a sublist of synparameters, which names are possible arguments of synthpop::syn.ctree if method="ctree".
#' @examples
#' sample.fn(method="ctree",
#'           xp=iris[,-5],
#'           fit.model=fitmodel.fn(method="ctree",x=iris[,-5],y=iris$Species,nbuckets=30),
#'           smoothing=FALSE)
sample.fn<-function(method,xp,fit.model,smoothing,...){
  do.call(get(paste0("sample.",method)),
          c(list(xp=xp,fit.model=fit.model,smoothing=smoothing),
            good.sample.parameters(method,list(...))))}

#' Prepare predictors for ctree fit
#' @param method a string. currently only method="ctree".
#' @param x a predictors, a dataframe.
#' @param y variable to predict, a vector
#' @param ... synthetic parameters to pass to the right fit model function. the fit model function name is the concatenation of "fit.model" and method
#' @return a sublist of synparameters, which names are possible arguments of synthpop::syn.ctree if method="ctree".
#' @examples
#' preparepredictorsforctreefit(x,
#'                              keep=NULL)
preparepredictorsforctreefit<-function(x,keep=NULL){
  datesx<-sapply(x,function(xx){inherits(xx,"POSIXct")|inherits(xx,"POSIXt")})
  if(any(datesx)){x[datesx]<-lapply(x[datesx],as.numeric)}
  factorswithtoomanylevels<-sapply(x,is.factor)
  if(any(factorswithtoomanylevels)){
    factorswithtoomanylevels[sapply(x[factorswithtoomanylevels],is.ordered)]<-FALSE
    factorswithtoomanylevels[sapply(x[factorswithtoomanylevels],nlevels)<30]<-FALSE
    x<-x[!factorswithtoomanylevels|is.element(names(x),keep)]}
  x[sapply(x,function(y){length(unique(y))})>1|is.element(names(x),keep)]}


#' Function to fit a ctree model.
#' @param x a dataframe of predictors 
#' @param y a vector :dependent variable
#' @param treeplotsavepath: a path to save the graph as a pdf. if NULL, no pdf is saved
#' @return a named list of 4 elements:
#' "Rules" a data.frame with two variables: terminalnode (a integer vector) and condition  a string that gives the rule for each terminal node.
#'  "y" the values of the predictor
#'  "terminalnodes" a vector: the terminal nodes for each element of $y$.
#'  "shortlist" a character string giving the names of the variables in x that were used for the classification   
#' @examples
#' fitmodel.ctree(x=iris[,-5],y=iris$Species)

# The partykit package is used for fitting ctree
fitmodel.ctree<-function(x,y,treeplotsavepath=NULL,...){
  #1 convert all posix to numeric
  y2<-y
  if(inherits(y2,"POSIXct")|inherits(y2,"POSIXt")){y2<-as.numeric(y2)}
  x2<-preparepredictorsforctreefit(x)
  datact <- partykit::ctree(y2 ~ ., 
                            data=cbind(y2=y2, if(ncol(x2)==0){x}else{x2}), 
                            control = do.call(partykit::ctree_control,
                                              onlygoodargs(partykit::ctree_control,list(...))))
  depthx<-try(treedepth(datact))
  if(!is.null(treeplotsavepath)){try(BigSyn::treetopdf(datact,treeplotsavepath))}
  splitnodeconditions <- daniRules(datact)
  splitnodeconditionsmerge<-paste0(splitnodeconditions,collapse="")
  Rules=  if(length(splitnodeconditions)>1){
    data.frame(terminalnode=strtoi(names(splitnodeconditions)),
               condition=splitnodeconditions,stringsAsFactors = FALSE)}else{
                 data.frame(terminalnode=1,condition="TRUE")}
  terminalnodes<-getnodesfromrules(x,Rules)
  shortlist<-if(ncol(x2)==0){character(0)}else{names(x2)[sapply(names(x2),grepl,x=splitnodeconditionsmerge)]}  
  list(Rules=Rules,
       y=y,
       terminalnodes=terminalnodes,
       shortlist=shortlist,
       depth=depthx,
       width=nrow(Rules),
       treeplotsavepath=treeplotsavepath)}

#' Function to get terminal node from a set of partitioning rules and new predictors
#' 
#' @param x a dataframe of predictors
#' @param Rules a data frame containing 2 character variables: "terminalnode" and "condition"
#' @return a vector of lenth the number of rows of x indicating the terminal nodes. 
#' @examples 
#' getnodesfromrules(x=iris[1:3,-5],Rules=fitmodel.ctree(x=iris[,-5],y=iris$Species)$Rules)

getnodesfromrules<-function(x,Rules){
  terminalnode=rep(NA,nrow(x))
  for(j in 1:nrow(Rules)){
    whh<-eval(parse(text=paste0("with(x,(rep(TRUE,nrow(x)))&(",Rules$condition[j],"))")))
    if(any(whh)){
    terminalnode[whh]<-Rules$terminalnode[j]}}
  terminalnode}

#' Function to sample from a set of partitioning rules
#' 
#' @param y a vector of values to pull from
#' @param terminalnodes a vector of terminal nodes
#' @param newterminalnodes: a path to save the graph
#' @return  a vector of the same size than terminalnodes, obtained by sampling betweenn the values of y such for the same terminal node.  
#' @examples
#' y=iris$Species;x=iris[,-5];fit.mod<-fitmodel.ctree(x,y);terminalnodes<-getnodesfromrules(x,fit.mod$Rules);
#' newterminalnodes<-sample(unique(terminalnodes),10,replace=TRUE);
#' samplefrompool(y,terminalnodes,newterminalnodes)
#' y<-y[terminalnodes!=7]
#' terminalnodes<-terminalnodes[terminalnodes!=7]
#' samplefrompool(y,terminalnodes,newterminalnodes)
samplefrompool<-function(y,terminalnodes,newterminalnodes){
  newy=sample(y,size=length(newterminalnodes),replace=(length(y)<length(newterminalnodes)))
  newy[TRUE]<-NA
  replaced=rep(FALSE,length(newterminalnodes))
  for (i in intersect(unique(newterminalnodes),unique(terminalnodes))){
    whh<-(newterminalnodes == i&!is.na(newterminalnodes == i))
    if(any(whh&!is.na(whh))){
      newy[whh] <- sample(y[terminalnodes ==i], sum(whh), replace = TRUE)
      replaced[whh]<-TRUE}}
  for(i in setdiff(unique(newterminalnodes),unique(terminalnodes))){
    whh<-(newterminalnodes == i&!is.na(newterminalnodes == i))
    distance<-abs(i-unique(terminalnodes))
    siblingnode<-unique(terminalnodes)[which(distance==min(distance))]
    if(any(sapply(unique(terminalnodes),function(jj){is.element(jj,siblingnode)}))){
    if(any(whh&!is.na(whh))){
      newy[whh] <- sample(y[sapply(terminalnodes,function(jj){is.element(jj,siblingnode)})],
                                                   sum(whh), replace = TRUE)
      replaced[whh]<-TRUE}}}
  if(any(!replaced)){newy[!replaced]<-sample(y, sum(!replaced), replace = TRUE)}
  newy}

#' Function to sample from a ctree fitted model
#' 
#' @param y a vector of values to pull from
#' @param terminalnodes a vector of terminal nodes
#' @param newterminalnodes: a path to save the graph
#' @return  a vector of the same size than terminalnodes, obtained by sampling betweenn the values of y such for the same terminal node.  
#' @examples
#' y<-iris$Species;x<-xp<-iris[,-5];fit.model<-fitmodel.ctree(x,y);sample.ctree(x,fit.model)

sample.ctree<-function(xp,fit.model,smoothing="none",...){
  keep<-names(xp)[sapply(names(xp),function(x){
    any(grepl(pattern = x,x =fit.model$Rules$condition))})]
  xp <- preparepredictorsforctreefit(xp,keep=keep)
  # for (i in which(sapply(x, class) != sapply(xp, class))){ xp[,i] <- eval(parse(text = paste0("as.", class(x[, i])[1], "(xp[,i])", sep = "")))}
  newterminalnodes<-getnodesfromrules(xp,fit.model$Rules)
  ysyn <-   samplefrompool(fit.model$y,fit.model$terminalnodes,newterminalnodes)
  if (!is.factor(fit.model$y) & smoothing == "density"){ysyn <- synthpop:::syn.smooth(ysyn, fit.model$y)}
  ysyn}

#' Function to fit a ctree model.
#' @param x a dataframe of predictors 
#' @param y a vector :dependent variable
#' @param treeplotsavepath: a path to save the graph as a pdf. if NULL, no pdf is saved
#' @return a named list of 4 elements:
#' "Rules" a data.frame with two variables: terminalnode (a integer vector) and condition  a string that gives the rule for each terminal node.
#'  "y" the values of the predictor
#'  "terminalnodes" a vector: the terminal nodes for each element of $y$.
#'  "shortlist" a character string giving the names of the variables in x that were used for the classification   
#' @examples
#' fitmodel.ctree(x=iris[,-5],y=iris$Species)
fitmodel.rf<-function(x,y,treeplotsavepath=NULL,...){
  if (is.factor(y)) {
    obslevels <- levels(y)
    y <- droplevels(y)}
  do.call( randomForest::randomForest,onlygoodargs(randomForest::randomForest,c(list(x=y ~ ., data = cbind.data.frame(y, x)), list(...))))}

#' Function to fit the model.
#' 
#' @param Sparameters_i an element of the list output from Sparameters.default.f
#' @param fitmodelsavepath a folder path. Results will either be read from or stored in this folder. If the file exists, by default it is not replaced.
#' @param TtableANAto0 a table  containing the predictors without NAs as well as the outcome
#' @return a list.  
#' @examples
#' #Load the data
#' data(TtableA,package="BigSyn")
#' #define parameters
#' Sparameters<-Sparameters.default.f(ref.table=TtableA)
#' Sparameters_i<-Sparameters[[53]]; 
#' fitmodelsavepath<-NULL; 
#' TtableANAto0<-NAto0(TtableA);
#' redocomputationsevenifexists<-FALSE
#' treeplotsavefolder=tempdir()
#' #fit the model:
#' fitthemodel(Sparameters_i,fitmodelsavepath = NULL,TtableANAto0 = TtableANAto0,
#'             treeplotsavefolder=tempdir())
#' Sparameters_i<-Sparameters[["AA.present_La_La_Lrn1"]]; 
#' treeplotsavefolder=tempdir()
#' fitthemodel(Sparameters_i,NULL,TtableANAto0,treeplotsavefolder=tempdir())
#' 
#' 
#' ###############Example with MM
#' data(school,package="BigSyn")
#' asis=names(school)[1:9] 
#' variables<-Sparameters.variables.reorder.default(names(school))
#' notpredictor<-NULL
#' Sparameters=Sparameters.default.f(
#'   ref.table=school,asis=asis,
#'   notpredictor=NULL,
#'   variables=variables,
#'   predictors.matrix = predictor.matrix.default(variables)[!is.element(variables, asis), !is.element(variables, notpredictor),drop=FALSE],
#'   defaultsynparameters=    eval(formals(Sparameters.default.f)$defaultsynparameters))

#' Sparameters_i<-list(
#'     variable="bscore",
#'      splits=list(
#'          split1=list(
#'         condition=expression((TRUE)),
#'         method="ctree.new",
#'         predictors=names(school),
#'         synthparameters=list(
#'           random = "schoolid", 
#'           lgmodel = "slope",
#'           rslope = "+ female + sclass"))))
#'           redocomputationsevenifexists=FALSE
#'           treeplotsavefolder=NULL
#' TtableANAto0<-school           
#' fitthemodel(Sparameters_i,NULL,school[1:9])

fitthemodel<-function(Sparameters_i,fitmodelsavepath,TtableANAto0,redocomputationsevenifexists=FALSE,treeplotsavefolder=NULL){
  print("________________________________________________________________________",quote = F)
  print(paste0("Now fitting model for ",Sparameters_i$variable),quote = F)
  redocomputations=TRUE
  if(!is.null(fitmodelsavepath)&!redocomputationsevenifexists){
    wheretosavefit<-file.path(fitmodelsavepath,paste0(Sparameters_i$variable,".rda"))
    if(file.exists(wheretosavefit)){
      redocomputations=FALSE;
      print("-- Already done",quote = F)
      return(NULL)}
  }
  if(redocomputations){
    print("......................................................................",quote = F)
    print(paste0("-- Fit model for ", Sparameters_i$rank," - ",Sparameters_i$variable),quote = F)
    if(is.null(names(Sparameters_i$splits))){
      names(Sparameters_i$splits)<-seq_along(Sparameters_i$splits)}else{
        aaa<-names(Sparameters_i$splits)==""
        if(any(aaa)){
        names(Sparameters_i$splits)[aaa]<-
          setdiff(paste0("",seq_along(Sparameters_i$splits)),names(Sparameters_i$splits))[1:sum(aaa)]}}
    splitids<-names(Sparameters_i$splits)
    Sparameters_i$splits<-plyr::llply(seq_along(Sparameters_i$splits),
      function(iii){
        Split=Sparameters_i$splits[[iii]]
        Split$id=splitids[iii]
        start_time <- Sys.time()
        selT<-(rep(TRUE,nrow(TtableANAto0)))&with(TtableANAto0,eval(Split$condition))
        print(paste0("---- ",capture.output(Split$condition)),quote = F)
        print(paste0("------- Number of observations: ",sum(selT)),quote = F)
        Fit=NULL
        Split$method2=Split$method
        print(paste0("------- Method: ",Split$method2),quote = F)
        if(is.element(Split$method,c("ctree","rf","ctree.new"))){
          if(sum(selT)>2){
            if(!is.null(treeplotsavefolder)){Split$treeplotsavepath<-file.path(treeplotsavefolder,paste0(Sparameters_i$variable,'_',Split$id,'.pdf'))}
            x=TtableANAto0[selT,Split$predictors,drop=FALSE]
            y0=TtableANAto0[selT,Sparameters_i$variable]
            #if(is.factor(y0)){
            #  originallevels<-levels(y0)
            #  y0<-droplevels(y0)}
            Split$fit.model<-try(
              fitmodel.fn(Split$method,x,y0,treeplotsavepath=Split$treeplotsavepath,fit.parameters=Split$synthparameters));
            if (is.element("try-error",class(Split$fit.model))){
              print("------ Error -  method changed to sample",quote = F)
              Split$method2="sample"}}else{
                print("------ warning -  cell<3 - method changed to sample",quote = F)
                Split$method2="sample"
              }
        }else{Split$fit.model=NULL}
        end_time <- Sys.time()
        Split$modelbuildingtime=end_time-start_time
        Split
      })}
  if(!is.null(fitmodelsavepath)&redocomputations){
    save(Sparameters_i,file=wheretosavefit);return(NULL)}else{return(Sparameters_i)}
}



# The wrap function to generate SDP.
#' General SDP function.
#' 
#' @param TtableA a dataframe  to synthesize
#' @param asis list of variable names from TtableA to keep as is (e.g. not to synthesize)
#' @param notpredictor list of variable names which should not be used as predictors
#' @param nrep number of synthetic replicates wanted
#' @param synparameters general synthetisation paramters
#' @param Sparameters a list, Specific (variable by variable) synthetisation parameters, splits ...
#' @param STtableA a dataframe
#' @param fitmodelsavepath a path where to save the fitted models
#' @param treeplotsavefolder  a path where to save the tree plots
#' @param samplereportsavepath  a path where to save the sampling report
#' @param stepbystepsavepath a path where to backup the synthetised in case of a crash
#' @param doparallel a boolean indicating whether sampling should be done in parallel for each repliacte
#' @param recode : a vector of character strings or NULL, list of variables to be recoded
#' @param saveeach an integer, indicating every how many variables a backup is done 
#' @param randomfitorder a boolean : fitting for each variable can be done in the order of appearance of each variables or at random
#' @param fitonly a boolean, if TRUE, no sampling is done.
#' @details This function is doing both the fitting and the sampling.
#' 
#'@examples
#'
#'data(TtableA,package="BigSyn")
#'ATtableA=augmentT_f(TtableA,variablesmax="AA.present",
#'                    variablespct="AA.cont1")
#'asis=NULL;notpredictor=asis;nrep=1;synparameters=NULL;
#'Sparameters=
#'  Sparameters.default.f(ref.table=TtableA,
#'                        asis=asis,
#'                        notpredictor=notpredictor,
#'                        preferredmethod="ctree",
#'                        defaultsynparameters=
#'                           c(as.list(synparameters),
#'                             eval(formals(Sparameters.default.f)$defaultsynparameters)[
#'                               setdiff(names(formals(Sparameters.default.f)$defaultsynparameters),
#'                               c("",names(synparameters)))]));
#'SATtableA=plyr::rdply(nrep,ATtableA[asis]);
#'samplereportsavepath=NULL;
#'stepbystepsavepath=NULL;
#'doparallel=FALSE;
#'recode=NULL;
#'randomfitorder=TRUE;
#'fitonly=FALSE;
#'fitmodelsavepath=tempdir()
#'treeplotsavefolder=tempdir()
#'sapply(list.files(tempdir(),full.names = TRUE  ),file.remove)
#'
#'SATtableA<-SDPSYN2(ATtableA,asis=NULL,
#'                   fitmodelsavepath = fitmodelsavepath,
#'                   treeplotsavefolder=treeplotsavefolder)
#'todisplay<-grep("La_La_Lrn1",names(SATtableA[[1]]),value=T);
#'SATtableA[[1]][1:3,todisplay];TtableA[1:3,todisplay]
#'
#' ##############################################################
#' # Controling that AA.present_La=0=>AA.present_La_Lb=0 in synthetic data
#' library(BigSyn)
#' library(reshape2)
#' library(data.table)
#' data(TtableA,package="BigSyn")
#' variablepct="AA.cont1"
#' variablespct=variablepct
#' variablemax="AA.present"
#' variablesmax=variablemax
#' set.seed(1)
#' asis=c("id1a", "id1b")
#'                   fitmodelsavepath=NULL
#'                   treeplotsavefolder=NULL
#'                   samplereportsavepath=NULL
#'                   stepbystepsavepath=NULL
#'                   doparallel=TRUE
#'                   recode=NULL
#'                   saveeach=200
#'                   randomfitorder=TRUE
#'                   fitonly=FALSE
#'                   
#' variablemax="AA.present"
#' variablesmax=variablemax
#' variablepct="AA.cont1"
#' variablespct=variablepct
#' ATtableA<-augmentT_f(TtableA,
#'                     variablesmax=variablesmax,variablespct=variablespct)
#' TtableA<-ATtableA
#' STtableA<-ATtableA[asis]
#' Sparameters=Sparameters.default.f(
#'    ref.table=ATtableA,asis=c("id1a", "id1b"),
#'    notpredictor=NULL,
#'    preferredmethod="ctree",
#'    defaultsynparameters=
#'      eval(formals(Sparameters.default.f)$defaultsynparameters))
#' SATtableA<-BigSyn::SDPSYN2(ATtableA,asis=c("id1a", "id1b"))[[1]]
#' problems<-SATtableA$AA.present_Lb_La==1&SATtableA$AA.present_Lb==0
#' mean(problems)
#' Sparameters[["AA.present_Lb_La"]]
#' library(dplyr)
#' library(ggplot2)
#' xx<-function(x){
#' xxx<-x[sort(grep("present",names(x),value=TRUE))]
#' xxx[xxx==0]<-NA
#' StudyDataTools::ggplot_missing(xxx)}
#' xx(ATtableA)
#' xx(SATtableA)
#' 
#' 
#' ## Example with mixed regression model
#' 
#' 
#' 
#' library(BigSyn)
#' data(school,package="BigSyn")
#' asis=names(school)[1:9]
#' fitmodelsavepath=NULL
#' treeplotsavefolder=NULL
#' samplereportsavepath=NULL
#' stepbystepsavepath=NULL
#' doparallel=FALSE
#' recode=NULL
#' saveeach=NULL
#' randomfitorder=FALSE
#' fitonly=FALSE

#' variables<-Sparameters.variables.reorder.default(names(school))
#' notpredictor<-NULL
#' Sparameters=Sparameters.default.f(
#'     ref.table=school,asis=asis,
#'     notpredictor=NULL,
#'     variables=variables,
#'     predictors.matrix = predictor.matrix.default(variables)[!is.element(variables, asis), !is.element(variables, notpredictor),drop=FALSE],
#'     defaultsynparameters=    eval(formals(Sparameters.default.f)$defaultsynparameters))
  
#' Sparameters<-list(
#'  bscore=list(
#'     variable="bscore",
#'      splits=list(
#'          split1=list(
#'         condition=expression((TRUE)),
#'         method="ctree.new",
#'         predictors=names(school)[1:9],
#'         synthparameters=list(
#'           random = "schoolid", 
#'           lgmodel = "slope",
#'           rslope = "+ female + sclass")))))
#' L<-list(TtableA=school,
#' asis=names(school)[1:9],
#' notpredictor=asis,
#' nrep=1,
#' synparameters=NULL,
#' Sparameters=Sparameters,
#' STtableA=school[1:10,1:9],
#' fitmodelsavepath=getwd(),
#' treeplotsavefolder=NULL,
#' samplereportsavepath=NULL,
#' stepbystepsavepath=NULL,
#' doparallel=TRUE,
#' recode=NULL,
#' saveeach=200,
#' randomfitorder=TRUE,
#' fitonly=FALSE)
#' attach(L)
#' S_school<-do.call(BigSyn::SDPSYN2,L)[[1]]


SDPSYN2<-function(TtableA,
                  asis=NULL,
                  notpredictor=asis,
                  nrep=1,
                  synparameters=NULL,
                  Sparameters=Sparameters.default.f(
                    ref.table=TtableA,
                    asis=asis,
                    notpredictor=notpredictor,
                    preferredmethod="ctree",
                    defaultsynparameters=c(as.list(synparameters),
                                           eval(formals(Sparameters.default.f)$defaultsynparameters)[setdiff(names(formals(Sparameters.default.f)$defaultsynparameters),c("",names(synparameters)))])),
                  STtableA=if(is.null(asis)){data.frame(.n=rep(nrep,each=nrow(TtableA)))}else{plyr::ddply(data.frame(.n=nrep),~.n,function(d){TtableA[asis]})},
                  fitmodelsavepath=NULL,
                  treeplotsavefolder=NULL,
                  samplereportsavepath=NULL,
                  stepbystepsavepath=NULL,
                  doparallel=TRUE,
                  recode=NULL,
                  saveeach=200,
                  randomfitorder=TRUE,
                  fitonly=FALSE){
  print(paste0(Sys.time()," I am Initialising tables and converting NA's"),quote = F)
  STtableANAto0<-NAto0(STtableA)
  TtableANAto0<-NAto0(TtableA)
  logiqueT<-names(TtableA)[sapply(TtableA,function(x){inherits(x,"logical")})]
  logiqueS<-names(STtableA)[sapply(STtableA,function(x){inherits(x,"logical")})]
  STtableANAto0[logiqueS]<-lapply(STtableANAto0[logiqueS],function(x){x*1})
  TtableANAto0[logiqueT]<-lapply(TtableANAto0[logiqueT],function(x){x*1})
  print("###########################################################################",quote = F)
  print(paste0(Sys.time()," I am Fitting  the models"),quote = F)
  stilltofit<-setdiff(names(Sparameters),if(is.null(fitmodelsavepath)){character(0)}else{gsub(".rda","",list.files(fitmodelsavepath))})
  
  print(paste0(Sys.time()," -- ",length(stilltofit)," models remaining, but my friends help me"), quote = F)
  if(randomfitorder&length(stilltofit)>0){stilltofit<-stilltofit[order(runif(length(stilltofit)))]}
  Sparameterswithfit<-plyr::llply(stilltofit,
                                  function(variable){
                                    if(!is.null(fitmodelsavepath)){
                                    if(!is.element(variable,gsub(".rda","",list.files(fitmodelsavepath)))){#We retest to allow parallel computing
                                      fitthemodel(Sparameters_i=Sparameters[[variable]],
                                                  fitmodelsavepath=fitmodelsavepath,
                                                  TtableANAto0=TtableANAto0,
                                                  treeplotsavefolder = treeplotsavefolder)}
                                      NULL}else{
                                                    fitthemodel(Sparameters_i=Sparameters[[variable]],
                                                                fitmodelsavepath=fitmodelsavepath,
                                                                TtableANAto0=TtableANAto0,
                                                                treeplotsavefolder = treeplotsavefolder)
                                                  }},
                                    .progress = "text")
  names(Sparameterswithfit)<-stilltofit
  #  if(!is.null(fitmodelsavepath)){save(Sparameters0,file.path(fitmodelsavepath,"Sparameters0.rda"))}
  
  print("###########################################################################",quote = F)
  print(paste0(Sys.time()," I am sampling from  the models"),quote = F)
  if(!is.null(stepbystepsavepath)){
    if(file.exists(stepbystepsavepath)){load(stepbystepsavepath)
      print("Done loading",quote = F)}
  }
  tosynthesize=setdiff(names(Sparameters),names(STtableA))
  alreadydone<-length(intersect(names(Sparameters),names(STtableA)))
  print(paste0("previously synthesised: ", alreadydone),quote = F)
  if(fitonly){tosynthesize<-NULL}
  for(variable in tosynthesize){
    print("________________________________________________________________________",quote = F)
    print(paste0("Now sampling ",variable),quote = F)
    print(paste0("-- ",which(tosynthesize==variable)+alreadydone,"/",(length(tosynthesize)+alreadydone)),quote = F)
    if(!is.null(fitmodelsavepath)){
      wheretosavefit<-file.path(fitmodelsavepath,paste0(variable,".rda"))
      x<-try(load(wheretosavefit));
      if(is.element("try-error",class(x))){
        print(paste0("-- could not load ",wheretosavefit,": have to refit"),quote = F)
        try(file.remove(wheretosavefit))
        fitthemodel(Sparameters[[variable]],fitmodelsavepath,TtableANAto0)
        load(wheretosavefit)}}else{Sparameters_i<-Sparameterswithfit[[variable]]}
    print(paste0("-- nrow:",nrow(STtableA)),quote = F)
    STtableA[[variable]]<-NA
    SSel<-rep(FALSE,nrow(STtableA))
    ReportonSample<-list(variable=variable,splits=list())
    for(Split in Sparameters_i$splits){
      start_time <- Sys.time()
      print("........................................................................",quote = F)
      print(paste0("---- ",capture.output(print(Split$condition))),quote = F)
      problem=NULL
      selT<-(rep(TRUE,nrow(TtableANAto0)))&with(TtableANAto0,eval(Split$condition))
      selS<-(rep(TRUE,nrow(STtableANAto0)))&with(STtableANAto0,eval(Split$condition))
      donors<-sum(selT);receptors<-sum(selS)
      print(paste0("---- donors: ",donors),quote = F)
      print(paste0("---- receptors: ",receptors),quote = F)
      print(paste0("---- method: ",Split$method),quote = F)
      if(is.null(Split$method2)){Split$method2<-Split$method}
      if(is.element(Split$method2,c("ctree"))){
        #syn.fn<-if(Split$method2=="ctree"){synthpop::syn.ctree}else{synthpop::syn.rf}
        predictors<-intersect(intersect(Split$predictors,names(STtableANAto0)),
                              names(STtableANAto0))
        xp=STtableANAto0[selS,predictors,drop=FALSE]
        x=TtableANAto0[selT,predictors,drop=FALSE]
        #y0=TtableA[selT,variable]
        #if(ncol(x)==0){syn.fn=function(y,xp,...){list(Fit="sample",res=sample(y,size=nrow(xp),replace=TRUE))}}
        #if(logique&Split$method2!="ctree"){y0<-1*y0}
        #syn.cartbug<-is.factor(y0)
        #if(syn.cartbug){
        #  originallevels<-levels(y0)
        #  y0<-droplevels(y0)}
        xx<-try(y<-do.call(sample.ctree,c(list(xp=xp,fit.model=Split$fit.model),
                                          good.syn.parameters(Split$method,Split$synthparameters))))
        if (is.element("try-error",class(xx))){
          print("---- PROBLEM: All values set to missing",quote = F)
          y<-rep(NA,nrow(STtableA[selS,,drop=FALSE]));problem=xx}
        #if(logique&Split$method2!="ctree"){y<-(y==1)}
        #if(syn.cartbug){
        #  y<-factor(levels(y)[y],levels=originallevels)
        #}
      }
      if(is.element(Split$method2,c("ctree.new"))){
        #syn.fn<-if(Split$method2=="ctree"){synthpop::syn.ctree}else{synthpop::syn.rf}
        predictors<-intersect(intersect(Split$predictors,names(STtableANAto0)),names(STtableANAto0))
        xp=STtableANAto0[selS,predictors,drop=FALSE]
        x=TtableANAto0[selT,predictors,drop=FALSE]
        #y0=TtableA[selT,variable]
        #if(ncol(x)==0){syn.fn=function(y,xp,...){list(Fit="sample",res=sample(y,size=nrow(xp),replace=TRUE))}}
        #if(logique&Split$method2!="ctree"){y0<-1*y0}
        #syn.cartbug<-is.factor(y0)
        #if(syn.cartbug){
        #  originallevels<-levels(y0)
        #  y0<-droplevels(y0)}
        xx<-try(y<-do.call(sample.ctree.new,c(list(xp=xp,fit.model=Split$fit.model),
                                          good.syn.parameters(Split$method,Split$synthparameters))))
        if (is.element("try-error",class(xx))){
          print("---- PROBLEM: All values set to missing",quote = F)
          y<-rep(NA,nrow(STtableA[selS,,drop=FALSE]));problem=xx}
        #if(logique&Split$method2!="ctree"){y<-(y==1)}
        #if(syn.cartbug){
        #  y<-factor(levels(y)[y],levels=originallevels)
        #}
      }
      y=if(Split$method2=="sample"){
        if(sum(selT)>0){
          sample(TtableANAto0[selT,variable],size = nrow(STtableA[selS,,drop=FALSE]),replace=TRUE)}else{rep(TtableANAto0[,variable],nrow(STtableA[selS,,drop=FALSE]))}
      }else{if(is.element(Split$method2,c("rf","ctree","ctree.new"))){y
      }else{if(Split$method2=="calculus"){with(STtableANAto0[selS,,drop=FALSE],eval(Split$calculus))}}}
      end_time <- Sys.time()
      # list(sel=(1:nrow(STtableA))[selS],
      #Split=c(Split,list(problem=problem,
      #                  numbersynthesised=nrow(STtableA[selS,]),
      #                  numberused=nrow(TtableA[selT,]),
      #                  synthetizationtime=end_time-start_time)),
      #       y=y)})
      STtableA[[variable]][selS]<-y
      SSel[selS]<-TRUE
      
      ReportonSample$splits<-c(ReportonSample$splits,list(list(donors=donors,receptors=receptors,problem=problem,samplingtime=end_time-start_time)))}
    
    fakesel<-!SSel#setdiff(1:nrow(STtableA),do.call(c, lapply(Syn, function(l) {l$sel})))
    if(any(fakesel)){
      print("......................................................................",quote = F)
      print("---- Fakes",quote = F)
      donors<-0;receptors<-sum(fakesel)
      print("---- donors: 0",quote = F)
      print(paste0("---- receptors:",receptors),quote = F)
      print("---- method: sample",quote = F)
      start_time <- Sys.time()
      STtableA[[variable]][fakesel]<-sample(TtableANAto0[, variable], size = sum(fakesel), replace = TRUE)
      end_time <- Sys.time()
      ReportonSample$splits<-c(ReportonSample$splits,list(list(donors=donors,receptors=receptors,problem="fake",samplingtime=end_time-start_time)))
    }
    
    if(is.factor(TtableA[[variable]])){
      STtableA[[variable]]<-
        factor(levels(TtableA[[variable]])[STtableA[[variable]]],levels=levels(TtableA[[variable]]))}
    STtableANAto0<-cbind(STtableANAto0,NAto0(STtableA[variable]))
    if(!is.null(samplereportsavepath)){save(ReportonSample,file=file.path(samplereportsavepath,paste0(variable,".rda")))}
    if(!is.null(stepbystepsavepath)&which(tosynthesize==variable)%%saveeach==length(tosynthesize)%%saveeach){save(STtableA,STtableANAto0,file=stepbystepsavepath)}
  }
  if(!fitonly){
    print("###########################################################################",quote = F)
    print("#  Now I am reconverting logical                                          #",quote = F)
    STtableA[unique(c(logiqueS,logiqueT))]<-lapply(STtableA[unique(c(logiqueS,logiqueT))], function(x){x==1})
    print("############################################",quote = F)
    print("#  I am changin id                         #",quote = F)
    print("############################################",quote = F)
    if(!is.null(recode)){recode<-intersect(recode,names(STtableA))
    STtableA[recode]<-lapply(STtableA[recode],function(x){as.integer(as.factor(x))})
    }
    return(split(STtableA, STtableA$".n"))}
  }
  
DanielBonnery/BigSyn documentation built on June 28, 2020, 7:18 p.m.