R/carrot_functions_rsq.R

Defines functions find_int cub quadr regr_ind get_indices av_out cross_val AUC comb get_predictions get_predictions_lin get_probabilities compute_max_length find_sub sum_weights_sub compute_max_weight compute_weights make_numeric_sets make_numeric

Documented in AUC av_out comb compute_max_length compute_max_weight compute_weights cross_val cub find_int find_sub get_indices get_predictions get_predictions_lin get_probabilities make_numeric make_numeric_sets quadr regr_ind sum_weights_sub

#'Turning a non-numeric variable into a numeric one
#'
#'Function which turns a single categorical (non-numeric) variable into a numeric one (or several) by introducing dummy '0'/'1' variables.
#'
#'@param vari array of values to be transformed
#'@param outcome TRUE/FALSE indicates whether the variable \code{vari} is an outcome (TRUE) or a predictor (FALSE)
#'@param ra indices of the input array \code{vari} which indicate which values will be transformed
#'@param mode \code{'binary'} (logistic regression), \code{'multin'} (multinomial regression)
#'@usage make_numeric(vari, outcome, ra,mode)
#'@return Returned value is an M x N matrix where M is the length of the input array of indices \code{ra} and N is \code{length(vari)-1}.
#'@details This function is essentially a standard way to turn categorical non-numeric variables into numeric ones in order to run a regression
#'@export make_numeric
#'@examples
#'#creating a non-numeric set
#'
#'a<-t(rmultinom(100,1,c(0.2,0.3,0.5)))%*%c(1,2,3)
#'
#'a[a==1]='red'
#'a[a==2]='green'
#'a[a==3]='blue'
#'
#'#running the function
#'
#'make_numeric(a,FALSE,sample(1:100,50),"linear")
#'
#'make_numeric(a,TRUE,sample(1:100,50))





make_numeric<-function(vari,outcome,ra,mode=NULL){


  if (outcome==TRUE){ #turning non-numeric outcome into a numeric one

    un=unique(vari); #finding unique elements of the array of outcomes

    lun=length(un);  #number of unique elements

    lunu<-array(NA,lun) #an empty array of the corresponding length

    #counting the number of occurrences of each outcome and writing it into the array initialised above

    for (i in 1:lun){

      lunu[i]=length(vari[vari==un[i]]);

    }

    #creating an order array corresponding to the array of counts

    o<-order(lunu);

    #creating a numeric array for output

    vari1<-as.numeric(as.factor(vari));

    # assigning values from 0 to lun-1 to the values of the array above in the descending order where 0 corresponds to the most frequent one and lun-1 to the least frequent one

    for (j in 1:lun){

      vari1[vari==un[o[j]]]=lun-j;


    }

    #numeric output of the function

    as.numeric(vari1);

  }
  else{ #turning non-numeric variable into a numeric one

    if (mode=='linear'){

      vari1<-model.matrix(~vari-1)

      vari1[ra,2:(dim(vari1)[2])];

    } else{
      un=unique(vari); #similar to the outcome case

      lun=length(un); #similar to the outcome case

      lunu<-array(NA,lun) #similar to the outcome case

      for (i in 1:lun){

        lunu[i]=length(vari[vari==un[i]]); #similar to the outcome case

      }

      o<-order(lunu); #similar to the outcome case

      vari1<-matrix(0,length(vari),lun-1) #creating a matrix of zeros to be turned into lun-1 dummy variables

      for (i in 1:lun-1){

        vari1[which(vari==un[o[i]]),i]=1; #assigning values 1 to different columns of the matrix depending on the value of the variable

      }

      #lun-1 dummy variables as an output

      vari1[ra,];
    }
  }


}

#'Transforming the set of predictors into a numeric set
#'
#'Function which turns a set of predictors containing non-numeric variables into a fully numeric set
#'
#'@param a An M x N matrix, containing all possible subsets (N overall) of the size M of predictors' indices; therefore each column of \code{a} defines a unique subset of the predictors
#'@param ai array of indices of the array \code{a}
#'@param k index of the array \code{ai}
#'@param vari set of all predictors
#'@param ra array of sample indices of \code{vari}
#'@param l size of the sample
#'@param mode \code{'binary'} (logistic regression), \code{'multin'} (multinomial regression)
#'@usage make_numeric_sets(a,ai,k,vari,ra,l,mode)
#'@return Returns a list containing two objects: \code{tr} and \code{test}\cr
#'\item{tr}{training set transformed into a numeric one}
#'\item{test}{test set transformed into a numeric one}
#'@details Function transforms the whole set of predictors into a numeric set by consecutively calling function \code{make_numeric} for each predictor
#'@seealso \code{\link{make_numeric}}
#'@export make_numeric_sets
#'@examples
#'#creating a categorical numeric variable
#'
#'a<-t(rmultinom(100,1,c(0.2,0.3,0.5)))%*%c(1,2,3)
#'
#'#creating an analogous non-numeric variable
#'
#'c<-array(NA,100)
#'c[a==1]='red'
#'c[a==2]='green'
#'c[a==3]='blue'
#'
#'#creating a data-set
#'
#'b<-data.frame(matrix(c(a,rbinom(100,1,0.3),runif(100,0,1)),ncol=3))
#'
#'#making the first column of the data-set non-numeric
#'
#'b[,1]=data.frame(c)
#'
#'#running the function
#'
#'make_numeric_sets(combn(3,2),1:3,1,b,sample(1:100,60),100,"binary")



make_numeric_sets<-function(a,ai,k,vari,ra,l,mode){

  #initialialising arrays of test and training sets

  testset1<-array(NA,0)
  trset1<-array(NA,0)

  #going through the indices of the corresponding set of predictors

  for (m in 1:length(a[,ai[k]])){


    #turning the non-numeric variable into numeric

    if (is.numeric(vari[,a[m,ai[k]]])==FALSE){ #turning a non-numeric variable into numeric

      #performing this operation for the training and test set

      anum1<-make_numeric(vari[,a[m,ai[k]]],FALSE,ra,mode)
      anum<-make_numeric(vari[,a[m,ai[k]]],FALSE,setdiff(1:l,ra),mode)

      #adding the transformed variable to the existing test and training sets on the left

      testset1<-cbind(testset1,anum)
      trset1<-cbind(trset1,anum1)


    }
    else{

      #if the variable is already numeric we simply add in on the left of the existing test and training set

      testset1<-cbind(testset1,vari[setdiff(1:l,ra),a[m,ai[k]]])
      trset1<-cbind(trset1,vari[ra,a[m,ai[k]]])


    }

  }

  #output the transformed test and training sets

  list("test" = testset1, "tr" = trset1)


}

#'Weights of predictors
#'
#'Function which computes the weight of each predictor according to the rules of thumb and outputs it into corresponding array
#'
#'@param vari_col number of predictors
#'@param vari set of predictors
#'@details Continuous or categorical numerical variable with more then 5 categories has weight 1, otherwise it has weight \code{n-1} where \code{n} is the number of categories
#'@return Returns an array of weights of the size \code{vari_col}
#'@usage compute_weights(vari_col, vari)
#'@export compute_weights
#'@references{
#'\insertRef{ref1}{CARRoT}
#'}
#'@references{
#'\insertRef{ref2012-18631-001}{CARRoT}
#'}
#'@importFrom Rdpack reprompt
#'@examples
#'#creating data-set with for variables
#'
#'a<-matrix(NA,nrow=100,ncol=4)
#'
#'#binary variable
#'
#'a[,1]=rbinom(100,1,0.3)
#'
#'#continuous variable
#'
#'a[,2]=runif(100,0,1)
#'
#'#categorical numeric with les than 5 categories
#'
#'a[,3]=t(rmultinom(100,1,c(0.2,0.3,0.5)))%*%c(1,2,3)
#'
#'#categorical numeric with 5 categories
#'
#'a[,4]=t(rmultinom(100,1,c(0.2,0.3,0.3,0.1,0.1)))%*%c(1,2,3,4,5)
#'
#'#running the function
#'
#'compute_weights(4,a)


compute_weights<-function(vari_col,vari){

  #initialising and empty array of weights

  we<-matrix(nrow=0,ncol=1);

  #going through all the predictive variables

  for (i in 1:vari_col){

    #if the variable is numeric it has a weight 1

    if (is.numeric(vari[,i])==TRUE) {

      we<-array(c(we,1))

    } else{

      #otherwise it has a weight of the number of categories minus one

      we<-array(c(we,length(unique(vari[,i]))-1));
    }

  }

  #outputting the array of weights

  we

}

#'Maximum feasible weight of the predictors
#'
#'Function which computes maximal weight (multiplied by the corresponding EPV rule) of a regression according to the rule of thumb applied to the outcome variable. Weight of a regression equals the sum of weights of its predictors.
#'
#'@details For continuous outcomes it equals sample size divided by 10, for multinomial it equals the size of the smallest category divided by 10
#'@param outi set of outcomes
#'@param mode indicates the mode: 'linear' (linear regression), 'binary' (logistic regression), 'multin' (multinomial regression)
#'@usage compute_max_weight(outi,mode)
#'@return returns an integer value of maximum allowed weight multiplied by 10
#'@export compute_max_weight
#'@examples
#'#continuous outcomes
#'
#' compute_max_weight(runif(100,0,1),'linear')
#'
#' #binary outcomes
#'
#' compute_max_weight(rbinom(100,1,0.4),'binary')
#'@references{
#'\insertRef{ref1}{CARRoT}
#'}
#'@importFrom Rdpack reprompt




compute_max_weight<-function(outi,mode){

  if (mode=='linear') { #if the mode is linear maximal weight multiplied by the corresponding EPV rule is defined by the sample size

    numr=length(outi)

  } else{ #otherwise

    unio<-unique(outi); #find all unique elements of the outcome

    lo<-array(NA,length(unio)) #initialise an array of the corresponding length

    for (i in 1:length(unio)) lo[i]=length(which(outi==unio[i])); #count the occurrences of the each type of outcome

    numr=min(lo); #choose the smallest one to defined the maximal weight multiplied by the corresponding EPV rule
  }

  numr #output the obtained value

}

#'Cumulative weights of the predictors' subsets
#'
#'Function which computes the sum of predictors' weights for each subset containing a fixed number of predictors
#'
#'@param a an \code{m} x N matrix, containing all possible subsets (N overall) of the size \code{m} of predictors' indices; therefore each column of \code{a} defines a unique subset of the predictors
#'@param m number of elements in each subset of indices
#'@param we array of weights of the predictors
#'@param st a subset of predictors to be always included into a predictive model
#'@usage sum_weights_sub(a,m,we,st)
#'@return Returns an array of weights for predictors defined by each colun of the matrix \code{a}
#'@export sum_weights_sub
#'@examples
#'#all two-element subsets of the set 1:3
#'
#'a<-combn(3,2)
#'
#'sum_weights_sub(a,2,c(1,2,1))



sum_weights_sub<-function(a,m,we,st=NULL){

  s<-array(NA,ncol(a)); #the array corresponding to the number of the feasible subsets

  if ((m>1)|(is.null(st)==FALSE)){ #if the size of the subset is greater than 1 or if it consists only of stationary part

    for (h in 1:ncol(a)){
      #print(h)

      s[h]=sum(we[a[((h-1)*m+1):(h*m)]]); #sum up the corresponding weights
    }

  } else s=we; #otherwise the target value is exactly the array of weights

  #output the value

  s

}


#'Finds certain subsets of predictors
#'
#'Reorders the columns of matrix \code{a} according to the ordered elements of array \code{s}
#'@param a A \code{j} x N matrix, containing all possible subsets (N overall) of the size \code{j} of predictors' indices.
#'@param s array of numbers of the size N
#'@param j number of rows in \code{a}
#'@param c array of all indices of the predictors
#'@param st a subset of predictors to be always included into a predictive model
#'@usage find_sub(a,s,j,c,st)
#'@return Returns a submatrix of matrix \code{a} which consits of columns determined by the input array \code{s}
#'@export find_sub
#'@examples
#'#all two-element subsets of 1:3
#'
#'a<-combn(3,2)
#'s<-c(3,2,3)
#'
#'find_sub(a,s,2,1:3)



find_sub<-function(a,s,j,c,st){#


  if (j==1){ #if all the subsets are of the size 1

    a=t(matrix(a[,order(s)])); #transforming array of subsets according to the order of the array s

  } else{

    if (dim(a)[2]==1){ #if there is only one subset

      a=matrix(a[,order(s)]);

    } else a=a[,order(s)]; #if there is more than one subset of size larger than 1

  }

  a #outputting the transformed subset

}

#'Maximum number of the regressions
#'
#'Function which computes the maximum number of regressions with fixed number of variables based on the rule of thumb
#'
#'@param vari_col number of predictors
#'@param k maximum weight of the predictors
#'@param c array of all indices of the predictors
#'@param we array of weights of the predictors. Continuous or categorical numerical variable with more then 5 categories has weight 1, otherwise it has weight \code{n-1} where \code{n} is the number of categories
#'@param minx minimum number of predictors, 1 by default
#'@param maxx maximum number of predictors, total number of variables by default
#'@param st a subset of predictors to be always included into a predictive model
#'@import utils
#'@return Integer correponding to maximum number of regressions of the same size
#'@usage compute_max_length(vari_col,k,c,we,minx,maxx,st)
#'@seealso Function uses \code{\link[utils]{combn}}
#'@export compute_max_length
#'@references{
#'\insertRef{ref1}{CARRoT}
#'}
#'@references{
#'\insertRef{ref2012-18631-001}{CARRoT}
#'}
#'@importFrom Rdpack reprompt
#'@examples
#'compute_max_length(4,40,1:4,c(1,1,2,1))


compute_max_length<-function(vari_col,k,c,we,minx=1,maxx=NULL,st=NULL){ #

  #initialising an array of length of the number of types of regressions subject to parameter k, number of variables,minimal number of variables and maximal number of variables
  lest<-length(st)

  c=setdiff(c,st);

#  minx=max(minx,lest)

  le<-array(NA,min(min(vari_col,k)-minx+1,maxx-max(minx,lest)+1))

  #going through regressions with the number of variables we are willing to consider

  for (m in max(max(minx,1),lest):min(min(vari_col,k),maxx)){
    #print(m)

    a<-combn(c,m-lest); #all subsets of variables of the size m

    if (is.null(st)==FALSE) a<-rbind(matrix(st,ncol=dim(a)[2],nrow=lest),a)

    #compute the weights of each column of a


    s<-sum_weights_sub(a,m,we,st); #computing the corresponding weights


    le[m-max(max(minx,1),lest)+1]=length(which(s<=k)); #number of regression of the given size satisfying the weight constraint

  }

  max(le) #outputting the size of the largest one


}


#'Probabilities for multinomial regression
#'
#'Function which computes probabilities of outcomes on the test set by applying regression parameters inferred by a run on the training set. Works for logistic or multinomial regression
#'@param trset values of predictors on the training set
#'@param testset values of predictors on the test set
#'@param outc values of outcomes on the training set
#'@param mode \code{'binary'} (logistic regression) or \code{'multin'} (multinomial regression)
#'@param Rsq whether R-squared statistics constrained is introduced
#'@param p weight of the model
#'@param n_tr size of the training set
#'@usage get_probabilities(trset,testset,outc,mode,Rsq,p,n_tr)
#'@return Probabilities of the outcomes. In \code{'binary'} mode returns an array of the size of the number of observations in a testset. In \code{'multin'} returns an M x N matrix where M is the size of the number of observations in a testset
#'and N is the number of unique outcomes minus 1.
#'@details In binary mode this function computes the probabilities of the event '0'. In multinomial mode computes the probabilities of the events '0','1',...,'N-1'.
#'@seealso Function uses \code{\link[nnet]{multinom}}  and \code{\link[stats]{coef}}
#'@import stats
#'@import nnet
#'@export get_probabilities
#'@examples
#'trset<-matrix(c(rbinom(70,1,0.5),runif(70,0.1)),ncol=2)
#'
#'testset<-matrix(c(rbinom(10,1,0.5),runif(10,0.1)),ncol=2)
#'
#'get_probabilities(trset,testset,rbinom(70,1,0.6),'binary')


get_probabilities<-function(trset,testset,outc,mode,Rsq=F,p=NULL,n_tr){

  #dimensions of the test set

  d<-dim(data.matrix(testset));

  #for binary mode compute the coefficients of the regression

  regr<-multinom(-outc~.,data=data.frame(trset),trace=FALSE);

  regr_c<-data.matrix(coef(regr));



  if (mode=='binary'){

    if (Rsq==F){


    #applying coefficients to the test set and immediately transforming the result in order to get probabilities later

    ps=exp(matrix(rep(1,d[1]))%*%regr_c[1,]+data.matrix(testset)%*%regr_c[2:length(regr_c),]);
    } else{



      L0<-sum(log(((1/(1+exp(-regr_c[1,])))^(sum(outc)))*((1-1/(1+exp(-regr_c[1,])))^(n_tr-sum(outc)))))
      Lm<-sum(log(1/(1+exp(-matrix(rep(1,length(outc)))%*%regr_c[1,]-data.matrix(trset)%*%regr_c[2:length(regr_c),]))))

      LR=-2*(L0-Lm)

      s=1-p/LR

      if ((s>=0.9)&(is.na(s)==F)){

        Rsq_v=1-exp(-LR/n_tr)
        Rsq_m=1-exp(2*L0/n_tr)

        if ((Rsq_v*(1-s)/Rsq_m<0.05)&(is.na(Rsq_v*(1-s)/Rsq_m)==F)){

      #applying coefficients to the test set and immediately transforming the result in order to get probabilities later

      ps=exp(matrix(rep(1,d[1]))%*%regr_c[1,]+data.matrix(testset)%*%regr_c[2:length(regr_c),]);

        } else ps=rep(NA,d[1])
      } else ps=rep(NA,d[1])


}

  } else {

    #for multinomial mode compute the coefficients of the regression

    regr_c<-apply(data.matrix(coef(multinom(-outc~.,data=data.frame(trset),trace=FALSE))),2,rev);

    #applying coefficients to the test set and immediately transforming the result in order to get probabilities later

    ps=rowSums(exp(matrix(rep(1,d[1]))%*%regr_c[,1]+data.matrix(testset)%*%t(regr_c[,2:dim(regr_c)[2]])));

  }

  #getting rid of infinite values of ps

  coe=8;

  ps[ps==Inf]=max(ps[ps<Inf])*coe;

  while(length((ps[ps==Inf])>0)&(sum(is.na(ps)))!=d[1]){
    coe=coe/2;
    ps[ps==Inf]=max(ps[ps<Inf])*coe;
  }

  #computing the first term in the product in order to get probabilities



  #computing the second term in the product in order to get probabilities for binary and multinomial probabilities

  if (mode=='binary') {

 #   p0=exp(matrix(rep(1,d[1]))%*%regr_c[1,]+data.matrix(testset)%*%regr_c[2:length(regr_c),]);
    p=1/(1+1/ps);

  } else p0=exp(matrix(rep(1,d[1]))%*%regr_c[,1]+data.matrix(testset)%*%t(regr_c[,2:dim(regr_c)[2]]));



   if (mode!='binary') { #computing the probabilities for binary mode
  #
  #   p=t(p1)*p0;
  # } else { #computing the probabilities for multinomial mode and combining them into a matrix
     p1=t(1/(1+ps));

    p=t(p1)*p0[,1]

    for (i in 2:dim(p0)[2]) p=cbind(p,t(p1)*p0[,i])

  }


  # if (sum(is.na(p))>0) { #output an error message in case there are undefined values of p
  #
  #   stop('undefined prediction probabilities')
  #
  # }

  p #output the probabilities


}

#'Predictions for linear regression
#'
#'Function which runs a linear regression on a training set, computes predictions for the test set
#'
#'@param trset values of predictors on the training set
#'@param testset values of predictors on the test set
#'@param outc values of predictors on the training set
#'@param k length of the test set
#'@param Rsq whether the R-squared statistics constraint is introduced
#'@param Rsq_v value of R-squared statistics on the training spli of the data
#'@param marg margin of error for R-squared statistics constraint
#'@param p weight of the model
#'@param n_tr size of the training set
#'@usage get_predictions_lin(trset,testset,outc,k,n_tr,p,Rsq,Rsq_v,marg)
#'@return An array of continous variables of the length equal to the size of a \code{testset}
#'@seealso Function uses function \code{\link[stats]{lsfit}} and \code{\link[stats]{coef}}
#'@export get_predictions_lin
#'@examples
#'trset<-matrix(c(rnorm(90,2,4),runif(90,0,0.5),rbinom(90,1,0.5)),ncol=3)
#'
#'testset<-matrix(c(rnorm(10,2,4),runif(10,0,0.5),rbinom(10,1,0.5)),ncol=3)
#'
#'get_predictions_lin(trset,testset,runif(90,0,1),10)

get_predictions_lin<-function(trset,testset,outc,k,n_tr,p,Rsq=F,Rsq_v=NULL,marg=0){

  #write the coefficients of the linear regression fitted to the corresponding training set into an array

  regr<-lsfit(as.matrix(trset),outc)

  regr_c<-data.matrix(coef(regr));



  #initialise an array of predictions

  pred<-array(NA,c(1,k));


  if (sum(outc%%1)==0){# in case the outcomes are integers round the predictions

    if (Rsq==T){

      varre=var(regr$residuals)


      Rr=1-varre/var(outc)

      marg0=0

      if (marg>0) marg0=sqrt(varre/n_tr)*qt(0.975,n_tr-p-1)/abs(regr_c[1,])

      if ((Rr>Rsq_v)&(marg0<=marg)) pred<-round(data.matrix(testset)%*%matrix(regr_c[2:length(regr_c)])+regr_c[1]);

    } else pred<-round(data.matrix(testset)%*%matrix(regr_c[2:length(regr_c)])+regr_c[1]);

  } else{ #otherwise do not round them

    if (Rsq==T){

      varre=var(regr$residuals)

      Rr=1-varre/var(outc)

      marg0=0.0

      if (marg>0) {

        marg0=sqrt(varre/n_tr)*qt(0.975,n_tr-p-1)/regr_c[1,]

      }

      if ((Rr>Rsq_v)&(marg0<=marg)) pred<-round(data.matrix(testset)%*%matrix(regr_c[2:length(regr_c)])+regr_c[1]);


    } else pred<-data.matrix(testset)%*%matrix(regr_c[2:length(regr_c)])+regr_c[1];



  }

  #in case all outcomes are positive assign value zero to all negative predictions

  if (length(outc[outc<0]==0)) pred[pred<0]=0;

  #output the array of predictions

  pred


}

#'Predictions for multinomial regression
#'
#'Function which makes a prediction for multinomial/logistic regression based on the given cut-off value and probabilities.
#'@param p probabilities of the outcomes for the test set given either by an array (logistic regression) or by a matrix (multinomial regression)
#'@param k size of the test set
#'@param cutoff cut-off value of the probability
#'@param cmode \code{'det'} or \code{''}; \code{'det'} always predicts the more likely outcome as determined by the odds ratio; \code{''} predicts certain outcome with probability corresponding to its odds ratio (more conservative). Option available for multinomial/logistic regression
#'@param mode \code{'binary'} (logistic regression), \code{'multin'} (multinomial regression)
#'@usage get_predictions(p,k,cutoff,cmode,mode)
#'@seealso Uses \code{\link[stats]{rbinom}}, \code{\link[stats]{rmultinom}}
#'@return Outputs the array of the predictions of the size of \code{p}.
#'@export get_predictions
#'@examples
#'#binary mode
#'
#'get_predictions(runif(20,0.4,0.6),20,0.5,'det','binary')
#'
#'#creating a data-set for multinomial mode
#'
#'p1<-runif(20,0.4,0.6)
#'p2<-runif(20,0.1,0.2)
#'p3<-1-p1-p2
#'
#'#running the function
#'
#'get_predictions(matrix(c(p1,p2,p3),ncol=3),20,0.5,'det','multin')


get_predictions<-function(p,k,cutoff,cmode,mode){

  #initialise the array of predictions

  pred<-array(NA,c(1,k));

  #going through all objects in the testset

  for (m in 1:k){


    if (mode=='binary'){



      if (cmode=='det'){

        #making a deterministic mode prediction based on the cut-off

        pred[m]=ifelse(p[m]>cutoff,0,1)
      } else{

        #making a random mode prediction based on sampling from binomial distribution

        pred[m]=1-rbinom(1,1,p[m])
      }

    } else{

      if (cmode=='det'){ #for multinomial mode deterministic prediction based on maximal probability

        maxpr=which(c(p[m,],1-sum(p[m,]))==max(c(p[m,],1-sum(p[m,]))));

        pred[m]=maxpr-1;

      } else{ #random mode prediction based on sampling from multinomial distribution

        pred[m]=c(0:(k-1))%*%rmultinom(1,1,c(p[m,],1-sum(p[m,])))

      }
    }
  }

  #outputting the array of predictions

  pred

}


#'Combining in a list
#'
#'Function for combining outputs in a list
#'@param ... an argument of \code{mapply} used by this function
#'@seealso Function \code{\link[base]{mapply}}
#'@export comb
#'@examples
#'#array of numbers to be separated in a list
#'
#'a<-1:4
#'
#'#running the function
#'
#'comb(a)

comb <- function(...) {
  mapply('cbind', ..., SIMPLIFY=FALSE)
}

#'Area Under the Curve
#'
#'Function enables efficient computation of area under receiver operating curve (AUC). Source: \url{https://stat.ethz.ch/pipermail/r-help/2005-September/079872.html}
#'@param probs probabilities
#'@param class outcomes
#'@usage AUC(probs, class)
#'@return A value for AUC
#'@export AUC
#'@examples
#'AUC(runif(100,0,1),rbinom(100,1,0.3))

AUC <- function(probs, class) {
  x <- probs
  y <- class
  x1 = x[y==1]; n1 = length(x1);
  x2 = x[y==0]; n2 = length(x2);
  r = rank(c(x1,x2))
  auc = (sum(r[1:n1]) - n1*(n1+1)/2) / n1 / n2
  return(auc)
}

#'Cross-validation run
#'
#'Function running a single cross-validation by partitioning the data into training and test set
#'
#'@param vari set of predictors
#'@param outi array of outcomes
#'@param c set of all indices of the predictors
#'@param rule rule of 10 in this case
#'@param part indicates partition of the original data-set into training and test set in a proportion \code{(part-1):1}
#'@param l number of observations
#'@param we weights of the predictors
#'@param vari_col overall number of predictors
#'@param preds array to write predictions for the test split into, intially empty
#'@param cmode \code{'det'} or \code{''}; \code{'det'} always predicts the more likely outcome as determined by the odds ratio; \code{''} predicts certain outcome with probability corresponding to its odds ratio (more conservative). Option available for multinomial/logistic regression
#'@param mode \code{'binary'} (logistic regression), \code{'multin'} (multinomial regression)
#'@param predm \code{'exact'} or \code{''}; for logistic and multinomial regression; \code{'exact'} computes how many times the exact outcome category was predicted, \code{''} computes how many times either the exact outcome category or its nearest neighbour was predicted
#'@param cutoff cut-off value for logistic regression
#'@param objfun \code{'roc'} for maximising the predictive power with respect to AUC, \code{'acc'} for maximising predictive power with respect to accuracy.
#'@param minx minimum number of predictors to be included in a regression, defaults to 1
#'@param maxx maximum number of predictors to be included in a regression, defaults to maximum feasible number according to one in ten rule
#'@param maxw maximum weight of predictors to be included in a regression, defaults to maximum weight according to one in ten rule
#'@param nr a subset of the data-set, such that \code{1/part} of it lies in the test set and \code{1-1/part} is in the training set, defaults to empty set
#'@param st a subset of predictors to be always included into a predictive model,defaults to empty set
#'@param rule an Events per Variable (EPV) rule, defaults to 10
#'@param corr maximum correlation between a pair of predictors in a model
#'@param Rsq whether R-squared statistics constrained is introduced
#'@param marg margin of error for R-squared statistics constraint
#'@param n_tr size of the training set
#'@param preds_tr array to write predictions for the training split into, intially empty
#@usage cross_val(vari,outi,c,rule,part,l,we,vari_col,preds,mode,cmode,predm,cutoff,objfun,minx,maxx,nr,maxw,st,corr)
#'@return
#'\item{regr}{An M x N matrix of sums of the absolute errors for each element of the test set for each feasible regression. M is maximum feasible number of variables included in a regression, N is the maximum feasible number of regressions of the fixed size; the row index indicates the number of variables included in a regression. Therefore each row corresponds to results obtained from running regressions with the same number of variables and columns correspond to different subsets of predictors used.}
#'\item{regrr}{An M x N matrix of sums of the relative errors for each element of the test set (only for \code{mode = 'linear'}) for each feasible regression. M is maximum feasible number of variables included in a regression, N is the maximum feasible number of regressions of the fixed size; the row index indicates the number of variables included in a regression. Therefore each row corresponds to results obtained from running regressions with the same number of variables and columns correspond to different subsets of predictors used.}
#'\item{nvar}{Maximum feasible number of variables in the regression}
#'\item{emp}{An accuracy of always predicting the more likely outcome as suggested by the training set (only for \code{mode = 'binary'} and \code{objfun = 'acc'})}
#'In \code{regr} and \code{regrr} \code{NA} values are possible since for some numbers of variables there are fewer feasible regressions than for the others.
#'@seealso Uses \code{\link{compute_max_weight}}, \code{\link{sum_weights_sub}}, \code{\link{make_numeric_sets}}, \code{\link{get_predictions_lin}}, \code{\link{get_predictions}}, \code{\link{get_probabilities}}, \code{\link{AUC}}, \code{\link[utils]{combn}}
#'@export cross_val
#'@examples
#'#creating variables
#'
#'vari<-matrix(c(1:100,seq(1,300,3)),ncol=2)
#'
#'#creating outcomes
#'
#'out<-rbinom(100,1,0.3)
#'
#'#creating array for predictions
#'
#'pr<-array(NA,c(2,2))
#'
#'pr_tr<-array(NA,c(2,2))
#'
#'#passing set of the inexes of the predictors
#'
#'c<-c(1:2)
#'
#'#passing the weights of the predictors
#'
#'we<-c(1,1)
#'
#'#setting the mode
#'
#'m<-'binary'
#'
#'#running the function
#'
#'cross_val(vari,out,c,10,10,100,we,2,pr,m,'det','exact',0.5,'acc',nr=c(1,4),n_tr=90,preds_tr=pr_tr)

cross_val<-function(vari,outi,c,rule,part,l,we,vari_col,preds,mode,cmode,predm,cutoff,objfun,minx=1,maxx=NULL,nr=NULL,maxw=NULL,st=NULL,corr=1,Rsq=F,marg=0.0,n_tr,preds_tr){

  #for linear mode initialising the array of relative errors

  if (mode=='linear') {
    predsr<-preds
   predsr_tr<-preds_tr
  }

  if (is.null(nr)==TRUE){ #creating the partition into training and test set given that no subset is specified to be necessarily present both in the training and test set

    ra=sample(1:l,n_tr);

  } else { #creatinng the partition when parameter nr is specified

    lnr=length(nr);

    ranr=sample(nr,floor((1-(1/part))*lnr)) #partitioning nr itself

    #partitioning the remaining datapoints

    rar=sample(setdiff(1:l,nr),n_tr-floor((1-(1/part))*lnr));

    #combining the two

    ra=c(ranr,rar);

  }

  #defining the training set

  trset<-vari[ra,];

  #outcomes corresponding to the training set

  outc<-outi[ra];

  #testset

  testset<-vari[setdiff(c(1:l),ra),];

  #computing the maximum allowed weight given the outcomes in the training set and the mode of the prediction

  mw<-compute_max_weight(outc,mode)

  #maximal weight given the input parameter maxw (if specified)

  mw=min(mw,maxw*rule);

  #maximal number of variables allowed in a single regression

  nvar=floor((1/rule)*mw);

  #error message if this nuber is 0

  if (nvar==0) stop('not enough events to build a regression model');

  #maximal number of variables taking into account restriction by the parameter maxx (if any)

  if (is.null(maxx)==FALSE){

    maxj=min(maxx,min(nvar,vari_col))

  } else {

    maxj=min(nvar,vari_col)
  }

  #minimal number of variables taking into account restriction by the parameter minx (if any)

    minj=max(1,minx)

#subset of indices of the regression without the "fixed subset" defined by parameter st

  c=setdiff(c,st);

  #length of the "fixed subset" defined by parameter st

  lest=length(st)

  #going through all possible sizes of the regression model

  for (j in max(minj,lest):maxj){

    #creating a matrix of all possible regressions of a given size j, taking into account the "fixed subset" defined by st

          a<-combn(c,(j-lest));

          #in case c contains only 1 element and st is empty

      if ((length(c)<2)&(j>lest)) a=matrix(c)

      #in case st is non-empty to each column of a add a column of elements of st

    if (is.null(st)==FALSE) a<-rbind(matrix(st,ncol=dim(a)[2],nrow=lest),a)

    #compute the weights of each column of a

    s<-sum_weights_sub(a,j,we,st)

    #reoders columns of a in ascending order corrresponding to the order of the array of weights s

    a<-find_sub(a,s,j,c(st,c),st)

    #sort the array of weights

    s=sort(s);

    #find those weights which satisfy the weight constraint (aka maximal number of variables in the regression)

    ai=which(s<=max(nvar));

    if (length(ai)>0){

      for (k in 1:length(ai)){ #going through all elements of ai


        #transform the corresponding subset of variables into numeric one

        set_num<-make_numeric_sets(a,ai,k,vari,ra,l,mode)

        #numeric test set

        testset1<-set_num$test

        #numeric training set

        trset1<-set_num$tr

        #initialise correlsation parameter to 0

        corr1<-0

        if (corr<1){ #if correlation parameter corr is smaller than 1 compute absolute correlations between variables on the training set

          corr1<-abs(cor(trset1))
        }

        #if there is no restriction on highly correlated predictors or if there are no highly correlated predictors in the kth subset

        if ((length(corr1[corr1>corr])<=length(a[,ai[k]]))|(corr==1)){


        if (mode=='linear'){ #linear mode

          #get the prediction for the training set
        if (Rsq==F){

          pred<-get_predictions_lin(trset1,testset1,outc,l-n_tr,n_tr,s[ai[k]],Rsq,Rsq_v)

          pred_tr<-get_predictions_lin(trset1,trset1,outc,n_tr,n_tr,s[ai[k]],Rsq,Rsq_v)

          #difference between the actual value and the predicted one on the test set

          diff<-outi[setdiff(1:l,ra)]-pred;

          diff_tr<-outi[ra]-pred_tr

          #difference between predicted value and the actual one divided by the actual one, aka relative difference

          diffr<-(pred/outi[setdiff(1:l,ra)])-1;

          diffr_tr<-(pred_tr/outi[ra])-1;

          #diffr_tr[diff_tr==0]=0

          #diffr[diff==0]=0

          #sum of all absolute values of differences defined above

          preds[j-minx+1,k]=sum(abs(diff));

          preds_tr[j-minx+1,k]=sum(abs(diff_tr));

          #sum of all absolute values of relative differences defined above

          predsr[j-minx+1,k]=sum(abs(diffr));

          predsr_tr[j-minx+1,k]=sum(abs(diffr_tr));

         } else {

           #
           marg0=0

           if (marg>0.0) marg0=sqrt(max(qchisq(0.975,n_tr-1-s[ai[k]])/(n_tr-1-s[ai[k]]),(n_tr-1-s[ai[k]])/qchisq(0.975,n_tr-1-s[ai[k]])))-1

           if (marg0<=marg){


             Rsq_v=max(1-exp((2-s[ai[k]])/(0.1*n_tr)),(s[ai[k]]-0.05*(n_tr-1-s[ai[k]]))/s[ai[k]])


             pred<-get_predictions_lin(trset1,testset1,outc,l-n_tr,n_tr,s[ai[k]],Rsq,Rsq_v,marg)

             pred_tr<-get_predictions_lin(trset1,trset1,outc,n_tr,n_tr,s[ai[k]],Rsq,Rsq_v,marg)


             #difference between the actual value and the predicted one on the test set

             diff<-outi[setdiff(1:l,ra)]-pred;

             diff_tr<-outi[ra]-pred_tr;

             #difference between predicted value and the actual one divided by the actual one, aka relative difference

             diffr<-(pred/outi[setdiff(1:l,ra)])-1;

             diffr_tr<-(pred_tr/outi[ra])-1;

             #sum of all absolute values of differences defined above

             preds[j-minx+1,k]=sum(abs(diff));

             preds_tr[j-minx+1,k]=sum(abs(diff_tr));

             #sum of all absolute values of relative differences defined above

             predsr[j-minx+1,k]=sum(abs(diffr));

             predsr_tr[j-minx+1,k]=sum(abs(diffr_tr));



           }

        }

        } else{

          if (Rsq==F){

          #computing the probabilities for each outcome on the test set by fitting multinomial/logistic regression to the training set

          p<-get_probabilities(trset1,testset1,outc,mode,Rsq,n_tr=n_tr);

          p_tr<-get_probabilities(trset1,trset1,outc,mode,Rsq,n_tr=n_tr);


          if (objfun=='acc'){ #case of accuracy maximisation

            #transforming probabilities into predictions

            pred<-get_predictions(p,l-floor((1-1/part)*l),cutoff,cmode,mode)

            pred_tr<-get_predictions(p_tr,floor((1-1/part)*l),cutoff,cmode,mode)

            #difference between the actual values and the predicted ones on the test set

            diff<-outi[setdiff(1:l,ra)]-array(pred);

            diff_tr<-outi[ra]-array(pred_tr);


            if (predm=='exact')  { #in case of the exact prediction, aka predicting the exact class

              diff[abs(diff)>0]=1;

              diff_tr[abs(diff_tr)>0]=1;
            } else{

              #in case of "up to a class" prediction, aka consider correct if the correct class is the one neighboring to the predicted one

              diff[abs(diff)<2]=0;
              diff[abs(diff)>1]=1;
            }

            #computing the number of times prediction was correct (non-averaged out accuracy)

            #rows correspond to the number of variables in a regression, column is determined by k

            preds[j-minx+1,k]=l-floor((1-1/part)*l)-sum(abs(diff));

            preds_tr[j-minx+1,k]=floor((1-1/part)*l)-sum(abs(diff_tr));


          } else{

            #computing the AUROC of the prediction

            preds[j-minx+1,k]<-AUC(1-p,outi[setdiff(1:l,ra)])

            preds_tr[j-minx+1,k]<-AUC(1-p_tr,outi[ra])
          }
          } else {

            marg0=0.0

            if (marg>0) marg0=1.96*sqrt(sum(outc)*(n_tr-sum(outc))/n_tr)

            if (marg0<=marg){

              #Rsq_v=1-exp(-s[ai[k]]/(0.1*n_tr))

              #computing the probabilities for each outcome on the test set by fitting multinomial/logistic regression to the training set

              p<-get_probabilities(trset1,testset1,outc,mode,Rsq,s[ai[k]],n_tr);

              p_tr<-get_probabilities(trset1,trset1,outc,mode,Rsq,s[ai[k]],n_tr);



              if (objfun=='acc'){ #case of accuracy maximisation

                #transforming probabilities into predictions

                pred<-get_predictions(p,l-floor((1-1/part)*l),cutoff,cmode,mode)

                pred_tr<-get_predictions(p_tr,floor((1-1/part)*l),cutoff,cmode,mode)

                #difference between the actual values and the predicted ones on the test set

                diff<-outi[setdiff(1:l,ra)]-array(pred);

                diff_tr<-outi[ra]-array(pred_tr);


                if (predm=='exact')  { #in case of the exact prediction, aka predicting the exact class

                  diff[abs(diff)>0]=1;

                  diff_tr[abs(diff_tr)>0]=1;
                } else{

                  #in case of "up to a class" prediction, aka consider correct if the correct class is the one neighboring to the predicted one

                  diff[abs(diff)<2]=0;
                  diff[abs(diff)>1]=1;
                }

                #computing the number of times prediction was correct (non-averaged out accuracy)

                #rows correspond to the number of variables in a regression, column is determined by k

                preds[j-minx+1,k]=l-floor((1-1/part)*l)-sum(abs(diff));

                preds_tr[j-minx+1,k]=floor((1-1/part)*l)-sum(abs(diff_tr));


              } else{

                #computing the AUROC of the prediction

                preds[j-minx+1,k]<-AUC(1-p,outi[setdiff(1:l,ra)])

                preds_tr[j-minx+1,k]<-AUC(1-p_tr,outi[ra])
              }



            }



          }
        }
      }
      }
    }
  }




  if ((mode=='binary')&(objfun=='acc')){

    #empirical prediction based on always choosing the most frequent category

    cpred=sum(outi[setdiff(1:l,ra)])/(l-floor((1-1/part)*l));

#output is a list of (non-averaged out) accuracies of all feasible regression models, the corredponding maximal nimber of variables, the corredsponding empirical prediction

    list("regr" = preds, "nvar"=nvar, "emp" = cpred, "regr_tr"=preds_tr)


  } else{

    if (objfun=='roc') {

      #list of AUROCs for all feasible model and the corrresponding maximal number of variables

      list("regr" = preds, "nvar"=nvar,"regr_tr"=preds_tr)

    } else{

      if (mode=='multin'){

        #unique elements of the training set outcomes

        uo<-unique(outc);

        #empirical prediction based on always choosing the most frequent category

        cpred=1-length(which(outi[setdiff(1:l,ra)]==uo[which.max(tabulate(match(outc,uo)))]))/(l-floor((1-1/part)*l));

        #output is a list of (non-averaged out) accuracies of all feasible regression models, the corredponding maximal nimber of variables, the corredsponding empirical prediction

        list("regr" = preds, "nvar"=nvar, "emp" = cpred,"regr_tr"=preds_tr)

      } else{ #linear mode

        #empirical predictions based on always choosing the mean of the training set (empirical absolute and relative errors respectively)

      cpred=sum(abs(outi[setdiff(1:l,ra)]-mean(outi[ra])))/(l-floor((1-1/part)*l));

      cpredr=sum(abs((outi[setdiff(1:l,ra)]-mean(outi[ra]))/outi[setdiff(1:l,ra)]))/(l-floor((1-1/part)*l));

      #output is a list of non-averaged out absolute errors of all feasible regression models
      #the corredponding maximal nimber of variables, non-averaged out relative errors of all feasible regression models
      #absolute error of the empirical prediction, relative error of the empirical prediction

      list("regr" = preds, "nvar"=nvar, "regrr"=predsr, "emp"=cpred,"empr"=cpredr,"regrr_tr"=predsr_tr,"regr_tr"=preds_tr)

    }
  }

  }


}


#'Averaging out the predictive power
#'
#'Function which averages out the predictive power over all cross-validations
#'@param preds An M x \code{crv}N matrix consisting of \code{crv} horizontally concatenated M x N matrices. These M x N matrices are the matrices of predictive powers for all feasible regressions (M is maximum feasible number of variables included in a regression, N is the maximum feasible number of regressions of the fixed size; the row index indicates the number of variables included in a regression)
#'@param crv number of cross-validations
#'@param k size of the test set for which the predictions are made
#'@usage av_out(preds,crv,k)
#'@return Returns an M x N matrix of average predictive powers where M is maximum feasible number of variables included in a regression, N is the maximum feasible number of regressions of the fixed size; the row index indicates the number of variables included in a regression
#'@export av_out
#'@examples
#'#creating a matrix of predictive powers
#'
#'preds<-cbind(matrix(runif(40,1,4),ncol=10),matrix(runif(40,1.5,4),ncol=10))
#'preds<-cbind(preds,matrix(runif(40,1,3.5),ncol=10))
#'
#'#running the function
#'
#'av_out(preds,3,5)


av_out<-function(preds,crv,k){

  #writing the dimensions of the matrix of preditive powers into the array

  si<-dim(preds);

  #dividing the number of columns in preds by the number of cross-validations, since the results from each next cross-validation are always concatenated with the previous one

  si[2]=si[2]/crv;

  #initialising the array of averaged out predictive powers

  predsp<-array(NA,c(si[1],si[2]))

  for (i in 1:si[1]){
    for (j in 1:si[2]){

      pr<-preds[i,seq(j,si[2]*crv,si[2])]; #predictive power corresponding to the same model from all cross-validations

      #the mean value of the corresponding predictive power divided by the size of the test set

      predsp[i,j]=mean(pr[is.finite(pr)],na.rm=TRUE)/k;

    }
  }

  #output is the matrix of the averaged out predictive powers

  predsp

}

#'Best regression
#'
#'Function which identifies regressions with the highest predictive power
#'
#'@param predsp An M x N matrix of averaged out predictive power values. M is maximum feasible number of variables included in a regression, N is the maximum feasible number of regressions of the fixed size; the row index indicates the number of variables included in a regression.
#'@param nvar array of maximal number of variables for each cross-validation
#'@param c array of all indices of the prediction variables
#'@param we array of all weights of the prediction variables
#'@param st a subset of predictors to be always included into a predictive model
#'@param minx minimum number of predictors, defaults to 1
#'@usage get_indices(predsp,nvar,c,we,st,minx)
#'@return A list of arrays which contain indices of the predictors corresponfing to the best regressions
#'@seealso Uses \code{\link{sum_weights_sub}}, \code{\link{find_sub}}, \code{\link[utils]{combn}}
#'@export get_indices
#'@examples
#'#creating a set of averaged out predictive powers
#'
#'predsp<-matrix(NA,ncol=3,nrow=3)
#'
#'predsp[1,]=runif(3,0.7,0.8)
#'predsp[2,]=runif(3,0.65,0.85)
#'predsp[3,1]=runif(1,0.4,0.5)
#'
#'#running the function
#'
#'get_indices(predsp,c(3,3,3),1:3,c(1,1,1))

get_indices<-function(predsp,nvar,c,we,st=NULL,minx=1){

  #creating a list

  ll=list(0)

  if (sum(is.na(predsp))<length(predsp)){

  #finding the index of the arry of the averaged out predictive powers which corresponds to the highest predictive power

  nums<-which(predsp==max(predsp[predsp!=0],na.rm=TRUE))

  #dimensions of the array of the averaged out predictive powers

  si=dim(predsp)

  #computing the number of variables of the best predictive model based on the row it corresponds to

  numv=nums%%si[1]

  #finding the column which corresponds to the model with the best predictive power

  numss=ceiling(nums/si[1])

  #array of predictive variables without the "fixed subset" defined by parameter st

  c=setdiff(c,st)

  #length of the "fixed subset"

  lest=length(st)

  #going through all models which exhibited the highest predictive power

  for (i in 1:length(numv)){

    #in case the value of the number of variables is 0 reassign the maximal number of variables to it

    if (numv[i]==0) numv[i]=si[1]

    #add the minimal number of variables defined by parameter minx to the number of variables

   numv1=numv[i]+minx-1

   #all subsets of size numv1 minus the size of the "fixed subset" defined by st

      af<-combn(c,(numv1-lest))

   #in case there is only one predictor and st is empty

    if ((length(c)<2)&(numv1>lest)) af<-matrix(c)

      #in case the "fixed subset" is not empty add to each column of af a column with elements of st

    if (is.null(st)==FALSE) af<-rbind(matrix(st,ncol=dim(af)[2],nrow=lest),af)

      #compute the weights of models corresponding to columns of af

    s<-sum_weights_sub(af,numv1,we,st)

    #reorder the columns of sf based on the order of the array of weights s

    af<-find_sub(af,s,numv1,c,st)

    #sort the array of weights

    s=sort(s)

    #find the models with the weights satisfying the aximal weight constraint

    aif=which(s<=max(nvar));

    #the column of af exhibiting the best predictive power is written as an ith element of the list

    ll[[i]]<-af[,aif[numss[i]]]



  }
  } else ll=NA

  #output the lest of the models exhibiting the best predictive power

  ll
}

#'Indices of the best regressions
#'
#'One of the two main functions of the package. Identifies the predictors included into regressions with the highest average predictive power
#'@param vari set of predictors
#'@param outi array of outcomes
#'@param crv number of cross-validations
#'@param cutoff cut-off value for mode \code{'binary'}
#'@param part for each cross-validation partitions the dataset into training and test set in a proportion \code{(part-1):part}
#'@param cmode \code{'det'} or \code{''}; \code{'det'} always predicts the more likely outcome as determined by the odds ratio; \code{''} predicts certain outcome with probability corresponding to its odds ratio (more conservative). Option available for multinomial/logistic regression
#'@param mode \code{'binary'} (logistic regression), \code{'multin'} (multinomial regression)
#'@param predm \code{'exact'} or \code{''}; for logistic and multinomial regression; \code{'exact'} computes how many times the exact outcome category was predicted, \code{''} computes how many times either the exact outcome category or its nearest neighbour was predicted
#'@param objfun \code{'roc'} for maximising the predictive power with respect to AUC, available only for \code{mode='binary'}; \code{'acc'} for maximising predictive power with respect to accuracy.
#'@param parallel TRUE if using parallel toolbox, FALSE if not. Defaults to FALSE
#'@param cores number of cores to use in case of parallel=TRUE
#'@param minx minimum number of predictors to be included in a regression, defaults to 1
#'@param maxx maximum number of predictors to be included in a regression, defaults to maximum feasible number according to one in ten rule
#'@param maxw maximum weight of predictors to be included in a regression, defaults to maximum weight according to one in ten rule
#'@param nr a subset of the data-set, such that \code{1/part} of it lies in the test set and \code{1-1/part} is in the training set, defaults to empty set. This is to ensure that elements of this subset are included both in the training and in the test set.
#'@param st a subset of predictors to be always included into a predictive model,defaults to empty set
#'@param rule an Events per Variable (EPV) rule, defaults to 10'
#'@param corr maximum correlation between a pair of predictors in a model
#'@param Rsq whether the R-squared statistics constraint is introduced
#'@param marg margin of error for R-squared statistics constraint
#'@return Prints the best predictive power provided by a regression, predictive accuracy of the empirical prediction (value of \code{emp} computed by \code{cross_val} for logistic and linear regression). Returns indices of the predictors included into regressions with the highest predictive power written in a list. For \code{mode='linear'} outputs a list of two lists. First list corresponds to the smallest absolute error, second corresponds to the smallest relative error
#'@export regr_ind
#'@import doParallel
#'@import parallel
#'@import foreach
#'@seealso Uses \code{\link{compute_weights}}, \code{\link{make_numeric}}, \code{\link{compute_max_weight}}, \code{\link{compute_weights}}, \code{\link{compute_max_length}}, \code{\link{cross_val}},\code{\link{av_out}}, \code{\link{get_indices}}
#'@examples
#'#creating variables for linear regression mode
#'
#'variables_lin<-matrix(c(rnorm(56,0,1),rnorm(56,1,2)),ncol=2)
#'
#'#creating outcomes for linear regression mode
#'
#'outcomes_lin<-rnorm(56,2,1)
#'
#'#running the function
#'
#'regr_ind(variables_lin,outcomes_lin,100,mode='linear',parallel=TRUE,cores=2)
#'
#'#creating variables for binary mode
#'
#'vari<-matrix(c(1:100,seq(1,300,3)),ncol=2)
#'
#'#creating outcomes for binary mode
#'
#'out<-rbinom(100,1,0.3)
#'
#'#running the function
#'
#'regr_ind(vari,out,20,cutoff=0.5,part=10,mode='binary',parallel=TRUE,cores=2,nr=c(1,10,20),maxx=1)




regr_ind<-function(vari,outi,crv,cutoff=NULL,part=10,mode,cmode='det',predm='exact',objfun='acc',parallel=FALSE,cores,minx=1,maxx=NULL,nr=NULL,maxw=NULL,st=NULL,rule=10,corr=1,Rsq=F,marg=0){

  #in case of an error close all the connections

  #on.exit(closeAllConnections());


  #error message in case of incompatible moe and objfun parameters

  if ((objfun=='roc')&(mode!='binary')) {

    stop('function "roc" is available for binary mode only')

  }

  if ((length(unique(outi))>2)&(mode=='binary')){

    stop('the outcome you provided appears to be non-binary, please re-run in multin mode')

  }


#if there is only one predictive variable written in an array

  if (is.null(dim(vari))){

    vari=matrix(vari);
  }

  #overall number of predictive variables

  vari_col=ncol(vari);

  #compute weights of all predictive variables and write them into an array

  we<-compute_weights(vari_col,vari)

  #the sample size

  l=nrow(vari);

  n_tr=floor((1-(1/part))*l)


  #the array of all indices of predictive variables

  c<-c(1:vari_col);

  if (is.numeric(outi)==FALSE){ #turning non-numeric outcomes in numeric ones

    outi<-make_numeric(outi,TRUE,1:length(outi),mode)

  }

  #compute maximum weight defined by the outcomes

  numr<-compute_max_weight(outi,mode);

  #compute maximal length of a regression which can be fitted to a training set

  le<-compute_max_length(vari_col,floor((1.0/rule)*numr),c,we,minx,maxx+1,st)

  #initialising the array of predictive powers of all feasible regression

  preds<-array(NA,c(min(vari_col,min(floor((1/(rule*min(we)))*numr),maxx+1))-minx+1,le));

  preds_tr<-array(NA,c(min(vari_col,min(floor((1/(rule*min(we)))*numr),maxx+1))-minx+1,le));

  #defining a %fun% function

  `%fun%` <- `%do%`


  if (parallel==TRUE){ #in case the parallel mode is on

    `%fun%` <- `%dopar%`

    #creating a cluster of the corresponding foem

    cl <- makeCluster(cores,setup_strategy="sequential")

    registerDoParallel(cl)

    #exporting the corresponding libraries to all cores

    clusterEvalQ(cl,rm(list=ls()))
    clusterEvalQ(cl, library(nnet))
    clusterEvalQ(cl, library(foreach))
    clusterEvalQ(cl, library(doParallel))
    clusterEvalQ(cl, library(stats))

  #exporting all necessary functions to all cores

    clusterExport(cl,"cross_val")
    clusterExport(cl,"compute_max_weight")
    clusterExport(cl,"sum_weights_sub")
    clusterExport(cl,"get_probabilities")
    clusterExport(cl,"get_predictions")
    clusterExport(cl,"get_predictions_lin")
    clusterExport(cl,"find_sub")
    clusterExport(cl,"make_numeric")
    clusterExport(cl,"make_numeric_sets")
    clusterExport(cl,"AUC")


  }

  #running the given number of cross-validations

  result <- foreach(i=1:crv, .combine='comb', .multicombine=TRUE,.packages="CARRoT") %fun% {



    cross_val(vari,outi,c,rule,part,l,we,vari_col,preds,mode,cmode,predm,cutoff,objfun,minx,maxx,nr,maxw,st,corr,Rsq,marg,n_tr,preds_tr);



  }

  if(parallel==T) on.exit(stopCluster(cl))
  #writing predictive powers of the regressions in an array

  preds<-result[[1]];

  #writing the maximal number of variables in an array

  nvar<-result[[2]];

  preds_tr<-result[[length(result)]]



  if (mode=='binary') {

    if (objfun=='acc') { #write the array of empirical predictions

      cpred<-result[[3]];

    }
  } else {

    if (mode=='linear'){

      #write the array of relative errors

      predsr<-result[[3]];

      predsr_tr<-result[[length(result)-1]]

      #write the array of absolute errors of empirical predictions

    cpred<-result[[4]];

    #write the array of relative errors of empirical predictions

    cpredr<-result[[5]];

    } else{

      #an array of empirical predictions

      cpred<-result[[3]]

      }

  }




  #stop the cluster

#  if (parallel==TRUE) stopCluster(cl)

#  closeAllConnections()


  #average out the predictive powers over all cross-validations

  if (objfun=='roc'){
    predsp<-av_out(preds,crv,1)

    predsp_tr<-av_out(preds_tr,crv,1)
  } else{

    predsp<-av_out(preds,crv,l-n_tr)

    predsp_tr<-av_out(preds_tr,crv,n_tr)
  }

  if (mode=='linear')  {

    predspr<-av_out(predsr,crv,l-n_tr)

    predspr_tr<-av_out(predsr_tr,crv,n_tr)


  }




  if (((mode=='binary')&(objfun=='acc'))|(mode=='multin')) {

    #print the average accuracy attained by the best predictive model and the empirical accuracy
    t1<-max(predsp[predsp>0],na.rm=TRUE)
    t1_ind<-which(predsp==t1)[1]

    if (is.infinite(t1)) t1=NA
    a1<-c(t1,min(predsp_tr[t1_ind]),1-sum(cpred)/crv)

    print(a1);

  } else{

    if (objfun=='roc') {

      #print the average AUROC of the best predictive model
    t1<-max(predsp[predsp>0],na.rm=TRUE)

    t1_ind<-which(predsp==t1)
    if (is.infinite(t1)) t1=NA

      print(c(t1,predsp_tr[t1_ind]))
      a1<-c(t1,predsp_tr[t1_ind])

    } else{


      if (is.na(sum(cpredr[cpredr<Inf])/(crv-length(cpredr[cpredr==Inf])))==F){

        #in case all average relative errors are finite

        t1<-min(predsp[predsp>0],na.rm=TRUE)
        t2<-min(predspr[predspr>0],na.rm=TRUE)

        t1_ind<-which(predsp==t1)
        t2_ind<-which(predspr==t2)

        if (is.infinite(t1)) t1=NA
        if (is.infinite(t2)) t2=NA


      print(c(t1,t2,min(predsp_tr[t1_ind]),min(predspr_tr[t2_ind]),sum(cpred)/crv,sum(cpredr[cpredr<Inf])/(crv-length(cpredr[cpredr==Inf]))))

      a1<-c(t1,t2,min(predsp_tr[t1_ind]),min(predspr_tr[t2_ind]),sum(cpred)/crv,sum(cpredr[cpredr<Inf])/(crv-length(cpredr[cpredr==Inf])))

        } else {

          #printing NaN values for average relative error in case it is not finite

          t1<-min(predsp[predsp>0],na.rm=TRUE)

          t1_ind<-which(predsp==t1)

          if (is.infinite(t1)) t1=NA

        print(c(t1,NaN,min(predsp_tr[t1_ind]),sum(cpred)/crv,NaN))

          a1<-c(t1,NaN,sum(cpred)/crv,NaN)


      }


    }
  }




  if (mode=='linear') { #find the indices of the variables included into the regression with the best predictive power

   if (sum(is.finite(predspr))>0) {

     #find indices of the variables included in the models corresponding to the lowest absolute and lowest relative error

     list(a1,get_indices(-predsp,nvar,c,we,st,minx),get_indices(-predspr,nvar,c,we,st,minx))

   } else {

     #in case relative error is infinite output NaN

     list(a1,get_indices(-predsp,nvar,c,we,st,minx),NaN)

   }

  } else{

    #find indices of the variables included in the models corresponding to the highest accuracy/AUROC

    list(a1,get_indices(predsp,nvar,c,we,st,minx))

  }






}




#' Pairwise interactions and squares
#'
#' Function transforms a set of predictors into a set of predictors, their squares and pairwise interactions
#'@param A set of predictors
#'@param n first \code{n} predictors, whose interactions with the rest should be taken into account, defaults to all of the predictors
#'@return Returns the predictors including their squares and pairwise interactions
#'@export quadr
#'@examples quadr(cbind(1:100,rnorm(100),runif(100),rnorm(100,0,2)))



quadr<-function(A,n=1000){

  #copying the array of variables into array B

  B<-A

if (n==1000){ #if the n parameter is set to default

  n=dim(A)[2] #the number of variables whose interactions are to be considers is the number of all variables

}

  ind<-c()

for (i in 1:n){

  ind<-c(ind,i:n)
}

B<-cbind(B,A[,rep(1:n,c(n:1))]*A[,ind])

#for (i in 1:n){

#  B=cbind(B,A[,i:dim(A)[2]]*A[,i]) #multiplying variables in order to obtain pairwise interactions

#}

B
}

#' Three-way interactions and squares
#'
#' Function transforms a set of predictors into a set of predictors, their squares, pairwise interactions, cubes and three-way interactions
#'@param A set of predictors
#'@param n first \code{n} predictors, whose interactions with the rest should be taken into account, defaults to all of the predictors
#'@return Returns the predictors including their squares, pairwise interactions, cubes and three-way interactions
#'@export cub
#'@examples cub(cbind(1:100,rnorm(100),runif(100),rnorm(100,0,2)))
#'
#'
cub<-function(A,n=1000){

  B<-quadr(A,n) #creating an array of all pairwise interactions

  if (n==1000){#if the n parameter is set to default
  n<-dim(A)[2] #the number of variables whose interactions are to be considers is the number of all variables
  }

  m<-dim(B)[2] #the number of variables and their pairwise interactions

  C<-B[,(n+1):m]

  k<-m-n

  ind<-c()
  ind0<-c()

  for (i in 1:n){

#    ind<-c(ind,i:n)
    ind0<-c(ind0,c((n-i+1):1))
  }

  for (i in 1:n){

    ind<-c(ind,n+1-ind0[(ifelse(i==1,0,sum((n-i+2):n))+1):length(ind0)])
  }

  B<-cbind(B,C[,rep(1:k,ind0)]*A[,ind])

#  for (i in 1:n){
#    B=cbind(B,B[,(n+0.5*(n+n-i+2)*(i-1)+1):m]*A[,i]) #multiplying variables from B (pairwise interactions) and A in order to obtain three-way interactions
#  }

  B

}


#' Finding the interacting terms based on the index
#'
#' Function transforms an index of an array of two- or three-way interactions into two or three indices corresponding to the interacting variables
#'@param ind index to transform
#'@param N number of interacting variables
#'@usage find_int(ind,N)
#'@return Returns two or three indices corredsponding to a combination of variables written under the given index
#'@export find_int
#'@examples find_int(28,9)


find_int<-function(ind,N){

  if (ind<=N){ #if the index is lower than the number of variables

    ind #it is just a single variable

  } else{

    if (ind<=(N+0.5*(N+1)*N)){ #if the index is smaller than the number of all variables, their squares and two-way interactions

    a<-2*N #starting point is interactions with the first variables

    i<-1

    while (ind>a){ #locating the index by adding the number of possible interacting variables

      a=a+N-i

      i=i+1
    }

    c(i,ind-a+N) #output two interacting variables

    } else{ #in case the interaction goes beyond a pair of variables


      a<-N+0.5*(N+1)*N #starting point two-way interactions

      i<-0

      while (ind>a){ #locating the index by adding the number of possible interacting variables taking into account three-way interactions

        a=a+0.5*(N-i+1)*(N-i)

        i=i+1
      }

      ind1<-i #the index of the first interacting variable

      ind2<-ind-a+0.5*(N-i+2)*(N-i+1) #reducing the problem to finding two interacting variables

      a<-N-i+1 #similar to the two-way interaction case but taking into account the number of the first interacting variable


      while (ind2>a){ #similarly to two-way case locating two variables interacting with each other

        a=a+N-i

        i=i+1
      }

      c(ind1,i,ind2-a+N) #outputting three interacting variables

    }


  }

}

Try the CARRoT package in your browser

Any scripts or data that you put into this service are public.

CARRoT documentation built on Oct. 14, 2023, 1:06 a.m.