R/gilmour.R
In gilmour: The Interpretation of Adjusted Cp Statistic

Documented in final_model submodels

#' This data contains critical values for testing at the significance level 5%
#' Data published in the original article.
#' Crit was taken from the original study (see Table2).
#' Values in Crit were recalculated by Josef Dolejs
#' for more possible steps of freedom
#' @name crit
#' @docType data
#' @author Steven G. Gilmour and recalculated by Josef Dolejs for more possible steps of freedom
#' @references \url{https://rss.onlinelibrary.wiley.com/doi/10.2307/2348411}
#' @keywords data
"crit"

#' Ilustrative data published in the original article (House prices)
#' Data with the practical meaning
#' Gilmour9p was taken from the original study (see Table1).
#' @name Gilmour9p
#' @docType data
#' @author Steven G. Gilmour
#' @references \url{https://rss.onlinelibrary.wiley.com/doi/10.2307/2348411}
#' @keywords data
"Gilmour9p"

#' T1 contains simulated data without real meaning
#' the null hypothesis is not rejected in the first test
#' it illustrates the situation if full model is final model
#' data has no real meaning
#' @name T1
#' @docType data
#' @author Josef Dolejs
#' @references Josef Dolejs
#' @keywords data
"T1"

#' T2 illustrates the situation if
#' the loop of tests is finished by the Trivial model
#' data has no practical meaning
#' @name T2
#' @docType data
#' @author Josef Dolejs
#' @references Josef Dolejs
#' @keywords data
"T2"

#' Trivial illustrates the situation if
#' the Trivial model is model_min without testing process
#' data has no real meaning
#' @name Trivial
#' @docType data
#' @author Josef Dolejs
#' @references Josef Dolejs
#' @keywords data
"Trivial"

#' special illustrative data if more than two tests
#' are done in the loop in the function final_model()
#' original Gilmour table was modified
#' data has no real meaning
#' @name Modified_Gilmour9p
#' @docType data
#' @author Josef Dolejs
#' @references Josef Dolejs
#' @keywords data
"Modified_Gilmour9p"

#' number of visitors in parks, citation:
#' Stemberk Josef, Josef Dolejs, Petra Maresova, Kamil Kuca.
#' Factors affecting the number of Visitors in National Parks
#' in the Czech Republic, Germany and Austria.
#' International Journal of Geo-Information.
#' ISPRS Int. J. Geo-Inf. 2018, 7(3), 124; doi:10.3390/ijgi7030124
#' data has real meaning
#' @name Parks5p
#' @docType data
#' @author Stemberk Josef, Josef Dolejs, Petra Maresova, Kamil Kuca
#' @references \url{https://www.mdpi.com/2220-9964/7/3/124}
#' @keywords data
"Parks5p"


#' number of patents in universities, citation:
#' Maresova Petra, Soukal Ivan, Stemberkova Ruzena, Selamat Ali
#' Perspective and Suitable Research Area in
#' Public Research-Patent Analysis of the Czech Public Universities
#' Education and Urban Society, 54(7),
#' Maresova Petra, Soukal Ivan, Stemberkova Ruzena, Selamat Ali
#' data has real meaning
#' @name Patents5p
#' @docType data
#' @author Maresova Petra, Soukal Ivan, Stemberkova Ruzena, Selamat Ali
#' @references \doi{10.1177/00131245211027362}
#' @keywords data
"Patents5p"

#' illustrative econometric data from Eurostat
#' for 5 variables in 17 countries in 2019
#' columns: Country, LifExp , HDP, Unempl, Obesity, APassangers
#' remove the first column: rownames(EU2019)= EU2019[,1]; EU2019=EU2019[,-1]
#' data has real meaning
#' @name EU2019
#' @docType data
#' @author Josef Dolejs
#' @references Josef Dolejs
#' @keywords data
"EU2019"

submodels<-function(d){
  # two basic objects and the first level of degrees of freedom in testing is t = n-k-1
  y=d[,1]; X=d[,2:ncol(d)]; k=ncol(d)-1; n=nrow(d); t=n-k-1 # k is number of regressors
  # number of observations should be > number of regressors+3
  if( n<=(k+3) ) {message("the number of observations is not higher than k+3")
    message( paste("reduction of regressors should be done, n =",n,"and k+3 =",k+3))}
  # the second level of degrees of freedom is r=k-q+1
  # full model is labeled as "Model", set of regressors is "x1-xk" for k
  regressors = paste0("x",1:k)
  Model=paste0("y~")
  for (i in 1:(k-1)){ Model=paste0(Model, regressors[i],"+")  } # only k-1 regressors
  Model=paste0(Model, regressors[k])
  # the sign plus is not used after the last regressor
  message( paste("Full Model: ",Model) )
  # reading of data to regressors from table d
  for (i in 1:k){ assign(regressors[i],d[,i+1]) }
  # Results and MSE in the full model
  Results = lm( as.formula(Model) )
  MSE = sigma(Results)^2
  # variables n, k, y, x1,x2...xk, MSE are used
  # matrix Smodels  contains all possible submodels
  s=0 # number of all possible submodel is calculated
  # firstly, number of all possible combinations is combn(seq(from=1,to=k,by=1),i)
  # i is "level" and number of regressors, number of submodels in specific level "i"
  # is calculated and the resulting value s is the sum over all levels "i"
  for (i in 1:k){
    s = s + ncol(combn(seq(from=1,to=k,by=1),i))
  }
  s # e.g. s= 511 in "Gilmour9p.txt" for k=9
  # matrix of submodels and string expressions of submodels are created
  m=0 # m is row in matrix of submodels, it is in the first column
  Smodels = matrix(,ncol=4,nrow=s) # implementation of matrix of submodels (now is empty)
  colnames(Smodels)=c("Submodel number","Number of regressors+1","Cp","Submodel")
  # the trivial model (constant C1, p=1) is calculated
  # function var() calculates sample variation ((n-1) is used)
  m=m+1; p=1 # here row m=1 (the first row contains the trivial model "C")
  C1=var(y)*(n-1)/MSE-(n-2*p)-2*(k-p+1)/(n-k-3) # formula the original article
  # string expression of submodel is in the 4th column and variable "TT" is used
  Smodels[m,1]=toString(m)
  Smodels[m,2]=toString(p)
  Smodels[m,3]=toString(C1)
  Smodels[m,4]=toString("C") # the end of the trivial model
  # loop through all levels of submodels: p = 1 : k
  # in the full model the level is p = k+1, therefore p in the last submodel is k here
  for (p in 2:(k+1-1)) { # p is level (size) of submodel (number of predictros + 1)
    # level p is fixed and all combinations at the level are calculated
    # all levels without the trivial model, number of combinations is p-1
    Comb_p = combn(seq(from=1,to=k,by=1),p-1) # e.g. for p=2 only one regressor
    # immersed loop for fixed p
    # the main part creating a textual representation of the submodel
    # and its calculation using the lm() function
    for (i in 1:ncol(Comb_p)){ # i is number of submodels in subset for level p
      subM=matrix(, ncol=p-1,nrow=ncol(Comb_p)) # matrix of regressors, not submodels
      for(j in 1:p-1) {subM[i,j]= sprintf("x%s", Comb_p[j,i])} # j is for regressors
      # string expressions of submodels TT is in two alternatives: p>2 or p==2
      TT="y~"
      if(p>2){for (j in 1:(p-2)){TT=paste0(TT, subM[i,j],"+")}
        TT=paste0(TT,subM[i,p-1])}
      if(p==2) {TT=paste0(TT,subM[i,p-1])}
      sublm = lm(as.formula(TT)) # linear regression calculated in submodel using lm()
      Cp=(sum(resid(sublm)^2)/MSE-n+2*p)-(2*(k-p+1)/(n-k-3)) # Cp
      # 4 values in 4 columns in matrix Smodels are written to m row
      # the four values are: number of submodel m, level p, Cp, textual representation of the submodel
      m=m+1
      Smodels[m,1]=toString(m); Smodels[m,2]=toString(p)
      Smodels[m,3]=toString(Cp); Smodels[m,4]=toString(TT)
    } # i is number of submodels in subset for level p
  }  # p is level (size) of submodel (number of predictros + 1)
  message(dim(Smodels))
  message(Smodels)   # the output of Smodels
  # selection of candidate submodel with the minimal value of Cp
  min=which.min(as.numeric(Smodels[,3]))
  # minimal Cp = Cq+1 (labeling according to the original article)
  Cq_plus_1 = min(as.numeric(Smodels[,3])) # e.g. p=3 in "y~x1+x2" and q=2 in the original example
  q=as.numeric(Smodels[min,2])-1 # number of regressors in model_min
  message(paste("Number of regressors in model_min ",q))
  model_min=Smodels[min,4]
  if(q>0){ model_min_results= lm(as.formula(model_min)); model_minA=paste( model_min, " + constant") }
  if(q==0){ model_minA ="Trivial model" ; model_min_results= NA ; Cq_plus_1=C1 }
  message(model_minA)
  message(paste("model_min: ", model_minA))
  message(paste("Cq+1: ", Cq_plus_1))

  return( list(y=y,X=X,n=n,k=k,t=t,regressors=regressors, full_model= paste(Model," + constant"),
               full_model_results= Results, MSE = MSE, submodels_number=s, submodels=Smodels,
               Cq_plus_1A= Cq_plus_1, q=q, model_min=model_minA, model_min_results= model_min_results)  )
}





final_model<-function(d){
  # two basic objects and the first level of degrees of freedom in testing is t = n-k-1
  y=d[,1]; X=d[,2:ncol(d)]; k=ncol(d)-1; n=nrow(d); t=n-k-1 # k is number of regressors
  # (n-k-3)> 0 => number of observations should be > number of regressors+3
  if( n<=(k+3) ) {message("the number of observations is not higher than k+3")
    message( paste("reduction of regressors should be done, n =",n,"and k+3 =",k+3))}
  # the second level of degrees of freedom is r=k-q+1
  # full model is labeled as "Model", set of regressors is "x1-xk" for k
  regressors = paste0("x",1:k)
  Model=paste0("y~")
  for (i in 1:(k-1)){ Model=paste0(Model, regressors[i],"+")  } # only k-1 regressors
  Model=paste0(Model, regressors[k]) # the sign plus is not used after the last regressor
  # reading of data to regressors from table d
  for (i in 1:k){ assign(regressors[i],d[,i+1]) }
  # the full model Results, MSE
  full_model_results = lm( as.formula(Model) ); MSE = sigma(full_model_results)^2
  # variables n, k, y, x1,x2...xk, MSE are used
  # matrix Smodels contains all possible submodels
  # sufficient number of observations n should be available (n>k+3)
  s=0 # number of all possible submodel
  # firstly, number of all possible combinations is combn(seq(from=1,to=k,by=1),i)
  # i is "level" and number of regressors, number of submodels in specific level "i"
  # is calculated and the resulting value s is the sum over all levels "i"
  for (i in 1:k){
    s = s + ncol(combn(seq(from=1,to=k,by=1),i))
    }
  s # e.g. it is 511 in "Gilmour9p" for k=9
  # matrix of submodels and string expressions of submodels are created
  m=0 # m is row in the matrix submodels and is in the first column
  Smodels = matrix(,ncol=4,nrow=s) # implementation of matrix of submodels
  colnames(Smodels)=c("Submodel number","Number of regressors+1","Cp","Submodel")
  # firstly, the trivial model (constant C1, p=1) is calculated
  # function var() calculates sample variation and (n-1) is used
  m=m+1; p=1 # row m=1 contains the trivial model "C"
  C1=var(y)*(n-1)/MSE-(n-2*p)-2*(k-p+1)/(n-k-3) # formula from the original article
  # string expression of submodel is in the 4th column (variable "TT")
  Smodels[m,1]=toString(m); Smodels[m,2]=toString(p)
  Smodels[m,3]=toString(C1); Smodels[m,4]=toString("C") # the end of the trivial model
  # loop through all levels of submodels: p = 1 : k
  # in the full model p = k+1, therefore p in the last submodel is k
  for (p in 2:(k+1-1)) { # p is size of submodel (number of regressors + 1)
    # level p is fixed and all combinations at the level are calculated
    # all levels without the trivial model, number of all combinations is p-1
    Comb_p = combn(seq(from=1,to=k,by=1),p-1) # e.g. for p=2 is one regressor
    # immersed loop for fixed p
    # creating a textual representation of submodel
    # and calculation of submodels results using the lm()
    for (i in 1:ncol(Comb_p)){ # i is number of submodels in subset for level p
      subM=matrix(, ncol=p-1,nrow=ncol(Comb_p)) # matrix of regressors, not submodels
      for(j in 1:p-1) {subM[i,j]= sprintf("x%s", Comb_p[j,i])} # j is for regressor
      # string expressions of submodels TT is in two alternatives: p>2 or p==2
      TT="y~"
      if(p>2){for (j in 1:(p-2)){TT=paste0(TT, subM[i,j],"+")}
        TT=paste0(TT,subM[i,p-1])}
      if(p==2) {TT=paste0(TT,subM[i,p-1])}
      sublm = lm(as.formula(TT)) # results calculated in submodel
      Cp=(sum(resid(sublm)^2)/MSE-n+2*p)-(2*(k-p+1)/(n-k-3)) # Cp
      # 4 values in 4 columns in matrix Smodels are written to m row
      # the four values are:
      # m, level p, Cp, textual representation of the submodel
      m=m+1
      Smodels[m,1]=toString(m); Smodels[m,2]=toString(p)
      Smodels[m,3]=toString(Cp); Smodels[m,4]=toString(TT)
    } # i is number of submodels in subset for level p
  }  # p is level (size) of submodel (number of predictros + 1)
  # selection of candidate submodel with the minimal value of Cp
  min=which.min(as.numeric(Smodels[,3]))
  # minimal Cp = Cq+1 (labeling according to the original article)
  Cq_plus_1 = min(as.numeric(Smodels[,3])) # p=3 in "y~x1+x2" and q=2 in the original example
  q=as.numeric(Smodels[min,2])-1 # number of regressors in model_min
  message(paste("Number of regressors in model_min ",q))
  model_minA=Smodels[min,4] ; Cq_plus_1A= Cq_plus_1 ; qA=q
  if(qA>0){ model_min_results= lm(as.formula(model_minA)) }
  if(qA==0){ model_minA ="Trivial model" ; model_min_results= NA ; FinalCp= Cq_plus_1A }
  message(paste("model_min: ", model_minA))
  message(paste("Cq+1:",Cq_plus_1A)) #

  # table "crit" contains critical values at the significance level 5%
 colnames(crit)=c("t=n-k-1 and r = k - q +1",1:30)
  # labeling "q+1=p" and "q" is used as in the original article
  # the number of regressors in specific submodel with the minimal value of Cp=Cq+1 is q and p=q+1
  # e.g. in the example in the original article model_min: "y~x1+x2" q=2
  # generally, the process may be finished at the level p=1
  # and the trivial model may be resulting model
  # model_minA is tested against the best immersed submodel
  # one regressor is removed and the best immersed submodel is used in the first cycle
  # if the null hypothesis is rejected then the best immersed submodel is determined as "model_min"
  # then the loop continue to the next step
  # if the null hypothesis is not rejected then final submodel is found...

  message(paste("The starting value of q for the test of submodel is:  ", qA))
  model_min=model_minA; NullHypothesis=TRUE
  if(qA>0){ for (level in 1:qA) { if(NullHypothesis) {
    # q is number of regressors in submodel, number of possible reduced cadidates is also q
    # t = n-k-1 a r = k q+1 are steps of freedom in the test (for critical values)

    if(q == 1){ # if q=1 then the test of submodel with one regressor against the trivial model is calculated
      r=k-qA+1
      F=C1-Cq_plus_1+2*(n-k-2)/(n-k-3); Critical = crit[t-2,r+1]
      if(F<Critical) { message(paste0("For q = ", q,": Ho is rejected and the submodel with one regressor is rejected. Only trivial model (constant) is assumed. C1 = ", round(C1,5)))
        FinalCp=C1 ; q=0; final_model_results=NA ; NullHypothesis=FALSE }

      if(F>Critical) {
        message(paste0("For q = ", q ," : the end, Ho is not rejected and the resulting model is: ", model_min,
                       "+constant, F = ", round(F,5), " Cq+1 = ", round(Cq_plus_1,5) ))
        FinalCp= Cq_plus_1 ;final_model= paste(model_min," + constant") ; final_model_results=lm(as.formula(model_min))
        NullHypothesis=FALSE
      }
    }   # if(q == 1)

    if(q > 1){ # if q > 1 then matrix Qmodels of all reduced submodels is used
      r=k-q+1
      Xpositions=unlist(gregexpr('x', model_min))
      # if Ho is rejected then new better submodel model_min is known, submodels with q-1 regressors
      Qmodels=matrix(,ncol=4,nrow=q)# new matrix of Qsubmodels is used with q rows
      mm=1 # the first reduced model from model_min
      # the first regressor is excluded
      Qmodels[mm,1]=mm; Qmodels[mm,2]=q; # writing mm row to matrix
      Qmodels[mm,4]= paste0(substr(model_min,1,2),substr(model_min,Xpositions[mm+1],nchar(model_min)))
      qlm = lm(as.formula(Qmodels[mm,4]))
      Qmodels[mm,3]=(sum(resid(qlm)^2)/MSE-n+2*q)-(2*(k-q+1)/(n-k-3))

      if (q>2){
        for (i in 2:(q-1)){  # the following reduced models from model_min
          mm=mm+1
          Qmodels[mm,1]=mm; Qmodels[mm,2]=q;
          Qmodels[mm,4]= paste0(substr(model_min,1,Xpositions[mm]-1),
                                substr(model_min,Xpositions[mm+1],nchar(model_min)))
          qlm = lm(as.formula(Qmodels[mm,4]))
          Qmodels[mm,3]=(sum(resid(qlm)^2)/MSE-n+2*q)-(2*(k-q+1)/(n-k-3))
        }
      }
      mm=mm+1 # the last reduced model from model_min
      Qmodels[mm,1]=mm; Qmodels[mm,2]=q
      Qmodels[mm,4]= substr(model_min,1,Xpositions[mm]-2)
      qlm = lm(as.formula(Qmodels[mm,4]))
      Qmodels[mm,3]=(sum(resid(qlm)^2)/MSE-n+2*q)-(2*(k-q+1)/(n-k-3))

      min=which.min(as.numeric(Qmodels[,3])) # position of the minimal Cq in reduced submodel
      CCq=as.numeric(Qmodels[min,3]) # CCq is the minimal Cq in reduced models, it is tested
      ModelTest= Qmodels[min,4] # the best model with the lowest Cq
      F=CCq-Cq_plus_1+2*(n-k-2)/(n-k-3)
      # it is now model_min, q=q-1 and the process is repeated
      # until q >2 the loop for (level in 1:qA){}
      r=k-q+1; Critical = crit[t-2,r+1]
      if(F<Critical) {message(paste0("For q = ", q,": Ho is rejected, new submodel goes to the next test."))
        model_min= ModelTest;q=q-1 ; Cq_plus_1= CCq
        final_model= paste(model_min," + constant"); FinalCp= Cq_plus_1
      }
      if(F>Critical) { message(paste0("For q = ", q ,": the End, Ho is not rejected, the resulting model is: ",
                                    model_min,"+const.  F= ", round(F,5), " Cq+1 = ", round(Cq_plus_1,5) ))
        final_model= paste(model_min," + constant"); FinalCp= Cq_plus_1 ; NullHypothesis=FALSE
      }
    }  # the end of if(q>1) # if q > 1 then matrix Qmodels of all reduced submodels is used

  } } } # the end of if(qA>0), the end of the loop for (level in qA:1) and if(NullHypothesis)
  # all tests were done
  if(q == 0){r=k-q+1; message(paste0("For q = ",
                                     q,": the end, only constant is assumed with no regressors. C1 = ", round(C1,5) ))
  final_model="Trivial model"
  message("Trivial model is not tested.")
  # The final values of testing process
  return( list(y=y,X=X,n=n, k=k, t=t, full_model=Model, full_model_results = full_model_results, MSE = MSE,
               submodels_number=s, submodels=Smodels, model_min= model_minA, model_min_results = model_min_results ,
               p=1, Cq_plus_1A= Cq_plus_1A, FinalCp= FinalCp,
               final_model= "Trivial model", final_model_results= NA)  )

  } # trivial model is not tested (no submodel exists)
  message(paste("Final value of q is : ",q))
  if(q > 0){
    final_model= paste(model_min," + constant")
    final_model_results=lm(as.formula(model_min))

    return( list(y=y,X=X,n=n, k=k, t=t, full_model=Model, full_model_results = full_model_results, MSE = MSE,
                 submodels_number=s, submodels=Smodels, model_min= model_minA, model_min_results = model_min_results ,
                 p=qA, Cq_plus_1A= Cq_plus_1A, FinalCp= FinalCp,
                 final_model= final_model, final_model_results= final_model_results)  )
  }
} # the end of final_model=function(d)