R/model_selection.R

Defines functions CorrectAicBic TestCorrectAicBic GlmFitMatr TestGlmFitMatr LmFitQR TestLmFitQR predict.gmod Testpredict.gmod GlmCv ModSelSubset TestModSelSubset ModSel_subsetFormula TestModSel_subsetFormula ModSel_mergeGroup TestModSel_mergeGroup ModSel_mergeGroupFormula TestModSel_mergeGroupFormula CvBmp GlmPredErr TestGlmPredErr Subset_explanList Subset_explanListFormula TestSubset_explanListFormula ModSelSeqMerging TestModSelSeqMerging ModSel_seqMergingFormula TestModSel_seqMergingFormula ModSelSeqMerging_fixedGroups TestModSelSeqMerging_fixedGroups ModSelSeqMerging_fixedGroupsFormula StepwiseGlm

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# authors: Reza Hosseini, Alireza Hosseini

### This source file contains functions that: fit linear models;
### calculate cross validation error; perform model selection ###
## need the source to calculate correlaltion with missing data


#### correct AIC and BIC of a glm model in R
## n sample size
## p is the number of covariates
CorrectAicBic = function(mod=NULL, p=NULL, n=NULL) {
    
  if (is.null(mod)) {
    aic = Inf 
    bic = Inf 
    logLik = -Inf
  }
  
  if (!is.null(mod)) {
    aic = AIC(mod)
    bic = BIC(mod)
    pNA = sum(is.na(coef(mod)))
    pModel = length(coef(mod)) - pNA

    logLik = (2*pModel - aic) / 2
    if (is.null(p)) {
      p = length(coef(mod))
    } 

    if (is.null(n)) {
      n = length(mod$fitted)      
    }
    aic = -2*logLik + 2*p
    bic = -2*logLik + p*(log(n))
  }

  return(list('AIC'=aic, 'BIC'=bic, 'logLik'=logLik))
}

## warning BIC calculation in R does not match!!
TestCorrectAicBic = function() {
    
  n = 100
  x1 = 1:n
  x2 = 1:n
  y = 2*x1 + rnorm(n)
  mod = glm(y ~ x1)
  AIC(mod) 
  BIC(mod)
  CorrectAicBic(mod, p=2, n=n)

  mod = glm(y ~ x1 + x2)
  AIC(mod)
  BIC(mod)
  CorrectAicBic(mod, p=3, n=n)

  mod = glm(y ~ x1 + x2)
  AIC(mod)
  BIC(mod)
  CorrectAicBic(mod)

  mod = glm(y ~ x1)
  AIC(mod)
  BIC(mod)
}

################### fitting the model with glm and in a way that is suitable for cross validation with categorical
# automatic fitting of the linear model with continuous response start
################### fitting the model with glm and in a way that is suitable for cross validation with categorical
# automatic fitting of the linear model with continuous response start
GlmFitMatr = function(y, explan=NULL, family='gaussian', ...) {
    
  glmMod = NULL
  
  if (is.null(explan)) {
    pNum = 0
    string = paste('glmMod = glm(y ~ 1, family=', family, ')', sep='')
    eval(parse(text=string))
  }

  if (!is.null(explan)) {
    n = length(y)
    explan = as.data.frame(explan)
    pNum = dim(explan)[2]
    for (j in (1:pNum)) {
      str = paste("V", j+1, " = explan[ ,", j, "]", sep="")
      eval(parse(text=str))
    }
    string = "(y ~ V2"
    if (pNum > 1) {
      for (i in 2:(pNum)) {
        k = i + 1
        smallString = paste("+", "V", k, sep="")
        string = paste(string, smallString)
      }
    }
    glmString = paste('glm', string, ',family=', family, ')', sep='')
    tryCatch(
        expr = {
            glmMod = eval(parse(text=glmString))},
        error = function(e){},
        finally = {})
  }

  aic_bic = CorrectAicBic(glmMod)
  aic = aic_bic[['AIC']]
  bic = aic_bic[['AIC']]
  logLik =  aic_bic[['logLik']]
  
  coeff = NA
  fitted = NA
  
  if (!is.null(glmMod)) {
    coeff = glmMod$coeff
    fitted = glmMod$fitted
  }

  out = list(glmMod=glmMod, logLik=logLik, BIC=bic, AIC=aic, coeff=coeff)
  class(out) = 'gmod'
  return(out)
}


TestGlmFitMatr = function() {
  N = 80
  num = 9
  explan = matrix(rnorm(1:(num*N)), N, num)
  y = rnorm(1:N) + 5*explan[ , 1] + 5*explan[ , 2] + 5*explan[ , 3]
  res = GlmFitMatr(y, explan)

  n = 1000
  x = rep(1, n)
  y = (1:n)
  x1 = rep(2, n)
  explan = cbind(x, x1, x, x, x, x, x, x, x, x, x)
  GlmFitMatr(y, explan, family='poisson')

  x = rnorm(10)
  y = (1:10)
  x1 = rnorm(10)
  explan = cbind(x, x1)
  GlmFitMatr(y, explan, family='poisson')
}

# Example y = rnorm(100); GlmFitMatr(y)
################# Quantile regression fitting and prediction in R

LmFitQR = function(y, explan, quant) {
    
  library('quantreg')
  n = length(y)
  explan = data.frame(explan)
  pNum = dim(explan)[2]
  for (j in (1:pNum)) {
    str = paste("V", j+1, " = explan[ , ", j, "]",sep="")
    eval(parse(text=str))
  }
  
  string = "(y~V2"
  if (pNum > 1) {
    for (i in 2:(pNum)) {
      k = i+1
      smallString = paste("+", "V", k, sep="")
      string = paste(string, smallString)
    }
  }
  
  string = paste(string, ",", as.character(quant), ")")
  rqString = paste("rq", string)
  gmod = eval(parse(text=rqString))
  coeff = gmod$coeff
  fitted = gmod$fitted
  out = list(gmod=gmod, coeff=coeff)
  return(out)
}

## Examples
TestLmFitQR = function() {
  n = 80
  num = 9
  explan = matrix(rnorm(1:(num*n)), n, num)
  y = rnorm(1:n) + 5*explan[ , 1] + 5*explan[ , 2] + 5*explan[ , 3]
  LmFitQR(y, explan, 0.5)
}

### a function that turns an explanatory matrix to V2,V3,... and create the prediction string
## need to provide data.frame as input
predict.gmod = function(mod, explan, ...) {
    
  glmMod = mod[['glmMod']]
  response = "response"
  pNum = dim(explan)[2]
  predStr = "out = predict(glmMod, newdata=data.frame(V2=explan[ , 1]"
  
  if (pNum > 1) {
    for (j in (2:pNum)) {
      predStr = paste(predStr, ",V", j+1, "=explan[ , ", j, "]",sep="")
    }
  }
  predStr = paste(predStr, "), type=response)")
  eval(parse(text=predStr))
  return(out)
}

Testpredict.gmod = function() {

  n = 30
  num = 2
  explan = matrix(rnorm(1:(num*n)), n, num)
  y = 6*rnorm(1:n) + 3*explan[ , 1]
  gmod = GlmFitMatr(y, explan)

  prediction = predict.gmod(gmod, explan)
  plot(explan[ , 1], prediction)
  points(explan[ , 1], explan[ , 1], col=2)

  y = rnorm(100)
  gmod = glm(y ~ 1)
  prediction = pred(explan=matrix(NA, 5, 1), gmod)
  plot(explan[ , 1], prediction)
  points(explan[ , 1], explan[ , 1], col=2)

}

###################### cross validation function that can also handle categorical variables#############
###out of date version. fixed the big object issue next, i.e. ommitted fitted_MAT

############## preventing memory breakdown  CV #########################
## modified to handle quantile regression as well
GlmCv = function(
    y,
    explan=NULL,
    putOutNum=1,
    model='lm',
    tau=0.5,
    family='gaussian') {
    
  if (is.null(explan)) {
    model = 'lm'
  }
  
  n = length(y)
  m = putOutNum
  no_explan = FALSE
  if (is.null(explan)) {
    no_explan = TRUE
    explan = matrix(1, length(y), 1)
  }
  
  explan = data.frame(explan)
  pNum = dim(explan)[2]
  subs = combn(1:n, m, fun=NULL, simplify=TRUE)
  N = dim(subs)[2]
  fittedMat = matrix(NA, N, m)
  
  for(i in 1:N) {
    ind = subs[ , i]
    valid = y[ind]
    training = y[-ind]
    explan2 = explan[-ind, ]
    if (no_explan) {explan2 = NULL}
    if (model == "lm") {
      gmod = GlmFitMatr(y=training, explan=explan2, family=family)$gmod
    }
    if (model == "qr") {
      gmod = LmFitQR(y=training, explan=explan2, quant=tau)$gmod
    } 

    explan3 = explan[ind, ]
    explan3 = data.frame(explan3)
    fitted = glmPred(explan=explan3, gmod)
    for (j in 1:m) {
      fittedMat[i, j] = fitted[j]
    }
  }
  
  fittedVec = rep(NA, n)
  for (k in 1:n) {
    tempFit = NULL
    for (l in 1:N) {
      if (sum(subs[ , l] == k) > 0) {
        k2 = which(subs[ , l] == k)
        tempFit[l]=fittedMat[l, k2]}
    }
    fittedVec[k] = mean(tempFit, na.rm=TRUE)
  }
  cc = (abs(y - fittedVec))
  cv1 = mean(na.omit(cc))
  cc = (y - fittedVec)^2
  cc = na.omit(cc)
  len = length(cc)
  cv = sqrt((1/len)*sum(cc))
  cc = (y - fittedVec)^4
  cc = na.omit(cc)
  len = length(cc)
  cv4 = sqrt(sqrt((1/len)*sum(cc)))
  cvr = cor2(fittedVec,y)
  out = list(
    cve1=cv1,
    cve=cv,
    cve4=cv4,
    cvr=cvr,
    predCv=fittedVec,
    size=len)
  return(out)
}

### automatic model selection procedure
### model selection using AIC, BIC, cve, find the optimal model of given number of parameters
## We can easily add cve1,cve4
### picking m top models
##################### Selecting models with covariates that are fixed
##################### We can also specify the number of accepted predictors by varNum
## explanF =  fixed predictors
## explan = predictor set from which we take a subset
## explanF could be NULL
ModSelSubset = function(
    y=NULL,
    explanF=NULL,
    explan=NULL,
    m=1,
    crit='AIC',
    family='gaussian',
    varNum=NULL,
    critSummaryDelta=0.1) {
    
  d = dim(explan)[2]
  varNum2 = d
  if (!is.null(varNum)) {
    if (varNum > d) {
      print('Warning: varNum > data dimension so it was reset to dim[2].')
    }
    varNum2 = min(d, varNum)
  }
  varNum = varNum2
  
  MsFcn = function(x) {
    if (is.null(x)) {
      explan1 = explanF
    } else {
      explan1 = explan[ , x, drop=FALSE]
      if (!is.null(explanF)) {
        explan1 = cbind(explanF, explan1)
      }   
    }
    

    res = GlmFitMatr(y=y, explan=explan1, family=family)
    mod = res[['glmMod']]
    aic_bic = CorrectAicBic(mod=mod)
    aic = aic_bic[['AIC']]
    bic = aic_bic[['BIC']]
    
    if (crit == "AIC") {
      out = c(aic, bic, x, rep(NA, (varNum - length(x))))
    }

    if (crit == "BIC") {
      out = c(bic, aic, x, rep(NA, (varNum - length(x))))
    }
    if (crit == "cve") {
      res = GlmCv(y, explan=explan1, putOutNum=1)
      out = c(res$cve, res$cvr, x, rep(NA, (varNum - length(x))))
    }
    return(out)
  }

  nullModRes = MsFcn(x=NULL)
  
  ## subSets of up to order varNum
  subSets = lapply(1:varNum, function(x) as.list(data.frame(combn(d, x))))
  subSets = unlist(subSets, recursive=FALSE)
  
  resList = lapply(subSets, MsFcn)
  resDf = data.frame()
  n = length(resList)
  for (i in 1:n) {resDf = rbind(resDf, resList[[i]])}
  
  colnames(resDf) = NULL
  resDf = rbind(resDf, nullModRes)
  
  if (crit == 'AIC') {
    colnames(resDf)[1:2] = c('AIC', 'BIC')
  }
  if (crit == 'BIC') {
    colnames(resDf)[1:2] = c('BIC', 'AIC')
  }
  if (crit == 'cve') {
    colnames(resDf)[1:2] = c('cve', 'cvr')
  }
  
  ord = order(resDf[ , 1])
  resDf = resDf[ord, ]
  topModMatrix = resDf[1:m, ]
  critVec = resDf[ , crit]
  critVec = na.omit(critVec)
  critSummary = quantile(critVec, seq(0, 1, critSummaryDelta))
  outList = list(topModMatrix=topModMatrix, critSummary=critSummary)
  return(outList)
}

TestModSelSubset = function() {

    n = 100
    num = 5
    fixedVarNum = 2
    explan = matrix(rnorm(1:(num*n)), n, num)
    explan = cbind(explan,explan[ , 5])
    explanF = matrix(rnorm(1:(num*n)), n, fixedVarNum);
    mu = explanF[ , 1] + 5*explan[ , 1] + 5*explan[ , 4] + 5*explan[ , 5]
    lambda = exp(mu)
    y = rpois(n, lambda)
    ModSelSubset(
        y,
        explanF=explanF,
        explan, 
        m=3,
        crit="AIC",
        family='poisson')

    ModSelSubset(
        y,
        explanF=NULL,
        explan=explan, 
        m=3,
        crit="AIC",
        family='poisson',
        varNum=2)
    
    ModSelSubset(
        y,
        explanF,
        explan=explan, 
        m=3,
        crit="BIC",
        family='poisson',
        varNum=4)
    
    ModSelSubset(
        y,
        explanF=NULL,
        explan=explan, 
        m=3,
        crit="BIC",
        family='poisson',
        varNum=4)
    
    ModSelSubset(
        y,
        explanF=NULL,
        explan=explan, 
        m=3,
        crit="BIC",
        family='poisson',
        varNum=5)
    ModSelSubset(
        y,
        explanF=NULL,
        explan=explan, 
        m=3,
        crit="BIC",
        family='poisson',
        varNum=7)
    
    n = 100
    num = 5
    fixedVarNum = 2
    explan = matrix(rnorm(1:(num*n)), n, num)
    explan = cbind(explan,explan[ , 5])
    explanF = matrix(rnorm(1:(num*n)), n, fixedVarNum);
    mu = explanF[ , 1] + 5*explan[ , 1] + 5*explan[ , 4] + 5*explan[ , 5]
    lambda = exp(mu)
    y = rpois(n, lambda)
    ModSelSubset(y, explanF, explan, m=3, crit="AIC", family='poisson')

    N = 100
    num = 5
    fixedVarNum = 2
    explan = matrix(rnorm(1:(num*N)), N, num)
    explanF = matrix(rnorm(1:(num*N)), N, fixedVarNum)
    y = rnorm(1:N) + 5*explan[ , 1] + 5*explan[ , 4] + 5*explan[ , 5]
    ModSelSubset(y, explanF, explan, m=3, crit="cve")

    N = 50
    num = 5
    fixedVarNum = 2
    explan = matrix(rnorm(1:(num*N)), N, num)
    explanF = NULL
    y = rnorm(1:N)
    ModSelSubset(y, explanF, explan, m=3, crit="BIC")

    N = 100
    num = 5
    fixedVarNum = 2
    explan = matrix(rnorm(1:(num*N)), N, num)
    explanF = matrix(rnorm(1:(num*N)), N, fixedVarNum)
    y = rnorm(1:N)
    ModSelSubset(y, explanF, explan, m=3, crit="BIC")
}

### model selection via GLM or other implemented models
# default is the regular GLM
## explan and explanF contain column names this time, or interaction terms
ModSel_subsetFormula = function(
    df,
    yCol,
    explanColsF=NULL,
    explanCols=NULL,
    m=1,
    crit='AIC',
    family='gaussian',
    ModelFcn=glm,
    varNum=NULL,
    critSummaryDelta=0.1) {
    
  d = length(explanCols)
  varNum2 = d
  if (!is.null(varNum)) {
    if (varNum > d) {
      print('Warning: varNum > data dimension so it was reset to dim[2].')
    }
    varNum2 = min(d, varNum)
  }
  varNum = varNum2
  
  MsFcn = function(x) {
    if (is.null(x)) {
        explanCols1 = explanColsF
    } else {
      explanCols1 = explanCols[x]
      if (!is.null(explanColsF)) {
          explanCols1 = c(explanColsF, explanCols1)
      }   
    }
    
    explanFormula = PlusVec(explanCols1)
    if (explanFormula == '') {
      explanFormula = '1'
    }
    formulaString = paste(yCol, ' ~ ', explanFormula, sep='')        
    mod = ModelFcn(
        formula=as.formula(formulaString), data=df, family=family)
    
    aic_bic = CorrectAicBic(mod=mod)
    aic = aic_bic[['AIC']]
    bic = aic_bic[['BIC']]
    # AIC = AIC(mod)
    # BIC = BIC(mod)
    if (crit == "AIC") {
      out = c(aic, bic, explanCols[x], rep(NA, (varNum - length(x))))
    }
    if (crit == "BIC") {
      out = c(bic, aic, explanCols[x], rep(NA, (varNum - length(x))))
    }
    if (crit == "cve") {
      res = GlmCv(y, explan=explan1, putOutNum=1)
      out = c(res$cve, res$cvr, explanCols, rep(NA, (varNum - length(x))))
    }
    return(out)
  }

  nullModRes = MsFcn(x=NULL)
  
  ## subSets of up to order varNum
  subSets = lapply(
    1:varNum,
    function(x) as.list(data.frame(combn(d, x))))
  subSets = unlist(subSets, recursive=FALSE)   
  resList = lapply(subSets, MsFcn)
  resDf = matrix(NA, length(resList), varNum + 2)
  resDf = rbind(resDf, nullModRes)
  n = length(resList)
  for (i in 1:n) {
    resDf[i, ] = resList[[i]]
  }
  
  colnames(resDf) = NULL
  resDf = data.frame(resDf)
  resDf[ , 1] = as.numeric(as.character(resDf[ ,1]))
  resDf[ , 2] = as.numeric(as.character(resDf[ ,2]))
  
  if (crit == 'AIC') {
    colnames(resDf)[1:2] = c('AIC', 'BIC')
  }
  if (crit == 'BIC') {
    colnames(resDf)[1:2] = c('BIC', 'AIC')
  }
  if (crit == 'cve') {
    colnames(resDf)[1:2] = c('cve', 'cvr')
  }
  
  ord = order(resDf[ , 1])
  resDf = resDf[ord, ]
  topModMatrix = resDf[1:m, ]
  critVec = resDf[ , crit]
  critVec = na.omit(critVec)
  critSummary = quantile(critVec, seq(0, 1, critSummaryDelta))
  outList = list(topModMatrix=topModMatrix, critSummary=critSummary)
  return(outList)
}

TestModSel_subsetFormula = function() {
    
  n = 1000
  num = 5
  fixedVarNum = 2
  explan = matrix(rnorm(1:(num*n)), n, num)
  explan = cbind(explan, explan[ , 5])
  explan = data.frame(explan)
  explanF = matrix(rnorm(1:(num*n)), n, fixedVarNum)
  explanF = data.frame(explanF)
  names(explanF) = paste('F', 1:fixedVarNum, sep='')
  mu = explanF[ , 1] + 5*(explan[ , 1]*explan[ , 1]) + 
      5*explan[ , 4]*explan[ , 3] + 5*explan[ , 5]
  
  lambda = (1.1)^(mu)
  y = rpois(n, lambda)
  explanColsF = names(explanF) 
  explanCols = names(explan)
  explanCols = c(explanCols, 'X1*X1', 'X1*X2', 'X3*X4', 'X2*X3')
  df = cbind(explanF, explan)
  df = cbind(df, y)
  res = ModSel_subsetFormula(
      df=df,
      yCol='y',
      explanColsF=explanColsF, 
      explanCols=explanCols,
      m=3,
      crit="BIC",
      family='poisson')

  res = ModSel_subsetFormula(
      df=df,
      yCol='y',
      explanColsF=NULL, 
      explanCols=explanCols,
      m=3,
      crit="BIC",
      family='poisson')

  ## y is independent of x's
  n = 500
  num = 5
  fixedVarNum = 2
  explan = matrix(rnorm(1:(num*n)), n, num)
  explan = cbind(explan, explan[ , 5])
  explan = data.frame(explan)
  explanF = matrix(rnorm(1:(num*n)), n, fixedVarNum)
  explanF = data.frame(explanF)
  names(explanF) = paste('F', 1:fixedVarNum, sep='')
  mu = rnorm(n)
  lambda = exp(mu)
  y = rpois(n, lambda)
  explanColsF = names(explanF) 
  explanCols = names(explan)
  df = cbind(explanF, explan)
  df = cbind(df, y)
  res = ModSel_subsetFormula(
      df=df,
      yCol='y',
      explanColsF=NULL, 
      explanCols=explanCols,
      m=3,
      crit="BIC",
      family='poisson')
}

##### sequential grouped step-wise model selection with various families
ModSel_mergeGroup = function(
    y,
    explanF=NULL,
    explanList, 
    groupIndex=NULL,
    crit,
    family='gaussian') {
    
  l = length(explanList)
  ## if no group identifier is passed we build a simple one below
  if (is.null(groupIndex)) {
    groupIndex = 1:l
  }
  explanList = explanList[groupIndex]
  l = length(explanList)

  explan1 = NULL
  ## this saves the index within an explan family
  elementVec = NULL
  ## this saves which of the groups that predictor belongs to.
  groupVec = NULL
  for (i in 1:l) {
    explan = explanList[[i]]
    if (!is.null(explan1)) {
      explan1 = cbind(explan1, explan)
    } else {
      explan1 = explan
    }
    ## number of parameters of the i-th explan 
    p = dim(explan)[2]
    elementVec = c(elementVec, 1:p)
    ## assigning the group index to keep track of the group 
    j = groupIndex[i]
    groupVec = c(groupVec, rep(j, p))
  }
  
  res = ModSelSubset(
      y,
      explanF=explanF,
      explan=explan1, 
      m=1,
      crit=crit,
      family=family)
  topModMatrix = res[['topModMatrix']]
  critSummary = res[['critSummary']]
  topMod = as.numeric(topModMatrix)
  crits = topMod[1:2]
  optInd = topMod[3:length(topMod)]
  optInd = na.omit(optInd)
  groupInd = groupVec[optInd]
  elementInd = elementVec[optInd]
  explanInd = data.frame(group=groupInd, element=elementInd) 
  outList = list(groupElemInd=explanInd, critSummary=critSummary)
  return(outList)
}

TestModSel_mergeGroup = function() {
    
  n = 1000
  num1 = 4
  num2 = 2
  num3 = 3
  explan1 = matrix(rnorm(1:(num1*n)), n, num1)
  explan2 = matrix(rnorm(1:(num2*n)), n, num2)
  explan3 = matrix(rnorm(1:(num3*n)), n, num3)
  explanF = matrix(rnorm(1:(num3*n)), n, num3)

  explanList = list(explan1, explan2, explan3)
  mu = explan1[ , 1] + 5*explan3[ , 1] + 5*explan3[ , 3]
  lambda = 1.2^(mu) + 0.1
  lambda = MinComp(lambda, 10000)
  y = rpois(n, lambda)
  
  ModSel_mergeGroup(
      y,
      explanF=explanF,
      explanList=explanList, 
      crit="BIC",
      family='poisson')

  #####
  ModSel_mergeGroup(
      y,
      explanF=NULL,
      explanList,
      groupIndex=c(1, 3),
      crit='BIC', 
      family='poisson')

}

ModSel_mergeGroupFormula = function(
    df,
    yCol,
    explanColsF=NULL,
    explanColsList, 
    groupIndex=NULL,
    crit,
    family='gaussian') {
  
  l = length(explanColsList)

  ## if no group identifier is passed we build a simple one below
  if (is.null(groupIndex)) {
      groupIndex = 1:l
  }
  explanColsList = explanColsList[groupIndex]
  l = length(explanColsList)

  explanCols1 = NULL
  ## this saves the index within an explan family
  elementVec = NULL
  ## this saves which of the groups that predictor belongs to.
  groupVec = NULL

  for (i in 1:l) {
    explanCols = explanColsList[[i]]
    if (!is.null(explan1)) {
      explanCols1 = c(explanCols1, explanCols)
    } else {
      explanCols1 = explanCols
    }
    ## number of parameters of the i-th explan 
    p = length(explanCols)
    elementVec = c(elementVec, 1:p)
    ## assigning the group index to keep track of the group 
    j = groupIndex[i]
    groupVec = c(groupVec, rep(j, p))
  }
  
  res = ModSel_subsetFormula(df=df, yCol=yCol, explanColsF=explanColsF, 
    explanCols=explanCols1, m=1, crit=crit, family=family)

  topModMatrix = res[['topModMatrix']]
  topMod = topModMatrix[1, ]
  names(topMod) = NULL
  critSummary = res[['critSummary']]
  crits = topMod[1:2]
  optInd = topMod[3:length(topMod)]
  optInd = as.character(unlist(optInd))
  optInd = na.omit(optInd)
  optInd = which(explanCols1 %in% optInd)
  
  groupInd = groupVec[optInd]
  elementInd = elementVec[optInd]
  explanInd = data.frame(group=groupInd, element=elementInd) 
  outList = list(groupElemInd=explanInd, critSummary=critSummary)
  return(outList)

}

TestModSel_mergeGroupFormula = function() {
    
  n = 500
  num1 = 4
  num2 = 2
  num3 = 3
  num4 = 3
  explan1 = matrix(rnorm(1:(num1*n)), n, num1)
  explan2 = matrix(rnorm(1:(num2*n)), n, num2)
  explan3 = matrix(rnorm(1:(num3*n)), n, num3)
  explanF = matrix(rnorm(1:(num3*n)), n, num4)
  
  explan1 = data.frame(explan1)
  explan2 = data.frame(explan2)
  explan3 = data.frame(explan3)
  explanF = data.frame(explanF)

  names(explan1) = paste('A', 1:num1, sep='')
  names(explan2) = paste('B', 1:num2, sep='')
  names(explan3) = paste('C', 1:num3, sep='')
  names(explanF) = paste('F', 1:num4, sep='')

  explanColsList = list(names(explan1), names(explan2), names(explan3))

  mu = explan1[ , 1] + 5*explan3[ , 1] + 5*explan3[ , 3]
  lambda = 2^(mu) + 0.1
  lambda = MinComp(lambda, 10000)
  y = rpois(n, lambda)
  df = cbind(explan1, explan2, explan3, explanF, y)

  ModSel_mergeGroupFormula(
    df,
    yCol=yCol,
    explanColsF=explanF, 
    explanColsList=explanColsList,
    crit="BIC",
    family='poisson')

  ModSel_mergeGroupFormula(
    df,
    yCol=yCol,
    explanColsF=NULL, 
    explanColsList=explanColsList,
    crit="BIC",
    family='poisson')

  ModSel_mergeGroupFormula(
    df,
    yCol,
    explanColsF=NULL, 
    explanColsList=explanColsList,
    crit="BIC",
    family='poisson')

  ModSel_mergeGroupFormula(
    df,
    yCol,
    explanColsF=NULL, 
    explanColsList=explanColsList,
    groupIndex=c(1, 3),
    crit="BIC",
    family='poisson')
}

####### Broke mans prediction
CvBmp = function(y) {
    
  y = na.omit(y)
  n = length(y)
  fittedVec = rep(NA, n)
  for (i in 1:n) {
    fittedVec[i] = mean(y[-i])
  }
  cc = (abs(y - fittedVec))
  cv1 = mean(na.omit(cc))
  cc = (y - fittedVec)^2
  cc = na.omit(cc)
  len = length(cc)
  cv = sqrt((1/len)*sum(cc))
  cc = (y - fittedVec)^4
  cc = na.omit(cc)
  len = length(cc)
  cv4 = sqrt(sqrt((1/len)*sum(cc)))
  cvr = cor2(fittedVec, y)
  out = list(cve1=cv1, cve=cv, cve4=cv4, cvr=cvr, size=len)
  return(out)
}

########### True prediction error
## here we provide a very large validation set that can be used to find a true prediction error for a model
GlmPredErr = function(y, explan, vars, validation, family='gaussian') {
    
  y1 = validation[ , 1]
  explan1 = validation[ , -1]
  explan1 = explan1[ , vars]
  expl = explan[ , vars]
  gmod = GlmFitMatr(y, expl, family=family)$gmod
  fitted = glmPred(explan1, gmod)
  tcv1 = (1/length(y1))*(sum(abs(y1 - fitted)))
  tcv = (1/length(y1))*sqrt(sum((y1 - fitted)^2))
  tcv4 = (1/length(y1))*sqrt(sqrt(sum((y1 - fitted)^4)))
  tcvr2 = corNA(fitted, y)
  cvout = GlmCv(y, expl, 1)
  out1 = c(cvout$cve1, cvout$cve, cvout$cve4, cvout$cvr2)
  out2 = c(tcv1, tcv, tcv4, tcvr2)
  out = list(cvErr=out1, trueErr=out2)
  return(OUT)
}

TestGlmPredErr = function() {
    
  n = 100
  num = 5
  explan = matrix(rnorm(1:(num*n)), n, num)
  y = rnorm(1:N) + 15*explan[ , 1] + 25*explan[ , 2] + 35*explan[ , 3]

  n = 200
  explan = matrix(rnorm(1:(num*n)), n, num)
  y = rnorm(1:n) + 15*explan[ , 1] + 25*explan[ , 2] + 35*explan[ , 3]
  validation = cbind(y, explan)
  vars = 1:3 
  GlmPredErr(y, explan, vars, validation)
  vars = 4:5
  GlmPredErr(y, explan, vars, validation)
  vars = 1:2
  GlmPredErr(y, explan, vars, validation)

  putOutNum = 2
  n = 100
  num = 5
  explan = matrix(rnorm(1:(num*n)), n, num)
  y = rnorm(1:n) + 15*explan[ , 1] + 25*explan[ , 2] + 35*explan[ , 3]
  
  n = 200
  explan = matrix(rnorm(1:(num*n)), n, num)
  y = rnorm(1:n) + 15*explan[ , 1] + 25*explan[ , 2] + 35*explan[ , 3]
  validation = cbind(y, explan)
  vars = 1:3
  GlmPredErr2(y, explan, vars, validation, 2)
  vars = 4:5
  GlmPredErr2(y, explan, vars, validation, 2)
  vars = 1:2
  GlmPredErr2(y, explan, vars, validation, 2)
}

#### Model selection sequential functions
## subsetting predictor groups according to given index pairs
## group index and element index
Subset_explanList = function(explanList, groupElemInd) {
    
  pickedGroups = unique(groupElemInd[ , 1])
  l = length(pickedGroups)
  outExplan = NULL
  for (i in 1:l) {
    gr = pickedGroups[i]
    ind = which(groupElemInd[ , 1] == gr)
    elements = groupElemInd[ind, 2]
    explan = explanList[[gr]]
    explan1 = explan[ , elements] 
    if (i == 1) {
      outExplan = explan1
    } else {
      outExplan = cbind(outExplan, explan1)
    }
  }
  return(outExplan)

}

Subset_explanListFormula = function(explanColsList, groupElemInd) {
    
  pickedGroups = unique(groupElemInd[ , 1])
  l = length(pickedGroups)
  outExplan = NULL
  for (i in 1:l) {
    gr = pickedGroups[i]
    ind = which(groupElemInd[ , 1] == gr)
    elements = groupElemInd[ind, 2]
    explanCols = explanColsList[[gr]]
    explanCols1 = explanCols[elements] 
    if (i == 1) {
      outExplanCols = explanCols1
    } else {
      outExplanCols = c(outExplanCols, explanCols1)
    }
  }
  return(outExplanCols)
}

TestSubset_explanListFormula = function() {
    
  names(explan1) = paste('A', 1:num1, sep='')
  names(explan2) = paste('B', 1:num2, sep='')
  names(explan3) = paste('C', 1:num3, sep='')
  names(explanF) = paste('F', 1:num4, sep='')

  explanColsList = list(names(explan1), names(explan2), names(explan3))

  groupElemInd = matrix(NA, 5, 2)
  groupElemInd[ , 1] = c(1, 1, 2, 2, 3)
  groupElemInd[ , 2] = c(1, 3, 1, 2, 2) 

  Subset_explanListFormula(explanColsList, groupElemInd=groupElemInd) 

}

### this function will 
### merge sequentially group of groups given in groupsIndList 
ModSelSeqMerging = function(
    y,
    explanF=NULL,
    explanList=NULL,
    groupsIndList=NULL, 
    crit='AIC',
    family='gaussian') {
    
  ## empty index set
  groupElemInd = NULL
  rowNum = length(groupsIndList)
  critSummaryList = list()
  for (i in 1:rowNum) {
    groupIndex = groupsIndList[[i]]
    res = ModSel_mergeGroup(
        y=y,
        explanF=explanF,
        explanList=explanList,
        groupIndex=groupIndex,
        crit=crit,
        family=family)
    
    groupElemInd1 = res[['groupElemInd']]
    critSummaryList[[i]] = res[['critSummary']]
    #print(groupElemInd1)
    
    ## reset explanF
    if (dim(groupElemInd1)[1] > 0) {
      explanF1 = Subset_explanList(explanList, groupElemInd1)
      if (is.null(explanF)) {
        explanF=explanF1
      } else {
        explanF = cbind(explanF, explanF1)
      }
      groupElemInd = rbind(groupElemInd, groupElemInd1)
    }
  }
  outList = list(
    groupElemInd=groupElemInd,
    critSummaryList=critSummaryList)
  return(outList)
}

TestModSelSeqMerging = function() {  
  n = 1000
  num1 = 4
  num2 = 2
  num3 = 3
  num4 = 2
  explan1 = matrix(rnorm(1:(num1*n)), n, num1)
  explan2 = matrix(rnorm(1:(num2*n)), n, num2)
  explan3 = matrix(rnorm(1:(num3*n)), n, num3)
  explanF = matrix(rnorm(1:(num3*n)), n, num4)

  explanList = list(explan1, explan2, explan3)
  mu = explan1[ , 1] + 5*explan3[ , 1] + 5*explan3[ , 3]
  lambda = 1.2^(mu) + 0.1
  lambda = MinComp(lambda, 10000)
  y = rpois(n, lambda)

  groupsIndList = list(c(1), c(2,3))
  
  ModSelSeqMerging(
    y=y,
    explanF=explanF,
    explanList=explanList, 
    groupsIndList=groupsIndList,
    crit='BIC',
    family='poisson')


  ModSelSeqMerging(
    y=y,
    explanF=NULL,
    explanList=explanList, 
    groupsIndList=groupsIndList,
    crit='BIC',
    family='poisson')

}

ModSel_seqMergingFormula = function(
    df,
    yCol, 
    explanColsF=NULL,
    explanColsList=NULL, 
    groupsIndList=NULL,
    crit='AIC',
    family='gaussian') {
    
  # empty index set
  groupElemInd = NULL
  rowNum = length(groupsIndList)
  critSummaryList = list()
  for (i in 1:rowNum) {
    groupIndex = groupsIndList[[i]]
    
    res = ModSel_mergeGroupFormula(
        df=df, 
        yCol,
        explanColsF=explanColsF, 
        explanColsList=explanColsList,
        groupIndex=groupIndex,
        crit=crit, 
        family=family)
    
    groupElemInd1 = res[['groupElemInd']]
    critSummaryList[[i]] = res[['critSummary']]
    # print(groupElemInd1)
    
    ## reset explanF
    if (dim(groupElemInd1)[1] > 0) {
      explanColsF1 = Subset_explanListFormula(explanColsList, groupElemInd1)
      if (is.null(explanColsF)) {
        explanColsF=explanColsF1
      } else {
        explanF = c(explanColsF, explanColsF1)
      }
      groupElemInd = rbind(groupElemInd, groupElemInd1)
    }
  }
  cols = Subset_explanListFormula(explanColsList, groupElemInd) 
  outList = list(
    groupElemInd=groupElemInd, 
    critSummaryList=critSummaryList,
    cols=cols)
  return(outList)
}

TestModSel_seqMergingFormula = function() {
    
  n = 1000
  num1 = 4
  num2 = 2
  num3 = 3
  num4 = 3
  explan1 = matrix(rnorm(1:(num1*n)), n, num1)
  explan2 = matrix(rnorm(1:(num2*n)), n, num2)
  explan3 = matrix(rnorm(1:(num3*n)), n, num3)
  explanF = matrix(rnorm(1:(num3*n)), n, num4)  
  explan1 = data.frame(explan1)
  explan2 = data.frame(explan2)
  explan3 = data.frame(explan3)
  explanF = data.frame(explanF)

  names(explan1) = paste('A', 1:num1, sep='')
  names(explan2) = paste('B', 1:num2, sep='')
  names(explan3) = paste('C', 1:num3, sep='')
  names(explanF) = paste('F', 1:num4, sep='')

  explanColsList = list(c(names(explan1), 'A1*A1', 'A2*A3', 'A3*A3', 'A2*A2'), 
    names(explan2), names(explan3))

  mu = explan1[ , 1]*explan1[ , 1] + explan1[ , 2]*explan1[ , 3] + 
    5*explan3[ , 1] + 5*explan3[ , 3]

  # lambda = 2^(mu) + 0.1
  # lambda = MinComp(lambda, 10000)
  # y = rpois(n, lambda)
  y = mu
  df = cbind(explan1, explan2, explan3, explanF, y)
  family = 'gaussian'

  groupsIndList = list(c(1), c(2,3))
  
  res = ModSel_seqMergingFormula(
    df=df, yCol='y', explanColsF=explanColsF, 
    explanColsList=explanColsList, 
    groupsIndList=groupsIndList, crit='BIC', family=family)

  res = ModSel_seqMergingFormula(
    df=df, yCol='y', explanColsF=NULL, 
    explanColsList=explanColsList, 
    groupsIndList=groupsIndList, crit='BIC', family=family)

}

## grouped model selection
## y: response
## explanList: list of data frames, each representing a group of variables
## data of two columns, one is group, one is element
## groupsIndList: 
ModSelSeqMerging_fixedGroups = function(
    y=NULL,
    explanList=NULL,
    groupElemIndFixed=NULL,
    groupsIndList=NULL,
    crit='AIC',
    family=NULL) {
    
  if (is.null(groupsIndList)) {
    groupsIndList=as.list(1:length(explanList))
  }
  explanF = NULL
  
  if (!is.null(groupElemIndFixed)) {
    explanF = Subset_explanList(explanList, groupElemIndFixed)
  }
  groupElemInd = NULL
  
  if (!is.null(groupsIndList)) {
    res = ModSelSeqMerging(
      y=y,
      explanF=explanF,
      explanList=explanList, 
      groupsIndList=groupsIndList,
      crit=crit,
      family=family)
    groupElemInd = res[['groupElemInd']]
    critSummaryList = res[['critSummaryList']]
  }
  
  if (!is.null(groupElemIndFixed)) {
    groupElemInd = rbind(as.matrix(groupElemIndFixed), groupElemInd)
  }
  
  outList = list(groupElemInd=groupElemInd, critSummaryList=critSummaryList)
  return(outList)

}

TestModSelSeqMerging_fixedGroups = function() {
    
  n = 500
  ord = c(5, 3, 2, 2) 
  group = c(rep(1, ord[1]), rep(2, ord[2]), rep(3, ord[3]), rep(4, ord[4]))
  element = c(1:ord[1], 1:ord[2], 1:ord[3], 1:ord[4])
  groupElem = cbind(group, element)
  explan1 = matrix(rnorm(ord[1]*n), n, ord[1])
  explan2 = matrix(rnorm(ord[2]*n), n, ord[2])
  explan3 = matrix(rnorm(ord[3]*n), n, ord[3])
  explan4 = matrix(rnorm(ord[4]*n), n, ord[4])    
  explanList = list(explan1, explan2, explan3, explan4)  
  logy = 2*explan1[ , 5] + 2*explan2[ , 2] + 1*explan3[ , 1] + rnorm(n)
  y = 1.1^(logy)
  groupElemIndFixed = groupElem[c(1, 2), ]
  groupsIndList = list(c(1), c(2), c(3, 4))
  
  ModSelSeqMerging_fixedGroups(
      y=y,
      explanList=explanList, 
      groupElemIndFixed=groupElemIndFixed,
      groupsIndList=groupsIndList, 
      crit='BIC',
      family='gaussian')

  ###
  groupElemIndFixed = NULL
  groupsIndList = list(c(1), c(2), c(3, 4))
  ModSelSeqMerging_fixedGroups(y=y, explanList=explanList, 
    groupElemIndFixed=groupElemIndFixed, groupsIndList=groupsIndList, 
    crit='BIC', family='gaussian')

}

ModSelSeqMerging_fixedGroupsFormula = function(
    df,
    yCol,
    explanColsList=NULL,
    groupElemIndFixed=NULL,
    groupsIndList=NULL,
    crit='AIC',
    family=NULL) {
    
  if (is.null(groupsIndList)) {
    groupsIndList=as.list(1:length(explanColsList))
  }
  explanF = NULL
  
  if (!is.null(groupElemIndFixed)) {
    explanColsF = Subset_explanListFormula(explanColsList, groupElemIndFixed)
  }
  groupElemInd = NULL
  
  if (!is.null(groupsIndList)) {
    res = ModSel_seqMergingFormula(
      df=df,
      yCol=yCol,
      explanColsF=explanColsF, 
      explanColsList=explanColsList, 
      groupsIndList=groupsIndList,
      crit=crit,
      family=family)
    groupElemInd = res[['groupElemInd']]
    critSummaryList = res[['critSummaryList']]
  }
  
  if (!is.null(groupElemIndFixed)) {
    groupElemInd = rbind(as.matrix(groupElemIndFixed), groupElemInd)
  }
  
  cols = Subset_explanListFormula(explanColsList, groupElemInd) 
  outList = list(
    groupElemInd=groupElemInd,
    critSummaryList=critSummaryList,
    cols=cols)
  return(outList)
}


## stepwise
StepwiseGlm = function(df, yCol, predCols, familyStr) {

  model = glm(
      as.formula(paste0(yCol, "~1")), data=df, family=familyStr)

  ## building the maximal model formula 
  upperModFormula = as.formula(paste0(
      "~",
      paste0(predCols, collapse="+")))

  stepModel = stepAIC(
      model,
      direction="both", trace=FALSE, k=log(nrow(df)),
      scope=list(
          upper=upperModFormula,
          lower=~1))

  terms = attr(terms(stepModel), "term.labels")
  anova = stepModel[["anova"]]
  return(list("terms"=terms, "anova"=anova))

}
Reza1317/funcly documentation built on Feb. 5, 2020, 4:06 a.m.