R/cv_GIC_indexed.R

Defines functions cv_GIC_indexed

cv_GIC_indexed <- function(X, y, nfolds, model_function, ...) {

        family = list(...)$family

        if (family == "gaussian"){
                n <- length(y)
                real_n <- 0 #recount  of test instances
                foldid <- sample(rep(1:nfolds,length.out=n))   #PP replaces cvfolds by a simpler sample(rep()) function
                err <- list(); rss <- list(); #md <- list()

                model.full <- model_function(X, y, ...)
                lambda.full<- model.full$lambda

                for (fold in 1:nfolds){

                        Xte <- X[foldid == fold, ,drop = FALSE]
                        yte <- y[foldid == fold, drop = FALSE]
                        Xtr <- X[foldid != fold, ,drop = FALSE]
                        ytr <- y[foldid != fold, drop = FALSE]

                        compute_model <- cv_compute_model(model_function, Xtr, ytr, Xte, yte, real_n, lambda.full = lambda.full, ...)   #three letter abbreviations (lambda.full vs lam) make this function call confused, so explicit passing of named parameter i.e. lambda.full=lambda.full is required
                        model<-compute_model$model
                        Xtr<-compute_model$Xtr
                        ytr<-compute_model$ytr
                        Xte<-compute_model$Xte
                        yte<-compute_model$yte
                        real_n<-compute_model$real_n

                        #PP new code
                        rss[[fold]] <- model$rss

                        pred <- predict.DMR(model, newx = as.data.frame(Xte))
                        #PP new code error[[fold]] <- apply(pred, 2, function(z) sum((z - yte)^2))
                        err[[fold]] <- apply(pred, 2, function(z) mean((z - yte)^2))

                }


                len_err <- sapply(err, length)
                foldmin <- min(len_err)
                ERR <- sapply(1:nfolds, function(i) err[[i]][ (len_err[i] - foldmin + 1) : len_err[i] ] )
                if (foldmin == 1) {
                  ERR<-t(as.matrix((ERR)))  #making it a horizontal one-row matrix
                }
                #err <- rowMeans(ERR); kt <- which(err == min(err)); df.min <- dmr$df[kt[length(kt)]]; plot(err, type="o")

                p1 <- model.full$df[1]
                s2 <- model.full$rss[1]/(n-p1)

                p <- ncol(model.full$beta)
                if (is.null(p))
                  p <- length(model.full$beta)

                RIC_constant <- constants()$RIC_gaussian_constant
                Const <- exp(seq(log(RIC_constant/50),log(RIC_constant*50), length=81))
                laGIC <- Const*log(p)*s2
                RSS <- sapply(1:nfolds, function(i) rss[[i]][ (len_err[i] - foldmin + 1) : len_err[i] ] )
                if (is.null(dim(RSS))) {
                  RSS<-t(as.matrix((RSS)))  #making it a horizontal one-row matrix
                }
                #MD <- sapply(1:nfolds, function(i)  md[[i]][ (len_err[i] - foldmin + 1) : len_err[i] ] )
                IND <- apply( RSS, 2, function(r) sapply( laGIC, function(la) which.min(r+la*length(r):1) ) )
                errGIC <- apply( IND, 1, function(ind) mean(ERR[cbind(ind,1:nfolds)]) )
                #mdGIC  <- apply( IND, 1, function(ind) mean(MD[cbind(ind,1:10)]) )
                #plot(mdGIC[length(laGIC):1],errGIC[length(laGIC):1]/s2, xlab="MD", ylab="PE", type="o")

                r <- model.full$rss
                kt <- which(errGIC == min(errGIC))
                indGIC <- kt[length(kt)]    #TODO: why last?
                gic.full <- (r+laGIC[indGIC]*length(r):1)/(real_n*s2)
                #plot(gic.full[length(gic.full):1])

        } else{
                if (family == "binomial"){
                        if (!inherits(y, "factor")){
                                stop("Error: y should be a factor")
                        }
                        lev <- levels(factor(y))
                        if (length(lev) != 2){
                                stop("Error: factor y should have 2 levels")
                        }
                        n1 <- table(y)[1]
                        n2 <- table(y)[2]
                        real_n <- 0 #recount  of test instances

                        foldid1 <- sample(rep(1:nfolds,length.out=n1))  #PP replaces cvfolds by a simpler sample(rep()) function
                        foldid2 <- sample(rep(1:nfolds,length.out=n2))  #PP replaces cvfolds by a simpler sample(rep()) function
                        foldid <- c()
                        foldid[which(y == levels(factor(y))[1])] = foldid1
                        foldid[which(y == levels(factor(y))[2])] = foldid2
                        #PP new code error <- list()
                        err <- list(); loglik <- list(); #md <- list()

                        model.full <- model_function(X, y, ...)
                        lambda.full<- model.full$lambda

                        for (fold in 1:nfolds) {

                                Xte <- X[foldid == fold, , drop = FALSE]
                                yte <- y[foldid == fold, drop = FALSE]
                                Xtr <- X[foldid != fold, , drop = FALSE]
                                ytr <- y[foldid != fold, drop = FALSE]

                                compute_model <- cv_compute_model(model_function, Xtr, ytr, Xte, yte, real_n, lambda.full = lambda.full, ...)   #three letter abbreviations (lambda.full vs lam) make this function call confused, so explicit passing of named parameter i.e. lambda.full=lambda.full is required
                                model<-compute_model$model
                                Xtr<-compute_model$Xtr
                                ytr<-compute_model$ytr
                                Xte<-compute_model$Xte
                                yte<-compute_model$yte
                                real_n<-compute_model$real_n

                                #SzN new code based on PP new code
                                loglik[[fold]] <- -2*model$loglik

                                pred <- predict.DMR(model, newx = as.data.frame(Xte), type = "class")
                                #SzN new code based on PP new code error[[fold]] <- apply(pred, 2, function(z) sum(z != yte))
                                err[[fold]] <- apply(pred, 2, function(z) mean(z != yte))

                        }

                        len_err <- sapply(err, length)
                        foldmin <- min(len_err)
                        ERR <- sapply(1:nfolds, function(i) err[[i]][ (len_err[i] - foldmin + 1) : len_err[i] ] )
                        #err <- rowMeans(ERR); kt <- which(err == min(err)); df.min <- dmr$df[kt[length(kt)]]; plot(err, type="o")


                        if (foldmin == 1) {
                          ERR<-t(as.matrix((ERR)))  #making it a horizontal one-row matrix
                        }

                        p <- ncol(model.full$beta)
                        if (is.null(p))
                          p <- length(model.full$beta)

                        RIC_constant <- constants()$RIC_binomial_constant
                        Const <- exp(seq(log(RIC_constant/50),log(RIC_constant*50), length=81))
                        laGIC <- Const*log(p)
                        LOGLIK <- sapply(1:nfolds, function(i) loglik[[i]][ (len_err[i] - foldmin + 1) : len_err[i] ] )
                        if (is.null(dim(LOGLIK))) {
                          LOGLIK<-t(as.matrix((LOGLIK)))  #making it a horizontal one-row matrix
                        }
                        #MD <- sapply(1:nfolds, function(i)  md[[i]][ (len_err[i] - foldmin + 1) : len_err[i] ] )
                        IND <- apply( LOGLIK, 2, function(ll) sapply( laGIC, function(la) which.min(ll+la*length(ll):1) ) )
                        errGIC <- apply( IND, 1, function(ind) mean(ERR[cbind(ind,1:nfolds)]) )
                        #mdGIC  <- apply( IND, 1, function(ind) mean(MD[cbind(ind,1:10)]) )
                        #plot(mdGIC[length(laGIC):1],errGIC[length(laGIC):1]/s2, xlab="MD", ylab="PE", type="o")

                        ll <- -2*model.full$loglik
                        kt <- which(errGIC == min(errGIC))
                        indGIC <- kt[length(kt)]    #TODO: why last?
                        gic.full <- (ll+laGIC[indGIC]*length(ll):1)/real_n
                        #plot(gic.full[length(gic.full):1])

                }
                else{
                        stop("Error: wrong family, should be one of: gaussian, binomial")
                }
        }
        kt <- which(gic.full == min(stats::na.omit(gic.full))) #kt stores indexes in error equal to a minimum error.
        #if there is more than one such index, the LAST one is the one returned, because LAST means a smaller model.
        indMod <- kt[length(kt)]
        df.min <- model.full$df[indMod]

        kt <- which(gic.full <= min(stats::na.omit(gic.full)) + stats::sd(stats::na.omit(gic.full[gic.full!=Inf & gic.full!=-Inf])))
        if (length(kt) == 0) {
          df.1se <- NULL
        } else {
          indMod <- kt[length(kt)]
          df.1se <- model.full$df[indMod]
        }

        out <- list(df.min = df.min, df.1se = df.1se, dmr.fit = model.full, cvm = gic.full, foldid = foldid)
        return(out)
}

Try the DMRnet package in your browser

Any scripts or data that you put into this service are public.

DMRnet documentation built on Aug. 7, 2023, 5:11 p.m.