R/comps.R

Defines functions clusterRepLasso processClusterLassoInputs protolasso getSelectedSets getClusterSelsFromGlmnet checkGetXglmnetInputs getXglmnet

#' Converts the provided design matrix to an appropriate format for either the
#' protolasso or the cluster representative lasso.
#'
#' Creates design matrix for glmnet by dealing with clusters (for
#' type="protolasso", discards all cluster members except prototype; for
#' type="clusterRepLasso", replaces all cluster members with a simple
#' average of all the cluster members).
#' @param x A numeric matrix; the provided matrix with n observations and p
#' features.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters should
#' be equal to the number of clusters.) All identified clusters should be
#' non-overlapping. All features should appear in exactly one cluster (any
#' unclustered features should be put in their own "cluster" of size 1).
#' @param type Character; "protolasso" for the protolasso or "clusterRepLasso"
#' for the cluster representative lasso.
#' @param prototypes Only required for type "protolasso". An integer vector
#' whose length is equal to the number of clusters. Entry i should be the
#' prototype for cluster i (the feature belonging to cluster i that is most
#' highly correlated with y; see Reid and Tibshirani 2016).
#' @return A numeric matrix; the design matrix as required for the protolasso or
#' cluster representative lasso, prepared for input to glmnet. The number of 
#' columns will be equal to length(clusters), and column j will be a feature
#' corresponding to clusters[[j]] (either the prototype from clusters[[j]] or
#' a simple average of the features in clusters[[j]]).
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
getXglmnet <- function(x, clusters, type, prototypes=NA){
    
    # Check inputs
    checkGetXglmnetInputs(x, clusters, type, prototypes)

    n <- nrow(x)
    p <- ncol(x)

    # if(n_clusters > 0){
    for(i in 1:length(clusters)){
        cluster_i <- clusters[[i]]

        if(length(cluster_i) == 1){
            X_glmnet_i <- x[, cluster_i]
        } else{
            stopifnot(length(cluster_i) > 1)

            if(type == "protolasso"){
                prototype_ind_i <- which(prototypes %in% cluster_i)
                stopifnot(length(prototype_ind_i) == 1)
                prototype_i <- prototypes[prototype_ind_i]
                X_glmnet_i <- x[, prototype_i]
            } else {
                stopifnot(type == "clusterRepLasso")
                X_glmnet_i <- rowMeans(x[, cluster_i])
            }
        }
        
        stopifnot(length(X_glmnet_i) == n)
        
        if(i == 1){
            X_glmnet <- as.matrix(X_glmnet_i)
        } else{
            X_glmnet <- cbind(X_glmnet, X_glmnet_i)
        }
    }
    stopifnot(ncol(X_glmnet) == length(clusters))
    
    # if(length(non_cluster_feats) > 0){
    #     X_glmnet <- cbind(X_glmnet, x[, non_cluster_feats])
    # } else{
    #     X_glmnet <- X_glmnet
    # }
    # stopifnot(ncol(X_glmnet) == length(non_cluster_feats) + n_clusters)
    # } else{
    #     X_glmnet <- x
    #     if(type == "protolasso"){
    #         stopifnot(length(prototypes) == 0)
    #     }
    #     cluster_members <- integer()
    #     non_cluster_feats <- 1:p
    # }

    colnames(X_glmnet) <- character()

    # Check output
    stopifnot(is.matrix(X_glmnet))
    stopifnot(nrow(X_glmnet) == n)
    stopifnot(ncol(X_glmnet) <= p)
    stopifnot(ncol(X_glmnet) >= 1)
    
    return(X_glmnet)

}


#' Verifies the inputs for getXglmnet.
#'
#' @param x A numeric matrix; the provided matrix with n observations and p
#' features.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters should
#' be equal to the number of clusters.) All identified clusters should be
#' non-overlapping. All features should appear in exactly one cluster (any
#' unclustered features should be put in their own "cluster" of size 1).
#' @param type Character; "protolasso" for the protolasso or "clusterRepLasso"
#' for the cluster representative lasso.
#' @param prototypes Only required for type "protolasso". An integer vector
#' whose length is equal to the number of clusters. Entry i should be the
#' prototype for cluster i (the feature belonging to cluster i that is most
#' highly correlated with y; see Reid and Tibshirani 2016).
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
checkGetXglmnetInputs <- function(x, clusters, type, prototypes){
    stopifnot(is.matrix(x))

    stopifnot(is.list(clusters))
    stopifnot(all(lengths(clusters) >= 1))

    # stopifnot(length(n_clusters) == 1)
    # stopifnot(is.integer(n_clusters) | is.numeric(n_clusters))
    # stopifnot(n_clusters >= 0)
    # if(type=="protolasso"){
    #     stopifnot(n_clusters == length(prototypes))
    # }
    # stopifnot(n_clusters == length(clusters))

    stopifnot(length(type) == 1)
    stopifnot(is.character(type))
    stopifnot(!is.na(type))
    stopifnot(type %in% c("protolasso", "clusterRepLasso"))

    stopifnot(!is.na(prototypes))
    stopifnot(is.integer(prototypes))
    # if(is.list(clusters)){
        # stopifnot(length(prototypes) == length(clusters))
    # } else{
    #     stopifnot(length(prototypes) == 1)
    # }
    stopifnot(all(!is.na(prototypes)))
    stopifnot(length(prototypes) == length(unique(prototypes)))
    stopifnot(all(prototypes %in% 1:ncol(x)))
    
    for(i in 1:length(clusters)){
        cluster_i <- clusters[[i]]
        stopifnot(sum(prototypes %in% cluster_i) == 1)
    }
}

#' Extracts selected clusters and cluster prototypes from the glmnet lasso
#' output
#'
#' @param lasso_sets A list of integer vectors. Each vector represents a set of
#' features selected by the lasso for a given value of the penalty parameter
#' lambda.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters is
#' equal to the number of clusters.) All identified clusters must be
#' non-overlapping. All features appear in exactly one cluster (any unclustered
#' features must be in their own "cluster" of size 1).
#' @param prototypes An integer vector whose length must be equal to the number
#' of clusters. Entry i should be the index of the feature belonging to cluster
#' i that is most highly correlated with y (that is, the prototype for the
#' cluster, as in the protolasso; see Reid and Tibshirani 2016).
#' @param feat_names Character vector; the names of the features in X. (If the
#' X provided to protolasso or clusterRepLasso did not have feature names,
#' feat_names will be NA.)
#' @return A list containing the following items: \item{selected_sets}{A list of
#' integer vectors. Entry k of this list contains a selected set of size k
#' yielded by glmnet--each member of the set is the index of a single feature
#' from a cluster selected by either the protolasso or the cluster
#' representative lasso (the prototype from that cluster--the cluster member
#' most highly correlated with y). (If no set of size k was selected, entry k
#' will be NULL.)} \item{selected_clusts_list}{A list of lists; entry k of this
#' list is a list of length k of clusters (the clusters that were selected by
#' the cluster representative lasso). Again, if no set of size k was selected,
#' entry k will be NULL.}
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}. \cr Bühlmann, P.,
#' Rütimann, P., van de Geer, S., & Zhang, C. H. (2013). Correlated variables in
#' regression: Clustering and sparse estimation.
#' \emph{Journal of Statistical Planning and Inference}, 143(11), 1835–1858.
#' \url{https://doi.org/10.1016/j.jspi.2013.05.019}.
getClusterSelsFromGlmnet <- function(lasso_sets, clusters, prototypes,
    feat_names){
    # checkGetClusterSelsFromGlmnetInput(clusters, prototypes, averaging)

    if(any(!is.na(feat_names))){
        stopifnot(all(!is.na(feat_names)))
    }

    # Largest selected set among all those in lasso_sets
    max_length <- max(vapply(lasso_sets, length, integer(1)))

    # Preparing lists to store 
    selected_sets <- list()
    # to_avg_list <- list()
    selected_clusts_list <- list()
    # weights_list <- list()
    
    for(j in 1:max_length){
        # Lasso selected set of size j
        lasso_sets_j <- lasso_sets[lapply(lasso_sets, length) == j]
        # Are there any lasso selected sets of size j? (If not, we will skip to
        # the next j, and slot j in the list will be empty.)
        if(length(lasso_sets_j) > 0){

            # Select the first set of size j
            lasso_set_j <- lasso_sets_j[[1]]
            stopifnot(length(lasso_set_j) == j)
            
            ret <- getSelectedSets(lasso_set=lasso_set_j, clusters=clusters,
                prototypes=prototypes, feat_names=feat_names)

            selected_sets[[j]] <- ret$selected_set
            selected_clusts_list[[j]] <- ret$selected_clusts_list

            rm(ret)
        }
    }

    stopifnot(length(selected_sets) <= max_length)
    stopifnot(length(selected_clusts_list) <= max_length)

    # if(averaging){
    return(list(selected_sets=selected_sets,
        selected_clusts_list=selected_clusts_list))
    # }
    # else{
    #     return(selected_sets)
    # }
}

#' Converts a selected set from X_glmnet to selected sets and selected clusters
#' from the original feature space of X.
#'
#' @param lasso_set A vector containing the indices of selected cluster
#' representatives or prototypes.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters is
#' equal to the number of clusters.) All identified clusters must be
#' non-overlapping. All features appear in exactly one cluster (any unclustered
#' features must be in their own "cluster" of size 1).
#' @param prototypes An integer vector whose length must be equal to the number
#' of clusters. Entry i should be the index of the feature belonging to cluster
#' i that is most highly correlated with y (that is, the prototype for the
#' cluster, as in the protolasso).
#' @param feat_names Character vector; the names of the features in X.
#' @return A list containing two items: \item{selected_set}{An integer vector
#' with length equal to lasso_set containing a set of selected features in the
#' original X matrix. (Selections in lasso_set corresponding to a cluster will
#' be replaced by the cluster's prototype from X.)}
#' \item{selected_clusts_list}{A named list of integer vectors with length equal
#' to selected_set. selected_clusts_list[[k]] will be an integer vector
#' containing the indices of the features in X that are in the cluster
#' containing prototype selected_set[k].}
#' @author Gregory Faletto, Jacob Bien
getSelectedSets <- function(lasso_set, clusters, prototypes, feat_names){
    
    model_size <- length(lasso_set)
    stopifnot(model_size > 0)

    stopifnot(length(unique(lasso_set)) == model_size)
    stopifnot(all(lasso_set <= length(clusters)))

    selected_set <- integer()
    selected_clusts_list <- list()
    # Recover features from original feature space
    for(k in 1:model_size){
        selected_cluster_k <- clusters[[lasso_set[k]]]
        stopifnot(is.integer(selected_cluster_k))
        selected_clusts_list[[k]] <- selected_cluster_k

        if(length(selected_cluster_k) == 1){
            stopifnot(!(selected_cluster_k %in% selected_set))
            selected_set <- c(selected_set, selected_cluster_k)
        } else{
            sel_prototype <- which(prototypes %in% selected_cluster_k)
            stopifnot(length(sel_prototype) == 1)
            stopifnot(!(prototypes[sel_prototype] %in% selected_set))
            selected_set <- c(selected_set, prototypes[sel_prototype])
        }
    }


    # # : deal with
    # # prototype and non-prototype features separately
    # proto_inds <- lasso_set %in% prototypes
    # lasso_set_protos <- lasso_set[proto_inds]
    # lasso_set_non_protos <- lasso_set[!proto_inds]

    # # Recover indices of non-prototype features

    # if(length(lasso_set_non_protos) > 0){
    #     stopifnot(length(lasso_set_non_protos) <=
    #         length(non_cluster_feats))
    #     stopifnot(all(lasso_set_non_protos - n_clusters %in%
    #         1:length(non_cluster_feats)))

    #     recovered_feats <- non_cluster_feats[lasso_set_non_protos -
    #         n_clusters]

    #     stopifnot(length(lasso_set_non_protos) ==
    #         length(recovered_feats))
    #     stopifnot(all.equal(!proto_inds, lasso_set %in%
    #         lasso_set_non_protos))
    #     stopifnot(length(recovered_feats) ==
    #         length(unique(recovered_feats)))

    #     lasso_set_non_protos <- recovered_feats

    #     stopifnot(length(lasso_set_non_protos) == sum(!proto_inds))
    # }

    # stopifnot(length(unique(lasso_set_protos)) == length(lasso_set_protos))
    # stopifnot(length(lasso_set_protos) == sum(proto_inds))

    # proto_selected <- length(lasso_set_protos) > 0

    # stopifnot(all(lasso_set_protos %in% 1:n_clusters))

    # if(proto_selected){
    #     lasso_set_protos <- prototypes[lasso_set_protos]
    #     stopifnot(length(lasso_set_protos) == sum(proto_inds))
    # }

    # stopifnot(length(lasso_set) == length(lasso_set_protos) +
    #     length(lasso_set_non_protos))

    # lasso_set[proto_inds] <- lasso_set_protos
    # lasso_set[!proto_inds] <- lasso_set_non_protos

    # stopifnot(length(unique(lasso_set)) == length(lasso_set))
    
    # if(var_names_provided){
    #     stopifnot(max(lasso_set) <= length(var_names))
    #     selected_sets <- var_names[lasso_set]
    # } else{
    #     selected_sets <- lasso_set
    # }

    stopifnot(length(selected_set) == model_size)
    stopifnot(length(unique(selected_set)) == model_size)
    
    if(any(!is.na(feat_names))){
        names(selected_set) <- feat_names[selected_set]
    }

    # if(averaging){

    # to_avg_list[[model_size]] <- logical(model_size)

    # Finally, identify selected clusters
    
    # weights_list[[model_size]] <- list()

    stopifnot(length(selected_clusts_list) == model_size)
    all_feats <- unlist(selected_clusts_list)
    stopifnot(length(all_feats) == length(unique(all_feats)))

    return(list(selected_set=selected_set,
        selected_clusts_list=selected_clusts_list))


}

# getCrlOutputs <- function(to_avg_list, selected_clusts_list, weights_list,
#         j, lasso_set_j, n_clusters, clusters){

#     ind_j <- matrix(FALSE, nrow=length(lasso_set_j),
#         ncol=n_clusters)
#     stopifnot(is.list(clusters))
#     # if(is.list(clusters)){
        
#     for(k in 1:n_clusters){
#         n_cluster_members_k <- length(clusters[[k]])

#         stopifnot(n_cluster_members_k >= 1)

#         ind_j[, k] <- lasso_set_j == prototypes[k]

#         stopifnot(sum(ind_j[, k]) %in% c(0, 1))

#         if(sum(ind_j[, k]) == 1){
#             to_avg_list[[j]][which(ind_j[, k])] <- TRUE

#             # selected_sets[[j]][lasso_set_j == 1] <- cluster_prototype
#             if(var_names_provided){
#                 selected_clusts_list[[j]][[which(ind_j[, k])]] <-
#                     var_names[clusters[[k]]]
#             } else{
#                 selected_clusts_list[[j]][[which(ind_j[, k])]] <-
#                     clusters[[k]]
#             }

#             weights_list[[j]][[which(ind_j[, k])]] <-
#                 rep(1/n_cluster_members_k, n_cluster_members_k)
#         }
#     }
#     # } else{
#     #     stopifnot(n_clusters <= 1)
#     #     n_cluster_members_k <- length(clusters)
#     #     stopifnot(n_cluster_members_k != 1)

#     #     ind_j[, 1] <- lasso_set_j == prototypes

#     #     stopifnot(sum(ind_j[, 1]) %in% c(0, 1))

#     #     if(sum(ind_j[, 1]) == 1){
#     #         to_avg_list[[j]][which(ind_j[, 1])] <- TRUE

#     #         # selected_sets[[j]][lasso_set_j == 1] <- cluster_prototype
#     #         if(var_names_provided){
#     #             selected_clusts_list[[j]][[which(ind_j[, 1])]] <-
#     #                 var_names[clusters]
#     #         } else{
#     #             selected_clusts_list[[j]][[which(ind_j[, 1])]] <-
#     #                 clusters
#     #         }


            
#     #         weights_list[[j]][[which(ind_j[, 1])]] <-
#     #             rep(1/n_cluster_members_k, n_cluster_members_k)

#     #         stopifnot(length(weights_list[[j]][[which(ind_j[, 1])]]) ==
#     #             length(selected_clusts_list[[j]][[which(ind_j[, 1])]]))
#     #     }
#     # }
        
#     stopifnot(sum(ind_j) != 0)
#     stopifnot(all(rowSums(ind_j) <= 1))

#     return(list(to_avg_list=to_avg_list,
#         selected_clusts_list=selected_clusts_list, weights_list=weights_list))
# }

# checkGetClusterSelsFromGlmnetInput <- function(clusters, prototypes, averaging){

#     stopifnot(is.list(clusters))
#     stopifnot(all(lengths(clusters) >= 1))
    
#     stopifnot(is.integer(prototypes))
#     stopifnot(all(!is.na(prototypes)))
#     stopifnot(length(prototypes) <= length(clusters))
#     stopifnot(length(prototypes) == length(unique(prototypes)))

#     # stopifnot(length(var_names_provided) == 1)
#     # stopifnot(is.logical(var_names_provided))
#     # if(var_names_provided){
#     #     stopifnot(is.character(var_names))
#     #     if(any(is.na(var_names))){
#     #         stop("must provide var_names (with no NAs) if var_names_provided=TRUE")
#     #     }
#     # }

#     stopifnot(length(averaging) == 1)
#     stopifnot(is.logical(averaging))

#     # n_clusters <- sum(lengths(clusters) > 1)
#     # stopifnot(length(prototypes) == n_clusters)

#     # return(n_clusters)

# }



#' Select features via the protolasso (Reid and Tibshirani 2016)
#'
#' @param X An n x p numeric matrix (preferably) or a data.frame (which will
#' be coerced internally to a matrix by the function model.matrix) containing
#' p >= 2 features/predictors
#' @param y The response; A length n numeric (or integer) real-valued vector.
#' @param clusters A list of integer vectors; each vector should contain the 
#' indices of a cluster of features (a subset of 1:p). (If there is only one
#' cluster, clusters can either be a list of length 1 or an integer vector.)
#' All of the provided clusters must be non-overlapping. Every feature not
#' appearing in any cluster will be assumed to be unclustered (that is, they
#' will be treated as if they are in a "cluster" containing only themselves).
#' Default is list() (so no clusters are specified).
#' @param nlambda Integer; the number of lambda values to use in the lasso fit
#' for the protolasso. Default is 100 (following the default for glmnet). For
#' now, nlambda must be at least 2 (using a single lambda is not supported).
#' @return A list with three elements. \item{selected_sets}{A list of integer
#' vectors. Entry k of this list contains a selected set (an integer vector) of
#' size k yielded by the protolasso (If no set of size k was selected, entry k
#' will be empty.)} \item{selected_clusts_list}{A list; each element of the list
#' is a named list of selected clusters. (That is, if a selected set of size k
#' was yielded by the protolasso, then selected_clusts_list[[k]] is a named
#' list of length k, where each member of the list is an integer vector
#' of cluster members. In particular, selected_clusts_lists[[k]][[j]] will be
#' the cluster that contains feature selected_sets[[k]][j].)} \item{beta}{The
#' beta output from glmnet when the lasso was estimated on a matrix of
#' prototypes. (See documentation for the function glmnet from the glmnet
#' package for details.)}
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
#' @export
protolasso <- function(X, y, clusters=list(), nlambda=100){

    # Handle and format inputs; get cluster prototypes
    ret <- processClusterLassoInputs(X, y, clusters, nlambda)

    x <- ret$x
    clusters <- ret$clusters
    prototypes <- ret$prototypes
    feat_names <- ret$var_names

    rm(ret)

    # Format the design matrix for glmnet according to the protolasso procedure
    X_glmnet <- getXglmnet(x, clusters, type="protolasso",
        prototypes=prototypes)

    #  <- getXglmnet_results$X_glmnet
    # cluster_members <- getXglmnet_results$cluster_members
    # non_cluster_feats <- getXglmnet_results$non_cluster_feats

    # rm(getXglmnet_results)

    # Estimate the lasso on the cluster prototypes
    fit <- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=nlambda)
    lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))

    # Finally, obtain a tidy list of selected sets--one for each model size
    cluster_sel_results <- getClusterSelsFromGlmnet(lasso_sets, clusters,
        prototypes, feat_names)

    return(list(selected_sets=cluster_sel_results$selected_sets,
        selected_clusts_list=cluster_sel_results$selected_clusts_list,
        beta=fit$beta))
}

#' Check the inputs to protolasso and clusterRepLasso, format clusters, and
#' identify prototypes for each cluster
#'
#' @param X An n x p numeric matrix (preferably) or a data.frame (which will
#' be coerced internally to a matrix by the function model.matrix) containing
#' p >= 2 features/predictors
#' @param y The response; A length n numeric (or integer) real-valued vector.
#' @param clusters A list of integer vectors; each vector should contain the 
#' indices of a cluster of features (a subset of 1:p). (If there is only one
#' cluster, clusters can either be a list of length 1 or an integer vector.)
#' All of the provided clusters must be non-overlapping. Every feature not
#' appearing in any cluster will be assumed to be unclustered (that is, they
#' will be treated as if they are in a "cluster" containing only themselves).
#' Default is list() (so no clusters are specified).
#' @param nlambda Integer; the number of lambda values to use in the lasso fit
#' for the protolasso. Default is 100 (following the default for glmnet). For
#' now, nlambda must be at least 2 (using a single lambda is not supported).
#' @return A list with four elements. \item{x}{The provided X, converted to a
#' matrix if it was provided as a data.frame, and with column names removed.}
#' \item{clusters}{A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters is
#' equal to the number of clusters.) All identified clusters are
#' non-overlapping. All features appear in exactly one cluster (any unclustered
#' features will be put in their own "cluster" of size 1).}
#' \item{prototypes}{An integer vector whose length is equal to the number of
#' clusters. Entry i is the index of the feature belonging to cluster i that is
#' most highly correlated with y (that is, the prototype for the cluster, as in
#' the protolasso; see Reid and Tibshirani 2016).} \item{var_names}{If the
#' provided X matrix had column names, the names of the featurrs in the provided
#' X matrix. If no names were provided, feat_names will be NA.}
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
processClusterLassoInputs <- function(X, y, clusters, nlambda){

    stopifnot(is.matrix(X) | is.data.frame(X))

    # Check if x is a matrix; if it's a data.frame, convert to matrix.
    if(is.data.frame(X)){
        p <- ncol(X)

        X <- stats::model.matrix(~ ., X)
        X <- X[, colnames(X) != "(Intercept)"]

        if(p != ncol(X) & length(clusters) > 0){
            stop("When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.")
        }
    }

    stopifnot(is.matrix(X))
    stopifnot(all(!is.na(X)))

    feat_names <- as.character(NA)
    if(!is.null(colnames(X))){
        feat_names <- colnames(X)
        if(any(is.na(feat_names))){
            stop("Some features in provided X matrix had valid names and some had NA names; please neither name all features in X or remove the names altogether.")
        }
    }

    n <- nrow(X)

    colnames(X) <- character()

    stopifnot(is.numeric(y) | is.integer(y))
    stopifnot(n == length(y))
    stopifnot(all(!is.na(y)))

    # Check clusters argument
    clusters <- checkCssClustersInput(clusters)

    # Format clusters into a list where all features are in exactly one
    # cluster (any unclustered features are put in their own "cluster" of size
    # 1).
    clust_names <- as.character(NA)
    if(!is.null(names(clusters)) & is.list(clusters)){
        clust_names <- names(clusters)
    }

    cluster_results <- formatClusters(clusters, p=ncol(X),
        clust_names=clust_names, get_prototypes=TRUE, x=X, y=y)

    clusters <- cluster_results$clusters
    prototypes <- cluster_results$prototypes

    rm(cluster_results)

    stopifnot(length(clusters) == length(prototypes))

    stopifnot(is.numeric(nlambda) | is.integer(nlambda))
    stopifnot(length(nlambda) == 1)
    stopifnot(!is.na(nlambda))
    stopifnot(nlambda >= 2)
    stopifnot(nlambda == round(nlambda))

    return(list(x=X, clusters=clusters, prototypes=prototypes,
        var_names=feat_names))
}

#' Select features via the cluster representative lasso (Bühlmann et. al. 2013)
#'
#' @param X An n x p numeric matrix (preferably) or a data.frame (which will
#' be coerced internally to a matrix by the function model.matrix) containing
#' p >= 2 features/predictors
#' @param y The response; A length n numeric (or integer) real-valued vector.
#' @param clusters A list of integer vectors; each vector should contain the 
#' indices of a cluster of features (a subset of 1:p). (If there is only one
#' cluster, clusters can either be a list of length 1 or an integer vector.)
#' All of the provided clusters must be non-overlapping. Every feature not
#' appearing in any cluster will be assumed to be unclustered (that is, they
#' will be treated as if they are in a "cluster" containing only themselves).
#' Default is list() (so no clusters are specified).
#' @param nlambda Integer; the number of lambda values to use in the lasso fit
#' for the cluster representative lasso. Default is 100 (following the default
#' for glmnet). For now, nlambda must be at least 2 (using a single lambda is
#' not supported).
#' @return A list with three elements. \item{selected_sets}{A list of integer
#' vectors. Entry k of this list contains a selected set (an integer vector) of
#' size k yielded by the lasso--each member of the set is the index of a single
#' feature from a cluster selected by the cluster representative lasso (the
#' prototype from that cluster--the cluster member most highly correlated with
#' y). (If no set of size k was selected, entry k will be empty.)}
#' \item{selected_clusts_list}{A list; each element of the list is a named list
#' of selected clusters. (That is, if a selected set of size k was yielded by
#' the cluster representative lasso, then selected_clusts_list[[k]] is a named
#' list of length k, where each member of the list is an integer vector
#' of cluster members. Note that selected_clusts_lists[[k]][[j]] will be the
#' cluster that contains feature selected_sets[[k]][j].)} \item{beta}{The beta
#' output from glmnet when the lasso was estimated on a matrix of prototypes.
#' (See documentation for the function glmnet from the glmnet package for
#' details.)}
#' @references Bühlmann, P., Rütimann, P., van de Geer, S., & Zhang, C. H.
#' (2013). Correlated variables in regression: Clustering and sparse estimation.
#' \emph{Journal of Statistical Planning and Inference}, 143(11), 1835–1858.
#' \url{https://doi.org/10.1016/j.jspi.2013.05.019}. \cr Jerome Friedman, Trevor
#' Hastie, Robert Tibshirani (2010). Regularization Paths for Generalized Linear
#' Models via Coordinate Descent. \emph{Journal of Statistical Software}, 33(1)
#' ' 1-22. URL \url{https://www.jstatsoft.org/v33/i01/}.
clusterRepLasso <- function(X, y, clusters=list(), nlambda=100){

    # Handle and format inputs; get cluster prototypes
    ret <- processClusterLassoInputs(X, y, clusters, nlambda)

    x <- ret$x
    clusters <- ret$clusters
    prototypes <- ret$prototypes
    feat_names <- ret$var_names

    rm(ret)

    # Format the design matrix for glmnet according to the cluster
    # representative lasso procedure
    X_glmnet <- getXglmnet(x, clusters, type="clusterRepLasso",
        prototypes=prototypes)

    #  <- getXglmnet_results$X_glmnet
    # cluster_members <- getXglmnet_results$cluster_members
    # non_cluster_feats <- getXglmnet_results$non_cluster_feats

    # rm(getXglmnet_results)

    # Estimate the lasso on the cluster representatives
    fit <- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=nlambda)
    lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))

    # Finally, extract the desired information from the lasso fit--all the
    # sets of selected clusters (one for each observed model size), and
    # corresponding sets of selected features
    cluster_sel_results <- getClusterSelsFromGlmnet(lasso_sets, clusters,
        prototypes, feat_names)

    return(list(selected_sets=cluster_sel_results$selected_sets,
        selected_clusts_list=cluster_sel_results$selected_clusts_list,
        beta=fit$beta))
}
gregfaletto/cssr documentation built on March 3, 2023, 1 p.m.