R/utils_weights.R

Defines functions getPeptideProteinWeights

Documented in getPeptideProteinWeights

#' Get PSM-protein weights for summarization with shared peptides
#'
#' @inheritParams getWeightedProteinSummary
#' @return data.table
#'
#' @export
getPeptideProteinWeights = function(feature_data,
                                    norm = "p_norm", norm_parameter = 1,
                                    weights_mode = "contributions",
                                    weights_penalty = FALSE,
                                    weights_penalty_param = 0.1) {
    weights_design = getWeightsDesign(feature_data)
    design_matrix = weights_design[["x"]]
    y = weights_design[["y"]]
    is_non_missing = !is.na(y)

    y = y[is_non_missing]
    design_matrix = design_matrix[is_non_missing, ]

    cols = colnames(design_matrix)
    cols_split = stringr::str_split(cols, "__") # TODO: STRINGR -> STRINGI
    psms_cols = sapply(cols_split, function(x) x[1])
    protein_cols = sapply(cols_split, function(x) if (length(x) == 1) NA else x[-1])

    if (weights_penalty) {
        feature_data[, NumPeptidesPerProtein := uniqueN(ProteinName), by = "PSM"]
        feature_data[, EqualWeights := 1 / NumPeptidesPerProtein]
        equal_weights = unique(feature_data[, .(PSM, ProteinName, EqualWeights)])[, EqualWeights]
        names(equal_weights) = unique(feature_data[, .(PSM, ProteinName, EqualWeights)])[, paste(PSM, ProteinName, sep = "__")]
        colnames(design_matrix)

        equal_weights_vals = equal_weights[colnames(design_matrix)]
        equal_weights_vals = ifelse(is.na(equal_weights_vals), 0, equal_weights_vals)
        equal_weights_vals = unname(equal_weights_vals)
        multiplier = ifelse(equal_weights_vals == 0, 0, 1)
    }

    params_full = CVXR::Variable(ncol(design_matrix)) # n_params defined earlier and passed to function below?
    constraints = getWeightsConstraints(params_full,
                                        design_matrix, weights_mode,
                                        cols, protein_cols, psms_cols)

    # TODO: move below to another function?
    if (weights_penalty) {
        penalty = 0.1 * sum(multiplier * (params_full - equal_weights_vals) ^ 2)
        if (norm == "p_norm") {
            obj = CVXR::p_norm(design_matrix %*% params_full - y, norm_parameter) + penalty
        } else {
            obj = sum(CVXR::huber(design_matrix %*% params_full - y, norm_parameter)) + penalty
        }
    } else {
        if (norm == "p_norm") {
            obj = CVXR::p_norm(design_matrix %*% params_full - y, norm_parameter)
        } else {
            obj = sum(CVXR::huber(design_matrix %*% params_full - y, norm_parameter))
        }
    }

    prob_con = CVXR::Problem(CVXR::Minimize(obj), constraints)
    sol_con = CVXR::solve(prob_con) # TODO: WHAT IF this throws an error?
    alphas = as.vector(sol_con$getValue(CVXR::variables(prob_con)[[1]]))

    result = data.table::data.table(
        ProteinName = protein_cols,
        PSM = psms_cols,
        Weight = alphas
    )
    result = result[!is.na(ProteinName)]
    result = result[Weight > 1e-3] # 1e-4?
    result[, Weight := Weight / sum(Weight), by = "PSM"]
    result
}

#' Get design matrix and response for weights calculation
#' @inheritParams getWeightedProteinSummary
#' @keywords internal
getWeightsDesign = function(feature_data) {
    intensities_tbl = unique(feature_data[, .(PSM, Channel, log2IntensityNormalized)])

    psms_intercept_tbl = unique(feature_data[, .(PSM, Channel, Present = 1)])
    psms_intercept_tbl = data.table::dcast(psms_intercept_tbl,
                                           PSM + Channel ~ PSM, value.var = "Present", fill = 0)
    colnames(psms_intercept_tbl)[3:ncol(psms_intercept_tbl)] = paste0("int_", 1:(ncol(psms_intercept_tbl) - 2))

    psms_protein_tbl = unique(feature_data[, .(PSM, ProteinName, Channel, CenteredAbundance)])
    psms_protein_tbl = data.table::dcast(psms_protein_tbl,
                                         PSM + Channel ~ PSM + ProteinName,
                                         value.var = "CenteredAbundance",
                                         fill = 0, sep = "__")
    wide = merge(
        merge(psms_intercept_tbl, intensities_tbl,
              by = c("PSM", "Channel"), all.x = T, all.y = T),
        psms_protein_tbl,
        by = c("PSM", "Channel"), all.x = T, all.y = T
    )
    wide = data.table::as.data.table(wide)

    y_full = wide$log2IntensityNormalized
    x_full = as.matrix(wide[, -(1:2), with = FALSE])
    x_full = x_full[, !(colnames(x_full) == "log2IntensityNormalized")]
    x_full = cbind(intercept = 1, x_full)

    list(x = x_full,
         y = y_full)
}

#' Get constraints for weights
#' @param params_full object generated by CVXR::Variables
#' @param design_matrix design matrix for weights optimization
#' @inheritParams getWeightedProteinSummary
#' @param cols column names of the design matrix
#' @param protein_cols protein names extracted from column names
#' @param psms_cols feature names extracted from column names
getWeightsConstraints = function(params_full, design_matrix, weights_mode,
                                 cols, protein_cols, psms_cols) {
    unique_psms = unique(psms_cols)
    unique_psms = unique_psms[!grepl("int", unique_psms)]
    n_params = ncol(design_matrix)
    n_conditions = length(unique_psms)

    intercept_conditions = matrix(1, nrow = 1, ncol = n_params)
    intercept_conditions[!grepl("int_[0-9]+", colnames(design_matrix))] = 0

    constraint_matrix_full = matrix(0, nrow = n_conditions, ncol = n_params)
    for (i in seq_along(unique_psms)) {
        psm = unique_psms[i]
        constraint_matrix_full[i, psms_cols == psm] = 1
    }
    constraint_matrix_full = rbind(constraint_matrix_full,
                                   intercept_conditions)

    n_intercepts = sum(grepl("int", cols))
    positive_matrix = diag(1, n_params - n_intercepts)
    positive_matrix = rbind(matrix(0, nrow = n_intercepts,
                                   ncol = ncol(positive_matrix)),
                            positive_matrix)
    positive_matrix = cbind(matrix(0, ncol = n_intercepts,
                                   nrow = nrow(positive_matrix)),
                            positive_matrix)

    if (weights_mode == "contributions") {
        constraints_full = list(positive_matrix %*% params_full >= rep(0, n_params),
                                constraint_matrix_full %*% params_full == c(rep(1, n_conditions),
                                                                            rep(0, nrow(constraint_matrix_full) - n_conditions)))
    } else {
        constraints_full = list(positive_matrix %*% params_full >= rep(0, n_params),
                                positive_matrix %*% params_full <= rep(1, n_params))
    }
    constraints_full
}
Vitek-Lab/SharedPeptidesExplorer documentation built on April 14, 2025, 1:45 p.m.