R/regressionsweep.R

Defines functions regressionFromCorrelations inverseBootstrap betasFromCorrelations unstandardizeBetas summary.RegressionCorrelations df.residual.RegressionCorrelations LinearRegressionFromCorrelations

Documented in LinearRegressionFromCorrelations summary.RegressionCorrelations

#' Linear regression via a sweep operation on the correlation matrix.
#'
#' @param formula An object of class \code{\link{formula}} (or one that can be
#'   coerced to that class): a symbolic description of the model to be fitted.
#'   The details of type specification are given under \sQuote{Details}.
#' @param data A \code{\link{data.frame}}.
#' @param subset An optional vector specifying a subset of observations to be
#'   used in the fitting process, or, the name of a variable in \code{data}. It
#'   may not be an expression. \code{subset} may not
#' @param weights An optional vector of sampling weights, or, the name or, the
#'   name of a variable in \code{data}. It may not be an expression.
#' @param ... Additional argments to be past to  \code{\link[psych]{setCor}}.
#' @details Estimates regression using sweep operations on the correlation matrix.
#' The sample size assumed in tests is the smallest pairwise correlation sample size.
#' @importFrom flipU OutcomeName AllVariablesNames
#' @importFrom flipStatistics CovarianceAndCorrelationMatrix  StandardDeviation Mean
#' @importFrom psych setCor
#' @importFrom flipData CheckForPositiveVariance CheckCorrelationMatrix
#' CheckForLinearDependence CalibrateWeight
#' @importFrom stats complete.cases
#' @importFrom flipFormat Labels BaseDescription
#' @importFrom verbs Sum
#' @export
LinearRegressionFromCorrelations <- function(formula, data = NULL, subset = NULL,
                                                                               weights = NULL, ...)
{
    result <- NULL
    variable.names <- names(data)
    full.variable.names <- Labels(data, show.name = TRUE)
    formula.names <- AllVariablesNames(formula, data)
    indices <- match(formula.names, variable.names)
    outcome.name <- OutcomeName(formula, data)
    outcome.index <- match(outcome.name, variable.names)
    predictors.index <- indices[-match(outcome.name, formula.names)]
    indices <- c(outcome.index, predictors.index)
    factors <- unlist(lapply(data[,indices], is.factor))
    if (any(factors))
        stop(paste0("Factors are not permitted when missing is set to 'Use partial data (pairwise)'.
                    Factors: ", paste(variable.names[indices][factors], collapse = ", ")))
    subset.data <- if(is.null(subset)) data else subset(data, subset)
    n.subset <- nrow(subset.data)
    if (n.subset < length(predictors.index) + 1)
        stop(warningSampleSizeTooSmall())
    n.total <- nrow(data)
    weighted <- !is.null(weights)
    if (weighted) {
        warning("The pairwise correlation option for missing data assumes ",
                "that cases are missing completely at random (MCAR), and ",
                "hence that all cases are drawn from the same population. ",
                "However, using a weight indicates that you do not believe ",
                "this to be the case. Use of weights in this case should be ",
                "treated with caution.")
    }
    if (n.subset < n.total && weighted)
        weights <- subset(weights, subset)
    y <- subset.data[, outcome.index]
    x <- subset.data[, predictors.index]
    if (weighted)
        weights <- CalibrateWeight(weights)
    y.and.x <- subset.data[, c(outcome.index, predictors.index)]
    k <- ncol(y.and.x)
    pairwise.n <- matrix(NA, k, k)
    for (r in 2:ncol(y.and.x))
        for (c in 1:r)
        {
            pairwise.data <- !is.na(y.and.x[,c]) & !is.na(y.and.x[,r])
            pairwise.n[r, c] <- if(weighted) Sum(weights[pairwise.data]) else Sum(pairwise.data)
        }
    min.pairwise.n <- min(pairwise.n, na.rm = TRUE)
    cors <- if (weighted)
        CovarianceAndCorrelationMatrix(y.and.x, weights, TRUE, TRUE)
    else
        cor(y.and.x, use = "pairwise.complete.obs")
    # Checking data
    CheckForPositiveVariance(y.and.x)
    # pysch function isCorrelation doesnt take into account Machine epsilon precision of diagonal
    # terms, does a check max(x) <= 1, which will be violated if any diagonal element is 1 + 1e-16
    diag(cors) <- 1
    CheckCorrelationMatrix(cors)
    CheckForLinearDependence(cors)
    # Doing the computation.
    original <- setCor(1, 2:ncol(cors), data = cors, n.obs = min.pairwise.n, plot = FALSE, z = NULL, ...)
    result$original <- original
    scaled.beta <- as.matrix(if (is.null(original$beta)) original$coefficients else  original$beta)
    sds <- StandardDeviation(y.and.x, weights)
    sds.independent <- sds[-1]
    result$original$sd.dependent <- sd.dependent <- sds[1]
    beta <- scaled.beta / (sds.independent / sd.dependent)
    se <- original$se / (sds.independent / sd.dependent)
    partial.coefs <- cbind(beta, se, original$t, original$Probability)
    dimnames(partial.coefs) <- list(variable.names[predictors.index],
                                    c("Estimate", "Std. Error", "t value", "Pr(>|t|)"))
    fitted <- as.matrix(data[, predictors.index]) %*% beta
    resid.valid <- !is.na(fitted) & complete.cases(data[, predictors.index])
    mean.y <- Mean(y[resid.valid], weights[resid.valid])
    mean.y.hat <- Mean(fitted[resid.valid], weights[resid.valid])
    intercept <- mean.y - mean.y.hat
    result$predicted.values <- result$fitted.values <- fitted + intercept
    partial.coefs <- partial.coefs[c(1,1:nrow(partial.coefs)),]
    partial.coefs[1,] <- c(intercept, NA, NA, NA)
    rownames(partial.coefs)[1] <- "(Intercept)"
    result$coef <- partial.coefs[, 1]
    result$subset <- !is.na(data[outcome.index]) & !is.na(fitted) & (rownames(data) %in% rownames(subset.data))
    # Sample description.
    result$n.observations <- n.total <- nrow(data)
    weight.label <- if(weighted <- !is.null(weights)) Labels(weights) else  ""
    rng <- range(pairwise.n[lower.tri(pairwise.n)], na.rm = TRUE)
    n.min <- rng[1]
    if (n.min == rng[2])
        description <- paste0("n = ", n.min,
                              " cases used in estimation\n")
    else
        description <- paste0("Pairwise correlations have been used to estimate this regression.\n",
                              "Sample sizes for the correlations range from ", n.min, " to ", rng[2], "")
    description <- BaseDescription(description,
                                   n.total, attr(subset, "n.subset"),
                                   n.min, Labels(subset), NULL, "")
    if (!is.null(weights))
        description <- paste0(description, " Standard errors and statistical tests use calibrated weights (",
                              weight.label, ").\n")
    result$sample.description <- description
    result$outcome.index <- outcome.index
    result$n.predictors <- length(predictors.index)
    result$estimation.data <- subset.data
    result$original$coefficients <- partial.coefs
    class(result) <- "RegressionCorrelations"
    result
}

#' @export
df.residual.RegressionCorrelations <- function(object, ...)
{
    object$original$df[2]
}

#' summary.RegressionCorrelations
#'
#' @param object Regression model.
#' @param ... Other arugments.
#' @method summary RegressionCorrelations
#' @importFrom xml2 read_xml
#' @export
summary.RegressionCorrelations <- function(object, ...)
{
    n <- object$estimation.data
    result <- list(coefficients = object$original$coefficients)
    p <- nrow(object$original$coefficients)
    result$sigma <- object$original$residual * object$original$sd.dependent
    result$df <- object$original$df[c(1:2,1)]
    result$fstatistic <- c(value = object$original$F, numdf = object$original$df[1], numdf = object$original$df[2])
    result$r.squared <- object$original$R2
    result$adj.r.squared <- object$original$shrunkenR2
    class(result) <- "RegressionCorrelationsSummary"
    result
}

#' @importFrom stats sd
unstandardizeBetas <- function(scaled.betas, x, y)
{
    x.sd = apply(x, 2, sd, na.rm = TRUE)
    y.sd <- sd(y, na.rm = TRUE)
    scaled.betas / (x.sd / y.sd)
}


betasFromCorrelations <- function(R)
{
    p <- ncol(R) - 1
    indices <- 1:p
    R11 <- R[indices, indices, drop = FALSE]
    R12 <- R[indices, -indices, drop = FALSE]
    R21 <- R[-indices, indices, drop = FALSE]
    R22 <- R[-indices, -indices, drop = FALSE]
    b <- R21 %*% solve(R11)
    ESS <- R22 - b %*% t(R21)
    list(ESS = ESS, b = b)
}

inverseBootstrap <- function(x, probabilities)
{
    n <- nrow(x)
    replicants = sample.int(n, size = n, replace = TRUE, prob = probabilities)
    x[replicants, ]
}

#' @importFrom stats sd cor
#' @importFrom verbs Sum
# Bootstrapping
regressionFromCorrelations <- function(y, x, weights = rep(1, nrow(x)), b = 999)
{
    data <- as.matrix(cbind(x, y))
    prob = weights / Sum(weights, remove.missing = FALSE)
    betas <- matrix(NA, b, ncol(x))
    for (i in 1:b)
    {
        dat <- inverseBootstrap(data, prob)
        r <- cor(dat, use = "pairwise.complete.obs")
        b <- betasFromCorrelations(r)$b
        betas[i, ] <- b
    }
    apply(betas, 2, sd)
}


# swp <- function(x, indices = 1:ncol(x))
# {
#     p <- ncol(x)
#     g <- x
#     result <- x
#     X11 <- x[indices, indices, drop = FALSE]
#     X12 <- x[indices, -indices, drop = FALSE]
#     X21 <- x[-indices, indices, drop = FALSE]
#     X22 <- x[-indices, -indices, drop = FALSE]
#     B <- X21 %*% solve(X11)
#     result[indices, indices] <- -solve(X11)
#     result[-indices, -indices] <- X22 - B %*% t(X21)
#     result[-indices, indices] <- B
#     result[indices, -indices] <- t(B)
#     result
# }
#
# swp(R, 1:6)
#
# C <-  matrix(c(9, 3,  4, -2,  5,3,  8,  6,  5 , 4, 4,  6,  7,  3,  1,-2,  5,  3 , 9,  2,5,  4,  1,  2,  8), ncol = 5, byrow = TRUE)
# swp(C, 1:3)

#
#
# setCor <- function (y, x, data, z = NULL, n.obs = NULL, use = "pairwise",
#           std = TRUE, square = FALSE, main = "Regression Models", plot = TRUE)
# {
#     require("psych")
#     cl <- match.call()
#     if (is.numeric(y))
#         y <- colnames(data)[y]
#     if (is.numeric(x))
#         x <- colnames(data)[x]
#     if (is.numeric(z))
#         z <- colnames(data)[z]
#     if ((dim(data)[1] != dim(data)[2]) | square) {
#         n.obs = dim(data)[1]
#         if (!is.null(z)) {
#             data <- data[, c(y, x, z)]
#         }
#         else {
#             data <- data[, c(y, x)]
#         }
#         if (!is.matrix(data))
#             data <- as.matrix(data)
#         if (!is.numeric(data))
#             stop("The data must be numeric to proceed")
#         C <- cov(data, use = use)
#         if (std) {
#             m <- cov2cor(C)
#             C <- m
#         }
#         else {
#             m <- C
#         }
#         raw <- TRUE
#     }
#     else {
#         raw <- FALSE
#         if (!is.matrix(data))
#             data <- as.matrix(data)
#         C <- data
#         if (std) {
#             m <- cov2cor(C)
#         }
#         else {
#             m <- C
#         }
#     }
#     nm <- dim(data)[1]
#     xy <- c(x, y)
#     numx <- length(x)
#     numy <- length(y)
#     numz <- 0
#     nxy <- numx + numy
#     m.matrix <- m[c(x, y), c(x, y)]
#     x.matrix <- m[x, x, drop = FALSE]
#     xc.matrix <- m[x, x, drop = FALSE]
#     xy.matrix <- m[x, y, drop = FALSE]
#     xyc.matrix <- m[x, y, drop = FALSE]
#     y.matrix <- m[y, y, drop = FALSE]
#     if (!is.null(z)) {
#         numz <- length(z)
#         zm <- m[z, z, drop = FALSE]
#         za <- m[x, z, drop = FALSE]
#         zb <- m[y, z, drop = FALSE]
#         x.matrix <- x.matrix - za %*% solve(zm) %*% t(za)
#         y.matrix <- y.matrix - zb %*% solve(zm) %*% t(zb)
#         xy.matrix <- xy.matrix - za %*% solve(zm) %*% t(zb)
#         m.matrix <- cbind(rbind(y.matrix, xy.matrix), rbind(t(xy.matrix),
#                                                             x.matrix))
#     }
#     if (numx == 1) {
#         beta <- matrix(xy.matrix, nrow = 1)/x.matrix[1, 1]
#     }
#     else {
#         beta <- solve(x.matrix, xy.matrix)
#         beta <- as.matrix(beta)
#     }
#     yhat <- t(xy.matrix) %*% solve(x.matrix) %*% (xy.matrix)
#     resid <- y.matrix - yhat
#     if (numy > 1) {
#         if (is.null(rownames(beta))) {
#             rownames(beta) <- x
#         }
#         if (is.null(colnames(beta))) {
#             colnames(beta) <- y
#         }
#         R2 <- colSums(beta * xy.matrix)/diag(y.matrix)
#     }
#     else {
#         colnames(beta) <- y
#         R2 <- Sum(beta * xy.matrix)/y.matrix
#         R2 <- matrix(R2)
#         rownames(beta) <- x
#         rownames(R2) <- colnames(R2) <- y
#     }
#     px <- principal(x.matrix)
#     keys.x <- diag(as.vector(1 - 2 * (px$loadings < 0)))
#     py <- principal(y.matrix)
#     keys.y <- diag(as.vector(1 - 2 * (py$loadings < 0)))
#     Vx <- Sum(keys.x %*% x.matrix %*% t(keys.x))
#     Vy <- Sum(keys.y %*% y.matrix %*% t(keys.y))
#     ruw <- colSums(abs(xy.matrix))/sqrt(Vx)
#     Ruw <- Sum(diag(keys.x) %*% xy.matrix %*% t(keys.y))/sqrt(Vx *
#                                                                   Vy)
#     if (numy < 2) {
#         Rset <- 1 - det(m.matrix)/(det(x.matrix))
#         Myx <- solve(x.matrix) %*% xy.matrix %*% t(xy.matrix)
#         cc2 <- cc <- T <- NULL
#     }
#     else {
#         if (numx < 2) {
#             Rset <- 1 - det(m.matrix)/(det(y.matrix))
#             Myx <- xy.matrix %*% solve(y.matrix) %*% t(xy.matrix)
#             cc2 <- cc <- T <- NULL
#         }
#         else {
#             Rset <- 1 - det(m.matrix)/(det(x.matrix) * det(y.matrix))
#             if (numy > numx) {
#                 Myx <- solve(x.matrix) %*% xy.matrix %*% solve(y.matrix) %*%
#                     t(xy.matrix)
#             }
#             else {
#                 Myx <- solve(y.matrix) %*% t(xy.matrix) %*% solve(x.matrix) %*%
#                     (xy.matrix)
#             }
#         }
#         cc2 <- eigen(Myx)$values
#         cc <- sqrt(cc2)
#         T <- Sum(cc2)/length(cc2)
#     }
#     if (!is.null(n.obs)) {
#         k <- length(x)
#         uniq <- (1 - smc(x.matrix))
#         se.beta <- list()
#         for (i in 1:length(y)) {
#             df <- n.obs - k - 1
#             se.beta[[i]] <- (sqrt((1 - R2[i])/(df)) * sqrt(1/uniq))
#         }
#         se <- matrix(unlist(se.beta), ncol = length(y))
#         colnames(se) <- colnames(beta)
#         rownames(se) <- rownames(beta)
#         se <- t(t(se) * sqrt(diag(C)[y]))/sqrt(diag(xc.matrix))
#         tvalue <- beta/se
#         prob <- 2 * (1 - pt(abs(tvalue), df))
#         SE2 <- 4 * R2 * (1 - R2)^2 * (df^2)/((n.obs^2 - 1) *
#                                                  (n.obs + 3))
#         SE = sqrt(SE2)
#         F <- R2 * df/(k * (1 - R2))
#         pF <- 1 - pf(F, k, df)
#         shrunkenR2 <- 1 - (1 - R2) * (n.obs - 1)/df
#         u <- numx * numy
#         m1 <- n.obs - max(numy, (numx + numz)) - (numx + numy +
#                                                       3)/2
#         s <- sqrt((numx^2 * numy^2 - 4)/(numx^2 + numy^2 - 5))
#         if (numx * numy == 4)
#             s <- 1
#         v <- m1 * s + 1 - u/2
#         R2set.shrunk <- 1 - (1 - Rset) * ((v + u)/v)^s
#         L <- 1 - Rset
#         L1s <- L^(-1/s)
#         Rset.F <- (L1s - 1) * (v/u)
#         df.m <- n.obs - max(numy, (numx + numz)) - (numx + numy +
#                                                         3)/2
#         s1 <- sqrt((numx^2 * numy^2 - 4)/(numx^2 + numy^2 - 5))
#         if (numx^2 * numy^2 < 5)
#             s1 <- 1
#         df.v <- df.m * s1 + 1 - numx * numy/2
#         Chisq <- -(n.obs - 1 - (numx + numy + 1)/2) * log((1 -
#                                                                cc2))
#     }
#     if (is.null(n.obs)) {
#         set.cor <- list(beta = beta, R = sqrt(R2), R2 = R2, Rset = Rset,
#                         T = T, cancor = cc, cancor2 = cc2, raw = raw, residual = resid,
#                         ruw = ruw, Ruw = Ruw, x.matrix = x.matrix, y.matrix = y.matrix,
#                         Call = cl)
#     }
#     else {
#         set.cor <- list(beta = beta, se = se, t = tvalue, Probability = prob,
#                         R = sqrt(R2), R2 = R2, shrunkenR2 = shrunkenR2, seR2 = SE,
#                         F = F, probF = pF, df = c(k, df), Rset = Rset, Rset.shrunk = R2set.shrunk,
#                         Rset.F = Rset.F, Rsetu = u, Rsetv = df.v, T = T,
#                         cancor = cc, cancor2 = cc2, Chisq = Chisq, raw = raw,
#                         residual = resid, ruw = ruw, Ruw = Ruw, x.matrix = x.matrix,
#                         y.matrix = y.matrix, Call = cl)
#     }
#     class(set.cor) <- c("psych", "setCor")
#     if (plot)
#         setCor.diagram(set.cor, main = main)
#     return(set.cor)
# }
NumbersInternational/flipRegression documentation built on March 2, 2024, 10:42 a.m.