R/uniform_coordinate.R

#' Coordinate ascent algorithm for normal likelihood and mixtures of uniforms.
#'
#'
#' @inheritParams succotash_unif_fixed
#' @inheritParams uniform_succ_given_alpha
#' @param likelihood Can be \code{"normal"} or \code{"t"}.
#' @param df A positive numeric. The degrees of freedom if the
#'     likelihood is t.
#'
#'
#'
fit_succotash_unif_coord <- function(pi_Z, lambda, alpha, Y, a_seq, b_seq, sig_diag,
                                     print_ziter = FALSE, newt_itermax = 100, tol = 10 ^ -4,
                                     var_scale = TRUE,
                                     likelihood = c("normal", "t"), df = NULL) {

    likelihood = match.arg(likelihood, c("normal", "t"))
    if (likelihood == "normal") {
        df <- 1000
    } else if (likelihood == "t" & is.null(df)) {
        stop("t likelihood specified but df is null")
    } else if (likelihood != "t" & likelihood != "normal") {
        stop("needs to be either a t likelihood or a normal likelihood")
    }

    M <- length(a_seq) + length(b_seq) + 1
    ## p <- nrow(Y)
    if (var_scale) {
        k         <- length(pi_Z) - M - 1
        scale_val <- pi_Z[length(pi_Z)]
    } else {
        k         <- length(pi_Z) - M
        scale_val <- 1
    }
    pi_old <- pi_Z[1:M]
    if (k != 0) {
        Z_old <- matrix(pi_Z[(M + 1):(M + k)], nrow = k)
    }

    assertthat::are_equal(length(lambda), M)
    assertthat::are_equal(sum(pi_old), 1)
    assertthat::assert_that(all(a_seq < 0))
    assertthat::assert_that(all(b_seq > 0))


    Z_new <- Z_old
    pi_new <- pi_old

    llike_new <- succotash_llike_unif(pi_Z = pi_Z, lambda = lambda,
                                      alpha = alpha, Y = Y, a_seq = a_seq, b_seq = b_seq,
                                      sig_diag = sig_diag, var_scale = var_scale,
                                      likelihood = likelihood, df = df)

    llike_vec <- llike_new

    err <- tol + 1
    iter_index <- 1
    while (err > tol & iter_index < newt_itermax) {
        llike_old <- llike_new
        pi_Z_old  <- pi_Z
        Z_old     <- Z_new
        pi_old    <- pi_new
        ## update Z with newton step --------------------------------------------------
        optim_out <- stats::optim(par = Z_new, fn = only_Z,
                                  gr = only_Z_grad,
                                  scale_val = scale_val,
                                  pi_vals = pi_new, lambda = lambda,
                                  alpha = alpha, Y = Y, a_seq = a_seq,
                                  b_seq = b_seq, sig_diag = sig_diag,
                                  likelihood = likelihood, df = df,
                                  method = "BFGS",
                                  control = list(fnscale = -1, maxit = 50, reltol = 10 ^ -4))
        Z_new <- optim_out$par

        ## optim_out1 <- optim::optim(Z_new, fn = only_Z, scale_val = scale_val, pi_vals = pi_new,
        ##                    lambda = lambda, alpha = alpha, Y = Y, a_seq = a_seq,
        ##                    method = "BFGS",
        ##                    b_seq = b_seq, sig_diag = sig_diag, control = list(fnscale = -1))
        if (var_scale) {
           pi_Z <- c(pi_new, optim_out$par, scale_val)
        } else {
           pi_Z <- c(pi_new, optim_out$par)
        }
        ## Z_new <- optim_out$par


        ## cat("llike after Z:", optim_out$value, "\n")
        llike_new <- optim_out$value
        llike_vec <- c(llike_vec, llike_new)

        assertthat::are_equal(pi_Z_old[1:M], pi_Z[1:M])
        assertthat::are_equal(sum(pi_new), 1)

        ## update pi with ashr -------------------------------------------------------
        betahat <- Y - alpha %*% Z_new
        sebetahat <- sqrt(sig_diag * scale_val)

        g <- ashr::unimix(pi_new, c(a_seq, rep(0, length(b_seq) + 1)),
                         c(rep(0, length(a_seq) + 1), b_seq))

        if (requireNamespace(package = "Rmosek", quietly = TRUE)) {
            optmethod <- "mixIP"
        } else {
            optmethod <- "mixEM"
        }
        control.default <- list(K = 1, method = 3, square = TRUE,
                               step.min0 = 1, step.max0 = 1, mstep = 4, kr = 1, objfn.inc = 1,
                               tol = 1e-07, maxiter = 500, trace = FALSE)
        ash_out <- ashr:::estimate_mixprop(betahat = c(betahat), sebetahat = sebetahat,
                                           g = g, prior = lambda, null.comp = length(a_seq + 1),
                                           optmethod = optmethod, control = control.default,
                                           df = df)

        ## ashr subtracts off the largest value of the loglikelihood
        ## to increase numerical stability during the optimization
        ## step, so the log-likelihood will be artificially smaller
        ## than what I calculate.

        pi_new <- ash_out$g$pi

        assertthat::are_equal(sum(pi_new), 1)
        if (var_scale) {
            pi_Z <- c(pi_new, Z_new, scale_val)
        } else {
            pi_Z <- c(pi_new, Z_new)
        }

        ## llike_new <- succotash_llike_unif(pi_Z = pi_Z, lambda = lambda,
        ##                                   alpha = alpha, Y = Y, a_seq = a_seq, b_seq = b_seq,
        ##                                   sig_diag = sig_diag, var_scale = var_scale,
        ##                                   likelihood = likelihood, df = df)

        ## cat("llike after ash:", llike_new, "\n")
        ## cat(" llike ash says:", ash_out$loglik, "\n")


        ## Update scale_val with Brent's method --------------------------------------

        if (var_scale) {
            oout <- stats::optim(par = scale_val, fn = only_Z,
                                 Z = Z_new, pi_vals = pi_new,
                                 lambda = lambda, alpha = alpha,
                                 Y = Y, a_seq = a_seq, b_seq = b_seq,
                                 sig_diag = sig_diag, likelihood = likelihood,
                                 df = df,
                                 method = "Brent", lower = 0,
                                 upper = 10,
                                 control = list(fnscale = -1))
            pi_Z <- c(pi_Z[1:(length(pi_Z) - 1)], oout$par)
            scale_val <- oout$par
        }


        err <- sum(abs(pi_Z - pi_Z_old))
        ## err <- abs(llike_new / llike_old - 1)
        ## cat("err:", err, "\n")
    }

    return(list(pi_Z = pi_Z, llike_vec = llike_vec))
}

#' Wrapper for \code{\link{succotash_llike_unif}} but useful for optim where only update Z.
#'
#' @param Z A k by 1 matrix of numerics. The confounders.
#' @param pi_vals A M vector of numerics. The mixing probs.
#' @param scale_val A positive numeric. The variance inflation parameter.
#' @inheritParams succotash_llike_unif
#'
only_Z <- function(Z, pi_vals, scale_val, lambda, alpha, Y, a_seq, b_seq, sig_diag,
                   likelihood = "normal", df = NULL) {
    pi_Z <- c(pi_vals, Z, scale_val)
    llike <- succotash_llike_unif(pi_Z = pi_Z, lambda = lambda,
                                  alpha = alpha, Y = Y, a_seq = a_seq,
                                  b_seq = b_seq, sig_diag = sig_diag,
                                  var_scale = TRUE,
                                  likelihood = likelihood,
                                  df = df)
    return(llike)
}

#' Wrapper for \code{\link{unif_grad_simp}}, but useful for optim where only update Z.
#'
#' @inheritParams only_Z
#'
only_Z_grad <- function(Z, pi_vals, scale_val, lambda, alpha, Y, a_seq, b_seq, sig_diag,
                        likelihood = "normal", df = NULL) {
  pi_Z <- c(pi_vals, Z, scale_val)
  grad_final <- unif_grad_simp(pi_Z = pi_Z, lambda = lambda,
                               alpha = alpha, Y = Y,
                               sig_diag = sig_diag, a_seq = a_seq,
                               b_seq = b_seq, var_scale = TRUE,
                               likelihood = likelihood, df = df)
  return(grad_final)
}

#' My first attempt at calculating the gradient of the likelihood
#' function using uniform mixtures wrt Z.
#'
#' This doesn't seem to work too well.
#'
#' @inheritParams succotash_llike_unif
#'
unif_grad_llike <- function(pi_Z, lambda, alpha, Y, a_seq, b_seq, sig_diag,
                                 var_scale = TRUE) {
    M <- length(a_seq) + length(b_seq) + 1
    ## p <- nrow(Y)
    if (var_scale) {
        k <- length(pi_Z) - M - 1
        scale_val <- pi_Z[length(pi_Z)]
    } else {
        k <- length(pi_Z) - M
        scale_val <- 1
    }
    pi_current <- pi_Z[1:M]
    if (k != 0) {
        Z_current <- matrix(pi_Z[(M + 1):(M + k)], nrow = k)
    }

    assertthat::are_equal(length(lambda), M)
    assertthat::assert_that(all(lambda >= 1))
    assertthat::assert_that(scale_val > 0)
    assertthat::are_equal(sum(pi_current), 1, tol = 10 ^ -4)

    sig_diag <- sig_diag * scale_val

    az <- alpha %*% Z_current

    left_means <- diag(1 / sqrt(sig_diag)) %*% outer(c( (Y - az)), a_seq, "-")
    left_means_zero <- diag(1 / sqrt(sig_diag)) %*% outer(c( (Y - az)), rep(0, length(a_seq)), "-")
    right_means <- diag(1 / sqrt(sig_diag)) %*% outer(c( (Y - az)), b_seq, "-")
    right_means_zero <- diag(1 / sqrt(sig_diag)) %*% outer(c( (Y - az)), rep(0, length(b_seq)), "-")
    zero_means <- stats::dnorm(Y, mean = az, sd = sqrt(sig_diag))

    left_ispos <- left_means > 0
    right_ispos <- right_means > 0

    pnorm_left_diff <- matrix(NA, ncol = ncol(left_means), nrow = nrow(left_means))
    pnorm_right_diff <- matrix(NA, ncol = ncol(right_means), nrow = nrow(right_means))

    pnorm_left_diff[!left_ispos] <-
        stats::pnorm(left_means[!left_ispos]) - stats::pnorm(left_means_zero[!left_ispos])
    pnorm_right_diff[!right_ispos] <-
        stats::pnorm(right_means_zero[!right_ispos]) - stats::pnorm(right_means[!right_ispos])

    pnorm_left_diff[left_ispos] <-
        stats::pnorm(-1 * left_means_zero[left_ispos]) - stats::pnorm(-1 * left_means[left_ispos])
    pnorm_right_diff[right_ispos] <-
        stats::pnorm(-1 * right_means[right_ispos]) - stats::pnorm(-1 * right_means_zero[right_ispos])

    pnorm_left_diff  <- t(t(pnorm_left_diff) / a_seq)
    pnorm_right_diff <- t(t(pnorm_right_diff) / b_seq)

    fkj <- abs(cbind(pnorm_left_diff,
                     zero_means,
                     pnorm_right_diff))

    denom_grad <- rowSums(t(t(fkj) * pi_current))
    ## fkj %*% diag(pi_current)

    mid_part <- zero_means * (Y - az) / sig_diag


    dnorm_left_diff <- stats::dnorm(left_means_zero) - stats::dnorm(left_means)
    dnorm_right_diff <- stats::dnorm(right_means) - stats::dnorm(right_means_zero)

    ##diag(1 / sig_diag) %*% dnorm_left_diff %*% diag(1 / a_seq)
    left_part <- t(t(1 / sqrt(sig_diag) * dnorm_left_diff) / a_seq)
    right_part <- t(t(1 / sqrt(sig_diag) * dnorm_right_diff) / b_seq)

    top_mult <- rowSums(t(t(cbind(left_part, mid_part, right_part)) * pi_current))

    ## t(t(cbind(left_part, mid_part, right_part)) * pi_current) -
    ##      cbind(left_part, mid_part, right_part) %*% diag(pi_current)

    mult_vals <- top_mult / denom_grad

    gradient_val <- colSums(mult_vals * alpha)
    ## diag(mult_vals) %*% alpha

    ## cat(gradient_val, "\n\n")

    if (var_scale) {
        augmented_grad <- c(rep(0, length = length(pi_current)), gradient_val, 0)
    } else {
        augmented_grad <- c(rep(0, length = length(pi_current)), gradient_val)
    }

    return(augmented_grad)
}


#' Me being as careful as possible when calculating the succotash log-likelihood.
#'
#' @param Y a p by 1 matrix of numerics.
#' @param Z a k by 1 matrix of numerics.
#' @param alpha A p by k matrix of numerics.
#' @param sig_diag A p-vector of numerics.
#' @param scale_val A positivie numeric.
#' @param left_seq The left endpoints of the uniforms.
#' @param right_seq The right endpoints of the uniforms
#' @param pi_vals A vector of non-negative numerics that sum to
#'     one. The mixing proportions.
#' @param likelihood Can be \code{"normal"} or \code{"t"}.
#' @param df A positive numeric. The degrees of freedom if the
#'     likelihood is t.
#'
llike_unif_simp <- function(Y, Z, pi_vals, alpha, sig_diag, left_seq, right_seq,
                            scale_val = 1, likelihood = c("normal", "t"), df = NULL) {
    sig_diag <- sig_diag * scale_val

    likelihood = match.arg(likelihood, c("normal", "t"))
    if (likelihood == "normal") {
        df <- Inf
    } else if (likelihood == "t" & is.null(df)) {
        stop("t likelihood specified but df is null")
    }

    p <- nrow(Y)
    ## k <- nrow(Z)
    M <- length(left_seq)
    assertthat::are_equal(M, length(right_seq))

    null_spot <- which(abs(left_seq) < 10 ^ -12 & abs(right_seq) < 10 ^ -12)


    resid_vec <- Y - alpha %*% Z

    mean_mat_left  <- outer(c(resid_vec), left_seq, "-")
    mean_mat_right <- outer(c(resid_vec), right_seq, "-")

    fkj_mat <- matrix(NA, nrow = p, ncol = M)

    var_mat <- matrix(rep(sqrt(sig_diag), M - length(null_spot)), nrow = p, byrow = FALSE)

    pnorm_diff <- pt_wrap(x = mean_mat_left[, -null_spot], df = df, mean = 0, sd = var_mat) -
        pt_wrap(x = mean_mat_right[, -null_spot], df = df, mean = 0, sd = var_mat)

    pnorm_ratio <- pnorm_diff %*% diag(1 / (right_seq[-null_spot] - left_seq[-null_spot]))


    fkj_mat[, -null_spot] <- pnorm_ratio

    assertthat::are_equal(mean_mat_left[, null_spot], mean_mat_right[, null_spot])

    fkj_mat[, null_spot] <- dt_wrap(x = mean_mat_left[, null_spot],
                                    df = df, mean = 0,
                                    sd = sqrt(sig_diag))

    mat_lik <- fkj_mat %*% diag(pi_vals)

    llike <- sum(log(rowSums(mat_lik)))
    return(llike)
}

#' This is me being very careful about calculating the gradient of the
#' likelihood function wrt Z.
#'
#' @inheritParams succotash_llike_unif
#' @inheritParams llike_unif_simp
#'
#' @return The gradient for Z.
#'
unif_grad_simp <- function(pi_Z, lambda, alpha, Y,  sig_diag,
                           left_seq = NULL, right_seq = NULL,
                           a_seq = NULL, b_seq = NULL,
                           var_scale = TRUE,
                           likelihood = c("normal", "t"), df = NULL) {

    assertthat::assert_that(!( (is.null(left_seq) | is.null(right_seq)) &
                              (is.null(a_seq) | is.null(b_seq))))

    likelihood = match.arg(likelihood, c("normal", "t"))
    if (likelihood == "normal") {
        df <- 1000
    } else if (likelihood == "t" & is.null(df)) {
        stop("t likelihood specified but df is null")
    }

    if (is.null(left_seq) | is.null(right_seq)) {
        left_seq <- c(a_seq, rep(0, length = length(b_seq) + 1))
        right_seq <- c(rep(0, length = length(a_seq) + 1), b_seq)
    }

    M <- length(left_seq)
    p <- nrow(Y)
    if (var_scale) {
        k <- length(pi_Z) - M - 1
        scale_val <- pi_Z[length(pi_Z)]
    } else {
        k <- length(pi_Z) - M
        scale_val <- 1
    }
    pi_current <- pi_Z[1:M]
    if (k != 0) {
        Z_current <- matrix(pi_Z[(M + 1):(M + k)], nrow = k)
    }

    assertthat::are_equal(length(left_seq), length(right_seq))
    assertthat::are_equal(length(lambda), M)
    assertthat::assert_that(all(lambda >= 1))
    assertthat::assert_that(scale_val > 0)
    assertthat::are_equal(sum(pi_current), 1, tol = 10 ^ -4)

    sig_diag <- sig_diag * scale_val

    az <- alpha %*% Z_current

    resid_vec <- Y - az

    null_spot <- which(abs(left_seq) < 10 ^ -12 & abs(right_seq) < 10 ^ -12)

    left_means  <- matrix(rep(left_seq, p), nrow = p, byrow = TRUE)
    right_means <- matrix(rep(right_seq, p), nrow = p, byrow = TRUE)

    quants <- matrix(rep(resid_vec, M), nrow = p, byrow = FALSE)
    sd_mat <- matrix(rep(sig_diag, M), nrow = p, byrow = FALSE)

    dnorm_mat_left  <- dt_wrap(x = quants, df = df, mean = left_means, sd = sd_mat)
    dnorm_mat_right <- dt_wrap(x = quants, df = df, mean = right_means, sd = sd_mat)

    dnorm_diff_mat <- dnorm_mat_right[, -null_spot] - dnorm_mat_left[, -null_spot]

    dnorm_diff_mat_scaled <- matrix(NA, nrow = p, ncol = M)
    dnorm_diff_mat_scaled[, -null_spot] <-
        dnorm_diff_mat %*% diag(1 / (right_seq[-null_spot] - left_seq[-null_spot]))

    assertthat::are_equal(dnorm_mat_left[, null_spot], dnorm_mat_right[, null_spot])

    if (likelihood == "normal") {
        dnorm_diff_mat_scaled[, null_spot] <- dnorm_mat_left[, null_spot] * (resid_vec / sig_diag)
    } else if (likelihood == "t") {
        dnorm_diff_mat_scaled[, null_spot] <- dnorm_mat_left[, null_spot] *
            (resid_vec / sig_diag) * (1 +  (resid_vec) ^ 2 / (sig_diag * df)) ^ -1 * (df + 1) / df
    } else {
        stop("likelihood not normal or t")
    }

    dnorm_diff_mat_pluspi <- dnorm_diff_mat_scaled %*% diag(pi_current)

    top_vec <- rowSums(dnorm_diff_mat_pluspi)


    ## Now do likelihood for bottom part ----------------------------------------------------
    mean_mat_left  <- outer(c(resid_vec), left_seq, "-")
    mean_mat_right <- outer(c(resid_vec), right_seq, "-")

    fkj_mat <- matrix(NA, nrow = p, ncol = M)

    var_mat <- matrix(rep(sqrt(sig_diag), M - length(null_spot)), nrow = p, byrow = FALSE)
    pnorm_diff <- pt_wrap(x = mean_mat_left[, -null_spot], df = df, mean = 0, sd = var_mat) -
        pt_wrap(x = mean_mat_right[, -null_spot], df = df, mean = 0, sd = var_mat)

    pnorm_ratio <- pnorm_diff %*% diag(1 / (right_seq[-null_spot] - left_seq[-null_spot]))


    fkj_mat[, -null_spot] <- pnorm_ratio

    assertthat::are_equal(mean_mat_left[, null_spot], mean_mat_right[, null_spot])

    fkj_mat[, null_spot] <- dt_wrap(x = mean_mat_left[, null_spot],
                                    df = df, mean = 0,
                                    sd = sqrt(sig_diag))

    mat_lik <- fkj_mat %*% diag(pi_current)

    bottom_vec <- rowSums(mat_lik)

    ## Now combine -------------------------------------------------------------------------

    weights_vec <- top_vec / bottom_vec

    weighted_alpha <- diag(weights_vec) %*% alpha

    grad_final <- colSums(weighted_alpha)

    ## if (var_scale) {
    ##     augmented_grad <- c(rep(0, length = M), grad_final, 0)
    ## } else {
    ##     augmented_grad <- c(rep(0, length = M), grad_final)
    ## }

    return(grad_final)
}


#' Wrapper for dt with a non-zero mena and non-1 standard deviation.
#'
#' @param x A numeric. Where to evaluate the density function.
#' @param df A positive numeric. The degrees of freedom.
#' @param mean A numeric. The mean of the t. Defaults to 0.
#' @param sd A positive numeric. The standard deviation of the
#'     t. Defaults to 1.
dt_wrap <- function(x, df, mean = 0, sd = 1) {
    x_new <- (x - mean) / sd
    dval <- stats::dt(x_new, df = df) / sd
    return(dval)
}

#' Wrapper for pt with a non-zero mena and non-1 standard deviation.
#'
#' @param x A numeric. Where to evaluate the cdf.
#' @inheritParams dt_wrap
#'
pt_wrap <- function(x, df, mean = 0, sd = 1) {
    x_new <- (x - mean) / sd
    pval <- stats::pt(x_new, df = df)
    return(pval)
}
dcgerard/succotashr documentation built on May 15, 2019, 1:25 a.m.