R/locfdrFit.R

## locfdrFit():'
## arguments: for description of arguments, see locfdr() - exactly the same.
## return:  see locfdr() for many of the return elements, but this function has additional ones:
## --yt: heights of pink histogram bars that appear on the plots (i.e., heights of alt. density's histogram)
## --x: locations of pinkfl histogram bars that appear on the plots (locations of alt. density's histogram)
## --mlest.lo and mlest.hi: if the function outputs a warning message saying "please re-run with mlest parameters = ...", said parameters are returned in these variables
## --needsfix: should the function advise you to re-run with different mlest parameters, needsfix will be 1, otherwise needsfix = 0.
## --nulldens: y-values of estimated null distribution density
## --fulldens: y-values of estimated full (mixture) density



#'Modified version of Efron's locfdr
#'
#'Compute local false discovery rates, following the definitions and
#'description in references listed below. Exactly the same as \code{locfdr},
#'but returns extra information.
#'
#'Generally not used directly in the tornado package, but is a workhorse for
#'\code{getParams}.
#'
#'@param zz A vector of summary statistics, one for each case under
#'simultaneous consideration. The calculations assume a large number of cases,
#'say length(zz) exceeding 200. Results may be improved by transforming zz so
#'that its elements are theoretically distributed as N(0, 1) under the null
#'hypothesis. See the \code{locfdr} vignette for tips on creating \code{zz}.
#'@param bre Number of breaks in the discretization of the z-score axis, or a
#'vector of breakpoints fully describing the discretization. If length(zz) is
#'small, such as when the number of cases is less than about 1000, set
#'\code{bre} to a number lower than the default of 120. The tornado package
#'keeps this at its default.
#'@param df Degrees of freedom for fitting the estimated density f(z).  The
#'tornado package keeps this at its default.
#'@param pct Excluded tail proportions of zz's when fitting f(z). \code{pct=0}
#'includes full range of zz's. \code{pct} can also be a 2-vector, describing
#'the fitting range. The tornado package keeps this at its default.
#'@param pct0 Proportion of the zz distribution used in fitting the null density
#'f0(z) by central matching. If a 2-vector, e.g. \code{pct0=c(0.25,0.60)}, the
#'range \code{[pct0[1], pct0[2]]} is used. If a scalar, \code{[pct0, 1-pct0]}
#'is used. The tornado package keeps this at its default.
#'@param nulltype Type of null hypothesis assumed in estimating f0(z), for use
#'in the fdr calculations. 0 is the theoretical null N(0; 1), 1 is maximum
#'likelihood estimation, 2 is central matching estimation, 3 is a split normal
#'version of 2. The tornado package fixes this at 1.
#'@param type Type of fitting used for f; 0 is a natural spline, 1 is a
#'polynomial, in either case with degrees of freedom \code{df} [so total
#'degrees of freedom including the intercept is \code{df+1}.] The tornado
#'package fixes this at 0.
#'@param plot Plots desired. 0 gives no plots. 1 gives single plot showing the
#'histogram of zz and fitted densities f and p0 f0. 2 also gives plot of fdr,
#'and the right and left tail area Fdr curves. 3 gives instead the f1 cdf of
#'the estimated fdr curve; plot=4 gives all three plots. The tornado package
#'allows choices 0 and 1; equivalent to \code{plots = F} and \code{plots = T}
#'in \code{getParams}.
#'@param mult Optional scalar multiple (or vector of multiples) of the sample
#'size for calculation of the corresponding hypothetical Efdr value(s). This is
#'not used in the tornado package.
#'@param mlests Optional vector of initial values for (delta0, sigma0) in the
#'maximum likelihood iteration. The tornado package includes these values in an
#'updated run of \code{locfdrFit} if they are suggested via warning in the
#'first run.
#'@param main Main heading for the histogram plot when \code{plot}>0.
#'@param sw Determines the type of output desired. 2 gives a list consisting of
#'the last 5 values listed under Value below. 3 gives the square matrix of
#'dimension \code{bre}-1 representing the influence function of log(fdr). Any
#'other value of sw returns a list consisting of the first 5 (6 if \code{mult}
#'is supplied) values listed below. The tornado package fixes this at 0.
#'@param verbose if TRUE, various messages are printed onscreen during
#'\code{getParams}.
#'@return See the \code{locfdr} manual for returned values. \code{locfdrFit}
#'returns the following additional elements:
#'\item{yt }{bin heights}
#'\item{mlest.lo }{if a warning that \code{mlest} should be used in a re-run of \code{locfdrFit}, the suggested first element of \code{mlest}.}
#'\item{mlest.hi }{if a warning that \code{mlest} should be used in a re-run of \code{locfdrFit}, the suggested second element of \code{mlest}.}
#'\item{needsfix }{0 if no warning to fix the run with \code{mlest} parameters, 1 otherwise}
#'\item{nulldens }{a rough estimate of y-axis values for f0(x)}
#'\item{fulldens }{a rough estimate of y-axis values for f(x)}
#'@author Bradley Efron, slight modifications by Alyssa Frazee
#'@export
#'@seealso \code{getParams}
#'@references http://cran.r-project.org/web/packages/locfdr/locfdr.pdf
locfdrFit <- function (zz, bre = 120, df = 7, pct = 0, pct0 = 1/4, nulltype = 1, type = 0, plot = 1, mult, mlests, main = " ", sw = 0, verbose=T) {
	require(locfdr)
	
	locmle = locfdr:::locmle	
	loccov2 = locfdr:::loccov2
	loccov = locfdr:::loccov 
	mlest.lo <- mlest.hi <- yt <- x <- NULL
	needsfix <- 0
    call = match.call()
    if (length(bre) > 1) {
        lo <- min(bre)
        up <- max(bre)
        bre <- length(bre)
    }
    else {
        if (length(pct) > 1) {
            lo <- pct[1]
            up <- pct[2]
        }
        else {
            if (pct == 0) {
                lo <- min(zz)
                up <- max(zz)
            }
            if (pct < 0) {
                med = median(zz)
                ra = med + (1 - pct) * (range(zz) - med)
                lo = ra[1]
                up = ra[2]
            }
            if (pct > 0) {
                v <- quantile(zz, c(pct, 1 - pct))
                lo <- v[1]
                up <- v[2]
            }
        }
    }
    zzz <- pmax(pmin(zz, up), lo)
    breaks <- seq(lo, up, length = bre)
    zh <- hist(zzz, breaks = breaks, plot = F)
    x <- (breaks[-1] + breaks[-length(breaks)])/2
    yall <- y <- zh$counts
    K <- length(y)
    N <- length(zz)
    if (pct > 0) {
        y[1] <- min(y[1], 1)
        y[K] <- min(y[K], 1)
    }
    if (type == 0) {
        X <- cbind(1, ns(x, df = df))
        f <- glm(y ~ ns(x, df = df), poisson)$fit
    }
    else {
        X <- cbind(1, poly(x, df = df))
        f <- glm(y ~ poly(x, df = df), poisson)$fit
    }
    fulldens <- f
    l <- log(f)
    Fl <- cumsum(f)
    Fr <- cumsum(rev(f))
    D <- (y - f)/(f + 1)^0.5
    D <- sum(D[2:(K - 1)]^2)/(K - 2 - df)
    if (D > 1.5) 
        if(verbose){warning(paste("f(z) misfit = ", round(D, 1), ".  Rerun with increased df", 
            sep = ""))}
    if (nulltype == 3) {
        fp0 = matrix(NA, 6, 4)
        colnames(fp0) = c("delta", "sigleft", "p0", "sigright")
    }
    else {
        fp0 = matrix(NA, 6, 3)
        colnames(fp0) = c("delta", "sigma", "p0")
    }
    rownames(fp0) = c("thest", "theSD", "mlest", "mleSD", "cmest", 
        "cmeSD")
    fp0["thest", 1:2] = c(0, 1)
    fp0["theSD", 1:2] = 0
    imax <- seq(l)[l == max(l)][1]
    xmax <- x[imax]
    if (length(pct0) == 1) {
        pctup <- 1 - pct0
        pctlo <- pct0
    }
    else {
        pctlo <- pct0[1]
        pctup <- pct0[2]
    }
    lo0 <- quantile(zz, pctlo)
    hi0 <- quantile(zz, pctup)
    nx <- length(x)
    i0 <- (1:nx)[x > lo0 & x < hi0]
    x0 <- x[i0]
    y0 <- l[i0]
    if (nulltype == 3) {
        X00 <- cbind((x0 - xmax)^2, pmax(x0 - xmax, 0)^2)
    }
    else {
        X00 <- cbind(x0 - xmax, (x0 - xmax)^2)
    }
    lr <- lm(y0 ~ X00)
    co <- lr$coef
    if (nulltype == 3) {
        cmerror = I(is.na(co[3]) | is.na(co[2]))
        if (!cmerror) 
            cmerror = I(co[2] >= 0 | co[2] + co[3] >= 0)
    }
    else {
        cmerror = is.na(co[3])
        if (!cmerror) 
            cmerror = I(co[3] >= 0)
    }
    if (cmerror) {
        if (nulltype == 3) 
            stop("CM estimation failed.  Rerun with nulltype = 1 or 2.")
        else if (nulltype == 2) 
            stop("CM estimation failed.  Rerun with nulltype = 1.")
        else {
            X0 <- cbind(1, x - xmax, (x - xmax)^2)
            warning("CM estimation failed, middle of histogram non-normal")
        }
    }
    else {
        if (nulltype == 3) {
            X0 <- cbind(1, (x - xmax)^2, pmax(x - xmax, 0)^2)
            sigs <- 1/sqrt(-2 * (c(co[2], co[2] + co[3])))
            fp0["cmest", c(1, 2, 4)] <- c(xmax, sigs)
        }
        else {
            X0 <- cbind(1, x - xmax, (x - xmax)^2)
            xmaxx <- -co[2]/(2 * co[3]) + xmax
            sighat <- 1/sqrt(-2 * co[3])
            fp0["cmest", 1:2] <- c(xmaxx, sighat)
        }
        l0 <- as.vector(X0 %*% co)
        f0 <- exp(l0)
        p0 <- sum(f0)/sum(f)
        f0 <- f0/p0
        fp0["cmest", 3] <- p0
    }
    b = 4.3 * exp(-0.26 * log(N, 10))
    if (missing(mlests)) {
        med = median(zz)
        sc = diff(quantile(zz)[c(2, 4)])/(2 * qnorm(0.75))
        mlests = locmle(zz, xlim = c(med, b * sc))
        if (N > 5e+05) {
            if(verbose){warning("length(zz) > 500,000: For ML estimation, a wider interval than optimal was used.  To use the optimal interval, rerun with mlests = c(", 
                mlests[1], ", ", b * mlests[2], ").\n", sep = "")}
            mlest.lo = mlests[1]
            mlest.hi = b * mlests[2]
            needsfix = 1
            mlests = locmle(zz, xlim = c(med, sc))
        }
    }
    if (!is.na(mlests[1])) {
        if (N > 5e+05) 
            b = 1
        if (nulltype == 1) {
            Cov.in = list(x = x, X = X, f = f, sw = sw)
            ml.out = locmle(zz, xlim = c(mlests[1], b * mlests[2]), 
                d = mlests[1], s = mlests[2], Cov.in = Cov.in)
            mlests = ml.out$mle
        }
        else mlests = locmle(zz, xlim = c(mlests[1], b * mlests[2]), 
            d = mlests[1], s = mlests[2])
        fp0["mlest", 1:3] = mlests[1:3]
        fp0["mleSD", 1:3] = mlests[4:6]
    }
    if (sum(is.na(fp0[c(3, 5), 1:2])) == 0 & nulltype > 1) 
        if (abs(fp0["cmest", 1] - mlests[1]) > 0.05 | abs(log(fp0["cmest", 
            2]/mlests[2])) > 0.05) 
            warning("Discrepancy between central matching and maximum likelihood estimates.\nConsider rerunning with nulltype = 1")
    if (is.na(mlests[1])) {
        if (nulltype == 1) {
            if (is.na(fp0["cmest", 1])) 
                stop("CM and ML Estimation failed, middle of histogram non-normal")
            else stop("ML estimation failed.  Rerun with nulltype=2")
        }
        else warning("ML Estimation failed")
    }
    if (nulltype < 2) {
        delhat = xmax = xmaxx = mlests[1]
        sighat = mlests[2]
        p0 = mlests[3]
        f0 = dnorm(x, delhat, sighat)
        f0 = (sum(f) * f0)/sum(f0)
    }
    fdr = pmin((p0 * f0)/f, 1)
    f00 <- exp(-x^2/2)
    f00 <- (f00 * sum(f))/sum(f00)
    p0theo <- sum(f[i0])/sum(f00[i0])
    fp0["thest", 3] = p0theo
    fdr0 <- pmin((p0theo * f00)/f, 1)
    f0p <- p0 * f0
    if (nulltype == 0) 
        f0p <- p0theo * f00
    F0l <- cumsum(f0p)
    F0r <- cumsum(rev(f0p))
    Fdrl <- F0l/Fl
    Fdrr <- rev(F0r/Fr)
    Int <- (1 - fdr) * f * (fdr < 0.9)
    if (sum(x <= xmax & fdr == 1) > 0) 
        xxlo <- min(x[x <= xmax & fdr == 1])
    else xxlo = xmax
    if (sum(x >= xmax & fdr == 1) > 0) 
        xxhi <- max(x[x >= xmax & fdr == 1])
    else xxhi = xmax
    if (sum(x >= xxlo & x <= xxhi) > 0) 
        fdr[x >= xxlo & x <= xxhi] <- 1
    if (sum(x <= xmax & fdr0 == 1) > 0) 
        xxlo <- min(x[x <= xmax & fdr0 == 1])
    else xxlo = xmax
    if (sum(x >= xmax & fdr0 == 1) > 0) 
        xxhi <- max(x[x >= xmax & fdr0 == 1])
    else xxhi = xmax
    if (sum(x >= xxlo & x <= xxhi) > 0) 
        fdr0[x >= xxlo & x <= xxhi] <- 1
    if (nulltype == 1) {
        fdr[x >= mlests[1] - mlests[2] & x <= mlests[1] + mlests[2]] = 1
        fdr0[x >= mlests[1] - mlests[2] & x <= mlests[1] + mlests[2]] = 1
    }
    p1 <- sum((1 - fdr) * f)/N
    p1theo <- sum((1 - fdr0) * f)/N
    fall <- f + (yall - y)
    Efdr <- sum((1 - fdr) * fdr * fall)/sum((1 - fdr) * fall)
    Efdrtheo <- sum((1 - fdr0) * fdr0 * fall)/sum((1 - fdr0) * 
        fall)
    iup <- (1:K)[x >= xmax]
    ido <- (1:K)[x <= xmax]
    Eleft <- sum((1 - fdr[ido]) * fdr[ido] * fall[ido])/sum((1 - 
        fdr[ido]) * fall[ido])
    Eleft0 <- sum((1 - fdr0[ido]) * fdr0[ido] * fall[ido])/sum((1 - 
        fdr0[ido]) * fall[ido])
    Eright <- sum((1 - fdr[iup]) * fdr[iup] * fall[iup])/sum((1 - 
        fdr[iup]) * fall[iup])
    Eright0 <- sum((1 - fdr0[iup]) * fdr0[iup] * fall[iup])/sum((1 - 
        fdr0[iup]) * fall[iup])
    Efdr <- c(Efdr, Eleft, Eright, Efdrtheo, Eleft0, Eright0)
    Efdr[which(is.na(Efdr))] = 1
    names(Efdr) <- c("Efdr", "Eleft", "Eright", "Efdrtheo", "Eleft0", 
        "Eright0")
    if (nulltype == 0) 
        f1 <- (1 - fdr0) * fall
    else f1 <- (1 - fdr) * fall
    if (!missing(mult)) {
        mul = c(1, mult)
        EE = rep(0, length(mul))
        for (m in 1:length(EE)) {
            xe = sqrt(mul[m]) * x
            f1e = approx(xe, f1, x, rule = 2, ties = mean)$y
            f1e = (f1e * sum(f1))/sum(f1e)
            f0e = f0
            p0e = p0
            if (nulltype == 0) {
                f0e = f00
                p0e = p0theo
            }
            fdre = (p0e * f0e)/(p0e * f0e + f1e)
            EE[m] = sum(f1e * fdre)/sum(f1e)
        }
        EE = EE/EE[1]
        names(EE) = mul
    }
    Cov2.out = loccov2(X, X0, i0, f, fp0["cmest", ], N)
    Cov0.out = loccov2(X, matrix(1, length(x), 1), i0, f, fp0["thest", 
        ], N)
    if (sw == 3) {
        if (nulltype == 0) 
            Ilfdr = Cov0.out$Ilfdr
        else if (nulltype == 1) 
            Ilfdr = ml.out$Ilfdr
        else if (nulltype == 2) 
            Ilfdr = Cov2.out$Ilfdr
        else stop("With sw=3, nulltype must equal 0, 1, or 2.")
        return(Ilfdr)
    }
    if (nulltype == 0) 
        Cov = Cov0.out$Cov
    else if (nulltype == 1) 
        Cov = ml.out$Cov.lfdr
    else Cov = Cov2.out$Cov
    lfdrse <- diag(Cov)^0.5
    fp0["cmeSD", 1:3] = Cov2.out$stdev[c(2, 3, 1)]
    if (nulltype == 3) 
        fp0["cmeSD", 4] = fp0["cmeSD", 2]
    fp0["theSD", 3] = Cov0.out$stdev[1]
    if (sw == 2) {
        if (nulltype == 0) {
            pds = fp0["thest", c(3, 1, 2)]
            stdev = fp0["theSD", c(3, 1, 2)]
            pds. = t(Cov0.out$pds.)
        }
        else if (nulltype == 1) {
            pds = fp0["mlest", c(3, 1, 2)]
            stdev = fp0["mleSD", c(3, 1, 2)]
            pds. = t(ml.out$pds.)
        }
        else if (nulltype == 2) {
            pds = fp0["cmest", c(3, 1, 2)]
            stdev = fp0["cmeSD", c(3, 1, 2)]
            pds. = t(Cov2.out$pds.)
        }
        else stop("With sw=2, nulltype must equal 0, 1, or 2.")
        colnames(pds.) = names(pds) = c("p0", "delhat", "sighat")
        names(stdev) = c("sdp0", "sddelhat", "sdsighat")
        return(list(pds = pds, x = x, f = f, pds. = pds., stdev = stdev))
    }
    p1 <- seq(0.01, 0.99, 0.01)
    cdf1 <- rep(0, 99)
    fd <- fdr
    if (nulltype == 0) 
        fd <- fdr0
    for (i in 1:99) cdf1[i] <- sum(f1[fd <= p1[i]])
    cdf1 <- cbind(p1, cdf1/cdf1[99])
    mat <- cbind(x, fdr, Fdrl, Fdrr, f, f0, f00, fdr0, yall, 
        lfdrse, f1)
    namat <- c("x", "fdr", "Fdrleft", "Fdrright", "f", "f0", 
        "f0theo", "fdrtheo", "counts", "lfdrse", "p1f1")
    if (nulltype == 0) 
        namat[c(3, 4, 10)] <- c("Fdrltheo", "Fdrrtheo", "lfdrsetheo")
    dimnames(mat) <- list(NULL, namat)
    z.2 = rep(NA, 2)
    m = order(fd)[nx]
    if (fd[nx] < 0.2) {
        z.2[2] = approx(fd[m:nx], x[m:nx], 0.2, ties = mean)$y
    }
    if (fd[1] < 0.2) {
        z.2[1] = approx(fd[1:m], x[1:m], 0.2, ties = mean)$y
    }
    if(nulltype==0) nulldens <- p0theo*f00
    else nulldens <- p0*f0
    yt <- pmax(yall * (1 - fd), 0)
    if (plot > 0) {
        if (plot == 2 | plot == 3) 
            oldpar <- par(mfrow = c(1, 2), pty = "m")
        else if (plot == 4) 
            oldpar = par(mfrow = c(1, 3), pty = "m")
        hist(zzz, breaks = breaks, xlab = " ", main = main)
        yt <- pmax(yall * (1 - fd), 0)
        for (k in 1:K) lines(c(x[k], x[k]), c(0, yt[k]), lwd = 2, 
            col = 6)
        if (nulltype == 3) 
            title(xlab = paste("delta=", round(xmax, 3), "sigleft=", 
                round(sigs[1], 3), " sigright=", round(sigs[2], 
                  3), "p0=", round(fp0["cmest", 3], 3)))
        if (nulltype == 1 | nulltype == 2) 
            title(xlab = paste("MLE: delta:", round(mlests[1], 
                3), "sigma:", round(mlests[2], 3), "p0:", round(mlests[3], 
                3)), sub = paste("CME: delta:", round(fp0["cmest", 
                1], 3), "sigma:", round(fp0["cmest", 2], 3), 
                "p0:", round(fp0["cmest", 3], 3)))
        lines(x, f, lwd = 3, col = 3)
        if (nulltype == 0)
            lines(x, p0theo * f00, lwd = 2, lty = 2, col = 4)
        else lines(x, p0 * f0, lwd = 2, lty = 2, col = 4)
        if (!is.na(z.2[2])) 
            points(z.2[2], -0.5, pch = 24, col = "red", bg = "yellow")
        if (!is.na(z.2[1])) 
            points(z.2[1], -0.5, pch = 24, col = "red", bg = "yellow")
        if (nulltype == 1 | nulltype == 2) 
            Ef <- Efdr[1]
        else if (nulltype == 0) 
            Ef <- Efdr[4]
        if (plot == 2 | plot == 4) {
            if (nulltype == 0) 
                fdd <- fdr0
            else fdd = fdr
            matplot(x, cbind(fdd, Fdrl, Fdrr), type = "l", lwd = 3, 
                xlab = " ", ylim = c(0, 1.1), main = "fdr (solid); Fdr's (dashed)")
            title(xlab = paste("Efdr= ", round(Ef, 3)))
            abline(0, 0, lty = 3, col = 2)
            lines(c(0, 0), c(0, 1), lty = 3, col = 2)
        }
        if (plot == 3 | plot == 4) {
            if (sum(is.na(cdf1[, 2])) == nrow(cdf1)) 
                warning("cdf1 not available")
            else {
                plot(cdf1[, 1], cdf1[, 2], type = "l", lwd = 3, 
                  xlab = "fdr level", ylim = c(0, 1), ylab = "f1 proportion < fdr level", 
                  main = "f1 cdf of estimated fdr")
                title(sub = paste("Efdr= ", round(Ef, 3)))
                lines(c(0.2, 0.2), c(0, cdf1[20, 2]), col = 4, 
                  lty = 2)
                lines(c(0, 0.2), rep(cdf1[20, 2], 2), col = 4, 
                  lty = 2)
                text(0.05, cdf1[20, 2], round(cdf1[20, 2], 2))
                abline(0, 0, col = 2)
                lines(c(0, 0), c(0, 1), col = 2)
            }
        }
        if (plot > 1) 
            par(oldpar)
    }
    if (nulltype == 0) {
        ffdr <- approx(x, fdr0, zz, rule = 2, ties = "ordered")$y
    }
    else ffdr <- approx(x, fdr, zz, rule = 2, ties = "ordered")$y
    vl = list(fdr = ffdr, fp0 = fp0, Efdr = Efdr, cdf1 = cdf1, 
        mat = mat, z.2 = z.2)
    if (!missing(mult)) 
        vl$mult = EE
    vl$call = call
    vl$yt = yt
    vl$x = x
    vl$mlest.lo = mlest.lo
    vl$mlest.hi = mlest.hi
    vl$needsfix = needsfix
    vl$nulldens = nulldens
    vl$fulldens = fulldens
    vl
}
leekgroup/derfinder documentation built on May 20, 2019, 11:30 p.m.