R/absCopyNumber.R

Defines functions cluster.solution grid.search.alpha optimize.alpha grid.search.alpha.simple optimize.alpha.simple get_absCopyNumber run_fromLocal run_fromDF plot_absCopyNumber

Documented in get_absCopyNumber plot_absCopyNumber run_fromDF run_fromLocal

cluster.solution <- function(x, alpha.cut, tau.cut) {
    cur.alpha <- x[1, "alpha"]
    cur.tau <- x[1, "tau.def"]
    cur.cnt <- x[1, "count"]
    cur.mse.r <- 1.0 / x[1, "mse"]
    clust <-
        data.frame(numeric(), numeric(), numeric(), numeric(), integer())

    for (i in 2:nrow(x)) {
        next.alpha <- x[i, "alpha"]
        next.tau <- x[i, "tau.def"]
        next.cnt <- x[i, "count"]
        next.mse.r <- 1.0 / x[i, "mse"]
        if (abs(next.alpha - cur.alpha) < alpha.cut &
            abs(next.tau - cur.tau) < tau.cut) {
            # same cluster
            # update: weighted average
            cur.wt <- cur.cnt / (cur.cnt + next.cnt)
            next.wt <- next.cnt / (cur.cnt + next.cnt)
            cur.alpha <-
                (cur.alpha * cur.wt * cur.mse.r + next.alpha * next.wt * next.mse.r) /
                (cur.wt * cur.mse.r + next.wt * next.mse.r)
            cur.tau <-
                (cur.tau * cur.wt * cur.mse.r + next.tau * next.wt * next.mse.r) / (cur.wt *
                                                                                        cur.mse.r + next.wt * next.mse.r)
            cur.mse.r <- cur.mse.r * cur.wt + next.mse.r * next.wt
            cur.cnt <- cur.cnt + next.cnt
        } else {
            # save the current cluster
            clust <-
                rbind(clust,
                      data.frame(cur.alpha, cur.tau, cur.tau, cur.mse.r, cur.cnt))

            cur.alpha <- next.alpha
            cur.tau <- next.tau
            cur.cnt <- next.cnt
            cur.mse.r <- next.mse.r
        }
    }
    clust <-
        rbind(clust,
              data.frame(cur.alpha, cur.tau, cur.tau, cur.mse.r, cur.cnt))
    colnames(clust) <- c("alpha", "tau", "tau.def", "mse", "count")
    clust[, 1:3] <- round(clust[, 1:3], 2)
    clust$mse <- 1.0 / clust$mse

    #print(x)
    #print(clust)

    clust
}

#'@import stats graphics utils
grid.search.alpha <-
    function(seg.data,
             snv.data,
             alpha.min = 0.2,
             alpha.max = 1.0,
             tau.min = 1.5,
             tau.max = 5.0,
             min.sol.freq = 0,
             min.seg.len = 200,
             qmax = 7,
             lamda = 0.5,
             verbose = FALSE) {
        if (verbose) {
            cat("Reading arguments: ================================================\n")
            cat("min.seg.len =", min.seg.len, "\n")
            cat("qmax =", qmax, "\n")
            cat("lamda =", lamda, "\n")			# 1.0: CN only
            cat(
                "====================================================================\n"
            )
        }

        orig.seg.dat <- seg.data

        # filtering
        n.seg.1 <- nrow(seg.data)
        seg.data <- na.omit(seg.data)
        seg.data <-
            seg.data[seg.data[, "eff.seg.len"] >= min.seg.len,]
        n.seg.2 <- nrow(seg.data)
        if (verbose) {
            cat("Filtering segments with NAs and length less than eff.seg.len ...\n")
            cat("All segments: ", n.seg.1, "\n")
            cat("Retained segments: ", n.seg.2, "\n")
            cat(
                "====================================================================\n"
            )
        }

        gf <- seg.data[, "loc.end"] - seg.data[, "loc.start"] + 1
        gf <- gf / sum(gf)
        eta <- 1.0

        r <- seg.data[, "normalized.ratio"]

        # set cutoff for filtering copy number segments
        max.r.cutoff <- 3.0
        min.r.cutoff <- 1.0 / 3.0

        outlier.frac.1 <-
            length(which(r > max.r.cutoff)) / length(r)
        # outlier.gf.1 <- sum(gf[r > max.r.cutoff])
        if (verbose)
            cat(
                100 * outlier.frac.1,
                "% segments with copy ratio r>3.0 before rescaling\n"
            )

        outlier2.frac.1 <-
            length(which(r < min.r.cutoff)) / length(r)
        # outlier2.gf.1 <- sum(gf[r < min.r.cutoff])
        if (verbose) {
            cat(
                100 * outlier2.frac.1,
                "% segments with copy ratio r<0.33 before rescaling\n"
            )
            # cat("Filtering them...\n")
            cat(
                "====================================================================\n"
            )
        }


        # assign SNP to each segment after filtering
        snp2seg <- NULL
        fb <- NULL
        het.ind <- NULL
        for (i in 1:nrow(snv.data)) {
            ind <-
                which(seg.data[, "chrom"] == snv.data[i, "chrom"] &
                          seg.data[, "loc.start"] <= snv.data[i, "position"] &
                          seg.data[, "loc.end"] >= snv.data[i, "position"])
            if (length(ind) == 1) {
                if (r[ind] <= max.r.cutoff & r[ind] >= min.r.cutoff) {
                    snp2seg <- c(snp2seg, ind)
                    fb <- c(fb, snv.data[i, "tumor_var_freq"])
                    het.ind <- c(het.ind, i)
                }
            }
        }
        # n.snv <- length(het.ind)
        if (verbose) {
            cat("Assign SNP to each segment: ========================================\n")
            cat("# of SNVs used:", length(het.ind), "\n")
            cat(
                "====================================================================\n"
            )
        }



        snv.type <- "somatic"

        # weights
        wts <- rep(1, length(r))
        wts[r > max.r.cutoff] <- 0.0
        wts[r < min.r.cutoff] <- 0.0

        wts.cn <- wts / sum(wts) * lamda
        wts.het <- 1.0 / length(snp2seg) * (1.0 - lamda)

        alpha.grid <-
            seq(from = alpha.min, to = alpha.max, by = 0.05)
        tau.grid <- seq(from = tau.min, to = tau.max, by = 0.05)
        search.grid <-
            expand.grid(alpha.grid = alpha.grid, tau.grid = tau.grid)
        n.grid <- nrow(search.grid)

        search.res <-
            data.frame(
                alpha = numeric(),
                tau = numeric(),
                tau.def = numeric(),
                mse = numeric(),
                alpha0 = numeric(),
                tau0 = numeric()
            )

        if (verbose) {
            cat("Grid searching:\n")
            total <- n.grid
            # create progress bar
            pb <-
                txtProgressBar(
                    min = 0,
                    max = total,
                    style = 3,
                    width = 60,
                    char = ">"
                )
        }

        for (k in 1:n.grid) {
            if (verbose)
                setTxtProgressBar(pb, k)

            alpha0 <- search.grid[k, 1]
            tau0 <- search.grid[k, 2]

            a.res <-
                optimize.alpha(
                    alpha0,
                    alpha.min,
                    alpha.max,
                    tau0,
                    wts.cn,
                    r,
                    gf,
                    eta,
                    wts.het,
                    snp2seg,
                    fb,
                    qmax,
                    snv.type
                )
            if (!is.null(a.res)) {
                search.res <- rbind(search.res, a.res)
            }
        }

        if (verbose)
            close(pb)

        colnames(search.res) <-
            c("alpha", "tau", "tau.def", "mse", "alpha0", "tau0")

        search.res[, c("alpha", "tau", "tau.def")] <-
            round(search.res[, c("alpha", "tau", "tau.def")], 2)
        tmp <-
            aggregate(search.res[, c("tau.def", "mse")], search.res[, c("alpha", "tau")], mean)
        tmp2 <-
            aggregate(search.res[, 1], search.res[, c("alpha", "tau")], length)

        if (verbose) {
            cat("Total", nrow(search.res), "results\n")
        }

        search.res <- data.frame(tmp, count = tmp2[, 3])
        if (verbose)
            cat(nrow(search.res), "unique results\n")

        oo <- order(search.res$mse)
        search.res <- search.res[oo,]
        rownames(search.res) <- NULL


        # only cluster when result number more than 1
        if (nrow(search.res) > 1) {
            if (verbose)
                cat("Clustering results...\n")
            # do clustering
            search.res <-
                cluster.solution(search.res, alpha.cut = 0.10, tau.cut = 0.15)
        }

        if (verbose)
            cat("Filtering impossible solutions...\n")
        # filtering out impossible solutions
        if (min.sol.freq == 0)
            min.sol.freq <- 0.05 * sum(search.res[, "count"])
        proper.ind <-
            which(
                search.res[, "alpha"] >= alpha.min &
                    search.res[, "alpha"] <= alpha.max &
                    search.res[, "tau.def"] >= tau.min &
                    search.res[, "tau.def"] <= tau.max &
                    search.res[, "count"] >= min.sol.freq
            )
        if (length(proper.ind) > 0) {
            search.res <- search.res[proper.ind, , drop = FALSE]
        }

        cat("Final solution number:", nrow(search.res), "\n")

        if (1) {
            if (verbose)
                cat("Outputing the best result...\n")
            # output the best result
            min.ind <- which(search.res$mse == search.res$mse[1])
            #cat("# of optimal parameters:",length(min.ind),"\n")

            alpha.opt <- search.res[1, "alpha"]
            tau.opt <- search.res[1, "tau.def"]

            tmp <-
                (r * (alpha.opt * tau.opt + (1.0 - alpha.opt) * 2) - (1.0 - alpha.opt) *
                     2) / alpha.opt
            qs <- round(tmp)
            qs[qs < 0] <- 0
            qs[qs > qmax] <- qmax
            rhat <-
                (alpha.opt * qs + (1.0 - alpha.opt) * 2) / (alpha.opt * tau.opt + (1.0 -
                                                                                       alpha.opt) * 2)
            abs.cn <- data.frame(seg.data[, 1:4],
                                 r = r,
                                 rhat = rhat,
                                 CN = qs)

            q.het <- qs[snp2seg]
            if (snv.type == "somatic") {
                tmp <-
                    ((alpha.opt * q.het + (1.0 - alpha.opt) * 2) * fb) / alpha.opt
            } else if (snv.type == "germline") {
                tmp <-
                    ((alpha.opt * q.het + (1.0 - alpha.opt) * 2) * fb - (1.0 - alpha.opt)) /
                    alpha.opt
            }
            mg <- round(tmp)
            mg[mg < 0] <- 0
            tmp.ind <- which(mg > q.het)
            if (length(tmp.ind) > 0) {
                mg[tmp.ind] <- q.het[tmp.ind]
            }
            abs.mg <-
                data.frame(snv.data[het.ind,], multiplicity = mg)

        }

        res.list <-
            list(
                searchRes = search.res,
                absCN = abs.cn,
                absSNV = abs.mg,
                orig.seg.dat = orig.seg.dat,
                seg.dat = seg.data,
                orig.snv.dat = snv.data,
                snv.dat = snv.data[het.ind,]
            )

        cat("Done.\n")

        res.list
    }

optimize.alpha <-
    function(alpha0,
             alpha.min = 0.2,
             alpha.max = 1.0,
             tau0,
             wts.cn,
             r,
             w,
             eta,
             wts.het,
             snp2seg,
             fb,
             qmax,
             snv.type) {
        # intialization
        alpha <- alpha0
        tau <- tau0

        tmp <-
            (r * (alpha * tau + (1.0 - alpha) * 2) - (1.0 - alpha) * 2) / alpha
        qs <- round(tmp)
        qs[qs < 0] <- 0
        qs[qs > qmax] <- qmax
        #tau <- sum(qs*w*eta)/sum(w*eta)
        tau <- sum(qs * w) / sum(w)
        qs.last <- qs

        q.het <- qs[snp2seg]
        if (snv.type == "somatic") {
            tmp <- ((alpha * q.het + (1.0 - alpha) * 2) * fb) / alpha
        } else if (snv.type == "germline") {
            tmp <-
                ((alpha * q.het + (1.0 - alpha) * 2) * fb - (1.0 - alpha)) / alpha
        }
        mg <- round(tmp)
        mg[mg < 0] <- 0
        tmp.ind <- which(mg > q.het)
        if (length(tmp.ind) > 0) {
            mg[tmp.ind] <- q.het[tmp.ind]
        }

        cn.df <- data.frame(
            x = qs,
            y = r,
            wt = wts.cn,
            lqs = 2.0,
            ix = 1
        )
        het.df <-
            data.frame(
                x = mg,
                y = fb,
                wt = wts.het,
                lqs = qs[snp2seg],
                ix = 0
            )
        reg.df <- rbind(cn.df, het.df)

        # iteration
        optim.fail <- 'converged'
        opt.cycle <- 0
        fitEst <- NULL
        alphaEst <- NULL
        tauEst <- NULL
        mdq <- NULL

        repeat {
            opt.cycle <- opt.cycle + 1
            if (opt.cycle > 100) {
                res <- NULL
                break
            }

            reg.df[, "x"] <- c(qs, mg)
            reg.df[, "lqs"] <- c(rep(2.0, length(qs)), qs[snp2seg])

            if (snv.type == "somatic") {
                try.tmp <-
                    try(nls.res <-
                            nls(
                                y ~ ix * (alpha * x + (1.0 - alpha) * 2) / (alpha * tau + (1.0 - alpha) *
                                                                                2) + (1.0 - ix) * (alpha * x) / (alpha * lqs + (1.0 - alpha) * 2) ,
                                data = reg.df,
                                start = list(alpha = alpha),
                                weights = wt,
                                lower = alpha.min,
                                upper = alpha.max,
                                algorithm = "port"
                            ),
                        silent = TRUE)

            } else if (snv.type == "germline") {
                try.tmp <-
                    try(nls.res <-
                            nls(
                                y ~ ix * (alpha * x + (1.0 - alpha) * 2) / (alpha * tau + (1.0 - alpha) *
                                                                                2) + (1.0 - ix) * (alpha * x + (1.0 - alpha)) / (alpha * lqs + (1.0 - alpha) *
                                                                                                                                     2) ,
                                data = reg.df,
                                start = list(alpha = alpha),
                                weights = wt,
                                lower = alpha.min,
                                upper = alpha.max,
                                algorithm = "port"
                            ),
                        silent = TRUE)
            }

            if (class(try.tmp) == "try-error") {
                optim.fail <- 'not_converged'
                res <- NULL
                break
            }

            if (nls.res$convInfo$isConv) {
                # proceed

                alpha <- coef(nls.res)[1]
                alpha <- unname(alpha)
                alphaEst <- c(alphaEst, alpha)
                fitEst <- c(fitEst, summary(nls.res)$sigma)

                # update qs
                tmp <-
                    (r * (alpha * tau + (1.0 - alpha) * 2) - (1.0 - alpha) * 2) / alpha
                qs <- round(tmp)
                qs[qs < 0] <- 0
                qs[qs > qmax] <- qmax

                # update tau
                #tau <- sum(qs*w*eta)/sum(w*eta)
                tau <- sum(qs * w) / sum(w)
                tauEst <- c(tauEst, tau)

                # update mg
                q.het <- qs[snp2seg]
                if (snv.type == "somatic") {
                    tmp <- ((alpha * q.het + (1.0 - alpha) * 2) * fb) / alpha
                } else if (snv.type == "germline") {
                    tmp <-
                        ((alpha * q.het + (1.0 - alpha) * 2) * fb - (1.0 - alpha)) / alpha
                }
                mg <- round(tmp)
                mg[mg < 0] <- 0
                tmp.ind <- which(mg > q.het)
                if (length(tmp.ind) > 0) {
                    mg[tmp.ind] <- q.het[tmp.ind]
                }

                # compute mean q changes
                mdq <- c(mdq, mean(abs(qs - qs.last)))
                qs.last <- qs

                nL <- length(fitEst)
                if (nL <= 1) {

                } else if (abs(fitEst[nL] - fitEst[(nL - 1)]) / abs(fitEst[(nL - 1)]) <
                           1e-4) {
                    # converged
                    tau.def <- sum(qs * w) / sum(w)
                    res <-
                        c(
                            alpha = alpha,
                            tau = tau,
                            tau.def = tau.def,
                            mse = fitEst[nL],
                            alpha0 = alpha0,
                            tau0 = tau0
                        )
                    break
                }

            } else {
                optim.fail <- 'not_converged'
                res <- NULL
                break
            }
        }

        res
    }

#'@import stats graphics utils
grid.search.alpha.simple <-
    function(seg.data,
             alpha.min = 0.2,
             alpha.max = 1.0,
             tau.min = 1.5,
             tau.max = 5.0,
             min.sol.freq = 0,
             min.seg.len = 200,
             qmax = 7,
             verbose = FALSE) {
        if (verbose) {
            cat("Reading arguments: ================================================\n")
            cat("min.seg.len =", min.seg.len, "\n")
            cat("qmax =", qmax, "\n")
            cat(
                "====================================================================\n"
            )
        }

        orig.seg.dat <- seg.data

        # filtering
        n.seg.1 <- nrow(seg.data)
        seg.data <- na.omit(seg.data)

        seg.data <-
            seg.data[seg.data[, "eff.seg.len"] >= min.seg.len,]
        n.seg.2 <- nrow(seg.data)
        if (verbose) {
            cat("Filtering segments with NAs and length less than eff.seg.len ...\n")
            cat("All segments: ", n.seg.1, "\n")
            cat("Retained segments: ", n.seg.2, "\n")
            cat(
                "====================================================================\n"
            )
        }

        gf <-
            seg.data[, "loc.end"] - seg.data[, "loc.start"] + 1  # vector of genome fraction: spaced length
        gf <- gf / sum(gf)
        eta <- 1.0

        r <- seg.data[, "normalized.ratio"]

        max.r.cutoff <- 3.0
        min.r.cutoff <- 1.0 / 3.0

        outlier.frac.1 <-
            length(which(r > max.r.cutoff)) / length(r)
        #outlier.gf.1 <- sum(gf[r > max.r.cutoff])
        if (verbose)
            cat(
                100 * outlier.frac.1,
                "% segments with copy ratio r>3.0 before rescaling\n"
            )

        outlier2.frac.1 <-
            length(which(r < min.r.cutoff)) / length(r)
        #outlier2.gf.1 <- sum(gf[r < min.r.cutoff])
        if (verbose) {
            cat(
                100 * outlier2.frac.1,
                "% segments with copy ratio r<0.33 before rescaling\n"
            )
            # cat("Filtering them...\n")
            cat(
                "====================================================================\n"
            )
        }


        # weights
        wts <- rep(1, length(r))
        wts[r > max.r.cutoff] <- 0.0
        wts[r < min.r.cutoff] <- 0.0

        wts.cn <- wts / sum(wts)

        alpha.grid <-
            seq(from = alpha.min, to = alpha.max, by = 0.05)
        tau.grid <- seq(from = tau.min, to = tau.max, by = 0.05)
        search.grid <-
            expand.grid(alpha.grid = alpha.grid, tau.grid = tau.grid)
        n.grid <- nrow(search.grid)

        search.res <-
            data.frame(
                alpha = numeric(),
                tau = numeric(),
                tau.def = numeric(),
                mse = numeric(),
                alpha0 = numeric(),
                tau0 = numeric()
            )

        if (verbose) {
            cat("Grid searching:\n")
            total <- n.grid
            # create progress bar
            pb <-
                txtProgressBar(
                    min = 0,
                    max = total,
                    style = 3,
                    width = 60,
                    char = ">"
                )
        }

        for (k in 1:n.grid) {
            # update progress bar
            if (verbose)
                setTxtProgressBar(pb, k)

            alpha0 <- search.grid[k, 1]
            tau0 <- search.grid[k, 2]

            a.res <-
                optimize.alpha.simple(alpha0,
                                      alpha.min,
                                      alpha.max,
                                      tau0,
                                      wts.cn,
                                      r,
                                      gf,
                                      eta,
                                      qmax)
            if (!is.null(a.res)) {
                search.res <- rbind(search.res, a.res)
            }
        }

        if (verbose)
            close(pb)

        colnames(search.res) <-
            c("alpha", "tau", "tau.def", "mse", "alpha0", "tau0")

        search.res[, c("alpha", "tau", "tau.def")] <-
            round(search.res[, c("alpha", "tau", "tau.def")], 2)
        tmp <-
            aggregate(search.res[, c("tau.def", "mse")], search.res[, c("alpha", "tau")], mean)
        tmp2 <-
            aggregate(search.res[, 1], search.res[, c("alpha", "tau")], length)

        if (verbose) {
            cat("Total", nrow(search.res), "results\n")
        }

        search.res <- data.frame(tmp, count = tmp2[, 3])

        if (verbose)
            cat(nrow(search.res), "unique results\n")

        oo <- order(search.res$mse)
        search.res <- search.res[oo,]
        rownames(search.res) <- NULL


        if (nrow(search.res) > 1) {
            if (verbose)
                cat("Clustering results...\n")
            # do clustering
            search.res <-
                cluster.solution(search.res, alpha.cut = 0.10, tau.cut = 0.15)
        }

        if (verbose)
            cat("Filtering impossible solutions...\n")
        # filtering out impossible solutions
        if (min.sol.freq == 0)
            min.sol.freq <- 0.05 * sum(search.res[, "count"])
        proper.ind <-
            which(
                search.res[, "alpha"] >= alpha.min &
                    search.res[, "alpha"] <= alpha.max &
                    search.res[, "tau.def"] >= tau.min &
                    search.res[, "tau.def"] <= tau.max &
                    search.res[, "count"] >= min.sol.freq
            )
        if (length(proper.ind) > 0) {
            search.res <- search.res[proper.ind, , drop = FALSE]
        }

        cat("Final solution number:", nrow(search.res), "\n")

        if (1) {
            if (verbose)
                cat("Outputing the best result...\n")
            # output the best result
            min.ind <- which(search.res$mse == search.res$mse[1])
            #cat("# of optimal parameters:",length(min.ind),"\n")

            alpha.opt <- search.res[1, "alpha"]
            tau.opt <- search.res[1, "tau.def"]

            tmp <-
                (r * (alpha.opt * tau.opt + (1.0 - alpha.opt) * 2) - (1.0 - alpha.opt) *
                     2) / alpha.opt
            qs <- round(tmp)
            qs[qs < 0] <- 0
            qs[qs > qmax] <- qmax
            rhat <-
                (alpha.opt * qs + (1.0 - alpha.opt) * 2) / (alpha.opt * tau.opt + (1.0 -
                                                                                       alpha.opt) * 2)
            abs.cn <- data.frame(seg.data[, 1:4],
                                 r = r,
                                 rhat = rhat,
                                 CN = qs)

        }

        res.list <-
            list(
                searchRes = search.res,
                absCN = abs.cn,
                orig.seg.dat = orig.seg.dat,
                seg.dat = seg.data
            )

        cat("Done.\n")
        res.list
    }

#'@import stats graphics
optimize.alpha.simple <-
    function(alpha0,
             alpha.min = 0.2,
             alpha.max = 1.0,
             tau0,
             wts.cn,
             r,
             w,
             eta,
             qmax) {
        # intialization
        alpha <- alpha0
        tau <- tau0

        tmp <-
            (r * (alpha * tau + (1.0 - alpha) * 2) - (1.0 - alpha) * 2) /
            alpha
        qs <- round(tmp)
        qs[qs < 0] <- 0
        qs[qs > qmax] <- qmax
        #tau <- sum(qs*w*eta)/sum(w*eta)
        tau <- sum(qs * w) / sum(w)
        qs.last <- qs

        reg.df <- data.frame(x = qs, y = r, wt = wts.cn)

        # iteration
        optim.fail <- 'converged'
        opt.cycle <- 0
        fitEst <- NULL
        alphaEst <- NULL
        tauEst <- NULL
        mdq <- NULL

        repeat {
            opt.cycle <- opt.cycle + 1
            if (opt.cycle > 100) {
                res <- NULL
                break
            }

            reg.df[, "x"] <- qs

            try.tmp <-
                try(nls.res <-
                        nls(
                            y ~ (alpha * x + (1.0 - alpha) * 2) / (alpha * tau + (1.0 - alpha) * 2),
                            data = reg.df,
                            start = list(alpha = alpha),
                            weights = wt,
                            lower = alpha.min,
                            upper = alpha.max,
                            algorithm = "port"
                        ),
                    silent = TRUE)

            if (class(try.tmp) == "try-error") {
                optim.fail <- 'not_converged'
                res <- NULL
                break
            }

            if (nls.res$convInfo$isConv) {
                # proceed

                alpha <- coef(nls.res)[1]
                alpha <- unname(alpha)
                alphaEst <- c(alphaEst, alpha)
                fitEst <- c(fitEst, summary(nls.res)$sigma)

                # update qs
                tmp <-
                    (r * (alpha * tau + (1.0 - alpha) * 2) - (1.0 - alpha) * 2) / alpha
                qs <- round(tmp)
                qs[qs < 0] <- 0
                qs[qs > qmax] <- qmax

                # update tau
                #tau <- sum(qs*w*eta)/sum(w*eta)
                tau <- sum(qs * w) / sum(w)
                tauEst <- c(tauEst, tau)

                # compute mean q changes
                mdq <- c(mdq, mean(abs(qs - qs.last)))
                qs.last <- qs

                nL <- length(fitEst)
                if (nL <= 1) {

                } else if (abs(fitEst[nL] - fitEst[(nL - 1)]) / abs(fitEst[(nL - 1)]) <
                           1e-4) {
                    # converged
                    tau.def <- sum(qs * w) / sum(w)
                    res <-
                        c(
                            alpha = alpha,
                            tau = tau,
                            tau.def = tau.def,
                            mse = fitEst[nL],
                            alpha0 = alpha0,
                            tau0 = tau0
                        )
                    break
                }

            } else {
                optim.fail <- 'not_converged'
                res <- NULL
                break
            }
        }

        res
    }



#' Convert copy ratios into abosolute copy numbers when given the purity and ploidy estimates
#'
#' This function converts copy ratios into abosolute copy numbers when the tumor
#' purity and ploidy are provided by the user. The tumor purity and ploidy
#' parameter should be taken from the result after running the "run_fromLocal" or
#' "run_fromDF" etc. functions.
#'
#' @param seg.data  a data frame with five
#' columns: "chrom", "loc.start", "loc.end", "eff.seg.len", "normalized.ratio".
#' @param alpha tumor purity estimate
#' @param tau tumor ploidy estimate
#' @param qmax maximum allowed absolute copy
#' number for any segments
#' @return a \code{data.frame}
#  The input data frame is augmented with two additional
#' columns:
#' \item{rhat}{expected copy ratio}
#' \item{CN}{absolute copy number}
#'
#' @author Lei Bao
#' @export
#'
get_absCopyNumber <- function(seg.data, alpha, tau, qmax = 7) {
    r <- seg.data[, "normalized.ratio"]
    tmp <-
        (r * (alpha * tau + (1.0 - alpha) * 2) - (1.0 - alpha) * 2) / alpha
    qs <- round(tmp)
    qs[qs < 0] <- 0
    qs[qs > qmax] <- qmax
    rhat <-
        (alpha * qs + (1.0 - alpha) * 2) / (alpha * tau + (1.0 - alpha) * 2)
    abs.cn <- data.frame(seg.data[, 1:4],
                         r = r,
                         rhat = rhat,
                         CN = qs)

    abs.cn
}




#' Run absCopyNumber from Local Files
#'
#' This wrapper function accepts data files and user specified parameters and runs
#' the absCNseq algorithm from paper "AbsCN-seq: a statistical method to estimate
#' tumor purity, ploidy and absolute copy numbers from next-generation sequencing data."
#'
#'
#' @param seg.fn The name of the file
#' containing the segmentation data. An example file called "example.cn.txt.gz"
#' can be found under the "inst/extdata" folder within the package.
#' @param snv.fn The name of the file
#' containing a set of somatic single nucleotide variants (SNVs). An example
#' file called "example.snv.txt.gz" can be found under the "inst/extdata" folder within
#' the package.
#' @param res.dir The output directory
#' @param sample.name  Sample name, used when store result to local machine.
#' @param platform  "WES" for whole exome sequencing, "WGS" for whole genome sequencing and
#' "MicroArray" for SNP or other gene chips can detect CNV.
#' @param alpha.min The minimum allowed
#' value for tumor purity. Default is 0.20. If you do have the pathologist
#' estimate, set it as the lower bound of the pathologist estimate is usually
#' preferred.
#' @param alpha.max The maximum allowed
#' value for tumor purity. Default is 1.0. If you do have the pathologist
#' estimate, set it as the upper bound of the pathologist estimate is usually
#' preferred.
#' @param tau.min The minimum allowed value for tumor ploidy, default is 0.5.
#' @param tau.max The maximum allowed value for tumor ploidy, default is 8.
#' @param min.sol.freq A solution
#' should appear at least this many times to be kept. Singleton solutions are
#' usually not trustable. By default (min.sol.freq=0), the program will only
#' retain solutions that cover at least 5 percent of the search space.
#' @param min.seg.len The minimum
#' length of a segment to be included in computation. The default value is 200
#' bp for WES, 3000 bp for WGS and 0 bp for MicroArray.
#' @param qmax Maximum allowed absolute copy number for any segments, default is 7.
#' @param lamda The relative weight of the
#' segment copy ratio data over the SNV data. Must be a value in (0.0,1.0]. Only used when SNV
#' file is provided. The default value is 0.5,
#' which give equal weights to copy-number-based ratio estimator and SNV-frequency-based estimator. If
#' \code{lamda} is 1, only use copy-number-based ratio model to minimized objective function.
#' @param verbose if \code{True}, print extra output
#' @return
#' \item{searchRes }{a data frame giving the solution set (purity and
#' ploidy pairs) ranked by their fitting errors }
#' \item{absCN }{a data frame
#' giving the absolute copy number estimates of tumor cells for the top first
#' solution (purity and ploidy pair)}
#' \item{absSNV }{a data frame giving the
#' absolute multiplicity estimates of SNVs for the top first solution (purity
#' and ploidy pair) }
#' \item{orig.seg.dat }{original copy ratio data read from
#' the input segmentation file }
#' \item{seg.dat }{filtered copy ratio data }
#' \item{orig.snv.dat }{original SNV data read from the input SNV file }
#' \item{snv.dat }{filtered SNV data }
#'
#' @author Lei Bao, Shixiang Wang
#' @importFrom readr read_tsv
#' @export
#' @references AbsCN-seq: a statistical method to estimate tumor purity,
#' ploidy and absolute copy numbers from next-generation sequencing data.
run_fromLocal <-
    function(seg.fn,
             snv.fn = NULL,
             res.dir = NULL,
             sample.name,
             platform = c("WES", "WGS", "MicroArray"),
             alpha.min = 0.2,
             alpha.max = 1.0,
             tau.min = 0.5,
             tau.max = 8.0,
             min.sol.freq = 0,
             min.seg.len = NULL,
             qmax = 7,
             lamda = 0.5,
             verbose = FALSE) {
        if (!is.null(seg.fn) & file.exists(seg.fn)) {
            suppressMessages(seg.data <-
                                 as.data.frame(readr::read_tsv(seg.fn, progress = FALSE)))
        } else {
            stop("ERROR: The input segmentation file is not provided or does not exist.")
        }

        platform <- match.arg(platform)
        if (platform == "WES") {
            if (is.null(min.seg.len))
                min.seg.len <- 200
        } else if (platform == "WGS") {
            if (is.null(min.seg.len))
                min.seg.len <- 3000
        } else if (platform == "MicroArray") {
            if (is.null(min.seg.len))
                min.seg.len <- 0
        } else {
            stop("ERROR: you must specify the platform: WES or WGS or MicroArray.")
        }

        # check the format of segmentation file
        seg.in.col <- colnames(seg.data)
        seg.req.col <-
            c("chrom",
              "loc.start",
              "loc.end",
              "eff.seg.len",
              "normalized.ratio")
        tmp <- setdiff(seg.req.col, seg.in.col)
        if (length(tmp) > 0) {
            stop(
                "ERROR: The input segmentation file must include all the named columns in order: chrom, loc.start, loc.end, eff.seg.len, normalized.ratio."
            )
        }

        if (is.null(snv.fn)) {
            res.list <-
                grid.search.alpha.simple(
                    seg.data,
                    alpha.min,
                    alpha.max,
                    tau.min,
                    tau.max,
                    min.sol.freq,
                    min.seg.len,
                    qmax,
                    verbose
                )
        } else {
            if (file.exists(snv.fn)) {
                suppressMessages(snv.data <-
                                     as.data.frame(readr::read_tsv(snv.fn, progress = FALSE)))
            } else {
                stop("ERROR: The input SNV file does not exist.")
            }

            # check the format of SNV file
            snv.in.col <- colnames(snv.data)
            snv.req.col <- c("chrom",  "position", "tumor_var_freq")
            tmp <- setdiff(snv.req.col, snv.in.col)
            if (length(tmp) > 0) {
                stop(
                    "ERROR: The input SNV file must include all the named columns in order: chrom, position, tumor_var_freq."
                )
            }

            res.list <-
                grid.search.alpha(
                    seg.data,
                    snv.data,
                    alpha.min,
                    alpha.max,
                    tau.min,
                    tau.max,
                    min.sol.freq,
                    min.seg.len,
                    qmax,
                    lamda,
                    verbose
                )
        }

        if (!is.null(res.dir)) {
            dir.create(res.dir,
                       showWarnings = FALSE,
                       recursive = TRUE)
            res.file <-
                file.path(res.dir, paste(sample.name, ".RData", sep = ""))
            save(res.list, file = res.file)
        }

        res.list
    }


#' Run absCopyNumber from data.frame
#'
#' This wrapper function accepts data.frame and user specified parameters and runs
#' the absCNseq algorithm from paper "AbsCN-seq: a statistical method to estimate
#' tumor purity, ploidy and absolute copy numbers from next-generation sequencing data."
#'
#'
#' @param seg.df The data.frame
#' containing the segmentation data. An example file called "example.cn.txt.gz"
#' can be found under the "inst/extdata" folder within the package.
#' @param snv.df The data.frame
#' containing a set of somatic single nucleotide variants (SNVs). An example
#' file called "example.snv.txt.gz" can be found under the "inst/extdata" folder within
#' the package.
#' @param platform  "WES" for whole exome sequencing, "WGS" for whole genome sequencing and
#' "MicroArray" for SNP or other gene chips can detect CNV.
#' @param alpha.min The minimum allowed
#' value for tumor purity. Default is 0.20. If you do have the pathologist
#' estimate, set it as the lower bound of the pathologist estimate is usually
#' preferred.
#' @param alpha.max The maximum allowed
#' value for tumor purity. Default is 1.0. If you do have the pathologist
#' estimate, set it as the upper bound of the pathologist estimate is usually
#' preferred.
#' @param tau.min The minimum allowed value for tumor ploidy, default is 0.5.
#' @param tau.max The maximum allowed value for tumor ploidy, default is 8.
#' @param min.sol.freq A solution
#' should appear at least this many times to be kept. Singleton solutions are
#' usually not trustable. By default (min.sol.freq=0), the program will only
#' retain solutions that cover at least 5 percent of the search space.
#' @param min.seg.len The minimum
#' length of a segment to be included in computation. The default value is 200
#' bp for WES, 3000 bp for WGS and 0 bp for MicroArray.
#' @param qmax Maximum allowed absolute copy number for any segments, default is 7.
#' @param lamda The relative weight of the
#' segment copy ratio data over the SNV data. Must be a value in(0.0,1.0]. Only used when SNV
#' file is provided. The default value is 0.5,
#' which give equal weights to copy-number-based ratio estimator and SNV-frequency-based estimator. If
#' \code{lamda} is 1, only use copy-number-based ratio model to minimized objective function.
#' @param verbose if \code{True}, print extra output
#' @return
#' \item{searchRes }{a data frame giving the solution set (purity and
#' ploidy pairs) ranked by their fitting errors }
#' \item{absCN }{a data frame
#' giving the absolute copy number estimates of tumor cells for the top first
#' solution (purity and ploidy pair)}
#' \item{absSNV }{a data frame giving the
#' absolute multiplicity estimates of SNVs for the top first solution (purity
#' and ploidy pair) }
#' \item{orig.seg.dat }{original copy ratio data read from
#' the input segmentation file }
#' \item{seg.dat }{filtered copy ratio data }
#' \item{orig.snv.dat }{original SNV data read from the input SNV file }
#' \item{snv.dat }{filtered SNV data }
#'
#' @author Lei Bao, Shixiang Wang
#' @export
#' @references AbsCN-seq: a statistical method to estimate tumor purity,
#' ploidy and absolute copy numbers from next-generation sequencing data.
run_fromDF <-
    function(seg.df,
             snv.df = NULL,
             platform = c("WES", "WGS", "MicroArray"),
             alpha.min = 0.2,
             alpha.max = 1.0,
             tau.min = 0.5,
             tau.max = 8.0,
             min.sol.freq = 0,
             min.seg.len = NULL,
             qmax = 7,
             lamda = 0.5,
             verbose = FALSE) {
        platform <- match.arg(platform)
        if (platform == "WES") {
            if (is.null(min.seg.len))
                min.seg.len <- 200
        } else if (platform == "WGS") {
            if (is.null(min.seg.len))
                min.seg.len <- 3000
        } else if (platform == "MicroArray") {
            if (is.null(min.seg.len))
                min.seg.len <- 0
        } else {
            stop("ERROR: you must specify the platform: WES or WGS or MicroArray.")
        }

        seg.data <- seg.df
        # check the format of segmentation file
        seg.in.col <- colnames(seg.data)
        seg.req.col <-
            c("chrom",
              "loc.start",
              "loc.end",
              "eff.seg.len",
              "normalized.ratio")
        tmp <- setdiff(seg.req.col, seg.in.col)
        if (length(tmp) > 0) {
            stop(
                "ERROR: The input segmentation data.frame must include all the named columns in order: chrom, loc.start, loc.end, eff.seg.len, normalized.ratio."
            )
        }

        if (is.null(snv.df)) {
            res.list <-
                grid.search.alpha.simple(
                    seg.data,
                    alpha.min,
                    alpha.max,
                    tau.min,
                    tau.max,
                    min.sol.freq,
                    min.seg.len,
                    qmax,
                    verbose
                )
        } else {
            snv.data <- snv.df
            # check the format of SNV file
            snv.in.col <- colnames(snv.data)
            snv.req.col <- c("chrom",  "position", "tumor_var_freq")
            tmp <- setdiff(snv.req.col, snv.in.col)
            if (length(tmp) > 0) {
                stop(
                    "ERROR: The input SNV data.frame must include all the named columns in order: chrom, position, tumor_var_freq."
                )
            }

            res.list <-
                grid.search.alpha(
                    seg.data,
                    snv.data,
                    alpha.min,
                    alpha.max,
                    tau.min,
                    tau.max,
                    min.sol.freq,
                    min.seg.len,
                    qmax,
                    lamda,
                    verbose
                )
        }
        res.list
    }


#' Plot estimated integer copy numbers for each chromosome.
#'
#' plot estimated integer copy numbers from absCNseq for each chromosome.
#'
#' @param seg.data  A result data frame
#' generated by the compute.absCN function. It has five mandatory columns:
#' "chrom", "loc.start", "loc.end", "r", "CN".
#' @param chromnum  which chromosome to be
#' plotted. Must be a number from 1 to 22
#' @param rawdata  (optional): raw copy
#' ratio data before segmentation. This data frame has four mandatory columns:
#' chromosome number, start position, end position, and normalized copy ratio
#' of tumor vs. normal sample on log2 scale (log2_ratio). The column names are
#' provided by the user. Default column names are taken from the VARSCAN
#' output.
#' @param chromvar The variable name of
#' the chromosome number column for the optional raw data.
#' @param loc.start The variable name of
#' the start position column for the optional raw data.
#' @param loc.end The variable name of the
#' end position column for the optional raw data.
#' @param normalized.ratio The
#' variable name of the log2_ratio column for the optional raw data.
#' @author Lei Bao
#' @export
plot_absCopyNumber <-
    function(seg.data,
             chromnum = 1,
             rawdata = NULL,
             chromvar = 'chrom',
             loc.start = 'chr_start',
             loc.end = 'chr_stop',
             normalized.ratio = 'log2_ratio') {
        # check the format of input data
        seg.in.col <- colnames(seg.data)
        seg.req.col <- c("chrom", "loc.start", "loc.end", "r", "CN")
        tmp <- setdiff(seg.req.col, seg.in.col)
        if (length(tmp) > 0) {
            stop(
                "ERROR: The input dataset must include all the named columns in order: chrom, loc.start, loc.end, r, CN."
            )
        }

        if (!(chromnum %in% 1:22)) {
            stop("ERROR: chromosome number must be from 1 to 22.")
        }

        segs = subset(seg.data, chrom == chromnum)
        par(xpd = NA, oma = c(2, 1, 3, 1))

        lty = c(1, 1)
        col = c('blue', 'red')
        lwd = c(4, 4)
        ylim = c(0, 8)
        legtxt = c('Raw', 'Absolute')

        if (!is.null(rawdata)) {
            rawdata$chrom = rawdata[, chromvar]
            rawdata$chr_start = rawdata[, loc.start]
            rawdata$chr_stop = rawdata[, loc.end]
            rawdata$log2_ratio = rawdata[, normalized.ratio]

            raw = subset(rawdata, chrom == chromnum)
            plot(
                raw[, 'chr_start'],
                2 * 2 ^ (raw$log2_ratio),
                xlab = 'Position',
                sub = paste('chr', chromnum, sep = ''),
                ylab = 'Copy Number',
                col = 'green',
                cex = .1,
                axes = F,
                xlim = c(min(raw[, 'chr_start'], na.rm = T), max(raw[, 'chr_stop'], na.rm =
                                                                     T)),
                ylim = ylim
            )
        }  else {
            plot(
                segs[, "loc.start"],
                2 * segs[, "r"],
                xlab = 'Position',
                sub = paste('chr', chromnum, sep = ''),
                ylab = 'Copy Number',
                col = 'green',
                cex = .1,
                axes = F,
                xlim = c(min(segs[, "loc.start"], na.rm = T), max(segs[, "loc.end"], na.rm =
                                                                      T)),
                ylim = ylim
            )
        }
        box()
        axis(1)
        axis(2, las = 2)
        segments(
            segs[, "loc.start"],
            2 * segs[, "r"],
            segs[, "loc.end"],
            2 * segs[, "r"],
            col = col[1],
            lwd = lwd[1],
            lty = lty[1]
        )
        segments(
            segs[, "loc.start"],
            segs[, "CN"],
            segs[, "loc.end"],
            segs[, "CN"],
            col = col[2],
            lwd = lwd[2],
            lty = lty[2]
        )

        legend(
            min(segs[, "loc.start"], na.rm = T),
            10,
            legtxt,
            lty = lty,
            col = col,
            lwd = lwd,
            bty = 'n',
            ncol = 2,
            xjust = 0
        )
    }

# Hack global variable  ---------------------------------------------------
base::suppressMessages(utils::globalVariables(c("chrom", "wt")))
ShixiangWang/absCopyNumber documentation built on May 28, 2019, 5:42 p.m.