R/HIBAG.R

Defines functions .onLoad .onAttach hlaSetKernelTarget print.hlaAttrBagObj plot.hlaAttrBagObj print.hlaAttrBagClass plot.hlaAttrBagClass hlaDistance hlaLDMatrix hlaGenoLD hlaOutOfBag summary.hlaAttrBagObj hlaModelFromObj hlaSubModelObj hlaCombineModelObj hlaModelToObj summary.hlaAttrBagClass hlaPredMerge hlaPredict predict.hlaAttrBagClass hlaClose hlaParallelAttrBagging .show_model_obj hlaAttrBagging .printMatching

Documented in hlaAttrBagging hlaClose hlaCombineModelObj hlaDistance hlaGenoLD hlaLDMatrix hlaModelFromObj hlaModelToObj hlaOutOfBag hlaParallelAttrBagging hlaPredict hlaPredMerge hlaSetKernelTarget hlaSubModelObj plot.hlaAttrBagClass plot.hlaAttrBagObj predict.hlaAttrBagClass print.hlaAttrBagClass print.hlaAttrBagObj summary.hlaAttrBagClass summary.hlaAttrBagObj

#######################################################################
#
# Package Name: HIBAG
# Description:
#   HIBAG -- HLA Genotype Imputation with Attribute Bagging
#
# HIBAG R package, HLA Genotype Imputation with Attribute Bagging
# Copyright (C) 2011-2020   Xiuwen Zheng (zhengx@u.washington.edu)
# All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#


##########################################################################
#
# Attribute Bagging method -- HIBAG algorithm
#

.printMatching <- function(p)
{
    w <- summary(p)
    class(w) <- "table"
    w <- c(w[1L],
        "0.1% Qu." = quantile(p, 0.001, na.rm=TRUE, names=FALSE),
        "1% Qu."   = quantile(p, 0.01, na.rm=TRUE, names=FALSE),
        w[2L], w[3L], w[5L], w[6L],
        w[4L], "SD" = sd(p, na.rm=TRUE))
    print(w)
}


##########################################################################
# Fit a HIBAG model for imputing HLA genotypes
#

hlaAttrBagging <- function(hla, snp, nclassifier=100L,
    mtry=c("sqrt", "all", "one"), prune=TRUE, na.rm=TRUE, mono.rm=TRUE,
    nthread=1L, verbose=TRUE, verbose.detail=FALSE)
{
    # check
    stopifnot(inherits(hla, "hlaAlleleClass"))
    stopifnot(inherits(snp, "hlaSNPGenoClass"))
    stopifnot(is.numeric(nclassifier), length(nclassifier)==1L)
    stopifnot(is.character(mtry) | is.numeric(mtry), length(mtry)>0L)
    stopifnot(is.logical(prune), length(prune)==1L)
    stopifnot(is.logical(na.rm), length(na.rm)==1L)
    stopifnot(is.logical(mono.rm), length(mono.rm)==1L)
    stopifnot(is.numeric(nthread) | is.logical(nthread), length(nthread)==1L,
        !is.na(nthread))
    stopifnot(is.logical(verbose), length(verbose)==1L)
    stopifnot(is.logical(verbose.detail), length(verbose.detail)==1L)
    if (verbose.detail) verbose <- TRUE

    if (is.na(nclassifier)) nclassifier <- 0L
    with.in.call <- nclassifier==0L
    with.matching <- (nclassifier > 0L)
    if (!with.matching)
    {
        nclassifier <- -nclassifier
        if (nclassifier == 0L) nclassifier <- 1L
    }

    # get the common samples
    samp.id <- intersect(hla$value$sample.id, snp$sample.id)

    # hla types
    samp.flag <- match(samp.id, hla$value$sample.id)
    hla.allele1 <- hla$value$allele1[samp.flag]
    hla.allele2 <- hla$value$allele2[samp.flag]
    if (na.rm)
    {
        if (any(is.na(c(hla.allele1, hla.allele2))))
        {
            warning("There are missing HLA alleles, ",
                "and the corresponding samples have been removed.")
            flag <- is.na(hla.allele1) | is.na(hla.allele2)
            samp.id <- setdiff(samp.id, hla$value$sample.id[samp.flag[flag]])
            samp.flag <- match(samp.id, hla$value$sample.id)
            hla.allele1 <- hla$value$allele1[samp.flag]
            hla.allele2 <- hla$value$allele2[samp.flag]
        }
    } else {
        if (any(is.na(c(hla.allele1, hla.allele2))))
        {
            stop("There are missing HLA alleles!")
        }
    }

    # SNP genotypes
    samp.flag <- match(samp.id, snp$sample.id)
    snp.geno <- snp$genotype[, samp.flag]
    if (!is.integer(snp.geno))
        storage.mode(snp.geno) <- "integer"
    if (with.in.call)
    {
        # save the variable, until remove it later
        .packageEnv$snp.geno <- snp.geno
    }

    tmp.snp.id <- snp$snp.id
    tmp.snp.position <- snp$snp.position
    tmp.snp.allele <- snp$snp.allele

    # remove mono-SNPs
    if (mono.rm)
    {
        snpsel <- rowMeans(snp.geno, na.rm=TRUE)
        snpsel[!is.finite(snpsel)] <- 0
        snpsel <- (0 < snpsel) & (snpsel < 2)
        if (sum(!snpsel) > 0L)
        {
            snp.geno <- snp.geno[snpsel, ]
            if (verbose)
            {
                a <- sum(!snpsel)
                if (a > 0L)
                    cat(sprintf("Exclude %d monomorphic SNP%s\n", a, .plural(a)))
            }
            tmp.snp.id <- tmp.snp.id[snpsel]
            tmp.snp.position <- tmp.snp.position[snpsel]
            tmp.snp.allele <- tmp.snp.allele[snpsel]
        }
    }

    # check
    if (length(samp.id) <= 0L)
        stop("There is no common sample between 'hla' and 'snp'.")
    if (length(dim(snp.geno)[1L]) <= 0L)
        stop("There is no valid SNP markers.")


    ###################################################################
    # initialize ...

    n.snp <- dim(snp.geno)[1L]     # Num. of SNPs
    n.samp <- dim(snp.geno)[2L]    # Num. of samples
    HUA <- hlaUniqueAllele(c(hla.allele1, hla.allele2))
    H <- factor(match(c(hla.allele1, hla.allele2), HUA))
    levels(H) <- HUA
    n.hla <- nlevels(H)
    H1 <- as.integer(H[1L:n.samp]) - 1L
    H2 <- as.integer(H[(n.samp+1L):(2L*n.samp)]) - 1L

    # create an attribute bagging object (return an integer)
    ABmodel <- .Call(HIBAG_Training, n.snp, n.samp, snp.geno, n.hla, H1, H2)

    # number of variables randomly sampled as candidates at each split
    mtry <- mtry[1L]
    if (is.character(mtry))
    {
        if (mtry == "sqrt")
        {
            mtry <- ceiling(sqrt(n.snp))
        } else if (mtry == "all")
        {
            mtry <- n.snp
        } else if (mtry == "one")
        {
            mtry <- 1L
        } else {
            stop("Invalid mtry!")
        }
    } else if (is.numeric(mtry))
    {
        if (is.finite(mtry))
        {
            if ((0 < mtry) & (mtry < 1)) mtry <- n.snp*mtry
            mtry <- ceiling(mtry)
            if (mtry > n.snp) mtry <- n.snp
        } else {
            mtry <- ceiling(sqrt(n.snp))
        }
    } else {
        stop("Invalid mtry value!")
    }
    if (mtry <= 0) mtry <- 1L

    if (verbose)
    {
        cat(sprintf("Build a HIBAG model with %d individual classifier%s:\n",
            nclassifier, .plural(nclassifier)))
        cat("    # of SNPs randomly sampled as candidates for each selection: ",
            mtry, "\n", sep="")
        cat("    # of SNPs: ", n.snp, "\n", sep="")
        cat("    # of samples: ", n.samp, "\n", sep="")
        s <- ifelse(!grepl("^KIR", hla$locus), "HLA", "KIR")
        cat("    # of unique ", s, " alleles: ", n.hla, "\n", sep="")
        cat("CPU flags: ", .Call(HIBAG_Kernel_Version)[[2L]][1L], "\n", sep="")
    }

    # set the number of threads (initialized in HIBAG_NewClassifiers)
    if (isTRUE(nthread))
        nthread <- as.integer(defaultNumThreads())
    if (is.na(nthread) || (nthread<1L)) nthread <- 1L


    ###################################################################
    # training ...
    # add new individual classifers
    .Call(HIBAG_NewClassifiers, ABmodel, nclassifier, mtry, prune,
        nthread, verbose, verbose.detail, NULL)

    # output
    mod <- list(n.samp = n.samp, n.snp = n.snp, sample.id = samp.id,
        snp.id = tmp.snp.id, snp.position = tmp.snp.position,
        snp.allele = tmp.snp.allele,
        snp.allele.freq = 0.5*rowMeans(snp.geno, na.rm=TRUE),
        hla.locus = hla$locus, hla.allele = levels(H),
        hla.freq = prop.table(table(H)),
        assembly = as.character(snp$assembly)[1L],
        model = ABmodel,
        appendix = list())
    if (is.na(mod$assembly)) mod$assembly <- "unknown"
    if (with.in.call) mod$mtry <- mtry

    class(mod) <- "hlaAttrBagClass"


    ###################################################################
    # calculate matching proportion
    if (with.matching)
    {
        if (verbose)
            cat("Calculating matching proportion:\n")
        pd <- hlaPredict(mod, snp, cl=nthread, verbose=FALSE)
        mod$matching <- pd$value$matching
        if (verbose)
        {
            .printMatching(mod$matching)
            acc <- hlaCompareAllele(hla, pd, verbose=FALSE)$overall$acc.haplo
            cat(sprintf("Accuracy with training data: %.2f%%\n", acc*100))
            # out-of-bag accuracy
            mobj <- hlaModelToObj(mod)
            acc <- sapply(mobj$classifiers, function(x) x$outofbag.acc)
            cat(sprintf("Out-of-bag accuracy: %.2f%%\n", mean(acc)*100))
        }
    }

    # output
    mod
}


##########################################################################
# Fit a HIBAG model for imputing HLA genotypes in parallel
#

.show_model_obj <- function(mobj, autosave)
{
    z <- summary(mobj, show=FALSE)
    cat(ifelse(autosave, "==Saved==", " --"),
        paste0("#", length(mobj$classifier), ","), "avg oob acc:",
        sprintf("%0.2f%%, sd: %0.2f%%, min: %0.2f%%, max: %0.2f%%\n",
        z$info["accuracy", "Mean"], z$info["accuracy", "SD"],
        z$info["accuracy", "Min"], z$info["accuracy", "Max"]))
    invisible()
}

hlaParallelAttrBagging <- function(cl, hla, snp, auto.save="",
    nclassifier=100L, mtry=c("sqrt", "all", "one"), prune=TRUE, na.rm=TRUE,
    mono.rm=TRUE, stop.cluster=FALSE, verbose=TRUE, verbose.detail=FALSE)
{
    # check
    stopifnot(is.null(cl) | is.logical(cl) | is.numeric(cl) |
        inherits(cl, "cluster"))
    stopifnot(inherits(hla, "hlaAlleleClass"))
    stopifnot(inherits(snp, "hlaSNPGenoClass"))
    stopifnot(is.character(auto.save), length(auto.save)==1L, !is.na(auto.save))
    if (auto.save!="" && is.na(.fn_obj_check(auto.save)))
        stop("'auto.save' should be a .rda/.RData or .rds file name.")
    stopifnot(is.numeric(nclassifier), length(nclassifier)==1L, nclassifier>0L)
    stopifnot(is.character(mtry) | is.numeric(mtry), length(mtry)>0L)
    stopifnot(is.logical(prune), length(prune)==1L)
    stopifnot(is.logical(na.rm), length(na.rm)==1L)
    stopifnot(is.logical(mono.rm), length(mono.rm)==1L)
    stopifnot(is.logical(stop.cluster), length(stop.cluster)==1L)
    stopifnot(is.logical(verbose), length(verbose)==1L)
    stopifnot(is.logical(verbose.detail), length(verbose.detail)==1L)

    if (verbose)
    {
        cat("Building a HIBAG model:\n")
        cat(sprintf("    %d individual classifier%s\n", nclassifier,
            .plural(nclassifier)))
        if (!is.null(cl))
        {
            cat(sprintf("    run in parallel with %d compute node%s\n",
                length(cl), .plural(length(cl))))
        }
        if (auto.save != "")
            cat("    autosave to ", sQuote(auto.save), "\n", sep="")
    }

    if (inherits(cl, "cluster"))
    {
        if (!requireNamespace("parallel", quietly=TRUE))
            stop("The `parallel' package should be installed.")
        if (verbose)
            cat("[-] ", format(Sys.time(), "%Y-%m-%d %H:%M:%S"), "\n", sep="")
        # set random number for the cluster
        RNGkind("L'Ecuyer-CMRG")
        rand <- .Random.seed
        parallel::clusterSetRNGStream(cl)

        total <- 0L
        ans <- .DynamicClusterCall(cl,
            fun = function(job, hla, snp, mtry, prune, na.rm, mono.rm)
            {
                model <- hlaAttrBagging(hla=hla, snp=snp, nclassifier=-1L,
                    mtry=mtry, prune=prune, na.rm=na.rm, mono.rm=mono.rm,
                    verbose=FALSE, verbose.detail=FALSE)
                mobj <- hlaModelToObj(model)
                hlaClose(model)
                mobj
            },
            combine.fun = function(obj1, obj2)
            {
                if (is.null(obj1))
                    mobj <- obj2
                else if (is.null(obj2))
                    mobj <- obj1
                else
                    mobj <- hlaCombineModelObj(obj1, obj2)
                if (auto.save != "")
                    .fn_obj_save(auto.save, mobj)
                if (verbose & !is.null(mobj))
                    .show_model_obj(mobj, auto.save!="")
                mobj
            },
            msg.fn = function(job, obj)
            {
                if (verbose)
                {
                    z <- summary(obj, show=FALSE)
                    total <<- total + 1L
                    cat(sprintf(
                        "[%d] %s, worker%3d, # of SNPs: %g, # of haplo: %g, oob acc: %0.1f%%\n",
                        total, format(Sys.time(), "%Y-%m-%d %H:%M:%S"),
                        as.integer(job), z$info["num.snp", "Mean"],
                        z$info["num.haplo", "Mean"],
                        z$info["accuracy", "Mean"]))
                }
            },
            n=nclassifier, stop.cluster=stop.cluster,
            hla=hla, snp=snp, mtry=mtry, prune=prune,
            na.rm=na.rm, mono.rm=mono.rm
        )

        # the next random seed
        parallel::nextRNGStream(rand)
        parallel::nextRNGSubStream(rand)
        if (stop.cluster) cl <- NULL
        if (auto.save != "")
            ans <- .fn_obj_load(auto.save)
        mod <- hlaModelFromObj(ans)

    } else {
        # find the number of threads
        nthread <- 1L
        if (isTRUE(nthread))
            nthread <- as.integer(defaultNumThreads())
        if (is.numeric(cl)) nthread <- as.integer(cl)
        if (is.na(nthread) || (nthread<1L)) nthread <- 1L

        if (auto.save == "")
        {
            mod <- hlaAttrBagging(hla=hla, snp=snp, nclassifier=-nclassifier,
                mtry=mtry, prune=prune, na.rm=na.rm, mono.rm=mono.rm,
                nthread=nthread, verbose=verbose, verbose.detail=verbose.detail)
        } else {
            mod <- hlaAttrBagging(hla=hla, snp=snp, nclassifier=0L,
                mtry=mtry, prune=prune, na.rm=na.rm, mono.rm=mono.rm,
                nthread=nthread, verbose=verbose, verbose.detail=verbose.detail)
            mtry <- mod$mtry
            on.exit({ .packageEnv$snp.geno <- NULL }, add=TRUE)
            mobj <- hlaModelToObj(mod)
            .fn_obj_save(auto.save, mobj)
            if (verbose) .show_model_obj(mobj, TRUE)
            # add the remaining individual classifiers
            for (i in seq_len(nclassifier-1L))
            {
                .Call(HIBAG_NewClassifiers, mod$model, 1L, mtry, prune,
                    -nthread, verbose, verbose.detail, NULL)
                mobj <- hlaModelToObj(mod)
                .fn_obj_save(auto.save, mobj)
                if (verbose) .show_model_obj(mobj, TRUE)
            }
        }
    }

    # matching proportion
    if (verbose)
        cat("Calculating matching proportion:\n")
    if (is.null(cl)) cl <- FALSE
    pd <- hlaPredict(mod, snp, cl=cl, verbose=FALSE)
    mod$matching <- pd$value$matching
    mobj <- NULL
    if (auto.save != "")
    {
        mobj <- hlaModelToObj(mod)
        .fn_obj_save(auto.save, mobj)
    }
    if (verbose)
    {
        .printMatching(mod$matching)
        acc <- hlaCompareAllele(hla, pd, verbose=FALSE)$overall$acc.haplo
        cat(sprintf("Accuracy with training data: %.2f%%\n", acc*100))
        # out-of-bag accuracy
        if (is.null(mobj)) mobj <- hlaModelToObj(mod)
        acc <- sapply(mobj$classifiers, function(x) x$outofbag.acc)
        cat(sprintf("Out-of-bag accuracy: %.2f%%\n", mean(acc)*100))
    }

    # output
    if (auto.save != "") invisible() else mod
}


##########################################################################
# Close and dispose a HIBAG model
#

hlaClose <- function(model)
{
    stopifnot(inherits(model, "hlaAttrBagClass"))
    .Call(HIBAG_Close, model$model)
    invisible()
}


#######################################################################
# Predict HLA types using unphased SNP data
#

predict.hlaAttrBagClass <- function(object, snp, cl=FALSE,
    type=c("response", "prob", "response+prob"), vote=c("prob", "majority"),
    allele.check=TRUE, match.type=c("Position", "RefSNP+Position", "RefSNP"),
    same.strand=FALSE, verbose=TRUE, ...)
{
    stopifnot(inherits(object, "hlaAttrBagClass"))
    hlaPredict(object, snp, cl, type, vote, allele.check, match.type,
        same.strand, verbose)
}

hlaPredict <- function(object, snp, cl=FALSE,
    type=c("response", "prob", "response+prob"), vote=c("prob", "majority"),
    allele.check=TRUE, match.type=c("Position", "RefSNP+Position", "RefSNP"),
    same.strand=FALSE, verbose=TRUE)
{
    # check
    stopifnot(inherits(object, "hlaAttrBagClass"))
    stopifnot(is.logical(cl) | is.numeric(cl) | inherits(cl, "cluster"))
    stopifnot(is.logical(allele.check), length(allele.check)==1L)
    stopifnot(is.logical(same.strand), length(same.strand)==1L)
    stopifnot(is.logical(verbose), length(verbose)==1L)
    type <- match.arg(type)
    vote <- match.arg(vote)
    match.type <- match.arg(match.type)
    vote_method <- match(vote, c("prob", "majority"))

    if (inherits(cl, "cluster"))
    {
        if (!requireNamespace("parallel", quietly=TRUE))
            stop("The `parallel' package should be installed.")
        if (length(cl) <= 1L) cl <- FALSE
    }

    # if warning
    if (!is.null(object$appendix$warning))
        warning(object$appendix$warning, immediate.=TRUE)

    if (verbose)
    {
        # get the number of classifiers
        CNum <- .Call(HIBAG_GetNumClassifiers, object$model)
        cat("HIBAG model:\n",
            "    ", CNum, " individual classifier", .plural(CNum), "\n",
            "    ", length(object$snp.id), " SNPs\n",
            "    ", length(object$hla.allele), " unique HLA alleles\n", sep="")
        cat("Prediction:\n")
        if (vote_method == 1L)
            cat("    based on the averaged posterior probabilities\n")
        else
            cat("    by voting from all individual classifiers\n")
        if (inherits(cl, "cluster"))
        {
            cat("    run in parallel with ", length(cl), " compute node",
                .plural(length(cl)), "\n", sep="")
        }
    }

    if (!inherits(snp, "hlaSNPGenoClass"))
    {
        # it should be a vector or a matrix
        stopifnot(is.numeric(snp), is.vector(snp) | is.matrix(snp))
        if (is.vector(snp))
        {
            stopifnot(length(snp) == object$n.snp)
            snp <- matrix(snp, ncol=1L)
        } else {
            stopifnot(nrow(snp) == object$n.snp)
        }
        geno.sampid <- 1L:ncol(snp)
        assembly <- "auto-silent"

    } else {

        # a 'hlaSNPGenoClass' object

        ##################################################
        # check assembly first

        model.assembly <- as.character(object$assembly)[1L]
        if (is.na(model.assembly))
            model.assembly <- "unknown"
        geno.assembly <- as.character(snp$assembly)[1L]
        if (is.na(geno.assembly))
            geno.assembly <- "unknown"
        refstr <- sprintf("Model assembly: %s, SNP assembly: %s",
            model.assembly, geno.assembly)
        if (verbose)
            cat(refstr, "\n", sep="")

        if (model.assembly != geno.assembly)
        {
            if (any(c(model.assembly, geno.assembly) %in% "unknown"))
            {
                if (verbose)
                    message("The human genome references might not match!")
                if (geno.assembly == "unknown")
                    assembly <- model.assembly
                else
                    assembly <- geno.assembly
            } else {
                warning("The human genome references do not match! ",
                    refstr, ".", immediate.=TRUE)
                assembly <- model.assembly
            }
        } else {
            if (model.assembly != "unknown")
                assembly <- model.assembly
            else
                assembly <- "auto"
        }

        ##################################################

        geno.sampid <- snp$sample.id
        obj.id <- hlaSNPID(object, match.type)
        geno.id <- hlaSNPID(snp, match.type)

        # flag boolean
        flag <- FALSE
        if (length(obj.id) == length(geno.id))
        {
            if (all(obj.id == geno.id))
                flag <- TRUE
        }

        # check and switch A/B alleles
        if (flag)
        {
            if (allele.check)
            {
                snp <- hlaGenoSwitchStrand(snp, object,
                    match.type, same.strand, verbose)$genotype
            } else {
                snp <- snp$genotype
            }
        } else {

            # snp selection
            snp.sel <- match(obj.id, geno.id)
            snp.allele <- snp$snp.allele
            snp.allele[is.na(snp.allele)] <- ""

            # temporary variable
            tmp <- list(genotype = snp$genotype[snp.sel,],
                sample.id = snp$sample.id,
                snp.id = object$snp.id, snp.position = object$snp.position,
                snp.allele = snp.allele[snp.sel],
                assembly = snp$assembly)
            flag <- is.na(tmp$snp.allele)
            tmp$snp.allele[flag] <- object$snp.allele[
                match(tmp$snp.id[flag], object$snp.id)]
            if (is.vector(tmp$genotype))
                tmp$genotype <- matrix(tmp$genotype, ncol=1L)
            class(tmp) <- "hlaSNPGenoClass"

            # the total number of missing snp
            missing.cnt <- sum(flag)

            # verbose
            if (verbose)
            {
                if (missing.cnt > 0L)
                {
                    cat(sprintf("There %s %d missing SNP%s (%0.1f%%).\n",
                        ifelse(missing.cnt > 1L, "are", "is"),
                        missing.cnt, .plural(missing.cnt),
                        100*missing.cnt/length(obj.id)))
                }
            }

            # try alternative matching if possible
            if (match.type == "RefSNP+Position")
            {
                # match by RefSNP IDs
                s1 <- hlaSNPID(object, "RefSNP")
                s2 <- hlaSNPID(snp, "RefSNP")
                mcnt.id <- length(s1) - length(intersect(s1, s2))

                # match by positions
                s1 <- hlaSNPID(object, "Position")
                s2 <- hlaSNPID(snp, "Position")
                mcnt.pos <- length(s1) - length(intersect(s1, s2))

                if (((mcnt.id < missing.cnt) |
                    (mcnt.pos < missing.cnt)) & verbose)
                {
                    message(
                        "Hint:\n",
                        "The current SNP matching requires both RefSNP IDs ",
                        "and positions, and lower missing fraction(s) could ",
                        "be gained:\n",
                        sprintf("\t%0.1f%% by matching RefSNP IDs only, ",
                            100*mcnt.id/length(s1)),
                        "call 'hlaPredict(..., match.type=\"RefSNP\")'\n",
                        sprintf("\t%0.1f%% by matching positions only, ",
                            100*mcnt.pos/length(s1)),
                        "call 'hlaPredict(..., match.type=\"Position\")'\n",
                        "Any concern about SNP mismatching should be emailed ",
                        "to the genotyping platform provider.")

                    if (!is.null(object$appendix$platform))
                    {
                        message("The supported platform(s): ",
                            object$appendix$platform)
                    }
                }
            }

            if (missing.cnt == length(obj.id))
            {
                stop("There is no overlapping of SNPs!")
            } else if (missing.cnt > 0.5*length(obj.id))
            {
                warning("More than 50% of SNPs are missing!", immediate.=TRUE)
            }

            # switch
            if (allele.check)
            {
                snp <- hlaGenoSwitchStrand(tmp, object, match.type,
                    same.strand, verbose)$genotype
            } else {
                snp <- tmp$genotype
            }
        }
    }

    # check
    if (dim(snp)[1L] != object$n.snp)
    {
        stop("The number of SNPs is not valid, and ",
            "it maybe due to duplicated 'snp.id' or ",
            "incorrect dimension of genotype matrix.")
    }

    # initialize ...
    n.samp <- dim(snp)[2L]
    n.hla <- length(object$hla.allele)
    if (verbose)
    {
        cat(sprintf("# of samples: %d\n", n.samp))
        cat("CPU flags: ", .Call(HIBAG_Kernel_Version)[[2L]][1L], "\n", sep="")
    }

    # parallel units
    if (!inherits(cl, "cluster"))
    {
        # the number of threads
        nthread <- 1L
        if (isTRUE(cl)) nthread <- as.integer(defaultNumThreads())
        if (is.numeric(cl)) nthread <- cl[1L]
        if (is.na(nthread) || nthread<1L) nthread <- 1L

        # pointer to functions for an extensible component
        pm <- attr(cl, "proc_ptr")

        # predict HLA genotypes
        if (type %in% c("response", "response+prob"))
        {
            # the best-guess prediction
            if (type == "response")
            {
                rv <- .Call(HIBAG_Predict_Resp, object$model, as.integer(snp),
                    n.samp, vote_method, nthread, verbose, pm)
                names(rv) <- c("H1", "H2", "prob", "matching")
            } else {
                rv <- .Call(HIBAG_Predict_Resp_Prob, object$model,
                    as.integer(snp), n.samp, vote_method, nthread, verbose, pm)
                names(rv) <- c("H1", "H2", "prob", "matching", "postprob")
            }

            # output object
            res <- hlaAllele(geno.sampid,
                H1 = object$hla.allele[rv$H1 + 1L],
                H2 = object$hla.allele[rv$H2 + 1L],
                locus = object$hla.locus, prob = rv$prob,
                na.rm = FALSE, assembly = assembly)
            res$value$matching <- rv$matching
            if (!is.null(rv$postprob))
            {
                res$postprob <- rv$postprob
                colnames(res$postprob) <- geno.sampid
                m <- outer(object$hla.allele, object$hla.allele,
                    function(x, y) paste(x, y, sep="/"))
                rownames(res$postprob) <- m[lower.tri(m, diag=TRUE)]
            }
            NA.cnt <- sum(is.na(res$value$allele1) | is.na(res$value$allele2))

        } else {
            # all probabilities
            rv <- .Call(HIBAG_Predict_Resp_Prob, object$model,
                as.integer(snp), n.samp, vote_method, nthread, verbose, pm)
            names(rv) <- c("H1", "H2", "prob", "matching", "postprob")

            # output object
            res <- rv$postprob
            colnames(res) <- geno.sampid
            m <- outer(object$hla.allele, object$hla.allele,
                function(x, y) paste(x, y, sep="/"))
            rownames(res) <- m[lower.tri(m, diag=TRUE)]
            NA.cnt <- sum(colSums(res) <= 0L, na.rm=TRUE)
        }
    } else {

        # run in parallel
        rv <- parallel::clusterApply(cl=cl,
            parallel::splitIndices(n.samp, length(cl)),
            fun = function(i, mobj, snp, type, vote)
            {
                if (length(i) > 0L)
                {
                    library(HIBAG)
                    m <- hlaModelFromObj(mobj)
                    on.exit(hlaClose(m))
                    hlaPredict(m, snp[,i], type=type, vote=vote, verbose=FALSE)
                } else
                    NULL
            },
            mobj=hlaModelToObj(object), snp=snp, type=type, vote=vote
        )

        # merge the results
        if (type %in% c("response", "response+prob"))
        {
            res <- rv[[1L]]
            for (i in 2L:length(rv))
            {
                if (!is.null(rv[[i]]))
                    res <- hlaCombineAllele(res, rv[[i]])
            }
            res$value$sample.id <- geno.sampid
            if (!is.null(res$postprob))
                colnames(res$postprob) <- geno.sampid
            NA.cnt <- sum(is.na(res$value$allele1) | is.na(res$value$allele2))
        } else {
            res <- rv[[1L]]
            for (i in 2L:length(rv))
            {
                if (!is.null(rv[[i]]))
                    res <- cbind(res, rv[[i]])
            }
            colnames(res) <- geno.sampid
            NA.cnt <- sum(colSums(res) <= 0L, na.rm=TRUE)
        }
    } 

    if (NA.cnt > 0L)
    {
        warning("No prediction output for ", NA.cnt, " individual",
            .plural(NA.cnt), " (possibly due to missing SNPs).",
            immediate.=TRUE)
    }
    # return
    res
}


#######################################################################
# Merge predictions by voting
#

hlaPredMerge <- function(..., weight=NULL, equivalence=NULL)
{
    # check "..."
    pdlist <- list(...)
    if (length(pdlist) <= 0L)
        stop("No object is passed to 'hlaPredMerge'.")
    for (i in 1:length(pdlist))
    {
        if (!inherits(pdlist[[i]], "hlaAlleleClass"))
        {
            stop("The object(s) passed to 'hlaPredMerge' ",
                "should be 'hlaAlleleClass'.")
        }
        if (is.null(pdlist[[i]]$postprob))
        {
            stop("The object(s) passed to 'hlaPredMerge' should have ",
                "a field of 'postprob' returned from ",
                "'hlaPredict(..., type=\"response+prob\", vote=\"majority\")'.")
        }
    }

    # check equivalence
    stopifnot(is.null(equivalence) | is.data.frame(equivalence))
    if (is.data.frame(equivalence))
    {
        if (ncol(equivalence) != 2L)
        {
            stop("'equivalence' should have two columns: ",
                "the first for new equivalent alleles, and ",
                "the second for the alleles possibly existed ",
                "in the object(s) passed to 'hlaPredMerge'.")
        }
    }

    # check locus and sample.id
    samp.id <- pdlist[[1L]]$value$sample.id
    locus <- pdlist[[1L]]$locus
    for (i in 1L:length(pdlist))
    {
        if (!identical(samp.id, pdlist[[i]]$value$sample.id))
            stop("The sample IDs should be the same.")
        if (!identical(locus, pdlist[[i]]$locus))
            stop("The locus should be the same.")
    }

    # check weight
    if (!is.null(weight))
    {
        stopifnot(is.numeric(weight) & is.vector(weight))
        if (length(pdlist) != length(weight))
            stop("Invalid 'weight'.")
        weight <- abs(weight)
        weight <- weight / sum(weight)
    } else {
        weight <- rep(1/length(pdlist), length(pdlist))
    }

    #############################################################
    # replace function
    replace <- function(allele)
    {
        if (!is.null(equivalence))
        {
            i <- match(allele, equivalence[, 2L])
            flag <- !is.na(i)
            i <- i[flag]
            allele[flag] <- equivalence[i, 1L]
        }
        allele
    }


    # all different alleles
    hla.allele <- NULL
    for (i in 1L:length(pdlist))
    {
        h <- unique(unlist(strsplit(rownames(pdlist[[i]]$postprob), "/")))
        hla.allele <- unique(c(hla.allele, replace(h)))
    }
    hla.allele <- hlaUniqueAllele(hla.allele)
    n.hla <- length(hla.allele)
    n.samp <- length(samp.id)

    prob <- matrix(0.0, nrow=n.hla*(n.hla+1L)/2, ncol=n.samp)
    m <- outer(hla.allele, hla.allele, function(x, y) paste(x, y, sep="/"))
    m <- m[lower.tri(m, diag=TRUE)]

    # for-loop
    for (i in 1L:length(pdlist))
    {
        p <- pdlist[[i]]$postprob
        h <- replace(unlist(strsplit(rownames(p), "/")))

        h1 <- h[seq(1L, length(h), 2L)]; h2 <- h[seq(2L, length(h), 2L)]
        j1 <- match(paste(h1, h2, sep="/"), m)
        j2 <- match(paste(h2, h1, sep="/"), m)
        j1[is.na(j1)] <- j2[is.na(j1)]

        # check
        stopifnot(!any(is.na(j1)))

        p <- p * weight[i]
        for (j in seq_len(length(j1)))
            prob[j1[j], ] <- prob[j1[j], ] + p[j, ]
    }
    colnames(prob) <- samp.id
    rownames(prob) <- m

    pb <- apply(prob, 2L, max)
    pt <- unlist(strsplit(m[apply(prob, 2L, which.max)], "/"))
    assembly <- pdlist[[1L]]$assembly
    if (is.null(assembly)) assembly <- "auto"

    rv <- hlaAllele(samp.id,
        H1 = pt[seq(2L, length(pt), 2L)],
        H2 = pt[seq(1L, length(pt), 2L)],
        locus = locus,
        locus.pos.start = pdlist[[1L]]$pos.start,
        locus.pos.end = pdlist[[1L]]$pos.end,
        prob = pb, na.rm = FALSE,
        assembly = assembly)
    rv$postprob <- prob
    rv
}


#######################################################################
# Summarize the "hlaAttrBagClass" object
#

summary.hlaAttrBagClass <- function(object, show=TRUE, ...)
{
    obj <- hlaModelToObj(object)
    summary(obj, show=show)
}


#######################################################################
# Save the parameters in a model of attribute bagging
#

hlaModelToObj <- function(model)
{
    # check
    stopifnot(inherits(model, "hlaAttrBagClass"))

    # get a list of classifiers
    clr <- .Call(HIBAG_GetClassifierList, model$model, model$hla.allele)

    # output
    rv <- list(n.samp = model$n.samp, n.snp = model$n.snp,
        sample.id = model$sample.id, snp.id = model$snp.id,
        snp.position = model$snp.position, snp.allele = model$snp.allele,
        snp.allele.freq = model$snp.allele.freq,
        hla.locus = model$hla.locus,
        hla.allele = model$hla.allele, hla.freq = model$hla.freq,
        assembly = model$assembly,
        classifiers = clr,
        matching = model$matching,
        appendix = model$appendix)
    class(rv) <- "hlaAttrBagObj"
    rv
}


#######################################################################
# Combine two HIBAG models
#

hlaCombineModelObj <- function(obj1, obj2)
{
    # check
    stopifnot(inherits(obj1, "hlaAttrBagObj"))
    stopifnot(inherits(obj2, "hlaAttrBagObj"))
    stopifnot(identical(obj1$hla.locus, obj2$hla.locus))
    stopifnot(identical(obj1$snp.id, obj2$snp.id))
    stopifnot(identical(obj1$hla.allele, obj2$hla.allele))
    stopifnot(identical(obj1$assembly, obj2$assembly))

    samp.id <- unique(c(obj1$sample.id, obj2$sample.id))
    if (!is.null(obj1$appendix) | !is.null(obj2$appendix))
    {
        appendix <- list(
            platform =
                unique(c(obj1$appendix$platform, obj2$appendix$platform)),
            information =
                unique(c(obj1$appendix$information,
                    obj2$appendix$information)),
            warning =
                unique(c(obj1$appendix$warning, obj2$appendix$warning))
        )
    } else {
        appendix <- NULL
    }

    rv <- list(n.samp = length(samp.id), n.snp = obj1$n.snp,
        sample.id = samp.id, snp.id = obj1$snp.id,
        snp.position = obj1$snp.position, snp.allele = obj1$snp.allele,
        snp.allele.freq = (obj1$snp.allele.freq + obj2$snp.allele.freq)*0.5,
        hla.locus = obj1$hla.locus, hla.allele = obj1$hla.allele,
        hla.freq = (obj1$hla.freq + obj2$hla.freq)*0.5,
        assembly = obj1$assembly,
        classifiers = c(obj1$classifiers, obj2$classifiers),
        matching = c(obj1$matching, obj2$matching),
        appendix = appendix)
    if (identical(obj1$sample.id, obj2$sample.id) && !is.null(obj1$sample.id))
    {
        n1 <- length(obj1$classifiers)
        n2 <- length(obj2$classifiers)
        n <- n1 + n2
        rv$matching <- n1/n*obj1$matching + n2/n*obj2$matching
    }
    class(rv) <- "hlaAttrBagObj"
    rv
}


#######################################################################
# Get a HIBAG model with the top n individual classifiers
#

hlaSubModelObj <- function(obj, n)
{
    # check
    stopifnot(inherits(obj, "hlaAttrBagObj"))
    stopifnot(is.numeric(n), length(n)==1L)
    obj$classifiers <- obj$classifiers[1L:n]
    obj
}


#######################################################################
# Create a "hlaAttrBagClass" class from an R object
#

hlaModelFromObj <- function(obj)
{
    # check
    stopifnot(inherits(obj, "hlaAttrBagObj"))

    # create an attribute bagging object
    ABmodel <- .Call(HIBAG_New, obj$n.samp, obj$n.snp, length(obj$hla.allele))

    # add individual classifiers
    for (tree in obj$classifiers)
    {
        hla <- match(tree$haplos$hla, obj$hla.allele) - 1L
        if (any(is.na(hla)))
            stop("Invalid HLA alleles in the individual classifier.")
        if (is.null(tree$samp.num))
            snum <- rep.int(1L, obj$n.samp)
        else
            snum <- as.integer(tree$samp.num)

        # create a new classifier
        .Call(HIBAG_NewClassifierHaplo, ABmodel, as.integer(tree$snpidx - 1L),
            snum, as.double(tree$haplos$freq), hla,
            as.character(tree$haplos$haplo),
            tree$outofbag.acc)
    }

    # output
    rv <- list(n.samp = obj$n.samp, n.snp = obj$n.snp,
        sample.id = obj$sample.id, snp.id = obj$snp.id,
        snp.position = obj$snp.position, snp.allele = obj$snp.allele,
        snp.allele.freq = obj$snp.allele.freq,
        hla.locus = obj$hla.locus, hla.allele = obj$hla.allele,
        hla.freq = obj$hla.freq,
        assembly = as.character(obj$assembly)[1L],
        model = ABmodel,
        matching = obj$matching,
        appendix = obj$appendix)
    if (is.na(rv$assembly)) rv$assembly <- "unknown"

    class(rv) <- "hlaAttrBagClass"
    rv
}


#######################################################################
# Summarize the "hlaAttrBagObj" object
#

summary.hlaAttrBagObj <- function(object, show=TRUE, ...)
{
    # check
    stopifnot(inherits(object, "hlaAttrBagObj"))
    obj <- object

    if (show)
    {
        cat("Gene: ", .hla_gene_name_string(obj$hla.locus), "\n", sep="")
        cat("Training dataset:", obj$n.samp, "samples X",
            length(obj$snp.id), "SNPs\n")
        cat("    # of HLA alleles: ", length(obj$hla.allele), "\n", sep="")
    }

    # summarize ...
    snpset <- NULL
    outofbag.acc <- rep(NaN, length(obj$classifiers))
    numsnp <- rep(NA, length(obj$classifiers))
    numhaplo <- rep(NA, length(obj$classifiers))
    snp.hist <- rep(0, length(obj$snp.id))
    for (i in 1L:length(obj$classifiers))
    {
        outofbag.acc[i] <- obj$classifiers[[i]]$outofbag.acc
        numsnp[i] <- length(obj$classifiers[[i]]$snpidx)
        numhaplo[i] <- length(obj$classifiers[[i]]$haplos$hla)
        snp.hist[obj$classifiers[[i]]$snpidx] <-
            snp.hist[obj$classifiers[[i]]$snpidx] + 1L
        snpset <- unique(c(snpset, obj$classifiers[[i]]$snpidx))
    }
    snpset <- snpset[order(snpset)]
    outofbag.acc <- outofbag.acc * 100

    info <- data.frame(
        Mean = c(mean(numsnp), mean(numhaplo), mean(outofbag.acc)),
        SD = c(sd(numsnp), sd(numhaplo), sd(outofbag.acc)),
        Min = c(min(numsnp), min(numhaplo), min(outofbag.acc)),
        Max = c(max(numsnp), max(numhaplo), max(outofbag.acc))
    )
    rownames(info) <- c("num.snp", "num.haplo", "accuracy")

    if (show)
    {
        cat("    # of individual classifiers: ", length(obj$classifiers),
            "\n", sep="")
        cat("    total # of SNPs used: ", length(snpset), "\n", sep="")
        cat("    avg. # of SNPs in an individual classifier:",
            sprintf("%0.2f\n        (sd: %0.2f, min: %d, max: %d, median: %0.2f)\n",
            mean(numsnp), sd(numsnp), min(numsnp), max(numsnp),
            median(numsnp)))
        cat("    avg. # of haplotypes in an individual classifier:",
            sprintf("%0.2f\n        (sd: %0.2f, min: %d, max: %d, median: %0.2f)\n",
            mean(numhaplo), sd(numhaplo), min(numhaplo), max(numhaplo),
            median(numhaplo)))
        cat("    avg. out-of-bag accuracy:",
            sprintf("%0.2f%%\n        (sd: %0.2f%%, min: %0.2f%%, max: %0.2f%%, median: %0.2f%%)\n",
            mean(outofbag.acc), sd(outofbag.acc), min(outofbag.acc),
            max(outofbag.acc), median(outofbag.acc)))

        p <- obj$matching
        if (!is.null(p))
        {
            cat("Matching proportion:\n")
            .printMatching(p)
        }

        if (is.null(obj$assembly))
            cat("Genome assembly: unknown\n")
        else
            cat("Genome assembly: ", obj$assembly, "\n", sep="")

        if (!is.null(obj$appendix$platform))
            cat("Platform:", obj$appendix$platform, "\n")
        if (!is.null(obj$appendix$information))
            cat("Information:", obj$appendix$information, "\n")
        if (!is.null(obj$appendix$warning))
            message(obj$appendix$warning)
    }

    rv <- list(num.classifier = length(obj$classifiers),
        num.snp = length(snpset),
        snp.id = obj$snp.id, snp.position = obj$snp.position,
        snp.hist = snp.hist, info = info)
    invisible(rv)
}


##########################################################################
# Out-of-bag estimation of overall accuracy, per-allele sensitivity, etc
#

hlaOutOfBag <- function(model, hla, snp, call.threshold=NaN, verbose=TRUE)
{
    # check
    stopifnot(inherits(model, "hlaAttrBagObj") |
        inherits(model, "hlaAttrBagClass"))
    stopifnot(inherits(hla, "hlaAlleleClass"))
    stopifnot(inherits(snp, "hlaSNPGenoClass"))

    stopifnot(is.numeric(call.threshold), length(call.threshold)==1L)
    stopifnot(is.logical(verbose), length(verbose)==1L)


    ######################################################
    # initialize ...
    if (inherits(model, "hlaAttrBagClass"))
    {
        model <- hlaModelToObj(model)
        if (verbose)
            print(model)
    }

    # map samples
    if (is.null(model$sample.id))
        stop("There is no sample ID in the model.")
    samp.idx <- match(model$sample.id, snp$sample.id)
    if (any(is.na(samp.idx)))
        stop("Some of sample.id in the model do not exist in SNP genotypes.")
    hla.samp.idx <- match(model$sample.id, hla$value$sample.id)
    if (any(is.na(hla.samp.idx)))
        stop("Some of sample.id in the model do not exist in HLA types.")

    # map SNPs
    snp.idx <- match(model$snp.id, snp$snp.id)
    if (any(is.na(snp.idx)))
        stop("Some of snp.id in the model do not exist in SNP genotypes.")

    # genotypes and the number of classifiers
    geno <- snp$genotype[snp.idx, samp.idx]
    nclass <- length(model$classifiers)

    # the returned value
    ans <- NULL

    # the column names of details
    nm1 <- c("allele", "train.num", "train.freq")
    nm2 <- c("call.rate", "accuracy", "sensitivity", "specificity",
        "ppv", "npv")

    # for-loop
    for (i in 1L:nclass)
    {
        mx <- model
        mx$classifiers <- mx$classifiers[i]
        s <- mx$classifiers[[1L]]$samp.num
        if (is.null(s))
            stop("There is no bootstrap sample index.")

        tmp.model <- hlaModelFromObj(mx)
        tmp.geno <- geno[, s == 0L]
        v <- hlaPredict(tmp.model, tmp.geno, verbose=FALSE)
        hlaClose(tmp.model)
        v$value$sample.id <- mx$sample.id[s == 0L]
        pam <- hlaCompareAllele(hla, v, allele.limit=mx,
            call.threshold=call.threshold, verbose=FALSE)

        if (!is.null(ans))
        {
            # overall
            ans$overall <- ans$overall + pam$overall

            # confusion matrix
            ans$confusion <- ans$confusion + pam$confusion

            # details
            pam$detail <- pam$detail[, nm2]
            ans$n.detail <- ans$n.detail + !is.na(pam$detail)
            pam$detail[is.na(pam$detail)] <- 0
            ans$detail <- ans$detail + pam$detail
        } else {
            ans <- pam
            ans$detailhead <- ans$detail[, nm1]
            colnames(ans$detailhead) <- c("allele", "valid.num", "valid.freq")
            ans$detail <- ans$detail[, nm2]
            ans$n.detail <- !is.na(ans$detail)
            ans$detail[is.na(ans$detail)] <- 0
        }

        if (verbose)
        {
            cat(date(), sprintf(", passing the %d/%d classifiers.\n",
                i, nclass), sep="")
        }
    }

    # average
    ans$overall <- ans$overall / nclass
    ans$confusion <- ans$confusion / nclass
    ans$detail <- ans$detail / ans$n.detail

    # get miscall
    rv <- ans$confusion; diag(rv) <- 0
    m.max <- apply(rv, 2L, max); m.idx <- apply(rv, 2L, which.max)
    s <- rownames(ans$confusion)[m.idx]; s[m.max<=0] <- NA
    p <- m.max / apply(rv, 2L, sum)

    # output
    ans$detail <- cbind(ans$detailhead, ans$detail,
        miscall=s, miscall.prop=p, stringsAsFactors=FALSE)
    ans$detailhead <- NULL
    ans$n.detail <- NULL
    ans
}


##########################################################################
##########################################################################
#
# Linkage Disequilibrium
#

##########################################################################
# Calculate linkage disequilibrium between HLA locus and SNP markers
#

hlaGenoLD <- function(hla, geno)
{
    # check
    stopifnot(inherits(hla, "hlaAlleleClass"))
    if (inherits(geno, "hlaSNPGenoClass"))
    {
        stopifnot(dim(hla$value)[1L] == length(geno$sample.id))
        if (any(hla$value$sample.id != geno$sample.id))
        {
            hla <- hlaAlleleSubset(hla, samp.sel =
                match(geno$sample.id, hla$value$sample.id))
        }
        geno <- geno$genotype
    } else if (is.matrix(geno))
    {
        stopifnot(is.numeric(geno))
        stopifnot(dim(hla$value)[1L] == dim(geno)[2L])
    } else if (is.vector(geno))
    {
        stopifnot(is.numeric(geno))
        stopifnot(dim(hla$value)[1L] == length(geno))
        geno <- matrix(geno, ncol=1L)
    } else {
        stop("geno should be `hlaSNPGenoClass', a vector or a matrix.")
    }

    # HLA alleles indicators
    alleles <- unique(c(hla$value$allele1, hla$value$allele2))
    alleles <- alleles[order(alleles)]
    allele.mat <- matrix(0L,
        nrow=length(hla$value$allele1), ncol=length(alleles))
    for (i in 1L:length(alleles))
    {
        allele.mat[, i] <- (hla$value$allele1==alleles[i]) +
            (hla$value$allele2==alleles[i])
    }

    apply(geno, 1L,
        function(x, allele.mat) {
            suppressWarnings(
                mean(
                    cor(x, allele.mat, use="pairwise.complete.obs")^2,
                    na.rm=TRUE
                )
            )
        },
        allele.mat=allele.mat)
}


##########################################################################
# SNP LD visualization
#

hlaLDMatrix <- function(geno, loci=NULL, maf=0.01, assembly="auto",
    draw=TRUE, verbose=TRUE)
{
    # check
    stopifnot(inherits(geno, "hlaSNPGenoClass"))
    stopifnot(is.numeric(maf), length(maf)==1L)
    stopifnot(is.null(loci) | is.vector(loci))
    stopifnot(is.logical(draw), length(draw)==1L)
    stopifnot(is.logical(verbose), length(verbose)==1L)

    # maf filter
    if (is.na(maf)) maf <- 0
    if (maf > 0)
    {
        af <- rowMeans(geno$genotype, na.rm=TRUE) * 0.5
        af <- pmin(af, 1-af)
        xx <- af >= maf
        xx[is.na(xx)] <- FALSE
        if (sum(xx) < length(af))
        {
            if (verbose)
            {
                cat("MAF filter (>=", maf, "), excluding ",
                    length(af)-sum(xx), " SNP(s)\n", sep="")
            }
            geno <- hlaGenoSubset(geno, snp.sel=xx)
        }
    }

    if (!is.null(loci))
    {
        if (assembly=="auto") assembly <- geno$assembly
        assembly <- .hla_assembly(assembly)
        info <- hlaLociInfo(assembly)
        if (!all(loci %in% rownames(info)))
        {
            stop("'loci' should be one of ", paste(rownames(info),
                collapse=", "))
        }
        info <- info[loci, ]
        st <- sapply(info$start, function(p) {
            mean(c(which(p <= geno$snp.position)[1L],
            rev(which(p >= geno$snp.position))[1L])) })
        ed <- sapply(info$end, function(p) {
            mean(c(which(p <= geno$snp.position)[1L],
            rev(which(p >= geno$snp.position))[1L])) })
    }

    # calculate the LD matrix
    ld <- suppressWarnings(cor(t(geno$genotype), use="na.or.complete")^2)

    if (isTRUE(draw))
    {
        Var1 <- Var2 <- value <- NULL
        dat <- reshape2::melt(ld)
        p <- ggplot2::ggplot(dat, ggplot2::aes(x=Var2, y=Var1)) +
            ggplot2::geom_raster(ggplot2::aes(fill=value)) +
            ggplot2::scale_fill_gradient2(low="grey95", mid="orange", high="red", midpoint=0.5)
        rg <- range(geno$snp.position)
        p <- p + ggplot2::labs(
            x=sprintf("%d SNPs in total, MAF >= %g", length(geno$snp.id), maf),
            y="SNP index", title=expression("Linkage Disequilibrium r"^2))
        ii <- c(0, 0.25, 0.5, 0.75, 1)
        p <- p + ggplot2::scale_x_continuous(
            breaks=unname(quantile(1:length(geno$snp.position), ii)),
            labels=sprintf("%.3fMb", c(quantile(geno$snp.position, ii)/10^6)))
        for (i in seq_along(loci))
        {
            md <- (st[i]+ed[i])*0.5
            if (is.finite(md))
            {
                p <- p +
                    ggplot2::geom_vline(xintercept=st[i], size=0.25, colour="blue", alpha=0.5) +
                    ggplot2::geom_vline(xintercept=ed[i], size=0.25, colour="blue", alpha=0.5) +
                    ggplot2::geom_vline(xintercept=md, linetype=2, colour="blue") +
                    ggplot2::annotate("label", x=md,
                        y=length(geno$snp.position), vjust="inward",
                        label=ifelse(grepl("^KIR", loci[i]), loci[i],
                            paste("HLA", loci[i], sep="-")))
            }
        }
        return(p)
    } else {
        return(ld)
    }
}


##########################################################################
# Calculate the distances among different HLA alleles
#

hlaDistance <- function(model)
{
    # check
    if (inherits(model, "hlaAttrBagClass"))
        model <- hlaModelToObj(model)

    # the total number of classifiers
    n <- length(model$classifiers)
    lst <- vector("list", n)
    hla <- model$hla.allele
    num <- matrix(0L, nrow=length(hla), ncol=length(hla))
    for (i in seq_len(n))
    {
        z <- model$classifiers[[i]]$haplos
        m <- .Call(HIBAG_Distance, length(hla), match(z$hla, hla),
            z$freq, z$haplo)
        num <- num + !is.na(m)
        m[is.na(m)] <- 0
        lst[[i]] <- m
    }

    # output
    rv <- Reduce("+", lst) / num
    colnames(rv) <- rownames(rv) <- hla
    rv
}


##########################################################################
##########################################################################
#
# Visualization
#

##########################################################################
# Plot an attribute bagging model
#

plot.hlaAttrBagClass <- function(x, ...)
{
    obj <- hlaModelToObj(x)
    plot(obj, ...)
}

print.hlaAttrBagClass <- function(x, ...)
{
    obj <- hlaModelToObj(x)
    print(obj)
    invisible()
}



##########################################################################
# Plot an attribute bagging model
#

plot.hlaAttrBagObj <- function(x, snp.col="gray33", snp.pch=1, snp.sz=1,
    locus.col="blue", locus.lty=1L, locus.lty2=2L, addplot=NULL,
    assembly="auto", ...)
{
    # check
    stopifnot(inherits(x, "hlaAttrBagObj"))
    assembly <- match.arg(assembly)

    # the starting and ending positions of HLA locus
    if (assembly == "auto")
    {
        if (!is.null(x$assembly))
            assembly <- x$assembly
    }
    info <- hlaLociInfo(assembly)
    pos.start <- info[x$hla.locus, "start"] / 1000L
    pos.end <- info[x$hla.locus, "end"] / 1000L

    # summary of the attribute bagging model
    desp <- summary(x, show=FALSE)

    # draw
    dat <- data.frame(pos=x$snp.position/1000L, ht=desp$snp.hist)
    pos <- ht <- NULL

    if (is.null(addplot))
    {
        p <- ggplot2::ggplot(dat, ggplot2::aes(x=pos, y=ht)) +
            ggplot2::xlab("SNP Position (KB)") +
            ggplot2::ylab("Frequency of Use")
    } else {
        p <- addplot
    }
    p <- p +
        ggplot2::geom_vline(xintercept=pos.start, colour=locus.col,
            linetype=locus.lty, size=0.25, alpha=0.5) +
        ggplot2::geom_vline(xintercept=pos.end, colour=locus.col,
            linetype=locus.lty, size=0.25, alpha=0.5) +
        ggplot2::geom_vline(xintercept=(pos.start+pos.end)*0.5,
            linetype=locus.lty2, colour=locus.col) +
        ggplot2::annotate("label", x=(pos.start + pos.end)/2,
            y=max(dat$ht), vjust="inward",
            label=ifelse(grepl("^KIR", x$hla.locus), x$hla.locus,
                paste("HLA", x$hla.locus, sep="-")))
    if (is.null(addplot))
    {
        p <- p + ggplot2::geom_point(color=snp.col, shape=snp.pch, size=snp.sz)
    } else {
        p <- p + ggplot2::geom_point(ggplot2::aes(x=pos, y=ht), dat,
            color=snp.col, shape=snp.pch, size=snp.sz)
    }
    p
}

print.hlaAttrBagObj <- function(x, ...)
{
    summary(x)
    invisible()
}



#######################################################################
# Export stardard R library function(s)
#######################################################################

hlaSetKernelTarget <- function(cpu=c("max", "auto.avx2", "base",
    "sse2", "sse4", "avx", "avx2", "avx512f", "avx512bw"))
{
    cpu <- match.arg(cpu)
    .Call(HIBAG_Kernel_SetTarget, cpu)
    .Call(HIBAG_Kernel_Version)[[2L]]
}


.onAttach <- function(lib, pkg)
{
    # get version and CPU information
    info <- .Call(HIBAG_Kernel_Version)

    # information
    packageStartupMessage(
        "HIBAG (HLA Genotype Imputation with Attribute Bagging)")
    packageStartupMessage(
        sprintf("Kernel Version: v%d.%d (%s)", info[[1L]][1L], info[[1L]][2L],
        info[[2L]][1L]))
    if (is.na(info[[3L]]))
        packageStartupMessage("No Intel Threading Building Blocks (TBB)")
    TRUE
}

.hibag_data_list <- list(
	data.frame = "data.frame",
	clr_nm = c("samp.num", "haplos", "snpidx", "outofbag.acc"),
	clr_haplo_nm = c("freq", "hla", "haplo"))

.onLoad <- function(lib, pkg)
{
    .Call(HIBAG_Init, .hibag_data_list)
    TRUE
}

Try the HIBAG package in your browser

Any scripts or data that you put into this service are public.

HIBAG documentation built on March 24, 2021, 6 p.m.