R/Methods-GbsrGenotypeData_HMM.R

Defines functions .cleanEachChr .checkPread .runCycle .flipParam .bindErrors .calcErrors .calcMissmap .calcReadBias .sumUpBias .getBias .summarizeEst .minimizeRecombination .solveBorderConflict .halfJoint .hap2geno .parentPat2Geno .pat2hap .getBestSeq .getParams .getPedigree .getInitProb .transitionProb .getXoFreq .calcNextJnum .initJnum .getJnum .makePattern .getPossibleGeno .getPossibleHap .getValidPat .progenyPattern .initialPattern .makeGenoParents .makeGenoPat .orderParents .loadReadCounts .saveMR .saveADB .savePGeno .saveGeno .saveEDS .saveHap .initGDS .setThreads .checkScheme

#' @importFrom SeqArray seqOptimize
#' @importFrom gdsfmt put.attr.gdsn index.gdsn
#' @rdname estGeno
setMethod("estGeno",
          "GbsrGenotypeData",
          function(object,
                   recomb_rate,
                   error_rate,
                   call_threshold,
                   het_parent,
                   optim,
                   iter,
                   n_threads,
                   dummy_reads,
                   fix_bias,
                   fix_mismap) {

              object <- .checkScheme(object)

              parents <- slot(slot(object, "scheme"), "parents")
              n_parents <- length(unique(parents))
              parentless <- all(is.na(parents))
              if(parentless){
                  message("Run in the parentless mode...")
              }
              .setThreads(n_threads)

              message("Start cleaning...")

              .initGDS(object, het_parent, n_parents)
              chr <- getChromosome(object)
              chr_levels <- unique(chr)
              no_eds <- FALSE
              for(chr_i in chr_levels) {
                  message("\nNow cleaning chr ", chr_i, "...")
                  clean_out <- .cleanEachChr(object = object,
                                             chr_i = chr_i,
                                             error_rate = error_rate,
                                             recomb_rate = recomb_rate,
                                             call_threshold = call_threshold,
                                             het_parent = het_parent,
                                             optim = optim,
                                             iter = iter,
                                             fix_bias = fix_bias,
                                             fix_mismap = fix_mismap,
                                             parentless = parentless,
                                             dummy_reads = dummy_reads)
                  # ### Debug
                  # return(clean_out)
                  # ###
                  if(parentless){
                      sel <- list(mar = validMar(object, chr_i),
                                  sam = validSam(object))
                      clean_out$best_hap <- clean_out$best_hap[, -seq_len(2), ]
                      clean_out$best_geno <- clean_out$best_geno[, -seq_len(2), ]

                  } else {
                      sel <- list(mar = validMar(object, chr_i),
                                  sam = validSam(object, parents = TRUE))
                  }

                  .saveHap(object, clean_out, sel)
                  .saveGeno(object, clean_out, sel)
                  .savePGeno(object, clean_out, sel)
                  .saveADB(object, clean_out, sel)
                  .saveMR(object, clean_out, sel)
                  if(n_parents == 2 & !het_parent){
                      .saveEDS(object, clean_out, sel)
                  }
              }
              closeGDS(object, verbose = FALSE)
              seqOptimize(object$filename, "by.sample",
                          c("HAP", "CGT"), verbose = FALSE)
              object <- reopenGDS(object)
              return(object)
          })

################################################################################
# Validate the scheme object
.checkScheme <- function(object){
    scheme <- slot(object, "scheme")
    if(length(slot(scheme, "progenies")) == 0){
        stop("No scheme information.",
             "\n",
             "Build scheme information with ",
             "initScheme() and addScheme().",
             call. = FALSE)
    }

    if(length(slot(scheme, "samples")) == 0){
        id <- unlist(tail(slot(scheme, "progenies"), 1))
        if(length(id) == 1){
            message("Member IDs were not assigned to samples.",
                    "\n",
                    "Assign ", id, " to all samples as member ID.")
            object <- assignScheme(object, rep(id, nsam(object)))

        } else {
            stop("Member IDs were not assigned to samples.",
                 "\n",
                 "Assign member IDs to samples using assignScheme().",
                 call. = FALSE)
        }
    }
    return(object)
}

################################################################################
################################################################################
# Set the number of threads
.setThreads <- function(n_threads){
    max_threads <- defaultNumThreads()
    if (is.null(n_threads)) { n_threads <- max_threads / 2 }

    if (max_threads <= n_threads) {
        warning("You are going to use all threads",
                "of your computer for the calculation.")
        answer <- ""
        while (answer != "y") {
            answer <- readline("Are you sure to use all threads?(y/n)")
            if (answer == "n") { stop("Stopped by user.") }
        }
        n_threads <- max_threads
    }

    message("Set the number of threads: ", n_threads)
    setThreadOptions(numThreads = n_threads)
}

################################################################################
################################################################################
# Initialize output nodes in the GDS file
.initGDS <- function(object, het_parent, n_parents) {
    hap <- addfolder.gdsn(index.gdsn(object, "annotation/format"), "HAP",
                          replace = TRUE)
    add.gdsn(hap, "data", storage = "bit6", compress = "", replace = TRUE)

    if(n_parents == 2 & !het_parent){
        eds <- addfolder.gdsn(index.gdsn(object, "annotation/format"), "EDS",
                              replace = TRUE)
        add.gdsn(eds, "data", storage = "bit6", compress = "", replace = TRUE)
    }

    cgt <- addfolder.gdsn(index.gdsn(object, "annotation/format"), "CGT",
                          replace = TRUE)
    add.gdsn(cgt, "data", storage = "bit2", compress = "", replace = TRUE)

    pgt <- add.gdsn(index.gdsn(object, "annotation/info"), "PGT",
                    storage = "bit2", compress = "", replace = TRUE)

    adb <- add.gdsn(index.gdsn(object, "annotation/info"), "ADB",
                    storage = "single", compress = "", replace = TRUE)

    mr <- add.gdsn(index.gdsn(object, "annotation/info"), "MR",
                   storage = "single", compress = "", replace = TRUE)
}

################################################################################
################################################################################
# Save the ouput to the GDS file
#' @importFrom gdsfmt append.gdsn
.saveHap <- function(object, clean_out, sel) {
    output <- array(0, c(2, length(sel$sam), length(sel$mar)))
    rep_id <- getReplicates(object, parents = TRUE)
    id_hit <- match(rep_id, clean_out$mapping_id)
    output[, sel$sam, sel$mar] <- clean_out$best_hap[, id_hit,]

    hap_gdsn <- index.gdsn(object, "annotation/format/HAP/data")
    gdsn_dim <- objdesp.gdsn(hap_gdsn)$dim
    if (gdsn_dim[1] == 0) {
        add.gdsn(index.gdsn(object, "annotation/format/HAP"), "data", output,
                 "bit6", compress = "", replace = TRUE)
    } else {
        append.gdsn(hap_gdsn, output)
    }
}

.saveEDS <- function(object, clean_out, sel) {
    output <- array(NA, c(2, length(sel$sam), length(sel$mar)))
    rep_id <- getReplicates(object, parents = TRUE)
    id_hit <- match(rep_id, clean_out$mapping_id)
    output[, sel$sam, sel$mar] <- clean_out$best_hap[, id_hit,]

    output[output == 0] <- NA
    output <- apply(output - 1, c(2, 3), sum)
    output[is.na(output)] <- 8

    eds_gdsn <- index.gdsn(object, "annotation/format/EDS/data")
    gdsn_dim <- objdesp.gdsn(eds_gdsn)$dim
    if (gdsn_dim[1] == 0) {
        add.gdsn(index.gdsn(object, "annotation/format/EDS"), "data", output,
                 "bit6", compress = "", replace = TRUE)
    } else {
        append.gdsn(eds_gdsn, output)
    }
}

#' @importFrom gdsfmt append.gdsn
.saveGeno <- function(object, clean_out, sel) {
    output <- array(3, c(2, length(sel$sam), length(sel$mar)))
    rep_id <- getReplicates(object, parents = TRUE)
    id_hit <- match(rep_id, clean_out$mapping_id)
    output[, sel$sam, sel$mar] <- clean_out$best_geno[, id_hit,]

    out_gdsn <-index.gdsn(object, "annotation/format/CGT/data")
    gdsn_dim <- objdesp.gdsn(out_gdsn)$dim
    if (gdsn_dim[1] == 0) {
        add.gdsn(index.gdsn(object, "annotation/format/CGT"), "data", output,
                 "bit2", compress = "", replace = TRUE)
    } else {
        append.gdsn(out_gdsn, output)
    }
}

#' @importFrom gdsfmt append.gdsn
.savePGeno <- function(object, clean_out, sel) {
    output <- matrix(3, nrow(clean_out$p_geno), length(sel$mar))
    output[, sel$mar] <- clean_out$p_geno
    out_gdsn <- index.gdsn(object, "annotation/info/PGT")
    gdsn_dim <- objdesp.gdsn(out_gdsn)$dim
    if (gdsn_dim[1] == 0) {
        add.gdsn(index.gdsn(object, "annotation/info"), "PGT", output,
                 "bit2", compress = "", replace = TRUE)
    } else {
        append.gdsn(out_gdsn, output)
    }
}

#' @importFrom gdsfmt append.gdsn
.saveADB <- function(object, clean_out, sel) {
    output <- rep(-1, length(sel$mar))
    output[sel$mar] <- t(clean_out$bias)
    out_gdsn <- index.gdsn(object, "annotation/info/ADB")
    gdsn_dim <- objdesp.gdsn(out_gdsn)$dim
    if (gdsn_dim[1] == 0) {
        add.gdsn(index.gdsn(object, "annotation/info"), "ADB", output,
                 "single", compress = "", replace = TRUE)
    } else {
        append.gdsn(out_gdsn, output)
    }
}

#' @importFrom gdsfmt append.gdsn
.saveMR <- function(object, clean_out, sel) {
    output <- matrix(-1, nrow = ncol(clean_out$mismap), ncol = length(sel$mar))
    output[, sel$mar] <- t(clean_out$mismap)
    out_gdsn <- index.gdsn(object, "annotation/info/MR")
    gdsn_dim <- objdesp.gdsn(out_gdsn)$dim
    if (gdsn_dim[1] == 0) {
        add.gdsn(index.gdsn(object, "annotation/info"), "MR", output,
                 "single", compress = "", replace = TRUE)
    } else {
        append.gdsn(out_gdsn, output)
    }
}

################################################################################
################################################################################
# Prepare read count data
.loadReadCounts <- function(object, chr_i, parentless, dummy_reads) {
    if (exist.gdsn(object, "annotation/format/FAD")) {
        ad_node <- "filt"
    } else {
        ad_node <- "raw"
    }
    reads <- getRead(object, ad_node, FALSE, TRUE, chr_i)
    rep_id <- getReplicates(object, parents = FALSE)
    reads <- .pileupAD(ad = reads, rep_id = rep_id)

    if(parentless){
        ref_read <- matrix(c(dummy_reads, 0), nrow = 2, ncol = ncol(reads$ref))
        alt_read <- matrix(c(0, dummy_reads), nrow = 2, ncol = ncol(reads$ref))
        p_reads <- list(ref = ref_read, alt = alt_read)

    } else {
        p_reads <- getRead(object, ad_node, "only", TRUE, chr_i)
        rep_id <- getReplicates(object, parents = "only")
        p_reads <- .pileupAD(ad = p_reads, rep_id = rep_id)
    }

    p_reads <- .orderParents(object = object, p_reads = p_reads)

    return(list(p_ref = p_reads$ref,
                p_alt = p_reads$alt,
                ref = reads$ref,
                alt = reads$alt))
}

.orderParents <- function(object, p_reads){
    p_id <- getParents(object)
    rep_id <- getReplicates(object, parents = "only")
    sam_id <- getSamID(object, FALSE)[validSam(object, parents = "only")]
    id_hit <- match(sam_id, p_id$sampleID)
    p_id$rep_id[na.omit(id_hit)] <- rep_id[!is.na(id_hit)]
    member_id <- p_id$memberID[match(rownames(p_reads$ref),
                                     p_id$rep_id)]
    p_reads$ref <- p_reads$ref[order(member_id), ]
    p_reads$alt <- p_reads$alt[order(member_id), ]
    return(p_reads)
}

################################################################################
################################################################################
# Make genotype and haplotype pattern list
.makeGenoPat <- function(n_ploidy, alleles){
    out <- NULL
    for (i in seq_len(n_ploidy)) {
        out <- c(out, list(alleles))
    }
    out <- expand.grid(out)
    out <- t(apply(out, 1, sort))
    out <- rowSums(out[!duplicated(out),])
    return(out)
}

.makeGenoParents <- function(n_parents, alleles, het_parent){
    out <- NULL
    for (i in seq_len(n_parents * 2)) {
        out <- c(out, list(alleles))
    }
    out <- expand.grid(out, KEEP.OUT.ATTRS = FALSE)
    valid_pat <- apply(out, 1, function(x) length(unique(x)) > 1)
    out <- as.matrix(out[valid_pat,])
    if (!het_parent) {
        valid <- out[, c(TRUE, FALSE)] == out[, c(FALSE, TRUE)]
        valid <- apply(valid, 1, all)
        out <- out[valid, ]
    }
    attributes(out) <- list(dim = dim(out))
    return(out)
}

.initialPattern <- function(mt, xtype, het_parent, n_origin){
    if (het_parent) {
        out <- apply(matrix(seq_len(n_origin), 2), 2, paste, collapse="/")
        out <- lapply(seq_along(xtype[[1]]), function(i){
            gamet <- out[match(mt[[1]][, i], seq_len(0.5 * n_origin))]
            return(paste(gamet, collapse = "|"))
        })

    } else {
        gamet <- mt[[1]]
        gamet[gamet != 1] <- gamet[gamet != 1] * 2 - 1
        out <- lapply(seq_along(xtype[[1]]), function(i){
            return(paste(gamet[, i], collapse = "|"))
        })
    }
    return(out)
}

.progenyPattern <- function(gamet, mt, xtype, pg, het_parent){
    if(all(xtype == "pairing")){
        out <- vapply(X = seq_along(xtype),
                      FUN.VALUE = list(1),
                      FUN = function(i){
                          i_gamet <- gamet[match(mt[, i], pg)]
                          return(list(paste(i_gamet, collapse = "|")))
                      })
    } else if(all(xtype != "pairing")){
        out <- vapply(X = gamet,
                      FUN.VALUE = list(1),
                      FUN = function(x){
                          return(list(paste(x, x, sep = "|")))
                      })
    }
    return(out)
}

.getValidPat <- function(scheme, het_parent, n_origin) {
    Var1 <- Var2 <- NULL
    homo <- FALSE

    target_pedigree <- sort(unique(slot(scheme, "samples")))
    mt <- slot(scheme, "mating")
    xtype <- slot(scheme, "crosstype")
    pg <- slot(scheme, "progenies")

    pat <- NULL

    # Pairing of founders
    pat <- c(pat, list(.initialPattern(mt = mt,
                                       xtype = xtype,
                                       het_parent = het_parent,
                                       n_origin = n_origin)))

    # Progeny
    if(length(xtype) >= 2){
        for(i in seq_along(xtype)[-1]) {
            gamet <- lapply(pat[[i - 1]], sub, pattern = "\\|", replacement = "/")
            pat <- c(pat,
                     list(.progenyPattern(gamet = gamet,
                                          mt = mt[[i]],
                                          xtype = xtype[[i]],
                                          pg = pg[[i - 1]],
                                          het_parent = het_parent)))
        }
    }
    target_index <- which(unlist(pg) %in% target_pedigree)
    target_pat <- unlist(pat)[target_index]
    gamet <- lapply(target_pat, function(x) unlist(strsplit(x, split = "\\|")))
    out <- lapply(gamet, function(x){
        gamet1 <- as.numeric(unlist(strsplit(x[1], "/")))
        gamet2 <- as.numeric(unlist(strsplit(x[2], "/")))
        tmp <- expand.grid(gamet1, gamet2, stringsAsFactors = FALSE)
        tmp <- as.matrix(tmp)
        tmp <- rbind(tmp, tmp[, 2:1])
        tmp <- unique(tmp)
        tmp <- tmp[order(tmp[, 1], tmp[, 2]),]
        return(tmp)
    })

    names(out) <- target_pedigree
    return(out)
}

.getPossibleHap <- function(hap_progeny, geno_parents, geno_pat){
    hap_vec <- as.vector(t(hap_progeny))

    derived_geno <- apply(geno_parents, 1, function(x) {
        x <- x[hap_vec]
        x <- x[c(TRUE, FALSE)] + x[c(FALSE, TRUE)]
        return(x)
    })
    derived_geno <- as.vector(derived_geno)
    out <- rbind(derived_geno, derived_geno, derived_geno) == geno_pat
    out <- apply(out, 2, which)
    return(out)
}

.getPossibleGeno <- function(geno_parents, geno_pat){
    out <- apply(geno_parents, 1, function(x) {
        x <- x[c(TRUE, FALSE)] + x[c(FALSE, TRUE)]
        return(x)
    })
    out <- as.vector(out)
    out <- rbind(out, out, out) == geno_pat
    out <- apply(out, 2, which)

    return(out)
}

.makePattern <- function(n_parents,
                         n_ploidy,
                         n_alleles,
                         n_samples,
                         het_parent,
                         scheme) {
    alleles <- seq(0, length.out = n_alleles)

    geno_pat <- .makeGenoPat(n_ploidy, alleles)
    geno_parents <- .makeGenoParents(n_parents, alleles, het_parent)
    n_origin <- n_parents * 2^het_parent
    hap_progeny <- .getValidPat(scheme, het_parent, n_origin)

    possiblehap <- lapply(hap_progeny,
                          .getPossibleHap,
                          geno_parents = geno_parents,
                          geno_pat = geno_pat)

    possiblegeno <- .getPossibleGeno(geno_parents, geno_pat)

    n_p_pat <- nrow(geno_parents)
    n_hap_pat <- vapply(X = hap_progeny, FUN = nrow, FUN.VALUE = numeric(1))
    return(list(alleles = alleles,
                geno_pat = geno_pat,
                geno_parents = geno_parents,
                hap_progeny = hap_progeny,
                possiblehap = possiblehap,
                possiblegeno = possiblegeno,
                n_p_pat = n_p_pat,
                n_hap_pat = n_hap_pat,
                n_samples = n_samples))
}

################################################################################
################################################################################
# Prepare the transition provability matrix
.getJnum <- function(scheme, n_origin, het_parent, i) {
    xtype <- vapply(X = slot(scheme, "crosstype"),
                    FUN.VALUE = character(length = 1),
                    FUN = function(x){
                        return(x[1])
                    })
    n_x <- length(xtype)

    if(n_x > 1){
        for(j in seq(n_x, 2)){
            if(xtype[j] == "pairing"){
                j_mating <- slot(scheme, "mating")[[j]]
                i_mating <- j_mating[, i]
                prev_mating <- slot(scheme, "mating")[[j - 1]]
                prev_max <- max(prev_mating)
                j_matings <- prev_mating[, i_mating - prev_max]

                if(any(j_matings[, 1] %in% j_matings[, 2])){
                    xtype[j] <- "sibling"
                }
            }
        }
    }

    i_pairing <- grep("pairing", xtype)
    if (length(i_pairing) == 0) {
        n_pairing  <- 0
    } else {
        n_pairing <- max(i_pairing)
    }

    if (n_pairing == 0) {
        jnum <- .initJnum(n_origin)
    } else {
        next_crosstype <- xtype[n_pairing + 1]
        if(is.na(next_crosstype)){
            next_crosstype <- "selfing"
        }
        jnum <- .initJnum(n_origin, n_pairing + het_parent, next_crosstype)
    }

    s_gen <- n_pairing + 1
    n_gen <- length(xtype)
    if(s_gen <= n_gen){
        for (i in s_gen:n_gen) {
            jnum <- .calcNextJnum(xtype[i], jnum)
        }
    }
    return(jnum)
}

.initJnum <- function(n_origin, n_pairing, next_crosstype) {
    jnum <- data.frame(a12 = 0,
                       b12 = 1,
                       a123 = 0,
                       b123 = 0,
                       r = 0,
                       j1232 = 0,
                       k1232 = 0,
                       j1122 = 0,
                       k1122 = 0,
                       j1222 = 0)
    if (n_origin > 2) {
        jnum$b123 <- 1
    }
    if (!missing(n_pairing)) {
        jnum$a12 <- 1
        if (next_crosstype == "selfing") {
            jnum$j1232 <- jnum$r <- n_pairing - 1

        } else {
            r <- switch(n_pairing, "1" = 0, n_pairing - 2)
            jnum$j1232 <- jnum$k1232 <- jnum$r <- r

            if (n_origin > 2) {
                jnum$a123 <- 1
            } else {
                jnum$b12 <- 0.5
            }
            if (next_crosstype == "sibling") {
                while (n_pairing > 1) {
                    jnum <- .calcNextJnum("sibling", jnum)
                    n_pairing <- n_pairing - 1
                }
            }
        }
    }
    return(jnum)
}

.calcNextJnum <- function(crosstype, prob_df) {
    coal <- switch(crosstype,
                   "pairing" = c(0, 0),
                   "selfing" = c(1, 0),
                   "sibling" = c(0.5, 0))
    s <- coal[1]
    q <- coal[2]
    if (crosstype == "selfing") {
        next_a12 <- 0.5 * prob_df$a12
        next_b12 <- 0
        next_a123 <- 0
        next_b123 <- 0
        next_r <- prob_df$r + prob_df$a12
        next_j1232 <- 0.5 * prob_df$j1232
        next_k1232 <- 0
        next_j1122 <- 0.5 * prob_df$j1122 + 0.5 * prob_df$r
        next_k1122 <- 0
        next_j1222 <- 0.5 * (next_r - next_j1122 - next_j1232)

    } else {
        next_a12 <- prob_df$b12
        next_b12 <- 0.5 * s * prob_df$a12 + (1 - s) * prob_df$b12
        next_a123 <- s * prob_df$a123 + (1 - 2 * s) * prob_df$b123
        next_b123 <-
            1.5 * q * prob_df$a123 + (1 - s - 2 * q) * prob_df$b123
        next_r <- prob_df$r + prob_df$a12
        next_j1232 <- prob_df$k1232 + prob_df$a123
        next_k1232 <-
            0.5 * s * prob_df$j1232 + (1 - s) * (prob_df$k1232 + prob_df$a123)
        next_j1122 <- prob_df$k1122
        next_k1122 <-
            0.5 * s * (prob_df$r + prob_df$j1122) + (1 - s) * prob_df$k1122
        next_j1222 <- 0.5 * (next_r - next_j1122 - next_j1232)
    }
    return(data.frame(a12 = next_a12,
                      b12 = next_b12,
                      a123 = next_a123,
                      b123 = next_b123,
                      r = next_r,
                      j1232 = next_j1232,
                      k1232 = next_k1232,
                      j1122 = next_j1122,
                      k1122 = next_k1122,
                      j1222 = next_j1222))
}

.getXoFreq <- function(jnum, n_origin) {
    r00 <- jnum$j1232 / jnum$a12 / (n_origin - 2)
    r01 <- jnum$j1222 / jnum$a12
    r10 <- jnum$j1222 / (1 - jnum$a12) / (n_origin - 1)
    r11 <- jnum$j1122 / (1 - jnum$a12) / (n_origin - 1)
    return(data.frame(r00, r01, r10, r11))
}

.transitionProb <- function(pat, pos, recomb_rate,
                            scheme, het_parent){
    out <- lapply(seq_along(pat$hap_progeny), function(i){
        all_hap <- do.call("rbind", pat$hap_progeny)
        hap_progeny_i <- pat$hap_progeny[[i]]
        n_origin <- length(unique(as.vector(hap_progeny_i)))
        jnum <- .getJnum(scheme, n_origin, het_parent, i)
        jrate <- .getXoFreq(jnum, n_origin)
        invalid_joint <- apply(hap_progeny_i, 1, function(x) {
            apply(hap_progeny_i, 1, function(y) {
                check1 <- x[1] == x[2] & y[1] == y[2]
                check2 <- any(x == y)
                return(!check1 & !check2)
            })
        })

        ibd <- apply(hap_progeny_i,
                     1,
                     function(x) abs(length(unique(x)) - 2))
        ibd <- vapply(ibd, function(x) paste0(ibd, x), character(length(ibd)))
        ibd[ibd == "00"] <- jrate$r00
        ibd[ibd == "01"] <- jrate$r01
        ibd[ibd == "10"] <- jrate$r10
        ibd[ibd == "11"] <- jrate$r11
        ibd[invalid_joint] <- 0
        q_mat <- as.numeric(ibd)

        mar_dist <- diff(pos)
        rf <- mar_dist * 1e-6 * recomb_rate
        q_mat <- matrix(q_mat,
                        nrow(hap_progeny_i),
                        nrow(hap_progeny_i))
        diag(q_mat) <- NA
        diag(q_mat) <- -rowSums(q_mat, na.rm = TRUE)
        prob <- vapply(rf, function(x) expm.Higham08(q_mat * x),
                       numeric(length(q_mat)))
        prob_dim <- c(pat$n_hap_pat[i], pat$n_hap_pat[i], length(mar_dist))
        prob <- array(prob, prob_dim)
        return(prob)
    })
    return(out)
}

.getInitProb <- function(prob, n_pat, n_samples) {
    ev1 <- eigen(t(prob[, , 1]))$vectors[, 1]
    init <- ev1 / sum(ev1)
    return(init)
}

################################################################################
################################################################################
# Make parameter lists
.getPedigree <- function(object){
    out <- slot(object, "sample")$pedigree[validSam(object)]
    return(out)
}

.getParams <- function(object, chr_i, error_rate, recomb_rate,
                       call_threshold, het_parent, fix_bias, fix_mismap,
                       parentless, dummy_reads) {
    reads <- .loadReadCounts(object, chr_i, parentless, dummy_reads)
    if(parentless){
        n_parents <- 2
    } else {
        parents <- getParents(object)
        n_parents <- length(unique(parents$memberID))
    }
    n_samples <- length(unique(getReplicates(object = object)))
    n_alleles <- 2
    n_ploidy <- attributes(slot(object, "sample"))$ploidy
    pat <- .makePattern(n_parents, n_ploidy, n_alleles, n_samples,
                        het_parent, slot(object, "scheme"))

    pos <- getPosition(object, TRUE, chr_i)
    n_mar <- nmar(object, TRUE, chr_i)
    trans_prob <- .transitionProb(pat, pos, recomb_rate,
                                  slot(object, "scheme"), het_parent)
    init_prob <- lapply(trans_prob, .getInitProb,
                        n_pat = pat$n_p_pat, n_samples = n_samples)
    if(is.null(fix_mismap)){
        mismap <-  matrix(0.005, n_mar, 2)
        fix_mismap <- FALSE
    } else {
        mismap <- matrix(fix_mismap, n_mar, 2)
        fix_mismap <- TRUE
    }
    if(is.null(fix_bias)){
        bias <- rep(0.5, n_mar)
        fix_bias <- FALSE
    } else {
        bias <- rep(fix_bias, n_mar)
        fix_bias <- TRUE
    }

    return(list(n_parents = n_parents,
                n_samples = n_samples,
                parents_ids = rownames(reads$p_ref),
                sample_ids = rownames(reads$ref),
                n_alleles = n_alleles,
                n_ploidy = n_ploidy,
                n_mar = n_mar,
                n_pedigree = length(trans_prob),
                pedigree = .getPedigree(object),
                het_parent = het_parent,
                error_rate = c(1 - error_rate, error_rate),
                recomb_rate = recomb_rate,
                call_threshold = call_threshold,
                reads = reads,
                pat = pat,
                bias = bias,
                mismap = mismap,
                fix_mismap = fix_mismap,
                fix_bias = fix_bias,
                trans_prob = lapply(trans_prob, log10),
                init_prob = lapply(init_prob, log10),
                count = 0,
                p_geno_fix = -1,
                flip = FALSE))
}

################################################################################
################################################################################
# Execute the Viterbi algorithm
.getBestSeq <- function(param_list, outprob) {
    trans_prob <- unlist(param_list$trans_prob)
    init_prob <- unlist(param_list$init_prob)
    possiblehap <- unlist(param_list$pat$possiblehap)
    pedigree <- match(param_list$pedigree, names(param_list$pat$hap_progeny))

    out_list <- run_viterbi(p_ref = param_list$reads$p_ref,
                            p_alt = param_list$reads$p_alt,
                            ref = param_list$reads$ref,
                            alt = param_list$reads$alt,
                            eseq_in = param_list$error_rate,
                            bias = param_list$bias,
                            mismap = param_list$mismap,
                            trans_prob = trans_prob,
                            init_prob = init_prob,
                            n_pgeno = param_list$pat$n_p_pat,
                            n_hap = param_list$pat$n_hap_pat,
                            n_offspring = param_list$n_samples,
                            n_founder = param_list$n_parents,
                            n_marker = param_list$n_mar,
                            het = param_list$het_parent,
                            pedigree = pedigree - 1,
                            possiblehap = possiblehap - 1,
                            possiblegeno = param_list$pat$possiblegeno - 1,
                            p_geno_fix = param_list$p_geno_fix - 1,
                            ploidy = param_list$n_ploidy)

    if (outprob) {
        prob <- run_fb(ref = param_list$reads$ref,
                       alt = param_list$reads$alt,
                       eseq_in = param_list$error_rate,
                       bias = param_list$bias,
                       mismap = param_list$mismap,
                       possiblehap = possiblehap - 1,
                       trans_prob = trans_prob,
                       init_prob = init_prob,
                       n_pgeno = param_list$pat$n_p_pat,
                       n_hap = param_list$pat$n_hap_pat,
                       n_offspring = param_list$n_samples,
                       n_marker = param_list$n_mar,
                       pedigree = pedigree - 1,
                       p_geno = out_list$p_geno,
                       ploidy = param_list$n_ploidy)
        prob <- array(apply(prob, 3, function(x) return(t(x))),
                      dim = c(3, param_list$n_samples, param_list$n_mar))
        out_list$prob <- prob
    } else {
        out_list$prob <- NULL
    }
    out_list$best_seq <- out_list$best_seq + 1
    out_list$p_geno <- out_list$p_geno + 1
    return(out_list)
}

################################################################################
################################################################################
# Convert data format the Viterbi algorithm's output to the GBScleanR style
.pat2hap <- function(best_hap, param_list) {
    n_parents_chr <- (param_list$n_parents * param_list$n_ploidy)
    if (param_list$het_parent) {
        parent_hap <- rep(seq_len(n_parents_chr), each=param_list$n_mar)
    } else {
        parent_hap <- rep(seq(1, by=2, length.out=param_list$n_parents),
                          each=param_list$n_ploidy * param_list$n_mar)
    }
    parent_hap <- matrix(parent_hap, n_parents_chr, byrow=TRUE)
    sample_pat <- best_hap$best_seq
    pat_i <- match(param_list$pedigree,
                   names(param_list$pat$hap_progeny))
    sample_hap <- vapply(X = seq_len(param_list$n_samples),
                         FUN.VALUE = numeric(2 * param_list$n_mar),
                         FUN = function(i) {
                             hap_i <- param_list$pat$hap_progeny[[pat_i[i]]]
                             out <- c(hap_i[sample_pat[, i], 1],
                                      hap_i[sample_pat[, i], 2])
                             return(out)
                         })
    sample_hap <- matrix(sample_hap,
                         nrow = param_list$n_samples * 2,
                         ncol = param_list$n_mar,
                         byrow = TRUE)
    out_hap <- rbind(parent_hap, sample_hap)
    out_hap <- array(out_hap, c(param_list$n_ploidy,
                                param_list$n_parents + param_list$n_samples,
                                param_list$n_mar))
    return(out_hap)
}

.parentPat2Geno <- function(best_hap, param_list) {
    p_pat <- best_hap$p_geno
    p_geno <- t(param_list$pat$geno_parents[p_pat, ])
    return(p_geno)
}

.hap2geno <- function(hap, p_geno, param_list) {
    n <- param_list$n_ploidy * (param_list$n_samples + param_list$n_parents)
    out_geno <- vapply(seq_len(param_list$n_mar), FUN.VALUE = numeric(n), function(x) {
        return(p_geno[hap[,, x], x])
    })
    out_geno <- array(out_geno, c(param_list$n_ploidy,
                                  param_list$n_samples + param_list$n_parents,
                                  param_list$n_mar))

    return(out_geno)
}

.halfJoint <- function(best_pat_f, best_pat_r, param_list) {
    n_mar <- param_list$n_mar
    n_sample <- param_list$n_samples
    half <- round(n_mar / 2)
    first <- (n_mar:1)[seq_len(half)]
    latter <- (half + 1):n_mar

    best_seq <- rbind(best_pat_r$best_seq[first,],
                      best_pat_f$best_seq[latter,])
    p_geno <- c(best_pat_r$p_geno[first], best_pat_f$p_geno[latter])

    prob <- c(best_pat_r$prob[, , first],
              best_pat_f$prob[, , latter])
    prob <- array(prob, dim(best_pat_f$prob))
    prob <- apply(prob, 2, function(x) return(x))
    prob <- array(prob, c(3, n_mar, n_sample))
    out_list <- list(best_seq = best_seq,
                     p_geno = p_geno,
                     prob = prob)
    return(out_list)
}

.solveBorderConflict <- function(best_hap, param_list){
    half <- round(param_list$n_mar / 2)
    filp_i <- apply(best_hap, 2, function(x){
        return(any(x[, half] != x[, half + 1]))
    })
    half_m <- -seq(1, half)
    best_hap[, filp_i, half_m] <- best_hap[2:1, filp_i, half_m]
    return(best_hap)
}

.minimizeRecombination <- function(best_hap){
    for(i in seq_len(dim(best_hap)[2])){
        recomb <- apply(best_hap[, i,], 1, diff) != 0
        n_recomb <- rowSums(recomb)
        recomb_pos <- which(n_recomb == 1)
        for(j in seq_along(recomb_pos)){
            if(j == 1){
                prev_xo <- recomb[recomb_pos[j], ]
            } else {
                j_xo <- recomb[recomb_pos[j], ]
                if(all(j_xo == prev_xo)){
                    prev_xo <- j_xo[2:1]
                    flip_i <- -seq(1, recomb_pos[j])
                    best_hap[, i, flip_i] <- best_hap[2:1, i, flip_i]
                }
            }
        }
    }
    return(best_hap)
}

.summarizeEst <- function(best_hap, best_geno, pat_prob, param_list) {
    n_mar <- param_list$n_mar
    n_sample <- param_list$n_samples
    i_sample <- -seq_len(param_list$n_parents)
    sample_geno <- apply(best_geno[, i_sample, ], 2, colSums)
    sample_geno <- as.vector(sample_geno + 1)
    sample_geno <- sample_geno + seq(0, by=3, length.out=n_mar * n_sample)
    log10_th <- log10(param_list$call_threshold)
    geno_prob <- t(matrix(pat_prob[sample_geno], n_mar, n_sample) < log10_th)
    best_geno[1, i_sample, ][geno_prob] <- 3
    best_geno[2, i_sample, ][geno_prob] <- 3
    if (!param_list$het_parent) {
        best_hap[best_hap != 1] <- (best_hap[best_hap != 1] + 1) / 2
    }
    best_hap[1, i_sample, ][geno_prob] <- 0
    best_hap[2, i_sample, ][geno_prob] <- 0
    out_list <- list(best_hap = best_hap, best_geno = best_geno)
    return(out_list)
}

################################################################################
################################################################################
# Marker specific error rate estimation
.getBias <- function(best_seq, type, ref, alt) {
    if (type == 1) {
        est_het <- best_seq == 1
        ref[!est_het] <- NA
        ref <- colSums(ref, na.rm = TRUE)
        n_ref <- colSums(est_het, na.rm = TRUE)
        ref_prop <- ref / n_ref
        alt[!est_het] <- NA
        alt <- colSums(alt, na.rm = TRUE)
        n_alt <- colSums(est_het, na.rm = TRUE)
        alt_prop <- alt / n_alt
        bias <- ref_prop / (ref_prop + alt_prop)

    } else {
        est_ref <- best_seq == 0
        ref[!est_ref] <- NA
        ref <- colSums(ref, na.rm = TRUE)
        n_ref <- colSums(est_ref, na.rm = TRUE)
        ref_prop <- ref / n_ref

        est_alt <- best_seq == 2
        alt[!est_alt] <- NA
        alt <- colSums(alt, na.rm = TRUE)
        n_alt <- colSums(est_alt, na.rm = TRUE)
        alt_prop <- alt / n_alt
        bias <- ref_prop / (ref_prop + alt_prop)
    }
    return(list(bias = bias,
                ref = ref,
                alt = alt,
                n_ref = n_ref,
                n_alt = n_alt))
}

.sumUpBias <- function(bias1, bias2) {
    ref_prop <- (bias1$ref + bias2$ref) / (bias1$n_ref + bias2$n_ref * 2)
    alt_prop <- (bias1$alt + bias2$alt) / (bias1$n_alt + bias2$n_alt * 2)
    bias <- ref_prop / (ref_prop + alt_prop)
    return(bias)
}

.calcReadBias <- function(best_seq, param_list) {
    ref <- rbind(param_list$reads$p_ref, param_list$reads$ref)
    alt <- rbind(param_list$reads$p_alt, param_list$reads$alt)

    bias1 <- .getBias(best_seq, 1, ref, alt)
    bias2 <- .getBias(best_seq, 2, ref, alt)
    bias_cor <- cor(bias1$bias, bias2$bias, "pair")
    if (!is.na(bias_cor) & bias_cor > 0.7) {
        bias <- .sumUpBias(bias1, bias2)
    } else {
        bias <- bias1$bias
    }
    bias[is.na(bias)] <- 0.5
    return(bias)
}

.calcMissmap <- function(best_seq, param_list) {
    geno_call <- get_genocall(ref = param_list$reads$ref,
                              alt = param_list$reads$alt,
                              eseq_in = param_list$error_rate,
                              bias = param_list$bias,
                              mismap = param_list$mismap,
                              n_o = param_list$n_samples,
                              n_m = param_list$n_mar)
    missing <- param_list$reads$ref == 0 & param_list$reads$alt == 0
    geno_call[missing] <- FALSE

    i_samples <- -seq_len(param_list$n_parents)
    est <- best_seq[i_samples, ] == 0
    n_ref <- colSums(est, na.rm = TRUE)
    alt <- param_list$reads$alt > 0
    alt[!est] <- FALSE
    alt[!geno_call] <- FALSE
    alt_mis <- colSums(alt, na.rm = TRUE) / n_ref

    est <- best_seq[i_samples, ] == 2
    n_alt <- colSums(est, na.rm = TRUE)
    ref <- param_list$reads$ref > 0
    ref[!est] <- FALSE
    ref[!geno_call] <- FALSE
    ref_mis <- colSums(ref, na.rm = TRUE) / n_alt

    return(cbind(alt_mis, ref_mis))
}

.calcErrors <- function(best_seq, param_list) {
    best_seq <- apply(best_seq, 3, colSums)
    bias <- .calcReadBias(best_seq, param_list)
    mismap <- .calcMissmap(best_seq, param_list)
    return(list(bias = bias, mismap = mismap))
}

.bindErrors <- function(error_f, error_r) {
    nmar <- length(error_f$bias)
    bias <- colMeans(rbind(error_f$bias, error_r$bias), na.rm = TRUE)
    bias[is.na(bias)] <- 0.5

    mismap <- cbind(colMeans(rbind(error_f$mismap[, 1],
                                   error_r$mismap[, 1]), na.rm = TRUE),
                    colMeans(rbind(error_f$mismap[, 2],
                                   error_r$mismap[, 2]), na.rm = TRUE))
    mismap[is.na(mismap)] <- 0
    return(list(bias = bias, mismap = mismap))
}

################################################################################
################################################################################
# Flip parameter lists for the reverse-direction round
.flipParam <- function(param_list) {
    n_mar <- param_list$n_mar
    param_list$trans_prob <- lapply(param_list$trans_prob,
                                    function(x){
                                        return(x[, , (n_mar - 1):1])
                                    })
    param_list$reads$p_ref <- param_list$reads$p_ref[, n_mar:1]
    param_list$reads$p_alt <- param_list$reads$p_alt[, n_mar:1]
    param_list$reads$ref <- param_list$reads$ref[, n_mar:1]
    param_list$reads$alt <- param_list$reads$alt[, n_mar:1]
    param_list$bias <- param_list$bias[n_mar:1]
    param_list$mismap <- param_list$mismap[n_mar:1,]
    return(param_list)
}


################################################################################
################################################################################
# Run the iterative estimation
.runCycle <- function(param_list, outprob, outgeno) {
    param_list$count <- param_list$count + 1
    cycle <- paste0("Cycle ", param_list$count, ": ")
    message("\r", cycle)
    message("\r", "Forward round of genotype estimation ...")

    if (param_list$flip) {
        best_pat_r <- .getBestSeq(.flipParam(param_list), outprob)
        best_hap_r <- .pat2hap(best_pat_r, param_list)
        p_geno_r <- .parentPat2Geno(best_pat_r, param_list)
        best_geno_r <- .hap2geno(best_hap_r, p_geno_r, param_list)
        param_list$p_geno_fix <- best_pat_r$p_geno[param_list$n_mar]

    } else {
        best_pat_f <- .getBestSeq(param_list, outprob)
        best_hap_f <- .pat2hap(best_pat_f, param_list)
        p_geno_f <- .parentPat2Geno(best_pat_f, param_list)
        best_geno_f <- .hap2geno(best_hap_f, p_geno_f, param_list)
        param_list$p_geno_fix <- best_pat_f$p_geno[param_list$n_mar]
    }

    message("\r", "Backward round of genotype estimation  ...")
    if (param_list$flip) {
        best_pat_f <- .getBestSeq(param_list, outprob)
        best_hap_f <- .pat2hap(best_pat_f, param_list)
        p_geno_f <- .parentPat2Geno(best_pat_f, param_list)
        best_geno_f <- .hap2geno(best_hap_f, p_geno_f, param_list)
        param_list$p_geno_fix <- -1

    } else {
        best_pat_r <- .getBestSeq(.flipParam(param_list), outprob)
        best_hap_r <- .pat2hap(best_pat_r, param_list)
        p_geno_r <- .parentPat2Geno(best_pat_r, param_list)
        best_geno_r <- .hap2geno(best_hap_r, p_geno_r, param_list)
        param_list$p_geno_fix <- -1
    }

    # ### Debug
    # return(list(best_geno_f, best_geno_r, param_list))
    # ###

    if (!outprob) {
        message("\r",
                "Paramter optimization ...")
        error_f <- .calcErrors(best_geno_f, param_list)
        error_r <- .calcErrors(best_geno_r[,,param_list$n_mar:1], param_list)
        error <- .bindErrors(error_f, error_r)
        check <- error$bias > param_list$error_rate[1]
        error$bias[check] <- param_list$error_rate[1]
        check <- error$bias < param_list$error_rate[2]
        error$bias[check] <- param_list$error_rate[2]

        if(!param_list$fix_bias){
            param_list$bias <- error$bias
        }
        if(!param_list$fix_mismap){
            param_list$mismap <- error$mismap
        }
    }

    if (outgeno) {
        message("\r", "Summarizing output ...")
        best_pat <- .halfJoint(best_pat_f, best_pat_r, param_list)
        p_geno <- .parentPat2Geno(best_pat, param_list)
        best_hap <- .pat2hap(best_pat, param_list)
        best_hap <- .solveBorderConflict(best_hap, param_list)
        best_hap <- .minimizeRecombination(best_hap)
        best_geno <- .hap2geno(best_hap, p_geno, param_list)
        out_list <- .summarizeEst(best_hap, best_geno,
                                  best_pat$prob, param_list)
        out_list$p_geno <- p_geno
        out_list$bias <- param_list$bias
        out_list$mismap <- param_list$mismap
        out_list$mapping_id <- c(param_list$parents_ids, param_list$sample_ids)

        message("\r", "Done!")
        return(out_list)
    } else {
        return(param_list)
    }
}

################################################################################
################################################################################
# Check data to avoid no reads for founders at the first marker
.checkPread <- function(param_list) {
    param_list$flip <- FALSE
    p_read_s <- sum(param_list$reads$p_ref[, 1] == 0 &
                        param_list$reads$p_alt[, 1] == 0)
    p_read_e <- sum(param_list$reads$p_ref[, param_list$n_mar] == 0 &
                        param_list$reads$p_alt[, param_list$n_mar] == 0)
    if (p_read_s < p_read_e) {
        param_list$flip <- TRUE
    } else if (p_read_s == p_read_e) {
        p_read_s <- sum(param_list$reads$p_ref[, 1] +
                            param_list$reads$p_alt[, 1])
        p_read_e <- sum(param_list$reads$p_ref[, param_list$n_mar] +
                            param_list$reads$p_alt[, param_list$n_mar])
        if (p_read_s < p_read_e) {
            param_list$flip <- TRUE
        }
    }
    return(param_list)
}

################################################################################
################################################################################
# Run genotype estimation per chromosome
.cleanEachChr <- function(object,
                          chr_i,
                          error_rate,
                          recomb_rate,
                          call_threshold,
                          het_parent,
                          optim,
                          iter,
                          fix_bias,
                          fix_mismap,
                          parentless,
                          dummy_reads) {
    param_list <- .getParams(object, chr_i, error_rate, recomb_rate,
                             call_threshold, het_parent, fix_bias, fix_mismap,
                             parentless, dummy_reads)
    param_list <- .checkPread(param_list)

    # ### Debug
    # out <- .runCycle(param_list, FALSE, FALSE)
    # return(out)
    # ###

    if (iter == 1) { optim <- FALSE }

    if (optim) {
        param_list <- .runCycle(param_list, FALSE, FALSE)

        for (i in 2:iter) {
            if (i == iter) {
                out_list <- .runCycle(param_list, TRUE, TRUE)

            } else {
                param_list <- .runCycle(param_list, FALSE, FALSE)
            }
        }
    } else {
        out_list <- .runCycle(param_list, TRUE, TRUE)
    }
    return(out_list)
}
tomoyukif/GBScleanR documentation built on April 27, 2024, 9:06 a.m.