#' @export
format_haplotypes_dlp <- function(haplotypes, CNbins, hmmcopybinsize = 0.5e6) {
options("scipen" = 20)
haplotypes <- haplotypes %>%
data.table::as.data.table()
CNbins <- CNbins %>%
data.table::as.data.table()
bins <- CNbins[, c("chr", "start", "end")] %>%
unique(., by = c("chr", "start", "end")) %>%
.[, binid := paste(chr, start, end, sep = "_")] %>%
.$binid
message(paste0("Number of distinct bins in copy number data: ", length(bins)))
binshaps <- haplotypes[, c("chr", "start", "end")] %>%
unique(., by = c("chr", "start", "end")) %>%
.[, start := floor(start / hmmcopybinsize) * hmmcopybinsize + 1] %>%
.[, end := start + hmmcopybinsize - 1] %>%
.[, binid := paste(chr, start, end, sep = "_")] %>%
.$binid
message(paste0("Number of distinct bins in haplotype data: ", length(unique(binshaps))))
formatted_haplotypes <- haplotypes %>%
.[, allele_id := paste0("allele", allele_id)] %>%
data.table::dcast(., ... ~ allele_id, value.var = "readcount", fill = 0L) %>%
.[, start := floor(start / hmmcopybinsize) * hmmcopybinsize + 1] %>%
.[, end := start + hmmcopybinsize - 1] %>%
.[, lapply(.SD, sum), by = .(cell_id, chr, start, end, hap_label), .SDcols = c("allele1", "allele0")] %>%
.[, totalcounts := allele1 + allele0] %>%
.[, hbinid := paste(chr, start, end, sep = "_")] %>%
.[hbinid %in% bins] %>%
.[, hbinid := NULL]
binshaps2 <- formatted_haplotypes[, c("chr", "start", "end")] %>%
unique(., by = c("chr", "start", "end")) %>%
.[, start := floor(start / hmmcopybinsize) * hmmcopybinsize + 1] %>%
.[, end := start + hmmcopybinsize - 1] %>%
.[, binid := paste(chr, start, end, sep = "_")] %>%
.$binid
message(paste0("Number of distinct bins in formatted haplotype data: ", length(unique(binshaps2))))
return(as.data.frame(formatted_haplotypes))
}
#' @export
format_haplotypes <- function(haplotypes,
filtern = 0,
hmmcopybinsize = 0.5e6,
phased_haplotypes = NULL,
phasing_method = "distribution", ...) {
message("Phase haplotypes...")
if (is.null(phased_haplotypes)) {
if (phasing_method == "distribution") {
message("Phasing based on distribution across all cells")
phased_haplotypes <- phase_haplotypes(haplotypes)
} else {
phased_haplotypes <- computehaplotypecounts(haplotypes, ...)
}
}
message("Join phased haplotypes...")
haplotypes <- haplotypes[phased_haplotypes, on = .(chr, start, end, hap_label)]
message("Reorder haplotypes based on phase...")
haplotypes <- haplotypes %>%
.[, alleleA := ifelse(phase == "allele0", allele1, allele0)] %>%
.[, alleleB := ifelse(phase == "allele0", allele0, allele1)] %>%
.[totalcounts > filtern, BAF := alleleB / totalcounts] %>%
.[, c("allele1", "allele0", "phase") := NULL]
return(haplotypes)
}
#' @export
phase_haplotypes <- function(haplotypes) {
phased_haplotypes <- data.table::as.data.table(haplotypes) %>%
.[, lapply(.SD, sum), by = .(chr, start, end, hap_label), .SDcols = c("allele1", "allele0")] %>%
.[, phase := ifelse(allele0 < allele1, "allele0", "allele1")] %>%
.[, c("allele1", "allele0") := NULL]
return(phased_haplotypes)
}
#' @export
computehaplotypecounts <- function(haplotypes, ncells = 10, arm = FALSE) {
message(paste0("Phasing based on distribution across top ", ncells, " cells with highest imbalance"))
formatted_haplotypes <- haplotypes %>%
.[, R := fifelse(allele0 == 0 | allele1 == 0, 0, 1)] %>%
.[, R0 := fifelse(allele0 == 0, "allele0", "allele1")] %>%
.[, R1 := fifelse(allele1 == 0, "allele1", "allele0")]
if (arm == FALSE) {
perchr <- formatted_haplotypes %>%
.[, list(meanR = mean(R), meanR0 = Mode(R0), meanR1 = Mode(R1)),
by = c("cell_id", "chr")
] %>%
# .[, dominant := fifelse(meanR0 < meanR1, "R1", "R0")] %>%
setkey("meanR") %>%
.[, head(.SD, ncells), by = c("chr")]
} else {
formatted_haplotypes$arm <- coord_to_arm(formatted_haplotypes$chr, formatted_haplotypes$start)
perchr <- formatted_haplotypes %>%
.[, list(meanR = mean(R), meanR0 = Mode(R0), meanR1 = Mode(R1)),
by = c("cell_id", "chr", "arm")
] %>%
# .[, dominant := fifelse(meanR0 < meanR1, "R1", "R0")] %>%
setkey("meanR") %>%
.[, head(.SD, ncells), by = c("chr", "arm")]
}
if (arm == FALSE) {
limitedhaps <- perchr[formatted_haplotypes, on = c("cell_id", "chr"), nomatch = 0] %>%
.[, phase := ifelse(allele0 < allele1, "allele0", "allele1")]
phased_haplotypes <- limitedhaps %>%
dplyr::group_by(chr, start, end, hap_label) %>%
dplyr::summarise(phase = Mode(phase), meanR = mean(meanR)) %>%
dplyr::ungroup() %>%
data.table::as.data.table()
} else {
limitedhaps <- perchr[formatted_haplotypes, on = c("cell_id", "chr", "arm"), nomatch = 0] %>%
.[, phase := ifelse(allele0 < allele1, "allele0", "allele1")]
phased_haplotypes <- limitedhaps %>%
dplyr::group_by(chr, start, arm, end, hap_label) %>%
dplyr::summarise(phase = Mode(phase), meanR = mean(meanR)) %>%
dplyr::ungroup() %>%
data.table::as.data.table()
}
return(phased_haplotypes)
}
#' @export
combineBAFCN <- function(haplotypes,
CNbins,
filtern = 0,
phased_haplotypes = NULL,
minbins = 100,
minbinschr = 10,
phasing_method = "distribution", ...) {
message("Finding overlapping cell IDs between CN data and haplotype data...")
cellidoverlap <- intersect(CNbins$cell_id, haplotypes$cell_id)
message(paste0("Total number of cells in both CN and haplotypes: ", length(cellidoverlap)))
CNbins <- data.table::as.data.table(CNbins)
haplotypes <- data.table::as.data.table(haplotypes)
if (all(cellidoverlap %in% CNbins$cell_id)) {
message(paste0("Number of cells in CN data: ", length(unique(CNbins$cell_id))))
CNbins <- CNbins[cell_id %in% cellidoverlap]
}
if (all(cellidoverlap %in% haplotypes$cell_id)) {
message(paste0("Number of cells in haplotype data: ", length(unique(CNbins$cell_id))))
haplotypes <- haplotypes[cell_id %in% cellidoverlap]
}
message("Joining bins and haplotypes...")
haplotypes <- format_haplotypes(haplotypes,
phased_haplotypes = phased_haplotypes,
phasing_method = phasing_method, ...
)
haplotypes <- data.table::as.data.table(haplotypes)
CNbins <- CNbins[haplotypes, on = c("chr", "start", "end", "cell_id"), nomatch = 0]
CNBAF <- data.table::as.data.table(CNbins) %>%
.[totalcounts > filtern] %>%
.[, lapply(.SD, sum), by = .(chr, start, end, cell_id, state, copy), .SDcols = c("alleleA", "alleleB", "totalcounts")] %>%
.[, BAF := alleleB / totalcounts]
CNBAF <- CNBAF %>%
.[, n := .N, by = "cell_id"] %>%
.[n > minbins] %>%
.[, n := NULL] %>%
.[, n := .N, by = c("chr", "cell_id")] %>%
.[, n := ifelse(chr == "X", 100, n)] %>%
.[, ncell := min(n), by = c("cell_id")] %>%
#do not filter out if chr X below cutoff
.[ncell > minbinschr] %>%
.[, n := NULL] %>%
.[, ncell := NULL]
message(paste0("Total number of cells after removing cells with < ", minbins, " bins: ", length(unique(CNBAF$cell_id))))
return(as.data.frame(CNBAF))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.