# Segmentation functions
#' @title BAF_EM_VAF
#' @description Clusters integer read counts to model mixtures of
#' noise distributions, binomial vs beta-binomial distributions,
#' over-dispersion.
#' @param MAT A numeric matrix of alternate (AD) and reference (RD) read counts
#' in addition to pre-calculated total read depth DP = (AD + RD),
#' log-transformed binomial coefficient (LBC), log-transformed total
#' read depth (log_DP).
#' @param binom Boolean set to true by default to model the mixture distribution
#' assuming binomial distribution. Otherwise set to false to explore beta-binomial
#' and binomial models.
#' @param show_dots Boolean set to true by default to visualize
#' computational runtime.
#' @param clust_tVAF Boolean but set to null to indicate clustering
#' normal read counts represented by modeling a mixture of noise and/or
#' VAF cluster centered around 0.5. Otherwise when set to true, the function
#' models the mixture by noise, cluster at 0.5, clusters p and 1-p for copy-altered
#' genomic segments.
#' @export
#' @return A list of clustered parameter estimates and
#' posterior probabilities for inferring classification.
BAF_EM_VAF = function(MAT,binom=TRUE,show_dots=TRUE,clust_tVAF=NULL){
if(FALSE){
MAT = as.matrix(sub_data[,c("AD","RD","DP","LBC","log_DP")])
binom = binom
show_dots = TRUE
clust_tVAF = FALSE
}
# Constants
init_pi = matrix(c(
1,0,0,0, # pure noise
0,1,0,0, # pure VAF=0.5
1,1,0,0, # mix of noise and VAF=0.5
1,0,1,1, # mix of noise and VAF=mm and VAF=1-mm
0,1,1,1, # mix of VAF=0.5 and VAF=mm and VAF=1-mm
1,1,1,1), # mix of noise and VAF=0.5 and VAF=mm and VAF=1-mm
ncol=4,byrow=TRUE)
my_max_iter = 3e2
eps = 1e-3
vec_pp = c(1/20,seq(4)/10)
if( !is.null(clust_tVAF) ){
if( clust_tVAF == FALSE ){
vec_pp = 0.25
init_pi = init_pi[1:3,,drop=FALSE]
}
}
init_pi = init_pi / rowSums(init_pi)
# Optimize
# iPSI = ifelse(binom == TRUE,0,0.005)
iPSI_vec = c(0,0.005)
if( binom == TRUE ) iPSI_vec = 0
em_out = NULL
for(iPSI in iPSI_vec){
for(jj in seq(nrow(init_pi))){
for(mm0 in vec_pp){
tmp_em = Rcpp_BAF_clust(RD = MAT[,c("AD","RD"),drop = FALSE],DP = MAT[,"DP"],
log_DP = MAT[,"log_DP"],LBC = MAT[,"LBC"],props0 = init_pi[jj,],
mm0 = mm0,psi0 = iPSI,max_iter = my_max_iter,eps = eps,show = FALSE)
if( is.null(em_out) || ( tmp_em$converge == "yes" && tmp_em$BIC > em_out$BIC ) ){
# If clust_tVAF, there should be at least 2 distributions mixed together
if( is.null(clust_tVAF) ){
em_out = tmp_em
} else if( clust_tVAF == TRUE ){
noise_dist = 1 * ( tmp_em$props[1] > 0 )
half_dist = 1 * ( tmp_em$props[2] > 0 )
mm_dist = 1 * ( sum(tmp_em$props[3:4]) > 0 )
num_dist = noise_dist + half_dist + mm_dist
if( noise_dist == 1 || num_dist >= 2 ){
em_out = tmp_em
}
} else {
em_out = tmp_em
}
}
rm(tmp_em)
}}}
em_out
}
#' @title run_AF_SEG
#' @description This function performs normal and tumor variant
#' allele frequency segmentation. Rows with allele frequencies
#' near 0 and 1 should be excluded before segmentation.
#' @param DATA a R data.frame containing columns \code{RD},
#' \code{AD}, and \code{index} corresponding to reference depth,
#' alternate depth, and the ordering and grouping of \code{DATA}'s
#' rows.
#' @param num_divides A positive integer to specify the initial
#' number of equally spaced segments prior to segmentation. This is
#' used if \code{init_seg} is set to \code{NULL}.
#' @param init_seg A vector specifying the user's initial segmentation.
#' @param binom Boolean value set to \code{TRUE} to model read counts
#' with binomial distribution. Otherwise model read counts with
#' beta-binomial distribution.
#' @param clust_tVAF If \code{TRUE}, \code{DATA} is assumed to
#' originate from tumor read counts. Otherwise, \code{DATA} is
#' assumed to originate from normal read counts.
#' @param show_plot If \code{TRUE}, the function will generate
#' a sequence of plots to visualize the points and ongoing segmentation.
#' @param show_mess If \code{TRUE}, the function will output detailed
#' segmentation messages. By default \code{show_mess = FALSE}.
#' @param min_adj_seg An integer for the minimum number of indices
#' to constitute a segment.
#' @param min_nUniqIdx_pSeg An integer for the minimum number of unique
#' indices per segment.
#' @return List of segmentation results. Object \code{INFER} is a data.frame
#' of segment-specific metrics. Object \code{BIC} contains the
#' Bayesian Information Criterion for the overall segmentation.
#' Object \code{SEG} is a numeric vector representing the indices of
#' segment start and ends.
#' @export
run_AF_SEG = function(DATA,num_divides,init_seg = NULL,binom = TRUE,
clust_tVAF = TRUE,show_plot = FALSE,show_mess = FALSE,min_adj_seg = 20,
min_nUniqIdx_pSeg = 10){
if(FALSE){
DATA = input_data
num_divides = 40
init_seg = NULL
binom = binom
clust_tVAF = !TRUE
show_plot = TRUE
show_mess = show_mess
}
# Functions
# Plan: Write mini-functions for parts of full segmentation
get_min_adjacent = function(SEG){
# SEG = c(1,10,55,100)
dat = c()
begin = SEG[1]
for(ii in SEG[-1]){
finish = ii
dat = rbind(dat,smart_df(start = begin,end = finish))
begin = finish + 1
}
dat$length = dat$end - dat$start
min(dat$length)
}
get_START_END = function(SEG,SEG_INT){
# SEG_INT = takes values between 1 and length(SEG) - 1
# Get START and END
END = SEG[SEG_INT + 1]
if(SEG_INT == 1){
START = SEG[SEG_INT]
} else {
START = SEG[SEG_INT] + 1
}
# print(paste0("START(",START,") <= INDEX <= END(",END,")"))
c(START,END)
}
get_new_seg_BIC = function(DATA,SEG,input_HIST){
if(FALSE){
# DATA = input_data
# SEG = SEG + kk*jj*vec_shift
# SEG = c(1,14,24,283,308,334,385,411,487,513)
SEG = curr_seg
# SEG = new_SEG
# SEG = curr_seg[-ii]
# SEG = curr_seg + pos_neg*increment*e_vec
input_HIST = hist_LL
}
start_pos = SEG[1]
curr_LL = 0
curr_k = 0
curr_BIC = 0
for(ii in SEG[-1]){
# ii = SEG[-1][1]
end_pos = ii
# print(c(start_pos,end_pos))
# Code for BAF
tmp_index = which(input_HIST$start == start_pos & input_HIST$end == end_pos)
tmp_idx = which(DATA$index >= start_pos & DATA$index <= end_pos)
if( is.null(input_HIST) || length(tmp_index) == 0 ){
sub_data = DATA[tmp_idx,,drop = FALSE]
# rownames(sub_data) = NULL
tmp_int = curr_BAF_EM_VAF(MAT = as.matrix(sub_data[,
c("AD","RD","DP","LBC","log_DP"),drop = FALSE]))
input_HIST = rbind(input_HIST,smart_df(start = start_pos,
end = end_pos,LL = tmp_int$LL,ms = tmp_int$ms))
} else if( length(tmp_index) == 1 ){
tmp_int = list()
tmp_int$LL = input_HIST$LL[tmp_index]
tmp_int$ms = input_HIST$ms[tmp_index]
}
curr_LL = curr_LL + tmp_int$LL
curr_k = curr_k + tmp_int$ms
curr_npts = length(tmp_idx)
#curr_BIC = curr_BIC +
# 2 * tmp_int$LL - log(curr_npts) * tmp_int$ms
start_pos = end_pos + 1
}
BIC = 2 * curr_LL - log(nrow(DATA)) * curr_k
# BIC = curr_BIC
list(BIC = round(BIC,2),HIST = input_HIST)
}
find_new_DIV = function(DATA,SEG,SEG_INT,input_HIST){
if(FALSE){
DATA = input_data
SEG = curr_seg
# SEG_INT = sample(x = c(2,seq(2,length(SEG))),size = 1)
SEG_INT = global_SEG_INT
input_HIST = hist_LL
}
SE = get_START_END(SEG,SEG_INT)
# Check num unique indices
thres = 9 #9
min_num_idx_per_seg = min_adj_seg # minimum end - start index among segmentation
num_uniq_idx_per_seg = min_nUniqIdx_pSeg
uniq_indices = unique(DATA$index[which(DATA$index >= SE[1] + 1 & DATA$index <= SE[2] - 1)])
SEG_BIC = get_new_seg_BIC(DATA,SEG,input_HIST)$BIC
if( length(uniq_indices) >= num_uniq_idx_per_seg ){
new_INDICES = unique(as.numeric(round(quantile(uniq_indices,seq(thres-1)/thres))))
vec_BIC = c()
for(ii in new_INDICES){
# ii = new_INDICES[2]
new_SEG = unique(sort(c(SEG,ii)))
min_adjacent = get_min_adjacent(new_SEG)
if( min_adjacent >= min_num_idx_per_seg ){
tmp_list = get_new_seg_BIC(DATA,new_SEG,input_HIST)
vec_BIC = c(vec_BIC,tmp_list$BIC)
input_HIST = tmp_list$HIST
} else {
vec_BIC = c(vec_BIC,SEG_BIC)
}
}
max_index = which.max(vec_BIC)
out_SEG = SEG
if( vec_BIC[max_index] > SEG_BIC){
out_SEG = sort(c(SEG,new_INDICES[max_index]))
}
} else {
out_SEG = SEG
}
NEW = ifelse(length(SEG) < length(out_SEG),1,0)
list(SEG=out_SEG,HIST=input_HIST,NEW=NEW)
}
find_new_DEL = function(DATA,SEG,input_HIST){
curr_BIC = get_new_seg_BIC(DATA,SEG,input_HIST)$BIC
seg_DEL = 0
out_SEG_INT = NA
for(ii in seq(2,length(SEG)-1)){
tmp_list = get_new_seg_BIC(DATA,SEG[-ii],input_HIST)
input_HIST = tmp_list$HIST
if( tmp_list$BIC >= curr_BIC ){
SEG = SEG[-ii]
seg_DEL = 1
out_SEG_INT = ii - 1
break
}
}
list(SEG=SEG,DEL=seg_DEL,HIST=input_HIST,SEG_INT=out_SEG_INT)
}
find_new_ADJ = function(DATA,SEG,input_HIST,vec_INTS,show=TRUE,show_try=!TRUE){
if(FALSE){
# DATA = input_data
SEG = curr_seg
input_HIST = hist_LL
vec_INTS = vec_INTS
show = TRUE
}
all_SHIFT = 2^seq(0,4)
seg_ADJ = 0
for(ii in vec_INTS){
# ii = vec_INTS[1]
vec_shift = rep(0,length(SEG))
vec_shift[ii] = 1
seg_ADJ_2 = 0
while(TRUE){
# Get max distance for adjustment
seg_LEFT = SEG[ii]-SEG[ii-1]
seg_RIGHT = SEG[ii+1]-SEG[ii]
seg_SHIFT = min(abs(c(seg_LEFT,seg_RIGHT)))
if( seg_SHIFT < 5 ) break
all_SHIFT2 = rev(all_SHIFT[which(all_SHIFT < seg_SHIFT)])
tmp_list = get_new_seg_BIC(DATA,SEG,input_HIST)
tmp_BIC = tmp_list$BIC
input_HIST = tmp_list$HIST
for(jj in all_SHIFT2){
# jj = all_SHIFT2[3]
vec_BIC = c()
for(kk in c(-1,1)){
if(FALSE){
kk = -1
kk = 1 # something wrong here
SEG + kk*jj*vec_shift
}
if(show_try) print(SEG + kk*jj*vec_shift)
min_adjacent = get_min_adjacent(SEG + kk*jj*vec_shift)
if( min_adjacent >= 5 ){
tmp_list = get_new_seg_BIC(DATA,SEG + kk*jj*vec_shift,input_HIST)
input_HIST = tmp_list$HIST
vec_BIC = c(vec_BIC,tmp_list$BIC)
} else {
vec_BIC = c(vec_BIC,tmp_BIC)
}
}
if( max(vec_BIC) > tmp_BIC ){
tmp_BIC = max(vec_BIC)
SEG = SEG + c(-1,1)[which.max(vec_BIC)]*jj*vec_shift
# get_new_seg_BIC(DATA,SEG,input_HIST)$BIC
if(show) message("Adj found!",appendLF = FALSE)
# if(show) print(SEG)
seg_ADJ = 1
seg_ADJ_2 = 1
break
}
}
if( seg_ADJ_2 == 0 || (seg_ADJ_2 == 1 && jj == 1) ) break
}
}
list(SEG=SEG,seg_ADJ=seg_ADJ,HIST=input_HIST)
}
plot_SEG = function(DATA,SEG){
plot(DATA[,c("index","VAF")],pch=16,cex=DATA$pt_cex,col=DATA$pt_col,ylim=c(0,1))
abline(v=SEG,lwd=2,lty=2,col="red")
abline(h=0.5,lwd=1,lty=2)
}
infer_SEG = function(DATA,SEG,show_plot=TRUE){
if(FALSE){
DATA = DATA
SEG = curr_seg
show_plot = TRUE
}
if(show_plot) plot_SEG(DATA,SEG)
out_infer = c()
START = SEG[1]
seg = 1
# eps_thres = ifelse(clust_tVAF,1,0.5)
eps_thres = 1
for(ii in seq(2,length(SEG))){
# ii = 13
END = SEG[ii]
# print(c(START,END))
my_MAT = as.matrix(DATA[which(DATA$index >= START & DATA$index <= END),c("AD","RD","DP","LBC","log_DP")])
em_out = curr_BAF_EM_VAF(MAT = my_MAT)
# em_out[c("converge","BIC","ms","params")]
maf = NA
if(em_out$params[1] < eps_thres){
if( em_out$params[2] >= 0.5 ){
maf = 0.5
} else {
maf = em_out$params[3]
if( em_out$params[4] < 1 ){
maf = c(maf,1-maf)
}
}
if(show_plot) segments(x0=START,y0=maf,x1=END,col="magenta",lwd=4,lty=1)
}
out_infer = rbind(out_infer,smart_df(seg,START,END,t(em_out$params),ms=em_out$ms))
START = END + 1
seg = seg + 1
}
names(out_infer) = c("seg","start","end","eps","prob_half","mBAF","prop_mBAF","PSI","ms")
out_infer
}
# 0) Check certain data.frame names exist in DATA
req_cols = c("AD","RD","index")
if( !all(req_cols %in% names(DATA)) ){
miss_cols = req_cols[!(req_cols %in% names(DATA))]
stop(paste0("DATA missing fields: ",paste(miss_cols,collapse=", ")))
}
# Make auxiliary variables (DP,LBC,VAF,pt_cex)
DATA$DP = rowSums(DATA[,c("AD","RD")])
DATA$log_DP = log(DATA$DP)
DATA$LBC = lchoose(DATA$DP,DATA$AD)
DATA$VAF = DATA$AD / DATA$DP
DATA$pt_cex = 0.5*log10(DATA$DP)
DATA$pt_col = rgb(0,0,0,0.5)
# 1) Determine which VAFs are being clustered
curr_BAF_EM_VAF = function(...){
BAF_EM_VAF(...,binom = binom,show_dots = FALSE,clust_tVAF = clust_tVAF)
}
# 2) Initialization
hist_LL = c()
# DATA = truth_data$all_data
if( is.null(init_seg) ){
tmp_range = range(DATA$index)
curr_seg = as.numeric(round(quantile(seq(tmp_range[1],tmp_range[2]),
seq(0,num_divides)/num_divides)))
if( get_min_adjacent(curr_seg) < 5 ){
curr_seg = tmp_range
}
} else {
curr_seg = init_seg
}
curr_list = get_new_seg_BIC(DATA,curr_seg,hist_LL)
hist_LL = curr_list$HIST
hist_BIC = curr_list$BIC
# 3) Run Segmentation
global_SEG_INT = 1; iter = 0; prop_thres = 0
while( global_SEG_INT <= length(curr_seg) ){
if(show_plot){
plot_SEG(DATA,curr_seg)
abline(v=curr_seg[c(global_SEG_INT,global_SEG_INT+1)],col="darkgreen",lty=2,lwd=2)
}
if(show_mess) print(curr_seg)
if( !is.na(curr_seg[global_SEG_INT+1]) && curr_seg[global_SEG_INT+1] < max(curr_seg) ){
tmp_num = curr_seg[global_SEG_INT+1] / max(curr_seg)
while(TRUE){
if( tmp_num >= prop_thres ){
message(paste0(prop_thres*100,"% "),appendLF = FALSE)
prop_thres = prop_thres + 0.05
} else {
break
}
}
}
# ADD SEGMENT
seg_ADD = 0
if(TRUE){
if(show_mess) message("Try ADD: ",appendLF = FALSE)
list_DIV = find_new_DIV(DATA,curr_seg,global_SEG_INT,hist_LL)
hist_LL = list_DIV$HIST
curr_seg = list_DIV$SEG
seg_ADD = list_DIV$NEW
if( seg_ADD == 1 ){
if(show_mess) message("Added segment",appendLF = FALSE)
global_SEG_INT = max(c(1,global_SEG_INT-1))
}
if(show_mess) message("\n",appendLF = FALSE)
}
# REMOVE SEGMENT
seg_DEL = 0
if( seg_ADD == 0 && length(curr_seg) > 3 ){
if(show_mess) message("Try DEL: ",appendLF = FALSE)
list_DEL = find_new_DEL(DATA,curr_seg,hist_LL)
hist_LL = list_DEL$HIST
curr_seg = list_DEL$SEG
seg_DEL = list_DEL$DEL
if(seg_DEL == 1){
if(show_mess) message("Removed segment",appendLF = FALSE)
# global_SEG_INT = max(c(1,global_SEG_INT-2))
global_SEG_INT = max(c(1,min(c(global_SEG_INT-2,list_DEL$SEG_INT-2))))
# global_SEG_INT = 1
}
if(show_mess) message("\n",appendLF = FALSE)
}
# ADJUST SEGMENTS
seg_ADJ = 0
if( seg_ADD == 0 && seg_DEL == 0 && length(curr_seg) > 3 ){
if(show_mess) message("Try ADJ: ",appendLF = FALSE)
vec_INTS = seq(global_SEG_INT - 1,global_SEG_INT + 1)
vec_INTS = vec_INTS[which(vec_INTS > 1 & vec_INTS < length(curr_seg))]
list_ADJ = find_new_ADJ(DATA,curr_seg,hist_LL,vec_INTS,show=show_mess)
hist_LL = list_ADJ$HIST
curr_seg = list_ADJ$SEG
seg_ADJ = list_ADJ$seg_ADJ
if(seg_ADJ == 1){
global_SEG_INT = max(c(1,global_SEG_INT-2))
}
if(show_mess) message("\n",appendLF = FALSE)
}
# Move to next interval if no updates
if( seg_ADD == 0 && seg_DEL == 0 && seg_ADJ == 0 ){
global_SEG_INT = global_SEG_INT + 1
}
iter = iter + 1
}
message("100%\n",appendLF = FALSE)
out_INFER = infer_SEG(DATA = DATA,SEG = curr_seg,show_plot = show_plot)
out_BIC = get_new_seg_BIC(DATA,curr_seg,hist_LL)$BIC
list(INFER=out_INFER,BIC=out_BIC,SEG=curr_seg)
}
segment_nVAF = function(DATA,chr,binom = TRUE,
eps_thres = 0.5,psi_thres = 0.01,show_plot = FALSE,
show_mess = FALSE){
if(FALSE){
DATA = vcs; chr = chr; binom = binom
show_plot = FALSE; show_mess = FALSE
}
input_data = DATA[which( DATA$Chr %in% paste0("chr",chr)
& DATA$nLabel %in% c("noise","G=1") ),
c("Chr","Position","GeneName","TARGET","x_coor","mutID",
"nAD","nRD","nDP","nVAF"),drop = FALSE]
names(input_data) = c("Chr","Position","GeneName","TARGET",
"x_coor","mutID","AD","RD","DP","VAF")
input_data = input_data[order(input_data$Position),]
# input_data = input_data[which(input_data$Position >= 150e6),]
tmp_df = smart_df(Position = sort(unique(input_data$Position)))
tmp_df$index = seq(nrow(tmp_df))
input_data = smart_merge(input_data,tmp_df); rm(tmp_df)
input_data$pt_cex = 0.5*log10(input_data$DP)
message(paste0("\tchr",chr,": "),appendLF = FALSE)
out_AF_SEG = run_AF_SEG(
DATA = input_data,
num_divides = 5e1,
binom = binom,
clust_tVAF = !TRUE,
show_plot = show_plot,
show_mess = show_mess)
infer_out = out_AF_SEG$INFER
infer_out$start_x_coor = NA; infer_out$end_x_coor = NA
infer_out$Chr = paste0("chr",chr)
infer_out$Start_Position = NA; infer_out$End_Position = NA
infer_out$Genes = NA; input_data$pt_col = NA
infer_out$nVAF_mean = NA; infer_out$nVAF_sd = NA; infer_out$N = NA
for(ii in seq(nrow(infer_out))){
# ii = 8
tmp_index1 = which(input_data$index >= infer_out$start[ii]
& input_data$index <= infer_out$end[ii])
infer_out[ii,c("start_x_coor","end_x_coor")] = range(input_data$x_coor[tmp_index1])
infer_out[ii,c("Start_Position","End_Position")] = range(input_data$Position[tmp_index1])
tmp_index2 = which(input_data$Position >= infer_out$Start_Position[ii]
& input_data$Position <= infer_out$End_Position[ii])
tmp_index3 = which(input_data$Position >= infer_out$Start_Position[ii]
& input_data$Position <= infer_out$End_Position[ii]
& !is.na(input_data$GeneName)
& input_data$TARGET == "YES")
tmp_tab = smart_table(unique(input_data[tmp_index3,c("GeneName","mutID")])[,1])
if(length(tmp_tab) > 0){
infer_out$Genes[ii] = ifelse(infer_out$eps[ii] >= eps_thres,
paste(paste0(names(tmp_tab),"(",tmp_tab,")"),collapse=";"),
NA)
}
input_data$pt_col[tmp_index2] = ifelse(infer_out$eps[ii] >= eps_thres | infer_out$PSI[ii] >= psi_thres,
rgb(1,0,0,0.5),rgb(0,0,0,0.5))
infer_out[ii,c("nVAF_mean","nVAF_sd","N")] = c(mean(input_data$VAF[tmp_index1]),
sd(input_data$VAF[tmp_index1]),length(tmp_index1))
}
# infer_out
# Output annotated input_data
list(annoDATA = input_data,INFER = infer_out)
}
segment_tVAF = function(DATA,chr,binom=TRUE,show_plot=FALSE,show_mess=FALSE,BAF_thres){
input_dat_vars = c("Chr","Position","x_coor","mutID","Qscore",
"tAD","tRD","tDP","tVAF","GeneName","TARGET")
input_data = DATA[which(DATA$Chr == paste0("chr",chr)
& DATA$tVAF > BAF_thres & DATA$tVAF < 1 - BAF_thres
& DATA$Qscore >= 10
),input_dat_vars]
input_data = name_change(input_data,"tAD","AD")
input_data = name_change(input_data,"tRD","RD")
input_data = name_change(input_data,"tDP","DP")
input_data = name_change(input_data,"tVAF","VAF")
input_data$LBC = lchoose(input_data$DP,input_data$AD)
input_data = input_data[order(input_data$Position),]
input_data = smart_uniq_df(input_df = input_data,vars=c("Chr","Position","mutID"))
input_data = input_data[order(input_data$Position),]
input_data$pt_cex = 0.5*log10(input_data$DP)
input_data$index = seq(nrow(input_data))
# Run segmentation!!!
message(paste0("\tchr",chr,": "),appendLF = FALSE)
out_AF_SEG = run_AF_SEG(
DATA = input_data,
num_divides = 5e1,
binom = binom,
clust_tVAF = TRUE,
show_plot = show_plot,
show_mess = show_mess)
# After segmentation, infer
infer_out = out_AF_SEG$INFER
infer_out$start_x_coor = NA; infer_out$end_x_coor = NA
infer_out$Chr = paste0("chr",chr)
infer_out$Start_Position = NA; infer_out$End_Position = NA
infer_out$iBAF = NA; infer_out$GeneName = ""
eps_thres = 1
for(ii in seq(nrow(infer_out))){
# ii = 1
tmp_index = which(input_data$index >= infer_out$start[ii]
& input_data$index <= infer_out$end[ii])
infer_out[ii,c("start_x_coor","end_x_coor")] = range(input_data$x_coor[tmp_index])
infer_out[ii,c("Start_Position","End_Position")] = range(input_data$Position[tmp_index])
tmp_index3 = which(input_data$Position >= infer_out$Start_Position[ii]
& input_data$Position <= infer_out$End_Position[ii]
& !is.na(input_data$GeneName)
& input_data$TARGET == "YES")
tmp_tab = smart_table(unique(input_data[tmp_index3,c("GeneName","mutID")])[,1])
if( infer_out$eps[ii] == eps_thres ){
if( length(tmp_tab) > 0 ) infer_out$GeneName[ii] = paste(paste0(names(tmp_tab),"(",tmp_tab,")"),collapse=";")
} else if( infer_out$eps[ii] < eps_thres ){
if( infer_out$prob_half[ii] >= 0.5 ){
infer_out$iBAF[ii] = 0.5
} else {
infer_out$iBAF[ii] = min(c(infer_out$mBAF[ii],1-infer_out$mBAF[ii]))
}
}
}
list(annoDATA=input_data,AF_SEG=out_AF_SEG,INFER=infer_out)
}
UNMASC_nSEG = function(outdir,vcs,tumorID,hg,genome,binom=TRUE,eps_thres=0.5,psi_thres=0.01){
if(FALSE){
outdir = outdir; vcs = vcf[which(vcf$Chr == "chr1"),]
tumorID = tumorID; hg = hg; genome = genome
binom = binom
}
message("normal VAF segmentation ...\n",appendLF = FALSE)
anno_all_vcs_nVAF = c()
all_nVAF_segs = c()
nSEG_dir = file.path(outdir,"nSEG")
smart_mkdir(nSEG_dir)
for(chr in 1:22){
# chr = 1
if( length(which(vcs$Chr == paste0("chr",chr))) == 0 ) next
tmp_list = segment_nVAF(DATA = vcs,chr = chr,binom = binom,
eps_thres = eps_thres,psi_thres = psi_thres,show_plot = FALSE,
show_mess = FALSE)
png(file.path(nSEG_dir,paste0("nSEG_chr",chr,".png")),units='px',
height=1800,width=3000,res=250,type='cairo')
par(mfrow=c(2,1),mar=c(5,4,1,6),xpd=TRUE)
genome_plot(tmp_list$annoDATA,var="VAF",hg=hg,chr=chr,pch=16,
cex=tmp_list$annoDATA$pt_cex,how_color="pt_col",genome=genome)
legend("bottomright",inset = c(-0.1,0),legend = c(10^seq(3)),
pt.cex = 0.5*log10(10^seq(3)),pch=rep(16,3),title="DP")
plot(tmp_list$annoDATA[,c("index","VAF")],pch = 16,ylab = "nVAF",
cex = tmp_list$annoDATA$pt_cex,col = tmp_list$annoDATA$pt_col,bty = "n",
xaxs = "i",ylim = c(0,1))
legend("bottomright",inset = c(-0.1,0),legend = c("H2M","nonH2M"),
pt.cex = 1.5,pch = 15,col = c("red","black"))
par(mfrow=c(1,1),mar=c(5,4,4,2)+0.1,xpd=FALSE)
dev.off()
anno_all_vcs_nVAF = rbind(anno_all_vcs_nVAF,tmp_list$annoDATA)
all_nVAF_segs = rbind(all_nVAF_segs,tmp_list$INFER)
rm(tmp_list)
}
saveRDS(list(anno_all_vcs_nVAF=anno_all_vcs_nVAF,all_nVAF_segs=all_nVAF_segs),
file.path(outdir,"anno_nSEG.rds"))
png(file.path(nSEG_dir,"nSEG.png"),units='px',height=1500,width=3000,res=250,type='cairo')
par(mar=c(5,4,1,6),xpd=TRUE)
genome_plot(anno_all_vcs_nVAF,var="VAF",cex=anno_all_vcs_nVAF$pt_cex,pch=16,
hg=hg,how_color="pt_col",genome=genome,main=tumorID)
legend("bottomright",inset = c(-0.1,0),legend = c(10^seq(3)),
pt.cex = 0.5*log10(10^seq(3)),pch=rep(16,3),title="DP")
par(mar=c(5,4,4,2)+0.1,xpd=FALSE)
dev.off()
NULL
}
UNMASC_tSEG = function(outdir,tumorID,vcs,strand,gender,cut_BAF,hg,genome,
rd_thres,ad_thres,qscore_thres,binom=TRUE){
if(FALSE){
outdir = outdir; tumorID = tumorID
vcs = vcf; strand = strand; gender = gender
cut_BAF = cut_BAF; hg = hg; genome = genome
rd_thres = rd_thres; ad_thres = ad_thres
qscore_thres = qscore_thres; binom = binom
}
message("Subset strand to variants with depth and qscore thresholds and no strand bias ...\n",appendLF = FALSE)
vcs = vcs[which(vcs$tDP >= rd_thres & vcs$tAD >= ad_thres
& vcs$Qscore >= qscore_thres),]
mutIDs = unique(vcs$mutID)
strand = strand[which(strand$mutID %in% mutIDs),]
strand = strand[which(is.na(strand$Strand_bias_P)
| strand$Strand_bias_P > 0.05),]
message("Determine OXOG,FFPE,ARTI status ...\n",appendLF = FALSE)
num_uniq_vars = nrow(strand)
message(paste0("Number of unique variants = ",num_uniq_vars,"\n"),appendLF = FALSE)
strand$OXOG = NA
strand$FFPE = NA
strand$ARTI = NA
for(ii in seq(num_uniq_vars)){
# ii = 1
if(ii %% 5e2 == 0) message(".",appendLF = FALSE)
if(ii %% 4e4 == 0 || ii == num_uniq_vars ) message(paste0(ii,"\n"),appendLF = FALSE)
mutid_index = which(vcs$mutID == strand$mutID[ii])
tmp_tab = smart_table(vcs$OXOG[mutid_index])
tmp_value = paste0(names(tmp_tab),"(",as.numeric(tmp_tab),",",
round(as.numeric(tmp_tab/sum(tmp_tab))*100,2),"%)")
strand$OXOG[ii] = paste(tmp_value,collapse=";")
tmp_tab = smart_table(vcs$FFPE[mutid_index])
tmp_value = paste0(names(tmp_tab),"(",as.numeric(tmp_tab),",",
round(as.numeric(tmp_tab/sum(tmp_tab))*100,2),"%)")
strand$FFPE[ii] = paste(tmp_value,collapse=";")
tmp_tab = smart_table(vcs$ARTI[mutid_index])
tmp_value = paste0(names(tmp_tab),"(",as.numeric(tmp_tab),",",
round(as.numeric(tmp_tab/sum(tmp_tab))*100,2),"%)")
strand$ARTI[ii] = paste(tmp_value,collapse=";")
}
strand$OXOG_artifact = !sapply(strand$OXOG,artifact_filter,USE.NAMES=FALSE)
strand$FFPE_artifact = !sapply(strand$FFPE,artifact_filter,USE.NAMES=FALSE)
strand$ARTI_artifact = !sapply(strand$ARTI,artifact_filter,USE.NAMES=FALSE)
# Segment non-OXOG, non-FFPE, non-strand bias, non-ARTI variants
strand = strand[which(strand$OXOG_artifact == FALSE
& strand$FFPE_artifact == FALSE
& strand$ARTI_artifact == FALSE
),]
vcs = vcs[which(vcs$mutID %in% strand$mutID),]
message("tumor VAF segmentation on variants...\n",appendLF = FALSE)
anno_all_vcs_tVAF = c()
all_tVAF_segs = c()
tumor_chrs = 1:22
if( !is.na(gender) && gender == "FEMALE" ){
tumor_chrs = c(1:22,"X")
}
tSEG_dir = file.path(outdir,"tSEG")
smart_mkdir(tSEG_dir)
for(chr in tumor_chrs){
if( length(which(vcs$Chr == paste0("chr",chr))) == 0 ) next
if(FALSE){
vv = vcs[which(vcs$tVAF > cut_BAF & vcs$tVAF < 1-cut_BAF
# & vcs$nANNO == "mappable"
),]
dim(vv)
smart_table(vv$RefAlt,vv$tVAF < 0.2)
vv$RefAlt2 = vv$RefAlt
vv$RefAlt2[vv$RefAlt %in% c("A>C","T>G")] = "A>C"
vv$RefAlt2[vv$RefAlt %in% c("A>G","T>C")] = "A>G"
vv$RefAlt2[vv$RefAlt %in% c("A>T","T>A")] = "A>T"
vv$RefAlt2[vv$RefAlt %in% c("C>A","G>T")] = "C>A"
vv$RefAlt2[vv$RefAlt %in% c("C>G","G>C")] = "C>G"
smart_table(vv$RefAlt2,vv$tVAF < 0.2)
plot(vv[,c("x_coor","tVAF")],cex = 0.5*log10(vv$tDP),
col = factor(vv$RefAlt2),pch = 16,ylim = c(0,1))
vv[which(vv$mutID == 32),]
plot(vv[,c("x_coor","tVAF")],cex = 0.5*log10(vv$tDP),pch = 16,
col = factor(vv$ThsdG_AF > 0 | vv$EXAC_AF > 0),ylim = c(0,1))
}
# Run segmentation on all non-candidate tumor-only variant calls
tmp_list = segment_tVAF(DATA = vcs,
chr = chr,binom = binom,show_plot = FALSE,show_mess = FALSE,
BAF_thres = cut_BAF)
png(file.path(tSEG_dir,paste0("tSEG_chr",chr,".png")),units='px',
height=1800,width=3000,res=250,type='cairo')
par(mfrow=c(2,1),mar=c(5,4,1,6),xpd=TRUE)
# 1)
genome_plot(tmp_list$annoDATA,var="VAF",hg=hg,chr=chr,pch=16,
cex = tmp_list$annoDATA$pt_cex,how_color = rgb(0,0,0,0.5),
genome = genome)
legend("bottomright",inset = c(-0.1,0),legend = c(10^seq(3)),
pt.cex = 0.5*log10(10^seq(3)),pch=rep(16,3),title="DP")
apply(tmp_list$INFER[which(!is.na(tmp_list$INFER$iBAF)),c("iBAF","start_x_coor","end_x_coor")],1,
function(x) segments(x0=x[2],y0=c(x[1],1-x[1]),x1=x[3],lwd=3,col="magenta"))
# 2)
plot(tmp_list$annoDATA$VAF,pch = 16,ylab = "tVAF",cex = tmp_list$annoDATA$pt_cex,
col = rgb(0,0,0,0.5),ylim = c(0,1))
apply(tmp_list$INFER[which(!is.na(tmp_list$INFER$iBAF)),c("iBAF","start","end")],1,
function(x) segments(x0=x[2],y0=c(x[1],1-x[1]),x1=x[3],lwd=3,col="magenta"))
par(mfrow=c(1,1),mar=c(5,4,4,2)+0.1,xpd=FALSE)
dev.off()
anno_all_vcs_tVAF = rbind(anno_all_vcs_tVAF,tmp_list$annoDATA)
all_tVAF_segs = rbind(all_tVAF_segs,tmp_list$INFER)
rm(tmp_list)
}
saveRDS(list(anno_all_vcs_tVAF=anno_all_vcs_tVAF,all_tVAF_segs=all_tVAF_segs),
file.path(outdir,"anno_tSEG.rds"))
png(file.path(tSEG_dir,"tSEG.png"),units='px',height=1500,width=3000,res=250,type='cairo')
par(mar=c(5,4,1,6),xpd=TRUE)
genome_plot(anno_all_vcs_tVAF,var = "VAF",cex = anno_all_vcs_tVAF$pt_cex,
pch = 16,hg = hg,how_color = rgb(0,0,0,0.5),genome = genome,
main = tumorID)
legend("bottomright",inset = c(-0.1,0),legend = c(10^seq(3)),
pt.cex = 0.5*log10(10^seq(3)),pch=rep(16,3),title="DP")
apply(all_tVAF_segs[which(!is.na(all_tVAF_segs$iBAF)),c("iBAF","start_x_coor","end_x_coor")],1,
function(x) segments(x0=x[2],y0=c(x[1],1-x[1]),x1=x[3],lwd=3,col="magenta"))
par(mar=c(5,4,4,2)+0.1,xpd=FALSE)
dev.off()
NULL
}
anno_nSEG = function(DAT,nSEG,eps_thres=0.5,psi_thres=0.01){
DAT$seg = NA
for(ii in seq(nrow(nSEG))){
# ii = 1
idx = which(DAT$Chr == nSEG$Chr[ii]
& DAT$Position >= nSEG$Start_Position[ii]
& DAT$Position <= nSEG$End_Position[ii])
if( length(idx) > 0 ){
DAT$seg[idx] = nSEG$seg[ii]
}
rm(idx)
}
dim(DAT)
DAT = smart_merge(smart_rmcols(DAT,c("N_eps","N_PSI")),
nSEG[,c("seg","Chr","eps","PSI")],all.x = TRUE)
DAT = name_change(DAT,"eps","N_eps")
DAT = name_change(DAT,"PSI","N_PSI")
DAT = smart_rmcols(DAT,"seg")
DAT$nANNO = ifelse(DAT$N_eps >= eps_thres
| DAT$N_PSI >= psi_thres,"H2M","mappable")
DAT
}
###
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.