#' Analyzing function.
#'
#' This function analyzes the filtering results.
#'
#' @param msec Mutation filtering information.
#' @param mut_depth Mutation coverage data.
#' @param short_homology_search_length Small sequence for homology search.
#' @param min_homology_search The sequence length for homology
#' search.
#' @param threshold_p The largest p value of significant errors.
#' @param threshold_hairpin_ratio The smallest hairpin read ratio.
#' @param threshold_short_length Reads shorter than that are analyzed.
#' @param threshold_distant_homology The smallest rate of reads from other
#' regions.
#' @param threshold_soft_clip_ratio The rate of soft-clipped reads.
#' @param threshold_low_quality_rate The smallest rate of low quality bases.
#' @param homopolymer_length The smallest length of homopolymers.
#' @return msec
#' @importFrom dplyr %>%
#' @importFrom dplyr mutate
#' @importFrom dplyr case_when
#' @importFrom dplyr select
#' @importFrom BiocGenerics mapply
#' @examples
#' data(msec_summarized)
#' data(mut_depth_checked)
#' fun_analysis(msec = msec_summarized,
#' mut_depth = mut_depth_checked,
#' short_homology_search_length = 4,
#' min_homology_search = 40,
#' threshold_p = 10 ^ (-6),
#' threshold_hairpin_ratio = 0.50,
#' threshold_short_length = 0.75,
#' threshold_distant_homology = 0.15,
#' threshold_soft_clip_ratio = 0.50,
#' threshold_low_quality_rate = 0.1,
#' homopolymer_length = 15
#' )
#' @export
fun_analysis <- function(msec,
mut_depth,
short_homology_search_length,
min_homology_search ,
threshold_p,
threshold_hairpin_ratio,
threshold_short_length,
threshold_distant_homology,
threshold_soft_clip_ratio,
threshold_low_quality_rate,
homopolymer_length) {
if (dim(msec)[1] > 0) {
short_support_length_total <- NULL
half_length_total <- NULL
pre_support_length_total <- NULL
total_length_total <- NULL
post_support_length_total <- NULL
low_quality_base_rate_under_q18 <- NULL
pre_rep_status <- NULL
post_rep_status <- NULL
alt_length <- NULL
short_support_length_adj_sum <- NULL
half_length_adj_sum <- NULL
pre_support_length_adj_sum <- NULL
total_length_pre_adj_sum <- NULL
total_length_post_adj_sum <- NULL
post_support_length_adj_sum <- NULL
total_read <- NULL
prob_filter_1 <- NULL
prob_filter_3_pre <- NULL
prob_filter_3_post <- NULL
short_short_support <- NULL
short_short_support_sum <- NULL
flag_hairpin <- NULL
high_rate_q18 <- NULL
short_pre_support <- NULL
short_pre_support_sum <- NULL
short_post_support <- NULL
short_post_support_sum <- NULL
soft_clipped_read <- NULL
pre_farthest <- NULL
post_farthest <- NULL
distant_homology_rate <- NULL
not_long_repeat <- NULL
SimpleRepeat_TRF <- NULL
homopolymer_status <- NULL
soft_clipped_rate <- NULL
caution <- NULL
filter_1_mutation_intra_hairpin_loop <- NULL
filter_2_hairpin_structure <- NULL
filter_3_microhomology_induced_mutation <- NULL
filter_4_highly_homologous_region <- NULL
filter_5_soft_clipped_reads <- NULL
filter_6_simple_repeat <- NULL
filter_7_mutation_at_homopolymer <- NULL
filter_8_low_quality <- NULL
mut_type <- NULL
hairpin_length <- NULL
pre_minimum_length <- NULL
post_minimum_length<- NULL
indel_status <- NULL
indel_length <- NULL
penalty_pre <- NULL
penalty_post <- NULL
pre_minimum_length_adj <- NULL
half_length <- NULL
post_minimum_length_adj <- NULL
pre_support_length_adj <- NULL
post_support_length_adj <- NULL
shortest_support_length_adj <- NULL
minimum_length_1 <- NULL
minimum_length_2 <- NULL
minimum_length <- NULL
short_support_length_adj <- NULL
altered_length <- NULL
altered_length2 <- NULL
altered_length3 <- NULL
distant_homology <- NULL
low_quality_pre <- NULL
low_quality_post <- NULL
mut_depth_pre <- mut_depth[[1]]
mut_depth_post <- mut_depth[[2]]
mut_depth_short <- mut_depth[[3]]
msec <- msec %>% mutate(
short_support_length_adj = case_when(
is.na(short_support_length_adj) ~ 99,
short_support_length_adj > 99 ~ 99,
short_support_length_adj < 0 ~ 0,
TRUE ~ short_support_length_adj
),
shortest_support_length_adj = case_when(
is.na(shortest_support_length_adj) ~ 0,
shortest_support_length_adj > 99 ~ 99,
shortest_support_length_adj < 0 ~ 0,
TRUE ~ shortest_support_length_adj
),
pre_support_length_adj = case_when(
is.na(pre_support_length_adj) ~ 199,
pre_support_length_adj > 199 ~ 199,
pre_support_length_adj < 0 ~ 0,
TRUE ~ pre_support_length_adj
),
pre_minimum_length_adj = case_when(
is.na(pre_minimum_length_adj) ~ 0,
pre_minimum_length_adj > 199 ~ 199,
pre_minimum_length_adj < 0 ~ 0,
TRUE ~ pre_minimum_length_adj
),
post_support_length_adj = case_when(
is.na(post_support_length_adj) ~ 199,
post_support_length_adj > 199 ~ 199,
post_support_length_adj < 0 ~ 0,
TRUE ~ post_support_length_adj
),
post_minimum_length_adj = case_when(
is.na(post_minimum_length_adj) ~ 0,
post_minimum_length_adj > 199 ~ 199,
post_minimum_length_adj < 0 ~ 0,
TRUE ~ post_minimum_length_adj
),
half_length = case_when(
is.na(half_length) ~ 99,
half_length > 99 ~ 99,
half_length < 0 ~ 0,
TRUE ~ half_length
),
minimum_length = case_when(
is.na(minimum_length) ~ 0,
minimum_length > 33 ~ 33,
minimum_length < 0 ~ 0,
TRUE ~ minimum_length
),
minimum_length_1 = case_when(
is.na(minimum_length_1) ~ 0,
minimum_length_1 > 33 ~ 33,
minimum_length_1 < 0 ~ 0,
TRUE ~ minimum_length_1
),
minimum_length_2 = case_when(
is.na(minimum_length_2) ~ 0,
minimum_length_2 > 33 ~ 33,
minimum_length_2 < 0 ~ 0,
TRUE ~ minimum_length_2
),
altered_length = case_when(
is.na(altered_length) ~ 0,
altered_length > 33 ~ 33,
altered_length < 0 ~ 0,
TRUE ~ altered_length
),
altered_length2 = case_when(
is.na(altered_length2) ~ 0,
altered_length2 > 33 ~ 33,
altered_length2 < 0 ~ 0,
TRUE ~ altered_length2
),
penalty_pre = case_when(
is.na(penalty_pre) ~ 0,
TRUE ~ penalty_pre
),
penalty_post = case_when(
is.na(penalty_post) ~ 0,
TRUE ~ penalty_post
)
)
msec <- msec %>% mutate(
penalty_pre = case_when(
penalty_pre > read_length - 1 - altered_length + altered_length2 - minimum_length_2 ~
read_length - 1 - altered_length + altered_length2 - minimum_length_2,
penalty_pre < read_length - altered_length + altered_length2 - 198 ~
read_length - altered_length + altered_length2 - 198,
TRUE ~ penalty_pre
),
penalty_post = case_when(
penalty_post > read_length - altered_length - minimum_length_1 ~
read_length - altered_length - minimum_length_1,
penalty_post < read_length - altered_length - 197 ~
read_length - altered_length - 197,
TRUE ~ penalty_post
)
)
msec <- msec %>% mutate(
short_short_support =
(short_support_length_total <=
threshold_short_length * half_length_total),
short_pre_support =
(pre_support_length_total <=
threshold_short_length * total_length_total),
short_post_support =
(post_support_length_total <=
threshold_short_length * total_length_total),
high_rate_q18 =
ifelse((low_quality_base_rate_under_q18 < threshold_low_quality_rate),
TRUE, FALSE),
not_long_repeat =
ifelse((short_homology_search_length +
pre_rep_status +
post_rep_status +
alt_length < min_homology_search),
TRUE, FALSE)
)
msec$short_support_length_adj_sum <-
mapply(
function(x, y) {
return(mut_depth_short[x, y])
},
1:dim(msec)[1],
msec$short_support_length_adj + 2
) -
mapply(
function(x, y) {
return(mut_depth_short[x, y])
},
1:dim(msec)[1],
msec$shortest_support_length_adj + 1
)
msec$pre_support_length_adj_sum <-
mapply(
function(x, y) {
return(mut_depth_pre[x, y])
},
1:dim(msec)[1],
msec$pre_support_length_adj + 2
) -
mapply(
function(x, y) {
return(mut_depth_pre[x, y])
},
1:dim(msec)[1],
msec$pre_minimum_length_adj + 1
)
msec$post_support_length_adj_sum <-
mapply(
function(x, y) {
return(mut_depth_post[x, y])
},
1:dim(msec)[1],
msec$post_support_length_adj + 2
) -
mapply(
function(x, y) {
return(mut_depth_post[x, y])
},
1:dim(msec)[1],
msec$post_minimum_length_adj + 1
)
msec$half_length_adj_sum <-
mapply(
function(x, y) {
return(mut_depth_short[x, y])
},
1:dim(msec)[1],
msec$half_length + 2
) -
mapply(
function(x, y) {
return(mut_depth_short[x, y])
},
1:dim(msec)[1],
msec$minimum_length + 1
)
msec$total_length_pre_adj_sum <-
mapply(
function(x, y) {
return(mut_depth_pre[x, y])
},
1:dim(msec)[1],
msec$read_length + 2 -
msec$altered_length -
msec$penalty_post
) -
mapply(
function(x, y) {
return(mut_depth_pre[x, y])
},
1:dim(msec)[1],
msec$minimum_length_1 + 1
)
msec$total_length_post_adj_sum <-
mapply(
function(x, y) {
return(mut_depth_post[x, y])
},
1:dim(msec)[1],
msec$read_length + 1 -
msec$altered_length +
msec$altered_length2 -
msec$penalty_pre
) -
mapply(
function(x, y) {
return(mut_depth_post[x, y])
},
1:dim(msec)[1],
msec$minimum_length_2 + 1
)
msec <- msec %>% mutate(
short_short_support_sum =
(short_support_length_adj_sum <=
half_length_adj_sum * threshold_short_length),
short_pre_support_sum =
(pre_support_length_adj_sum <=
total_length_pre_adj_sum * threshold_short_length),
short_post_support_sum =
(post_support_length_adj_sum <=
total_length_post_adj_sum * threshold_short_length)
)
msec <- msec %>% mutate(
prob_filter_1 =
fun_zero(short_support_length_adj_sum,
half_length_adj_sum) ^ total_read,
prob_filter_3_pre =
fun_zero(pre_support_length_adj_sum,
total_length_pre_adj_sum) ^ total_read,
prob_filter_3_post =
fun_zero(post_support_length_adj_sum,
total_length_post_adj_sum) ^ total_read
)
msec <- msec %>% mutate(
prob_filter_1 = ifelse((prob_filter_1 > 1),
1, ifelse((prob_filter_1 < 0),
0, prob_filter_1)),
prob_filter_3_pre = ifelse((prob_filter_3_pre > 1),
1, ifelse((prob_filter_3_pre < 0),
0, prob_filter_3_pre)),
prob_filter_3_post = ifelse((prob_filter_3_post > 1),
1, ifelse((prob_filter_3_post < 0),
0, prob_filter_3_post))
)
msec <- msec %>% mutate(
filter_1_mutation_intra_hairpin_loop =
ifelse((short_short_support &
short_short_support_sum &
prob_filter_1 < threshold_p),
TRUE, FALSE),
filter_2_hairpin_structure =
ifelse((fun_zero(flag_hairpin, total_read) > threshold_hairpin_ratio),
TRUE, FALSE),
filter_3_microhomology_induced_mutation =
ifelse((high_rate_q18 &
((prob_filter_3_pre <= threshold_p &
short_pre_support &
short_pre_support_sum) |
(prob_filter_3_post <= threshold_p &
short_post_support &
short_post_support_sum))),
TRUE, FALSE),
filter_4_highly_homologous_region =
ifelse((distant_homology_rate >= threshold_distant_homology &
#(pre_farthest == pre_support_length | post_farthest == post_support_length) &
not_long_repeat),
TRUE, FALSE),
filter_5_soft_clipped_reads =
ifelse((soft_clipped_rate >= threshold_soft_clip_ratio),
TRUE, FALSE),
filter_6_simple_repeat =
ifelse((SimpleRepeat_TRF == "Y"),
TRUE, FALSE),
filter_7_mutation_at_homopolymer =
ifelse((homopolymer_status >= homopolymer_length),
TRUE, FALSE),
filter_8_low_quality =
ifelse((low_quality_base_rate_under_q18 >= threshold_low_quality_rate |
low_quality_pre >= threshold_low_quality_rate |
low_quality_post >= threshold_low_quality_rate),
TRUE, FALSE)
)
msec <- msec %>% mutate(
caution =
ifelse((distant_homology_rate >= threshold_distant_homology &
!not_long_repeat),
paste(caution,
"too repetitive to analyze homology,"),
caution)
)
msec <- msec %>% mutate(
caution =
ifelse((prob_filter_1 < threshold_p &
(!short_short_support |
!short_short_support_sum)),
paste(caution,
"filter 1: p is small, but supported enough long,"),
caution)
)
msec <- msec %>% mutate(
caution =
ifelse(((prob_filter_3_pre < threshold_p |
prob_filter_3_post < threshold_p) &
!filter_3_microhomology_induced_mutation &
high_rate_q18),
paste(caution,
"filter 3: p is small, but supported enough long,"),
caution)
)
msec <- msec %>% mutate(
caution =
ifelse(((prob_filter_3_pre < threshold_p |
prob_filter_3_post < threshold_p) &
!filter_3_microhomology_induced_mutation &
!high_rate_q18),
paste(caution,
"filter 3: p is small, but reads are low quality,"),
caution)
)
msec <- msec %>% mutate(
caution =
ifelse((distant_homology_rate >= threshold_distant_homology &
!filter_4_highly_homologous_region &
!not_long_repeat),
paste(caution,
"filter 4: sequence is too repetitive,"),
caution)
)
msec <- msec %>% mutate(
msec_filter_123 = ifelse(
filter_1_mutation_intra_hairpin_loop |
filter_2_hairpin_structure |
filter_3_microhomology_induced_mutation,
"Artifact suspicious", ""),
msec_filter_1234 = ifelse(
filter_1_mutation_intra_hairpin_loop |
filter_2_hairpin_structure |
filter_3_microhomology_induced_mutation |
filter_4_highly_homologous_region,
"Artifact suspicious", ""),
msec_filter_all = ifelse(
filter_1_mutation_intra_hairpin_loop |
filter_2_hairpin_structure |
filter_3_microhomology_induced_mutation |
filter_4_highly_homologous_region |
filter_5_soft_clipped_reads |
filter_6_simple_repeat |
filter_7_mutation_at_homopolymer |
filter_8_low_quality,
"Artifact suspicious", ""),
comment = caution
)
msec <- msec %>% select(-caution)
}
msec <- msec %>% select(-mut_type, -alt_length, -hairpin_length,
-pre_minimum_length, -post_minimum_length,
-pre_rep_status, -post_rep_status,
-homopolymer_status, -indel_status, -indel_length,
-penalty_pre,
-penalty_post, -caution, -pre_minimum_length_adj,
-half_length,
-post_minimum_length_adj, -pre_support_length_adj,
-post_support_length_adj,
-shortest_support_length_adj,
-minimum_length_1, -minimum_length_2, -minimum_length,
-short_support_length_adj, -altered_length,
-altered_length2, -altered_length3,
-distant_homology,
-short_support_length_total,
-pre_support_length_total,
-post_support_length_total, -half_length_total,
-total_length_total, -high_rate_q18,
-short_short_support, -short_pre_support,
-short_post_support,
-not_long_repeat,
-short_short_support_sum, -short_pre_support_sum,
-short_post_support_sum,
-short_support_length_adj_sum,
-pre_support_length_adj_sum,
-post_support_length_adj_sum,
-half_length_adj_sum, -total_length_pre_adj_sum,
-total_length_post_adj_sum
)
return(msec)
}
# The following block is used by usethis to automatically manage
# roxygen namespace tags. Modify with care!
## usethis namespace: start
## usethis namespace: end
NULL
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.