#' Separately align multiple pattern sequences to one subject
#'
#' This function is useful to visualise the alignment position of multiple patterns on one subject.
#' It uses Biostrings::pairwiseAlignment(), obtains the individual alignment boundaries and converts
#' the results to a ggplot object.
#' The function will fail if a gap is induced in the subject and at least two pattern alignments overlap at this gap.
#' Method = local-global avoids indels in subject. Local cuts subject and pattern to the best matching sequence of both.
#'
#' @param subject a named character or named DNAStringSet of one subject (only the DNAStringSet but not DNAString can hold a name)
#' @param patterns a named character vector or named DNAStringSet of patterns to align to the subject sequence
#' @param type the type of alignment passed to Biostrings::pairwiseAlignment; not every type may work well with this function (if there are overlapping ranges of the alignments to the subject for example)
#' @param order_patterns order pattern increasingly by alignment position (start)
#' @param max_mismatch only use patterns that have a maximum number of mismatches
#' with the subject
#' @param fix_subject_indels in case of overlapping indels and shared subject ranges, cut respective patterns to avoid indels
#' @param seq_type set sequence type to AA or NT if necessary; if NULL
#' it is attempted to guess the type
#' @param return_max_mismatch_info_only only return information on mismatches of patterns with the subject;
#' in this case no alignment is calculated
#' @param rm_indel_inducing_pattern
#' @param matches_to_subject_and_pattern
#' @param compare_seq_df_long_args
#' @param pairwiseAlignment_args
#' @param algnmt_plot_args
#' @param order_subject_ranges
#'
#' @return a list
#' @export
#'
#' @importFrom magrittr %>%
#'
#' @examples
#' \dontrun{
#' s <- stats::setNames("AAAACCCCTTTTGGGGAACCTTCC", "sub")
#' s <- Biostrings::DNAStringSet(s)
#' p <- stats::setNames(c("TTCC", "CCCC", "TTTT", "GGGG", "AAAA"), c("pat1", "pat2", "pat3", "pat4", "pat5"))
#' p <- Biostrings::DNAStringSet(p)
#' als <- igsc::MultiplePairwiseAlignmentsToOneSubject(subject = s, patterns = p)
#' als_ordered <- igsc::MultiplePairwiseAlignmentsToOneSubject(subject = s, patterns = p, order_patterns = T)
#' }
MultiplePairwiseAlignmentsToOneSubject <- function(subject,
patterns,
type = c("global-local", "global", "local", "overlap", "local-global"),
max_mismatch = NA,
order_patterns = F,
fix_subject_indels = F,
rm_indel_inducing_pattern = F,
seq_type = NULL,
return_max_mismatch_info_only = F,
matches_to_subject_and_pattern = list(c(T,T),c(F,T),c(F,F)),
compare_seq_df_long_args = list(seq_original = NULL,
match_symbol = ".",
change_pattern = T,
pattern_mismatch_as = "base",
change_ref = T,
ref_mismatch_as = "base",
insertion_as = "base"),
pairwiseAlignment_args = list(),
algnmt_plot_args = list(add_length_suffix = T, pattern_lim_size = 2, verbose = F),
order_subject_ranges = F) {
## if a pattern has no mismatches, or only a maximum number of them, then could be used to find multiple matches
# restrict that to type = "local" or type = "global-local"
#y <- joined_TCR_data[19,"values"]
#y <- "GCGGCCGCGCCACCATGGACTCCTGGACCCTCTGCTGTGTGTCCCTTTGCATCCTGGTAGCAAAGCACACAGATGCTGGAGTTATCCAGTCACCCCGGCACGAGGTGACAGAGATGGGACAAGAAGTGACTCTGAGATGTAAACCAATTTCAGGACACGACTACCTTTTCTGGTACAGACAGACCATGATGCGGGGACTGGAGTTGCTCATTTACTTTAACAACAACGTTCCGATAGATGATTCAGGGATGCCCGAGGATCGATTCTCAGCTAAGATGCCTAATGCATCATTCTCCACTCTGAAGATCCAGCCCTCAGAACCCAGGGACTCAGCTGTGTACTTCTGTGCCAGCCGGGGCGGGGACAATGAGCAGTTCTTCGGGCCAGGGACACGGCTCACCGTGCTAGAGGATCTGAGAAATGTGACTCCACCCAAGGTCTCCTTGTTTGAGCCATCAAAAGCAGAGATTGCAAACAAACAAAAGGCTACCCTCGTGTGCTTGGCCAGGGGCTTCTTCCCTGACCACGTGGAGCTGAGCTGGTGGGTGAATGGCAAGGAGGTCCACAGTGGGGTCTGCACGGACCCTCAGGCCTACAAGGAGAGCAATTATAGCTACTGCCTGAGCAGCCGCCTGAGGGTCTCTGCTACCTTCTGGCACAATCCTCGAAACCACTTCCGCTGCCAAGTGCAGTTCCATGGGCTTTCAGAGGAGGACAAGTGGCCAGAGGGCTCACCCAAACCTGTCACACAGAACATCAGTGCAGAGGCCTGGGGCCGAGCAGACTGTGGAATCACTTCAGCATCCTATCATCAGGGGGTTCTGTCTGCAACCATCCTCTATGAGATCCTACTGGGGAAGGCCACCCTATATGCTGTGCTGGTCAGTGGCCTGGTGCTGATGGCCATGGTCAAGAAAAAAAATTCCGGCAGCGGCGCCACCAACTTCAGCCTGCTGAAGCAGGCCGGCGACGTGGAAGAGAACCCCGGGCCCATGGAAACTCTCCTGGGAGTGTCTTTGGTGATTCTATGGCTTCAACTGGCTAGGGTGAACAGTCAACAGGGAGAAGAGGATCCTCAGGCCTTGAGCATCCAGGAGGGTGAAAATGCCACCATGAACTGCAGTTACAAAACTAGTATAAACAATTTACAGTGGTATAGACAAAATTCAGGTAGAGGCCTTGTCCACCTAATTTTAATACGTTCAAATGAAAGAGAGAAACACAGTGGAAGATTAAGAGTCACGCTTGACACTTCCAAGAAAAGCAGTTCCTTGTTGATCACGGCTTCCCGGGCAGCAGACACTGCTTCTTACTTCTGTGCTACGAAAGATGGCCAGAAGCTGCTCTTTGCAAGGGGAACCATGTTAAAGGTGGATCTTAACATCCAGAACCCAGAACCTGCTGTGTACCAGTTAAAAGATCCTCGGTCTCAGGACAGCACCCTCTGCCTGTTCACCGACTTTGACTCCCAAATCAATGTGCCGAAAACCATGGAATCTGGAACGTTCATCACTGACAAATGCGTGCTGGACATGAAAGCTATGGATTCCAAGAGCAATGGGGCCATTGCCTGGAGCAACCAGACAAGCTTCACCTGCCAAGATATCTTCAAAGAGACCAACGCCACCTACCCCAGTTCAGACGTTCCCTGTGATGCCACGTTGACTGAGAAAAGCTTTGAAACAGATATGAACCTAAACTTTCAAAACCTGTCAGTTATGGGACTCCGAATCCTCCTGCTGAAAGTAGCCGGATTTAACCTGCTCATGACGCTGAGGCTGTGGTCCAGTTGAGAATTC"
#tt <- Biostrings::matchPattern(subject = y, max.mismatch=2, pattern = Biostrings::DNAString(igsc::read_fasta(system.file("extdata", "TCR_cassette_elements.fasta", package = "igsc"))[["Kozak_sequence"]]))
#Biostrings::countPattern(subject = y, pattern = Biostrings::DNAString(igsc::read_fasta(system.file("extdata", "TCR_cassette_elements.fasta", package = "igsc"))[["Kozak_sequence"]]))
# have optionally pattern removed that have (i) mismatches, or (ii) do not align full length ?!
## obtain sequences of pattern and subject with gap correction etc differently?
#nchar(as.character(pa@subject))
#nchar(as.character(pa@pattern))
# how to use IRanges?
#IRanges::IRanges(subject.ranges[[1]])
# remove pattern that do not align full length?
if (!requireNamespace("Biostrings", quietly = T)) {
BiocManager::install("Biostrings")
}
' if (fix_indels) {
message("fix_indels does not work yet. Set to F.")
fix_indels <- F
}'
if (!is.list(matches_to_subject_and_pattern) || any(lengths(matches_to_subject_and_pattern) < 2) || any(!sapply(matches_to_subject_and_pattern, is.logical))) {
stop("matches_to_subject_and_pattern has to be a list of vectors of lengths two, containing T or F only.")
}
type <- match.arg(type, choices = c("global-local", "global", "local", "overlap", "local-global"))
#assigns: subject, patterns, seq_type, original_names, pattern_original_order, pattern_groups
prep_subject_and_patterns(subject = subject,
patterns = patterns,
seq_type = seq_type)
# check for non-DNA characters first
#assigns: pa, patterns, pattern_indel_inducing, subject_inds_indel
check_for_invalid_chars(subject = subject,
patterns = patterns,
max_mismatch = max_mismatch)
if (return_max_mismatch_info_only) {
return(list(patterns_invalid = patterns_invalid,
pattern_mismatching = pattern_mismatching_return))
}
# calculate all alignments
# fastDoCall does not work here, maybe due to method written in C
pa <- do.call(Biostrings::pairwiseAlignment, args = c(list(subject = subject, pattern = patterns, type = type),
pairwiseAlignment_args))
# make pa a list once and then iterate over list entries with purrr/furrr which is quicker!
# use multiple threads to speed up?!
# pal <- stats::setNames(as.list(pa), patnames(patterns)terns.names)
# pal <- stats::setNames(purrr::flatten(parallel::mclapply(split(c(1:length(pa)), ceiling(seq_along(c(1:length(pa)))/10)), function(x) as.list(pa[x]), mc.cores = parallel::detectCores()-1)), names(patterns))
#assigns: pa, patterns, pattern_indel_inducing, subject_inds_indel
check_for_indel_induction(pa = pa,
patterns = patterns,
max_mismatch = max_mismatch,
rm_indel_inducing_pattern = rm_indel_inducing_pattern)
#assigns: pa, patterns, subject_indels, indel_ranges
check_for_overlapping_indels(pa = pa,
patterns = patterns,
subject_inds_indel = subject_inds_indel,
fix_subject_indels = fix_subject_indels)
# subject.ranges are defined herein
# here also overlapping subject.ranges within groups are checked for
#assigns: subject.ranges, subject.ranges.unique, pa.unique, pa
make_pa_unique_and_order_and_rm_subset_alignments(pa = pa,
pattern_groups = pattern_groups)
df <- paste_subject_seq(subject = subject,
subject.ranges.unique = subject.ranges.unique,
pa.unique = pa.unique)
# pattern_order is defined here
#assigns: df, subject_indels, pattern_order
paste_patterns_to_subject(subject_indels = subject_indels,
subject.ranges = subject.ranges,
patterns = patterns,
pa = pa,
df = df,
pattern_original_order = pattern_original_order)
if (!is.null(pattern_groups)) {
algnmt_plot_args <- c(algnmt_plot_args, list(y_group_col = "pattern.group"))
}
# fix algnmt_plot_args: remove duplicate args
# write original names into alignments; when the object cycles through C-code (with altered names) certain symbols (maybe like asterisk (*)) may cause problems.
pa@pattern@unaligned@ranges@NAMES <- original_names[pa@pattern@unaligned@ranges@NAMES]
df2 <- prep_df_for_algnmt_plot(df = df,
subject_name = names(subject),
pattern_names = names(patterns),
original_names = original_names,
order_patterns = order_patterns,
subject.ranges = subject.ranges,
pattern_order = pattern_order,
pattern_groups = pattern_groups,
compare_seq_df_long_args)
plot <- Gmisc::fastDoCall(algnmt_plot, args = c(list(algnmt = df2,
algnmt_type = seq_type,
pairwiseAlignment = pa),
algnmt_plot_args))
# c(min(sapply(names(patterns), function(x) min(df[which(!is.na(df[,x,drop=T])),c("subject.position", x)][,"subject.position",drop=T])), na.rm = T), max(sapply(names(patterns), function(x) max(df[which(!is.na(df[,x,drop=T])),c("subject.position", x)][,"subject.position",drop=T])), na.rm = T)),
# c(min(sapply(subject.ranges, min), na.rm = T), max(sapply(subject.ranges, max), na.rm = T))
# alignment can be so bad, that these two information do not match
return(list(data = df2,
plot = plot,
subject_limits = c(min(sapply(subject.ranges, min), na.rm = T), max(sapply(subject.ranges, max), na.rm = T)),
data_wide = df,
pairwise_alignments = pa,
subject_ranges = if (order_subject_ranges) {subject.ranges[order(sapply(subject.ranges, min))]} else {subject.ranges},
pattern_invalid = patterns_invalid,
pattern_indel_inducing = pattern_indel_inducing,
pattern_mismatching = pattern_mismatching_return))
}
'data_plot <- purrr::map(setNames(matches_to_subject_and_pattern, sapply(matches_to_subject_and_pattern, function(x) paste(ifelse(x, "match", "base"), collapse = "_"))), function(y) {
df2 <- prep_df_for_algnmt_plot(df = df,
subject_name = names(subject),
pattern_names = names(patterns),
original_names = original_names,
order_patterns = order_patterns,
subject.ranges = subject.ranges,
pattern_order = pattern_order,
pattern_groups = pattern_groups,
compare_seq_df_long_args)
plot <- Gmisc::fastDoCall(algnmt_plot, args = c(list(algnmt = df2,
algnmt_type = seq_type,
pairwiseAlignment = pa),
algnmt_plot_args))
return(list(data = df2, plot = plot))
})
data = stats::setNames(sapply(data_plot, "[", "data"), nm = sapply(matches_to_subject_and_pattern, function(x) paste(ifelse(x, "match", "base"), collapse = "_"))),
plot = stats::setNames(sapply(data_plot, "[", "plot") , nm = sapply(matches_to_subject_and_pattern, function(x) paste(ifelse(x, "match", "base"), collapse = "_"))),
'
prep_df_for_algnmt_plot <- function(df,
subject_name,
pattern_names,
original_names,
order_patterns,
subject.ranges,
pattern_order,
pattern_groups,
compare_seq_df_long_args) {
# gap
# match
# mismatch
# insertion
' df <- compare_seq_previous(df = df,
subject_name = subject_name,
pattern_names = pattern_names,
matches_to_subject = matches_to_subject,
matches_to_pattern = matches_to_pattern)'
df <- Gmisc::fastDoCall(compare_seq_df_long, args = c(compare_seq_df_long_args, list(df_long =
df %>%
tidyr::pivot_longer(cols = dplyr::all_of(c(subject_name, pattern_names)), names_to = "seq.name", values_to = "seq"),
ref = subject_name,
pos_col = "position",
seq_col = "seq",
name_col = "seq.name")))
df <-
df %>%
#tidyr::pivot_longer(cols = dplyr::all_of(c(subject_name, pattern_names)), names_to = "seq.name", values_to = "seq") %>% # this is now down above
## here, original names are restored
dplyr::mutate(seq.name = original_names[seq.name]) %>%
## factor order with original names
dplyr::mutate(seq.name = factor(seq.name, levels = c(subject_name, unname(original_names[pattern_names])[ifelse(rep(order_patterns, length(subject.ranges)), order(purrr::map_int(subject.ranges, min)), order(pattern_order))])))
#unname(c(original_names[1], original_names[-1][ifelse(rep(order_patterns, length(subject.ranges)), order(purrr::map_int(subject.ranges, min)), pattern_order)]))
if (!is.null(pattern_groups)) {
pattern_groups <- c(stats::setNames("subject", subject_name), pattern_groups)
df$pattern.group <- pattern_groups[df$seq.name]
df$pattern.group <- factor(df$pattern.group, levels = unique(pattern_groups))
}
return(df)
}
prep_subject_and_patterns <- function(subject,
patterns,
seq_type) {
if (length(subject) > 1) {
stop("Please provide only one subject as DNAString, DNAStringSet or character.")
}
if (is.list(subject)) {
stop("Please provide one subject only.")
}
if (is.list(patterns)) {
if (all(!sapply(patterns, methods::is, class2 = "RNAStringSet")) && all(!sapply(patterns, methods::is, class2 = "DNAStringSet")) && all(!sapply(patterns, methods::is, class2 = "AAStringSet")) && all(!sapply(patterns, methods::is, class2 = "character"))) {
stop("patterns has to be a XStringSet or character vector.")
}
} else {
if (!methods::is(patterns, "RNAStringSet") && !methods::is(patterns, "DNAStringSet") && !methods::is(patterns, "AAStringSet") && !methods::is(patterns, "character")) {
stop("patterns has to be a XStringSet or character vector.")
}
}
if (!methods::is(subject, "RNAStringSet") && !methods::is(subject, "DNAStringSet") && !methods::is(subject, "AAStringSet") && !methods::is(subject, "character")) {
stop("subject has to be a XStringSet or character vector.")
}
# handle list of patterns
patterns_list <- NULL
pattern_groups <- NULL
if (is.list(patterns) && length(patterns) > 1 && !all(lengths(patterns) == 1)) {
patterns <- patterns[which(!sapply(patterns, is.null))]
patterns <- patterns[which(!sapply(sapply(patterns, is.na, simplify = F), all))]
patterns <- lapply(patterns, function(x) x[which(!is.na(x))])
patterns_list <- patterns
# will this be slow for many patterns? - is needed if list of DNAStringsSets - very hypothetical; maybe restrict what data types can be passed as patterns
patterns <- unlist(lapply(patterns_list, as.character))
if (is.null(names(patterns_list))) {
names(patterns_list) <- paste0("Group_", seq_along(patterns_list))
}
pattern_groups <- lapply(patterns_list, names)
pattern_groups <- stats::setNames(as.character(stack(pattern_groups)$ind), stack(pattern_groups)$values)
} else if (is.list(patterns) && length(patterns) > 1 && all(lengths(patterns) == 1)) {
# each list entry if length = 1
patterns <- unlist(patterns)
} else if (is.list(patterns)) {
patterns <- patterns[[1]]
if (is.null(patterns) || all(is.na(patterns))) {
stop("patterns is Null or NA.")
}
}
# make this a separate fun somewhen?
if (anyDuplicated(patterns)) {
if (is.null(patterns_list)) {
message("These pattern are duplicates: ", paste((unique(sapply(which(duplicated(patterns)), function(x) which(patterns[x] == patterns)))), collapse = ", "))
} else {
message("pattern at these indices are duplicates: ", paste(which(duplicated(patterns)), collapse = ", "))
# similar fun as above but cylce through list and return list indices on top
' tt<-sapply(which(duplicated(patterns)), function(x) {
unlist(purrr::discard(sapply(stats::setNames(seq_along(patterns_list), seq_along(patterns_list)), function(y) {
which(patterns[x] == patterns_list[[y]])
}), function(z) length(z) == 0))
}, simplify = F)
paste(tt)'
}
message("pattern at these indices are duplicates: ", paste(which(duplicated(patterns)), collapse = ", "))
}
## pull seqs from subject and patterns, then run guess_type
unique_letters <- unique(toupper(c(unlist(strsplit(as.character(subject), "")), unlist(strsplit(as.character(patterns), "")))))
if (is.null(seq_type)) {
seq_type <- guess_type(unique_letters)
} else {
seq_type <- match.arg(seq_type, c("NT", "AA"))
}
if (seq_type == "NT") {
if ("U" %in% unique_letters) {
if (!methods::is(patterns, "RNAStringSet")) {
patterns <- Biostrings::RNAStringSet(patterns)
}
if (!methods::is(subject, "RNAStringSet")) {
subject <- Biostrings::RNAStringSet(subject)
}
} else {
if (!methods::is(patterns, "DNAStringSet")) {
patterns <- Biostrings::DNAStringSet(patterns)
}
if (!methods::is(subject, "DNAStringSet")) {
subject <- Biostrings::DNAStringSet(subject)
}
}
} else if (seq_type == "AA") {
if (!methods::is(patterns, "AAStringSet")) {
patterns <- Biostrings::AAStringSet(patterns)
}
if (!methods::is(subject, "AAStringSet")) {
subject <- Biostrings::AAStringSet(subject)
}
}
if (is.null(names(subject))) {
names(subject) <- "subject"
}
if (is.null(names(patterns))) {
names(patterns) <- paste0("pattern_", seq(1,length(patterns)))
}
names(patterns) <- make.unique(names(patterns))
# TODO:
# do not allow certain names for subject or pattern?
# e.g. no pattern name identical to subject name
# or pattern name != "subject" if no subject name is given
# or seq or position or subject.position - check data frame from later below
## save original names for replacement later
### avoid using names from pa then
original_names <- c(names(subject), names(patterns))
names(subject) <- make.names(names(subject))
names(patterns) <- make.names(names(patterns))
names(original_names) <- c(names(subject), names(patterns))
# save original order in case filterings below shuffles it
pattern_original_order <- names(patterns)
# assigns in parent environment (https://stackoverflow.com/questions/10904124/global-and-local-variables-in-r?rq=1)
assign("subject", subject, envir = parent.frame())
assign("patterns", patterns, envir = parent.frame())
assign("seq_type", seq_type, envir = parent.frame())
assign("original_names", original_names, envir = parent.frame())
assign("pattern_original_order", pattern_original_order, envir = parent.frame())
assign("pattern_groups", pattern_groups, envir = parent.frame())
}
check_for_invalid_chars <- function(subject,
patterns,
max_mismatch) {
if (!is.na(max_mismatch) && max_mismatch < 0) {
message("max_mismatch has to be NA or >= 0. Is set to NA now.")
max_mismatch <- NA
}
patterns_invalid <- NULL
pattern_mismatching_return <- NULL
if (!is.na(max_mismatch) && methods::is(subject, "DNAStringSet") && methods::is(patterns, "DNAStringSet")) {
if (grepl("[^ACTGU]", subject)) {
message("subject contains non-DNA or non-RNA characters. To check for max_mismatch currenly only ACTGU are allowed. max_mismatch is now set to NA.")
max_mismatch <- NA
}
inds <- grepl("[^ACTGU]", patterns)
if (any(inds)) {
message(length(which(inds)), " patterns with non-DNA or non-RNA characters detected. Those patterns are removed in order to allow checking for max_mismatch. They are returned as patterns_invalid.")
patterns_invalid <- patterns[which(inds)]
patterns <- patterns[which(!inds)]
if (length(patterns) == 0) {
stop("No patterns left after filtering for ones with valid DNA/RNA characters only. Please fix the sequences or set max_mismatch = NA.")
}
}
####
####
pattern_mismatching <- purrr::map(stats::setNames(0:max_mismatch, 0:max_mismatch), function(x) {
## different lengths of patterns not allowed - split patterns by length; then have the names returned
## if too slow, think of other procedure
pattern_names <- purrr::map(unique(nchar(as.character(patterns))), function(y) {
patterns_temp <- patterns[which(nchar(as.character(patterns)) == y)]
inds <- Biostrings::vwhichPDict(subject = subject,
pdict = Biostrings::PDict(x = patterns_temp, max.mismatch = x),
max.mismatch = x)[[1]]
names(patterns_temp[inds])
})
return(unlist(pattern_names))
})
# last index of pattern_mismatching contains all patterns with the amount of mismatches at this index or less
message(length(pattern_mismatching[[length(pattern_mismatching)]]), " of ", length(patterns), " patterns found to have less or equal to ", max_mismatch, " mismatches with the subject.")
print_pattern_mismatching <- utils::stack(lengths(pattern_mismatching))
names(print_pattern_mismatching) <- c("n patterns", "mismatches")
print(print_pattern_mismatching)
pattern_mismatching_return <- purrr::map(stats::setNames(pattern_mismatching, paste0("max_mis_", names(pattern_mismatching))), function(x) patterns[x])
patterns <- patterns[pattern_mismatching[[length(pattern_mismatching)]]]
if (length(patterns) == 0) {
stop("No pattern left after filtering for max_mismatch.")
}
} else if (!is.na(max_mismatch)) {
message("!is.na(max_mismatch) can only be applied for DNA.")
max_mismatch <- NA
}
assign("patterns", patterns, envir = parent.frame())
assign("pattern_mismatching_return", pattern_mismatching_return, envir = parent.frame())
assign("patterns_invalid", patterns_invalid, envir = parent.frame())
assign("max_mismatch", max_mismatch, envir = parent.frame())
}
check_for_indel_induction <- function(pa,
patterns,
max_mismatch,
rm_indel_inducing_pattern) {
# caution: leading gaps are not considered as indels!
#indel_list <- 0
pattern_indel_inducing <- NULL
subject_inds_indel <- NULL
if (is.na(max_mismatch) || max_mismatch > 0) { # why this condition?
# based on pa, not pal!!
#indel_lists <- Biostrings::nindel(pa) # this does not discriminate pattern and subject
#indel_list <- stats::setNames(apply(cbind(indel_lists@insertion, indel_lists@deletion), 1, sum), names(patterns))
# loop over pa elements to create list of matrices, then tell which pattern had gap in subject or pattern
indel_mat_pattern <- lapply(seq_along(pa), function(z) as.matrix(pa[z]@pattern@indel@unlistData))
indel_mat_subject <- lapply(seq_along(pa), function(z) as.matrix(pa[z]@subject@indel@unlistData))
# this if FYI
pattern_inds_indel <- which(unlist(lapply(indel_mat_pattern, nrow)) > 0)
if (length(pattern_inds_indel) > 0) {
message(sum(pattern_inds_indel > 0), " patterns got indels. Indices: ", paste(pattern_inds_indel, collapse = ", "))
}
subject_inds_indel <- which(unlist(lapply(indel_mat_subject, nrow)) > 0)
subject_inds_indel_pass <- which(unlist(lapply(indel_mat_subject, nrow)) == 0)
if (any(subject_inds_indel > 0)) {
message(sum(subject_inds_indel > 0), " patterns caused indels in subject. Indices: ", paste(subject_inds_indel, collapse = ", "))
if (rm_indel_inducing_pattern) {
message("Those are removed as rm_indel_inducing_pattern = T. They are returned as pattern_indel_inducing.")
pattern_indel_inducing <- patterns[subject_inds_indel]
patterns <- patterns[subject_inds_indel_pass]
pa <- pa[subject_inds_indel_pass]
#subject_inds_indel <- subject_inds_indel[which(subject_inds_indel == 0)] # needed?
}
}
}
assign("pa", pa, envir = parent.frame())
assign("patterns", patterns, envir = parent.frame())
assign("pattern_indel_inducing", pattern_indel_inducing, envir = parent.frame())
assign("subject_inds_indel", subject_inds_indel, envir = parent.frame())
}
check_for_overlapping_indels <- function(pa,
patterns,
subject_inds_indel,
fix_subject_indels) {
if (length(patterns) > 1 && any(subject_inds_indel > 0)) { # min 2 pattern and min 1 indel in subject
# find out if any pattern alignment overlap with gaps from another pattern alignment. this would cause problem in the alignment.
subject_indels <- as.data.frame(pa@subject@range)
names(subject_indels) <- c("al_start", "al_end", "al_width")
subject_indels$group <- 1:nrow(subject_indels)
subject_indels$pattern_name <- pa@pattern@unaligned@ranges@NAMES
subject_indels <- dplyr::left_join(subject_indels, as.data.frame(pa@subject@indel)[,-2], by = "group")
subject_indels$start <- subject_indels$start + subject_indels$al_start - 1
subject_indels$end <- subject_indels$start + subject_indels$width - 1
subject_indels$corr_end <- NA
subject.ranges <- seq2(subject_indels$al_start, subject_indels$al_end) # this is the same as subject.ranges below; this is like seq2
indel_ranges <- seq2(subject_indels$start[!is.na(subject_indels$start)], subject_indels$end[!is.na(subject_indels$end)])
do_fix <- F
if (!fix_subject_indels) {
for (i in seq_along(subject.ranges)) {
if (any(subject.ranges[[i]] %in% unlist(indel_ranges[-i]))) {
## allow for indel at same position
# change order around %in% ?
if (!all(subject.ranges[[i]][which(subject.ranges[[i]] %in% unlist(indel_ranges[-i]))] %in% unlist(indel_ranges[-i]))) {
stop("Overlapping indel and subject alignment range found at index ", i, ". This cannot be handled yet, except for shortening respective sequences to just before the indel insertion.
To do so, set fix_subject_indels = T.")
}
}
}
}
if (fix_subject_indels) {
for (i in seq_along(subject.ranges)) {
for (j in seq_along(indel_ranges)) {
if (i != j) {
if (length(intersect(subject.ranges[[i]],indel_ranges[[j]])) > 0) {
do_fix <- T
subject_indels[j,"corr_end"] <- subject_indels[j,"start"] - 1
}
}
}
}
}
if (do_fix && fix_subject_indels) {
for (k in seq_along(patterns)) {
if (any(!is.na(subject_indels[which(ind$group == k), "corr_end"]))) {
message(names(patterns)[k], " is cut at position ", min(subject_indels[which(subject_indels$group == k), "corr_end"], na.rm = T), " to avoid indel overlap with another's pattern range on the subject. Experimental, yet.")
patterns[k] <- Biostrings::subseq(patterns[k], start = 1, end = min(subject_indels[which(subject_indels$group == k), "corr_end"], na.rm = T))
}
}
pa <- do.call(Biostrings::pairwiseAlignment, args = c(list(subject = subject, pattern = patterns, type = type),
pairwiseAlignment_args))
}
assign("pa", pa, envir = parent.frame())
assign("patterns", patterns, envir = parent.frame())
assign("subject_indels", subject_indels, envir = parent.frame())
assign("indel_ranges", indel_ranges, envir = parent.frame())
} else {
assign("subject_indels", NULL, envir = parent.frame())
}
}
make_pa_unique_and_order_and_rm_subset_alignments <- function(pa,
pattern_groups) {
#subject.ranges <- brathering::seq2(pa@subject@range@start, pa@subject@range@start+pa@subject@range@width-1)
#subject.ranges <- mapply("seq", pa@subject@range@start, pa@subject@range@start+pa@subject@range@width-1)
subject.ranges <- seq2(pa@subject@range@start, pa@subject@range@start+pa@subject@range@width-1)
names(subject.ranges) <- pa@pattern@unaligned@ranges@NAMES
#out <- IRanges::findOverlapPairs(pa@subject@range, pa@subject@range)
## check if subject ranges within groups are overlapping
if (!is.null(pattern_groups)) {
subject.ranges.split <- split(names(subject.ranges), pattern_groups[names(subject.ranges)])
overlap_subject_ranges <- unlist(lapply(subject.ranges.split, function(y) {
ranges_comb <- combn(y, 2, simplify = F)
if (any(unlist(lapply(ranges_comb, function(x) length(intersect(subject.ranges[[x[1]]], subject.ranges[[x[2]]])) > 1)))) {
return(T)
} else {
return(F)
}
}))
if (any(overlap_subject_ranges)) {
stop("These groups have patterns with overlapping alignment ranges in subject: ", paste(names(which(overlap_subject_ranges)), collapse = ", "))
}
}
non_dups <- which(!duplicated(subject.ranges))
subject.ranges.unique <- subject.ranges[non_dups]
pa.unique <- pa[non_dups]
if (length(subject.ranges.unique) > 1) {
is.subset <- purrr::map_lgl(seq_along(subject.ranges.unique), function(i) {
any(purrr::map_lgl(seq_along(subject.ranges.unique), function (j) {
if (i == j) {
return(F)
} else {
all(subject.ranges.unique[[i]] %in% subject.ranges.unique[[j]])
}
}))
})
} else {
is.subset <- rep(F, length(subject.ranges.unique))
}
not.subset <- which(!is.subset)
subject.ranges.unique <- subject.ranges.unique[not.subset]
pa.unique <- pa.unique[not.subset]
# order alignment and subject ranges increasingly
al_order <- order(purrr::map_int(subject.ranges.unique, min))
pa.unique <- pa.unique[al_order]
subject.ranges.unique <- subject.ranges.unique[al_order]
pa <- pa[order(purrr::map_int(subject.ranges, min))]
assign("subject.ranges", subject.ranges, envir = parent.frame()) # needed? - yes
assign("subject.ranges.unique", subject.ranges.unique, envir = parent.frame())
assign("pa.unique", pa.unique, envir = parent.frame())
assign("pa", pa, envir = parent.frame())
}
paste_subject_seq <- function(subject,
subject.ranges.unique,
pa.unique) {
# use this to concat the subject sequence
#nchar(as.character(pa.unique@subject))
#nchar(as.character(pa.unique@pattern))
#lengths(subject.ranges.unique)
#sapply(subject.ranges.unique, min)
#stringr::str_count(as.character(pa.unique@subject)[3], "-")
# paste together the complete subject
total.subject.seq <- stringr::str_sub(as.character(subject), 1, (min(subject.ranges.unique[[1]]) - 1))
for (i in seq_along(subject.ranges.unique)) {
# test if there is a gap between the i-1th and the ith alignment; if so, fill with original sequence
if (i != 1 && max(subject.ranges.unique[[i-1]])+1 < min(subject.ranges.unique[[i]])) {
# if yes use original subject
total.subject.seq <- paste0(total.subject.seq, substr(as.character(subject), max(subject.ranges.unique[[i-1]])+1, min(subject.ranges.unique[[i]])-1))
}
if (i < length(subject.ranges.unique)) {
r <- min(subject.ranges.unique[[i]])
temp_ref1 <- min(subject.ranges.unique[[i]])-r+1
temp_ref2 <- min(subject.ranges.unique[[i+1]])-r
# 2023-12-08: this if is new; in case the pattern causes gaps in subject, append as long as the total number of non-gap characters equals the needed subject length by this pattern; only needed if next pattern overlaps
gaps_found <- nchar(gsub("-", "", as.character(pa.unique@subject[i]))) != nchar(as.character(pa.unique@subject[i]))
if (gaps_found) {
next_overlap <- length(intersect(subject.ranges.unique[[i]], subject.ranges.unique[[i+1]])) > 0
if (next_overlap) {
diff <- (min(subject.ranges.unique[[i+1]])) - (min(subject.ranges.unique[[i]])+1)
len <- nchar(gsub("-", "", substr(pa.unique@subject[i], temp_ref1, temp_ref2)))
add <- 0
max_char <- nchar(as.character(pa.unique@subject[i]))
# still error here, maybe not valid for every type of alignment check how to make quicker
while(len < diff) {
add <- add + 1
if ((temp_ref2+add) > max_char) {
stop("error in extending pattern.")
}
len <- nchar(gsub("-", "", substr(pa.unique@subject[i], temp_ref1, temp_ref2+add)))
}
total.subject.seq <- paste0(total.subject.seq, substr(pa.unique@subject[i], temp_ref1, temp_ref2+add+1))
} else {
# nothing, then the gap will be filled in next iteration of i, above
}
} else {
total.subject.seq <- paste0(total.subject.seq, substr(pa.unique@subject[i], temp_ref1, temp_ref2))
}
}
if (i == length(subject.ranges.unique)) {
total.subject.seq <- paste0(total.subject.seq, pa.unique@subject[i])
}
}
## attach the remaining sequence from subject
total.subject.seq <- paste0(total.subject.seq, substr(as.character(subject), max(subject.ranges.unique[[length(subject.ranges.unique)]])+1, nchar(as.character(subject))))
## create data frame for plotting
df <- dplyr::mutate(data.frame(seq = stats::setNames(strsplit(total.subject.seq, ""), names(subject))), position = dplyr::row_number())
df[df[,names(subject)] != "-", "subject.position"] <- seq(1:nrow(df[df[,names(subject)] != "-", ]))
return(df)
}
' if (nchar(gsub("-", "", as.character(pa.unique@subject[i]))) != nchar(as.character(pa.unique@subject[i]))) {
diff <- (min(subject.ranges.unique[[i+1]])) - (min(subject.ranges.unique[[i]])+1)
len <- nchar(gsub("-", "", substr(pa.unique@subject[i], min(subject.ranges.unique[[i]])-r+1, min(subject.ranges.unique[[i+1]])-r)))
add <- 0
# still error here, maybe not valid for every type of alignment check how to make quicker
while(len < diff) {
add <- add + 1
len <- nchar(gsub("-", "", substr(pa.unique@subject[i], min(subject.ranges.unique[[i]])-r+1, min(subject.ranges.unique[[i+1]])-r+add)))
}
total.subject.seq <- paste0(total.subject.seq, substr(pa.unique@subject[i], min(subject.ranges.unique[[i]])-r+1, min(subject.ranges.unique[[i+1]])-r+add+1))
} else {
total.subject.seq <- paste0(total.subject.seq, substr(pa.unique@subject[i], min(subject.ranges.unique[[i]])-r+1, min(subject.ranges.unique[[i+1]])-r))
}'
paste_patterns_to_subject <- function(subject_indels,
subject.ranges,
patterns,
pa,
df,
pattern_original_order) {
# gaps only account for the next sequence, respectively, hence add 0 at beginning, and delete last index
# these lines assumed that indels are not overlapping with the subsequent pattern
#gaps <- c(0, Biostrings::nindel(pa)@insertion[,"WidthSum"])
#gap_corr <- purrr::accumulate(gaps[-length(gaps)], `+`)
# if indels are at same position has been checked above
# if subject_indel is at the same position as previous, then do not increment gap_corr at next index
# for overlapping patterns, one of which causes indels but the other not (the second aligns perfectly), gap_corr has to be provided position-wise, not pattern-wise
all_pattern <- stats::setNames(as.character(pa@pattern), pa@pattern@unaligned@ranges@NAMES)
pattern_seq <- stack(strsplit(all_pattern, ""))
names(pattern_seq) <- c("seq", "pattern")
# default for !is.null(subject_indels) and overlap == TRUE
# pattern_plot_pos is also needed below when is.null(subject_indels)
pattern_plot_pos <- seq2((pa@subject@range@start), (pa@subject@range@start + nchar(all_pattern) - 1))
pattern_plot_pos <- stack(stats::setNames(pattern_plot_pos, names(all_pattern)))
names(pattern_plot_pos) <- c("position", "pattern")
if (!is.null(subject_indels)) {
# replace NAs first
subject_indels$start <- ifelse(is.na(subject_indels$start), 0 , subject_indels$start)
subject_indels$end <- ifelse(is.na(subject_indels$end), 0 , subject_indels$end)
subject_indels$gap_insert <- ifelse(is.na(subject_indels$width), 0, subject_indels$width)
subject_indels <- dplyr::arrange(subject_indels, al_start, group) # why not ordered by default??
overlap <- purrr::map_lgl(seq_along(subject.ranges), function(i) {
any(purrr::map_lgl(seq_along(subject.ranges), function (j) {
if (i == j) {
return(F)
} else {
any(subject.ranges[[i]] %in% subject.ranges[[j]])
}
}))
})
# if any pattern overlap, use the more precise method of gap correction
if (any(overlap)) {
subject_indels <- subject_indels[which(!is.na(subject_indels$width)),]
## leave one pattern out, to only add gaps induced by any other pattern
gap_corr2_list <- purrr::map_dfr(stats::setNames(names(patterns), names(patterns)), function(x) {
temp <- subject_indels[which(subject_indels$pattern_name != x),,drop=F]
if (nrow(temp) > 0) {
gap_corr2 <- rep(0, length(1:temp[1,"start"])-1) # where to subtract 1, here and/or in loop below has to be tested
for (i in 2:nrow(temp)) {
gap_corr2 <- c(gap_corr2, rep(gap_corr2[length(gap_corr2)]+temp[i-1,"gap_insert"], temp[i,"start"]-length(gap_corr2)-1))
}
gap_corr2 <- c(gap_corr2, rep(gap_corr2[length(gap_corr2)]+temp[i,"gap_insert"], max(df$position)-length(gap_corr2))) # max(df$position) may be longer than necessary
names(gap_corr2) <- seq(1, length(gap_corr2))
gap_corr2 <- stack(gap_corr2)
names(gap_corr2) <- c("corr", "position")
gap_corr2$position <- as.numeric(gap_corr2$position)
} else {
# no gaps from other patterns to consider
gap_corr2 <- data.frame(corr = rep(0, max(df$position)), position = 1:max(df$position)) # max(df$position) may be longer than necessary
}
return(gap_corr2)
}, .id = "pattern")
gap_corr2_list <- split(gap_corr2_list, gap_corr2_list$pattern)
# pattern_plot_pos was initiated above
# join gap_corr2_list with coalesce join
for (i in seq_along(gap_corr2_list)) {
pattern_plot_pos <- coalesce_join(pattern_plot_pos, gap_corr2_list[[i]], by = c("position" = "position", "pattern" = "pattern"))
}
pattern_plot_pos$position <- pattern_plot_pos$position + pattern_plot_pos$corr
pattern_plot_pos <- pattern_plot_pos[,-which(names(pattern_plot_pos) == "corr")]
} else {
## no overlap: addition of gaps can happen after every pattern
## all previous gaps are summed and added after a pattern for all subsequent ones
## group by group = multiple indels by one pattern are grouped; gaps induced are summed because only the sum of gaps is relevant to shift patterns afterwards
subject_indels_grouped <-
subject_indels %>%
dplyr::group_by(group, al_start) %>%
dplyr::summarise(start = list(start), end = list(end), gap_insert = sum(gap_insert), .groups = "drop") %>%
dplyr::arrange(al_start)
for (i in 1:(nrow(subject_indels_grouped)-1)) {
subject_indels_grouped$gap_insert[i] <- ifelse((identical(subject_indels_grouped$start[i+1], subject_indels_grouped$start[i]) && identical(subject_indels_grouped$end[i+1], subject_indels_grouped$end[i])), 0, subject_indels_grouped$gap_insert[i])
}
gaps <- c(0, subject_indels_grouped$gap_insert)
gap_corr <- purrr::accumulate(gaps[-length(gaps)], `+`)
pattern_plot_pos <- seq2((pa@subject@range@start + gap_corr), (pa@subject@range@start+nchar(all_pattern) - 1 + gap_corr))
pattern_plot_pos = stack(stats::setNames(pattern_plot_pos, names(all_pattern)))
names(pattern_plot_pos) <- c("position", "pattern")
}
}
## common final lines
dfs <- cbind(pattern_seq[,"seq",drop=F], pattern_plot_pos)
dfs <- split(dfs, dfs$pattern)
dfs <- purrr::map(dfs, function(x) {
names(x)[1] <- as.character(x[1,"pattern",drop=T])
return(x[-which(names(x) == "pattern")])
})
dfs <- join_chunkwise(data_frame_list = dfs, join_by = "position")
# then left join with complete seq in index 1
df2 <- purrr::reduce(c(list(df), dfs), dplyr::left_join, by = "position")
## order patterns in data.frames below by factor order
pattern_order <- match(names(patterns), pattern_original_order)
assign("df", df2, envir = parent.frame())
assign("subject_indels", subject_indels, envir = parent.frame())
assign("pattern_order", pattern_order, envir = parent.frame())
}
seq2_default <- Vectorize(seq.default, vectorize.args = c("from", "to"))
seq2 <- function(from = 1, to = 1) {
x <- seq2_default(from = from, to = to)
# make sure always a list is returned
if (is.matrix(x)) {
x <- unname(as.list(as.data.frame(x)))
}
return(x)
}
join_chunkwise <- function(data_frame_list,
join_fun = dplyr::full_join,
join_by,
chunks = 10,
max_final_size = 20) {
# chunk wise joining can be much faster
join_fun <- match.fun(join_fun)
while (length(data_frame_list) > max_final_size) {
data_frame_list <- purrr::map(split(c(1:length(data_frame_list)), ceiling(seq_along(c(1:length(data_frame_list)))/chunks)),
function(x) purrr::reduce(data_frame_list[x], join_fun, by = join_by))
}
return(data_frame_list)
}
coalesce_join <- function(x, y,
by = NULL, suffix = c(".x", ".y"),
join = dplyr::left_join, ...) {
# copied originally from: https://alistaire.rbind.io/blog/coalescing-joins/
# ideas:
# https://stackoverflow.com/questions/33954292/merge-two-data-frame-and-replace-the-na-value-in-r#33954334
# https://community.rstudio.com/t/merging-2-dataframes-and-replacing-na-values/32123/2
# https://github.com/WinVector/rqdatatable
joined <- join(x, y, by = by, suffix = suffix, ...)
# names of desired output
cols <- union(names(x), names(y))
to_coalesce <- names(joined)[!names(joined) %in% cols]
if (length(to_coalesce) > 0) {
suffix_used <- suffix[ifelse(endsWith(to_coalesce, suffix[1]), 1, 2)]
# remove suffixes and deduplicate
to_coalesce <- unique(substr(to_coalesce, 1, nchar(to_coalesce) - nchar(suffix_used)))
coalesced <- purrr::map(to_coalesce, ~dplyr::coalesce(joined[[paste0(.x, suffix[1])]], joined[[paste0(.x, suffix[2])]]))
names(coalesced) <- to_coalesce
coalesced <- dplyr::bind_cols(coalesced)
# only remove cols when they have different names in x and y; because then the col name from y disappears; this is irrespective of left_join and right_join
# what about other way of specifying join columns?
cols2 <- if(is.null(names(by))) {
cols
} else {
by2 <- by[which(names(by) != by)]
cols[which(!cols %in% by2)]
}
return(dplyr::bind_cols(joined, coalesced)[cols2]) # modified from dplyr::bind_cols(joined, coalesced)[cols]; needed if by = c("xyz" = "abc") and "abc" becomes "xyz" in joined df
} else {
return(joined)
}
}
any_false <- function(x) {
if (all(is.na(x))) {
return(T)
} else if (any(!x[which(!is.na(x))])) {
return(F)
} else if (all(x[which(!is.na(x))])) {
return(T)
} else {
stop("Logical error.")
}
}
compare_seq_previous <- function(df,
subject_name,
pattern_names,
matches_to_subject,
matches_to_pattern) {
# previous procedure of comparing subject and pattern
# "-" in subject is a gap
## if is.na(subject.position) --> always gap in subject and always insertion in pattern
if (matches_to_pattern || matches_to_subject) {
df_original <- df
# is.na(subject.position) --> always gap in subject and always insertion in pattern
subject_gap <- which(is.na(df[,"subject.position"]))
for (i in pattern_names) {
df[subject_gap, i][which(!is.na(df[subject_gap, i]))] <- "insertion"
}
df[subject_gap, subject_name] <- "gap"
# "-" in any pattern --> gap in pattern, insertion in subject
pattern_gap <- apply(df[,pattern_names,drop=F], 1, function(x) which(x == "-"))
pattern_gap_rows <- which(lengths(pattern_gap) > 0)
df[pattern_gap_rows, subject_name] <- "insertion"
# write "gap" to patterns individually
for (i in pattern_names) {
df[which(lengths(apply(df[,i,drop=F], 1, function(x) which(x == "-"))) > 0), i] <- "gap"
}
# then find matches and mismatches
# pattern first
match_mismatch_list <- lapply(stats::setNames(pattern_names, pattern_names), function(x) df_original[,x] == df_original[,subject_name]) # subject seq in df (not df.original) may already contain "insertion"; for overlapping pattern where one receives an insertion and the other not, this is relevant
for (i in names(match_mismatch_list)) {
df[which(!df[,i] %in% c("gap", "insertion")),i] <- ifelse(match_mismatch_list[[i]][which(!df[,i] %in% c("gap", "insertion"))], "match", "mismatch")
}
# then subject
# find if any pattern has mismatch to subject
matches <- purrr::pmap_lgl(match_mismatch_list, function(...) any_false(unlist(list(...))))
df[intersect(which(!df[,subject_name] %in% c("gap", "insertion")), which(matches)),subject_name] <- "match"
df[intersect(which(!df[,subject_name] %in% c("gap", "insertion")), which(!matches)),subject_name] <- "mismatch"
if (!matches_to_subject) {
df[,subject_name] <- df_original[,subject_name]
}
if (!matches_to_pattern) {
for (i in pattern_names) {
df[,i] <- df_original[,i]
}
}
}
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.