#' @export
#' @importFrom dplyr "%>%" mutate group_by add_count ungroup pull if_else
#' @importFrom stringr str_c
make_clean_unique_names <- function(x) {
tibble(x = x) %>%
mutate(x = make_clean_names(x)) %>%
group_by(x) %>%
add_count() %>%
mutate(suffix = if_else(n>1, str_c('-', seq_along(x)), '')) %>%
ungroup() %>%
mutate(x = str_c(x, suffix)) %>%
pull(x)
}
#' @export
#' @importFrom dplyr "%>%"
#' @importFrom stringr str_remove str_replace str_replace_all
make_clean_names <- function(x) {
str_remove(x, '[^A-z0-9_\\-]+$') %>%
str_remove('^[^A-z0-9_\\-]+') %>%
str_replace_all('[^A-z0-9_\\-]+', '_') %>%
str_replace('^$', 'x')
}
#' @export
#' @importFrom ShortRead writeFastq yield FastqStreamer
#' @importFrom dplyr tibble bind_rows
#' @importFrom rlang is_string is_scalar_integerish
#' @importFrom future future plan multisession tweak value
splitFastqPairs <- function(fq1, fq2, out_dir, prefix = 'reads', suffix = '.fastq.gz', nreads=5e5){
stopifnot(is_string(fq1), file.exists(fq1),
is_string(fq2), file.exists(fq2),
is_string(out_dir), dir.exists(out_dir),
is_string(prefix), is_string(suffix),
is_scalar_integerish(nreads), nreads > 0)
f_fq1fns <- future({
HaplotypReportR::splitFastq(fqfn = fq1,
out_dir = out_dir,
prefix = prefix,
suffix = str_c('R1', suffix),
nreads = nreads)})
f_fq2fns <- future({
HaplotypReportR::splitFastq(fqfn = fq2,
out_dir = out_dir,
prefix = prefix,
suffix = str_c('R2', suffix),
nreads = nreads)})
fq1s <- value(f_fq1fns)
fq2s <- value(f_fq2fns)
if (length(fq1s) != length(fq2s)) {
warning("Fastq files were of different lengths")
}
n <- min(length(fq1s), length(fq2s))
tibble(i = seq_len(n),
fq1 = fq1s[seq_len(n)],
fq2 = fq2s[seq_len(n)])
}
#' @export
#' @importFrom ShortRead writeFastq yield FastqStreamer
#' @importFrom future future plan multisession value
splitFastq <- function(fqfn, out_dir, prefix, suffix, nreads){
fs <- FastqStreamer(fqfn, n = nreads)
i <- 1L
out_fns <- character()
f_write <- NULL
while(length(sr <- yield(fs)) > 0){
if (!is.null(f_write)) {
if (value(f_write) < 1) {
warning('writeFastq possibly failed')
}
}
if (length(sr) == 0)
break
fn <- file.path(out_dir, str_c(prefix, '_', i, '_', suffix))
f_write <- future(writeFastq(sr, fn))
out_fns <- c(out_fns, fn)
i <- i + 1L
}
close(fs)
if (!is.null(f_write)) {
if (value(f_write) < 1) {
warning('writeFastq possibly failed')
}
}
return(out_fns)
}
#' @export
#' @importFrom ShortRead writeFastq yield FastqStreamer
#' @importFrom dplyr tibble bind_rows
#' @importFrom rlang is_string is_scalar_integerish
concatFastq <- function(in_fqs, out_fq){
stopifnot(is.character(in_fqs),
all(file.exists(in_fqs)),
is_string(out_fq))
mode <- 'w'
for (fq in in_fqs) {
fs <- FastqStreamer(fq)
while(length(sr <- yield(fs)) > 0) {
writeFastq(sr, out_fq, mode = mode)
mode <- 'a'
}
close(fs)
}
invisible(out_fq)
}
#' @export
#' @importFrom ShortRead writeFastq yield FastqSampler
#' @importFrom rlang is_string is_scalar_integerish
sampleFastq <- function(in_fq, nreads, pattern = '.fastq.gz$', suffix = '_sub.fastq.gz', seed = 0L){
stopifnot(is_string(in_fq), file.exists(in_fq),
is_scalar_integerish(nreads))
out_fq = str_c(str_remove(in_fq, pattern), suffix)
if(file.exists(out_fq))
file.remove(out_fq)
set.seed(0L)
fs <- FastqSampler(in_fq, n = nreads)
sr <- yield(fs)
writeFastq(sr, out_fq)
close(fs)
return(out_fq)
}
#' @importFrom dplyr inner_join "%>%"
#' @importFrom stringr str_extract str_count
#' @importFrom Biostrings DNAString DNAStringSet pairwiseAlignment alignedSubject subseq
trim_marker_panel <- function(trimmed_ref_seq, marker_panel) {
trimmed_panel <-
names(trimmed_ref_seq) %>%
map_df(function(marker) {
panel_seqs <-
marker_panel %>%
filter(MarkerID == marker) %>%
with(magrittr::set_names(DNAStringSet(Sequence), Haplotype))
aln <- pairwiseAlignment(panel_seqs, trimmed_ref_seq[marker], type="global")
subject <- alignedSubject(aln) %>% as.character()
gaps_left <- str_extract(subject, '^-*') %>% str_count('-')
gaps_right <- str_extract(subject, '-*$') %>% str_count('-')
trimmed_panel <-
panel_seqs %>%
subseq(., gaps_left + 1, width(.) - gaps_right) %>%
unique()
tibble(MarkerID = marker,
Haplotype = names(trimmed_panel),
Sequence = as.character(trimmed_panel))
})
}
#' @importFrom dplyr inner_join "%>%" tibble
#' @importFrom Biostrings DNAString DNAStringSet pairwiseAlignment mismatchSummary
get_panel_snps <- function(trimmed_ref_seq, marker_panel) {
marker_ref <-
tibble(MarkerID = names(trimmed_ref_seq),
ReferenceSequence = as.character(trimmed_ref_seq))
inner_join(marker_ref, marker_panel, 'MarkerID') %>%
select(MarkerID, ReferenceSequence, Sequence) %>%
group_by(MarkerID, ReferenceSequence) %>%
summarise(Sequence = list(Sequence)) %>%
ungroup() %>%
mutate(snps = map2(ReferenceSequence, Sequence, function(ref, seq) {
aln <- pairwiseAlignment(DNAStringSet(seq), DNAString(ref), type="global")
mismatchSummary(aln)$subject %>% select(Pos = SubjectPosition, Ref = Subject, Alt = Pattern)
})) %>%
unnest(snps) %>%
select(MarkerID, Pos, Ref, Alt) %>%
mutate_if(is.factor, as.character) %>%
distinct() %>%
arrange_all()
}
format_int <- function(x, min_width = 1L) {
stopifnot(rlang::is_integerish(x))
width <- max(min_width, 1 + floor(log10(max(x))))
stringr::str_pad(x, pad = "0",side = 'left', width = width)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.