R/utility.R

Defines functions process_result_file get_snps_set read_sequences_from_fastq match_count reverse_complement scramble_sequence translate_position

Documented in get_snps_set match_count process_result_file read_sequences_from_fastq reverse_complement scramble_sequence translate_position

#Utility functions

#' \code{translate_position}
#' 
#' @description
#' \code{translate_position} translate the scambled position in the alignment
#' to the original position or vice versa
#' @param position the position to translate
#' @param positions_table the table containing the original and scrambled positions
#' @param to the direction to translate, either "original" or "scrambled"
#' @return the translated position
#' @export
translate_position <- function(position, positions_table, to = "original") {
    if (to == "original") {
        return(positions_table$original[match(position, positions_table$scrambled)])
    } else if (to == "scrambled") {
        return(positions_table$scrambled[match(position, positions_table$original)])
    } else {
        stop("to must be either original or scrambled")
    }
}

#' \code{scramble_sequence}
#' 
#' @description
#' \code{scramble_sequence} scramble the orthologous matrix based on a seed
#' @param seqc the sequence to scramble
#' @param seed the seed to use for scrambling
#' @return a named list, containing the scambled sequence and the new positions
#' @export 
scramble_sequence <- function(seqc, seed){
    set.seed(seed)

    result <- seqc
    if (inherits(seqc, "processed_seqs")) {
        ### use the seqc & update the excluded position
        seqc <- seqc$seqc
    }

    new_order <- sample(seq_len(length(seqc[[1]])), length(seqc[[1]]), replace = FALSE)
    new_seqc <- lapply(seqc, function(x) {
        return(x[new_order])
    })

    positions_table <- data.frame(original = seq_len(length(seqc[[1]])), scrambled = new_order)

    if (inherits(seqc, "processed_seqs")) {
        result$seqc <- new_seqc
        result$excluded_positions <- translate_position(result$excluded_positions, positions_table, to = "scrambled")
        result$positions_table <- positions_table
    } else {
        result <- list(new_seqc = new_seqc,
            positions_table=positions_table)
    }

    return(result)
}

#' \code{reverse_complement}
#'
#' @description
#' \code{reverse_complement} returns the reverse complement of the
#' given sequence
#' @param seq the sequence to reverse complement
#' @return reverse complemented sequence
#' @export
reverse_complement <- function(seq) {
    seq <- rev(strsplit(seq, split = "")[[1]])
    result <- lapply(seq, function(x) {
        if (toupper(x) == "A") {
            return("T")
        }
        if (toupper(x) == "C") {
            return("G")
        }
        if (toupper(x) == "T") {
            return("A")
        }
        if (toupper(x) == "G") {
            return("C")
        }
        return(x)
    })
    return(paste(result, collapse = ""))
}

#' \code{match_count}
#'
#' @description
#' \code{match_count} return the number of matches of the target string in the
#' given sequence
#' @param target the search target
#' @param search_from the sequence to search from
#' @return number of matches
#' @export
match_count <- function(target, search_from) {
    matches <- gregexpr(paste(target, collapse = ""),
        paste(search_from, collapse = ""))
    found <- attr(matches[[1]], "match.length")
    if (length(found) == 1) {
        if (found[1] == -1) {
           return(0)
        }
    }
    return(length(found))
}

#' \code{read_sequences_from_fastq}
#'
#' @description
#' \code{read_sequences_from_fastq} get the sequences
#' from a fastq file, it completely ignores the quality scores
#' @param fastq_file location of the fastq file
#' @param force_to_upper whether to transform sequences
#' to upper case, default to TRUE
#' @param max_n_reads maximum number of reads to read, default to -1 (all)
#' @param skip_n_reads number of reads to skip, default to 0
#' @param output_quality whether to output the quality scores, default to TRUE
#' @param bp BiocParallel backend to use for parallelization
#' @param quality_offset the quality offset to use, default to 33
#' @return will return a list of sequences, with qualities as attribute
#' @importFrom BiocParallel bplapply MulticoreParam
#' @export
read_sequences_from_fastq <- function(fastq_file, force_to_upper = TRUE, skip_n_reads = 0,
    max_n_reads = -1, output_quality = TRUE, quality_offset = 33, bp = MulticoreParam()) {

    if (max_n_reads != -1 && skip_n_reads != 0) {
        n_lines <- (max_n_reads * 4) + (skip_n_reads * 4)
    } else {
        n_lines <- (max_n_reads * 4)
    }

    lines <- readLines(fastq_file, n = n_lines)
    if (skip_n_reads != 0){
        if ((4 * skip_n_reads + 1) > length(lines)){
            warning("skip_n_reads is too large: No reads beyond that")
            return(NULL)
        }
        lines <- lines[seq(4 * skip_n_reads + 1, length(lines), 1)]
    }
    seqs_id <- lines[seq(1, length(lines), 4)]
    seqs <- lines[seq(2, length(lines), 4)]
    

    seqs_id <- unlist(bplapply(seqs_id, function(id) {
        id <- strsplit(id, split = "")[[1]]
        return(paste0(id[2:length(id)], collapse = ""))
    }, BPPARAM = bp))

    if (force_to_upper) {
        seqs <- unlist(bplapply(seqs, function(seq) {
            return(toupper(seq))
        }, BPPARAM = bp))
    }

    seqs <- as.list(seqs)
    names(seqs) <- seqs_id
    if (output_quality) {
        qualities_data <- lines[seq(4, length(lines), 4)]
        qualities <- bplapply(qualities_data, function(x) {
            return(utf8ToInt(x) - quality_offset)
        }, BPPARAM = bp)
        attr(seqs, "qualities") <- qualities
    }
    return(seqs)
}

#' \code{get_snps_set}
#'
#' @description
#' \code{get_snps_set} extract the SNP sets from the output of
#' `find_optimised_snps`.
#' @param results output from `find_optimised_snps`
#' @param as output format, either `data.frame` or `list`.
#' @importFrom utils tail
#' @return will return either 
#' 1. a dataframe containing SNPs_set (SNP position separated by ",") and score
#' 2. a list containing SNPs_set (SNP position as numeric vector) and score (attr of the list)
#' @export
get_snps_set <- function(results, as = "data.frame") {
    if (!as %in% c("data.frame", "list"))
        stop("as must be either data.frame or list")

    snp_sets <- sapply(results$results, function(x){
        return(
            tail(names(x), n = 1)
        )
    })
    score <- sapply(results$results, function(x){
        return(
            tail(x, n = 1)[[1]]
        )
    })
    if (as == "data.frame"){
        result <- data.frame(SNPs_set = snp_sets, score = score)
    } else {
        result <- unname(lapply(strsplit(snp_sets, split = ", "), as.numeric))
        attr(result, "score") <- unname(score)
    }
    return(result)
}

#' \code{process_result_file}
#'
#' @description
#' \code{process_result_file} extract the SNP sets from the saved output file.
#' @param result_filepath is the path of the saved output file.
#' @return will return a list containing SNPs_set (SNP position as numeric vector).
#' @export
process_result_file <- function(result_filepath) {
    f <- file(result_filepath, "r")
    preparse <- readLines(f)
    result <- list()
    cur <- 1
    is_result <- FALSE
    for (i in seq_len(length(preparse))) {
        if (length(grep("^Result", preparse[[i]])) >= 1) {
            is_result <- TRUE
        }
        if (is_result) {
            if (gsub("\\s+", "", preparse[[i + 1]]) == "") {
                result[[cur]] <- as.numeric(
                    strsplit(
                        gsub(
                            "\"", "",
                            strsplit(preparse[[i]], split = "\t")[[1]][1]
                        ),
                        split = ", "
                    )[[1]]
                )
                cur <- cur + 1
                is_result <- FALSE
            }
        }
    }
    close(f)
    final_selected <- (result)
    return(final_selected)
}
ludwigHoon/minSNPs documentation built on March 25, 2024, 11:54 a.m.