WIP/Long_read_Simulation/string_generation.R

### HELPERS
STEP_1 <- T

process_result_file <- function(result_filepath) {
    f <- file(result_filepath, "r")
    preparse <- readLines(f)
    result <- list()
    cur <- 1
    is_result <- FALSE
    for (i in seq_len(length(preparse))){
        if (length(grep("^Result", preparse[[i]])) >= 1){
            is_result <- TRUE
        }
        if (is_result){
            if (gsub("\\s+", "", preparse[[i + 1]]) == "") {
                result[[cur]] <- as.numeric(
                    strsplit(
                        gsub("\"", "",
                            strsplit(preparse[[i]], split = "\t")[[1]][1]),
                    split = ", ")[[1]])
                cur <- cur + 1
                is_result <- FALSE
            }
        }
    }
    close(f)
    final_selected <- (result)
    return(final_selected)
}

generate_kmers <- function(final_string, k){
    if (is.character(final_string) && length(final_string) == 1){
        final_string <- strsplit(final_string, split = "")[[1]]
    }
    kmer <- list()
    for (i in seq_len(length(final_string) - k + 1)){
        kmer[[i]] <- paste(final_string[i:(i+k-1)], collapse = "")
    }
    return(unlist(kmer))
}

process_variant_file <- function(variant_sites){
    result <- c()
    read_cc <- c()
    con <- file(variant_sites, "r")
    while(TRUE){
        data <- readLines(con, n = 3)
        if (length(data) == 0){
            break
        }
            positions <- strsplit(strsplit(data, split = "\n")[[2]], split = ", ")
            read_cc <- c(read_cc, strsplit(data, split = "\n")[[1]])
            result <- c(result, unlist(positions))
    }
    close(con)
    print(read_cc)
    result <- sort(as.numeric(unique(result)))
    return(result)
}

reverse_complement <- function(seq){
    seq <- rev(strsplit(seq, split = "")[[1]])
    result <- lapply(seq, function(x){
        if (toupper(x) == "A") {return("T")}
        if (toupper(x) == "C") {return("G")}
        if (toupper(x) == "T") {return("A")}
        if (toupper(x) == "G") {return("C")}
        return(x)
        })
    return(paste(result, collapse = ""))
}



### STEP 1
identify_overlaps <- function(selected_fasta, position_reference, prev, after, extend_length = T){
    snp_table <- data.frame(snp_id = c(), fasta_position = c(), genome_position = c(), string_start = c(), string_end = c(),
        stringsAsFactors = F)
    overlap_table <- data.frame(snp_id = c(), overlaps_genome_pos = c(), overlaps_fasta_pos = c(),
        overlaps_string_pos = c(), stringsAsFactors = F)

    selected_fasta <- c(selected_fasta)
    for (snp in selected_fasta){
        genome_pos <- position_reference[position_reference$fasta_position == snp, "genome_position"]
        string_start <- genome_pos - prev
        string_end <- genome_pos + after
        
        N_test <- 0
        snp_string_pos <- prev + 1
        # Check overlaps before
        overlaps_before <- position_reference[position_reference$genome_position >= (genome_pos - (N_test+prev)) &
            position_reference$genome_position < genome_pos, c("fasta_position", "genome_position")]
        if(extend_length){
            while((N_test + prev) - nrow(overlaps_before) < prev){
                N_test <- N_test + (nrow(overlaps_before) - N_test)
                overlaps_before <- position_reference[position_reference$genome_position >= (genome_pos - (N_test+prev)) &
                    position_reference$genome_position < genome_pos, c("fasta_position", "genome_position")]
            }
            string_start <- string_start - N_test
            snp_string_pos <- snp_string_pos + N_test
            overlaps_before$s_pos <- overlaps_before$genome_position - string_start
        }
        
        N_test <- 0
        # Check overlaps after
        overlaps_after <- position_reference[position_reference$genome_position > genome_pos &
            position_reference$genome_position <= (genome_pos + (N_test+after)), c("fasta_position", "genome_position")]
        if(extend_length){
            while((N_test + after) - nrow(overlaps_after) < after){
                N_test <- N_test + (nrow(overlaps_after) - N_test)
                overlaps_after <- position_reference[position_reference$genome_position > genome_pos &
                    position_reference$genome_position <= (genome_pos + (N_test+after)), c("fasta_position", "genome_position")]
            }
            string_end <- string_end + N_test
            overlaps_after$s_pos <- snp_string_pos + (overlaps_after$genome_position - genome_pos)
        }
        
        temp_overlap_table <- rbind(overlaps_before, overlaps_after)
        if (ncol(temp_overlap_table) >= 3){
            colnames(temp_overlap_table) <- c("overlaps_fasta_pos", "overlaps_genome_pos", "overlaps_string_pos")
        }
        temp_overlap_table$snp_id <- rep(snp, nrow(temp_overlap_table))

        snp_table <- rbind(snp_table, data.frame(
            snp_id = snp, fasta_position = as.character(paste(snp, collapse = ", ")),
                genome_position = genome_pos, string_start = string_start,
                string_end = string_end, snp_string_pos = as.character(paste(snp_string_pos, collapse = ", ")),
                stringsAsFactors = F))
        overlap_table <- rbind(overlap_table, temp_overlap_table)
    }
    return(list(snp_table = snp_table, overlap_table = overlap_table))
}

#data.frame(snp_id = c(), genome_position = c(), fasta_position = c(), string_start = c(),
#    string_end = c(), snp_string_pos = c(), stringsAsFactor = F)

#data.frame(snp_id = c(), overlaps_genome_pos = c(), overlaps_fasta_pos = c(),
#    overlaps_string_pos = c(), stringsAsFactor = F)
match_count <- function(target, search_from){
    matches <- gregexpr(paste(target, collapse=""), search_from)
    found <- 0
    if (length(matches[[1]]) == 1){
        if (matches[[1]][1] != -1){
            found <- 1
        }
    } else{
        found <- length(matches[[1]])
    }
    return(found)
}

### STEP 2
generate_search_string <- function(snp_table, overlap_table, orth_matrix, ref_seq, include_neighbour = F,
    bpparam = BiocParallel::MulticoreParam()){

    string_table <- data.frame(search_string = c(), snp_id = c(), snp_string = c(),
        n_match_genome = c(), n_match_genome_rev = c(), stringsAsFactors = F)
    if (class(ref_seq) == "list") {
        ref_seq <- ref_seq[[1]]
    }
    
    #pb = txtProgressBar(min = 0, max = nrow(snp_table), initial = 0, style = 3) 
    l_string_table <- BiocParallel::bplapply(seq_len(nrow(snp_table)), function(i){
        #setTxtProgressBar(pb,i)
        snp_id <- snp_table[i, "snp_id"]
        #tic(snp_id)
        ref_string <- ref_seq[c(snp_table[i, "string_start"]:snp_table[i, "string_end"])]
        fasta_positions <- as.numeric(strsplit(snp_table[i, "fasta_position"], split = ", ")[[1]])
        overlaps <- overlap_table[overlap_table$snp_id == snp_id,]

        if (include_neighbour){
            ##### TO-DO
            next
        } else {
            variants <- unlist(unique(minSNPs:::generate_pattern(orth_matrix, fasta_positions)))
            f_string <- lapply(variants, function(var){
                ref_string[as.numeric(strsplit(snp_table[i, "snp_string_pos"], split = ", ")[[1]])] <- var
                return(paste(ref_string, collapse = ""))
            })
            f_string <- lapply(f_string, function(string){
                string <- strsplit(string, split = "")[[1]]
                string[overlaps$overlaps_string_pos] <- "."
                return(paste(string, collapse = ""))
            })
        }
        rc_string <- lapply(f_string, reverse_complement)
        temp_string_table <- data.frame(search_string = c(unlist(f_string), unlist(rc_string)),
            snp_id = rep(snp_id, length(f_string)*2), snp_string = c(variants, variants), stringsAsFactors = F)
        temp_string_table$n_match_genome <- unlist(lapply(temp_string_table$search_string, match_count,
            search_from = paste(ref_seq, collapse = "")))
        temp_string_table$n_match_genome_rev <- unlist(lapply(temp_string_table$search_string, match_count,
            search_from = reverse_complement(paste(ref_seq, collapse = ""))))
        temp_string_table$fasta_position <- rep(snp_table[i, "fasta_position"], nrow(temp_string_table))
        #toc()
        return(temp_string_table)
    }, BPPARAM = bpparam)
    string_table <- do.call(rbind, l_string_table)
    #close(pb)
    return(string_table)
}

#data.frame(search_string = c(), snp_id = c(), fasta_position = c(), snp_string = c(),
#    n_match_genome = c(), n_match_genome_rev = c(), stringsAsFactor = F)


### STEP 3
summarise_result <- function(snp_table, overlap_table, search_string_table, excluded_position = NULL){
    new_snp_table <- snp_table
    snps <- unlist(unique(snp_table$snp_id))
    new_snp_table$multiple_match <- unlist(lapply(snps, function(id){
        subset <- snp_table[snp_table$snp_id == id, ]
        return((sum(subset$n_match_genome) > 1) | (sum(subset$n_match_genome_rev) > 1))
    }))
    new_snp_table$variant_count <- unlist(lapply(snps, function(id){
        subset <- search_string_table[search_string_table$snp_id == id,]
        return(nrow(subset)/2)
    }))
    new_snp_table$contain_excluded <- unlist(lapply(snps, function(id){
        subset <- overlap_table[overlap_table$snp_id == id, "overlaps_fasta_pos"]
        return(any(subset %in% excluded_position))
    }))
    return(new_snp_table)
}

### For gene
generate_search_string_gene <- function(gene_seq, ref_seq, k, id_prefix = "gene"){
    string_table <- data.frame(search_string = c(), string_id = c(), stringsAsFactors = FALSE)

    if (class(ref_seq) == "list") {
        ref_seq <- ref_seq[[1]]
    }

    search_strings <- BiocParallel::bplapply(gene_seq, function(gene, generate_kmers, k){
        kmers <- generate_kmers(final_string = gene, k = k)
        return(kmers)
    }, k=k, generate_kmers=generate_kmers)

    u_string <- unique(unlist(search_strings))
    rc_u_string <- unname(sapply(u_string, reverse_complement))

    string_table <- data.frame(search_string = c(u_string, rc_u_string),
        string_id = c(paste(id_prefix, seq_along(u_string), "1", sep = "_"),
            paste(id_prefix, seq_along(rc_u_string), "2", sep = "_")),
        stringsAsFactors = F)

    string_table$n_match_genome <- unlist(bplapply(string_table$search_string, match_count,
        search_from = paste(ref_seq, collapse = "")))
    string_table$n_match_genome_rev <- unlist(bplapply(string_table$search_string, match_count,
        search_from = reverse_complement(paste(ref_seq, collapse = ""))))

    return(string_table)
}



#### RUNS
library(minSNPs) 
library(BiocParallel) 

ref_matrix <- read.csv("ref_OUTF1.csv")
ort_matrix <- read_fasta("./balanced_mat_without_SRR798725_single_final.fasta")
excluded <- process_variant_file("variant_t10.csv")
result_filepath <- "evenhd_with_exc200x8_400.csv"
final_selected <- unique(unlist(process_result_file(result_filepath)))
ref_genome <- read_fasta("./Mu50.fasta")
meca <- read_fasta("mecA.fasta")
pvl <- read_fasta("pvl.fasta")

###
# final_selected <- seq_len(length(ref_matrix[[1]]))
# final_selected <- final_selected[!final_selected %in% excluded]
##
prev <- 7
after <- 7
k <- 15

if (STEP_1) {
RES_STEP_1 <- bplapply(final_selected, identify_overlaps,
    position_reference = ref_matrix, prev = prev, after = after)

snp_table <- do.call(rbind, lapply(RES_STEP_1, function(x){
    return(x$snp_table)}))
overlap_table <- do.call(rbind, lapply(RES_STEP_1, function(x){
    return(x$overlap_table)}))

write.csv(snp_table, "snp_table.csv", row.names = F)
write.csv(overlap_table, "overlap_table.csv", row.names = F)

RES_STEP_1[["snp_table"]] <- snp_table
RES_STEP_1[["overlap_table"]] <- overlap_table
}

search_string_table <- generate_search_string(RES_STEP_1[["snp_table"]],
                        RES_STEP_1[["overlap_table"]], ort_matrix, ref_genome, bpparam = MulticoreParam(workers = 6, progressbar = T))
write.csv(search_string_table, "search_string_table.csv", row.names = F)

summary <- summarise_result(RES_STEP_1[["snp_table"]], RES_STEP_1[["overlap_table"]],
                        search_string_table)
write.csv(summary, "summary.csv", row.names = F)

# For gene
meca_search_string_table <- generate_search_string_gene(meca, ref_genome, k, id_prefix = "mecA")
pvl_search_string_table <- generate_search_string_gene(pvl, ref_genome, k, id_prefix = "pvl")

gene_search_table <- rbind(meca_search_string_table, pvl_search_string_table)
write.csv(gene_search_table, "gene_search_table.csv", row.names = F)
#search_string_table <- generate_search_string(test_snp_table, test_overlap_table, ort_matrix, ref_genome, bpparam = Serial
ludwigHoon/minSNPs documentation built on March 25, 2024, 11:54 a.m.