library(minSNPs)
library(dplyr)
library(BiocParallel)
searches <- read.csv("new_30_set_searches.csv", stringsAsFactors = F)
seqs <- read_fasta("balanced_mat_without_SRR798725_single_final.fasta")
result <- data.frame(isolate=c(), number_fragment=c(), fragment=c(), result=c())
simulated_1 <- Sys.glob("./*DeepSimu")
simulated_1 <- gsub("_DeepSimu", "", gsub("./", "", simulated_1))
seed <- as.numeric(commandArgs(trailingOnly=TRUE)[[1]])
set.seed(seed)
infer_result <- function(data, searches, seqs){
s_strings <- unique(searches$search_string)
temp_result <- data.frame(fragment_id = c(), snp_id = c(),
extracted_snps = c(), match_count = c(),
stringsAsFactors = F)
for (i in seq_len(length(data))) {
c_count <- BiocParallel::bplapply(s_strings, function(string) {
match <- unlist(gregexpr(string, paste(data[i], collapse = "")))
return(match[which(match > 0)])
}, BPPARAM = BiocParallel::MulticoreParam(workers = 62))
s_m <- lapply(c_count, length)
matched_string <- s_strings[which(s_m > 0)]
match_count <- unlist(s_m[which(s_m > 0)])
match_indexes <- unlist(lapply(c_count[which(s_m > 0)], paste, collapse = ", "))
snp_id <- searches[match(matched_string, searches$search_string), "fasta_position"]
target_pos <- searches[match(matched_string, searches$search_string), "target_position"]
ext_snps <- searches[match(matched_string, searches$search_string), "snp_string"]
stopifnot(length(matched_string) == length(match_count))
stopifnot(length(matched_string) == length(match_indexes))
new_data <- data.frame(fragment_id = rep(i, length(matched_string)), snp_id = snp_id,
extracted_snps = ext_snps, match_count = match_count,
stringsAsFactors = F)
temp_result <- rbind(temp_result, new_data)
}
result_to_use <- data.frame(fragment_id = c(), snp_id = c(),
extracted_snps = c(), match_count = c(),
stringsAsFactors = F)
temp_result <- as.data.frame(temp_result %>%
group_by(snp_id, extracted_snps) %>%
summarise(match_count=sum(match_count), .groups="keep"))
# Check each SNP set
for (snp in unique(temp_result$snp_id)){
subset <- temp_result[temp_result$snp_id == snp,]
if (length(unique(subset$extracted_snps)) > 1){
highest_count <- max(subset$match_count)
if (length(unique(subset[subset$match_count == highest_count, "extracted_snps"])) > 1){
print("Found 2 match with highest/equal counts, discarding result")
next
} else {
result_to_use <- rbind(result_to_use, subset[subset$match_count == highest_count,])
}
} else{
result_to_use <- rbind(result_to_use, subset)
}
}
if (nrow(result_to_use) > 1){
snps_found <- as.numeric(strsplit(paste(result_to_use[, "snp_id"], collapse = ", "), ", ")[[1]])
out_snps <- paste(result_to_use[, "extracted_snps"], collapse = "")
pattern_ref <- minSNPs:::generate_pattern(seqs, snps_found)
possible_profiles <- lapply(names(pattern_ref)[which(pattern_ref == out_snps)], function(x) {
return(strsplit(x, split = "_")[[1]][1])
})
} else{
possible_profiles <- lapply(names(seqs), function(x) {
return(strsplit(x, split = "_")[[1]][1])
})
}
return(possible_profiles)
}
i <- 1
simulated <- simulated_1[1:50]
for (sim in simulated){
c_pipe <- pipe(
paste("sed -n '2~4p' ", sim, "_DeepSimu/fastq/pass/*.fastq", sep = "")
)
data <- readLines(
c_pipe
)
close(c_pipe)
print(paste("Running:", i, "/", length(simulated)))
pool <- seq_len(length(data))
sampled <- c()
for (n_frag in c(1, 4, rep(5, 9))){
pool_ss <- pool[which(!pool %in% sampled)]
sampled <- c(sampled, sample(pool_ss, n_frag))
temp_data <- data[sampled]
temp_result <- infer_result(temp_data, searches, seqs)
n_result <- data.frame(
isolate=sim,
number_fragment=length(sampled),
fragment=paste(sampled, collapse = ", "),
result=paste(temp_result, collapse = ", ")
)
result <- rbind(result, n_result)
}
i <- i + 1
}
write.csv(result, paste("r", seed, "30set_1_50_simulation_result.csv", sep="_"), row.names=F)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.