## Requirement: "tibble" + "stringr" + "DECIPHER" + the program OligoArrayAux installed in the correct directory (see AmplifyDNA help)
decipher_in_silico_pcr = function(complete_sequences, data_taxo, cols_to_keep,
accession_col_name = "ACCESSION", species_col_name = "SPECIES",
primer_data = NULL, primers_names_col_name = NULL, length_col_name= NULL,
temp_col_name= NULL, gene_col_name = "GENE", P_col_name= NULL, ions_col_name= NULL,
forward_col_name = "FORWARD", reverse_col_name = "REVERSE",
primers = NULL, primers_names = NULL, gene = NULL,
taqEfficiency = T, maxDistance, maxGaps,
max_length = NULL, annealingTemp = NULL, P, ions,
minEfficiency = 0.001, includePrimers, output_name,
batchSize = 100, program_location = "C:/cygwin64/bin", write_final = F){
fragments = list()
primer_name = list()
previous_wd = getwd()
parameters = list(max_length, annealingTemp, primers_names, gene)
parameters[sapply(parameters, is.null)] = NULL
data_col_names = c("length_col_name", "temp_col_name", "gene_col_name", "P_col_name", "ions_col_name",
"forward_col_name", "reverse_col_name")
accession_pos = which(colnames(data_taxo) == accession_col_name)
CalculateEfficiencyPCR_custom <- function(primer,
target,
temp,
P,
ions,
batchSize=1000,
taqEfficiency=TRUE,
maxDistance=0.4,
maxGaps=2,
processors=1) {
# error checking
if (is.character(primer))
primer <- toupper(primer)
if (is(primer, "DNAStringSet"))
primer <- strsplit(toString(primer), ", ", fixed=TRUE)[[1]]
if (is(target, "DNAStringSet"))
target <- strsplit(toString(target), ", ", fixed=TRUE)[[1]]
if (!is.character(primer))
stop("primer must be a character vector or DNAStringSet.")
if (!is.character(target))
stop("target must be a character vector or DNAStringSet.")
if (!is.numeric(ions))
stop("ions must be a numeric.")
if (ions < .01 || is.nan(ions))
stop("Sodium equivilent concentration must be at least 0.01M.")
if (!is.numeric(P))
stop("P must be a numeric.")
if (!(P > 0))
stop("P must be greater than zero.")
if (!is.numeric(batchSize))
stop("batchSize must be a numeric.")
if (floor(batchSize)!=batchSize)
stop("batchSize must be a whole number.")
if (batchSize <= 0)
stop("batchSize must be greater than zero.")
if (!is.numeric(temp))
stop("temp must be a numeric.")
if (!is.logical(taqEfficiency))
stop("taqEfficiency must be a logical.")
RT <- 0.0019871*(273.15 + temp) # [kcal/mol]
l <- length(primer)
if (l==0)
stop("No primer specified.")
if (l!=length(target))
stop("primer is not the same length as target.")
# align primer and target
seqs2 <- Biostrings::reverseComplement(Biostrings::DNAStringSet(target))
seqs2 <- unlist(strsplit(toString(seqs2), ", ", fixed=TRUE))
seqs2 <- paste("----", seqs2, "----", sep="")
p <- Biostrings::pairwiseAlignment(primer,
seqs2,
type="global-local",
gapOpen=-5,
gapExtension=-5)
t_START <- nchar(seqs2) - p@subject@range@start - p@subject@range@width - 2
t_END <- nchar(seqs2) - p@subject@range@start - 3
if (taqEfficiency) {
if (!is.numeric(maxDistance))
stop("maxDistance must be a numeric.")
if (maxDistance < 0)
stop("maxDistance must be greater or equal to zero.")
if (maxDistance > 1)
stop("maxDistance must be less than or equal to one.")
if (!is.numeric(maxGaps))
stop("maxGaps must be a numeric.")
if (maxGaps < 0)
stop("maxGaps must be at least zero.")
if (!is.null(processors) && !is.numeric(processors))
stop("processors must be a numeric.")
if (!is.null(processors) && floor(processors)!=processors)
stop("processors must be a whole number.")
if (!is.null(processors) && processors < 1)
stop("processors must be at least 1.")
if (is.null(processors)) {
processors <- parallel::detectCores()
} else {
processors <- as.integer(processors)
}
seqs1 <- unlist(strsplit(toString(Biostrings::pattern(p)), ", ", fixed=TRUE))
seqs2 <- unlist(strsplit(toString(Biostrings::subject(p)), ", ", fixed=TRUE))
seqs2 <- Biostrings::reverseComplement(Biostrings::DNAStringSet(seqs2))
seqs2 <- unlist(strsplit(toString(seqs2), ", ", fixed=TRUE))
# calculate elongation efficiency
eff_taq <- .Call("terminalMismatch", seqs1, seqs2, maxDistance, maxGaps, processors, PACKAGE="DECIPHER")
} else {
eff_taq <- numeric(l)
eff_taq[] <- 1
}
t <- which(eff_taq >= .001)
tl <- length(t)
if (tl == 0)
return(eff_taq)
# duplex formation
seqs <- paste(primer[t],
target[t],
sep=" ")
seq <- unique(seqs)
ls <- length(seq)
dG <- numeric(ls)
for (start in seq(1, ls, batchSize)) {
end <- ifelse(start + batchSize > ls, ls, start + batchSize)
dG[start:end] <- as.numeric(system(paste("hybrid-min -n DNA -t",
temp,
"-T",
temp,
"-N",
ions,
"-E -q",
paste(seq[start:end], collapse=" ")),
intern=TRUE))
}
dG1 <- numeric(l)
dG1[t] <- dG[match(seqs, seq)]
K1 <- exp(-dG1/RT)
eff <- P*K1/(1 + P*K1)*eff_taq
eff_hyb <- P*K1/(1 + P*K1) ##############
if (tl != l)
eff[-t] <- eff_taq[-t]
z <- which(eff >= .001)
l <- length(z)
if (l == 0)
return(eff)
primer <- primer[z]
target <- target[z]
K1 <- K1[z]
dG1 <- dG1[z]
eff_taq <- eff_taq[z]
t_START <- t_START[z]
t_END <- t_END[z]
# primer folding
seqs <- primer
seq <- unique(seqs)
ls <- length(seq)
dG <- numeric(ls)
for (start in seq(1, ls, batchSize)) {
end <- ifelse(start + batchSize > ls, ls, start + batchSize)
dG[start:end] <- as.numeric(system(paste("hybrid-ss-min -n DNA -t",
temp,
"-T",
temp,
"-N",
ions,
"-E -q",
paste(seq[start:end], collapse=" ")),
intern=TRUE))
}
dG2a <- numeric(l)
dG2a <- dG[match(seqs, seq)]
K2a <- exp(-dG2a/RT)
# primer-primer duplex
seqs <- paste(primer,
primer,
sep=" ")
seq <- unique(seqs)
ls <- length(seq)
dG <- numeric(ls)
for (start in seq(1, ls, batchSize)) {
end <- ifelse(start + batchSize > ls, ls, start + batchSize)
dG[start:end] <- as.numeric(system(paste("hybrid-min -n DNA -t",
temp,
"-T",
temp,
"-N",
ions,
"-E -q",
paste(seq[start:end], collapse=" ")),
intern=TRUE))
}
dG2b <- numeric(l)
dG2b <- dG[match(seqs, seq)]
K2b <- exp(-dG2b/RT)
# target folding
seqs <- substr(target, t_START, t_END)
seq <- unique(seqs)
ls <- length(seq)
dG <- numeric(ls)
for (start in seq(1, ls, batchSize)) {
end <- ifelse(start + batchSize > ls, ls, start + batchSize)
dG[start:end] <- as.numeric(system(paste("hybrid-ss-min -n DNA -t",
temp,
"-T",
temp,
"-N",
ions,
"-E -q",
paste(seq[start:end], collapse=" ")),
intern=TRUE))
}
dG3 <- numeric(l)
dG3 <- dG[match(seqs, seq)]
K3 <- exp(-dG3/RT)
K2eff <- numeric(l)
K2eff[] <- -1
w <- which(K2b==0)
K2eff[w] <- 0
w <- which(K2b*P/K2a < .01)
K2eff[w] <- K2a[w]
w <- which(8*K2b*P > (1 + K2a)^2)
K2eff[w] <- K2b[w]
w <- which(K2eff==-1)
K2eff[w] <- 4*K2b[w]*P/(-1 - K2a[w] + sqrt((1 + K2a[w])^2 - 8*K2b[w]*P)) - 1
w <- which(K2eff < 0)
K2eff[w] <- 0
Kov <- K1/((1 + K2eff)*(1 + K3))
eff[z] <- P*Kov/(1 + P*Kov)*eff_taq
eff_hyb[z] <- P*Kov/(1 + P*Kov) ##########
eff_taq[z] <- eff_taq ##########
return(list(eff, eff_taq, eff_hyb)) #########
}
AmplifyDNA_custom <- function(primers,
myDNAStringSet,
maxProductSize,
annealingTemp,
P,
ions=0.2,
includePrimers=TRUE,
minEfficiency=0.001,
...) {
# error checking
if (is.character(primers))
primers <- Biostrings::DNAStringSet(primers)
if (!is(primers, "DNAStringSet"))
stop("primers must be aDNAStringSet.")
if (any(primers==""))
stop("primers cannot contain 0 character elements.")
if (is.character(myDNAStringSet))
myDNAStringSet <- Biostrings::DNAStringSet(myDNAStringSet)
if (!is(myDNAStringSet, "DNAStringSet"))
stop("myDNAStringSet must be a DNAStringSet.")
if (length(myDNAStringSet)==0)
stop("myDNAStringSet is empty.")
if (!is.numeric(ions))
stop("ions must be a numeric.")
if (ions < .01 || is.nan(ions))
stop("Sodium equivilent concentration must be at least 0.01M.")
if (!is.numeric(P))
stop("P must be a numeric.")
if (!(P > 0))
stop("P must be greater than zero.")
if (!is.numeric(annealingTemp))
stop("annealingTemp must be a numeric.")
if (!is.numeric(maxProductSize))
stop("maxProductSize must be a numeric.")
if (maxProductSize <= 0)
stop("maxProductSize must be greater than zero")
maxProductSize <- as.integer(maxProductSize)
if (!is.logical(includePrimers))
stop("includePrimers must be a logical.")
a <- Biostrings::vcountPattern("-", myDNAStringSet)
if (any(a > 0))
stop("Gap characters ('-') must be removed before amplification.")
a <- Biostrings::vcountPattern("+", myDNAStringSet)
if (any(a > 0))
stop("Mask characters ('+') must be removed before amplification.")
a <- Biostrings::vcountPattern(".", myDNAStringSet)
if (any(a > 0))
stop("Unknown characters ('.') must be removed before amplification.")
primers <- DECIPHER::Disambiguate(primers)
l <- unlist(lapply(primers, length))
l <- rep(seq_along(primers), l)
primers <- unlist(primers)
ns <- names(myDNAStringSet)
w <- Biostrings::width(myDNAStringSet)
w <- cumsum(w)
w <- w[-length(w)]
starts <- c(1L,
w + seq_along(w)*maxProductSize + 1L)
myDNAStringSet <- unlist(myDNAStringSet)
if (length(w) > 0)
myDNAStringSet <- Biostrings::replaceAt(myDNAStringSet,
IRanges::IRanges(start=w + 1L,
width=0),
paste(rep("-", maxProductSize),
collapse=""))
f <- IRanges::IRanges()
fp <- integer()
for (i in 1:length(primers)) {
temp <- Biostrings::matchPattern(primers[[i]],
myDNAStringSet,
max.mismatch=ceiling(0.25*nchar(primers[[i]])),
with.indels=TRUE)
temp <- as(temp, "IRanges")
f <- c(f, temp)
fp <- c(fp, rep(i, length(temp)))
}
f <- IRanges::Views(myDNAStringSet, start(f), end(f))
if (length(f)==0)
return(Biostrings::DNAStringSet())
r <- IRanges::IRanges()
rp <- integer()
for (i in 1:length(primers)) {
temp <- Biostrings::matchPattern(Biostrings::reverseComplement(primers[[i]]),
myDNAStringSet,
max.mismatch=ceiling(.25*nchar(primers[[i]])),
with.indels=TRUE)
temp <- as(temp, "IRanges")
r <- c(r, temp)
rp <- c(rp, rep(i, length(temp)))
}
r <- IRanges::Views(myDNAStringSet, start(r), end(r))
if (length(r)==0)
return(Biostrings::DNAStringSet())
ends <- end(r)
o <- order(ends)
r <- r[o]
rp <- rp[o]
targets <- Biostrings::extractAt(myDNAStringSet,
at=IRanges::IRanges(start=start(f),
end=end(f)))
######
forward_score <- CalculateEfficiencyPCR_custom(primers[fp],
Biostrings::reverseComplement(targets),
annealingTemp, P, ions, ...)
fe <- forward_score[[1]]
######
fw <- which(fe >= minEfficiency)
######
fe_el_eff <- forward_score[[2]]
fe_hy_eff <- forward_score[[3]]
######
if (length(fw)==0)
return(Biostrings::DNAStringSet())
targets <- Biostrings::extractAt(myDNAStringSet,
at=IRanges::IRanges(start=start(r),
end=end(r)))
######
reverse_score <- CalculateEfficiencyPCR_custom(primers[rp],
targets,
annealingTemp, P, ions, ...)
re <- reverse_score[[1]]
######
rw <- which(re >= minEfficiency)
######
re_el_eff <- reverse_score[[2]]
re_hy_eff <- reverse_score[[3]]
######
if (length(rw)==0)
return(Biostrings::DNAStringSet())
sf <- start(f)[fw]
sr <- start(r)[rw]
ef <- end(f)[fw]
er <- end(r)[rw]
b <- e <- fs <- rs <- integer(1e6)
effs <- numeric(1e6)
######
el_eff <- numeric(1e6)
hy_eff <- numeric(1e6)
######
c <- 0L
for (i in 1:length(sf)) {
w <- .Call("boundedMatches",
sr,
as.integer(sf[i] + 1L),
as.integer(sf[i] + maxProductSize - 1L),
PACKAGE="DECIPHER")
if (length(w) > 0) {
r <- (c + 1L):(c + length(w))
c <- c + length(w)
if (includePrimers) {
b[r] <- ef[i] + 1L
e[r] <- sr[w] - 1L
} else {
b[r] <- sf[i]
e[r] <- er[w]
}
effs[r] <- sqrt(fe[fw[i]]*re[rw[w]])
######
el_eff[r] <- sqrt(fe_el_eff[fw[i]]*re_el_eff[rw[w]])
hy_eff[r] <- sqrt(fe_hy_eff[fw[i]]*re_hy_eff[rw[w]])
######
fs[r] <- fp[fw[i]]
rs[r] <- rp[rw[w]]
}
}
if (c > 0) {
length(b) <- length(e) <- length(effs) <- c
o <- order(effs, decreasing=TRUE)
index <- sapply(b[o],
function(x) {
tail(which(starts <= x), n=1)
})
if (!is.null(ns))
index <- ns[index]
if (includePrimers) {
extra <- ifelse(b > e, b - e - 1L, 0) # primers overlap
e <- ifelse(b > e, b - 1L, e) # primers overlap
v <- IRanges::Views(myDNAStringSet,
start=b[o],
end=e[o])
v <- as(v, "DNAStringSet")
v <- Biostrings::xscat(substring(primers[fs[o]],
1L,
Biostrings::width(primers[fs[o]]) - extra[o]),
v,
Biostrings::reverseComplement(primers[rs[o]]))
names(v) <- paste(round(effs[o], 3), " ", round(el_eff[o], 3), " ", round(hy_eff[o], 3), ##############
" (",
l[fs[o]],
" x ",
l[rs[o]],
") ",
index,
sep="")
} else {
v <- IRanges::Views(myDNAStringSet,
start=b[o],
end=e[o],
names=paste(round(effs[o], 3), " ", round(el_eff[o], 3), " ", round(hy_eff[o], 3), ##############
" (",
l[fs[o]],
" x ",
l[rs[o]],
") ",
index,
sep=""))
v <- as(v, "DNAStringSet")
}
w <- which(Biostrings::width(v) > maxProductSize)
if (length(w) > 0)
v <- v[-w]
} else {
v <- Biostrings::DNAStringSet()
}
return(v)
}
if(is.null(max_length) || is.null(primers) || is.null(annealingTemp)) {if(is.null(primer_data))
stop('Provide a table with primer informations containing columns with primers (default: "FORWARD" & "REVERSE"), with maximum lengths and annealing temperatures.')}
if(length(parameters) != 0) {if(any(sapply(parameters, length) > 1))
stop('The vectors "max_length", "primers_names", "annealingTemp" & "gene" must be of length 1.')}
if(!is.null(length_col_name)) {if(length(length_col_name) > 2 || is.na(length_col_name) || length(length_col_name) == 0)
stop('The number of column names specifying the maximum length must be 1 or 2.')}
if(!all(unlist(sapply(c(data_col_names), function(x) get(x))) %in% colnames(primer_data)) && !is.null(primer_data)){
missing_pos = which(!(unlist(sapply(c(data_col_names), function(x) get(x))) %in% colnames(primer_data)))
missing_col = unlist(sapply(c(data_col_names), function(x) get(x)))[missing_pos]
missing_col = unique(gsub('[[:digit:]]+', '', names(missing_col)))
stop(paste0('Some columns names ("', paste(missing_col, collapse = '", "'),
'") are not in the dataframe specified in the "primer_data" argument.'))
}
if(!any(colnames(data_taxo) == accession_col_name))
stop(paste('The table provided in the "data_taxo" argument must contain a column named', accession_col_name))
if(!is.null(primer_data)) nb_iterations = nrow(primer_data)
else nb_iterations = length(parameters[[1]])
for(i in 1:nb_iterations){
setwd(program_location)
start = Sys.time()
if(length(length_col_name) == 2 && is.null(max_length) && !is.null(primer_data)) {
max_length_pcr = c(primer_data[i,][[length_col_name[1]]], primer_data[i,][[length_col_name[2]]])
max_length_pcr = max(max_length_pcr[!is.na(max_length_pcr)]) * 2
}
else if(length(length_col_name) == 1 && is.null(max_length) && !is.null(primer_data))
max_length_pcr = primer_data[i,][[length_col_name]]
else max_length_pcr = max_length
if(is.null(primers) && !is.null(primer_data)) primers_pcr = Biostrings::DNAStringSet(c(primer_data[i,][[forward_col_name]],
primer_data[i,][[reverse_col_name]]))
else primers_pcr = primers
if(is.null(annealingTemp) && !is.null(primer_data)) annealingTemp_pcr = primer_data[i,][[temp_col_name]]
else annealingTemp_pcr = annealingTemp
if(length(P) == 0 && !is.null(primer_data)) P_pcr = primer_data[i,][[P_col_name]]
else P_pcr = P
if(length(ions) == 0 && !is.null(primer_data)) ions_pcr = primer_data[i,][[ions_col_name]]
else ions_pcr = ions
if(is.null(gene) && !is.null(primer_data)) gene_pcr = primer_data[i,][[gene_col_name]]
else gene_pcr = gene
if(is.null(primers_names) && !is.null(primer_data)) primers_names_pcr = primer_data[i,][[primers_names_col_name]]
else primers_names_pcr = primers_names
withCallingHandlers({
fragments[i] = AmplifyDNA_custom(primers = primers_pcr,
myDNAStringSet = complete_sequences,
maxDistance = maxDistance,
maxGaps = maxGaps,
P = P_pcr,
ions = ions_pcr,
maxProductSize = max_length_pcr,
annealingTemp = annealingTemp_pcr,
minEfficiency = minEfficiency,
includePrimers = includePrimers,
taqEfficiency = taqEfficiency,
processors = NULL,
batchSize = batchSize)
}, warning=function(w) {
if (startsWith(conditionMessage(w), "implicit list embedding of S4 objects is deprecated"))
invokeRestart("muffleWarning")
})
end = Sys.time()
duration = difftime(end, start)
cat(paste0("In silico PCR for ", primers_names_pcr, " DONE\n"))
cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
cat("------------------------------------\n")
cat("\n")
setwd(previous_wd)
if(length(fragments[i][[1]]) != 0){
accession = stringr::word(names(fragments[i][[1]]), -1)
taxo_eco = data_taxo[which(data_taxo[[accession_col_name]] %in% unlist(accession)) , c(cols_to_keep, accession_pos)]
accession_pos2 = which(colnames(taxo_eco) == accession_col_name)
species_pos = which(colnames(taxo_eco) == species_col_name)
taxo_eco = taxo_eco[match(unlist(accession), taxo_eco[[accession_col_name]]), -accession_pos2]
final = data.frame(PRIMER_SET = rep(primers_names_pcr, length(accession)),
GENE = rep(gene_pcr, length(accession)),
ACCESSION = accession,
SPECIES = taxo_eco[, species_pos],
LENGTH = Biostrings::width(fragments[i][[1]]),
AMPLIFICATION_EFFICIENCY = stringr::word(names(fragments[i][[1]]), 1),
ELONGATION_EFFICIENCY = stringr::word(names(fragments[i][[1]]), 2),
HYBRIDATION_EFFICIENCY = stringr::word(names(fragments[i][[1]]), 3),
PRIMERS_USED = unlist(regmatches(names(fragments[i][[1]]), gregexpr("(?<=\\().*?(?=\\))",
names(fragments[i][[1]]), perl=T))),
taxo_eco[, setdiff(c(1:ncol(taxo_eco)), species_pos)],
SEQUENCES = paste(fragments[i][[1]]))
write.csv(final, paste0(output_name, " - ", primers_names_pcr, ".csv"), row.names = F)
}
else{
final = data.frame(matrix(NA, nrow = 1, ncol = 10 + length(cols_to_keep) - 1))
final[1,1] = primers_names_pcr
final[1,2] = gene_pcr
taxo_eco_columns = data_taxo[, c(cols_to_keep, accession_pos)]
taxo_eco_columns = taxo_eco_columns[, -which(colnames(taxo_eco_columns) %in% c(species_col_name, accession_col_name))]
colnames(final) = c("PRIMER_SET", "GENE", "ACCESSION", "SPECIES", "LENGTH", "AMPLIFICATION_EFFICIENCY",
"ELONGATION_EFFICIENCY", "HYBRIDATION_EFFICIENCY", "PRIMERS_USED",
colnames(taxo_eco_columns), "SEQUENCES")
warning(paste(primers_names_pcr[i], "did not amplified any fragment!"))
}
if(i == 1) {
final_df = final
}
else {
final_df = rbind(final_df, final)
}
}
if(write_final){
write.csv(final_df, paste0(output_name, " - All primers.csv"), row.names = F)
}
return(tibble(final_df))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.