#' @title trimMitoAlignments
#'
#' @description Function for batch trimming a folder of alignments, with the various trimming functions available to select from
#'
#' @param alignment.dir path to a folder of sequence alignments in phylip format.
#'
#' @param alignment.format available input alignment formats: fasta or phylip
#'
#' @param output.dir contigs are added into existing alignment if algorithm is "add"
#'
#' @param output.format available output formats: phylip
#'
#' @param sample.similarity remove samples too divergent from consensus, values 0-1 for proportion similar sites
#'
#' @param TrimAl if a file name is provided, save.name will be used to save aligment to file as a fasta
#'
#' @param TrimAl.path path to a folder of sequence alignments in phylip format.
#'
#' @param trim.external give a save name if you wnat to save the summary to file.
#'
#' @param min.external.percent TRUE to supress mafft screen output
#'
#' @param trim.coverage path to a folder of sequence alignments in phylip format.
#'
#' @param min.coverage.percent contigs are added into existing alignment if algorithm is "add"
#'
#' @param trim.column algorithm to use: "add" add sequences with "add.contigs"; "localpair" for local pair align. All others available
#'
#' @param min.column.gap.percent TRUE applies the adjust sequence direction function of MAFFT
#'
#' @param alignment.assess if a file name is provided, save.name will be used to save aligment to file as a fasta
#'
#' @param min.sample.bp path to a folder of sequence alignments in phylip format.
#'
#' @param min.align.length give a save name if you wnat to save the summary to file.
#'
#' @param min.taxa.count TRUE to supress mafft screen output
#'
#' @param min.gap.percent if a file name is provided, save.name will be used to save aligment to file as a fasta
#'
#' @param overwrite TRUE to supress mafft screen output
#'
#' @return an alignment of provided sequences in DNAStringSet format. Also can save alignment as a file with save.name
#'
#' @examples
#'
#' your.tree = ape::read.tree(file = "file-path-to-tree.tre")
#' astral.data = astralPlane(astral.tree = your.tree,
#' outgroups = c("species_one", "species_two"),
#' tip.length = 1)
#'
#'
#' @export
trimMitoAlignments = function(alignment.dir = NULL,
alignment.format = "phylip",
output.dir = NULL,
output.format = "phylip",
TAPER = FALSE,
TAPER.path = NULL,
julia.path = NULL,
TrimAl = FALSE,
TrimAl.path = NULL,
trim.external = TRUE,
min.external.percent = 50,
trim.coverage = TRUE,
min.coverage.percent = 50,
trim.column = TRUE,
min.column.gap.percent = 100,
convert.ambiguous.sites = FALSE,
alignment.assess = TRUE,
min.coverage.bp = 0,
min.alignment.length = 0,
min.taxa.alignment = 0,
max.alignment.gap.percent = 0,
threads = 1,
memory = 1,
overwrite = FALSE) {
# #devtools::install_github("chutter/PhyloProcessR", upgrade = "never")
# library(PhyloProcessR)
# library(foreach)
#
#
# #Save directory
# #work.dir = "/Volumes/Rodents/Murinae/Trimming"
# #align.dir = "/Volumes/Rodents/Murinae/Trimming/genes-untrimmed"
# #feat.gene.names = "/Volumes/Rodents/Murinae/Selected_Transcripts/Mus_gene_metadata.csv"
# #work.dir = "/home/c111h652/scratch/Rodents/Trimming"
#
# work.dir = "/home/c111h652/scratch/Rodents/Trimming"
# align.dir = "/home/c111h652/scratch/Rodents/Trimming/Emily/genes_untrimmed"
#
# work.dir = "/Volumes/Rodents/Murinae/Trimming"
# align.dir = "/Volumes/Rodents/Murinae/Trimming/Emily/genes_untrimmed"
# out.name = "Emily"
#
#
# setwd(work.dir)
# alignment.dir = align.dir
# output.dir = paste0(out.name, "/genes_trimmed2")
# alignment.format = "phylip"
# output.format = "phylip"
# TAPER = TRUE
# TAPER.path = "/usr/local/bin"
# julia.path = "/Applications/Julia-1.6.app/Contents/Resources/julia/bin"
# #TAPER.path = "/home/c111h652/programs"
# #julia.path = "/programs/julia/bin"
# TrimAl = TRUE
# TrimAl.path = "/Users/chutter/miniconda3/bin"
# trim.external = TRUE
# min.external.percent = 50
# trim.coverage = TRUE
# min.coverage.percent = 35
# trim.column = TRUE
# min.column.gap.percent = 50
# convert.ambiguous.sites = FALSE
# alignment.assess = TRUE
# min.coverage.bp = 60
# min.alignment.length = 100
# min.taxa.alignment = 5
# max.alignment.gap.percent = 50
# threads = 2
# memory = 6
# overwrite = FALSE
# resume = TRUE
if (is.null(TrimAl.path) == FALSE){
b.string = unlist(strsplit(TrimAl.path, ""))
if (b.string[length(b.string)] != "/") {
TrimAl.path = paste0(append(b.string, "/"), collapse = "")
}#end if
} else { TrimAl.path = NULL }
if (alignment.dir == output.dir){ stop("You should not overwrite the original alignments.") }
# if (dir.exists(output.dir) == FALSE) { dir.create(output.dir) }
if (dir.exists(output.dir) == TRUE) {
if (overwrite == TRUE){
system(paste0("rm -r ", output.dir))
dir.create(output.dir)
}
} else { dir.create(output.dir) }
#Gathers alignments
align.files = list.files(alignment.dir)
if (length(align.files) == 0) { stop("alignment files could not be found.") }
#Skips files done already if resume = TRUE
if (overwrite == FALSE){
done.files = list.files(output.dir)
align.files = align.files[!gsub("\\..*", "", align.files) %in% gsub("\\..*", "", done.files)]
}
if (length(align.files) == 0) { stop("All alignments have already been completed and overwrite = FALSE.") }
#Data to collect
header.data = c("Alignment", "Pass", "startSamples", "trimalSamples",
"edgeSamples", "columnSamples", "covSamples",
"startLength", "trimalLength",
"edgeLength", "columnLength", "covLength",
"startBasepairs", "tapirBasepairs", "trimalBasepairs",
"edgeBasepairs", "columnBasepairs", "covBasepairs",
"startGaps", "trimalGaps",
"edgeGaps", "columnGaps", "covGaps",
"startPerGaps", "trimalPerGaps",
"edgePerGaps", "columnPerGaps", "covPerGaps")
save.data = data.table::data.table(matrix(as.double(0), nrow = length(align.files), ncol = length(header.data)))
data.table::setnames(save.data, header.data)
save.data[, Alignment:=as.character(Alignment)]
save.data[, Pass:=as.logical(Pass)]
#Sets up multiprocessing
#cl = parallel::makeCluster(threads, outfile = "")
#doParallel::registerDoParallel(cl)
#mem.cl = floor(memory/threads)
#Loops through each locus and does operations on them
#save.data = foreach::foreach(i=1:length(align.files), .combine = rbind, .packages = c("PhyloProcessR", "foreach", "Biostrings","Rsamtools", "ape", "stringr", "data.table")) %dopar% {
for (i in 1:length(align.files)){
print(paste0(align.files[i], " Starting..."))
#Load in alignments
if (alignment.format == "phylip"){
align = Biostrings::readAAMultipleAlignment(file = paste0(alignment.dir, "/", align.files[i]), format = "phylip")
# align = Biostrings::readDNAStringSet(file = paste0(alignment.dir, "/", align.files[i]), format = "phylip")
# align = readLines(paste0(alignment.dir, "/", align.files[i]))[-1]
# align = gsub(".*\\ ", "", align)
# char.count = nchar(align)
align = Biostrings::DNAStringSet(align)
save.name = gsub(".phy$", "", align.files[i])
save.name = gsub(".phylip$", "", save.name)
}#end phylip
if (alignment.format == "fasta"){
align = Biostrings::readDNAStringSet(paste0(alignment.dir, "/", align.files[i]) )
save.name = gsub(".fa$", "", align.files[i])
save.name = gsub(".fasta$", "", save.name)
}#end phylip
# Runs the functions
#######
#Step 1: Strip Ns
non.align = PhyloProcessR::replaceAlignmentCharacter(alignment = align,
char.find = "N",
char.replace = "-")
#Convert ambiguous sites
if (convert.ambiguous.sites == TRUE){
non.align = PhyloProcessR::convertAmbiguousConsensus(alignment = non.align)
}#end phylip
#Summarize all this, no functoin
data.table::set(save.data, i = as.integer(1), j = match("Alignment", header.data), value = save.name)
data.table::set(save.data, i = as.integer(1), j = match("startSamples", header.data), value = length(non.align))
data.table::set(save.data, i = as.integer(1), j = match("startLength", header.data), value = Biostrings::width(non.align)[1] )
gap.count = PhyloProcessR::countAlignmentGaps(non.align)
data.table::set(save.data, i = as.integer(1), j = match("startBasepairs", header.data), value = gap.count[2] - gap.count[1])
data.table::set(save.data, i = as.integer(1), j = match("startGaps", header.data), value = gap.count[1])
data.table::set(save.data, i = as.integer(1), j = match("startPerGaps", header.data), value = gap.count[3])
#Step 3. Trimal trimming
if (TrimAl == TRUE && length(non.align) != 0){
trimal.align = PhyloProcessR::trimTrimal(alignment = non.align,
trimal.path = TrimAl.path,
quiet = TRUE)
non.align = trimal.align
#Saves stat data
data.table::set(save.data, i = as.integer(1), j = match("trimalSamples", header.data), value = length(trimal.align))
data.table::set(save.data, i = as.integer(1), j = match("trimalLength", header.data), value = Biostrings::width(trimal.align)[1])
gap.count = PhyloProcessR::countAlignmentGaps(non.align)
data.table::set(save.data, i = as.integer(1), j = match("trimalBasepairs", header.data), value = gap.count[2] - gap.count[1])
data.table::set(save.data, i = as.integer(1), j = match("trimalGaps", header.data), value = gap.count[1])
data.table::set(save.data, i = as.integer(1), j = match("trimalPerGaps", header.data), value = gap.count[3])
}#end if
# Step 4. Edge trimming
if (trim.external == TRUE && length(non.align) != 0){
#external trimming function
edge.align = PhyloProcessR::trimExternal(alignment = non.align,
min.n.seq = ceiling(length(non.align) * (min.external.percent/100)),
codon.trim = F)
if (class(edge.align) == "numeric") { edge.align = DNAStringSet() }
non.align = edge.align
#Saves stat data
data.table::set(save.data, i = as.integer(1), j = match("edgeSamples", header.data), value = length(edge.align))
data.table::set(save.data, i = as.integer(1), j = match("edgeLength", header.data), value = Biostrings::width(edge.align)[1])
gap.count = PhyloProcessR::countAlignmentGaps(non.align)
data.table::set(save.data, i = as.integer(1), j = match("edgeBasepairs", header.data), value = gap.count[2] - gap.count[1])
data.table::set(save.data, i = as.integer(1), j = match("edgeGaps", header.data), value = gap.count[1])
data.table::set(save.data, i = as.integer(1), j = match("edgePerGaps", header.data), value = gap.count[3])
}#end trim external
if (trim.column == TRUE && length(non.align) != 0){
#Trim alignment colums
col.align = PhyloProcessR::trimAlignmentColumns(alignment = non.align,
min.gap.percent = min.column.gap.percent)
non.align = col.align
#Saves stat data
data.table::set(save.data, i = as.integer(1), j = match("columnSamples", header.data), value = length(col.align))
data.table::set(save.data, i = as.integer(1), j = match("columnLength", header.data), value = Biostrings::width(col.align)[1])
gap.count = PhyloProcessR::countAlignmentGaps(non.align)
data.table::set(save.data, i = as.integer(1), j = match("columnBasepairs", header.data), value = gap.count[2] - gap.count[1])
data.table::set(save.data, i = as.integer(1), j = match("columnGaps", header.data), value = gap.count[1])
data.table::set(save.data, i = as.integer(1), j = match("columnPerGaps", header.data), value = gap.count[3])
}#end trim column.
#Step 5. Evaluate and cut out each sample
if (trim.coverage == TRUE && length(non.align) != 0){
#sample coverage function
cov.align = PhyloProcessR::trimSampleCoverage(alignment = non.align,
min.coverage.percent = min.coverage.percent,
min.coverage.bp = min.coverage.bp,
relative.width = "sample")
non.align = cov.align
#Saves stat data
data.table::set(save.data, i = as.integer(1), j = match("covSamples", header.data), value = length(cov.align))
data.table::set(save.data, i = as.integer(1), j = match("covLength", header.data), value = Biostrings::width(cov.align)[1])
gap.count = PhyloProcessR::countAlignmentGaps(non.align)
data.table::set(save.data, i = as.integer(1), j = match("covBasepairs", header.data), value = gap.count[2] - gap.count[1])
data.table::set(save.data, i = as.integer(1), j = match("covGaps", header.data), value = gap.count[1])
data.table::set(save.data, i = as.integer(1), j = match("covPerGaps", header.data), value = gap.count[3])
}#end trim.external
#Step 6
if (alignment.assess == TRUE) {
#Assesses the alignment returning TRUE for pass and FALSE for fail
test.result = PhyloProcessR::alignmentAssess(alignment = non.align,
max.alignment.gap.percent = max.alignment.gap.percent,
min.taxa.alignment = min.taxa.alignment,
min.alignment.length = min.alignment.length)
data.table::set(save.data, i = as.integer(1), j = match("Pass", header.data), value = test.result)
if (test.result == FALSE){
print(paste0(align.files[i], " Failed and was discarded."))
} else {
write.temp = strsplit(as.character(non.align), "")
aligned.set = as.matrix(ape::as.DNAbin(write.temp) )
#readies for saving
PhyloProcessR::writePhylip(aligned.set, file= paste0(output.dir, "/", save.name, ".phy"), interleave = F)
}#end else test result
} else {
#If no alignment assessing is done, saves
write.temp = strsplit(as.character(non.align), "")
aligned.set = as.matrix(ape::as.DNAbin(write.temp) )
#readies for saving
PhyloProcessR::writePhylip(aligned.set, file= paste0(output.dir, "/", save.name, ".phy"), interleave = F)
}#end else
print(paste0(align.files[i], " Completed."))
#rm()
#gc()
}#end i loop
# parallel::stopCluster(cl)
#Print and save summary table
write.csv(save.data, file = paste0(output.dir, "_trimming-summary.csv"), row.names = F)
#Saves log file of things
if (file.exists(paste0(output.dir, ".log")) == TRUE){ system(paste0("rm ", output.dir, ".log")) }
fileConn = file(paste0(output.dir, ".log"), open = "w")
writeLines(paste0("Log file for ", output.dir), fileConn)
writeLines(paste0("\n"), fileConn)
writeLines(paste0("Overall trimming summary:"), fileConn)
writeLines(paste0("------------------------------------------------------------------"), fileConn)
writeLines(paste0(""), fileConn)
writeLines(paste0("Starting alignments: ", length(align.files)), fileConn)
writeLines(paste0("Trimmed alignments: ", length(save.data$Pass[save.data$Pass == TRUE])), fileConn)
writeLines(paste0("Discarded alignments: ", length(save.data$Pass[save.data$Pass == FALSE])), fileConn)
writeLines(paste0(""), fileConn)
writeLines(paste0("Mean samples removed per alignment: ",
mean(save.data$startSamples - save.data$columnSamples)), fileConn)
writeLines(paste0("Mean alignment length trimmed per alignment: ",
mean(save.data$startLength - save.data$columnLength)), fileConn)
writeLines(paste0("Mean basepairs trimmed per alignment: ",
mean(save.data$startBasepairs - save.data$columnBasepairs)), fileConn)
writeLines(paste0("Mean gaps trimmed per alignment: ",
mean(save.data$startGaps - save.data$columnGaps)), fileConn)
writeLines(paste0("Mean gap percent trimmed per alignment: ",
mean(save.data$startPerGaps - save.data$columnPerGaps)), fileConn)
writeLines(paste0(""), fileConn)
writeLines(paste0(""), fileConn)
writeLines(paste0("Individual trimming step summary:"), fileConn)
writeLines(paste0("------------------------------------------------------------------"), fileConn)
writeLines(paste0(""), fileConn)
writeLines(paste0("Starting alignments:"), fileConn)
writeLines(paste0("Mean samples: ",
mean(save.data$startSamples)), fileConn)
writeLines(paste0("Mean alignment length: ",
mean(save.data$startLength)), fileConn)
writeLines(paste0("Mean basepairs: ",
mean(save.data$startBasepairs)), fileConn)
writeLines(paste0("Mean gaps: ",
mean(save.data$startGaps)), fileConn)
writeLines(paste0("Mean gap percentage: ",
mean(save.data$startPerGaps)), fileConn)
writeLines(paste0(""), fileConn)
writeLines(paste0("hmmCleaner:"), fileConn)
writeLines(paste0("Mean samples removed: ",
mean(save.data$startSamples - save.data$hmmSamples)), fileConn)
writeLines(paste0("Mean alignment length reduction: ",
mean(save.data$startLength - save.data$hmmLength)), fileConn)
writeLines(paste0("Mean basepairs trimmed: ",
mean(save.data$startBasepairs - save.data$hmmBasepairs)), fileConn)
writeLines(paste0("Mean gap change: ",
mean(save.data$startGaps - save.data$hmmGaps)), fileConn)
writeLines(paste0("Mean gap percent change: ",
mean(save.data$startPerGaps - save.data$hmmPerGaps)), fileConn)
writeLines(paste0(""), fileConn)
writeLines(paste0("Trimal:"), fileConn)
writeLines(paste0("Mean samples removed: ",
mean(save.data$hmmSamples - save.data$trimalSamples)), fileConn)
writeLines(paste0("Mean alignment length reduction: ",
mean(save.data$hmmLength - save.data$trimalLength)), fileConn)
writeLines(paste0("Mean basepairs trimmed: ",
mean(save.data$hmmBasepairs - save.data$trimalBasepairs)), fileConn)
writeLines(paste0("Mean gap change: ",
mean(save.data$hmmGaps - save.data$trimalGaps)), fileConn)
writeLines(paste0("Mean gap percent change: ",
mean(save.data$hmmPerGaps - save.data$trimalPerGaps)), fileConn)
writeLines(paste0(""), fileConn)
writeLines(paste0("External Trimming:"), fileConn)
writeLines(paste0("Mean samples removed: ",
mean(save.data$trimalSamples - save.data$edgeSamples)), fileConn)
writeLines(paste0("Mean alignment length reduction: ",
mean(save.data$trimalLength - save.data$edgeLength)), fileConn)
writeLines(paste0("Mean basepairs trimmed: ",
mean(save.data$trimalBasepairs - save.data$edgeBasepairs)), fileConn)
writeLines(paste0("Mean gap change: ",
mean(save.data$trimalGaps - save.data$edgeGaps)), fileConn)
writeLines(paste0("Mean gap percent change: ",
mean(save.data$trimalPerGaps - save.data$edgePerGaps)), fileConn)
writeLines(paste0(""), fileConn)
writeLines(paste0("Sample Coverage Trimming:"), fileConn)
writeLines(paste0("Mean samples removed: ",
mean(save.data$edgeSamples - save.data$covSamples)), fileConn)
writeLines(paste0("Mean alignment length reduction: ",
mean(save.data$edgeLength - save.data$covLength)), fileConn)
writeLines(paste0("Mean basepairs trimmed: ",
mean(save.data$edgeBasepairs - save.data$covBasepairs)), fileConn)
writeLines(paste0("Mean gap change: ",
mean(save.data$edgeGaps - save.data$covGaps)), fileConn)
writeLines(paste0("Mean gap percent change: ",
mean(save.data$edgePerGaps - save.data$covPerGaps)), fileConn)
writeLines(paste0(""), fileConn)
writeLines(paste0("Column Coverage Trimming:"), fileConn)
writeLines(paste0("Mean samples removed: ",
mean(save.data$covSamples - save.data$columnSamples)), fileConn)
writeLines(paste0("Mean alignment length reduction: ",
mean(save.data$covLength - save.data$columnLength)), fileConn)
writeLines(paste0("Mean basepairs trimmed: ",
mean(save.data$covBasepairs - save.data$columnBasepairs)), fileConn)
writeLines(paste0("Mean gap change: ",
mean(save.data$covGaps - save.data$columnGaps)), fileConn)
writeLines(paste0("Mean gap percent change: ",
mean(save.data$covPerGaps - save.data$columnPerGaps)), fileConn)
close(fileConn)
} #end function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.