#' The ``make.contigs`` command reads a forward fastq file and a reverse
#' fastq file and outputs new fasta and report files.
#'
#' @details The make.contigs command parameters are file, ffastq, rfastq,
#' ffasta, rfasta, fqfile, rqfile, findex, rindex, oligos, format, tdiffs,
#' bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert,
#' deltaq, maxee, allfiles and processors.
#'
#' Here we create a combo file for given input data to mothur: the 3 column
#' format is used for datasets where the sequences have already had the
#' barcodes and primers removed and been split into separate files.
#'
#' The first column is the group, the second is the forward fastq and
#' the third column contains the reverse fastq.
#'
#' @param algorithm [align] The align parameter allows you to specify the
#' alignment method to use. Your options are: gotoh and needleman.
#' The default is needleman.
#' @param score [match & mismatch & gapopen & gapextend] These parameters
#' are used while aligning the sequence read to determine the overlap.
#' The match parameter allows you to specify the bonus for having the
#' same base. The default is 1.0. The mistmatch parameter allows you to
#' specify the penalty for having different bases. The default is -1.0.
#' The gapopen parameter allows you to specify the penalty for opening a
#' gap in an alignment. The default is -2.0. The gapextend parameter
#' allows you to specify the penalty for extending a gap in an alignment.
#' The default is -1.0.
#'
#' @param insert The insert parameter allows you to set a quality scores
#' threshold. When we are merging the overlapping regions, in the case
#' where we are trying to decide whether to keep a base or remove it
#' because the base is compared to a gap in the other fragment, if the
#' base has a quality score below the threshold we eliminate it.
#' Default=20.
#'
const make.contigs = function(left, right,
algorithm = ["needleman", "gotoh"],
score = [1.0, -1.0, -2.0, -1.0],
insert = 20,
num_threads = 8) {
# write raw data sequence file inputs
# at here
cat(`16s\t${left}\t${right}`, file = "./16s.files");
# run mothur make.contigs command
runMothur(
command = "make.contigs",
argv = list(
file = "16s.files",
checkorient = "T",
align = algorithm[1],
match = score[1],
mismatch = score[2],
gapopen = score[3],
gapextend = score[4],
insert = insert,
processors = num_threads
),
log = "[1]make.contigs.txt"
);
# result content files should be appears in the
# output directory:
# -rw-r--r-- 1 root root 3256536 Dec 10 17:24 16s.contigs.groups
# -rw-r--r-- 1 root root 4341698 Dec 10 17:24 16s.contigs.report
# -rw-r--r-- 1 root root 49 Dec 10 17:28 16s.files
# -rw-r--r-- 1 root root 0 Dec 10 17:23 16s.scrap.contigs.fasta
# -rw-r--r-- 1 root root 0 Dec 10 17:23 16s.scrap.contigs.qual
# -rw-r--r-- 1 root root 31614742 Dec 10 17:24 16s.trim.contigs.fasta
# -rw-r--r-- 1 root root 86838453 Dec 10 17:24 16s.trim.contigs.qual
}
#' Contig file renames
#'
#' @param seqfile the source sequence file
#' @param contig.fasta the target renamed sequence file.
#'
#' @return the parameter value comes from the ``contig.fasta`` parameter.
#'
const write_contig = function(seqfile, contig.fasta = "contig.fasta") {
if (file.exists(contig.fasta)) {
print(`${contig.fasta} is already exists, going to remove it...`);
unlink(contig.fasta);
if (file.exists(contig.fasta)) {
print("unlink of target file is not working!");
} else {
print("yes!");
}
}
print(`${seqfile} => ${contig.fasta}...`);
seqfile
|> readLines()
|> writeLines(con = contig.fasta)
;
cat("done~!\n");
contig.fasta;
}
#' The ``summary.seqs`` command will summarize the quality of sequences in
#' an unaligned or aligned fasta-formatted sequence file.
#'
#' @param num_threads The processors option enables you to accelerate the
#' summary process by using multiple processors. Default processors=Autodetect
#' number of available processors and use all available.
#'
const summary.seqs = function(seqfile, count_table,
num_threads = 8,
logfile = "log.txt") {
runMothur(
command = "summary.seqs",
argv = list(fasta=seqfile,processors=num_threads),
log = logfile
);
}
#' RunAutoScreen
#'
#' @description The screen.seqs command enables you to keep sequences that
#' fulfill certain user defined criteria. Furthermore, it enables you to
#' cull those sequences not meeting the criteria from a names, group,
#' contigsreport, alignreport and summary file.
#'
const screen.seqs = function(contigs, num_threads = 8) {
using workdir as workdir(dirname(contigs)) {
const file as string = "[2]summary.seqs.txt";
const summary = list(min = 0, max = 0);
const groups = "16s.contigs.groups";
# sftp://xieguigang:biodeep123@192.168.0.207/home/xieguigang/16s/test/16s_result/[2]summary.seqs.txt
# sftp://xieguigang:biodeep123@192.168.0.207/home/xieguigang/16s/test/16s_result/contig.summary
summary.seqs(
seqfile = contigs,
count_table = NULL,
num_threads = num_threads,
logfile = file
);
print("sequence summary result:");
cat("\n");
cat(readText(file));
cat("\n\n");
# filter out all empty string line:
# argument is of length zero
for(line in readLines(file) |> which(l -> l != "")) {
const cols as string = unlist(strsplit(line, "\t", fixed = FALSE));
const header as string = cols[1];
if (header == "2.5%-tile:") {
summary$min = cols[4];
} else if (header == "97.5%-tile:") {
summary$max = cols[4];
}
}
print("Parsed result:");
str(summary);
runMothur(
command = "screen.seqs",
argv = list(
fasta = contigs,
group = groups,
maxambig = 0,
minlength = summary$min,
maxlength = summary$max
),
log = "[3]screen.seqs.txt"
);
# contig.good.fasta
# contig.bad.accnos
# 16s.contigs.good.groups
}
}
#' The unique.seqs command returns only the unique sequences found in
#' a fasta-formatted sequence file and a file that indicates those
#' sequences that are identical to the reference sequence. Often times
#' a collection of sequences will have a significant number of identical
#' sequences. It sucks up considerable processing time to have to align,
#' calculate distances, and cluster each of these sequences individually.
#'
const unique.seqs = function(contigs, logfile = "[4]unique.seqs.txt") {
using workdir as workdir(dirname(contigs)) {
runMothur(
command = "unique.seqs",
argv = list(fasta = contigs),
log = logfile
);
# contig.good.names
# contig.good.unique.fasta
}
}
#' The count.seqs / make.table command counts the number of sequences
#' represented by the representative sequence in a name file. If a group
#' file is given, it will also provide the group count breakdown.
#'
const count.seqs = function(names, groups) {
runMothur(
command = "count.seqs",
argv = list(name = names, group = groups),
log = "[5]count.seqs.txt"
);
# contig.good.count_table
}
#' The align.seqs command aligns a user-supplied fasta-formatted candidate
#' sequence file to a user-supplied fasta-formatted template alignment.
#' The general approach is to i) find the closest template for each candidate
#' using kmer searching, blastn, or suffix tree searching; ii) to make a
#' pairwise alignment between the candidate and de-gapped template sequences
#' using the Needleman-Wunsch, Gotoh, or blastn algorithms; and iii) to re-insert
#' gaps to the candidate and template pairwise alignments using the NAST
#' algorithm so that the candidate sequence alignment is compatible with the
#' original template alignment. We provide several alignment databases for
#' 16S and 18S rRNA gene sequences that are compatible with the greengenes or
#' silva alignments; however, custom alignments for any DNA sequence can be
#' used as a template and users are encouraged to share their alignment for
#' others to use. In general the alignment is very fast - we are able to align
#' over 186,000 full-length sequences to the SILVA alignment in less than 3 hrs
#' with a quality as good as the SINA aligner. Furthermore, this rate can be
#' accelerated using multiple processors. While the aligner doesn’t explicitly
#' take into account the secondary structure of the 16S rRNA gene, if the
#' template database is based on the secondary structure, then the resulting
#' alignment will at least be implicitly based on the secondary structure.
#'
const align.seqs = function(contigs, reference,
flip = "F",
num_threads = 8,
logfile = "[7]align.seqs.txt") {
runMothur(
command = "align.seqs",
argv = list(
candidate = contigs,
template = reference,
flip = flip,
processors = num_threads
),
log = logfile
);
}
#' filter.seqs removes columns from alignments based on a criteria defined by the
#' user. For example, alignments generated against reference alignments (e.g. from
#' RDP, SILVA, or greengenes) often have columns where every character is either a
#' ‘.’ or a ‘-‘. These columns are not included in calculating distances because
#' they have no information in them. By removing these columns, the calculation of
#' a large number of distances is accelerated. Also, people also like to mask their
#' sequences to remove variable regions using a soft or hard mask (e.g. Lane’s mask).
#' This type of masking is only encouraged for deep-level phylogenetic analysis, not
#' fine level analysis such as that needed with calculating OTUs.
#'
const filter.seqs = function(align, num_threads, logfile = "[8]filter.seqs.txt") {
runMothur(
command = "filter.seqs",
argv = list(
fasta = align,
processors = num_threads
),
log = logfile
);
}
#' create sequence distance matrix
#'
#' @details The dist.seqs command will calculate uncorrected pairwise
#' distances between aligned DNA sequences. This approach is better
#' than the commonly used DNADIST because the distances are not stored
#' in RAM, rather they are printed directly to a file.
#'
#' Furthermore, it is possible to ignore “large” distances that one
#' might not be interested in. The command will generate a column-formatted
#' distance matrix that is compatible with the column option in the
#' various cluster commands. The command is also able to generate a
#' phylip-formatted distance matrix.
#'
const dist.seqs as function(align_fasta,
calc = "onegap",
countends = "F",
cutoff = 0.03,
output = "lt",
num_threads = 32,
logfile = "[10]dist.seqs.txt" ) {
runMothur(
command = "dist.seqs",
argv = list(
fasta = align_fasta,
calc = calc,
countends = countends,
cutoff = cutoff,
output = output,
processors = num_threads
),
log = logfile
);
}
#' assign sequences to OTUs
#'
#' @description Once a distance matrix gets read into mothur, the cluster command can be
#' used to assign sequences to OTUs. Presently, mothur implements three clustering
#' methods:
#'
#' + OptiClust (opti): OTUs are assembled using metrics to determine the quality of
#' clustering (the default setting).
#' + Nearest neighbor (nearest): Each of the sequences within an OTU are at most X% distant
#' from the most similar sequence in the OTU.
#' + Furthest neighbor (furthest): All of the sequences within an OTU are at most X%
#' distant from all of the other sequences within the OTU.
#' + Average neighbor (average): This method is a middle ground between the other two
#' algorithms.
#' + AGC (agc): Abundance-based greedy clustering.
#' + DGC (dgc): Distance-based greedy clustering.
#' + Unique (unique): Creates a list file from a name or count file where every unique
#' sequence is assigned to it’s own OTU (i.e. Amplicon Sequence
#' Variant)
#'
#' If there is an algorithm that you would like to see implemented, please consider either
#' contributing to the mothur project or contacting the developers and we’ll see what we
#' can do. The opticlust algorithm is the default option.
#'
const cluster as function(dist,
method = "furthest",
cutoff = 0.03,
num_threads = 32,
logfile = "[11]cluster.txt") {
runMothur(
command = "cluster",
argv = list(phylip = dist,
method = method,
cutoff = cutoff,
processors = num_threads
),
log = logfile
);
}
#'
#'
#' @details ``bin.seqs`` prints out a fasta-formatted file where
#' sequences are ordered according to the OTU that they belong to.
#' Such an output may be helpful for generating primers specific
#' to an OTU or for classification of sequences.
#'
const bin.seqs = function(list, contigs, contig.names, logfile = "[12]bin.seqs.txt") {
runMothur(
command = "bin.seqs",
argv = list(list = list, fasta = contigs, name = contig.names),
log = logfile
);
}
#' Get the representative sequence
#'
#' @details While the ``bin.seqs`` command reports the OTU number for
#' all sequences, the ``get.oturep`` command generates a fasta-formatted
#' sequence file containing only a representative sequence for each OTU.
#' A .rep.fasta and .rep.name file or .rep.count_table file is generated
#' for each OTU definition.
#'
const get.oturep as function(dist, contig.unique.fasta, list,
label = 0.03,
logfile = "[13]get.oturep.txt") {
runMothur(
command = "get.oturep",
argv = list(
phylip = dist,
fasta = contig.unique.fasta,
list = list,
label = label
),
log = logfile
);
}
#' Do OTU annotation based on the reference database
#'
#' @details The classify.seqs command allows the user to use
#' several different methods to assign their sequences
#' to the taxonomy outline of their choice. Current
#' methods include the Wang approach, using a k-nearest
#' neighbor consensus and zap. Taxonomy outlines and
#' reference sequences can be obtained from the
#' taxonomy outline page. The command requires that
#' you provide a fasta-formatted input and database
#' sequence file and a taxonomy file for the reference
#' sequences.
#'
const classify.seqs = function(
fasta,
reference,
taxonomy,
method="knn",
numwanted=3,
processors=2,
logfile = "[14]classify.seqs.txt"
) {
runMothur(
command = "classify.seqs",
argv = list(
reference = reference,
fasta = fasta,
taxonomy = taxonomy,
method = method,
numwanted = numwanted,
processors = processors
),
log = logfile
);
}
#' taxonomy summary of the annotation
#'
#' @details The summary.tax command reads a taxonomy file and
#' an optional name and or group file, and summarizes
#' the taxonomy information.
#'
const summary.tax = function(taxonomy, group, logfile = "[15]summary.tax.txt") {
runMothur(
command = "summary.tax",
argv = list(
taxonomy = taxonomy,
group = group
),
log = logfile
);
}
#' count for groups
#'
#' @details The split.groups command reads a list, fasta, flow or
#' fastq file and count and generates a list, fasta, flow
#' or fastq files for each group in the count file.
#'
const split.groups = function(fasta, group, logfile = "[16]split.groups.txt") {
runMothur(
command = "split.groups",
argv = list(
fasta = fasta,
group = group
),
log = logfile
);
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.