# ARAST.R
# "Abbreviated RAST": Metagenome analysis pipeline in R inspired by MG-RAST
# (only up to RNA and protein genecalling steps)
# 20180310 v0 by Jeffrey M. Dick
# 20181117 v1 - Used in gradox paper (https://doi.org/10.3389/fmicb.2019.00120)
# Script is on Zenodo: https://doi.org/10.5281/zenodo.2314933
# 20211216 v2 - Used in geo16S paper (https://doi.org/10.1007/s00248-022-01988-9)
# Add techtype argument to ARAST(); script no longer has hardcoded list of Run IDs and techtypes
# 20221118 v3 - Used in microhum preprint (https://doi.org/10.1101/2023.02.12.528246)
# Remove human DNA using bowtie2 (step 5 added to script)
# 20231216 v4 - Used in microhum paper
# Save file with processing statistics
# USAGE EXAMPLES
# To process a batch of files:
# files <- dir("/home/ARAST/work/", ".fasta", full.names = TRUE)
# lapply(files, ARAST)
# To stop after the dereplication step and don't gzip the output:
# lapply(files, ARAST, mode = "DNA", run.gzip = FALSE)
# SYSTEM SOFTWARE: modified from MG-RAST's "Processing Information and Downloads" seen for ERX242607_sra_data.fasta
# skewer_0.2.2 (https://github.com/relipmoc/skewer)
# bowtie_v2.5.0 (https://github.com/BenLangmead/bowtie2) [used by autoskewer and for screening step]
# autoskewer (https://github.com/MG-RAST/autoskewer)
# ea-utils (https://github.com/ExpressionAnalysis/ea-utils) [fastq-mcf, used by filter_sequences]
# biopython (pip install biopython) [used by seq_length_stats.py]
# pipeline (https://github.com/MG-RAST/pipeline) [seq_length_stats.py, filter_sequences, dereplication.py, parallel_FragGeneScan.py]
# sortmerna (https://github.com/biocore/sortmerna)
# FragGeneScan_FGS1.18 (https://github.com/MG-RAST/FGS)
# LOCAL MODIFICATIONS to autoskewer.py (20231222):
# $ diff autoskewer.py.orig autoskewer.py
# 1c1
# < #!/usr/bin/env python
# ---
# > #!/usr/bin/env python2.7
# 133c133
# < skewoptions = "-k 5 -l 0 --quiet -t 8 -r .2 -m any"
# ---
# > skewoptions = "-k 5 --quiet -t 16 -r .2 -m any"
#
# Why?
# -l 0 (0 minimum read length allowed after trimming) causes the following downstream error in filter_sequences:
# fastq-mcf: fastq-mcf.cpp:184: void inbuffer::reset(): Assertion `max_buf > 0' failed.
# So, take away the -l option to let skewer use the default (-l 18)
# Also, use -t 16 to adjust the number of threads for skewer
# DATABASE:
# Download md5rna.clust (pipeline-master/CWL/Inputs/DBs/getpredata.sh) [used by sortmerna]
# curl "http://shock.metagenomics.anl.gov/node/c4c76c22-297b-4404-af5c-8cd98e580f2a?download" > md5rna.clust
# Set up index for sortmerna:
# indexdb --ref md5rna.clust,md5rna.clust.index
# Human genome index for bowtie2
# wget https://genome-idx.s3.amazonaws.com/bt/GRCh38_noalt_as.zip
ARAST <- function(inputfile, proc = parallel::detectCores(), mode = "protein", run.gzip = TRUE, techtype = "illumina", mem_MB = 6144, mem_GB = 16, cleanup = TRUE) {
# This function processes a single FASTA or FASTQ file
# 'proc' number of threads for bowtie and sortmerna
# 'techtype' can be sanger, 454, or illumina (sequencing technology needed for FragGeneScan)
# 'mem_MB' is used by arast_sortme_rna.pl
# NOTE: value is limited by free memory resources; tested values include 3941 for 8 GB RAM and 6144 for 16 GB RAM
# 'mem_GB' is used by dereplication.py
# 'mode' specifies processing mode:
# DNA: trimming, denoising, artifact removal, screening (host sequence removal)
# RNA: DNA steps plus RNA genecalling with sortmerna
# protein: DNA and RNA steps plus protein genecalling with FragGeneScan
# FGS: FragGeneScan only
## What steps to run
steps <- numeric()
if(mode == "DNA") steps <- c(0:5, 23:24)
if(mode == "RNA") steps <- c(0:6, 23:24)
if(mode == "protein") steps <- c(0:9, 23:24)
if(mode == "protein_no_screening") steps <- c(0:4, 6:9, 23:24)
if(mode == "FGS") steps <- c(0:1, 9, 23:24)
if(length(steps) == 0) stop("Invalid value for 'mode' argument")
if(0 %in% steps) {
print("######## 0. Set up environment ########")
# Get working directory and Run ID from file name
workdir <- dirname(inputfile)
Run <- strsplit(basename(inputfile), "_")[[1]][1]
# File type: fasta or fastq
if(grepl("\\.fasta$", inputfile)) filetype <- "fasta" else
if(grepl("\\.fna$", inputfile)) filetype <- "fasta" else
if(grepl("\\.fastq$", inputfile)) filetype <- "fastq" else {
print(paste("Not processing", inputfile, "which is not fasta or fastq"))
return()
}
print(paste0("Processing ", filetype, " file for ", Run, " (", techtype, " sequencing)"))
tmpdir <- file.path(workdir, "tmp")
if(!dir.exists(tmpdir)) dir.create(tmpdir)
# Keep list of temporary and output files
tmpfiles <- outfiles <- character()
} else stop("Step 0 is required")
if(1 %in% steps) {
print("######## 1. Initial sequence statistics ########")
# Initialize list of statistics
stats <- list(Run = Run)
# Read output from fastq-dump, if available
fastq_dump.logfile <- file.path(workdir, paste0(Run, "_fastq-dump.log"))
if(file.exists(fastq_dump.logfile)) {
fastq_dump.log <- readLines(fastq_dump.logfile)
# Find lines starting with "Read" and "Written"
iRead <- grep("^Read", fastq_dump.log)
iWritten <- grep("^Written", fastq_dump.log)
spots <- as.numeric(strsplit(fastq_dump.log[iRead], " ")[[1]][2])
passed_read_filter <- as.numeric(strsplit(fastq_dump.log[iWritten], " ")[[1]][2])
# List number of spots and passed reads (those written by fastq-dump --read-filter pass)
stats <- c(stats, spots = spots, passed_read_filter = passed_read_filter)
}
# Count number of sequences in input file
input_sequences <- count_sequences(inputfile, filetype)
stats <- c(stats, inputfile = basename(inputfile), input_sequences = input_sequences)
} else stop("Step 1 is required")
if(2 %in% steps) {
print("######## 2. Adapter Trimming ########")
scrubbedfile <- file.path(workdir, paste0(Run, "_scrubbed.", filetype))
autoskewer(inputfile, scrubbedfile, tmpdir, cleanup)
tmpfiles <- c(tmpfiles, scrubbedfile)
scrubbed_sequences <- count_sequences(scrubbedfile, filetype)
stats <- c(stats, scrubbed_sequences = scrubbed_sequences)
} else scrubbedfile <- inputfile
if(3 %in% steps) {
print("######## 3. Denoising and normalization ########")
passedfile <- file.path(workdir, paste0(Run, "_passed.fasta"))
removedfile <- file.path(workdir, paste0(Run, "_removed.fasta"))
preprocess(scrubbedfile, passedfile, removedfile, filetype)
tmpfiles <- c(tmpfiles, passedfile, removedfile)
passed_sequences <- count_sequences(passedfile, "fasta")
stats <- c(stats, passed_sequences = passed_sequences)
} else passedfile <- scrubbedfile
if(4 %in% steps) {
print("######## 4. Removal of sequencing artifacts ########")
derepfile <- file.path(workdir, paste0(Run, "_dereplicated.fasta"))
dereplication(passedfile, derepfile, tmpdir, mem_GB, cleanup)
# Mark this as a temporary file unless this is the final step
if(5 %in% steps) tmpfiles <- c(tmpfiles, derepfile) else outfiles <- c(outfiles, derepfile)
dereplicated_sequences <- count_sequences(derepfile, "fasta")
stats <- c(stats, dereplicated_sequences = dereplicated_sequences)
} else derepfile <- passedfile
if(5 %in% steps) {
print("######## 5. Host DNA contamination removal ########")
screenedfile <- file.path(workdir, paste0(Run, "_screened.fasta"))
bowtie2(derepfile, screenedfile, proc, tmpdir)
if(6 %in% steps) tmpfiles <- c(tmpfiles, screenedfile) else outfiles <- c(outfiles, screenedfile)
screened_sequences <- count_sequences(screenedfile, "fasta")
stats <- c(stats, screened_sequences = screened_sequences)
} else screenedfile <- derepfile
if(6 %in% steps) {
print("######## 6. RNA feature identification (rRNA genecalling) ########")
rna_nr = "/home/ARAST/REFDB/md5rna.clust"
index <- paste0(rna_nr, ".index")
rRNAfile <- file.path(workdir, paste0(Run, "_rRNA.fasta"))
not_rRNAfile <- file.path(workdir, paste0(Run, "_not-rRNA.fasta"))
sortmerna(screenedfile, rRNAfile, not_rRNAfile, proc, mem_MB, rna_nr, index)
outfiles <- c(outfiles, rRNAfile)
if(9 %in% steps) tmpfiles <- c(tmpfiles, not_rRNAfile) else outfiles <- c(outfiles, not_rRNAfile)
rRNA_sequences <- count_sequences(rRNAfile, "fasta")
not_rRNA_sequences <- count_sequences(not_rRNAfile, "fasta")
stats <- c(stats, rRNA_sequences = rRNA_sequences, not_rRNA_sequences = not_rRNA_sequences)
} else not_rRNAfile <- screenedfile
# NOTE: Steps labelled 00000000 are in the MG-RAST pipeline but are not implemented here
if(7 %in% steps) print("00000000 7. RNA clustering (not implemented) 00000000")
if(8 %in% steps) print("00000000 8. RNA similarity search (not implemented) 00000000")
if(9 %in% steps) {
print("######## 9. Identify putative protein coding features (genecalling) ########")
codingfile <- file.path(workdir, paste0(Run, "_coding"))
genecalling(not_rRNAfile, codingfile, techtype, tmpdir, proc, cleanup)
if(file.exists(paste0(codingfile, ".faa"))) {
fnafile <- paste0(codingfile, ".fna")
faafile <- paste0(codingfile, ".faa")
tmpfiles <- c(tmpfiles, fnafile)
outfiles <- c(outfiles, faafile)
coding_sequences <- count_sequences(faafile, "fasta")
stats <- c(stats, coding_sequences = coding_sequences)
# Get average protein length
cmd <- paste('grep -v "^>"', faafile, '| wc -c')
nbytes <- as.numeric(system(cmd, intern = TRUE))
# Subtract the number of newlines (one for each sequence) from the number of bytes and divide by number of sequences to get average sequence length
protein_length <- ( nbytes - coding_sequences ) / coding_sequences
stats <- c(stats, protein_length = protein_length)
} else stats <- c(stats, coding_sequences = 0, protein_length = NA)
}
if(23 %in% steps) {
print("######## 23. Summary statistics ########")
# Remove temporary files and compress output files
if(cleanup) system(paste("rm -f", paste(tmpfiles, collapse = " ")))
if(run.gzip & length(outfiles) > 0) system(paste("gzip -f", paste(outfiles, collapse = " ")))
# Save stats file
statsfile <- file.path(workdir, paste0(Run, "_stats.csv"))
write.csv(as.data.frame(stats), statsfile, row.names = FALSE, quote = FALSE)
}
if(24 %in% steps) print("######## 24. Completed ########")
}
# Run autoskewer.py 20180310
autoskewer <- function(file, scrubbedfile, tmpdir, cleanup) {
cmd <- paste("autoskewer.py -v -i", file, "-o", scrubbedfile, "-l ~/tmp/scrubbed.log -t", tmpdir)
system(cmd)
if(cleanup) {
# Clean up tmpdir
tmpfiles <- file.path(tmpdir, "*")
cmd <- paste("rm", tmpfiles)
system(cmd)
}
}
# Run filter_sequences 20180309
# Based on MG-RAST/pipeline/mgcmd/mgrast_preprocess.pl
preprocess <- function(file, passedfile, removedfile, filetype) {
# Print stats
cmd <- paste("seq_length_stats.py -i", file, "-t", filetype, "-f")
system(cmd)
if(filetype=="fasta") {
# Get stats
cmd <- paste("seq_length_stats.py -i", file, "-t", filetype, "-f | cut -f2")
out <- system(cmd, intern=TRUE)
# This gives bp_count, sequence_count, average_length, standard_deviation_length, length_min, length_max
out <- as.numeric(out)
# Translated from mgrast_preprocess.pl
mymean <- out[3]
mystdv <- out[4]
min_ln <- round(mymean - 2 * mystdv)
max_ln <- round(mymean + 2 * mystdv)
if(min_ln < 0) min_ln <- 0
cmd <- paste("filter_sequences -i", file, "-format fasta -o", passedfile, "-r", removedfile,
"-filter_ln -min_ln", min_ln, "-max_ln", max_ln, "-filter_ambig -max_ambig 5")
}
if(filetype=="fastq") {
cmd <- paste("filter_sequences -i", file, "-format fastq -o", passedfile, "-r", removedfile,
"-dynamic_trim -min_qual 15 -max_lqb 5")
}
system(cmd)
}
# Run dereplication.py 20180310
# Based on MG-RAST/pipeline/mgcmd/mgrast_dereplicate.pl
dereplication <- function(file, derepfile, tmpdir, mem_GB, cleanup) {
prefix_size <- 50
run_dir <- tmpdir
input <- file
out_prefix <- tmpdir
cmd <- paste("dereplication.py -l", prefix_size, "-m", mem_GB, "-d", run_dir, input, out_prefix)
system(cmd)
# Rename and clean up files
system(paste("mv", paste0(tmpdir, ".passed.fna"), derepfile))
if(cleanup) system(paste("rm", paste0(tmpdir, ".*")))
}
# Run sortmerna 20180310
# Based on MG-RAST/pipeline/mgcmd/mgrast_sortme_rna.pl
sortmerna <- function(file, rRNAfile, not_rRNAfile, proc, mem_MB, rna_nr, index) {
eval <- 0.1
ident <- 70 # cutoff value listed in the MG-RAST Processing Information page
fasta <- file
## NOTE: MG-RAST picks sequences using the BLAST ouput with an identity threshold;
## here we use sortmerna with command modified for FASTA output (--fastx),
## fast filtering (--num_alignments), and saving both aligned and rejected sequences;
## sortmerna doesn't have an identity threshold option
#ref <- paste0(rna_nr, ",", index)
#cmd <- paste("sortmerna -a", proc, "-m", mem, "-e", eval,
# "--fastx --num_alignments 1 --ref", ref, "--reads", fasta,
# "--aligned", rRNAfile, "--other", not_rRNAfile, "-v")
# Use a modified version of mgrast_sortme_rna.pl to save both RNA and non-RNA sequences 20180311
cmd <- paste("./arast_sortme_rna.pl -input", file, "-output", rRNAfile, "-notrna", not_rRNAfile,
"-rna_nr", rna_nr, "-index", index, "-proc", proc, "-mem", mem_MB, "-eval", eval, "-ident", ident)
#print(cmd)
system(cmd)
# Clean up files
WD <- setwd(dirname(file))
system("rm *.blast *.tab *.id *.sort")
setwd(WD)
}
# Run FragGeneScan 20180309
# Based on MG-RAST/pipeline/mgcmd/mgrast_genecalling.pl
genecalling <- function(file, codingfile, techtype = c("sanger", "454", "illumina"), tmpdir, proc, cleanup) {
out_prefix <- codingfile
# mgrast_genecalling.pl chooses training files with highest error rate for given techtype
if(techtype == "sanger") type <- "sanger_10"
if(techtype == "454") type <- "454_30"
if(techtype == "illumina") type <- "illumina_10"
cmd <- paste("parallel_FragGeneScan.py -v -p", proc, "-s 100 -t", type, "-d", tmpdir, file, out_prefix)
# If parallel_FragGeneScan.py gives python error (OSError: [Errno 2] No such file or directory),
# try this instead:
##cmd <- paste("run_FragGeneScan.pl -genome", file, "-out", codingfile, "-complete 0 -train", type)
system(cmd)
if(cleanup) {
# Clean up files
system(paste("rm", paste0(codingfile, ".out")))
##system(paste("rm", file.path(tmpdir, "*")))
}
}
# Run bowtie2 (host sequence removal) 20221117
# Based on MG-RAST/pipeline/mgcmd/mgrast_bowtie_screen.pl
bowtie2 <- function(file, screenedfile, proc, tmpdir) {
refdb <- "/home/ARAST/genome-idx/GRCh38_noalt_as/GRCh38_noalt_as"
# 'reorder' option outputs sequences in same order as input file
cmd <- paste("bowtie2 -f --reorder -p", proc, "--un", tmpdir, "-x", refdb, "-U", file, "> /dev/null")
print(cmd)
system(cmd)
# Rename output file
file.rename(file.path(tmpdir, "un-seqs"), screenedfile)
}
# Function to count number of sequences in FASTA or FASTQ file 20231217
count_sequences <- function(file, filetype) {
cmd <- paste("wc -l", file)
wc_output <- system(cmd, intern = TRUE)
nlines <- as.numeric(strsplit(wc_output, " ")[[1]][1])
if(filetype == "fasta") nseq <- nlines / 2
if(filetype == "fastq") nseq <- nlines / 4
nseq
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.