# chem16S/pipeline.R
# Merge and filter 16S sequences from SRA, subsample, find chimeras, and run RDP Classifier
# 20200909 Initial version by Jeffrey M. Dick
# 20210922 Add settings for orp16S datasets
# Add --skip-technical --clip options to fastq-dump
# Don't use fastq_filter --stripleft for 454 data
## SYSTEM REQUIREMENTS
# (version numbers for information only)
# R 4.0.0 https://www.r-project.org/
# fastq-dump 2.9.0 https://ncbi.github.io/sra-tools/
# vsearch 2.15.0 https://github.com/torognes/vsearch
# seqtk 1.3-r115 https://github.com/lh3/seqtk
# RDP Classifier 2.13 https://sourceforge.net/projects/rdp-classifier/
# Java openjdk version 1.8.0_252 (for RDP); GNU utils (cat, awk, grep)
## USAGE
# First, make sure the working directory is clean (especially no *.fa files)
# To filter one file (generates .fa files in workdir for subsampling)
# filter("SRR2059382")
# To filter multiple files:
# RUNID <- c("SRR2059381", "SRR2059380", "SRR2059379")
# lapply(RUNID, filter)
# To subsample and remove chimeras from the filtered files:
# subsample()
# findchimeras()
# To run RDP Classifier after chimera removal:
# classify(RUNID)
# To combine output of RDP Classifier into one file
# -- result is saved in RDP/<study>.tab in the current directory
# mkRDP(study, RUNID)
## STUDY SETTINGS
# Change the following line to setup the pipeline for one study
study <- "HMP12"
# Settings for all studies are stored here
file <- tempfile()
# Write spaces here (but don't save them) to make this easier to read
writeLines(con = file, text = gsub(" ", "", c(
# For 454 experiments, set forwardonly to NA
"study, forwardonly, trunclen",
## For geo16S paper 20210502
# Natural environment datasets
# For HCW+13 (Guerrero Negro), don't use filter(); start with findchimeras()
#"HCW+13, NA, NA", # GenBank JN427016-JN539989 10.1038/ismej.2012.79
"BGPF13, NA, NA", # BioProject PRJNA207095 10.3389/fmicb.2013.00330
"HLA+16, NA, NA", # PRJEB1245 10.3389/fmicb.2016.01883
"JHM+16, FALSE, 230", # PRJNA291280 10.1128/AEM.02699-15
"ZLM+16, TRUE, 250", # PRJNA294836 10.1128/AEM.03332-15
"MPB+17, TRUE, 450", # PRJEB15554 10.1038/ismej.2017.37
"XDZ+17, TRUE, 250", # PRJNA388250 10.1038/s41598-017-13608-5
"SVH+19, NA, NA", # PRJNA423140 10.1111/gbi.12316
# Shale gas datasets
"UKD+18.sediment, TRUE, 100", # PRJNA394724 10.1038/s41598-018-23679-7
"UKD+18.water, TRUE, 100",
"MMA+20, FALSE, 250", # PRJNA544240 10.1073/pnas.1911458117
"CHM+14, NA, NA", # PRJNA229085 10.1021/es501173p
"HRR+18, TRUE, 300", # PRJNA438710 10.1016/j.scitotenv.2018.06.067
"ZLF+19, FALSE, 290", # PRJNA407226 10.1021/acsearthspacechem.9b00037
# Stratified water body datasets
"GBL+15, FALSE, 250", # PRJNA263621 10.1038/ismej.2015.44
"BCA+21, FALSE, 450", # PRJNA395513 10.1111/1462-2920.14909
"MZG+20, FALSE, 450", # PRJEB27579 10.1038/s41396-019-0515-8
"HXZ+20, FALSE, 440", # PRJNA503500 10.1038/s41598-020-62411-2
# Datasets for comparing 16S estimated proteomes with metagenome/metatranscriptome 20211017
# For SMS+12 (Bison Pool), start with classify()
#"SMS+12, NA, NA",
"EH18, NA, NA",
"MKK+11, NA, NA",
"FLA+12, NA, NA",
"HMP12, TRUE, 400"
)))
# This reads and applies the settings
settings <- read.csv(file, as.is = TRUE)
istudy <- match(study, settings$study)
# Flag for 454 studies
is454 <- is.na(settings$forwardonly[istudy])
# Use forward reads only (for low-quality or missing reverse reads)
forwardonly <- settings$forwardonly[istudy]
# Value of fastq_trunclen
trunclen <- settings$trunclen[istudy]
## FILE LOCATIONS
# Working directory
workdir <- "~/tmp/chem16S"
# Output directory for FASTA files
FASTAdir <- file.path("/home/chem16S", study, "fasta")
# Reference database for chimera detection
refdb <- "/home/chem16S/SILVA/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz"
#refdb <- "/home/chem16S/chimera/gold.fa"
# RDP jar file
RDPjar <- "/opt/rdp_classifier_2.13/dist/classifier.jar"
# Output directory for RDP Classifier
RDPdir <- file.path("/home/chem16S", study, "RDP")
# RDP Classifier version (java -jar /opt/rdp_classifier_2.13/dist/classifier.jar version)
#Gene:16srrna Trainset No:18 Taxonomy Version:RDP 16S rRNA training setNo 18 07/2020
#RDP Classifier Version:RDP Naive Bayesian rRNA Classifier Version 2.11, September 2015
filter <- function(RUNID) {
message("============================================")
message(paste0("Merging and filtering ", RUNID, " [", study, "]"))
message("============================================")
# Change to working directory
olddir <- setwd(workdir)
on.exit(setwd(olddir))
# The output file from this function is a FASTA file with .fa suffix
outfile <- paste0(RUNID, ".fa")
if(study %in% c("MKK+11", "FLA+12")) {
# For these datasets, files are downloaded from MG-RAST 20220122
fqdump <- FALSE
} else {
# Generate input FASTQ files with fastq-dump
fqdump <- TRUE
cmd <- paste("fastq-dump --split-files --skip-technical --clip", RUNID)
print(cmd)
system(cmd)
}
if(study %in% c("MKK+11", "FLA+12")) {
# These are files from MG-RAST (.fasta suffix)
# Prefix MG-RAST ID to header so we can extract the reads in the subsample() and findchimeras() steps 20211004
fastafile <- paste0(RUNID, ".fasta")
lines <- readLines(fastafile)
ihead <- grep("^>", lines)
lines[ihead] <- paste0(">", RUNID, ";", gsub("^>", "", lines[ihead]))
writeLines(lines, outfile)
} else if(is454) {
# For 454 studies, fastq-dump --skip-technical puts biological sequences into file with highest-numbered suffix
# Use _3.fastq, _2.fastq or _1.fastq if they are present 20200921
infile <- paste0(RUNID, "_3.fastq")
if(!file.exists(infile)) infile <- paste0(RUNID, "_2.fastq")
if(!file.exists(infile)) infile <- paste0(RUNID, "_1.fastq")
# This is needed for files not generated by fastq-dump
if(!file.exists(infile)) infile <- paste0(RUNID, ".fastq")
truncqual <- 15
fastq_minlen <- 200
cmd <- paste("vsearch -fastq_filter", infile, "-fastq_minlen", fastq_minlen, "-fastq_maxlen 600 -fastq_truncqual", truncqual, "-fastaout", outfile)
print(cmd)
system(cmd)
} else if(forwardonly) {
# Use forward reads only
file.copy(paste0(RUNID, "_1.fastq"), "merged.fastq", overwrite = TRUE)
# For Human Microbiome Project, reads are in different file 20211216
if(study == "HMP12") file.copy(paste0(RUNID, "_4.fastq"), "merged.fastq", overwrite = TRUE)
nseq <- length(readLines("merged.fastq")) / 4
print(paste0("Using forward reads only (", nseq, " sequences)"))
} else {
# Check that we have same number of forward and reverse reads 20200917
# (issue with some datasets: fastq_dump rejects some reads because READLEN < 1)
ID1 <- sapply(strsplit(gsub("@", "", system(paste0("awk '(NR - 1) % 4 == 0' ", RUNID, "_1.fastq"), intern = TRUE)), " "), "[", 1)
ID2 <- sapply(strsplit(gsub("@", "", system(paste0("awk '(NR - 1) % 4 == 0' ", RUNID, "_2.fastq"), intern = TRUE)), " "), "[", 1)
# Forward reads that have matching reverse reads
forward_matches <- ID1[ID1 %in% ID2]
if(length(forward_matches) < length(ID1)) {
writeLines(forward_matches, "forward_matches.txt")
print(cmd <- paste0("seqtk subseq ", RUNID, "_1.fastq forward_matches.txt > new_1.fastq"))
system(cmd)
print(cmd <- paste0("mv new_1.fastq ", RUNID, "_1.fastq"))
system(cmd)
}
# Reverse reads that have matching forward reads
reverse_matches <- ID2[ID2 %in% ID1]
if(length(reverse_matches) < length(ID2)) {
writeLines(reverse_matches, "reverse_matches.txt")
print(cmd <- paste0("seqtk subseq ", RUNID, "_2.fastq reverse_matches.txt > new_2.fastq"))
system(cmd)
print(cmd <- paste0("mv new_2.fastq ", RUNID, "_2.fastq"))
system(cmd)
}
# Make sure forward and reverse reads have same IDs
ID1 <- sapply(strsplit(gsub("@", "", system(paste0("awk '(NR - 1) % 4 == 0' ", RUNID, "_1.fastq"), intern = TRUE)), " "), "[", 1)
ID2 <- sapply(strsplit(gsub("@", "", system(paste0("awk '(NR - 1) % 4 == 0' ", RUNID, "_2.fastq"), intern = TRUE)), " "), "[", 1)
if(!all(ID1 == ID2)) stop(paste0("sequence IDs in ", RUNID, "_1.fastq and ", RUNID, "_2.fastq don't match!"))
# Merge paired reads
print(cmd <- paste0("vsearch -fastq_mergepairs ", RUNID, "_1.fastq -reverse ", RUNID, "_2.fastq -fastqout merged.fastq"))
system(cmd)
}
if(!is454) {
# Remove short and high-error reads
# Use fastq_maxee_rate instead of fastq_maxee 20200920
maxee_rate <- 0.005
# Use default -fastq_qmax 20210922
qmax <- 41
cmd <- paste("vsearch -fastq_filter merged.fastq -fastq_trunclen", trunclen, "-fastq_qmax", qmax, "-fastq_maxee_rate", maxee_rate, "-fastaout", outfile)
print(cmd)
system(cmd)
}
# Clean up
if(fqdump) file.remove(Sys.glob("*.fastq"))
return()
}
subsample <- function(do.singletons = TRUE) {
message("=======================")
message(paste0("Subsampling [", study, "]"))
message("=======================")
# Change to working directory
olddir <- setwd(workdir)
on.exit(setwd(olddir))
if(do.singletons) {
# Make combined FASTA file of filtered sequences 20200914
print(cmd <- "cat *.fa > filtered.fasta")
system(cmd)
# Remove singletons 20200912
# 1. Dereplicate sequences, keeping those that appear exactly once
print(cmd <- "vsearch -derep_fulllength filtered.fasta -maxuniquesize 1 -output singletons.fasta")
system(cmd)
# 2. Get all sequence IDs
print(cmd <- 'grep ">" filtered.fasta | sed -e s/\\ .*//g | sed -e s/\\>//g > allIDs.txt')
system(cmd)
allIDs <- readLines("allIDs.txt")
# 3. Get singleton sequence IDs
print(cmd <- 'grep ">" singletons.fasta | sed -e s/\\ .*//g | sed -e s/\\>//g > singletonIDs.txt')
system(cmd)
singletonIDs <- readLines("singletonIDs.txt")
# 4. Write IDs of sequences that are not singletons
writeLines(setdiff(allIDs, singletonIDs), "not_singletons.txt")
# 5. Extract sequences that are not singletons
print(cmd <- "seqtk subseq filtered.fasta not_singletons.txt > not_singletons.fasta")
system(cmd)
singletxt <- paste0("Removed ", length(singletonIDs), " singletons from ", length(allIDs), " sequences (", round(length(singletonIDs) / length(allIDs) * 100, 1), "%)")
print(singletxt)
} else {
# Don't remove singletons, but combine filtered sequences for subsampling 20210821
print(cmd <- "cat *.fa > not_singletons.fasta")
system(cmd)
}
# Loop over samples
allRUNID <- gsub("\\.fa$", "", dir(pattern = "\\.fa$"))
for(thisRUNID in allRUNID) {
# Extract sequences for this sample
thisfile <- paste0(thisRUNID, ".fasta")
# https://stackoverflow.com/questions/26144692/printing-a-sequence-from-a-fasta-file
# "awk '/>" is used to match sample name at beginning of header line
# (this is needed for numeric or short sample names that can match other parts of the header)
print(cmd <- paste0("awk '/>", thisRUNID, "/{p++;print;next} /^>/{p=0} p' not_singletons.fasta > ", thisfile))
# For few datasets, don't use "awk '/>" because sample names are end of header of original FASTQ files, not beginning 20210501
#print(cmd <- paste0("awk '/", thisRUNID, "/{p++;print;next} /^>/{p=0} p' not_singletons.fasta > ", thisfile))
system(cmd)
# Subsample 10000 sequences - put output in .fa file in FASTAdir
print(cmd <- paste('grep "^>"', thisfile))
thisfile.headers <- system(cmd, intern = TRUE)
nout <- nseq <- length(thisfile.headers)
outfile <- file.path(FASTAdir, paste0(thisRUNID, ".fa"))
if(nseq > 10000) {
print(cmd <- paste("vsearch -fastx_subsample", thisfile, "-sample_size 10000 -randseed 1234 -fastaout", outfile))
system(cmd)
nout <- 10000
} else {
file.copy(thisfile, outfile, overwrite = TRUE)
}
subsamptxt <- paste("Subsampled", nout, "of", nseq, "sequences")
print(subsamptxt)
# Save messages about number of subsampled sequences
txtfile <- file.path(FASTAdir, paste0(thisRUNID, ".txt"))
writeLines(subsamptxt, txtfile)
}
# Clean up
file.remove(Sys.glob("*.fa"))
file.remove(Sys.glob("*.fasta"))
return()
}
# Find chimeras in combined FASTA file, then split result into samples 20200913
# (Saves time by masking and creating k-mer index only once)
findchimeras <- function() {
message("=========================")
message(paste0("Finding chimeras [", study, "]"))
message("=========================")
# Change to output directory
olddir <- setwd(FASTAdir)
on.exit(setwd(olddir))
# Make combined FASTA file
print(cmd <- "cat *.fa > combined.fasta")
system(cmd)
# Identify chimeras
print(cmd <- paste("vsearch -uchime_ref combined.fasta -nonchimeras nonchimeras.fasta -db", refdb))
system(cmd)
# Split result into one file for each sample
# https://stackoverflow.com/questions/26144692/printing-a-sequence-from-a-fasta-file
allRUNID <- gsub("\\.fa$", "", dir(pattern = "\\.fa$"))
for(thisRUNID in allRUNID) {
print(cmd <- paste0("awk '/>", thisRUNID, "/{p++;print;next} /^>/{p=0} p' nonchimeras.fasta > ", thisRUNID, ".fasta"))
# For few datasets, don't use "awk '/>" because sample names are end of header of original FASTQ files, not beginning 20210501
#print(cmd <- paste0("awk '/", thisRUNID, "/{p++;print;next} /^>/{p=0} p' nonchimeras.fasta > ", thisRUNID, ".fasta"))
system(cmd)
}
# Clean up
file.remove("combined.fasta")
file.remove("nonchimeras.fasta")
}
# Run RDP Classifier on one or multiple files 20200910
classify <- function(RUNID, conf = 0.8) {
# Make sure RDP output directory exists 20200915
if(!dir.exists(RDPdir)) stop(paste("directory", RDPdir, "does not exist"))
# Force evaluation of RUNID before changing the directory
allRUNID <- RUNID
# Change to working directory
olddir <- setwd(workdir)
on.exit(setwd(olddir))
# Loop over all RUNIDs
n <- length(allRUNID)
for(i in 1:n) {
thisRUNID <- allRUNID[i]
message("=======================================")
message(paste0("Classifying ", thisRUNID, " [", study, "] (", i, "/", n, ")"))
message("=======================================")
# Input FASTA file
FASTAfile <- file.path(FASTAdir, paste0(thisRUNID, ".fasta"))
# Copy FASTA file to here without .fasta suffix (so it doesn't get into sample name)
file.copy(FASTAfile, thisRUNID, overwrite = TRUE)
# Run classifier with -c (confidence score threshold) 20201007
print(cmd <- paste0("java -jar ", RDPjar, " classify -c ", conf, " -o ", thisRUNID, ".tab -h ", thisRUNID, ".txt ", thisRUNID))
system(cmd)
# Copy results (assignment count for each taxon) to output directories
results <- paste0(thisRUNID, ".txt")
resfile <- file.path(RDPdir, paste0(thisRUNID, ".txt"))
file.copy(results, resfile, overwrite = TRUE)
# Clean up
file.remove(Sys.glob(paste0("*", thisRUNID, "*")))
}
}
# Combine output files of RDP Classifier into a single CSV file for one study 20200903
# Use RDP Classifier merge-count command 20200910
# Get file list from RDPdir 20210817
mkRDP <- function() {
# Change to temporary directory
olddir <- setwd(tempdir())
on.exit(setwd(olddir))
# Remove any existing .txt files
file.remove(Sys.glob("*.txt"))
# Loop over runs
runs <- gsub(".txt", "", dir(RDPdir))
for(run in runs) {
# The RDP result file
RDPfile <- file.path(RDPdir, paste0(run, ".txt"))
print(RDPfile)
# Copy the RDP result file to here (temporary directory)
file.copy(RDPfile, ".")
}
# Now list all the RDP result files
files <- dir(pattern = "txt")
# Name of output file
outfile <- file.path(olddir, "RDP", paste0(study, ".tab"))
# Remove output file in case it exists
if(file.exists(outfile)) file.remove(outfile)
# RDP jar file
RDPjar <- "/opt/rdp_classifier_2.13/dist/classifier.jar"
# Run merge-count command
print(cmd <- paste("java -jar", RDPjar, "merge-count", outfile, paste(files, collapse = " ")))
system(cmd)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.