knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) if(dir.exists("~/zhulab/")) { knitr::opts_knit$set( root.dir = "~/zhulab/neonSoilMicrobeProcessing/" # Update as necessary. Should refer to the absolute filepath of the project root directory (e.g. .../neonSoilMicrobeProcessing) ) } if(dir.exists("/Users/lstanish/Github/")) { knitr::opts_knit$set( root.dir = "/Users/lstanish/Github/NEON_soil_microbe_processing/" # LFS directory file path. ) } if(dir.exists("/projectnb/dietzelab/zrwerbin/")) { knitr::opts_knit$set( root.dir = "/projectnb/dietzelab/zrwerbin/NEON_soil_microbe_processing/" # ZW directory file path. ) }
The goal of this vignette is to merge various NEON data products to link the outputs of the DADA2 pipeline (e.g. non-chimeric sequence table, taxonomic table) to environmental/sample data. We assume that you have already run the entire pipeline and have a sequence table and taxonomic table prepared.
First, load necessary packages, and parameters file.
packages = c("dplyr", "tibble", "phyloseq") ## Now load or install&load all package.check <- lapply( packages, FUN = function(x) { if (!require(x, character.only = TRUE)) { install.packages(x, dependencies = TRUE) library(x, character.only = TRUE) } } ) # Load parameters and utilities source("./R/utils.R") source("./code/params.R")
Next, we load in the outputs from dada2. This pulls the most recent output files from the outputs subdirectory.
df <- file.info(list.files(file.path(PRESET_OUTDIR,PRESET_OUTDIR_OUTPUTS), full.names = T)) %>% rownames_to_column('filename') %>% grepl("ITS") newest_outputs <- df[which.max(df$mtime),]$filename # Load combined sequence table and taxonomic table seqtab_orig <- readRDS(file.path(newest_outputs, "NEON_ITS_seqtab.rds")) taxa_orig <- readRDS(file.path(newest_outputs, "NEON_ITS_tax.rds"))
Now, we add sequence metadata. This is necessary for correcting the sample names.
# Find newest metadata file df <- file.info(list.files(file.path(PRESET_OUTDIR,PRESET_OUTDIR_SEQMETA), full.names = T)) %>% rownames_to_column('filename') %>% filter(grepl("soilMetadata_ITS", filename)) newest_metadata <- df[which.max(df$mtime),]$filename sampledata_full <- read.csv(newest_metadata) n_seq_samples <- nrow(seqtab_orig) n_sampledata <- nrow(sampledata_full) if (n_seq_samples > n_sampledata) { msg <- paste0("There appear to be more samples in your sequence table than in your metadata table! You will need to download additional metadata for this vignette to work. See vignette #1 for metadata downloading function.") message(msg) }
Fix up some columns in metadata.
# Subset to columns of interest sampledata <- sampledata_full %>% distinct(dnaSampleID, .keep_all = T) %>% select(sequencerRunID, dnaSampleID, dataQF.rawFiles, internalLabID.seq, siteID, collectDate, plotID, deprecatedVialID, geneticSampleID) # Fix date column sampledata$date_ymd <- as.Date(format(as.Date(sampledata$collectDate, format="%Y-%m-%d %H:%M:%S"), "%Y-%m-%d"))
This section downloads data associated with soil samples, such as temperature, moisture, and pH.
start <- substr(min(sampledata$date_ymd), 1, 7) end <- substr(max(sampledata$date_ymd), 1, 7) site <- unique(sampledata$siteID) soils_full <- downloadRawSoilData(startYrMo = start, endYrMo = end, outDir=file.path(PRESET_OUTDIR,PRESET_OUTDIR_SOIL), sites = site) soils <- soils_full %>% select(domainID, siteID, plotID, plotType, nlcdClass, collectDate, sampleTiming, standingWaterDepth, sampleID, horizon, soilTemp, litterDepth, sampleTopDepth, sampleBottomDepth, geneticSampleID, soilInCaClpH) sampledata <- merge(sampledata, soils, all.x=T, by = "geneticSampleID") # Add rownames to sample data table rownames(sampledata) <- sampledata$dnaSampleID
Large phyloseq objects can use a lot of storage. Here, we remove samples and taxa that didn't sequence well, to reduce the size of the output object.
# Remove low-abundance taxa (optional - this step is now in dada2 vignette) seqtab_orig <- seqtab_orig[,which(nchar(colnames(seqtab_orig)) > 50)] keep <- which(colSums(seqtab_orig) > 20) seqtab <- seqtab_orig[,keep] print(dim(seqtab)) # Remove low-quality samples (optional) keep <- which(rowSums(seqtab) > 1000) seqtab <- seqtab[keep,] print(dim(seqtab))
The phyloseq
R package uses row names to match up data. Let's get the row names consistent between the sequence table and sample data.
# Fix rownames on sequence table seq.internalLabID <- gsub("^run....._|_ITS|_.....$", "", rownames(seqtab)) seq.internalLabID <- gsub("-","_",seq.internalLabID) seq.sampleID <- sampledata[match(seq.internalLabID, sampledata$internalLabID),]$dnaSampleID rownames(seqtab) <- make.unique(as.character(seq.sampleID)) # Subset to samples present in both dataframes common_samples <- intersect(rownames(sampledata), rownames(seqtab)) sampledata <- sampledata[common_samples,] seqtab <- seqtab[common_samples,]
Now we put it all together!
# Check rowname agreement for phyloseq #identical(rownames(seqtab), rownames(sampledata)) # Combine into phyloseq object ps <- phyloseq(otu_table(seqtab, taxa_are_rows=FALSE), sample_data(sampledata), tax_table(taxa_orig)) # store the DNA sequences of our ASVs in the refseq slot of the phyloseq object, # and then rename our taxa to a short string dna <- Biostrings::DNAStringSet(taxa_names(ps)) names(dna) <- taxa_names(ps) ps <- merge_phyloseq(ps, dna) taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps))) # Print phyloseq summary. ps
Don't forget to save!
saveRDS(ps, file.path(newest_outputs, "NEON_ITS_phyloseq_full.rds"))
If the saved object still takes up too much memory, consider removing or consolidating additional taxa, using glom functions. The speedyseq
package is also useful for manipulating large phyloseq datasets.
# Create a smaller subset # Remove taxa not seen more than 3 times in at least 10 of the samples. # This protects against an OTU with small mean & trivially large C.V. # (from phyloseq tutorial) ps_subset = filter_taxa(ps, function(x) sum(x > 3) > 10, TRUE) saveRDS(ps_subset, file.path(newest_outputs, "NEON_ITS_phyloseq_subset.rds"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.