In claraqin/neonMicrobe: Downloading, Pre-processing, and Assembling NEON Soil Microbe Marker Gene Sequence Data

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
if(dir.exists("~/zhulab/")) {
  knitr::opts_knit$set(
  root.dir = "~/zhulab/neonSoilMicrobeProcessing/" # Update as necessary. Should refer to the absolute filepath of the project root directory (e.g. .../neonSoilMicrobeProcessing)
  )
}
if(dir.exists("/Users/lstanish/Github/")) {
  knitr::opts_knit$set(
  root.dir = "/Users/lstanish/Github/NEON_soil_microbe_processing/" # LFS directory file path. 
  )
}
if(dir.exists("/projectnb/dietzelab/zrwerbin/")) {
  knitr::opts_knit$set(
  root.dir = "/projectnb/dietzelab/zrwerbin/NEON_soil_microbe_processing/" # ZW directory file path.
  )
}

The goal of this vignette is to merge various NEON data products to link the outputs of the DADA2 pipeline (e.g. non-chimeric sequence table, taxonomic table) to environmental/sample data. We assume that you have already run the entire pipeline and have a sequence table and taxonomic table prepared.

First, load necessary packages, and parameters file.

packages = c("dplyr", "tibble",
             "phyloseq")

## Now load or install&load all
package.check <- lapply(
  packages,
  FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  }
)

# Load parameters and utilities
source("./R/utils.R")
source("./code/params.R")

Next, we load in the outputs from dada2. This pulls the most recent output files from the outputs subdirectory.

df <- file.info(list.files(file.path(PRESET_OUTDIR,PRESET_OUTDIR_OUTPUTS), full.names = T)) %>% 
    rownames_to_column('filename') %>% grepl("ITS")
newest_outputs <- df[which.max(df$mtime),]$filename
# Load combined sequence table and taxonomic table
seqtab_orig <- readRDS(file.path(newest_outputs, "NEON_ITS_seqtab.rds"))
taxa_orig <- readRDS(file.path(newest_outputs, "NEON_ITS_tax.rds"))

Now, we add sequence metadata. This is necessary for correcting the sample names.

# Find newest metadata file
df <- file.info(list.files(file.path(PRESET_OUTDIR,PRESET_OUTDIR_SEQMETA), full.names = T)) %>% 
    rownames_to_column('filename') %>%
    filter(grepl("soilMetadata_ITS", filename))
newest_metadata <- df[which.max(df$mtime),]$filename
sampledata_full <- read.csv(newest_metadata)

n_seq_samples <- nrow(seqtab_orig)
n_sampledata <- nrow(sampledata_full)
if (n_seq_samples > n_sampledata) {
    msg <- paste0("There appear to be more samples in your sequence table than in your metadata table! You will need to download additional metadata for this vignette to work. See vignette #1 for metadata downloading function.")
    message(msg)
}

Fix up some columns in metadata.

# Subset to columns of interest
sampledata <- sampledata_full %>% distinct(dnaSampleID, .keep_all = T) %>% 
    select(sequencerRunID, dnaSampleID, dataQF.rawFiles, internalLabID.seq,
                 siteID, collectDate, plotID, deprecatedVialID, geneticSampleID)
# Fix date column
sampledata$date_ymd <- as.Date(format(as.Date(sampledata$collectDate, format="%Y-%m-%d %H:%M:%S"), "%Y-%m-%d"))

This section downloads data associated with soil samples, such as temperature, moisture, and pH.

start <- substr(min(sampledata$date_ymd), 1, 7)
end <- substr(max(sampledata$date_ymd), 1, 7)
site <- unique(sampledata$siteID)

soils_full <- downloadRawSoilData(startYrMo = start, endYrMo = end,
                                                         outDir=file.path(PRESET_OUTDIR,PRESET_OUTDIR_SOIL), 
                                                         sites = site)
soils <- soils_full %>% select(domainID, siteID, plotID, plotType, nlcdClass, collectDate, 
                                                             sampleTiming, standingWaterDepth, sampleID, 
                                                    horizon, soilTemp, litterDepth, sampleTopDepth, sampleBottomDepth, 
                                                    geneticSampleID, soilInCaClpH)
sampledata <- merge(sampledata, soils, all.x=T, by = "geneticSampleID")

# Add rownames to sample data table 
rownames(sampledata) <- sampledata$dnaSampleID

Large phyloseq objects can use a lot of storage. Here, we remove samples and taxa that didn't sequence well, to reduce the size of the output object.

# Remove low-abundance taxa (optional - this step is now in dada2 vignette)
seqtab_orig <- seqtab_orig[,which(nchar(colnames(seqtab_orig)) > 50)]
keep <- which(colSums(seqtab_orig) > 20)
seqtab <- seqtab_orig[,keep]
print(dim(seqtab))

# Remove low-quality samples (optional)
keep <- which(rowSums(seqtab) > 1000)
seqtab <- seqtab[keep,]
print(dim(seqtab))

The phyloseq R package uses row names to match up data. Let's get the row names consistent between the sequence table and sample data.

# Fix rownames on sequence table
seq.internalLabID <- gsub("^run....._|_ITS|_.....$", "", rownames(seqtab)) 
seq.internalLabID <- gsub("-","_",seq.internalLabID)
seq.sampleID <- sampledata[match(seq.internalLabID, sampledata$internalLabID),]$dnaSampleID
rownames(seqtab) <- make.unique(as.character(seq.sampleID))

# Subset to samples present in both dataframes
common_samples <- intersect(rownames(sampledata), rownames(seqtab))
sampledata <- sampledata[common_samples,]
seqtab <- seqtab[common_samples,]

Now we put it all together!

# Check rowname agreement for phyloseq
#identical(rownames(seqtab), rownames(sampledata))

# Combine into phyloseq object
ps <- phyloseq(otu_table(seqtab, taxa_are_rows=FALSE),
               sample_data(sampledata),
               tax_table(taxa_orig))

# store the DNA sequences of our ASVs in the refseq slot of the phyloseq object,
# and then rename our taxa to a short string
dna <- Biostrings::DNAStringSet(taxa_names(ps))
names(dna) <- taxa_names(ps)
ps <- merge_phyloseq(ps, dna)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))

# Print phyloseq summary.
ps

Don't forget to save!

saveRDS(ps, file.path(newest_outputs, "NEON_ITS_phyloseq_full.rds"))

If the saved object still takes up too much memory, consider removing or consolidating additional taxa, using glom functions. The speedyseq package is also useful for manipulating large phyloseq datasets.

# Create a smaller subset
# Remove taxa not seen more than 3 times in at least 10 of the samples. 
# This protects against an OTU with small mean & trivially large C.V. 
# (from phyloseq tutorial)
ps_subset = filter_taxa(ps, function(x) sum(x > 3) > 10, TRUE)
saveRDS(ps_subset, file.path(newest_outputs, "NEON_ITS_phyloseq_subset.rds"))

claraqin/neonMicrobe documentation built on April 11, 2024, 11:47 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

claraqin/neonMicrobe
Downloading, Pre-processing, and Assembling NEON Soil Microbe Marker Gene Sequence Data

In claraqin/neonMicrobe: Downloading, Pre-processing, and Assembling NEON Soil Microbe Marker Gene Sequence Data

R Package Documentation

Browse R Packages

We want your feedback!

claraqin/neonMicrobe Downloading, Pre-processing, and Assembling NEON Soil Microbe Marker Gene Sequence Data

In claraqin/neonMicrobe: Downloading, Pre-processing, and Assembling NEON Soil Microbe Marker Gene Sequence Data

R Package Documentation

Browse R Packages

We want your feedback!

claraqin/neonMicrobe
Downloading, Pre-processing, and Assembling NEON Soil Microbe Marker Gene Sequence Data