knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) knitr::opts_chunk$set(dev = "png")
#load packages library(RADstackshelpR) library(gridExtra)
```{bash, eval = FALSE} /home/path/to/stacks-2.41/process_radtags -p . -o . -b plate.1.barcodes.txt -e ndeI -r -c -q
## Step 2: Quality control - Once each sample has been demultiplexed into an individual file with the extension '.fastq.gz', it is now possible to assess the quality of each sequenced sample. Samples which receive very little sequencing will not contain enough reads to assemble shared loci, and should be dropped right away, so that they do not bias the downstream optimization of assembly parameters. I have written an [RMarkdown script](https://github.com/DevonDeRaad/RADstackshelpR/blob/master/inst/extdata/fastqcr.Rmd) that uses the R package [fastqcr](https://github.com/kassambara/fastqcr) to generate a report visualizing the quality and quantity of sequencing for each sample, and recommending a subset of samples to be immediately dropped before parameter optimization. The only modification necessary for this script is the path to the folder containing the input .fastq.gz files and the path to your desired output folder. An example report generated using this script can be seen [here](https://devonderaad.github.io/RADstackshelpR/articles/quality.control.vignette.html). ## Step 3: Now that we have run quality control, we have a reasonable set of samples for which to perform parameter optimization for de novo assembly. To begin, iterate over values of 'm' ranging from 3-7, while leaving all other parameters at default values. ###Bash code to execute this: ```{bash, eval = FALSE} #!/bin/sh # #SBATCH --job-name=denovo.rad.hippo # Job Name #SBATCH --nodes=1 # nodes #SBATCH --cpus-per-task=15 # CPU allocation per Task #SBATCH --partition=bi # Name of the Slurm partition used #SBATCH --chdir=/home/d669d153/scratch/diadema.dinops # Set working d$ #SBATCH --mem-per-cpu=1gb # memory requested #SBATCH --time=10000 files="908108_H_diadema_Gatokae 908150_H_dinops_Guadalcanal 908151_H_diadema_Guadalcanal 908152_H_diadema_Guadalcanal 908153a_H_dinops_Guadalcanal 908154_H_dinops_Guadalcanal 908155_H_dinops_Guadalcanal 908156_H_diadema_Guadalcanal 908208_H_diadema_Guadalcanal KVO150_H_diadema_Isabel KVO168_H_diadema_Isabel KVO169_H_diadema_Isabel KVO170_H_diadema_Isabel KVO171_H_diadema_Isabel KVO172_H_diadema_Isabel KVO242_H_dinops_Isabel KVO243_H_dinops_Isabel KVO244_H_dinops_Isabel KVO245_H_dinops_Isabel KVO246_H_dinops_Isabel KVO248_H_dinops_Isabel THL1156_H_demissus_Makira THL1172_H_dinops_Guadalcanal THL1173_H_dinops_Guadalcanal THL17193_H_diadema_Ngella THL17194_H_diadema_Ngell THL17195_H_diadema_Ngella THL17197_H_diadema_Ngella THL17198_H_diadema_Ngella THL17199_H_diadema_Ngell WD1705_H_diadema_E_New_Britain WD2047_H_diadema_Simbu_Prov WD2074_H_diadema_Gulf_Prov" # Build loci de novo in each sample for the single-end reads only. # -M — Maximum distance (in nucleotides) allowed between stacks (default 2). # -m — Minimum depth of coverage required to create a stack (default 3). #here, we will vary m from 3-7, and leave all other paramaters default for i in {3..7} do #create a directory to hold this unique iteration: mkdir stacks_m$i #run ustacks with m equal to the current iteration (3-7) for each sample id=1 for sample in $files do /home/d669d153/work/stacks-2.41/ustacks -f ${sample}.fq.gz -o stacks_m$i -i $id -m $i -p 15 let "id+=1" done ## Run cstacks to compile stacks between samples. Popmap is a file in working directory called 'pipeline_popmap.txt' /home/d669d153/work/stacks-2.41/cstacks -P stacks_m$i -M pipeline_popmap.txt -p 15 ## Run sstacks. Match all samples supplied in the population map against the catalog. /home/d669d153/work/stacks-2.41/sstacks -P stacks_m$i -M pipeline_popmap.txt -p 15 ## Run tsv2bam to transpose the data so it is stored by locus, instead of by sample. /home/d669d153/work/stacks-2.41/tsv2bam -P stacks_m$i -M pipeline_popmap.txt -t 15 ## Run gstacks: build a paired-end contig from the metapopulation data (if paired-reads provided), ## align reads per sample, call variant sites in the population, genotypes in each individual. /home/d669d153/work/stacks-2.41/gstacks -P stacks_m$i -M pipeline_popmap.txt -t 15 ## Run populations completely unfiltered and output unfiltered vcf, for input to the RADstackshelpR package /home/d669d153/work/stacks-2.41/populations -P stacks_m$i -M pipeline_popmap.txt --vcf -t 15 done
#optimize_m function will generate summary stats on your 5 iterative runs #input can be full path to each file or just the file name if your working directory contains the files m.out<-optimize_m(m3="/Users/devder/Desktop/hipposideros/m_3.vcf", m4="/Users/devder/Desktop/hipposideros/m_4.vcf", m5="/Users/devder/Desktop/hipposideros/m_5.vcf", m6="/Users/devder/Desktop/hipposideros/m_6.vcf", m7="/Users/devder/Desktop/hipposideros/m_7.vcf") #visualize the effect of varying m on the depth of each sample vis_depth(output = m.out) #visualize the effect of varying m on the number of SNPs retained vis_snps(output = m.out, stacks_param = "m") #visualize the effect of varying m on the number of loci retained vis_loci(output = m.out, stacks_param = "m") #3 is the optimal m value, and will be used next to optimize M
```{bash, eval = FALSE}
for i in {1..8} do
mkdir stacks_bigM$i
id=1 for sample in $files do /home/d669d153/work/stacks-2.41/ustacks -f ${sample}.fq.gz -o stacks_bigM$i -i $id -m 3 -M $i -p 15 let "id+=1" done /home/d669d153/work/stacks-2.41/cstacks -P stacks_bigM$i -M pipeline_popmap.txt -p 15 /home/d669d153/work/stacks-2.41/sstacks -P stacks_bigM$i -M pipeline_popmap.txt -p 15 /home/d669d153/work/stacks-2.41/tsv2bam -P stacks_bigM$i -M pipeline_popmap.txt -t 15 /home/d669d153/work/stacks-2.41/gstacks -P stacks_bigM$i -M pipeline_popmap.txt -t 15 /home/d669d153/work/stacks-2.41/populations -P stacks_bigM$i -M pipeline_popmap.txt --vcf -t 15 done
## Step 6: Visualize the output of these 8 runs and determine the optimal value for M. ```r #optimize M M.out<-optimize_bigM(M1="/Users/devder/Desktop/hipposideros/M1.vcf", M2="/Users/devder/Desktop/hipposideros/M2.vcf", M3="/Users/devder/Desktop/hipposideros/M3.vcf", M4="/Users/devder/Desktop/hipposideros/M4.vcf", M5="/Users/devder/Desktop/hipposideros/M5.vcf", M6="/Users/devder/Desktop/hipposideros/M6.vcf", M7="/Users/devder/Desktop/hipposideros/M7.vcf", M8="/Users/devder/Desktop/hipposideros/M8.vcf") #visualize the effect of varying M on the number of SNPs retained vis_snps(output = M.out, stacks_param = "M") #visualize the effect of varying M on the number of polymorphic loci retained vis_loci(output = M.out, stacks_param = "M")
```{bash, eval = FALSE}
for i in {1..3} do
mkdir stacks_n$i
id=1 for sample in $files do /home/d669d153/work/stacks-2.41/ustacks -f ${sample}.fq.gz -o stacks_n$i -i $id -m 3 -M 2 -p 15 let "id+=1" done /home/d669d153/work/stacks-2.41/cstacks -n $i -P stacks_n$i -M pipeline_popmap.txt -p 15 /home/d669d153/work/stacks-2.41/sstacks -P stacks_n$i -M pipeline_popmap.txt -p 15 /home/d669d153/work/stacks-2.41/tsv2bam -P stacks_n$i -M pipeline_popmap.txt -t 15 /home/d669d153/work/stacks-2.41/gstacks -P stacks_n$i -M pipeline_popmap.txt -t 15 /home/d669d153/work/stacks-2.41/populations -P stacks_n$i -M pipeline_popmap.txt --vcf -t 15 done
## Step 8: Visualize the output of these 3 runs and determine the optimal value for n. ```r #optimize n n.out<-optimize_n(nequalsMminus1="/Users/devder/Desktop/hipposideros/n1.vcf", nequalsM="/Users/devder/Desktop/hipposideros/n2.vcf", nequalsMplus1="/Users/devder/Desktop/hipposideros/n3.vcf") #visualize the effect of varying n on the number of SNPs retained vis_snps(output = n.out, stacks_param = "n") #visualize the effect of varying n on the number of polymorphic loci retained vis_loci(output = n.out, stacks_param = "n")
gl<-list() gl[[1]]<-vis_depth(output = m.out) gl[[2]]<-vis_snps(output = m.out, stacks_param = "m") gl[[3]]<-vis_loci(output = m.out, stacks_param = "m") gl[[4]]<-vis_snps(output = M.out, stacks_param = "M") gl[[5]]<-vis_loci(output = M.out, stacks_param = "M") gl[[6]]<-vis_snps(output = n.out, stacks_param = "n") gl[[7]]<-vis_loci(output = n.out, stacks_param = "n") grid.arrange( grobs = gl, widths = c(1,1,1,1,1,1), layout_matrix = rbind(c(1,1,2,2,3,3), c(4,4,4,5,5,5), c(6,6,6,7,7,7)) )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.