pkg <- read.dcf("../DESCRIPTION", fields = "Package")[1] library(pkg, character.only = TRUE)
library(`r pkg`)
library(EpiPrepare) library(dplyr) root="/home/bms20/RDS/project/neurogenomics-lab/live/Data/tip_seq" print(root) datadir=file.path( root, "raw_data/H3K27ac_K562_cells/phase_1_05_jan_2022/X204SC21121860-Z01-F001/raw_data" ) print(datadir)
# files <- list.files(datadir, pattern = ".fastq.gz|.fq.gz", # full.names = TRUE, recursive = TRUE) # message(length(files)," files retrieved.") design <- make_design_files(root=root, split_batches = FALSE, omit_processed = FALSE, force_new = TRUE)
Create a conda environment specifically designed for EpiPrepare
.
The yaml used to specify all the required packages for this env is
included within the package
echoconda
and can automatically
be built as follows. If the env already exists, building will be skipped.
echoconda::yaml_to_env("epiprepare")
outdir <- here::here("EpiPrepare_results","fastqc") fastqc_out <- fastqc(files = files, outdir = outdir, conda_env = "epiprepare", threads = 10) #### Individual samples #### qc <- fastqcr::qc_read(file = grep(".zip$",fastqc_out,value = TRUE)[1]) fastqcr::qc_plot(qc = qc, modules = c("Summary", "Basic Statistics", "Sequence Duplication Levels")) #### All samples #### qc_agg <- fastqcr::qc_aggregate(qc.dir = outdir) fastqcr::qc_fails(qc_agg)
Get bad gzip error if use zipped files, so gunzip first.
outdir <- here::here("EpiPrepare_results","trimgalore") trimgalore_out <- trimgalore(design = design, outdir = outdir, conda_env = "epiprepare")
Index and other genome files obtained through the iGenomes resource. AWS-iGenomes GitHub.
--end-to-end
instead of --local bowtie2
setting.--local
is used to avoid remaining adapter sequence at the 3’ ends of reads during mapping. bowtie2_ex <- echoconda::find_packages(packages = "bowtie2", conda_env = "epiprepare") bowtie2_out <- bowtie2(design=design, outdir = here::here("EpiPrepare_results", "bowtie2"), conda_env = "epiprepare")
### Linux command datadir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY" ref="/rds/general/user/la420/home/hg19/index/genome" cores=8 sample_name="..." #mkdir -p ${datadir}/alignment/sam/bowtie2_summary #mkdir -p ${datadir}/alignment/bam #mkdir -p ${datadir}/alignment/bed #mkdir -p ${datadir}/alignment/bedgraph outdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY/alignment/sam" module load bowtie2/2.2.9 bowtie2 \ --local \ --very-sensitive \ --no-mixed \ --no-discordant \ --phred33 \ -I 10 \ -X 700 \ -p ${cores} \ -x ${ref} \ -1 ${datadir}/data_trimmed/${sample_name}_val_1.fq.gz \ -2 ${datadir}/data_trimmed/${sample_name}_val_2.fq.gz \ -S ${outdir}/${sample_name}_trimmed_bowtie2.sam &> ${outdir}/bowtie2_summary/${sample_name}_trimmed_bowtie2.txt
### Linux command module load picard/2.6.0 workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY" picard_command="java -Xmx12G -XX:ParallelGCThreads=5 -XX:GCTimeLimit=50 -jar /apps/picard/2.6.0/picard.jar" sample_name="..." #mkdir -p $workdir/removeDuplicate #mkdir -p $workdir/removeDuplicate/picard_summary ## sort by coordinate $picard_command SortSam I=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sam O=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sorted.sam SORT_ORDER=coordinate ## mark duplicates $picard_command MarkDuplicates I=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sorted.sam O=$workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.dupMarked.sam METRICS_FILE=$workdir/removeDuplicate/picard_summary/${sample_name}_picard.dupMark.txt ## remove duplicates $picard_command MarkDuplicates I=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sorted.sam O=$workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.rmDup.sam REMOVE_DUPLICATES=true METRICS_FILE=$workdir/removeDuplicate/picard_summary/${sample_name}_picard.rmDup.txt
### Linux command module load samtools/1.3.1 workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY" sample_name="..." #mkdir -p $workdir/alignment/sam/fragmentLen ## extract the 9th column from the alignment sam file which is the fragment length samtools view -F 0x04 $workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.rmDup.sam | awk -F'\t' 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($9)}' | sort | uniq -c | awk -v OFS="\t" '{print $2, $1/2}' >$workdir/alignment/sam/fragmentLen/${sample_name}_rmDup_fragmentLen.txt
### Linux command module load samtools/1.3.1 module load bedtools/2.25.0 workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY" sample_name="..." ## sort sam file by read name (rather than coordinate, as it is now) samtools sort -n $workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.rmDup.sam -o $workdir/alignment/sam/${sample_name}_trimmed_bowtie2.n_sorted.rmDup.sam ## filter and keep the mapped read pairs samtools view -bS -F 0x04 $workdir/alignment/sam/${sample_name}_trimmed_bowtie2.n_sorted.rmDup.sam >$workdir/alignment/bam/${sample_name}_bowtie2_rmDup.mapped.bam ## convert into bed file format bedtools bamtobed -i $workdir/alignment/bam/${sample_name}_bowtie2_rmDup.mapped.bam -bedpe >$workdir/alignment/bed/${sample_name}_bowtie2_rmDup.bed ## keep the read pairs that are on the same chromosome and fragment length less than 1000bp. awk '$1==$4 && $6-$2 < 1000 {print $0}' $workdir/alignment/bed/${sample_name}_bowtie2_rmDup.bed >$workdir/alignment/bed/${sample_name}_bowtie2_rmDup.clean.bed ## only extract the fragment related columns cut -f 1,2,6 $workdir/alignment/bed/${sample_name}_bowtie2_rmDup.clean.bed | sort -k1,1 -k2,2n -k3,3n >$workdir/alignment/bed/${sample_name}_bowtie2_rmDup.fragments.bed
Can be done from fastqs or bam files
``` {r, engine='bash', eval=FALSE}
module load samtools/1.3.1
bamdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY/alignment/bam" sample_name="..."
read_proportion=".05" #this would be 5% seed="..."
samtools view -bs ${seed}${read_proportion} $bamdir/{sample_name}_bowtie2.mapped.bam > $bamdir/{sample_name}_subsample.mapped.bam
# Make (normalized) bedgraph Optional as this does not affect peak calling, only bedgraph values. *chrom sizes from http://hgdownload.cse.ucsc.edu/goldenpath/hg19/bigZips/hg19.chrom.sizes ``` {r, engine='bash', eval=FALSE} ### Linux command module load bedtools/2.25.0 workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY/alignment" chromSize="/rds/general/user/la420/home/hg19/other/hg19.chrom.sizes" sample_name="..." #total number of (paired-end) reads, or 2x the number of fragments seqDepthDouble=`samtools view -F 0x04 $workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sam | wc -l` seqDepth=$((seqDepthDouble/2)) if [[ "$seqDepth" -gt "1" ]]; then scale_factor=`echo "1000000 / $seqDepth" | bc -l` echo "Scaling factor for $sample_name is: $scale_factor" bedtools genomecov -bg -scale $scale_factor -i $workdir/bed/${sample_name}_bowtie2_rmDup.fragments.bed -g $chromSize > $workdir/bedgraph/${sample_name}_norm_bowtie2_rmDup.fragments.bedgraph fi
*No IgG control, so using "non"
### Linux command module load anaconda3/personal source activate seacr_env seacr="bash $HOME/anaconda3/envs/seacr_env/bin/SEACR_1.3.sh" workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY" sample_name="..." $seacr $workdir/alignment/bedgraph/${sample_name}_bowtie2_rmDup.fragments.bedgraph 0.01 non stringent $workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks
Prepare bigwig file:
### Linux command module load anaconda3/personal source activate deepenv module load samtools/1.3.1 bamCoverage="$HOME/anaconda3/envs/deepenv/bin/bamCoverage" workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY" sample_name="Diagenode_S1" #mkdir $workdir/alignment/bigwig samtools sort -o $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam $workdir/alignment/bam/${sample_name}_bowtie2_rmDup.mapped.bam samtools index $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam $bamCoverage -b $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam -o $workdir/alignment/bigwig/${sample_name}_rmDup_raw.bw
hg19 genes from: https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/genes/hg19.ncbiRefSeq.gtf.gz
### Linux command module load anaconda3/personal source activate deepenv workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY" plotHeatmap="$HOME/anaconda3/envs/deepenv/bin/plotHeatmap" computeMatrix="$HOME/anaconda3/envs/deepenv/bin/computeMatrix" sample_1="..." sample_2="..." cores=8 #mkdir -p $workdir/heatmap computeMatrix scale-regions -S $workdir/alignment/bigwig/${sample_1}_rmDup_raw.bw \ $workdir/alignment/bigwig/${sample_2}_rmDup_raw.bw \ -R $HOME/hg19/other/hg19.ncbiRefSeq.gtf.gz \ --beforeRegionStartLength 3000 \ --regionBodyLength 5000 \ --afterRegionStartLength 3000 \ --skipZeros \ -o $workdir/heatmap/matrix_gene.mat.gz \ -p $cores plotHeatmap -m $workdir/heatmap/hg19_gene/matrix_gene.mat.gz -out $workdir/heatmap/heatmap_transcription_units.png --sortUsing sum
### Linux command module load anaconda3/personal source activate deepenv workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY" plotHeatmap="$HOME/anaconda3/envs/deepenv/bin/plotHeatmap" computeMatrix="$HOME/anaconda3/envs/deepenv/bin/computeMatrix" sample_name="..." cores=8 awk '{split($6, summit, ":"); split(summit[2], region, "-"); print summit[1]"\t"region[1]"\t"region[2]}' $workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks.stringent.bed >$workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks.summitRegion.bed computeMatrix reference-point -S $workdir/alignment/bigwig/${sample_name}_rmDup_raw.bw -R $workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks.summitRegion.bed --skipZeros -o $workdir/peakCalling/SEACR/${sample_name}_SEACR.mat.gz -p $cores -a 3000 -b 3000 --referencePoint center plotHeatmap -m $workdir/peakCalling/SEACR/${sample_name}_SEACR.mat.gz -out $workdir/peakCalling/SEACR/${sample_name}_SEACR_heatmap.png --sortUsing sum --startLabel "Peak Start" --endLabel "Peak End" --xAxisLabel "" --regionsLabel "Peaks" --samplesLabel "${sample_name}"
### Linux command module load anaconda3/personal source activate macs2_env workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY" out_dir="${workdir}/peakCalling" #mkdir -p $outdir #mkdir -p $outdir/MACS2 sample_name="..." macs2 callpeak -t $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam -q 0.05 -n ${sample_name}_q0.05 -f BAMPE -g hs --keep-dup all --nomodel --outdir $out_dir/MACS2
module load anaconda3/personal source activate homerenv workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY" peakdir="${workdir}/peakCalling" outdir="${workdir}/motifs" #mkdir -p $outdir caller="..." sample_name="..." genome="..." # need separate folder per sample #mkdir -p $outdir/${sample_name} findMotifsGenome.pl $peakdir/${caller}/${sample_name}_${caller}.bed $genome $out_dir -size given
### Linux command #PBS -l walltime=72:00:00 #PBS -l select=1:ncpus=32:mem=124gb module load anaconda3/personal source activate bedtools_env abdir="/rds/general/user/la420/home/CUTnTAG/antibodies" corrdir="${abdir}/correlation" bamdir="${abdir}/bam/sorted_bam" beddir="${corrdir}/ENCODE_peak_files" ENCODE_bamdir="${corrdir}/ENCODE_peak_files/bam/sorted_bam/fixed" old_ENCODE_bamdir="${corrdir}/ENCODE_peak_files/bam/sorted_bam" outdir="${corrdir}/all_marks" mergedbeddir="${abdir}/all_peaks/merged" hg19_dir="/rds/general/user/la420/home/hg19/other" #mark="H3K27ac_ENCFF044JNJ" #mark="H3K27me3_ENCFF126QYP" bedtools multicov -bams \ $bamdir/Abcam-ab177178_rmDup.sorted.fixed2.bam \ $bamdir/Abcam-ab4729_rmDup.sorted.fixed2.bam \ $bamdir/ActiveMotif_rmDup.sorted.fixed2.bam \ $bamdir/Diagenode_100x_rmDup.sorted.fixed2.bam \ $bamdir/Diagenode_50x_rmDup.sorted.fixed2.bam \ $bamdir/H3K27me3_rmDup.sorted.fixed2.bam \ $bamdir/H3K27ac_SRR8383507_rmDup.sorted.fixed2.bam \ $bamdir/H3K27ac_SRR8383508_rmDup.sorted.fixed2.bam \ $bamdir/H3K27me3_SRR11074238_rmDup.sorted.fixed2.bam \ $bamdir/H3K27me3_SRR11074239_rmDup.sorted.fixed2.bam \ $bamdir/ActiveMotif_H3K27ac_rmDup.sorted.fixed2.bam \ $bamdir/H3K27ac_SRR8581604_CnR_rmDup.sorted.fixed2.bam \ $bamdir/H3K27me3_SRR907370_CnR_rmDup.sorted.fixed2.bam \ $old_ENCODE_bamdir/H3K27ac_ENCFF384ZZM_sorted.bam \ $old_ENCODE_bamdir/H3K27me3_ENCFF676ORH_sorted.bam \ $ENCODE_bamdir/H3K4me1_ENCFF196QGZ_sorted.fixed.bam \ $ENCODE_bamdir/H3K4me3_ENCFF915MJO_sorted.fixed.bam \ $ENCODE_bamdir/H3K9ac_ENCFF204DNG_sorted.fixed.bam \ $ENCODE_bamdir/H3K9me3_ENCFF805FLY_sorted.fixed.bam \ $ENCODE_bamdir/H3K36me3_ENCFF190AQC_sorted.fixed.bam \ $ENCODE_bamdir/DNase_ENCFF826DJP_sorted.fixed.bam \ -bed $hg19_dir/hg19_500bp_windows.bed > $outdir/hg19_500bp_overlaps.txt
## !!!! need to merge at sample level as well !!!! metadata_path=file.path( "/home/bms20/RDS/project/neurogenomics-lab", "live/Projects/TIPseq/reports/Bulk TIP-seq samples_V2.xlsx") # meta=readxl::read_excel(metadata_path, skip = 6) sample_dict <- make_sample_dict(batch_list = unique(design$batch), metadata_path = metadata_path)
utils::sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.