knitr::opts_chunk$set(echo = TRUE)
The file Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz
was downloaded from the ARMOR repository at https://github.com/csoneson/ARMOR/blob/master/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz.
The equivalence classes (eq_classes.txt
) and transcript estimated counts (quant.sf
) were obtained by aligning the paired-ended reads from the ARMOR repository with STAR and checking the transcripts compatible with each alignment via salmon.
The reads were downloaded from https://github.com/csoneson/ARMOR/tree/master/example_data/FASTQ.
# Download the ARMOR github repository git clone https://github.com/csoneson/ARMOR.git # set the base_dir to the downloaded repo: base_dir="~/ARMOR" # input reads: fastq_files=$base_dir/example_data/FASTQ ################################################################################################################# # RUN STAR alignment and Salmon on the alignment obtained from STAR: ################################################################################################################# # fasta file, reference genome (DNA) fasta=$base_dir/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.dna.chromosome.1.1.10M.fa # gtf file gtf=$base_dir/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.93.1.1.10M.gtf # make directory for STAR output: mkdir $base_dir/STAR # folder where we will create a genome index: mkdir $base_dir/STAR/genome_index GDIR=$base_dir/STAR/genome_index # Generate Genome index: STAR --runMode genomeGenerate --runThreadN 4 --genomeDir $GDIR \ --genomeFastaFiles $fasta --sjdbGTFfile $gtf --sjdbOverhang 62 ls $GDIR # sjdbOverhang ideally should be the lenght of the reads -1 (our reads are 63 bps). # output directory mkdir $base_dir/STAR/alignment outDir=$base_dir/STAR/alignment # change directory to the output directory: cd $outDir # align reads with STAR: # --quantMode TranscriptomeSAM is essential to obtain the transcript alignments. STAR --runMode alignReads --runThreadN 4 --genomeDir $GDIR \ --readFilesIn <(zcat $fastq_files/SRR1039508_R1.fastq.gz) <(zcat $fastq_files/SRR1039508_R2.fastq.gz) \ --outFileNamePrefix sample1 --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM STAR --runMode alignReads --runThreadN 4 --genomeDir $GDIR \ --readFilesIn <(zcat $fastq_files/SRR1039509_R1.fastq.gz) <(zcat $fastq_files/SRR1039509_R2.fastq.gz) \ --outFileNamePrefix sample2 --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM STAR --runMode alignReads --runThreadN 4 --genomeDir $GDIR \ --readFilesIn <(zcat $fastq_files/SRR1039512_R1.fastq.gz) <(zcat $fastq_files/SRR1039512_R2.fastq.gz) \ --outFileNamePrefix sample3 --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM STAR --runMode alignReads --runThreadN 4 --genomeDir $GDIR \ --readFilesIn <(zcat $fastq_files/SRR1039513_R1.fastq.gz) <(zcat $fastq_files/SRR1039513_R2.fastq.gz) \ --outFileNamePrefix sample4 --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM # use gffread to build a reference transcriptome (fasta format) compatible with the DNA fasta and gtf files used for STAR: gffread -w cDNA.fa -g $fasta $gtf # crete a variable to point at the newly created transcriptome fasta file: cdna=$outDir/cDNA.fa # Use salmon on the transcript alignments to compute the equivalence classes: # --dumpEq is essential to obtain the equivalence classes from salmon. $salmon quant -t $cdna -l A -a sample1Aligned.toTranscriptome.out.bam -o sample1 -p 4 --dumpEq $salmon quant -t $cdna -l A -a sample2Aligned.toTranscriptome.out.bam -o sample2 -p 4 --dumpEq $salmon quant -t $cdna -l A -a sample3Aligned.toTranscriptome.out.bam -o sample3 -p 4 --dumpEq $salmon quant -t $cdna -l A -a sample4Aligned.toTranscriptome.out.bam -o sample4 -p 4 --dumpEq # In the output folders, the `quant.sf` files contain the estimated transcript level abundances, while the equivalence classes (and respective counts) are stored in the `aux_info/eq_classes.txt` files.
The equivalence classes (pseudoalignments.ec
) and respective counts counts (pseudoalignments.tsv
) were obtained by aligning the paired-ended reads from the ARMOR repository with kallisto.
Continue after running the code above.
# create a directory for kallisto mkdir $base_dir/kallisto # create a directory for the genome index mkdir $base_dir/kallisto/kallisto_index idx=$base_dir/kallisto/kallisto_index # create a directory for the output of the alignment mkdir $base_dir/kallisto/alignment out_kallisto_alignment=$base_dir/kallisto/alignment # create a directory for the output of the equivalence classes mkdir $base_dir/kallisto/pseudo out_kallisto_pseudo=$base_dir/kallisto/pseudo #Build kallisto index kallisto index -i $idx/transcripts.idxl $cdna # Align reads and quantify transcript abundance with kallisto: kallisto quant -i $idx/transcripts.idx -o $out_kallisto_alignment/sample1 --bias --threads 4 \ $fastq_files/SRR1039508_R1.fastq.gz $fastq_files/SRR1039508_R2.fastq.gz kallisto quant -i $idx/transcripts.idx -o $out_kallisto_alignment/sample2 --bias --threads 4 \ $fastq_files/SRR1039509_R1.fastq.gz $fastq_files/SRR1039509_R2.fastq.gz kallisto quant -i $idx/transcripts.idx -o $out_kallisto_alignment/sample3 --bias --threads 4 \ $fastq_files/SRR1039512_R1.fastq.gz $fastq_files/SRR1039512_R2.fastq.gz kallisto quant -i $idx/transcripts.idx -o $out_kallisto_alignment/sample4 --bias --threads 4 \ $fastq_files/SRR1039513_R1.fastq.gz $fastq_files/SRR1039513_R2.fastq.gz # In the output folders, the `abundance.tsv` files contain the estimated transcript level abundances. # Compute the equivalence classes and respective counts with kallisto: kallisto pseudo -i $idx/transcripts.idx -o $out_kallisto_pseudo/sample1 \ $fastq_files/SRR1039508_R1.fastq.gz $fastq_files/SRR1039508_R2.fastq.gz kallisto pseudo -i $idx/transcripts.idx -o $out_kallisto_pseudo/sample2 \ $fastq_files/SRR1039509_R1.fastq.gz $fastq_files/SRR1039509_R2.fastq.gz kallisto pseudo -i $idx/transcripts.idx -o $out_kallisto_pseudo/sample3 \ $fastq_files/SRR1039512_R1.fastq.gz $fastq_files/SRR1039512_R2.fastq.gz kallisto pseudo -i $idx/transcripts.idx -o $out_kallisto_pseudo/sample4 \ $fastq_files/SRR1039513_R1.fastq.gz $fastq_files/SRR1039513_R2.fastq.gz # In the output folders, the `pseudoalignments.ec` files contain the transcripts forming each equivalence class, while the `pseudoalignments.tsv` files contain the equivalence classes counts.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.