knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%", results = FALSE, message = FALSE, warning = FALSE )
(This is a draft)
lrRNAseqBenchmark
is as R
package that facilitates the validation of genetic variants called from long-read RNA-seq (lrRNA-seq) data by one or more variant callers, by comparing their VCF files with a ground-truth VCF file (commonly generated from high-coverage short-read data).
lrRNAseqBenchmark
constructs a big data.frame (master table) that contains information about the variants called by the methods to be validated and the variants in the ground truth. From this master table, many plots can be made by different functions available in the package (ggplot2
-base packages used internally), which allow the visial analyses including variant proximity to splice junctions, variants in homopolymers, and variants from allele specific expressed (ASE) genes.
To install this package just type in your R
devtools::install_github("vladimirsouza/lrRNAseqBenchmark@main")
For a full list and description of all functions available in this package, click here.
For less computational efforts, we use data only from chromosome 22.
Generate a master table to compare and validate the variant calls from Iso-Seq data when using variant caller DeepVariant alone and DeepVariant combined with SplitNCigarReads and flagCorrection. The ground-truth VCF was generated by GATK pipeline on high-coverage short-read DNA data. We treat these two pipelines as methods.
SplitNCigarReads
and flagCorrection
in ORIGINAL_BAM
.ORIGINAL_BAM
.SNCR+fC+DeepVariant_VCF
.### methods to validate # name of the methods to validate METHOD_NAMES <- c("dv", "sncr_fc_dv") # name of the dataset used with the methods to validate METHOD_DATASET_NAME <- "isoSeq_chr21_22" # VCF files METHOD_VCF_FILES <- c( "/home/vbarbo/master_table_example/data/deepvariant_chr21_22.vcf", "/home/vbarbo/master_table_example/data/sncr_fc_deepvariant_chr21_22.vcf" ) # BAM of the data METHOD_BAM_FILE <- "/home/vbarbo/master_table_example/data/isoseq_chr21_22.bam" ### ground-truth # name TRUTH_NAME <- "truth" # name of the dataset used to generate the ground-truth TRUTH_DATASET_NAME <- "shortRead_chr21_22" # VCF file TRUTH_VCF_FILE <- "/home/vbarbo/master_table_example/data/ground_truth_chr21_22.vcf" # BAM file TRUTH_BAM_FILE <- "/home/vbarbo/master_table_example/data/short_read_chr21_22.bam" ### variables # the maximum distance (number of bases) of a variant from a splice junction to be # consider near the splice junction MAX_DIST_FROM_SPLICE_SITE <- 20 # number of threads to use THREADS <- 10 GENOME_REF_FILE <- "/home/vbarbo/master_table_example/data/chr21_22.fa"
library(lrRNAseqBenchmark) library(GenomicAlignments) library(dplyr) library(sarlacc) library(snakecase) library(ggplot2) library(readr)
Get variant positions and which method could call them.
dat <- initiate_master_table( METHOD_VCF_FILES[1], METHOD_VCF_FILES[2], TRUTH_VCF_FILE, method_names=c(METHOD_NAMES, TRUTH_NAME) )
dat <- add_info_tag_from_vcf(dat, "qd_truth", "QD", TRUTH_VCF_FILE)
dat <- add_qual_from_vcf(dat, "dv", METHOD_VCF_FILES[1]) dat <- add_qual_from_vcf(dat, "sncrFcDv", METHOD_VCF_FILES[2])
method_bam <- readGAlignments(METHOD_BAM_FILE) splice_sites <- get_splice_sites_info(method_bam, THREADS) dat <- add_splice_site_info_to_master_table( dat, splice_sites, MAX_DIST_FROM_SPLICE_SITE )
Add iso-seq read coverage.
method_coverage <- coverage(METHOD_BAM_FILE) dat <- add_read_coverage_from_bam_to_master_table( dat, method_coverage, METHOD_DATASET_NAME )
Add short-read coverage (ground truth).
### this takes a long time to run truth_coverage <- coverage(TRUTH_BAM_FILE) dat <- add_read_coverage_from_bam_to_master_table( dat, truth_coverage, TRUTH_DATASET_NAME )
dat <- add_number_of_n_cigar_reads_to_master_table( dat, method_bam, METHOD_DATASET_NAME )
### method 1 dat <- add_method_vs_truth_comparison_to_master_table( dat, METHOD_NAMES[1], TRUTH_NAME ) ### method 2 dat <- add_method_vs_truth_comparison_to_master_table( dat, METHOD_NAMES[2], TRUTH_NAME )
### ground truth dat <- add_variant_type_to_master_table(dat, TRUTH_VCF_FILE, TRUTH_NAME) ### methods dat <- add_variant_type_to_master_table(dat, METHOD_VCF_FILES[1], METHOD_NAMES[1]) dat <- add_variant_type_to_master_table(dat, METHOD_VCF_FILES[2], METHOD_NAMES[2])
### truth dat <- is_indel_method(dat, TRUTH_NAME) ### dv dat <- is_indel_method(dat, TRUTH_NAME, "dv") ### dv_s dat <- is_indel_method(dat, TRUTH_NAME, "sncr_fc_dv")
### TRUTH_NAME dat <- add_variant_density_of_a_method(dat, 201, TRUTH_NAME) ### all methods dat <- add_variant_density_of_a_method(dat, 201, METHOD_NAMES)
dat <- add_format_tag_from_vcf(dat, TRUTH_NAME, "GT", TRUTH_VCF_FILE) dat <- add_format_tag_from_vcf(dat, "dv", "GT", METHOD_VCF_FILES[1]) dat <- add_format_tag_from_vcf(dat, "sncr_fc_dv", "GT", METHOD_VCF_FILES[2]) sel_cols <- paste0( "gt_", c(TRUTH_NAME, METHOD_NAMES) ) k <- is.na( dat[,sel_cols] ) dat[,sel_cols] [k] <- "0/0"
# get all homopolymers of the reference genome genome_ref <- readDNAStringSet(GENOME_REF_FILE) names(genome_ref) <- sub("(^chr[0-9]+|X|Y).*", "\\1", names(genome_ref)) homopolymers <- homopolymerFinder(genome_ref) dat <- add_homopolymer_length_when_indels(dat, homopolymers)
head(dat)
Sites to keep:
short-read variant density: x <= 3 (ignore positions)
ignore intronic regions (isoSeq_coverage > 0) (ignore positions)
DeepVariant QUAL: x >= 15 (remove method calling)
short-read coverare : 20 reads <= x <= 95% quantil (ignore positions)
# no filtering dat1 <- dat dim(dat1) shortread_cover_quantiles <- quantile(dat1$shortRead_chr21_22_coverage, probs=.95) # if a region contains high variant density, probably this is because of a bad alignmnet. # we filter out variants in 201bp regions with more than 3 variants in the ground truth k <- dat1$variantDensity_truth <= 3 dat1 <- dat1[k,] dim(dat1) # ignore intronic regions (iso-seq coverage equal to zero) dat1 <- filter(dat1, isoSeq_chr21_22_coverage>0) dim(dat1) # ignore regions based on short-read coverage quantiles: keep 20 reads <= x <= 95% percentile dat1 <- filter(dat1, shortRead_chr21_22_coverage >= 20 & shortRead_chr21_22_coverage <= shortread_cover_quantiles) dim(dat1) # Filter out DeepVariant calls by QUAL -- but keep the position # dv k <- which( dat1$qual_dv < 15 ) dat1$in_dv[k] <- 0 dat1 <- add_method_vs_truth_comparison_to_master_table( dat1, METHOD_NAMES[1], TRUTH_NAME, replace_column=TRUE ) # sncr_fc_dv k <- which( dat1$qual_sncrFcDv < 15 ) dat1$in_sncr_fc_dv[k] <- 0 dat1 <- add_method_vs_truth_comparison_to_master_table( dat1, METHOD_NAMES[2], TRUTH_NAME, replace_column=TRUE )
In this section we use functions available in package lrRNAseqBenchmark
to validate the used methods/pipelines.
Calculate precison measures according to different minimum Iso-Seq read coverage threshold.
# define different minimum coverage threshold min_coverage <- c(3, 15, 40, 100) output_method_names <- c("DeepVariant", "SNCR+fC+DeepVariant") experiment_names <- "Experiment 1" ### calculate precision dat_plot <- calculate_precision_recall_for_multi_master_tables( dat1, experiment_names = experiment_names, method_names = METHOD_NAMES, output_method_names = output_method_names, data_names = METHOD_DATASET_NAME, truth_names = TRUTH_NAME, coverage_thresholds = min_coverage, what = "snps_indels" ) ### plot the results ggplot(dat_plot, aes(recall, precision, colour=method)) + facet_grid(experiment~variant) + geom_point(aes(size=`coverage >= n`), alpha=.5) + geom_path(size=1.2, alpha=.5) + theme(legend.position="bottom", legend.box="vertical") + coord_fixed(ratio=1, xlim=c(0.24, 0.98), ylim=c(0.69, 0.96)) + xlab("Recall") + ylab("Precision") + labs(colour="Method", size="Minimum Iso-Seq read coverage") + scale_x_continuous(breaks=seq(.2, 1, .1), labels=seq(.2, 1, .1)) + guides(colour = guide_legend(order=1)) + NULL
Here, we consider a variant near to a splice junction when it's not further than 20 bases.
variant_type <- "snp" min_coverage <- 20 sj_proximity_snps <- splice_junction_analysis_table(dat1, experiment_names = experiment_names, truth_names = TRUTH_NAME, method_dataset_name = METHOD_DATASET_NAME, method_names = METHOD_NAMES, output_method_names = output_method_names, variant_type = variant_type, min_isoseq_coverage = min_coverage) ggplot( sj_proximity_snps$acc_sj, aes(x=is_near, y=Score, group=Measures, colour=Measures) ) + facet_grid(experiment~Method) + theme(strip.text = element_text(size = 14)) + geom_point(size=4, alpha=.5) + geom_line(size=1.2, alpha=.5) + xlab("SNP candidates are near a splice junction") + theme(text = element_text(size=18)) + geom_text( data=sj_proximity_snps$n_text, mapping= aes(x=x, y=y, label=label), size=5 ) + scale_y_continuous(breaks=seq(0, 1, .25)) + guides(colour = guide_legend(override.aes = list(size = 1, shape = 11))) + theme(legend.position="bottom") + NULL
### for higher variant quality, we also filter out variants in high-variant-density regions according the calls of the methods to be compared. dat2 <- filter(dat1, variantDensity_dv_sncrFcDv <= 3) dim(dat2) ### and filter out variants near splice junctions dat2 <- filter(dat2, is_near_ss==0) dim(dat2) ### for each pipeline, make a table that stores indels and the homopolymer length where they are inside. ### if a indel is outside homopolymers, the returned homopolymer length is equal to 1. ### keep only variants that are: ### * in Iso-Seq coverage >= 20 reads; ### * classified as heterozygous alternative; ### * indels. # indels called by DeepVaraint (alone) dat_hom_dv <- method_homopolymer_indels(input_table = dat2, first_method_name = TRUTH_NAME, second_method_name = METHOD_NAMES[1], vcf_first = TRUTH_VCF_FILE, vcf_second = METHOD_VCF_FILES[1], method_dataset_name = METHOD_DATASET_NAME, homopolymers = homopolymers, ref_fasta_seqs = genome_ref, min_isoseq_coverage = 20, genotyped_alt = "find") # indels called by SNCR+fC+DeepVaraint dat_hom_sncrFcDv <- method_homopolymer_indels(input_table = dat2, first_method_name = TRUTH_NAME, second_method_name = METHOD_NAMES[2], vcf_first = TRUTH_VCF_FILE, vcf_second = METHOD_VCF_FILES[2], method_dataset_name = METHOD_DATASET_NAME, homopolymers = homopolymers, ref_fasta_seqs = genome_ref, min_isoseq_coverage = 20, genotyped_alt = "find") ### get the data to make the plot hom_length_intervals <- c(1, 2, 8) interval_names <- c("non-hp", "2-5", ">=8") dat_homopolymer <-list(class_counts=NULL, dat_text=NULL) # DeepVaraint (alone), deletions k <- make_homopolymer_table_to_plot( input_hom_table = dat_hom_dv, variant_type = "deletion", method_name = METHOD_NAMES[1], truth_name = TRUTH_NAME, hom_length_intervals = hom_length_intervals, interval_names = interval_names, to_calculate = "pre_rec_f1", output_method_name = output_method_names[1] ) dat_homopolymer <- mapply(rbind, dat_homopolymer, k, SIMPLIFY=FALSE) # DeepVaraint (alone), insertions k <- make_homopolymer_table_to_plot( input_hom_table = dat_hom_dv, variant_type = "insertion", method_name = METHOD_NAMES[1], truth_name = TRUTH_NAME, hom_length_intervals = hom_length_intervals, interval_names = interval_names, to_calculate = "pre_rec_f1", output_method_name = output_method_names[1] ) dat_homopolymer <- mapply(rbind, dat_homopolymer, k, SIMPLIFY=FALSE) # DeepVaraint (alone), deletions k <- make_homopolymer_table_to_plot( input_hom_table = dat_hom_sncrFcDv, variant_type = "deletion", method_name = METHOD_NAMES[2], truth_name = TRUTH_NAME, hom_length_intervals = hom_length_intervals, interval_names = interval_names, to_calculate = "pre_rec_f1", output_method_name = output_method_names[2] ) dat_homopolymer <- mapply(rbind, dat_homopolymer, k, SIMPLIFY=FALSE) # DeepVaraint (alone), insertions k <- make_homopolymer_table_to_plot( input_hom_table = dat_hom_sncrFcDv, variant_type = "insertion", method_name = METHOD_NAMES[2], truth_name = TRUTH_NAME, hom_length_intervals = hom_length_intervals, interval_names = interval_names, to_calculate = "pre_rec_f1", output_method_name = output_method_names[2] ) dat_homopolymer <- mapply(rbind, dat_homopolymer, k, SIMPLIFY=FALSE) ggplot(dat_homopolymer$class_counts, aes(x=homopolymer_length_intervals, y=percent, group=Measures, colour=Measures)) + facet_grid(variant_type~method) + theme(strip.text = element_text(size = 14)) + geom_point(size=4, alpha=.5) + geom_line(size=1.2, alpha=.5) + # xlab("Variant within a homopolymer of length l") + # xlab("Homopolymer length") + # ylab("Score") + theme(text = element_text(size=20)) + geom_text( data=dat_homopolymer$dat_text, mapping= aes(x=x, y=y, label=label), size=5 ) + guides(colour = guide_legend(override.aes = list(size = 1, shape = 11))) + theme(legend.position="bottom") + NULL
(at the moment, there is not a ready-to-use function for this analysis)
To identify ASE genes, first we use GATK's ASEReadCounter function to get a table with read counts per allele. For the analysis presented in this README, this table can be downloaded here.
Once got the table of counts, the chi-squared test is used to identify ASE sites.
### inputs ALLELE_COUNTS_FILE <- "/home/vbarbo/master_table_example/ase_table_count/ase_read_count_tables/ase_read_count_all.table" ### load allele counts allele_counts <- read_tsv(ALLELE_COUNTS_FILE) ### add column of p-values for ase identification ase_chiSquare <- mapply(function(ref, alt){ suppressWarnings( tryCatch( chisq.test( c(ref, alt) ) $p.value, error=function(e){NA} ) ) }, allele_counts$refCount, allele_counts$altCount) ase_chiSquare_adj <- p.adjust(ase_chiSquare, method="BH") allele_counts <- mutate(allele_counts, ase_chiSquare_adj) ### add ase q-value to the master table allele_counts <- rename(allele_counts, "chrm"="contig", "pos"="position") allele_p <- subset(allele_counts, select=c("chrm", "pos", "refCount", "altCount", "totalCount", "otherBases", "ase_chiSquare_adj")) dat3 <- merge(dat1, allele_p, by=c("chrm", "pos"), all.x=TRUE, sort=FALSE) ### keep only variants that are SNPs and heterozygous (according to the ### ground truth), and have a p-value for ASE identification associated to ### them dat3 <- filter(dat3, variantType_truth=="snp" & gt_truth=="0/1") dat3 <- filter(dat3, !is.na(ase_chiSquare_adj)) ### keep only variants with reasonable read coverage in the ground truth and ### in the rna short read data dat3 <- filter(dat3, isoSeq_chr21_22_coverage>=10) dat3 <- filter(dat3, totalCount>=40) ### chi-square test to test whether the proportions of FNs between ASE and ### non-ASE variants are different # for DeepVariant (alone) k <- table(dat3$dv_classification, dat3$ase_chiSquare_adj<.05) k <- k[rownames(k) %in% c("FN", "TP"),] chisq.test(k)$p.value # for SNCR+fC+DeepVariant k <- table(dat3$sncr_fc_dv_classification, dat3$ase_chiSquare_adj<.05) k <- k[rownames(k) %in% c("FN", "TP"),] chisq.test(k)$p.value
Make a plot to compare the performance of the methods from ASE- and non-ASE sites.
### create the tables from which ggplot2 makes the plots calcFreq <- function(d){ d$Ratio <- 0 d$Ratio[1] <- d$Freq[1] / sum(d$Freq[1:2]) d$Ratio[2] <- d$Freq[2] / sum(d$Freq[1:2]) d$Ratio[3] <- d$Freq[3] / sum(d$Freq[3:4]) d$Ratio[4] <- d$Freq[4] / sum(d$Freq[3:4]) d } # DeepVariant (alone) dat_ase <- table(dat3$dv_classification, dat3$ase_chiSquare_adj<.05) dat_ase <- dat_ase[c("FN", "TP"),] dat_ase <- as.data.frame(dat_ase) names(dat_ase) <- c("Classification", "ASE_Identification", "Frequency") dat_ase$ASE_Identification <- ifelse( as.logical(dat_ase$ASE_Identification), "ASE", "Non-ASE" ) %>% factor dat_ase_dv <- calcFreq(dat_ase) # SNCR+flagCorrecion+DeepVariant dat_ase <- table(dat3$sncr_fc_dv_classification, dat3$ase_chiSquare_adj<.05) dat_ase <- dat_ase[c("FN", "TP"),] dat_ase <- as.data.frame(dat_ase) names(dat_ase) <- c("Classification", "ASE_Identification", "Frequency") dat_ase$ASE_Identification <- ifelse( as.logical(dat_ase$ASE_Identification), "ASE", "Non-ASE") %>% factor dat_ase_dvSFc <- calcFreq(dat_ase) ### put all tables together in a single table dat_ase_dv$method <- "DeepVariant" dat_ase_dvSFc$method <- "SNCR+fC+DeepVariant" dat_ase <- rbind(dat_ase_dv, dat_ase_dvSFc) k <- c("DeepVariant", "SNCR+fC+DeepVariant") dat_ase$method <- factor(dat_ase$method, levels=k, ordered=TRUE) ### draw the plot ggplot(dat_ase, aes(x=ASE_Identification, y=Ratio, fill=Classification)) + facet_grid(.~method) + geom_bar(stat="identity", position=position_dodge()) + geom_text(aes(label=round(Ratio,2)), vjust=-0.1, color="gray40", position = position_dodge(0.9), size=5)+ theme(text = element_text(size = 20)) + ylim(0,1) + xlab("Allele specific expression identification") + NULL
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.