Step 1: Compile a Raw Database from Input Files

transcript_filename <- "~/IsoPops_Data/all_data.fasta"
abundance_filename <- "~/IsoPops_Data/all_data.abundance.txt"
ORF_filename <- "~/IsoPops_Data/all_data.ORFs.fasta"
gff_filename <- "~/IsoPops_Data/all_data.gff"

rawDB <- compile_raw_db(transcript_filename, abundance_filename, gff_filename, ORF_filename)

Step 2: Filter Off-Target Transcripts and Generate Gene Information

# read 2-column file of IDs and gene names into a table
gene_ID_table <- read.table("~/IsoPops_Data/gene_IDs_and_names.tsv", header = T, stringsAsFactors = F)
# you could also create this table in R manually with the command:
# gene_ID_table <- data.frame(ID = list_of_IDs, Name = list_of_gene_names, stringsAsFactors = F)

# pass in gene table to finish Database processing
DB_all <- process_db(rawDB, gene_ID_table)
# placeholders for gene names used in code snippets below
example_gene1 <- gene_ID_table$Name[17]
example_gene2 <- gene_ID_table$Name[18]
example_gene_family <- gene_ID_table$Name[23:26]
example_gene_family2 <- gene_ID_table$Name[10:11]

Step 3: Optionally, Filter Database By Exon Count, Abundance, or Truncated Transcripts

DB_filter_4ex <- filter_db(DB_all, exon_min_count = 4, abund_cutoff = 1)
DB_filter_4ex_notrunc <- filter_truncations(DB_filter_4ex)
DB_filter_4ex_95perc <- filter_db(DB_filter_4ex_notrunc, 4, 0.95, recalc_abundances = F)

Saving and Loading Databases

This is recommended so that re-running the code is unnecessary, especially if your filtering steps took computational effort.

# save to file
saveRDS(DB_filter_4ex_95perc, "~/IsoPops_Data/DB_filter_4ex_95perc.db")
# later, read Database from saved file
database <- readRDS("~/IsoPops_Data/DB_filter_4ex_95perc.db")

Examples of Filtering + Saving GFFs

# write GffDB portion of a Database to a file (for IGV or other use)
write_gff_data(DB_filter_4ex, "~/IsoPops_Data/filter_4ex.gff")
write_gff_data(DB_filter_4ex_notrunc, "~/IsoPops_Data/filter_4ex_notrunc.gff")
write_gff_data(DB_filter_4ex_95perc, "~/IsoPops_Data/fullfilter.gff")

# filtering and saving can be combined into one action

# filter by exon count + abundance, then save GFF
write_gff_data(filter_db(DB_filter_4ex_notrunc, 4, 0.90), "~/IsoPops_Data/filter_4ex_notrunc_90.gff")
# filter by top X isoforms per gene, then save GFF
write_gff_data(filter_top_gff(DB_filter_4ex_95perc, 10), "~/IsoPops_Data/fullfilter_top10.gff")
write_gff_data(filter_top_gff(DB_filter_4ex_95perc, 25), "~/IsoPops_Data/fullfilter_top25.gff")

Example Plots

plot_N50_N75(DB_filter_4ex_95perc)
# you can choose to look at only a subset of genes, unique ORFs instead of transcripts, or other customizations
plot_N50_N75(DB_filter_4ex_95perc, use_ORFs = T)
plot_N50_N75(DB_filter_4ex_95perc, genes_to_include = example_gene_family, insert_title = "Gene Family Statistics")
plot_counts(DB_filter_4ex_95perc)
plot_counts(DB_filter_4ex_95perc, use_log = T)
plot_counts(DB_filter_4ex_95perc, genes_to_include = example_gene_family)
plot_counts(DB_filter_4ex_95perc, use_counts = "ORFs")
# using the 4+ exons, no-truncations filtered database (without 95% cutoff)
jellyfish_plot(DB_filter_4ex_notrunc)
jellyfish_plot(DB_filter_4ex_notrunc, use_ORFs = T)
plot_exon_dist(DB_filter_4ex_95perc)
plot_exon_dist(DB_filter_4ex_95perc, bin_width = 0.01)
plot_exon_dist(DB_filter_4ex_95perc, example_gene2)
plot_exon_dist(DB_filter_4ex_95perc, example_gene2, sum_dist = F)
plot_treemap(DB_filter_4ex_95perc)
plot_treemap(DB_filter_4ex_95perc, use_ORFs = T)
plot_Shannon_index(DB_filter_4ex_95perc)
plot_Shannon_index(DB_filter_4ex_95perc, use_ORFs = T)
plot_length_dist(DB_filter_4ex_95perc)
plot_length_dist(DB_filter_4ex_95perc, use_ORFs = T, horiz_spread = 0.7)

PCA of Isoform Sequences

# perform PCA on vectorizations of isoform sequences
counts <- get_kmer_counts(DB_filter_4ex_95perc, genes = c(example_gene_family))
pca <- kmer_PCA(DB_filter_4ex_95perc, counts)

# plot isoforms by their first 2 PCs...
plot_PCA(DB_filter_4ex_95perc, pca)
plot_PCA(DB_filter_4ex_95perc, pca, scale_by = "length")
plot_PCA(DB_filter_4ex_95perc, pca, scale_by = "abundance")
# ... or first 3 PCs
plot_3D_PCA(DB_filter_4ex_95perc, pca)
plot_3D_PCA(DB_filter_4ex_95perc, pca, scale_by = "length")
plot_3D_PCA(DB_filter_4ex_95perc, pca, scale_by = "abundance")

Hierarchical Clustering of Isoforms

# cluster isoforms (or ORFs) into dendrograms
# if you don't supply kmer_counts, they will be calculated with default settings
cluster_isoforms(DB_filter_4ex_95perc, genes = example_gene_family2)

# if you create kmer counts first, you can pass them into functions multiple times
# this saves time if your dataset is very large
counts <- get_kmer_counts(DB_filter_4ex_95perc, genes = example_gene_family2)

# evaluate clusterings at different #s of clusters or cut heights
cluster_isoforms(DB_filter_4ex_95perc, counts, num_clusters = 4)
cluster_isoforms(DB_filter_4ex_95perc, counts, cut_height = 0.0002)

PCA and Clustering of ORFs

# everything works with ORF peptide sequences, too
counts <- get_kmer_counts(DB_filter_4ex_95perc, genes = "Nrxn1", use_ORFs = T)
pca <- kmer_PCA(DB_filter_4ex_95perc, counts, use_ORFs = T)

# cluster ORFs
cluster_isoforms(DB_filter_4ex_95perc, counts, use_ORFs = T)

# plot ORFs by their first 2 or 3 PCs
plot_PCA(DB_filter_4ex_95perc, pca, use_ORFs = T)
plot_3D_PCA(DB_filter_4ex_95perc, pca, use_ORFs = T)

t-SNE

# can also visualize isoform populations using t-SNE
# note: this is the same counts data that PCA takes in
counts <- get_kmer_counts(DB_filter_4ex_95perc, genes = c(example_gene_family))

# In 2D:
tsne <- kmer_tSNE(DB_filter_4ex_95perc, counts, iterations = 5000, perplexity = 40, dims = 2, verbose = F)
plot_tSNE(DB_filter_4ex_95perc, tsne)
plot_tSNE(DB_filter_4ex_95perc, tsne, scale_by = "abundance")
# you can force 3D viewing for 2D t-SNE to utilize the plotly UI
plot_tSNE(DB_filter_4ex_95perc, tsne, force_3D = T)
plot_tSNE(DB_filter_4ex_95perc, tsne, scale_by = "abundance", force_3D = T)

# In 3D:
tsne3D <- kmer_tSNE(DB_filter_4ex_95perc, counts, perplexity = 10, dims = 3)
plot_tSNE(DB_filter_4ex_95perc, tsne3D, force_3D = T)

Splicing Heatmaps (like Sashimi Plots)

# look at the alternative splicing event frequencies across one gene
plot_splicing(DB_filter_4ex_95perc, example_gene2)

# for many-isoform genes, show only the top X most abundant isoforms
plot_splicing(DB_filter_4ex_95perc, example_gene2, max_isoforms = 20)

# zoom in on splicing sites of interest, using an interval of %s from left to right...
plot_splicing(DB_filter_4ex_95perc, example_gene2, max_isoforms = 10, zoom_in = c(0.9, 1))
# or using genomic coordinates
plot_splicing(DB_filter_4ex_95perc, example_gene2, max_isoforms = 10, zoom_in = c(64680000, 64694500))

Exon Correlation Heatmaps

# this requires inputting a file of named exon sequences
# shows exon co-splicing correlations
plot_exon_correlations(DB_filter_4ex_95perc, "~/IsoPops_Data/example_exons.tsv", example_gene2, exons_to_include = c("01", "02", "03", "04", "05", "06", "07", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25"))

# can also override weighting the correlation by transcript abundance
# optionally, draw the heatmap symmetrically (showing both triangles)
# you can also plot a histogram of correlations to see overall trends
plot_exon_correlations(DB_filter_4ex_95perc, "~/IsoPops_Data/example_exons.tsv", example_gene2, weighted = F, plot_hist = T, symmetric = T)


kellycochran/IsoDB documentation built on July 7, 2020, 2:17 p.m.