transcript_filename <- "~/IsoPops_Data/all_data.fasta" abundance_filename <- "~/IsoPops_Data/all_data.abundance.txt" ORF_filename <- "~/IsoPops_Data/all_data.ORFs.fasta" gff_filename <- "~/IsoPops_Data/all_data.gff" rawDB <- compile_raw_db(transcript_filename, abundance_filename, gff_filename, ORF_filename)
# read 2-column file of IDs and gene names into a table gene_ID_table <- read.table("~/IsoPops_Data/gene_IDs_and_names.tsv", header = T, stringsAsFactors = F) # you could also create this table in R manually with the command: # gene_ID_table <- data.frame(ID = list_of_IDs, Name = list_of_gene_names, stringsAsFactors = F) # pass in gene table to finish Database processing DB_all <- process_db(rawDB, gene_ID_table)
# placeholders for gene names used in code snippets below example_gene1 <- gene_ID_table$Name[17] example_gene2 <- gene_ID_table$Name[18] example_gene_family <- gene_ID_table$Name[23:26] example_gene_family2 <- gene_ID_table$Name[10:11]
DB_filter_4ex <- filter_db(DB_all, exon_min_count = 4, abund_cutoff = 1) DB_filter_4ex_notrunc <- filter_truncations(DB_filter_4ex) DB_filter_4ex_95perc <- filter_db(DB_filter_4ex_notrunc, 4, 0.95, recalc_abundances = F)
This is recommended so that re-running the code is unnecessary, especially if your filtering steps took computational effort.
# save to file saveRDS(DB_filter_4ex_95perc, "~/IsoPops_Data/DB_filter_4ex_95perc.db") # later, read Database from saved file database <- readRDS("~/IsoPops_Data/DB_filter_4ex_95perc.db")
# write GffDB portion of a Database to a file (for IGV or other use) write_gff_data(DB_filter_4ex, "~/IsoPops_Data/filter_4ex.gff") write_gff_data(DB_filter_4ex_notrunc, "~/IsoPops_Data/filter_4ex_notrunc.gff") write_gff_data(DB_filter_4ex_95perc, "~/IsoPops_Data/fullfilter.gff") # filtering and saving can be combined into one action # filter by exon count + abundance, then save GFF write_gff_data(filter_db(DB_filter_4ex_notrunc, 4, 0.90), "~/IsoPops_Data/filter_4ex_notrunc_90.gff") # filter by top X isoforms per gene, then save GFF write_gff_data(filter_top_gff(DB_filter_4ex_95perc, 10), "~/IsoPops_Data/fullfilter_top10.gff") write_gff_data(filter_top_gff(DB_filter_4ex_95perc, 25), "~/IsoPops_Data/fullfilter_top25.gff")
plot_N50_N75(DB_filter_4ex_95perc) # you can choose to look at only a subset of genes, unique ORFs instead of transcripts, or other customizations plot_N50_N75(DB_filter_4ex_95perc, use_ORFs = T) plot_N50_N75(DB_filter_4ex_95perc, genes_to_include = example_gene_family, insert_title = "Gene Family Statistics")
plot_counts(DB_filter_4ex_95perc) plot_counts(DB_filter_4ex_95perc, use_log = T) plot_counts(DB_filter_4ex_95perc, genes_to_include = example_gene_family) plot_counts(DB_filter_4ex_95perc, use_counts = "ORFs")
# using the 4+ exons, no-truncations filtered database (without 95% cutoff) jellyfish_plot(DB_filter_4ex_notrunc) jellyfish_plot(DB_filter_4ex_notrunc, use_ORFs = T)
plot_exon_dist(DB_filter_4ex_95perc) plot_exon_dist(DB_filter_4ex_95perc, bin_width = 0.01) plot_exon_dist(DB_filter_4ex_95perc, example_gene2) plot_exon_dist(DB_filter_4ex_95perc, example_gene2, sum_dist = F)
plot_treemap(DB_filter_4ex_95perc) plot_treemap(DB_filter_4ex_95perc, use_ORFs = T)
plot_Shannon_index(DB_filter_4ex_95perc) plot_Shannon_index(DB_filter_4ex_95perc, use_ORFs = T)
plot_length_dist(DB_filter_4ex_95perc) plot_length_dist(DB_filter_4ex_95perc, use_ORFs = T, horiz_spread = 0.7)
# perform PCA on vectorizations of isoform sequences counts <- get_kmer_counts(DB_filter_4ex_95perc, genes = c(example_gene_family)) pca <- kmer_PCA(DB_filter_4ex_95perc, counts) # plot isoforms by their first 2 PCs... plot_PCA(DB_filter_4ex_95perc, pca) plot_PCA(DB_filter_4ex_95perc, pca, scale_by = "length") plot_PCA(DB_filter_4ex_95perc, pca, scale_by = "abundance") # ... or first 3 PCs plot_3D_PCA(DB_filter_4ex_95perc, pca) plot_3D_PCA(DB_filter_4ex_95perc, pca, scale_by = "length") plot_3D_PCA(DB_filter_4ex_95perc, pca, scale_by = "abundance")
# cluster isoforms (or ORFs) into dendrograms # if you don't supply kmer_counts, they will be calculated with default settings cluster_isoforms(DB_filter_4ex_95perc, genes = example_gene_family2) # if you create kmer counts first, you can pass them into functions multiple times # this saves time if your dataset is very large counts <- get_kmer_counts(DB_filter_4ex_95perc, genes = example_gene_family2) # evaluate clusterings at different #s of clusters or cut heights cluster_isoforms(DB_filter_4ex_95perc, counts, num_clusters = 4) cluster_isoforms(DB_filter_4ex_95perc, counts, cut_height = 0.0002)
# everything works with ORF peptide sequences, too counts <- get_kmer_counts(DB_filter_4ex_95perc, genes = "Nrxn1", use_ORFs = T) pca <- kmer_PCA(DB_filter_4ex_95perc, counts, use_ORFs = T) # cluster ORFs cluster_isoforms(DB_filter_4ex_95perc, counts, use_ORFs = T) # plot ORFs by their first 2 or 3 PCs plot_PCA(DB_filter_4ex_95perc, pca, use_ORFs = T) plot_3D_PCA(DB_filter_4ex_95perc, pca, use_ORFs = T)
# can also visualize isoform populations using t-SNE # note: this is the same counts data that PCA takes in counts <- get_kmer_counts(DB_filter_4ex_95perc, genes = c(example_gene_family)) # In 2D: tsne <- kmer_tSNE(DB_filter_4ex_95perc, counts, iterations = 5000, perplexity = 40, dims = 2, verbose = F) plot_tSNE(DB_filter_4ex_95perc, tsne) plot_tSNE(DB_filter_4ex_95perc, tsne, scale_by = "abundance") # you can force 3D viewing for 2D t-SNE to utilize the plotly UI plot_tSNE(DB_filter_4ex_95perc, tsne, force_3D = T) plot_tSNE(DB_filter_4ex_95perc, tsne, scale_by = "abundance", force_3D = T) # In 3D: tsne3D <- kmer_tSNE(DB_filter_4ex_95perc, counts, perplexity = 10, dims = 3) plot_tSNE(DB_filter_4ex_95perc, tsne3D, force_3D = T)
# look at the alternative splicing event frequencies across one gene plot_splicing(DB_filter_4ex_95perc, example_gene2) # for many-isoform genes, show only the top X most abundant isoforms plot_splicing(DB_filter_4ex_95perc, example_gene2, max_isoforms = 20) # zoom in on splicing sites of interest, using an interval of %s from left to right... plot_splicing(DB_filter_4ex_95perc, example_gene2, max_isoforms = 10, zoom_in = c(0.9, 1)) # or using genomic coordinates plot_splicing(DB_filter_4ex_95perc, example_gene2, max_isoforms = 10, zoom_in = c(64680000, 64694500))
# this requires inputting a file of named exon sequences # shows exon co-splicing correlations plot_exon_correlations(DB_filter_4ex_95perc, "~/IsoPops_Data/example_exons.tsv", example_gene2, exons_to_include = c("01", "02", "03", "04", "05", "06", "07", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25")) # can also override weighting the correlation by transcript abundance # optionally, draw the heatmap symmetrically (showing both triangles) # you can also plot a histogram of correlations to see overall trends plot_exon_correlations(DB_filter_4ex_95perc, "~/IsoPops_Data/example_exons.tsv", example_gene2, weighted = F, plot_hist = T, symmetric = T)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.