knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
circIMPACT provides a strategy to analyze circular and linear transcriptome rely on data mining and statistical learning techniques. circIMPACT package offers a tidy pipeline, starting from the provided circRNA which expression classifies samples with high and low expression of it, a comprensive transcriptome and molecular pathway analysis are performed, including visualisation, normalization, classification, clustering, Differential Expression Analysis and Gene Set Enrichment Analysis. The package accepts data presented as a matrix of raw counts and allows the inclusion of variables that occur with the experimental setting. A series of functions enables data cleaning by filtering rows, data adjustment by identify and removing the unwonted source of variation and to select the best predictors for modeling the responce variable. A learning technique is applied to build a robust classification model, and also an unsupervised analysis is carried out to gain mechanistic insights and detect the molecular pathway associated with the expression levels of the selected circRNA. Finally, a Differential Expression Analysis identyfied deregulated mRNAs and a GSEA is performed for them. circIMPACT stands for "circRNA impacted genes and pathways".
circIMPACT package version: r packageVersion("circIMPACT")
circIMPACT provides an R interface to analyze a possible impact of circRNA expression profile in gene expression.
The toolset performs functional enrichment analysis and visualization of gene lists obtained comparing samples with low circRNA expression with those with high expression.
The main tools in circIMPACT are:
marker.detection
- detection of circRNAs that can stratified samples by their expression signifincantlygene.expression
- DEG analysis for each circRNA-markers defined by K-means algorithm or by circRNA expression.The input for any of the tools consist in count matrices: from the same samples we need the quantification of circular and linear RNAs.
devtools::install_github("AFBuratin/circIMPACT", force = TRUE)
library(circIMPACT) library(DESeq2) library(data.table) library(plyr) library(Rtsne) library(ggplot2) library(ggrepel) library(plotly) library(ComplexHeatmap) library(circlize) library(viridis) library(knitr) library(kableExtra) library(formattable) library(htmltools) library(sparkline) library(tidyverse) library(RColorBrewer) library(purrr) library(magrittr) library(randomForest) library(DaMiRseq) library(webshot) library(ggpubr) library(tidyr) library(factoextra) library(NbClust)
data("circularData") data("count.matrix") data("meta") data("coldata.df") ## define sample/condition colors intgroup.dt <- meta[, .(.N), by = .(sample, condition)][order(sample), .(sample), by = condition] samples.per.condition <- data.frame(intgroup.dt[, .N, by = .(condition)], row.names = "condition") n.conditions <- nrow(samples.per.condition) hues <- circIMPACT::gg_color_hue(n.conditions) for(i in 1:n.conditions){ n.hues <- samples.per.condition[i, "N"] + 2 col.hues <- colorRampPalette(colors = c(hues[i], "white"))(n.hues)[1:(n.hues-2)] intgroup.dt[condition == rownames(samples.per.condition)[i], `:=`(color = col.hues, hue = hues[i])] } ## create a deseq dataset object dds.circular <- suppressMessages(DESeqDataSetFromMatrix(countData = ceiling(circularData[, coldata.df$sample[order(coldata.df$condition)]]), colData = coldata.df[order(coldata.df$condition),], design = ~ condition)) dds.circular <- suppressMessages(estimateSizeFactors(dds.circular)) sf <- sizeFactors(dds.circular)
At first a simply filter of low count is applied to remove background noise.
data.filt <- circularData[rowSums(circularData >= 10) >= 2,] dds.filt.expr <- suppressMessages(DESeqDataSetFromMatrix(countData = ceiling(data.filt[,coldata.df$sample[order(coldata.df$condition)]]), colData = coldata.df[order(coldata.df$condition),], design = ~ condition)) dds.filt.expr <- suppressMessages(estimateSizeFactors(dds.filt.expr)) sf.filt <- sizeFactors(dds.filt.expr) circNormDeseq <- counts(dds.filt.expr, normalized = T)
marker.detection
To establish a possible impact of circRNA in gene expression using different molecular subtypes of human T-ALL we select a subset of circRNA defined as markers: at first by using k-means algorithm we separate samples in two main cluster defined by the circRNA expression, then only circRNA which DESeq adj. p-value<.1 and log fold-change>1.5 have been selected.
circIMPACT <- circIMPACT::marker.selection(dat = circNormDeseq, dds = dds.filt.expr, sf = sf.filt, p.cutoff = 0.05, lfc.cutoff = 1) circIMPACT_Kmeans <- circIMPACT::marker.selection(dat = circNormDeseq, dds = dds.filt.expr, sf = sf.filt, p.cutoff = 0.05, lfc.cutoff = NULL, method.d = "euclidean", method.c = "ward.D2", median = FALSE, choose.k = TRUE, index.m = "silhouette")
The result list
will include:
circMark <-"11:33286412-33287511" circIMPACT_Kmeans$group.df[circIMPACT_Kmeans$group.df$circ_id==circMark,] circMark_group.df <- circIMPACT$group.df[circIMPACT$group.df$circ_id==circMark,] circMark_group.df$counts <- merge(circMark_group.df, reshape2::melt(circNormDeseq[circMark,]), by.x = "sample_id", by.y = "row.names")[,"value"] mu <- ddply(circMark_group.df, "group", summarise, Mean=mean(counts), Median=median(counts), Variance=sd(counts)) p <- ggplot(circMark_group.df, aes(x=counts, color=group, fill=group)) + geom_density(alpha=0.3) + ylim(c(0,0.02)) + geom_vline(data=mu, aes(xintercept=Median, color=group), linetype="dashed") + geom_text(data=mu, aes(x=Median[group=="g1"] - 1, label=paste0("Median:", round(Median[group=="g1"], 2), "/ Variance:", round(Variance[group=="g1"], 2)), y=0.012), colour="black", angle=90, text=element_text(size=12)) + geom_text(data=mu, aes(x=Median[group=="g2"] - 1, label=paste0("Median:", round(Median[group=="g2"], 3), "/ Variance:", round(as.numeric(Variance[group=="g2"]), 2)), y=0.012), colour="black", angle=90, text=element_text(size=11)) + scale_fill_brewer(palette="Dark2") + scale_color_brewer(palette="Dark2") + labs(title=paste0("circHIPK3 (", circMark, ")", " counts density curve"), x = "Normalized read counts", y = "Density") + theme_classic() + theme(axis.text.x = element_text(size=18), axis.text.y = element_text(size=18), axis.title = element_text(size=20), legend.text = element_text(size=15), legend.title = element_text(size=15), plot.title = element_text(size = 18), plot.subtitle = element_text(size = 13, color = "darkslategrey")) p
Clustering analysis is performed using the previuos selected circRNA, defined as circRNA-markers. T-SNE algorithm is used to performe dimensionality reduction and k-means and hierarchical clustering methods are compared in order to identify two cluster of samples.
markers.circrnas <- circIMPACT$circ.targetIDS mat.filt.mark <- circularData[markers.circrnas, ] dds.filt.mark <- DESeqDataSetFromMatrix(countData = ceiling(mat.filt.mark[,coldata.df$sample]), colData = coldata.df, design = ~ 1) dds.filt.vst <- varianceStabilizingTransformation(dds.filt.mark, fitType = "local", blind = F) norm.counts.filt <- assay(dds.filt.vst)
dt <- norm.counts.filt
## Rtsne function may take some minutes to complete... set.seed(9) mydist <- dist(t(norm.counts.filt)) ## t-SNE representation # set a perplexity parameter consistent with the number of samples tsne_data <- Rtsne(mydist, pca = F, perplexity=1, max_iter=5000) ## getting the two dimension matrix d_tsne_1 = as.data.frame(tsne_data$Y) rownames(d_tsne_1) <- colnames(norm.counts.filt)
## keeping original data d_tsne_1_original=d_tsne_1 ## Creating k-means clustering model, and assigning the result to the data used to create the tsne fit_cluster_kmeans=kmeans(scale(d_tsne_1), 2) d_tsne_1_original$cl_kmeans = factor(fit_cluster_kmeans$cluster) ## Creating hierarchical cluster model, and assigning the result to the data used to create the tsne fit_cluster_hierarchical=hclust(dist(scale(d_tsne_1))) ## setting 2 clusters as output d_tsne_1_original$cl_hierarchical = factor(cutree(fit_cluster_hierarchical, k=2)) # Plotting the cluster models onto t-SNE output plot_cluster=function(data, var_cluster, palette) { ggplot(data, aes_string(x="V1", y="V2", color=var_cluster)) + geom_point(size=3) + guides(colour=guide_legend(override.aes=list(size=3))) + geom_text_repel(aes(label = rownames(data)), hjust = 0.5, vjust = -1) + xlab("") + ylab("") + ggtitle("") + theme_light(base_size=11) + theme(axis.text.x=element_blank(), axis.text.y=element_blank(), legend.direction = "horizontal", legend.position = "bottom", legend.box = "horizontal") + scale_colour_brewer(palette = palette) } plot_k=plot_cluster(d_tsne_1_original, "cl_kmeans", "Dark2") plot_h=plot_cluster(d_tsne_1_original, "cl_hierarchical", "Set1") ## and finally: putting the plots side by side with gridExtra lib... library(gridExtra) grid.arrange(plot_k, plot_h, ncol=2)
After Principal Component Analysis for each circRNAs previously selected a contribution in variance explanation of the two condition comparade will be computed in order to rank circRNA for further analysis. The results can be investigated with visual representation in the PCA plot, by using fviz_pca_biplot function in the package factoextra or with a table with measure of contribution, obtained by get_pca_var function, for each circRNAs.
pca <- prcomp(x = t(norm.counts.filt), center = T) d <- data.frame(pca$x[rownames(coldata.df), c("PC1", "PC2")], coldata.df) PC1.var <- summary(pca)$importance["Proportion of Variance", 1] PC2.var <- summary(pca)$importance["Proportion of Variance", 2] g1 <- ggplot(data = d, mapping = aes(x = PC1, y = PC2, shape = condition)) + geom_point(size = 5) + coord_fixed(ratio = 1) + xlab(paste0("PC1: ", percent(PC1.var))) + ylab(paste0("PC2: ", percent(PC2.var))) + scale_shape_manual("", values = c(8,16)) + theme_classic() + theme(legend.position = "bottom", legend.direction = "vertical", plot.title = element_text(hjust = .5), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(size=20), aspect.ratio = 1) circRNArank = circIMPACT::circ_contrib(matrix_pc = pca, K = 25) ggpubr::ggarrange(ggarrange(g1, circRNArank$plot, ncol = 2, labels = c("PCA", "")), nrow = 1)
circ_contrib function select the top K circRNAs ranked by their contirubution in variance explanation for PC1 and PC2.
library(factoextra) res = fviz_contrib(pca, choice ="var", axes = 1, top = 25) top25 = res$data$name[order(res$data$contrib, decreasing=T)][1:25] circAnnotation = read.csv("/media/Data/Li/circRNA_expression_per_sample.csv", header = T) circAnnotation$circCHR = sub(":.*", "", circAnnotation$circ_id) circAnnotation$circstart = gsub(".*:(.*)\\-.*", "\\1", circAnnotation$circ_id) circAnnotation$circend = sub(".*-", "\\1", circAnnotation$circ_id) circAnnotation$circ_id <- paste0(circAnnotation$circCHR, ":", as.numeric(circAnnotation$circstart)-1, "-", circAnnotation$circend) rownames(pca$rotation) = paste0("circ", circAnnotation$gene_names[match( rownames(norm.counts.filt), circAnnotation$circ_id)], "_", rownames(norm.counts.filt)) p = fviz_pca_biplot(pca, #select.ind = list(contrib = 5), select.var = list(contrib = 25), ggtheme = theme_minimal(), title = "CircRNAs PCA loadings", #habillage=coldata.df$condition, # Individuals geom.ind = "point", #fill.ind = coldata.df$condition, col.ind = "black", #pointshape = coldata.df$condition, pointsize = 3, palette = "jco", addEllipses = FALSE, repel = TRUE, # Variables # alpha.var ="contrib", col.var = "contrib", # gradient.cols = c("#C51B7D", "#E9A3C9", "#A1D76A", "#4D9221"), gradient.cols = c("darkviolet", "deeppink2", "deeppink4"), legend.title = list(shape = "Condition", color = "Contrib")) + #alpha = "Contrib")) + labs(subtitle = "Top 25 variables with highest contribution of the event to PC1 and PC2", caption = "") + theme(axis.text.x = element_text(size=18), axis.text.y = element_text(size=18), axis.title = element_text(size=20), legend.text = element_text(size=15), legend.title = element_text(size=15), plot.title = element_text(size = 18), plot.subtitle = element_text(size = 13, color = "gray28")) p
set.seed(201) dds.filt.mark <- estimateSizeFactors(dds.filt.mark) circNormDeseq <- counts(dds.filt.mark, normalized = T) base_mean = log2(rowMeans(circNormDeseq)+0.001) mat_scaled = t(apply(dt, 1, function(x) scale(x = x, center = T, scale = T))) colnames(mat_scaled) <- colnames(dt) cond = colData(dds.filt.expr)$condition ## choice of kmeans results as cluster of samples clus = d_tsne_1_original$cl_kmeans cond.colors <- unique(intgroup.dt$hue) names(cond.colors) <- unique(intgroup.dt$condition) ha = HeatmapAnnotation(df = data.frame(condition = cond, cluster = clus), col = list(condition = cond.colors), show_annotation_name = F, annotation_legend_param = list(condition = list(nrow = 2, direction = "horizontal"))) mat.dend <- as.dendrogram(fit_cluster_hierarchical) fit_cluster_kmeans$cluster ht <- Heatmap(mat_scaled, name = "expression", # km = 2, # column_km = 2, column_order = names(fit_cluster_kmeans$cluster[order(fit_cluster_kmeans$cluster)]), col = colorRamp2(c(-2, 0, 2), c("blue", "white", "red")), top_annotation = ha, # top_annotation_height = unit(4, "mm"), clustering_distance_columns = "euclidean", clustering_method_column = "complete", cluster_columns = F, clustering_distance_rows = "spearman",#"minkowski", clustering_method_rows = "ward.D2", cluster_rows = T, # row_dend_side = "right", # row_names_side = "left", show_row_names = T, show_column_names = F, width = unit(9, "cm"), show_row_dend = T, show_column_dend = T, # row_dend_reorder = TRUE, row_names_gp = gpar(fontsize = 5), heatmap_legend_param = list(direction = "horizontal")) + Heatmap(base_mean, name = "log2(base mean)", show_row_names = F, width = unit(2, "mm"), col = inferno(255), show_column_names = F, row_names_gp = gpar(fontsize = 5), heatmap_legend_param = list(direction = "horizontal")) draw(ht, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
Because of these analysis is highly time consuming for this tutorial have been reported the DEGs for only the circHIPK3.
## filter out genes low expressed min.count <- 20 min.col <- 5 filt.mat <- count.matrix[rowSums(count.matrix >= min.count) >= min.col, ]
#normalized circRNAs data circNormDeseq <- counts(dds.filt.expr, normalized = T) %>% as.data.frame() circNormDeseq$circ_id <- rownames(circNormDeseq) # library(doParallel) # no_cores <- detectCores() - 5 # registerDoParallel(cores=no_cores) # gene_mark <- foreach::foreach(i=1:25, .combine = rbind) %dopar% { # # results.temp <- data.frame(gene.expression(circ_idofinterest = top25[i], circRNAs = circNormDeseq, # linearRNAs = filt.mat, # colData = coldata.df, padj = 0.05, # group = circIMPACT$group.df[circIMPACT$group.df$circ_id%in%top25[i],], # covariates = NULL), circIMPACT = top25[i]) # } gene_mark_hipk3 <- data.frame(gene.expression(circ_idofinterest = "11:33286412-33287511", circRNAs = circNormDeseq, linearRNAs = filt.mat, colData = coldata.df, padj = 0.1, group = circIMPACT$group.df[circIMPACT$group.df$circ_id%in%"11:33286412-33287511",], covariates = NULL), circIMPACT = "11:33286412-33287511")
gene_mark_hip <- as.data.table(gene_mark_hipk3) gene_mark_tab = gene_mark_hip %>% dplyr::group_by(circIMPACT, n.degs) %>% dplyr::mutate(UP=sum(log2FoldChange>0), DW=sum(log2FoldChange<0)) %>% dplyr::select(circIMPACT, n.degs, UP, DW) gene_mark_hip %>% dplyr::rename("Gene" = "gene_id", "logFC" = "log2FoldChange") %>% arrange(padj) %>% select(circIMPACT, Gene, logFC) %>% head(20) %>% formattable::formattable(., align = c("c","c","c"), list( gene_id = formattable::formatter("span", style = ~ formattable::style(color = "grey", font.weight = "bold")), circIMPACT = formattable::formatter("span", style = ~ formattable::style(color = "grey", font.weight = "bold")), logFC = circIMPACT::color_tile3(digits = 3, n = 18, fun = "comma", palette = "PiYG"))) # knitr::kable(gene_mark %>% dplyr::group_by(circRNA_markers, n.degs) %>% # dplyr::summarise(DEGs = paste(sort(gene_id),collapse=", ")), # escape = F, align = "c", row.names = T, caption = "circRNA-DEGs assosiation") %>% kable_styling(c("striped"), full_width = T) gene_mark_hip[gene_mark_hip$gene_id=="HPSE",] # head(gene_mark_hip) # Make a basic volcano plot gene_mark_hipk3$expression = ifelse(gene_mark_hipk3$padj <= 0.01 & abs(gene_mark_hipk3$log2FoldChange) >= 1, ifelse(gene_mark_hipk3$log2FoldChange >= 1 ,'Up','Down'), 'Stable') p <- ggplot(data = gene_mark_hipk3, aes(x = log2FoldChange, y = -log10(padj), colour=expression, label = gene_id)) + geom_point(alpha=0.4, size=3.5) + geom_text_repel(aes(label=ifelse((abs(log2FoldChange) >= 5)&(padj<=0.01), gene_id, "")), color = "black", size = 6) + scale_color_manual("", guide = guide_legend(title.position = "top", ncol = 1, title.hjust = .5), values = setNames(c("blue", "grey","red"), nm = c("Down", "Stable","Up"))) + # scale_color_manual(values=c("blue", "grey","red"))+ xlim(c(-6.5, 6.5)) + geom_vline(xintercept=c(-1,1),lty=4,col="black",lwd=0.8) + geom_hline(yintercept = 2,lty=4,col="black",lwd=0.8) + labs(x="log2(fold change)", y="-log10 (adj.p-value)", title="") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5), legend.position="right", legend.title = element_blank(), legend.text = element_text(size=20), text = element_text(size=20), axis.text.x = element_text(size=25), axis.text.y = element_text(size=25)) p #ggplotly(p)
my_data = data.frame(circHIPK3 = as.vector(t(circNormDeseq["11:33286412-33287511",-7])), HPSE = filt.mat["HPSE",]) ggscatter(my_data, x = "circHIPK3", y = "HPSE", add = "reg.line", conf.int = TRUE, cor.coef = TRUE, cor.method = "spearman", xlab = "circHIPK3 expression", ylab = "QKI gene expression")
my_data = data.frame(circHIPK3 = as.vector(t(circNormDeseq["11:33286412-33287511",-7])), HPSE = filt.mat["HPSE",]) my_data$sample_id = rownames(my_data) my_data$condition = meta$condition[match(my_data$sample_id, meta$sample)] ggplot(reshape2::melt(my_data, id.var = c("sample_id","condition")), aes(x=condition, y=value, group=condition)) + geom_boxplot(aes(fill=condition)) + facet_grid( variable ~ ., scales='free') + scale_fill_manual(values = c("#1B9E77","#D95F02")) + theme_bw() + xlab("") + ylab("Normalized Expression") + theme(plot.title = element_text(hjust = 0.5), legend.position="none", legend.title = element_blank(), legend.text = element_text(size=20), text = element_text(size=20), axis.text.x = element_text(size=20), axis.text.y = element_text(20), strip.text.x = element_text(size = 20))
library(doParallel) library(caret) library(dplyr) library(tidyr) library(data.table) # no_cores <- detectCores() - 6 # registerDoParallel(cores=no_cores) # gene.class <- foreach::foreach(i=1:25, .combine = list) %dopar% { # # results.temp <- gene_class(circ_idofinterest = top25[i], circRNAs = circNormDeseq, # linearRNAs = filt.mat, colData = coldata.df, # group = circIMPACT$group.df[circIMPACT$group.df$circ_id%in%top25[i],], # covariates = NULL, th.corr = 0.3) # } gene_class_hipk3 <- circIMPACT::gene.class(circ_idofinterest = "11:33286412-33287511", circRNAs = circNormDeseq, linearRNAs = filt.mat, colData = coldata.df, group = circIMPACT$group.df[circIMPACT$group.df$circ_id%in%"11:33286412-33287511",], covariates = NULL, th.corr = 0.3) #p #ggplotly(p)
VI <- importance(gene_class_hipk3$RF) VI.mat <- as.data.frame(VI) r <- rownames(VI) VI.mat$gene <- r VI.mat <- VI.mat[order(VI.mat$MeanDecreaseAccuracy, decreasing = T),] VI.mat <- as.data.table(VI.mat) # VI.mat VI.mat %>% mutate_at(1:4, round, 3) %>% head() %>% mutate_if(is.numeric, function(x) { cell_spec(x, bold = T, color = spec_color(x, end = 0.9), font_size = spec_font_size(x)) }) %>% kable(escape = F, align = "c", row.names = F, caption = "Table of selected genes used for classification of subgroups defined by circRNAs variation. For each class of response variable there is a OOB error rate of classification. In the 4th column there is the importance of the variable in the growing of the the random forest") %>% kable_styling(c("striped"), full_width = F)
library(randomForestExplainer) min_depth_frame <- min_depth_distribution(gene_class_hipk3$RF) importance_frame <- measure_importance(gene_class_hipk3$RF) plot_multi_way_importance(importance_frame, x_measure = "accuracy_decrease", y_measure = "gini_decrease", size_measure = "no_of_nodes", no_of_labels = 5)
library(enrichplot) library(DOSE) library(clusterProfiler) organism = "org.Hs.eg.db" library(organism, character.only = TRUE)
#subset gene symbol deregulated using the interesting circRNA marker as stratificator geneList <- gene_mark_hipk3$log2FoldChange names(geneList) <- gene_mark_hipk3$gene_id # order gene list by foldchange geneList = sort(geneList, decreasing = TRUE) # names(geneList) <- gene_mark$Gene[gene_mark$circIMPACT==markers.circrnas[5]] geneList <- geneList[abs(geneList)>=1.89] # library(gprofiler2) # gostres2 <- gost(query = names(geneList)[names(geneList)!="."], # organism = "hsapiens", ordered_query = TRUE, # multi_query = FALSE, significant = FALSE, exclude_iea = FALSE, # measure_underrepresentation = FALSE, evcodes = TRUE, # user_threshold = 0.05, correction_method = "g_SCS", # domain_scope = "annotated", custom_bg = NULL, # numeric_ns = "", sources = NULL) # # p <- gostplot(gostres2, capped = FALSE, interactive = TRUE) # p gse <- gseGO(geneList=geneList, ont ="ALL", keyType = "SYMBOL", nPerm = 10000, minGSSize = 3, maxGSSize = 800, pvalueCutoff = 0.05, verbose = TRUE, OrgDb = organism, pAdjustMethod = "none") df_gse <- gse@result df_gse = as.data.table(df_gse) dim(df_gse)
enrichplot::dotplot(gse, showCategory=10, split=".sign", title = "Enriched GO") + facet_grid(.~.sign)
upsetplot(gse, showCategory=10)
kable(head(df_gse[like(core_enrichment,"HPSE/"),]), escape = F, align = "c", row.names = F, caption = "Enrichment of GO including HPSE gene") %>% kable_styling(c("striped"), full_width = F)
original_gene_list <- geneList ids<-bitr(names(original_gene_list), fromType = "SYMBOL", toType = "ENTREZID", OrgDb=organism) de_ids = ids[!duplicated(ids[c("SYMBOL")]),] Eids<-bitr(names(original_gene_list), fromType = "SYMBOL", toType = "ENSEMBL", OrgDb=organism) # Create a new dataframe df2 which has only the genes which were successfully mapped using the bitr function above df <- gene_mark_hipk3 df$SYMBOL <- df$gene_id df <- df[which(df$SYMBOL%in%de_ids$SYMBOL),] # Create a new column with the corresponding ENTREZ IDs df$Y = de_ids$ENTREZID[match(df$SYMBOL,de_ids$SYMBOL)] # Create a vector of the gene unuiverse kegg_gene_list <- df$log2FoldChange # Name vector with ENTREZ ids names(kegg_gene_list) <- df$Y # omit any NA values kegg_gene_list<-na.omit(kegg_gene_list) # sort the list in decreasing order (required for clusterProfiler) kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE) kegg_organism = "hsa" kk2 <- gseKEGG(geneList = kegg_gene_list, organism = kegg_organism, nPerm = 10000, minGSSize = 3, maxGSSize = 800, pvalueCutoff = 0.05, pAdjustMethod = "none", keyType = "kegg") #> preparing geneSet collections... #> GSEA analysis... #> leading edge analysis... #> done... gene <- names(kegg_gene_list) edox <- setReadable(kk2, 'org.Hs.eg.db', 'ENTREZID') df_kegg = as.data.table(edox@result)
dotplot(kk2, showCategory=10, split=".sign", title = "Enriched Pathways") + facet_grid(.~.sign)
upsetplot(kk2, showCategory = 10)
library(enrichplot) library(DOSE) library(clusterProfiler) organism = "org.Hs.eg.db" library(organism, character.only = TRUE)
#subset gene symbol deregulated using the interesting circRNA marker as stratificator # geneList <- gene_mark$log2FoldChange[gene_mark$circIMPACT==markers.circrnas[5]] geneList <- gene_mark_hipk3$log2FoldChange[gene_mark_hipk3$gene_id%in%rownames(gene_class_hipk3$VI)] names(geneList) <- gene_mark_hipk3$gene_id[gene_mark_hipk3$gene_id%in%rownames(gene_class_hipk3$VI)] # order gene list by foldchange geneList = sort(geneList, decreasing = TRUE) # names(geneList) <- gene_mark$Gene[gene_mark$circIMPACT==markers.circrnas[5]] gse <- gseGO(geneList=geneList, ont ="ALL", keyType = "SYMBOL", nPerm = 10000, minGSSize = 3, maxGSSize = 800, pvalueCutoff = 0.05, verbose = TRUE, OrgDb = organism, pAdjustMethod = "none") df_gse <- gse@result df_gse = as.data.table(df_gse)
enrichplot::dotplot(gse, showCategory=10, split=".sign", title = "Enriched GO") + facet_grid(.~.sign)
upsetplot(gse, showCategory=10)
kable(head(df_gse), escape = F, align = "c", row.names = F, caption = "Enrichment of GO including HPSE gene") %>% kable_styling(c("striped"), full_width = F)
original_gene_list <- geneList ids<-bitr(names(original_gene_list), fromType = "SYMBOL", toType = "ENTREZID", OrgDb=organism) de_ids = ids[!duplicated(ids[c("SYMBOL")]),] Eids<-bitr(names(original_gene_list), fromType = "SYMBOL", toType = "ENSEMBL", OrgDb=organism) df <- gene_mark_hipk3[gene_mark_hipk3$gene_id%in%rownames(gene_class_hipk3$VI),] df$SYMBOL <- df$gene_id df <- df[which(df$SYMBOL%in%de_ids$SYMBOL),] # Create a new column with the corresponding ENTREZ IDs df$Y = de_ids$ENTREZID[match(df$SYMBOL,de_ids$SYMBOL)] # Create a vector of the gene unuiverse kegg_gene_list <- df$log2FoldChange # Name vector with ENTREZ ids names(kegg_gene_list) <- df$Y # omit any NA values kegg_gene_list<-na.omit(kegg_gene_list) # sort the list in decreasing order (required for clusterProfiler) kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE) kegg_organism = "hsa" kk2 <- gseKEGG(geneList = kegg_gene_list, organism = kegg_organism, nPerm = 10000, minGSSize = 3, maxGSSize = 800, pvalueCutoff = 0.1, pAdjustMethod = "none", keyType = "kegg") gene <- names(kegg_gene_list) edox <- setReadable(kk2, 'org.Hs.eg.db', 'ENTREZID') df_kegg = as.data.table(edox@result)
dotplot(kk2, showCategory=10, split=".sign", title = "Enriched Pathways") + facet_grid(.~.sign)
upsetplot(kk2, showCategory = 10)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.