time.string = gsub(":", "-", gsub(" ", "_", as.character(Sys.time()))) cat("This report is generated at:", time.string,"\n")
Load required library
library(JunctionSeq) library(GOSJ) library(Biobase) library(goseq) library(org.Mm.eg.db) library(VennDiagram) library(GSA) library(GO.db) library(ggplot2) library(gdata) library(popbio) #library(ReportingTools) #library(hgu95av2.db) #library(devtools) #library(rgl) library(qpcR) library(AnnotationDbi) library(ggbio)
#load("/media/H_driver/PJ/re_save_2016-07-05 17:55:10_PJ_jscs.RData") load("/media/H_driver/PJ/JunctionRe.RData")
#set up input file path input.dir.name="/media/H_driver/PJ/" QoRTs.dir="/home/aiminyan/QoRTs/QoRTsFullExampleData/QoRTsRelease/" gtf.file.dir="/media/aiminyan/DATA/mus_musculus/" gff.file.dir="/media/H_driver/2015/Nimer_Cheng/" canonical.tft.gmt.file.dir="/media/H_driver/Annotation/MsigDB/" go.file.dir="/media/H_driver/Annotation/GO_gene_mouse/" gene.annotation.file.dir="/media/H_driver/Annotation/mm10/" #load gene model data(gene.model) #set up QoRTs EXE QoRTs.exe=paste0(QoRTs.dir,"QoRTs.jar") #set up input files gtf.file=paste0(gtf.file.dir,"Mus_musculus.GRCm38.83.processed.sorted.gtf") decode.file="decoder.bySample.rtf" gene.count.file="/QC.geneCounts.formatted.for.DESeq.txt" canonical.path.gmt.file=paste0(canonical.tft.gmt.file.dir,"c2.cp.Mouse.v5.1.symbols.gmt") tft.gmt.file=paste0(canonical.tft.gmt.file.dir,"c3.tft.Mouse.v5.1.symbols.gmt") go.mouse.file=paste0(go.file.dir,"gene_GO.mgi") gene.annotation.file=paste0(gene.annotation.file.dir,"genes_table_02052016.csv") sj.exon.count.file="/QC.spliceJunctionAndExonCounts.forJunctionSeq.txt" gff.file=paste0(gff.file.dir,"Mus_musculus.GRCm38.83.JunctionSeq.flat.gff") junctionSeq.output.save="PJ_jscs.RData" junctionSeq.output.save.history="PJ_jscs.Rhistory"
#set up the path for output files output.dir.name="/media/H_driver/PJ/Results/" output.dir.name.new="/media/H_driver/PJ/Results/Feature_GeneWise_rMAT/" output.dir.name.rMAT_SE_based=paste0(output.dir.name,time.string,"_","Feature_GeneWise_rMAT_SE_based/") dir.create(output.dir.name.rMAT_SE_based) #set up output files for gene based gene.based.ad.by.GL.GO.output.file=paste0(output.dir.name,"geneGL_rm_non_coding.xls") gene.based.ad.by.GL.ca.path.output.file=paste0(output.dir.name,"geneGL_rm_non_coding_Cp.xls") gene.based.ad.by.GL.tft.output.file=paste0(output.dir.name,"geneGL_rm_non_coding_tft.xls") #set up output file for feature based feature.based.ad.by.SJ.GO.output.file=paste0(output.dir.name,"GO_use_all_gene.xls") feature.based.ad.by.SJ.ca.path.output.file=paste0(output.dir.name,"Canonical_path_use_all_gene.xls") feature.based.ad.by.SJ.tft.output.file=paste0(output.dir.name,"Transcription_factor_targets_use_all_gene.xls") Output_DE_gene_file_feature_gene_based=paste0(output.dir.name,"DE_genes_based_on_features_gene_based.xls") Output_DE_gene_file_feature_all_feature_based=paste0(output.dir.name,"DE_genes_based_on_features_all_feature_based.xls") Output_GO_use_DE_gene_use_FC_p_of_feature=paste0(output.dir.name,"GO_term_use_DE_gene_based_on_FC_p_feature.xls") redefined.feature.based.ad.by.SJ.ca.path.output.file=paste0(output.dir.name,"Ca_path_use_DE_gene_based_on_FC_p_feature.xls") redefined.feature.based.ad.by.SJ.tft.output.file=paste0(output.dir.name,"Tft_use_DE_gene_based_on_FC_p_feature.xls") Output_GO_use_DE_gene_use_FC_p_of_feature_058=paste0(output.dir.name,"GO_term_use_DE_gene_based_on_FC_p_feature_058.xls") Output_GO_BP_use_DE_gene_use_FC_p_of_feature_058=paste0(output.dir.name,"GO_term_BP_use_DE_gene_based_on_FC_p_feature_058.xls") Output_GO_BP_use_DE_gene_use_FC_p_of_feature_058_2=paste0(output.dir.name,"GO_term_BP_use_DE_gene_based_on_FC_p_feature_058_2.xls") Output_GO_BP_use_DE_gene_use_FC_p_of_feature_058_3=paste0(output.dir.name,"GO_term_BP_use_DE_gene_based_on_FC_p_feature_058_3.xls") Output_GO_BP_use_DE_gene_use_FC_p_of_feature_058_4=paste0(output.dir.name,"GO_term_BP_use_DE_gene_based_on_FC_p_feature_058_4.xls") redefined.feature.based.ad.by.SJ.ca.path.output.file_058=paste0(output.dir.name,"Ca_path_use_DE_gene_based_on_FC_p_feature_058.xls") redefined.feature.based.ad.by.SJ.tft.output.file_058=paste0(output.dir.name,"Tft_use_DE_gene_based_on_FC_p_feature_058.xls") outputfile_LabelGeneBasedFeature=paste0(output.dir.name,"Label_gene_by_features.xls") outputfile_DGE_FC_P_geneWise=paste0(output.dir.name,"DGE_overlap_check.tiff") outputfile_DGE_2_FC_P_geneWise=paste0(output.dir.name,"DGE_overlap_check_defined_by_2_FC.tiff") Output_GO_BP_use_DE_gene_use_FC_p_of_feature_058_2_rMAT=paste0(output.dir.name,"GO_term_BP_use_DE_gene_based_on_FC_p_feature_058_4_rMAT.xls") Output_GO_BP_feature.gene.rMAT=paste0(output.dir.name,"GO_term_BP_feature_gene_rMAT.xls") Output_GO_BP_gene=paste0(output.dir.name,"GO_term_BP_gene_only.xls") Output_GO_BP_feature=paste0(output.dir.name,"GO_term_BP_feature_only.xls") Output_GO_BP_feature_gene=paste0(output.dir.name,"GO_term_BP_feature_gene.xls") Output_GO_BP_feature_gene_redefined=paste0(output.dir.name,"GO_term_BP_feature_gene_redefined.xls") Output_GO_BP_feature.gene.rMAT.2.FC=paste0(output.dir.name,"GO_term_BP_feature_gene_rMAT_2FC.xls") output.dir.DE.name=paste0(output.dir.name,time.string,"_","DE/") dir.create(output.dir.DE.name) Output_DE_gene_rMAT_FDR=paste0(output.dir.DE.name,"DE_gene_rMAT_FDR.xls") Output_DE_gene_rMAT_p_inc=paste0(output.dir.DE.name,"GO_gene_rMAT_p_inc.xls")
4 BAM files(2 BAM files within each condition(WT vs KO)) are generated by aliging FASTQ files to the GRCm38/mm10 build of the Mus musculus genome with STAR aligner. Gene-wise counts were obtained with callQoRT function in GOSJ R package:
file.name=dir(input.dir.name,recursive = TRUE,pattern="sorted.bam") file.name.whole<-paste0(input.dir.name,file.name) file.name.selected<-file.name.whole[c(5,3,9)] file.name.selected.2<-as.list(file.name.selected) names(file.name.selected.2)=sapply(strsplit(file.name.selected,split="\\/"),"[[",6) cmd1=paste0("java -Xmx4000M -jar ",QoRTs.exe," QC --noGzipOutput") re.out<-lapply(file.name.selected.2,callQoRT,gtf_file=gtf.file,runing_cmd=cmd1)
Use gene based count, overall differential gene expression is analyzed with GetResults4GeneLevel function in GOSJ R package:
Re.PJ.gene<-GetResults4GeneLevel(input.dir.name,decode.file,gene.count.file)
Show the results for differentially expressed genes:
knitr::kable(head(Re.PJ.gene)) head(Re.PJ.gene) length(which(Re.PJ.gene$padj<0.05&abs(Re.PJ.gene$log2FoldChange)>0.5))
Collect all geneIDs from GO, canonical pathway and transcription factor target
gene.id.all<-CombineCpTftGoGeneID(canonical.path.gmt.file,tft.gmt.file,go.mouse.file,gene.annotation.file) index.cp.tft.co.gene<-match(gene.id.all,rownames(Re.PJ.gene)) index.cp.tft.co.gene.2<-index.cp.tft.co.gene[-which(is.na(index.cp.tft.co.gene))] Re.PJ.gene.rm.non.coding<-Re.PJ.gene[index.cp.tft.co.gene.2,]
Show the results for differentially expressed genes:
knitr::kable(head(Re.PJ.gene.rm.non.coding))
To perform GO and pathway analysis based on differentially expressed genes:
Gene.non.coding.GO<-GeneBasedAnalysis(Re.PJ.gene.rm.non.coding,gene.model,gene.based.ad.by.GL.GO.output.file) gene.2.cat.cp.mouse<-Gmt2GeneCat(canonical.path.gmt.file,gene.annotation.file) gene.2.cat.tft.mouse<-Gmt2GeneCat(tft.gmt.file,gene.annotation.file) RE.cp.gene<-OutputCatBasedPwf(Gene.non.coding.GO$pwf,gene_model=gene.model,gene_2_cat=gene.2.cat.cp.mouse, Output_file=gene.based.ad.by.GL.ca.path.output.file) RE.cft.gene<-OutputCatBasedPwf(Gene.non.coding.GO$pwf,gene_model=gene.model,gene_2_cat=gene.2.cat.tft.mouse, Output_file=gene.based.ad.by.GL.tft.output.file)
For differential exon and splicing usage analysis, the count file for exons and splicing junctions are used as input to get differential usage results with GetResultsFromJunctionSeq function in GOSJ R package:
#decoder <- read.table(paste0(input.dir.name,decode.file),header=TRUE,stringsAsFactors=FALSE) #countFiles <- paste0(input.dir.name,decoder$sample.ID,"/QC.spliceJunctionAndExonCounts.forJunctionSeq.txt") #design <- data.frame(condition = factor(decoder$group.ID)) #jscs.counts = readJunctionSeqCounts(countfiles = countFiles,samplenames = decoder$sample.ID,design = design,flat.gff.file = gff.file) Re.PJ<-GetResultsFromJunctionSeq(input.dir.name,decode.file,sj.exon.count.file,gff.file)
From the resutls of differential exons and splicing junctions usage(Re.PJ), we generate genewise results using makeGeneWiseTable function in GOSJ R package:
#load(paste0(input.dir.name,junctionSeq.output.save)) re.PJ.gene.based<-makeGeneWiseTable(Re.PJ,gene.list=unique(as.character(fData(Re.PJ)$geneID)))
To show some results of genewise resutls:
knitr::kable(head(pData(re.PJ.gene.based)))
DE.from.PJ.input.file="/media/H_driver/PJ/Results/SE.MATS.ReadsOnTargetAndJunctionCounts.Neg_WTvKO_c_001_0_20_inclusion_p_0_05_from_PJ.xlsx" DE.from.PJ<-read.xls(DE.from.PJ.input.file,header=T) dim(DE.from.PJ) length(unique(as.character(DE.from.PJ$geneSymbol)))
dir.name.rMAT="/media/H_driver/PJ/rMAT/pHamard_MATS-selected-re-run-exon-centric/pHamard_MATS-selected-re-run-exon-centric/Neg_WTvKO_c0.01/" input.file.pattern="*Neg_WTvKO_c0.01.txt" file.list<-paste0(dir.name.rMAT,dir(dir.name.rMAT)) print(file.list[19]) re.Data<-read.table(file.list[19],header = T) names(re.Data) DE.defined.by.p.IncLevelDifference<-re.Data[which(re.Data$PValue<0.05&abs(re.Data$IncLevelDifference)>0.20),] DE.defined.by.p.IncLevelDifference.geneSymbol<-unique(as.character(DE.defined.by.p.IncLevelDifference$geneSymbol)) length(DE.defined.by.p.IncLevelDifference.geneSymbol) #Match data set to DE list from PJ DE.from.PJ.2.H.driver<-re.Data[(which(as.character(re.Data$geneSymbol) %in% unique(as.character(DE.from.PJ$geneSymbol)))),c(1,2,3,19,20,23)] #dim(DE.from.PJ.2.H.driver) DE.from.PJ.2.H.driver.selected.events<-DE.from.PJ.2.H.driver[which(DE.from.PJ.2.H.driver$PValue<0.05&abs(DE.from.PJ.2.H.driver$IncLevelDifference)>=0.20),] dim(DE.from.PJ.2.H.driver.selected.events) length(unique(as.character(DE.from.PJ.2.H.driver.selected.events$geneSymbol))) setdiff(unique(as.character(DE.from.PJ.2.H.driver.selected.events$geneSymbol)),unique(as.character(DE.from.PJ$geneSymbol))) setdiff(DE.defined.by.p.IncLevelDifference.geneSymbol,unique(as.character(DE.from.PJ$geneSymbol))) knitr::kable(DE.from.PJ.2.H.driver.selected.events)
#load("/media/H_driver/PJ/JunctionRe.RData") De_defined_by_what="P_and_inclusion" re.rMAT<-ProcessOutputFilesFrom_rMATS(dir.name.rMAT,input.file.pattern,De_defined_by_what) Re.PJ.selected.feature.2.FC.p<-Select_DE_gene_basd_on_Feature(Re.PJ,re.PJ.gene.based,re.rMAT,"SE",1,0.05,gene.model,output.dir.DE.name)
DE.FDR.rMAT<-gene.model[which(gene.model$ensembl_gene_id %in% Re.PJ.selected.feature.2.FC.p$DGE.rMAT),] DE.FDR.rMAT write.table(DE.FDR.rMAT,file=Output_DE_gene_rMAT_FDR,row.names = FALSE,quote=FALSE,sep="\t")
DE.p.inc.rMAT<-gene.model[which(gene.model$ensembl_gene_id %in% Re.PJ.selected.feature.2.FC.p$DGE.p.IncLevelDifference),] DE.p.inc.rMAT write.table(DE.p.inc.rMAT,file=Output_DE_gene_rMAT_p_inc,row.names = FALSE,quote=FALSE,sep="\t")
cat("please find all results in: ", output.dir.name.rMAT_SE_based, "\n")
venn.plot <- venn.diagram( x = Re.PJ.selected.feature.2.FC.p[c(2,3,4)], filename = NULL, col = "black", lty = "dotted", lwd = 2, fill = c("red", "orange", "blue"), alpha = 0.50, label.col = c(rep("white",7)), cex = 1, fontfamily = "serif", fontface = "bold", cat.col = c("red", "orange", "blue"), cat.cex = 0.8, cat.fontfamily = "serif" ) grid.draw(venn.plot); #dev.off()
#output.dir.name="/media/H_driver/PJ/Results/" #output.dir.name.rMAT_SE_based=paste0(output.dir.name,"test_7_26_2016","_","Feature_GeneWise_rMAT_SE_based/") #dir.create(output.dir.name.rMAT_SE_based) #sink(paste0(output.dir.name.rMAT_SE_based,"Output_test.txt")) Re.combine.3.methods.rMAT.SE<-Combine3Re(Re.PJ, re.PJ.gene.based, re.rMAT,gene.model,output.dir.name.rMAT_SE_based) #save.image(file=paste0(output.dir.name.rMAT_SE_based,"re_save.RData")) #savehistory(file=paste0(output.dir.name.rMAT_SE_based,"re_save.Rhistory")) #sink()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.