Time for generating this report

time.string = gsub(":", "-", gsub(" ", "_", as.character(Sys.time())))
cat("This report is generated at:", time.string,"\n")

set up runing environment

Load required library

library(JunctionSeq)
library(GOSJ)
library(Biobase)
library(goseq)
library(org.Mm.eg.db)
library(VennDiagram)
library(GSA)
library(GO.db)
library(ggplot2)
library(gdata)
library(popbio)
#library(ReportingTools)
#library(hgu95av2.db)
#library(devtools)
#library(rgl)
library(qpcR)
library(AnnotationDbi)
library(ggbio)
#load("/media/H_driver/PJ/re_save_2016-07-05 17:55:10_PJ_jscs.RData")
load("/media/H_driver/PJ/JunctionRe.RData")
#set up input file path

input.dir.name="/media/H_driver/PJ/"
QoRTs.dir="/home/aiminyan/QoRTs/QoRTsFullExampleData/QoRTsRelease/"
gtf.file.dir="/media/aiminyan/DATA/mus_musculus/"
gff.file.dir="/media/H_driver/2015/Nimer_Cheng/"
canonical.tft.gmt.file.dir="/media/H_driver/Annotation/MsigDB/"
go.file.dir="/media/H_driver/Annotation/GO_gene_mouse/"
gene.annotation.file.dir="/media/H_driver/Annotation/mm10/"

#load gene model
data(gene.model)

#set up QoRTs EXE
QoRTs.exe=paste0(QoRTs.dir,"QoRTs.jar")

#set up input files
gtf.file=paste0(gtf.file.dir,"Mus_musculus.GRCm38.83.processed.sorted.gtf")
decode.file="decoder.bySample.rtf"
gene.count.file="/QC.geneCounts.formatted.for.DESeq.txt"

canonical.path.gmt.file=paste0(canonical.tft.gmt.file.dir,"c2.cp.Mouse.v5.1.symbols.gmt")
tft.gmt.file=paste0(canonical.tft.gmt.file.dir,"c3.tft.Mouse.v5.1.symbols.gmt")
go.mouse.file=paste0(go.file.dir,"gene_GO.mgi")
gene.annotation.file=paste0(gene.annotation.file.dir,"genes_table_02052016.csv")

sj.exon.count.file="/QC.spliceJunctionAndExonCounts.forJunctionSeq.txt"
gff.file=paste0(gff.file.dir,"Mus_musculus.GRCm38.83.JunctionSeq.flat.gff")
junctionSeq.output.save="PJ_jscs.RData"
junctionSeq.output.save.history="PJ_jscs.Rhistory"
#set up the path for output files
output.dir.name="/media/H_driver/PJ/Results/"


output.dir.name.new="/media/H_driver/PJ/Results/Feature_GeneWise_rMAT/"

output.dir.name.rMAT_SE_based=paste0(output.dir.name,time.string,"_","Feature_GeneWise_rMAT_SE_based/")
dir.create(output.dir.name.rMAT_SE_based)

#set up output files for gene based
gene.based.ad.by.GL.GO.output.file=paste0(output.dir.name,"geneGL_rm_non_coding.xls")
gene.based.ad.by.GL.ca.path.output.file=paste0(output.dir.name,"geneGL_rm_non_coding_Cp.xls")
gene.based.ad.by.GL.tft.output.file=paste0(output.dir.name,"geneGL_rm_non_coding_tft.xls")

#set up output file for feature based
feature.based.ad.by.SJ.GO.output.file=paste0(output.dir.name,"GO_use_all_gene.xls")
feature.based.ad.by.SJ.ca.path.output.file=paste0(output.dir.name,"Canonical_path_use_all_gene.xls")
feature.based.ad.by.SJ.tft.output.file=paste0(output.dir.name,"Transcription_factor_targets_use_all_gene.xls")
Output_DE_gene_file_feature_gene_based=paste0(output.dir.name,"DE_genes_based_on_features_gene_based.xls")
Output_DE_gene_file_feature_all_feature_based=paste0(output.dir.name,"DE_genes_based_on_features_all_feature_based.xls")

Output_GO_use_DE_gene_use_FC_p_of_feature=paste0(output.dir.name,"GO_term_use_DE_gene_based_on_FC_p_feature.xls")
redefined.feature.based.ad.by.SJ.ca.path.output.file=paste0(output.dir.name,"Ca_path_use_DE_gene_based_on_FC_p_feature.xls")
redefined.feature.based.ad.by.SJ.tft.output.file=paste0(output.dir.name,"Tft_use_DE_gene_based_on_FC_p_feature.xls")

Output_GO_use_DE_gene_use_FC_p_of_feature_058=paste0(output.dir.name,"GO_term_use_DE_gene_based_on_FC_p_feature_058.xls")

Output_GO_BP_use_DE_gene_use_FC_p_of_feature_058=paste0(output.dir.name,"GO_term_BP_use_DE_gene_based_on_FC_p_feature_058.xls")
Output_GO_BP_use_DE_gene_use_FC_p_of_feature_058_2=paste0(output.dir.name,"GO_term_BP_use_DE_gene_based_on_FC_p_feature_058_2.xls")
Output_GO_BP_use_DE_gene_use_FC_p_of_feature_058_3=paste0(output.dir.name,"GO_term_BP_use_DE_gene_based_on_FC_p_feature_058_3.xls")
Output_GO_BP_use_DE_gene_use_FC_p_of_feature_058_4=paste0(output.dir.name,"GO_term_BP_use_DE_gene_based_on_FC_p_feature_058_4.xls")

redefined.feature.based.ad.by.SJ.ca.path.output.file_058=paste0(output.dir.name,"Ca_path_use_DE_gene_based_on_FC_p_feature_058.xls")
redefined.feature.based.ad.by.SJ.tft.output.file_058=paste0(output.dir.name,"Tft_use_DE_gene_based_on_FC_p_feature_058.xls")
outputfile_LabelGeneBasedFeature=paste0(output.dir.name,"Label_gene_by_features.xls")
outputfile_DGE_FC_P_geneWise=paste0(output.dir.name,"DGE_overlap_check.tiff")

outputfile_DGE_2_FC_P_geneWise=paste0(output.dir.name,"DGE_overlap_check_defined_by_2_FC.tiff")


Output_GO_BP_use_DE_gene_use_FC_p_of_feature_058_2_rMAT=paste0(output.dir.name,"GO_term_BP_use_DE_gene_based_on_FC_p_feature_058_4_rMAT.xls")

Output_GO_BP_feature.gene.rMAT=paste0(output.dir.name,"GO_term_BP_feature_gene_rMAT.xls")
Output_GO_BP_gene=paste0(output.dir.name,"GO_term_BP_gene_only.xls")
Output_GO_BP_feature=paste0(output.dir.name,"GO_term_BP_feature_only.xls")
Output_GO_BP_feature_gene=paste0(output.dir.name,"GO_term_BP_feature_gene.xls")
Output_GO_BP_feature_gene_redefined=paste0(output.dir.name,"GO_term_BP_feature_gene_redefined.xls")

Output_GO_BP_feature.gene.rMAT.2.FC=paste0(output.dir.name,"GO_term_BP_feature_gene_rMAT_2FC.xls")

output.dir.DE.name=paste0(output.dir.name,time.string,"_","DE/")
dir.create(output.dir.DE.name)

Output_DE_gene_rMAT_FDR=paste0(output.dir.DE.name,"DE_gene_rMAT_FDR.xls")
Output_DE_gene_rMAT_p_inc=paste0(output.dir.DE.name,"GO_gene_rMAT_p_inc.xls")

Get counts for gene, exon and splicing junctions

4 BAM files(2 BAM files within each condition(WT vs KO)) are generated by aliging FASTQ files to the GRCm38/mm10 build of the Mus musculus genome with STAR aligner. Gene-wise counts were obtained with callQoRT function in GOSJ R package:

file.name=dir(input.dir.name,recursive = TRUE,pattern="sorted.bam")
file.name.whole<-paste0(input.dir.name,file.name)
file.name.selected<-file.name.whole[c(5,3,9)]
file.name.selected.2<-as.list(file.name.selected)
names(file.name.selected.2)=sapply(strsplit(file.name.selected,split="\\/"),"[[",6)
cmd1=paste0("java -Xmx4000M -jar ",QoRTs.exe," QC --noGzipOutput")

re.out<-lapply(file.name.selected.2,callQoRT,gtf_file=gtf.file,runing_cmd=cmd1)

Gene based analysis

Use gene based count, overall differential gene expression is analyzed with GetResults4GeneLevel function in GOSJ R package:

Re.PJ.gene<-GetResults4GeneLevel(input.dir.name,decode.file,gene.count.file)

Show the results for differentially expressed genes:

knitr::kable(head(Re.PJ.gene))

head(Re.PJ.gene)
length(which(Re.PJ.gene$padj<0.05&abs(Re.PJ.gene$log2FoldChange)>0.5))

Collect all geneIDs from GO, canonical pathway and transcription factor target

gene.id.all<-CombineCpTftGoGeneID(canonical.path.gmt.file,tft.gmt.file,go.mouse.file,gene.annotation.file)
index.cp.tft.co.gene<-match(gene.id.all,rownames(Re.PJ.gene))
index.cp.tft.co.gene.2<-index.cp.tft.co.gene[-which(is.na(index.cp.tft.co.gene))]
Re.PJ.gene.rm.non.coding<-Re.PJ.gene[index.cp.tft.co.gene.2,]

Show the results for differentially expressed genes:

knitr::kable(head(Re.PJ.gene.rm.non.coding))

To perform GO and pathway analysis based on differentially expressed genes:

Gene.non.coding.GO<-GeneBasedAnalysis(Re.PJ.gene.rm.non.coding,gene.model,gene.based.ad.by.GL.GO.output.file)

gene.2.cat.cp.mouse<-Gmt2GeneCat(canonical.path.gmt.file,gene.annotation.file)
gene.2.cat.tft.mouse<-Gmt2GeneCat(tft.gmt.file,gene.annotation.file)

RE.cp.gene<-OutputCatBasedPwf(Gene.non.coding.GO$pwf,gene_model=gene.model,gene_2_cat=gene.2.cat.cp.mouse,
                              Output_file=gene.based.ad.by.GL.ca.path.output.file)

RE.cft.gene<-OutputCatBasedPwf(Gene.non.coding.GO$pwf,gene_model=gene.model,gene_2_cat=gene.2.cat.tft.mouse,
                               Output_file=gene.based.ad.by.GL.tft.output.file)

Analysis based on exons and splicing junction counts

For differential exon and splicing usage analysis, the count file for exons and splicing junctions are used as input to get differential usage results with GetResultsFromJunctionSeq function in GOSJ R package:

#decoder <- read.table(paste0(input.dir.name,decode.file),header=TRUE,stringsAsFactors=FALSE)
#countFiles <- paste0(input.dir.name,decoder$sample.ID,"/QC.spliceJunctionAndExonCounts.forJunctionSeq.txt")
#design <- data.frame(condition = factor(decoder$group.ID))

#jscs.counts = readJunctionSeqCounts(countfiles = countFiles,samplenames = decoder$sample.ID,design = design,flat.gff.file = gff.file)


Re.PJ<-GetResultsFromJunctionSeq(input.dir.name,decode.file,sj.exon.count.file,gff.file)

From the resutls of differential exons and splicing junctions usage(Re.PJ), we generate genewise results using makeGeneWiseTable function in GOSJ R package:

#load(paste0(input.dir.name,junctionSeq.output.save))
re.PJ.gene.based<-makeGeneWiseTable(Re.PJ,gene.list=unique(as.character(fData(Re.PJ)$geneID)))

To show some results of genewise resutls:

knitr::kable(head(pData(re.PJ.gene.based)))

Compare DE from PJ with DE we got from PJ files

DE.from.PJ.input.file="/media/H_driver/PJ/Results/SE.MATS.ReadsOnTargetAndJunctionCounts.Neg_WTvKO_c_001_0_20_inclusion_p_0_05_from_PJ.xlsx"
DE.from.PJ<-read.xls(DE.from.PJ.input.file,header=T)
dim(DE.from.PJ)
length(unique(as.character(DE.from.PJ$geneSymbol)))
dir.name.rMAT="/media/H_driver/PJ/rMAT/pHamard_MATS-selected-re-run-exon-centric/pHamard_MATS-selected-re-run-exon-centric/Neg_WTvKO_c0.01/"
input.file.pattern="*Neg_WTvKO_c0.01.txt"

file.list<-paste0(dir.name.rMAT,dir(dir.name.rMAT))
print(file.list[19])

re.Data<-read.table(file.list[19],header = T)
names(re.Data)
DE.defined.by.p.IncLevelDifference<-re.Data[which(re.Data$PValue<0.05&abs(re.Data$IncLevelDifference)>0.20),]
DE.defined.by.p.IncLevelDifference.geneSymbol<-unique(as.character(DE.defined.by.p.IncLevelDifference$geneSymbol))
length(DE.defined.by.p.IncLevelDifference.geneSymbol)

#Match data set to DE list from PJ
DE.from.PJ.2.H.driver<-re.Data[(which(as.character(re.Data$geneSymbol) %in% unique(as.character(DE.from.PJ$geneSymbol)))),c(1,2,3,19,20,23)]

#dim(DE.from.PJ.2.H.driver)
DE.from.PJ.2.H.driver.selected.events<-DE.from.PJ.2.H.driver[which(DE.from.PJ.2.H.driver$PValue<0.05&abs(DE.from.PJ.2.H.driver$IncLevelDifference)>=0.20),]
dim(DE.from.PJ.2.H.driver.selected.events)

length(unique(as.character(DE.from.PJ.2.H.driver.selected.events$geneSymbol)))

setdiff(unique(as.character(DE.from.PJ.2.H.driver.selected.events$geneSymbol)),unique(as.character(DE.from.PJ$geneSymbol)))

setdiff(DE.defined.by.p.IncLevelDifference.geneSymbol,unique(as.character(DE.from.PJ$geneSymbol)))

knitr::kable(DE.from.PJ.2.H.driver.selected.events)

Generate DE list

#load("/media/H_driver/PJ/JunctionRe.RData")
De_defined_by_what="P_and_inclusion"
re.rMAT<-ProcessOutputFilesFrom_rMATS(dir.name.rMAT,input.file.pattern,De_defined_by_what)
Re.PJ.selected.feature.2.FC.p<-Select_DE_gene_basd_on_Feature(Re.PJ,re.PJ.gene.based,re.rMAT,"SE",1,0.05,gene.model,output.dir.DE.name)

DE defined by FDR from rMAT(FDR<0.05)

DE.FDR.rMAT<-gene.model[which(gene.model$ensembl_gene_id %in% Re.PJ.selected.feature.2.FC.p$DGE.rMAT),]
DE.FDR.rMAT
write.table(DE.FDR.rMAT,file=Output_DE_gene_rMAT_FDR,row.names = FALSE,quote=FALSE,sep="\t")

DE defined by p value and IncLevelDifference from rMAT(p value <0.05 and IncLevelDifference>0.2)

DE.p.inc.rMAT<-gene.model[which(gene.model$ensembl_gene_id %in% Re.PJ.selected.feature.2.FC.p$DGE.p.IncLevelDifference),]
DE.p.inc.rMAT
write.table(DE.p.inc.rMAT,file=Output_DE_gene_rMAT_p_inc,row.names = FALSE,quote=FALSE,sep="\t")

Overlap on DE between JunctionSeq, rMAT(p<0.05 and incl> 0.20)

cat("please find all results in: ", output.dir.name.rMAT_SE_based, "\n")
venn.plot <- venn.diagram(
    x = Re.PJ.selected.feature.2.FC.p[c(2,3,4)],
    filename = NULL,
    col = "black",
    lty = "dotted",
    lwd = 2,
    fill = c("red", "orange", "blue"),
    alpha = 0.50,
    label.col = c(rep("white",7)),
    cex = 1,
    fontfamily = "serif",
    fontface = "bold",
    cat.col = c("red", "orange", "blue"),
    cat.cex = 0.8,
    cat.fontfamily = "serif"
  )
grid.draw(venn.plot);
#dev.off()
#output.dir.name="/media/H_driver/PJ/Results/"
#output.dir.name.rMAT_SE_based=paste0(output.dir.name,"test_7_26_2016","_","Feature_GeneWise_rMAT_SE_based/")
#dir.create(output.dir.name.rMAT_SE_based)
#sink(paste0(output.dir.name.rMAT_SE_based,"Output_test.txt"))
Re.combine.3.methods.rMAT.SE<-Combine3Re(Re.PJ, re.PJ.gene.based, re.rMAT,gene.model,output.dir.name.rMAT_SE_based)
#save.image(file=paste0(output.dir.name.rMAT_SE_based,"re_save.RData"))
#savehistory(file=paste0(output.dir.name.rMAT_SE_based,"re_save.Rhistory"))
#sink()


aiminy/GOSJ documentation built on May 12, 2019, 3:38 a.m.