# Install Package: Ctrl + Shift + B
# devtools::document()
#' combines IRead output files into one file
#'
#' Imports:
#' purrr,
#' dplyr,
#' data.table
#'
#' @param files vector of filenames from files that should be uploaded
#' @return A data.frame with read counts where each row is an intron and each column is a gene, except the last column which contains corresponding symbols
#' @examples
#' myFiles <- list.files()
#' iread <- seekIRead(myFiles)
seekIRead <- function(files){
tables <- lapply(files, data.table::fread)
minifun <- function(input){
output <- purrr::reduce(input, dplyr::inner_join, "intron_id") %>% as.data.frame()
rownames(output) <- output$intron_id
output <- output[,-1]
colnames(output) <- gsub("_Align.*$","",gsub("^.*/","",files))
output <- cbind(symbol=seekX2Y(gsub("-.*","",gsub("^.*ENS","ENS", rownames(output))),"ENSEMBL","SYMBOL","human"), output)
output
}
counts <- minifun(lapply(tables, function(x) x[,c(1,2)]))
IR <- minifun(lapply(tables, function(x) x[,c(1,7)]))
for(i in 2:ncol(IR)){
IR[,i] <- as.numeric(ifelse(IR[,i] %in% "yes",1,0))
}
list(IR, counts)
}
#' combines IRFinder output files into one file
#'
#' Imports:
#' DESeq2,
#' ggplot2
#'
#' @param subfolders vector of strings, full paths to subfolder of IRFinder results for each sample. You can use the following code to get the subfolders: subfolders <- mainfolder %>% paste0(list.files(.))
#' @param conditions vector, has to contain the conditions (strings or numbers) of the samples (WT, KO, etc.) in the same order as the parameter subfolders
#' @param cond0 string or numeric, control condition, has to be the same as in the parameter conditions
#' @param cond1 string or numeric, test condition, has to be the same as in the parameter conditions
#' @param batches vector of integers, standing for batch numbers
#' @param dir string, either "dir" or "nondir"
#' @return A list of two data.frames. The second contains
#' @details the code is based around the DESeq2Constructor from the IRFinder pipeline
#' @examples
#' myFiles <- list.files()
#' iread <- seekIRfind(myFiles)
seekIRfind <- function(subfolders, conditions, cond0, cond1, batches=NA, dir="dir"){
#subfolder <- paste0(mainfolder,list.files(mainfolder))
files <- lapply(subfolders, function(x) list.files(x)[grep(paste0("IRFinder-IR-",dir), list.files(x))])
paths <- paste0(subfolders,"/",files)
experiment <- as.data.frame(data.frame(SampleNames=subfolders, Condition=conditions, Batch=as.character(batches)))
experiment$Condition=factor(experiment$Condition,levels=c(cond0,cond1))
rownames(experiment)=NULL
print(experiment)
#== DESeq2 ==#
source("~/Apps/IRFinder-1.2.3/bin/DESeq2Constructor.R")
if(is.na(batches[1])){batchCorrection <- "FALSE"}else{batchCorrection <- "TRUE"}
metaList <- switch(batchCorrection,
"FALSE"=DESeqDataSetFromIRFinder(filePaths=paths, designMatrix=experiment, designFormula=~Condition+Condition:IRFinder),
"TRUE"=DESeqDataSetFromIRFinder(filePaths=paths, designMatrix=experiment, designFormula=~Batch+Condition+Condition:IRFinder)
)
dds <- DESeq2::DESeq(metaList$DESeq2Object)
#DESeq2::resultsNames(dds)
res.diff = DESeq2::results(dds, contrast=as.list(paste0("Condition",c(cond1,cond0),".IRFinderIR"))) #diff.IR between PR957 and DMSO
res.diff <- as.data.frame(res.diff)
colnames(res.diff) <- c("baseMean","log2FC","lfcSE","stat","pval","padj")
res.diff$symbol <- gsub("/.*$","",row.names(res.diff))
#== absolute IR-perentage change ==# (why do we care?)
minideseq <- function(comparison){
res <- DESeq2::results(dds, name = comparison) #calculating the log2FC between IR and splice
IR_vs_Splice <- 2^res$log2FoldChange #making the log2FC a regular FC
IRratio <- IR_vs_Splice/(1+IR_vs_Splice) #calculating the percentage of IR to IR+splice (total reads regarding this intron)
IRratio
}
IRratio.Ctrl <- minideseq(paste0("Condition",c(cond0),".IRFinderIR")) # IR percentage in DMSO for each intron
IRratio.Treat <- minideseq(paste0("Condition",c(cond1),".IRFinderIR")) # IR percentage in PR957 for each intron
IR.change <- IRratio.Treat - IRratio.Ctrl # change in IR percentage !!!ABSOLUTE!!! in contrast to the relative change we calculated with DESeq
#== combine relative table with absolute changes ==#
res.diff$IRchange <- IR.change
res.diff <- res.diff[order(res.diff$pval),]
#==output==#
ggplot2::ggplot(subset(res.diff, !is.na(padj)), ggplot2::aes(y=-log10(padj), color=ifelse(padj<=0.05,"sign.","unsig."))) +
ggplot2::geom_point(ggplot2::aes(x=IRchange)) +
ggplot2::geom_point(ggplot2::aes(x=log2FC), shape=21,fill="magenta", color="black") +
ggplot2::geom_hline(yintercept = -log10(0.05), linetype="dashed") +
ggplot2::labs(color="change")
list(IR=res.diff, dds=dds)
}
#' calculates differential transcript expression, using the DRIMSeq package, following the Salmon pipeline
#'
#' Imports:
#' vst,
#'
#' @param files vector of strings, specifying the locations of the Salmon output files ("/folderpath/quant.sf")
#' @param samps data.frame containing two or three columns: sample_id, condition, library_layout (optional). sample_id must contain desired sample names in the same order as the files parameter. condition must contain identifiers (integers or strings) to put each sample in a certain group (e.g. wt & ko or 1 & 2). Optionally a "library_layout" column can be added to account for batch effects.
#' @param txdb a TxDb object, made by makeTxDbFromGFF("folderpath/file.gtf.gz") or by loading an existing one loadDb("/folderpath/file.sqlite")
#' @param Species a string, either "human" or "mouse". Is used to generate gene symbols from Ensembl IDs.
#' @param noncontrol of the condition column from samps, which identifier is the non-control (ko, etc.)?
#' @return A list with two data.frames and dmDSdata object. [["res"]] contains p values for the test if a gene has differentially regulated isoforms. [["res.txp"]] contains p values for the test if an isoform is differentially regulated. [["d"]] contains the dmDSdata object used to do graphs with plotProportions().
#' @examples
#' txdb <- GenomicFeatures::makeTxDbFromGFF("/folderpath/gencode.v32.annotation.gtf.gz", organism="Homo sapiens")
#' #OR
#' txdb <- AnnotationDbi::loadDb("/folderpath/gencode.v32.annotation.sqlite")
#' samps <- data.frame(sample_id=c(c1,c2,c3,t1,t2,t3), condition=c(1,1,1,2,2,2))
#' files <- c("/folderpath/c1/quant.sf", "/folderpath/c2/quant.sf", "/folderpath/c3/quant.sf", "/folderpath/t1/quant.sf", "/folderpath/t2/quant.sf", "/folderpath/t3/quant.sf")
#' myIso <- seekSalmon(files=files, samps=samps, txdb=txdb, Species="human")
#' plotProportions(myIso[["d"]], myIso[["res"]]$gene_id[which(res1$adj_pvalue <= 0.05)[1]], "condition")
seekSalmon <- function(files, samps, txdb, Species, noncontrol){
#==samples==#
samps$sample_id <- make.names(samps$sample_id ,unique=T)
names(files) <- samps$sample_id
samps$condition <- factor(samps$condition)
cts <- tximport::tximport(files, type="salmon", txOut=TRUE, countsFromAbundance="scaledTPM")
cts <- cts$counts[rowSums(cts$counts) > 0,]
#==transcript database==#
message("#==Loading transcript database==#")
txdf <- AnnotationDbi::select(txdb, keys(txdb, "GENEID"), "TXNAME", "GENEID")
translation <- {switch(Species,
"human"=AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, gsub("\\..*$", "", txdf$GENEID), "SYMBOL","ENSEMBL", multiVals="first"),
"mouse"=AnnotationDbi::mapIds(org.Mm.eg.db::org.Mm.eg.db, gsub("\\..*$", "", txdf$GENEID), "SYMBOL","ENSEMBL", multiVals="first"))}
txdf <- data.frame(gene_id=translation, feature_id=txdf$TXNAME)
#txdf <- merge(txdf, cts, by.x="feature_id", by.y=0)
#txdf$ntx <- table(txdf$GENEID)[match(txdf$GENEID, names(table(txdf$GENEID)))]
#txdf <- txdf[match(rownames(cts), txdf$TXNAME),]
#==transcript counts==#
message("#==generating transcript counts==#")
counts <- na.omit(merge(txdf, cts, by.x="feature_id", by.y=0))
#==diff.Expr==#
message("#==Filtering==#")
n.small <- length(files)-2
d1 <- DRIMSeq::dmDSdata(counts, samples=samps)
d2 <- DRIMSeq::dmFilter(d1, min_samps_feature_expr=n.small, min_feature_expr=10,
min_samps_feature_prop=n.small, min_feature_prop=0.1,
min_samps_gene_expr=length(files), min_gene_expr=10)
if(ncol(samps)==2){
design <- model.matrix(~condition, data=DRIMSeq::samples(d2))
}else{
design <- model.matrix(~condition + library_layout, data=DRIMSeq::samples(d2))
}
set.seed(1)
message("#==Calculating differential transcript usage (DRIMSeq)==#")
d3 <- DRIMSeq::dmPrecision(d2, design=design)
d4 <- DRIMSeq::dmFit(d3, design=design)
d5 <- DRIMSeq::dmTest(d4, coef=paste0("condition", noncontrol)) #coeff removes column of that group to calculate the null design. control must be either 1 or 2
res <- DRIMSeq::results(d5)
res.txp <- DRIMSeq::results(d5, level="feature")
#==final polishing==#
message("#==Translating IDs to gene symbols==#")
res <- res[order(res$adj_pvalue),]
res.txp <- res.txp[order(res.txp$adj_pvalue),]
#==output==#
list(res=res, res.txp=res.txp, d=d5)
}
seekSalmonPlot <- function(seekSalmonOut3, gene, plot_type="boxplot1", group_variable="condition"){
isoPlot <- DRIMSeq::plotProportions(seekSalmonOut3, gene_id=gene, plot_type=plot_type, group_variable=group_variable,
plot_fit=F)
isoPlot <- ggplotify::as.ggplot(isoPlot)
isoPlot
}
# setwd(mainfolder)
# results = read.table("filePaths.txt")
# paths = as.vector(results$V1)
# experiment=data.frame()
# tables <- lapply(paths, function(x) {
# read.table(x, header=F, sep="\t",col.names=c("Chr","Start","End","Symbol_EnsemblID_StaticWarning","null","Strand","ExcludedBases","Coverage",
# "IntronDepth","Intron25Percentile","Intron50Percentile","Intron75Percentile",
# "ExonToIntronReadsLeft","ExonToIntronReadsRight","IntronDepthFirst50bp","IntronDepthLast50bp",
# "SplifeLeft","SpliceRight","SpliceExact","IRratio","Warnings"))
# })
# tables <- lapply(tables, function(x) cbind(x, intron_id=paste(x$Chr,x$Start,x$End,x$Symbol_EnsemblID_StaticWarning,sep="_")))
#
# #count matrix for intron reads
# counts <- mapply(function(x,y) `colnames<-`(x[,c("IntronDepth","intron_id")],c(y,"intron_id")), x=tables, y=list.files(mainfolder), SIMPLIFY=F)
# counts <- purrr::reduce(counts, inner_join, "intron_id")
# rownames(counts) <- make.names(counts$intron_id)
# counts <- counts[,-2]
#
# #count matrix for intronExon reads + intron reads (addition in last step)
# counts2 <- mapply(function(x,y) `colnames<-`(x[,c("SpliceExact","intron_id")],c(y,"intron_id")), x=tables, y=list.files(mainfolder), SIMPLIFY=F)
# counts2 <- purrr::reduce(counts2, inner_join, "intron_id")
# rownames(counts2) <- make.names(counts2$intron_id)
# counts2 <- counts2[,-2]
# counts2 <- counts2+counts
#
# #count matrix with both
# counts3 <- merge(counts, counts2, by=0)
# rownames(counts3) <-counts3$Row.names
# counts3 <- counts3[-1]
# counts3 <- round(counts3)
#
# #DESeq2 between counts1 & 2
# seekDEseq(counts3, rep(c(1,2), each=(ncol/2)))
#
#
#
# names(counts) <- subfolder
# counts <- purrr::reduce(counts, inner_join, "intron_id")
# rownames(counts) <- counts$intron_id
# counts <- counts[,-1]
# colnames(counts) <- gsub("_Align.*$","",gsub("^trim_","",files))
# counts$symbol <- seekX2Y(gsub("-.*","",gsub("^.*ENS","ENS", rownames(counts))),"ENSEMBL","SYMBOL","human")
# counts
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.