seeqR:

Documented in seekIRead seekIRfind seekSalmon

# Install Package: Ctrl + Shift + B

# devtools::document()


#' combines IRead output files into one file
#'
#' Imports:
#' purrr,
#' dplyr,
#' data.table
#'
#' @param files vector of filenames from files that should be uploaded
#' @return A data.frame with read counts where each row is an intron and each column is a gene, except the last column which contains corresponding symbols
#' @examples
#' myFiles <- list.files()
#' iread <- seekIRead(myFiles)
seekIRead <- function(files){
  tables <- lapply(files, data.table::fread)

  minifun <- function(input){
    output <- purrr::reduce(input, dplyr::inner_join, "intron_id") %>% as.data.frame()
    rownames(output) <- output$intron_id
    output <- output[,-1]
    colnames(output) <- gsub("_Align.*$","",gsub("^.*/","",files))
    output <- cbind(symbol=seekX2Y(gsub("-.*","",gsub("^.*ENS","ENS", rownames(output))),"ENSEMBL","SYMBOL","human"), output)
    output
  }
  counts <- minifun(lapply(tables, function(x) x[,c(1,2)]))
  IR <- minifun(lapply(tables, function(x) x[,c(1,7)]))
  for(i in 2:ncol(IR)){
    IR[,i] <- as.numeric(ifelse(IR[,i] %in% "yes",1,0))
  }

  list(IR, counts)
}

#' combines IRFinder output files into one file
#'
#' Imports:
#' DESeq2,
#' ggplot2
#'
#' @param subfolders vector of strings, full paths to subfolder of IRFinder results for each sample. You can use the following code to get the subfolders: subfolders <- mainfolder %>% paste0(list.files(.))
#' @param conditions vector, has to contain the conditions (strings or numbers) of the samples (WT, KO, etc.) in the same order as the parameter subfolders
#' @param cond0 string or numeric, control condition, has to be the same as in the parameter conditions
#' @param cond1 string or numeric, test condition, has to be the same as in the parameter conditions
#' @param batches vector of integers, standing for batch numbers
#' @param dir string, either "dir" or "nondir"
#' @return A list of two data.frames. The second contains
#' @details the code is based around the DESeq2Constructor from the IRFinder pipeline
#' @examples
#' myFiles <- list.files()
#' iread <- seekIRfind(myFiles)
seekIRfind <- function(subfolders, conditions, cond0, cond1, batches=NA, dir="dir"){
  #subfolder <- paste0(mainfolder,list.files(mainfolder))
  files <- lapply(subfolders, function(x) list.files(x)[grep(paste0("IRFinder-IR-",dir), list.files(x))])
  paths <- paste0(subfolders,"/",files)

  experiment <- as.data.frame(data.frame(SampleNames=subfolders, Condition=conditions, Batch=as.character(batches)))
  experiment$Condition=factor(experiment$Condition,levels=c(cond0,cond1))
  rownames(experiment)=NULL
  print(experiment)
  #== DESeq2 ==#
  source("~/Apps/IRFinder-1.2.3/bin/DESeq2Constructor.R")
  if(is.na(batches[1])){batchCorrection <- "FALSE"}else{batchCorrection <- "TRUE"}
  metaList <- switch(batchCorrection,
                     "FALSE"=DESeqDataSetFromIRFinder(filePaths=paths, designMatrix=experiment, designFormula=~Condition+Condition:IRFinder),
                     "TRUE"=DESeqDataSetFromIRFinder(filePaths=paths, designMatrix=experiment, designFormula=~Batch+Condition+Condition:IRFinder)
  )
  dds <- DESeq2::DESeq(metaList$DESeq2Object)
  #DESeq2::resultsNames(dds)
  res.diff = DESeq2::results(dds, contrast=as.list(paste0("Condition",c(cond1,cond0),".IRFinderIR"))) #diff.IR between PR957 and DMSO
  res.diff <- as.data.frame(res.diff)
  colnames(res.diff) <- c("baseMean","log2FC","lfcSE","stat","pval","padj")
  res.diff$symbol <- gsub("/.*$","",row.names(res.diff))

  #== absolute IR-perentage change ==# (why do we care?)
  minideseq <- function(comparison){
    res <- DESeq2::results(dds, name = comparison) #calculating the log2FC between IR and splice
    IR_vs_Splice <- 2^res$log2FoldChange #making the log2FC a regular FC
    IRratio <- IR_vs_Splice/(1+IR_vs_Splice) #calculating the percentage of IR to IR+splice (total reads regarding this intron)
    IRratio
  }
  IRratio.Ctrl <- minideseq(paste0("Condition",c(cond0),".IRFinderIR")) # IR percentage in DMSO  for each intron
  IRratio.Treat <- minideseq(paste0("Condition",c(cond1),".IRFinderIR")) # IR percentage in PR957 for each intron
  IR.change <- IRratio.Treat - IRratio.Ctrl # change in IR percentage !!!ABSOLUTE!!! in contrast to the relative change we calculated with DESeq

  #== combine relative table with absolute changes ==#
  res.diff$IRchange <- IR.change
  res.diff <- res.diff[order(res.diff$pval),]

  #==output==#
  ggplot2::ggplot(subset(res.diff, !is.na(padj)), ggplot2::aes(y=-log10(padj), color=ifelse(padj<=0.05,"sign.","unsig."))) +
    ggplot2::geom_point(ggplot2::aes(x=IRchange)) +
    ggplot2::geom_point(ggplot2::aes(x=log2FC), shape=21,fill="magenta", color="black") +
    ggplot2::geom_hline(yintercept = -log10(0.05), linetype="dashed") +
    ggplot2::labs(color="change")

  list(IR=res.diff, dds=dds)
}


#' calculates differential transcript expression, using the DRIMSeq package, following the Salmon pipeline
#'
#' Imports:
#' vst,
#'
#' @param files vector of strings, specifying the locations of the Salmon output files ("/folderpath/quant.sf")
#' @param samps data.frame containing two or three columns: sample_id, condition, library_layout (optional). sample_id must contain desired sample names in the same order as the files parameter. condition must contain identifiers (integers or strings) to put each sample in a certain group (e.g. wt & ko or 1 & 2). Optionally a "library_layout" column can be added to account for batch effects.
#' @param txdb a TxDb object, made by makeTxDbFromGFF("folderpath/file.gtf.gz") or by loading an existing one loadDb("/folderpath/file.sqlite")
#' @param Species a string, either "human" or "mouse". Is used to generate gene symbols from Ensembl IDs.
#' @param noncontrol of the condition column from samps, which identifier is the non-control (ko, etc.)?
#' @return A list with two data.frames and dmDSdata object. [["res"]] contains p values for the test if a gene has differentially regulated isoforms. [["res.txp"]] contains p values for the test if an isoform is differentially regulated. [["d"]] contains the dmDSdata object used to do graphs with plotProportions().
#' @examples
#' txdb <- GenomicFeatures::makeTxDbFromGFF("/folderpath/gencode.v32.annotation.gtf.gz", organism="Homo sapiens")
#' #OR
#' txdb <- AnnotationDbi::loadDb("/folderpath/gencode.v32.annotation.sqlite")
#' samps <- data.frame(sample_id=c(c1,c2,c3,t1,t2,t3), condition=c(1,1,1,2,2,2))
#' files <- c("/folderpath/c1/quant.sf", "/folderpath/c2/quant.sf", "/folderpath/c3/quant.sf", "/folderpath/t1/quant.sf", "/folderpath/t2/quant.sf", "/folderpath/t3/quant.sf")
#' myIso <- seekSalmon(files=files, samps=samps, txdb=txdb, Species="human")
#' plotProportions(myIso[["d"]], myIso[["res"]]$gene_id[which(res1$adj_pvalue <= 0.05)[1]], "condition")
seekSalmon <- function(files, samps, txdb, Species, noncontrol){
  #==samples==#
  samps$sample_id <- make.names(samps$sample_id ,unique=T)
  names(files) <- samps$sample_id
  samps$condition <- factor(samps$condition)
  cts <- tximport::tximport(files, type="salmon", txOut=TRUE, countsFromAbundance="scaledTPM")
  cts <- cts$counts[rowSums(cts$counts) > 0,]

  #==transcript database==#
  message("#==Loading transcript database==#")
  txdf <- AnnotationDbi::select(txdb, keys(txdb, "GENEID"), "TXNAME", "GENEID")

  translation <-  {switch(Species,
                          "human"=AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, gsub("\\..*$", "", txdf$GENEID), "SYMBOL","ENSEMBL", multiVals="first"),
                          "mouse"=AnnotationDbi::mapIds(org.Mm.eg.db::org.Mm.eg.db, gsub("\\..*$", "", txdf$GENEID), "SYMBOL","ENSEMBL", multiVals="first"))}
  txdf <- data.frame(gene_id=translation, feature_id=txdf$TXNAME)
  #txdf <- merge(txdf, cts, by.x="feature_id", by.y=0)
  #txdf$ntx <- table(txdf$GENEID)[match(txdf$GENEID, names(table(txdf$GENEID)))]
  #txdf <- txdf[match(rownames(cts), txdf$TXNAME),]

  #==transcript counts==#
  message("#==generating transcript counts==#")
  counts <- na.omit(merge(txdf, cts, by.x="feature_id", by.y=0))

  #==diff.Expr==#
  message("#==Filtering==#")
  n.small <- length(files)-2
  d1 <- DRIMSeq::dmDSdata(counts, samples=samps)
  d2 <- DRIMSeq::dmFilter(d1, min_samps_feature_expr=n.small, min_feature_expr=10,
                          min_samps_feature_prop=n.small, min_feature_prop=0.1,
                          min_samps_gene_expr=length(files), min_gene_expr=10)
  if(ncol(samps)==2){
    design <- model.matrix(~condition, data=DRIMSeq::samples(d2))
  }else{
    design <- model.matrix(~condition + library_layout, data=DRIMSeq::samples(d2))
  }
  set.seed(1)
  message("#==Calculating differential transcript usage (DRIMSeq)==#")
  d3 <- DRIMSeq::dmPrecision(d2, design=design)
  d4 <- DRIMSeq::dmFit(d3, design=design)
  d5 <- DRIMSeq::dmTest(d4, coef=paste0("condition", noncontrol)) #coeff removes column of that group to calculate the null design. control must be either 1 or 2
  res <- DRIMSeq::results(d5)
  res.txp <- DRIMSeq::results(d5, level="feature")

  #==final polishing==#
  message("#==Translating IDs to gene symbols==#")
  res <- res[order(res$adj_pvalue),]
  res.txp <- res.txp[order(res.txp$adj_pvalue),]

  #==output==#
  list(res=res, res.txp=res.txp, d=d5)
}

seekSalmonPlot <- function(seekSalmonOut3, gene, plot_type="boxplot1", group_variable="condition"){
  isoPlot <- DRIMSeq::plotProportions(seekSalmonOut3, gene_id=gene, plot_type=plot_type, group_variable=group_variable,
                                      plot_fit=F)
  isoPlot <- ggplotify::as.ggplot(isoPlot)
  isoPlot
}

# setwd(mainfolder)
# results = read.table("filePaths.txt")
# paths = as.vector(results$V1)
# experiment=data.frame()
# tables <- lapply(paths, function(x) {
#   read.table(x, header=F, sep="\t",col.names=c("Chr","Start","End","Symbol_EnsemblID_StaticWarning","null","Strand","ExcludedBases","Coverage",
#                                                "IntronDepth","Intron25Percentile","Intron50Percentile","Intron75Percentile",
#                                                "ExonToIntronReadsLeft","ExonToIntronReadsRight","IntronDepthFirst50bp","IntronDepthLast50bp",
#                                                "SplifeLeft","SpliceRight","SpliceExact","IRratio","Warnings"))
# })
# tables <- lapply(tables, function(x) cbind(x, intron_id=paste(x$Chr,x$Start,x$End,x$Symbol_EnsemblID_StaticWarning,sep="_")))
#
# #count matrix for intron reads
# counts <- mapply(function(x,y) `colnames<-`(x[,c("IntronDepth","intron_id")],c(y,"intron_id")), x=tables, y=list.files(mainfolder), SIMPLIFY=F)
# counts <- purrr::reduce(counts, inner_join, "intron_id")
# rownames(counts) <- make.names(counts$intron_id)
# counts <- counts[,-2]
#
# #count matrix for intronExon reads + intron reads (addition in last step)
# counts2 <- mapply(function(x,y) `colnames<-`(x[,c("SpliceExact","intron_id")],c(y,"intron_id")), x=tables, y=list.files(mainfolder), SIMPLIFY=F)
# counts2 <- purrr::reduce(counts2, inner_join, "intron_id")
# rownames(counts2) <- make.names(counts2$intron_id)
# counts2 <- counts2[,-2]
# counts2 <- counts2+counts
#
# #count matrix with both
# counts3 <- merge(counts, counts2, by=0)
# rownames(counts3) <-counts3$Row.names
# counts3 <- counts3[-1]
# counts3 <- round(counts3)
#
# #DESeq2 between counts1 & 2
# seekDEseq(counts3, rep(c(1,2), each=(ncol/2)))
#
#
#
# names(counts) <- subfolder
# counts <- purrr::reduce(counts, inner_join, "intron_id")
# rownames(counts) <- counts$intron_id
# counts <- counts[,-1]
# colnames(counts) <- gsub("_Align.*$","",gsub("^trim_","",files))
# counts$symbol <- seekX2Y(gsub("-.*","",gsub("^.*ENS","ENS", rownames(counts))),"ENSEMBL","SYMBOL","human")
# counts