# # Load/install dependencies
# CRAN_packs <- c("magrittr", "parallel", "optparse", "plyr", "ggplot2", "grid", "gtools", "reshape2",
# "EnvStats", "RColorBrewer")
# BIOC_packs <- c("GenomicAlignments", "GenomicRanges", "rtracklayer")
# sapply(CRAN_packs, function(x) installLoad_CRAN(x))
# sapply(BIOC_packs, function(x) installLoad_BioC(x))
# # Functions for plotting.
# # Score RD: Makes an average of the relative differences of neighboring positions
# scoreRD <- function(CHR, d = 6){
# scoreRD <- rep(0, length(CHR)); names(scoreRD) <- names(CHR)
# ds <- NULL; us <- NULL
# for(P in (d+1):(length(CHR)-d-1)){
# evalPos <- CHR[P]
# leftFlank <- CHR[(P-d):(P-1)]
# rightFlank <- CHR[(P+1):(P+d)]
# if(sum(leftFlank == 0) > 3 | sum(rightFlank == 0) > 3){next()}
# us <- mean(((leftFlank - evalPos + 1) / (evalPos + 1))) # Up-stream Relative difference
# ds <- mean(((rightFlank - evalPos + 1) / (evalPos + 1))) # Downstream Relative difference
# scoreRD[P] <- mean(c(us, ds))
# }
# # scoreRD <- (scoreRD - min(scoreRD)) / (max(scoreRD) - min(scoreRD))
# return(scoreRD)
# }
#
# #Gives scoreA given a numeric vector containing read ends
# scoreA <- function(CHR, d = 6, minEnv = 15){
# scoreA <- rep(NA, length(CHR)); names(scoreA) <- names(CHR)
# for (P in (d+1):(length(CHR)-d)){
# evalPos <- CHR[P]
# leftFlank <- CHR[(P-d):(P-1)]
# rightFlank <- CHR[(P+1):(P+d)]
# if( min(c(rightFlank, leftFlank)) < minEnv ){next()}
# m1 <- mean(leftFlank)
# m2 <- mean(rightFlank)
# sd1 <- sd(leftFlank)
# sd2 <- sd(rightFlank)
# scoreA[P] <- 1 - (((2*evalPos) + 1) /
# ((0.5*abs(m1-sd1)) + evalPos + (0.5*abs(m2-sd2)) + 1))
# }
# scoreA[scoreA < 0] <- 0
# return(scoreA)
# }
#
#
# # ScoreMeth: Quantifies methylation as a percentage.Based on a weighted ratio to
# # the "d" positions flanking both sides (Birkedal et al., 2015)
# scoreMeth <- function(CHR, d = 6){
# scoreMeth <- rep(0, length(CHR)); names(scoreMeth) <- names(CHR)
# rfWeigth <- seq(1, 0.5, -0.5/(d-1))[1:d]; lfWeigth <- rev(seq(1,0.5,-0.5/(d-1))[1:d])
# srfw <- sum(rfWeigth); slfw <- sum(lfWeigth)
# for(P in (d+1):(length(CHR)-d-1)){
# evalPos <- CHR[P]
# leftFlank <- CHR[(P-d):(P-1)]
# rightFlank <- CHR[(P+1):(P+d)]
# if(sum(leftFlank == 0) > 3 | sum(rightFlank == 0) > 3){next()}
# if(evalPos >= 0 & (sum(rightFlank) > 0 | sum(leftFlank) > 0) ){
# scoreMeth[P] <- 1-(evalPos/(0.5*((sum(lfWeigth*leftFlank)/slfw) + sum(rfWeigth*rightFlank)/srfw)))
# }
# }
# scoreMeth[is.na(scoreMeth)] <- 0
# return(scoreMeth)
# }
#
# #ScoreZ - based on Z.score
# scoreZ <- function(CHR, d = 6){
# scoreZ <- rep(NA, length(CHR)); names(scoreZ) <- names(CHR)
# for (P in (d+1):(length(CHR)-d-1)){
# evalPos= CHR[P]; leftFlank = CHR[(P-d):(P-1)]; rightFlank = CHR[(P+1):(P+d)]
# meanNeigh <- mean(c(leftFlank, rightFlank))
# sdNeigh <- sd(c(leftFlank, rightFlank))
# scoreZ[P] <- (evalPos - meanNeigh) / sdNeigh
# }
# scoreZ <- scoreZ * -1
# return(scoreZ)
# }
#
# # Stops score
# scoreStop <- function(stopReads, readtReads, minCov = 50){
# res <- stopReads/readtReads
# autoFail <- readtReads < minCov
# res[autoFail] <- 0
# res <- res[-1]; res <- c(res, 0); names(res) <- names(stopReads) # Move one up-stream
# res[is.na(res)] <- 0
# res[is.infinite(res)] <- 0
# return(res)
# }
#
# # read coverage file and generate CHR object
# readCov <- function(file, chromosome){
# coverage = read.delim(file, header = F, stringsAsFactors = F)
# CHR = coverage[coverage[,1] == chromosome, 3]
# names(CHR) <- coverage[coverage[,1] == chromosome, 2]
# return(CHR)
# }
#
# # read ALLREADENDS zip file and generate CHR object
# zReadCov <- function(file, chromosome, rColumn){
# coverage = read.delim(gzfile(file), header = T, stringsAsFactors = F)
# CHR = coverage[coverage[,1] == chromosome, rColumn]
# names(CHR) <- coverage[coverage[,1] == chromosome, 2]
# return(CHR)
# }
#
# # Outputs table of confusion, precision, recall, AUC and MCC of predictor variable with binary classification
# modMetrics <- function(RESULTS, successes){
# evalRES <- matrix(NA, nrow = 1, ncol = 9)
# res <- RESULTS; zres <- scale(res)
# MCC <- matthewCorrCoeff(scores = res, successes = successes)
# selThr <- res[which.max(MCC)]; zThr <- zres[which.max(MCC)]
# calls <- as.numeric(res >= selThr)
# known <- rep(0, length(calls)); known[successes] <- 1
# TP <- sum(calls & known)
# FP <- sum(calls) - TP
# TN <- sum(!calls & !known)
# FN <- sum(!calls) - TN
# Prec <- TP/(TP+FP)
# Reca <- TP/(TP+FN)
# M <- (TP * TN - FP * FN) / sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
# AUC <- signif(auc(roc(predictions = res, labels = as.factor(known))), 4)
# eval <- c(TP, TN, FP, FN, Prec, Reca, M, zThr, AUC)
# evalRES[1,] <- eval
# colnames(evalRES) <- c("TP", "TN", "FP", "FN", "Precision", "Recall", "MCC", "Z-threshold", "AUC")
# rownames(evalRES) <- names(RESULTS)
# return(evalRES)
# }
#
# # Matthew Correlation Coefficient
# matthewCorrCoeff <- function(scores, successes){
# MCC <- NULL
# known <- rep(0, length(scores)); known[successes] <- 1
# NAs <- which(is.na(scores))
# scores[is.na(scores)] <- 0
# for (i in 1:length(scores)){
# thr <- scores[i]
# calls <- as.numeric(scores >= thr)
# TP <- sum(calls & known)
# FP <- sum(calls) - TP
# TN <- sum(!calls & !known)
# FN <- sum(!calls) - TN
# MCC[i] <- (TP * TN - FP * FN) / sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
# }
# MCC[is.nan(MCC)] <- 0
# MCC[is.infinite(MCC)] <- 0
# MCC[NAs] <- NA
# return(MCC)
# }
#
# # Paralelized version of Matthew Correlation coefficient calculation
# parMCC <- function(scores, successes, cluster){
# known <- rep(0, length(scores)); known[successes] <- 1
# NAs <- which(is.na(scores))
# scores[is.na(scores)] <- 0
# MCC <- parSapply(cluster, 1:length(scores), function(i){
# thr <- scores[i]
# calls <- as.numeric(scores >= thr)
# TP <- sum(calls & known)
# FP <- sum(calls) - TP
# TN <- sum(!calls & !known)
# FN <- sum(!calls) - TN
# return((TP * TN - FP * FN) / sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)))
# })
# MCC[is.nan(MCC)] <- 0
# MCC[is.infinite(MCC)] <- 0
# MCC[NAs] <- NA
# return(MCC)
# }
#
# # Binomial proportional confidence interval. For Alpha = 0.05 z = 1.96
# binError <- function(p, z, n){
# bErr <- z * sqrt((p*(1-p))/n)
# return(bErr)
# }
#
# # Scatter, density function, and ROC curve plots for assesing scores
# modEvalPlots <- function(RESULTS, successes, MCCthr = F, main = "", multiplot = T){
# if(multiplot == T) {par(mfrow=c(1,3))}
# res <- RESULTS
# MCC <- matthewCorrCoeff(res, successes)
# selThr <- res[which.max(MCC)]
# colorvector <- rep("black", length(res)); colorvector[successes] <- "red"
# par(pch = 16); barplot(res, col = colorvector, border = NA)
# abline(h = res[which.max(MCC)],col= "blue")
# plot(ecdf(res[-successes]), main = main, col = "black")
# lines(ecdf(res[successes]), col = "red")
# abline(v = res[which.max(MCC)], col = "blue")
# legend("bottomright", legend = paste("max.MCC =", signif(max(MCC), 4)))
# if(MCCthr){res <- as.numeric(res >= selThr)}
# known <- rep(0, length(res)); known[successes] <- 1
# areaUC <- signif(auc(roc(predictions = res, labels = as.factor(known))), 4)
# plot(roc(predictions = res, labels = as.factor(known)), col = "red")
# legend("bottomright", legend = paste("AUC =", areaUC))
# if(multiplot == T) {par(mfrow=c(1,1))}
# }
# modelEvalPlot <- modEvalPlots #deprecated function name
#
# #Calculate AUC for two vectors (scores, successes)
# modAUC <- function(scores, successes, main = ""){
# known <- as.numeric(successes)
# areaUC <- signif(auc(roc(predictions = scores, labels = as.factor(known))), 4)
# plot(roc(predictions = scores, labels = as.factor(known)), col = "red", main= main)
# legend("bottomright", legend = paste("AUC =", areaUC))
# }
#
# # Fit the fragmentation and signal score with logistic regression
# integrate_scores <- function(RES_A, RES_B, success){
# RESULTS <- list()
# for(i in 1:length(RES_A)){
# res1 <- RES_A[[i]] #Frag Score
# res2 <- RES_B[[i]] # Stop Score
# meth <- rep(0, 1, length(res1)); meth[success] <- 1
# df <- data.frame(METH = meth, FRAG_SCORE = res1, STOP_SCORE = res2)
# model <- glm(METH ~., family=binomial(link='logit'), data = df)
# RESULTS[[i]] <- predict(model, newdata= df ,type='response')
# }
# nick <- sapply(rRNA_rEnds, function(x) gsub(pattern = "_Aligned.out.sorted.allreadEnds.gz", replacement = "", x))
# names(RESULTS) <- nick
# return(RESULTS)
# }
#
# # Fit the fragmentation and signal score with logistic regression
# integrate_3scores <- function(RES_A, RES_B, RES_C, success){
# RESULTS <- list()
# for(i in 1:length(RES_A)){
# res1 <- RES_A[[i]] # Frag Score
# res2 <- RES_B[[i]] # Stop Score
# res3 <- RES_C[[i]] # Z-score
# meth <- rep(0, 1, length(res1)); meth[success] <- 1
# df <- data.frame(METH = meth, FRAG_SCORE = res1, STOP_SCORE = res2, Zscore = res3)
# model <- glm(METH ~., family=binomial(link='logit'), data = df)
# RESULTS[[i]] <- predict(model, newdata= df ,type='response')
# }
# nick <- sapply(rRNA_rEnds, function(x) gsub(pattern = "_Aligned.out.sorted.allreadEnds.gz", replacement = "", x))
# names(RESULTS) <- nick
# return(RESULTS)
# }
#
# # Get Coefficients of integrated model
# getCoef <- function(scoreFr, scoreSt, success){
# meth <- rep(0, 1, length(scoreFr)); meth[success] <- 1
# df <- data.frame(METH = meth, FRAG_SCORE = scoreFr, STOP_SCORE = scoreSt)
# model <- glm(METH ~., family=binomial(link='logit'), data = df)
# return(model$coefficients)
# }
#
# # Fit one score to the logistic regression
# fitOneScore <- function(res, success){
# meth <- rep(0, 1, length(res)); meth[success] <- 1
# df <- data.frame(METH = meth, SCORE = res)
# model <- glm(METH ~., family=binomial(link='logit'), data = df)
# RESULTS <- predict(model, newdata= df ,type='response')
# return(RESULTS)
# }
#
# # Write table with TAB separated files standards
# writeMyTable <- function(X, filename){
# write.table(X, file = filename, sep= "\t", col.names = NA, quote = F)
# }
#
# # Extracting the last n characters from a string in R
# substrRight <- function(x, n){
# substr(x, nchar(x)-n+1, nchar(x))
# }
#
# # Logistic Regression
# logisticReg <- function(interc, var1, cf1, var2, cf2){
# res <- 1/(1 + exp(-(var1*cf1 + var2*cf2 + interc))) # Logistic Regression Formula
# return(res)
# }
#
# # List files in work dir that match pattern pat
# listFilePatt <- function(pattern, path = "."){
# files <- list.files(path)[grep(pattern = pattern, x = list.files(path))]
# return(files)
# }
#
# # Object size
# oSize <- function(x){
# print(object.size(x), units = "auto")
# }
#
# # Define Chr, Start & End of a GENE based on UTR annotations
# defCoor <- function(gene){
# g <- gene
# chr <- geneAnnot[g,1]
# coor <- geneAnnot[g,2:3]
# strand <- geneAnnot[g, 6]
# U3 <- UTR3Annot[UTR3Annot[, "gene"] == g, 2:3]
# U5 <- UTR5Annot[UTR5Annot[, "gene"] == g, 2:3]
# start <- NULL; end <- NULL
# if (strand == "+"){
# if(nrow(U5) > 0){
# start <- U5[1]
# } else {
# start <- coor[1]
# }
# if(nrow(U3) > 0){
# end <- U3[2]
# } else {
# end <- coor[2]
# }
# } else if (strand == "-"){
# if(nrow(U5) > 0){
# start <- U5[2]
# } else {
# start <- coor[2]
# }
# if(nrow(U3) > 0){
# end <- U3[1]
# } else {
# end <- coor[1]
# }
# }
# start <- as.integer(start) ; end <- as.integer(end)
# return(c(chr, start, end, strand))
# }
#
# # Calculate adjusted coefficients
# adjCoeff <- function(F18s, F25s, S18s, S25s, R18s, R25s, medGoal, known){
# adjF <- mean(median(F18s)/medGoal, median(F25s)/medGoal)
# score.frag <- c(scoreRD(round(F18s/adjF)), scoreRD(round(F25s/adjF)))
# score.stop <- c(scoreStop(round(S18s/adjF), round(R18s/adjF)), scoreStop(round(S25s/adjF), round(R25s/adjF)))
# cf <- getCoef(score.frag, score.stop, known)
# res <- logisticReg(cf[1], score.frag, cf[2], score.stop, cf[3])
# maxMCC <- max(matthewCorrCoeff(res, known_RRNA_2Ome))
# modThresh <- res[which.max(matthewCorrCoeff(res, known))]
# modVals <- c(cf, maxMCC, modThresh); names(modVals)[4:5] <- c("maxMCC", "Thresh")
# return(modVals)
# }
#
# # Strand Specific coverage
# strSpCov <- function(covStrand, covOppStrand){
# cov <- as.numeric(covStrand[,3]) - as.numeric(covOppStrand[,3])
# cov[cov < 0] <- 0
# cov <- data.frame(chr= covStrand[,1], coor = covStrand[,2], cov)
# return(cov)
# }
#
# # Smooth y given x,y coordinates
# smoothLine <- function(x, y){
# sl <- smooth.spline(x, y, spar = 0.7)
# y <- sl$y; names(y) <- sl$x
# return(y)
# }
#
# # Turns a character matrix into a dataframe more "smartly"
# asDataFrame <- function(x){
# tmpCol <- colnames(x)
# out <- data.frame(lapply(split(x, col(x)), type.convert, as.is = TRUE),
# stringsAsFactors = FALSE)
# colnames(out) <- tmpCol
# return(out)
# }
#
# # Read rRNA data from rEnd files
# rRNAdata <- function(rEndsFile){
# F18s <- zReadCov(rEndsFile, "RDN18-1", "r3pos")
# F25s <- zReadCov(rEndsFile, "RDN25-1", "r3pos")
# S18s <- zReadCov(rEndsFile, "RDN18-1", "r5pos")
# S25s <- zReadCov(rEndsFile, "RDN25-1", "r5pos")
# R18s <- zReadCov(rEndsFile, "RDN18-1", "Fpos")
# R25s <- zReadCov(rEndsFile, "RDN25-1", "Fpos")
# return(list(F18s = F18s, F25s = F25s, S18s = S18s, S25s = S25s, R18s = R18s, R25s = R25s))
# }
#
# adjCoeff <- function(h_F18s, h_F25s, l_S18s, l_S25s, l_R18s, l_R25s, h_S18s, h_S25s,
# h_R18s, h_R25s, medGoal, known){
# adjF <- mean(median(h_F18s)/medGoal, median(h_F25s)/medGoal)
# score.frag <- c(scoreRD(round(h_F18s/adjF)), scoreRD(round(h_F25s/adjF)))
# score.stop <- c(scoreStop(round(l_S18s/adjF), round(l_R18s/adjF)), scoreStop(round(l_S25s/adjF), round(l_R25s/adjF))) -
# c(scoreStop(round(h_S18s/adjF), round(h_R18s/adjF)), scoreStop(round(h_S25s/adjF), round(h_R25s/adjF)))
# score.stop[score.stop < 0] <- 0
# cf <- getCoef(score.frag, score.stop, known)
# res <- logisticReg(cf[1], score.frag, cf[2], score.stop, cf[3])
# maxMCC <- max(matthewCorrCoeff(res, known_RRNA_2Ome))
# modThresh <- res[which.max(matthewCorrCoeff(res, known))]
# modVals <- c(cf, maxMCC, modThresh); names(modVals)[4:5] <- c("maxMCC", "Thresh")
# return(modVals)
# }
#
# # Results Dataframe
# resDF_Thresh <- function(lDATA, hDATA, stopThresh = 0, fragThresh = 0, fCHThresh = 0, minMedF = 0, evalGenes){
# stopRES <- list()
# for(eGene in evalGenes){
# frGeneL <- lDATA$DATA[[eGene]]$frags
# stGeneL <- lDATA$DATA[[eGene]]$stops
# rtGeneL <- lDATA$DATA[[eGene]]$readt
# frGeneH <- hDATA$DATA[[eGene]]$frags
# stGeneH <- hDATA$DATA[[eGene]]$stops
# rtGeneH <- hDATA$DATA[[eGene]]$readt
# frGeneT <- frGeneL + frGeneH
# medBool <- medNeighBool(frGeneT, minMedF)
# frScore <- scoreA(frGeneT)
# frScore2 <- scoreRD(frGeneT)
# zScore <- scoreZ(frGeneT)
# stScoreL <- scoreStop(stGeneL+1, rtGeneL+1)
# stScoreH <- scoreStop(stGeneH+1, rtGeneH+1)
# fCHstop <- stScoreL/stScoreH
# # Thresholds
# callsGene <- which(stScoreL >= stopThresh & frScore >= fragThresh & fCHstop >= fCHThresh & medBool)
# callsGene <- callsGene[callsGene > 6 & (callsGene <= length(rtGeneL) - 7)]
# chrGene <- geneAnnot[eGene,1]
# strandGene <- geneAnnot[eGene,6]
# if(strandGene == "+"){
# coorsGene <- geneAnnot[eGene,2]:geneAnnot[eGene,3]
# } else if(strandGene == "-"){
# coorsGene <- rev(geneAnnot[eGene,2]:geneAnnot[eGene,3])
# }
# bedGene <- list() # Defining yet another dummy variable...
# if(length(callsGene) > 0){
# for(l in 1:length(callsGene)){
# selRes <- callsGene[l]
# neighF <- frGeneT[(selRes-6):(selRes+6)]
# nS_l <- stGeneL[selRes+1] + 1 # +1 Position Upstream +1 PseudoCount
# nR_l <- rtGeneL[selRes+1] + 1
# nS_h <- stGeneH[selRes+1] + 1
# nR_h <- rtGeneH[selRes+1] + 1
# fisher.pVal <- fisher.test(rbind(c(nR_l,nS_l),c(nR_h,nS_h)), alternative ="l")$p.value
# FCH.stop <- (nS_l/nR_l)/(nS_h/nR_h)
# bedGene[[l]] <- c(chrGene,
# coorsGene[selRes],
# coorsGene[selRes],
# eGene,
# paste(chrGene, coorsGene[selRes], eGene, sep = "."),
# strandGene,
# frScore[selRes],
# frScore2[selRes],
# zScore[selRes],
# stScoreL[selRes],
# stScoreH[selRes],
# FCH.stop,
# fisher.pVal,
# nS_l,
# nR_l,
# nS_h,
# nR_h,
# paste(neighF, collapse = ","))
# }
# stopRES[[eGene]] <- t(sapply(bedGene, c))
# }
# }
# stopRES <- stopRES[!sapply(stopRES, is.null)]
# if(length(stopRES) == 0){
# stop("No results found with specified thresholds!")
# }
# RESMAT <- NULL
# for(m in 1:length(stopRES)){
# RESMAT <- rbind(RESMAT, stopRES[[m]])
# }
# RESMAT <- asDataFrame(RESMAT)
# colnames(RESMAT) <- c("chr", "start", "end", "gene", "id", "strand",
# "scoreA", "scoreRD", "scoreZ", "stopScoreL", "stopScoreH",
# "FCHstop", "p.Value", "stopsL", "readtL", "stopsH", "readtH",
# "fragEnvir")
# return(RESMAT)
# }
#
# # Parallel version of Results Data frame function
# parResDF_Thresh <- function(lDATA, hDATA, stopThresh = 0, fragThresh = 0, fCHThresh = 0, minMedF = 0, evalGenes, cluster, noTresh = F){
# stopRES <- parLapply(cluster, evalGenes, function(eGene){
# frGeneL <- lDATA$DATA[[eGene]]$frags
# stGeneL <- lDATA$DATA[[eGene]]$stops
# rtGeneL <- lDATA$DATA[[eGene]]$readt
# frGeneH <- hDATA$DATA[[eGene]]$frags
# stGeneH <- hDATA$DATA[[eGene]]$stops
# rtGeneH <- hDATA$DATA[[eGene]]$readt
# frGeneT <- frGeneL + frGeneH
# medBool <- medNeighBool(frGeneT, minMedF)
# frScore <- scoreA(frGeneT)
# frScore2 <- scoreRD(frGeneT)
# zScore <- scoreZ(frGeneT)
# stScoreL <- scoreStop(stGeneL+1, rtGeneL+1)
# stScoreH <- scoreStop(stGeneH+1, rtGeneH+1)
# fCHstop <- stScoreL/stScoreH
# # Thresholds
# if(noTresh == F){
# callsGene <- which(stScoreL >= stopThresh & frScore >= fragThresh & fCHstop >= fCHThresh & medBool)
# }else {callsGene <- 1:length(frGeneL)}
# callsGene <- callsGene[callsGene > 6 & (callsGene <= length(rtGeneL) - 7)]
# chrGene <- geneAnnot[eGene,1]
# strandGene <- geneAnnot[eGene,6]
# if(strandGene == "+"){
# coorsGene <- geneAnnot[eGene,2]:geneAnnot[eGene,3]
# } else if(strandGene == "-"){
# coorsGene <- rev(geneAnnot[eGene,2]:geneAnnot[eGene,3])
# }
# bedGene <- list() # Defining yet another dummy variable...
# if(length(callsGene) > 0){
# for(l in 1:length(callsGene)){
# selRes <- callsGene[l]
# neighF <- frGeneT[(selRes-6):(selRes+6)]
# nS_l <- stGeneL[selRes + 1] + 1 # +1 Position Upstream +1 PseudoCount
# nR_l <- rtGeneL[selRes + 1] + 1
# nS_h <- stGeneH[selRes + 1] + 1
# nR_h <- rtGeneH[selRes + 1] + 1
# fisher.pVal <- fisher.test(rbind(c(nR_l - nS_l,nS_l),c(nR_h - nS_h,nS_h)), alternative ="l")$p.value
# if(nR_l > 100 & nR_h > 100){
# FCH.stop <- (nS_l/nR_l)/(nS_h/nR_h)
# }else{FCH.stop <- NA}
# bedGene[[l]] <- c(chrGene,
# coorsGene[selRes],
# coorsGene[selRes],
# eGene,
# paste(chrGene, coorsGene[selRes], eGene, sep = "."),
# strandGene,
# frScore[selRes],
# frScore2[selRes],
# zScore[selRes],
# stScoreL[selRes],
# stScoreH[selRes],
# FCH.stop,
# fisher.pVal,
# nS_l,
# nR_l,
# nS_h,
# nR_h,
# paste(neighF, collapse = ","))
# }
# return(asDataFrame(t(sapply(bedGene, c))))
# }
# })
# stopRES <- stopRES[!sapply(stopRES, is.null)]
# if(length(stopRES) == 0){
# stop("No results found with specified thresholds!")
# }
# RESMAT <- rbindlist(stopRES)
# colnames(RESMAT) <- c("chr", "start", "end", "gene", "id", "strand",
# "scoreA", "scoreRD", "scoreZ", "stopScoreL", "stopScoreH",
# "FCHstop", "p.Value", "stopsL", "readtL", "stopsH", "readtH",
# "fragEnvir")
# return(RESMAT)
# }
#
#
# # Read rRNA data from rEnd files
# rRNAdata <- function(rEndsFile){
# F18s <- zReadCov(file.path(INPUTDIR, rEndsFile), "RDN18-1", "r3pos")
# F25s <- zReadCov(file.path(INPUTDIR, rEndsFile), "RDN25-1", "r3pos")
# S18s <- zReadCov(file.path(INPUTDIR, rEndsFile), "RDN18-1", "r5pos")
# S25s <- zReadCov(file.path(INPUTDIR, rEndsFile), "RDN25-1", "r5pos")
# R18s <- zReadCov(file.path(INPUTDIR, rEndsFile), "RDN18-1", "Fpos")
# R25s <- zReadCov(file.path(INPUTDIR, rEndsFile), "RDN25-1", "Fpos")
# return(list(F18s = F18s, F25s = F25s, S18s = S18s, S25s = S25s, R18s = R18s, R25s = R25s))
# }
#
# # Median of neighboring positions above threshold
# medNeighBool <- function(CHR, minMed, d = 6){
# medN <- rep(F, length(CHR)); names(scoreA) <- names(CHR)
# for (P in (d+1):(length(CHR)-d-1)){
# medN[P] <- median(c(CHR[(P-d):(P-1)], CHR[(P+1):(P+d)])) >= minMed
# }
# return(medN)
# }
#
# # Vector indicating methylation status for known rRNA sites
# knownStatus <- function(res){
# real <- NULL
# for (iter in 1:nrow(res)){
# real[iter] <- paste(res[iter,1:2], collapse = " ") %in% known2Om_coors
# }
# falPos <- NULL
# for (iter in 1:nrow(res)){
# falPos[iter] <- paste(res[iter,1:2], collapse = " ") %in% FP_2Om_coors
# }
# realChr <- as.character(real)
# realChr[real] <- "Known"
# realChr[falPos] <- "FalsePositives"
# realChr[realChr == "FALSE"] <- "Unknown"
# return(as.factor(realChr))
# }
#
# # Sample read Ends with c(P (fail),Q(success)) probabilities of sampling each read
# sampleREnds <- function(rEnds, prob){
# sampRE <- NULL
# for(j in 1:length(rEnds)){
# sampRE[j] <- sum(sample(0:1, rEnds[j],T, prob))
# }
# return(sampRE)
# }
#
# # Sample geneData to desired median depth
# sampDesMed <- function(geneData, desiredMedian){
# suc <- 1 / (median(geneData$frags)/desiredMedian)
# fai <- 1 - suc
# sFrags <- sampleREnds(geneData$frags, prob = c(fai,suc))
# sStops <- sampleREnds(geneData$stops, prob = c(fai,suc))
# sReadt <- as.integer(round(geneData$readt*suc))
# return(list(readt = sReadt, stops = sStops, frags= sFrags))
# }
#
# # Rank of sums of ranks in dataframe X columns (...yeah... bit complicated...)
# rankRanks <- function(x) {
# rRanks <- apply(x, 2, rank, ties.method = "average") %>%
# rowSums() %>% rank(ties.method = "average")
# return(nrow(x) - rRanks + 1)
# }
#
# # Filter Results based on MMC and known rRNA sites
# filtResMMC <- function(res, keep = F, doPlot = T){
# rRNAranks <- (max(res$rank) + 1 ) - res$rank[res$status == "Known" | res$status == "FalsePositives"]
# which2Ome <- which(res$status[res$status == "Known" | res$status == "FalsePositives"] == "Known")
# mcc <- matthewCorrCoeff(rRNAranks, which2Ome)
# lastTrue <- (max(res$rank) +1 ) - min(rRNAranks[which2Ome])
# thr <- rRNAranks[which.max(mcc)]
# thr <- (max(res$rank) +1 ) - thr
# if(doPlot == T){
# plot(res$rank,
# col = res$status,
# main= "Ranking -- Unknown + True/False rRNA 2'-O-meth. sites",
# ylab = "Rank in results")
# abline(c(thr,0), col = "gray50")
# }
# if(keep == F){
# # fRes <- res[res$rank <= thr & res$status == "Unknown",]
# fRes <- res[res$rank <= thr,]
# } else if (keep == T){
# fRes <- res
# fRes$passThr <- "Fail"
# fRes$passThr[res$rank <= thr] <- "Pass"
# fRes$passThr <- as.factor(fRes$passThr)
# # fRes <- fRes[fRes$rank <= lastTrue & fRes$status == "Unknown",]
# fRes <- fRes[fRes$rank <= lastTrue,]
# }
# fRes$rank <- rank(fRes$rank, ties.method = "random")
# fRes <- fRes[order(fRes$rank),]
# return(fRes)
# }
#
# # Generate Random Sites
# randResults <- function(desDepth, randCycles = 1){
# samRES <- NULL
# for (j in 1:randCycles){
# for(i in 1:length(desDepth)){
# sDATA_h <- list(a=sampDesMed(hDATA$DATA$`RDN18-1`, desDepth[i]),
# b=sampDesMed(hDATA$DATA$`RDN25-1`, desDepth[i]))
# sDATA_l <- list(a=sampDesMed(lDATA$DATA$`RDN18-1`, desDepth[i]),
# b=sampDesMed(lDATA$DATA$`RDN25-1`, desDepth[i]))
# names(sDATA_l) <- c("RDN18-1", "RDN25-1"); names(sDATA_h) <- c("RDN18-1", "RDN25-1")
# sDATA_l <- list(DATA = sDATA_l)
# sDATA_h <- list(DATA = sDATA_h)
# sRes <- resDF_Thresh(lDATA = sDATA_l,
# hDATA = sDATA_h,
# stopThresh = 0.01,
# fragThresh = 0.3,
# fCHThresh = 1,
# minMedF = 10,
# evalGenes = tail(g, 2))
# sRes$medFR <- sapply(1:nrow(sRes), function(x) median(as.numeric(unlist(strsplit(sRes$fragEnvir[x], ",")))[-7]))
# sRes$status <- knownStatus(sRes)
# sRes$log10pVal <- -log10(sRes$p.Value)
# samRES <- rbind(samRES, sRes)
# }
# }
# return(samRES)
# }
#
# # Calculate Threshold Parameters
# calcThr <- function(res){
# rRNAranks <- (max(res$rank) + 1 ) - res$rank[res$status == "Known" | res$status == "FalsePositives"]
# which2Ome <- which(res$status[res$status == "Known" | res$status == "FalsePositives"] == "Known")
# mcc <- matthewCorrCoeff(rRNAranks, which2Ome)
# lastTrue <- (max(res$rank) +1 ) - min(rRNAranks[which2Ome])
# thr <- rRNAranks[which.max(mcc)]
# thr <- (max(res$rank) +1 ) - thr
# modPar <- c(max(mcc), thr)
# names(modPar) <- c("maxMCC", "rankThr")
# return(modPar)
# }
#
# # Crate BED file with range of positions for sequence retrieval with BEDTOOLS
# bedRange <- function(bedDF, d){
# data.frame(bedDF[,1], as.integer(bedDF[,3]) - d -1,
# as.integer(bedDF[,3]) + d, bedDF[,5],
# bedDF[,4], bedDF[,6], stringsAsFactors = F)
# }
#
# # Function to perform t tests on groups G1 and G2 by score with an alternative hypothesis
# tTestScores <- function(resList, expG1, expG2, score, tTestAlternative){
# expG1 <- as.integer(expG1)
# expG2 <- as.integer(expG2)
# g1 <- sapply(expG1, function(x) resList[[x]][score])
# g1 <- matrix(unlist(g1), ncol = length(expG1), byrow = F)
# g2 <- sapply(expG2, function(x) resList[[x]][score])
# g2 <- matrix(unlist(g2), ncol = length(expG2), byrow = F)
# tt_score <- NULL
# for(i in 1:nrow(g1)){
# sc1 <- g1[i,]; sc2 <- g2[i,]; tRes <- NA
# if(sum(!is.na(sc1)) >= 2 & sum(!is.na(sc2)) >= 2 ){
# tRes <- try(t.test(sc1, sc2, alternative = tTestAlternative)$p.value, silent = T)
# }
# tt_score[i] <- tRes
# }
# return(tt_score)
# }
#
# # Processing readEnds from rEndsFileName and gene annotation
# rEndsProcess <- function(rEndsFileName, geneAnnot){
# rownames(geneAnnot) <- geneAnnot[,4]
# rEndsFile <- read.delim(gzfile(rEndsFileName), header = T, stringsAsFactors = F)
# covFpos <- rEndsFile[,c("X0", "X1", "Fpos")]
# covFneg <- rEndsFile[,c("X0", "X1", "Fneg")]
# r5pos <- rEndsFile[,c("X0", "X1", "r5pos")]
# r5neg <- rEndsFile[,c("X0", "X1", "r5neg")]
# r3pos <- rEndsFile[,c("X0", "X1", "r3pos")]
# r3neg <- rEndsFile[,c("X0", "X1", "r3neg")]
# GENES <- geneAnnot[,4]
# chrs <- unique(r5pos[,1])
# coors <- sapply(chrs, function(x) which(r5pos[,1] == x) %>% min()) - 1
# DATA <- lapply(GENES, function(gene){
# geneCoors <- c(geneAnnot[gene, 2], geneAnnot[gene, 3]) + coors[geneAnnot[gene, 1]]
# if (geneAnnot[gene, 6] == "+"){
# sta <- min(geneCoors)
# end <- max(geneCoors)
# readt <- covFpos[sta:end, 3] #Read-through
# stops <- r5pos[sta:end, 3] #Reverse-Stops
# frags <- r3pos[sta:end, 3] #Fragmentation
# } else if (geneAnnot[gene, 6] == "-"){
# sta <- max(geneCoors)
# end <- min(geneCoors)
# readt <- covFneg[sta:end, 3]
# stops <- r5neg[sta:end, 3]
# frags <- r3neg[sta:end, 3]
# }
# OUT <- list(readt, stops, frags)
# names(OUT) <- c("readt", "stops", "frags")
# return(OUT)
# }
# )
# names(DATA) <- GENES
# medRT <- sapply(DATA, function(x) {
# res <- median(x[[1]])
# return(res)
# }
# )
# medST <- sapply(DATA, function(x) {
# res <- median(x[[2]])
# return(res)
# }
# )
# medFR <- sapply(DATA, function(x) {
# res <- median(x[[3]])
# return(res)
# }
# )
# ALLDATA <- list(DATA = DATA, medRT = medRT, medST = medST, medFR = medFR)
# return(ALLDATA)
# }
#
# # Read a BED-6 file and transform base-0 coordinates to base-1
# readBED <- function(fileName, idAsRowNames = T){
# tmp <- read.delim(fileName, header = F, stringsAsFactors = F)
# if(ncol(tmp) == 6){
# tmp[,2] <- tmp[,2] + 1
# colnames(tmp) <- bed6_header
# }else if(ncol(tmp) == 12){
# tmp[,2] <- tmp[,2] + 1
# tmp[,7] <- tmp[,2] + 1
# colnames(tmp) <- bed12_header
# }else{stop("Number of columns in file is neither 6 or 12")}
# if(idAsRowNames){rownames(tmp) <- tmp$name}
# return(tmp)
# }
#
# # Function write bed file (Input = base1, output = base-0)
# writeBED <- function(bed, filename){
# if(ncol(bed) == 6){
# bed[,2] <- bed[,2] - 1
# }else if(ncol(bed) == 12){
# bed[,2] <- bed[,2] - 1
# bed[,7] <- bed[,7] - 1
# }else{
# stop("BED is neither BED-6 or BED-12 column length")
# }
# write.table(bed, filename, sep= "\t", col.names = F, row.names = F, quote = F)
# }
#
#
# # Crate BED file with range of positions for sequence retrieval with BEDTOOLS
# bedRange <- function(bedDF, d){
# cbind(as.character(bedDF[,1]), as.integer(bedDF[,3]) - d -1,
# as.integer(bedDF[,3]) + d,
# as.character(bedDF[,5]),
# as.character(bedDF[,4]),
# as.character(bedDF[,6]))
# }
#
# # Some logical functions
# great <- function(x, Than, orEqual = F, value = F){
# if(value){
# if(orEqual){x[x >= Than]}else if(!orEqual){x[x > Than]}
# }else if(!value){
# if(orEqual){x >= Than}else if(!orEqual){x > Than}
# }
# }
#
# less <- function(x, Than, orEqual = F, value = F){
# if(value){
# if(orEqual){x[x <= Than]}else if(!orEqual){x[x < Than]}
# }else if(!value){
# if(orEqual){x <= Than}else if(!orEqual){x < Than}
# }
# }
#
#
# equal <- function(x,y, value = F){
# if(value){x[x == y]}else if(!value){x == y}
# }
#
# # Function to go up along a vector until finding lower values, with a threshold of fails going up
# stepWiseUp <- function(x, failThr, start = 2){
# best <- start
# fails <- 0
# for(i in start:length(x)){
# if(fails > failThr){break()}
# if(x[i] > x[i-1]){
# best <- i
# fails <- 0
# }else{fails <- fails + 1}
# }
# return(best)
# }
#
# # Genomic to transcriptomic coordinates based on gene annotation (Bed Format - base1)
# genCoor_2_transCorr <- function(bedLikeRow, geneAnnot){
# tmpGA <- geneAnnot[geneAnnot$chr == as.character(bedLikeRow[1,1]) &
# geneAnnot$strand == bedLikeRow[1,6] &
# geneAnnot$start <= as.numeric(bedLikeRow[1,3]) &
# geneAnnot$end >= as.numeric(bedLikeRow[1,3]),]
# lGenes <- lapply(tmpGA$name, function(x) exonBlockGen(x, geneAnnot))
# names(lGenes) <- tmpGA$name
# tmpGA$relCoor <- sapply(lGenes, function(x) match(as.numeric(bedLikeRow[1,3]), x))
# return(na.omit(tmpGA[,c("name", "relCoor")]))
# }
#
# genCoor_2_transCorr_V2 <- function(bedLikeRow, geneAnnot){
# tmpGA <- geneAnnot[geneAnnot$chr == as.character(bedLikeRow[1]) &
# geneAnnot$strand == bedLikeRow[6] &
# geneAnnot$start <= as.numeric(bedLikeRow[3]) &
# geneAnnot$end >= as.numeric(bedLikeRow[3]),]
# lGenes <- lapply(tmpGA$name, function(x) exonBlockGen(x, geneAnnot))
# names(lGenes) <- tmpGA$name
# tmpGA$relCoor <- sapply(lGenes, function(x) match(as.numeric(bedLikeRow[3]), x))
# return(na.omit(tmpGA[,c("name", "relCoor")]))
# }
#
# # Transcriptomic to genomic coordinates - parallelized
# transCoor_2_genCoor <- function(cl, relCoors, geneAnnot){
# res <- parSapply(cl, 1:nrow(relCoors), function(i){
# id <- paste(relCoors[i, ], collapse = ".")
# gene <- relCoors[i,1]
# relPos <- relCoors[i,2] %>% as.numeric()
# chr <- geneAnnot[gene,]$chr
# strand <- geneAnnot[gene,]$strand
# if(strand == "+"){
# genCoor <- (geneAnnot[gene,]$start:geneAnnot[gene,]$end)[relPos]
# }else if(strand == "-"){
# genCoor <- (geneAnnot[gene,]$end:geneAnnot[gene,]$start)[relPos]
# }
# return(c(chr, genCoor, genCoor, id, 0, strand))
# }) %>% t()
# return(res)
# }
#
# # Function to change the range of numeric values from 0 to 1 - Feature Scaling
# range0_1 <- function(x){
# z <- (x - min(x, na.rm = T)) / (max(x, na.rm = T) - min(x, na.rm = T))
# return(z)
# }
#
# # Standardized Mean Difference (2 groups - unbiased)
# smd <- function(g1, g2){
# meanDiff <- mean(g1) - mean(g2)
# SDwithin <- sqrt(((length(g1)-1)*sd(g1)^2 + (length(g2)-1)*sd(g2)^2)/(length(g1)+length(g2)-2))
# smd <- meanDiff/SDwithin
# return(smd)
# }
#
# # Function for plotting a 2 axis line plot with baseR
# # Based on https://stackoverflow.com/questions/6142944/how-can-i-plot-with-2-different-y-axes#6143251
# secAxisPlot <- function(xAxis, yAxis, yAxis2, xlab = "", ylab = "", y2lab = "", main = "", secCol= 2){
# ## add extra space to right margin of plot within frame
# par(mar=c(5, 5, 4, 6) + 0.1)
# ## Plot first set of data and draw its axis
# plot(xAxis, yAxis, pch = 16, axes = FALSE, ylim = range(yAxis), xlab= "", ylab = "",
# type = "b", col = "black", main = main)
# axis(2, ylim = range(yAxis), col = "black", las=1) ## las=1 makes horizontal labels
# mtext(ylab, side = 2, line = 4)
# box()
# ## Allow a second plot on the same graph
# par(new = TRUE)
# ## Plot the second plot and put axis scale on right
# plot(xAxis, yAxis2, pch=15, xlab="", ylab = "", ylim = range(yAxis2),
# axes= FALSE, type="b", col= secCol)
# ## a little farther out (line=4) to make room for labels
# mtext(y2lab, side = 4, col = secCol, line=4)
# axis(4, ylim = range(yAxis2), col = secCol, col.axis= secCol, las=1)
# ## Draw the xAxis axis
# axis(1, pretty(range(xAxis), 10))
# mtext(xlab, side = 1, col = "black", line = 2.5)
# }
#
# #GC content
# GC_content <- function(sequence){
# tmp <- strsplit(sequence, "") %>% unlist()
# (tmp == "G" | tmp == "C") %>% sum() / length(tmp)
# }
#
# #Mode function by Nick https://stackoverflow.com/users/5222/nick
# Mode <- function(x) {
# ux <- unique(x)
# ux[which.max(tabulate(match(x, ux)))]
# }
#
# # Nucleotide Binary Matrix
# nucBinMat <- function(tmp){
# binMat <- matrix(0, nrow = length(tmp), ncol = 5)
# colnames(binMat) <- c("A", "C", "G", "T", "N")
# for(i in 1:length(tmp)){
# binMat[i, tmp[i]] <- 1
# }
# return(binMat[,1:4])
# }
#
# # t.Test and FCH contrast between two groups (matrix: columns = sample, rows = 'genes')
# tTest_lFCH <- function(g1Dat, g2Dat, na.rm = T){
# # NAs filter (min 2 samples by group)
# keep <- rowSums(!is.na(g1Dat)) >= 2 & rowSums(!is.na(g2Dat)) >=2 ; table(keep)
# g1Dat <- g1Dat[keep,]
# g2Dat <- g2Dat[keep,]
# g1_mu <- rowMeans(g1Dat, na.rm = na.rm)
# g2_mu <- rowMeans(g2Dat, na.rm = na.rm)
# pV <- sapply(1:nrow(g1Dat), function(i) {
# tTest <- try(t.test(g1Dat[i,], g2Dat[i,]), silent = T)
# if(class(tTest) == "try-error"){
# return(NA)
# }else return(tTest$p.value)
# })
# logFCH <- log2(g1_mu/g2_mu)
# meanDiff <- g1_mu - g2_mu
# tmp_RES <- data.frame(g1Mean = g1_mu, g2Mean= g2_mu, logFCH = logFCH, pVal = pV, meanDiff = meanDiff)
# return(tmp_RES)
# }
#
# #Merge two data frames by rownames and renames rows
# mergebyRowNames <- function(x, y, keepAll = F){
# tmp <- merge(x, y, by = "row.names", all = keepAll)
# rownames(tmp) <- tmp[,1]
# tmp <- tmp[,-1]
# return(tmp)
# }
#
# # Merge by rownames using dplyr
# fullJoinByRowNames <- function(x, y){
# x$rowNames <- rownames(x)
# y$rowNames <- rownames(y)
# tmpO <- dplyr::full_join(x, y, by = "rowNames")
# rownames(tmpO) <- tmpO$rowNames
# return(tmpO[, !colnames(tmpO) == "rowNames"])
# }
#
# leftJoinByRowNames <- function(x, y, keepAll = F){
# x$rowNames <- rownames(x)
# y$rowNames <- rownames(y)
# tmpO <- dplyr::left_join(x, y, by = "rowNames")
# rownames(tmpO) <- tmpO$rowNames
# return(tmpO[,!colnames(tmpO) == "rowNames"])
# }
#
#
# # Read a BEDPE file and transform base-0 coordinates to base-1
# readBEDPE <- function(fileName, idAsRowNames = T){
# tmp <- read.delim(fileName, header = F, stringsAsFactors = F)
# colnames(tmp) <- c("chr1", "start1", "end1", "chr2", "start2", "end2", "name", "score", "strand1", "strand2")
# tmp$start1 <- tmp$start1 + 1
# tmp$start2 <- tmp$start2 + 1
# if(idAsRowNames){rownames(tmp) <- tmp$name}
# return(tmp)
# }
#
# # Read BedPE file for getting read counts (strand specific)
# readBEDPE3 <- function(fileName){
# tmp <- read.delim(fileName, header = F, stringsAsFactors = F)
# colnames(tmp) <- c("chr", "start_r1", "end_r1", "start_r2", "end_r2", "strand")
# tmp$start_r1 <- tmp$start_r1 + 1
# tmp$start_r2 <- tmp$start_r2 + 1
# return(tmp)
# }
#
# # Read a custom BEDPE file and transform base-0 coordinates to base-1
# #readBEDPE2 <- function(fileName){
# # tmp <- read.delim(fileName, header = F, stringsAsFactors = F)
# # colnames(tmp) <- c("chr1", "start1", "end2", "times")
# # tmp$start1 <- tmp$start1 + 1
# # return(tmp)
# #}
#
# # Generate exon coordinates block
# exonBlockGen <- function(iGene, geneAnnot){
# iStart <- geneAnnot[iGene,]$start
# iEnd <- geneAnnot[iGene,]$end
# iStrand <- geneAnnot[iGene,]$strand
# tmpA <- geneAnnot[iGene,]$blockStarts %>% strsplit(split = ",") %>% unlist() %>% as.numeric()
# tmpB <- geneAnnot[iGene,]$blockSizes %>% strsplit(split = ",") %>% unlist() %>% as.numeric()
# iBlocks <- sapply(1:length(tmpA), function(i){
# c(tmpA[i],(tmpA[i] + tmpB[i] -1))
# }) %>% t
# iBlocks <- iStart + iBlocks
# if(iEnd %in% as.vector(iBlocks)){
# if(iStrand == "+"){
# sapply(1:nrow(iBlocks), function(k){
# iBlocks[k,1]:iBlocks[k,2]
# }) %>% unlist %>% as.numeric
# }else if(iStrand == "-"){
# sapply(1:nrow(iBlocks), function(k){
# iBlocks[k,1]:iBlocks[k,2]
# }) %>% unlist %>% rev %>% as.numeric
# }
# }else{stop(paste("Malformed exon structure at gene", iGene))}
# }
#
# # Function paired end BEDfile to transcriptome
# pairEndReadsToGenes <- function(parCluster, bedPEGzFile, gAnnot){
# inputFile <- readBEDPE3(gzfile(bedPEGzFile))
# commChrom <- intersect(unique(gAnnot$chr), unique(inputFile$chr))
# genPReads <- parLapply(parCluster, commChrom, function(iChr){
# tmpIF <- subset(inputFile, chr == iChr)
# tmpGA <- subset(gAnnot, chr == iChr)
# tmpO <- lapply(tmpGA$name, function(iGene){
# exBlock <- exonBlockGen(iGene, geneAnnot)
# if(geneAnnot[iGene, "strand"] == "+"){
# tmpDF <- subset(tmpIF, strand == "+" & start_r1 %in% exBlock & end_r2 %in% exBlock)
# tmpDF$start <- match(tmpDF$start_r1, exBlock)
# tmpDF$end <- match(tmpDF$end_r2, exBlock)
# tmpDF <- tmpDF[,c("chr", "start", "end")]
# }else if(geneAnnot[iGene, "strand"] == "-"){
# tmpDF <- subset(tmpIF, strand == "-" & start_r2 %in% exBlock & end_r1 %in% exBlock)
# tmpDF$start <- match(tmpDF$end_r1, exBlock)
# tmpDF$end <- match(tmpDF$start_r2, exBlock)
# tmpDF <- tmpDF[,c("chr", "start", "end")]
# }
# tmpDF <- ddply(tmpDF,.(chr, start, end), nrow)
# if(ncol(tmpDF) == 4){
# colnames(tmpDF)[4] <- "times"
# tmpDF$len <- tmpDF$end - tmpDF$start
# }
# rownames(tmpDF) <- NULL
# return(tmpDF)
# })
# names(tmpO) <- tmpGA$name
# return(tmpO)
# })
# # Flatten list
# tmp <- do.call(c, genPReads)
# genPReads <- tmp
# return(genPReads)
# }
#
# # RowMeans by column groups
# rowMeansColG <- function(DF, colGroups, na.rm = T){
# cG <- unique(colGroups)
# out <- sapply(cG, function(x){
# rowMeans(DF[,colGroups == x], na.rm = na.rm)
# }) %>% as.data.frame()
# colnames(out) <- cG
# rownames(out) <- rownames(DF)
# return(out)
# }
#
# # Extract countData from DATA object
# getCountData <- function(cluster, dat, GENES, geneAnnot, minInserts, maxInsLen){
# rawData <- parLapply(cluster, GENES, function(iGene) {
# iStrand <- geneAnnot[iGene, "strand"]
# tmpD <- dat[[iGene]]
# tmpD <- subset(tmpD, tmpD$len <= maxInsLen)
# if(nrow(tmpD) == 0){
# tmpRE <- list(rE5 = NA, rE3 = NA, cov = NA)
# }else if(sum(tmpD$times) < minInserts){
# tmpRE <- list(rE5 = NA, rE3 = NA, cov = NA)
# }else{
# tmpRE_5 <- tmpRE_3 <- tmpRE_c <- rep(0L, length(exonBlockGen(iGene, geneAnnot)))
# for(k in 1:nrow(tmpD)){
# tmpRE_5[tmpD[k,]$start] <- tmpRE_5[tmpD[k,]$start] + tmpD[k,]$times
# tmpRE_3[tmpD[k,]$end] <- tmpRE_3[tmpD[k,]$end] + tmpD[k,]$times
# tmpRE_c[tmpD[k,]$start:tmpD[k,]$end] <- tmpRE_c[tmpD[k,]$start:tmpD[k,]$end] + tmpD[k,]$times
# }
# tmpRE <- list(rE5 = tmpRE_5, rE3 = tmpRE_3, cov = tmpRE_c)
# }
# return(tmpRE)
# })
# return(rawData)
# }
#
#
# # Load/install CRAN packages
# installLoad_CRAN <- function(package){
# if (!require(package, character.only = T)) {
# install.packages(package, dependencies = TRUE,
# repos = "http://cran.us.r-project.org")
# library(package, character.only = T, quietly = T)
# }
# }
#
# # Load/install Bioconductor packages
# installLoad_BioC <- function(package){
# if (!require(package, character.only = T)) {
# if(version$minor %>% as.numeric() >= 3.5){
# if (!requireNamespace("BiocManager", quietly = TRUE)){
# installLoad_CRAN("BiocManager")
# }
# BiocManager::install(package)
# library(package, character.only = T, quietly = T)
# }
# if(version$minor %>% as.numeric() < 3.5){
# source("https://bioconductor.org/biocLite.R")
# biocLite(package, ask = F)
# library(package, character.only = T, quietly = T)
# }
# }
# }
#
# # Number of inserts in sample
# insertsNum <- function(pairEndReads){
# sapply(pairEndReads, function(x){
# sum(x$times)
# })
# }
#
# # Extract rawData from DATA object
# getRawData <- function(dat, GENES, geneAnnot, motifOme, maxL){
# rawData <- lapply(GENES, function(iGene) {
# iStrand <- geneAnnot[iGene, "strand"]
# tmpD <- dat[[iGene]]
# iCoors <- motifOme[[iGene]]
# tmpD$len <- abs(tmpD$start - tmpD$end)
# tmpD <- tmpD[tmpD[,"len"] <= maxL,]
# if(nrow(tmpD) == 0){
# tmpRE <- cbind(NA, NA, NA, NA, NA) %>% as.matrix()
# }else if(sum(tmpD$times) < minInserts){
# tmpRE <- cbind(NA, NA, NA, NA, NA) %>% as.matrix()
# }else{
# tmpRE_5 <- tmpRE_3 <- tmpRE_c <- rep(0L, geneAnnot[iGene,]$seqs %>% nchar)
# if(iStrand == "-"){
# for(k in 1:nrow(tmpD)){
# tmpRE_5[tmpD[k,]$end] <- tmpRE_5[tmpD[k,]$end] + tmpD[k,]$times
# tmpRE_3[tmpD[k,]$start] <- tmpRE_3[tmpD[k,]$start] + tmpD[k,]$times
# tmpRE_c[tmpD[k,]$end:tmpD[k,]$start] <- tmpRE_c[tmpD[k,]$end:tmpD[k,]$start] + tmpD[k,]$times
# }
# }else if(iStrand == "+"){
# for(k in 1:nrow(tmpD)){
# tmpRE_5[tmpD[k,]$start] <- tmpRE_5[tmpD[k,]$start] + tmpD[k,]$times
# tmpRE_3[tmpD[k,]$end] <- tmpRE_3[tmpD[k,]$end] + tmpD[k,]$times
# tmpRE_c[tmpD[k,]$start:tmpD[k,]$end] <- tmpRE_c[tmpD[k,]$start:tmpD[k,]$end] + tmpD[k,]$times
# }
# }else{stop("No valid strand specified in gene annotation")
# }
# rowN <- paste(iGene, iCoors, sep = "_")
# tmpRE <- cbind(rowN, tmpRE_5[iCoors], tmpRE_3[iCoors-1], tmpRE_c[iCoors], tmpRE_c[iCoors -1]) %>% as.matrix()
# colnames(tmpRE) <- c("coor", "rE5","rE3", "rT5", "rT3")
# }
# return(tmpRE)
# })
# rawData <- do.call("rbind", rawData) %>% na.omit() %>% asDataFrame()
# return(rawData)
# }
#
# # Get all the relative coordinates for all genes using a BED-like matrix row
# getRelPos <- function(BEDrow){
# tmp <- BEDrow
# iChr <- tmp$chr
# iStart <- tmp$start
# iStrand <- tmp$strand
# tmpA <- subset(geneAnnot, chr == iChr & strand == iStrand & start <= iStart & end >= iStart)
# tmpCoor <- sapply(tmpA$name, function(x){
# tmp <- which(exonBlocks[[x]] == iStart)
# if(!as.logical(length(tmp))){
# tmp <- NA
# }
# return(tmp)
# }) %>% unlist
# paste0(tmpA$name,"_", tmpCoor)[!is.na(tmpCoor)]
# }
#
# # From relative to genomic coordinate
# relToGenCoors <- function(relCoor, genAnnot = geneAnnot){
# splt <- unlist(strsplit(relCoor, split = "_"))
# gene <- splt[1] %>% as.character()
# genCoor <- exonBlockGen(gene, genAnnot)[as.numeric(splt[2])]
# return(c(genAnnot[gene,]$chr, genCoor, genCoor, relCoor, 0, genAnnot[gene,]$strand))
# }
#
# # Extract window of sequences from gene sequence vector
# extracSeq <- function(relCoor, window = 10, fullSeqs = txOmeSEQS){
# tmp <- unlist(strsplit(relCoor, split = "_"))
# iGene <- tmp[1]
# iCoor <- as.numeric(tmp[2])
# substr(fullSeqs[iGene], start = iCoor - window, stop = iCoor + window)
# }
#
# # raw cleavage efficiencies in motif (motifOme object required)
# rawClvEff_motif <- function(countDataFile, RTthr = 15, motifOme = motifOme){
# load(countDataFile)
# tmpSel <- sapply(countData, function(x) length(x$cov)) > 1
# countData <- countData[tmpSel]
# genesI <- intersect(names(countData), names(motifOme))
# rawClvEff <- lapply(genesI, function(geneI){
# tmp <- data.frame(coorNames = paste0(geneI,"_", motifOme[[geneI]]),
# rE5 = (countData[[geneI]]$rE5)[motifOme[[geneI]]],
# rT5 = (countData[[geneI]]$cov)[motifOme[[geneI]]],
# rE3 = (countData[[geneI]]$rE3)[motifOme[[geneI]] -1],
# rT3 = (countData[[geneI]]$cov)[motifOme[[geneI]] -1], stringsAsFactors = F)
# tmp$clvEff_5 <- tmp$rE5 / tmp$rT5
# tmp$clvEff_3 <- tmp$rE3 / tmp$rT3
# tmp$clvEff_5[tmp$rT5 < RTthr] <- NA
# tmp$clvEff_3[tmp$rT3 < RTthr] <- NA
# out <- tmp[(is.na(tmp[,c("clvEff_5", "clvEff_3")]) %>% rowSums()) < 2,]
# return(out)
# })
# rawClvEff <- do.call(rbind, rawClvEff)
# rawClvEff <- merge(allSites, rawClvEff)
# return(rawClvEff)
# }
#
# # Optimal distance analysis
# optiMotifDist <- function(rawCE, distLimit){
# TMP_both <- sapply(1:distLimit, function(thr){
# TMPdata <- cbind(rawCE$clvEff_5[rawCE$doS_motifDist >= thr & rawCE$upS_motifDist >= thr],
# rawCE$clvEff_3[rawCE$doS_motifDist >= thr & rawCE$upS_motifDist >= thr]) %>% na.omit
# c(cor(TMPdata[,1], TMPdata[,2]), nrow(TMPdata))
# })
#
# minBoD <- stepWiseUp(TMP_both[1,], failThr = 1, start = 5)
# TMP_up <- sapply(1:distLimit, function(thr){
# TMP_up <- cbind(rawCE$clvEff_5[rawCE$doS_motifDist >= minBoD & rawCE$upS_motifDist >= thr],
# rawCE$clvEff_3[rawCE$doS_motifDist >= minBoD & rawCE$upS_motifDist >= thr]) %>% na.omit
# c(cor(TMP_up[,1], TMP_up[,2]), nrow(TMP_up))
# })
# TMP_do <- sapply(1:distLimit, function(thr){
# TMP_do <- cbind(rawCE$clvEff_5[rawCE$doS_motifDist >= thr & rawCE$upS_motifDist >= minBoD],
# rawCE$clvEff_3[rawCE$doS_motifDist >= thr & rawCE$upS_motifDist >= minBoD]) %>% na.omit
# c(cor(TMP_do[,1], TMP_do[,2]), nrow(TMP_do))
# })
# minUpD <- stepWiseUp(TMP_up[1,], failThr = 1, start = 10)
# minDoD <- stepWiseUp(TMP_do[1,], failThr = 1, start = 10)
# tmp <- c(minBoD, minUpD, minDoD); names(tmp) <- c("minBoD", "minUpD", "minDoD")
# return(list(boD = TMP_both, upD = TMP_up, doD = TMP_do, minD = tmp))
# }
#
# # Plotting optimal distance analyisis
# plot_OptimMotifDist <- function(opMoDi, smpName){
# palette(brewer.pal(9, "Set1")) #RColorBrewer
# par(mfrow=c(1,3))
# distLimit <- ncol(opMoDi$boD)
# secAxisPlot(1:distLimit, opMoDi$boD[2,], opMoDi$boD[1,],
# xlab = "Distance to ACA",
# ylab = "n ACA sites eval",
# y2lab = "Pearson Correlation",
# main = "P. Corr ~ ACA dist.-BothS")
# abline(v = opMoDi$minD["minBoD"], col = "red")
# abline(h = opMoDi$boD[1, opMoDi$minD["minBoD"]], col = "red")
#
# secAxisPlot(1:distLimit, opMoDi$upD[2,], opMoDi$upD[1,],
# xlab = "Distance to ACA",
# ylab = "n ACA sites eval",
# y2lab = "Pearson Correlation",
# main = "P. Corr ~ ACA dist.-UpS")
# title(sub = smpName)
# abline(v = opMoDi$minD["minUpD"], col = "red")
# abline(h = opMoDi$upD[1, opMoDi$minD["minUpD"]], col = "red")
#
# secAxisPlot(1:distLimit, opMoDi$doD[2,], opMoDi$doD[1,],
# xlab = "Distance to ACA",
# ylab = "n ACA sites eval",
# y2lab = "Pearson Correlation",
# main = "P. Corr ~ ACA dist.-DownS")
# abline(v = opMoDi$minD["minDoD"], col = "red")
# abline(h = opMoDi$doD[1, opMoDi$minD["minDoD"]], col = "red")
# par(mfrow=c(1,1))
# }
#
# # Calculating average cleavage efficiency using rawClvEff_motif output and
# # downStream and upStream thresholds
# avgCleavEff <- function(rawClvEff, doS_thr, upS_thr){
# tmp <- rawClvEff
# tmp$clvEff_5[tmp$doS_motifDist < doS_thr] <- NA
# tmp$clvEff_3[tmp$upS_motifDist < upS_thr] <- NA
# tmp$avgClvEff <- rowMeans(tmp[,c("clvEff_5", "clvEff_3")], na.rm = T)
# tmp$avgClvEff[is.na(tmp$avgClvEff)] <- NA
# out <- rawClvEff
# out$avgClvEff <- tmp$avgClvEff
# return(out)
# }
#
# # Cleavage motif frequency (based on first 'N' nucleotides)
# clvNucFreq <- function(cluster, countFile, geneSeqs, nNucs){
# load(countFile)
# triNucs <- permutations(n = 4,r = nNucs, v = c("A", "T", "C", "G"),
# repeats.allowed = T) %>% apply(MARGIN = 1, function(x){paste(x, collapse = "")})
# iGenes <- intersect(names(countData)[sapply(countData, function(x) sum(x$rE3)>0)], names(geneSeqs))
# nucsFreq_5 <- parSapply(cluster, iGenes, function(iG){
# test <- sapply(which(countData[[iG]]$rE5 > 0), function(i){
# c(substring(text = geneSeqs[iG], first = i, last = i+nNucs-1), countData[[iG]]$rE5[i])
# }) %>% t %>% asDataFrame()
# if(is.null(names(test))){
# test <- data.frame(a= NA, b = NA)
# }
# names(test) <- c("nucs", "times")
# sapply(triNucs, function(x){
# sum(subset(test, nucs == x)$times)
# })
# }) %>% rowSums()
# nucsFreq_3 <- parSapply(cluster, iGenes, function(iG){
# test <- sapply(which(countData[[iG]]$rE3 > 0), function(i){
# c(substring(text = geneSeqs[iG], first = i+1, last = i+nNucs), countData[[iG]]$rE3[i])
# }) %>% t %>% asDataFrame()
# if(is.null(names(test))){
# test <- data.frame(a= NA, b = NA)
# }
# names(test) <- c("nucs", "times")
# sapply(triNucs, function(x){
# sum(subset(test, nucs == x)$times)
# })
# }) %>% rowSums()
# list(nucsFreq_5= nucsFreq_5, nucsFreq_3 = nucsFreq_3)
# }
#
# # Which positions are top N
# whichTopN <- function(x, N){
# names(x) <- 1:length(x)
# as.numeric(names(tail(sort(x), N)))
# }
# # Which positions are bottom N
# whichBottomN <- function(x, N){
# names(x) <- 1:length(x)
# as.numeric(names(head(sort(x), N)))
# }
# # Omit infinite values
# inf.omit <- function(x){
# x[!is.infinite(x)]
# }
#
# # Keep rows and column names when using normalize.quantiles()
# normalize.quantiles.keepNames <- function(x){
# x[is.na(x)] <- NA
# require(preprocessCore)
# rN <- rownames(x)
# cN <- colnames(x)
# tmpO <- data.frame(normalize.quantiles(as.matrix(x)))
# rownames(tmpO) <- rN; colnames(tmpO) <- cN
# return(tmpO)
# }
#
# # Clear temporary variables
# clearTmp <- function(){rm(list = grep("tmp", ls(), value = T)); gc(verbose = T)}
#
# # Get model data
# getModeldata <- function(geneAnnot, txOmeSEQS, relCoors){
# cl <- makeCluster(nCores, type ="FORK")
# tmpCoors <- parSapply(cl, relCoors, function(x) relToGenCoors(x, geneAnnot)) %>% t %>% asDataFrame()
# stopCluster(cl)
# seqData <- data.frame(stringsAsFactors = F, tmpCoors)
# colnames(seqData) <- c("chr", "start", "end", "id", "score", "strand")
# class(seqData$start) <- "numeric"
# class(seqData$end) <- "numeric"
# seqData <- seqData[!duplicated(seqData),]
# rownames(seqData) <- seqData$id
# all_relPos <- sapply(rownames(seqData), function(x) strsplit(x, split = "_")) %>% unlist %>% matrix(ncol = 2, byrow = T)
# wind <- 30
# cl <- makeCluster(nCores, type ="FORK")
# seqData$sequence <- parSapply(cl, 1:nrow(all_relPos), function(i){
# iGene <- all_relPos[i, 1]
# iPos <- all_relPos[i, 2] %>% as.numeric()
# return(txOmeSEQS[[iGene]] %>% substring(first = iPos - wind, last = iPos + wind))
# })
# stopCluster(cl)
# nick <- tempfile()
# writeLines(seqData$sequence, paste0(nick, ".seqs"))
# # Free energy estimation (ViennaRNA)
# comm2 <- paste0("/apps/RH7U2/gnu/ViennaRNA/2.3.1-nogsl/bin/RNAfold -i ", nick,
# ".seqs --noconv --noPS -T 30 > ", nick, ".fold.out.fold")
# system(comm2)
# seqFold <- readLines(paste0(nick, ".fold.out.fold"))
# tmp <- seqFold[1:length(seqFold) %% 2 == 0]
# sep <- grep(tmp[1] %>% strsplit(split = "") %>% unlist, pattern = " ")[1]
# cl <- makeCluster(nCores, type ="FORK")
# seqData$fEnergy <- parSapply(cl, 1:length(tmp), function(i) {
# substring(tmp[i], first = sep+1) %>%
# gsub(pattern = "\\(", replacement = "") %>%
# gsub(pattern = "\\)", replacement = "") %>%
# as.numeric()
# })
# stopCluster(cl)
# dummy <- file.remove(paste0(nick, ".fold.out.fold"))
#
# #GC content
# cl <- makeCluster(nCores, type ="FORK")
# seqData$GCcont <- parSapply(cl, seqData$sequence, GC_content)
# stopCluster(cl)
#
# # Relative position in gene
# cl <- makeCluster(nCores, type ="FORK")
# seqData$relPosRat <- parSapply(cl, 1:nrow(all_relPos), function(i){
# gene <- all_relPos[i, 1]
# relPos <- all_relPos[i, 2] %>% as.numeric()
# fRelPos <- relPos / length(exonBlockGen(gene, geneAnnot))
# return(fRelPos)
# })
# stopCluster(cl)
#
# # Nucleotide Matrix
# wind <- 6
# cl <- makeCluster(nCores, type ="FORK")
# tmpSeqs <- parSapply(cl, 1:nrow(all_relPos), function(i){
# iGene <- all_relPos[i, 1]
# iPos <- all_relPos[i, 2] %>% as.numeric()
# return(txOmeSEQS[[iGene]] %>% substring(first = iPos - wind, last = iPos + wind))
# })
# stopCluster(cl)
# selR <- sapply(tmpSeqs, nchar) %>% equal((wind*2)+1)
# tmpSeqs <- tmpSeqs[selR]
# seqData <- seqData[selR,]
# cl <- makeCluster(nCores, type ="FORK")
# nucMat <- parSapply(cl, tmpSeqs, function(x) {
# x %>% strsplit("") %>% unlist}) %>% t
# stopCluster(cl)
# rownames(nucMat) <- rownames(seqData)
# seqData$seqs <- tmpSeqs
# # Nucleotides binary matrix
# selNucPos <- c(3:wind,(wind+4):((wind*2)+1))
# selNucMat <- nucMat[, selNucPos]
# cl <- makeCluster(nCores, type = "FORK")
# nucBinMatrix <- parLapply(cl, 1:ncol(selNucMat), function(x) nucBinMat(selNucMat[,x])) %>% cbind.data.frame()
# stopCluster(cl)
# colnames(nucBinMatrix) <- sapply(paste("POS", selNucPos - wind - 1, sep = "_"),
# function(x) {paste(x, c("A", "C", "G", "T"), sep = "_")}) %>% as.vector()
# rownames(nucBinMatrix) <- rownames(seqData)
#
# # Putting everythin together
# modelDATA <- data.frame(seqData[, c("chr", "start", "end", "id", "score",
# "strand", "fEnergy", "GCcont","relPosRat", "seqs")],
# nucBinMatrix)
# rownames(modelDATA) <- rownames(nucBinMatrix)
# return(modelDATA)
# }
#
# # Merge lists elements by names
# mergeListsByNames <- function(list){
# keys <- unique(unlist(lapply(list, names)))
# setNames(do.call(mapply, c(FUN=c, lapply(list, `[`, keys))), keys)
# }
#
# # Extract random window of consecutive elements in vector
# extractRandWindow <- function(x, p){
# firstIndex = sample(seq(length(x) - p + 1), 1)
# x[firstIndex:(firstIndex + p -1)]
# }
#
#
# # get Coverage of range
# getCov_InRange <- function(covObject, gRange){
# covObject[[as.character(seqnames(gRange))]][(start(gRange)):(end(gRange))]
# }
#
# # Remove alignments that include Soft-clipping, Hard-clipping, deletions and insertions (S|D|I )
# cleanMapping_PairEnd <- function(pairedEnd_Alignments){
# tmpCig_f <- cigar(pairedEnd_Alignments@first)
# tmpCig_l <- cigar(pairedEnd_Alignments@last)
# pairedEnd_Alignments <- pairedEnd_Alignments[-union(grep(pattern = "D|I|S|H", tmpCig_f),
# grep(pattern = "D|I|S|H", tmpCig_l))]
# }
#
# # Getting sequence from genome
# getGenSeqInGenome <- function(gene, geneAnnot, genome){
# tmpA <- geneAnnot[gene,]$blockStarts %>% strsplit(split = ",") %>% unlist() %>% as.numeric()
# tmpB <- geneAnnot[gene,]$blockSizes %>% strsplit(split = ",") %>% unlist() %>% as.numeric()
# iBlocks <- (sapply(1:length(tmpA), function(i){
# c(tmpA[i],(tmpA[i] + tmpB[i] -1))
# }) %>% t) + geneAnnot[gene,]$start
# if(geneAnnot[gene,]$end %in% as.vector(iBlocks)){
# if(geneAnnot[gene,]$strand == "+"){
# sapply(1:nrow(iBlocks), function(k){
# str_sub(genome[[geneAnnot[gene,]$chr]], iBlocks[k, 1], iBlocks[k,2])
# }) %>% paste(collapse = "") %>% DNAString()
# }else if(geneAnnot[gene,]$strand == "-"){
# sapply(1:nrow(iBlocks), function(k){
# str_sub(genome[[geneAnnot[gene,]$chr]], iBlocks[k, 1], iBlocks[k,2])
# }) %>% paste(collapse = "") %>% DNAString() %>% reverseComplement()
# }
# }else{stop(paste("Malformed exon structure at gene", gene))}
# }
#
# # From transcriptomic range to genomic coordinates in BED format
# txRangeToBED <- function(Tgene, TrangeStart, TrangeEnd, geneAnnot){
# genCoor <- exonBlockGen(Tgene, geneAnnot)[TrangeStart:TrangeEnd]
# if(geneAnnot[Tgene,]$strand == "-"){
# genCoor <- rev(genCoor)
# }
# gStart <- genCoor[1]
# gEnd <- genCoor[length(genCoor)]
# blockCount <- sum(diff(genCoor) !=1) + 1
# if(sum(diff(genCoor) < 1) > 0){stop("Next exon position cannot be negative")}
# blockSizes <- paste(diff(which(c(2, diff(genCoor), 2) !=1)), collapse = ",")
# blockStarts <- paste(c(0, genCoor[which(diff(genCoor) != 1) +1] - genCoor[1]), collapse = ",")
# c(geneAnnot[Tgene,]$chr, gStart, gEnd, Tgene, 0,
# geneAnnot[Tgene,]$strand, gStart, gEnd, "0", blockCount,
# blockSizes, blockStarts)
# }
#
#
# # Shotcuts #####################################################################
# bed6_header <- c("chr", "start", "end", "name", "score", "strand")
# bed12_header <- c("chr", "start", "end", "name", "score", "strand",
# "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "blockStarts")
#
#
# # GenomicAlignments functions #############################################################
# # Merging GAlignmentPairs
# mergeGAPairs <- function(GAPairsList){
# tmp_first <- lapply(1:length(GAPairsList), function(i){
# GAPairsList[[i]]@first
# }) %>% do.call(what = c)
# tmp_last <- lapply(1:length(GAPairsList), function(i){
# GAPairsList[[i]]@last
# }) %>% do.call(what = c)
# tmp_names <- lapply(1:length(GAPairsList), function(i){
# names(GAPairsList[[i]])
# }) %>% do.call(what = c)
# GAlignmentPairs(first = tmp_first, last = tmp_last, names = tmp_names)
# }
#
# # Extract Nucleotides from GAPairs object at single nucleotide position
# extract_nuc <- function(GAPairs, genomicPos){
# ov_f <- findOverlaps(genomicPos, GAPairs@first, ignore.strand = T)@to
# ov_l <- findOverlaps(genomicPos, GAPairs@last, ignore.strand = T)@to
# ov_l <- setdiff(ov_l, ov_f)
# if(length(ov_f) > 0){
# nucs_f <- str_sub(string = sequenceLayer(mcols(GAPairs@first[ov_f])$seq,
# mcols(GAPairs@first[ov_f])$cigar),
# start = start(genomicPos) - start(GAPairs@first[ov_f]) + 1,
# end = start(genomicPos) - start(GAPairs@first[ov_f]) + width(genomicPos))
# }else{nucs_f <- NULL}
# if(length(ov_l) > 0){
# nucs_l <- str_sub(string = sequenceLayer(mcols(GAPairs@last[ov_l])$seq,
# mcols(GAPairs@last[ov_l])$cigar),
# start = start(genomicPos) - start(GAPairs@last[ov_l]) + 1,
# end = start(genomicPos) - start(GAPairs@last[ov_l]) + width(genomicPos))
# }else{nucs_l <- NULL}
# if(as.logical(strand(genomicPos) == "-")){
# as.character(complement(DNAStringSet(c(nucs_f, nucs_l))))
# }else{c(nucs_f, nucs_l)}
# }
#
# # Read counts by range
# readCountsByRange <- function(BAM_file, targetRanges){
# flag0 <- scanBamFlag(isDuplicate = FALSE, # Base parameters
# isNotPassingQualityControls = FALSE,
# isPaired = T) #Remove optical duplicates and low quality reads
# chroms <- seqnames(targetRanges) %>% unique %>% as.character()
# lapply(chroms, function(iChr){
# param0 <- ScanBamParam(flag=flag0,
# which = targetRanges)
# rds <- readGAlignmentPairs(BAM_file, use.names = TRUE, param = param0)
# rds_pos <- rds[strand(rds@first) == "+"]
# rds_neg <- rds[strand(rds@first) == "-"]
# tmpRds_pos <- GRanges(seqnames = seqnames(rds_pos),
# ranges = IRanges(start = start(rds_pos@first), end = end(rds_pos@last)),
# strand = strand(rds_pos@first))
# tmpRds_neg <- GRanges(seqnames = seqnames(rds_neg),
# ranges = IRanges(start = start(rds_neg@last), end = end(rds_neg@first)),
# strand = strand(rds_neg@first))
# rdsExt <- c(tmpRds_pos, tmpRds_neg)
# tmpCoors <- targetRanges[seqnames(targetRanges) == iChr]
# tmpOver <- findOverlaps(rdsExt, tmpCoors)
# readSplit <- split(tmpOver@from, names(tmpCoors)[tmpOver@to])
# readSplit <- lapply(readSplit, unique) # Remove duplicated reads
# sapply(readSplit, length) # Counts by
# }) %>% unlist
# }
#
# # Counting reads by gene in gene annotation as ranges
# readCountsByGene <- function(BAM_file, geneAnnotRanges){
# flag0 <- scanBamFlag(isDuplicate = FALSE, # Base parameter
# isNotPassingQualityControls=FALSE,
# isPaired = T) #Remove optical duplicates and low quality reads
# chroms <- seqnames(geneAnnotRanges) %>% unique %>% as.character()
# lapply(chroms, function(iChr){
# param0 <- ScanBamParam(flag=flag0,
# which = GRanges(seqnames= iChr,
# ranges=IRanges(start=1,
# end= length(genome[[iChr]])),
# strand = "*"))
# rds <- readGAlignmentPairs(BAM_file, use.names = TRUE, param = param0)
# strand(rds@last) <- strand(invertStrand(rds@last))
# tmpGA <- geneAnnotRanges[seqnames(geneAnnotRanges) == iChr]
# fOver <- findOverlaps(rds@first, tmpGA, type = "within")
# lOver <- findOverlaps(rds@last, tmpGA, type = "within")
# readSplit_f <- split(fOver@from, tmpGA$name[fOver@to])
# readSplit_l <- split(lOver@from, tmpGA$name[lOver@to])
# readSplit <- mergeListsByNames(list(readSplit_f, readSplit_l))
# readSplit <- lapply(readSplit, unique) # Remove duplicated reads
# sapply(readSplit, length) # Gene Counts
# }) %>% unlist
# }
#
# # Counts list to dataframe function
# countsList_ToDataFrame <- function(countsList){
# tmpNames <- sapply(countsList, names) %>% unlist %>% unique %>% sort
# tmpDF <- lapply(1:length(countsList), function(i){
# tmp <- data.frame("x" = countsList[[i]][tmpNames], row.names = tmpNames)
# tmp[is.na(tmp)] <- 0
# tmp
# }) %>% do.call(what = cbind)
# colnames(tmpDF) <- names(countsList)
# tmpDF
# }
#
# # TPM normalization
# TPMnorm <- function(tmpDF){
# cS <- colSums(tmpDF)
# tmp <- sapply(1:ncol(tmpDF), function(i){
# ((tmpDF[,i]/cS[i]) * 1000000)
# })
# tmp
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.