R/fimo_create_fimo_predictors.R

#' Creates a score list
#'
#' \code{makeScoreList} creates and gives column names to a list which will
#'   contain the minPval, maxScore, numOcc, and sumScore for Factorbook, 
#'   Jaspar, and Encode scores for the given eQTLs, as well as the Deepbind
#'   score.
#'
#' @return scoreList: a list which will contain the FactorBook, Jaspar, Encode,
#'   and Deepbind scores.
#' 
#' @name makeScoreList
makeScoreList <- function() {
  scoreList <- vector("list", 13)
  names(scoreList) <- c("fb_minPval", "fb_maxScore", "fb_numOcc", "fb_sumScore",
                        "en_minPval", "en_maxScore", "en_numOcc", "en_sumScore",
                        "ja_minPval", "ja_maxScore", "ja_numOcc", "ja_sumScore", 
                        "db_score")
  return(scoreList)
}

#' Load in FIMO summary data
#'
#' \code{loadFimoData} loads in the .Rdata files created by 
#'   fimo_run_fimo_and_summarize.R
#'
#' @param refOrAlt
#' @param functionName
#' @param fimoThreshold
#' @return Null. Loads 4 .Rdata files into the environment
#' 
#' @name loadFimoData
loadFimoData <- function(refOrAlt, functionName, fimoThreshold) {
  for (name in c("minPval", "maxScore", "sumScore", "numOcc")) {
    load(paste("Training_", refOrAlt, "_thres", fimoThreshold, 
               "_", functionName, "_", name, ".RData", sep = ""))
  }
}

#' Adds a score to the score list
#'
#' \code{addToScoreList} adds a FactorBook or Jaspar score to the score list.
#'   Specifically, minPval, maxScore, numOcc, or sumScore.
#'
#' @param trainingD: a table of eQTL causal SNPs
#' @param fimoS: a score for FIMO
#' @param scoreName: string, name indicating analysis type and type of score
#'   Ex. "fb_minPval"
#' @param scoreList: a vector containing all scores for FactorBook, ENCODE,
#'   Jaspar, and Deepbhind 
#'   
#' @name addToScoreList
addToScoreList <- function(trainingD, fimoS, scoreName, scoreList) {
  # split the fimo score rownames into ??
  oIndex <- as.numeric(apply(as.matrix(rownames(fimoS)), 1, function(x){strsplit(x, "_SNP_")[[1]][2]}))
  fimoS.ordered <- matrix(1, nrow(trainingD), ncol(fimoS))
  colnames(fimoS.ordered) <- colnames(fimoS)
  fimoS.ordered[oIndex, ] <- fimoS
  # add the ordered fimo score to the scoreName's column of the list of scores
  scoreList[[scoreName]] <- fimoS.ordered

  return(scoreList)
}


#' Adds a score to the score list
#'
#' \code{addAllToScoreList} adds the minPval, maxScore, numOcc, and sumScore to
#'   the scoreslist for either FactBook or Jaspar
#'
#' @param trainingD: a table of eQTL causal SNPs
#' @param fimoS: TYPE?, a score for FIMO
#' @param scoreName: list of strings, which are indicating analysis type and type of score
#'   Ex. "fb_minPval"
#' @param scoreList: a vector containing all scores for FactorBook, ENCODE,
#'   Jaspar, and Deepbhind 
#'
#' @name addAllToScoreList
addAllToScoreList <- function (trainingD, fimoS, scoreNames, scoreList) {
  # renaming the data which has been loaded in
  fimo.minPval <- out.1
  fimo.maxScore <- out.2
  fimo.numOcc <- out.3
  fimo.sumScore <- out.4
  
  # creating names to label each score with - specific to this analysis
  for (score in scoreNames) {
    scoreList <- addToScoreList(trainingD, fimoS, scoreName, scoreList)
  }
  return(scoreList)
}

#'
#'
#' \code{getCorrelation} performs a simple correlation between a fimo score and
#'    the eQTL data given.
#'
#' @param fimoS.ordered: the fimo scores
#' @param trainingD: a table of eQTL data
#' @name getCorrelation
getCorrelation <- function(fimoS.ordered, trainingD) {
  cor(fimoS.ordered, trainingD$C.A.log2FC)
}

#' Perform simple correlations
#' 
#' \code{createAllCorrelations} will take the scores for either FactBook,
#'   Jaspar, or ENCODE and will perform correlations for the minimum P-value,
#'   the maximum score, the sum of the scores, and the number of occurences.
#' 
#' @param scoreNames: list of strings, contains names of the analysis and the
#'    different scores. 
#'    Ex. c("fb_minPval", "fb_maxScore", "fb_numOcc", "fb_sumScore")
#' @param scoreList: list of scores
#' @return Null. Prints out.
#' @name createAllCorrelations
createAllCorrelations <- function(scoreList, scoreNames) {
  # Getting correlation for the minimum p-value
  cat("minPVal\n")
  fimoS.ordered <- scoreList[[scoreNames[1]]]
  fimoS.ordered.binary <- apply(fimoS.ordered, 2, function(x){res <- x; res[x>=0.05]<-0; res[x <0.05] <-1; return(res)}) #PWM match crude
  corRes.binary <- apply(fimoS.ordered.binary, 2, function(x){ifelse(length(unique(x)) > 1, wilcox.test(trainingD$C.A.log2FC~x)$p.value,1)})
  #order(corRes.binary, decreasing = FALSE)
  summary(-log10(corRes.binary))
  
  # Getting correlation for the maximum score
  cat("maxScore\n")
  fimoS.ordered <- scoreList[[scoreNames[2]]]
  corRes <- apply(fimoS.ordered, 2, getCorrelation(fimoS.ordered, trainingD))
  #order(abs(corRes), decreasing = TRUE)
  summary(abs(corRes))
  
  # Getting correlation for the sum of the scores
  cat("sumScore\n")
  fimoS.ordered <- scoreList[[scoreNames[3]]]
  corRes <- apply(fimoS.ordered, 2, getCorrelation(fimoS.ordered, trainingD))
  #order(abs(corRes), decreasing = TRUE)
  #sort(abs(corRes), decreasing = TRUE)[1:10]
  summary(abs(corRes))
  
  # Getting correlation for the number of occurences
  cat("numOcc\n")
  fimoS.ordered <- scoreList[[scoreNames[4]]]
  corRes <- apply(fimoS.ordered, 2, getCorrelation(fimoS.ordered, trainingD))
  #order(abs(corRes), decreasing = TRUE)
  #sort(abs(corRes), decreasing = TRUE)[1:10]
  summary(abs(corRes))
}

#' Exploratory analysis with FactorBook PWM scores
#'
#' \code{factorBookAnalysis} performs all analysis of the FactorBook PWM
#'
#' @param dataDir: directory of data
#' @param fimoDir: directory of FIMO files
#' @param refOrAlt: string of "ref" or "alt", used for naming
#' @param trainingD: training data table of eQTL data
#' @param fimoS
#' @param scoreList: list of scores
#' @name factorBookAnalysis
factorBookAnalysis <- function(dataDir, fimoDir, refOrAlt, trainingD, fimoS, scoreList) {
  # First load in the eQTL data
  setwd("/p/keles/CAGI2015/volumeB/Data/")
  trainingD <- read.table("4-eQTL-causal_SNPs_sample_v2.txt", header = T)
 
  # Load in the FIMO data from previous script
  #setwd("/Users/keles/Research_Projects/CAGI2015/Fimo/")
  setwd(fimoDir)
  loadFimoData(refOrAlt, "FactorBook", fimoThreshold)
  
  # Create a specific list of names for the 4 scores 
  # Ex. fb_minPval, etc.
  scoreNames <- paste("fb", scoreTypes, sep = "_")
  
  # Add all 4 scores to the output list
  scoreList <- addAllToScoreList(trainingD, fimoS, scoreNames, scoreList)
  
  # Create correlations for all 4 scores
  createAllCorrelations(scoreList, scoreNames)
  
}

#' Exploratory analysis with Deepbind scores
#'
#' \code{deepbindAnalysis} 
#'
#' @param deepBindLocation: string, location of deepbind
#' @name deepbindAnalysis
deepbindAnalysis <- function(deepBindLocation = "/p/keles/CAGI2015/volumeB/DeepBind/deepbind") {
  setwd("/p/keles/CAGI2015/volumeB/DeepBind/deepbind")
  db <- read.table("alt_SS.v1_deepbind.txt", header = TRUE)
  
  scoreList[["db_score"]] <- db
  
  ## Plot the correlations for the Deepbind data?
  setwd("/Users/keles/Research_Projects/CAGI2015/Data/After_Nov5")
  trainingD <- read.table("4-eQTL-causal_SNPs_sample_v2.txt", header = T)
  setwd("/Users/keles/Research_Projects/CAGI2015/DeepBind/")
  db <- read.table("Training_alt_3044_deepbind_927.txt", header = TRUE)
  corRes <- apply(db, 2, getCorrelation(fimoS.ordered, trainingD))
  index <- order(abs(corRes), decreasing = TRUE)[1:50]
 
  db.sub <- db[, index]
  colnames(db.sub) <- paste("db_", 1:50, sep = "")
  
  dd <- cbind(trainingD, db.sub)
  ddd <- dd
  ddd$grp <- paste(ddd$Regulatory_Hit, ddd$emVar_Hit, sep = "")
  ggplot(ddd, aes(x = db_2, y =C.A.log2FC)) + geom_smooth(method='lm',formula=y~x) + geom_point(aes(colour=grp)) + facet_wrap( ~ grp, nrow = 1)
  

}

#' Exploratory analysis with ENCODE PWM scores
#'
#' \code{deepbindAnalysis} performs all analysis of the Deepbind data
#'
#' @param fimoDir: location of the FIMO files
#' @param trainingD: training data table of eQTL data
#' @name encodeAnalysis
encodeAnalysis <- function(fimoDir, trainingD) {
  setwd("/p/keles/CAGI2015/volumeA/Data/After_Nov5")
  trainingD <- read.table("4-eQTL-causal_SNPs_sample_v2.txt", header = T)
  
  scoreNames <- paste("en", scoreTypes, sep = "_")
  
  setwd(fimoDir)
  loadFimoData(refOrAlt, "ENCODE", fimoThreshold)
  
  scoreList <- addAllToScoreList(trainingD, fimoS, scoreNames, scoreList)
  
  createAllCorrelations(scoreNames)
}

#' Exploratory analysis with Jaspar PWM scores
#'
#' \code{jasparAnalysis} 
#'
#' @param dataDir: string, location of data
#' @param fimoDir: string, location of FIMO files
#' @param trainingD: training data table of eQTL data
#' @return 
#' @name jasparAnalysis
jasparAnalysis <- function(dataDir, fimoDir, trainingD) {
  setwd("/p/keles/CAGI2015/volumeB/Data/")
  trainingD <- read.table("4-eQTL-causal_SNPs_sample_v2.txt", header = T)
  
  scoreNames <- paste("ja", scoreTypes, sep = "_")
  
  setwd(fimoDir)
  loadFimoData(refOrAlt, "Jaspar", fimoThreshold)
  
  createAllCorrelations(scoreList, scoreNames)
}


#' Output all scores
#' 
#' \code{outputAllScores} saves the FactorBook, Deepbind, ENCODE, and Jaspar
#'   scores.
#' 
#' @param scoreList: list of scores
#' @name outputAllScores
outputAllScores <- function(scoreList) {
  setwd("/p/keles/CAGI2015/volumeB/Predictors/")
  save(scoreList, file = "scoreList_Training_alt_thres0.001_pwm.RData")
}

#' Run all functions in here
#' 
#' 
#' 
#' @param trainingDLocation: string, location of the training D
#' @param dataDir: string, location of data
#' @param fimoDir: string, location of FIMO files
#' @param trainingDLocation: string, location of training eQTL data
#' @name create_fimo_predictors
create_fimo_predictors <- function(trainingDLocation = "/p/keles/CAGI2015/volumeA/Data/After_Nov5/4-eQTL-causal_SNPs_sample_v2.txt", 
                                   dataDir = "/p/keles/CAGI2015/volumeA/Data/After_Nov5",
                                   fimoDir = "/p/keles/CAGI2015/volumeB/Fimo") {
  trainingD <- read.table(trainingDLocation, header = T)
  scoreList <- makeScoreList()
  
  scoreTypes <- c("minPval", "maxScore", "numOcc", "sumScore")
  
  factorBookAnalysis(scoreList, trainingD)
  deepbindAnalysis()
  encodeAnalysis()
  jasparAnalysis()
  outputAllScores(scoreList)
}
alice-szeliga/keles-pipeline documentation built on May 12, 2019, 5:35 a.m.