TissueCiradianAnalysis: Analysis of oscillatory RNASeq data

#library(Matrix)
#library(dplyr)
#library(hash)

UnzipMatrices <- function(chunk.path, outdir){
  # Unzip matrix and save with extension ".full.Robj"
  load(chunk.path)  # des.mats.list
  outname <- basename(chunk.path)
  outname <- paste(strsplit(outname, ".Robj")[[1]], collapse = ".")
  outname <- paste0(outname, ".unzipped.Robj")
  out.path <- file.path(outdir, outname)
  
  des.mats.list <- lapply(des.mats.list, function(des.mats){
    des.mats$mat <- as.matrix(des.mats$mat)
    return(des.mats)
  })
  return(des.mats.list)
  save(des.mats.list, file = out.path)
}

ConcatenateFits <- function(genedir){
  source("scripts/functions/AppendListFunctions.R")
  start <- Sys.time()
  fits.long.list <- expandingList()
  for (gene in list.files(genedir)){
    fitdir <- file.path(genedir, gene)
    fits <- LoadFitsFromModels(fitdir)
    fits.top <- GetTopNModelsFromFits(fits, top.n)
    fits.long.gene <- ListToLong(fits.top, genename = gene, top.n = top.n, period = 24)
    fits.long.list$add(fits.long.gene)
    #   break
  }
  fits.long.list <- fits.long.list$as.list()
  fits.long <- do.call(rbind, fits.long.list)
  return(fits.long)
}

LoadFitsFromModels <- function(fitdir){
  fits.list <- expandingList()
  fit.files <- list.files(fitdir)
  for (i in seq(length(fit.files))){
    load(file.path(fitdir, fit.files[[i]]))  # fits
    fits.list$add(fits)
  }
  return(fits.list$as.list())
}

AddDf <- function(obj){
  # add arbitrary object to last column of data frame
  return(obj)
}

FitToMat <- function(fit, genename, weight, weight.raw, period = 24){
  # Turn fit into a long data frame
  model.name <- CoefToModelName(fit)
  # vector to long
  params <- CoefToParams(fit)
  dat.out <- data.frame(gene = genename, model = model.name, weight = weight, weight.raw = weight.raw)  # init
  dat.params <- dat.out %>%
    group_by(gene, model, weight, weight.raw) %>%
    do(param.list = AddDf(params))
  return(dat.params)
}

ListToLong <- function(fits.list, genename, top.n, period = 24){
  # form list of fits (sorted by best to worst) into a long dataframe
  # should be easily used to append or edit a data frame
  mats <- list()
  for (i in seq(top.n)){
    mats[[i]] <- FitToMat(fits.list[[i]]$fit, genename, fits.list[[i]]$weight.norm, fits.list[[i]]$weight.raw, period)
  }
  return(do.call(rbind, mats))
}

GetTopNModelsFromFits <- function(fits.list, top.n){
  fits.list <- NormalizeWeightsFromFits(fits.list)  # handles pesky fit.weights.sum  
  top.n.weights <- vector("numeric", length = top.n)
  top.n.tuples <- matrix(data = 0, nrow = top.n, ncol = 2)  # 2 because i and j indices
  for (i in seq(length(fits.list))){
    # if (is.null(fits.list[[i]])) next
    for (j in seq(length(fits.list[[i]]))){  # should be list of n.top or more
      weight.j <- fits.list[[i]][[j]]$weight
      min.indx <- which.min(top.n.weights)
      # if (length(weight.j) == 0) next
      if (is.null(fits.list[[i]][[j]])) next
      # print(weight.j)
      # print(top.n.weights[min.indx])
      # print(fits.list[[i]][[j]])
      if (top.n.weights[min.indx] < weight.j){
        # update weights and indcies
        top.n.weights[min.indx] <- weight.j
        # print(top.n.tuples[min.indx, c(1, 2)])
        top.n.tuples[min.indx, c(1, 2)] <- c(i, j)
        # print(top.n.tuples[min.indx, c(1, 2)])
      }
    }
  }
  # order top.n.tuples by top.n.weights
  top.n.tuples <- top.n.tuples[order(top.n.weights, decreasing = TRUE), ]
  
  # get the top.n tuples
  fits.top <- expandingList()
  # print(top.n.tuples)
  apply(top.n.tuples, MARGIN = 1, function(tup){
    # print(tup)
    fits.top$add(fits.list[[tup[1]]][[tup[2]]])
  })
  return(fits.top$as.list())
}

NormalizeWeightsFromFits <- function(fits.list){
  fit.weight.sum <- SumWeights(fits.list)
  for (i in seq(length(fits.list))){
    fits.list[[i]][["fit.weights.sum"]] <- NULL
    for (j in seq(length(fits.list[[i]]))){
      if (is.null(fits.list[[i]][[j]])) next
      fits.list[[i]][[j]][["weight.norm"]] <- fits.list[[i]][[j]][["weight"]] / fit.weight.sum
    }
  }
  return(fits.list)
}

SumWeights <- function(fits.list){
  # Given list of fits, divide each weight by fit.weights.sum
  fit.weight.sum <- 0
  for (fit in fits.list){
    if (!is.null(fit$fit.weights.sum)){
      fit.weight.sum <- fit.weight.sum + fit$fit.weights.sum
    }
  }
  return(fit.weight.sum)
}

GetNrhythFromModel <- function(model){
  # from model, return number of tissues that are rhythmic 
  # e.g.: Mus;Adr,Kidney,Liver;Aorta,BFAT,Heart,Lung = 8 rhythmic tissues
  # strategy: replace ";" with ',', then get length of ',' split string
  n.rhyth <- gsub(pattern = ";", replacement = ",", x = model)
  return(length(strsplit(n.rhyth, ",")[[1]]))
}

GetAvgAmpFromParams <- function(params, by.model=FALSE){
  # look at param names to get average amplitudes
  # option: by.model: go strictly by model (used if model was filtered by some amp criteria beforehand)
  # by.model is a model name e.g., "Kidney;Liver;Adr,Lung" or FALSE
  amps <- params[grepl("amp", names(params))]
  if (by.model){
    if (by.model == "") return(NA)  # non rhyth model
    grepstr = gsub(pattern = ";", replacement = "|", as.character(by.model))
    amps <- amps[grepl(grepstr, names(amps))]
  }
  if(length(amps) == 0) return(NA)  # non rhyth model
  
  # get number of tissues rhythmic for each rhythmic paramter (because tissues can
  # share paramters)
  n.tiss <- sapply(names(amps), function(tiss.id) length(strsplit(tiss.id, ",")[[1]]))
  amps.weighted <- sum(amps * n.tiss)
  # return weighted mean
  return(amps.weighted / sum(n.tiss))
}

GetSdPhaseFromParams <- function(params, by.model=FALSE){
  # look at param names to get average amplitudes
  # option: by.model: go strictly by model (used if model was filtered by some amp criteria beforehand)
  # by.model is a model name e.g., "Kidney;Liver;Adr,Lung" or FALSE
  phases <- params[grepl("phase", names(params))]
  if (by.model){
    if (by.model == "") return(NA)  # non rhyth model
    grepstr = gsub(pattern = ";", replacement = "|", as.character(by.model))
    phases <- phases[grepl(grepstr, names(phases))]
  }
  if(length(phases) == 0) return(NA)  # non rhyth model
  
  # get number of tissues rhythmic for each rhythmic paramter (because tissues can
  # share paramters)
  n.tiss <- sapply(names(phases), function(tiss.id) length(strsplit(tiss.id, ",")[[1]]))
  weights = n.tiss / sum(n.tiss)
  phases.avg = weighted.mean(phases, weights)
  # phases.weighted = sum(phases * n.tiss); phases.avg = phases.weighted / sum(n.tiss)  # equivalent but less readable
  
  phases.var =  sum(weights * (DiffPhase(phases, phases.avg)) ^ 2)  # denominator is not N-1 it is N
  return(sqrt(phases.var))
}

GetPhaseFromParams <- function(params, by.model=FALSE){
  # works only for n.params == 1
  phases <- params[grepl("phase", names(params))]
  if (by.model){
    if (by.model == "") return(NA)  # non rhyth model
    grepstr = gsub(pattern = ";", replacement = "|", as.character(by.model))
    phases <- phases[grepl(grepstr, names(phases))]
  }
  if(length(phases) == 0) return(NA)  # non rhyth model
  return(mean(phases))
}

DiffPhase <- function(phase1, phase2, period=24){
  # Get phase1 and phase2 difference, modulo period
  jdiff <- abs(diff(c(phase1, phase2)))
  return(min(period - jdiff, jdiff))
}

MaxDiffPhaseVector <- function(phases, period=24){
  phasediff.max <- 0
  for (phase.i in phases){
    for (phase.j in phases){
      jdiff <- DiffPhase(phase.j, phase.i, period)
      if (jdiff > phasediff.max){
        phasediff.max <- jdiff
      }
    }
  }
  return(phasediff.max)
}

GetMaxPhaseDiffFromParams <- function(params, by.model=FALSE, period=24){
  # Get maximum phase difference between tissues
  phases <- params[grepl("phase", names(params))]
  if (by.model != FALSE){
    if (by.model == "") return(NA)  # non rhyth model
    grepstr = gsub(pattern = ";", replacement = "|", as.character(by.model))
    phases <- phases[grepl(grepstr, names(phases))]
  }
  if(length(phases) == 0) return(NA)  # non rhyth model
  phasediff.max <- MaxDiffPhaseVector(phases, period)
  return(phasediff.max)
}

FilterModelByAmp <- function(model.name, params, amp.cutoff = 0.5){
  # http://stackoverflow.com/questions/10049402/calculating-weighted-mean-and-standard-deviation
  # Get amplitude of each tissue in model name.
  # model name: Kidney;Liver;Adr,Lung 
  # params: fit from nconds2: amp and phase are called tissue1.amp tissue1.phase ... 
  # return:
  #   model.filt: Kidney:Liver if Adr,Lung amp parameter is less than amp.cutoff.
  model.vec <- strsplit(as.character(model.name), split = ";")[[1]]
  amps <- params[grepl("amp", names(params))]
  # remove tissues from model with low amplitude
  amps.filt <- amps[which(amps >= amp.cutoff)]
  
  # create new model name based on filtering
  # from c("Mus.amp", "Adr,Kidney,Liver.amp" ... ) -> c("Mus", "Adr,Kidney,Liver" ... )
  models.filt <- sapply(names(amps.filt), function(name) strsplit(name, ".amp")[[1]])
  models.filt <- paste(models.filt, collapse = ";")
  return(as.factor(models.filt))
}


MakeDesMatFromModelName <- function(dat.gene, model.name, tissues, w = 2 * pi / 24){
  # Given model.name, return design matrix
  # model.name: Aorta,BFAT,Liver;Adr,BS,Cere,Heart,Kidney
  des.mat <- GetFlatModel(dat.gene)  # cbind to this to create matrix
  tiss.combos <- GetAllCombos(tissues)
  des.mat.sinhash <- GetSinCombos(dat.gene, w, tissues, tiss.combos)
  des.mat.coshash <- GetCosCombos(dat.gene, w, tissues, tiss.combos)
  
  model.name.vec <- strsplit(model.name, ";")[[1]]
  for (rhyth.tiss in model.name.vec){
    des.mat <- cbind(des.mat, AddRhythmicColumns(des.mat.sinhash, des.mat.coshash, rhyth.tiss))
  }
  return(des.mat)
}

LoadDesMatDatGeneRunFits <- function(dat.gene, mat.chunk, criterion = "BIC", normalize.weights = TRUE, top.n = 10, sparse = TRUE){
  # Run fits given dat.gene and chunk of mat, can be loaded from file 
  # track top.n
  # each element in list of mat.chunk has $mat, $rhyth.tiss etc. So use $mat for fitting
  
  # store my fits
  # to save memory, only store top n fits
  # fits <- expandingList()
  fits <- list()
  fit.count <- 0
  
  # track weights to normalize afterwards
  fit.weights.sum <- 0
  fit.weights <- vector(mode="numeric", length=top.n)
  
  # fit my model
  for (i in seq(length(mat.chunk))){
    des.mat <- mat.chunk[[i]]
    if (sparse){
      fit <- FitModel(dat.gene, as.matrix(des.mat$mat), get.criterion = criterion, condensed = TRUE)  # condensed will save some memory
    } else {
      fit <- FitModel(dat.gene, des.mat$mat, get.criterion = criterion, condensed = TRUE)  # condensed will save some memory
    }
    fit.weights.sum <- fit.weights.sum + fit$weight  # track even if it is a bad model, helps normalization
    fit.weights.lst <- UpdateFitWeights(fit$weight, fit.weights)
    if (!is.na(fit.weights.lst$i)){
      fits[[fit.weights.lst$i]] <- fit
      fit.weights <- fit.weights.lst$weights
    }
    fit.count <- fit.count + 1
  }
  
  if (normalize.weights){
    # print("Normalizing weights...")
    # print(dat.gene$gene[[1]])
    # fits.list <- NormalizeWeights(fits.list, cutoff)
    for (i in seq(length(fits))){
      fits[[i]]$weight.norm <- fits[[i]]$weight / fit.weights.sum
    }
  }
  fits$fit.weights.sum <- fit.weights.sum
  return(fits)
}

MakeDesMatChunks <- function(dat.gene, out.dir, tissues, n.rhyth.max, w = 2 * pi / 24, sparse = TRUE, chunks=10000, only.n.params = FALSE, count.mode=FALSE){
  # dat.gene: long format of gene expression, time and
  # conditions. Can include additional factors such as 
  # experiment.
  # to reduce computation, savee matrices into chunks (user defined, let's say 10000 models a chunk) so
  # if we run genome-wide, then we load the chunk and fit.
  #
  # only.n: store models only with exactly n parameters
  # count.mode: only care about counts, dont save anything
  
  if (missing(tissues)){
    tissues <- unique(as.character(dat.gene$tissue))
  }
  
  if (missing(n.rhyth.max)){
    n.rhyth.max <- length(tissues)
  } else if (n.rhyth.max < 1){
    print(n.rhyth.max)
    warning("N rhyth max cannot be less than 2")
  }
  
  tiss.combos <- GetAllCombos(tissues, ignore.full = FALSE)
  my_mat.queue <- new.queue()
  
  # BEGIN: init with flat model
  des.mat.flat <- GetFlatModel(dat.gene)
  
  if (sparse){
    # make sparse using Matrix package
    des.mat.flat <- Matrix(des.mat.flat)
  }
  
  # get rhythmic parameters which will be used for adding later: hash structure has fast lookup
  des.mat.sinhash <- GetSinCombos(dat.gene, w, tissues, tiss.combos)
  des.mat.coshash <- GetCosCombos(dat.gene, w, tissues, tiss.combos)
  
  rhyth.tiss <- list(character(0))  # needs to track shared and independent parameters, e.g.: c("Liver,Kidney", "Adr") no duplicates allowed
  # n.rhyth <- NRhythmicFromString(rhyth.tiss)  # number of independent rhythmic parameters perhaps? Do this later for speed?
  complement <- FilterCombos(tiss.combos, rhyth.tiss)  # given current matrix, you will know which tissues to iterate
  des.mat.list <- list(mat=des.mat.flat, rhyth.tiss=rhyth.tiss, complement = complement)
  # END: init with flat model
  
  # load up my queue
  enqueue(my_mat.queue, des.mat.list)
  
  # uncomment if you want to store all the matrices were used
  des.mats <- expandingList() 
  des.mats$add(des.mat.list)
  total.count <- 1  # mats total, to track how many models
  chunk.id <- 1  # increment every time we save a model
  
  # need to track models that we have done, so we eliminate "permutations" like c("Liver", "Kidney") and c("Kidney", "Liver) models
  # use hash for speed
  models.done <- hash()
  
  # generate matrix by adding combinations of columns and adding
  # those matrices into the queue
  while (! is.empty(my_mat.queue)) {
    des.mat.list <- dequeue(my_mat.queue)
    
    # check that this matrix is not already the maximum complexity (n.rhyth.max),
    # if it is already as complex as we want, then ignore it because 
    # we dont want to add another rhythmic column to this.
    #
    # strictly greater than should handle the case of "no rhytmic tissues"
    if (length(des.mat.list$rhyth.tiss) > n.rhyth.max){
      # print(paste("Skipping", des.mat.list$rhyth.tiss))
      next  # should work even in n.rhyth.max == length(tissues)
    }
    
    # determine tissue combinations that need to be added based on rhyth.tiss
    # e.g., no need to add Liver twice, they can't have two rhythmic paramters  
    for (tiss.comb in des.mat.list$complement){
      # add column for each tissue combination
      tiss.key <- paste(tiss.comb, collapse = ",")
      
      # append tiss.key to rhyth.tiss
      rhyth.tiss <- c(des.mat.list$rhyth.tiss, tiss.key)  # form list("Adr,Kidney", "Mus")
      
      # check if this tissue combination has been already submitted into queue (but in different permutation)
      # track models we have done globally
      modelname <- MakeModelName(rhyth.tiss)
      if (! is.null(models.done[[modelname]])){
        # this is a permutation of an already done combo, skip
        #       print(rhyth.tiss)
        #       print(paste('Skipping', modelname))
        next
      }
      
      col.new <- AddRhythmicColumns(des.mat.sinhash, des.mat.coshash, tiss.key)
      
      #     rhyth.tiss <- c(des.mat.list$rhyth.tiss, tiss.key)
      
      # further remove complement after having
      tiss.complement.new <- FilterCombos(des.mat.list$complement, tiss.comb)
      
      # make new matrix, put it into queue
      mat.new <- cbind(des.mat.list$mat, col.new)
      des.mat.list.new <- list(mat=mat.new, rhyth.tiss = rhyth.tiss, complement = tiss.complement.new)
      enqueue(my_mat.queue, des.mat.list.new)
      models.done[[modelname]] <- TRUE  # we dont want to redo permutations of same models
      total.count <- total.count + 1
      # add to list unless only.n is not FALSE
      if (only.n.params == FALSE){
        des.mats$add(des.mat.list.new)
      } else {
        # only add if only.n.params matches n.rhyth.params
        n.rhyth.params <- length(rhyth.tiss) - 1  # rhyth.tiss contains a flat model, remove 1 to consider only n.rhyth.params
        if (n.rhyth.params == only.n.params){
          des.mats$add(des.mat.list.new)
        }
      }
      if (total.count %% chunks == 0){
        if (count.mode == FALSE){
          SaveChunk(chunk.id, out.dir, des.mats)
        }
        # start a new des.mats
        des.mats <- expandingList()
        chunk.id <- chunk.id + 1
      }
    }  
  }
  # save last batch
  if (count.mode == FALSE){
    SaveChunk(chunk.id, out.dir, des.mats)
  }
  print(paste("Total models:", total.count))
  return(NULL)
}

SaveChunk <- function(chunk.id, out.dir, des.mats){
  fname <- paste0("chunk.", chunk.id, ".Robj")
  des.mats.list <- des.mats$as.list()
  save(des.mats.list, file = file.path(out.dir, fname))
  return(NULL)
}

LoadChunkRunNconds <- function(chunk.path, genename, tissues, n.rhyth.max, w, criterion, normalize.weights, cutoff, top.n, sparse){
  # given path to a dat.gene chunk, run nconds2
  if (missing(genename)){
    genename <- strsplit(basename(chunk.path), ".Robj")[[1]][[1]]
  }
  load(chunk.path)
  return(MakeDesMatRunFit(dat.gene, genename, tissues, n.rhyth.max, w, criterion, normalize.weights, cutoff, top.n, sparse))
}

ChunkDatGenesToFile <- function(dat.long, write.dir){
  # Split dat.long by gene, save each as an genename.Robj
  dat.long %>%
    group_by(gene) %>%
    do(SaveDatGenes(., write.dir))
  return(NULL)
}

SaveDatGenes <- function(dat.gene, write.dir){
  gene <- dat.gene$gene[[1]]
  fname <- paste0(gene, ".Robj")
  save(dat.gene, file = file.path(write.dir, fname))
  return(data.frame(NULL))
}

GetBestModel <- function(fits){
  fit.bestweight <- 0
  for (i in seq(length(fits))){
    if (length(fits[[i]]) == 1){
      # skip because one of the parameters is just gene name
      next
    }
    if (fits[[i]]$weight.norm > fit.bestweight){
      fit.best <- fits[[i]]
      fit.bestweight <- fits[[i]]$weight.norm
    }
  }
  return(fit.best)
}

MakeDesMatRunFitEnv <- function(env, genename, tissues, n.rhyth.max, w = 2 * pi / 24, criterion = "BIC", normalize.weights = FALSE, cutoff = 1e-3, top.n = 3, sparse = TRUE){
  # wrapper function to grab dat.gene from environment
  # genename is gene name found from ls(env)
  dat.gene <- get(genename, envir = env)
  return(MakeDesMatRunFit(dat.gene, genename, tissues, n.rhyth.max, w, criterion, normalize.weights, cutoff, top.n))
}

MakeDesMatRunFit <- function(dat.gene, gene, tissues, n.rhyth.max, w = 2 * pi / 24, criterion = "BIC", normalize.weights = FALSE, cutoff = 1e-3, top.n = 3, sparse = TRUE){
  # dat.gene: long format of gene expression, time and
  # conditions. Can include additional factors such as 
  # experiment.
  # To save memory, generate model and then run fit immediately afterwards.
  # optionally allow stopping after models reach a certain complexity
  
  if (missing(tissues)){
    tissues <- unique(as.character(dat.gene$tissue))
  }
  if (missing(gene)){
    gene <- as.character(dat.gene$gene[[1]])
  }
  
  if (missing(n.rhyth.max)){
    n.rhyth.max <- length(tissues)
  } else if (n.rhyth.max < 1){
    print(n.rhyth.max)
    warning("N rhyth max cannot be less than 2")
  }
  tiss.combos <- GetAllCombos(tissues, ignore.full = FALSE)
  my_mat.queue <- new.queue()
  
  # BEGIN: init with flat model
  des.mat.flat <- GetFlatModel(dat.gene)
  
  if (sparse){
    # make sparse using Matrix package
    des.mat.flat <- Matrix(des.mat.flat)
  }
  
  # get rhythmic parameters which will be used for adding later: hash structure has fast lookup
  des.mat.sinhash <- GetSinCombos(dat.gene, w, tissues, tiss.combos)
  des.mat.coshash <- GetCosCombos(dat.gene, w, tissues, tiss.combos)
  
  rhyth.tiss <- list(character(0))  # needs to track shared and independent parameters, e.g.: c("Liver,Kidney", "Adr") no duplicates allowed
  # n.rhyth <- NRhythmicFromString(rhyth.tiss)  # number of independent rhythmic parameters perhaps? Do this later for speed?
  complement <- FilterCombos(tiss.combos, rhyth.tiss)  # given current matrix, you will know which tissues to iterate
  des.mat.list <- list(mat=des.mat.flat, rhyth.tiss=rhyth.tiss, complement = complement)
  # END: init with flat model
  
  # load up my queue
  enqueue(my_mat.queue, des.mat.list)
  
  # uncomment if you want to store all the matrices were used
  # des.mats <- expandingList() 
  # des.mats$add(des.mat.list)
  
  # store my fits
  # to save memory, only store top n fits
  # fits <- expandingList()
  fits <- list()
  fit.count <- 0
  
  # track weights to normalize afterwards
  fit.weights.sum <- 0
  if (!is.null(top.n)){
    fit.weights <- vector(mode="numeric", length=top.n)
  }
  
  # need to track models that we have done, so we eliminate "permutations" like c("Liver", "Kidney") and c("Kidney", "Liver) models
  # use hash for speed
  models.done <- hash()
  
  # generate matrix by adding combinations of columns and adding
  # those matrices into the queue
  while (! is.empty(my_mat.queue)) {
    des.mat.list <- dequeue(my_mat.queue)
    
    # fit my model
    if (sparse){
      fit <- FitModel(dat.gene, as.matrix(des.mat.list$mat), get.criterion = criterion, condensed = TRUE)  # condensed will save some memory
    } else {
      fit <- FitModel(dat.gene, des.mat.list$mat, get.criterion = criterion, condensed = TRUE)  # condensed will save some memory
    }
    fit.count <- fit.count + 1
    fit.weights.sum <- fit.weights.sum + fit$weight  # track even if it is a bad model, helps normalization
    if (!is.null(top.n)){
      fit.weights.lst <- UpdateFitWeights(fit$weight, fit.weights)
      if (!is.na(fit.weights.lst$i)){
        fits[[fit.weights.lst$i]] <- fit
        fit.weights <- fit.weights.lst$weights
      }
    } else {
      fits[[fit.count]] <- fit
    }
    
    
    # check that this matrix is not already the maximum complexity (n.rhyth.max),
    # if it is already as complex as we want, then ignore it because 
    # we dont want to add another rhythmic column to this.
    #
    # strictly greater than should handle the case of "no rhytmic tissues"
    if (length(des.mat.list$rhyth.tiss) > n.rhyth.max){
      # print(paste("Skipping", des.mat.list$rhyth.tiss))
      next  # should work even in n.rhyth.max == length(tissues)
    }
    
    # determine tissue combinations that need to be added based on rhyth.tiss
    # e.g., no need to add Liver twice, they can't have two rhythmic paramters  
    for (tiss.comb in des.mat.list$complement){
      # add column for each tissue combination
      tiss.key <- paste(tiss.comb, collapse = ",")
      
      # append tiss.key to rhyth.tiss
      rhyth.tiss <- c(des.mat.list$rhyth.tiss, tiss.key)  # form list("Adr,Kidney", "Mus")
      
      # check if this tissue combination has been already submitted into queue (but in different permutation)
      # track models we have done globally
      modelname <- MakeModelName(rhyth.tiss)
      if (! is.null(models.done[[modelname]])){
        # this is a permutation of an already done combo, skip
        #       print(rhyth.tiss)
        #       print(paste('Skipping', modelname))
        next
      }
      
      col.new <- AddRhythmicColumns(des.mat.sinhash, des.mat.coshash, tiss.key)
      
      #     rhyth.tiss <- c(des.mat.list$rhyth.tiss, tiss.key)

      # further remove complement after having
      tiss.complement.new <- FilterCombos(des.mat.list$complement, tiss.comb)
      
      # make new matrix, put it into queue
      mat.new <- cbind(des.mat.list$mat, col.new)
      des.mat.list.new <- list(mat=mat.new, rhyth.tiss = rhyth.tiss, complement = tiss.complement.new)
      enqueue(my_mat.queue, des.mat.list.new) 
      models.done[[modelname]] <- TRUE  # we dont want to redo permutations of same models
      # des.mats$add(des.mat.list.new)
    }  
  }   
  # unpack my fits 
  # fits.list <- fits$as.list()
  # print(paste("Models fitted:", fit.count))
  fits.list <- fits
  
  if (normalize.weights){
    # print("Normalizing weights...")
    # print(dat.gene$gene[[1]])
    # fits.list <- NormalizeWeights(fits.list, cutoff)
    for (i in seq(length(fits.list))){
      fits.list[[i]]$weight.norm <- fits.list[[i]]$weight / fit.weights.sum
    }
  }
  fits.list$gene <- gene
  # print(fit.weights.sum)
  return(fits.list)
}

UpdateFitWeights <- function(weight, weights){
  # check min weights, if weight is larger than min(weights), replace min(weights) with weight
  which.min.i <- which.min(weights)
  min.weight <- weights[which.min.i]
  if (weight <= min.weight){
    return(list(weights = NA, i = NA))  # do nothing
  }
  else if (weight > min.weight){
    # replace it
    weights[which.min(weights)] <- weight
    return(list(weights = weights, i = which.min.i))
  }
}

NormalizeWeights <- function(fit.list, cutoff = 1e-3){
  # take list with $weight and 
  # add $weight.norm
  weight.sum <- sum(unlist(lapply(fit.list, function(fit){
    fit$weight
  })))
#   fit.list.norm <- Filter(function(fit){
#     fit$weight / weight.sum >= cutoff
#   }, fit.list)

  fit.list.norm <- lapply(fit.list, function(fit){
    fit$weight.norm <- fit$weight / weight.sum
    if (fit$weight.norm < cutoff){
      return(NULL)
    } else {
      return(list(fit = fit$fit, weight.norm = fit$weight.norm))
    }
  })
  return(fit.list.norm)
}

CoefToModelName <- function(coef){
  # Given set of coef, with sin and cos to designate rhythmic parameters, extract
  # a rhythmic model name e.g., Adr;Liver,Kidney;Mus means 3 rhythmic parameters, liver and kidney share 
  # rhythmic params
  rhyth.names <- names(coef[grepl(":sin", names(coef))])
  rhyth.names <- sapply(rhyth.names, function(jname) strsplit(jname, ":")[[1]][[1]])
  return(paste(rhyth.names, collapse=";"))
}

CoefToParams <- function(coef, period = 24){
  w = 2 * pi / 24
  flat.params <- coef[grepl("tissue", names(coef))]
  
  # Convert linear coefficients sin cos to amplitude and phase.
  rhyth.params.sin <- coef[grepl(":sin", names(coef))]
  rhyth.params.cos <- coef[grepl(":cos", names(coef))]
  rhyth.params.names <- names(coef)[grepl(":sin", names(coef))]
  
  if (length(rhyth.params.names) > 0){
    has.rhyth <- TRUE
  } else {
    has.rhyth <- FALSE
  }
  # TODO: check for case with no rhythmic param
  
  if (has.rhyth){
    # Liver:sin(w * time) -> Liver (can be Adr,Liver)
    rhyth.tiss <- sapply(rhyth.params.names, function(rhyth.param) strsplit(rhyth.param, ":")[[1]][[1]])
    
    # create amplitude and amp for each rhyth.tiss
    amps <- mapply(function(a, b) return(sqrt(a^2 + b^2)), rhyth.params.sin, rhyth.params.cos)
    phases <- mapply(function(a, b){
      phase.rad <- atan2(a, b)
      return((phase.rad / w) %% period)
    }, rhyth.params.sin, rhyth.params.cos)
    
    # name my new params
    if (length(amps) == 0 & length(phases) == 0) return(NA)
    names(amps) <- paste0(rhyth.tiss, ".amp")
    names(phases) <- paste0(rhyth.tiss, ".phase")
  } else {
    return(flat.params)
  }
  # return as numeric vector
  rhyth.params.out <- c(amps, phases)
  out.params <- c(flat.params, rhyth.params.out)
  return(out.params)
}

BICFromLmFit <- function(coefficients, residuals){
  # from vector of coefs and residuals, calculate BIC
  n <- length(residuals)
  RSS <- sum(residuals ^ 2)
  k <- length(coefficients)
  criterion <- n * log(RSS / n) + k * log(n)
  return(criterion)
}

AICFromLmFit <- function(coefficients, residuals){
  # from vector of coefs and residuals, calculate AIC
  n <- length(residuals)
  RSS <- sum(residuals ^ 2)
  k <- length(coefficients)
  criterion <- n * log(RSS / n) + 2 * k
  return(criterion)
}

FitModels <- function(dat.gene, my_mat, get.criterion = "BIC", normalize.weights = TRUE){
  # Fit many models with lm.fit() which is faster than lm()
  fits <- lapply(my_mat, function(mat) FitModel(dat.gene, mat, get.criterion))
  # calculate weights.sum
  weights.sum <- sum(unlist(lapply(fits, function(fit){
    return(fit$weight)
  })))
  
  # normalize weights so sum = 1
  if (normalize.weights){
    fits <- lapply(fits, function(fit){
      fit$weight.norm <- fit$weight / weight.sum
      return(fit)
    })
  }
  return(fits)
}

FitModel <- function(dat.gene, mat, weight.sum, get.criterion="BIC", condensed=FALSE){
  # Subroutine for fitting many models because
  # only one matrix, cannot normalize weights
  # 
  # weight.sum: track weight.sum 
  #   print(head(mat))
  #   print(dat.gene$exprs)
  fit <- lm.fit(y = dat.gene$exprs, x = mat)
  if (get.criterion == "BIC"){
    criterion <- BICFromLmFit(fit$coefficients, fit$residuals)
    weight <- exp(-0.5 * criterion)
    weight.raw <- criterion
  } else if (get.criterion == "AIC"){
    criterion <- AICFromLmFit(fit$coefficients, fit$residuals)
    weight <- exp(-0.5 * criterion)
    weight.raw <- criterion
  } else {
    rsquared <- GetRSquaredFromFits(dat.gene$exprs, fit$residuals)
    #     # sanity check
    #         rsquared.check <- summary(lm(dat.gene$exprs ~ mat))$r.squared
    #         print(paste("RSquared calculated: ", rsquared))
    #         print(paste("RSquared real: ", rsquared))
    N <- length(dat.gene$exprs)
    p <- length(fit$coefficients)
    bf <- tryCatch({
      bf <- GetBayesFactor(N, p, rsquared, method = get.criterion, plot.integrand = FALSE)  # output is log
    }, error = function(e) {
      print(as.character(dat.gene$gene[[1]]))
      print(e)
      bf <- NA
    })
    weight <- exp(bf)
    weight.raw <- -bf
  }
  if (condensed){
    return(list(fit = fit$coefficients, weight = weight, weight.raw = weight.raw, method = get.criterion))
  } else {
    return(list(fit = fit$coefficients, residuals = fit$residuals, weight = weight, method = get.criterion))
  }
}

GetBayesFactor <- function(N, p, rsquared, method = "zf", plot.integrand=FALSE){
  # Bayes Factor a la Liang et al 2008 mixture of g priors
  # http://www.tandfonline.com/doi/abs/10.1080/00273171.2012.734737
  # http://amstat.tandfonline.com/doi/abs/10.1198/016214507000001337#.V017rTdWeMM  
  # 
  # Taken from Richard Morey Bayes Factor package
  # github code: https://github.com/richarddmorey/BayesFactor/blob/a90b5929a8f44de1ef853b49de1d49b3c8095285/pkg/BayesFactor/R/regressionBF-utility.R
  #
  # Methods:
  # zf = "Zellner-Siow priors"
  # hyperg = "hyper-g priors"
  # zf_laplace = "Zellner-Siow priors approximated with Laplace"
  # hyperg_laplace = "hyper g approximated with Laplace
  # eb = "Empirical Bayes" use mode of g 
  
  if (method == "zf"){
    # get Marginal Likelihood of the model
    debug <- FALSE
    if (debug){
      g.mode <- ModeG(N, p, rsquared)
      gvec <- seq(0, 500000, length.out = 1000)
      bf.real <- exp(linearReg.R2stat(N=N, p=p, R2=rsquared, rscale = 1)[['bf']])  # needs BayesFactor package
      gvec.log <- seq(-50, 50, length.out = 1000)
      plot(gvec, ModelLikelihood(gvec, N=N, p=p, R2=rsquared, .log=TRUE, log.const=0, return.log=FALSE), type = "l", main=paste("log(g) vs transformed integrand. Mode:", signif(log(g.mode), digits = 2)))
      plot(gvec.log, ModelLikelihood2(gvec.log, N=N, p=p, R2=rsquared, shift = log(g.mode)), type = "l", main=paste("log(g) vs transformed integrand. Mode:", signif(log(g.mode), digits = 2)))
      # test my modellikelihood
      bf.old <- ModelLikelihood(g.mode, N=N, p=p, R2=rsquared, .log = TRUE, log.const = 0, return.log = FALSE)
      bf.bylog <- integrate(ModelLikelihood2, lower=-Inf, upper=Inf, N=N, p=p, R2=rsquared, log.const = 0, shift = log(g.mode))
      print(paste("BF old:", bf.old[[1]]))
      print(paste("BF real:", bf.real))
      print(paste("BF by log:", bf.bylog[[1]]))
    }
    # bf <- integrate(ModelLikelihood, lower=-Inf, upper=Inf, N=N, p=p, R2=rsquared, .log = TRUE, log.const = 0, return.log = FALSE)  # ModelLikelihood ratio with NULL model
    g.mode <- ModeG(N, p, R2 = rsquared)
    log.const <- ModelLikelihood2(tau = 0, N = N, p = p, R2 = rsquared, shift = log(g.mode), return.log=TRUE)
    # bf <- integrate(ModelLikelihood, lower=0, upper=Inf, N=N, p=p, R2=rsquared, .log = TRUE, log.const = 0, return.log = FALSE)  # ModelLikelihood ratio with NULL model
    bf <- integrate(ModelLikelihood2, lower=-Inf, upper=Inf, N=N, p=p, R2=rsquared, log.const = log.const, shift = log(g.mode))  # shift to centre
    if (bf[[4]] == "OK"){
      bf <- log(bf[[1]]) + log.const
    } else {
      warning(paste("Integration returned non-OK message", bf[[4]]))
      bf <- log(bf[[1]])
    }
  } else if (method == "zf_laplace"){
    warning("zf_laplace still buggy")
    # Approximate Marginal Likelihood of the model
    approx <- LaplaceApprox(N, p, rsquared)
    print(paste("mode:", approx$g.mode))
    bf <- approx$bf
  } else if (method == "hyperg"){
    a <- 3  # 2 to 4 is OK. 3 is recommended by Liang et al 2008
    bf <- log(((a - 2) / (p + a - 2)) * Re(hypergeo::hypergeo(A = (N - 1 / 2), B = 1, C = (p + a) / 2, z = rsquared)))
  } else if (method == "eb"){
    # proper way is to do EM algorithm, which I will not do.
    g.mode <- 1000
    bf <- log(ModelLikelihood(g = g.mode, N = N, p = p, R2 = rsquared, .log = TRUE, log.const = 0, return.log = FALSE))
  } else {
    # guess that user set method as "g"
    if (!is.numeric(method)){
    	# if not numeric, expect form g=1000
    	g <- as.numeric(strsplit(method, "=")[[1]][[2]])
    } else {
    	g <- method
    }
    # no prior needed (Equation 8 of Liang 2008)
    bf <- log(ModelLikelihood(g = g, N = N, p = p, R2 = rsquared, .log = TRUE, log.const = 0, return.log = FALSE, add.prior = FALSE))
  }
  if (plot.integrand){
    par(mar = c(5,5,2,5))
    g <- seq(0, 1000)
    cat(paste("N:", N, "\np:", p, "\nR2:", rsquared, "\n"))
    plot(g, ModelLikelihood(g, N = N, p = p, R2 = rsquared), type = "l", main = paste("N:", N, "\np:", p, "\nR2:", rsquared, "\n"))
    if (method == "zf_laplace"){
      abline(v = approx$g.mode, lty = 3)
      par(new = T)
      plot(g, SecondDerivG(g, N, p, rsquared), type = "l", lty = 2, axes=F, xlab=NA, ylab=NA)
      axis(side = 4)
      mtext(side = 4, line = 3, 'd2h')
    }
  }
  return(bf)
}

LaplaceApprox <- function(N, p, R2){
  # Approximate the marginal likelihood using LaPlace
  # Appendix A of Liang et al 2008
  # 
  ## Compute second derivative
  g.mode <- ModeG(N, p, R2)
  d2h <- SecondDerivG(g.mode, N, p, R2)  # Equation A.2
  print(paste("d2h:", d2h))
  sig <- (-1 * d2h) ^ -0.5
  print(paste("sig:", sig))
  bf <- sqrt(2 * pi)* sig * ModelLikelihood(g.mode, N, p, R2, .log = TRUE, log.const = 0, return.log = TRUE)  # Equation A.1
  return(list(bf = bf, g.mode = g.mode))
}

ModeG <- function(N, p, R2){
  # approximate with Laplace integral
  ### Compute approximation to posterior mode of g
  ### Liang et al Eq. A.3, assuming a=b=0
  g3 = -(1 - R2) * (p + 3) #* g^3 
  g2 = (N - p - 4 - 2 * (1 - R2)) #* g^2
  g1 = (N * (2 - R2) - 3) #*g
  g0 = N
  sol = polyroot(c(g0, g1, g2, g3))  # solutions to cubic equation
  ## Pick the real solution
  g.mode = Re(sol[which.min(Im(sol)^2)])
  if (g.mode <= 0){
    g.mode <- N / 20  # somewhere close to 0, can't do log of negs
  }
  return(g.mode)
}

SecondDerivG <- function(g, N, p, R2, a = 0, b = 0, verbose=FALSE){
  # Liang et al Eq. A. 3
  t1 <- (N - 1) * (1 - R2) ^ 2 / (1 + (g * (1 - R2))) ^ 2
  t2 <- -1 * (N - p - 1) / ((1 + g) ^ 2)
  t3 <- (3 - 2 * a) / (g ^ 2)
  t4 <- -1 * (2 * N / g ^ 3)
  d2h <- 0.5 * (t1 + t2 + t3 + t4)
  if (verbose){
    cat(paste("t1: ", t1, "\nt2", t2, "\nt3", t3, "\nt4", t4, "\n"))
  }
#   d2h <- 0.5 * ( 
#     (((N - 1)*(1 - R2)) / ((1 + g * (1 - R2)) ^ 2)) - 
#     ((N - p - 1) / ((1 + g) ^ 2)) + 
#     ((3 - 2 * a) / (g ^ 2)) - 
#     ((2 * N) / (g ^ 3))
#     )
  return(d2h)
}

ModelLikelihood <- function(g, N, p, R2, .log=FALSE, log.const=0, return.log = FALSE, method = "zf", add.prior=TRUE){
  if (!.log){
    if (add.prior){
      L <- (((1 + g) ^ ((N - p - 1) / 2)) * (1 + (1 - R2) * g) ^ -((N - 1) / 2)) * PriorG(N, g, .log = FALSE, method = "zf")
    } else {
      L <- (((1 + g) ^ ((N - p - 1) / 2)) * (1 + (1 - R2) * g) ^ -((N - 1) / 2))
    }
  } else {
    debug <- FALSE
    if (debug){
      # print((N - p -1) * log1pExp(g))
      # print(g + log(1 - R2))
      # print(.5 * ((N - p - 1 ) * log1pExp(g) - (N - 1) * log1pExp(g + log(1 - R2))))
      # a2 <- .5 * ((N - p - 1 ) * log1pExp(g) - (N - 1) * log1pExp(g + log(1 - R2)))  
      # a1 <- (((N - p - 1) / 2) * log(1 + g)) + (-(N - 1) / 2) * log(1 + (1 - R2) * g)
      a2 <- (- (N - 1)/2 * log1pExp(g + log(1 - R2)))
      a1 <-  (-(N - 1) / 2) * log(1 + (1 - R2) * g)
      print(paste("a from jake:", a1))
      print(paste("a from morey", a2))
      print(paste("a from jake + correction", a1 + log(g)))
    }
    # two lines should be equal
    # L <- (((N - p - 1) / 2) * log1pExp(log(g)) + (-(N - 1) / 2) * log1pExp(log((1 - R2) * g)) + PriorG(N, g, .log = TRUE, method = "zf"))
    if (add.prior){
      L <- (((N - p - 1) / 2) * log(1 + g)) + (-(N - 1) / 2) * log(1 + (1 - R2) * g) + PriorG(N, g, .log = TRUE, method = "zf")
    } else {
      L <- (((N - p - 1) / 2) * log(1 + g)) + (-(N - 1) / 2) * log(1 + (1 - R2) * g)
    }
    # exponentiate at the end
    if (!return.log){
      L <- exp(L)
    }
  }
  return(L)
}

ModelLikelihood2 <- function(tau, N, p, R2, log.const=0, shift=0, return.log=FALSE){
  # integrate from -Inf to Inf, function of log(g)
  # allow shift to move tau where peak is centre
  tau <- tau + shift
  a <- ((N - 1 - p) / 2) * log1pExp(tau) +  # log1pExp works when tau is large
   (-(N - 1) / 2) * log1pExp(tau + log(1 - R2))  # log1pExp(tau + log(1 - R2)) == log(1 + (1 - R2) * exp(tau))
  # a = .5 * ((N - p - 1 ) * log1pExp(tau) - 
  #             (N - 1) * log1pExp(tau + log(1 - R2)))
  L <- a + PriorG(N, tau, .log=TRUE, .logx = TRUE, method="zf") - log.const + tau
  if (!return.log){
    return(exp(L))
  } else {
    return(L)
  }
}

log1pExp <- Vectorize(function(x){
  # return log(1 + exp(x)), preventing Infs
  if (x > -log(.Machine$double.eps)){
    # log(1 + exp(x)) == x, x too large for computer
    return(x)
  } else{
    return(log(1 + exp(x)))
  }
}, "x")



PriorG <- function(N, g, .log=TRUE, .logx=FALSE, method = "zf"){
  # Evaluate prior distribution of g
  # uses Inv-Gamma(1/2, N/2)
  if (method == "zf"){
    shape <- 1/2
    scale <- N/2  # differs from BayesFactor implementation
    if (!.logx){
      prior.g <- InvGamma(g, shape, scale, .log = TRUE, .logx=.logx)
    } else {
      prior.g <- InvGamma(g, shape, scale, .log = TRUE, .logx=.logx)
    }
    # prior.g <- MCMCpack::dinvgamma(g, shape, scale)
  } else if (method == "hg"){
    a <- 3  # range from 2 to 4 is reasonable. Equation 16 of Liang et al 2008
    prior.g <- (a - 2)
  } else {
    warning("Unknown method")
  }
  return(prior.g)
}

InvGamma <- function(g, shape, scale, .log=FALSE, .logx=FALSE){
  const <- ((scale ^ shape) / gamma(shape))
  if (!.log){
    if (.logx) warning("Log x not coded for when .log=FALSE")
    d <- const * g ^ (-shape - 1) * exp(-scale / g)
  } else {
    if (!.logx){
      d <- log(const) + (-shape - 1) * log(g) + (-scale / g)
    } else {
      # g is tau
      d <- log(const) + (-shape - 1) * g + (-scale * exp(-g))
    }
  }
  return(d)
}

GetRSquaredFromFits <- function(y, residuals){
  TSS <- sum((y - mean(y)) ^ 2)
  RSS <- sum(residuals^2)
  return(1 - (RSS / TSS))
}

GetSelectionCriterion <- function(fits, model.selection = "BIC"){
  # given list of fits, calculate BIC weights
  # get coefficients and residuals, get model selection by BIC or AIC
  if (model.selection == "BIC"){
    # https://en.wikipedia.org/wiki/Bayesian_information_criterion#Definition
    # BIC = -2 log(RSS/n) + k * log(n)
    bics <- unlist(lapply(fits, function(fit){
      n <- length(fit$residuals)
      RSS <- sum(fit$residuals ^ 2)
      k <- length(fit$coefficients)
      criterion <- -2 * log(RSS / n) + k * log(n)
      bicweight <- exp(-0.5 * criterion)
      }))
    # normalize
    bics.weight <- bics / sum(bics)
  } else {
    warning("Only BIC is currently implemented.")
  }
}

MakeRhythmicDesignMatrices <- function(dat.gene, w = 2 * pi / 24, simplify=FALSE){
  # dat.gene: long format of gene expression, time and
  # conditions. Can include additional factors such as 
  # experiment.
  # simplify: return only design matrix rather than full matrix containing meta data (FALSE is debug mode)
  
  tissues <- unique(as.character(dat.gene$tissue))
  
  tiss.combos <- GetAllCombos(tissues, ignore.full = FALSE)
  my_mat.queue <- new.queue()
  
  # BEGIN: init with flat model
  des.mat.flat <- GetFlatModel(dat.gene)
  # get rhythmic parameters which will be used for adding later: hash structure has fast lookup
  des.mat.sinhash <- GetSinCombos(dat.gene, w, tissues, tiss.combos)
  des.mat.coshash <- GetCosCombos(dat.gene, w, tissues, tiss.combos)
  
  rhyth.tiss <- list(character(0))  # needs to track shared and independent parameters, e.g.: c("Liver,Kidney", "Adr") no duplicates allowed
  # n.rhyth <- NRhythmicFromString(rhyth.tiss)  # number of independent rhythmic parameters perhaps? 
  n.rhyth <- NRhythmicFromVector(rhyth.tiss)  # do that later? naw faster if we do it now
  complement <- FilterCombos(tiss.combos, rhyth.tiss)
  des.mat.list <- list(mat=des.mat.flat, rhyth.tiss=rhyth.tiss, n.rhyth=n.rhyth, complement = complement)
  # END: init with flat model
  
  # load up my queue
  enqueue(my_mat.queue, des.mat.list)
  
  n.mat.submitted <- 1
  des.mats <- expandingList() 
  des.mats$add(des.mat.list)
  
  # need to track models that we have done, so we eliminate "permutations" like c("Liver", "Kidney") and c("Kidney", "Liver) models
  # use hash for speed
  models.done <- hash()
  
  # generate matrix by adding combinations of columns and adding
  # those matrices into the queue
  while (! is.empty(my_mat.queue)) {
    des.mat.list <- dequeue(my_mat.queue)
    # determine tissue combinations that need to be added based on rhyth.tiss
    # e.g., no need to add Liver twice, they can't have two rhythmic paramters
    
    for (tiss.comb in des.mat.list$complement){
      # add column for each tissue combination
      tiss.key <- paste(tiss.comb, collapse = ",")
      
      # append tiss.key to rhyth.tiss
      rhyth.tiss <- c(des.mat.list$rhyth.tiss, tiss.key)  # form list("Adr,Kidney", "Mus")
      
      # check if this tissue combination has been already submitted into queue (but in different permutation)
      # track models we have done globally
      modelname <- MakeModelName(rhyth.tiss)
      if (! is.null(models.done[[modelname]])){
        # this is a permutation of an already done combo, skip
        #       print(rhyth.tiss)
        #       print(paste('Skipping', modelname))
        next
      }
      
      col.new <- AddRhythmicColumns(des.mat.sinhash, des.mat.coshash, tiss.key)
      
      #     rhyth.tiss <- c(des.mat.list$rhyth.tiss, tiss.key)
      
      
      # further remove complement after having
      tiss.complement.new <- FilterCombos(des.mat.list$complement, tiss.comb)
      
      # add meta data: makes finding models easier
      n.rhyth <- des.mat.list$n.rhyth + length(tiss.comb)
      
      # make new matrix, put it into queue
      mat.new <- cbind(des.mat.list$mat, col.new)
      des.mat.list.new <- list(mat=mat.new, rhyth.tiss = rhyth.tiss, n.rhyth=n.rhyth, complement = tiss.complement.new)
      enqueue(my_mat.queue, des.mat.list.new) 
      models.done[[modelname]] <- TRUE  # we dont want to redo permutations of same models
      n.mat.submitted <- n.mat.submitted + 1
      des.mats$add(des.mat.list.new)
    }  
  } 
  print(paste("Number of matrices generated:", n.mat.submitted))
  des.mats.list <- des.mats$as.list()
  if (simplify){
    des.mats.list <- lapply(des.mats.list, function(x) return(x$mat))
  }
  return(des.mats.list)
}

MakeModelName <- function(rhyth.tiss.lst, delim = ";"){
  # From a list of rhythmic tissues, generate a hash key to track models
  # therefore: Adr,Kidney;Liver means Adr,Kidney same param, Liver independent param
  lst <- sort(unlist(rhyth.tiss.lst))
  return(paste(lst, collapse = delim))
}

AddRhythmicColumns <- function(des.mat.sinhash, des.mat.coshash, tiss.key){
  # cbind rhythnic columns, rename to track which tissues share this parameter
  col.new <- cbind(des.mat.sinhash[[tiss.key]], des.mat.coshash[[tiss.key]])
  col.new.namebase <- paste(tiss.key, sep = ",")
  colnames(col.new) <- c(paste0(col.new.namebase, ":sin(w * time)"), paste0(col.new.namebase, ":cos(w * time)"))
  return(col.new)
}

GetSinCombos <- function(dat.gene, w, tissues, combos){
  des.mat.sin <- GetRhythModel.sin(dat.gene, w)
  des.mat.sin.hash <- MakeHash(des.mat.sin, dat.gene, tissues, combos)
  return(des.mat.sin.hash)
}

GetCosCombos <- function(dat.gene, w, tissues, combos){
  des.mat.cos <- GetRhythModel.cos(dat.gene, w)
  des.mat.cos.hash <- MakeHash(des.mat.cos, dat.gene, tissues, combos)
  return(des.mat.cos.hash)
}

NRhythmicFromString <- function(rhyth.tiss, jsep = ","){
  # How many tissues are rhythmic given a comma separated stringS
  # "" suggests 0
  if (length(rhyth.tiss) == 0) return(0)
  if (rhyth.tiss == ""){
    return(0) 
  } else {
    return(length(strsplit(rhyth.tiss, jsep)[[1]]))
  }
}

NRhythmicFromVector <- function(rhyth.tiss.vec, jsep = ","){
  # if c("Adr,Liver", "Kidney") output 3 rhythmic tisues
  n.rhyth <- 0
  for (tiss in rhyth.tiss.vec){
    n.rhyth <- n.rhyth + NRhythmicFromString(tiss)
  }
  return(n.rhyth)
}

FilterCombos <- function(combos, combos.sub){
  # Filter out combos given a combos.sub, filters
  # any combo that contains even one of the elements in combo.sub
  # always removes an empty set no matter what
  combos.subl <- sapply(combos, function(comb){
    if (length(comb) == 0) return(FALSE)  # empty set
    inrhyth <- TRUE  # say it is true, loop through conditions to set to false if it is a duplicate
    for (ci in comb){
      if (ci %in% combos.sub){
        inrhyth <- inrhyth * FALSE
      } 
    }
    return(as.logical(inrhyth))
  })
  return(combos[combos.subl])
}

GetRhythmicFormula <- function(exprs, time, intercepts, w = 2 * pi / 24, with.intercept = FALSE){
  # Get formula for full rhythmic model
  # tissue: genotype, or tissues
  # x: time
  # y: expression
  # experiment: column name of experiment (can be blank I think "")
  # with.intercept: if you want tissue and experiments to be your intercepts, keep it False
  
  sinterm <- paste0(tissue, " : ", "sin(", w, " * ", time, ")")
  costerm <- paste0(tissue, " : ", "cos(", w, " * ", time, ")")
  rhyth.terms <- paste(sinterm, costerm, sep = " + ")
  
  if (with.intercept){
    intercept <- "1"
  } else {
    intercept <- "0"
  }
  
  intercepts <- paste(intercepts, collapse = "+")
  
  form <- as.formula(paste0(exprs, "~", paste(intercept, tissue, experiment, rhyth.terms, sep = "+")))
  return(form)
}

GetFlatModel <- function(dat.gene){
  des.mat.flat <- tryCatch({
    model.matrix(exprs ~ 0 + tissue + tissue:experiment, dat.gene)
  }, error = function(e) {
    # error, try without experiment
    model.matrix(exprs ~ 0 + tissue, dat.gene)
  })
}

GetRhythModel <- function(dat.gene, w = 2 * pi / 24){
  des.mat.rhyth <- model.matrix(exprs ~ 0 + tissue:sin(w * time) + tissue:cos(w * time), dat.gene)
}

MakeHash <- function(des.mat.rhyth, dat.gene, tissues, combos){
  # return as hash object requires hash library
  if (missing(tissues)) tissues <- unique(as.character(dat.gene$tissue))
  # init with single tissues
  des.mat.hash <- hash()
  for (tiss.i in seq(tissues)){
    tiss <- tissues[tiss.i]
    des.mat.hash[[tiss]] <- des.mat.rhyth[, colnames(des.mat.rhyth)[grepl(tiss, colnames(des.mat.rhyth))]]
  }
  # add combos only if they are composed of >1 tissues
  for (comb in combos){
    if (length(comb) <= 1) next
    # init
    vec <- numeric(length = nrow(des.mat.rhyth))
    for (tiss in comb){
      vec <- vec + des.mat.hash[[tiss]]
    }
    # put combo into hash as comma separated string
    comb.key <- paste(comb, collapse = ",")
    des.mat.hash[[comb.key]] <- vec
  }
  return(des.mat.hash)
}


GetRhythModel.sin <- function(dat.gene, w = 2 * pi / 24){
  des.mat.rhyth <- model.matrix(exprs ~ 0 + tissue:sin(w * time), dat.gene)
  return(des.mat.rhyth)
}

GetRhythModel.cos <- function(dat.gene, w = 2 * pi / 24){
  des.mat.rhyth <- model.matrix(exprs ~ 0 + tissue:cos(w * time), dat.gene)
}

GetRhythmicFormula.Shared <- function(exprs, time, rhythmic.factor, intercepts, w = 2 * pi / 24, with.intercept = FALSE){
  # Get formula for full rhythmic model
  # exprs: expression col
  # time: time
  # rhythmic.factor: factor describing if rhythmic or not. 
  # with.intercept: if you want tissue and experiments to be your intercepts, keep it False
  
  # get rhythmic terms
  sinterm <- paste0(rhythmic.factor, " : ", "sin(", w, " * ", time, ")")
  costerm <- paste0(rhythmic.factor, " : ", "cos(", w, " * ", time, ")")
  rhyth.terms <- paste(sinterm, costerm, sep = " + ")
  
  if (with.intercept){
    intercept <- "1"
  } else {
    intercept <- "0"
  }
  
  intercepts <- paste(intercepts, collapse = "+")
  
  form <- as.formula(paste0(exprs, "~", paste(intercept, tissue, experiment, rhyth.terms, sep = "+")))
  return(form)
}

SubsetFullDesignMatrix <- function(des.mat, rhythmic.tissues){
  # Remake design matrix to include only "rhythmic.tissues" given the full des.mat from GetRhythmicFormula
  
  # keep intercept terms
  cols.int <- !grepl(":sin|:cos", colnames(des.mat))  # all terms without :sin or :cos are intercept terms
  
  if (length(rhythmic.tissues) == 0){
    # no rhytmic tissues
    return(des.mat[, cols.int])
  }
  
  # keep subset of rhythmic terms, depending on rhythmic tissues
  grep.str <- paste0(rhythmic.tissues, ":")  # greps sin and cos terms
  grep.str <- paste0(grep.str, collapse = "|")  # greps OR
  
  cols.rhyth <- grepl(grep.str, colnames(des.mat))
  # take intercept and subset of rhythmics as subset
  des.mat <- des.mat[, cols.int | cols.rhyth]
  return(des.mat)
}

GetDesignMatrices <- function(dat, formula){
  # Return all possible design matrices from dat
}

GetAllCombos <- function(tissues, ignore.full = TRUE){
  # get subsets of tissues possible
  # ignore.full: does not consider the full model as a combination
  tissue.combos.lst <- list()
  model.i <- 1
  
  if (ignore.full){
    max.rhyth <- length(tissues) - 1  # ignore full model
  } else {
    max.rhyth <- length(tissues)
  }
  
  for(i in 0:max.rhyth){
    tissues.combo <- combn(tissues, i)
    if (is.null(ncol(tissues.combo))){
      # only one choice, put that choice into list
      tissue.combos.lst[[model.i]] <- tissues.combo
      model.i <- model.i + 1
    }
    for(j in seq(ncol(tissues.combo))){
      tissue.combos.lst[[model.i]] <- tissues.combo[, j]
      model.i <- model.i + 1
    }
  }
  return(tissue.combos.lst)
}

BICWeight <- function(bic.vec){
  # Vector of BIC values, convert to weights "probability"
  # BIC = -2 log (BayesFactor)
  # therefore, BICw = exp(-0.5 * BIC) normalized across all BICs
  bic.vec.weight <- exp(-0.5 * bic.vec)
  bic.vec.weight.frac <- bic.vec.weight / sum(bic.vec.weight)
}

FitCombinations <- function(dat.gene, tiss.combos, N=3, n.cores=30){
  # Return dataframe of combinations, fit, BIC for each possible model (2^N)
  # tiss.combos: list of tissue combinations to run lienar model
  # N: return only a subset of BIC models (top 3 by default)
  tissues <- unique(dat.gene$tissue)
  gene <- dat.gene$gene[[1]]
  n.models <- 2 ^ length(tissues)
  
  fits.lst <- list()
  bics.lst <- vector(length = n.models)
  
  #   print(paste("Number of models to fit:", n.models))
  
  # BEGIN: FULL DESIGN MATRIX
  form <- GetRhythmicFormula("exprs", "time", c("tissue", "experiment"), with.intercept = FALSE)
  des.mat.full <- model.matrix(form, dat.gene)
  # END: FULL DESIGN MATRIX
  
  # BEGIN: INIT FITTING WITH FULL MODEL
  # init with fitting full model, then just update with new formula
  des.mat.sub <- des.mat.full  # rename to keep fit output names consistent
  # fit.full <- lm(exprs ~ 0 + des.mat.sub, data = dat.gene)
  # END: INIT FITTING
  
  # BEGIN: FIT DIFFERENT COMBOS
  
  # PARALLEL
  #   out.lst <- mclapply(tiss.combos, function(tiss.combo){
  #     des.mat.sub <- SubsetFullDesignMatrix(des.mat.full, tiss.combo)
  #     fit.long <- dat.gene %>% do(mod = lm(exprs ~ 0 + des.mat.sub, data = .)) %>%
  #       mutate(rhyth.tiss = paste0(tiss.combo, collapse = ","), 
  #              bic = BIC(mod[[1]]))
  #     return(fit.long)
  #   }, mc.cores = n.cores)
  
  # SERIAL
  #   out.lst <- lapply(tiss.combos, function(tiss.combo){
  #     des.mat.sub <- SubsetFullDesignMatrix(des.mat.full, tiss.combo)
  #     fit.long <- dat.gene %>% do(mod = lm(exprs ~ 0 + des.mat.sub, data = .)) %>%
  #       mutate(rhyth.tiss = paste0(tiss.combo, collapse = ","), 
  #              bic = BIC(mod[[1]]))
  #     return(fit.long)
  #   })
  
  out.lst <- lapply(tiss.combos, function(tiss.combo){
    des.mat.sub <- SubsetFullDesignMatrix(des.mat.full, tiss.combo)
    fit.long <- dat.gene %>% 
      do(mod = FitRhythmicDesMat(., des.mat.sub)) %>%
      mutate(rhyth.tiss = paste0(tiss.combo, collapse = ","))
    return(fit.long)
  })
  
  # END: FIT DIFFERENT COMBOS
  out.df <- do.call(rbind, out.lst)
  # out.df$bicweight <- BICWeight(out.df$bic)
  out.df$bicweight <- BICWeight(sapply(out.df$mod, function(x) x$bic))
  
  # get top 3
  out.df <- out.df[order(out.df$bicweight, decreasing = TRUE), ][1:N, ]
  out.df$gene <- gene
  # gc()  # too slow
  return(out.df)
}

FitRhythmicDesMat <- function(dat, des.mat){
  # Fit rhythmic with design matix
  mod <- lm(exprs ~ 0 + des.mat, data = dat)
  bic <- BIC(mod)
  return(list(fit = coef(mod), bic = bic))
}
jakeyeung/TissueCiradianAnalysis documentation built on Aug. 7, 2020, 7:58 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
jakeyeung/TissueCiradianAnalysis
Analysis of oscillatory RNASeq data

R/NcondsFunctions.R
In jakeyeung/TissueCiradianAnalysis: Analysis of oscillatory RNASeq data

R Package Documentation

Browse R Packages

We want your feedback!

jakeyeung/TissueCiradianAnalysis Analysis of oscillatory RNASeq data

R/NcondsFunctions.R In jakeyeung/TissueCiradianAnalysis: Analysis of oscillatory RNASeq data

R Package Documentation

Browse R Packages

We want your feedback!

jakeyeung/TissueCiradianAnalysis
Analysis of oscillatory RNASeq data

R/NcondsFunctions.R
In jakeyeung/TissueCiradianAnalysis: Analysis of oscillatory RNASeq data