TissueCiradianAnalysis: Analysis of oscillatory RNASeq data

GetGeneModelKeys <- function(dat, tiss){
  rhyth.tiss <- gsub(pattern = ";", replacement = ",", x = dat$model)
  rhyth.tiss <- strsplit(rhyth.tiss, ",")[[1]]
  rhyth.tiss <- sapply(rhyth.tiss, function(tiss) strsplit(tiss, "_")[[1]][[1]], USE.NAMES = FALSE)
  
  is.rhyth <- tiss %in% rhyth.tiss
  return(data.frame(tissue = tiss, is.rhyth = is.rhyth))
}

GetMaxDist <- function(proms, tiss.vec){
  dist.tiss <- as.matrix(dist(proms))
  max.dist.tiss <- max(dist.tiss)
  rowcol.i <- which(dist.tiss == max.dist.tiss, arr.ind = TRUE)[1, ]  # take first max, there wil be two
  tiss <- paste(tiss.vec[rowcol.i[1]], tiss.vec[rowcol.i[2]], sep = ";")
  return(data.frame(max.dist = max.dist.tiss, tissue = tiss))
}

GetPromoterUsageMaxDist <- function(dat, jvar = "tpm_norm.avg"){
  proms.full <- GetPromoterUsage(dat, jvar = jvar, do.svd = TRUE, append.tiss = TRUE, get.means = FALSE, get.entropy = FALSE)  
  tiss.vec <- proms.full[[1]]$tissue
  proms <- subset(proms.full$dat.mat.trans, select = -c(amp, tissue))
  dist.tiss <- as.matrix(dist(proms))
  max.dist.tiss <- max(dist.tiss)
  rowcol.i <- which(dist.tiss == max.dist.tiss, arr.ind = TRUE)[1, ]  # take first max, there wil be two
  tiss <- paste(tiss.vec[rowcol.i[1]], tiss.vec[rowcol.i[2]], sep = ";")
  return(data.frame(max.dist = max.dist.tiss, tissue = tiss))
}
# plot top hits
PlotPromoter <- function(tpm.afe.avg.sub, jtitle="Title"){
  proms.full <- GetPromoterUsage(tpm.afe.avg.sub, jvar = "tpm_norm.avg", get.means = FALSE, get.entropy = FALSE)
  proms <- subset(proms.full$dat.mat.trans, select = -c(amp, tissue))
  plot(proms[, 1], proms[, 2], main = jtitle, xlab = "Promoter usage (1st component)", ylab = "Promoter usage (2nd component)")
  par(cex.axis=1, cex.lab=2, cex.main=2, cex.sub=1)
  text(proms[, 1], proms[, 2], proms.full$dat.mat.trans$tissue)
}

add.alpha <- function(col, alpha=1){
  if(missing(col))
    stop("Please provide a vector of colours.")
  apply(sapply(col, col2rgb)/255, 2, 
        function(x) 
          rgb(x[1], x[2], x[3], alpha=alpha))  
}

PromoterSpacePlots.nostics <- function(tpm.gauss.sigs, jgene, jtitle, draw.ellipse = TRUE){
  proms <- tpm.gauss.sigs$proms
  n.proms <- ncol(proms)
  if (missing(jtitle)){
    jtitle <- paste(jgene, "n.proms", n.proms)
  }
  plot(proms[, 1], proms[, 2], main = jtitle, xlab = "Promoter usage (1st component)", ylab = "Promoter usage (2nd component)")
  par(cex.axis=1, cex.lab=2, cex.main=2, cex.sub=1)
  
  jcols <- vector(length = length(tpm.gauss.sigs$amp))
  jcols[which(tpm.gauss.sigs$amp == 1)] <- "blue"
  jcols[which(tpm.gauss.sigs$amp == 0)] <- "red"
  text(proms[, 1], proms[, 2], labels = tpm.gauss.sigs$tissue, col = jcols)
  # draw ellipse
  if (draw.ellipse){
    try(lines(ellipse(tpm.gauss.sigs$sig1$cov, level = 0.5, centre = tpm.gauss.sigs$sig1$center), type='l', col = "blue"), silent = T)
    try(lines(ellipse(tpm.gauss.sigs$sig2$cov, level = 0.5, centre = tpm.gauss.sigs$sig2$center), type='l', col = "red"), silent = T)
  }
}

PromoterSpacePlots <- function(tpm.afe.avg, jgene, jvar = "tpm_norm.avg", draw.ellipse = TRUE, use.weights = TRUE, transcript_id = "transcript_id"){
  tpm.test = tryCatch({
    tpm.test <- subset(tpm.afe.avg, gene_name == jgene & tissue != "WFAT")
  }, error = function(e) {
    tpm.test <- subset(tpm.afe.avg, gene == jgene)
  })
  test.svd <- GetPromoterUsage(tpm.test, jvar = jvar, do.svd = T, append.tiss = TRUE, get.means = TRUE, transcript_id = transcript_id)

  proms <- subset(test.svd$dat.mat.trans, select = -c(amp, tissue))
  amp <- test.svd$dat.mat.trans$amp
  weights1 <- (amp - min(amp)) / (max(amp) - min(amp))
  weights2 <- 1 - weights1
  
  # center1
  mu1 <- colSums(sweep(proms, MARGIN = 1, STATS = weights1, FUN = "*")) / sum(weights1)
  sig1 <- cov.wt(proms, wt = weights1)
  pi1 <- sum(weights1) / length(weights1)
  
  # center2
  mu2 <- colSums(sweep(proms, MARGIN = 1, STATS = weights2, FUN = "*")) / sum(weights2)
  sig2 <- cov.wt(proms, wt = weights2)
  pi2 <- sum(weights2) / length(weights2)
  
  plot(test.svd$dat.mat.trans[, 2], test.svd$dat.mat.trans[, 3], main = paste(jgene, "Amp range:", diff(range(amp))))
  if (use.weights){
    myColoursAlpha <- sapply(weights1, function(a) add.alpha(1, alpha=a))
    text(test.svd$dat.mat.trans[, 2], test.svd$dat.mat.trans[, 3], labels = test.svd$dat.mat.trans$tissue, col = myColoursAlpha)
  } else {
    jcols <- vector(length = length(amp))
    jcols[which(amp == 1)] <- "blue"
    jcols[which(amp == 0)] <- "red"
    text(test.svd$dat.mat.trans[, 2], test.svd$dat.mat.trans[, 3], labels = test.svd$dat.mat.trans$tissue, col = jcols)
  }
  points(mu1[1], mu1[2], pch = "*", col = "blue", cex = 5)
  points(mu2[1], mu2[2], pch = "*", col = "red", cex = 5)
  # draw ellipse
  if (draw.ellipse){
    try(lines(ellipse(sig1$cov, level = 0.5, centre = sig1$center), type='l', col = "blue"), silent = T)
    try(lines(ellipse(sig2$cov, level = 0.5, centre = sig2$center), type='l', col = "red"), silent = T)
  }
  
  #   dist1 <- FuzzyDistance(proms, mu1, amp)
  #   dist2 <- FuzzyDistance(proms, mu2, amp)
  #   print(paste("Intracluster score", sum(dist1, dist2)))
  #   print(paste("Interscore", sum((mu2 - mu1) ^ 2)))
  #   
  #   # Gaussian distribution
  #   intraprob1 <- sum(weights1 * apply(proms, 1, function(x) mvtnorm::dmvnorm(x, mean = mu1, sigma = sig1$cov))) / sum(weights1)
  #   intraprob2 <- sum(weights2 * apply(proms, 1, function(x) mvtnorm::dmvnorm(x, mu2, sigma = sig2$cov))) / sum(weights2)
  #   interprob1 <- sum(weights2 * apply(proms, 1, function(x) mvtnorm::dmvnorm(x, mu1, sig1$cov))) / sum(weights2)
  #   interprob2 <- sum(weights1 * apply(proms, 1, function(x) mvtnorm::dmvnorm(x, mu2, sig1$cov))) / sum(weights1)
}

HellingerDistance <- function(dat){
  # calculate distance betwene two rows of matrix
  if (nrow(dat) != 2){
    # print(paste("Number of columns:", ncol(dat)))
    warning("Hellinger Distance only computes distance between two rows.")
    # print("Dim:")
    # print(dim(dat))
    return(NA)
  }
  return(sqrt(sum(apply(dat, 2, function(colm) (sqrt(colm[1]) - sqrt(colm[2]))) ^ 2)))
}


CalculateDistance <- function(dat, dist.method = "hellinger", jvar = "tpm_norm.avg", transcript_id = "transcript", return.as.df=FALSE){
  # simpler than GaussianCenters
  # For Liver and Kidney, calculate distance (either euclidean or hellinger)
  if (length(unique(dat$tissue)) <= 1){
    ifelse(return.as.df, data.frame(NULL), NA)
  }
  proms <- GetPromoterUsage(dat, jvar = jvar, do.svd = FALSE, append.tiss = TRUE, get.means = FALSE, get.entropy = FALSE, transcript_id = transcript_id, get.prom.only = TRUE)
  if (nrow(proms) != 2){
    warning("Expecting only two rows in GetPromoterUsage() output")
  }
  if (dist.method == "euclidean"){
    proms.dist <- as.numeric(dist(as.matrix(proms), method = "euclidean"))
  } else if (dist.method == "hellinger"){
    proms.dist <- HellingerDistance(proms)
  }
  if (!return.as.df){
    return(proms.dist)
  } else {
    gene <- dat$gene[[1]]
    return(data.frame(gene = gene, proms.dist = proms.dist))
  }
}

CalculateGaussianCenters <- function(dat, jvar = "tpm_norm.avg", thres = 0.9, do.svd = TRUE, transcript_id = "transcript_id"){
  if (length(unique(dat$tissue)) <= 1){
    return(NA)
  }
  proms.full <- GetPromoterUsage(dat, jvar = jvar, do.svd = do.svd, append.tiss = TRUE, get.means = TRUE, get.entropy = FALSE, transcript_id = transcript_id)  
  proms <- subset(proms.full$dat.mat.trans, select = -c(amp, tissue))
  amp <- proms.full$dat.mat.trans$amp
  
  if (diff(range(amp)) == 0){
    return(NA)
  }
  weights1 <- (amp - min(amp)) / (max(amp) - min(amp))
  weights2 <- 1 - weights1
    
  # center1
  sig1 <- cov.wt(proms, wt = weights1)
  mu1 <- sig1$center
  pi1 <- sum(weights1) / length(weights1)
  
  # center2
  sig2 <- cov.wt(proms, wt = weights2)
  mu2 <- sig2$center
  pi2 <- sum(weights2) / length(weights2)
  
  return(list(mu1 = mu1, mu2 = mu2, sig1 = sig1, sig2 = sig2, proms = proms, amp = amp, tissue = as.character(proms.full$dat.mat.trans$tissue)))
}

CalculateGaussianDists <- function(dat){
  # sigs is output from CalculateGaussianCenters
  # calculate inter and intra cluster scores
  
  sigs <- dat$sigs[[1]]
  proms <- sigs$proms
  amp <- sigs$amp
  
  # center1
  sig1 <- sigs$sig1
  mu1 <- sig1$center
  weights1 <- sig1$wt
  #   pi1 <- sum(weights1) / length(weights1)
  
  # center2
  sig2 <- sigs$sig2
  mu2 <- sig2$center
  weights2 <- sig2$wt
  #   pi2 <- sum(weights2) / length(weights2)
  
  center.dists <- sum((mu2 - mu1) ^ 2)
  amp.range <- max(amp) - min(amp)
  intrascore1 <- sum(weights1 * apply(proms, 1, function(x) mvtnorm::dmvnorm(x, mean = mu1, sigma = sig1$cov))) / sum(weights1)
  intrascore2 <- sum(weights2 * apply(proms, 1, function(x) mvtnorm::dmvnorm(x, mu2, sigma = sig2$cov))) / sum(weights2)
  interscore1 <- sum(weights2 * apply(proms, 1, function(x) mvtnorm::dmvnorm(x, mu1, sig1$cov))) / sum(weights2)
  interscore2 <- sum(weights1 * apply(proms, 1, function(x) mvtnorm::dmvnorm(x, mu2, sig1$cov))) / sum(weights1)
  return(data.frame(center.dists=center.dists, amp.range = amp.range, intrascore1=intrascore1, intrascore2=intrascore2, interscore1=interscore1, interscore2=interscore2))
}

RunFuzzyDistance <- function(dat, jvar = "tpm_norm.avg", thres = 0.9, do.svd = TRUE){
  if (length(unique(dat$tissue)) <= 1){
    return(data.frame(amp.range = NA, intra.score = NA, inter.score = NA))
  }
  proms.full <- GetPromoterUsage(dat, jvar = jvar, do.svd = do.svd, append.tiss = TRUE, get.means = TRUE, get.entropy = FALSE)  
  proms <- subset(proms.full$dat.mat.trans, select = -c(amp, tissue))
  amp <- proms.full$dat.mat.trans$amp
  weights1 <- (amp - min(amp)) / (max(amp) - min(amp))
  weights2 <- 1 - weights1
  
  # center1
  mu1 <- colSums(sweep(proms, MARGIN = 1, STATS = weights1, FUN = "*")) / sum(weights1)
  
  # center2
  mu2 <- colSums(sweep(proms, MARGIN = 1, STATS = weights2, FUN = "*")) / sum(weights2)
  
  amp.range <- max(amp) - min(amp)
  dist1 <- FuzzyDistance(proms, mu1, amp)
  dist2 <- FuzzyDistance(proms, mu2, amp)
  intra.score <- sum(dist1, dist2)
  inter.score <- sum((mu2 - mu1) ^ 2)
  return(data.frame(amp.range = amp.range, intra.score = intra.score, inter.score = inter.score))
}

PromAmpScore <- function(prom.vec1, prom.vec2, amp1, amp2){
  euc.dist <- sqrt(sum((prom.vec1 - prom.vec2) ^ 2))
  amp.dist <- abs(amp1 - amp2)
  return((euc.dist * amp.dist))
}

# distance score
FuzzyDistance <- function(jpoints, jcluster, weights){
  # https://en.wikipedia.org/wiki/Fuzzy_clustering
  dist <- sweep(jpoints, MARGIN = 2, STATS = jcluster, FUN = "-")
  dist <- sweep(dist, MARGIN = 1, STATS = weights, FUN = "*")
  dist <- sum(apply(dist, MARGIN = 1, function(x) sum(x^2)))
  return(dist)
}


DoCanCor <- function(tpm.sub, xvar = "tpm_norm.avg", yvar = "amp"){
  # do canonical correlation of two matrices to find interesting
  # alternative promoter usage
  if (nrow(tpm.sub) == 0){
    return(data.frame(Cor = NA))
  }
  X <- dcast(data = tpm.sub, formula = transcript_id ~ tissue, value.var = xvar)
  rownames(X) <- X$transcript_id; X$transcript_id <- NULL
  X <- t(X)
  jtrans <- tpm.sub$transcript_id[[1]]
  Y <- data.frame(subset(tpm.sub, transcript_id == jtrans, select = c(yvar, "tissue")))
  rownames(Y) <- as.character(Y$tissue); Y$tissue <- NULL
  jcor <- cancor(X, Y, xcenter = T, ycenter = T)
  return(data.frame(Cor = jcor$cor))
}

KeepUpToThres <- function(vec, thres, min.dim = 2){
  # keep vector up to threshold return by index
  for (i in seq(length(vec))){
    if (vec[i] > thres){
      break
    }
  }
  if (i < min.dim){
    return(i:min.dim)
  }
  return(1:i)
}

GetPromoterUsage <- function(dat, jvar = "tpm_norm.avg", do.svd = TRUE, thres = 0.9, append.tiss = TRUE, get.means=TRUE, get.entropy=TRUE, 
                             transcript_id = "transcript_id", return.transcripts=FALSE, get.prom.only=FALSE){
  # get promoter usage
  if (!is.null(dat$mean)){
    jform <- paste0("tissue + amp + mean ~ ", transcript_id)
    # dat.mat <- dcast(dat, tissue + amp + mean~ transcript_id, value.var = jvar)
    dat.mat <- dcast(dat, as.formula(jform), value.var = jvar)
    dat.mat.prom <- subset(dat.mat, select = -c(tissue, amp, mean))
  } else {
    jform <- paste0("tissue + amp ~ ", transcript_id)
    # dat.mat <- dcast(dat, tissue + amp + mean~ transcript_id, value.var = jvar)
    dat.mat <- dcast(dat, as.formula(jform), value.var = jvar)
    dat.mat.prom <- subset(dat.mat, select = -c(tissue, amp))
  }
  if (do.svd){
    # reduce dim
    dat.mat.prom <- sweep(dat.mat.prom, MARGIN = 1, STATS = rowMeans(dat.mat.prom), FUN = "-")
    dat.mat.prom.s <- svd(dat.mat.prom)
    eigvals <- dat.mat.prom.s$d ^ 2 / sum(dat.mat.prom.s$d ^ 2)
    eigvals.cum <- cumsum(eigvals)
    # threshold for number of eigvals to keep
    keep <- KeepUpToThres(eigvals.cum, thres, min.dim = 2)
    if (length(dat.mat.prom.s$d) == 1){
      print("Unexpected here:")
      print(dat)
    }
    dat.mat.prom <- sweep(dat.mat.prom.s$u[, keep], MARGIN = 2, STATS = dat.mat.prom.s$d[keep], FUN = "*")
    dat.mat.trans <- data.frame(amp = dat.mat$amp, dat.mat.prom) 
    if (return.transcripts){
      dat.mat.v <- sweep(dat.mat.prom.s$v[, keep], MARGIN = 2, STATS = dat.mat.prom.s$d[keep], FUN = "*")
      # take most polarizing two transcripts
      transcripts <- colnames(subset(dat.mat, select = -c(tissue, amp)))
      dat.mat.v.mean <- apply(dat.mat.v, 1, function(row) sum(row ^ 2))
      
      n <- length(dat.mat.v.mean)
      tx1.i <- which.max(dat.mat.v.mean)[[1]]
      tx2.i <- which(dat.mat.v.mean == sort(dat.mat.v.mean, partial = n - 1)[n - 1])[[1]]
      
      dat.mat.v.keep <- dat.mat.v.mean[c(tx1.i, tx2.i)]
      tx <- transcripts[c(tx1.i, tx2.i)]
      
      dat.mat.trans <- cbind(dat.mat.trans, dat.mat.v.keep)
      dat.mat.trans$transcript <- tx
    }
  } else {
    dat.mat.trans <- data.frame(amp = dat.mat$amp, dat.mat.prom)
  }
  
  if (append.tiss){
    dat.mat.trans$tissue <- dat.mat$tissue
  }
  out <- list(dat.mat.trans = dat.mat.trans)
  if (get.means){
    # out$weights <- dat.mat$mean
    out$mean <- dat.mat$mean
  }
  if (get.entropy){
    dat.H <- apply(dat.mat.prom, 1, function(x) ShannonEntropy(x, normalize = FALSE))
    out$entropy <- dat.H
  }
  if (get.prom.only){
    return(subset(out$dat.mat.trans, select = c(-amp, -tissue)))
  }
  return(out)
}

CorrelateAmpPromMulti <- function(dat, jvar = "tpm_norm.avg", thres = 0.9, do.svd = TRUE, weighted = FALSE, eps = 1e-10){
  dat.mat.trans.lst <- GetPromoterUsage(dat, jvar = jvar, do.svd = do.svd, thres = thres, append.tiss = FALSE, get.means = TRUE, get.entropy = TRUE)
  dat.mat.trans <- dat.mat.trans.lst$dat.mat.trans
  weights <- dat.mat.trans.lst$weights
  entropy <- dat.mat.trans.lst$entropy
  if (weighted){
    # by shannon entropy
    weights <- 1 - entropy
    weights <- weights + eps  # if zero?
    fit.altprom <- lm(formula = amp ~ ., data = dat.mat.trans, weights = weights)
  } else {
    fit.altprom <- lm(formula = amp ~ ., data = dat.mat.trans)
  }
  # return just R-squared and best p-value
  rsqr <- summary(fit.altprom)$r.squared
  coefs <- summary(fit.altprom)$coefficients
  pval.best <- min(coefs[2:nrow(coefs), "Pr(>|t|)"])
  return(data.frame(rsqr = rsqr, pval.best = pval.best))
}

# Functions: kallisto -----------------------------------------------------

PlotDiagnostics <- function(dat.tpm, dat.arrayrnaseq, jgene, jtranscript){
  source('scripts/functions/PlotGeneAcrossTissues.R')
  # uses data objects from find_alternative_first_exons_kallisto.R OR filter_kallisto_for_start_sites.R
  # Plot diagnostic plots, given a gene and transcript
  # jgene <- "Sorbs1"; jtranscript="ENSMUST00000165469"  # Liver and BFAT looks rhythmic, but assigned as "not rhythmic"
  
  test.gene <- subset(dat.tpm, gene_name == jgene)
  test <- subset(test.gene, transcript_id == jtranscript)
  
  rhythmic.tissues <- paste(unique(test$tissue[which(test$is.rhythmic == TRUE)]), collapse = ",")
  notrhythmic.tissues <- paste(unique(test$tissue[which(test$is.rhythmic == FALSE)]), collapse = ",")
  
  print(PlotGeneAcrossTissues(subset(dat.arrayrnaseq, gene == jgene), 
                              jtitle = paste(jgene, "\nRhythmic Tissues:", rhythmic.tissues, "\nFlat Tissues:", notrhythmic.tissues)))
  
  print(PlotTpmAcrossTissues(test.gene, jtitle = paste(jgene, jtranscript), log2.transform=FALSE))
  print(PlotTpmAcrossTissues(test, jtitle = paste(jgene, jtranscript), log2.transform=TRUE))
  
  print(ggplot(test.gene, aes(y = tpm_normalized, x = tissue)) + 
          geom_boxplot() + 
          facet_wrap(~transcript_id) + 
          ggtitle(jgene) +
          theme(axis.text.x = element_text(angle = 90, hjust = 1)))
  
  print(ggplot(test, 
               aes(y = tpm_normalized, x = is.rhythmic)) + 
          geom_boxplot() + 
          ggtitle(paste(jgene, jtranscript)))
}

PlotDiagnostics2 <- function(dat.tpm, tpm.avg, dat.long, jgene, jtranscript){
  source('scripts/functions/PlotGeneAcrossTissues.R')
  # updated version plotting the linear trend
  dat.gene <- subset(dat.tpm, gene_name == jgene)
  dat.transcript <- subset(dat.gene, transcript_id == jtranscript)
  tpm.avg.sub <- subset(tpm.avg, transcript_id == jtranscript)
  
  print(PlotGeneAcrossTissues(subset(dat.long, gene == jgene), jtitle = paste(jgene, jtranscript, sep = "\n")))
  print(PlotTpmAcrossTissues(dat.gene, jtitle = paste("linear-scale", jgene, jtranscript, sep = "\n"), log2.transform=FALSE))
  print(PlotTpmAcrossTissues(dat.transcript, jtitle = paste("log-scale", jgene, jtranscript, sep = "\n"), log2.transform=TRUE))
  print(ggplot(tpm.avg.sub, aes(y = tpm_norm.avg, x = relamp, label = tissue)) + geom_point() + geom_text() + ggtitle(jgene) + geom_smooth(method = "lm"))
}

IsTissueSpecific2 <- function(jdf, pval.min = 1e-5, pval.max = 0.05, cutoff = 4.89){
  # given list of pvals, check if it contains pvals less than pval.min and greater than pval.max.
  # check if pval contains values less than pval.min (rhythmic) and greater than pval.max (flat)
  # if so, return TRUE, otherwise FALSE
  avg.exprs <- jdf$int.rnaseq
  pvals <- jdf$pval
  pvals.filt <- pvals[which(!is.nan(pvals) & avg.exprs > cutoff)]
  if (min(pvals.filt) < pval.min & max(pvals.filt) > pval.max){
    jdf$is.tissue.spec.circ <- rep(TRUE, length(pvals))
  } else {
    jdf$is.tissue.spec.circ<- rep(FALSE, length(pvals))
  }
  return(jdf)
}

IsRhythmic2 <- function(pval, avg.exprs, pval.min = 1e-5, cutoff = 6){
  # ask if gene is rhythmic, not rhythmic or NA (lowly expressed or 0)
  # check if pval is less than pval.min
  if (is.nan(pval) | avg.exprs < cutoff){
    return(NA)
  }
  if (pval < pval.min){
    return(TRUE)
  } else {
    return(FALSE)
  }
}

ModelRhythmicity <- function(dat, jformula=tpm_normalized ~ is.rhythmic){
  # check it contains both Rhythmic and NotRhythmic elements
  if (length(unique(dat$is.rhythmic)) != 2){
    return(data.frame(int = NA, coef = NA, pval = NA))
  }
  jfit <- lm(data = dat, formula = jformula)
  # int <- summary(jfit)$mat["(Intercept)", "Coef"]
  # jcoef <- summary(jfit)$mat["rhythmic.or.notRhythmic", "Coef"]
  int <- coef(jfit)[[1]]
  jcoef <- coef(jfit)[[2]]
  pval <- summary(jfit)$coefficients["is.rhythmicTRUE", "Pr(>|t|)"]
  return(data.frame(int = int, coef = jcoef, pval = pval))
}

CalculateFractionIsoformUsage <- function(tpm, pseudocount = 0){
  # given tpm of a sample across known isoforms, compute
  # fraction of isoform usage. 
  # 
  # Add pseudo count to increase robustness to lowly expressed genes
  tpm_normalized <- (tpm + pseudocount) / (sum(tpm + pseudocount))
}


# Functions: alternative first exons --------------------------------------


KsTestPosNeg <- function(jdf){
  # 
  x.neg <- jdf$sitecount[which(jdf$hit.or.not == "Neg")]
  x.pos <- jdf$sitecount[which(jdf$hit.or.not == "Pos")]
  jks <- ks.test(x.neg, x.pos)
  stat <- jks$statistic
  pval <- jks$p.value
  return(data.frame(statistic = stat, pval = pval))
}

FitPosNeg <- function(jdf){
  jfit <- lm(formula = sitecount ~ hit.or.not, data = jdf)
  summary(jfit)$coefficients["hit.or.notPos", "Pr(>|t|)"]
  pval <- summary(jfit)$coefficients["hit.or.notPos", "Pr(>|t|)"]
  coef <- summary(jfit)$coefficients["hit.or.notPos", "Estimate"]
  return(data.frame(coef = coef, pval = pval))
}

HitOrNot <- function(coef, pval, max.pval=1e-5, min.pval = 0.05){
  if (pval < max.pval){
    if (coef > 0){
      s = "Pos"
    } else if (coef < 0){
      s = "Neg"
    } else {
      warning("Coefficient is 0 but called a hit")
    }
  } else if (pval < min.pval){
    s = NA
  } else {
    s = "NotHit"
  }
  return(s)
}

FitDfToMatrix <- function(jdf, common.genes){
  dat.fitrhyth.filt <- data.frame(subset(jdf, gene %in% common.genes, select = c(gene, tissue, as.numeric(pval), amp)))  # faster
  rnames <- apply(dat.fitrhyth.filt, 1, function(x) paste0(x[2], '-', x[1]))
  rownames(dat.fitrhyth.filt) <- rnames; rm(rnames)
  dat.fitrhyth.filt <- subset(dat.fitrhyth.filt, select = c(pval, amp))
  dat.fitrhyth.filt <- data.matrix(dat.fitrhyth.filt)  # faster to work with matrices
  return(dat.fitrhyth.filt)
}

GetRhythmicOrNot <- function(x, fitdf){
  # Expect x to be a row from cov.normreads, with tissue and gene in 2nd and 3rd col
  tiss <- x[2]
  gene <- x[3]
  rname <- paste0(tiss, '-', gene)
  fitdf.sub = tryCatch({
    fitdf[rname, ]
  }, warning = function(w) {
    print("Warning")
    print(w)
  }, error = function(e) {
    # print(paste("Cannot access:", rname))
    return(NA)
  })  
  if (is.na(fitdf.sub[1])){
    return(NA)
  }
  pval <- fitdf.sub[1]
  amp <- fitdf.sub[2]
  annots <- RhythmicOrNot(pval, amp)
  return(annots)
}

RhythmicOrNot <- function(pval, amp, min.pval = 1e-5, max.pval = 0.05, max.amp = 0.5, min.amp = 0.1){
  if (pval < min.pval & amp > max.amp){
    return("Rhythmic")
  } else if (pval > max.pval & amp < min.amp){
    return("NotRhythmic")
  } else {
    return(NA)  # undecided
  }
}

FoldChangeRhyth <- function(jdf){
  # Calculate log2 fold change between "rhythmic" and "non rhythmic" genes 
}

cossim <- function(x, y){
  return(x %*% y / sqrt(x%*%x * y%*%y))
}

LoopCor <- function(m, show.which=FALSE, input.vec1=NA){
  imin <- NA
  jmin <- NA
  jcor.min <- 2  # init because pearson cor is between -1 and 1. Use a number outside of this range.
  for (i in 1:ncol(m)){
    if (!is.na(input.vec1)){
      i <- input.vec1
    }
    vec1 <- m[, i]
    for (j in i:ncol(m)){
      if (j == i){
        next  # dont need to compare between same vec
      }
      vec2 <- m[, j]
      jcor <- cossim(vec1, vec2)
      if (jcor < jcor.min){
        jcor.min <- jcor
        imin <- i
        jmin <- j
      }
    }
    if (!is.na(input.vec1)){
      break
    }
  }
  if (show.which){
    jtissues <- c(colnames(m)[imin], colnames(m)[jmin])
    print(jtissues)
    return(list(tissues = jtissues,
                min.cor = jcor.min))
  } 
  return(jcor.min)
}

GetMinCor <- function(df){
  m <- acast(data = df, transcript ~ tissue, value.var = "norm_reads")
  jcor.min <- LoopCor(m)
  return(data.frame(min.cor = jcor.min))
}

Normalize <- function(x, pseudocount = 1){
  if (pseudocount > 0){
    x <- x + pseudocount
  }
  x.norm <- x / sum(x)
  return(x.norm)
}

AvgAcrossTranscripts <- function(df){
  ddply(df, .(transcript, gene, tissue), summarise, mean_reads = mean(reads))
}

NormalizeReads <- function(df){
  # Normalize reads across transcripts
  ddply(df, .(gene), transform, norm_reads = Normalize(mean_reads))
}

GetGeneNameFromAnnot <- function(annot){
  # from gene_name=RP23-271O17.1;transcript_id=ENSMUST00000193812 retrieve RP23-27017.1
  gene.str <- strsplit(as.character(annot), ';')[[1]][[1]]
  gene.str <- strsplit(gene.str, '=')[[1]][[2]]
  return(gene.str)
}

GetTranscriptIDFromAnnot <- function(annot){
  # from gene_name=RP23-271O17.1;transcript_id=ENSMUST00000193812 retrieve ENSMUST...
  transcript.str <- strsplit(as.character(annot), ';')[[1]][[2]]
  transcript.str <- strsplit(transcript.str, '=')[[1]][[2]]
  return(transcript.str)
}

rowMax <- function(df){
  # Return vector of maximums from df
  return(apply(df, 1, max))
}

GetTissuesAFE <- function(x){
  # Get tissues from column names from Adr_CT22 format
  substr(x, "_")[[1]][[1]]
}

TissueMapping <- function(cov.to.rnaseq = TRUE){
  # Tissue names from coverage are slightly different from
  # tissue names from RNASeq data. Create the mapping between 
  # coverage to rnaseq tissue names
  list("Adr" = "Adr",
       "Aor" = "Aorta",
       "BFat" = "BFAT",
       "Bstm" = "BS",
       "Cer" = "Cere",
       "Hrt" = "Heart",
       "Hyp" = "Hypo",
       "Kid" = "Kidney",
       "Liv" = "Liver",
       "Lun" = "Lung",
       "Mus" = "Mus",
       "WFat" = "WFAT")
}

ConvertTissuesToMatchRnaSeq <- function(tissues){
  # make tissue names look like RNASeq column names using TissueMapping
  tissue.map <- TissueMapping()
  sapply(tissues, function(x){
    tissue.map[[x]]
  })
}

GetExprsAsVector <- function(dat, genes, tissuetime){
  # Given a vector of rownames and column names, extract
  # its corresponding element in the dat. Return as
  # a vector.
  if (length(genes) != length(tissuetime)) print("Genes and tissuetime is not same length")
  lookups <- mapply(function(x, y){
    return(dat[x, y])
  }, x = genes, y = tissuetime)
  return(lookups)
}

GetLocationFromAnnotation <- function(bed, gene_name, transcript_id){
  # Given bed, gene_name and transcript_id, return the chromo, start, end
  annot <- paste0("gene_name=", gene_name, ";transcript_id=", transcript_id)
  sub <- subset(bed, annotations == annot)
  # return as UCSC-style
  return(paste0(sub$chromosome, ":", sub$start, "-", sub$end))
}

SubsetBed <- function(bed, gene_name, transcript){
  # Subset bed based on grepping annotations from gene name
  if (missing(transcript)){
    return(bed[grepl(gene_name, bed$annotations), ])  
  } else if (missing(gene_name)){  
    return(bed[grepl(transcript, bed$annotations), ])  
  }
}

NormalizeBySum <- function(x){
  # Normalize a vector by its sum
  return(x / sum(x))
}

GetFirst <- function(x){
  return(x[1])
}

ShannonEntropy <- function(x.vec, normalize=FALSE){
  if (normalize){
    # should sum to 1
    x.vec <- x.vec / sum(x.vec)
  }
  entropy <- 0
  for (x in x.vec){
    entropy <- entropy + x * log2(1 / x)
  }
  entropy <- entropy / log2(length(x.vec))
  return(entropy)
}

MulitpleStarts <- function(df, min_dist = 500){
  # Check if df has multiple exons, ddply from bed
  if (nrow(df) <= 1){
    return(data.frame(MultiStart = FALSE))
  }
  dist <- diff(c(min(df$start), max(df$start)))
  if (dist >= min_dist){
    return(data.frame(MultiStart = TRUE))
  } else{
    return(data.frame(MultiStart = FALSE))
  }
}

SubsetMinPval <- function(jdf){
  # take row with minimum pval
  jdf <- jdf[order(jdf$pval), ]
  return(jdf[1, ])
}

FitRhythNonRhyth <- function(jdf, log2transf = FALSE){
  # check it contains both Rhythmic and NotRhythmic elements
  if (length(unique(jdf$rhythmic.or.not)) != 2){
    return(data.frame(int = NA, coef = NA, pval = NA))
  }
  if (log2transf){
    jform <- log2(norm_reads) ~ rhythmic.or.not
  } else {
    jform <- norm_reads ~ rhythmic.or.not
  }
  jfit <- lm(data = jdf, formula = jform)
  # int <- summary(jfit)$mat["(Intercept)", "Coef"]
  # jcoef <- summary(jfit)$mat["rhythmic.or.notRhythmic", "Coef"]
  int <- coef(jfit)[[1]]
  jcoef <- coef(jfit)[[2]]
  pval <- summary(jfit)$coefficients["rhythmic.or.notRhythmic", "Pr(>|t|)"]
  return(data.frame(int = int, coef = jcoef, pval = pval))
}

FitPromoterUsageToAmplitude <- function(dat){
  #   jweights <- 1 / sqrt(dat$tpm_norm.var)
  #   fit <- lm(formula = tpm_norm.avg ~ relamp, data = dat, weights = jweights)
  fit <- lm(formula = tpm_norm.avg ~ relamp, data = dat)
  int <- coef(fit)[["(Intercept)"]]
  relamp <- coef(fit)[["relamp"]]
  tpm_norm.range <- diff(range(dat$tpm_norm.avg))
  relamp.range <- diff(range(dat$relamp))
  # handle if relamp is "NA"
  if (is.na(fit$coefficients[["relamp"]])){
    return(data.frame(int = NA, relamp = NA, pval = NA, tpm_norm.range = NA, relamp.range = NA))
  }
  pval <- summary(fit)$coefficient["relamp", "Pr(>|t|)"]
  return(data.frame(int = int, relamp = relamp, pval = pval, tpm_norm.range = tpm_norm.range, relamp.range = relamp.range))
}
jakeyeung/TissueCiradianAnalysis documentation built on Aug. 7, 2020, 7:58 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
jakeyeung/TissueCiradianAnalysis
Analysis of oscillatory RNASeq data

R/AlternativeFirstExonsFunctions.R
In jakeyeung/TissueCiradianAnalysis: Analysis of oscillatory RNASeq data

R Package Documentation

Browse R Packages

We want your feedback!

jakeyeung/TissueCiradianAnalysis Analysis of oscillatory RNASeq data

R/AlternativeFirstExonsFunctions.R In jakeyeung/TissueCiradianAnalysis: Analysis of oscillatory RNASeq data

R Package Documentation

Browse R Packages

We want your feedback!

jakeyeung/TissueCiradianAnalysis
Analysis of oscillatory RNASeq data

R/AlternativeFirstExonsFunctions.R
In jakeyeung/TissueCiradianAnalysis: Analysis of oscillatory RNASeq data