TPMaccess: RNA data analysis

#' @include RNAClass.R
#' @title sepSpike
#' @name sepSpike
#' @rdname sepSpike-methods
#' @description
#' Separate out spikein from genes based upon rownames of tpm.value slot of tpm object.
#' @param obj A \code{tpm} object.
#' @param invert A \code{logical} object, indicating whether to keep spike-in only (TRUE) or genes only (FALSE).
#' @export sepSpike
setGeneric(name = 'sepSpike',
  def = function(obj, invert = FALSE) {
    standardGeneric("sepSpike")
  }
)

#' @rdname sepSpike-methods
setMethod(f = "sepSpike",
  signature = c("tpm", "logical"),
  def = function(obj, invert) {
    tpm.value <- obj@tpm.value[grep("spike", rownames(obj@tpm.value), ignore.case = TRUE, invert = invert),]
    return(tpm.value)
  }
)

#' @title rmLow
#' @name rmLow
#' @rdname rmLow-methods
#' @description
#' Filter of genes or transcripts with expression levels above given threshold in one or more samples.
#' @param obj A \code{tpm} object.
#' @param thresh A TPM threshold.
#' @export rmLow
setGeneric(name = "rmLow",
  def = function(obj, thresh) {
    standardGeneric("rmLow")
  }
)

#' @rdname rmLow-methods
setMethod(f = "rmLow",
  signature = c("tpm", "numeric"),
  definition = function(obj, thresh) {
    kpt.idx <- apply(obj@tpm.value, 1,
      function(vec)any(vec>thresh)
    )
    stopifnot(sum(kpt.idx) > 0)
    obj@tpm.value <- obj@tpm.value[kpt.idx, ]
    return(obj)
  }
)

#' @title rmNonVar
#' @name rmNonVar
#' @rdname rmNonVar-methods
#' @description
#' Filter genes or transcripts with expression variabilities above a given quantile.
#' @param obj A \code{tpm} object.
#' @param probs A probability threshold for removing invariably expressed genes.
#' @export rmNonVar
setGeneric(name = "rmNonVar",
  def = function(obj, probs = 0.1) {
    standardGeneric("rmNonVar")
  }
)

#' @rdname rmNonVar-methods
setMethod(f = "rmNonVar",
  signature = c("tpm", "numeric"),
  definition = function(obj, probs) {
    var <- apply(obj@tpm.value, 1, var)
    thresh <- quantile(var, probs = probs, names = FALSE)
    kpt.idx <- var >= thresh
    obj@tpm.value <- obj@tpm.value[kpt.idx, ]
    return(obj)
  }
)

#' @title lmAdjCovar
#' @name lmAdjCovar
#' @rdname lmAdjCovar-methods
#' @description
#' Linear regression to adjust for batch effects.
#' @param x A matrix of gene expression data with genes in the rows and samples in the cols.
#' @param covar A data.frame covariates.
#' @param add.mean A logical vector indicating whether to add group mean to the residuals
#' @export lmAdjCovar
setGeneric(name = "lmAdjCovar",
  def = function(x, covar, add.mean = TRUE) {
    standardGeneric("lmAdjCovar")
  }
)

#' @rdname lmAdjCovar-methods
setMethod(f = "lmAdjCovar",
  signature = c("matrix", "data.frame", "logical"),
  definition = function(x, covar, add.mean = TRUE){
    stopifnot(ncol(x) == nrow(covar))
    colnames(covar) <- paste0('V', 1:ncol(covar))
    x <- t(x)
    t(lm(x~., data = covar)$residuals) + if(add.mean){colMeans(x)}else{0}}
)

#' @title loessnorm
#' @name loessnorm
#' @rdname loessnorm-methods
#' @description
#' Loess normalization of gene or transcript expressions to expressions of spike-in.
#' @param obj A \code{tpm} object.
#' @param small A numeric vector indicating the adjustment to the TPM values before log2 transformation.
#' @export loessnorm
setGeneric(name="loessnorm",
  def = function(obj, small = 0.05) {
    standardGeneric("loessnorm")
  }
)

#' @rdname loessnorm-methods
setMethod(f = "loessnorm",
  signature = c("tpm", "ANY"),
  definition = function(obj, small = 0.05){
    tpm.norm <- normalize.loess(obj@tpm.value + small,
      subset = grep("spikein", row.names(tpm)))
    return(tpm.norm)
  }
)

#' @title rdcntnorm
#' @name rdcntnorm
#' @rdname rdcntnorm-methods
#' @description
#' Normalization of gene or transcript expressions to expressions of spike-in based upon read counts in a table.
#' @param obj A \code{tpm} object.
#' @param stats A data.frame with named columns of 'spikein' and 'mm10' read counts.
#' @export rdcntnorm
setGeneric(name="rdcntnorm",
  def = function(obj, stats) {
    standardGeneric("rdcntnorm")
  }
)

#' @rdname rdcntnorm-methods
setMethod(f = "rdcntnorm",
  signature = c("tpm", "data.frame"),
  definition = function(obj, stats) {
    frac <- stats$spikein/(stats$spikein + stats$mm10)
    normfac <- mean(frac)/frac
    tpm.norm <- data.frame(t(t(obj@tpm.value)*normfac))
    return(tpm.norm)
  }
)

#' @title distplot
#' @name distplot
#' @rdname distplot-methods
#' @description
#' tpm.value either normalised to spikein or not
#' tpm.value either spikein, non-spikein, or all
#' idx for selecting differnetially expressed genes.
#' @param obj A \code{tpm} object.
#' @param ylab A character string specify y-labl on the plot.
#' @param pdffout A character string specify pdf output file.
#' @param probs A probability specifying the top quantile to use a ceilling in plotting.
#' @export distplot
setGeneric(name="distplot",
  def = function(obj, ylab, pdffout, probs = 0.85) {
    standardGeneric("distplot")
  }
)

#' @rdname distplot-methods
setMethod(f = "distplot",
  signature = c("tpm", "character", "character", "numeric"),
  definition = function(obj, ylab, pdffout, probs) {
    dat <- data.table(obj@tpm.value)
    ldat <- melt(dat)
    max.y <- quantile(ldat$value, probs = probs)
    p1 <- ggplot(ldat, aes_(x = ~variable, y = ~value)) +
      geom_boxplot(aes_(fill = factor(~variable))) +
      scale_y_continuous(labels = comma) +
      labs(x = "", y = ylab) +
      theme(legend.title = element_blank(), legend.position="top") +
      coord_cartesian(ylim=c(0, max.y))
      pdf(pdffout)
      theme_set(theme_grey(base_size=15))
    multiplot(p1,cols=1)
    dev.off()
  }
)

#' @title corplot
#' @name corplot
#' @rdname corplot-methods
#' @description
#' Scatter plot of pairwise sample comparisons
#' @param obj A \code{tpm} object.
#' @param pdffout A character string specify pdf output file.
#' @export corplot
setGeneric(name = "corplot",
  def = function(obj, pdffout) {
    standardGeneric("corplot")
  }
)

#' @rdname corplot-methods
setMethod(f = "corplot",
  signature = c("tpm", "character"),
  definition = function(obj, pdffout) {
    tpm.value <- obj@tpm.value
    thresh <- floor(quantile(as.matrix(tpm.value), probs = 0.999))
    pm <- ggpairs(data.table(tpm.value))
    pm2 <- pm
    for(i in 2:pm$nrow) {
      for(j in 1:(i-1)) {
        pm2[i,j] <- pm[i,j] + coord_cartesian(xlim = c(0,thresh),ylim = c(0,thresh)) +
          scale_x_continuous(breaks = c(0, as.numeric(thresh))) +
          scale_y_continuous(breaks = c(0, as.numeric(thresh)))
      }
    }
    pdf(pdffout, height = 7 * (ncol(obj@tpm.value)/10), width = 7 *(ncol(obj@tpm.value)/10))
    print(pm2)
    dev.off()
  }
)

#' @title hireplot
#' @name hireplot
#' @rdname hireplot-methods
#' @description
#' Hierarchical clustering of sample correlation coefficients
#' @param obj A \code{tpm} object.
#' @param pdffout A character string specify pdf output file.
#' @export hireplot
setGeneric(name = "hireplot",
  def = function(obj, pdffout) {
    standardGeneric("hireplot")
  }
)

#' @rdname hireplot-methods
setMethod(f = "hireplot",
  signature = c("tpm"),
  definition = function(obj, pdffout) {
    corstats <- cor(obj@tpm.value, method="spearman")
    grps <- factor(obj@grps, levels = unique(obj@grps), ordered = T)
    tip.col <- if(length(levels(grps)) < 3) {
      brewer.pal(3, "Dark2")[1:2]
    } else if (length(levels(grps)) <= 8) {
      brewer.pal(length(levels(grps)), "Dark2")
    } else if (length(levels(grps)) > 8) {
      colorRampPalette(brewer.pal(8, "Dark2"))(length(levels(grps)))
    }
    tip.col <- tip.col[as.numeric(grps)]
    pdf(pdffout, height = 7 * (ncol(obj@tpm.value)/10), width = 7 *(ncol(obj@tpm.value)/10))
    plot(as.phylo(hclust(as.dist(1-corstats), method = "average")),
      cex = 2, label.offset = 0, tip.color = tip.col)
    dev.off()
  }
)

#' @title heatcorplot
#' @name heatcorplot
#' @rdname heatcorplot-methods
#' @description
#' Correlation coefficient heatmap of samples
#' @param obj A \code{tpm} object.
#' @param pdffout A character string specify pdf output file.
#' @export heatcorplot
# alternative code:
# corstats=cor(tpm.value[idx,],method="spearman")
# ana.col=data.frame(Group=grps)
# rownames(ana.col)=colnames(tpm.value)
# png(pngfout,width=3000,height=3000,res=300)
# pheatmap(corstats,colorRampPalette(rev(brewer.pal(n=7, name ="RdYlBu")))(100),breaks=seq(-1,1,length.out=101),annotation_col=ana.col)
# dev.off()
setGeneric(name = "heatcorplot",
  def = function(obj, pdffout) {
    standardGeneric("heatcorplot")
  }
)

#' @rdname heatcorplot-methods
setMethod(f = "heatcorplot",
  signature = "tpm",
  definition = function(obj, pdffout) {
    pm <- ggcorr(obj@tpm.value, method = c("pairwise","spearman"),
      label = TRUE, label_alpha = TRUE)
    pdf(pdffout)
    print(pm)
    dev.off()
  }
)

#' @title bplot
#' @name bplot
#' @rdname bplot-methods
#' @description
#' Boxplot of tpm.value slot of tpm object.
#' @param obj A \code{tpm} object.
#' @param title A title for boxplot.
#' @param pdffout A character string specify pdf output file.
#' @param probs A probability specifying the top quantile to use a ceilling in plotting.
#' @param ylab A string specifying the ylab in the plot.
#' @param isLog A logical value indicating whether TPM values are already log2 transformed.
#' @param small A numeric vector indicating the adjustment to the TPM values before log2 transformation.
#' @export bplot
setGeneric(name = "bplot",
  def = function(obj, title, pdffout, probs = 0.80,
    ylab = expression(paste(log[2], "(TPM)")),
    isLog = FALSE, small = 0.05) {
    standardGeneric("bplot")
  }
)

#' @rdname bplot-methods
setMethod(f = "bplot",
  signature = "tpm",
  definition = function(obj, title, pdffout, probs, ylab, isLog, small) {
    tpm.value <- obj@tpm.value
    if(!isLog) {
      tpm.value <- log2(tpm.value + small)
    }
    map.it <- obj@grps
    names(map.it) <- colnames(tpm.value)
    ldat <- melt(data.table(tpm.value))
    ldat$grps <- factor(ldat$variable, levels = colnames(tpm.value))
    levels(ldat$grps) <- obj@grps
    min.y <- min(ldat$value)
    max.y <- as.numeric(quantile(ldat$value, probs = probs))
    p1 <- ggplot(ldat, aes_(x = ~variable, y = ~value, fill = ~grps))+
      geom_boxplot()+
      coord_cartesian(ylim = c(min.y, max.y))+
      theme(legend.title = element_blank(), legend.position = "top") +
      labs(x = "", y = ylab) +
      ggtitle(title)
    pdf(pdffout, pointsize = 14)
    theme_set(theme_grey(base_size = 15))
    multiplot(p1, cols = 1)
    dev.off()
  }
)

#' @title PCAplot
#' @name PCAplot
#' @rdname PCAplot-methods
#' @description
#' PCA plot (PC1, PC2 and PC3) of samples
#' @param obj A \code{tpm} object.
#' @param pdffout A character string specify pdf output file.
#' @param fout A text output file of top N most variably expressed genes.
#' @param excl.col A numeric vector indicating columns to be excluded from the \code{tpm} slot of a \code{tpm} object.
#' @param ntop A numeric value indicating selection of the top N most variably expressed genes, or Inf (default).
#' @param isLog A logical value of whether the \code{tpm} data is already log2 transformed.
#' @param pca2d A logical value indicating whether 2D instead of 3D PCA.
#' @param small A numeric vector indicating the adjustment to the TPM values before log2 transformation.
#' @export PCAplot
setGeneric(name = "PCAplot",
  def = function(obj, pdffout, fout = NULL, excl.col = NULL,
    ntop = Inf, isLog = FALSE, pca2d = FALSE, small = 0.05) {
    standardGeneric("PCAplot")
  }
)

#' @rdname PCAplot-methods
setMethod(f = "PCAplot",
  signature = "tpm",
  definition = function(obj, pdffout, fout, excl.col, ntop, isLog, pca2d, small) {
    out <- function(dat, fout) {
      tmp <- data.frame(gene = rownames(dat), dat)
      write.table(tmp, file = fout, row.names = F,
        col.names = T, quote = F, sep = "\t")
    }
    if (is.null(excl.col)) {
      dat <- as.matrix(obj@tpm.value)
    } else {
      dat <- as.matrix(obj@tpm.value[, -excl.col])
    }
    if(ncol(dat) <= 3) {
      stop('The number of samples must be larger than 3\n')
    }
    ridx <- apply(dat, 1, function(vec)any(vec > 0))
    stopifnot(any(ridx))
    dat <- dat[ridx,]
    if(!isLog) {
      dat <- log2(dat + small)
    }
    dat <- dat[apply(dat, 1, var) > 0, ]
    if (ntop < nrow(dat)){
       vars <- apply(dat, 1, var)
       dat <- dat[order(vars, decreasing = TRUE), ]
       dat <- dat[1:ntop, ]
    }
    pca <- prcomp(t(dat), center=TRUE, scale=TRUE, retx=TRUE)
    ve <- pca$sdev^2/sum(pca$sdev^2) #variance explained by each PC
    ve13 <- sprintf("%.1f",ve[1:3]*100)
    x <- data.frame(PC1 = pca$x[,1], PC2 = pca$x[,2], PC3 = pca$x[,3])
    rownames(x) <- colnames(dat)
    if (!is.infinite(ntop) && !is.null(fout)) {
      out(dat, fout)
    }
    uniq.cols <- rainbow(length(unique(obj@grps)))
    cols <- uniq.cols[as.numeric(factor(obj@grps, levels = unique(obj@grps), ordered = TRUE))]
    x$colors <- cols
    pch <- as.numeric(factor(rownames(x)), ordered = TRUE)
    # 2D vs 3D PCA
    if (pca2d) {
      p1 <- ggplot(x) +
      geom_point(aes_(x = ~PC1, y = ~ PC2), size = 5, color = 'grey') +
      labs(x = paste0("PC1 (", round(ve[1], digits = 2), " of variance)"),
        y = paste0("PC2 (", round(ve[2], digits = 2), " of variance)")) +
      geom_label_repel(
        aes_(x = ~ PC1, y = ~ PC2, fill = obj@grps, label = rownames(x)),
        fontface = 'bold', color = 'white',
        box.padding = 0.35, point.padding = 0.5,
        segment.color = 'grey50') +
        theme(legend.title = element_blank(), panel.spacing = unit(2, "lines"), legend.position = "top")
      ggsave(filename = pdffout, plot = p1, height = max(7, 7 * (ncol(obj@tpm.value)/15)),
        width = max(7, 7 *(ncol(obj@tpm.value)/15)), units = "in")
    } else {
      pdf(pdffout, pointsize = 14, height = max(7, 7 * (ncol(obj@tpm.value)/30)),
        width = max(7, 7 *(ncol(obj@tpm.value)/30)))
      par(mar = c(1, 1, 1, 1))
      s3d <- scatterplot3d(x[ , 1:3], grid = FALSE, box = FALSE, mar = c(3, 3, 2, 2), pch = "")
      addgrids3d(x[, 1:3], grid = c("xy", "xz", "yz"))
      #s3d$points3d(x[, 1:3], pch=16, col=x$colors)
      text(s3d$xyz.convert(x[, 1:3] + 1), labels = 1:nrow(x), col = x$colors)
      legend(par('usr')[1] - 0.5,par('usr')[4] + 0.3,
        legend = paste0(1:nrow(x), ": ", rownames(x)),
        pch = 20, col = x$colors, xpd = TRUE,
        ncol = 3, cex = 0.7)
      dev.off()
    }
  }
)

#' @title MAchart
#' @name MAchart
#' @rdname MAchart-methods
#' @description
#' plot of log2FC over pvalue during differential analysis.
#' @param dd A \code{data.frame} object, with columns of 'P.Value', 'logFC', 'nlogpval' and 'DEG'.
#' @param pdffout A character string specify pdf output file.
#' @export MAchart
setGeneric(name = "MAchart",
  def = function(dd, pdffout) {
    standardGeneric("MAchart")
  }
)

#' @rdname MAchart-methods
setMethod(f = "MAchart",
  signature = c("data.frame", "character"),
  definition = function(dd, pdffout) {
    nd <- dd
    nd$nlogpval <- -log10(nd$P.Value)
    ylab <- "P"
    p1 <- ggplot(nd, aes_(x = ~logFC, y = ~nlogpval, color = ~DEG)) + geom_point() +
      labs(y = bquote(-log[10](.(ylab)))) +
      theme(legend.title = element_blank(), legend.position = "top")
    pdf(pdffout, pointsize = 14)
    theme_set(theme_grey(base_size = 15))
    multiplot(p1, cols = 1)
    dev.off()
  }
)

#' @title BICplot
#' @name BICplot
#' @rdname BICplot-methods
#' @description
#' Plot BIC values over cluster size.
#' @param g A numeric vector of cluster sizes.
#' @param BIC A numeric vector of BIC values.
#' @param pdffout A character string specify pdf output file.
#' @export BICplot
setGeneric(name = "BICplot",
  def = function(g, BIC, pdffout) {
    standardGeneric("BICplot")
  }
)

#' @rdname BICplot-methods
setMethod(f = "BICplot",
  signature = c("numeric", "numeric", "character"),
  definition = function(g, BIC, pdffout) {
    dd <- data.frame(g = g ,BIC = BIC)
    p1 <- ggplot(dd, aes_(x = g, y = BIC)) + geom_point(colour = "#FF9999") +
      xlab("Cluster Size")+ylab("BIC")
    pdf(pdffout, pointsize = 14)
    theme_set(theme_grey(base_size = 15))
    multiplot(p1, cols = 1)
    dev.off()
  }
)

#' @title diffHeatmap
#' @name diffHeatmap
#' @rdname diffHeatmap-methods
#' @description
#' heatmap during or after differential analysis
#' @param tpm.value A \code{tpm} matrix.
#' @param col.idx Numeric or logical indices specifying columns to keep of the \code{tpm} matrix.
#' @param row.idx Numeric or logical indices specifying rows to keep of the code{tpm} matrix.
#' @param pdffout A character string specify pdf output file.
#' @param cutreek A logical value indicating whether to perform clustering.
#' @param cut.alg A string value of selecting the clustering algorithm "pam","hclust" or "emmix".
#' @param rank.man A logical value indicating whether to perform manual ranking on the row.
#' @param log.it.already A logical value indicating whether the \code{tpm} matrix is already log2 transformed.
#' @param scale.it A logical value indicating whether to row standardize the \code{tpm} matrix.
#' @param cluster_columns_par A logical value indicating whether to cluster the columns of the \code{tpm} matrix.
#' @param cluster_rows_par A logical value indicating whether to cluster the rows of the \code{tpm} matrix.
#' @param show_column_dend_par A logical value indicating whether to show the column dendrogram.
#' @param show_row_dend_par A logical value indicating whether to show the row dendrogram.
#' @param small A numeric value indicating the adjustment to the TPM values before log2 transformation.
#' @param ... Additional arguments to be passed to methods.
#' @export diffHeatmap
setGeneric(name = "diffHeatmap",
  def = function(tpm.value, col.idx, row.idx, pdffout,
    cutreek = NULL, cut.alg = NULL, rank.man = FALSE, log.it.already = FALSE,
    scale.it = TRUE, cluster_columns_par = TRUE, cluster_rows_par = TRUE,
    show_column_dend_par = TRUE, show_row_dend_par = FALSE, small = 0.05, ...) {
    standardGeneric("diffHeatmap")
  }
)

#' @rdname diffHeatmap-methods
setMethod(f = "diffHeatmap",
  signature = c("matrix"),
  definition = function(tpm.value, col.idx, row.idx, pdffout, cutreek,
    cut.alg, rank.man, log.it.already, scale.it, cluster_columns_par,
    cluster_rows_par, show_column_dend_par, show_row_dend_par, small, ...) {
    if (!is.null(cut.alg)) {
      cut.alg <- match.arg(cut.alg, c("pam","hclust","emmix"))
    }
    tpm.value <- tpm.value[row.idx, col.idx]
    if (!log.it.already) {
      tpm.value <- log2(tpm.value+small)
    }
    if (scale.it) {
      norm <- t(scale(t(tpm.value)))
    } else {
      norm <- tpm.value
    }
    if (rank.man) {
      norm <- norm[order(norm[,1], -norm[,2], decreasing=T),];
      cluster_rows <- FALSE
    }
    if (!is.null(cutreek)) {
      if (cut.alg == "hclust") {
        corstats <- cor(t(norm),method="spearman");
        clusters <- cutree(hclust(as.dist(1-corstats), method = 'average'), k = cutreek)
      } else if (cut.alg == "emmix") {
        if (!is.null(clusters)) {
          clusters <- clusters
        }
      } else if (cut.alg == "pam") {
        clusters <- pam(norm,cutreek)$clustering
      }
      pm <- Heatmap(norm, cluster_columns = cluster_columns_par,
        cluster_rows = cluster_rows_par, show_row_names = FALSE,
        show_row_dend = show_row_dend_par, show_column_dend = show_column_dend_par,
        heatmap_legend_param = list(title = "", color_bar = "continuous"),
        clustering_distance_rows = "spearman", clustering_method_rows = "average",
        clustering_distance_columns = "spearman", clustering_method_columns = "average",
        split = paste0("Cluster", clusters))
    } else {
      pm <- Heatmap(norm, cluster_columns = cluster_columns_par,
        cluster_rows = cluster_rows_par, show_row_names = FALSE,
        show_row_dend = show_row_dend_par, show_column_dend = show_column_dend_par,
        heatmap_legend_param = list(title="", color_bar="continuous"),
        clustering_distance_rows = "spearman", clustering_method_rows = "average",
        clustering_distance_columns = "spearman", clustering_method_columns = "average")
    }
    pdf(pdffout, pointsize = 14)
    draw(pm)
    dev.off()
    if (exists("clusters")) {
      invisible(clusters)
    } else {
      invisible(NULL)
    }
  }
)

#' @title clusing
#' @name clusing
#' @rdname clusing-methods
#' @description
#' Determine the optimum number of k-mean clusters, and return cluster membership.
#' @param dat A \code{data.frame} of \code{tpm} values.
#' @param pdffout A character string specify the diagnosis pdf file of the optimum number of clusters.
setGeneric(name="clusing",
  def=function(dat, pdffout) {
    standardGeneric("clusing")
  }
)

#' @rdname clusing-methods
setMethod(f = "clusing",
  signature=c(dat = "data.frame", pdffout="character"),
  definition=function(dat, pdffout){
    if(!is.null(dirname(pdffout)) || dirname(pdffout)==".") dir.create(dirname(pdffout), showWarnings = FALSE)
    pdf(pdffout)
    opt <- Optimal_Clusters_KM(dat, max_clusters = min(10,ncol(dat)), plot_clusters=TRUE, criterion = 'distortion_fK', fK_threshold = 0.85, initializer= 'optimal_init', tol_optimal_init = 0.2)
    dev.off()
    km_mb <- MiniBatchKmeans(dat, clusters = opt, batch_size = 20, num_init = 5, max_iters = 100,
    init_fraction = 0.2, initializer = 'kmeans++', early_stop_iter = 10,verbose = F)
    pr_mb <- predict_MBatchKMeans(dat, km_mb$centroids)
    return(pr_mb)
  }
)


#' @title kHeat
#' @name kHeat
#' @rdname kHeat-methods
#' @description
#' K-mean clustering and heatmap
#' @param obj A \code{tpm} object.
#' @param pdffout A character string specify pdf output file.
#' @param k A numeric value specifying the number of clusters.
#' @param log2.it A logical value specifying whether to log2 transform the tpm value data.
#' @param scale.it A logical value specifying whether to row standardize the tpm value data.
#' @param small A numeric value indicating the adjustment to the TPM values before log2 transformation.
#' @export kHeat
setGeneric(name = "kHeat",
  def = function(obj, pdffout, k = NULL, log2.it = TRUE, scale.it = TRUE, small = 0.05) {
    standardGeneric("kHeat")
  }
)

#' @rdname kHeat-methods
setMethod(f = "kHeat",
  signature = c("tpm", "ANY"),
  definition = function(obj, pdffout, k, log2.it, scale.it, small) {
    mat <- obj@tpm.value
    if (log2.it) {
      mat <- log2(mat + small)
    }
    if (scale.it) {
      mat <- t(apply(mat, 1, scale))
    }
    colnames(mat) <- colnames(obj@tpm.value)
    if (is.null(k)) {
        message("clusing function when k is null.")
        cl <- clusing(data.frame(mat), pdffout = gsub(".pdf","_optK.pdf", pdffout))
        message("Heatmap function when k is null.")
    } else {
        message("heatmap function when k is ", k)
        set.seed(888)
        cl <- kmeans(mat, k)$cluster
    }
    ht_list <- Heatmap(mat, show_row_names = FALSE, show_column_names = TRUE, cluster_rows = TRUE,
      show_row_dend = FALSE,  cluster_columns = FALSE, show_column_dend = FALSE,
      heatmap_legend_param = list(title = "", color_bar = "continuous"),
      clustering_distance_rows = "spearman", clustering_method_rows = "average",
      clustering_distance_columns = "spearman", clustering_method_columns = "average",
      split = factor(cl), gap = unit(3, "mm"))
    pdf(pdffout)
    draw(ht_list)
    dev.off()
    message("running prepare function...")
    ht.obj <- prepare(ht_list)
    message("done with prepare function...")
    row_list <- row_order(ht_list)
    dd <- do.call("rbind", lapply(seq_along(row_list),
      function(i){data.frame(cluster = i, ori.idx = row_list[[i]], idx = seq_along(row_list[[i]]))}))
    dat <- data.frame(gene = rownames(mat)[dd$ori.idx], dd, mat = mat[dd$ori.idx,])
    dat <- dat[order(dat$ori.idx), grep("ori.idx", colnames(dat), invert = TRUE, fixed = TRUE)]
    colnames(dat)[-c(1:3)] <- colnames(mat)
    write.table(dat,
      file = gsub("pdf", "txt", pdffout),
      row.names = FALSE, col.names = TRUE,
      sep = "\t", quote = FALSE)
  }
)

#' @title limmaDiff
#' @name limmaDiff
#' @rdname limmaDiff-methods
#' @description
#' tpm.value either normalised to spikein or not
#' filtering of genes must be done before hand
#' @param obj A \code{tpm} object.
#' @param dout A character string specifying the output directory.
#' @param pat A character string specifying the prefix of file(s) without the directory name.
#' @param MA.it A logical value indicating whether to generate an MA plot.
#' @param HEAT.it A logical value specifying whether to draw a heatmap for each pairwise differnetially analysis.
#' @param GO.it A logical value specifying whether to do gene ontology analysis.
#' @param DiffOut.it A logical value specifying whether to write out differential analysis data to file.
#' @param logFCthresh A numeric value specifying the log2 fold change threshold for data to be called differential.
#' @param PValthresh A numeric value specifying the pvalue threshold for data to be called differential.
#' @param PADJthresh A numeric value specifying the adjusted pvalue threshold for data to be called differential, overriding the specification of pvalue threshold.
#' @param log2.it A logical value specifying whether to perform log2 transformation.
#' @param small A numeric value indicating the adjustment to the TPM values before log2 transformation.
#' @export limmaDiff
setGeneric(name = "limmaDiff",
  def = function(obj, dout, pat,
    MA.it = TRUE, HEAT.it = TRUE, GO.it = TRUE, DiffOut.it = TRUE,
    logFCthresh = 1, PValthresh = NULL, PADJthresh = NULL, log2.it = TRUE, small = 0.05) {
    standardGeneric("limmaDiff")
  }
)

#' @rdname limmaDiff-methods
setMethod(f = "limmaDiff",
  signature = c("tpm"),
  definition = function(obj, dout, pat, MA.it, HEAT.it, GO.it, DiffOut.it, logFCthresh, PValthresh, PADJthresh, log2.it, small) {
    stopifnot(ncol(obj@tpm.value)>0)
    if (!is.null(PADJthresh)) {
      signcol <- "adj.P.Val"
      signthresh <- PADJthresh
    } else if (!is.null(PValthresh)) {
      signcol <- "P.Value"
      signthresh <- PValthresh
    } else {
      stop("Error: either PADJthresh or PValthresh need to be specified in limmaDiff...exiting...")
    }
    grps <- factor(obj@grps)
    if (log2.it) {
      tpm.value <- log2(obj@tpm.value + small)
    } else {
      tpm.value <- obj@tpm.value
    }
    contrast <- apply(combn(levels(grps), 2), 2,paste0,collapse="-")
    design <- model.matrix(~ 0 + grps)
    colnames(design) <- sub('^grps', '', colnames(design))
    fit <- lmFit(tpm.value, design)
    contrast.matrix <- makeContrasts(contrasts = contrast, levels = design)
    fit <- contrasts.fit(fit, contrast.matrix)
    fit <- eBayes(fit)
    dat.list <- lapply(contrast, function(const){
      dd <- topTable(fit, adjust.method = "BH", number = Inf, coef = const, sort.by = 'none')
      dd <- dd[,c("logFC","P.Value","adj.P.Val")]
      query.population <- gsub(
        "[^\\|]+\\|([^\\|]+)\\|[^\\|]+(\\|[^\\|]+){0,1}",
        "\\1", rownames(dd))
      up.idx <- which(dd[,"logFC"] >= logFCthresh & dd[, signcol] <= signthresh)
      down.idx <- which(dd[,"logFC"] <= (-logFCthresh) & dd[, signcol] <= signthresh)
      stopifnot(length(up.idx) > 0 & length(down.idx) > 0)
      dd$DEG <- "NDiff"
      dd[up.idx, "DEG"] <- "Up"
      dd[down.idx, "DEG"] <- "Down"
      if (MA.it) {
        MAchart(dd, pdffout = file.path(dout, paste0(pat, "_", const, "_MA.pdf")))
      }
      if (HEAT.it) {
        sm1 <- gsub("([^\\-]+)\\-([^\\-]+)", "\\1", const)
        sm2 <- gsub("([^\\-]+)\\-([^\\-]+)", "\\2", const)
        diffHeatmap(tpm.value, col.idx = which(as.character(grps) %in% c(sm1, sm2)), row.idx = which(dd$DEG != "NDiff"),
		pdffout = file.path(dout, paste0(pat, "_", const, "_diffHeatmap.pdf")),
          cutreek = NULL, log.it.already = TRUE)
      }
      if (GO.it) {
        DEG <- data.frame(gene = query.population[c(up.idx, down.idx)],
          group = dd$DEG[c(up.idx, down.idx)], stringsAsFactors = FALSE)
        Res <- msigdb.gsea(DEG, query.population = query.population, background = 'query',
          genesets = c('C2.CP','C5.BP','C5.CC','C5.MF'), name.x='DEGs', name.go='MSigDB', species='mouse')
        write.table(Res, file = file.path(dout, paste0(pat, "_", const, "_GO.xls")), row.names = FALSE,
          col.names = TRUE, quote = FALSE, sep = "\t")
      }
      colnames(dd) <- paste(colnames(dd), const, sep="_")
      return(dd)
    })
    dat <- do.call("cbind",dat.list)
    rownames(dat) <- rownames(tpm.value)
    gid <- gsub("([^\\|]+)\\|([^\\|]+)\\|([^\\|]+)(\\|[^\\|]+){0,1}",
      "\\1", rownames(dat))
    gname <- gsub("([^\\|]+)\\|([^\\|]+)\\|([^\\|]+)(\\|[^\\|]+){0,1}",
      "\\2", rownames(dat))
    gtype <- gsub("([^\\|]+)\\|([^\\|]+)\\|([^\\|]+)(\\|[^\\|]+){0,1}",
      "\\3", rownames(dat))
    dat <- data.frame(gid = gid, gname = gname, gtype = gtype, dat, obj@tpm.value)
    if (DiffOut.it) {
      write.table(dat, file = file.path(dout, paste0(pat, "_DiffAna.xls")),
        row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")
    }
    return(dat)
  }
)

#' @title lineGraph
#' @name lineGraph
#' @rdname lineGraph-methods
#' @description
#' Line graphs of clustering
#' @param obj A \code{tpm} object.
#' @param col.idx A numeric or logical vector specifying columns to keep.
#' @param row.idx A numeric or logical vector specifying rows to keep.
#' @param clusters A numeric vector indicating cluster membership.
#' @param pdffout A character string specify pdf output file.
#' @param log.it.already A logical value specifying whether tpm values were already log2 transformed.
#' @param mean.it A logical value indicating whether to generate group values from individual samples.
#' @param small A numeric value indicating the adjustment to the TPM values before log2 transformation.
#' @export lineGraph
setGeneric(name = "lineGraph",
  def = function(obj, col.idx, row.idx, clusters, pdffout, log.it.already = FALSE, mean.it = FALSE, small = 0.05) {
    standardGeneric("lineGraph")
  }
)

#' @rdname lineGraph-methods
setMethod(f = "lineGraph",
  signature = c("tpm"),
  definition = function(obj, col.idx, row.idx, clusters, pdffout, log.it.already, mean.it, small) {
    tpm.value <- obj@tpm.value[row.idx, col.idx]
    if (!log.it.already) {
      tpm.value <- log2(tpm.value + small)
    }
    norm <- t(scale(t(tpm.value)))
    grps <- factor(obj@grps, levels = unique(obj@grps))
    norm.grp <- t(apply(norm, 1, function(vec)tapply(vec, grps, mean)))
    if (mean.it) {
      dat <- data.table(norm.grp, clusters = clusters)
      dat <- data.table(dat[, lapply(.SD, mean, na.rm = TRUE), by = clusters, .SDcols = levels(grps)], id="mean")
      setcolorder(dat, c(2:(ncol(dat)-1), 1, ncol(dat)))
    } else {
      dat <- data.table(norm.grp, clusters = clusters, id = rownames(norm.grp))
    }
    ldat <- melt(dat, id.vars = c(1:ncol(dat))[!1:ncol(dat) %in% 1:ncol(norm.grp)])
    ylab <- "TPM"
    p1 <- ggplot(data = ldat, aes_(x = ~variable, y = ~value, group = ~id, colour = "#FF9999" )) + geom_line() +
      geom_point() + facet_wrap(~ clusters) + labs(y = bquote(paste("Standardized ", log[2](.(ylab))))) + theme(legend.position="none")
    pdf(pdffout, pointsize = 14)
    theme_set(theme_grey(base_size = 15))
    multiplot(p1, cols = 1)
    dev.off()
  }
)