R/integrations.R
In conos: Clustering on Network of Samples

#' @importFrom rlang .data
#' @importFrom utils packageVersion
NULL

#' @keywords internal
extendMatrix <- function(mtx, col.names) {
  new.names <- setdiff(col.names, colnames(mtx))
  ext.mtx <- matrix(0, nrow=nrow(mtx), ncol=length(new.names))
  colnames(ext.mtx) <- new.names
  return(cbind(mtx, ext.mtx)[,col.names])
}

#' @keywords internal
mergeCountMatrices <- function(cms, transposed=FALSE) {
  if (!transposed) {
    cms %<>% lapply(Matrix::t)
  }

  gene.union <- lapply(cms, colnames) %>% Reduce(union, .)

  res <- lapply(cms, extendMatrix, gene.union) %>% Reduce(rbind, .)
  if (!transposed) {
    res %<>% Matrix::t()
  }
  return(res)
}

#' @keywords internal
checkSeuratV3 <- function() {
  if (!requireNamespace('Seurat', quietly = TRUE) || packageVersion('Seurat') < package_version(x = '3.0.0')) {
    stop("Use of Seurat v3-backed Conos objects requires Seurat v3.X installed")
  }
}

#' @keywords internal
seuratProcV2 <- function(count.matrix, vars.to.regress=NULL, verbose=TRUE, do.par=TRUE, n.pcs=100, cluster=TRUE, tsne=TRUE, umap=FALSE) {
  if (!requireNamespace("Seurat", quietly = TRUE)) {
    stop("Package \"Seurat\" is needed for this function to work. Please install it, as described here: <https://satijalab.org/seurat/articles/install.html>.", call. = FALSE)
  }
  if (verbose) {
    message("Running Seurat v2 workflow")
  }
  rownames(count.matrix) <- make.unique(rownames(count.matrix))
  max.n.pcs <- min(nrow(count.matrix) - 1, ncol(count.matrix) - 1, n.pcs)
  so <- Seurat::CreateSeuratObject(count.matrix, display.progress=verbose) %>%
    Seurat::NormalizeData(display.progress=verbose) %>%
    Seurat::ScaleData(vars.to.regress=vars.to.regress, display.progress=verbose, do.par=do.par) %>%
    Seurat::FindVariableFeatures(do.plot = FALSE, display.progress=verbose) %>%
    Seurat::RunPCA(pcs.compute=max.n.pcs, do.print=FALSE)
  if (cluster) {
    so <- Seurat::FindClusters(so, n.iter=500, n.start=10, dims.use=1:n.pcs, print.output = FALSE)
  }
  if (tsne) {
    so <- Seurat::RunTSNE(so, dims.use=1:n.pcs)
  }
  if (umap) {
    if (packageVersion('Seurat') < package_version(x = '2.3.1')) {
      warning("UMAP support in Seurat came in v2.3.1, please update to a newer version of Seurt to enable UMAP functionality", immediate. = TRUE)
    } else {
      so <- Seurat::RunUMAP(object = so, dims.use = 1:n.pcs)
    }
  }
  return(so)
}

#' @keywords internal
seuratProcV3 <- function(count.matrix, vars.to.regress=NULL, verbose=TRUE, n.pcs=100, cluster=TRUE, tsne=TRUE, umap=FALSE, ...) {
  if (!requireNamespace("Seurat", quietly = TRUE)) {
    stop("Package \"Seurat\" needed for this function to work. Please install it, as described here: <https://satijalab.org/seurat/articles/install.html>.", call. = FALSE)
  }
  if (verbose) {
    message("Running Seurat v3 workflow")
  }
  so <- Seurat::CreateSeuratObject(counts = count.matrix)
  so <- Seurat::NormalizeData(object = so, verbose = verbose)
  so <- Seurat::FindVariableFeatures(object = so, verbose = verbose)
  so <- Seurat::ScaleData(object = so, vars.to.regress = vars.to.regress, verbose = verbose)
  so <- Seurat::RunPCA(object = so, npcs = n.pcs, verbose = verbose)
  if (cluster) {
    so <- Seurat::FindNeighbors(object = so, dims = 1:n.pcs, verbose = verbose)
    so <- Seurat::FindClusters(object = so, n.iter = 500, n.start = 10, verbose = verbose)
  }
  if (tsne) {
    so <- Seurat::RunTSNE(object = so, dims = 1:n.pcs)
  }
  if (umap) {
    so <- Seurat::RunUMAP(object = so, dims = 1:n.pcs, verbose = verbose)
  }
  return(so)
}

#' Save Conos object on disk to read it from ScanPy
#'
#' @param con conos object
#' @param output.path path to a folder, where intermediate files will be saved
#' @param hdf5_filename name of HDF5 written with ScanPy files. Note: the rhdf5 package is required
#' @param metadata.df data.frame with additional metadata with rownames corresponding to cell ids, which should be passed to ScanPy (default=NULL)
#'     If NULL, only information about cell ids and origin dataset will be saved.
#' @param cm.norm boolean Whether to include the matrix of normalised counts (default=FALSE).
#' @param embedding boolean Whether to include the current conos embedding (default=TRUE).
#' @param pseudo.pca boolean Whether to produce an emulated PCA by embedding the graph to a space with `n.dims` dimensions and save it as a pseudoPCA (default=FALSE).
#' @param pca boolean Whether to include PCA of all the samples (not batch corrected) (default=FALSE).
#' @param n.dims numeric Number of dimensions for calculating PCA and/or pseudoPCA (default=100).
#' @param alignment.graph boolean Whether to include graph of connectivities and distances (default=TRUE).
#' @param verbose boolean Whether to use verbose mode (default=FALSE)
#' @seealso The rhdf5 package documentation here: <https://www.bioconductor.org/packages/release/bioc/html/rhdf5.html>
#' @return AnnData object for ScanPy, saved to disk
#' @export
saveConosForScanPy <- function(con, output.path, hdf5_filename, metadata.df=NULL, cm.norm=FALSE, pseudo.pca=FALSE, 
  pca=FALSE, n.dims=100, embedding=TRUE, alignment.graph=TRUE, verbose=FALSE) {
  if (!requireNamespace("rhdf5", quietly = TRUE)) {
    stop("The package \"rhdf5\" is required for saveConosForScanPy(). Please install.")
  }
  if (!requireNamespace("tibble", quietly = TRUE)) {
    stop("The package \"tibble\" is required for saveConosForScanPy(). Please install.")
  }
  if (!dir.exists(output.path)){
    stop("Path", output.path, "doesn't exist")
  }

  if (tools::file_ext(hdf5_filename) != "h5"){
    stop("File", hdf5_filename, "must have the file extension *.h5")
  }

  if (verbose) message("Merge raw count matrices...\t")
  raw.count.matrix.merged <- con$getJointCountMatrix(raw=TRUE)
  if (verbose) message("Done.\n")

  cell.ids <- rownames(raw.count.matrix.merged)
  gene.df <- data.frame(gene=colnames(raw.count.matrix.merged))

  if (!is.null(metadata.df)) {
    metadata.df %<>% .[cell.ids, , drop=FALSE] %>% dplyr::mutate(CellId=cell.ids)
  } else {
    metadata.df <- tibble::tibble(CellId=cell.ids)
  }
  metadata.df$Dataset <- as.character(con$getDatasetPerCell()[cell.ids])

  if (cm.norm) {
    if (verbose) message("Merge count matrices...\t\t")
    count.matrix.merged <- con$getJointCountMatrix(raw=FALSE)
    if (verbose) message("Done.\n")
  }

  if (embedding){
    if (verbose) message("Save the embedding...\t\t")
    if (length(con$embedding)>1) {
      embedding.df <- con$embedding[cell.ids,] %>% as.data.frame()
      if (verbose) message("Done.\n")
    } else {
      warning("\n No embedding found in the conos object. Skipping... \n")
      embedding <- FALSE
    }

  }

  # Create a batch-free embedding that can be used instead of PCA space
  if (pseudo.pca) {
    if (verbose) message("Create psudo-PCA space...\t")
    pseudopca.df <- con$embedGraph(target.dims=n.dims, method="largeVis", verbose=FALSE)[cell.ids, ] %>% as.data.frame()
    if (verbose) message("Done.\n")
  }

  if (pca){
    if (verbose) message("Save PCA space...\t\t")
    pca.df <- pcaFromConos(con$samples, ncomps=n.dims, n.odgenes=2000, verbose=FALSE) %>% as.data.frame()
    if (verbose) message("Done.\n")
  }

  if (alignment.graph){
    if (verbose) message("Save graph matrices...\t\t")
    if (!is.null(con$graph)) {
      graph.conn <- igraph::as_adjacency_matrix(con$graph, attr="weight")[cell.ids, cell.ids]
      graph.dist <- graph.conn
      graph.dist@x <- 1 - graph.dist@x
      if (verbose) message("Done.\n")
    } else {
      warning("\n No graph found in the conos object. Skipping... \n")
      alignment.graph <- FALSE
    }
  }

  if (verbose) message("Write data to disk...\t\t")
  ## create HDF5 file
  total_hdf5file_path = paste0(output.path, "/", hdf5_filename)
  rhdf5::h5createFile(total_hdf5file_path)
  ## raw.count.matrix.merged
  rhdf5::h5createGroup(total_hdf5file_path, "raw_count_matrix")
  rhdf5::h5write(raw.count.matrix.merged@x, total_hdf5file_path, "raw_count_matrix/data")
  rhdf5::h5write(dim(raw.count.matrix.merged), total_hdf5file_path, "raw_count_matrix/shape")
  rhdf5::h5write(raw.count.matrix.merged@i, total_hdf5file_path, "raw_count_matrix/indices")
  rhdf5::h5write(raw.count.matrix.merged@p, total_hdf5file_path, "raw_count_matrix/indptr")
  ## metadata
  rhdf5::h5createGroup(total_hdf5file_path, "metadata")
  rhdf5::h5write(metadata.df, total_hdf5file_path, "metadata/metadata.df")
  ## genes
  rhdf5::h5createGroup(total_hdf5file_path, "genes")
  rhdf5::h5write(gene.df, total_hdf5file_path, "genes/genes.df")
  ## count_matrix
  if (cm.norm) {
    rhdf5::h5createGroup(total_hdf5file_path, "count_matrix")
    rhdf5::h5write(count.matrix.merged@x, total_hdf5file_path, "count_matrix/data")
    rhdf5::h5write(dim(count.matrix.merged), total_hdf5file_path, "count_matrix/shape")
    rhdf5::h5write(count.matrix.merged@i, total_hdf5file_path, "count_matrix/indices")
    rhdf5::h5write(count.matrix.merged@p, total_hdf5file_path, "count_matrix/indptr")
  }
  if (embedding) {
    rhdf5::h5createGroup(total_hdf5file_path, "embedding")
    rhdf5::h5write(embedding.df, total_hdf5file_path, "embedding/embedding.df")
  }
  if (pseudo.pca) {
    rhdf5::h5createGroup(total_hdf5file_path, "pseudopca")
    rhdf5::h5write(pseudopca.df, total_hdf5file_path, "pseudopca/pseudopca.df")
  }
  if (pca) {
    rhdf5::h5createGroup(total_hdf5file_path, "pca")
    rhdf5::h5write(pca.df, total_hdf5file_path, "pca/pca.df")
  }
  if (alignment.graph) {
    ## graph_connectivities
    rhdf5::h5createGroup(total_hdf5file_path, "graph_connectivities")
    rhdf5::h5write(graph.conn@x, total_hdf5file_path, "graph_connectivities/data")  
    rhdf5::h5write(dim(graph.conn), total_hdf5file_path, "graph_connectivities/shape") 
    rhdf5::h5write(graph.conn@i, total_hdf5file_path, "graph_connectivities/indices") 
    rhdf5::h5write(graph.conn@p, total_hdf5file_path, "graph_connectivities/indptr") 
    ## graph_distances
    rhdf5::h5createGroup(total_hdf5file_path, "graph_distances")
    rhdf5::h5write(graph.dist@x, total_hdf5file_path, "graph_distances/data")  
    rhdf5::h5write(dim(graph.dist), total_hdf5file_path, "graph_distances/shape") 
    rhdf5::h5write(graph.dist@i, total_hdf5file_path, "graph_distances/indices") 
    rhdf5::h5write(graph.dist@p, total_hdf5file_path, "graph_distances/indptr") 
  }
  if (verbose) message("All Done!")
}

#' Create and preprocess a Seurat object
#'
#' @param count.matrix gene count matrix
#' @param vars.to.regress variables to regress with Seurat (default=NULL)
#' @param verbose boolean Verbose mode (default=TRUE)
#' @param do.par boolean Use parallel processing for regressing out variables faster (default=TRUE)
#' @param n.pcs numeric Number of principal components (default=100)
#' @param cluster boolean Whether to perform clustering (default=TRUE)
#' @param tsne boolean Whether to construct tSNE embedding (default=TRUE)
#' @param umap boolean Whether to construct UMAP embedding, works only for Seurat v2.3.1 or higher (default=FALSE)
#' @return Seurat object
#'
#' @export
basicSeuratProc <- function(count.matrix, vars.to.regress=NULL, verbose=TRUE, do.par=TRUE, n.pcs=100, cluster=TRUE, tsne=TRUE, umap=FALSE) {
  if (!requireNamespace("Seurat", quietly = TRUE)) {
    stop("Package \"Seurat\" is needed for this function to work. Please install it, as described here: <https://satijalab.org/seurat/articles/install.html>.", call. = FALSE)
  }
  proc.fxn <- ifelse(
    test = packageVersion('Seurat') < package_version(x = '3.0.0'),
    yes = seuratProcV2,
    no = seuratProcV3
  )
  so <- proc.fxn(
    count.matrix = count.matrix,
    vars.to.regress = vars.to.regress,
    verbose = verbose,
    do.par = do.par,
    n.pcs = n.pcs,
    cluster = cluster,
    tsne = tsne,
    umap = umap
  )
}

# Intersect genes and cells between all the velocity files and the conos object
#' @keywords internal
prepareVelocity <- function(cms.file, genes, cells) {
  exon.genes <- rownames(cms.file$exon)
  intron.genes <- rownames(cms.file$intron)
  spanning.genes <- rownames(cms.file$spanning)
  # Only common genes between the 3 files and conos object
  common.genes <- intersect(exon.genes, intron.genes) %>% intersect(spanning.genes) %>% intersect(genes)

  exon.cells <- colnames(cms.file$exon)
  intron.cells <- colnames(cms.file$intron)
  spanning.cells <- colnames(cms.file$spanning)
  # Only common cells between the 3 files and conos object
  common.cells <- intersect(exon.cells, intron.cells) %>% intersect(spanning.cells) %>% intersect(cells)

  cms.file$exon <- cms.file$exon[common.genes,common.cells]
  cms.file$intron <- cms.file$intron[common.genes,common.cells]
  cms.file$spanning <- cms.file$spanning[common.genes,common.cells]
  return(cms.file)
}

# Get PCA results for all the samples from the conos object
# This is a modification of the quickPlainPCA function
#' @keywords internal
pcaFromConos <- function(p2.list, data.type='counts', k=30, ncomps=100, n.odgenes=NULL, verbose=TRUE) {

  od.genes <- commonOverdispersedGenes(p2.list, n.odgenes, verbose = FALSE)
  if (length(od.genes)<5){
    return(NULL)
  }

  if(verbose) message('Calculating PCs for',length(p2.list),' datasets...\n')

  # Get scaled matrices from a list of pagoda2 objects
  sm <- scaledMatrices(p2.list, data.type=data.type, od.genes=od.genes, var.scale=TRUE)
  # Transpose the scaled matrices since we want to run PCA on cells and not genes (like in quickPlainPCA)
  sm <- lapply(sm, t)
  # Get the names of all the cells
  nms <- Reduce(union, lapply(sm, colnames))

  pcs <- lapply(sm, function(x) {
    cm <- Matrix::colMeans(x)
    ncomps <- min(c(nrow(cm)-1,ncol(cm)-1,ncomps))
    res <- irlba::irlba(x, nv=ncomps, nu=0, center=cm, right_only=FALSE, reorth=TRUE)
    res
  })

  pcj <- do.call(rbind,lapply(pcs,function(x) x$v))
  rownames(pcj) <- nms
  res <- pcj

  return(res)
}

#' Convert Conos object to Pagoda2 object
#'
#' @param con Conos object
#' @param n.pcs numeric Number of principal components (default=100)
#' @param n.odgenes numeric Number of overdispersed genes (default=2000)
#' @param verbose boolean Whether to give verbose output (default=TRUE)
#' @param ... parameters passed to Pagoda2$new()
#' @return pagoda2 object 
#'
#' @export
convertToPagoda2 <- function(con, n.pcs=100, n.odgenes=2000, verbose=TRUE, ...) {
  if (!requireNamespace('pagoda2', quietly=TRUE)) {
    stop("'pagoda2' must be installed to convert a Conos object to a Pagoda2 object. Please refer to <https://github.com/kharchenkolab/pagoda2>.")
  }
  p2 <- con$getJointCountMatrix(raw=TRUE) %>% Matrix::t() %>%
    pagoda2::Pagoda2$new(n.cores=con$n.cores, verbose=verbose, ...)

  if (n.pcs > 0) {
    if (verbose) message("Estimating PCA... ")
    p2$reductions$PCA <- pcaFromConos(con$samples, ncomps=n.pcs, n.odgenes=n.odgenes, verbose=FALSE)
    if (verbose) message("Done\n")
  }

  p2$graphs$conos <- con$graph

  if (!("uninitializedField" %in% class(con$embedding))) {
    p2$embeddings$PCA$conos <- con$embedding
  }

  if (length(con$clusters) > 0) {
    con.clusters <- con$clusters %>% setNames(paste0("conos_", names(.))) %>%
      lapply(`[[`, "groups") %>% lapply(as.factor)
    p2$clusters %<>% c(con.clusters)
  }

  p2$clusters$dataset <- con$getDatasetPerCell()

  return(p2)
}


#' Utility function to generate a pagoda2 app from a conos object
#' 
#' @param conos Conos object
#' @param cdl list Optional list of raw matrices (so that gene merging doesn't have to be redone) (default=NULL)
#' @param metadata list Optional list of (named) metadata factors (default=NULL)
#' @param filename string Name of the *.bin file to seralize for the pagoda2 application if save=TRUE (default='conos_app.bin')
#' @param save boolean Save serialized *bin file specified in filename (default=TRUE)
#' @param n.cores integer Number of cores (default=1)
#' @param n.odgenes numeric Number of top overdispersed genes to use (dfault=3e3). From pagoda2::basicP2proc().
#' @param nPcs numeric Number of PCs to use (default=100). From pagoda2::basicP2proc().
#' @param k numeric Default number of neighbors to use in kNN graph (default=30). From pagoda2::basicP2proc().
#' @param perplexity numeric Perplexity to use in generating tSNE and largeVis embeddings (default=50). From pagoda2::basicP2proc().
#' @param log.scale boolean Whether to use log scale normalization (default=TRUE). From pagoda2::basicP2proc().
#' @param trim numeric Number of cells to trim in winsorization (default=10). From pagoda2::basicP2proc().
#' @param keep.genes optional set of genes to keep from being filtered out (even at low counts) (default=NULL). From pagoda2::basicP2proc().
#' @param min.cells.per.gene numeric Minimal number of cells required for gene to be kept (unless listed in keep.genes) (default=0). From pagoda2::basicP2proc().
#' @param min.transcripts.per.cell numeric Minimumal number of molecules/reads for a cell to be admitted (default=100). From pagoda2::basicP2proc().
#' @param get.largevis boolean Whether to caluclate largeVis embedding (default=TRUE). From pagoda2::basicP2proc().
#' @param get.tsne boolean Whether to calculate tSNE embedding (default=TRUE). From pagoda2::basicP2proc().
#' @param make.geneknn boolean Whether pre-calculate gene kNN (for gene search) (default=TRUE). From pagoda2::basicP2proc().
#' @param go.env GO environment for the organism of interest (default=NULL)
#' @param cell.subset string Cells to subset with the conos embedding conos$embedding. If NULL, uses all cells via rownames(conos$embedding) (default=NULL)
#' @param max.cells numeric Limit to the cells that are included in the conos. If Inf, there is no limit (default=Inf)
#' @param additional.embeddings list Additional embeddings to add to conos for the pagoda2 app (default=NULL)
#' @param test.pathway.overdispersion boolean Find all IDs using GO category against either org.Hs.eg.db ('hs') or org.Mm.eg.db ('mm') (default=FALSE
#' @param organism string Organism of interest, either 'hs' (Homo sapiens) or 'mm' (Mus musculus, i.e. mouse) (default=NULL). Only used if test.pathway.overdispersion is TRUE. If NULL and test.pathway.overdispersion=TRUE, then 'hs' is used.
#' @param return.details boolean If TRUE, return list of p2 application, pagoda2 object, list of raw matrices, and cell names. If FALSE, simply return pagoda2 app object. (default=FALSE)
#' @return pagoda2 app object
#'
#' @export 
p2app4conos <- function(conos, cdl=NULL, metadata=NULL, filename='conos_app.bin', 
  save=TRUE, n.cores=1, n.odgenes=3e3, nPcs=100, k=30, perplexity=50, log.scale=TRUE, trim=10, 
  keep.genes=NULL, min.cells.per.gene=0, min.transcripts.per.cell=100, get.largevis=TRUE, 
  get.tsne=TRUE, make.geneknn=TRUE, go.env=NULL, cell.subset=NULL, max.cells=Inf, 
  additional.embeddings=NULL, test.pathway.overdispersion=FALSE, organism=NULL, return.details=FALSE) {
  
  if (!requireNamespace("pagoda2", quietly = TRUE)) {
    stop("You have to install the pagoda2 package to use p2app4conos(). Please refer to <https://github.com/kharchenkolab/pagoda2>.")
  }
  if (!requireNamespace("GO.db", quietly = TRUE)) {
    stop("Package \"GO.db\" is needed for this function to work. Please install it from Bioconductor.", call. = FALSE)
  }
  if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
    stop("Package \"org.Hs.eg.db\" is needed for this function to work. Please install it from Bioconductor.", call. = FALSE)
  }
  if (!requireNamespace("org.Mm.eg.db", quietly = TRUE)) {
    stop("Package \"org.Mm.eg.db\" is needed for this function to work. Please install it from Bioconductor.", call. = FALSE)
  }
  if (!requireNamespace("AnnotationDbi", quietly = TRUE)) {
    stop("Package \"AnnotationDbi\" is needed for this function to work. Please install it from Bioconductor.", call. = FALSE)
  }

  if (is.null(conos$embedding)){
    stop("The input Conos object must contain an embedding. Please generate with the method embedGraph().")
  }
  if (is.null(cdl)) {
    #cdl <- lapply(conos$samples,function(p) t(p$misc$rawCounts))
    cdl <- lapply(rawMatricesWithCommonGenes(conos),t)
  }
  samf <- lapply(conos$samples,function(x) rownames(x$counts))
  samf <- as.factor(setNames(rep(names(samf),unlist(lapply(samf,length))),unlist(samf)))

  if (!is.null(cell.subset)) {
    cell.subset <- intersect(cell.subset,rownames(conos$embedding))
  } else {
    cell.subset <- rownames(conos$embedding)  
  }
  
  samf <- droplevels(samf[names(samf) %in% cell.subset])
  cdl <- lapply(cdl, function(x) x[,colnames(x) %in% cell.subset,drop=FALSE])
  
  # limit to the cells that are included in the conos
  vc <- unlist(lapply(cdl,colnames))
  if (length(vc)>max.cells) { # subsample
    cat("subsampling",length(vc),"cells down to",max.cells,'...')
    vc <- sample(vc,max.cells)
    cat('done\n')
  } 
  cdl <- lapply(cdl,function(d) d[,colnames(d) %in% intersect(vc,names(samf))])
  cm <- do.call(cbind,cdl)

  cp2 <- pagoda2::basicP2proc(cm, n.cores=n.cores, n.odgenes=n.odgenes, nPcs=nPcs, k=k, perplexity=perplexity, log.scale=log.scale, min.cells.per.gene=min.cells.per.gene, min.transcripts.per.cell=min.transcripts.per.cell, get.largevis=get.largevis, get.tsne=get.tsne, make.geneknn=make.geneknn)
  
  if (test.pathway.overdispersion) {
    if (is.null(organism) || organism =='hs') {
      ids <- unlist(lapply(mget(colnames(cp2$counts),org.Hs.eg.db::org.Hs.egALIAS2EG,ifnotfound=NA),function(x) x[1]))
      rids <- names(ids)
      names(rids) <- ids
      # list all the ids per GO category
      go.env <- list2env(eapply(org.Hs.eg.db::org.Hs.egGO2ALLEGS, function(x) as.character(na.omit(rids[x]))))
    } else if (organism =='mm') { 
      # translate gene names to ids
      ids <- unlist(lapply(mget(colnames(cp2$counts),org.Mm.eg.db::org.Mm.egALIAS2EG,ifnotfound=NA),function(x) x[1]))
      # reverse map
      rids <- names(ids)
      names(rids) <- ids
      # list all the ids per GO category
      go.env <- list2env(eapply(org.Mm.eg.db::org.Mm.egGO2ALLEGS, function(x) as.character(na.omit(rids[x]))))

    } else { 
      stop("Unknown organism. Currently only 'hs' (human) and 'mm' (mouse) supported.")
    }
  }
    
  if (!is.null(go.env)) {
    #cp2$getHierarchicalDiffExpressionAspects(type='PCA',clusterName='community',z.threshold=3)
    # here we test for pathway overdispersion, but recursive diff expression, as shown above will work just as well
    cp2$testPathwayOverdispersion(go.env,verbose=TRUE,correlation.distance.threshold=0.95,recalculate.pca=FALSE,top.aspects=15)
    termDescriptions <- AnnotationDbi::Term(GO.db::GOTERM[names(go.env)]) # saves a good minute or so compared to individual lookups
    geneSets <- lapply(sn(names(go.env)),function(x) {
      list(properties=list(locked=TRUE,genesetname=x,shortdescription=as.character(termDescriptions[x])),genes=c(go.env[[x]]))
    })
  } else {
    hdea <- cp2$getHierarchicalDiffExpressionAspects(type='PCA',z.threshold=3)
    geneSets <- pagoda2::hierDiffToGenesets(hdea)
  }
  

  # add all kinds of embeddings
  # various joint embeddding versions
  #cp2$embeddings$Conos <- list("All"=t(conO$embedding),"cochlea"=t(conC$embedding),"DRG"=t(conD$embedding),"Merged"=t(conO$embedding),"Refined"=t(con2$embedding))
  cn <- rownames(cp2$counts) # cell names
  cp2$embeddings$Conos <- list("All"=conos$embedding[cn,])
  # hack: add placeholder spaces, so that old p2 version picks up the new embeddings
  cp2$reductions$Conos <- list() 

  cp2$embeddings$sample <- lapply(conos$samples,function(x) { em <- x$embeddings$PCA[[1]]; em[rownames(em) %in% cn,] }) # embeddings of the individual samples
  cp2$embeddings$sample <- cp2$embeddings$sample[!unlist(lapply(cp2$embeddings$sample,is.null))]
  if (length(cp2$embeddings$sample)>1) {
    cp2$reductions$sample <- list()
  } else {
    cp2$embeddings$sample <- NULL
    
  }
  
  if (!is.null(additional.embeddings)) {
    cp2$embeddings$Other <- lapply(additional.embeddings, function(em) { em[rownames(em) %in% cn,] })
    cp2$reductions$Other <- list()
  }
  
  # additional metadata with different factors .. you probably want to include something like sample or tissue (or patient type)
  metadata <- c(metadata,list(sample=samf))
  metadata <- lapply(metadata, function(d) d[cn])
  meta <- lapply(sn(names(metadata)),function(n) pagoda2::p2.metadata.from.factor(droplevels(as.factor(metadata[[n]])),displayname=n))
  
  p2app <- pagoda2::make.p2.app(cp2, dendrogramCellGroups = as.factor(conos$clusters[[1]]$groups[cn]), additionalMetadata = meta, geneSets = geneSets,innerOrder='odPCA')
  
  # Optional showing of app
  #show.app(p2app, name='newPagoda',browse=FALSE)
  # Save serialised web object, RDS app and session image
  if(save){
    p2app$serializeToStaticFast(binary.filename = filename,verbose=TRUE)
  } 
  if(return.details) {
    return(list(app=p2app,p2=cp2,cdl=cdl,cn=cn))
  } else {
    invisible(p2app)
  }
}



#' Filter genes by requiring minimum average expression within at least one of the provided cell clusters
#'
#' @param emat spliced (exonic) count matrix
#' @param clusters named cell factor defining clusters
#' @param min.max.cluster.average numeric Required minimum average expression count (no normalization is perfomed) (default=0.1)
#' @return filtered emat matrix
#' @keywords internal
filter.genes.by.cluster.expression <- function(emat, clusters, min.max.cluster.average=0.1) {
  if(!any(colnames(emat) %in% names(clusters))) stop("provided clusters do not cover any of the emat cells!")
  vc <- intersect(colnames(emat),names(clusters))
  cl.emax <- apply(do.call(cbind,tapply(vc,as.factor(clusters[vc]),function(ii) Matrix::rowMeans(emat[,ii]))),1,max)
  vi <- cl.emax>min.max.cluster.average;
  emat[vi,]
}



#' A slightly faster way of calculating column correlation matrix
#' @param mat matrix whose columns will be correlated
#' @param nthreads number of threads to use 
#' @return correlation matrix 
#' @keywords internal
armaCor <- function(mat,nthreads=1) {
  cd <- arma_mat_cor(mat);
  rownames(cd) <- colnames(cd) <- colnames(mat);
  return(cd)
}


#' RNA velocity analysis on samples integrated with conos
#' Create a list of objects to pass into gene.relative.velocity.estimates function from the velocyto.R package
#'
#' @param cms.list list of velocity files written out as cell.counts.matrices.rds files by running dropest with -V option
#' @param con conos object (after creating an embedding and running leiden clustering)
#' @param clustering name of clustering in the conos object to use (default=NULL). Either 'clustering' or 'groups' must be provided. 
#' @param groups set of clusters to use (default=NULL). Ignored if 'clustering' is not NULL. 
#' @param n.odgenes numeric Number of overdispersed genes to use for PCA (default=2000).
#' @param verbose boolean Whether to use verbose mode (default=TRUE)
#' @param min.max.cluster.average.emat Required minimum average expression count for emat, the spliced (exonic) count matrix (default=0.2). Note: no normalization is perfomed. See the parameter 'min.max.cluster.average' in the function 'filter.genes.by.cluster.expression.'
#' @param min.max.cluster.average.nmat Required minimum average expression count for nmat, the unspliced (nascent) count matrix (default=0.05). Note: no normalization is perfomed. See the parameter 'min.max.cluster.average' in the function 'filter.genes.by.cluster.expression.'
#' @param min.max.cluster.average.smat Required minimum average expression count for smat, the spanning read matrix (used in offset calculations) (default=0.01). Note: no normalization is perfomed. See the parameter 'min.max.cluster.average' in the function 'filter.genes.by.cluster.expression.'
#' @return List with cell distances, combined spliced expression matrix, combined unspliced expression matrix, combined matrix of spanning reads, cell colors for clusters and embedding (taken from conos)
#' @export
velocityInfoConos <- function(cms.list, con, clustering=NULL, groups=NULL, n.odgenes=2e3, verbose=TRUE, min.max.cluster.average.emat=0.2, min.max.cluster.average.nmat=0.05, min.max.cluster.average.smat=0.01) {

  groups <- parseCellGroups(con, clustering, groups)
  cell.colors <- fac2col(groups)

  if (!is.null(con$embedding)){
    emb <- con$embedding
  } else {
    stop("No embedding found in the conos object. Run 'con$embedGraph()' before running this function.")
  }

  if (verbose) message("Merging raw count matrices...\n")
  # Merge samples to get names of relevant cells and genes
  raw.count.matrix.merged <- con$getJointCountMatrix(raw=TRUE)

  if (verbose) message("Merging velocity files...\n")
  # Intersect genes and cells between the conos object and all the velocity files
  cms.list <- lapply(cms.list, prepareVelocity, genes=colnames(raw.count.matrix.merged), cells=rownames(raw.count.matrix.merged))
  # Keep only genes present in velocity files from all the samples
  common.genes <-  Reduce(intersect, lapply(cms.list, function(x) {rownames(x[[1]])}))
  cms.list <- lapply(cms.list, function(x) {lapply(x, function(y) {y[row.names(y) %in% common.genes,]} )} )

  # Merge velocity files from different samples
  emat <- do.call(cbind, lapply(cms.list, function(x) {x[[1]]}))  ## emat - spliced (exonic) count matrix
  nmat <- do.call(cbind, lapply(cms.list, function(x) {x[[2]]}))  ## nmat - unspliced (nascent) count matrix
  smat <- do.call(cbind, lapply(cms.list, function(x) {x[[3]]}))  ## smat - optional spanning read matrix (used in offset calculations)

  # Keep the order of cells consistent between velocity matrices and the embedding (not really sure whether it's necessary...)
  emat <- emat[,order(match(colnames(emat), rownames(emb)))]
  nmat <- nmat[,order(match(colnames(nmat), rownames(emb)))]
  smat <- smat[,order(match(colnames(smat), rownames(emb)))]

  if (verbose) message("Calculating cell distances...\n")
  # Get PCA results for all the samples from the conos object
  pcs <- pcaFromConos(con$samples, n.odgenes=n.odgenes)
  # Again, keep the order of cells consistent
  pcs <- pcs[order(match(rownames(pcs), rownames(emb))),]
  # Calculate the cell distances based on correlation
  cell.dist <- as.dist(1 - armaCor(t(pcs)))

  if (verbose) message("Filtering velocity...\n")
  emat %<>% filter.genes.by.cluster.expression(groups, min.max.cluster.average=min.max.cluster.average.emat)
  nmat %<>% filter.genes.by.cluster.expression(groups, min.max.cluster.average=min.max.cluster.average.nmat)
  smat %<>% filter.genes.by.cluster.expression(groups, min.max.cluster.average=min.max.cluster.average.smat)

  if (verbose) message("All Done!")
  return(list(cell.dist=cell.dist, emat=emat, nmat=nmat, smat=smat, cell.colors=cell.colors, emb=emb))
}