R/submatrix.R
In spatialHeatmap: spatialHeatmap

Documented in submatrix

#' Subset Target Assayed Items and Their Nearest Neighbors 
#'
#' Given a vector of target assayed items (gene, protein, metabolite, \emph{etc}), this function selects nearest neighbors for every target item independently, which share most similar abundance profiles with the targets. The selection is based on correlation or distance matrix computed by \code{\link[stats]{cor}} or \code{\link[stats]{dist}} from the "stats" package respectively. One of three alternative arguments \code{p}, \code{n}, \code{v} sets a cutoff for the selection.

#' @param data A \code{data.frame} or \code{SummarizedExperiment} object returned by the function \code{\link{filter_data}}, where the columns and rows of the data matrix are samples/conditions and assayed items (\emph{e.g.} genes, proteins) respectively. Since this function builds on coexpression analysis, variables of sample/condition should be at least 5. Otherwise, the results are not reliable. 
#' @param ann Applies to \code{data} argument of \code{SummarizedExperiment}. The column name corresponding to row item annotation in the \code{rowData} slot. Default is NULL.
#' @param ID A vector of target item identifiers.
#' @param p The proportion of top items with most similar expression profiles with the target items. Only items within this proportion are returned. Default is 0.3. It applies to each target item independently, and selected items of each target are returned together.
#' @param n An integer of top items with most similar expression profiles with the target items. Only items within this number are returned. Default is NULL. It applies to each target independently, and selected items of each target are returned together.
#' @param v A numeric of correlation (-1 to 1) or distance (>=0) threshold to select items sharing the most similar expression profiles with the target items. If \code{fun='cor'}, only items with correlation coefficient larger than \code{v} are returned. If \code{fun='dist'}, only items with distance less than \code{v} are returned. Default is NULL. It applies to each target independently, and selected items of each target are returned together.
#' @param fun The function to calculate similarity/distance measure, 'cor' or 'dist', corresponding to \code{\link[stats]{cor}} or \code{\link[stats]{dist}} from the "stats" package respectively. Default is 'cor'.
#' @param cor.absolute Logical, TRUE or FALSE. Use absolute values or not. Only applies to \code{fun='cor'}. Default is FALSE, meaning the correlation coefficient preserves the negative sign when selecting items. 
#' @param arg.cor A list of arguments passed to \code{\link[stats]{cor}} in the "stats" package. \cr Default is \code{list(method="pearson")}.
#' @param arg.dist A list of arguments passed to \code{\link[stats]{dist}} in the "stats" package. \cr Default is \code{list(method="euclidean")}.
#' @param dir The directory where the folder "customComputedData" is created automatically to save the subsetted matrix as a TSV-format file "sub_matrix.txt", which is ready to upload to the Shiny app launched by \code{\link{shiny_all}}. In the "sub_matrix.txt", the rows are assayed items and column names are in the syntax "sample__condition". This argument should be the same with the \code{dir} in \code{\link{adj_mod}} so that the files "adj.txt" and "mod.txt" generated by \code{\link{adj_mod}} are saved in the same folder "customComputedData". The default is NULL and no file is saved. This argument is used only when using the "customComputedData" in the Shiny app.

#' @return The subsetted matrix of target items and their nearest neighbors.

#' @examples

#' ## In the following examples, the 2 toy data come from an RNA-seq analysis on development of 7
#' ## chicken organs under 9 time points (Cardoso-Moreira et al. 2019). For conveninece, they are
#' ## included in this package. The complete raw count data are downloaded using the R package
#' ## ExpressionAtlas (Keays 2019) with the accession number "E-MTAB-6769". Toy data1 is used as
#' ## a "data frame" input to exemplify data of simple samples/conditions, while toy data2 as
#' ## "SummarizedExperiment" to illustrate data involving complex samples/conditions.   
#' 
#' ## Set up toy data.
#' 
#' # Access toy data1.
#' cnt.chk.simple <- system.file('extdata/shinyApp/example/count_chicken_simple.txt', 
#' package='spatialHeatmap')
#' df.chk <- read.table(cnt.chk.simple, header=TRUE, row.names=1, sep='\t', check.names=FALSE)
#' # Columns follow the namig scheme "sample__condition", where "sample" and "condition" stands
#' # for organs and time points respectively.
#' df.chk[1:3, ]
#'
#' # A column of gene annotation can be appended to the data frame, but is not required.  
#' ann <- paste0('ann', seq_len(nrow(df.chk))); ann[1:3]
#' df.chk <- cbind(df.chk, ann=ann)
#' df.chk[1:3, ]
#'
#' # Access toy data2. 
#' cnt.chk <- system.file('extdata/shinyApp/example/count_chicken.txt', package='spatialHeatmap')
#' count.chk <- read.table(cnt.chk, header=TRUE, row.names=1, sep='\t')
#' count.chk[1:3, 1:5]
#'
#' # A targets file describing samples and conditions is required for toy data2. It should be made
#' # based on the experiment design, which is accessible through the accession number
#' # "E-MTAB-6769" in the R package ExpressionAtlas. An example targets file is included in this
#' # package and accessed below. 

#' # Access the example targets file. 
#' tar.chk <- system.file('extdata/shinyApp/example/target_chicken.txt', package='spatialHeatmap')
#' target.chk <- read.table(tar.chk, header=TRUE, row.names=1, sep='\t')
#' # Every column in toy data2 corresponds with a row in targets file. 
#' target.chk[1:5, ]
#' # Store toy data2 in "SummarizedExperiment".
#' library(SummarizedExperiment)
#' se.chk <- SummarizedExperiment(assay=count.chk, colData=target.chk)
#' # The "rowData" slot can store a data frame of gene annotation, but not required.
#' rowData(se.chk) <- DataFrame(ann=ann)
#'
#' ## As conventions, raw sequencing count data should be normalized, aggregated, and filtered to
#' ## reduce noise.
#'
#' # Normalize count data.
#' # The normalizing function "calcNormFactors" (McCarthy et al. 2012) with default settings
#' # is used.  
#' df.nor.chk <- norm_data(data=df.chk, norm.fun='CNF', data.trans='log2')
#' se.nor.chk <- norm_data(data=se.chk, norm.fun='CNF', data.trans='log2')

#' # Aggregate count data.
#' # Aggregate "sample__condition" replicates in toy data1.
#' df.aggr.chk <- aggr_rep(data=df.nor.chk, aggr='mean')
#' df.aggr.chk[1:3, ]

#' # Aggregate "sample_condition" replicates in toy data2, where "sample" is "organism_part" and
#' # "condition" is "age". 
#' se.aggr.chk <- aggr_rep(data=se.nor.chk, sam.factor='organism_part', con.factor='age',
#' aggr='mean')
#' assay(se.aggr.chk)[1:3, 1:3]

#' # Filter out genes with low counts and low variance. Genes with counts over 5 (log2 unit) in at
#' # least 1% samples (pOA), and coefficient of variance (CV) between 0.2 and 100 are retained.
#' # Filter toy data1.
#' df.fil.chk <- filter_data(data=df.aggr.chk, pOA=c(0.01, 5), CV=c(0.2, 100), dir=NULL)
#' # Filter toy data2.
#' se.fil.chk <- filter_data(data=se.aggr.chk, sam.factor='organism_part', con.factor='age', 
#' pOA=c(0.01, 5), CV=c(0.2, 100), dir=NULL)
#'
#' ## Select nearest neighbors for target genes 'ENSGALG00000019846' and 'ENSGALG00000000112',
#' ## which are usually genes visualized in spatial heatmaps.
#' # Toy data1.
#' df.sub.mat <- submatrix(data=df.fil.chk, ID=c('ENSGALG00000019846', 'ENSGALG00000000112'),
#' p=0.1)
#' # Toy data2.
#' se.sub.mat <- submatrix(data=se.fil.chk, ann='ann', ID=c('ENSGALG00000019846', 
#' 'ENSGALG00000000112'), p=0.1) 
#'
#' # In the following, "df.sub.mat" and "se.sub.mat" is used in the same way, so only
#' # "se.sub.mat" illustrated.
#'
#' # The subsetted matrix is partially shown below.
#' se.sub.mat[c('ENSGALG00000019846', 'ENSGALG00000000112'), c(1:2, 63)]

#' @author Jianhai Zhang \email{zhang.jianhai@@hotmail.com; jzhan067@@ucr.edu} \cr Dr. Thomas Girke \email{thomas.girke@@ucr.edu}

#' @references 
#' Langfelder P and Horvath S, WGCNA: an R package for weighted correlation network analysis. BMC Bioinformatics 2008, 9:559 doi:10.1186/1471-2105-9-559 \cr Peter Langfelder, Steve Horvath (2012). Fast R Functions for Robust Correlations and Hierarchical Clustering. Journal of Statistical Software, 46(11), 1-17. URL http://www.jstatsoft.org/v46/i11/ \cr R Core Team (2018). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/ \cr Peter Langfelder, Bin Zhang and with contributions from Steve Horvath (2016). dynamicTreeCut: Methods for Detection of Clusters in Hierarchical Clustering Dendrograms. R package version 1.63-1. https://CRAN.R-project.org/package=dynamicTreeCut \cr Martin Morgan, Valerie Obenchain, Jim Hester and Hervé Pagès (2018). SummarizedExperiment: SummarizedExperiment container. R package version 1.10.1 
#' \cr Keays, Maria. 2019. ExpressionAtlas: Download Datasets from EMBL-EBI Expression Atlas
#' \cr Love, Michael I., Wolfgang Huber, and Simon Anders. 2014. "Moderated Estimation of Fold Change and Dispersion for RNA-Seq Data with DESeq2." Genome Biology 15 (12): 550. doi:10.1186/s13059-014-0550-8
#' \cr Cardoso-Moreira, Margarida, Jean Halbert, Delphine Valloton, Britta Velten, Chunyan Chen, Yi Shao, Angélica Liechti, et al. 2019. “Gene Expression Across Mammalian Organ Development.” Nature 571 (7766): 505–9 

#' @export submatrix
#' @importFrom SummarizedExperiment assay rowData
#' @importFrom stats cor dist

submatrix <- function(data, ann=NULL, ID, p=0.3, n=NULL, v=NULL, fun='cor', cor.absolute=FALSE, arg.cor=list(method="pearson"), arg.dist=list(method="euclidean"), dir=NULL) {

  options(stringsAsFactors=FALSE)
  # Process data.
  if (is(data, 'data.frame')|is(data, 'matrix')|is(data, 'DFrame')) {
    dat.lis <- check_data(data=data); data <- dat.lis$dat; ann <- dat.lis$row.meta
  } else if (is(data, 'SummarizedExperiment')) { 

    ann <- rowData(data)[ann]; data <- as.data.frame(assay(data)) 
    if (any(duplicated(rownames(data)))) stop('Please use function \'aggr_rep\' to aggregate replicates!')

  } else { stop('Accepted data classes are "data.frame", "matrix", "DFrame", or "SummarizedExperiment", except that "spatial_hm" also accepts a "vector".') }

  if (nrow(data)<5) cat('Warning: variables of sample/condition are less than 5! \n')
  na <- NULL; len <- nrow(data)
  if (len>=50000) cat('More than 50,000 rows are detected in data. Computation may take a long time! \n')
  if (sum(c(!is.null(p), !is.null(n), !is.null(v)))>1) return('Please only use one of \'p\', \'n\', \'v\' as the threshold!')

  if (fun=='cor') {

    dat.t <- t(data)
    m <- do.call(cor, c(x=list(dat.t[, ID]), y=list(dat.t), arg.cor))
    if (length(ID)==1) rownames(m) <- ID; m <- t(m)
    if (cor.absolute==TRUE) { m1 <- m; m <- abs(m) }
    na <- sub_na(mat=m, ID=ID, p=p, n=n, v=v)
    if (cor.absolute==TRUE) m <- m1
    m <- t(m)

  } else if (fun=='dist') { 

    m <- data.frame(); for (i in ID) {
    
      dis <- NULL; for (j in rownames(data)) {

        # Distances are transformed to negative.
        dis0 <- -do.call(dist, c(x=list(data[c(i, j), ]), arg.dist))[1]
        dis <- c(dis, dis0)

      }; m <- rbind(m, dis)

    }; row.names(m) <- ID; colnames(m) <- rownames(data)
    if (!is.null(v)) v <- -v
    na <- sub_na(mat=t(m), ID=ID, p=p, n=n, v=v); m <- -m

  }
  if (!is.null(dir)) { 

    dir <- normalizePath(dir, winslash="/", mustWork=FALSE); if (!dir.exists(dir)) stop(paste0(dir, ' does not exist!'))
    dir <- paste0(dir, "/customComputedData/"); if (!dir.exists(dir)) dir.create(dir)
    write.table(data[na, ], paste0(dir, '/sub_matrix.txt'), col.names=TRUE, row.names=TRUE, sep='\t')

  }
  return(cbind(data[na, ], ann[na, , drop=FALSE]))

}