R/count.hits.R
In GRIN2: Genomic Random Interval (GRIN)

Documented in count.hits

#' Count Gene Lesion Hits
#'
#' @description
#' Computes the number of genomic lesions ("hits") affecting each gene by lesion category. It also calculates the number of unique subjects whose lesions overlap each gene by lesion type.
#'
#' @param ov.data A list of six `data.frame` objects generated by the `find.gene.lsn.overlaps()` function, containing gene-lesion overlap data and supporting indices.
#'
#' @details
#' This function summarizes the output of `find.gene.lsn.overlaps()` by generating two key matrices:
#'
#' - **nsubj.mtx**: For each gene, the number of *unique subjects* with at least one overlapping lesion of each type.
#' - **nhit.mtx**: For each gene, the *total number of overlapping lesions* (hits), regardless of subject redundancy, categorized by lesion type.
#'
#' For example, if the gene **NOTCH1** is affected by three separate mutations in the same subject, that subject will be counted once in `nsubj.mtx`, but all three hits will be counted in `nhit.mtx`.
#'
#' @return
#' A list containing the following components:
#' \item{lsn.data}{Original input lesion data.}
#' \item{lsn.index}{A `data.frame` indexing the rows in `gene.lsn.data` that correspond to each lesion.}
#' \item{gene.data}{Original input gene annotation data.}
#' \item{gene.index}{A `data.frame` indexing the rows in `gene.lsn.data` that correspond to each chromosome.}
#' \item{nhit.mtx}{A `data.frame` where rows correspond to genes and columns to lesion types. Each value is the number of hits (lesions) of a certain type affecting the gene.}
#' \item{nsubj.mtx}{A `data.frame` with the same structure as `nhit.mtx`, but showing the number of unique subjects with at least one hit of each lesion type per gene.}
#' \item{gene.lsn.data}{A `data.frame` where each row represents a gene overlapped by a lesion. Includes gene name (`gene`) and subject ID (`ID`).}
#' \item{glp.data}{A `data.frame` ordered by gene and lesion start positions. The `cty` column encodes event boundaries: 1 = gene start, 2 = lesion start, 3 = lesion end, 4 = gene end.}
#'
#' @export
#'
#' @references
#' Pounds, S. et al. (2013). A genomic random interval model for statistical analysis of genomic lesion data.
#'
#' Cao, X., Elsayed, A. H., & Pounds, S. B. (2023). Statistical Methods Inspired by Challenges in Pediatric Cancer Multi-omics.
#'
#' @author
#' Abdelrahman Elsayed \email{abdelrahman.elsayed@stjude.org}, Stanley Pounds \email{stanley.pounds@stjude.org}
#'
#' @seealso \code{\link{prep.gene.lsn.data}}, \code{\link{find.gene.lsn.overlaps}}
#'
#' @examples
#' data(lesion_data)
#' data(hg38_gene_annotation)
#'
#' # Prepare gene and lesion data for GRIN analysis:
#' prep.gene.lsn <- prep.gene.lsn.data(lesion_data, hg38_gene_annotation)
#'
#' # Identify overlapping gene-lesion events:
#' gene.lsn.overlap <- find.gene.lsn.overlaps(prep.gene.lsn)
#'
#' # Count the number of subjects and lesions (hits) affecting each gene:
#' count.nsubj.nhits <- count.hits(gene.lsn.overlap)

count.hits=function(ov.data) # output results of find.gene.lsn.overlaps function
{
  lsn.data=ov.data$lsn.data
  lsn.index=ov.data$lsn.index
  gene.lsn.hits=ov.data$gene.lsn.hits
  gene.lsn.data=ov.data$gene.lsn.data
  gene.data=ov.data$gene.data
  gene.index=ov.data$gene.index

  g=nrow(gene.data)

  # Compute the number of hits matrix
  lsn.types=sort(unique(lsn.index[,"lsn.type"]))
  k=length(lsn.types)

  nhit.mtx=matrix(0,g,k)
  colnames(nhit.mtx)=lsn.types

  nhit.tbl=table(gene.lsn.hits$gene.row,
                 gene.lsn.hits$lsn.type)
  nhit.rows=as.numeric(rownames(nhit.tbl))

  for (i in 1:ncol(nhit.tbl))
    nhit.mtx[nhit.rows,colnames(nhit.tbl)[i]]=nhit.tbl[,i]

  # Compute the matrix of the number of subjects with a hit
  gene.subj.type=paste0(gene.lsn.hits$gene.row,"_",
                        gene.lsn.hits$ID,"_",
                        gene.lsn.hits$lsn.type)
  dup.gene.subj.type=duplicated(gene.subj.type)

  subj.gene.hits=gene.lsn.hits[!dup.gene.subj.type,]

  nsubj.mtx=matrix(0,g,k)
  colnames(nsubj.mtx)=lsn.types

  nsubj.tbl=table(subj.gene.hits$gene.row,
                  subj.gene.hits$lsn.type)

  nsubj.rows=as.numeric(rownames(nsubj.tbl))

  for (i in 1:ncol(nsubj.tbl))
    nsubj.mtx[nsubj.rows,colnames(nsubj.tbl)[i]]=nsubj.tbl[,i]

  res=list(lsn.data=lsn.data,  # Input lesion data
           lsn.index=lsn.index, # data.frame that shows row start and row end for each lesion in the gene.lsn.data table
           gene.data=gene.data, # Input gene annotation data
           gene.index=gene.index, # data.frame that shows ordered row start and row end for each chromosome in the gene.lsn.data table
           nhit.mtx=nhit.mtx, # A data matrix with number of hits in each gene by lesion type
           nsubj.mtx=nsubj.mtx, # A data matrix with number of affected subjects by lesion type
           gene.lsn.data=gene.lsn.hits, # Each row represent a gene overlapped by a certain lesion. Column "gene" shows the overlapped gene and ID column has the patient ID
           glp.data=gene.lsn.data) # data.frame ordered by gene and lesions start position. Gene start position is coded as 1 in the cty column and gene end position is coded as 4. Lesion start position is coded as 2 in the cty column and lesion end position is coded as 3

  return(res)

}