R/data.R

#' A Dataset Containing 3 Clusters of Normal Distribution Data
#'
#' A simulated dataset used in the `SWKM` vignette for illustration to perform sparse weighted
#' K-Means clustering.
#'
#' @format A list with three items:
#' \describe{
#'   \item{data}{a 60 by 500 matrix, indicating 60 observations with 500 features. The first 
#'   50 features are cluster-specific.}
#'   \item{nCluster}{true number of clusters.}
#'   \item{true.label}{a 60-dimension integer vector indicating the cluster each observation 
#'   should be assigned to.}
#'   \item{noisy.label}{a 10-dimension integer vector indicating the positions of noisy 
#'   observations. The noisy observations share the same mean with normal ones but have larger 
#'   variance.}
#' }
#' 
#' @source 
#' This dataset is generated by the code shown in the example.
#' 
#' @examples 
#' \dontrun{
#' # this is the code to generate this dataset.
#' set.seed(1)
#' require(mvtnorm)
#' n <- 60  #sample size
#' p <- 500 #dimension of features
#' q <- 50  #dimension of cluster-specific features
#' mu <- 0.8
#' MU <- c(0,-mu,mu)
#' sigma0 <- 5
#' data <- rbind(rmvnorm(n/3,rep(0,p)),rmvnorm(n/3,c(rep(-mu,q),rep(0,p-q))),
#'               rmvnorm(n/3,c(rep(mu,q),rep(0,p-q))))
#' # add noise to 10 random observations
#' noisy.lab <- sample(n,10)
#' for (k in 1:3){
#'   check <- (noisy.lab<n*k/3+1) & (noisy.lab>n/3*(k-1))
#'   temp.lab <- noisy.lab[check]
#'   num <- length(temp.lab)
#'   if(any(check))
#'     data[temp.lab,] <- rmvnorm(num,c(rep(MU[k],q),rep(0,p-q)),sigma = diag(sigma0,p))
#' }
#' }
"NormalDisData"


#' A Dataset Simulating Single-Cell Epigenomic Data
#'
#' A simulated dataset used in the `SWKM` vignette for illustration to perform weighted
#' K-Means clustering.
#'
#' @format A list with three items:
#' \describe{
#'   \item{data}{a 100 by 5000 matrix, indicating 100 observations with 5000 features. 
#'   Each element is a non-negative integer, which could be regarded as read counts.}
#'   \item{nCluster}{true number of clusters.}
#'   \item{true.label}{a 100-dimension integer vector indicating the cluster each observation 
#'   should be assigned to.}
#'   \item{noisy.label}{a 20-dimension integer vector indicating the positions of noisy 
#'   observations. 90\% of the features in the noisy observations are set to be 0, 
#'   simulating missing values in real experiments.}
#' }
#' 
#' @source 
#' Jason D Buenrostro, \emph{et al.} (2015) 
#' Single-cell chromatin accessibility reveals principles of regulatory variation. 
#' \emph{Nature}, \bold{523(7561)}, 486-490.
#' 
#' @examples 
#' data <- DMdata$data
#' true.label <- DMdata$true.label
#' noisy.label <- DMdata$noisy.label
"DMdata"
Van1yu3/SWKM documentation built on Sept. 3, 2019, 7:50 a.m.