R/synthetic_datasets.R
In cocoreg: Extract Shared Variation in Collections of Data Sets Using Regression Models

Documented in create_syndata_mv create_syn_data_puvar create_syndata_pwl create_syn_data_toy create_syn_data_uds create_syn_data_uvar create_Z_linear dl_scale variation_shared_by

#Copyright (c) 2016 Jussi Korpela (Finnish Institute of Occupational Healt, jussi.korpela@ttl.fi) and Andreas Henelius (Finnish Institute of Occupational Healt, andreas.henelius@iki.fi)
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#The above copyright notice and this permission notice shall be included in
#all copies or substantial portions of the Software.
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#THE SOFTWARE.

## -----------------------------------------------------------------------------
## Functions to create synthetic datasets
## -----------------------------------------------------------------------------
#' Contains functions to create synthetic datasets with different properties.
#' The create_syn_data_*() functions follow the scheme:
#' "total variation = shared_by_all + shared_by_subset + noise"


## -----------------------------------------------------------------------------
## Datasets with linear relations among shared components
## -----------------------------------------------------------------------------

#' Create signals
#' 
#' @param N Number of observations in data as integer
#' @param decorrelate (optional) Should the variables be de-correlated?
#'
#' @return A [N,3] matrix of signals
#' 
#' @export
create_Z_linear <- function(N, decorrelate = T){
  nlf <- function(x){x^2}
  
  sig1 <- seq.int(N) #linear trend
  sig2 <- sin(2*seq.int(N)*(2*pi/N)) #slow sine
  sig3 <- sin(4*seq.int(N)*(2*pi/N)) #fast sine 
  
  Z <- matrix(c(sig1,sig2,sig3), nrow=N, ncol=3, byrow=F)
  
  if (decorrelate){
    pca <- prcomp(Z)
    Z <- pca$x   
  }

  Z <- scale(Z)
}


#' An illustrative synthetic data collection
#' 
#' @description 
#' Model: D_k = D_shared_by_all + D_shared_by_subset + D_unique,
#'
#' @param normalize (optional) Should the data be processed with dl_scale()? A boolean value. 
#' @param noisemf (optional) Multiplication factor for noise
#' @inheritParams create_Z_linear
#' 
#' @return A list with elements
#' \item{data}{Data collection as a list of data.frames}
#' \item{Z_all}{Signals shared by all datasets in the collection}
#' \item{Z_sub}{Signals not shared by all datasets}
#' \item{W_all}{Mixing weights for Z_all}
#' \item{W_sub}{Mixing weights for Z_sub}
#' \item{E}{Noise}
#' \item{var.coef}{Noise multiplication factor used}
#' 
#' @examples
#' \dontrun{
#' dc <- create_syn_data_toy()
#' ggplot_dflst(dc$data, ncol = 1)
#' }
#'
#' @export
create_syn_data_toy <- function(N = 100, 
                                 normalize = T,
                                 noisemf = 0.1 ){
  # hardcoded values:
  K = 4
  dk = 3
  max_var_arr = rep(noisemf, K)
  
  set.seed(10) # Affects weights drawn from runif(). Selected such that most 
  # weights differ from zero and hence the data has variation
  
  ## Load latent activities
  Z <- create_Z_linear(N, decorrelate = F)
  Z_all <- Z[,1:2]
  Z_sub <- Z[,3]

  W_all <- vector(mode = 'list', length = K)
  W_sub <- vector(mode = 'list', length = K)		
  for (k in 1:K) {
    W_all[k] <- list(matrix(runif(2*dk, -1, 1), nrow=2, ncol=dk))
    W_sub[] <- list( 2 * matrix(runif(dk, -1, 1), nrow=1, ncol=dk))
  }
  W_sub[4] <- list(matrix(0, nrow=1, ncol=dk)) #ds4 not part of subset
  
  ## Create datasets
  D <- vector(mode='list', length=K)
  E <- vector(mode='list', length=K)
  for (k in seq.int(K)){
    set.seed(2)
    E[k] <- list(max_var_arr[k] * matrix(rnorm(N*dk), nrow=N, ncol=dk)) #noise [N,D_k]
    D[k] <- list(Z_all %*% W_all[[k]] + Z_sub %*% W_sub[[k]] + E[[k]])
    D[[k]] <- as.data.frame(D[[k]])
    colnames(D[[k]]) <- paste(sprintf("x%d",k), as.character(seq.int(dk)), sep="_")
    rownames(D[[k]]) <- as.character(seq.int(N))
  }
  names(D) <- sprintf('DS%d', 1:K)
  
  #ggplot_dflst(D,ncol=1)
  D <- dl_scale(D, center = T, scale = F)
  
  ## Create output
  attr(D,'id') <- "syn"
  D <- validate_data(D)
  list("data" = D,
       "Z_all" = Z_all, "Z_sub"=Z_sub,
       "W_all" = W_all, "W_sub"=W_sub,
       "E"=E, "var.coef"=max_var_arr)
}


#' A data collection with one unrelated dataset
#' 
#' @return A list with elements
#' \item{data}{Data collection as a list of data.frames}
#' \item{Z_all}{Signals shared by all datasets in the collection}
#' \item{Z_sub}{Signals not shared by all datasets}
#' \item{W_all}{Mixing weights for Z_all}
#' \item{W_sub}{Mixing weights for Z_sub}
#' \item{E}{Noise}
#' \item{var.coef}{Noise multiplication factor used}
#' 
#' @export
create_syn_data_uds <- function(){
  dc <- create_syn_data_toy()
  d4 <- ncol(dc$data[[4]])
  
  dc$data[[4]][,1:d4] <- rnorm( prod(dim(dc$data[[4]])) )
  for (k in seq.int(length(dc$W_all))){
    dc$W_all[[k]] <- matrix(0, nrow=nrow(dc$W_all[[k]]),
                               ncol=ncol(dc$W_all[[k]]))
    dc$W_sub[[k]] <- matrix(0, nrow=nrow(dc$W_sub[[k]]),
                               ncol=ncol(dc$W_sub[[k]]))    
    
  }
  dc$E[[4]] <- as.matrix(dc$data[[4]])
  
  dc
}


#' A collection with unrelated variables
#'
#' @return A list with elements
#' \item{data}{Data collection as a list of data.frames}
#' \item{Z_all}{Signals shared by all datasets in the collection}
#' \item{Z_sub}{Signals not shared by all datasets}
#' \item{W_all}{Mixing weights for Z_all}
#' \item{W_sub}{Mixing weights for Z_sub}
#' \item{E}{Noise}
#' \item{var.coef}{Noise multiplication factor used}
#' 
#' @export
create_syn_data_uvar <- function(){
  dc <- create_syn_data_toy()
  N <- nrow(dc$data[[1]])
  
  dc$data[[4]][,2] <- 0.5*rnorm( nrow(dc$data[[4]]) )
  dc$W_all[[4]][,2] <- 0
  dc$W_sub[[4]][,2] <- 0
  dc$E[[4]][,2] <- dc$data[[4]][,1]
  
  dc
}


#' A data collection with variables that "become unrelated during measurement"
#'
#' @return A list with elements
#' \item{data}{Data collection as a list of data.frames}
#' \item{Z_all}{Signals shared by all datasets in the collection}
#' \item{Z_sub}{Signals not shared by all datasets}
#' \item{W_all}{Mixing weights for Z_all}
#' \item{W_sub}{Mixing weights for Z_sub}
#' \item{E}{Noise}
#' \item{var.coef}{Noise multiplication factor used}
#' 
#' @export
create_syn_data_puvar <- function(){
  dc <- create_syn_data_toy()
  N <- nrow(dc$data[[1]])
  
  dc$data[[4]][(N/2):N,1] <- 0.5*rnorm((N/2)+1)
  dc$data[[4]][(N/2):N,2] <- 0.5*rnorm((N/2)+1)
  dc$W_all[[4]][,] <- NaN
  dc$W_sub[[4]][,] <- NaN
  dc$E[[4]][,] <- NaN
  
  dc
}


## -----------------------------------------------------------------------------
## Datasets with non-linear relations among shared components
## -----------------------------------------------------------------------------

#' A non-linear data collection using piecewise linearity
#' 
#' @return A list with elements
#' \item{data}{Data collection as a list of data.frames}
#' \item{Z}{Signals used}
#' \item{W}{Mixing matrix used}
#' \item{Z_all}{Signals shared by all datasets in the collection}
#' \item{Z_sub}{Signals not shared by all datasets}
#' \item{W_all}{Mixing weights for Z_all}
#' \item{W_sub}{Mixing weights for Z_sub as a list of matrices, one matrix per dataset}
#' \item{E}{Noise}
#' \item{var.coef}{Noise multiplication factor used}
#' 
#' @export
create_syndata_pwl <- function(){
  
  f <- function(z) { pmax(-6*z+1,-(z-1/3)/8+1/3)  }
  z <- seq(from=0,to=1,length.out=100) 
  s1 <- 10*z     # dataset 1
  s2 <- 10*f(z) # dataset 2
  
  Z <- matrix(c(s1,s2), ncol=2, byrow=F)
  
  # same variable added twice to avoid univariate data related errors in some
  # external packages
  w1 <- matrix(c(1,1,
                 0,0), nrow=2, ncol=2, byrow=T) 
  w2 <- matrix(c(0,0,
                 1,1), nrow=2, ncol=2, byrow=T)
  W <- list(w1,w2)  
  
  set.seed(42)
  d <- create_syndata_mv(Z, W,  max_var_arr=rep(0.1,length(W)))
  # set above max_var_arr = rep(0.5,length(W)) and GFA goes nuts.
  
  d$Z_all <- Z
  d$Z_sub <- matrix(0, nrow=nrow(Z), ncol=ncol(Z))
  d$W_all <- W
  d$W_sub <- rep(list( matrix(0, nrow=nrow(W[[1]]), ncol=ncol(W[[1]])) ),
                 length(W)) #doesn't really matter since Z_sub=0
  
  d$data <- dl_scale(d$data, center=T, scale=F)

  attr(d$data,'id') <- "syn_pwl"
  d
}


## -----------------------------------------------------------------------------
## Helper functions
##--------------------------------------------------------------------------

#' Create multivariate synthetic data
#' 
#' @param Z [N,L] matrix, Latent factors, N observations, L factors
#' @param W a [1,K] list of [L,D_k] matrices or [L,D,K] array, Projections from latent factors to 
#'        data, D_k variables per dataset
#' @param max_var_arr (optional) [1,K] numeric, Relative maximum amplitude of noise
#'
#' @return A list with elements:
#' \item{data}{Data collection as a list of data.frames}
#' \item{Z}{Signals used}
#' \item{W}{Mixing matrix used}
#' \item{E}{Noise}
#' \item{var.coef}{Noise multiplication factor used}
#' 
#' Each dataset is a data.frame to gain compatibility with lm() and glm()
#'
#' @export
create_syndata_mv <- function(Z, W, max_var_arr = rep(1,length(W))) {
  if (is.array(W)){
    K <- dim(W)[3]
    Dk_arr <- rep(dim(W)[2], K)
  } else {
    #assume list
    K <- length(W)
    Dk_arr <- sapply(W, ncol)
  }
  
  n_points <- nrow(Z)
  L = ncol(Z)
  
  ## Generate datasets (views)
  out <- vector(mode = "list", length=K)
  error_var_vec <- NULL
  E_lst <- vector(mode = "list", length=K)
  for (i in seq.int(K)){
    E <- max_var_arr[i] * matrix(rnorm(n_points*Dk_arr[i]),
                                 nrow=n_points, ncol=Dk_arr[i]) #noise [N,D_m]
    E_lst[[i]] <- E
    
    if (is.array(W)){
      k_df <- Z %*% W[,,i] + E
    } else {
      k_df <- Z %*% W[[i]] + E  
    }
    
    k_df <- as.data.frame(k_df)
    colnames(k_df) <- paste(sprintf("x%d",i), as.character(seq.int(Dk_arr[i])),
                            sep="_")
    rownames(k_df) <- as.character(seq.int(n_points))
    out[[i]] <- k_df
  }
  names(out) <- sprintf("D%d",1:K)
  attr(out,'id') <- "syn_nl"
  list("data" = out, "Z" = Z, "W" = W, "E"=E_lst, "var.coef"=max_var_arr)
}


#' Run scale() on a list of data.frames
#' 
#' @param dl A list of data.frames
#' @param ... Additional arguments to be passed on to scale
#' 
#' @return A list of data.frames that have been processed using scale() and
#' converted back to data.frame
#' 
#' @export
dl_scale <- function(dl,...){
  dl <- lapply(dl, function(x){as.data.frame(scale(x,...))})
  dl
}


#' Return a specific variation component
#' 
#' @description 
#' Variation can be shared by:
#'  'all'     all datasets
#'  'subset'  a subset of the datasets (excluding variation already in 'all')
#'  'all_and_subset' a union of the above
#'  
#'  The returned data never contains noise (which is considered to be part
#'  of each datasets unique variation). The linear toy datasets do not contain 
#'  variation unique to a dataset other than pure noise.
#'  
#' @param dc    A data collection from one of the create_syn_data_*() functions
#' @param type  Type of variation to extract, allowed values c('all','subset','all_and_subset')
#' @param center (optional) Should the output data be centered?
#' @param scale (optional) Should the output data be scaled?
#' 
#' @return A list of data.frames containing the desired variation component
#' 
#' @examples
#' \dontrun{
#' dc <- create_syn_data_toy()
#' ldSharedByAll = variation_shared_by(dc, "all", center = F) 
#' ldSharedBySome = variation_shared_by(dc, "subset", center = F) 
#' ldNonUnique = variation_shared_by(dc, "all_and_subset", center = F) 
#' dNoise <- mapply(function(x,y){x-y}, x=dc$data, y=ldNonUnique, SIMPLIFY = F)
#' ggplot_dclst(list(observed = dc$data,
#'                 shared.by.all = ldSharedByAll,
#'                 shared.by.some = ldSharedBySome,
#'                 noise = dNoise),
#'             ylim = c(-3, 3))
#' }
#' 
#' @export
variation_shared_by <- function(dc, type, center = T, scale = F){
  
  mprod <- function(W,Z){as.data.frame(Z %*% W)}
  
  if (type=="all"){
    D <- lapply(dc$W_all, mprod, Z = dc$Z_all)
    
  } else if (type=="subset") {
    D <- lapply(dc$W_sub, mprod, Z = dc$Z_sub)
    
  } else if (type=="all_and_subset") {
    D1 <- lapply(dc$W_all, mprod, Z = dc$Z_all)
    D2 <- lapply(dc$W_sub, mprod, Z = dc$Z_sub)
    D <- mapply(function(x,y){x+y}, D1, D2, SIMPLIFY = F)
    
  } else {stop("variation_shared_by(): Unknown value for ''type''.")}
  
  dmeta <- get_dc_meta(dc$data)
  D <- apply_dc_meta(D, dmeta)
  
  D <- dl_scale(D, center=center, scale=scale)
  
  D
}

Any scripts or data that you put into this service are public.

cocoreg documentation built on May 2, 2019, 6:48 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cocoreg
Extract Shared Variation in Collections of Data Sets Using Regression Models

R/synthetic_datasets.R
In cocoreg: Extract Shared Variation in Collections of Data Sets Using Regression Models

Defines functions create_Z_linear create_syn_data_toy create_syn_data_uds create_syn_data_uvar create_syn_data_puvar create_syndata_pwl create_syndata_mv dl_scale variation_shared_by

Documented in create_syndata_mv create_syn_data_puvar create_syndata_pwl create_syn_data_toy create_syn_data_uds create_syn_data_uvar create_Z_linear dl_scale variation_shared_by

Try the cocoreg package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cocoreg Extract Shared Variation in Collections of Data Sets Using Regression Models

R/synthetic_datasets.R In cocoreg: Extract Shared Variation in Collections of Data Sets Using Regression Models

Defines functions create_Z_linear create_syn_data_toy create_syn_data_uds create_syn_data_uvar create_syn_data_puvar create_syndata_pwl create_syndata_mv dl_scale variation_shared_by

Documented in create_syndata_mv create_syn_data_puvar create_syndata_pwl create_syn_data_toy create_syn_data_uds create_syn_data_uvar create_Z_linear dl_scale variation_shared_by

Try the cocoreg package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cocoreg
Extract Shared Variation in Collections of Data Sets Using Regression Models

R/synthetic_datasets.R
In cocoreg: Extract Shared Variation in Collections of Data Sets Using Regression Models