R/cocoreg.R
In cocoreg: Extract Shared Variation in Collections of Data Sets Using Regression Models

Documented in apply_dc_meta average_R2_dflst cocoreg cocoreg_by_path compose compose_all create_mappings dc_variability ds_variability generate_mapping_function generate_paths generate_paths_cyclic generate_paths_noncyclic get_dc_meta get_starting_dataset mapping_glmnet mapping_lmridge mapping_pcr mappings_R2_matrix matrix_variability pathify rename_variables row_suffle_variability svm_sigmoid validate_data vector_variability

#Copyright (c) 2016 Jussi Korpela (Finnish Institute of Occupational Healt, jussi.korpela@ttl.fi) and Andreas Henelius (Finnish Institute of Occupational Healt, andreas.henelius@iki.fi)
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#The above copyright notice and this permission notice shall be included in
#all copies or substantial portions of the Software.
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#THE SOFTWARE.

## -----------------------------------------------------------------------------
## The CoCoReg algorithm
## -----------------------------------------------------------------------------

#' Create a path
#'
#' @description 
#' Given a set of numbers and a starting point, create a path from
#' the starting point through each point. If cyclic = FALSE, the path
#' stops at p[length(p)] and if cyclic = TRUE the patch goes back to ind.
#'
#' @param ind [1,1] int, The starting point.
#' @param p [1,m] int, A set of numbers through which the path must go,
#'        excluding the starting point.
#' @param cyclic boolean, If TRUE, the path leads back to the starting point.
#'       If FALSE, the path stops at last element of p.
#'               
#' @return A list containing the path definition.
#'
#' @keywords internal
pathify <- function(ind, p, cyclic = FALSE) {
    if (cyclic)
        p  <- c(ind, rep(p, each = 2), ind)
    else
        p  <- c(p[1], rep(p[-1], each = 2), ind)

    p2 <- split(p, ceiling(seq_along(p) / 2))
    path_lst <- lapply(p2, function(i) paste(as.character(i), collapse = "-"))
    attr(path_lst, "startDS") <- p[1]
    path_lst
}


#' Generate all/some paths between points
#'
#' @param ind [1,2] int, The starting and ending point c(start, end).
#' @param n [1,1] int, Number of points in the whole set.
#' @param cyclic boolean, Should the path be cyclic (1-2-1) or noncyclic (2-1).
#' @param sample_paths boolean, If FALSE, all possible paths are generated.
#'       If true one path per ending point is selected.
#'
#' @return A list of lists containing the paths.
#'
#' @export
generate_paths <- function(ind, n, cyclic = FALSE, sample_paths = FALSE) {
    if (cyclic){
        paths <- generate_paths_cyclic(ind, n)
    } else {
        paths <- generate_paths_noncyclic(ind, n, sample_paths = sample_paths)
    }
}


#' Generate cyclic paths
#'
#'@description
#' From a set of n numbers, generate all possible paths starting from
#' and ending on a given number.
#'
#' @param ind [1,1] ind, The starting dataset (equals to ending dataset because of cycle).
#' @param n [1,1] ind, The number of datasets.
#'
#' @return A list of lists containing the paths.
#'
#' @importFrom combinat permn
#'
#' @export
generate_paths_cyclic <- function(ind, n) {
    if (n == 2) {
        if (ind == 1)
            res <- list(list("1-2", "2-1"))
            attr(res[[1]], "startDS") <- 1
        if (ind == 2)
            res <- list(list("2-1", "1-2"))
            attr(res[[1]], "startDS") <- 2
    }

    if (n > 2) {
        plist <- combinat::permn(setdiff(seq.int(n), ind))
        res   <- lapply(plist, function(i) pathify(ind, i, cyclic = TRUE))
    }

    res
}


#' Generate non-cyclic paths
#'
#' @description 
#' From a set of n numbers, generate all possible paths starting from
#' and ending on a given number.
#'
#' @param ind The starting dataset
#' @param n The number of datasets.
#' @param sample_paths boolean, If FALSE, all possible paths are generated.
#'       If TRUE one path per ending point is selected.
#'
#' @return A list of lists containing the paths.
#'
#' @importFrom combinat permn
#'
#' @export
generate_paths_noncyclic <- function(ind, n, sample_paths = FALSE) {
    if (n == 2) {
        if (ind == 1){
            res <- list(list("2-1"))
            attr(res[[1]], "startDS") <- 2
        }
        if (ind == 2){
            res <- list(list("1-2"))
            attr(res[[1]], "startDS") <- 1
        }
    }

    if (n > 2) {
      if (sample_paths & (n>3)){
        ## pick one path for each possible starting index
        plist <- lapply(setdiff(seq.int(n),ind), function(i) c(i, sample(setdiff(seq.int(n), c(i, ind)))))
        ## Note: sample(3) does not sample from vector c(3) but from 1:3!
        ## This is why sample_paths = TRUE has effect only for data collections
        ## with K > 3.
      } else {
        ## all paths for all possible starting indices
        plist <- combinat::permn(setdiff(seq.int(n), ind))  
      }
      res <- lapply(plist, function(i) pathify(ind, i, cyclic = FALSE))
    }
    res
}


#' Generate a mapping function between two datasets
#' 
#' @description 
#' Generate a mapping function between two datasets,
#' using some method, such as linear regression (lm),
#' or some classifier such as a random forest (randomForest).
#' 
#' Wraps the function as well as data into a single object.
#'
#' @param method A function to be used in the mapping. A function object.
#'
#' @return Returns a function object that does the mapping between two datasets i.e.
#' from dataset 1 to dataset 2.
#'
#' @export
generate_mapping_function <- function(method = lm) {
    function(data1, data2) {
        f_list <- lapply(seq.int(ncol(data2)),
                         function(i) method(data2[,i] ~ 0 + . , data = data1))
        function(x) {
            tmp           <- lapply(f_list, function(i) predict(i, newdata = x))
            tmp           <- data.frame(do.call(cbind, tmp))
            colnames(tmp) <- colnames(data2)
            tmp
        }
    }
}


## Create mapping functions:

#' Mapping stats::lm
#' 
#' @param data1 data.frame, Dataset 1, the independent variables
#' @param data2 data.frame, Dataset 2, the dependent variables
#' 
#' @export
mapping_lm  <- generate_mapping_function(method = stats::lm)

#' Mapping MASS::rlm
#' 
#' @param data1 data.frame, Dataset 1, the independent variables
#' @param data2 data.frame, Dataset 2, the dependent variables
#' 
#' @importFrom MASS rlm
#' @export
mapping_rlm  <- generate_mapping_function(method = MASS::rlm)

#' Mapping randomForest
#' 
#' @param data1 data.frame, Dataset 1, the independent variables
#' @param data2 data.frame, Dataset 2, the dependent variables
#' 
#' @export
mapping_rf  <- generate_mapping_function(method = randomForest::randomForest)

#' Mapping svm
#' 
#' @param data1 data.frame, Dataset 1, the independent variables
#' @param data2 data.frame, Dataset 2, the dependent variables
#' 
#' @importFrom e1071 svm
#' @export
mapping_svm <- generate_mapping_function(method = e1071::svm)

#' SVM using sigmoid kernel
#' 
#' @param ... Further arguments passed on to e1071::svm()
#' 
#' @importFrom e1071 svm
#' @export
svm_sigmoid <- function(...) {
  e1071::svm(..., kernel = "sigmoid", coef0 = 0)
}


#' Mapping svm using sigmoid
#' 
#' @param data1 data.frame, Dataset 1, the independent variables
#' @param data2 data.frame, Dataset 2, the dependent variables
#' 
#' @export
mapping_svm_sigmoid <- generate_mapping_function(method = svm_sigmoid)

#' Define a mapping function using MASS::lm.ridge
#' 
#' @param data1 data.frame, Dataset 1, the independent variables
#' @param data2 data.frame, Dataset 2, the dependent variables
#' 
#' @return Returns a function object that does the mapping between two datasets.
#' 
#' @importFrom MASS lm.ridge
#' @export
mapping_lmridge <- function(data1, data2) {
  f_list <- lapply(seq.int(ncol(data2)), function(i) MASS::lm.ridge(data2[,i] ~ . , data = data1))
  
  function(x) {
    tmp           <- lapply(f_list, function(model){
      matrix( scale(as.matrix(x), center = F, scale = model$scales) %*% matrix(model$coef) + model$ym
        , nrow=nrow(data2))} )
    tmp           <- data.frame(do.call(cbind, tmp))
    colnames(tmp) <- colnames(data2)
    tmp
  }
}



#' Define a mapping function using pls::pcr
#' 
#' @param data1 data.frame, Dataset 1, the independent variables
#' @param data2 data.frame, Dataset 2, the dependent variables
#' 
#' @return Returns a function object that does the mapping between two datasets.
#' 
#' @importFrom pls pcr
#' @export
mapping_pcr <- function(data1, data2) {
  f_list <- lapply(seq.int(ncol(data2)), function(i) pls::pcr(data2[,i] ~ . , data = data1))
  
  function(x) {
    tmp           <- lapply(f_list, function(model){
                matrix(predict(model, x)[,1,2], nrow=nrow(data2))} )
    tmp           <- data.frame(do.call(cbind, tmp))
    colnames(tmp) <- colnames(data2)
    tmp
  }
}


#' Define a mapping function using glmnet::glmnet
#'
#' @param data1 data.frame, Dataset 1, the independent variables
#' @param data2 data.frame, Dataset 2, the dependent variables
#' 
#' @return Returns a function object that does the mapping between two datasets.
#' 
#' @importFrom glmnet glmnet predict.glmnet
#' @export
mapping_glmnet <- function(data1, data2) {
  f_list <- lapply(seq.int(ncol(data2)), function(i) glmnet::glmnet(as.matrix(data1), data2[,i]) )
  
  function(x) {
    tmp           <- lapply(f_list, function(i) predict.glmnet(i, newx = as.matrix(x), s=49))
    tmp           <- data.frame(do.call(cbind, tmp))
    colnames(tmp) <- colnames(data2)
    tmp
  }
}


#' Generate all possible pairwise mappings between the given multivariate
#' datasets.
#' 
#' @description 
#' The following naming convention is used in the output:
#' '1-2' means '1' mapped to '2', i.e.,, '2' explained by '1'.
#'
#' @param data A list of data.frames (the datasets)
#' @param mapping_function (optional) Default is \code{mapping_lm}.
#'
#' @return A named list containing the pairwise mapping functions.
#'
#' @export
create_mappings <- function(data, mapping_function = mapping_lm) {
    n   <- length(data)
    out <- vector(mode = "list", length = ((n * n) - n))
    ind <- 1

    for (i in seq.int(n)) {
        for (j in seq.int(n)) {
            if (i != j) {
                out[[ind]]      <-  mapping_function(data[[i]], data[[j]])
                names(out)[ind] <- paste(i, j, sep = "-")
                ind             <- ind + 1
            }
        }
    }

    out
}


#' Calculate the composition formed by applying all functions
#' in the given path to a dataset.
#'
#' @param x A data frame or vector
#' @param path A list describing the path.
#' @param mappings A list containing the mapping functions described in the path.
#'                 Usually a list of all M*M-M available mappings between the M
#'                 data sets.
#'
#' @return A vector containing the result of the composition.
#'
#' @export
compose <- function(x, path, mappings) {
    for (i in path){
      #str(x)
      x <- mappings[[i]](x)
    }
        
    x
}


#' Helper function to get the starting dataset based on
#' a path.
#'
#' @param p A path.
#'
#' @return A number indicating the starting dataset
get_starting_dataset <- function(p) {
    as.numeric(unlist(strsplit(p[[1]], "-"))[1])
}


#' Calculate the average of the composition formed by applying all functions
#' in all possible paths to a dataset.
#'
#' @param x A list of data frames.
#' @param paths A list of list with paths.
#' @param mappings A list containing the mapping functions described in the path.
#'
#' @return A vector containing the average composition from all the paths.
#'
#' @importFrom abind abind
#'
#' @export
compose_all <- function(x, paths, mappings) {
    tmp <- vector(mode = "list", length = length(paths))
    for (i in seq.int(length(paths))){
        #cat(sprintf("\n\ncompose_all, round:%d",i))
        tmp[[i]] <- compose(x[[get_starting_dataset(paths[[i]])]], paths[[i]], mappings)
    }

    tmp <- lapply(tmp, as.matrix)
    tmp <- do.call(abind::abind, c(tmp, along = 3)) #catenated along the third dimension
    as.data.frame(apply(tmp, c(1, 2), mean)) # average out 3rd dimension
}


#' The Common Components by Regression (CoCoReg) algorithm
#' 
#' @description 
#' An algorithm that extracts common variation between datasets using regression.
#'
#' @param data   [1,K] list of data.frames.
#' @param cyclic boolean, Operation mode: cyclic or non-cyclic
#' @param mapping_function function, The function to use in mappings.
#'                          See mapping_lm() for an example.
#' @param sample_paths boolean, If FALSE all paths are computed. If TRUE a
#'        subset of paths is taken: one (random) path for each starting point.
#'        Currently implemented only for cyclic=F.
#' @param center_data boolean, Should the data be centered?
#' @param scale_data boolean, Should the data be scaled?
#'
#'
#' @return A list with elements:
#' \item{$data:}{[1,K] list of data.frames containing the joint information,
#'                organised identically to the input data.}
#' \item{$mappings:}{[1,K*K-K] list of functions, mappings between datasets}
#' \item{$paths:}{[(K-1)(K-2)!, K] list of lists, paths for each data set}
#' \item{$cyclic:}{input cyclic as is}
#' \item{$sample_paths:}{boolean, TRUE if paths have been sampled, FALSE otherwise.}
#' \item{$dataid:}{string, Dataset identifier string}
#' \item{$method:}{string, Analysis method identifier string}
#' \item{$wall_time_taken:}{[1,1] double, Time taken to run the analysis in seconds}
#'
#' @examples
#' dc <- create_syn_data_toy()
#' ccr <- cocoreg(dc$data)
#' 
#' ggplot_dflst(dc$data, ncol=1)
#' ggplot_dflst(ccr$data, ncol=1)
#' 
#' \dontrun{
#' ggplot_dclst(list(orig = dc$data, ccr = ccr$data)) 
#' ggplot_dclst(list(orig = dc$data, shared = ccr$data), legendMode = 'none')
#' ggplot_dclst(list(orig = dc$data, ccr = ccr$data), legendMode = 'all')
#' }
#'
#' @export
cocoreg <- function(data, cyclic = FALSE, mapping_function = mapping_lm,
                    sample_paths = FALSE, center_data = T, scale_data = F) {
  start_time <- Sys.time()
  
  ## Validate data and convert to internal format
  if ("id" %in% names(attributes(data)) ){
    dataid <- attr(data, "id")
  } else {
    dataid <- "NA"
  }
  data  <- validate_data(data)
  dmeta <- get_dc_meta(data)
  data  <- rename_variables(data)
  if (center_data | scale_data){
    data  <- lapply(data, function(el){df_scale(el, center=center_data, scale=scale_data)})  
  }
  
  ## Compute cocoreg
  mappings <- create_mappings(data, mapping_function = mapping_function)
  ccr_data <- lapply(seq.int(length(data)), function(i) compose_all(data, generate_paths(i, n = length(data), cyclic = cyclic, sample_paths=sample_paths), mappings))
  paths <- sapply(seq.int(length(data)), function(i)  generate_paths(i, n = length(data), cyclic = cyclic, sample_paths=sample_paths))
  #TODO: test if the use of vapply() would not leave out the dim attribute?
  if (length(data)==2){
    dim(paths) <- c(1, 2) #prevent from becoming dimensionless  
  }
  
  ## Convert back to original data format
  ccr_data <- apply_dc_meta(ccr_data, dmeta) 
  
  ## Return
  res_lst <- list(
    "data"         = ccr_data,
    "mappings"     = mappings,
    "paths"        = paths,
    "cyclic"       = cyclic,
    "sample_paths" = sample_paths,
    "dataid"       = dataid,
    "method"       = 'ccr')
  
  res_lst        <- c(res_lst, list("wall_time_taken" = as.numeric(difftime(Sys.time(), start_time, units = "sec"))))
  class(res_lst) <- "cocoreg" ## Needed for effective recursive traversing of results in a nested list
  res_lst
}




## -----------------------------------------------------------------------------
## Helper functions
## -----------------------------------------------------------------------------


#' Computes the R^2 (variance explained) between two lists of data.frames
#' 
#' @param df_orig_lst List of original data.frames
#' @param df_est_lst List of estimated data.frames
#' 
#' @return Returns a data.frame with R2 values, one value for each data set and variable. Molten/long format.
#' 
#' @importFrom reshape melt melt.data.frame
#' 
#' @export

## 
average_R2_dflst <- function(df_orig_lst, df_est_lst){
  df_var_explained <- function(df1, df2){
    as.data.frame(mapply(function(x,y){var_explained(x,y)$R2},
                         df1, df2, SIMPLIFY = F))
  }
  tmp <- mapply(df_var_explained, df_orig_lst, df_est_lst, SIMPLIFY = F)
  tmp <- reshape::melt(tmp)
  tmp$dsvar <- tmp$variable
  tmp$variable <- "R2"
  tmp$ds <- as.factor(tmp$L1)
  tmp$L1 <- NULL
  tmp
}


#' Validate a data collection for use with cocoreg
#' 
#' @description 
#' Check that the data collection has all the required properties.
#' 
#' @param df_list list of data.frames, The data collection to validate
#' 
#' @return A list of data.frames that conform to the requirements
#' 
#' @export
validate_data <- function(df_list){  
  
  ## Add dataset names if missing
  if (is.null(names(df_list))){
    warning('validate_data: datasets do not have names, creating names...')
    names(df_list) <- sprintf("DS%d", seq.int(length(df_list)))
  }
  
  ## Make sure the datasets are data.frames
  df_match <- vapply(df_list, is.data.frame, logical(1))
  if (!all(df_match)){
    warning('validate_data: some datasets are not data.frames, converting...')
    df_list <- lapply(df_list, as.data.frame)
  }
  
  #TODO: Maybe check that variable names follow the COCOREG convention and rename if necessary
  ## Rename variables
  #if (rename_vars){
  #  df_list <- rename_variables(df_list)
  #}
  
  df_list
}


#' Rename variables of a data collection
#' 
#' @description 
#' Rename variable in all datasets such that the data.frame list conforms to the 
#' requirements of CoCoReg.
#'
#' @param df_list  list of data.frames, The datasets to process
#'
#' @return A list of data.frames with changed variable names. Original dimension names are stored
#'          as an attribute.
#'
#' @export
rename_variables <- function(df_list){
  for (i in 1:length(df_list)){
    attr(df_list[[i]],'orig_dimnames') <- dimnames(df_list[[i]])
    ## CoCoReg dimension names
    tmp_colnames <- paste(sprintf("x%d",i), as.character(seq.int(ncol(df_list[[i]]))), sep="_")
    dimnames(df_list[[i]]) <- list((1:nrow(df_list[[i]])), tmp_colnames)
  }
  df_list
}


#' Extract important properties of data collection
#'
#' @param df_list  list of data.frames, The data collection to process
#' @param type string, If 'current' then data collection metadata is collected from the
#'        data collection itself. If 'original' then metadata is collected from special attributes.
#'
#' @return A list with elements:
#' \item{$dcnames:}{Dataset names}
#' \item{$dcdimnames:}{A list of dataset dimension names for each dataset}
#'
#' @export
get_dc_meta <- function(df_list, type = 'current'){
  
  if (type == 'current'){
    meta <- list( dcnames = names(df_list),
                  dcdimnames = lapply(df_list, dimnames) )
  } else if (type == 'original'){
    meta <- list( dcnames = names(df_list),
                  dcdimnames = lapply(df_list, function(el){attr(el, 'orig_dimnames')}) )
  } else {stop()}
  meta
}


#' Apply extracted properties of a data collection to a data collection (restore)
#' 
#' @param df_list  list of data.frames, The data collection to process
#' @param meta, list, Output of get_dc_meta()
#' 
#' @return A list of data.frames, the data collection with updated metadata
#' 
#' @export
apply_dc_meta <- function(df_list, meta){
  names(df_list) <- meta$dcnames
  for (ind in seq.int(length(df_list))){
    dimnames(df_list[[ind]]) <- meta$dcdimnames[[ind]]
  }
  df_list
}


#' Extract R2 values from a list of mappings using summary()
#'
#' @param mappings [M*M-M] list, Exhaustive list of mappings between the M datasets
#' @param n_datasets integer, Number of datasets i.e. M
#' @param aggfun function, A function to apply when aggregating R2 values over variables in
#'                a multivariate dataset
#'
#' @return A [M,M] matrix of R2 values stored such that the R2 value for mapping
#'          a->b is read from row a and column b.
#'
#' @examples
#' \dontrun{
#' ccr <- cocoreg(data_collection)
#' R2mat <- mappings_R2_matrix(ccr$mappings, length(ccr$data))
#' }
#'
#' @export
mappings_R2_matrix <- function(mappings, n_datasets, aggfun = mean){

    ## Compute values
    mappings_summaries <- lapply(mappings, function(el){summary(el)})

    ## Extract and average R2 values
    if ("r.squared" %in% names(summary(mappings[[1]]))) {
        ## univariate datasets (?)
        mappings_R2 <- lapply(mappings_summaries, function(el){el$r.squared})
    } else {
        ## multivariate datasets
        sbf_r2_fun <- function(summ) {
            aggfun(sapply(summ, function(el){el$r.squared}, simplify=T))
        }
        mappings_R2 <- lapply(mappings_summaries, sbf_r2_fun)
    }

    ## Create respective R2 matrix
    R2mat <- matrix(NA, nrow=n_datasets, ncol=n_datasets)
    for (i in 1:length(mappings_R2)) {
        i_inds <- as.integer(strsplit(names(mappings_R2)[[i]],"-")[[1]])
        R2mat[i_inds[1], i_inds[2]] <- mappings_R2[[i]]
    }

    R2mat
}


#' Compute D_joint for dataset i separately for all paths
#' Can be used to study path variability
#'
#' @param data_orig list of data.frames, Original data collection
#' @param ccr list, output of cocoreg(data_orig)
#' @param ds_ind integer, index of the dataset to process
#'
#' @return A list of data.frames, D_joint corresponding to each path that ends
#'         at 'ds_ind' i.e. paths defined by ccr$paths[,ds_ind].
#'         Dimensions of the matrices are the same as for the data.frames
#'         in data_orig.
#'
#' @examples
#' \dontrun{
#' ccr <- cocoreg(data_list)
#' jibp <- cocoreg_by_path(ccr, 1)
#' }
#' @export
cocoreg_by_path <- function(data_orig, ccr, ds_ind){
    data_orig <- validate_data(data_orig)
    data_orig <- rename_variables(data_orig)
    
    ##paths <- generate_paths(ds_ind, n=length(ccr$data), cyclic=ccr$cyclic)
    paths <- ccr$paths[,ds_ind]

    ## Compute path joint informations
    tmp <- vector(mode = "list", length = length(paths))
    for (i in seq.int(length(paths))){
        tmp[[i]] <- compose(data_orig[[ attr(paths[[i]],'startDS') ]],
                            paths[[i]], ccr$mappings)
    }
    names(tmp) <- vapply(paths, function(x){attr(x,'startDS')}, numeric(1))
    tmp
}


#' Compute "variance" of the vectors var()
#' 
#' @param vec_lst Data to process as a list of numeric vectors
#' @param mean_vec (optional) Desired mean vector as a numeric vector
#' 
#' @return Variance of data values around mean as a numeric matrix
#' 
#' @importFrom abind abind
#' @export
vector_variability <- function(vec_lst,
                               mean_vec = apply(abind::abind(vec_lst, along = 2), 1, mean) ){
    ## Variability around mean
    diff_mat <- abind::abind(lapply(vec_lst, function(x){x-mean_vec}), along=2)
    apply(diff_mat, 2, var)
}


#' Compute "variance" of the matrices using Frobenius norm.
#' Variance is by default computed with respect to the mean of the matrices.
#'
#' @param mat_lst A [1,M] list of [I,J] matrices from which variability should
#'        be computed
#' @param mean_mat A [I,J] matrix describing the mean observation for mat_lst.
#'
#' @return A list with elements
#' \item{fbnorm}{Frobenius norm values for each of the matrices}
#' \item{diff}{[I,J,M] matrix of differences mat_lst-mean_mat}
#'
#' @importFrom abind abind
#' @export
matrix_variability <- function(mat_lst,
                               mean_mat = apply(abind::abind(mat_lst, along = 3), c(1,2), mean) ){
    ## Variability around mean
    diff_mat <- abind::abind(lapply(mat_lst, function(x){x-mean_mat}), along=3)
    list(fbnorm=apply(diff_mat, 3, norm, type="F"),
         diff=diff_mat)
}


#' Determine the variability of matrices under row shuffling
#'
#' @param mat_lst A list of matrices
#' @param B integer, Number of times to sample (shuffle)
#'
#' @return [B,K] matrix, Frobenius norm vectors corresponding to row shuffling
#'
#' @export
row_suffle_variability <- function(mat_lst, B=50){
    nd_var = matrix(NA, nrow=B, ncol=length(mat_lst))
    for (i in 1:B){
        ## suffle
        mat_lst_suff <- lapply(mat_lst, function(x){apply(x,2,sample,size=nrow(x))})
        ## compute variability
        nd_var[i,] <- matrix_variability(mat_lst_suff)$fbnorm
    }
    nd_var
}


#' Compute ds_variability for all datasets in a data collection
#' 
#' @param data Unprocessed dataset as a list of data.frames
#' @param ccr Processed dataset as a list of data.frames, output of cocoreg()$data
#' 
#' @return Path variability as a data.frame
#' 
#' @export
dc_variability <- function(data, ccr){
  data <- validate_data(data)
  data <- rename_variables(data)
  x <- lapply(seq.int(length(data)),
              function(i){ds_variability(data, ccr, i)})
  names(x) <- names(data)
  x_df <- dflst2df(x, id_var_name="ds")
}


#' Compute variability_components for a dataset
#'
#' Note: might not work for cyclic ccr <TODO>
#' 
#' @param ds_ind Starting dataset of the set of paths to analyze as [1,1] integer
#' @inheritParams dc_variability
#' 
#' @return Path variability as a data.frame
#' 
#' @importFrom reshape melt melt.data.frame
ds_variability <- function(data, ccr, ds_ind){
    
    paths <- ccr$paths[,ds_ind]

    ## Compute path joint informations
    tmp <- vector(mode = "list", length = length(paths))
    for (i in seq.int(length(paths))){
        tmp[[i]] <- compose(data[[get_starting_dataset(paths[[i]])]],
                            paths[[i]], ccr$mappings)
    }
    names(tmp) <- vapply(paths, function(x){attr(x,'startDS')}, numeric(1))


    path_arr <- dflst2array(tmp, dim=3)
    myfun <- function(vec){
        variability_components(vec, grp=dimnames(path_arr)[[3]], fun=ss)
    }
    ss_arr <- apply(path_arr, c(1,2), myfun)
    ss_arr <- apply(ss_arr, c(1,3), mean) #average out time
    names(dimnames(ss_arr)) <- c("varcomp","variable")
    ss_df <- reshape::melt(ss_arr)
    ss_df
}
Any scripts or data that you put into this service are public.
cocoreg documentation built on May 2, 2019, 6:48 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
cocoreg
Extract Shared Variation in Collections of Data Sets Using Regression Models

R/cocoreg.R
In cocoreg: Extract Shared Variation in Collections of Data Sets Using Regression Models

Try the cocoreg package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cocoreg Extract Shared Variation in Collections of Data Sets Using Regression Models

R/cocoreg.R In cocoreg: Extract Shared Variation in Collections of Data Sets Using Regression Models

Try the cocoreg package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cocoreg
Extract Shared Variation in Collections of Data Sets Using Regression Models

R/cocoreg.R
In cocoreg: Extract Shared Variation in Collections of Data Sets Using Regression Models