#Copyright (c) 2016 Jussi Korpela (Finnish Institute of Occupational Healt, jussi.korpela@ttl.fi) and Andreas Henelius (Finnish Institute of Occupational Healt, andreas.henelius@iki.fi)
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#The above copyright notice and this permission notice shall be included in
#all copies or substantial portions of the Software.
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#THE SOFTWARE.
## -----------------------------------------------------------------------------
## This file contains common tools such as data creation and plotting
## -----------------------------------------------------------------------------
## -----------------------------------------------------------------------------
## Manipulation of lists of data.frames
## -----------------------------------------------------------------------------
# Manipulations that are independet for data.frames in the list can be done
# using lapply(). The more complicated ones are stored here.
#' Remove rows with NA values from a list of data.frames
#'
#' @description
#' Same rows removed from all data frames in the list.
#'
#' @param df_lst [1,m] list of data.frames, A list of data.frames to process
#'
#' @return [1,m] list of data.frames, Data without rows that contain NA,
#' same rows removed from all data.frames in the input list.
#'
#' @export
dl_remove_NA <- function(df_lst){
df <- abind(df_lst, along=2) ## collect all dfs in one giant one
is.complete <- complete.cases(df)
df_lst <- lapply(df_lst, function(el){el[is.complete,]})
df_lst
}
#' Catenate a list of data.frames vertically to a single data.frame
#'
#' @description
#' Assumes equal variables for all datasets!
#' Output has columns: <variables>, "dataset"
#' Preserves list element names in column "dataset".
#' For a more generic approach see dflst2dfmelt (uses reshape::melt(df_lst))
#'
#' @param df_lst [1,m] list of data.frames, A list of data.frames to process
#' @param id_var_name string, Column name for the dataset id variable
#'
#' @return A data.frame with elements of df_lst catenated vertically, An extra column with dataset id
#' is added.
#'
#' @export
dflst2df <- function(df_lst, id_var_name="dataset"){
if (is.null(names(df_lst))){
warning('dflst2df: df_lst entries not named. Making them up...')
names(df_lst) <- sprintf('dataset%d', 1:length(df_lst))
}
df_lst <- mapply(function(x,y){x[,id_var_name] <- as.factor(y)
#x[,id_var_name] <- as.factor(x[,id_var_name])
x}, df_lst, names(df_lst),
SIMPLIFY=F, USE.NAMES=F)
ggdf <- do.call(rbind, df_lst)
}
#' Combine a list of data.frames to a single molten data.frame
#'
#' @description
#' Output maximally "molten" with columns "dataset", "obs", "variable", "value"
#' Preserves list element names in column "dataset".
#'
#' @param df_lst [1,m] list of data.frames, A list of data.frames to process
#'
#' @return A data.frame with elements of df_lst combined using reshape::melt().
#'
#' Extra columns:
#'
#' \describe{
#' \item{dataset}{dataset name}
#' \item{obs}{running observation index (time)}
#' }
#'
#' @importFrom reshape melt.list melt.data.frame
#'
#' @export
dflst2dfmelt <- function(df_lst){
#df <- lapply(df_lst, reshape::melt.data.frame, id.vars = NULL)
#df <- reshape::melt.list(df, id.vars = c('variable','value'))
df <- reshape::melt(df_lst) #won't work with importFrom
names(df)[names(df) %in% "L1"] <- "dataset"
#names(df)[names(df) %in% "L2"] <- "datacollection"
df$obs <- rep(1:nrow(df_lst[[1]]), length(df_lst))
df
}
#' Catenate a list of data.frames to a matrix along dim
#'
#' @param df_lst [1,m] list of data.frames, A list of data.frames to process
#' @param dim [1,1] int, Dimension to apply over
#'
#' @return A matrix with elements of df_lst converted to matrix and catenated along dim.
#'
#' @importFrom abind abind
#' @export
dflst2array <- function(df_lst, dim=2){
df_lst <- dflst_dsnames2varnames(df_lst)
tmp <- lapply(df_lst, as.matrix)
tmp <- do.call(abind::abind, c(tmp, along = dim)) #catenated along dimension dim
}
#' Append dataset names to variable names of the respective dataset
#'
#' @param dflst [1,m] list of data.frames, A list of data.frames to process
#' @param sep string, Separator to use
#'
#' @return A [1,m] list of data.frames with modified variable names.
#'
#' @export
dflst_dsnames2varnames <- function(dflst, sep = "_"){
for (i in seq.int(length(dflst))){
names(dflst[[i]]) <- paste(names(dflst)[i], names(dflst[[i]]), sep=sep)
}
dflst
}
#' Catenate a set of data collections (lists of data.frames) into a single molted data.frame.
#'
#' @description
#' Can be used e.g. to prepare data for plotting with ggplot().
#'
#' @param ... Several lists of data.frames to catenate
#' @param id.vars [1,m] string, ID variables for reshape::melt
#'
#' @return A data.frame with elements of ... melted and catenated vertically into a single data.frame.
#'
#' Extra columns created:
#' \item{ds:}{dataset id within data collection}
#' \item{dc:}{data collection id}
#'
#' @examples
#' df_lst <- list(df1 = iris[,2:3], df2 = iris[2:3])
#' data_collections2ggdf(dc1 = df_lst, dc2 = df_lst)
#'
#' @importFrom reshape melt melt.data.frame
#'
#' @export
data_collections2ggdf <- function(..., id.vars=NULL){
input_lst <- list(...)
## melt all data.frames
if (!is.null(id.vars)){
input_lst <- lapply(input_lst, function(d){
lapply(d,function(d2){reshape::melt(d2, id.vars=id.vars)}) })
} else {
input_lst <- lapply(input_lst, function(d){ lapply(d,function(d2){reshape::melt(d2)}) })
}
## collapse data collections to data frames
input_lst <- lapply(input_lst, dflst2df, id_var_name="ds")
## collapse into data.frame
dflst2df(input_lst, id_var_name="dc")
}
#' Apply fun to the bottom level of a nested list structure
#'
#' @description
#' Used to batch process computation results that are stored into a nested list structure.
#' Analysis results are stored as lists but with class attribute changed. This signals that the
#' recursion into the list structure should end and fun should be applied instead. Can be used e.g.
#' to pick out results from a complex list structure.
#'
#' @param lst nested list, A nested list structure to process
#' @param fun function object, The function to apply at the bottom level
#' @param exclude_names string array, Names of list elements to skip at any level
#' @param ... Further parameters passed on to fun
#'
#' @return A list outputs generated when applying fun to the bottom level of input lst.
#' Bottom level is considered reached when something other than class == 'list' is
#' encountered.
#'
#' @export
traverse_nested_list <- function(lst, fun, exclude_names = NULL, ...){
if (class(lst)=="list"){
include_match <- !(names(lst) %in% exclude_names)
lapply(lst[include_match],
function(el){traverse_nested_list(el, fun=fun, exclude_names=exclude_names, ...)})
} else {
# The hack comes in here: results of analyses are lists but their class attribute has been
# changed to something else. Hence they end up here.
fun(lst, ...)
}
}
#' Apply PCA to the data after catenating data.frames horizontally
#'
#' @param df_lst [1,m] list of data.frames, A list of data.frames to process
#' @param center boolean, TRUE -> center data, FALSE -> do nothing
#' @param scale boolean, TRUE -> scale data, FALSE -> do nothing
#'
#' @return A list with elements:
#' \item{pcdf:}{data.frame, PCA components (prcomp()$x)}
#' \item{model:}{list, Output of prcomp()}
#'
#' @import stats
#' @export
dflst_pca <- function(df_lst, center = F, scale = F){
d <- dflst2array(df_lst, dim = 2)
fit <- stats::prcomp(d, center = center, scale = scale)
pcdf <- as.data.frame(fit$x)
list(pcdf = pcdf, model = fit)
}
#' Add a data.frame (dataset) to a list of data.frames (datasets)
#'
#' @param dflst [1,m] list of data.frames, A list of data.frames
#' @param df data.frame, Data frame to add
#' @param dsname string, Dataset name for the data.frame to add
#'
#' @return A list of data.frames, A new list of data.frames with one new dataset in the end
#'
#' @export
dflst_add_ds <- function(dflst, df, dsname){
df <- list(df)
names(df) <- dsname
dflst <- c(dflst, df)
dflst
}
## -----------------------------------------------------------------------------
## Manipulation of data.frames
## -----------------------------------------------------------------------------
#' Apply scale on a numeric data.frame
#'
#' @param df data.frame, A numeric data.frame to process
#' @param ... arguments to scale()
#'
#' @return data.frame, A scaled data.frame with attributes preserved
#'
#' @export
df_scale <- function(df,...){
att <- attributes(df)
df <- as.data.frame(scale(df,...))
attributes(df) <- att
df
}
#' Scales variables in data.frame dfx using ordinary least squares such
#'
#' @description
#' Scales variables in data.frame dfx using ordinary least squares such that the scaled
#' result explains as much of the variance in dfy as possible.
#' Scaling is done separately for each variable (i.e. no linear mixing of variables).
#' Assumes data.frames dfx and dfy to be of identical structure.
#' Intended use: to scale up cocoreg projections to account for the lost variance.
#'
#' @param dfx data.frame, Data frame to use as independent variable
#' @param dfy data.frame, Data frames to use as dependent variable
#'
#' @return data.frame, A rescaled version of dfx with dimnames from dfy.
#'
#' @examples
#' \dontrun{
#' dc <- create_syn_data_toy()
#' ccr <- cocoreg(dc$data)
#' dfLst <- mapply(df_scale_ols, ccr$data, dc$data , SIMPLIFY=F)
#' }
#'
#' @export
df_scale_ols <- function(dfx, dfy){
dfy_est <- as.data.frame(mapply(
function(x1,x2){
mapping_lm(as.data.frame(x1),as.data.frame(x2))(as.data.frame(x1))
}, dfx, dfy))
dimnames(dfy_est) <- dimnames(dfy)
dfy_est
}
## -----------------------------------------------------------------------------
## Statistics
## -----------------------------------------------------------------------------
#' Compute Euclidean norm of vector
#'
#' @description
#' Convenience function for use with e.g. lapply
#'
#' @param x [1,m] double, A vector of data
#'
#' @return [1,1] double, Euclidean norm of x
#'
#' @export
vecnorm <- function(x){
base::norm(as.matrix(x),'F')
}
#' Make vector of unit norm
#'
#' @param x [1,m] double, A vector of data
#'
#' @return [1,m] double, Same vector normalized to unit Euclidean norm
#'
#' @export
to_unit_vec <- function(x){
x / vecnorm(x)
}
#' Compute RMSE between vectors v1 and v2
#'
#' @param v1 [1,m] numeric, First data vector
#' @param v2 [1,m] numeric, Second data vector
#' @param relative boolean, If TRUE, relate the rmse value to the rmse of v1.
#' If FALSE, just compute RMSE between v1 and v2
#'
#' @return [1,1] double, RMSE value
#'
#' @export
rmse <- function(v1, v2, relative=F){
if (relative){
rmse(v1,v2)/rmse(v1, rep(0,length(v1)) )
} else {
sqrt( mean((v1-v2)^2, na.rm = TRUE) )
}
}
#' Compute RMSE between data.matrices dm1 and dm2
#'
#' @description
#'
#' A data.matrix has observations as rows and variables as columns
#'
#' @param dm1 [N,M] numeric, First data.matrix
#' @param dm2 [N,M] numeric, Second data.matrix
#'
#' @return [1,M] numeric, A vector of RMSE values, one per variable.
#'
#' @examples
#' \dontrun{
#' dm1 <- matrix(rep(1,6),nrow=2)
#' dm2 <- matrix(rep(3,6),nrow=2)
#' data_matrix_rmse(dm1, dm2)
#'
#' first = list(dm1, dm1)
#' second = list(dm2, dm2)
#' (tmp = mapply(data_matrix_rmse, first, second, SIMPLIFY=FALSE))
#' }
#'
#' @export
data_matrix_rmse <- function(dm1, dm2){
## average the rows of data.matrix, columns (variables) remain
sqrt( apply((dm1-dm2)^2, 2, mean , na.rm = TRUE) )
}
#' Standard error of mean
#'
#' @param x [1,M] numeric, data vector
#' @param na.rm procedure for NA's, passed on to sd(), default: na.rm = T
#'
#' @return [1,1] numeric, standard error of mean
#'
#' @export
se <- function(x, na.rm = T){
sd(x, na.rm=na.rm)/sqrt(length(x))
}
#' Get [min, max] of a list of numeric objects
#'
#' @param dataList [1,m] list of numeric objects
#'
#' @return [1,2] double, [min, max] of the input
#'
#' @export
get_range_datalist <- function(dataList){
## TODO: rename to dl_range()
dl_max <- max(as.numeric( lapply(dataList, function(x){max(x)}) ))
dl_min <- min(as.numeric( lapply(dataList, function(x){min(x)}) ))
c(dl_min, dl_max)
}
#' A rotation matrix
#'
#' @param angle_deg [1,1] numeric, Angle in degrees
#'
#' @return [2,2] matrix, Rotation matrix for making angle_deg 2D rotation
#'
#' @export
rotation_matrix <- function(angle_deg){
angle_rad = (angle_deg/180)*pi
matrix(c(cos(angle_rad), -sin(angle_rad),
sin(angle_rad), cos(angle_rad)), ncol=2 ,byrow=T)
}
#' Sum-of-squares values showing what portion of variance in dvec is explained
#' by dvec_est
#'
#' @description
#'
#' Computation as in:
#' http://en.wikipedia.org/wiki/Fraction_of_variance_unexplained
#'
#' ss_est becomes zero if dvec_est equals dvec_0=rep(mean(dvec),length(dvec)).
#' If dvec_est is better estimate than dvec_0, R2 is positive.
#' If dvec_est is worse than dvec_0, R2 is negative.
#'
#' @param dvec [1,m] numeric, data vector
#' @param dvec_est [1,m] numeric, data vector, an estimate of dvec
#'
#' @return A list with elements:
#' \item{ss_tot:}{Sum of squares in dvec}
#' \item{ss_est:}{Sum of squares in dvec_est}
#' \item{ss_err:}{Sum of squares of dvec - dvec_est}
#' \item{R2:}{Percentage of variance explained i.e. 1 - ss_err/ss_tot}
#'
#' @export
var_explained <- function(dvec, dvec_est){
ss_tot <- ss(dvec - mean(dvec))
ss_est <- ss(dvec_est - mean(dvec))
ss_err <- ss(dvec - dvec_est)
R2 = 1 - ss_err/ss_tot
list(ss_tot=ss_tot, ss_est=ss_est, ss_err=ss_err, R2=R2)
}
#' Compute total, within group and between group variability using fun
#'
#' @description
#' The function used the definition: gvar = tvar - wgvar
#'
#' @param vec [1,M] numeric, Data vector
#' @param grp [1,M] integer/character vector, Some grouping of vec
#' @param fun function, Function to use when quantifying the variability
#'
#' @return A list with elements:
#' \item{tvar:}{Total variability}
#' \item{bgvar:}{Between groups variability, tvar - sum(wgvar_*)}
#' \item{wgvar_<groupname>:}{Within group variability for each group}
#' \item{wg_rel:}{sum(wgvar)/tvar}
#' \item{bg_rel:}{bgvar/tvar}
#'
#' @examples
#' vec <- rnorm(10)
#' grp <- rep(c("a","b","c"), c(3,3,4))
#' variability_components(vec, grp, ss)
#'
#' @export
variability_components <- function(vec, grp, fun){
grpmean_vec <- tapply(vec, grp, mean)
tvar <- fun(vec - mean(vec))
wgvar <- vapply(seq.int(length(grpmean_vec)),
function(i){ fun(vec[grp %in% names(grpmean_vec)[i]] - grpmean_vec[i]) },
numeric(1))
names(wgvar) <- names(grpmean_vec)
bgvar <- tvar - sum(wgvar) ## this term cannot be computed directly as
## there might be "interactions" present
res <- c(tvar , bgvar , wgvar , sum(wgvar)/tvar, bgvar/tvar)
names(res) <- c("tvar", "bgvar", paste0("wgvar_",names(grpmean_vec)), "wg_rel" , "bg_rel")
res
}
#' Sum of squares
#'
#' @param x [1,m] numeric, A data vector
#'
#' @return Sum of squares of x
#'
#' @export
ss <- function(x){sum(x^2)}
## -----------------------------------------------------------------------------
## Data generation
## -----------------------------------------------------------------------------
#' Make 2D gauss data (maybe obsolete)
#'
#' @param n [1,1] int, Number of observations
#' @param var [1,2] numeric, Variances
#' @param angle_deg [1,1] numeric, Rotation angle
#' @param scale boolean, Scale data? T -> scale, F -> do not scale
#' @param seed [1,1] int, Random seed
#'
#' @return Matrix of 2D gaussian data
#'
#' @export
make_data_gauss_2d <- function(n, var, angle_deg, scale=T, seed=42){
R <- rotation_matrix(angle_deg) #rotation matrix
L = diag(var) #variances
set.seed(seed)
S = matrix(rnorm(n*2), ncol=n)
d = t(R %*% L %*% S)
if (scale){
d = scale(d)
} else { d }
}
#' Add notch-like gaussian snippets to an existing signal x
#'
#' @param x [1,N] numeric, Original data
#' @param pos [1,m] integer, Positions to add notches to
#' @param sd [1,1] numeric, (optional) Desired width of the Gaussian notch
#' @param amplitude [1,1] numeric, (optional) Desired amplitude for the notches
#'
#' @return [1,N] numeric, Modified signal with notches
#'
#' @export
add_notches <- function(x, pos, sd=0.01*length(x), amplitude=1){
N <- length(x)
mid <- floor(N/2)
nv <- dnorm(-mid:(mid-1), sd=sd)
if (length(nv) != N) {
# can only be one shorter: for uneven N
nv <- c(nv,0)
}
nv <- (amplitude/max(nv))*nv #scale peak height
# Add notches
for (p in pos){
if (p > mid){
by = p - mid -1
} else {
by = -(mid-p+1)
}
x <- x + cshift(nv, by=by)
}
x
}
## -----------------------------------------------------------------------------
## General utilities
## -----------------------------------------------------------------------------
#' Circularly shift vector elements
#'
#' @param x [1,N] numeric, A vector
#' @param by [1,1] integer, How many positions to shift.
#' by > 0 -> shift to right
#' by = 0 -> no shift
#' by < 0 -> shift to left
#'
#' @return [1,N] numeric, Circularly shifted signal
#'
#' @export
cshift <- function(x, by){
N <- length(x)
if (by > 0){
# to right
c(x[(N-by+1):N], x[1:(N-by)])
} else if (by==0){ x }
else {
# to left
c(x[abs(by)+1:(N-abs(by))], x[1:(abs(by))])
}
}
#' Replicate matrix to create a larger one
#'
#' @description
#' From:
#' http://haky-functions.blogspot.fi/2006/11/repmat-function-matlab.html (accessed 27.3.2015)
#'
#' @param X A [I,J] matrix or J element vector, Matrix used as such, vector coerced to a row
#' matrix with dim(X)=[1,J].
#' @param m [1,1] integer, Replication count vertically
#' @param n [1,1] integer, Replication count horizontally
#'
#' @return [m*I,n*J] matrix, Replicated data
#'
#' @export
repmat = function(X, m, n){
##R equivalent of repmat (matlab)
if (class(X) != "matrix"){
X = matrix(X, ncol=length(X))
}
mx = dim(X)[1]
nx = dim(X)[2]
matrix(t(matrix(X,mx,nx*n)),mx*m,nx*n,byrow=T)
}
## -----------------------------------------------------------------------------
## Wrappers
## -----------------------------------------------------------------------------
#' Run BGFA by Klami et. al. using data format conventions of this repo
#'
#' @param df_list [1,m] list of data.frames, Input data to GFA in COCOREG format
#' @param K [1,1] int, The (maximum) number of components; should be high enough to capture all of the components. This can be recognized by at least a few of the components being shut down
#' @param Nrep [1,1] int, Number of random initialization used for learning the model
#'
#' @return A list, The output of CCAGFA::GFA()
#'
#' @import CCAGFA
#' @export
wrapper_BGFA <- function(df_list, K=8, Nrep=2){
dmat_list <- lapply(df_list, as.matrix)
opts <- CCAGFA::getDefaultOpts()
## These options are from the examples in CCAGFA
opts$iter.crit <- 1e-6 # Need to make this more strict if
opts$lbfgs.factr <- 1e5 # A bit less strict convergence criterion,
opts$iter.max <- 1e3
# having a large sample size
# should be enough for our simple dataset
opts$verbose <- 1 # Looks a bit nicer
opts$low.mem <- FALSE
#model <- CCAGFA::GFAexperiment(dmat_list, K, opts, Nrep=Nrep)
model <- CCAGFA::GFA(dmat_list, K, opts)
}
#' Project BGFA components common to all datasets back to the original space
#'
#' @param model Output of CCAGFA::GFA()
#' @param threshold [1,1] double, GFA model trimming threshold
#'
#' @return A list of data.frames, Original data reconstructed using only
#' those latent components that are active in all datasets
#'
#' @import CCAGFA
#' @export
BGFA_joint_info <- function(model, threshold=0.001){
model.trim <- CCAGFA::GFAtrim(model, threshold=threshold) #0.001 GFAtrim default threshold
all_active_match <- colSums(model.trim$active)==nrow(model.trim$active)
lapply(model.trim$W, function(w){ as.data.frame(model.trim$Z[,all_active_match] %*%
t(w[,all_active_match])) })
}
#' Apply GFA using the same interface as cocoreg()
#'
#' @description
#' Note: if K is too high GFA() might not converge in a meaningful time
#' or the computation may mysteriously crash.
#'
#' @param df_list [1,m] list of data.frames, Input data to GFA in COCOREG format
#' @param K [1,1] int, (Maximum) number of GFA components
#' @param Nrep [1,1] int, Number of random initialization used for learning the model
#' @param threshold [1,1] double, GFA model trimming threshold
#'
#' @return A list with elements:
#' \item{$data:}{[1,m] list of data.frames, Original data reconstructed using only
#' those latent components that are active in all datasets}
#' \item{$model:}{a list, Non-trimmed output of CCAGFA::GFA()}
#' \item{$dataid:}{string, Dataset identifier string}
#' \item{$method:}{string, Analysis method identifier string}
#' \item{$wall_time_taken:}{[1,1] double, Time taken to run the analysis in seconds}
#'
#' @export
BGFA_cocoreg_interface <- function(df_list, K=8, Nrep=2, threshold=0.001){
start_time <- Sys.time()
if ('id' %in% names(attributes(df_list)) ){
dataid <- attr(df_list,'id') #get before its gone
} else {
dataid <- 'NA'
}
df_list <- validate_data(df_list)
dmeta <- get_dc_meta(df_list)
df_list <- lapply(df_list, function(el){df_scale(el, center=T, scale=F)})
## A quick-n-dirty fix for one special dataset that triggers a bug in GFA
if ( length(grep("syn", dataid)) == 1 ){
#synthetic data -> reduce K to be able to compute
model <- wrapper_BGFA(df_list, K=4, Nrep=Nrep)
} else {
model <- wrapper_BGFA(df_list, K=K, Nrep=Nrep)
}
data <- BGFA_joint_info(model, threshold=threshold)
## Convert back to original data format
data <- apply_dc_meta(data, dmeta)
res_lst <- list(data=data,
model=model,
dataid=dataid,
method='gfa')
res_lst <- c(res_lst, list(wall_time_taken = as.numeric(difftime(Sys.time(), start_time, units = "sec"))))
class(res_lst) <- "gfa" ## Needed for effective recursive traversing of results in a nested list
res_lst
}
#' PCA projection using cocoreg interface
#'
#' @param df_list [1,m] list of data.frames, Input data to GFA in COCOREG format
#' @param prc_th [1,1] double, Threshold in percentage of cumulative variance explained
#' PCA components are included until cumulative explained variance reaches prc_th.
#'
#' @return A list with elements:
#' \item{$data}{[1,m] list of data.frames, Original data reconstructed using only
#' those latent components that are active in all datasets}
#' \item{$dataid}{string, Dataset identifier string}
#' \item{$method}{string, Analysis method identifier string}
#' \item{$wall_time_taken}{[1,1] double, Time taken to run the analysis in seconds}
#'
#' @export
PCA_cocoreg_interface <- function(df_list, prc_th=0.9){
start_time <- Sys.time()
dataid <- attr(df_list,'id') #get before its gone
df_list <- validate_data(df_list)
dmeta <- get_dc_meta(df_list)
df_list <- lapply(df_list, function(el){df_scale(el, center=T, scale=F)})
df <- do.call(cbind, df_list)
varnames <- lapply(seq.int(length(df_list)),
function(i){
paste0(names(df_list[i]),".",colnames(df_list[[i]]))
})
fit <- stats::prcomp(df, center=T, scale=T) #SVD
##summary(fit) # print variance accounted for
pc_inds <- which(summary(fit)$importance[3,] < prc_th)
if (length(pc_inds)==0){
pc_inds = 1
} else {
pc_inds <- as.numeric(c(pc_inds, max(pc_inds)+1))
}
pcd <- as.matrix(df) %*% fit$rotation[,pc_inds]
pd <- pcd %*% t(fit$rotation[,pc_inds])
pd <- lapply(varnames, function(el){as.data.frame(pd[,el])})
## Convert back to original data format
pd <- apply_dc_meta(pd, dmeta)
res_lst <- list(data=pd,
dataid=dataid,
method='pca',
wall_time_taken = as.numeric(difftime(Sys.time(), start_time, units = "sec")) )
class(res_lst) <- "pca" ## Needed for effective recursive traversing of results in a nested list
res_lst
}
#' SCA projection using cocoreg interface
#'
#' @param df_list, [1,m] list of data.frames, Input data to GFA in COCOREG format
#' @param nfac, [1,1] int, see multiway::sca() for details
#' @param type, string, Type of analysis, see multiway::sca() for details
#' @param rotation, string, see multiway::sca() for details
#' @param nstart, [1,1] int, see multiway::sca() for details
#'
#' @return A list with elements:
#' \item{$data}{[1,m] list of data.frames, Original data reconstructed using only
#' those latent components that are active in all datasets}
#' \item{$model}{list, The output of multiway::sca()}
#' \item{$dataid}{string, Dataset identifier string}
#' \item{$method}{string, Analysis method identifier string}
#' \item{$wall_time_taken}{[1,1] double, Time taken to run the analysis in seconds}
#'
#' @importFrom multiway sca
#' @export
SCA_cocoreg_interface <- function(df_list,
nfac=1, type="sca-p",
rotation="none", nstart=10){
start_time <- Sys.time()
## Extract metadata
dataid <- attr(df_list,'id') #get before its gone
df_list <- validate_data(df_list)
dmeta <- get_dc_meta(df_list)
df_list <- lapply(df_list, function(el){df_scale(el, center=T, scale=F)})
varnames <- lapply(seq.int(length(df_list)),
function(i){
paste0(names(df_list[i]),".",colnames(df_list[[i]]))
})
## SCA
dm_list <- lapply(df_list, as.matrix)
scamod <- multiway::sca(dm_list, nfac=nfac, type=type, rotation=rotation, nstart=nstart)
# Projection onto original variables
sca_comp <- vector(length=length(df_list), mode="list")
for (i in seq.int(length(df_list))){
sca_comp[[i]] <- scamod$D[[i]] %*% t(scamod$B)
}
## Convert back to original data format
sca_comp <- lapply(sca_comp, as.data.frame)
sca_comp <- apply_dc_meta(sca_comp, dmeta)
res_lst <- list(data=sca_comp,
model=scamod,
dataid=dataid,
method='sca',
wall_time_taken = as.numeric(difftime(Sys.time(), start_time, units = "sec")) )
class(res_lst) <- "sca" ## Needed for effective recursive traversing of results in a nested list
res_lst
}
#' COCOREG style analysis using RGCCA projection
#'
#' @description
#' COCOREG interface used for both input and output.
#'
#' @param dflst, [1,m] list of data.frames, Input data to GFA in COCOREG format
#' @param tauArr, [1,m] double, See RGCCA::rgcca() for details
#'
#' @return A list with elements:
#' \item{$data}{[1,m] list of data.frames, Original data reconstructed using only
#' those latent components that are active in all datasets}
#' \item{$model}{list, The output RGCCA::rgcca()}
#' \item{$dataid}{string, Dataset identifier string}
#' \item{$method}{string, Analysis method identifier string}
#' \item{$wall_time_taken}{[1,1] double, Time taken to run the analysis in seconds}
#'
#' @import RGCCA
#' @export
RGCCA_cocoreg_interface <- function(dflst, tauArr = rep(0.5, length(dflst))) {
start_time <- Sys.time()
## Extract metadata
dataid <- attr(dflst,'id') #get before its gone
df_list <- validate_data(dflst)
dmeta <- get_dc_meta(dflst)
# Get the first RGCCA component
res.rgcca = RGCCA::rgcca( A = lapply(dflst, as.matrix),
ncomp = rep(1,length(dflst)),
tau = tauArr,
verbose = F)
rgccaProjs <- lapply(res.rgcca$Y, as.data.frame)
rgccaEstimate <- mapply(df_scale_ols, rgccaProjs, dflst , SIMPLIFY=F)
res_lst <- list(data = rgccaEstimate,
model = res.rgcca,
dataid = dataid,
method = 'rgcca',
wall_time_taken = as.numeric(difftime(Sys.time(), start_time, units = "sec")) )
class(res_lst) <- "rgcca" ## Needed for effective recursive traversing of results in a nested list
res_lst
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.