R/split_data_class.R

Defines functions split_data

Documented in split_data

#' @eval get_description('split_data')
#' @export split_data
#' @examples
#' M = split_data(p_train=0.75)
#'
split_data = function(p_train,...) {
    out=struct::new_struct('split_data',
        p_train=p_train,
        ...)
    return(out)
}


.split_data<-setClass(
    "split_data",
    contains = c('model'),
    slots=c(
        p_train='entity',
        training='entity',
        testing='entity'
    ),

    prototype=list(
        name = 'Split data',
        description = paste0('The data matrix is divided into two subsets.',
        'A predefined proportion of the samples are randomly selected for a ',
        'training set, and the remaining samples are used for the test set.'),
        type = 'processing',
        predicted = 'testing',
        .params=c('p_train'),
        .outputs=c('training','testing'),

        p_train=entity(name = 'Proportion in training set',
            description = paste0('The proportion of samples selected for the ',
            'training set.'),
            value = 0.75,
            type='numeric'),

        training=entity(name = 'A DatasetExperiment of training data',
            description = 'A DatasetExperiment object containing samples selected for the training set.',
            type='DatasetExperiment',
            value=DatasetExperiment()
        ),
        testing=entity(name = 'A DatasetExperiment of data for testing',
            description = 'A DatasetExperiment object containing samples selected for the testing set.',
            type='DatasetExperiment',
            value=DatasetExperiment()
        )
    )
)

#' @export
#' @template model_apply
setMethod(f="model_apply",
    signature=c("split_data","DatasetExperiment"),
    definition=function(M,D) {
        opt=param_list(M)
        # number of samples
        nMax=nrow(D$data)
        # number in the training set
        n=floor(nMax*opt$p_train)
        # select a random subset of the data for training
        in_training=sample(x=1:nMax,size = n, replace=FALSE,prob=NULL)
        training=DatasetExperiment(data=D$data[in_training,,drop=FALSE],
            sample_meta=D$sample_meta[in_training,,drop=FALSE],
            variable_meta=D$variable_meta,
            name=c(D$name,'(Training set)'),
            description=c(D$description,'A subset of the data has been selected as a training set'))
        testing=DatasetExperiment(data=D$data[-in_training,,drop=FALSE],
            sample_meta=D$sample_meta[-in_training,,drop=FALSE],
            variable_meta=D$variable_meta,
            name=c(D$name,'(Testing set)'),
            description=c(D$description,'A subset of the data has been selected as a test set'))
        output_value(M,'training')=training
        output_value(M,'testing')=testing

        return(M)
    }
)
computational-metabolomics/structtoolbox documentation built on Feb. 9, 2024, 8:19 a.m.