R/columns.R
In maditr: Fast Data Aggregation, Modification, and Filtering with Pipes and 'data.table'

Documented in columns rows

#' Selects columns or rows from the data set
#'
#' - `columns`: select columns from dataset. There are four ways of column selection:
#' 1. Simply by column names
#' 2. By variable ranges, e. g. vs:carb. Alternatively, you can use '%to%'
#' instead of colon: 'vs %to% carb'.
#' 3. With regular expressions. Characters which start with '^' or end with '$'
#' considered as Perl-style regular expression patterns. For example, '^Petal'
#' returns all variables started with 'Petal'. 'Width$' returns all variables
#' which end with 'Width'. Pattern '^.' matches all variables and pattern
#' '^.*my_str' is equivalent to contains "my_str"'.
#' 4. By character variables with interpolated parts. Expression in the curly
#' brackets inside characters will be evaluated in the parent frame with
#' [text_expand]. For example, `a{1:3}` will be transformed to the names 'a1',
#' 'a2', 'a3'. 'cols' is just a shortcut for 'columns'. See examples.
#' ```
#' ```
#' - `rows`: select rows from dataset by logical conditions.
#'
#'
#' @param data data.table/data.frame
#' @param ... unquoted or quoted column names, regex selectors or variable
#'   ranges for 'columns' and logical conditions for 'rows'.
#'
#' @return data.frame/data.table
#' @export
#' @examples
#'
#' ## columns
#' mtcars %>%
#'     columns(vs:carb, cyl)
#' mtcars %>%
#'     columns(-am, -cyl)
#'
#' # regular expression pattern
#' columns(iris, "^Petal") %>% head() # variables which start from 'Petal'
#' columns(iris, "Width$") %>% head() # variables which end with 'Width'
#' # move Species variable to the front.
#' # pattern "^." matches all variables
#' columns(iris, Species, "^.") %>% head()
#' # pattern "^.*i" means "contains 'i'"
#' columns(iris, "^.*i") %>% head()
#' # numeric indexing - all variables except Species
#' columns(iris, 1:4) %>% head()
#'
#' # variable expansion
#' dims = c("Width", "Length")
#' columns(iris, "Petal.{dims}") %>% head()
#'
#' # rows
#'
#' mtcars %>%
#'     rows(am==0) %>%
#'     head()
#'
#' # select rows with compound condition
#' mtcars %>%
#'     rows(am==0 & mpg>mean(mpg))
#'
columns = function(data, ...){
    UseMethod("columns")
}


#' @export
columns.data.frame = function(data, ...){
    # if data is expression we want to calculate it only once
    data = force(data)
    data_names = colnames(data)
    parent_frame = parent.frame()
    var_indexes = select_columns(...,
                                 data_names = data_names,
                                 frame = parent_frame,
                                 type = "index"
                                 )

    if(is.data.table(data)){
        data[, var_indexes, with = FALSE]
    } else {
        data[, var_indexes, drop = FALSE]
    }

}

#' @rdname columns
#' @export
cols = columns


is_columns = function(expr){
    is.call(expr) && (identical(expr[[1]], quote(columns)) || identical(expr[[1]], quote(cols)) || identical(expr[[1]], quote(`%to%`)))
}

# here we find `columns` expression and replace it with data.table(...) or c(...)
replace_column_expr = function(expr, data_names, frame,
                               type = c("data.table", "index", "names"),
                               new = FALSE){
    if(missing(expr)) return(missing_arg())
    type = match.arg(type)
    if(is.call(expr)){
        if(new){
            curr_action = quote(create_columns)
        } else {
            curr_action = quote(select_columns)
        }
        if(is_columns(expr)){
            if(identical(expr[[1]], quote(`%to%`))){
                expr = as.call(list(curr_action, expr)) # standalone a %to% b, without 'columns'
            } else {
                # 'columns(...)'
                expr[[1]] = curr_action
            }
            expr = as.call(c(as.list(expr),
                             list(data_names = data_names,
                                  frame = frame,
                                  type = type)
                             )
                           )
            expr = eval(expr)

        } else {
            res = lapply(as.list(expr),
                         replace_column_expr,
                         data_names = data_names,
                         frame = frame,
                         type = type,
                         new = new
                         )
            expr = as.call(res)
        }
    }
    expr
}


select_columns = function(..., data_names, frame, type){
    var_list = substitute(list(...))
    curr_range_expander = create_range_expander(data_names, frame)
    var_list = substitute_symbols(var_list, list(
        ":" = curr_range_expander,
        "%to%" = curr_range_expander,
        "-" = create_unary_minus(data_names, frame)
    ))
    all_indexes = create_list_with_index(data_names)
    var_list = eval(var_list, all_indexes, frame)
    # here we processed values which didn't resolved to numeric column index
    var_indexes = expand_characters(var_list, data_names, frame)
    var_indexes = unique(unlist(var_indexes, recursive = TRUE, use.names = FALSE))
    switch(type,
           "data.table"  = {
               all_symbols = create_list_with_symbols(data_names)
               res = all_symbols[var_indexes]
               as.call(c(quote(data.table), res))
           },
           "index" = var_indexes,
           "names" = data_names[var_indexes]

    )
}

create_columns = function(..., data_names, frame, type){
    var_list = substitute(list(...))
    var_list = eval_expressions(var_list, frame)
    # unknown names
    # it is for unquoted names
    new_names = setdiff(all.vars(var_list), data_names)

    curr_range_expander = create_range_expander(data_names, frame, new = TRUE)
    var_list = substitute_symbols(var_list, list(
        ":" = curr_range_expander,
        "%to%" = curr_range_expander
    ))
    all_names = create_list_with_names(c(data_names, new_names))
    var_list = eval(var_list, all_names, frame)
    new_names = expand_characters(var_list, data_names, frame, new = TRUE)
    unique(unlist(new_names, recursive = TRUE, use.names = FALSE))
}

eval_expressions = function(var_list, frame){
    # eval expressions, symbols remain as is
    if(length(var_list)>1){
        var_list = as.list(var_list)
        var_list[-1] = lapply(var_list[-1], function(item){
            if(is.call(item) && !(identical(item[[1]], quote(`:`)) || identical(item[[1]], quote(`%to%`)))){
                item = eval(item, envir = frame)
            }
            item
        })
        var_list = as.call(var_list)
    }
    var_list
}

###
expand_characters = function(selected, data_names, frame, new = FALSE){
    # expand text and regex
    selected = lapply(selected, function(item){
        # browser()
        if(is.character(item)){
            item = s_regex_expand(item, data_names)
            item = s_text_expand(item, data_names, frame)
            if(!new){
                item = create_index_vec(item, df_names = data_names)
            }
        }
        item

    })
    selected
}

create_unary_minus = function(data_names, frame){
    force(data_names)
    force(frame)
    function(e1, e2){
        if(missing(e2)){
            if(is.character(e1)){
                var_indexes = expand_characters(e1, data_names, frame)
                var_indexes = unique(unlist(var_indexes, recursive = TRUE, use.names = FALSE))
                return(-var_indexes)
            } else {
                return(-e1)
            }
        }
        base::`-`(e1, e2)
    }
}

is_regex = function(txt){
    is.character(txt) && any(startsWith(txt, "^") | endsWith(txt, "$"))
}

s_regex_expand = function(expr, df_names){
    # TODO here we have possible unexpected behaviour - when expr is vec of several character
    # if one of them is regex all other will be considered as regex
    if(!is_regex(expr)) return(expr)
    res = lapply(expr, grep, x = df_names, perl = TRUE, value = TRUE)
    res = unlist(res, recursive = TRUE, use.names = FALSE)
    (length(res)==0) && stop(paste("'columns' - there are no variables which match regex(-s): ", paste(expr, collapse = ",")))
    unlist(res, recursive = TRUE, use.names = TRUE)
}

s_text_expand = function(expr, df_names, frame){
    res = eval(substitute(maditr::text_expand(item), list(item = expr)), envir = frame)
    unlist(res, recursive = TRUE, use.names = TRUE)
}

create_range_expander = function(df_names, frame, new = FALSE){
    force(df_names)
    function(from, to){   # , frame, new = FALSE
        from = substitute(from)
        to = substitute(to)
        if(is.call(from)) from = eval(from, frame)
        if(is.call(to)) to = eval(to, frame)
        if(is.symbol(from)) from = as.character(from)
        if(is.symbol(to)) to = as.character(to)
        if(is.numeric(from) && is.numeric(to)) { #
            if(new){
                res = df_names[from:to]
                !anyNA(res) || stop("'columns' range selection: positions ", from, ":", to, " out of range.")
                return(res)
            } else {
                return(from:to)
            }
        }
        first = match(from, df_names)[1]
        last = match(to, df_names)[1]
        if(is.na(first) && is.na(last) && new) return(
            create_name_sequence(from, to)
        )
        (!is.na(first) &&  !is.na(last)) || stop("'columns' range selection: variables '", from, "' or '", to, "' are not found.")
        (last>=first) || stop( "'columns' range selection: '",to, "' located before '",from,"'. Did you mean '",to," %to% ",from,"'?")
        if(new) df_names[first:last] else first:last
    }
}

create_index_vec = function(res, df_names){
    match(res, df_names)
}

create_list_with_index = function(data_names){
    res = as.list(seq_along(data_names))
    names(res) = data_names
    res
}

###

create_list_with_symbols = function(data_names){
    res = lapply(data_names, as.symbol)
    names(res) = data_names
    res
}

create_list_with_names = function(data_names){
    res = as.list(data_names)
    names(res) = data_names
    res
}





####



create_name_sequence = function(from, to){
    patt1 = gsub("^(.+?)([\\d]+)$", "\\1", from, perl = TRUE)
    patt2 = gsub("^(.+?)([\\d]+)$", "\\1", to, perl = TRUE)
    (patt1 == patt2) || stop("'columns range selection': start and end variables begin from different patterns: '", patt1, "', '", patt2,"'.")
    digits1 = gsub("^(.+?)([\\d]+)$", "\\2", from, perl = TRUE)
    digits2 = gsub("^(.+?)([\\d]+)$", "\\2", to, perl = TRUE)
    padding = 0
    if((substr(digits1,1,1)=="0" || substr(digits2,1,1)=="0") &&
       !(substr(digits1,1,1)=="0" && nchar(digits1)==1 && substr(digits2,1,1)!=0)){
        (nchar(digits1) == nchar(digits2)) ||
            stop("'columns' range selection: invalid use of the '%to%' convention. For zero-padded numbers numeric part of the names must be the same length but: '",
                 from, ", '", to, "'.")
        padding = nchar(digits1)
    }
    digits1 = as.numeric(digits1)
    digits2 = as.numeric(digits2)
    (digits1<=digits2) || stop("'columns' range selection: name of start variables greater than name of end variables: '", from,"' > '",to,"'.")
    all_digits = digits1:digits2
    if(padding>0) all_digits = formatC(all_digits, width = padding, format = "d", flag = "0")
    res = paste0(patt1, all_digits)
    unlist(res, recursive = TRUE, use.names = TRUE)
}




########################################################

#' @rdname columns
#' @export
rows = function(data, ...){
    UseMethod("rows")
}


#' @export
rows.data.frame = function(data, ...){
    curr_names = names(substitute(list(...)))
    if(!is.null(curr_names)){
        if(any(c("by", "keyby") %in% curr_names)){
            stop("'rows': you try to use 'by' or 'keyby'. Sorry, but grouped filtering is not yet supported.")
        }
        curr_names = curr_names[curr_names!=""][[1]]
        stop(sprintf("'rows': it seems you use '=' instead of '==': %s.", curr_names))
    }

    parent_frame = parent.frame()
    # if data is expression we want to calculate it only once
    data = force(data)
    # NULL is just a placeholder
    expr = substitute(
        NULL[Reduce(f = '&', list(...)),]
    )
    eval_in_parent_frame(data, expr, frame = parent_frame)

}

#' @export
rows.etable = function(data, ...){
    data_class = class(data)
    data = as.data.table(data)
    res = eval.parent(
        substitute(maditr::rows(data, ...))
    )
    setDF(res)
    class(res) = data_class
    res
}




#########
Any scripts or data that you put into this service are public.
maditr documentation built on April 2, 2022, 5:05 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
maditr
Fast Data Aggregation, Modification, and Filtering with Pipes and 'data.table'

R/columns.R
In maditr: Fast Data Aggregation, Modification, and Filtering with Pipes and 'data.table'

Documented in columns rows

Try the maditr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

maditr Fast Data Aggregation, Modification, and Filtering with Pipes and 'data.table'

R/columns.R In maditr: Fast Data Aggregation, Modification, and Filtering with Pipes and 'data.table'

Documented in columns rows

Try the maditr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

maditr
Fast Data Aggregation, Modification, and Filtering with Pipes and 'data.table'

R/columns.R
In maditr: Fast Data Aggregation, Modification, and Filtering with Pipes and 'data.table'