# R/count_if.R In expss: Tables, Labels and Some Useful Functions from Spreadsheets and 'SPSS' Statistics

#### Documented in apply_col_ifapply_row_ifcount_col_ifcount_ifcount_row_ifhasmax_col_ifmax_ifmax_row_ifmean_col_ifmean_ifmean_row_ifmedian_col_ifmedian_ifmedian_row_ifmin_col_ifmin_ifmin_row_ifsd_col_ifsd_ifsd_row_ifsum_col_ifsum_ifsum_row_if

#' Count/sum/average/other functions on values that meet a criterion
#'
#' These functions calculate count/sum/average/etc. on values that meet a
#' criterion that you specify. \code{apply_if_*} apply custom functions. There
#' are different flavors of these functions: \code{*_if} work on entire
#' dataset/matrix/vector, \code{*_row_if} works on each row and \code{*_col_if}
#' works on each column.
#'
#' @param criterion Vector with counted values or
#'   function. See details and examples.
#'
#' @param ... Data on which criterion will be applied. Vector, matrix,
#'   data.frame, list.
#'
#' @param x Data on which criterion will be applied. Vector, matrix,
#'   data.frame, list.
#'
#' @param data Data on which function will be applied. Doesn't applicable to
#'   \code{count_*_if} functions. If omitted then function will be applied on
#'   the ... argument.
#'
#' @param fun Custom function that will be applied based on criterion.
#'
#' @return
#' \code{*_if} return single value (vector of length 1).
#' \code{*_row_if} returns vector for each row of supplied arguments.
#' \code{*_col_if} returns vector for each column of supplied arguments.
#' \code{\%row_in\%}/\code{\%col_in\%} return logical vector - indicator of
#' presence of criterion in each row/column. \code{\%has\%} is an alias for
#' \code{\%row_in\%}.
#'
#' @details
#' Possible type for criterion argument:
#' \itemize{
#' \item{vector/single value}{ All values in \code{...} which equal to the
#' elements of vector in the criteria will be used as function \code{fun}
#' argument.}
#' \item{function}{ Values for which function gives TRUE will be used as
#' function \code{fun} argument. There are some special functions for
#' convenience (e. g. \code{gt(5)} is equivalent ">5" in spreadsheet) - see
#'
#' \code{count*} and \code{\%in*\%} never returns NA's. Other functions remove
#' NA's before calculations (as \code{na.rm = TRUE} in base R functions).
#'
#' Function criterion should return logical vector of same size and shape as its
#' argument. This function will be applied to each column of supplied data and
#' TRUE results will be used. There is asymmetrical behavior in \code{*_row_if}
#' and \code{*_col_if} for function criterion: in both cases function criterion
#' will be applied columnwise.
#'
#' @export
#' @examples
#' set.seed(123)
#' sheet1 = as.sheet(
#'        matrix(sample(c(1:10,NA), 30, replace = TRUE), 10)
#' )
#'
#' result  = let(sheet1,
#'              # count 8
#'              exact = count_row_if(8, V1, V2, V3),
#'              # count values greater than 8
#'              greater = count_row_if(gt(8), V1, V2, V3),
#'              # count integer values between 5 and 8, e. g. 5, 6, 7, 8
#'              integer_range = count_row_if(5:8, V1, V2, V3),
#'              # count values between 5 and 8
#'              range = count_row_if(5 %thru% 8, V1, V2, V3),
#'              # count NA
#'              na = count_row_if(is.na, V1, V2, V3),
#'              # count not-NA
#'              not_na = count_row_if(not_na, V1, V2, V3),
#'              # are there any 5 in each row?
#'              has_five = cbind(V1, V2, V3) %row_in% 5
#'          )
#' print(result)
#'
#' mean_row_if(6, sheet1$V1, data = sheet1) #' median_row_if(gt(2), sheet1$V1, sheet1$V2, sheet1$V3)
#' sd_row_if(5 %thru% 8, sheet1$V1, sheet1$V2, sheet1$V3) #' #' if_na(sheet1) = 5 # replace NA #' #' # custom apply #' apply_col_if(prod, gt(2), sheet1$V1, data = sheet1) # product of all elements by columns
#' apply_row_if(prod, gt(2), sheet1$V1, data = sheet1) # product of all elements by rows #' #' # Examples borrowed from Microsoft Excel help for COUNTIF #' sheet1 = text_to_columns( #' " #' a b #' apples 32 #' oranges 54 #' peaches 75 #' apples 86 #' " #' ) #' #' count_if("apples", sheet1$a) # 2
#'
#' count_if("apples", sheet1) # 2
#'
#' with(sheet1, count_if("apples", a, b)) # 2
#'
#' count_if(gt(55), sheet1$b) # greater than 55 = 2 #' #' count_if(ne(75), sheet1$b) # not equal 75 = 3
#'
#' count_if(ge(32), sheet1$b) # greater than or equal 32 = 4 #' #' count_if(gt(32) & lt(86), sheet1$b) # 2
#'
#' # count only integer values between 33 and 85
#' count_if(33:85, sheet1$b) # 2 #' #' # values with letters #' count_if(regex("^[A-z]+$"), sheet1) # 4
#'
#' # values that started on 'a'
#' count_if(regex("^a"), sheet1) # 2
#'
#' # count_row_if
#' count_row_if(regex("^a"), sheet1) # c(1,0,0,1)
#'
#' sheet1 %row_in% 'apples'  # c(TRUE,FALSE,FALSE,TRUE)
#'
#' # Some of Microsoft Excel examples for SUMIF/AVERAGEIF/etc
#' sheet1 = text_to_columns(
#'     "
#'     property_value commission data
#'     100000              7000  250000
#'     200000             14000
#'     300000             21000
#'     400000             28000
#'     "
#' )
#'
#' # Sum of commision for property value greater than 160000
#' with(sheet1, sum_if(gt(160000), property_value, data = commission)) # 63000
#'
#' # Sum of property value greater than 160000
#' with(sheet1, sum_if(gt(160000), property_value)) # 900000
#'
#' # Sum of commision for property value equals to 300000
#' with(sheet1, sum_if(300000, property_value, data = commission)) # 21000
#'
#' # Sum of commision for property value greater than first value of data
#' with(sheet1, sum_if(gt(data[1]), property_value, data = commission)) # 49000
#'
#' sheet1 = text_to_columns(
#'        "
#'          category     food sales
#'        Vegetables Tomatoes  2300
#'        Vegetables   Celery  5500
#'            Fruits  Oranges   800
#'              NA     Butter   400
#'        Vegetables  Carrots  4200
#'            Fruits   Apples  1200
#'        "
#'        )
#'
#' # Sum of sales for Fruits
#' with(sheet1, sum_if("Fruits", category, data = sales)) # 2000
#'
#' # Sum of sales for Vegetables
#' with(sheet1, sum_if("Vegetables", category, data = sales)) # 12000
#'
#' # Sum of sales for food which is ending on 'es'
#' with(sheet1, sum_if(perl("es\$"), food, data = sales)) # 4300
#'
#' # Sum of sales for empty category
#' with(sheet1, sum_if(NA, category, data = sales))  # 400
#'
#'
#' sheet1 = text_to_columns(
#'     "
#'     property_value commission data
#'     100000              7000  250000
#'     200000             14000
#'     300000             21000
#'     400000             28000
#'     "
#' )
#'
#' # Commision average for comission less than 23000
#' with(sheet1, mean_if(lt(23000), commission)) # 14000
#'
#'
#' # Property value average for property value less than 95000
#' with(sheet1, mean_if(lt(95000), property_value)) #  NaN
#'
#' # Commision average for property value greater than 250000
#' with(sheet1, mean_if(gt(250000), property_value, data = commission)) # 24500
#'
#'
#' sheet1 = text_to_columns(
#'     '
#'                 region  profits
#'                   East   45678
#'                   West   23789
#'                  North   -4789
#'     "South (New Office)"     0
#'                MidWest    9678
#'     ',
#'     quote = '"'
#' )
#'
#'
#' # Mean profits for 'west' regions
#' with(sheet1, mean_if(contains("West"), region, data = profits)) # 16733.5
#'
#'
#' # Mean profits for regions wich doesn't contain New Office
#' with(sheet1, mean_if(not(contains("New Office")), region, data = profits))  # 18589
#'
#'
#' sheet1 = text_to_columns(
#'     "
#'     89      1
#'     93      2
#'     96      2
#'     85      3
#'     91      1
#'     88      1
#'     "
#' )
#'
#' # Minimum grade for weight equals to 1
#' with(sheet1, min_if(1, weight, data = grade)) # 88
#'
#'
#' # Maximum grade for weight equals to 1
#' with(sheet1, max_if(1, weight, data = grade)) #91
#'
#'
#' # Example with offset
#' sheet1 = text_to_columns(
#'     "
#'        10    b
#'        11    a
#'       100    a
#'       111    b
#'         1    a
#'         1    a
#'     "
#' )
#'
#' with(sheet1, min_if("a", grade[2:5], data = weight[1:4])) # 10
#'
#'
count_if=function(criterion,...){
cond = build_condition_matrix(criterion, ..., logical_as_numeric = TRUE)
matrixStats::sum2(cond, na.rm=TRUE)
}

#' @export
#' @rdname count_if
count_row_if=function(criterion,...){
cond = build_condition_matrix(criterion, ..., logical_as_numeric = TRUE)
matrixStats::rowSums2(cond)
}

#' @export
#' @rdname count_if
count_col_if=function(criterion,...){
cond = build_condition_matrix(criterion, ..., logical_as_numeric = TRUE)
res = matrixStats::colSums2(cond)
names(res) = colnames(cond)
res
}

#' @export
#' @rdname count_if
has = function(x, criterion){
cond = build_condition_matrix(criterion, x)
matrixStats::rowAnys(cond)
}

#' @export
#' @rdname count_if
'%row_in%'= has
#' @export
#' @rdname count_if
'%has%' = has

#' @export
#' @rdname count_if
'%col_in%'=function(x, criterion){
cond = build_condition_matrix(criterion, x)
res = matrixStats::colAnys(cond)
names(res) = colnames(cond)
res
}

#' @export
#' @rdname count_if
sum_if=function(criterion, ..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
matrixStats::sum2(data, na.rm = TRUE)
}

#' @export
#' @rdname count_if
sum_row_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
matrixStats::rowSums2(data, na.rm=TRUE)
}

#' @export
#' @rdname count_if
sum_col_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
res = matrixStats::colSums2(data, na.rm=TRUE)
names(res) = colnames(data)
res
}

################################################

#' @export
#' @rdname count_if
mean_if=function(criterion, ..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
matrixStats::mean2(data, na.rm=TRUE)
}

#' @export
#' @rdname count_if
mean_row_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
matrixStats::rowMeans2(data, na.rm=TRUE)
}

#' @export
#' @rdname count_if
mean_col_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
res = matrixStats::colMeans2(data, na.rm=TRUE)
names(res) = colnames(data)
res
}

################################################

#' @export
#' @rdname count_if
sd_if=function(criterion, ..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
if(!is.numeric(data)) stop("sd_if: argument should be numeric.")
stats::sd(data, na.rm=TRUE)
}

#' @export
#' @rdname count_if
sd_row_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
matrixStats::rowSds(data, na.rm=TRUE)
}

#' @export
#' @rdname count_if
sd_col_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
res = matrixStats::colSds(data, na.rm=TRUE)
names(res) = colnames(data)
res
}

################################################

#' @export
#' @rdname count_if
median_if=function(criterion, ..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
if(!is.numeric(data)) stop("median_if: argument should be numeric.")
stats::median(data, na.rm=TRUE)
}

#' @export
#' @rdname count_if
median_row_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
matrixStats::rowMedians(data, na.rm=TRUE)
}

#' @export
#' @rdname count_if
median_col_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
res = matrixStats::colMedians(data, na.rm=TRUE)
names(res) = colnames(data)
res
}

###################################################

#' @export
#' @rdname count_if
max_if=function(criterion, ..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
res = suppressWarnings(max(data, na.rm=TRUE))
res[!is.finite(res)] = NA
res
}

#' @export
#' @rdname count_if
max_row_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
res = matrixStats::rowMaxs(data, na.rm=TRUE)
res[!is.finite(res)] = NA
res
}

#' @export
#' @rdname count_if
max_col_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
res = matrixStats::colMaxs(data, na.rm=TRUE)
res[!is.finite(res)] = NA
names(res) = colnames(data)
res
}

##########################################################

#' @export
#' @rdname count_if
min_if=function(criterion, ..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
res = suppressWarnings(min(data, na.rm=TRUE))
res[!is.finite(res)] = NA
res
}

#' @export
#' @rdname count_if
min_row_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
res = matrixStats::rowMins(data, na.rm=TRUE)
res[!is.finite(res)] = NA
res
}

#' @export
#' @rdname count_if
min_col_if=function(criterion,..., data = NULL){
data = fun_if_helper(criterion = criterion, ..., data = data, logical_as_numeric = TRUE)
res = matrixStats::colMins(data, na.rm=TRUE)
res[!is.finite(res)] = NA
names(res) = colnames(data)
res
}

#' @export
#' @rdname count_if
apply_row_if=function(fun, criterion,..., data = NULL){
fun = match.fun(fun)
cond = build_condition_matrix(criterion, ...)
if (is.null(data)){
data = flat_list(
list(...),
flat_df = FALSE
)
data = do.call(cbind, data)
}
if(!is.matrix(data)) data = as.matrix(data)
stopifnot(
NROW(cond)==NROW(data),
NCOL(cond)==1 || NCOL(cond)==NCOL(data)
)

rows = seq_len(nrow(data))
res = lapply(rows, function(each_row){
filtered_row = data[each_row,][cond[each_row,]]
fun(filtered_row)
})

if(any(lengths(res) != 1)){
stop("'apply_row_if': incorrect result - function returns values with length greater than one.")
}
unlist(res, use.names = FALSE, recursive = TRUE)
}

#' @export
#' @rdname count_if
apply_col_if=function(fun, criterion,..., data = NULL){
fun = match.fun(fun)
cond = build_condition_matrix(criterion, ...)
if (is.null(data)){
data = flat_list(
list(...),
flat_df = FALSE
)
data = do.call(cbind, data)
}
if(!is.matrix(data)) data = as.matrix(data)
stopifnot(
NROW(cond)==NROW(data),
NCOL(cond)==1 || NCOL(cond)==NCOL(data)
)

cols = seq_len(ncol(data))
if(NCOL(cond) > 1){
res = lapply(cols, function(each_col){
filtered_row = data[, each_col][cond[,each_col]]
fun(filtered_row)
})

} else {
data = data[cond, ,drop = FALSE]
res = lapply(cols, function(each_col){
filtered_row = data[, each_col]
fun(filtered_row)
})
}
if(any(lengths(res) != 1)){
stop("'apply_col_if': incorrect result - function returns values with length greater than one.")
}
res = unlist(res, use.names = FALSE, recursive = TRUE)
names(res) = colnames(data)
res
}

#########################################################
fun_if_helper = function(criterion,..., data, logical_as_numeric = FALSE){
if(is.null(data)){
args = flat_list(
list(...),
flat_df = FALSE
)
data = do.call(cbind, args)
}
if(!is.matrix(data)) data = as.matrix(data)
if(logical_as_numeric && is.logical(data)) storage.mode(data) = "integer"
cond = build_condition_matrix(criterion, ...)
stopifnot(
NROW(cond)==NROW(data),
NCOL(cond)==1 || NCOL(cond)==NCOL(data)
)
data[!cond] = NA
data

}

build_condition_matrix = function(criterion, ..., logical_as_numeric = FALSE){
cond = flat_list(
list(...),
flat_df = FALSE
)

if(!inherits(criterion, "criterion")) criterion = as.criterion(criterion)
cond = apply_criterion(cond, criterion)
cond = do.call(cbind, cond)
if(!is.matrix(cond)) cond = as.matrix(cond)
if(logical_as_numeric && is.logical(cond)) storage.mode(cond) = "integer"
cond
}

# optimization after profiling
apply_criterion = function(obj, crit){
UseMethod("apply_criterion")
}

#' @export
apply_criterion.list = function(obj, crit){
lapply(obj, apply_criterion, crit)
}

#' @export
apply_criterion.data.frame = function(obj, crit){
obj[] = lapply(obj, apply_criterion, crit)
obj
}

#' @export
apply_criterion.default = function(obj, crit){
crit(obj)
}

#' @export
apply_criterion.matrix = function(obj, crit){
res = crit(obj)
res = matrix(res, nrow = nrow(obj), ncol = ncol(obj))
dimnames(res) = dimnames(obj)
res
}


## Try the expss package in your browser

Any scripts or data that you put into this service are public.

expss documentation built on July 26, 2023, 5:23 p.m.