#' Split data into bins (usually time bins), then calculate percentiles of data in each bin.
#'
#' Essentially the part of VPC that deals with observed data. Good for finding trends in large data sets.
#'
#' @author Rupert Austin
#' @param data data frame. Suitably filtered and containing the data to analyse.
#' @param bins numeric vector giving the cut off values for the time bins (binning doesn't have to based on time - see ind_var_col).
#' @param dv_col column name (default value 'DV') in \eqn{data} containing the experimental data (default value is 'DV')
#' @param ind_var_col column name (default value 'TIME') in \eqn{data} containing the variable to use for binning.
#' @param pct numeric vector of percentiles (default value c(10,50,90)) to calculate for the dv_col data within each bin.
#' @return A matrix object with one column for each bin, and the following rows of summary statistics for the dv_col data within each bin
#' (see sum_stats function documentation for descriptions):
#' \itemize{
#' \item min_ind_var_col
#' \item max_ind_var_col
#' \item n
#' \item min
#' \item max
#' \item median
#' \item arith_mean
#' \item arith_std_dev
#' \item arith_CV
#' \item geo_mean
#' \item geo_std_dev
#' \item geo_CV
#' \item requested percentiles (pct)
#' }
#' @note For a bin with limits (LOWER,UPPER), an observation will go into that bin if it is >=LOWER and <UPPER.
#' @export
################################################################################
################################################################################
# function to bin experimental data and summarize it for use in plots and tables
# of binned experimental statistics.
#
# For a bin with limits (LOWER,UPPER) an observation will go into that bin if it is >=LOWER and <UPPER.
#
# function inputs:
#
# data=suitably filtered data frame containing the data to analyse
# bins=vector giving the cut off values for the time bins (binning doesn't have to based on time - see ind_var_col)
# dv_col=column name in 'data' containing the experimental data (defualt value is 'DV')
# ind_var_col=column name in 'data' containing the variable to use for binning (default value is 'TIME')
# pct=vector of percentiles to calculate for the dv_col data within each bin (defualt value is c(10,50,90))
#
# function outputs:
#
# returns a matrix object with one column for each bin, and the following rows of summary statistics
# for the dv_col data within each bin:
#
# min_ind_var_col
# max_ind_var_col
# n
# min
# max
# median
# arith_mean
# arith_std_dev
# arith_CV
# geo_mean
# geo_std_dev
# geo_CV
# requested percentiles
#
################################################################################
################################################################################
bin_stats=function(data,bins,dv_col='DV',ind_var_col='TIME',pct=c(10,50,90)){
#.................................................................
# function for calculating summary statistics on a vector of data
s_stats=function(data){
n_tot=length(data)
data=data[is.na(data)==F]
n_val=length(data)
num_NA=n_tot-n_val
arith_mean=mean(data)
median=median(data)
min_val=min(data)
max_val=max(data)
stdev=sd(data)
arith_CV=stdev/arith_mean
# Geometric statistics. First check all data are positive
geo_mean=NA
geo_sd=NA
geo_CV=NA
if(all(data>0)){
ln_data=log(data)
geo_mean=exp(mean(ln_data))
geo_sd=exp(sd(ln_data))
ln_geo_sd2=(log(geo_sd))^2
geo_CV=(exp(ln_geo_sd2)-1)^0.5
}
res=c(n=n_val,n_NA=num_NA,min=min_val,max=max_val,median=median,arith_mean=arith_mean,std_dev=stdev,arith_CV=arith_CV,geo_mean=geo_mean,geo_std_dev=geo_sd,geo_CV=geo_CV)
return(res)
}
#...................................................................#
# check for NA within the data
if(any(is.na(data[,dv_col]))==T){
stop("There is at least one NA values in the DV column. You must first remove all NA values.")
}
if(any(is.na(data[,ind_var_col]))==T){
stop("There is at least one NA values in the independent variable column. You must first remove all NA values.")
}
num_bins=length(bins)-1
# assign data to bins
bi=rowSums(matrix(data[,ind_var_col]>=rep(bins,each=nrow(data)),nrow=nrow(data)))
# count number of data points in each bin
bin_nums=rep(0,num_bins)
for(i in 1:num_bins){
bin_nums[i]=sum(bi==i)
}
# build a matix for storing results
result=matrix(data=NA,nrow=12+length(pct),ncol=num_bins)
rownames(result)=c(paste0('min_',ind_var_col),paste0('max_',ind_var_col),'n','min','max','median','arith_mean','arith_std_dev','arith_CV','geo_mean','geo_std_dev','geo_CV',paste0('pct_',pct))
colnames(result)=paste0('bin_',1:num_bins)
# calculate results and tabulate
for(i in 1:num_bins){
if(bin_nums[i]>0){
ind_var_dat=data[bi==i,ind_var_col]
dv_dat=data[bi==i,dv_col]
result[1,i]=min(ind_var_dat)
result[2,i]=max(ind_var_dat)
result[3,i]=bin_nums[i]
q=s_stats(dv_dat)
result[4,i]=q[['min']]
result[5,i]=q[['max']]
result[6,i]=q[['median']]
result[7,i]=q[['arith_mean']]
result[8,i]=q[['std_dev']]
result[9,i]=q[['arith_CV']]
result[10,i]=q[['geo_mean']]
result[11,i]=q[['geo_std_dev']]
result[12,i]=q[['geo_CV']]
# get percentiles
q1=quantile(dv_dat,probs=pct/100)
for(k in 1:length(pct)){
result[12+k,i]=as.numeric(q1[k])
}
}else{
result[3,i]=0
}
}
# return the result
return(result)
}
################################################################################
############ END OF bin_stats FUNCTION #######################################
################################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.