R/utils.R

Defines functions is_num is_cat sort_df ks_stat rmse_stat smd_stat pfc_stat punch_holes

#' @importFrom dplyr bind_rows
#' @importFrom tibble tibble enframe 

# Removes [size] values 
# from [x] at random
punch_holes <- function(
		x, size, 
		seed){
	id <- 1:length(x)
	missing <- as.integer(is.na(x))
	id_obs <- id[missing==0]
	x_obs <- x[missing==0]
	set.seed(seed)
	draw <- sample(id_obs,size=size)
	x[id %in% draw] <- NA
	return(x)
	} 

# Metrics 
pfc_stat <- function(x,y){
	1-(sum(x==y)/length(x))
}

smd_stat <- function(x,y){
	if(!is.numeric(x)){
		x <- unclass(x)
	}
	if(!is.numeric(y)){
		y <- unclass(y)
	}
	abs(mean(x)-mean(y))/sqrt((var(x)+var(y))/2)
}

rmse_stat <- function(x,y){
	sqrt(mean((x-y)^2))
}

ks_stat <- function(x,y){
	if(!is.numeric(x)){
		x <- unclass(x)
	}
	if(!is.numeric(y)){
		y <- unclass(y)
	}
	as.numeric(
		suppressWarnings(ks.test(x,y)$statistic) )
}

# Sort children data.frame 
# by the same ID of a parent data.frame
sort_df <- function(children,parent,id){
	if( !(id %in% colnames(parent)) ) stop(paste(id, "not in parent."))
	if( !(id %in% colnames(children)) ) stop(paste(id, "not in children."))
	return(
		children[order(
			match(
				children[[id]],
			 parent[[id]]
			)
		),]
	)
}

# Value that occurs most often 
# (disregarding missings)
qmode <- function (x,na.rm=FALSE) {
	if(na.rm) x <- na.omit(x)
  ux <- unique(x)
  findmax <- which.max(tabulate(match(x, ux)))
  return(ux[findmax])
	}

is_cat <- function(x){
		is.character(x) | 
		is.factor(x) | 
		is.logical(x) | 
		is.ordered(x)	
	}

is_num <- function(x){
		!(is.character(x) | 
		is.factor(x) | 
		is.logical(x) | 
		is.ordered(x))	
	}
sumtxt/missEval documentation built on July 12, 2020, 12:07 a.m.