##### General data wrangling #####
#' Removes un-informative cols from a data.frame
#'
#' @param dat a data.frame
#' @param n_levels thresholded number of items to keep column
#'
#' @return a data.frame without un-informative columns
#' @export
#'
#' @examples
#' library(delutils)
#' dat <- data.frame(A=rep(1,4), B=seq(1,4), C=c('f1', 'f2', 'f1', 'f1'))
#' cleaned_dat <- clean_by_levels(dat)
clean_by_levels <- function(dat, n_levels=1){
if(! 'data.frame' %in% class(dat)){
emsg <- paste0('dat needs to be a data.frame not a ', class(dat))
stop(emsg)
}
stopifnot
if( !is.numeric(n_levels)){
if(!is.integer(n_levels)){
emsg <- paste0('n_levels needs to be an integer >= 1')
stop(emsg)
}
}
if( (n_levels %% 1 != 0) | (n_levels < 1)){
emsg <- paste0('n_levels needs to be an integer >= 1')
stop(emsg)
}
nitems_per_col <- sapply(dat, simplify = TRUE, function(x){
length(unique(x))
})
cols_to_keep <- which(nitems_per_col > n_levels)
cleaned_df <- dat[,cols_to_keep, drop=FALSE]
return(cleaned_df)
}
#' Convert Column Names to Lowercase, Replace Whitespace, and Remove Leading/Trailing Whitespace
#'
#' This function converts the column names of a data frame to lowercase,
#' replaces whitespace (including tabs) with underscores, and removes any leading or trailing whitespace.
#'
#' @param dat A data frame.
#' @return A data frame with cleaned column names.
#' @examples
#' # Example data frame
#' my_df <- data.frame("First Name\t" = c("John", "Jane", "Alice"),
#' "Last Name " = c("Doe", "Smith", "Johnson"),
#' "Age " = c(25, 30, 35))
#'
#' # Clean column names: convert to lowercase, replace whitespace, and remove leading/trailing whitespace
#' cleaned_df <- clean_colnames(my_df)
#' colnames(cleaned_df)
#' @importFrom stringr str_replace_all str_trim
#' @export
clean_colnames <- function(dat) {
if(!("data.frame" %in% class(dat))){
stop("df needs to be either a data.frame or tibble")
}
fixed_names <- colnames(dat) %>%
str_trim(side = "both") %>%
gsub("\\s+", "_", .) %>%
tolower()
colnames(dat) <-fixed_names
return(dat)
}
#' Finds rows in a data.frame which contain NA values
#'
#' @param df data.frame or tibble
#'
#' @return indexes of rows with NA values
#' @export
#'
#' @examples
#' library(delutils)
#' dat <- data.frame(A=c(NA,2:4), B=c(1:3, NA), C=1:4)
#' find_na_rows(dat)
#'
find_na_rows <- function(df){
if(!("data.frame" %in% class(df))){
stop("df needs to be either a data.frame or tibble")
}
weird_rows <-
apply(df, 2, function(x){
which(is.na(x))
}) %>% unlist
weird_rows <-
unique(weird_rows)
return(weird_rows)
}
#' Uniquefy by variance
#'
#' Uniquefies an matrix by the row variance. This
#' ranks variance for each as specifified in the
#' map data.frame. The row.names of the expression
#' matrix and map need to have at least some in common.
#' Function first identifies which row.names are shared
#' and then subsets.
#' Idea for using max var to de-duplicate probes came
#' form here: https://www.biostars.org/p/51756/#51875
#'
#'
#' @param mat expression matrix
#' @param map gene map
#' @param dup_col col with potential duplicates
#'
#' @return filtered expression matrix
#' @export
#'
#' @examples
uniquefy_by_variance <- function(mat, map, dup_col=NULL){
if(!any(class(mat) %in% c('data.frame','matrix'))){
stop('expr_mat needs to be either a data.frame or matrix of gene expression value.')
}
if(is.null(dup_col)){
dup_col <- names(map)[1]
warning(paste0('Using map ', dup_col, ' as dup_col'))
}
if(!any(row.names(mat) %in% row.names(map))){
stop('None of the row.names matched between mat and map.')
}
# Sorts and keeps only rows shared between map and mat
common_ids <- intersect(row.names(mat), row.names(map))
mat <- mat[common_ids,]
map <- map[common_ids,dup_col]
# Gets row vars and filters
rv_mat <- data.frame(row.names = common_ids,
group=map,
vals=matrixStats::rowVars(mat))
rv_mat <- uniquefy_base(rv_mat, 'group', 'vals')
filt_mat <- mat[rv_mat,]
return(filt_mat)
}
#' Return a unique data.frame by max value
#'
#' @param dat data.frame with 1st column as string and second as numeric
#' @param group_col string denoting group column (col with duplicates)
#' @param val_col string denoting value column
#'
#' @return unique data.frame
#' @export
#'
#' @examples
#' set.seed(42)
#' dat <- data.frame(group=c('a', 'a', 'a', 'b', 'c', 'c'), vals=runif(6))
#' dat$vals[6] <- -4
uniquefy_by_abs_max <- function(dat, group_col=NULL, val_col=NULL){
#TODO make this workhorse functions which
# and split the variance and other uniquefiers
# into compatible functions
if(is.null(group_col)){
group_col <- names(dat)[1]
}
if(is.null(val_col)){
val_col <- names(dat)[2]
}
temp_dat <- data.frame(dat)
temp_dat[val_col] <- abs(temp_dat[val_col])
rows_to_keep <- uniquefy_base(temp_dat, group_col, val_col)
dat <- dat[rows_to_keep,]
return(dat)
}
#' Base function which does the uniquefying
#'
#' This is meant to be an internal function
#' which is the commonly used across the
#' different uniquefiers. It works by
#' taking uniquefying based on largest value
#' and returning a logical vector
#'
#' @param dat data frame
#' @param group_col group_col string denoting group column (col with duplicates)
#' @param val_col string denoting value column
#'
#' @return logical vector
#' @keywords internal
#'
#' @examples
uniquefy_base <- function(dat, group_col, val_col){
ordered_dat <- dat[order(dat[group_col], -dat[val_col]),]
ordered_dat <- ordered_dat[!duplicated(ordered_dat[group_col]),]
logical_idxs <- row.names(dat) %in% row.names(ordered_dat)
return(logical_idxs)
}
#' rbinds a named list of data.frames
#'
#'
#' rbinds together data.frames in which the list name
#' is turned into a column and bound to the data.frame
#' becoming the first column of the data.frame
#'
#' @param df_list named list of data.frames
#' @param sum_col_name colname of new column bound to data.frame
#'
#' @return a single data.frame row bound data.frame
#' @export
#'
#' @examples
#' library(dplyr)
#' library(delutils)
#' l <- list(data.frame(A=seq(1,5)), data.frame(A=seq(6, 7))) %>%
#' setNames(., c('Cond1', 'Cond2'))
#' rbind_named_df_list(l)
#'
#' # Doesn't work because list contains matrix
#' \dontrun{
#' l <- list(data.frame(A=seq(1,5)),
#' data.frame(A=seq(6, 7)),
#' matrix(data=NA, ncol=1, nrow=5)) %>%
#' setNames(., c('Cond1', 'Cond2', 'Cond3'))
#' rbind_named_df_list(l)
#' }
rbind_named_df_list <- function(df_list, sum_col_name='col_from_list'){
# Makes sure list has names, everything is
# either a data.frame or NULL
nms <- names(df_list)
if(is.null(nms)){
stop('df_list needs to be a named list')
}
item_class_checker <-
vapply(df_list, function(x){
any(class(x) %in% c('data.frame', 'NULL'))
}, logical(1)) %>%
all(.)
if(! item_class_checker){
stop('df_list contains items which are not data.frame or tibble')
}
out_df <- list()
empty_dfs <- NULL
for(x in nms){
temp_pth <- df_list[[x]]
valid_conditions <- c(
nrow(temp_pth) >= 1, # more than 1 rows
!is.null(temp_pth) # df is not NULL
) %>% all
# Check valid conditions
if(valid_conditions){
temp_pth <- cbind(x,
temp_pth,
stringsAsFactors = FALSE)
row.names(temp_pth) <- NULL
names(temp_pth)[1] <- sum_col_name
out_df[[x]] <- temp_pth
} else {
empty_dfs <- c(empty_dfs, x)
next
}
}
if(!is.null(empty_dfs)){
warning_str <-
paste0(head(empty_dfs), collapse ='\n') %>%
paste0('Some items contained empty or otherwise disagreeable data.frames and were not included in the bound data.frame.\nThese include:\n', . )
warning(warning_str, sep='')
}
n_cols <- lapply(out_df, function(x){
colnames(x) %>%
length
}) %>% do.call(c, .)
if(length(unique(n_cols)) > 1){
suspect_dfs <- names(out_df)[which(n_cols != mode(n_cols))]
e_message <-
paste0(suspect_dfs, collapse='\n') %>%
paste0('data.frames have different number of columns. check:\n', . )
stop(e_message, sep='')
}
# TODO wrap this trycatch when data.frames have
# different columns, s.t., an error w/ offending
# items is spit out
out_df <- do.call(rbind, c(out_df, stringsAsFactors=FALSE))
row.names(out_df) <- NULL
return(out_df)
}
#' de_rbind_df
#'
#' @param df data.frame
#' @param col_to_split columnt to split data.frame by
#'
#' @return
#' @export
#'
#' @keywords internal
#'
#' @examples
de_rbind_df <- function(df, col_to_split=1){
e_message <- 'col_to_split needs to be one of a "character", "numeric", or "integer" which matches a column in the data.frame.'
if(!class(col_to_split) %in% c("character", "numeric", "integer")){
stop(e_message)
}
df_nms <- colnames(df)
df_idxs <- seq(1, ncol(df))
valid_col <- col_to_split %in% df_nms || col_to_split %in% df_idxs
if(!valid_col){
stop(e_message)
}
splits_vec <- df[, col_to_split]
df[,col_to_split] <- NULL
split_list <- split(df, splits_vec)
return(split_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.