#
# BEGIN_COPYRIGHT
#
# PARADIGM4 INC.
# This file is part of the Paradigm4 Enterprise SciDB distribution kit
# and may only be used with a valid Paradigm4 contract and in accord
# with the terms and conditions specified by that contract.
#
# Copyright (C) 2011 - 2017 Paradigm4 Inc.
# All Rights Reserved.
#
# END_COPYRIGHT
#
############################################################
# Helper functions for dataframe / text manipulation
############################################################
#' Auto-promote characters to types
#'
#' Custom function
#' - should not autopromote non-character columns for VARIANT data
#' - for columns that are interpreted as logical, should use strict checking for TRUE/True, or FALSE/False
#' ('T' is a valid character in genomic data)
#'
#' @param convert_logicals if TRUE, accept automatically converted logicals (see note above)
autoconvert_char = function(df1, convert_logicals = TRUE) {
if (nrow(df1) == 0) return(df1)
col_types = sapply(df1, class)
col_types_chars = names(col_types[which(col_types == 'character')])
col_types_nonchars = names(col_types[which(col_types != 'character')])
df_ac = df1[, col_types_chars] %>%
mutate_all(
funs(
type.convert(as.character(.), as.is = TRUE, numerals = "warn.loss")
)
)
col_types_ac = sapply(df_ac, class)
col_types_ac_logicals = names(col_types_ac[which(col_types_ac == 'logical')])
col_types_ac_nonlogicals = names(col_types_ac[which(col_types_ac != 'logical')])
if (convert_logicals) {
df_ac = cbind(as_tibble(df1)[, col_types_nonchars],
df_ac)
} else {
df_ac = cbind(as_tibble(df1)[, col_types_nonchars],
as_tibble(df1)[, col_types_ac_logicals],
as_tibble(df1)[, col_types_ac_nonlogicals])
}
df_ac[, colnames(df1)]
}
#' Compare with mandatory fields passed by user
#'
#' Rename remaining columns of dataframe as info_<column-name>
#' Take the info columns that are non-string and convert to string
prep_df_fields = function(df, mandatory_fields){
available_fields = colnames(df)
pos = which(!(available_fields %in% mandatory_fields))
colnames(df)[pos] = paste("info_", available_fields[pos], sep = "")
posNotChar = which((sapply(df, class) != "character") &
!(colnames(df) %in% mandatory_fields))
for (posi in posNotChar){
df[, posi] = paste(df[, posi])
}
df
}
#' @export
remove_duplicates = function(df_data){
df_data[which(!duplicated(df_data)), ]
}
df_rename_column = function(df, oldname, newname){
colnames(df)[grep(paste("^",oldname,"$",sep = ""), colnames(df))] = newname
df
}
## YAML related
strip_namespace = function(arrayname) sub("^.*[.]", "", arrayname)
## YAML related
get_namespace = function(arrayname) sub("[.].*$", "", arrayname)
#' @export
drop_na_columns = function(df){
if (nrow(df) > 0) {
if( "data.table" %in% class(df) |
(nrow(df) == 1 & ncol(df) == 1)) {
# Use a different method to remove NA columns if a data.table
# http://stackoverflow.com/questions/2643939/remove-columns-from-dataframe-where-all-values-are-na
base::Filter(function(x)
!all(is.na(x)),
df)
} else {
df[,colSums(is.na(df))<nrow(df)]
}
} else {
df
}
}
rename_column = function(x1, old_name, new_name){
colnames(x1)[colnames(x1) == old_name] = new_name
x1
}
# Helper function to convert dataframe columns from factors to characters
# Use with caution: Floats and integer values that are stored as factors
# will be forced into characters
convert_factors_to_char = function(dfx){
i <- sapply(dfx, is.factor)
dfx[i] <- lapply(dfx[i], as.character)
dfx
}
#' Pretty print a large vector of strings, integers etc.
#'
#' @param vec vector that is to be pretty printed
#' @param prettify_after prettify output if length of vector is longer than this limit
#' @export
pretty_print = function(vec, prettify_after = 7) {
prettify_after = ifelse(prettify_after >= 7, prettify_after, 7) # force parameter to have a minimum value of 7
ifelse(length(vec) <= prettify_after,
paste(vec, collapse = ", "),
paste(pretty_print(head(vec, ceiling((prettify_after-3)/2))),
"...(Total: ", length(vec), ")... ",
pretty_print(tail(vec, ceiling((prettify_after-3)/2))),
sep = ""))}
#' helper function to report matches between vectors
#'
#' @param source source vector for finding matches from
#' @param target target vector in which to find matches
#'
#' @return
#' list(match_res, source_matched_idx, source_unmatched_idx, target_matched_idx)
#' @export
find_matches_and_return_indices = function(source, target){
match_res = match(source, target)
match_idx = which(!is.na(match_res))
non_match_idx = which(is.na(match_res))
list(match_res = match_res,
source_matched_idx = match_idx,
source_unmatched_idx = non_match_idx,
target_matched_idx = match_res[match_idx])
}
#' names to list of numbers by uniqueness
#'
#' function to convert a list of names to
#' a numbered vector \code{1:N} where \code{N} is the number of unique names
#' (each unique name has a different number)
#'
#' @examples
#' names_to_numbered_vec_by_uniqueness(c('a', 'b', 'a', 'c')) # returns: 1, 2, 1, 3
#' names_to_numbered_vec_by_uniqueness(c(11, 35, 44, 11, 35, 66)) # returns: 1, 2, 3, 1, 2, 4
names_to_numbered_vec_by_uniqueness = function(names_vec) {
names_vec = as.character(names_vec)
lookup_idx = 1:length(unique(names_vec))
names(lookup_idx) = unique(names_vec)
lookup_idx[names_vec]
}
#' convert NA to blank
na_to_blank = function(terms) {
ifelse(is.na(terms), "", terms)
}
#' check before proceeding
#'
#' ask user for confirmation before proceeding with an action
#'
#' @param action a string describing the action that will be taken if user confirms
user_confirms_action = function(action) {
userResponse <- NA
while(is.na(userResponse) | nchar(userResponse) == 0) {
userResponse <- readline(
prompt = paste0("Do you want to continue with action: ", action, "? (yes/no): \n "))
switch(tolower(userResponse),
"yes" = {
cat("Proceeding with action: ", action, "\n")
proceed = TRUE
},
"no" = {
cat("Canceled action: ", action, "\n")
proceed = FALSE
},
{ message("Please respond with yes or no"); userResponse <- NA })
}
return(proceed)
}
convert_data_frame_to_matrix = function(expr_df) {
# Two types of matrices are formed:
# 1. Numeric (matrix of floating point numbers or integers) -- this is the general case
# 2. Character (matrix of geneset classification categorical calls) -- see test for `categorical` data
# in `tests/testthat/test99-api-loader.R`
stopifnot(all(c('feature_id', 'biosample_id', 'value') %in% colnames(expr_df)))
## Convert expr_df to data.table in-place
expr_dt <- setDT(expr_df)
## Get unique feature_ids and biosample_ids
ftr_vec = unique(expr_dt, by = c("feature_id"))$feature_id
bios_vec = unique(expr_dt, by = c("biosample_id"))$biosample_id
if (class(expr_df$value) == 'character') { # handle matrix of strings without optimization paths
matrix_value_class = 'character'
cat('Handling matrix of characters ... ')
run_time = system.time({
exprs = acast(expr_df, feature_id~biosample_id, value.var="value")
stopifnot( nrow(exprs) == length(ftr_vec))
stopifnot( ncol(exprs) == length(bios_vec))
})[[3]]
} else { # handle matrix of numbers with various optimization paths
## Assumptions about input data - For a particular biosample_id feature_ids will be unique and
## vice versa
## Create a variable to store the product between the ramge of feature_id values and
## biosample_id values. This variable helps to understand if expr_df is a continuous dense matrix
## or a discontinous dense/sparse matrix
prod_dim_range = ((max(ftr_vec) - min(ftr_vec) + 1) * (max(bios_vec) - min(bios_vec) + 1))
cat("Reshaping expr to Matrix ... ")
if(prod_dim_range == nrow(expr_df)) {
## This implies expr_df is a continuous dense matrix
cat("Handling continuous dense matrix case ... ")
run_time = system.time({
expr_dt[, `:=`(feature_id_idx = feature_id - min(feature_id) + 1,
biosample_id_idx = biosample_id - min(biosample_id) + 1)]
exprs <- Matrix::Matrix(nrow = length(ftr_vec), ncol = length(bios_vec), data = 0, sparse = F)
exprs[ as.matrix(expr_dt[, .(feature_id_idx, biosample_id_idx)]) ] <- expr_dt$value
# Set the row and column names to the id's first
rownames(exprs) = ftr_vec; colnames(exprs) = bios_vec
})[[3]]
} else if (prod_dim_range > nrow(expr_df)) {
## This implies expr_df is a discontinuos dense or a sparse matrix
## Order expr_dt per feature_id then biosample_id
expr_dt <- setorder(expr_dt, biosample_id, feature_id)
## Check number of feature_id per biosample
ftrLen_per_bios <- expr_dt[,.(lenU = .N), by = biosample_id]
if(length(unique(ftrLen_per_bios$lenU)) == 1) {
isSparseMatrix = FALSE # flag to check if matrix is sparse
## This implies all biosample_ids are repeated the same number of times i.e have the same
## number of feature_ids
if(length(ftr_vec) == (nrow(expr_dt)/length(bios_vec))) {
## This implies all biosample_ids have the same feature_ids. Hence, per this and previous
## condition expr_df is a dense matrix. Here we handle only discontinous dense as
## continuous dense is tracked and treated by the outter most if condition.
cat("Handling discontinuous dense matrix case ... ")
run_time = system.time({
expr_dt[, `:=`(feature_id_idx = rep(1:ftrLen_per_bios$lenU[[1]], length(bios_vec)),
biosample_id_idx = sort(rep(1:length(bios_vec), ftrLen_per_bios$lenU[[1]])))]
exprs <- Matrix::Matrix(nrow = length(ftr_vec), ncol = length(bios_vec), data = 0,
sparse = F)
exprs[ as.matrix(expr_dt[, .(feature_id_idx, biosample_id_idx)]) ] <- expr_dt$value
# Set the row and column names to the id's
rownames(exprs) = ftr_vec; colnames(exprs) = bios_vec
})[[3]]
} else if (length(ftr_vec) > (nrow(expr_dt)/length(bios_vec))) {
isSparseMatrix = TRUE # condition 1 for sparsity
} else {
# duplicate feature_id(s) for some or all biossample_id(s), goes against our assumptions
cat('Unknown case')
}
} else {
## This implies expr_df is a sparse matrix
isSparseMatrix = TRUE # condition 2 for sparsity
}
if (isSparseMatrix) {
cat('Handling sparse matrix ... ')
run_time = system.time({
exprs = acast(expr_df, feature_id~biosample_id, value.var="value")
stopifnot( nrow(exprs) == length(ftr_vec))
stopifnot( ncol(exprs) == length(bios_vec))
})[[3]]
}
} else {
stop("Expect product of lengths of features and vectors to be greater than or equal to ",
"number of rows of expression dataframe")
}
}
cat(paste("done (", round(run_time, 5), " sec)", "\n", sep = ""))
return(exprs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.