## DATASET AND I/O FUNCTIONS
#these two functions are wrappers intended to make it easier to work with the megadataset
#' A wrapper for read.csv()
#'
#' This is a convenient wrapper for read.csv()
#' @param filename the file to be read
#' @export
read_mega = function(filename){
return(read.csv(filename,sep=";",row.names=1, #this loads the rownames
stringsAsFactors=FALSE)) #dont want factors
}
#' A wrapper for write.csv()
#'
#' This is a convenient wrapper for write.csv()
#' @param object the object to be written to a file
#' @param filename the name of the file you want to write to
#' @export
write_mega = function(object, filename){
datafile = cbind(ID=rownames(object), object) #adds an ID column with the rownames
return(write.table(datafile, file = filename, #filename
sep=";", #use tabs to separate
na="", #no data is empty cell
row.names = FALSE)) #add rownames
}
#' Write a nice sorted vector with names.
#'
#' This is a convenience function for outputting lists
#' @param df a data frame.
#' @param var a string with the name of the variable to write.
#' @param var a string with the desired filename to write to.
#' @export
output_sorted_var = function(df, var, filename) {
s = df[var] #subset
s2 = s[order(s[1], decreasing = T),,drop=F] #sort
s3 = data.frame(1:nrow(s2),
s2)
rownames(s3) = rownames(s2)
colnames(s3) = c("Rank", var)
write.csv(s3, file.name, fileEncoding = "UTF-8") #save
}
#' Write object to clipboard
#'
#' S3 generic function to write objects to the clipboard for easy use.
#'
#' See \code{\link{write_clipboard.data.frame}}, \code{\link{write_clipboard.model_summary}}.
#' @export
write_clipboard <- function(...) UseMethod("write_clipboard")
#' Write data frame to clipboard
#'
#' A wrapper function to \code{\link{write.table}} for writing to the clipboard for pasting in a spreadsheet.
#'
#' @param x (any object that works with write.table) Something to write to the clipboard.
#' @param digits (int scalar) A number of digits to round the data to.
#' @param clean_names (log scalar) Whether to clean the names. Default=T.
#' @param clean_what (chr vector) Which things to clean. Defaults to underscores and dots.
#' @param pad_digits (log scalar) Whether to pad zeros to the digits (for prettier tables; default = T).
#' @param print (log scalar) Whether to also print the output in R (default T).
#' @param .rownames (lgl scalar) Whether to write rownames. Default yes. These are written to a column in front called .rownames.
#' @param write_to_clipboard (lgl) Whether to write to the clipboard. Can be useful to disable in rare cases.
#' @param return_modified (lgl) Whether to return the modified input instead of the original. Useful if one wants to modify it further.
#' @param na (chr) How to write `NA`s.
#' @param capitalize_dimnames (lgl) Whether to capitalize the first letter in the dimnames.
#'
#' @export
#' @examples
#' iris[-5] %>% cor() %>% write_clipboard()
#' iris %>% head() %>% miss_add_random() %>% write_clipboard()
#' #does not pad integers
#' test_df = data.frame(int = 1:5, num = rnorm(5))
#' test_df %>% write_clipboard()
#' #removes list columns
#' test_df$lc = list(1:3)
#' test_df %>% write_clipboard()
write_clipboard.data.frame = function(x,
digits = 2,
clean_names = T,
clean_what = c("_", "\\."),
pad_digits = T,
print = T,
.rownames = T,
write_to_clipboard = interactive(),
return_modified = F,
capitalize_dimnames = T,
na = "") {
#save orig
x_orig = x
#change type
x = as.data.frame(x_orig)
#remove list columns
list_cols = map_lgl(x, is.list)
if (any(list_cols)) message(glue::glue("List columns were removed because they cannot be easily transformed to rectangular format. Colnames: {stringr::str_c(names(x[list_cols]), collapse = ', ')}"))
x = x[!list_cols]
#round
x = df_round(x, digits)
#format if desired
if (pad_digits) {
x = format(x, nsmall = digits, digits = digits)
}
#clean
if (clean_names) {
for (char in clean_what) {
if (is.data.frame(x) | is.matrix(x)) {
rownames(x) = stringr::str_replace_all(rownames(x), char, " ")
colnames(x) = stringr::str_replace_all(colnames(x), char, " ")
}
if (is.vector(x)) {
names(x) = stringr::str_replace_all(names(x), "_", " ")
}
}
}
#capitalization
if (capitalize_dimnames) {
rownames(x) = str_to_upper_initial(rownames(x))
colnames(x) = str_to_upper_initial(colnames(x))
}
#print
if (print) print(x)
#write to clipboard
if (write_to_clipboard) {
#add rownames if desired
if (.rownames) {
x_modded = cbind(".rownames" = rownames(x), x)
} else {
x_modded = x
}
#decide how to write
#windows is easy!
if (Sys.info()['sysname'] %in% c("Windows")) {
#just write as normal
write.table(x_modded, "clipboard", sep = "\t", na = na, row.names = F)
} else {
#for non-windows, try xclip approach
#https://stackoverflow.com/a/10960498/3980197
write.xclip = function(x) {
#if xclip not installed
if (!isTRUE(file.exists(Sys.which("xclip")[1L]))) {
stop("Cannot find xclip")
}
con <- pipe("xclip -selection c", "w")
on.exit(close(con))
write.table(x, con, sep = "\t", na = na, row.names = F)
}
tryCatch({
write.xclip(x_modded)
}, error = function(e) {
message("Could not write using xclip")
})
}
}
#return
if (return_modified) {
return(invisible(x))
} else {
return(invisible(x_orig))
}
}
#for matrix, use data frame function
#' Write matrix to clipboard
#'
#' Write a matrix to the clipboard with clean formating. Calls \code{\link{write_clipboard.data.frame}}.
#' @export
write_clipboard.matrix = write_clipboard.data.frame
#' Read data frame from clipboard
#'
#' A wrapper function to \code{\link{write.table}} for reading from the clipboard. Useful to quickly read data copied from a table or spreadsheet.
#' @param header (lgl) Whether to use the first row as headers (column names).
#' @param sep (chr) Which separator to use.
#' @param na.strings (chr) Which cell values to interpret as missing data.
#' @param check.names (lgl) Whether to convert illegal names to legal ones using [base::make.names()].
#' @param stringsAsFactors (lgl) Whether to automatically convert strings to factors (GOD PLEASE NO).
#' @param dec (chr) Which symbol to interpret as decimal separator.
#' @param ... Any other arguments passed to [utils::read.table()].
#' @export
#' @examples
#' iris[-5] %>% cor %>% write_clipboard
#' iris %>% head %>% miss_add_random %>% write_clipboard
read_clipboard = function(header = T,
sep = "\t",
na.strings = c("", "NA"),
check.names = T,
stringsAsFactors = F,
dec = ".",
...) {
#decide how to read
#windows is easy!
if (Sys.info()['sysname'] %in% c("Windows")) {
#just read as normal
read.table(file = con, sep = sep, header = header, check.names = check.names, na.strings = na.strings, stringsAsFactors = stringsAsFactors, dec = dec, ...)
} else {
#for non-windows, try xclip approach
#https://stackoverflow.com/a/10960498/3980197
read.xclip = function(x) {
#if xclip not installed
if (!isTRUE(file.exists(Sys.which("xclip")[1L]))) {
stop("Cannot find xclip")
}
con <- pipe("xclip -o -selection c", "r")
on.exit(close(con))
read.table(file = con, sep = sep, header = header, check.names = check.names, na.strings = na.strings, stringsAsFactors = stringsAsFactors, dec = dec, ...)
}
tryCatch({
read.xclip(x)
}, error = function(e) {
message(sprintf("error: %s", e$message))
})
}
}
#helper function for printing lists of heterogenous data frames
ldf_to_long_mat = function(x, rowname_headers = NULL, only_interesting_rownames = T, clean_colnames = T, clean_rownames = T) {
#rowname headers?
if (is.null(rowname_headers)) {
rowname_headers = rep(NA, length(x))
}
#convert rownames and colnames to explicit names
x_explicit_names = purrr::map2(x, rowname_headers, function(., rh) {
#boring rownames?
if (only_interesting_rownames && are_equal(rownames(.), as.character(1:nrow(.)))) {
y = rbind(colnames(.) %>% str_clean, as.matrix(.))
rownames(y) = NULL
colnames(y) = NULL
return(y)
}
#append rownames to new leftmost col
#and colnames to new top row, put NA in the corner
y = cbind(c(rh, rownames(.) %>% str_clean), rbind(colnames(.) %>% str_clean, as.matrix(.)))
rownames(y) = NULL
colnames(y) = NULL
y
})
#widest?
x_widths = purrr::map_int(x_explicit_names, ~ncol(.))
x_width_max = max(x_widths)
#add empty columns
x_cols_added = purrr::map(x_explicit_names, function(.) {
#need to add?
if (ncol(.) < x_width_max) {
delta_ncol = x_width_max - ncol(.)
with_added = cbind(., matrix(rep("", nrow(.) * delta_ncol), ncol = delta_ncol))
return(with_added)
}
#otherwise, good to go
.
})
#fill in empty lines and titles
x_with_empty_lines = purrr::map(seq_along(x_cols_added), function(i) {
#add empty
this = x_cols_added[[i]]
this_name = names(x)[i]
#add header?
if (has_names(x)) {
#add a header based on the name of the list
this = rbind(
c(this_name, rep("", x_width_max - 1)),
this
)
}
#add empty
this = rbind(
rep("", x_width_max),
this
)
this
})
#finally, rbind and cut top line
purrr::reduce(x_with_empty_lines, rbind) %>%
#remove the top empty line
`[`(-1, )
}
#' Write model summary to clipboard
#'
#' Restructures a model summary object to a matrix and writes it to the clipboard.
#' @export
#' @examples
#' lm(Sepal.Length ~ Petal.Length, data = iris) %>% MOD_summary(kfold = F) %>% write_clipboard
write_clipboard.model_summary = function(x, digits = 2) {
#rerestructure to a matrix suitable for clipboard
ldf_to_long_mat(list(
#coefs
coefs = x$coefs %>% df_round(digits = digits),
#model meta data
meta = x$meta,
#etas
etas = x$aov_etas %>% df_round(digits = digits)
), rowname_headers = c("Predictor", NA, "Predictor")) %>%
#call generic again
write_clipboard()
}
#' Stack data into n columns.
#'
#' Reshapes the data to an n column structure for easy use in documents. Pads empty lines between variables and can include their names as well. Outputs a character matrix.
#' @param data (data.frame, matrix, or something coercible into a matrix) The data to reshape.
#' @param columns (whole number scalar) How many columns to stack the data into.
#' @param pad_columns (logical) Whether to pad empty columns to the data if the data and column dimensions do fit divide into a whole number. Defaults to TRUE.
#' @param include_colnames (logical) Whether to include the column names in the output. Defaults to TRUE.
#' @param rownames_colnames (character scalar) If adding colnames in rows, which rownames should these be given? Defaults to "name".
#' @export
#' @examples
#' df = split_every_k(1:12, 2) %>% as.data.frame
#' stack_into_n_columns(df, 2)
#' stack_into_n_columns(df, 3)
#' stack_into_n_columns(df, 4)
stack_into_n_columns = function(data, columns, pad_columns = TRUE, include_colnames = TRUE, rowname_colnames = "name") {
#checks
data = as.matrix(data)
assertthat::assert_that(is.matrix(data))
assertthat::assert_that(is_whole_number(columns))
assertthat::assert_that(is.logical(pad_columns))
#already the case?
if (ncol(data) == columns) {
message("data already was in the desired number of columns")
return(data)
}
#column number
if (pad_columns) {
if (ncol(data) %% columns != 0) {
message(str_c("data isn't integer divisible into ", columns, " columns.", ncol(data), "/", columns, "=", ncol(data) %% columns), ". Padding empty columns.")
#pad cols
v_n_to_pad = columns - (ncol(data) %% columns)
data = cbind(data, matrix(NA, ncol = v_n_to_pad, nrow = nrow(data)))
}
} else {
if (ncol(data) %% columns != 0) {
stop(stringr::str_c("data isn't integer divisible into ", columns, " columns!", ncol(data), "/", columns, "=", ncol(data) %% columns))
}
}
#stack
l_indices = split_every_k(1:ncol(data), columns)
m = matrix(ncol = columns, nrow = 0)
for (indices in l_indices) {
#fetch colnames
v_names = colnames(data)[indices]
if (include_colnames) {
m = rbind(m, #the object so far
v_names, #the column names
data[, indices], #the data slice
matrix(NA, nrow=1, ncol=columns)) #empty row
} else {
m = rbind(m, #the object so far
data[, indices], #the data slice
matrix(NA, nrow=1, ncol=columns)) #empty row
}
}
#clean
m = m[-nrow(m), ] #remove last empty row
#rownames
if (is.null(rownames(data))) {
rownames(data) = 1:nrow(data)
}
if (include_colnames) {
rownames(m) = rep(c(rowname_colnames, rownames(data), NA), length.out = nrow(m))
} else {
rownames(m) = rep(c(rownames(data), NA), length.out = nrow(m))
}
#remove colnames
colnames(m) = NULL
return(m)
}
#' Split data into n columns.
#'
#' Reshapes the data to a wider structure for easy use in documents. Pads empty lines between and can include their rownames and colnames as well. Outputs a character matrix.
#' @param data (data.frame, matrix, or something coercible into a matrix) The data to reshape.
#' @param split_times (whole number scalar) How many times to split the rows.
#' @param pad_columns (logical) Whether to pad empty columns to the data if the data and column dimensions do fit divide into a whole number. Defaults to TRUE.
#' @param include_colnames (logical) Whether to include the column names in the output. Defaults to TRUE.
#' @param rownames_colnames (character scalar) If adding colnames in rows, which rownames should these be given? Defaults to "name".
#' @export
#' @examples
#' df = data.frame(small = letters[1:6], big = LETTERS[1:6], stringsAsFactors = F)
#' split_into_n_columns(df, 2) #ok
#' split_into_n_columns(df, 3) #ok
#' split_into_n_columns(df, 4) #stupid but no error!
split_into_n_columns = function(data, split_times, pad_rows = T, include_rownames = T, rownames_var = "name", include_colnames = T) {
#types
data = as.data.frame(data)
#convert factors to chr
data[] = lapply(data, FUN = function(col) {
if (is.factor(col)) return(as.character(col))
col
})
#check length
v_remainder = nrow(data) %% split_times
if (!pad_rows & (v_remainder != 0)) {
stop("It is not possible to reform the data without padding rows because the number of rows is not integer disible by the desired number of columns! " + nrow(data) + "/" + split_times + "=" + v_remainder)
}
#include rownames?
if (include_rownames) {
data = data.frame(rownames___ = rownames(data), data, stringsAsFactors = F)
colnames(data)[1] = rownames_var
}
#save colnames
v_colnames = colnames(data)
#make out data object
v_rows_out = (nrow(data) / split_times) %>% ceiling()
#fill in missing rows
if ((nrow(data) %% split_times) != 0) {
#how many rows to fill in?
v_to_fill = nrow(data) %% split_times
#make the extra block
data_extra = matrix(nrow = v_to_fill, ncol = ncol(data)) %>% as.data.frame(stringsAsFactors = F)
#copy over colnames
colnames(data_extra) = colnames(data)
#rbind them
data = rbind(data, data_extra)
}
#transform
data2 = df_to_ldf(data, by = rep(1:split_times, each = v_rows_out)) %>%
do.call(what = "cbind", args = .)
#names
colnames(data2) = NULL
#add colnames on top
if (include_colnames) {
v_colnames = rep(v_colnames, length.out = ncol(data2))
data2 = rbind(v_colnames, data2)
}
data2
}
#' Improved dataset merger function
#'
#' This function allows you to merge two data.frames by their overlapping rownames. About 15 times faster than the earlier version.
#' @param DF1 (data.frame) A data.frame to merge into.
#' @param DF2 (data.frame) A data.frame with the new data.
#' @param join (character scalar) Which data.frame to use cases from. Options: left, right, both.
#' @param overwrite_NA (lgl scalar) Whether to overwrite with NA values.
#' @param restore_factors (lgl scalar) Whether to recreate factors in the merged data.frame. Does not keep levels.
#' @export
#' @examples
#' merge_datasets(iris[1:4], iris[1:5]) #merge together two parts of iris
merge_datasets = function (DF1, DF2, join = "both", overwrite_NA = FALSE, restore_factors = FALSE){
#checks
if (!join %in% c("both", "left", "right")) stop("Invalid join parameter!")
#join setting decides how to combine
v_shared_rows = intersect(rownames(DF1), rownames(DF2))
if (join == "left") {
DF2 = DF2[v_shared_rows, , drop = FALSE] #subset to overlap with DF1
}
if (join == "right") {
DF1 = DF1[v_shared_rows, , drop = FALSE] #subset to overlap with DF2
}
#if nothing to join
if (nrow(DF1) == 0) {
message("Warning, nothing joined! No case in DF1 matches any in DF2!")
return(DF2)
}
if (nrow(DF2) == 0) {
message("Warning, nothing joined! No case in DF2 matches any in DF1!")
return(DF1)
}
#factors present?
if (any(c(purrr::map_lgl(DF1, is.factor), purrr::map_lgl(DF2, is.factor)))) {
if (!restore_factors) {
message("Factors were converted to characters.")
}
if (restore_factors) {
message("Factors were converted to characters and back to factors. However, the levels cannot be restored automatically.")
}
}
#new DF
v_rows = unique(c(rownames(DF1), rownames(DF2)))
v_cols = unique(c(colnames(DF1), colnames(DF2)))
DF3 = matrix(NA, nrow = length(v_rows), ncol = length(v_cols)) %>% as.data.frame()
rownames(DF3) = v_rows;colnames(DF3) = v_cols
#for each dataset
for (DF in list(DF1, DF2)) {
v_rows_to_add = rownames(DF)
#for each col
for (col in colnames(DF)) {
#dont overwrite NAs
if (!overwrite_NA) {
#all
v_rows_to_add = rownames(DF)
#subset
v_rows_to_add = v_rows_to_add[!is.na(DF[[col]])] #subset rows with non-NA
}
#write in
#is factor?
logi_nonfactor = !is.factor(DF[v_rows_to_add, col])
if (logi_nonfactor) {
DF3[v_rows_to_add, col] = DF[v_rows_to_add, col]
} else {
DF3[v_rows_to_add, col] = DF[v_rows_to_add, col] %>% as.character()
}
}
}
#restore factors
if (restore_factors) {
v_factors = purrr::map_lgl(colnames(DF3), function(col) {
#check if cols are factors or nulls in both datasets
if ((is.factor(DF1[[col]]) || is.null(DF1[[col]])) &&
(is.factor(DF2[[col]]) || is.null(DF2[[col]]))) return(TRUE)
#if they are, treat as factor
#otherwise, treat as non-factor
FALSE
})
#change cols to factors
for (col_i in v_factors %>% which) {
DF3[col_i] = as.factor(DF3[[col_i]])
}
}
return(DF3)
}
#' Merge multiple datasets at once, improved version.
#'
#' This is a wrapper for merge_datasets().
#' @param ... (data.frames) Two or more data.frames to merge.
#' @param join (character scalar) Which data.frame to use cases from. Defaults to "both". Can be: both, left, right.
#' @param overwrite_NA (logical scalar) Whether to overwrite with NA values. Default = FALSE.
#' @param restore_factors (logical scalar) Whether to recreate factors in the merged data.frame. Does not keep levels. Default = FALSE.
#' @export
#' @examples
#' merge_datasets_multi(iris[1:50, ], iris[51:100, ], iris[101:150, ]) #merge three-part iris
merge_datasets_multi = function(..., join = "both", overwrite_NA = FALSE, restore_factors = FALSE) {
#wrap with Reduce
Reduce(function(x, y) merge_datasets(x, y, join=join, overwrite_NA = overwrite_NA, restore_factors = restore_factors), list(...))
}
#' Write rvest/xml object
#'
#' Writes an rvest/xml object to disk for reuse. This is a wrapper around write_rds.
#' @param x (obj) Object to write to disk.
#' @param path (str) Where to save it.
#' @param ... Other parameters to write_rds.
#' @return Invisibly returns x for use in pipelines.
#' @export
write_rvest = function(x, file, ...) {
#convert to character
#is list?
if (inherits(x, "list")) {
x = purrr::map(x, function(.x) {
#to char
.x = as.character(.x)
#to UTF8
Encoding(.x) = "UTF-8"
.x
})
} else {
x = as.character(x)
Encoding(x) = "UTF-8"
}
#save
readr::write_rds(x, file = file, ...)
invisible(x)
}
#' Read rvest/xml object
#'
#' Reads an rvest/xml object from disk. This is a wrapper around read_rds.
#' @param path (str) Where to read from.
#' @param ... Other parameters to read_rds.
#' @return An XML object.
#' @export
read_rvest = function(path) {
#load from file
x = readr::read_rds(path)
#read
if (inherits(x, "list")) {
x = purrr::map(x, read_html)
} else {
x = xml2::read_html(x)
}
x
}
#' Read VCF file into data frame
#'
#' @param x path to VCF
#' @param var_id Which column(s) to use for IDs. Can be 'ID' or 'chrpos'
#' @param n_max Max lines to read (including comments)
#'
#' @return a data frame
#' @export
read_vcf = function(x, var_id = "ID", n_max = Inf) {
#read VCF, all cols forced as character
x2 = readr::read_tsv(x, comment = "##", col_types = cols(.default = col_character())) %>% df_legalize_names()
#drop unwanted columns
var_id = match.arg(var_id, c("ID", "chrpos"))
if (var_id == "ID") {
meta_cols = c("CHROM", "POS", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT")
x2 = x2[!names(x2) %in% meta_cols]
} else if (var_id == "chrpos") {
#make chrpos
x2$chrpos = str_c(x2$CHROM, ":" , x2$POS, sep = "")
meta_cols = c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT")
x2 = x2[!names(x2) %in% meta_cols]
}
#ensure ids are unique
#warn if not unique
if (any(duplicated(x2[[var_id]]))) {
warning("There were duplicated IDs among the variants. These were made unique with `str_uniquify()` -- check your data!")
x2[[var_id]] %<>% str_uniquify()
}
#restructure to normal format
var_id_sym = rlang::sym(var_id)
x2_long = suppressWarnings(x2 %>% tidyr::gather(key = id, value = count, -!!var_id_sym))
#spread snps
x3 = x2_long %>% dplyr::select(id, !!var_id_sym, count) %>% tidyr::spread(key = !!var_id_sym, value = count)
#recode to counts of minor allele
for (i in seq_along(x3)) {
#skip first col
if (i == 1) next
#extract relevant part if needed
if (any(str_detect(x3[[i]], ":"))) {
x3[[i]] = x3[[i]] %>% str_match("^([^:]+):") %>% .[, 2]
}
#anything not biallelic?
non_biallelic = !x3[[i]] %in% c("./.", "0/0", "0/1", "1/0", "1/1")
x3[[i]][non_biallelic] = NA_character_
#replace
x3[[i]] = x3[[i]] %>% plyr::mapvalues(c("./.", "0/0", "0/1", "1/0", "1/1"), c(NA, 0, 1, 1, 2), warn_missing = F) %>%
as.numeric()
}
x3
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.