# Data structure operations.
# Data structures
# ----------------------------------------------------------
# The main data strucutre is an implicit data strucutre
# based on an R list. THe list has entries $meta_col for
# sample metadata and $meta_row for row metadata. All other
# list entries are primary data matrices matching the dimensions
# of the meta data.
# Test if the list is a valid bdmerge data list. Throws errors if invalid.
checkDataList = function(d) {
if (class(d) != "list") {
stop("ERROR: Invalid data type, not a list.")
}
if (!("meta_row" %in% names(d))) {
stop("ERROR: Invalid data type, list does not contain meta_row entry.")
}
if (!("meta_row" %in% names(d))) {
stop("ERROR: Invalid data type, list does not contain meta_row entry.")
}
if (!("meta_col" %in% names(d))) {
stop("ERROR: Invalid data type, list does not contain meta_col entry.")
}
for (entry in names(d)) {
if (entry == "meta_row" || entry == "meta_col") {
next
}
# Test dimensions of data matrix
if (nrow(d[[entry]]) != nrow(d$meta_row)) {
stop("ERROR: Invalid data type, dimension mismatch between meta_row and ", entry)
}
}
return(TRUE) # all tests passed
}
# Some data structure checks for the matched correlation structure generated by corMerge()
checkCorList = function(d) {
if (class(d) != "list") {
stop("ERROR: cor data type is not a list")
}
req_names = c("A", "B", "ATBT", "meta_rowA", "meta_colA", "meta_rowB", "meta_colB", "invoke")
if (any(! req_names %in% names(d))) {
stop("ERROR: invalid list names")
}
return(TRUE)
}
# Returns the column subset of a bdmerge data list.
# sample_num: a vector of the sample ids
# d is a bdmerge data list
colSubset = function(d, samples) {
if (class(samples) == "logical") {
samples = which(samples)
}
if (!all(samples <= nsamples(d))) {
stop("samples is out of bounds")
}
if (!all(samples > 0)) {
stop("samples is out of bounds")
}
out = list()
out$meta_row = d$meta_row
out$meta_col = d$meta_col[samples, , drop=FALSE]
out$meta_col = as.data.frame(out$meta_col) #
for (entry in names(d)) {
if (entry == "meta_row" || entry == "meta_col") next
out[[entry]] = d[[entry]][,samples]
}
checkDataList(out)
return(out)
}
# d is a dlist, exp_def a vector defining experimental id
colUnique = function(d, exp_def) {
experiments = apply(d$meta_col[exp_def], 1, paste, collapse=":")
unique_samples = match(unique(experiments), experiments) # first match only
# Remove all sample definitions with missing values
return(colSubset(d, unique_samples))
}
# Extract columns from matched list of data lists.
colSubsetMatchedCollection = function(dlists, samples) {
out = list(0)
for (i in 1:length(dlists)) {
out[[i]] = colSubset(dlists[[i]], samples)
}
return(out)
}
# Rename matrix columns. Returns modified dlist.
labelMatrixCols = function(d, col_name_def, dec=1) {
# Extract naming subset
name_frame = d$meta_col[col_name_def]
# Round numeric entries
name_type = sapply(name_frame, class)
for (i in 1:ncol(name_frame)) {
if (name_type[i] == "numeric") {
name_frame[,i] = round(name_frame[,i], dec)
}
}
# Construct labels
lables = apply(name_frame, 1, paste, collapse=":")
lables = gsub(" ", "", lables) # remove all white space
for (entry in names(d)) {
if (entry == "meta_col" | entry == "meta_row") next
colnames(d[[entry]]) = lables
}
return(d)
}
labelMatrixRows = function(d, row_name_def, dec=1) {
# Extract naming subset
name_frame = d$meta_row[row_name_def]
# Round numeric entries
name_type = sapply(name_frame, class)
for (i in 1:ncol(name_frame)) {
if (name_type[i] == "numeric") {
name_frame[,i] = round(name_frame[,i], dec)
}
}
lables = apply(name_frame, 1, paste, collapse=":")
lables = gsub(" ", "", lables) # remove white space
for (entry in names(d)) {
if (entry == "meta_col" | entry == "meta_row") next
rownames(d[[entry]]) = lables
}
return(d)
}
nsamples = function(d) {
tryCatch(checkDataList(d),
# on any error, callback
error=function(e) {
# Prints error and proceeds
message(e$message)
}
)
return(nrow(d$meta_col))
}
nfeatures = function(d) {
tryCatch(checkDataList(d),
# on any error, callback
error=function(e) {
# Prints error and proceeds
message(e$message)
}
)
return(nrow(d$meta_row))
}
nmatrices = function(d) {
tryCatch(checkDataList(d),
# on any error, callback
error=function(e) {
# Prints error and proceeds
message(e$message)
}
)
return(length(d) - 2)
}
# Counts the number of missing values per column of each dlist in a provided list. entry is the name of the data table.
missingValuesCol = function(dlists, entry) {
missing = lapply(dlists, function(dlist) {
col_missing = apply(dlist[[entry]], 2, function(col) {
return(sum(is.na(col)))
})
return(col_missing)
})
return(missing)
}
# Calculate if all entries of a dlist list colum are NAs.
# entry is a vector of matrix names in dlists. If single entry name is provided
# the same name is assumed for all.
allMissingValuesCol = function(dlists, entries) {
if (length(entries) == 1) {
entries = rep(entries, length(dlists)) # expand to match length of list
}
# make references to matrix
matrices = list()
for (i in 1:length(dlists)) {
matrices[[i]] = dlists[[i]][[entries[i]]]
}
missing = lapply(matrices, function(mat) {
col_missing = apply(mat, 2, function(col) {
return(all(is.na(col)))
})
return(col_missing)
})
return(missing)
}
# Operations
# --------------------------------------------------------------
# Merges data lists. Includes common features indicated by the row_id.
# Column meta data are expanded.
# WARNING: function may have issues if the provided ids are not unique...
mergeDataListsByRow = function(dlist1, dlist2, row_id="id") {
require("plyr") # todo: move dependency to NAMESPACE file
# Defensive check for valid data lists.
checkDataList(dlist1)
checkDataList(dlist2)
# Test if provided row_id is valid
if (!all(row_id %in% colnames(dlist1$meta_row))) {
stop("invalid row_id not found in meta_row: ", row_id)
}
if (!all(row_id %in% colnames(dlist2$meta_row))) {
stop("invalid row_id not found in meta_row: ", row_id)
}
if (any(table(dlist1$meta_row[[row_id]]) > 1)) {
warning("id not unique for dlist1")
}
if (any(table(dlist2$meta_row[[row_id]]) > 1)) {
warning("id not unique for dlist2")
}
# Output data lists structure
out = list()
# out$meta_row = merge(dlist1$meta_row, dlist2$meta_row, by=row_id) # default inner join, only includes common rows
out$meta_row = merge(dlist1$meta_row, dlist2$meta_row, all=TRUE) # outer join, all rows are included
# Expand column meta data (samples)
out$meta_col = rbind.fill(dlist1$meta_col, dlist2$meta_col)
for (entry in names(dlist1)) {
if (entry == "meta_row" | entry == "meta_col") next # excludes meta data
mat1 = matrix(NA, nrow=nrow(out$meta_row), ncol=nsamples(dlist1))
mat2 = matrix(NA, nrow=nrow(out$meta_row), ncol=nsamples(dlist2))
row_order1 = match(dlist1$meta_row[[row_id]], out$meta_row[[row_id]])
row_order2 = match(dlist2$meta_row[[row_id]], out$meta_row[[row_id]])
mat1[row_order1,] = dlist1[[entry]]
mat2[row_order2,] = dlist2[[entry]]
out[[entry]] = cbind(mat1, mat2)
}
checkDataList(out)
return(out)
}
mergeDataLists = function(dlist1, dlist2) {
# out$meta_col = merge(dlist1$meta_col, dlist2$meta_col, all.y=TRUE)
# out$meta_col = dlist1$meta_col[col_index1,]
# meta_col1 = dlist1$meta_col[col_index1,]
# names(meta_col1) = paste0(names(meta_col1), "1")
# meta_col2 = dlist2$meta_col[col_index2,]
# names(meta_col2) = paste0(names(meta_col2), "2")
# out$meta_col = cbind(meta_col1, meta_col2)
# out$meta_col = cbind(dlist1$meta_col, dlist2$meta_col)
out = list()
out$meta_col = rbind.fill(dlist1$meta_col, dlist2$meta_col)
# out$meta_row = rbind.fill(dlist1$meta_row, dlist2$meta_row)
out$meta_row = merge(dlist1$meta_row, dlist2$meta_row) # default inner join
for (entry in names(dlist1)) {
if (entry == "meta_row" | entry == "meta_col") next
out[[entry]] = cbind(
dlist1[[entry]],
dlist2[[entry]])
}
checkDataList(out)
return(out)
}
# Merges data types with matching column condition (such as experimental id).
# ARGUMENTS:
# match_conditions: list of vector of the meta data sample column names to use for the experimental matching id.
# matrix_selection: list of vectors of strings, names of matrices to use. if null, all entries are used
# mehod: methods for handling multiple entries with the same experimental id
# c("cyclical", "bootstrap", "average")
# cyclical: cyclical randomly selects
# bootstrap: samples with replacement from multiple members of group
# average: condenses multiples to their average. Not yet implemented.
# Uses cyclical sampling without replacement to fill in the lesser data type for each condition.
# First, matching index vectors are calculated, which are used to slice the two input data lists.
# Only column metadata is combined using cbind. Therefore, column names migth be replicated.
mergeDataListsByCol = function(dlists,
match_conditions, # list of vectors generating
matrix_selections=NULL, # all selected by default
method="cyclical")
{
require(plyr)
# CHECK INPUT
# -------------------------------------------------
if (length(match_conditions) == 1 & length(dlists) > 1) {
# repeat match_conditions
match_conditions = rep(list(match_conditions), length(dlists))
}
if (length(matrix_selections) == 1 & length(dlists) > 1) {
matrix_selections = rep(list(matrix_selections), length(dlists))
}
if (!length(match_conditions) == length(dlists)) {
stop("Data lists and match_conditions do not agree")
}
# Check and format
for (j in 1:length(dlists)) {
checkDataList(dlists[[j]])
# Test if provided row_id is valid
if (!all(match_conditions[[j]] %in% colnames(dlists[[j]]$meta_col))) {
stop("invalid col id not found in meta_col: ", match_conditions[[j]])
}
dlists[[j]]$meta_col = as.data.frame(dlists[[j]]$meta_col)
dlists[[j]]$meta_row = as.data.frame(dlists[[j]]$meta_row)
# Check match condition and ensure consistent string format
for (k in 1:length(match_conditions[[j]])) {
if (class(dlists[[j]]$meta_col[[match_conditions[[j]][[k]]]]) == "integer" |
class(dlists[[j]]$meta_col[[match_conditions[[j]][[k]]]]) == "numeric")
{
dlists[[j]]$meta_col[[match_conditions[[j]][k]]] = format(as.numeric(dlists[[j]]$meta_col[[match_conditions[[j]][k]]]), nsmall=1, trim=TRUE)
}
}
}
# MATCH DATA
# -------------------------------------------------------------------
# Construct group ids
sampleids = list()
for (j in 1:length(dlists)) {
# construct ids from multiple fields.
sampleids[[j]] = apply(
dlists[[j]]$meta_col[unlist(match_conditions[[j]])],
1, paste, collapse=":"
)
sampleids[[j]] = gsub(" ", "", sampleids[[j]]) # remove any whitespace
}
groups = unique(unlist(sampleids)) # get all groups from constructed sampleids
# Create matching local column index accross each of the dlists
col_index = list()
# Loop over experimental groups
for (i in 1:length(groups)) {
# Find the local columns for each data list that matches the experimental group.
cols = lapply(sampleids, function(ids) {
local_index = which(ids == groups[i])
if (length(local_index) == 0) {
return(NA)
} else {
return(local_index)
}
})
# Shuffle order such that the cyclical matching is not biased
cols = lapply(cols, function(local_index) {
return(local_index[sample(1:length(local_index))])
})
# The maximum number of members for the considered experimental group
max_members = max(sapply(cols, length))
# sample with replacement to maximum samples for other data types
if (method == "bootstrap") {
cols = lapply(cols, function(local_index) {
if (length(local_index) == max_members | length(local_index) == 0) {
return(local_index)
} else if (length(local_index) == 1) {
return(rep(local_index, max_members - length(local_index) + 1))
} else if (length(local_index) > 1) {
return(
sample(local_index, size=max_members, replace=TRUE)
)
} else {
stop("Invalid local_index") # internal error
}
})
} else if (method == "cyclical") {
# no bootstrap, fill up with NA's
cols = lapply(cols, function(local_index) {
if (length(local_index) == max_members) {
# maximum number of entries
return(local_index)
} else {
# less than maximum number entries
return(c(local_index, rep(NA, max_members - length(local_index)))) # fill up with NAs
}
})
} else if (method == "average") {
# TODO
stop("method not implemented")
} else {
stop("wrong method")
}
col_index[[i]] = do.call(cbind, cols) # create matrix of column indices
}
# Matrix of local (and matched) indicies. One column per dlist
col_index_mat = do.call(rbind, col_index)
# calculate number of entries in each experimental group
group_members = sapply(col_index, nrow)
if (!length(group_members) == length(groups)) {
stop("Internal, match group inconsistency")
}
if (!length(group_members) == length(col_index)) {
stop("Internal, match group and col index inconsistent.")
}
# create internal match group id
match_group = list()
for (k in 1:length(group_members)) {
match_group[[k]] = rep(k, nrow(col_index[[k]]))
}
match_group = unlist(match_group)
# COPY DATA TO RETURN STRUCTURE
# --------------------------------------------------------------
# As specified in col_index_mat. Slicing the input data lists using the calculated column indices.
out = list()
for (j in 1:length(dlists)) {
out[[j]] = list()
out[[j]]$meta_col = dlists[[j]]$meta_col[col_index_mat[,j], , drop=FALSE]
out[[j]]$meta_col$match_group = match_group
out[[j]]$meta_row = dlists[[j]]$meta_row
if (!is.null(matrix_selections)) {
# only return selected matrix
for (entry in matrix_selections[j]) {
out[[j]][[entry]] = as.matrix(dlists[[j]][[entry]])[,col_index_mat[,j], drop=FALSE]
}
} else {
# assume data lists have the exact same matrix names.
for (entry in names(dlists[[j]])) {
if (entry == "meta_row" | entry == "meta_col") next
out[[j]][[entry]] = as.matrix(dlists[[j]][[entry]])[,col_index_mat[,j], drop=FALSE]
# note that NA slicing (NA indices yields NA elements) only works for matrices
}
}
checkDataList(out[[j]])
}
return(out)
}
# mergeDataListsByColLegacy = function(dlist1, dlist2, match_condition,
# max_repeat=3, output_format="rjoin_matrix")
# {
# require(plyr)
# # # test enviroment
# # match_condition=c("cell_id", "pert_id")
# # max_repeat = 3
# # dlist1 = ttest
# # dlist2 = p100_ttest
# checkDataList(dlist1)
# checkDataList(dlist2)
# # Ensure that column meta data is a data frame.
# dlist1$meta_col = as.data.frame(dlist1$meta_col)
# dlist2$meta_col = as.data.frame(dlist2$meta_col)
# dlist1$meta_row = as.data.frame(dlist1$meta_row)
# dlist2$meta_row = as.data.frame(dlist2$meta_row)
# # Test if provided row_id is valid
# if (!all(match_condition %in% colnames(dlist1$meta_col))) {
# stop("invalid col id not found in meta_col: ", match_condition)
# }
# if (!all(match_condition %in% colnames(dlist2$meta_col))) {
# stop("invalid col id not found in meta_col: ", match_condition)
# }
# # Test that provided lists contains the same entreis
# if (!all(names(dlist1) == names(dlist2))) {
# stop("Data lists contain different entries.")
# }
# # Construct group ids
# sampleid1 = apply(dlist1$meta_col[match_condition], 1, paste, collapse=":")
# sampleid1 = gsub(" ", "", sampleid1) # remove all whitespace
# sampleid2 = apply(dlist2$meta_col[match_condition], 1, paste, collapse=":")
# sampleid2 = gsub(" ", "", sampleid2) # remove all whitespace
# groups = unique(c(sampleid1, sampleid2))
# col_index1 = list()
# col_index2 = list()
# for (i in 1:length(groups)) {
# cols1 = which(sampleid1 == groups[i])
# cols2 = which(sampleid2 == groups[i])
# if (length(cols1) == 0) {
# cols1 = NA
# }
# if (length(cols2) == 0) {
# cols2 = NA
# }
# if (length(cols1) == 1) {
# # only one data point of type 1
# col_index1[[i]] = rep(cols1, length(cols2))
# col_index2[[i]] = cols2
# } else if (length(cols2) == 1) {
# col_index1[[i]] = cols1
# col_index2[[i]] = rep(cols2, length(cols1))
# } else if (length(cols1) == length(cols2)) {
# # same number of data points, randomize order of matching
# col_index1[[i]] = cols1
# col_index2[[i]] = sample(cols2, replace=FALSE)
# } else if (length(cols1) > length(cols2)){
# col_index1[[i]] = cols1
# # fill in data of second type
# quotient = length(cols1) %/% length(cols2)
# remainder = length(cols1) %% length(cols2)
# cycle_index = as.vector(
# replicate(quotient, sample(cols2, replace=FALSE))
# )
# remainder_index = sample(cols2, size=remainder, replace=FALSE)
# col_index2[[i]] = c(cycle_index, remainder_index)
# } else {
# # length(cols2) > length(cols1)
# col_index2[[i]] = cols2
# quotient = length(cols2) %/% length(cols1)
# remainder = length(cols2) %% length(cols1)
# cycle_index = as.vector(
# replicate(quotient, sample(cols1, replace=FALSE))
# )
# remainder_index = sample(cols1, size=remainder, replace=FALSE)
# col_index1[[i]] = c(cycle_index, remainder_index)
# }
# }
# col_index1 = unlist(col_index1)
# col_index2 = unlist(col_index2)
# # Calculate the number of each data index that needs to be removed in order to satisfy the maximum
# # data repeatability criteria.
# # named vector of the number of data points to remove
# remove1 = table(col_index1) - max_repeat
# remove1 = remove1[remove1 > 0]
# remove2 = table(col_index2) - max_repeat
# remove2 = remove2[remove2 > 0]
# if (length(remove1) > 0) {
# for (i in 1:length(remove1)) {
# # print(i)
# index = as.numeric(names(remove1[i])) # data index
# nrem = remove1[i] # number of entries to remove (multiplicity)
# index_remove = sample(which(col_index1 == index), nrem)
# col_index1[index_remove] = NA
# }
# }
# if (length(remove2) > 0) {
# for (i in 1:length(remove2)) {
# # print(i)
# index = as.numeric(names(remove2[i])) # data index
# nrem = remove2[i] # number of entries to remove (multiplicity)
# index_remove = sample(which(col_index2 == index), nrem)
# col_index2[index_remove] = NA
# }
# }
# # Slice the input data lists using the calculated column indices.
# out = list()
# # out$meta_col = rbind.fill(dlist1$meta_col[col_index1,], dlist2$meta_col[col_index2,])
# # out$meta_col = merge(dlist1$meta_col[col_index1,], dlist2$meta_col[col_index2,], all.y=TRUE)
# # out$meta_col = dlist1$meta_col[col_index1,]
# # meta_col1 = dlist1$meta_col[col_index1,]
# # names(meta_col1) = paste0(names(meta_col1), "1")
# # meta_col2 = dlist2$meta_col[col_index2,]
# # names(meta_col2) = paste0(names(meta_col2), "2")
# # out$meta_col = cbind(meta_col1, meta_col2)
# if (output_format == "rjoin_matrix") {
# out$meta_col = cbind(dlist1$meta_col[col_index1,], dlist2$meta_col[col_index2,])
# out$meta_row = rbind.fill(dlist1$meta_row, dlist2$meta_row)
# for (entry in names(dlist1)) {
# if (entry == "meta_row" | entry == "meta_col") next
# out[[entry]] = rbind(
# dlist1[[entry]][,col_index1],
# dlist2[[entry]][,col_index2])
# }
# checkDataList(out)
# } else if (output_format == "matched_matrix") {
# out[[1]] = list()
# out[[2]] = list()
# out[[1]]$meta_col = dlist1$meta_col[col_index1,]
# out[[2]]$meta_col = dlist2$meta_col[col_index2,]
# out[[1]]$meta_row = dlist1$meta_row
# out[[2]]$meta_row = dlist2$meta_row
# for (entry in intersect(names(dlist1), names(dlist2))) {
# if (entry == "meta_col" || entry == "meta_row") next
# out[[1]][[entry]] = dlist1[[entry]][,col_index1]
# out[[2]][[entry]] = dlist2[[entry]][,col_index2]
# }
# checkDataList(out[[1]])
# checkDataList(out[[2]])
# }
# return(out)
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.