R/Upset.R

make_comb_mat_from_matrix = function(x, mode, top_n_sets = Inf, min_set_size = -Inf, 
	universal_set = NULL, complement_size = NULL) {

	# check whether x is a binary matrix
	if(is.data.frame(x)) {
		lc = sapply(x, function(x) {
			if(is.numeric(x)) {
				all(x == 0 | x == 1)
			} else if(is.logical(x)) {
				TRUE
			} else {
				FALSE
			}
		})
	} else if(is.matrix(x)) {
		lc = apply(x, 2, function(x) {
			if(is.numeric(x)) {
				all(x == 0 | x == 1)
			} else if(is.logical(x)) {
				TRUE
			} else {
				FALSE
			}
		})
	}
	if(sum(lc) < 1) {
		stop_wrap("Can not find columns which are logical or only contain 0 or 1.")
	} else {
		x = x[, lc, drop = FALSE]
	}
	if(is.null(colnames(x))) {
		stop_wrap("The matrix or the data frame must have column names.")
	}

	x = as.matrix(x) + 0
	set_size = colSums(x)
	l = set_size >= min_set_size & rank(max(set_size) - set_size) <= top_n_sets

	set_size = set_size[l]
	x = x[, l, drop = FALSE]
	
	if(is.null(rownames(x))) {
		x_has_rownames = FALSE
		rownames(x) = as.character(seq_len(nrow(x)))
	} else {
		x_has_rownames = TRUE
	}

	if(!is.null(universal_set)) {
		if(!x_has_rownames) {
			stop_wrap("`x` should have row names when `universal_set` is set.")
		}
	}

	if(is.null(complement_size)) {
		k = sum(rowSums(x) == 0)
		if(k) {
			complement_size = k
			if(is.null(universal_set)) {
				universal_set = unique(rownames(x))
			}
		}
	}

	x = x[rowSums(x) > 0, , drop = FALSE]

	if(!is.null(universal_set)) {

    	x = x[intersect(rownames(x), universal_set), , drop = FALSE]
    	if(nrow(x) == 0) {
    		stop_wrap("There is no combination set left after intersecting to `universal_set`.")
    	}
    	complement_size = length(setdiff(universal_set, rownames(x)))
    }
	
	comb_mat = unique(x)
	rn = apply(comb_mat, 1, binaryToInt)
	tb = table(apply(x, 1, binaryToInt))
	comb_size = as.vector(tb[as.character(rn)])
	
	rownames(comb_mat) = NULL
	comb_mat = t(comb_mat)

	nc = ncol(comb_mat)
	comb_mat2 = matrix(nrow = nrow(comb_mat), ncol = nc*(nc-1)/2)
	rownames(comb_mat2) = rownames(comb_mat)
	if(mode == "intersect") {
		if(nc > 1) {
			ic = 0
			for(i in 1:(nc-1)) {
				for(j in (i+1):nc) {
					ic = ic + 1
					comb_mat2[, ic] = (comb_mat[, i] & comb_mat[, j]) + 0
				}
			}
		}
	} else if(mode == "union") {
		if(nc > 1) {
			ic = 0
			for(i in 1:(nc-1)) {
				for(j in (i+1):nc) {
					ic = ic + 1
					comb_mat2[, ic] = (comb_mat[, i] | comb_mat[, j]) + 0
				}
			}
		}
	}
	comb_mat2 = comb_mat2[, colSums(comb_mat2) > 0, drop = FALSE]

	if(mode %in% c("intersect", "union")) {
		comb_mat = unique(comb_mat2, MARGIN = 2)
		comb_size = apply(comb_mat, 2, function(query) {
			if(mode == "intersect") {
				l_subset = query == 1
				l = apply(x, 1, function(y) all(y[l_subset] == 1))
			} else if(mode == "union") {
				l_subset = query == 1
				l = apply(x, 1, function(y) {
					if(all(y[!l_subset] == 0)) {
						sum(y[l_subset]) > 0
					} else {
						FALSE
					}
				})
			}
			sum(l)
		})
	}

	#normalize comb_mat to 2^n - 1 columns
	n = nrow(comb_mat)
	if(ncol(comb_mat) < 2^n - 1) {
		full_comb_mat = matrix(0, nrow = n, ncol = 2^n - 1)
		j = 1
		for(k in seq_len(n)) {
			comb = combn(n, k)
	        for(i in 1:ncol(comb)) {
	            full_comb_mat[comb[, i], j] = TRUE
	            j = j + 1
	        }
		}
		ind = which(! apply(full_comb_mat, 2, binaryToInt) %in% apply(comb_mat, 2, binaryToInt))
		comb_mat = cbind(comb_mat, full_comb_mat[, ind, drop = FALSE])
		comb_size = c(comb_size, rep(0, length(ind)))
		
	}

	if(!is.null(complement_size)) {
		comb_mat = cbind(rep(0, nrow(comb_mat)), comb_mat)
		comb_size = c(complement_size, comb_size)
	}

	od = order(apply(comb_mat, 2, function(x) sum(x*seq_along(x))))
	comb_mat = comb_mat[, od, drop = FALSE]
	comb_size = comb_size[od]

	attr(comb_mat, "set_size") = set_size
	attr(comb_mat, "comb_size") = comb_size
	attr(comb_mat, "mode") = mode
	attr(comb_mat, "set_on_rows") = TRUE
	attr(comb_mat, "x") = x
	if(!x_has_rownames) universal_set = as.numeric(universal_set)
	attr(comb_mat, "universal_set") = universal_set
	class(comb_mat) = c("comb_mat", "matrix")
	return(comb_mat)

}

make_comb_mat_from_list = function(lt, mode, value_fun = length, top_n_sets = Inf, 
	min_set_size = -Inf, universal_set = NULL, complement_size = NULL) {

	n = length(lt)
    nm = names(lt)
    if(is.null(nm)) {
    	stop_wrap("The list must have names.")
    }

    if(inherits(lt[[1]], "GRanges")) {
    	union = getFromNamespace("union", ns = "BiocGenerics")
    	intersect = getFromNamespace("intersect", ns = "BiocGenerics")
    	setdiff = getFromNamespace("setdiff", ns = "BiocGenerics")
    } else if(inherits(lt[[1]], "IRanges")) {
    	union = getFromNamespace("union", ns = "BiocGenerics")
    	intersect = getFromNamespace("intersect", ns = "BiocGenerics")
    	setdiff = getFromNamespace("setdiff", ns = "BiocGenerics")
    }

    if(inherits(lt[[1]], "GRanges")) {
    	set_size = sapply(lt, function(x) {
	    	value_fun(union(x, x[NULL]))
	    })
    } else if(inherits(lt[[1]], "IRanges")) {
    	set_size = sapply(lt, function(x) {
	    	value_fun(union(x, x[NULL]))
	    })
    } else {
	    set_size = sapply(lt, function(x) {
	    	value_fun(union(x, NULL))
	    })
	}

	l = set_size >= min_set_size & rank(max(set_size) - set_size) <= top_n_sets

	set_size = set_size[l]
	lt = lt[l]
	n = length(lt)
    nm = names(lt)

    if(!is.null(universal_set)) {
    	lt = lapply(lt, function(x) intersect(x, universal_set))
    	complement_set = universal_set
    	for(i in seq_along(lt)) {
    		complement_set = setdiff(complement_set, lt[[i]])
    	}
    	complement_size = value_fun(complement_set)
    }
    
    comb_mat = matrix(FALSE, nrow = n, ncol = sum(choose(n, 1:n)))
    rownames(comb_mat) = nm
    j = 1
    for(k in 1:n) {
        comb = combn(n, k)
        for(i in 1:ncol(comb)) {
            comb_mat[comb[, i], j] = TRUE
            j = j + 1
        }
    }

    get_comb_size = function(lt, mode, do = rep(TRUE, length(lt)), value_fun = length) {
        set1_index = which(do)
        set2_index = which(!do)

        s = lt[[ set1_index[1] ]]
        if(mode == "distinct") {
	        for(i in set1_index[-1]) {
	            s = intersect(s, lt[[ i ]])
	        }

	        for(i in set2_index) {
	            s = setdiff(s, lt[[ i ]])
	        }
	    } else if(mode == "intersect") {
	    	for(i in set1_index[-1]) {
	            s = intersect(s, lt[[ i ]])
	        }
	    } else if(mode == "union") {
	    	for(i in set1_index[-1]) {
	            s = union(s, lt[[ i ]])
	        }
	    }
        value_fun(s)
    }

    comb_size = numeric(ncol(comb_mat))
    for(i in seq_len(ncol(comb_mat))) {
        comb_size[i] = get_comb_size(lt, mode = mode, comb_mat[, i], value_fun = value_fun)
    }

    comb_mat = comb_mat + 0

    if(!is.null(complement_size)) {
		comb_mat = cbind(rep(0, nrow(comb_mat)), comb_mat)
		comb_size = c(complement_size, comb_size)
	}

    attr(comb_mat, "set_size") = set_size
	attr(comb_mat, "comb_size") = comb_size
	attr(comb_mat, "mode") = mode
	attr(comb_mat, "set_on_rows") = TRUE
	attr(comb_mat, "lt") = lt
	attr(comb_mat, "universal_set") = universal_set
	class(comb_mat) = c("comb_mat", "matrix")
	return(comb_mat)
}

# == title
# Convert a List of Sets to a Binary Matrix
#
# == param
# -lt A list of vectors.
# -universal_set The universal set.
#
# == details
# It converts the list which have m sets to a binary matrix with n rows and m columns
# where n is the size of universal set.
#
# == example
# set.seed(123)
# lt = list(a = sample(letters, 5),
#           b = sample(letters, 10),
#           c = sample(letters, 15))
# list_to_matrix(lt)
# list_to_matrix(lt, universal_set = letters)
list_to_matrix = function(lt, universal_set = NULL) {
	if(!is.null(universal_set)) {
		lt = lapply(lt, function(x) intersect(x, universal_set))
	} else {
		universal_set = unique(unlist(lt))
	}

	mat = matrix(0, nrow = length(universal_set), ncol = length(lt))
	rownames(mat) = sort(universal_set)
	colnames(mat) = names(lt)
	for(i in seq_along(lt)) {
		mat[unique(lt[[i]]), i] = 1
	}
	return(mat)
}

# == title
# Make a Combination Matrix for UpSet Plot
#
# == param
# -... The input sets. If it is represented as a single variable, it should be a matrix/data frame
#     or a list. If it is multiple variables, it should be name-value pairs, see Input section for explanation.
# -mode The mode for forming the combination set, see Mode section.
# -top_n_sets Number of sets with largest size.
# -min_set_size Ths minimal set size that is used for generating the combination matrix.
# -remove_empty_comb_set Whether remove empty combination sets?
# -remove_complement_set Whether remove the complement set which has no intersection to any of the set.
# -universal_set The universal set. If it is set, the size of the complement set of all sets is also calculated.
#                It if is specified, ``complement_size`` is ignored.
# -complement_size The size for the complement of all sets. If it is specified, the combination
#                  set name will be like "00...".
# -value_fun For each combination set, how to calculate the size? If it is a scalar set, 
#      the length of the vector is the size of the set, while if it is a region-based set,
#      (i.e. ``GRanges`` or ``IRanges`` object), the sum of widths of regions in the set is
#      calculated as the size of the set.
#
# == Input
# To represent multiple sets, the variable can be represented as: 
#
# 1. A list of sets where each set is a vector, e.g.:
#
#     list(set1 = c("a", "b", "c"),
#          set2 = c("b", "c", "d", "e"),
#          ...)
#
# 2. A binary matrix/data frame where rows are elements and columns are sets, e.g.:
#
#       a b c
#     h 1 1 1
#     t 1 0 1
#     j 1 0 0
#     u 1 0 1
#     w 1 0 0
#     ...
#
# If the variable is a data frame, the binary columns (only contain 0 and 1) and the logical
# columns are only kept.
#
# The set can be genomic regions, then it can only be represented as a list of ``GRanges`` objects.
#
# == Mode
# E.g. for three sets (A, B, C), the UpSet approach splits the combination of selecting elements
# in the set or not in the set and calculates the sizes of the combination sets. For three sets,
# all possible combinations are:
#
#     A B C
#     1 1 1
#     1 1 0
#     1 0 1
#     0 1 1
#     1 0 0
#     0 1 0
#     0 0 1
# 
# A value of 1 means to select that set and 0 means not to select that set. E.g., "1 1 0"
# means to select set A, B while not set C. Note there is no "0 0 0", because the background 
# size is not of interest here. With the code of selecting and not selecting the sets, next
# we need to define how to calculate the size of that combination set. There are three modes:
#
# 1. ``distinct`` mode: 1 means in that set and 0 means not in that set, then "1 1 0" means a
# set of elements also in set A and B, while not in C (i.e. ``setdiff(intersect(A, B), C)``). Under
# this mode, the seven combination sets are the seven partitions in the Venn diagram and they
# are mutually exclusive.
#
# 2. ``intersect`` mode: 1 means in that set and 0 is not taken into account, then, "1 1 0" means
# a set of elements in set A and B, and they can also in C or not in C (i.e. ``intersect(A, B)``).
# Under this mode, the seven combination sets can overlap.
#
# 3. ``union`` mode: 1 means in that set and 0 is not taken into account. When there are multiple
# 1, the relationship is OR. Then, "1 1 0" means a set of elements in set A or B, and they can also in C or not in C (i.e. ``union(A, B)``).
# Under this mode, the seven combination sets can overlap.
#
# == value
# A matrix also in a class of ``comb_mat``.
#
# Following functions can be applied to it: `set_name`, `comb_name`, `set_size`, `comb_size`, `comb_degree`,
# `extract_comb` and `t.comb_mat`.
#
# == example
# set.seed(123)
# lt = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m = make_comb_mat(lt)
#
# mat = list_to_matrix(lt)
# mat
# m = make_comb_mat(mat)
#
# \dontrun{
# library(circlize)
# library(GenomicRanges)
# lt = lapply(1:4, function(i) generateRandomBed())
# lt = lapply(lt, function(df) GRanges(seqnames = df[, 1], 
#     ranges = IRanges(df[, 2], df[, 3])))
# names(lt) = letters[1:4]
# m = make_comb_mat(lt)
# }
make_comb_mat = function(..., mode = c("distinct", "intersect", "union"),
	top_n_sets = Inf, min_set_size = -Inf, remove_empty_comb_set = TRUE,
	remove_complement_set = FALSE, universal_set = NULL, 
	complement_size = NULL, value_fun = NULL) {

	lt = list(...)

	mode = match.arg(mode)[1]
	if(length(lt) == 1) {
		lt = lt[[1]]
		if(!is.null(dim(lt))) {
			m = make_comb_mat_from_matrix(lt, mode = mode, top_n_sets = top_n_sets, 
				min_set_size = min_set_size, universal_set = universal_set, complement_size = complement_size)
			if(remove_empty_comb_set) {
				m = m[comb_size(m) > 0]
			}
			if(remove_complement_set) {
				m = m[comb_degree(m) > 0]
			}
			return(m)
		}
	}

	if(is.null(value_fun)) {
		if(inherits(lt[[1]], "GRanges")) {
			value_fun = function(x) sum(as.numeric(getFromNamespace("width", ns = "BiocGenerics")(x)))
		} else if(inherits(lt[[1]], "IRanges")) {
			value_fun = function(x) sum(as.numeric(getFromNamespace("width", ns = "BiocGenerics")(x)))
		} else {
			value_fun = length
		}
	}
	m = make_comb_mat_from_list(lt, value_fun, mode = mode, top_n_sets = top_n_sets, min_set_size = min_set_size, 
		universal_set = universal_set, complement_size = complement_size)
	if(remove_empty_comb_set) {
		m = m[comb_size(m) > 0]
	}
	if(remove_complement_set) {
		m = m[comb_degree(m) > 0]
	}
	return(m)
}


binaryToInt = function(x) {
	sum(x * 2^(rev(seq_along(x)) - 1))
}

intToBinary = function(x, len) {
	rev(as.integer(intToBits(x))[1:len])
}

# == title
# Set Names
#
# == param
# -m A combination matrix returned by `make_comb_mat`.
#
# == value
# A vector of set names.
#
# == example
# set.seed(123)
# lt = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m = make_comb_mat(lt)
# set_name(m)
set_name = function(m) {
	set_on_rows = attr(m, "set_on_rows")
	if(set_on_rows) {
		rownames(m)
	} else {
		colnames(m)
	}
}

# == title
# Set Sizes
#
# == param
# -m A combination matrix returned by `make_comb_mat`.
#
# == value
# A vector of set sizes.
#
# == example
# set.seed(123)
# lt = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m = make_comb_mat(lt)
# set_size(m)
set_size = function(m) {
	structure(attr(m, "set_size"), names = set_name(m))
}

# == title
# Sizes of the Combination sets
#
# == param
# -m A combination matrix returned by `make_comb_mat`.
#
# == value
# A vector of sizes of the combination sets.
#
# == example
# set.seed(123)
# lt = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m = make_comb_mat(lt)
# comb_size(m)
comb_size = function(m) {
	structure(attr(m, "comb_size"), names = comb_name(m))
}

# == title
# Names of the Combination sets
#
# == param
# -m A combination matrix returned by `make_comb_mat`.
#
# == details
# The name of the combination sets are formatted as a string
# of binary bits. E.g. for three sets of "a", "b", "c", the combination
# set with name "101" corresponds to select set a, not select set b
# and select set c. The definition of "select" depends on the value of
# ``mode`` from `make_comb_mat`.
# 
# == value
# A vector of names of the combination sets.
#
# == example
# set.seed(123)
# lt = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m = make_comb_mat(lt)
# comb_name(m)
comb_name = function(m) {
	set_on_rows = attr(m, "set_on_rows")
	if(set_on_rows) {
		apply(m, 2, paste, collapse = "")
	} else {
		apply(m, 1, paste, collapse = "")
	}
}

# == title
# Degrees of the Combination sets
#
# == param
# -m A combination matrix returned by `make_comb_mat`.
#
# == details
# The degree for a combination set is the number of sets that are selected.
#
# == value
# A vector of degrees of the combination sets.
#
# == example
# set.seed(123)
# lt = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m = make_comb_mat(lt)
# comb_degree(m)
comb_degree = function(m) {
	set_on_rows = attr(m, "set_on_rows")
	if(set_on_rows) {
		d = colSums(m)
	} else {
		d = rowSums(m)
	}
	structure(d, names = comb_name(m))
}

# == title
# Extract Elements in a Combination set
#
# == param
# -m A combination matrix returned by `make_comb_mat`.
# -comb_name The valid combination set name should be from `comb_name`.
#
# == details
# It returns the combination set.
#
# == example
# set.seed(123)
# lt = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m = make_comb_mat(lt)
# extract_comb(m, "110")
extract_comb = function(m, comb_name) {

	all_comb_names = comb_name(m)
	if(!comb_name %in% all_comb_names) {
		stop_wrap(paste0("Cannot find a combination name:	", comb_name, ", valid combination name should be in `comb_name(m)`."))
	}

	query = as.numeric(strsplit(comb_name, "")[[1]])

	x = attr(m, "x")
	lt = attr(m, "lt")
	universal_set = attr(m, "universal_set")
	mode = attr(m, "mode")

	is_complement_set = function(comb_name) {
		grepl("^0+$", comb_name)
	}

	if(is.null(universal_set) && is_complement_set(comb_name)) {
		stop_wrap(qq("Cannot extract elements for the complement set '@{comb_name}' since universal set was not set."))
	}

	if(!is.null(x)) {
		if(is_complement_set(comb_name)) {
			if(is.numeric(universal_set)) {
				return(as.integer(as.numeric(setdiff(as.character(universal_set), rownames(x)))))
			} else {
				return(setdiff(universal_set, rownames(x)))
			}
		}

		if(mode == "distinct") {
			l = apply(x, 1, function(y) all(y == query))
		} else if(mode == "intersect") {
			l_subset = query == 1
			l = apply(x, 1, function(y) all(y[l_subset] == 1))
		} else if(mode == "union") {
			l_subset = query == 1
			l = apply(x, 1, function(y) {
				if(all(y[!l_subset] == 0)) {
					sum(y[l_subset]) > 0
				} else {
					FALSE
				}
			})
		}

		rn = rownames(x)
		if(is.null(universal_set)) {
			if(is.null(rn)) {
				return(seq_len(nrow(x))[l])
			} else {
				return(rn[l])
			}
		} else {
			if(is.numeric(universal_set)) {
				return(as.integer(as.numeric(rn[l])))
			} else {
				return(rn[l])
			}
		}
	}
	if(!is.null(lt)) {

		if(inherits(lt[[1]], "GRanges")) {
	    	union = getFromNamespace("union", ns = "BiocGenerics")
	    	intersect = getFromNamespace("intersect", ns = "BiocGenerics")
	    	setdiff = getFromNamespace("setdiff", ns = "BiocGenerics")
	    } else if(inherits(lt[[1]], "IRanges")) {
	    	union = getFromNamespace("union", ns = "BiocGenerics")
	    	intersect = getFromNamespace("intersect", ns = "BiocGenerics")
	    	setdiff = getFromNamespace("setdiff", ns = "BiocGenerics")
	    }

	    if(is_complement_set(comb_name)) {
			s = universal_set
			for(i in seq_along(lt)) {
				s = setdiff(s, lt[[i]])
			}
			return(s)
		}

		do_comb = function(lt, mode, do = rep(TRUE, length(lt))) {
	        set1_index = which(do)
	        set2_index = which(!do)

	        s = lt[[ set1_index[1] ]]
	        
	        if(mode == "distinct") {
		        for(i in set1_index[-1]) {
		            s = intersect(s, lt[[ i ]])
		        }

		        for(i in set2_index) {
		            s = setdiff(s, lt[[ i ]])
		        }
		    } else if(mode == "intersect") {
		    	for(i in set1_index[-1]) {
		            s = intersect(s, lt[[ i ]])
		        }
		    } else if(mode == "union") {
		    	for(i in set1_index[-1]) {
		            s = union(s, lt[[ i ]])
		        }
		    }
	        s
	    }

	    do = as.logical(as.numeric(strsplit(comb_name, "")[[1]]))
	    do_comb(lt, mode, do)
	}
}

# == title
# Transpost the Combination Matrix
#
# == param
# -x A combination matrix returned by `make_comb_mat`.
#
# == example
# set.seed(123)
# lt = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m = make_comb_mat(lt)
# t(m)
t.comb_mat = function(x) {
	x2 = t.default(x)
	attr(x2, "set_on_rows") = !attr(x, "set_on_rows")
	x2
}

# == title
# Subset the Combination Matrix
#
# == param
# -x A combination matrix returned by `make_comb_mat`.
# -i Indices on rows.
# -j Indices on columns.
# -drop It is always reset to ``FALSE`` internally.
#
# == details
# If sets are on rows of the combination matrix, the row indices correspond
# to sets and column indices correspond to combination sets, and if sets are
# on columns of the combination matrix, rows correspond to the combination sets.
#
# If the index is one-dimension, e.g. ``x[i]``, the index always corresponds to the combination sets.
#
# You should not subset by the sets. It will give you wrong combination set size. The subsetting
# on sets are only used internally.
#
# This subsetting method is mainly for subsetting combination sets, i.e., users
# can first use `comb_size` to get the size of each combination set, and filter them
# by the size.
#
# == example
# set.seed(123)
# lt = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m = make_comb_mat(lt)
# m2 = m[, comb_size(m) >= 3]
# comb_size(m2)
# m[comb_size(m) >= 3]
"[.comb_mat" = function(x, i, j, drop = FALSE) {
	set_size = attr(x, "set_size")
	comb_size = attr(x, "comb_size")
	set_on_rows = attr(x, "set_on_rows")
	mode = attr(x, "mode")

	class(x) = "matrix"
	if(set_on_rows) {
		if(nargs() == 2) {
			x2 = x[, i, drop = FALSE]
			comb_size = comb_size[i]
		} else if(missing(i)) {
			x2 = x[, j, drop = FALSE]
			comb_size = comb_size[j]
		} else if(missing(j)) {
			x2 = x[i, , drop = FALSE]
			set_size = set_size[i]
		} else {
			x2 = x[i, j, drop = FALSE]
			set_size = set_size[i]
			comb_size = comb_size[j]
		}
	} else {
		if(nargs() == 2) {
			x2 = x[i, , drop = FALSE]
			comb_size = comb_size[i]
		} else if(missing(i)) {
			x2 = x[, j, drop = FALSE]
			set_size = set_size[j]
		} else if(missing(j)) {
			x2 = x[i, , drop = FALSE]
			comb_size = comb_size[i]
		} else {
			x2 = x[i, j, drop = FALSE]
			comb_size = comb_size[i]
			set_size = set_size[j]
		}
	}

	attr(x2, "set_size") = set_size
	attr(x2, "comb_size") = comb_size
	attr(x2, "mode") = mode
	attr(x2, "set_on_rows") = set_on_rows
	attr(x2, "x") = attr(x, "x")
	attr(x2, "lt") = attr(x, "lt")
	attr(x2, "universal_set") = attr(x, "universal_set")
	class(x2) = c("comb_mat", "matrix")
	return(x2)
}

# == title
# Print the comb_mat Object
#
# == param
# -x A combination matrix returned by `make_comb_mat`.
# -... Other arguments
#
print.comb_mat = function(x, ...) {
	set_size = attr(x, "set_size")
	comb_size = attr(x, "comb_size")
	set_on_rows = attr(x, "set_on_rows")
	mode = attr(x, "mode")

	cat("A combination matrix with", length(set_size), "sets and", length(comb_size), "combinations.\n")
	cat("  ranges of combination set size: c(", min(comb_size), ", ", max(comb_size), ").\n", sep = "")
	cat("  mode for the combination size: ", mode, ".\n", sep = "")
	if(set_on_rows) {
		cat("  sets are on rows.\n")
	} else {
		cat("  sets are on columns\n")
	}
	if(!is.null(attr(x, "universal_set"))) {
		cat("  universal set is set.\n")
	}

	cat("
Utility functions that can be applied:
- set_name(): name of the sets.
- set_size(): size of the sets.
- comb_name(): name of the combination sets.
- comb_size(): size of the combination sets.
- comb_degree(): degree of the combination sets.
- extract_comb(): extract elements in the specific combination set.
- t(): transpose the combination matrix on the UpSet plot.
- '[': subset the combination matrix.
")

}

# == title
# Make the UpSet plot
#
# == param
# -m A combination matrix returned by `make_comb_mat`. The matrix can be transposed to switch
#    the position of sets and combination sets.
# -comb_col The color for the dots representing combination sets.
# -pt_size The point size for the dots representing combination sets.
# -lwd The line width for the combination sets.
# -bg_col Color for the background rectangles.
# -bg_pt_col Color for the dots representing the set is not selected.
# -set_order The order of sets.
# -comb_order The order of combination sets.
# -top_annotation A `HeatmapAnnotation` object on top of the combination matrix.
# -right_annotation A `HeatmapAnnotation` object on the right of the combination matrix.
# -row_names_side The side of row names.
# -... Other arguments passed to `Heatmap`.
#
# == details
# By default, the sets are on rows and combination sets are on columns. The positions of the
# two types of sets can be switched by transposing the matrix.
#
# When sets are on rows, the default top annotation is the barplot showing the size of each
# combination sets and the default right annotation is the barplot showing the size of the sets.
# The annotations are simply constructed by `HeatmapAnnotation` and `anno_barplot` with some
# parameters pre-set. Users can check the source code of `upset_top_annotation` and
# `upset_right_annotation` to find out how the annotations are defined.
#
# To change or to add annotations, users just need to define a new `HeatmapAnnotation` object.
# E.g. if we want to change the side of the axis and name on top annotation:
#
#     Upset(..., top_annotation = 
#         HeatmapAnnotation(
#            "Intersection size" = anno_barplot(
#                comb_size(m), 
#                border = FALSE, 
#                gp = gpar(fill = "black"), 
#                height = unit(2, "cm"),
#                axis_param = list(side = "right")
#            ), 
#            annotation_name_side = "right", 
#            annotation_name_rot = 0)
#     )
#
# To add more annotations on top, users just add it in `HeatmapAnnotation`:
#
#     Upset(..., top_annotation = 
#         HeatmapAnnotation(
#            "Intersection size" = anno_barplot(
#                comb_size(m), 
#                border = FALSE, 
#                gp = gpar(fill = "black"), 
#                height = unit(2, "cm"),
#                axis_param = list(side = "right")
#            ), 
#            "anno1" = anno_points(...),
#            "anno2" = some_vector, 
#            annotation_name_side = "right", 
#            annotation_name_rot = 0)
#     )
#
# And so is for the right annotations.
#
# `UpSet` returns a `Heatmap-class` object, which means, you can add it with other heatmaps and annotations
# by ``+`` or `\%v\%`.
#
# == example
# set.seed(123)
# lt = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m = make_comb_mat(lt)
# UpSet(m)
# UpSet(t(m))
# 
# m = make_comb_mat(lt, mode = "union")
# UpSet(m)
# UpSet(m, comb_col = c(rep(2, 3), rep(3, 3), 1))
#
#
# # compare two UpSet plots
# set.seed(123)
# lt1 = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m1 = make_comb_mat(lt1)
# set.seed(456)
# lt2 = list(a = sample(letters, 10),
#           b = sample(letters, 15),
#           c = sample(letters, 20))
# m2 = make_comb_mat(lt2)
#
# max1 = max(c(set_size(m1), set_size(m2)))
# max2 = max(c(comb_size(m1), comb_size(m2)))
#
# UpSet(m1, top_annotation = upset_top_annotation(m1, ylim = c(0, max2)),
#     right_annotation = upset_right_annotation(m1, ylim = c(0, max1)),
#     column_title = "UpSet1") +
# UpSet(m2, top_annotation = upset_top_annotation(m2, ylim = c(0, max2)),
#     right_annotation = upset_right_annotation(m2, ylim = c(0, max1)),
#     column_title = "UpSet2")
#
UpSet = function(m, 
	comb_col = "black",
	pt_size = unit(3, "mm"), lwd = 2, 
	bg_col = "#F0F0F0", bg_pt_col = "#CCCCCC",
	set_order = order(set_size(m), decreasing = TRUE), 
	comb_order = if(attr(m, "set_on_rows")) {
			order.comb_mat(m[set_order, ], decreasing = TRUE)
		} else {
			order.comb_mat(m[, set_order], decreasing = TRUE)
		},
	top_annotation = upset_top_annotation(m),
	right_annotation = upset_right_annotation(m),
	row_names_side = "left",
	...) {

	set_on_rows = attr(m, "set_on_rows")
	mode = attr(m, "mode")

	m2 = m
	
	class(m2) = "matrix"

	pt_size = pt_size
	lwd = lwd

	if(set_on_rows) {
		n_comb = ncol(m)
		if(length(comb_col == 1)) comb_col = rep(comb_col, n_comb)

		layer_fun = function(j, i, x, y, w, h, fill) {
			nr = round(1/as.numeric(h[1]))
			nc = round(1/as.numeric(w[1]))
			subm = matrix(pindex(m2, i, j), nrow = nr, byrow = FALSE)
			for(k in seq_len(nr)) {
				if(k %% 2) {
					grid.rect(y = k/nr, height = 1/nr, just = "top", gp = gpar(fill = bg_col[1], col = NA))
				} else {
					if(length(bg_col) > 1) {
						grid.rect(y = k/nr, height = 1/nr, just = "top", gp = gpar(fill = bg_col[2], col = NA))
					}
				}
			}
			grid.points(x, y, size = pt_size, pch = 16, gp = gpar(col = ifelse(pindex(m2, i, j), comb_col[j], bg_pt_col)))
			jj = unique(j)
			for(k in seq_len(nc)) {
		        if(sum(subm[, k]) >= 2) {
		            i_min = min(which(subm[, k] > 0))
		            i_max = max(which(subm[, k] > 0))
		            grid.lines(c(k - 0.5, k - 0.5)/nc, (nr - c(i_min, i_max) + 0.5)/nr, gp = gpar(col = comb_col[jj[k]], lwd = lwd))
		        }
		    }
		}

		# check top annotation
		# if it is specified by upset_top_annotation and gp(col) is not set
		ra = top_annotation
		if(length(ra) == 1) {
			ta_call = substitute(top_annotation)
			ta_call = as.list(ta_call)
			if(as.character(ta_call[[1]]) == "upset_top_annotation") {
				if(!"gp" %in% names(as.list(ta_call))) {
					ra@anno_list[[1]]@fun@var_env$gp$fill = comb_col
					ra@anno_list[[1]]@fun@var_env$gp$col = comb_col
				}
			}
		}

		ht = Heatmap(m2, cluster_rows = FALSE, cluster_columns = FALSE, rect_gp = gpar(type = "none"),
			layer_fun = layer_fun, show_heatmap_legend = FALSE,
			top_annotation = ra,
			right_annotation = right_annotation,
			row_names_side = row_names_side,
			row_order = set_order, column_order = comb_order, ...)
	} else {
		n_comb = nrow(m)
		if(length(comb_col == 1)) comb_col = rep(comb_col, n_comb)

		layer_fun = function(j, i, x, y, w, h, fill) {
			nr = round(1/as.numeric(h[1]))
			nc = round(1/as.numeric(w[1]))
			subm = matrix(pindex(m2, i, j), nrow = nr, byrow = FALSE)
			for(k in seq_len(nc)) {
				if(k %% 2) {
					grid.rect(x = k/nc, width = 1/nc, just = "right", gp = gpar(fill = "#F0F0F0", col = NA))
				}
			}
			grid.points(x, y, size = pt_size, pch = 16, gp = gpar(col = ifelse(pindex(m2, i, j), comb_col[i], "#CCCCCC")))
			ii = unique(i)
			for(k in seq_len(nr)) {
		        if(sum(subm[k, ]) >= 2) {
		            i_min = min(which(subm[k, ] > 0))
		            i_max = max(which(subm[k, ] > 0))
		            grid.lines((c(i_min, i_max) - 0.5)/nc, (nr - c(k ,k) + 0.5)/nr, gp = gpar(col = comb_col[ii[k]], lwd = lwd))
		        }
		    }
		}

		ra = right_annotation
		if(length(ra) == 1) {
			ta_call = substitute(top_annotation)
			ta_call = as.list(ta_call)
			if(as.character(ta_call[[1]]) == "upset_right_annotation") {
				if(!"gp" %in% names(as.list(ta_call))) {
					ra@anno_list[[1]]@fun@var_env$gp$fill = comb_col
					ra@anno_list[[1]]@fun@var_env$gp$col = comb_col
				}
			}
		}
		ht = Heatmap(m2, cluster_rows = FALSE, cluster_columns = FALSE, rect_gp = gpar(type = "none"),
			layer_fun = layer_fun, show_heatmap_legend = FALSE,
			top_annotation = top_annotation,
			right_annotation = ra,
			row_order = comb_order, column_order = set_order, ...)
	}
	ht
}

# == title
# Order of the Combination Sets
#
# == param
# -m A combination matrix returned by `make_comb_mat`.
# -on On sets or on combination sets?
# -decreasing Whether the ordering is applied decreasingly.
#
# == details
# It first sorts by the degree of the combination sets then
# by the combination matrix.
#
order.comb_mat = function(m, decreasing = TRUE, on = "comb_set") {
	if(on == "set") {
		return(order(set_size(m), decreasing = decreasing))
	} else {
		set_on_rows = attr(m, "set_on_rows")
		if(set_on_rows) {
			lt = list(comb_degree(m))
			lt = c(lt, as.list(as.data.frame(t(m))))
			lt$decreasing = decreasing
			do.call(order, lt)
		} else {
			lt = list(comb_degree(m))
			lt = c(lt, as.list(as.data.frame(m)))
			lt$decreasing = decreasing
			do.call(order, lt)
		}
	}
}

# == title
# Default UpSet Top Annotation
#
# == param
# -m A combination matrix which is as same as the one for `UpSet`.
# -gp Graphic parameters for bars.
# -height The height of the top annotation.
# -show_annotation_name Whether show annotation names?
# -annotation_name_gp Graphic parameters for anntation names.
# -annotation_name_offset Offset to the annotation name, a `grid::unit` object.
# -annotation_name_side Side of the annotation name.
# -annotation_name_rot Rotation of the annotation name, it can only take values in ``c(00, 90, 180, 270)``.
# -... Passed to `anno_barplot`.
#
# == details
# The default top annotation is actually barplot implemented by `anno_barplot`. For
# how to set the top annotation or bottom annotation in `UpSet`, please refer to `UpSet`.
#
upset_top_annotation = function(m, 
	gp = gpar(fill = "black"), 
	height = unit(ifelse(set_on_rows, 3, 2), "cm"),
	show_annotation_name = TRUE,
	annotation_name_gp = gpar(),
	annotation_name_offset = NULL,
	annotation_name_side = "left",
	annotation_name_rot = 0,
	...) {
	set_on_rows = attr(m, "set_on_rows")
	
	if(set_on_rows) {
		ha = HeatmapAnnotation("Intersection\nsize" = anno_barplot(comb_size(m), 
					border = FALSE, gp = gp, height = height, ...), 
				show_annotation_name = show_annotation_name,
				annotation_name_gp = annotation_name_gp,
				annotation_name_offset = annotation_name_offset,
				annotation_name_side = annotation_name_side,
				annotation_name_rot = annotation_name_rot)
	} else {
		ha = HeatmapAnnotation("Set\nsize" = anno_barplot(set_size(m), border = FALSE, 
					gp = gp, height = height, ...),
				show_annotation_name = show_annotation_name,
				annotation_name_gp = annotation_name_gp,
				annotation_name_offset = annotation_name_offset,
				annotation_name_side = annotation_name_side,
				annotation_name_rot = annotation_name_rot)
	}

	mode = attr(m, "mode")
	if(set_on_rows) {
		if(mode %in% c("distinct", "intersect")) {
			names(ha) = "Intersection\nsize"
		} else {
			names(ha) = "Union\nsize"
		}
	}
	return(ha)
}

# == title
# Default UpSet Right Annotation
#
# == param
# -m A combination matrix which is as same as the one for `UpSet`.
# -gp Graphic parameters for bars.
# -width Width of the right annotation.
# -show_annotation_name Whether show annotation names?
# -annotation_name_gp Graphic parameters for anntation names.
# -annotation_name_offset Offset to the annotation name, a `grid::unit` object.
# -annotation_name_side Side of the annotation name.
# -annotation_name_rot Rotation of the annotation name, it can only take values in ``c(00, 90, 180, 270)``.
# -... Passed to `anno_barplot`.
#
# == details
# The default right annotation is actually barplot implemented by `anno_barplot`. For
# how to set the right annotation or left annotation in `UpSet`, please refer to `UpSet`.
#
upset_right_annotation = function(m,
	gp = gpar(fill = "black"),  
	width = unit(ifelse(set_on_rows, 2, 3), "cm"),
	show_annotation_name = TRUE,
	annotation_name_gp = gpar(),
	annotation_name_offset = NULL,
	annotation_name_side = "bottom",
	annotation_name_rot = NULL,
	...) {
	set_on_rows = attr(m, "set_on_rows")

	if(set_on_rows) {
		ha = rowAnnotation("Set size" = anno_barplot(set_size(m), border = FALSE, 
					gp = gp, width = width, ...),
				show_annotation_name = show_annotation_name,
				annotation_name_gp = annotation_name_gp,
				annotation_name_offset = annotation_name_offset,
				annotation_name_side = annotation_name_side,
				annotation_name_rot = annotation_name_rot)
	} else {
		ha = rowAnnotation("Intersection\nsize" = anno_barplot(comb_size(m), 
					border = FALSE, gp = gp, width = width, ...),
				show_annotation_name = show_annotation_name,
				annotation_name_gp = annotation_name_gp,
				annotation_name_offset = annotation_name_offset,
				annotation_name_side = annotation_name_side,
				annotation_name_rot = annotation_name_rot)
	}

	mode = attr(m, "mode")
	if(!set_on_rows) {
		if(mode %in% c("distinct", "intersect")) {
			names(ha) = "Intersection\nsize"
		} else {
			names(ha) = "Union size"
		}
	}
	return(ha)
}

# == title
# Normalize a list of combination matrice
#
# == param
# -... If it is a single argument, the value should be a list of combination matrices.
#
# == details
# It normalizes a list of combination matrice to make them have same number and order of sets and combination sets.
#
# The sets (by `set_name`) from all combination matrice should be the same.
#
normalize_comb_mat = function(...) {
	lt = list(...)
	if(length(lt) == 1) {
		if(!is.list(lt)) {
			stop_wrap("If you only specify one argument, it must be a list of comb_mat objects.")
		}
		lt = lt[[1]]
	}

	n = length(lt)
	if(n == 1) {
		stop_wrap("There should be at least two combination matrices.")
	}

	set1 = set_name(lt[[1]])
	for(i in 2:n) {
		if(!setequal(set1, set_name(lt[[i]]))) {
			stop_wrap("The sets of all combination matrices should be identical.")
		}
	}
	all_set_name = set_name(lt[[1]])
	n_set = length(all_set_name)

	all_comb_size = lapply(lt, function(x) {
		set_on_rows = attr(x, "set_on_rows")
		if(set_on_rows) {
			code = apply(x, 2, binaryToInt)
		} else {
			code = apply(x, 1, binaryToInt)
		}
		structure(comb_size(x), names = code)
	})

	all_code = unique(unlist(lapply(all_comb_size, names)))
	comb_mat = do.call(cbind, lapply(as.numeric(all_code), intToBinary, len = n_set))
	rownames(comb_mat) = all_set_name

	all_comb_size = lapply(all_comb_size, function(x) {
		x2 = structure(rep(0, length(all_code)), names = all_code)
		x2[names(x)] = x
		x2
	})

	for(i in seq_along(lt)) {
		x = lt[[i]]
		set_on_rows = attr(x, "set_on_rows")
		attr = attributes(x)
		attr = attr[!names(attr) %in% c("dim", "dimname")]

		if(set_on_rows) {
			x2 = comb_mat
		} else {
			x2 = t(comb_mat)
		}
		for(nm in names(attr)) {
			attr(x2, nm) = attr[[nm]]
		}
		attr(x2, "set_size") = attr$set_size[all_set_name]
		attr(x2, "comb_size") = all_comb_size[[i]]
		
		lt[[i]] = x2
	}
	return(lt)
}

# == title
# str method
#
# == param
# -object A combination matrix returned by `make_comb_mat`.
# -... Other arguments.
#
str.comb_mat = function(object, ...) {
	class(object) = "matrix"
	cat(" A comb_mat class object\n")
	str(object)
}
zhongmicai/complexHeatmap documentation built on May 7, 2019, 6:11 a.m.