R/cluster.r

#' Data-clustering
#'
#' This package contains methods, which enables clustering in dataframes. Particularly useful for bio-mathematics, cognitive sciences, etc.
#'
#' \code{cd <- clusterby::clusterdataframe(tib)}
#' \code{cd$build(...)}
#' \code{cd$summarise(...)}
#' \code{cd$get('original', ...)}
#' \code{cd$get('clusters', summary=<lgl>, ...)}
#'
#' @param tib Tibble/Dataframe to be clustered. Method also possible with vectors.
#' @param by string vector. Specifies the column(s) for geometric data, according to which the clusters are to be built.
#' @param filter.by string vector. Defaults to \code{c()}. Specificies columns, by which data is to be preliminarily divided into groups, within which the clusters are to be built.
#' @param keep string vector. Defaults to \code{c()}. Specificies columns, which should be kept when using the $get.
#' @param near symmetric function in two arguments. This function operates pairs of entries in the columns with geometric data and returns \code{TRUE}/\code{FALSE} if entries are near. Defaults to a Manhattan metric.
#' @param min.dist a real number. Defaults to \code{0}. If the default manhattan metric is used for \code{near}, this is the minimum tolerated distance between geometric data.
#' @param max.dist a real number. Defaults to \code{Inf}. If the default manhattan metric is used for \code{near}, this is the maximum tolerated distance between geometric data.
#' @param strict boolean. Defaults to \code{FALSE}. If the default manhattan metric is used for \code{near}, this sets the proximity to be a strict \code{< dist} or else \code{<= dist}.
#' @param cluster.name string. Defaults to \code{'cluster'}. Running \code{tib \%>\% clusterby(...)} returns a data frame, which extends \code{tib} by 1 column with this name. This column tags the clusters by a unique index.
#' @param min.size a natural number. Defaults to \code{1}. If a cluster has fewer elements as this, it will not be viewed as a cluster.
#' @param max.size a natural number. Defaults to \code{Inf}, determining the maximum allowable size of a cluster.
#' @param split boolean. Defaults to \code{FALSE}. If set to \code{TRUE}, then the output will be group the tibble data by cluster (equivalent to performing \code{\%>\% group_by(...)}).
#' @param is.lexical boolean. Defaults to \code{TRUE} if \code{length(by)=1}, otherwise to \code{FALSE}. If set to \code{TRUE}, then the geometry is assumed to be linear and endowed with a simple difference-metric. This allows for faster computation.
#' @param no.overlaps boolean. Defaults to \code{FALSE}. If set to \code{TRUE} in combination with \code{is.lexical=TRUE}, then the clusters must occupy intervals that do not overlap.
#' @param summary boolean. Defaults to \code{FALSE}. If set to \code{TRUE} in combination with \code{is.lexical=TRUE} and **assuming** the user has presorted the data by the \code{by}-column, then a summary of the clusters as intervalls is provided. This makes most sense, if \code{no.overlaps=TRUE}. This produces the columns \code{filter.by, by, pstart, pend, nstart, nend, n} where \code{pstart}, \code{pend} describes the interval, \code{nstart}, \code{nend} provides the original indices in the input data, and \code{n} is the cluster size (number of points).
#' @param as.interval boolean. Defaults to \code{TRUE} if \code{is.lexical=TRUE} and \code{no.overlaps=TRUE}, otherwise defaults to \code{FALSE}. If \code{TRUE} and, then summaries provide information as interval end points. If \code{FALSE}, then summaries are provided as lists.
#'
#' @export clusterdataframe
#'
#' @examples cdf <- clusterby::clusterdataframe(gene);
#' @examples cdf$build(by='position', filter.by=c('gene','actve'), min.size=4, max.dist=400, strict=TRUE, is.lexical=TRUE, no.overlaps=TRUE);
#' @examples cdf <- clusterby::clusterdataframe(protein3d);
#' @examples cdf$build(by=c('x','y','z'), filter.by='celltype', max.dist=5.8e-7, cluster.name='segment');
#' @examples cdf <- clusterby::clusterdataframe(soil_data);
#' @examples cdf$build(by=c('x','y'), filter.by=c('density','substance'), max.dist=10e-3, cluster.name='clump');
#' @examples data <- cdf$get('clusters');
#' @examples tib <- cdf$get('clusters', keep=c('colour','age'));
#' @examples tib <- cdf$get('clusters', summary=FALSE);
#' @examples tib_summ <- cdf$get('clusters', summary=TRUE, as.interval=TRUE);
#' @examples tib_summ <- cdf$get('clusters', summary=TRUE, as.interval=FALSE);
#'
#' @keywords cluster clustering gene



clusterdataframe <- setRefClass('clusterdataframe',
	fields = list(
		is.clustered='logical',
		is.lexical='logical',
		index.name='character',
		cluster.name='character',
		cluster.loc='ANY',
		cluster.group.by='ANY',
		cluster.by='ANY',
		cols.keep='ANY',
		cols.keep.set='logical',
		data='ANY',
		data.cols='ANY'
	),
	methods = list(
		initialize = function(...) {
			INPUTVARS = list(...);
			if(length(INPUTVARS) >= 1) {
				.self$data <- INPUTVARS[[1]];
				.self$data.cols <- names(.self$data);
			} else {
				.self$data <- tibble::as_tibble(list());
				.self$data.cols <- c();
			}

			.self$index.name <- 'index';
			.self$cluster.name <- 'cluster';
			for(key in c(
				'cluster.loc',
				'cluster.by',
				'cluster.group.by',
				'cols.keep'
			)) .self[[key]] <- c();
			.self$cols.keep.set <- FALSE;
			.self$is.clustered <- FALSE;
			.self$is.lexical <- FALSE;
		},
		get = function(key, ...) {
			if(key %in% c(
				'is.clustered',
				'is.lexical',
				'index.name',
				'cluster.name',
				'cluster.loc',
				'cluster.by',
				'cluster.group.by',
				'cols.keep',
				'cols.keep.set'
			)) return(.self[[key]]);

			clustername <- .self$cluster.name;
			indexname <- .self$index.name;

			if(key == 'original') {
				tib <- .self$data;
				if(.self$is.clustered) return(tib %>% dplyr::select(-c(clustername, indexname)));
				return(tib);
			}

			if(key == 'clusters') {
				if(!.self$is.clustered) return(tibble::as_tibble(list()));

				INPUTVARS = list(...);
				summary <- INPUTVARS[['summary']];
				if(!is.logical(summary)) summary <- FALSE;

				by <- .self$cluster.by;
				indexname <- .self$index.name;
				keep <- INPUTVARS[['cols.keep']];
				if(is.character(keep)) .self$setkeep(keep);

				if(summary) {
					as_interval <- INPUTVARS[['as.interval']];
					with_pts <- INPUTVARS[['with.points']];
					sep <- INPUTVARS[['sep']];
					with_index <- INPUTVARS[['with.index']];
					with_keys <- INPUTVARS[['with.keys']];
					with_braces <- INPUTVARS[['with.braces']];
					if(!is.logical(as_interval)) as_interval <- .self$is.lexical;
					if(!is.logical(with_pts)) with_pts <- (length(by) == 1);
					if(!is.character(sep)) sep <- ';';
					if(!is.logical(with_index)) with_index <- TRUE;
					if(!is.logical(with_keys)) with_keys <- FALSE;
					if(!is.logical(with_braces)) with_braces <- FALSE;

					if(as_interval) {
						if(.self$is.lexical && with_pts) {
							if(with_index) {
								tib <- .self$summarise(
									'p.start' = list(col=by, method='lex:min', sep=sep, with.keys=with_keys, with.braces=with_braces),
									'p.end' = list(col=by, method='lex:max', sep=sep, with.keys=with_keys, with.braces=with_braces),
									'distance' = list(col=by, method='lex:range', mode='distance', sep=sep, with.keys=with_keys, with.braces=with_braces),
									'n.start' = list(col=indexname, method='min'),
									'n.end' = list(col=indexname, method='max'),
									'size' = list(col=indexname, method='length')
								);
							} else {
								tib <- .self$summarise(
									'p.start' = list(col=by, method='lex:min', sep=sep, with.keys=with_keys, with.braces=with_braces),
									'p.end' = list(col=by, method='lex:max', sep=sep, with.keys=with_keys, with.braces=with_braces),
									'distance' = list(col=by, method='lex:range', mode='distance', sep=sep, with.keys=with_keys, with.braces=with_braces)
								);
							}
						} else {
							tib <- .self$summarise(
								'n.start' = list(col=indexname, method='min'),
								'n.end' = list(col=indexname, method='max'),
								'size' = list(col=indexname, method='length')
							);
						}
					} else {
						if(with_pts) {
							if(with_index) {
								tib <- .self$summarise(
									'positions' = list(col=by, method='list:points', sep=sep, with.keys=with_keys, with.braces=with_braces),
									'index' = list(col=indexname, method='list', sep=sep, with.braces=with_braces),
									'size' = list(col=indexname, method='length')
								);
							} else {
								tib <- .self$summarise(
									'positions' = list(col=by, method='list:points', sep=sep, with.keys=with_keys, with.braces=with_braces)
								);
							}
						} else {
							tib <- .self$summarise(
								'index' = list(col=indexname, method='set', sep=sep, with.braces=with_braces),
								'size' = list(col=indexname, method='length')
							);
						}
					}

					return(tib);
				} else {
					tib <- .self$data;

					if(!.self$is.clustered) return(tib)

					cols <- .self$data.cols;
					by <- .self$cluster.by;
					group_by <- .self$cluster.group.by;
					keep <- .self$cols.keep;
					sel <- .self$cluster.loc;
					print(sel);

					if(!.self$cols.keep.set) keep <- cols;
					keep <- unique(c(clustername, group_by, by, keep));

					return(tib[sel, ] %>% dplyr::select(keep));
				}
			}

			return(NULL);
		},
		setkeep = function(keep) {
			cols <- .self$data.cols;
			keep <- keep[which(keep %in% cols)];
			.self$cols.keep <- keep;
			.self$cols.keep.set <- TRUE;
		},
		groupby = function(only_clusters=TRUE) {
			if(!.self$is.clustered) return(tibble::as_tibble(list()));
			tib <- .self$data;
			clustername <- .self$cluster.name;
			indexname <- .self$index.name;
			if(only_clusters) {
				sel <- .self$cluster.loc;
				tib <- tib[sel, ];
			}
			return(tib %>% group_by_at(clustername));
		},
		build = function(...) {
			obj <- .self$data %>% buildclusters____(...);
			for(key in c(
				'is.lexical',
				'index.name',
				'cluster.name',
				'cluster.loc',
				'cluster.by',
				'cluster.group.by',
				'data'
			)) .self[[key]] <- obj[[key]];
			.self$is.clustered <- TRUE;
		},
		summarise = function(...) {
			if(!.self$is.clustered) return(tibble::as_tibble(list()));

			INPUTVARS <- list(...);
			summcols <- names(INPUTVARS);

			clustername <- .self$cluster.name;
			group_by <- .self$cluster.group.by;
			keep <- .self$cols.keep;
			cols <- .self$data.cols;
			if(!.self$cols.keep.set) keep <- c();
			keep <- unique(c(group_by, keep));
			keep <- keep[which(!(keep %in% summcols))];

			instructions <- list();

			for(col in keep) instructions[[col]] <- list(
				'col'=col,
				'method'='pick'
			);

			for(col in summcols) {
				s <- INPUTVARS[[col]];
				instructions[[col]] <- s;
				if(is.list(s)) next;
				instructions[[col]] <- list(
					'col'=col,
					'method'=s
				);
			}

			method <- list();
			k <- 1;
			fnames <- c();
			colnames <- c();

			n <- 0;
			for(col in c(keep,summcols)) {
				if(col == clustername) next;

				s <- instructions[[col]];
				f <- (function(s, col) {
					f <- NULL;
					vars <- s[['col']];
					m <- s[['method']];
					d <- length(vars);

					mode <- s[['mode']];
					if(!is.character(mode)) mode <- '';
					sep <- ';';
					if('sep' %in% names(s)) sep <- s[['sep']];
					with_braces <- FALSE;
					if('with.braces' %in% names(s)) with_braces <- s[['with.braces']];
					with_keys <- FALSE;
					if('with.keys' %in% names(s)) with_keys <- s[['with.keys']];

					lbrace <- '';
					rbrace <- '';
					if(with_braces) {
						lbrace <- '[';
						rbrace <- ']';
					}

					llbrace <- '';
					rrbrace <- '';
					if(d > 1) {
						llbrace <- '[';
						rrbrace <- ']';
					}

					if(with_keys) {
						inner_to_json <- function(x) {return(jsonlite::toJSON(x));};
					} else {
						inner_to_json <- function(x) {return(paste0(llbrace,paste(x, collapse=sep),rrbrace));};
					}

					if(is.function(m)) {
						f <- m;
					} else if(is.character(m)) {
						opt <- m[1];
						if(opt == 'pick') {
							f <- function(x) {return(x[1]);};
						} else if(opt == 'set') {
							f <- function(x) {return(paste0(lbrace,paste(unique(x), collapse=sep),rbrace));};
						} else if(opt == 'list') {
							f <- function(x) {return(paste0(lbrace,paste(x, collapse=sep),rbrace));};
						} else if(opt == 'list:points') {
							if(with_keys) {
								f <- function(...) {
									pts <- list(...);
									nom <- sapply(c(1:length(pts)), function(i) {return(paste0('c',i));});
									names(pts) <- nom;
									pts <- lexsort____(tibble::as_tibble(pts), nom);
									names(pts) <- vars;
									return(jsonlite::toJSON(pts));
								};
							} else {
								f <- function(...) {
									pts <- list(...);
									nom <- sapply(c(1:length(pts)), function(i) {return(paste0('c',i));});
									names(pts) <- nom;
									pts <- lexsort____(tibble::as_tibble(pts), nom);
									n <- nrow(pts);
									return(paste0(lbrace, paste(sapply(c(1:n), function(i) {
											return(inner_to_json(pts[i, ]));
									}), collapse=sep), rbrace));
								};
							}
						} else if(opt == 'json') {
							f <- jsonlite::toJSON;
						} else if(opt == 'json:set') {
							f <- function(x) {return(jsonlite::toJSON(unique(x)));};
						} else if(opt == 'length') {
							f <- length;
						} else if(opt == 'min') {
							f <- function(x) {return(min(x, na.rm=TRUE));};
						} else if(opt == 'max') {
							f <- function(x) {return(max(x, na.rm=TRUE));};
						} else if(opt == 'range') {
							if(mode == 'distance') {
								f <- function(x) {
									a <- min(x, na.rm=TRUE);
									b <- max(x, na.rm=TRUE);
									return(b - a);
								};
							} else {
								f <- function(x) {
									a <- min(x, na.rm=TRUE);
									b <- max(x, na.rm=TRUE);
									return(paste0('[',a,';',b,']'));
								};
							}
						} else if(opt == 'lex:min') {
							f <- function(...) {
								pts <- list(...);
								nom <- sapply(c(1:length(pts)), function(i) {return(paste0('c',i));});
								names(pts) <- nom;
								pts <- lexsort____(tibble::as_tibble(pts), nom);
								names(pts) <- vars;
								pt <- pts[1, ];
								return(inner_to_json(pt));
							};
						} else if(opt == 'lex:max') {
							f <- function(...) {
								pts <- list(...);
								nom <- sapply(c(1:length(pts)), function(i) {return(paste0('c',i));});
								names(pts) <- nom;
								pts <- lexsort____(tibble::as_tibble(pts), nom);
								names(pts) <- vars;
								pt <- pts[nrow(pts), ];
								return(inner_to_json(pt));
							};
						} else if(opt == 'lex:range') {
							if(mode == 'distance') {
								f <- function(...) {
									pts <- list(...);
									nom <- sapply(c(1:length(pts)), function(i) {return(paste0('c',i));});
									names(pts) <- nom;
									pts <- lexsort____(tibble::as_tibble(pts), nom);
									names(pts) <- vars;
									a <- pts[1,];
									b <- pts[nrow(pts),];
									d <- b-a;
									return(inner_to_json(d));
								};
							} else {
								if(with_keys) {
									f <- function(...) {
										pts <- list(...);
										nom <- sapply(c(1:length(pts)), function(i) {return(paste0('c',i));});
										names(pts) <- nom;
										pts <- lexsort____(tibble::as_tibble(pts), nom);
										names(pts) <- vars;
										a <- pts[1,];
										a <- inner_to_json(a);
										b <- pts[nrow(pts),];
										b <- inner_to_json(b);
										return(paste0('[',a,',',b,']'));
									};
								} else {
									f <- function(...) {
										pts <- list(...);
										nom <- sapply(c(1:length(pts)), function(i) {return(paste0('c',i));});
										names(pts) <- nom;
										pts <- lexsort____(tibble::as_tibble(pts), nom);
										names(pts) <- vars;
										a <- pts[1,];
										b <- inner_to_json(a);
										b <- pts[nrow(pts),];
										b <- inner_to_json(b);
										return(paste0(lbrace,a,sep,b,rbrace));
									};
								}
							}
						} else if(opt == 'mean') {
							f <- function(x) {return(mean(x, na.rm=TRUE));};
						} else if(opt == 'var') {
							f <- function(x) {return(var(x, na.rm=TRUE));};
						} else if(opt == 'sd') {
							f <- function(x) {return(sd(x, na.rm=TRUE));};
						} else {
							# f <- function(x) {return(NA);};
						}
					}

					return(f);
				})(s, col);

				if(is.null(f)) next;

				method[[k]] <- f;
				vnames <- paste(s[['col']], collapse=',');
				fnames[k] <- paste0('method[[',k,']](',vnames,')');
				colnames[k] <- col;
				if(col == 'size') k0 <- k;
				k <- k + 1;
				n <- n + 1;
			}

			tib <- .self$groupby(TRUE);
			tib_summ <- list();
			tib_summ[[clustername]] <- c(NA);
			if(n > 0) {
				for(k in c(1:n)) {
					col <- colnames[k];
					args <- setNames(fnames[k], col);
					tib_summ_ <- tib %>% dplyr::summarise_(.dots=args);
					tib_summ[[col]] <- tib_summ_[[col]];
					if(k == n) tib_summ[[clustername]] <- tib_summ_[[clustername]];
				}
			}

			return(tibble::as_tibble(tib_summ));
		}
	)
);




buildclusters____ <- function(tib, ...) {
	INPUTVARS <- list(...);
	VARNAMES <- names(INPUTVARS);

	group_by <- INPUTVARS[['filter.by']];
	by <- INPUTVARS[['by']];
	clustername <- INPUTVARS[['cluster.name']];
	min_cluster_size <- INPUTVARS[['min.size']];
	max_cluster_size <- INPUTVARS[['max.size']];
	near <- INPUTVARS[['near']];
	d_min <- INPUTVARS[['min.dist']];
	d_max <- INPUTVARS[['max.dist']];
	strict <- INPUTVARS[['strict']];
	is_lexical <- INPUTVARS[['is.lexical']];
	no_overlaps <- INPUTVARS[['no.overlaps']];

	tib <- tibble::as_tibble(tib);
	cols <- names(tib);

	dim_by <- length(by);
	if(!is.vector(group_by)) group_by <- c();
	if(!is.logical(is_lexical)) is_lexical <- (dim_by == 1);
	if(!is.logical(no_overlaps)) no_overlaps <- FALSE;
	if(!is.numeric(min_cluster_size)) min_cluster_size <- 1;
	if(!is.numeric(max_cluster_size)) max_cluster_size <- Inf;
	if(!is.numeric(d_min)) d_min <- 0;
	if(!is.numeric(d_max)) d_max <- Inf;
	if(is_lexical) near <- 'lexical';
	if(!is.logical(strict)) strict <- FALSE;
	metric_lex <- function(x, y) {
		d <- abs(y-x);
		i <- min(which(d>0),dim_by);
		return(d[i])
	};
	if(!is.function(near)) {
		if(!is.character(near)) near <- 'Manhattan';
		if(near == 'lexical') {
			metric <- metric_lex;
		} else if(near == 'Euclidean') {
			metric <- function(x, y) {return(sqrt(sum((x-y)^2)))};
		} else {#if(near == 'Manhattan') {
			metric <- function(x, y) {return(max(abs(x-y)))};
		}
		if(strict) {
			near <- function(x, y) {d <- metric(x,y); return(d_min <= d && d < d_max);};
		} else {
			near <- function(x, y) {d <- metric(x,y); return(d_min <= d && d <= d_max);};
		}
	}


	## Erstellung von Spaltennamen (Klusterspalte + Pufferspalte):
	n <- nrow(tib);
	if(!is.character(clustername)) clustername <- uniquecolumnname____('cluster', cols);
	chunkname <- uniquecolumnname____('chunk', c(cols, clustername));
	indexname <- uniquecolumnname____('index', c(cols, clustername));


	## Definiere Typen von Spalten
	types <- list()
	types[[indexname]] <- 'integer';
	types[[clustername]] <- 'integer';


	## Indexnamen bevor Verarbeitung speichern.
	tib <- tib %>% tibble::add_column(!!(indexname):=c(1:n), !!(clustername):=rep(NA, n));
	## Indizes für Präsortierung hinzufügen.
	groupname <- uniquecolumnname____('group', c(cols, clustername));
	if(is_lexical) tib <- tib %>% lexsort____(group_by);
	tib <- tib %>% group_by_at(group_by) %>% nest(.key=!!(chunkname));
	Ng <- nrow(tib);
	tib <- tib %>% tibble::add_column(!!(groupname):=c(1:Ng));
	tib <- tib %>% unnest();


	## HAUPTMETHODE
	if(is_lexical) {
		presortcols <- c(groupname, by);
		## wenn Überschneidungen nicht zulässig sind, dann sortiere nur nach Position: Klusters von verschiedenen Typen können einander blockieren.
		if(no_overlaps) presortcols <- by;
		tib <- tib %>% lexsort____(presortcols);

		i0 <- 1;
		cl <- 1;
		while(i0 <= n) {
			## Iteriere bis Gruppe sich ändert, oder nächster Punkt zu weit weg vom Kluster liegt.
			g <- tib[i0, groupname][[1]];
			pt0 <- tib[i0, by];
			pt1 <- pt0;
			i1 <- i0 + 1;
			while(i1 <= n) {
				g_ <- tib[i1, groupname][[1]];
				p2 <- tib[i1, by];
				if(!(g == g_) || !near(pt1, p2)) break;
				pt1 <- p2;
				i1 <- i1 +  1;
			}
			i1 <- i1 - 1;

			## Fasse als Kluster zusammen.
			sz <- i1-i0+1;
			if(min_cluster_size <= sz && sz <= max_cluster_size) {
				dsel <- c(i0:i1);
				tib[dsel, clustername] <- cl;
				cl <- cl + 1;
			}
			i0 <- i1 + 1;
		}
	} else {
		tib <- tib %>% lexsort____(groupname);
		cl <- 1;
		for(g in c(1:Ng)) {
			ind <- which(tib[[groupname]] == g);
			chunk <- tib[ind, ];
			n <- nrow(chunk);
			if(n < min_cluster_size) next;

			## Graphen mit Kanten erstellen, wenn Punkte „nahe“ liegen.
			pts <- list(); for(j in c(1:n)) pts[[j]] <- chunk[j, by];
			edges <- lapply(c(1:n), function(j) {
				e <- c();
				if(j < n) {
					pt <- pts[[j]];
					bool <- lapply(pts, function(pt_) {
						return(near(pt, pt_));
					});
					e <- which(unlist(bool));
				}
				if(length(e) == 0) e <- c(NA);
				return(e);
			});

			## Erzeuge Kluster aus Kanten.
			clusters <- generateconnectedcomponents____(edges, min_cluster_size, max_cluster_size, cl);
			dsel <- which(!is.na(clusters));
			if(length(dsel) > 0) cl <- max(clusters[dsel]) + 1;
			tib[ind, clustername] <- clusters;
		}
	}


	## Reinige Typen.
	for(col in names(types)) {
		typ <- types[[col]];
		if(typ == 'integer') tib[ , col] <- as.integer(tib[[col]]);
	}

	## ursp. Reihenfolge wiederherstellen.
	tib <- tib %>% dplyr::arrange_(indexname) %>% dplyr::select(-c(groupname));
	sel <- which(!is.na(tib[[clustername]]));

	return(list(
		is.lexical=is_lexical,
		index.name=indexname,
		cluster.name=clustername,
		cluster.loc=sel,
		cluster.by=by,
		cluster.group.by=group_by,
		data=tib
	));
};


## LOKALE METHODEN
generateconnectedcomponents____ <- function(edges, min_sz, max_sz, key0) {
	n <- length(edges);
	classes <- rep(NA,n);
	if(n == 0) return(classes);

	key <- key0;
	ind <- c(1:n);
	while(length(ind) > 0) {
		i <- ind[1];
		ind <- ind[-1];
		nodes = c(i);
		children <- nodes;

		while(TRUE) {
			e <- edges[children];
			if(length(e) == 1) {
				grandchildren <- e[[1]];
			} else {
				grandchildren <- apply(cbind(edges[children]), 2, unlist)[,1];
			}
			if(length(grandchildren) == 0) break;
			grandchildren <- unique(grandchildren);
			children <- grandchildren[which(!is.na(grandchildren))];
			filt <- which(children %in% ind);
			if(length(filt) == 0) break;
			children <- children[filt];
			nodes <- c(nodes, children);
			ind <- ind[!(ind %in% children)];
		}

		if(length(nodes) < min_sz) next;
		nodes <- sort(nodes);
		while(length(nodes) >= max_sz) {
			classes[nodes[c(1:max_sz)]] <- key;
			key <- key + 1;
			nodes <- nodes[-c(1:max_sz)];
		}
		if(length(nodes) < min_sz) next;
		classes[nodes] <- key;
		key <- key + 1;
	}

	return(classes);
};

uniquecolumnname____ <- function(nom, cols) {
	col <- nom;
	while(col %in% cols) col <- paste0(col,'_');
	return(col);
};

lexsort____ <- function(tib, nom) {
	## data %>% arrange_(c(#,#,...,#)); funktioniert nicht.
	expr <- paste0("tib[with(tib, order(",paste(nom, collapse=','),")), ]");
	return(eval(parse(text=expr)));
};
RLogik/clusterby documentation built on May 5, 2019, 12:28 p.m.