R/leaders.R

Defines functions leaders.algorithm compute.clusters compute.leaders sort.matrix.rows d1 d1.cleader d2 d2.cleader d3 d3.cleader d4 d4.cleader d5 d5.cleader d6 d6.cleader d7 d7.cleader

Documented in compute.clusters compute.leaders d1 d1.cleader leaders.algorithm sort.matrix.rows

# setwd("Users/natasa/projekti/R_package/paket/")

leaders <- function (x, centers, err.measure = "d4", as.probs = FALSE,
					iter.max = 10, method = "rnd_cls", stabil=1e-6, penalty=1e6,echo=FALSE){
	#
	# basic function, compute clusters
	# data: x, number of clusters (or centers already): centers, error measure: err.measure
	# if x is probabilities: as.probs, 
	# number of iterations for method: iter.max
        # method: for determining centers (units, rnd_cls)
	# (only considered if centers means "number of centers)
        # echo: whether iterations are written
	#
    x <- data.matrix(x)
    if (any(is.na(x)))
        stop("Missing values not allowed.")
    if (!is.numeric(x)) 
        stop("x is not a numeric dataframe or matrix.")
    x2 <- if (!as.probs) 
              x/apply(x, 1, sum)    # transform rows to probabilities
          else x

    if (iter.max < 1) 
        stop("'iter.max' must be positive.")
		
	if (any(x == 0) && err.measure == "d7") 
	    warning("x contains zeros, error measure d7 could produce empty clusters.")
	
    if (missing(centers)) 
        stop("'centers' must be a number or a matrix.")
    if (length(centers) == 1) {
        k <- centers
        iter <- TRUE
        if (nrow(unique(x2)) < k || k < 1)
            stop("Number of clusters must be in {1,2, .., n}.")
    }
    else {
    	iter <- FALSE
    	iter.max <- 1
    	centers2 <- as.matrix(centers)
    	centers <- if (!as.probs)
    		centers2/apply(centers2,1,sum)
    			else centers2
        if (any(duplicated(centers))) 
            stop("Initial centers are not distinct.")
        if (!(ncol(x2) == ncol(centers))) 
            stop("Must have same number of columns in 'x' and 'centers'.")

        k <- nrow(centers)
        if (nrow(unique(x2)) < k) 
            stop("More cluster centers than data units.")
    }
	
    res <- NULL
	for (i in 1:iter.max){
		if (iter){
		    # compute initial "leaders"
                    centers <- compute.centers(x2, k, method=method,
                                   err.measure=err.measure, penalty=penalty)
		}
        # compute clustering
        curr_res <- leaders.algorithm(x2, centers, err.measure=err.measure,
        		stabil=stabil,penalty=penalty,echo=echo)
        if (is.null(res) || res$error > curr_res$error)
        	res <- curr_res
	}

    out <- list(clustering=res$clu_vector,error=res$error,centers=res$leaders)
    #class(out) <- c("leaders","partition")
    return(out)
}

leaders.algorithm <- function(x, centers, err.measure="d4",stabil=1e-6,
					penalty=1e6,echo=FALSE){
	#
	# algorithm to get k=dim(centers)[1] clusters
	# error measure between unit and center (leader): err.measure
	# when leaders stabilize: stabil
	#
	if (is.null(rownames(x))){
		temp <- 1:dim(x)[1]
		rownames(x) <- as.character(temp)	# names for each unit
	}
	centers <- sort.matrix.rows(centers)
	k <- dim(centers)[1]
	# compute initial clusters (according to err.measure)
	curr_result <- compute.clusters(x,centers,err.measure,penalty)
	iter = 0
	stabilize = FALSE
	while (!stabilize){
		# compute new leaders
		leaders <- compute.leaders(x,k,curr_result$clu_vector,err.measure,penalty)
		leaders <- sort.matrix.rows(leaders)
                if (is.na(max(centers - leaders))){
                  stop("consider less centers! (not all clusters have units)")
                }
		# compute clustering function and check if leaders stabilized
		if (max(centers - leaders) < stabil)
			stabilize = TRUE
		else{
			# compute new clusters
			curr_result <- compute.clusters(x,leaders,err.measure,penalty)
			centers <- leaders
			iter <- iter+1
                        if (echo) {
                          print(iter)
                          print(curr_result$error)
                        }
		}
	}
	return(list(clu_vector=curr_result$clu_vector,error=curr_result$error,leaders=leaders))
}

### SUBROUTINES TO BREAK THE CODE ###

compute.clusters <- function(x, centers, err.measure, penalty=1e6){
	#
	# compute clustering vector: names: units, values: cluster numbers (1:k)
	# k: number of centers
	# calls "err.measure" function for each unit
	#
	clu_vector <- vector(mode="numeric",length=dim(x)[1])
	names(clu_vector) <- rownames(x)
	error <- 0
	for (i in 1:dim(x)[1]){
		temp_e <- eval(call(err.measure,x[i,],centers,penalty))
		clu_vector[i] <- temp_e$clu
		error <- error + temp_e$error
	}
	return(list(clu_vector=clu_vector,error=error))
}

compute.leaders <- function(x, k, curr_clusters,err.measure,penalty=1e6){
	#
	# data: x, number of clusters: k, curr_clusters: units with current clusters
	# calls "error.measure.leader" function for each cluster
	#
	leaders <- matrix(nrow=k,ncol=dim(x)[2])
	for (i in 1:k){
		cluster_units <- x[which(curr_clusters == i),]
		if (!is.matrix(cluster_units))	# only one row
			cluster_units <- t(as.matrix(cluster_units))
		leaders[i,] <- eval(call(paste(err.measure,"cleader",sep="."),cluster_units,penalty))
	}
	return(leaders)	
}

sort.matrix.rows <- function(m){
	
	df <- as.data.frame(m)
	df_sorted <- df[do.call(order,df),]
	m_sorted <- as.matrix(df_sorted)
	dimnames(m_sorted) <- dimnames(m)
	return(m_sorted)	
}

### ERROR MEASURES ###

d1 <- function(unit, centers,penalty=1e6){
	#
	# compute error term for one unit
	# return number of center and error there
	#
	possible_d <- vector(mode="numeric",length=dim(centers)[1])
	for (i in 1:dim(centers)[1]){
		temp <- (unit-centers[i,])^2
		possible_d[i] <- sum(temp) 
	}
	clu <- which.min(possible_d)
	error <- possible_d[clu]
	return(list(error=error,clu=clu))
}

d1.cleader <- function(cluster_units,...){
	#
	# compute new leader for a cluster of units
	# return leader
	#
	leader <- apply(cluster_units,2,mean)
	return(leader)
}

d2 <- function(unit, centers,penalty=1e6){
	#
	# compute error term for one unit
	# return number of center and error there
	#
	possible_d <- vector(mode="numeric",length=dim(centers)[1])
	for (i in 1:dim(centers)[1]){
		temp <- abs(unit-centers[i,])
		possible_d[i] <- sum(temp) 
	}
	clu <- which.min(possible_d)
	error <- possible_d[clu]
	return(list(error=error,clu=clu))
}

d2.cleader <- function(cluster_units,...){
	#
	# compute new leader for a cluster of units
	# return leader
	#
	leader <- apply(cluster_units,2,median)
	return(leader)
}

d3 <- function(unit, centers,penalty=1e6){
	#
	# compute error term for one unit
	# return number of center and error there
	#
	possible_d <- vector(mode="numeric",length=dim(centers)[1])
	for (i in 1:dim(centers)[1]){
		temp <- ((unit-centers[i,])/centers[i,])^2
		# solving problems of zero values (unit or centers component)
		# both 0 -> NaN becomes 0
		temp[which(is.nan(temp))] <- 0
		# centers 0 -> turn to penalty (large number)
		temp[which(is.infinite(temp))] <- penalty
		possible_d[i] <- sum(temp) 
	}
	clu <- which.min(possible_d)
	error <- possible_d[clu]
	return(list(error=error,clu=clu))
}

d3.cleader <- function(cluster_units,...){
	#
	# compute new leader for a cluster of units
	# return leader
	#
	leader <- apply(cluster_units^2,2,mean)/(apply(cluster_units,2,mean))
	leader[which(is.nan(leader))] <- 0 # define 0/0 = 0
	return(leader)
}

d4 <- function(unit, centers,penalty=1e6){
	#
	# compute error term for one unit
	# return number of center and error there
	#
	possible_d <- vector(mode="numeric",length=dim(centers)[1])
	for (i in 1:dim(centers)[1]){
		temp <- (unit-centers[i,])^2/centers[i,]
		# solving problems of zero values (unit or centers component)
		# both 0 -> NaN becomes 0
		temp[which(is.nan(temp))] <- 0
		# centers 0 -> turn to penalty (large number)
		temp[which(is.infinite(temp))] <- penalty
		possible_d[i] <- sum(temp) 
	}
	clu <- which.min(possible_d)
	error <- possible_d[clu]
	return(list(error=error,clu=clu))
}

d4.cleader <- function(cluster_units,...){
	#
	# compute new leader for a cluster of units
	# return leader
	#
	leader <- sqrt(apply(cluster_units^2,2,mean))
	return(leader)
}

d5 <- function(unit, centers,penalty=1e6){
	#
	# compute error term for one unit
	# return number of center and error there
	#
	possible_d <- vector(mode="numeric",length=dim(centers)[1])
	for (i in 1:dim(centers)[1]){
		temp <- ((unit-centers[i,])/unit)^2
		# solving problems of zero values (unit or centers component)
		# both 0 -> NaN becomes 0
		temp[which(is.nan(temp))] <- 0
		# units 0 -> turn to penalty (large number)
		temp[which(is.infinite(temp))] <- penalty
		possible_d[i] <- sum(temp) 
	}
	clu <- which.min(possible_d)
	error <- possible_d[clu]
	return(list(error=error,clu=clu))
}

d5.cleader <- function(cluster_units,penalty=1e6){
	#
	# compute new leader for a cluster of units
	# return leader
	#
	#penalty <- 1e6
	leader <- vector(mode="numeric",length=dim(cluster_units)[2])
	x <- 1/cluster_units
	for (i in 1:dim(cluster_units)[2]){
		# compute every column (leader variable) separately
		infinite <- which(is.infinite(x[,i]))
		if (length(infinite) >=1) ## CHANGE (see d6.cleader)== dim(cluster_units)[1])
			leader[i] <- 0	# all zeros
		else{
			## x[infinite,i] <- penalty # zeros get penalty
			x2 <- x[,i]^2
			leader[i] <- sum(x[,i])/sum(x2)
		}	
	}
	return(leader)
}

d6 <- function(unit, centers,penalty=1e6){
	#
	# compute error term for one unit
	# return number of center and error there
	#
	possible_d <- vector(mode="numeric",length=dim(centers)[1])
	for (i in 1:dim(centers)[1]){
		temp <- (unit-centers[i,])^2/unit
		# solving problems of zero values (unit or centers component)
		# both 0 -> NaN becomes 0
		temp[which(is.nan(temp))] <- 0
		# units 0 -> turn to penalty (large number)
		temp[which(is.infinite(temp))] <- penalty
		possible_d[i] <- sum(temp) 
	}
	clu <- which.min(possible_d)
	error <- possible_d[clu]
	return(list(error=error,clu=clu))
}

d6.cleader <- function(cluster_units,penalty=1e6){
	#
	# compute new leader for a cluster of units
	# return leader
	#
	#penalty <- 1e6
	leader <- vector(mode="numeric",length=dim(cluster_units)[2])
	x <- 1/cluster_units
	for (i in 1:dim(cluster_units)[2]){
		# compute every column (leader variable) separately
		infinite <- which(is.infinite(x[,i]))
		if (length(infinite) >= 1) ## CHANGE (the case of 1 unit zero, leader should be 0)== dim(cluster_units)[1])
			leader[i] <- 0	# all zeros
		else{
			## x[infinite,i] <- penalty # zeros get penalty
			leader[i] <- 1/mean(x[,i])
		}	
	}
	return(leader)
}

d7 <- function(unit, centers,penalty=1e6){
	#
	# compute error term for one unit
	# return number of center and error there
	#
	possible_d <- vector(mode="numeric",length=dim(centers)[1])
	for (i in 1:dim(centers)[1]){
		temp <- (unit-centers[i,])^2/(unit*centers[i,])
		# solving problems of zero values (unit or centers component)
		# both 0 -> NaN becomes 0
		temp[which(is.nan(temp))] <- 0
		# units 0 -> turn to penalty (large number)
		temp[which(is.infinite(temp))] <- penalty
		possible_d[i] <- sum(temp) 
	}
	clu <- which.min(possible_d)
	error <- possible_d[clu]
	return(list(error=error,clu=clu))
}

d7.cleader <- function(cluster_units,penalty=1e6){
	#
	# compute new leader for a cluster of units
	# return leader
	#
	#penalty <- 1e6
	leader <- vector(mode="numeric",length=dim(cluster_units)[2])
	x <- 1/cluster_units
	for (i in 1:dim(cluster_units)[2]){
		# compute every column (leader variable) separately
		infinite <- which(is.infinite(x[,i]))
		if (length(infinite) >= 1) ## CHANGE (see d6.cleader) == dim(cluster_units)[1])
			leader[i] <- 0	# all zeros
		else{
			## x[infinite,i] <- penalty # zeros get penalty
			leader[i] <- sqrt(mean(cluster_units[,i])/mean(x[,i]))
		}	
	}
	return(leader)
}

# GLAVNI PROGRAM - TESTIRANJE
#x <- read.table("test.txt",sep=" ",row.names = 1)
#centers <- x[1:3,]
#centers <- 3
#tt <- system.time(c <- leaders(x, centers=centers)) # izpisujejo se iteracije in trenutna napaka
#print(tt)
#print(c)

#[1] 41 - iterations
#[1] 2313.275
#   user  system elapsed 
# 31.432   0.299  31.808 

#[1] 32 - iterations
#[1] 2301.801
#   user  system elapsed 
# 24.638   0.217  24.873 

Try the clustDDist package in your browser

Any scripts or data that you put into this service are public.

clustDDist documentation built on May 2, 2019, 6:47 p.m.