R/DSC_tNN_fast.R

Defines functions DSC_tNN_fast get_microclusters.DSC_tNN_fast get_microweights.DSC_tNN_fast get_macroclusters.DSC_tNN_fast get_macroweights.DSC_tNN_fast microToMacro.DSC_tNN_fast serialize serialize.DSC_tNN_fast plot.DSC_tNN_fast

tNN_fast <- setRefClass("tNN_fast",
    fields = list(
    ### parameters (micro-clustering)
    r			= "numeric",
    measure			= "character",
    lambda			= "numeric",
    decay_interval		= "integer",
    ### noise: min. weight for micro-clusters given as a 
    ### percentile of the total weight of the clustering (i.e.,
    ### noise% of the data points is considered noise)
    noise			= "numeric", 
    
    ### used internally
    distFun			= "ANY",
  	decay_factor		= "numeric",
  	debug			= "logical",
  	
		### data
		weights			= "numeric",
		total_weight		= "numeric",
		npoints			= "integer",
    rel = "ANY",
    flann = "ANY",
    nclusters = "integer",
    columns = "integer",
		
		### Macro-clustering
		macro			= "logical",	# do macro?
		### alpha: intersection factor (area of the intersection)
		alpha			= "numeric",
		### minweights: min. weight for macro-clusters 	
		minweight		= "numeric",
		### k: number of macro-clusters (alternative to )
		k			= "numeric",
    
    #used for serializing
    sflann = "ANY",
    srelation = "ANY"
    
		),


	methods = list(
		initialize = function(
			r		= 0.1,
			k		= 0,
			lambda		= 1e-3,
			decay_interval  = 1000L,
			minweight	= 0.1,
			noise		= 0.01,
			alpha 		= 0.25,
			measure		= "Euclidean",
			macro		= TRUE
			) {

		    r			<<- r
		    lambda		<<- lambda
		    decay_interval	<<- decay_interval
		    decay_factor	<<- 2^(-lambda*decay_interval)
		    minweight		<<- minweight
		    noise		<<- noise
		    alpha		<<- alpha
		    measure		<<- measure
		    macro		<<- macro
        rel <<- .Call("CreateRelations", PACKAGE="stream")
      
        if(is.null(k))
    	    k		<<- 0
		    else
			    k		<<- k

		    weights		<<- numeric()
		    total_weight	<<- 0
		    npoints		<<- 0L
        flann <<- NULL
		    nclusters <<- 0L

		    distFun		<<- pr_DB[[measure]]

		    .self
		  }

		),
	)



  tNN_fast$methods(cluster = function(newdata, debug = FALSE) {
      'Cluster new data.' ### online help
      newdata <- as.data.frame(newdata)
  
  	  if(debug) cat("Debug cluster for tNN_fast!\n")
      
      for(i in 1:nrow(newdata)) {
  	    npoints <<- npoints + 1L
        if(debug && !i%%100) cat("Processed",i,"points\n")
  
  		  ### decay and remove clusters
  		  if(decay_factor<1 && !npoints%%decay_interval) {
  		    #decrease weight for microclusters
  		    weights <<- weights * decay_factor
  		    total_weight <<- total_weight * decay_factor
          weight_remove <- .5
  		    remove <- which(weights <= weight_remove)
          
          if(length(remove)>0) {
  			    #remove microclusters
  			    removeKeys <- as.integer(names(weights)[remove])
  			    weights <<- weights[-remove]
            
            
            #TODO: set serialize variables to null
            #TODO: throw "you should serialize error"
  			    deserialize()
  			    .Call("RemovePoints",flann, removeKeys, PACKAGE="stream")
  			    .Call("DeleteNodes", rel, removeKeys, srelation, PACKAGE="stream")
          }
  
  		    ### decay and remove weak relations
  		    if(macro) {  
  		      deserialize()
            .Call("AgeRelations", rel, alpha, srelation, PACKAGE="stream")
          }
  		  }
  
    		### process new point
    		point <- newdata[i,]
    		
    		total_weight <<- total_weight +1
        
        if(length(weights)==0) {
  		    #create first micro-cluster
  		    weights <<- 1
          names(weights) <<- "0"
          columns <<- length(as.numeric(point))
          nclusters <<- nclusters + 1L
  		    flann <<- .Call("CreateCenters",as.numeric(point), PACKAGE="stream")
  		  } else {
  		    deserialize()
  		    inside <- .Call("RadiusSearch",flann,as.numeric(point),r^2,weights, PACKAGE="stream")
          inside <- inside[,1]

        if(length(inside)<1) { ### new cluster
  			  weights <<- c(weights, 1)
          names(weights) <<- c(names(weights[1:length(weights)-1]),nclusters)
  			  deserialize()
  			  .Call("AddPoint",flann,as.numeric(point),columns, PACKAGE="stream")
  			  if(debug) cat("  + Creating Cluster", nclusters, "\n")
  			  nclusters <<- nclusters + 1L;


		    }else{ ### update existing cluster

		  	  partialweight <- 1/length(inside) 
		  	  
          
			    weights[as.character(inside)] <<- weights[as.character(inside)] + partialweight

			    if(macro && length(inside)>1) {
            .Call("AddRelations", rel, inside, srelation, PACKAGE="stream")
          }
		    }
		  }	   
    }
  },
                   
                   
                   ###########################################################################
                   ### helpers
                   
                   serialize = function() {
                     sflann <<- as.matrix(.Call("GetAllPoints",flann,nclusters,columns, PACKAGE="stream"))
                     srelation <<- .Call("GetRelations",rel, PACKAGE="stream")
                   },
                   
                   deserialize = function() {
                     flann <<- .Call("DeserializeFlann",flann,sflann, PACKAGE="stream")
                     rel <<- .Call("DeserializeRelations",rel,srelation, PACKAGE="stream")
                     sflann <<- NULL
                     srelation <<- NULL
                   },
                   
                   # find strong MCs
                   strong_mcs = function(weak=FALSE) {
                     o <- order(weights, decreasing=FALSE)
                     
                     # first element represents weight of already deleted MCs!
                     cs <- cumsum(c(total_weight-sum(weights), weights[o]))
                     
                     if(weak)
                       as.integer(names(weights)[o[(cs < total_weight*noise)[-1]]])
                     else  
                       as.integer(names(weights)[o[(cs >= total_weight*noise)[-1]]])
                   },
                   
                   
                   ### FIXME: this is not exported yet
                   get_connectivity = function(matrix=FALSE) {
                     mc_weights <- weights
                     mcs <- as.integer(names(weights))
                     
                     deserialize()
                     relations <- .Call("GetRelations",rel,srelation, PACKAGE="stream")
                     
                     reltemp <- as.matrix(relations[,c(1,2)])
                     reltemp <-  matrix(match(reltemp, mcs), ncol=2) ### translate from names to index
                     val <- relations[,3]
                     
                     if(nrow(reltemp) <1) return(matrix(nrow=0, ncol=0))
                     
                     avg_weight <- apply(reltemp, MARGIN=1, FUN= function(x) mean(mc_weights[x]))
                     
                     ### similarity
                     ss <- val/avg_weight
                     ### create a distance
                     
                     ### unconnected is 2 times the largest distance
                     s <- matrix(0, ncol=length(mcs), nrow=length(mcs))
                     rownames(s) <- mcs
                     colnames(s) <- mcs
                     
                     for(i in 1:nrow(reltemp)) {
                       s[reltemp[i,1], reltemp[i,2]] <- ss[i]
                       s[reltemp[i,2], reltemp[i,1]] <- ss[i]
                     }
                     
                     strong <- .self$strong_mcs()
                     
                     s <- s[as.character(strong),as.character(strong)]
                     if(!matrix) s <- as.simil(s)
                     s
                   },
                   
                   get_membership_weights = function() {
                     s <- .self$get_connectivity()
                     
                     l <- list(description = "tNN_fast", RObj = .self)
                     class(l) <- c("DSC_tNN_fast", "DSC_Micro", "DSC_R", "DSC")
                     
                     
                     if(nrow(s)<2) assignment <- 1:nclusters(l, type="micro")
                     else if(alpha>0) { ### use alpha
                       s[s < alpha] <- 0
                       s[s>0] <- 1
                       d <- 1-s
                       assignment <- cutree(hclust(d, method="single"), h=.5)
                     }else{ ### use k
                       if(alpha<0) warning("You need to specify at leasy alpha or k!")
                       d <- 1/(1+s)
                       
                       ### FIXME: If k>number of connected components then components would
                       ###  be merged randomly! So we add for these the redular distance!
                       
                       d2 <- dist(get_centers(l, type="micro"), method=distFun) 
                       unconnected <- d==1
                       d[unconnected] <- d[unconnected] + d2[unconnected]
                       
                       assignment <- cutree(hclust(d, method="single"), k=k)
                     }
                     
                     ### aggregate macro-cluster weights
                     w <- get_weights(l, type="micro")
                     w <- aggregate(w, by=list(assignment), FUN=sum)$x
                     
                     ### deal with k and minweight (only if alpha is given!)
                     if(alpha>0) {
                       if(k>0 && length(w) > k) {
                         take <- order(w, decreasing=TRUE)[1:k]
                         w <- w[take]
                         assignment <- match(assignment, take)
                       }
                       if(minweight>0) {
                         take <- which(w>=(minweight*sum(w)))
                         w <- w[take]
                         assignment <- match(assignment, take)
                       }
                     }
                     
                     return(list(assignment=assignment, weight=w))
                   }
)
    
DSC_tNN_fast <- function(r = 0.1, k=0, alpha = 0, minweight = 0, lambda = 1e-3, 
                    decay_interval=1000L, noise = 0.01, measure = "Euclidean", macro = TRUE) {
  
  if(k==0 && alpha==0 && macro) {
    warning("You have to specify at least k or alpha! Using default alpha=.25 and minweight=0.1.")
    minweight <- 0.1
    alpha <- 0.25
  }
  
  tNN <- tNN_fast$new(r, k, lambda, as.integer(decay_interval), 
                 minweight, noise, alpha, measure, macro)
  l <- list(description = "tNN", RObj = tNN)
  class(l) <- c("DSC_tNN_fast", "DSC_Micro", "DSC_R", "DSC")
  l
}

get_microclusters.DSC_tNN_fast <- function(x) {
    ### we have to rename the micro-clusters
    x$RObj$deserialize()
    mc <- as.data.frame(.Call("GetAllPoints",x$RObj$flann,x$RObj$nclusters,x$RObj$columns, PACKAGE="stream"))
    if(nrow(mc)<1) return(data.frame())
    
    mc <- mc[as.character(x$RObj$strong_mcs()),]

    rownames(mc) <- NULL
    mc
}



get_microweights.DSC_tNN_fast <- function(x) {
    x$RObj$weights[as.character(x$RObj$strong_mcs())]
}



get_macroclusters.DSC_tNN_fast <- function(x) {
    if(!x$RObj$macro) stop("No macro-clusters available!")
    
    mw <-  x$RObj$get_membership_weights()
    assignment <- mw$assignment
    weights <- mw$weight
    uniqueassign <- na.omit(unique(assignment))
    
    if(length(uniqueassign) <1) return(data.frame())
    
    mcs <- get_centers(x, type="micro")
    mcw <- get_weights(x, type="micro")

    ### find weighted centroids
    as.data.frame(t(sapply(uniqueassign, FUN=function(i) {
		take <- which(assignment==i)
		colSums(mcs[take,]*mcw[take])/sum(mcw[take])	
	    })))
}

get_macroweights.DSC_tNN_fast <- function(x) {
    if(!x$RObj$macro) stop("No macro-clusters available!")
    x$RObj$get_membership_weights()$weight
}


microToMacro.DSC_tNN_fast <- function(x, micro=NULL) {
    if(is.null(micro)) micro <- 1:nclusters(x, type="micro")
    mw <- x$RObj$get_membership_weights()
   
    structure(mw$assignment[micro], names=micro)
}

serialize <- function(x){ UseMethod("serialize") }

serialize.DSC_tNN_fast <- function(x) {
  x$RObj$serialize()
}


### special plotting for DSC_tNN_fast
### FIXME: only show edges that really are used
plot.DSC_tNN_fast <- function(x, dsd = NULL, n = 1000,
	col_points="gray",
	col_clusters="red",
	weights=TRUE,
	scale=c(1,5),
	cex =1,
	pch=NULL,
	...,
	method="pairs",
	type=c("auto", "micro", "macro")) {
	
    NextMethod()


    if(x$RObj$macro && type %in% c("macro")
		&& (x$RObj$columns<=2 || method=="plot")) {

	    p <- get_centers(x, type="micro")

	    if(nrow(p)>0) {
		points(p, col="black")

		### add threshold circles
		for(i in 1:nrow(p)){
		    lines(ellipsePoints(x$RObj$r, x$RObj$r, 
				    loc=as.numeric(p[i,]), n=60),
			    col = "black", lty=3)
		}

		### add edges connecting macro-clusters
		s <- x$RObj$get_connectivity(matrix=TRUE)
		s[lower.tri(s)] <- NA
		edges <- which(s>0, arr.ind=TRUE)
		
		if(length(edges)>0) { # length instead of nrow (s can be empty!)
		    edges <- cbind(edges, 
			    w=apply(edges, MARGIN=1, FUN=function(ij) s[ij[1], ij[2]]))

		    #edges <- cbind(edges, stream:::map(edges[,3], range=c(1,5)))
		    edges <- cbind(edges, map(edges[,3], range=c(1,5)))

		    for(i in 1:nrow(edges)){
			lines(rbind(p[edges[i,1],],p[edges[i,2],]),
				col="black",lwd=edges[i,4])
		    }
		}

	    }
	}

    }

Try the streamtNNfast package in your browser

Any scripts or data that you put into this service are public.

streamtNNfast documentation built on May 2, 2019, 5:19 p.m.