R/reedgraph-bloom.R

###
###    Copyright (C) 2012 Martin J Reed              martin@reednet.org.uk
###    University of Essex, Colchester, UK
###
###    This program is free software; you can redistribute it and/or modify
###    it under the terms of the GNU General Public License as published by
###    the Free Software Foundation; either version 2 of the License, or
###    (at your option) any later version.
###
###    This program is distributed in the hope that it will be useful,
###    but WITHOUT ANY WARRANTY; without even the implied warranty of
###    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
###    GNU General Public License for more details.
###
###    You should have received a copy of the GNU General Public License along
###    with this program; if not, write to the Free Software Foundation, Inc.,
###    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

require("bitops")

reedgraphEnv <- new.env()
reedgraphEnv$BLOOMLENGTH <- 256
reedgraphEnv$BLOOMCHUNKS <- ceiling(reedgraphEnv$BLOOMLENGTH/32)
reedgraphEnv$ALLZEROS <- rep(0,reedgraphEnv$BLOOMCHUNKS)

## add a adjacency matrix to g for fast edge look up
## g an igraph
rg.add.adjacency.lookup <- function(g) {
    N <- length(V(g))
    edgeM <- matrix(nrow=N,ncol=N)
    for(i in 1:length(E(g))) {
        st <- ends(g,E(g)[i])
        edgeM[as.integer(st[1]),as.integer(st[2])] <- i
    }
    g$edgeM <- edgeM
    return(g)
}


## CAREFUL this library uses a fragile encoding of length.
## you must not change the Bloom length and then use variables
## that were generated using the old Bloom length
rg.set.bloomlength <- function(m) {
    unlockBinding("BLOOMLENGTH",reedgraphEnv)
    unlockBinding("BLOOMCHUNKS",reedgraphEnv)
    unlockBinding("ALLZEROS",reedgraphEnv)
  reedgraphEnv$BLOOMLENGTH <- m
  reedgraphEnv$BLOOMCHUNKS <- ceiling(reedgraphEnv$BLOOMLENGTH/32)
  reedgraphEnv$ALLZEROS <- rep(0,reedgraphEnv$BLOOMCHUNKS)
  
}

rg.prob.false.free <- function(m,alpha,beta) {
  return((1-0.618503^(m/alpha))^beta)
}

rg.exp.bloom.length <- function(alpha,beta) {
  notconverged <- TRUE
  Em <- (1-0.618503^(1/alpha))^beta
  m <- 2
  prod=1
  while(prod > 0) {
    prod=1
    for(i in 1:m-1) {
      prod <- prod * (1- (1-0.618503^(i/alpha))^beta)
    }
    Em <- Em + m * (1-0.618503^(m/alpha))^beta * prod
    m <- m + 1
  }
  cat("Expected is ",Em," converged at m=",m,"\n")
  return(Em)
}

rg.test.exp.bloom.length <- function(m,alpha,beta) {
  rg.set.bloomlength(m)
  k <- rg.bloom.optimum.k(m,alpha)
  rg.test.fp.averages()
  
}

## generate a random FID with n elements
rg.random.fid <-function(n,k=1) {
  fid <- reedgraphEnv$ALLZEROS
  for(i in 1:n) {
    fid <- bitOr(fid,rg.createLid(k))
    #print(rg.bloom.to.binary(fid))
  }
  return(fid)
}

## count the number of false positives for
## an FID against t off path (random) LIDs
rg.test.fp <- function(n,t,k=1) {
  count <- 0
  fid <- rg.random.fid(n,k)
  #print("fid=")
  #print(rg.bloom.to.binary(fid))
  #print("lids")
  for(i in 1:t) {
    lid <- rg.createLid(k)
    #print(rg.bloom.to.binary(lid))
    if(rg.test.member(fid,lid))
      count <- count +1
  }
  return(count)
}

## find the false potisive rate for
## n (random) elements coded in a Bloom filter
## and compared against t off path (random) LIDs
## work this out over a averages and give the
## mean false postive rate
rg.test.fp.averages <-function(n,t,k=1,a) {
  sum <- 0
  for(i in 1:a) {
    sum <- sum + rg.test.fp(n,t,k)
  }
  return(sum/(a*t))
}

### example
### rg.test.fp.averages(10,20,10,100)
rg.test.fp.range.k <- function(n,t,k=c(1),a) {
  FP <- c()
  for(kay in k) {
    FP <- c(FP,rg.test.fp.averages(n,t,kay,a))
  }
  results <- data.frame(k=k,FP=FP)
  return(results)
    
}

### Function to print binary pattern as a string
rg.bloom.to.binary <- function(val,len=reedgraphEnv$BLOOMLENGTH) {
    
    result <- c()
    for(i in 1:len) {
        ind <- (i-1) %/% 32  + 1
        mybit <- bitAnd(val[ind],1)
        result <- c(mybit,result)
        val[ind] <- bitShiftR(val[ind],1)
    }
    return(rev(result))
}

## Convert a string pattern like "0 0 1 0 .....0" to the internal bloom representation
rg.binary.to.bloom <- function(string,len=reedgraphEnv$BLOOMLENGTH) {
    bloomchunks <- ceiling(len/32)
    allzeros <- rep(0,bloomchunks)
    
    result <- allzeros
    for(i in len:1) {
        ind <- (i-1) %/% 32  + 1
        result[ind] <- bitShiftL(result[ind],1)
        result[ind] <- bitOr(result[ind],string[i])
    }
    return(result)
}

### combinations possible with
### k hash functions, m length
rg.bloom.comb <- function(k,m) {

  prod <- 1.0
  for(i in m:(m-k+1)) {
    print(prod)
    prod <- i * prod
  }
  return(prod)
}

### Bloom filter false postive rate as estimated according to Bose 2007
### input: k hash functions, m length, n items in filter
### this is a strict lower bound for k>=2
### returns vector c(lower,upper)
### lower is a strict lower bound originally published by Bloom
### as exact figure, but Bose shows it is a strict lower bound.
### upper is approximate upper bound by Bose
rg.bloom.false <- function(k,m,n) {
  ## original Bloom figure is lower bound
  lbound <- (1-(1-1/m)^(k*n))^k

  ## from Bose
  p <- 1-(1-1/m)^(k*n)
  ubound <- lbound * (1 + k/p * sqrt((log(m) - k * log(p))/m ))


  return(c(lbound,ubound))
}



### optimium k for Bloom filter
### Input:
### n is number of elements bloom filter,
### m is length of bloom filter.
### returns k
rg.bloom.optimum.k <- function(m,n) {
  return(log(2) * m /n)
}

rg.createLid <- function(k=7,pos=NULL) {
  lid <- rep(0,reedgraphEnv$BLOOMCHUNKS)
  ## produce number 0-255 (zero offset)
  if(is.null(pos)) {
      pos <- floor(runif(k) * reedgraphEnv$BLOOMLENGTH)
  } else
      pos <- pos - 1
  ## index in chunks (unit offset)
  ## note integer division here (not modulus, easy to miss / in %/%)
  ind <- pos %/% 32 +1
  ## set one bit in 0 ... 31
  val <- 2^((pos) %% 32)
  ## add to existing 
  for(i in 1:length(val)) {
    lid[ind[i]] <- bitOr(lid[ind[i]],val[i])
  }
  return(lid)
}

### Test if element is member of bloom filter
### input:
### fid - Bloom filter
### lid - element to test if member of fid
### this could be simply written in-line but this
### call does not take much longer than the messy line
### it includes.
### returns - TRUE or FALSE as expected
rg.test.member <- function(fid,lid) {
  identical(bitXor(bitAnd(fid,lid),lid) ,reedgraphEnv$ALLZEROS)
}

## assigns the LIDs randomly
rg.assign.lids.to.graph <- function(g,k=7) {
  for(e in 1:(length(E(g))) ) {
    ## have to assign a list to an attribute
    ## cannot assign a vector directly
    E(g)[e]$lid <- list(rg.createLid(k=k))
  }
  return(g)
}


### create fid along path, path is a sequence of node numbers
### as returned from   path <- get.shortest.paths(g,from,to)[[1]]
### graph - igraph
### value - vector representing fid
rg.create.fid <- function(graph,path) {
  fid <- reedgraphEnv$ALLZEROS
  for(i in 1:(length(path)-1)) {
      lid <- E(graph)[path[i] %->% path[i+1]]$lid[[1]]
    fid <- bitOr(fid,lid)
  }
  return(fid)
}

### Count false positives on a path for given fid
### input
### graph - igraph with lid edge attributes set
### path - vector of vertex pairs for path
### fid - Bloom filter made up from lids on path
### returns list
### fpcount - count of false postives
### tpcount - count of true positives
### tncount - count of true negatives
###   ther are no false negatives for LIPSIN
### graph - igraph
### pathtype:
###     single - vector of vertices 1 2 3 which make a unicast path (an old format)
###     pairs  - a matrix of path pairs  (1,2) (2,3) which can cope with multicast
###              it assumes the links have been stored missing duplicates in the tree
#### WARNING may be broken, igraph stores the lids as seperate objects not concatonated list
rg.bloom.false.positive.on.path <- function(graph,path,fid,pathtype="single") {
  if(!is.simple(graph)) {
    cat("Error in rg.bloom.false.positive, not a simple graph\n")
    return(NULL)
  }
  if(pathtype == "single") {
      #cat(path,"\n")
      #cat("FID   :",rg.bloom.to.binary(fid),"\n")
      edgeM <- graph$edgeM
      previous <- -1
      current <- path[1]
      fpcount <- 0
      tpcount <- 0
      tncount <- 0
      for(i in 1:length(path)) {
          current <- path[i]
          if(i != length(path))
              nxt <- path[i+1]
          else
              nxt <- -1
          n <- neighbors(graph,current)
          
          if(nxt==-1) {
              tests <- n[n!=previous]
          } else if(previous==-1) {
              tests <- n[n!=nxt]
          } else {
              tests <- n[n!=nxt & n!=previous]
          }
          for(t in tests) {
              #if(rg.test.member(fid,E(graph)[ current %->% t ]$lid[[1]])) {
              #print(c(current,t))
              #print(graph$edgeM[current,t])
              lid <- E(graph)[edgeM[current,t]]$lid[[1]]
              #cat(current,"->",t,":")
              #cat(rg.bloom.to.binary(lid),"\n")
              if(identical(bitXor(bitAnd(fid,lid),lid) ,reedgraphEnv$ALLZEROS)) {
              #if(rg.test.member(fid,E(graph)[edgeM[current,t]]$lid[[1]])) {
                  fpcount <- fpcount + 1
                  #cat("false positive:",current,"->",t,"\n")
              } else {
                  tncount <- tncount + 1
              }
          }
          previous <- current
          tpcount <- tpcount + 1
      }
      count <- list()
      count$fpcount <- fpcount
      count$tpcount <- tpcount
      count$tncount <- tncount
      return(count)
  } else if(pathtype == "pairs") {
      print("Warning pairs not implemented")
  }
}

### Returns the false positive count on each unicast path of length 1 ... diameter
### also returns true positive count and true negative count
rg.test.false.positives.all.paths <- function(graph) {
    
    fpcount <- c(0)
    tpcount <- c(0)
    tncount <- c(0)
    for(i in V(graph)) {
        paths <- get.shortest.paths(graph,i)
        for(path in paths$vpath) {
            nelements <- length(path)-1
            if(length(path)>1){
                fid <- rg.create.fid(graph,path)
                
                pcount <- rg.bloom.false.positive.on.path(graph,path,fid)
                if(pcount$fpcount>0)
                    #print(path)
                if(is.na(fpcount[nelements])) {fpcount[nelements] <- 0}
                if(is.na(tpcount[nelements])) {tpcount[nelements] <- 0}
                if(is.na(tncount[nelements])) {tncount[nelements] <- 0}
                fpcount[nelements] <- fpcount[nelements] + pcount$fpcount
                tpcount[nelements] <- tpcount[nelements] + pcount$tpcount
                tncount[nelements] <- tncount[nelements] + pcount$tncount
            }
        }
    }
    result <- list()
    
    result$fpcount <- fpcount
    result$tpcount <- tpcount
    result$tncount <- tncount
    return(result)
}

rg.test.false.posititives.loop <- function(graph,k=7) {
  rate <- c(0)
  count <- c(0)

  for(i in 1:50) {
    graph <- rg.assign.lids.to.graph(graph,k=k)
    result <- rg.test.false.positives.all.paths(graph)
    ## rate or count not long enough make them long
    ## enough and fill missing values with zero
    len <- length(result$rate)
    if(is.na(rate[len])) {
      rate[len] <- 0
      rate[is.na(rate)] <- 0
      count[len] <- 0
      count[is.na(count)] <- 0
    }
    rate <- rate + result$rate
    count <- count + result$count
  }
  count[is.na(count)] <- 0
  rate[is.na(rate)] <- 0
  range <- 1:length(count)
  bounds <- mapply(rg.bloom.false,k,reedgraphEnv$BLOOMLENGTH,range)
  ## calculate mean rate from all results.
  result <- data.frame(N=range,Count=count,Rate=rate/count,Lower=bounds[1,],
                       Upper=bounds[2,])
  return(result)
}
martinjreed/reedgraph documentation built on May 21, 2019, 12:39 p.m.