inst/doc/klsh.R

## ---- echo=TRUE, message=FALSE, knitr::opts_chunk$set(cache=TRUE)-------------
library(blink)
library(plyr)
library(klsh)
data(RLdata500)
head(RLdata500)
data.500 <- RLdata500[-c(2,4)]
head(data.500)

## -----------------------------------------------------------------------------
set.seed(1234)
klsh.blocks <- klsh(data.500, p=100, num.blocks=5, k=2)

## -----------------------------------------------------------------------------
confusion.from.blocking(klsh.blocks, true_ids = identity.RLdata500)
confusion.from.blocking(klsh.blocks, recall.only=TRUE, true_ids = identity.RLdata500)
reduction.ratio.from.blocking(klsh.blocks)

## -----------------------------------------------------------------------------
twohundred_blocks_for_2e4_recs <- klsh(p=100,data.500, num.blocks=200,k=4)
onehundred_blocks_for_2e4_recs <- klsh(p=100,data.500, num.blocks=100,k=4)
fifty_blocks_for_2e4_recs <- klsh(p=100,data.500, num.blocks=50,k=4)
twentyfive_blocks_for_2e4_recs <- klsh(p=100,data.500, num.blocks=25,k=4)
ten_blocks_for_2e4_recs <- klsh(p=100,data.500, num.blocks=10,k=4)
five_blocks_for_2e4_recs <- klsh(p=100,data.500, num.blocks=5,k=4)
three_blocks_for_2e4_recs <- klsh(p=100,data.500, num.blocks=3,k=4)

blockings_k4 <- list(
						twohundred_blocks_for_2e4_recs,
						onehundred_blocks_for_2e4_recs,
						fifty_blocks_for_2e4_recs,
						twentyfive_blocks_for_2e4_recs,
						ten_blocks_for_2e4_recs,
						five_blocks_for_2e4_recs,
						three_blocks_for_2e4_recs)
confusions_k4 <- sapply(blockings_k4, confusion.from.blocking, recall.only=TRUE, true_ids = identity.RLdata500)	
reduction.ratio.from.blocking_k4 <- sapply(blockings_k4, reduction.ratio.from.blocking)

## -----------------------------------------------------------------------------
twohundred_blocks_for_2e4_recs_3 <- klsh(p=100,data.500, num.blocks=200,k=3)
onehundred_blocks_for_2e4_recs_3 <- klsh(p=100,data.500, num.blocks=100,k=3)
fifty_blocks_for_2e4_recs_3 <- klsh(p=100,data.500, num.blocks=50,k=3)
twentyfive_blocks_for_2e4_recs_3 <- klsh(p=100,data.500, num.blocks=25,k=3)
ten_blocks_for_2e4_recs_3 <- klsh(p=100,data.500, num.blocks=10,k=3)
five_blocks_for_2e4_recs_3 <- klsh(p=100,data.500, num.blocks=5,k=3)
three_blocks_for_2e4_recs_3 <- klsh(p=100,data.500, num.blocks=3,k=3)

blockings_k3 <- list(
						twohundred_blocks_for_2e4_recs_3,
						onehundred_blocks_for_2e4_recs_3,
						fifty_blocks_for_2e4_recs_3,
						twentyfive_blocks_for_2e4_recs_3,
						ten_blocks_for_2e4_recs_3,
						five_blocks_for_2e4_recs_3,
						three_blocks_for_2e4_recs_3)

confusions_k3 <- sapply(blockings_k3, confusion.from.blocking, recall.only=TRUE, true_ids = identity.RLdata500)
reduction.ratio.from.blocking_k3 <- sapply(blockings_k3, reduction.ratio.from.blocking)

## -----------------------------------------------------------------------------
twohundred_blocks_for_2e4_recs_2 <- klsh(p=100,data.500, num.blocks=200,k=2)
onehundred_blocks_for_2e4_recs_2 <- klsh(p=100,data.500, num.blocks=100,k=2)
fifty_blocks_for_2e4_recs_2 <- klsh(p=100,data.500, num.blocks=50,k=2)
twentyfive_blocks_for_2e4_recs_2 <- klsh(p=100,data.500, num.blocks=25,k=2)
ten_blocks_for_2e4_recs_2 <- klsh(p=100,data.500, num.blocks=10,k=2)
five_blocks_for_2e4_recs_2 <- klsh(p=100,data.500, num.blocks=5,k=2)
three_blocks_for_2e4_recs_2 <- klsh(p=100,data.500, num.blocks=3,k=2)

blockings_k2 <- list(
						twohundred_blocks_for_2e4_recs_2,
						onehundred_blocks_for_2e4_recs_2,
						fifty_blocks_for_2e4_recs_2,
						twentyfive_blocks_for_2e4_recs_2,
						ten_blocks_for_2e4_recs_2,
						five_blocks_for_2e4_recs_2,
						three_blocks_for_2e4_recs_2)	

confusions_k2 <- sapply(blockings_k2, confusion.from.blocking, recall.only=TRUE, true_ids = identity.RLdata500)
reduction.ratio.from.blocking_k2 <- sapply(blockings_k2, reduction.ratio.from.blocking)

## -----------------------------------------------------------------------------
twohundred_blocks_for_2e4_recs_1 <- klsh(p=100,data.500, num.blocks=200,k=1)
onehundred_blocks_for_2e4_recs_1 <- klsh(p=100,data.500, num.blocks=100,k=1)
fifty_blocks_for_2e4_recs_1 <- klsh(p=100,data.500, num.blocks=50,k=1)
twentyfive_blocks_for_2e4_recs_1 <- klsh(p=100,data.500, num.blocks=25,k=1)
ten_blocks_for_2e4_recs_1 <- klsh(p=100,data.500, num.blocks=10,k=1)
five_blocks_for_2e4_recs_1 <- klsh(p=100,data.500, num.blocks=5,k=1)
three_blocks_for_2e4_recs_1 <- klsh(p=100,data.500, num.blocks=3,k=1)

blockings_k1 <- list(
						twohundred_blocks_for_2e4_recs_1,
						onehundred_blocks_for_2e4_recs_1,
						fifty_blocks_for_2e4_recs_1,
						twentyfive_blocks_for_2e4_recs_1,
						ten_blocks_for_2e4_recs_1,
						five_blocks_for_2e4_recs_1,
						three_blocks_for_2e4_recs_1)

confusions_k1 <- sapply(blockings_k1, confusion.from.blocking, recall.only=TRUE, true_ids = identity.RLdata500)
reduction.ratio.from.blocking_k1 <- sapply(blockings_k1, reduction.ratio.from.blocking)

## ---- fig.show="hold", fig.cap="The recall versus the total number of blocks after running KLSH using k=1, 2, 3, 4.", fig.height = 4, fig.width = 5, fig.align = "center"----
library(ggplot2)

plot_dat <- rbind(
  data.frame(k = "4", block_length = unlist(lapply(blockings_k4, length)), recall = confusions_k4, reduction_ratio = reduction.ratio.from.blocking_k4),
  data.frame(k = "3", block_length = unlist(lapply(blockings_k3, length)), recall = confusions_k3, reduction_ratio = reduction.ratio.from.blocking_k3),
  data.frame(k = "2", block_length = unlist(lapply(blockings_k2, length)), recall = confusions_k2, reduction_ratio = reduction.ratio.from.blocking_k2),
  data.frame(k = "1", block_length = unlist(lapply(blockings_k1, length)), recall = confusions_k1, reduction_ratio = reduction.ratio.from.blocking_k1)
)

ggplot(plot_dat) +
  geom_point(aes(block_length, recall, colour = k)) +
  geom_line(aes(block_length, recall, colour = k, group = k)) +
  xlab("Total Number of Blocks") +
  ylab("Recall") +
  theme_bw(base_family = "serif") +
  ylim(c(0.4, 1))

## ---- fig.show="hold", fig.cap="The recall versus reduction ratio after running KLSH using k=1,2,3,4.", fig.height = 4, fig.width = 5, fig.align = "center"----
ggplot(plot_dat) +
  geom_point(aes(block_length, reduction_ratio, colour = k)) +
  geom_line(aes(block_length, reduction_ratio, colour = k, group = k)) +
  xlab("Total Number of Blocks") +
  ylab("Reduction Ratio") +
  theme_bw(base_family = "serif")

Try the klsh package in your browser

Any scripts or data that you put into this service are public.

klsh documentation built on Jan. 13, 2021, 8:05 p.m.