# Functions for document and image retrieval
# By Tom Minka
# Functions for lab 1
strip_text <- function(txt) {
# remove apostrophes
txt <- gsub("'","",txt)
# convert to lowercase
txt <- tolower(txt)
# change other non-alphanumeric characters to spaces
# "man 7 regex" for info
txt <- gsub("[^a-z0-9]"," ",txt)
# change digits to #
txt <- gsub("[0-9]+","#",txt)
# split and make one array
txt <- unlist(strsplit(txt," "))
# remove empty words
txt <- txt[txt != ""]
txt
}
#' @export
read_doc <- function(fname, remove.header = TRUE) {
txt <- readLines(fname)
if(remove.header) {
i <- which(txt == "")
if(length(i) > 0) {
txt <- txt[-(1:i[1])]
}
}
strip_text(txt)
}
remove.singletons <- function(x) {
# remove infrequent words
count <- colSums(x>0)
x[,(count > 1)]
}
idf.weight <- function(x) {
# IDF weighting
doc.freq <- colSums(x>0)
doc.freq[doc.freq == 0] <- 1
w <- log(nrow(x)/doc.freq)
scale.cols(x,w)
}
idf.weight2 <- function(x) {
# Joachims' IDF
doc.freq <- colMeans(div_by_sum(x))
w <- sqrt(1/doc.freq)
scale.cols(x,w)
}
#' @export
div_by_sum <- function(x) {
scale.rows(x,1/(rowSums(x)+1e-16))
}
#' @export
div_by_euc_length <- function(x) {
scale.rows(x,1/sqrt(rowSums(x^2)+1e-16))
}
if(T) {
remove.singletons.ragged <- function(x) {
# x is a list of vectors (a ragged array)
col.names <- c()
for(i in 1:length(x)) {
col.names <- c(col.names, names(x[[i]]))
}
count <- table(col.names)
for(i in 1:length(x)) {
not.single <- (count[names(x[[i]])] > 1)
x[[i]] <- x[[i]][not.single]
}
x
}
standardize.ragged <- function(x) {
# x is a list of vectors (a ragged array)
# standardize each vector to have same length and ordering, by adding NAs
col.names <- c()
for(i in 1:length(x)) {
# this is faster than unique(c(col.names, names(x[[i]])))
col.names <- c(col.names, setdiff(names(x[[i]]),col.names))
}
for(i in 1:length(x)) {
#not.in <- setdiff(col.names,names(x[[i]]))
#x[[i]][not.in] <- NA
#is.na(x[[i]][not.in]) <- T
# this automatically returns NA for missing names
x[[i]] <- x[[i]][col.names]
names(x[[i]]) <- col.names
}
x
}
}
#' @export
mds <- function(d,y=NULL,labels=T,k=2,main="Multidimensional scaling",...) {
library(MASS)
for(iter in 1:2) {
px = try(as.data.frame(sammon(d,k=k,trace=F)$points), silent=TRUE)
if(!inherits(px,"try-error")) break
d = d + 1e-4*array(runif(prod(dim(d))),dim(d))
diag(d) = 0
}
if(inherits(px,"try-error")) stop(px)
if(k == 2) {
if(is.null(y)) {
if(is.null(labels))
plot(px,asp=1,main=main,...)
else
text.plot(px,labels,asp=1,main=main,...)
} else {
color.plot(px,y,labels=labels,asp=1,zlab=main,...)
}
} else if(k == 3) {
if(is.null(y)) {
vrml.plot3d(formula(px),px,labels=labels,...)
} else {
vrml.color.plot(px,y,labels=labels,...)
}
}
invisible(px)
}
# Functions for lab 2
as.oldpix <- function(pix) {
d = attr(pix,"size")
x = array(0,c(d,3))
x[,,1] = attr(pix,"red")
x[,,2] = attr(pix,"green")
x[,,3] = attr(pix,"blue")
x
}
flatten.pixmap <- function(x) {
if(inherits(x,"pixmapRGB")) x = as.oldpix(x)
len <- prod(dim(x)[1:2])
# convert pixmap to array with cols r,g,b
r <- array(0,c(len,3),list(NULL,c("red","green","blue")))
for(i in 1:3) {
r[,i] <- as.vector(x[,,i])
}
as.data.frame(r)
}
plot.pixels <- function(x,k=2,as.hsv=T) {
col <- rgb(x[[1]],x[[2]],x[[3]])
if(as.hsv) x = rgb2hsv(x)
if(k==3)
vrml.plot3d(formula(x),x,col=col,bg=grey(0.5))
else
plot3d(formula(x),x,color=col,bg=grey(0.5))
}
quantize.cube <- function(x,k) {
# quantize to integer
# x has three columns
# first col varies fastest
r <- 0
d <- 1
for(i in 1:ncol(x)) {
r <- r + as.integer(x[,i]*k/1.00001)*d
d <- d*k
}
r
}
closest <- function(d) {
# for each column, return the name of the row with smallest distance
if(dim(d)[1] == dim(d)[2] && all(diag(d)) == 0) {
diag(d) <- Inf
}
r <- rownames(d)[apply(d,2,which.min)]
names(r) <- colnames(d)
r
}
if(T) {
doc.page <- function(index.file,base="",cols=5) {
# make a web page with document names
# output is index.file with extension changed to html
base = path.expand(base)
tab <- read.table(index.file,header=T)
col <- 1
s <- strsplit(index.file,"\\.")
fname <- paste(s[[1]][1],"html",sep=".")
cat("wrote",fname,"\n")
con <- file(fname,"w")
cat("<html><body><table>\n",file=con)
for(i in 1:nrow(tab)) {
name <- as.character(tab[i,1])
fname <- as.character(tab[i,3])
fname = file.path(base,fname)
if(col == 1) cat("<tr align=center>\n",file=con)
cat("<td>",file=con)
cat("<a href=\"",fname,"\">",name,"</a>\n",sep="",file=con)
cat("</td>",file=con)
if(col == cols) { cat("</tr>\n",file=con); col <- 0 }
col <- col + 1
}
cat("</table></body></html>\n",file=con)
close(con)
}
image.page <- function(index.file,base="",cols=5) {
# make a web page with images
base = path.expand(base)
tab <- read.table(index.file,header=T)
col <- 1
s <- strsplit(index.file,"\\.")
fname <- paste(s[[1]][1],"html",sep=".")
cat("wrote",fname,"\n")
con <- file(fname,"w")
cat("<html><body><table>\n",file=con)
for(i in 1:nrow(tab)) {
name <- as.character(tab[i,1])
fname <- as.character(tab[i,2])
s <- strsplit(fname,"\\.")
fname <- paste(s[[1]][1],"jpg",sep=".")
if(nchar(base) > 0) fname = file.path(base,fname)
if(col == 1) cat("<tr align=center>\n",file=con)
cat("<td>",file=con)
cat("<img src=\"",fname,"\"><br>",name,"\n",sep="",file=con)
cat("</td>",file=con)
if(col == cols) { cat("</tr>\n",file=con); col <- 0 }
col <- col + 1
}
cat("</table></body></html>\n",file=con)
close(con)
}
read.images <- function(index.file,k=4) {
# read images and convert to matrix of color counts
library(pixmap)
tab <- read.table(index.file,header=T)
if(T) {
midpoints <- function(x) (x[-1]+x[-length(x)])/2
col <- midpoints(seq(0,1,len=k+1))
col <- as.matrix(expand.grid(col,col,col))
labels <- rgb2name(col2rgb(hsv(col[,1],col[,2],col[,3]))/255)
labels <- make.unique(labels)
#if(any(duplicated(labels))) stop("duplicate color names")
} else {
labels <- 0:(k^3-1)
}
imgs <- array(0,c(nrow(tab),k^3))
rownames(imgs) <- as.character(tab[,1])
for(i in 1:nrow(tab)) {
fname <- as.character(tab[i,2])
print(fname)
img <- flatten.pixmap(read.pnm(fname))
img <- rgb2hsv(img)
#img <- hsv2cone(img)
q <- quantize.cube(img,k)
x <- table(factor(q,levels=0:(k^3-1),labels=labels))
imgs[i,] <- x
colnames(imgs) <- names(x)
}
imgs
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.