# nearest neighbor prediction via "cosine" method:
# the user has already called formUserData() on the training set, and
# now wishes to predict a new case; the question is, what rating would
# the user newData give to item newItem?
# first, the cases in the training set for which a rating for newItem is
# available are found; then each such case is compared to newData, by
# computing the "cosine" between the training case and newData
# the "cosine" between two cases A and B is defined according to the
# items rated in common between them; the dot product (in the linear
# algebra sense) of the ratings of A and the ratings of B, among the
# items in common, then divided by the product of the norms of those two
# vectors
# we then find the prediction by averaging the ratings of newItem among
# the k training cases having the highest cosine with newData
# covariates (e.g. age, gender) and item type preferences (e.g.
# preferred movie genres) are allowed in distance computation for
# neighbors; the covariates are assumed here to be user covariates, as
# this is what findUsrItmData.R gives
# NOTE: the cosine, though standard, has certain problems; e.g., its
# scale-free nature means two users are very "close" even if, say, one
# is almost exactly double the other; other choices for distance measure
# will be added
# arguments:
# origData: training set, object of class 'usrData', a list of
# objects of class 'usrDatum' (see file findUsrItmData.R)
# newData: data point to be predicted, object of class 'usrDatum';
# just a single point here, but have predictUsrDataMany()
# later below
# newItem: ID of the item rating to be predicted for the
# user in newData
# wtcovs: weight to put on covariates; NULL if no covs
# wtcats: weight to put on item categories; NULL if no cats
# k: a vector of the numbers of nearest neigbhors used for
# predicting; one predicted rating will be calculated for
# each k
# value:
# predicted ratings for newData for the item newItem
predict.usrData <- function(origData,newData,newItem,
k,wtcovs=NULL,wtcats=NULL,leave1Out=TRUE)
{
# check that newData has the covs and cats if and only if the
# training data did
traincovs <- !is.null(origData[[1]]$cvrs)
newcovs <- !is.null(newData$cvrs)
if (!(traincovs + newcovs %in% c(0,2)))
stop('mismatch in having/not having covars, orig and new data')
traincats <- !is.null(origData$cats)
newcats <- !is.null(newData$cats)
if (!(traincats + newcats %in% c(0,2)))
stop('mismatch in having/not having cats, orig and new data')
# we first need to narrow origData down to the users who
# have rated newItem
# action of checkNewItem(): here oneUsr is one user record in
# origData; the function will look for a j such that element j in the
# items list for this user matches the item of interest, newItem; if
# such a j exists, then (j,rating) will be returned, otherwise
# (NA,NA); defined for use by sapply() below
checkNewItem <- function(oneUsr) {
whichOne <- which(oneUsr$itms == newItem)
if (length(whichOne) > 1) {
stop("same user/item pair encountered more than once")
}
if (length(whichOne) == 0) {
return(c(NA,NA))
}
else return(c(whichOne,oneUsr$ratings[whichOne]))
}
found <- as.matrix(sapply(origData,checkNewItem))
# description of 'found':
# found is of dimensions 2 x number of users in training set
# found[1,i] = j means origData[[i]]$itms[j] = newItem;
# found[1,i] = NA means newItem wasn't rated by user i
# found[2,i] = rating in the non-NA case
# we need to get rid of the users who didn't rate newItem
whoHasIt <- which(!is.na(found[1,]))
if (leave1Out && newData %in% whoHasIt) {
whoHasIt <- whoHasIt[-as.integer(newData$userID)]
}
if (is.null(whoHasIt) | length(whoHasIt) == 0)
return(NA) # no one rated this item
origDataRatedNI <- origData[whoHasIt]
# now origDataRatedNI only has the relevant users, the ones who
# have rated newItem, so select only those columns of the found matrix
found <- found[,whoHasIt,drop=FALSE]
# find the distance from newData to one user y of origData; defined for
# use in sapply() below
onecos <- function(y) cosDist(newData,y,wtcovs,wtcats)
cosines <- sapply(origDataRatedNI,onecos)
# the vector cosines contains the distances from newData to all the
# original data points who rated newItem
# action of findKnghbourRtng(): find the mean rating of newItem in
# origDataRatedNI, for ki (= k[i]) neighbors
#
# if ki > neighbours present in the dataset, then the
# number of neighbours is used
findKnghbourRtng <- function(ki){
ki <- min(ki, length(cosines))
# nearby is a vector containing the indices of the ki
# most similar neighbours
nearby <- order(cosines,decreasing=TRUE)[1:ki]
mean(as.numeric(found[2, nearby]))
}
sapply(k, findKnghbourRtng)
}
prdusrdata <- predict.usrData
# find cosine distance between x and y, objects
# of 'usrData' class
#
# only items rated in both x and y are used; if none
# exist, then return NaN
#
# wtcovs: weight to put on covariates; NULL if no covs
# wtcats: weight to put on item categories; NULL if no cats
cosDist <- function(x,y,wtcovs=NULL,wtcats=NULL)
{
# rated items in common
commItms <- intersect(x$itms,y$itms)
if (length(commItms)==0) return(NaN)
# where are those common items in x and y?
xwhere <- which(!is.na(match(x$itms,commItms)))
ywhere <- which(!is.na(match(y$itms,commItms)))
xvec <- x$ratings[xwhere]
yvec <- y$ratings[ywhere]
if (!is.null(wtcovs)) {
xvec <- c(xvec,wtcovs*x$cvrs)
yvec <- c(yvec,wtcovs*y$cvrs)
}
if (!is.null(wtcats)) {
xvec <- c(xvec,wtcats*x$cats)
yvec <- c(yvec,wtcats*y$cats)
}
xvec %*% yvec / (l2a(xvec) * l2a(yvec))
}
l2a <- function(x) sqrt(x %*% x)
testCos <- function()
{
rts <- rbind(c(1, 3, 5), c(4, 2, 2), c(4, 1, 2), c(5, 6,2),
c(1, 6, 5),c(1,2,1),c(1,1,5))
ud <- formUserData(rts)
print(predict(ud,ud[['r']],3,1)) # should print 5
}
# temporary solution to the problem of predict.usrData predicting only
# one data point at a time
# origData,k,wtcovs,wtcovs: see predict.usrData() above
# newDataMany: a vector of user IDs (must have an entry in origData)
# newItemMany: a vector of item IDs (must be in origData)
predictUsrDataMany <- function(origData,newDataMany,newItemMany,
k,wtcovs=NULL,wtcats=NULL)
{
nNew <- length(newDataMany)
preds <- vector(length = nNew)
for (i in 1:nNew) {
origDataEntry <- origData[[as.character(newDataMany[i])]]
preds[i] <- predict.usrData(origData,origDataEntry,newItemMany[i],k=k)
}
preds
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.