#' Generate basic pairwise comparisons for user accounts of interest
#'
#' Given a dataframe containing at least account hashes, IDs and marketplaces,
#' generate basic pairwise comparisons: edit distance between the IDs and
#' whether or not they are from the same marketplace.
#'
#' @param dataframeName name of dataframe that contains at least three columns:
#' user hashes of interest (named `vendor_hash`), alias (`id`) and marketplace
#' (`marketplace`).
#' @return dataframe with columns `hash1`, `hash2`, `order1` (row corresponding
#' to `hash1` in `dataframeName`), `order2`, `id1`, `id2`, `marketplace1`,
#' `marketplace2`, `idDist`, `sameMarket`
#' @export
getPairs <- function(dataframeName) {
allPairwise <- t(combn(dataframeName$vendor_hash, 2))
allPairwise <- data.frame(hash1 = allPairwise[, 1], hash2 = allPairwise[, 2], stringsAsFactors = FALSE)
allPairwise$order1 <- match(allPairwise$hash1, dataframeName$vendor_hash)
allPairwise$order2 <- match(allPairwise$hash2, dataframeName$vendor_hash)
info1 <- dataframeName[allPairwise$order1, c("id", "marketplace")]
info2 <- dataframeName[allPairwise$order2, c("id", "marketplace")]
allPairwise$id1 <- info1$id
allPairwise$id2 <- info2$id
allPairwise$marketplace1 <- info1$marketplace
allPairwise$marketplace2 <- info2$marketplace
allPairwise$idDist <- apply(allPairwise[, c("id1", "id2")], MARGIN = 1, FUN = function(x) adist(x[1], x[2]))
allPairwise$sameMarket <- ifelse(info1$marketplace == info2$marketplace, 1, 0)
return(allPairwise)
}
#' Generate pairwise comparisons from list items
#'
#' Given a list item indexed by vendor hashes (such as `titleTokens`, ...,
#' generated using the functions in `step2_anytime.R`), generate Jaccard
#' similarities / return whether or not their intersections are non-empty (used
#' for PGPs).
#'
#' @param pairsOfInterest dataframe with columns `hash1`, `hash2` referring to
#' vendor hashes for the comparison
#' @param listName e.g. `titleTokens`, `profileTokens`, `PGPlist`
#' @param functionName either `jaccardSimilarity` or `PGPmatch`. These are
#' currently the only two options.
#' @return vector the length of `nrow(pairsOfInterest)`, containing the
#' resulting similarity scores for each pair
#' @export
fromList <- function(pairsOfInterest, listName, functionName) {
list1 <- listName[pairsOfInterest$order1]
list2 <- listName[pairsOfInterest$order2]
out <- mapply(functionName, list1, list2, USE.NAMES = FALSE)
return(out)
}
#' Generate pairwise absolute differences from dataframes
#'
#' Given a dataframe where each row corresponds to a vendor hash (can be
#' generated using the functions in `step2_anytime.R`), generate absolute
#' differences between columns. Columns need to be numeric variables. There is
#' also the option to set the result to -1 if the variable being compared has
#' value 0 for both accounts in the pair. This is useful for variables such as
#' the number of tokens in the profiles -- if both have no profile information
#' the similarity should be missing (-1) instead of 0.
#'
#' @param pairsOfInterest dataframe with columns `hash1`, `hash2` referring to
#' vendor hashes for the comparison
#' @param dtfName dataframe containing the columns to be compared
#' @param varName vector of variable names that we want to compute differences
#' for (TODO: now only works if >1 col)
#' @param setToMinus1 logical vector, same length as `varName`
#' @return vector the length of `nrow(pairsOfInterest)`, containing the
#' resulting similarity scores for each pair
#' @export
absFromDataframe <- function(pairsOfInterest, dtfName, varName, setToMinus1) {
info1 <- dtfName[pairsOfInterest$order1, varName]
info2 <- dtfName[pairsOfInterest$order2, varName]
out <- abs(info1 - info2)
if (sum(setToMinus1) > 0) {
tmpCols <- which(setToMinus1 == 1)
putThisIn <- unlist(out[, tmpCols], use.names = FALSE)
putThisIn[which(info1[, tmpCols] == 0 | info2[, tmpCols] == 0)] <- -1
out[, tmpCols] <- putThisIn
}
names(out) <- paste0("diff_", varName)
return(out)
}
#' Generate pairwise similarities given binary vector of sales days
#'
#' Given sales columns generated by `infoFromFeedback()`, produce three pairwise
#' variables: 1. Hamming distance between the sales columns (number of days
#' where one account has sales but the other doesn't), 2. fraction of
#' overlapping sales days: number of days where both accounts have sales / union
#' of sales days, and 3. the number of sales days for account1 + number of sales
#' days for account2
#'
#' @param pairsOfInterest dataframe with columns `hash1`, `hash2` referring to
#' vendor hashes for the comparison
#' @param salesCols dataframe containing the columns representing binary
#' variables for whether account has sales or not (exclude vendor hashes from
#' this dataframe)
#' @return dataframe with three columns: diffSalesDates, salesOverlap,
#' totalSalesDays, as described above
#' @export
fromSalesDays <- function(pairsOfInterest, salesCols) {
info1 <- salesCols[pairsOfInterest$order1, ]
info2 <- salesCols[pairsOfInterest$order2, ]
out <- data.frame(diffSalesDates = apply(info1 != info2, MARGIN = 1, sum))
tmp <- info1 + info2
out$salesOverlap <- apply(tmp, MARGIN = 1, FUN = function(x) sum(x == 2) / sum(x > 0))
out$totalSalesDays <- apply(tmp, MARGIN = 1, sum) # 1 if one person had sales, 2 if both had sales
return(out)
}
# helper function: jaccard similarity (length of intersection/union)
jaccardSimilarity <- function(a, b) {
if (length(a) == 0 || length(b) == 0) {
jaccard <- -1
} else {
jaccard <- length(intersect(a, b)) / length(union(a, b))
}
return(jaccard)
}
# helper function: PGP match -- just checks if the intersection is non-empty
PGPmatch <- function(a, b) {
if (length(a) == 0 || length(b) == 0) {
match <- NA
} else {
match <- ifelse(length(intersect(a, b)) > 0, 1, 0)
}
return(match)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.