heisenbrgr: Linking Accounts on Anonymous Marketplaces

#' Generate basic pairwise comparisons for user accounts of interest
#'
#' Given a dataframe containing at least account hashes, IDs and marketplaces,
#' generate basic pairwise comparisons: edit distance between the IDs and
#' whether or not they are from the same marketplace.
#'
#' @param dataframeName name of dataframe that contains at least three columns:
#'   user hashes of interest (named `vendor_hash`), alias (`id`) and marketplace
#'   (`marketplace`).
#' @return dataframe with columns `hash1`, `hash2`, `order1` (row corresponding
#'   to `hash1` in `dataframeName`), `order2`, `id1`, `id2`, `marketplace1`,
#'   `marketplace2`, `idDist`, `sameMarket`
#' @export

getPairs <- function(dataframeName) {
    allPairwise <- t(combn(dataframeName$vendor_hash, 2))
    allPairwise <- data.frame(hash1 = allPairwise[, 1], hash2 = allPairwise[, 2], stringsAsFactors = FALSE)
    allPairwise$order1 <- match(allPairwise$hash1, dataframeName$vendor_hash)
    allPairwise$order2 <- match(allPairwise$hash2, dataframeName$vendor_hash)

    info1 <- dataframeName[allPairwise$order1, c("id", "marketplace")]
    info2 <- dataframeName[allPairwise$order2, c("id", "marketplace")]

    allPairwise$id1 <- info1$id
    allPairwise$id2 <- info2$id

    allPairwise$marketplace1 <- info1$marketplace
    allPairwise$marketplace2 <- info2$marketplace

    allPairwise$idDist <- apply(allPairwise[, c("id1", "id2")], MARGIN = 1, FUN = function(x) adist(x[1], x[2]))

    allPairwise$sameMarket <- ifelse(info1$marketplace == info2$marketplace, 1, 0)

    return(allPairwise)
}

#' Generate pairwise comparisons from list items
#'
#' Given a list item indexed by vendor hashes (such as `titleTokens`, ...,
#' generated using the functions in `step2_anytime.R`), generate Jaccard
#' similarities / return whether or not their intersections are non-empty (used
#' for PGPs).
#'
#' @param pairsOfInterest dataframe with columns `hash1`, `hash2` referring to
#'   vendor hashes for the comparison
#' @param listName e.g. `titleTokens`, `profileTokens`, `PGPlist`
#' @param functionName either `jaccardSimilarity` or `PGPmatch`. These are
#'   currently the only two options.
#' @return vector the length of `nrow(pairsOfInterest)`, containing the
#'   resulting similarity scores for each pair
#' @export

fromList <- function(pairsOfInterest, listName, functionName) {

    list1 <- listName[pairsOfInterest$order1]
    list2 <- listName[pairsOfInterest$order2]

    out <- mapply(functionName, list1, list2, USE.NAMES = FALSE)
    return(out)
}

#' Generate pairwise absolute differences from dataframes
#'
#' Given a dataframe where each row corresponds to a vendor hash (can be
#' generated using the functions in `step2_anytime.R`), generate absolute
#' differences between columns. Columns need to be numeric variables. There is
#' also the option to set the result to -1 if the variable being compared has
#' value 0 for both accounts in the pair. This is useful for variables such as
#' the number of tokens in the profiles -- if both have no profile information
#' the similarity should be missing (-1) instead of 0.
#'
#' @param pairsOfInterest dataframe with columns `hash1`, `hash2` referring to
#'   vendor hashes for the comparison
#' @param dtfName dataframe containing the columns to be compared
#' @param varName vector of variable names that we want to compute differences
#'   for (TODO: now only works if >1 col)
#' @param setToMinus1 logical vector, same length as `varName`
#' @return vector the length of `nrow(pairsOfInterest)`, containing the
#'   resulting similarity scores for each pair
#' @export

absFromDataframe <- function(pairsOfInterest, dtfName, varName, setToMinus1) {

    info1 <- dtfName[pairsOfInterest$order1, varName]
    info2 <- dtfName[pairsOfInterest$order2, varName]

    out <- abs(info1 - info2)

    if (sum(setToMinus1) > 0) {
        tmpCols <- which(setToMinus1 == 1)

        putThisIn <- unlist(out[, tmpCols], use.names = FALSE)
        putThisIn[which(info1[, tmpCols] == 0 | info2[, tmpCols] == 0)] <- -1

        out[, tmpCols] <- putThisIn
    }

    names(out) <- paste0("diff_", varName)

    return(out)
}

#' Generate pairwise similarities given binary vector of sales days
#'
#' Given sales columns generated by `infoFromFeedback()`, produce three pairwise
#' variables: 1. Hamming distance between the sales columns (number of days
#' where one account has sales but the other doesn't), 2. fraction of
#' overlapping sales days: number of days where both accounts have sales / union
#' of sales days, and 3. the number of sales days for account1 + number of sales
#' days for account2
#'
#' @param pairsOfInterest dataframe with columns `hash1`, `hash2` referring to
#'   vendor hashes for the comparison
#' @param salesCols dataframe containing the columns representing binary
#'   variables for whether account has sales or not (exclude vendor hashes from
#'   this dataframe)
#' @return dataframe with three columns: diffSalesDates, salesOverlap,
#'   totalSalesDays, as described above
#' @export

fromSalesDays <- function(pairsOfInterest, salesCols) {
    info1 <- salesCols[pairsOfInterest$order1, ]
    info2 <- salesCols[pairsOfInterest$order2, ]

    out <- data.frame(diffSalesDates = apply(info1 != info2, MARGIN = 1, sum))
    tmp <- info1 + info2
    out$salesOverlap <- apply(tmp, MARGIN = 1, FUN = function(x) sum(x == 2) / sum(x > 0))
    out$totalSalesDays <- apply(tmp, MARGIN = 1, sum) # 1 if one person had sales, 2 if both had sales
    return(out)
}

# helper function: jaccard similarity (length of intersection/union)
jaccardSimilarity <- function(a, b) {
    if (length(a) == 0 || length(b) == 0) {
        jaccard <- -1
    } else {
        jaccard <- length(intersect(a, b)) / length(union(a, b))
    }
    return(jaccard)
}

# helper function: PGP match -- just checks if the intersection is non-empty
PGPmatch <- function(a, b) {
    if (length(a) == 0 || length(b) == 0) {
        match <- NA
    } else {
        match <- ifelse(length(intersect(a, b)) > 0, 1, 0)
    }
    return(match)
}

xhtai/heisenbrgr documentation built on June 8, 2019, 9:30 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

xhtai/heisenbrgr
Linking Accounts on Anonymous Marketplaces

R/step3_generateComparisons.R
In xhtai/heisenbrgr: Linking Accounts on Anonymous Marketplaces

R Package Documentation

Browse R Packages

We want your feedback!

xhtai/heisenbrgr Linking Accounts on Anonymous Marketplaces

R/step3_generateComparisons.R In xhtai/heisenbrgr: Linking Accounts on Anonymous Marketplaces

R Package Documentation

Browse R Packages

We want your feedback!

xhtai/heisenbrgr
Linking Accounts on Anonymous Marketplaces

R/step3_generateComparisons.R
In xhtai/heisenbrgr: Linking Accounts on Anonymous Marketplaces