R/blockedSN.R

#' Blocked sorted neighbors generic.
#'
#' \code{blockedSN} returns a \code{\link{Blocks}} object created from a single
#' iteration of the blocked sorted neighbors algorithm.
#'
#' @param object \code{\link{Neighbors}} or \code{\link{Blocks}} object to perform BSN algorithm on.
#' @return An object of class \code{\link{Blocks}} containing the neighbors found
#' and keys used during the blocked sorted neighbors iteration. 
#'
#' @export
blockedSN <- function(object, blockVar, repSN, windowSN, keyLength) {
    UseMethod("blockedSN", object)
}

#' @rdname blockedSN
#' @export 
blockedSN.default <- function(object, blockVar, repSN, windowSN, keyLength) {
    print("Blocking allowable on Neighbors and Blocks objects only.")
    print("Please first initialize a Neighbors object.")
    return(NULL)
}

#' @rdname blockedSN
#' @param blockVar Integer or character value. The column on which to block.
#' @param repSN Integer value. How much iterations of sorted neighbors to perform?
#' @param windowSN Integer value. Size of sliding window to use during sorted neighbrs.
#' @param keyLength Numeric value or vector. How many keyVars to concatenate per sort key.
#' @export 
blockedSN.Neighbors <- function(object, blockVar, repSN, windowSN, keyLength) {
    keyVars <- object[["keyVars"]][, "keyVars"]
    keyVars <- keyVars[keyVars != blockVar]
    # generate pseudo random key lengths if none provided
    if (missing(keyLength)) {
        stats::runif(1)
        keyLength <- abs(.Random.seed[1:repSN])%%10
        idx <- keyLength > length(keyVars)
        keyLength[idx] <- length(keyVars)
    }
    if (length(keyLength != repSN)) {
        keyLength <- rep_len(keyLength, repSN)
    }
    # generate keys to sort on
    keys <- lapply(1:repSN, function(sn) {
        keyVars[sample(length(keyVars), keyLength)]
    })
    # get unique values to block on
    blockVec <- object[["rawData"]][, blockVar]
    iterval <- unique(blockVec)
    iter <- 1:length(iterval)
    # partition input data to separate blocks
    subdat <- lapply(iter, function(it) {
        object[["rawData"]][blockVec == iterval[it], ]
    })
    names(subdat) <- iter
    rez <- lapply(iter, function(it) {
        #print(it)
        if (nrow(subdat[[it]])==1) {return(NULL)}
        if (nrow(subdat[[it]])==2) {
            nei <- cbind(subdat[[it]][1,object[["ID"]]],subdat[[it]][2,object[["ID"]]])
            colnames(nei) <- c("sorted.ids", "")
            return(nei)
        }
        nei <- lapply(1:repSN, function(sn) {
            if (nrow(subdat[[it]]) > windowSN) {
                sortedNeighbors(subdat[[it]], keys[[sn]], windowSN = windowSN, ID = object[["ID"]])
            } else {
                sortedNeighbors(subdat[[it]], keys[[sn]], windowSN = nrow(subdat[[it]])-1, ID = object[["ID"]])
            }
        })
        do.call(rbind, nei)
    })
    rez <- do.call(rbind, rez)
    rez <- t(apply(rez, 1, sort))
    rez <- unique(rez)
    rez <- cbind(rez, rep(0, nrow(rez)))
    colnames(rez) <- c("ID1", "ID2", "background")
    # promote to Blocks object
    object <- Blocks(object, rez, list(blockVar = blockVar, sortKeys = keys))
    return(object)
}

#' @rdname blockedSN
#' @export 
blockedSN.Blocks <- function(object, blockVar, repSN, windowSN, keyLength) {
    keyVars <- object[["keyVars"]][, "keyVars"]
    keyVars <- keyVars[keyVars != blockVar]
    # generate pseudo random key lengths if none provided
    if (missing(keyLength)) {
        stats::runif(1)
        keyLength <- abs(.Random.seed[1:repSN])%%10
        idx <- keyLength > length(keyVars)
        keyLength[idx] <- length(keyVars)
    }
    if (length(keyLength != repSN)) {
        keyLength <- rep_len(keyLength, repSN)
    }
    # generate keys to sort on
    keys <- lapply(1:repSN, function(sn) {
        keyVars[sample(length(keyVars), keyLength)]
    })
    # get unique values to block on
    blockVec <- object[["rawData"]][, blockVar]
    iterval <- unique(blockVec)
    iter <- 1:length(iterval)
    # partition input data to separate blocks
    subdat <- lapply(iter, function(it) {
        object[["rawData"]][blockVec == iterval[it], ]
    })
    names(subdat) <- iter
    rez <- lapply(iter, function(it) {
        if (nrow(subdat[[it]])==1) {return(NULL)}
        if (nrow(subdat[[it]])==2) {
            nei <- cbind(subdat[[it]][1,object[["ID"]]],subdat[[it]][2,object[["ID"]]])
            colnames(nei) <- c("sorted.ids", "")
            return(nei)
        }
        nei <- lapply(1:repSN, function(sn) {
            if (nrow(subdat[[it]]) > windowSN) {
                sortedNeighbors(subdat[[it]], keys[[sn]], windowSN = windowSN, ID = object[["ID"]])
            } else {
                sortedNeighbors(subdat[[it]], keys[[sn]], windowSN = nrow(subdat[[it]])-1, ID = object[["ID"]])
            }
        })
        do.call(rbind, nei)
    })
    rez <- do.call(rbind, rez)
    rez <- t(apply(rez, 1, sort))
    rez <- unique(rez)
    rez <- cbind(rez, rep(0, nrow(rez)))
    colnames(rez) <- c("ID1", "ID2", "background")
    object <- set(object, "Neighbors", rez)
    object <- set(object, "keysUsed", list(blockVar = blockVar, sortKeys = keys))
    return(object)
}

#' @rdname blockedSN
#' @export 
blockedSN.Scores <- function(object, blockVar, repSN, windowSN, keyLength) {
    keyVars <- object[["keyVars"]][, "keyVars"]
    keyVars <- keyVars[keyVars != blockVar]
    # generate pseudo random key lengths if none provided
    if (missing(keyLength)) {
        stats::runif(1)
        keyLength <- abs(.Random.seed[1:repSN])%%10
        idx <- keyLength > length(keyVars)
        keyLength[idx] <- length(keyVars)
    }
    if (length(keyLength != repSN)) {
        keyLength <- rep_len(keyLength, repSN)
    }
    # generate keys to sort on
    keys <- lapply(1:repSN, function(sn) {
        keyVars[sample(length(keyVars), keyLength)]
    })
    # get unique values to block on
    blockVec <- object[["rawData"]][, blockVar]
    iterval <- unique(blockVec)
    iter <- 1:length(iterval)
    # partition input data to separate blocks
    subdat <- lapply(iter, function(it) {
        object[["rawData"]][blockVec == iterval[it], ]
    })
    names(subdat) <- iter
    rez <- lapply(iter, function(it) {
        if (nrow(subdat[[it]])==1) {return(NULL)}
        if (nrow(subdat[[it]])==2) {
            nei <- cbind(subdat[[it]][1,object[["ID"]]],subdat[[it]][2,object[["ID"]]])
            colnames(nei) <- c("sorted.ids", "")
            return(nei)
        }
        nei <- lapply(1:repSN, function(sn) {
            if (nrow(subdat[[it]]) > windowSN) {
                sortedNeighbors(subdat[[it]], keys[[sn]], windowSN = windowSN, ID = object[["ID"]])
            } else {
                sortedNeighbors(subdat[[it]], keys[[sn]], windowSN = nrow(subdat[[it]])-1, ID = object[["ID"]])
            }
        })
        do.call(rbind, nei)
    })
    rez <- do.call(rbind, rez)
    rez <- t(apply(rez, 1, sort))
    rez <- unique(rez)
    rez <- cbind(rez, rep(0, nrow(rez)), rep(NA, nrow(rez)))
    colnames(rez) <- c("ID1", "ID2", "background", "matchScore")
    object <- set(object, "Neighbors", rez)
    object <- set(object, "keysUsed", list(blockVar = blockVar, sortKeys = keys))
    return(object)
}
mPloenzke/bsnR documentation built on May 21, 2019, 9:18 a.m.