R/NB.R

## This is the master function for now.

NB <- function(DT, Prior, Likelihood_MV, Training, Explanatory, tolerance){
    ## Explanatory doesn't do anything so far
    ## Barcode is always the explanatory variable
    ## 'Training' refers to the prediction columns.

    for(i in seq_along(DT)){
        ## For each testing/training DT, add tolerance to Q1/Q3
        DT[[i]][, library.Q1.min := as.numeric(library.Q1-tolerance)]
        DT[[i]][, library.Q1.max := as.numeric(library.Q1+tolerance)]
        DT[[i]][, library.Q3.min := as.numeric(library.Q3-tolerance)]
        DT[[i]][, library.Q3.max := as.numeric(library.Q3+tolerance)]
        Likelihood_MV[[i]][, library.Q1.min := as.numeric(library.Q1-tolerance)]
        Likelihood_MV[[i]][, library.Q1.max := as.numeric(library.Q1+tolerance)]
        Likelihood_MV[[i]][, library.Q3.min := as.numeric(library.Q3-tolerance)]
        Likelihood_MV[[i]][, library.Q3.max := as.numeric(library.Q3+tolerance)]

        ## Rename 'Barcode' to actual barcode
        setnames(Likelihood_MV[[i]], "Barcode", "Training.Barcode", skip_absent=TRUE)

        ## Find all training DT rows that match testing rows based on Q1/Q3
        setkey(DT[[i]],
               library.Q1.min, library.Q1.max,
               library.Q3.min, library.Q3.max)
        setkey(Likelihood_MV[[i]],
               library.Q1.min, library.Q1.max,
               library.Q3.min, library.Q3.max)
        overlap <- foverlaps(DT[[i]], Likelihood_MV[[i]], type="any")

        ## Remove any duplicate columns starting with 'i'
        selectCol <- grep("^i.", colnames(overlap))
        overlap <- overlap[, !selectCol, with=F]
        overlap[, c("library.Q1.min", "library.Q1.max",
                    "library.Q3.min", "library.Q3.max") := NULL]

        ## Compute Gaussian likelihood
        overlap[, Likelihood := (
            (1/(sqrt(2*pi*Variance))) * exp( (-1/2) * ( (value-Mean)^2) / (Variance))
        )]
        overlap[, c("Mean", "Variance") := NULL]

        setnames(overlap, length(overlap), names(DT)[i])
        if(i==1){
            setnames(overlap, "value", "Retention.Time.value")
            DT_return <- overlap
        } else{
            overlap[, "value" := NULL]
            DT_return <- merge(DT_return, overlap, all=T)
        }
    }

    ## Merge 'Prior' with 'DT_return'. Only left join because we want to match
    ## priors with 'DT_return' entries
    DT_return <- merge(DT_return, Prior,
                       by = c("library.Q1", "library.Q3", "Barcode", "matrix"),
                       all.x = T)

    ## Remove extra rows from merged 'Prior'
    DT_return <- DT_return[is.na(run) == F]
    rm(Prior)

    ## Calculate posterior probability (minus normalizing denominator, the marginal likelihood)
    ## Converting likelihoods and prior to a matrix for row product
    mat_return <- as.matrix(DT_return)

    ## Find column names of training variables
    names <- which(colnames(mat_return) %in% Training)
    prior <- which(colnames(mat_return) %in% "prior")

    ## Calculate row product
    mat_return <- mat_return[, c(names, prior)]
    mode(mat_return) = "numeric"
    pp <- rowProds(mat_return, na.rm=TRUE)

    ## Add posterior probability to data table
    DT_return[, "posterior" := pp]
    rm(mat_return)

    ## Re-order columns with Barcode & in the front
    ## iGraph assumes edges are the first 2 columns
    extract <- grep("Barcode", colnames(DT_return))
    temp <- DT_return[, extract, with=FALSE]
    DT_return[, c(extract) := NULL]
    DT_return <- cbind(temp, DT_return)

    ## Maximum weighted bipartite matching for each run
    ## Only one possible testing (run) barcode can match a single training barcode
    ## NAs can be produced if an MRM peak doesn't match anything in the training library
    ## Get length of unique runs (not including NA)
    MWBMrange <- sum(!is.na(DT_return[, unique(run)]))
    DT_return2 <- vector(mode="list", length=MWBMrange)
    for(j in 1:MWBMrange){
        ## Making temporary data table copies to prevent overwriting via assign
        ## by reference
        tempDT2 <- DT_return[run == unique(run)[j]]
        tempDT <- copy(tempDT2)

        ## Create iGraph object for maximum weighted bipartite matching
        ## Vertices are 'Barcode' and 'Training.Barcode'
        ## Prefixes added to Barcode columns to distinguish them for MWBM
        tempDT[, Barcode := paste("Run.", Barcode, sep="")]
        tempDT[, Training.Barcode := paste("Lib.", Training.Barcode, sep="")]
        g <- graph.data.frame(tempDT, directed = T)

        ## Figure out which vertices should be FALSE (column 1) or TRUE (column 2)
        Vin  <- which(names(V(g)) %in% tempDT[,Barcode])
        Vin  <- cbind(Vin, FALSE)
        Vout <- which(!(1:length(V(g)) %in% Vin))
        Vout <- cbind(Vout, TRUE)
        Vall <- rbind(Vin, Vout)
        Vall <- Vall[order(Vall[,1]), 2]
        V(g)$type <- as.logical(Vall)

        ## Edge weights are posterior probabilities of classifier
        E(g)$weight <- as.numeric(tempDT[,posterior])
        MWBM <- max_bipartite_match(g)

        ## Only interested in matching 'Barcode' to 'Training.Barcode'
        myFilter <- tempDT[,Barcode]
        MWBM <- MWBM$matching[which(names(MWBM$matching) %in% myFilter)]
        MWBM <- data.table(Barcode = names(MWBM),
                           Training.Barcode = as.vector(MWBM))
        ## Remove prefixes in both Barcode/Training.Barcode columns
        MWBM[, Barcode := substring(Barcode, 5)]
        MWBM[, Training.Barcode := substring(Training.Barcode, 5)]
        ## Only keep row in 'DT_return' corresponding to MWBM
        setkey(DT_return, "Barcode", "Training.Barcode")
        setkey(MWBM, "Barcode", "Training.Barcode")
        DT_return2[[j]] <- DT_return[run == unique(run)[j]][MWBM, nomatch=0]
    }

    ## Convert list to a data table by rows
    DT_return2 <- do.call(rbind, DT_return2)

    ## Figure out which label to assign based on MWBM
    setkeyv(DT_return,  colnames(DT_return))
    setkeyv(DT_return2, colnames(DT_return2))
    DT_return3 <- DT_return[DT_return2]
    DT_return4 <- DT_return[!DT_return2]
    DT_return3[, "NoClass" := ifelse(is.na(posterior) == T | posterior == 0, TRUE, FALSE)]
    DT_return3[, "MWBM" := ifelse(is.na(NoClass) == F, TRUE, FALSE)]
    DT_return4[, "NoClass" := NA]
    DT_return4[, "MWBM" := F]
    DT_return <- rbind(DT_return3, DT_return4)
    rm(DT_return2, DT_return3, DT_return4, MWBM, myFilter, g, tempDT)

    ## "Correct" barcode assignment based on MWBM
    DT_return[, "Actual_choice" := ifelse(Barcode == Training.Barcode, TRUE, FALSE)]
    DT_correct <- DT_return[MWBM == T & Actual_choice == T & is.na(NoClass) == F]

    ## Incorrect barcode assignment
    DT_incorrect <- DT_return[MWBM == T & Actual_choice == F & NoClass == F]

    ## No assignment
    DT_noclass <- DT_return[NoClass == T | is.na(NoClass) == T]

    return(list(complete=DT_return,
                correct=DT_correct,
                incorrect=DT_incorrect,
                noclass=DT_noclass))
}
jchitpin/blistR documentation built on July 8, 2019, 6:29 p.m.