R/clonotype_table.R

clonotype_table <- function (libs, feats=c("V","pep","J"), data, filter=(data$unproductive | data$ambiguous), minscore=0, minqual=1, sample=FALSE) {

if ( missing (libs) )
	libs <- levels(data$lib)

if ( ! is.character(libs) )
        stop ("Include list of libraries as first argument.")

if ( ! length(libs) == length(unique(libs)) )
	stop ("Redundant list of libraries.")

if ( ! is.data.frame(data) )
	stop ("Input clonotypes must be in a data frame.")

if ( ! is.logical(data$unproductive) )
	stop ('Input missing "unproductive" column.')

if ( ! is.logical(data$ambiguous) )
	stop ('Input missing "ambiguous" column.')

# The following function counts, for a single library, the occurrences of
# segments, CDR3s or combinations of them, and return them as a simple data
# frame of tuples describing in which library, which combination was found how
# many times.  By default it discards the unproductive rearrangements.

feat.freq <- function (lib, filter) {
    if ( all(c('score', 'mapq') %in% colnames(data) ) ) {
      keep <- data$lib == lib & (! filter) & data$score >= minscore & data$mapq >= minqual
    } else {
      keep <- data$lib == lib & (! filter)
    }
    if (length(feats) == 1) {
        feat <- data[keep,feats]
    } else {
        feat <- apply(data[keep,feats], 1, paste, collapse=" ")
    }
    if (sample) {
        feat <- sample(feat, sample)
    }
    feat <- table(as.character(feat))
    data.frame(
        lib   =  lib,
        name  =  rownames(feat),
        count =  as.numeric(feat)
    )
}

# For each library, produce tables of tuples (see above), and append them to a
# data frame.

ff <- data.frame()
for (libname in libs) {
    ff <- rbind(
        ff,
        feat.freq(libname, filter)
    )
}

ff <- reshape(
    ff,
    direction="wide",
    idvar="name",
    timevar="lib"
)
rownames(ff) <- ff$name
colnames(ff) <- c("name",libs)
ff <- ff[, libs, drop=FALSE]
ff[is.na(ff)] <- 0
return(ff)
}
charles-plessy/clonotypeR.old documentation built on May 13, 2019, 3:31 p.m.