AIRR-seq)

Documented in cluster_germline fine_clustering

globalVariables('ngrp')

#' Cluster contigs by germline properties
#'
#' @param ccdb [ContigCellDB()]
#' @param segment_keys fields in `contig_tbl` that identify a cluster
#' @param cluster_pk name of cluster to be added to `cluster_tbl`
#'
#' @return [ContigCellDB()]
#' @export
#'
#' @examples
#' data(ccdb_ex)
#' ccdb_ex = cluster_germline(ccdb_ex)
#' ccdb_ex$cluster_tbl
cluster_germline = function(ccdb, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'cluster_idx'){
    contig_tbl = ccdb$contig_tbl
    seg_types = contig_tbl %>% group_by(!!!syms(segment_keys)) %>% summarize() %>% ungroup()
    seg_types[[cluster_pk]] = seq_len(nrow(seg_types))
    cl_con_tbl = left_join_warn(seg_types, contig_tbl, by = segment_keys)
    cluster_tbl = as_tibble(unique(cl_con_tbl[union(cluster_pk, segment_keys)]))
    replace_cluster_tbl(ccdb, cluster_tbl, cl_con_tbl, cluster_pk = cluster_pk)
}

globalVariables(c('fc', 'd(medoid)', 'is_medoid', 'n_cluster'))
# attempt to detect if we need nest_legacy or regular nest
nest_pre1.0 = if('nest_legacy' %in% getNamespaceExports('tidyr')) tidyr::nest_legacy else tidyr::nest

# Also canonicalize..
#' Perform additional clustering of sequences within groups
#'
#' @param ccdb A [ContigCellDB()] object
#' @param sequence_key `character` naming column in `contig_tbl` with sequence
#' @param type 'AA' or 'DNA'
#' @param max_affinity `numeric` naming the maximal affinity for the sparse affinity matrix that is constructed.  Not currently used.
#' @param keep_clustering_details `logical` -- should output of `fine_cluster_seqs` be kept as a list column
#' @inheritDotParams fine_cluster_seqs -seqs -type -cluster_fun -cluster_method
#' @importFrom dplyr select bind_cols
#'
#' @example inst/examples/small_cluster_example.R
#' @return [ContigCellDB()] object with updated `contig_tbl` and `cluster_tbl`
#' @export
fine_clustering = function(ccdb, sequence_key, type, max_affinity = NULL, keep_clustering_details = FALSE, ...){
    contig_tbl = ccdb$contig_tbl
    message('Calculating intradistances on ', nrow(ccdb$cluster_tbl), ' clusters.')
    # run `fine_cluster_seqs` within each cluster_pk
    cluster_tbl = contig_tbl %>% group_by(!!!syms(ccdb$cluster_pk)) %>% summarize(fc = list(fine_cluster_seqs(!!sym(sequence_key), type = type, ...)), n_cluster = dplyr::n())
    cluster_tbl_orig = ccdb$cluster_tbl
    cluster_tbl = left_join_warn(cluster_tbl, cluster_tbl_orig, by = ccdb$cluster_pk, overwrite = TRUE)
    message('Summarizing')
    contig_by_cluster = contig_tbl[union(ccdb$contig_pk, ccdb$cluster_pk)]
    contig_by_cluster = nest_pre1.0(contig_by_cluster, !!!syms(ccdb$contig_pk))
    contig_by_cluster = left_join(cluster_tbl[ccdb$cluster_pk], contig_by_cluster, by=ccdb$cluster_pk)

    if(is.null(max_affinity)){
        max_max = max(purrr::map_dbl(cluster_tbl$fc, 'max_dist'))
    } else {
        max_max = max_affinity
    }
    affinities = purrr::map(cluster_tbl$fc, ~ max_max - .[['distance']])
    aff_matrix = Matrix::.bdiag(affinities)
    d_medoid = purrr::map2_dfr(cluster_tbl$fc, contig_by_cluster$data, function(.x, .y){
        is_medoid = seq_len(nrow(.y)) == .x$medoid
        #if(nrow(.y) != length(.x$distance)) browser()
        dplyr::bind_cols(.y, `d(medoid)` = .x$distance, is_medoid = is_medoid)
    })
    contig_tbl = left_join_warn(d_medoid, contig_tbl, by = ccdb$contig_pk, overwrite = TRUE)
    avg_distance = contig_tbl %>% group_by(!!!syms(ccdb$cluster_pk)) %>% summarize(avg_distance = mean(`d(medoid)`))
    cluster_tbl = avg_distance %>% left_join_warn(cluster_tbl, by = ccdb$cluster_pk, overwrite = TRUE)
    if(!keep_clustering_details) cluster_tbl = cluster_tbl %>% dplyr::select(-fc)
    replace_cluster_tbl(ccdb, cluster_tbl, contig_tbl)

}

#' @describeIn left_join_warn perform a `dplyr::right_join()`
#' @export
right_join_warn = function(...) left_join_warn(..., join = right_join)

#' Perform a `dplyr::left_join()` but check for non-key overlapping fields
#'
#' Perform a `dplyr` join, but either warn if the two tables
#' share non-key fields  If `overwrite = TRUE`, then shared columns will pull from `x` otherwise a suffix will be added to `y`.  To perform this check, `by` must be specified, and it is an error if it is not.
#' @inheritParams dplyr::left_join
#' @param overwrite `logical` -- should non-key fields in y be overwritten using x, or should a suffix (".y") be added
#' @param by `character` specifying columns in `x` and `y` to key on.
#' @param join function giving the type of join to perform, eg, left, right, inner, outer.
#' @param ... passed to joining function
#'
#' @return `data.frame` or `tibble`
#' @export
#'
#' @examples
#' left_join_warn(mtcars, mtcars, by  = 'mpg')
#' left_join_warn(mtcars, mtcars, by = 'mpg', overwrite = TRUE)
left_join_warn = function(x, y, by, overwrite = FALSE, join = left_join, ...){
    if(missing(by)) stop('by must be provided')
    nx = setdiff(names(x), by)
    ny = setdiff(names(y), by)
    if(length(inter <- intersect(nx, ny))>0){
        if(overwrite){
            warning("Overwriting fields ",
                    paste(inter, collapse = ', '),
                    ' in table ', deparse(substitute(y)))
            y = y[setdiff(names(y), inter)]
        } else{
            warning('Tables share fields ',
                    paste(inter, collapse = ', '),
                    ', which will gain a suffix in ',
                    deparse(substitute(y)))
        }
    }
    join(x  = x, y = y, by = by, suffix = c('', '.y'), ...)
}

has_fineclustering = function(ccdb){
    ('is_medoid' %in% names(ccdb$contig_tbl))
}

#' Find a canonical contig to represent a cluster
#'
#' @param ccdb [ContigCellDB()]
#' @param contig_filter_args an expression passed to [dplyr::filter()].
#' Unlike `filter`, multiple criteria must be `&` together, rather than using
#' commas to separate. These act on `ccdb$contig_tbl`
#' @param tie_break_keys (optional) `character` naming fields in `contig_tbl`
#' that are used sort the contig table in descending order.
#' Used to break ties if `contig_filter_args` does not return a unique contig
#' for each cluster
#' @param order The rank order of the contig, based on `tie_break_keys`
#' to return.  If `tie_break_keys` included an ordered factor (such as chain)
#' this could be used to return the second chain.
#' @param representative an optional field from `contig_tbl` that will be made
#' unique. Serve as a surrogate `cluster_pk`.
#' @param contig_fields Optional fields from `contig_tbl` that will be copied into
#' the `cluster_tbl` from the canonical contig.
#'
#' @inheritParams left_join_warn
#' @return [ContigCellDB()] with some number of clusters/contigs/cells but with "canonical" values copied into `cluster_tbl`
#' @export
#' @seealso [canonicalize_cell()] [left_join_warn()]
#' @example inst/examples/small_cluster_example.R
canonicalize_cluster = function(ccdb, contig_filter_args,
tie_break_keys = character(), order = 1, representative = ccdb$cluster_pk[1], contig_fields = c('cdr3', 'cdr3_nt', 'chain', 'v_gene', 'd_gene', 'j_gene'), overwrite = TRUE){
    if(missing(contig_filter_args) && 'is_medoid' %in% names(ccdb$contig_tbl)){
        message("Filtering `contig_tbl` by `is_medoid`, override by setting `contig_filter_args == TRUE`")
        contig_filter_args = quote(is_medoid)
    }
    if(!has_fineclustering(ccdb)){
        stop('Run `fine_clustering(ccdb)` first.')
    }
    req_contig_fields = unique(c(contig_fields, representative, tie_break_keys))
    if(length(missing_contig <- setdiff(req_contig_fields, names(ccdb$contig_tbl))) > 0) stop('`contig_tbl` is missing fields, ', paste(missing_contig, collapse = ', '), '.')
    sub_contig_tbl = filter(.data = ccdb$contig_tbl, !!rlang::enexpr(contig_filter_args))

    if(nrow(sub_contig_tbl) != nrow(ccdb$cluster_tbl)){
        message('Subset of `contig_tbl` has ', nrow(sub_contig_tbl) , ' rows for ',
                nrow(ccdb$cluster_tbl), ' clusters. Filling missing values and breaking ties ', appendLF = FALSE)
        if(length(tie_break_keys) == 0) message('arbitrarily by first contig.')
        else message('with ', paste(tie_break_keys, sep = ', '), '.')
    }
    # setup quosures to arrange the data
    arranging = purrr::map(tie_break_keys, ~ rlang::quo(desc(!!sym(.x))))

    # take first row of each cluster
    cluster_tbl = sub_contig_tbl %>% group_by(!!!syms(ccdb$cluster_pk)) %>% dplyr::arrange(!!!arranging)
    idx = cluster_tbl %>% dplyr::transmute(ngrp = dplyr::n(), idx = seq_along(ngrp))
    cluster_tbl = cluster_tbl[idx$idx==order,,drop = FALSE]
    cluster_tbl = cluster_tbl %>% dplyr::select(!!!syms(unique(c(ccdb$cluster_pk, contig_fields, representative))))

    # fill any missing clusters after the filtering
    cluster_tbl = cluster_tbl  %>% ungroup() %>% right_join_warn(ccdb$cluster_tbl, by = ccdb$cluster_pk, overwrite = overwrite)
    cluster_tbl = cluster_tbl %>% mutate(representative = make.unique(as.character(!!sym(representative))), representative = forcats::fct_reorder(representative, n_cluster))
    contig_tbl = left_join_warn(ccdb$contig_tbl, cluster_tbl %>% dplyr::select(!!!syms(ccdb$cluster_pk), representative), by = ccdb$cluster_pk)
    replace_cluster_tbl(ccdb, cluster_tbl, contig_tbl)
}

#' Calculate distances and perform hierarchical clustering on a set of sequences
#'
#' The distances between AA sequences is defined to be 1-score/max(score) times the median length of the input sequences.
#' The distances between nucleotide sequences is defined to be edit_distance/max(edit_distance) times the median length of input sequences.
#'
#' @param seqs character vector, DNAStringSet or AAStringSet
#' @param type character either `AA` or `DNA` specifying type of `seqs`
#' @param big_memory_brute attempt to cluster more than 4000 sequences?  Clustering is quadratic, so this will take a long time and might exhaust memory
#' @param method one of 'substitutionMatrix' or 'levenshtein'
#' @param substitution_matrix a character vector naming a substitution matrix available in Biostrings, or a  substitution matrix itself
#' @param cluster_fun `character`, one of "hclust" or "none", determining if distance matrices should also be clustered with `hclust`
#' @param cluster_method character passed to `hclust`
#'
#' @seealso [hclust()], [Biostrings::stringDist()]
#' @return `list`
#' @export
#' @import Biostrings
#' @examples
#' fasta_path = system.file('extdata', 'demo.fasta', package='CellaRepertorium')
#' aaseq = Biostrings::readAAStringSet(fasta_path)[1:100]
#' cls = fine_cluster_seqs(aaseq, cluster_fun = 'hclust')
#' plot(cls$cluster)
fine_cluster_seqs = function(seqs, type = 'AA', big_memory_brute = FALSE, method = 'levenshtein', substitution_matrix = 'BLOSUM100', cluster_fun = 'none', cluster_method = 'complete'){
    if(length(seqs) > 4000 & !big_memory_brute) stop("Won't try to cluster ", length(seqs), " sequences unless `big_memory_brute` = TRUE.  (Make sure you actually have lots of memory!)")
    type = match.arg(type, choices = c('AA', 'DNA'))
    cluster_fun = match.arg(cluster_fun, c('hclust', 'none'))
    method = match.arg(method, c('substitutionMatrix', 'levenshtein'))

    if(is.character(seqs)){
        if(type == 'AA'){
            ss = AAStringSet(seqs)
            } else {
            ss = DNAStringSet(seqs)
            }
    } else if(inherits(seqs, 'AAStringSet') || inherits(seqs, 'DNAStringSet')){
        ss = seqs
    } else{
        stop("Must be character, AAStringSet or DNAStringSet")
    }

    if(method == 'substitutionMatrix'){
        biostrings_data_avail = data(package = 'Biostrings')$results
        if(substitution_matrix %in% biostrings_data_avail[,'Item']){
            e = environment()
            data(list = substitution_matrix, envir = e, package = 'Biostrings')
            substitution_matrix = e[[substitution_matrix]]
        }
        sd = as.matrix(Biostrings::stringDist(ss, method = method, substitutionMatrix = substitution_matrix))
        # alignment score along diagonal
        pw = Biostrings::pairwiseAlignment(ss, ss, substitutionMatrix = substitution_matrix, scoreOnly = TRUE)
        diag(sd) = pw
        sd  = if(length(ss)>1) (diag(sd) - sd) else matrix(0, nrow = 1, ncol = 1)
    } else{ #levenshtein
        sd = as.matrix(Biostrings::stringDist(ss, method = 'levenshtein'))
        #if(length(ss)>1) sd = sd/max(sd)*median(Biostrings::nchar(ss))
    }

    if(length(seqs)>1 && cluster_fun != 'none'){
        hc = stats::hclust(as.dist(sd), method = cluster_method)
        hc$labels = names(ss)
    } else{
        hc = NULL
    }

    medoid = which.min(colMeans(sd))
    distance = sd[medoid,]
    list(cluster = hc, distance_mat = sd, distance = distance, medoid = medoid, max_dist = max(sd))
}

#' Calculate the entropy of a vector
#'
#' @param v categorical vector
#' @param pseudo_count number of pseudo counts to add on, to stabilize empty categories
#' @param na.action how to handle NA values
#'
#' @return the sample entropy
#' @export
#'
#' @examples
#' v2 = gl(2, 4)
#' v4 = gl(4, 4)
#' stopifnot(entropy(v2) < entropy(v4))
#' v_empty = v2[1:4] #empty level 2
#' stopifnot(is.finite(entropy(v_empty))) # pseudo_count
#'
#' np(v4, p = .2, pseudo_count = 0)
#' np(v4, p = .25, pseudo_count = 0)
#' np(v4, p = .25, pseudo_count = .0001)
#'
#' modal_category(v4)
#' modal_category(v4[-1])
#'
entropy = function(v, pseudo_count = length(v)/1000, na.action = na.fail){
    v = na.action(v)
    pr = table(v)/length(v) + pseudo_count
    e = -sum(log(pr)*pr)
    if(e<0) warning("Negative entropy; adjust pseudo_counts?")
    e
}


#' @export
#' @describeIn entropy The number of categories exceeding `p` proportion of the total
#' @param p proportion threshold
np = function(v, p = .05, pseudo_count = p/5, na.action = na.fail){
    v = na.action(v)
    if(length(v) == 0) return(0)
    pr = table(v)/length(v) + pseudo_count
    e = sum(pr>p)
    e
}

#' @export
#' @describeIn entropy The modal category of v.  Ties are broken by lexicographic order of the factor levels.
modal_category = function(v, na.action = na.fail){
    v = na.action(v)
    if(length(v)>0) names(sort(-table(v)))[1] else NA_character_
}

Any scripts or data that you put into this service are public.

CellaRepertorium documentation built on Nov. 8, 2020, 7:48 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

CellaRepertorium
Data structures, clustering and testing for single cell immune receptor repertoires (scRNAseq RepSeq/AIRR-seq)

R/clustering-methods.R
In CellaRepertorium: Data structures, clustering and testing for single cell immune receptor repertoires (scRNAseq RepSeq/AIRR-seq)

Defines functions fine_clustering cluster_germline

Documented in cluster_germline fine_clustering

Try the CellaRepertorium package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CellaRepertorium Data structures, clustering and testing for single cell immune receptor repertoires (scRNAseq RepSeq/AIRR-seq)

R/clustering-methods.R In CellaRepertorium: Data structures, clustering and testing for single cell immune receptor repertoires (scRNAseq RepSeq/AIRR-seq)

Defines functions fine_clustering cluster_germline

Documented in cluster_germline fine_clustering

Try the CellaRepertorium package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CellaRepertorium
Data structures, clustering and testing for single cell immune receptor repertoires (scRNAseq RepSeq/AIRR-seq)

R/clustering-methods.R
In CellaRepertorium: Data structures, clustering and testing for single cell immune receptor repertoires (scRNAseq RepSeq/AIRR-seq)