R/diversityAnalysis.R
In abseqR: Reporting and data analysis functionalities for Rep-Seq datasets of antibody libraries

Documented in .aminoAcidBar .aminoAcidPlot .calculateDInd .calculateDiversityEstimates .diversityAnalysis .generateAllSpectratypes .plotDiversityCurves .plotDuplication .plotRarefaction .plotRecapture .regionAnalysis .reportLBE

#' Duplication level plot
#'
#' @description bins singletons, doubletons, and higher order clonotypes
#' into a line plot
#'
#' @import ggplot2
#' @include util.R
#'
#' @param files list type. List of strings to _cdr_v_duplication.csv pathname
#' @param sampleNames vector type. Vector of strings each representing
#' sample names
#' @param regions vector type.
#'  Which regions to include in the plot. Default = c("CDR3", "V")
#'
#' @return ggplot2 object
.plotDuplication <-
    function(files, sampleNames, regions = c("CDR3", "V")) {
        nsamples <- length(files)

        if (nsamples != length(sampleNames)) {
            stop("Expected equal number of sample names and dataframes, got ",
                 length(sampleNames),
                 " samples and ",
                 nsamples, " dataframes.")
        }

        message("Creating duplication plot for samples ",
                paste(sampleNames, collapse = ", "))

        # read xticks and xlimits from first 2 row
        trimwsNoQuotes <- function(x) {
            gsub("'", "", trimws(x))
        }

        # trimwsNoQuotes strips whitespaces AND single quotes
        fp <- file(files[[1]], "r")
        xticks <-
            strtoi(unlist(lapply(strsplit(
                readLines(fp, n = 1), ","
            )[[1]],
            trimws)))
        xlabels <-
            unlist(lapply(strsplit(readLines(fp, n = 1), ",")[[1]],
                          trimwsNoQuotes))
        close(fp)

        # read files into dataframes
        dataframes <- lapply(files, read.csv, skip = 2)

        # pre-processing & cleanup
        for (i in seq_len(nsamples)) {
            df <- dataframes[[i]]
            df$sample  <- sampleNames[[i]]
            dataframes[[i]] <- df[df$region %in% regions,]
        }

        # combine!
        df.union <- do.call("rbind", dataframes)

        g <- ggplot(df.union, aes(x = x, y = y))

        if (nsamples == 1) {
            g <- g + geom_line(aes(linetype = region),
                               color = BLUEHEX,
                               size = 0.65) +
                guides(color = FALSE)
        } else {
            g <- g + geom_line(aes(linetype = region, color = sample),
                               size = 0.65)
        }

        g <- g +
            scale_linetype_manual(values = .getLineTypes(regions)) +
            scale_x_continuous(breaks = xticks, labels = xlabels) +
            labs(
                title = paste(
                    "Sequence duplication levels of",
                    paste(regions, collapse = ", "),
                    "in",
                    paste(sampleNames, collapse = ", ")
                ),
                x = "Duplication level",
                y = "Proportion of duplicated sequences"
            ) +
            theme_bw() +
            theme(axis.text.x = element_text(angle = 45, hjust = 1))
        return(g)
    }




#' Rarefaction plot
#'
#' @description Plots the number of unique clonotypes (on the y-axis)
#' drawn from a sample size on the x axis. The number of unique clonotypes
#' is averaged over 5 repeated rounds.
#'
#' @import ggplot2
#' @include util.R
#'
#' @param files list type. A list of files consisting of path to samples
#' @param sampleNames vector type. A vector of strings,
#'  each being the name of samples in files
#' @param regions vector type. A vector of strings,
#' regions to be included. Defaults to c("CDR3", "V")
#'
#' @return ggplot2 object
.plotRarefaction <-
    function(files, sampleNames, regions = c("CDR3", "V")) {
        nsamples <- length(files)
        # sanity check
        if (length(sampleNames) != nsamples) {
            stop("Expected equal number of sample names and dataframes, got ",
                 length(sampleNames),
                 " samples and ",
                 nsamples, " dataframes.")
        }

        message("Creating rarefaction plot for samples ",
                paste(sampleNames, collapse = ", "))

        # find the minimum xtick value from all the samples to plot as the
        # max xtick value on the actual graph (i.e. the graph is truncated to the
        # smallest maximum xtick value from the pool of samples)
        fp <- file(files[[1]], "r")
        xticks <-
            strtoi(unlist(lapply(strsplit(
                readLines(fp, n = 1), ","
            )[[1]],
            trimws)))
        close(fp)

        # if there are more
        if (nsamples > 1) {
            for (i in 2:nsamples) {
                fp <- file(files[[i]], "r")
                candidate <- strtoi(unlist(lapply(
                    strsplit(readLines(fp, n = 1), ",")[[1]], trimws
                )))
                close(fp)
                if (tail(candidate, n = 1) < tail(xticks, n = 1)) {
                    xticks <- candidate
                }
            }
        }

        # read files
        dataframes <- lapply(files, read.csv, skip = 1)

        # pre-processing & cleaning
        for (i in seq_len(nsamples)) {
            df <- dataframes[[i]]
            df$sample <- sampleNames[[i]]
            df <- df[df$region %in% regions,]
            dataframes[[i]] <- .summarySE(df,
                                          measurevar = 'y',
                                          groupvars = c('x', 'region', 'sample'))
        }

        # merge
        df.union <- do.call("rbind", dataframes)

        # make compound column of region . sample for geom_ribbon
        df.union$compound <- paste(df.union$sample, df.union$region)

        g <- ggplot(df.union, aes(x = x, y = y))

        if (nsamples == 1) {
            g <- g + geom_line(aes(linetype = region),
                               color = BLUEHEX,
                               size = 0.65) +
                guides(color = FALSE)
        } else {
            g <- g +
                geom_line(aes(linetype = region, color = sample), size = 0.65)
        }

        g <- g +
            scale_linetype_manual(values = .getLineTypes(regions)) +
            scale_x_continuous(breaks = xticks,
                                    limits = c(head(xticks, n = 1),
                                               tail(xticks, n = 1))) +
            geom_ribbon(
                aes(
                    ymin = y - ci,
                    ymax = y + ci,
                    fill = compound
                ),
                alpha = 0.1,
                show.legend = FALSE
            ) +
            labs(
                title = paste(
                    "Rarefaction of",
                    paste(regions, collapse = ", "),
                    "in",
                    paste(sampleNames, collapse = ", ")
                ),
                subtitle = "Mean number of deduplicated sequences with 95% confidence interval",
                x = 'Sample size',
                y = "Number of deduplicated sequences"
            ) +
            theme_bw() +
            theme(axis.text.x = element_text(angle = 90, hjust = 1))
        return(g)
    }



#' Plots capture-recapture
#'
#' @description Plots the percent of recapture clonotypes (on the y-axis)
#' drawn from a repeated (with replacement) sample size on the x axis. The
#' percentage of recaptured clonotypes is averaged over 5 recapture rounds.
#'
#' @import ggplot2
#' @include util.R
#'
#' @param files list type. List of _cdr_v_recapture.csv.gz files.
#' @param sampleNames vector type. A vector of strings each
#' representing the name of samples in files.
#' @param regions vector type. A vector of strings,
#' regions to be included in the plot. defaults to c("CDR3", "V")
#'
#' @return ggplot2 object
.plotRecapture <-
    function(files, sampleNames, regions = c("CDR3", "V")) {
        nsamples <- length(files)
        # sanity check
        if (nsamples != length(sampleNames)) {
            stop("Expected equal number of sample names and dataframes, got ",
                 length(sampleNames),
                 " samples and ",
                 nsamples, " dataframes.")
        }

        message("Creating recapture plot for samples ",
                paste(sampleNames, collapse = ", "))

        # find the minimum xtick value from all the samples to plot as the
        # max xtick value on the actual graph (i.e. the graph is truncated to the
        # smallest maximum xtick value from the pool of samples)
        fp <- file(files[[1]], "r")
        xticks <-
            strtoi(unlist(lapply(strsplit(
                readLines(fp, n = 1), ","
            )[[1]],
            trimws)))
        close(fp)

        # if there are more
        if (nsamples > 1) {
            for (i in 2:nsamples) {
                fp <- file(files[[i]], "r")
                candidate <- strtoi(unlist(lapply(
                    strsplit(readLines(fp, n = 1), ",")[[1]], trimws
                )))
                close(fp)
                if (tail(candidate, n = 1) < tail(xticks, n = 1)) {
                    xticks <- candidate
                }
            }
        }

        # read dataframes
        dataframes <- lapply(files, read.csv, skip = 1)

        # cleanup & pre-processing
        for (i in seq_len(nsamples)) {
            df <- dataframes[[i]]
            # append sample name to a new column named sample
            df$sample <- sampleNames[[i]]
            # only want selected regions - ignore others
            df <- df[df$region %in% regions, ]
            # get mean, sd, se, and ci
            dataframes[[i]] <- .summarySE(df,
                                          measurevar = 'y',
                                          groupvars = c("x", "region", "sample"))
        }

        df.union <- do.call("rbind", dataframes)

        # make compound column for geom_ribbon (region . sample)
        df.union$compound <- paste(df.union$region, df.union$sample)

        # plot!
        p <- ggplot(df.union, aes(x = x, y = y))

        if (nsamples == 1) {
            p <- p + geom_line(aes(linetype = region),
                               color = BLUEHEX,
                               size = 0.65) +
                guides(color = FALSE)
        } else {
            p <- p +
                geom_line(aes(linetype = region, color = sample), size = 0.65)
        }

        p <- p +
            scale_linetype_manual(values = .getLineTypes(regions)) +
            scale_x_continuous(breaks = xticks) +
            geom_ribbon(
                aes(
                    ymin = y - ci,
                    ymax = y + ci,
                    fill = compound
                ),
                alpha = 0.1,
                show.legend = FALSE
            ) +
            labs(
                title = paste(
                    "Percent recapture of",
                    paste(regions, collapse = ", "),
                    "in",
                    paste(sampleNames, collapse = ", ")
                ),
                subtitle = "Mean number of recaptured sequences with 95% confidence interval",
                x = "Sample size",
                y = "Percent Recapture"
            ) + theme_bw() +
            theme(axis.text.x = element_text(angle = 90, hjust = 1))

        return(p)
    }


#' Title Shows varying regions for a given clonotype defined by its CDR3
#'
#' @import reshape2
#' @import ggplot2
#'
#' @param path string type. Path to diversity folder
#' where <sampleName>_clonotype_diversity_region_analysis.csv.gz is located
#' @param sampleName string type
#' @param top int type. Top N number of clones to analyze
#'
#' @return ggplot2 object
.regionAnalysis <- function(path, sampleName, top = 15) {
    message("Starting clonotype region analysis for ", sampleName)

    df <- read.csv(
        paste0(
            path,
            sampleName,
            "_clonotype_diversity_region_analysis.csv.gz"
        ),
        stringsAsFactors = FALSE
    )

    headers <- c("fr1", "cdr1", "fr2", "cdr2", "fr3", "fr4")
    # sort the df with decreasing counts of CDR3 occurance
    df <- df[with(df, order(-count)),]

    # add new column to sum the "unique" regions
    df$sumcounts = rowSums(df[, headers])

    # grab only those whos V-domain will differ.
    #  This means sumcounts != 6 (> 6) where 6 = length(headers)
    df <- df[df$sumcounts > 6,]

    # grab top N
    df <- head(df, top)

    # add new row as reference of "unique regions"
    de <- data.frame("REFERENCE", Inf, 1, 1, 1, 1, 1, 1, 6)
    names(de) <- names(df)
    df <- rbind(df, de)


    # reshape to multi-row
    df.mel <- melt(df, measure.vars = headers)
    df.mel[, "value"] = df.mel[, "value"] / df.mel[, "sumcounts"]


    # plot!
    g <- ggplot(df.mel,
                aes(
                    x = cdr3,
                    y = value,
                    fill = variable,
                    label = sprintf("%0.2f%%",
                                    round(value * 100, digits = 2))
                )) +
        geom_bar(stat = 'identity') +
        theme(text = element_text(size = 10),
              axis.text.x = element_text(angle = 65, hjust = 1)) +
        #geom_text(position = position_stack(vjust = 0.5)) +
        #stat_summary(fun.y = sum, aes(label = sumcounts, group=cdr3),
        #             geom='text', vjust=-.2) +
        labs(
            title = paste(
                sampleName,
                "varying levels of FRs and CDRs of top",
                top,
                "CDR3 clonotype"
            ),
            subtitle = "Counts of unique regions for a given CDR3",
            x = "CDR3",
            y = "Proportion"
        ) +
        guides(fill = guide_legend(title = "Region")) +
        scale_x_discrete(limits = c("REFERENCE", head(df, -1)$cdr3))
    return(g)
}

#' Composition logo plot
#'
#' @description Plots 2 kinds: scaled and unscaled composition logos
#'
#' @param compositionDirectory string type.
#' @param outdir string type.
#' @param sampleName string type.
#' @param regions logical type. vector of FR/CDR regions to plot
#' @param .save logical type. save ggplot object
#'
#' @import stringr
#'
#' @return none
.aminoAcidPlot <- function(compositionDirectory,
                           outdir,
                           sampleName,
                           regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3", "CDR3", "FR4"),
                           .save = TRUE) {
    for (region in regions) {
        dirName <- file.path(compositionDirectory, region)
        outputPath <- file.path(outdir, region)
        if (!dir.exists(outputPath)) {
            dir.create(outputPath)
        }

        summaryPlot <-
            file.path(dirName, paste0(sampleName, "_cumulative_logo.csv"))
        df <- read.csv(summaryPlot)
        g1 <- .aminoAcidBar(df, scale = FALSE, region)
        g2 <- .aminoAcidBar(df, scale = TRUE, region)
        fname <-
            file.path(outputPath,
                      paste0(sampleName, "_cumulative_logo.png"))
        fnameScaled <-
            file.path(outputPath,
                      paste0(sampleName, "_cumulative_logo_scaled.png"))
        ggsave(fname, plot = g1, width = V_WIDTH, height = V_HEIGHT)
        ggsave(fnameScaled, plot = g2, width = V_WIDTH, height = V_HEIGHT)
        .saveAs(.save, fname, g1)
        .saveAs(.save, fnameScaled, g2)

        germlineSpecific <-
            list.files(
                path = dirName,
                pattern = paste0(sampleName,
                                 "_.+_cumulative_logo\\.csv(\\.gz)?$"),
                full.names = TRUE
            )

        lapply(germlineSpecific, function(gLogoFile) {
            germName <- sub(
                "_cumulative_logo\\.csv(\\.gz)?$",
                "",
                stringr::str_extract(gLogoFile, "IG[HKL][VDJ].*")
            )
            df <- read.csv(gLogoFile)
            g1 <-
                .aminoAcidBar(df, scale = FALSE, region, germ = germName)
            g2 <-
                .aminoAcidBar(df, scale = TRUE, region, germ = germName)
            fname <-
                file.path(outputPath,
                          paste0(sampleName, "_", germName, "_cumulative_logo.png"))
            fnameScaled <-
                file.path(
                    outputPath,
                    paste0(
                        sampleName,
                        "_",
                        germName,
                        "_cumulative_logo_scaled.png"
                    )
                )
            ggsave(fname, plot = g1, width = V_WIDTH, height = V_HEIGHT)
            ggsave( fnameScaled, plot = g2, width = V_WIDTH, height = V_HEIGHT)
            .saveAs(.save, fname, g1)
            .saveAs(.save, fnameScaled, g2)
        })
    }
}

#' Plots amino acid composition logo
#'
#' @import ggplot2 stats
#'
#' @param df dataframe
#' @param scale logical. scale to proportion?
#' @param region string. which region is this
#' @param germ string. V germline family
#'
#' @return ggplot2 object
.aminoAcidBar <- function(df, scale, region, germ = "") {
    # oranges(G - T), greens(C - W), purples (N - H), reds (D, E), blues (K, R)
    group.colors <-
        c(
            G = "#e65c00",
            A = "#ff751a",
            S = "#ff8533",
            T = "#ff944d",
            C = "#003300",
            V = "#145214",
            I = "#006622",
            L = "#1f7a1f",
            P = "#009933",
            F = "#29a329",
            Y = "#00b33c",
            M = "#2eb82e",
            W = "#33cc33",
            N = "#330066",
            Q = "#4d0099",
            H = "#6600cc",
            D = "#990000",
            E = "#b30000",
            K = "#000099",
            R = "#0000cc"
        )
    df.agg <- aggregate(count ~ position, df, sum)

    # get the max counts for each position - then xlabel will contain
    # the amino acid character for that position - break ties on first occurance
    df.max <- merge(aggregate(count ~ position, df, max), df)
    df.max <- df.max[!duplicated(df.max[c(1, 2)]),]
    xlabels <-
        lapply(df.max[with(df.max, order(position)),]$aa, as.character)

    total <- max(df.agg$count)
    if (scale) {
        df$proportion <- df$count / total
        subs <- "Scaled to proportion"
    } else {
        df.tmp <- merge(df, df.agg, by = "position")
        df.tmp <- df.tmp[with(df.tmp, order(position)),]
        # if not scaled, divide within its own position rather than
        # overall (i.e. the max)
        df$proportion <- df.tmp$count.x / df.tmp$count.y
        subs <- ""
    }

    df$aa <-
        factor(
            df$aa,
            levels = c(
                "G",
                "A",
                "S",
                "T",
                "C",
                "V",
                "I",
                "L",
                "P",
                "F",
                "Y",
                "M",
                "W",
                "N",
                "Q",
                "H",
                "D",
                "E",
                "K",
                "R"
            )
        )
    g <- ggplot(df, aes(x = position, y = proportion)) +
        geom_bar(stat = "identity", aes(fill = aa)) +
        labs(
            title = paste0(germ, " ", region, " (", total, ")"),
            subtitle = subs,
            x = "amino acid",
            y = "proportion"
        ) +
        scale_x_continuous(breaks = df.agg$position, labels = xlabels) +
        scale_fill_manual(values = group.colors, drop = FALSE) +
        theme(legend.title = element_blank(),
              legend.text = element_text(size = 5))
    return(g)
}

##  dataframe is something like:
##  +--------------------------+
##  | Clonotype | Count | prop |
##  +--------------------------+
##  |           |       |      |
##  |           |       |      |
##  |           |       |      |
##  |  .......  |  ...  | .... |
##  +--------------------------+
#' Reports abundance-based (Lower bound) diversity estimates using the Vegan package
#'
#' @import vegan
#'
#' @param df clonotype dataframe. Vegan format:
#' +---------------------------+
#' | S.1| S.2| S.3 | S.4 | ... |   (each species should have its own column)
#' +---------------------------+
#' | v1 |v2  | v3  | ....      |   (each species' count values are placed in the corresponding column)
#' +---------------------------+
#'
#' @return dataframe with the format:
#' +----------------------------------------------------------------+
#' | S.obs | S.chao1 | se.chao1 | S.ACE | se.ACE | s.jack1 | s.jack2|
#' +----------------------------------------------------------------+
#' | v1    |  v2 ....                                               |
#' +----------------------------------------------------------------+
.reportLBE <- function(df) {
    f1.f2 <- unlist(lapply(seq_len(2), function(i) {
        sum(df[1,] == i)
    }))

    lbe <- estimateR(df)
    s.obs <- lbe[1]
    s.jack1 <- s.obs + f1.f2[1]
    s.jack2 <- s.obs + 2 * f1.f2[1] - f1.f2[2]

    lbe <-
        rbind(as.data.frame(lbe),
              S.jack1 = s.jack1,
              S.jack2 = s.jack2)

    df.out <- as.data.frame(t(lbe[, 1]))
    names(df.out) <- rownames(lbe)
    return(df.out)
}

#' Calculates the "standard" diversity indices
#'
#' @import vegan
#'
#' @param df clonotype dataframe. Vegan format:
#' +---------------------------+
#' | S.1| S.2| S.3 | S.4 | ... |   (each species should have its own column)
#' +---------------------------+
#' | v1 |v2  | v3  | ....      |   (each species' count values are placed in the corresponding column)
#' +---------------------------+
#'
#' @return dataframe with the column headers:
#' shannon , simpson.con , simpson.inv , simpson.gini , renyi.0 ,
#' renyi.1 , renyi.2 , renyi.Inf , hill.0 , hill.1 , hill.2 , hill.Inf
#'
#' renyi.0 => species richness
#' renyi.1 => shannon entropy
#' renyi.2 => inv.gini
#' renyi.Inf => min.entropy
#'
#' finally:
#'    hill_a = exp(renyi_a)
#'
.calculateDInd <- function(df) {
    renyi.scales = c(0, 1, 2, Inf)
    renyi <- renyi(df, scales = renyi.scales)
    hill <- exp(renyi)
    shannon <- vegan::diversity(df, index = "shannon")
    simpson <- vegan::diversity(df, index = "simpson")
    n.species <- ncol(df)

    as.data.frame(
        cbind(
            shannon = shannon,
            shannon.norm = shannon / log(n.species),
            simpson.gini = simpson,
            simpson.inv = vegan::diversity(df, index = "invsimpson"),
            simpson.con = 1 - simpson,
            renyi.0 = renyi['0'],
            renyi.1 = renyi['1'],
            renyi.2 = renyi['2'],
            renyi.Inf = renyi['Inf'],
            hill.0 = hill['0'],
            hill.1 = hill['1'],
            hill.2 = hill['2'],
            hill.Inf = hill['Inf']
        )
    )
}


#' Title Diversity analysis
#'
#' @import ggplot2
#' @include util.R
#' @include distributions.R
#'
#' @param diversityDirectories list type. List of directories to diversity dir
#' @param diversityOut string type. Output directory
#' @param sampleNames vector type. 1-1 with diversityDirectories
#' @param mashedNames string type. Prefix for output files using "mashed-up"
#' sample names
#' @param .save logical type. Save ggplot object?
#'
#' @return None
.diversityAnalysis <- function(diversityDirectories,
                               diversityOut,
                               sampleNames,
                               mashedNames,
                               .save = TRUE) {
    message("Starting diversity analysis on samples ",
            paste(sampleNames, collapse = ", "))

    # fr/cdr plots --------------------------------------------------------
    # plot duplication, rarefaction, recapture
    lapply(c("cdr", "cdr_v", "fr"), function(region) {
        .plotDiversityCurves(region, diversityDirectories,
                             sampleNames, mashedNames,
                             diversityOut, .save = .save)
    })

    # generate FR1-4, CDR1-3, CDR3 no outliers, and V spectratypes -------
    .generateAllSpectratypes(diversityDirectories,
                             diversityOut,
                             sampleNames,
                             mashedNames,
                             .save = .save)

    # generate composition logos -----------------------------------------
    # not applicable to multi-sample scenario
    if (length(sampleNames) == 1) {
        compDir <- file.path(diversityDirectories[[1]], "composition_logos")
        compOut <- file.path(diversityOut, "composition_logos")
        if (!file.exists(compOut)) {
            dir.create(compOut)
        }
        message("Plotting composition logos on samples ",
                paste(sampleNames, collapse = ", "))
        .aminoAcidPlot(compDir, compOut, sampleNames[1])
    }

    # we can plot region analysis if there's only one sample
    # this feature is disabled until the backend has been optimized
    #if (length(sampleNames) == 1) {
    #  # default = top 15
    #  g <- .regionAnalysis(diversityOut, sampleNames[1])
    #  ggsave(paste0(diversityOut, mashedNames, "_region_analysis.png"),
    #         plot = g, width = V_WIDTH, height = V_HEIGHT)
    #}

    # Calculate Lower Bound Estimate and diversity indices --------------------
    .calculateDiversityEstimates(diversityDirectories,
                                 diversityOut,
                                 sampleNames)
}

#' Plots rarefaction, recapture, and de-dup plots for specified \code{region}
#'
#' @import ggplot2
#' @include util.R
#' @include distributions.R
#'
#' @param region string type. One of: "cdr", "cdr_v", and "fr". "cdr" means
#' CDR1-3, "cdr_v" means CDR3 and V only, and finally "fr" means FR1-4.
#' @param diversityDirectories list type. List of directories to diversity dir
#' @param sampleNames vector type. 1-1 with diversityDirectories
#' @param mashedNames string type. Prefix for output files using "mashed-up"
#' @param diversityOut string type. Output directory sample names
#' @param .save logical type. Save ggplot object?
#'
#' @return Nothing
.plotDiversityCurves <- function(region,
                                 diversityDirectories,
                                 sampleNames,
                                 mashedNames,
                                 diversityOut,
                                 .save = TRUE) {
    if (region == "cdr") {
        includedRegions <- c("CDR1", "CDR2", "CDR3")
    } else if (region == "cdr_v") {
        includedRegions <- c("CDR3", "V")
    } else {
        includedRegions <- c("FR1", "FR2", "FR3")
    }

    plotTypes <- c("duplication" = .plotDuplication,
                   "rarefaction" = .plotRarefaction,
                   "recapture" = .plotRecapture)

    lapply(seq_along(plotTypes), function(i) {
        functor <- plotTypes[[i]]
        ptype <- names(plotTypes)[[i]]

        searchFiles <-
            .listFilesInOrder(path = diversityDirectories,
                              pattern =  paste0(".*_", region,
                                                "_", ptype, "\\.csv(\\.gz)?$"))
        if (length(searchFiles) > 0) {
            g <- functor(searchFiles, sampleNames, includedRegions)
            saveName <-
                file.path(diversityOut,
                          paste0(mashedNames, "_", region, "_", ptype, ".png"))
            ggsave(saveName,
                   plot = g,
                   width = V_WIDTH,
                   height = V_HEIGHT)
            .saveAs(.save, saveName, g)
        } else {
            warning("Could not find ", ptype, " files in ",
                    paste(diversityDirectories, collapse = ", "))
        }
    })
}



#' Generates all FR/CDR spectratypes
#'
#' @import ggplot2
#' @include util.R
#' @include distributions.R
#'
#' @param diversityDirectories list type. List of directories to diversity dir
#' @param diversityOut string type. Output directory
#' @param sampleNames vector type. 1-1 with diversityDirectories
#' @param mashedNames string type. Prefix for output files using "mashed-up"
#' sample names
#' @param .save logical type. Save ggplot object?
#'
#' @return Nothing
.generateAllSpectratypes <- function(diversityDirectories,
                                     diversityOut,
                                     sampleNames,
                                     mashedNames,
                                     .save = TRUE) {
    specOut <- file.path(diversityOut, "spectratypes")
    if (!file.exists(specOut)) {
        dir.create(specOut)
    }
    message("Plotting spectratypes on samples ",
            paste(sampleNames, collapse = ", "))

    regionTypes <- list("cdr" = seq_len(3), "fr" = seq_len(4))
    lapply(seq_along(regionTypes), function(i) {
        # grab the region (CDR/FR)
        region <- names(regionTypes)[[i]]
        # for CDR, 1-3, else 1-4
        lapply(regionTypes[[i]], function(j) {
            specFiles <-
                .listFilesInOrder(path = diversityDirectories,
                                  pattern = paste0(".*_",
                                                   region,
                                                   j,
                                                   "_spectratype\\.csv(\\.gz)?$"))
            if (length(specFiles) > 0) {
                g <- .plotSpectratype(lapply(specFiles, read.csv,
                                             stringsAsFactors = FALSE),
                                      sampleNames,
                                      paste0(toupper(region), j))
                saveName <- file.path(specOut,
                                      paste0(mashedNames,
                                             "_",
                                             region,
                                             j,
                                             "_spectratype.png"))
                ggsave(saveName,plot = g, width = V_WIDTH, height = V_HEIGHT)
                .saveAs(.save, saveName, g)
            } else {
                warning("Could not find ",
                        region, j, " spectratype files in ",
                        paste(diversityDirectories, collapse = ", "))
            }
        })
    })

    # special case, no outliers plot for CDR3 only
    specFiles <-
        .listFilesInOrder(path = diversityDirectories,
                          pattern = ".*_cdr3_spectratype_no_outliers\\.csv(\\.gz)?$")

    if (length(specFiles) > 0) {
        g <-
            .plotSpectratype(lapply(specFiles, read.csv, stringsAsFactors = FALSE),
                             sampleNames,
                             "CDR3")
        saveName <-
            file.path(specOut,
                      paste0(mashedNames, "_cdr3_spectratype_no_outliers.png"))
        ggsave(saveName, plot = g, width = V_WIDTH, height = V_HEIGHT)
        .saveAs(.save, saveName, g)
    } else {
        warning("Could not find CDR3 spectratype (no outlier) files in ",
                paste(diversityDirectories, collapse = ", "))
    }

    # entire V-domain
    specFiles <- .listFilesInOrder(path = diversityDirectories,
                                   pattern = ".*_v_spectratype\\.csv(\\.gz)?$")
    if (length(specFiles) > 0) {
        g <-
            .plotSpectratype(
                lapply(specFiles, read.csv, stringsAsFactors = FALSE),
                sampleNames,
                "V domain")
        saveName <-
            file.path(specOut, paste0(mashedNames, "_v_spectratype.png"))
        ggsave(saveName, plot = g, width = V_WIDTH, height = V_HEIGHT)
        .saveAs(.save, saveName, g)
    } else {
        warning("Cound not find V spectratype files in ",
                paste(diversityDirectories, collapse = ", "))
    }
}


#' Calculates Lower Bound Estimates for unseen species and Common Diversity
#' Indices from clonotype tables
#'
#' @description Employ common techniques to calculate LBE for unseen species
#' and commonly used diversity indices
#'
#' @import tools
#' @include util.R
#' @include distributions.R
#'
#' @param diversityDirectories list type. List of directories to diversity dir
#' @param diversityOut string type. Output directory
#' @param sampleNames vector type. 1-1 with diversityDirectories sample names
#'
#' @return None
.calculateDiversityEstimates <- function(diversityDirectories,
                                         diversityOut,
                                         sampleNames) {
    cdr3ClonesFile <- .listFilesInOrder(path = diversityDirectories,
                                        pattern = ".*_cdr3_clonotypes_.*_over\\.csv(\\.gz)?$")

    if (length(cdr3ClonesFile) != length(sampleNames)) {
        warning(paste(sampleNames, collapse = ", "),
                " is missing CDR3 clonotype counts file, ",
                "skipping LBE and IND analysis.")
        return()
    }

    # dataframes is in vegan input format, the clonotypes are
    # now column headers instead of column values
    dataframes <- lapply(cdr3ClonesFile, function(fname) {
        df <- read.csv(fname, stringsAsFactors = FALSE)
        d.trans <- as.data.frame(t(df[, "Count"]))
        return(d.trans)
    })

    lb.fname <- "lower_bound_estimate.tsv"
    ind.fname <- "diversity_indices.tsv"

    estimateTypes <- list(.reportLBE, .calculateDInd)
    names(estimateTypes) <- c(lb.fname, ind.fname)
    outputFiles <- c(file.path(diversityOut, lb.fname),
                     file.path(diversityOut, ind.fname))

    lapply(seq_along(estimateTypes), function(i) {
        if (!file.exists(outputFiles[[i]])) {
            fileName <- names(estimateTypes)[[i]]
            fileNameSansExt <- tools::file_path_sans_ext(fileName)
            functor <- estimateTypes[[i]]

            files <-
                .listFilesInOrder(path = diversityDirectories,
                                  pattern = paste0(fileNameSansExt,
                                                   "\\.tsv(\\.gz)?$"))

            if (length(files) != length(sampleNames)) {
                # if even one of the tsv file doesn't exist
                # (which means we haven't generated it, or if it was deleted,
                # we re-generate them)
                message("Calculating ",
                        sub("_", " ", fileNameSansExt, fixed = TRUE),
                        " for ",
                        paste(sampleNames, collapse = ", "))
                df.ests <- lapply(dataframes, functor)
            } else {
                # the rare occasion when all individual samples have already been
                # analyzed and the TSVs are all available, we only need to reload
                # them rather than re-computing the values
                message("Loading precomputed ",
                        sub("_", " ", fileNameSansExt, fixed = TRUE),
                        " from ",
                        paste(sampleNames, collapse = ", "))
                df.ests <- lapply(files, read.table, header = TRUE)
            }
            stopifnot(length(diversityDirectories) == length(sampleNames) &&
                          length(df.ests) == length(sampleNames))

            dfs <- do.call("rbind", Map(cbind, df.ests, sample = sampleNames))
            write.table(dfs, file = outputFiles[[i]],
                        sep = "\t", quote = FALSE,
                        row.names = FALSE)

        } else {
            message("Found ", names(estimateTypes)[[i]], ", skipping ...")
        }
    })
}
Any scripts or data that you put into this service are public.
abseqR documentation built on Nov. 8, 2020, 8:28 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
abseqR
Reporting and data analysis functionalities for Rep-Seq datasets of antibody libraries

R/diversityAnalysis.R
In abseqR: Reporting and data analysis functionalities for Rep-Seq datasets of antibody libraries

Defines functions .calculateDiversityEstimates .generateAllSpectratypes .plotDiversityCurves .diversityAnalysis .calculateDInd .reportLBE .aminoAcidBar .aminoAcidPlot .regionAnalysis .plotRecapture .plotRarefaction .plotDuplication

Documented in .aminoAcidBar .aminoAcidPlot .calculateDInd .calculateDiversityEstimates .diversityAnalysis .generateAllSpectratypes .plotDiversityCurves .plotDuplication .plotRarefaction .plotRecapture .regionAnalysis .reportLBE

Try the abseqR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

abseqR Reporting and data analysis functionalities for Rep-Seq datasets of antibody libraries

R/diversityAnalysis.R In abseqR: Reporting and data analysis functionalities for Rep-Seq datasets of antibody libraries

Defines functions .calculateDiversityEstimates .generateAllSpectratypes .plotDiversityCurves .diversityAnalysis .calculateDInd .reportLBE .aminoAcidBar .aminoAcidPlot .regionAnalysis .plotRecapture .plotRarefaction .plotDuplication

Documented in .aminoAcidBar .aminoAcidPlot .calculateDInd .calculateDiversityEstimates .diversityAnalysis .generateAllSpectratypes .plotDiversityCurves .plotDuplication .plotRarefaction .plotRecapture .regionAnalysis .reportLBE

Try the abseqR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

abseqR
Reporting and data analysis functionalities for Rep-Seq datasets of antibody libraries

R/diversityAnalysis.R
In abseqR: Reporting and data analysis functionalities for Rep-Seq datasets of antibody libraries