R/igc_refraction.r

Defines functions igc_refraction

Documented in igc_refraction

#' IGC Input and Rarefaction Curves/Basic Comparative
#'
#' @param mre adsf
#'
#' @return NULL asdf
#' @export
#'
#' @import dplyr
#' @import ggplot2
#' @import stringr
#' @import purrr
#' @import tidyr
#' @importFrom grDevices boxplot.stats cairo_pdf
#' @importFrom stats lm na.omit quantile reorder setNames
#' @importFrom ggforce facet_row
#'
#' @examples
igc_refraction <- function(mre) {

  dataTable <- mre$dataTable
  metadata_df <- mre$metadata_df
  categorical_vals <- mre$categorical_vals
  numeric_vals <- mre$numeric_vals
  longitudinal_vals <- mre$longitudinal_vals

  # Logg info
  IGCperc <- 0.02
  message("Starting IGC refraction")
  message("IGC percentage to use as a threshold: ", IGCperc)

  ## ===========================================================
  ## CATEGORIC PLOTS

  ## PREPORCESSING DATA
  prepro_1 <- dataTable %>%
    as_tibble() %>%
    select(SampleID, InitialReads, NumberMappedReads) %>%
    distinct_all() %>%
    arrange(InitialReads) %>%
    mutate(SampleID = factor(SampleID, levels = SampleID),
           InitialReads = InitialReads - NumberMappedReads) %>%
    gather("Step", "value", -SampleID) %>%
    inner_join(metadata_df, by = "SampleID")

  prepro_2 <- dataTable %>%
    as_tibble() %>%
    select(SampleID, InitialReads, NumberMappedReads) %>%
    distinct_all() %>%
    mutate(SampleID = factor(SampleID, levels = SampleID),
           NumberMappedReads = NumberMappedReads / InitialReads * 100,
           InitialReads = 100 - NumberMappedReads) %>%
    arrange(NumberMappedReads) %>%
    mutate(idx = 1:nrow(.)) %>%
    gather("Step", "value", -SampleID, -idx) %>%
    inner_join(metadata_df, by = "SampleID")

  ## STACKED BARPLOTS
  plot_list <- list()
  plot_list[['StackedBarplot1']] <- prepro_1 %>%
    select(SampleID, Step, value) %>%
    drop_na() %>%
    ggplot(aes(x = SampleID, y = value, fill = Step)) +
    geom_bar(stat = "Identity", size = 0.5, alpha = 0.75) +
    labs(x = "", y = "Million PE reads", title = "IGC Mapped Reads") +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 90, hjust = 1),
          axis.text=element_text(size = 6)) +
    ggsave(filename = str_c("CatalogMapping/IGC/IGCMapping_StackedBarplot1.pdf"),
           device = cairo_pdf, width = 16, height = 10, dpi = 600)

  plot_list[['StackedBarplot1_by_vars']] <- categorical_vals %>%
    pull(CategoricalVariable) %>%
    purrr::set_names() %>%
    purrr::map(function(cat_var) {
      prepro_1 %>%
        select(SampleID, Step, value, cat_var) %>%
        na.omit() %>%
        ggplot(aes(x = SampleID, y = value, fill = Step)) +
        geom_bar(stat="Identity", size = 0.5, alpha = 0.75) +
        facet_row(ensym(cat_var), scales = 'free_x', space = 'free') +
        labs(x = "", y = "Million PE reads", title = "IGC Mapped Reads") +
        theme_minimal() +
        theme(axis.title.x = element_blank(),
              axis.text.x = element_text(angle = 90, hjust = 1),
              axis.text=element_text(size = 6)) +
        ggsave(filename = str_c("CatalogMapping/IGC/IGCMapping_StackedBarplot2_by_",cat_var,".pdf"),
               device = cairo_pdf, width = 16, height = 10, dpi = 600)
      })

  plot_list[['StackedBarplot2']] <- prepro_2 %>%
    select(SampleID, Step, value, idx) %>%
    drop_na() %>%
    ggplot(aes(x = reorder(SampleID, idx), y = value, fill = Step)) +
    geom_bar(stat = "Identity", size = 0.5, alpha = 0.75) +
    labs(x = "", y = "PE reads (%)", title = "% of IGC Mapped Reads") +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 90, hjust = 1),
          axis.text=element_text(size = 6)) +
    ggsave(filename = str_c("CatalogMapping/IGC/IGCMapping_StackedBarplot3.pdf"),
           device = cairo_pdf, width = 16, height = 10, dpi = 600)

  plot_list[['StackedBarplot2_by_vars']] <- categorical_vals %>%
    pull(CategoricalVariable) %>%
    purrr::set_names() %>%
    purrr::map(function(cat_var) {
      prepro_2 %>%
        select(SampleID, Step, value, cat_var, idx) %>%
        na.omit() %>%
        ggplot(aes(x = reorder(SampleID, idx), y = value, fill = Step)) +
        geom_bar(stat="Identity", size = 0.5, alpha = 0.75) +
        facet_row(ensym(cat_var), scales = 'free_x', space = 'free') +
        labs(x = "", y = "PE reads (%)", title = "% of IGC Mapped Reads") +
        theme_minimal() +
        theme(axis.title.x = element_blank(),
              axis.text.x = element_text(angle = 90, hjust = 1),
              axis.text=element_text(size = 6)) +
        ggsave(filename = str_c("CatalogMapping/IGC/IGCMapping_StackedBarplot4_by_",cat_var,".pdf"),
               device = cairo_pdf, width = 16, height = 10, dpi = 600)
    })

  ## REFRACTION PLOTS
  plot_list[['Refraction']] <- dataTable %>%
    ggplot(aes(ReadCountReal, GeneNumber, colour = SampleID)) +
    geom_line(size = 0.5) +
    geom_point(size = 1) +
    ggsave(filename = str_c("CatalogMapping/IGC/IGCMapping_RarefactionPlot.pdf"),
           device = cairo_pdf, width = 16, height = 10, dpi = 600)

  plot_list[['Refraction_by_vars']] <- categorical_vals %>%
    pull(CategoricalVariable) %>%
    purrr::set_names() %>%
    purrr::map(function(cat_var) {
      dataTable %>%
        as_tibble() %>%
        select(SampleID, ReadCountReal, GeneNumber) %>%
        inner_join(metadata_df %>% select(SampleID, Arm), by = 'SampleID') %>%
        setNames(c("SampleID","ReadCountReal","GeneNumber","Category")) %>%
        ggplot(aes(ReadCountReal,GeneNumber, color = Category)) +
        labs(title = str_c("IGC Rarefaction Plots by", cat_var)) +
        geom_line(size = 0.5,aes(color = Category,group = SampleID)) +
        geom_point(size = 1) +
        ggsave(filename = str_c("CatalogMapping/IGC/IGCMapping_RarefactionPlotBy_",cat_var,".pdf"),
               device = cairo_pdf, width = 16, height = 10, dpi = 600)
    })

  plot_list[['Refraction_boxplot']] <- c("InitialReads","NumberMappedReads","ReadCountReal","GeneNumber") %>%
    purrr::set_names() %>%
    map(function(col) {
    categorical_vals %>%
      pull(CategoricalVariable) %>%
      purrr::set_names() %>%
      purrr::map(function(cat_var) {
        dataTable %>%
          select(SampleID, !!col) %>%
          inner_join(metadata_df %>% select(SampleID, !!cat_var), by = 'SampleID') %>%
          select(!!cat_var, !!col) %>%
          boxplot_single_var(., outdir = "CatalogMapping/IGC", palette_names = 'Set1',
                             filename = str_c("IGCMapping_Boxplot_",col,"_By_",cat_var,".pdf"))
      })
  })

  plot_list[['Refraction_boxplot_thres']] <- categorical_vals %>%
    pull(CategoricalVariable) %>%
    purrr::set_names() %>%
    purrr::map(function(cat_var) {
      thres <- quantile(dataTable$NumberMappedReads, probs = 0.05)
      dataTable %>%
        as_tibble() %>%
        dplyr::filter(ReadCountReal >= thres) %>%
        dplyr::filter(!duplicated(SampleID)) %>%
        inner_join(metadata_df %>% select(SampleID, !!cat_var), by = 'SampleID') %>%
        select(!!cat_var, "GeneNumber") %>%
        boxplot_single_var(., outdir = "CatalogMapping/IGC", palette_names = 'Set1',
                           title = str_c("Gene Count by ", cat_var, ' at ',thres, ' depth'),
                           filename = str_c("IGCGeneNumber_Boxplot_By_", cat_var, ".pdf"))
    })

  ## ===========================================================
  ## LONGITUDINAL PLOTS

  if(!is.null(longitudinal_vals)) {
    path <- "CatalogMapping/IGC/Longitudinal/"
    dir.create(path, showWarnings = F, recursive = T)

    plot_list[['Loitudinal_plots_1']] <- longitudinal_vals %>%
      pull(LongitudinalVariable) %>%
      purrr::set_names() %>%
      purrr::map(function(long_var) {
        thres <- quantile(dataTable$NumberMappedReads, probs = 0.05)
        link_var <- longitudinal_vals %>%
          dplyr::filter(LongitudinalVariable == long_var) %>%
          pull(LinkVariable)

        pp_1 <- dataTable %>%
          as_tibble() %>%
          dplyr::filter(ReadCountReal >= thres) %>%
          dplyr::filter(!duplicated(SampleID)) %>%
          inner_join(metadata_df, by = 'SampleID') %>%
          select(SampleID, !!link_var, !!long_var, GeneNumber) %>%
          setNames(c("SampleID","LinkVariable","TimeVariable","GeneRichness")) %>%
          mutate(TimeVariable = as.numeric(TimeVariable)) %>%
          ggplot(aes(TimeVariable, GeneRichness, group = LinkVariable, colour = LinkVariable)) +
          geom_line() +
          geom_point(size = 2) +
          geom_smooth(stat = "smooth", aes(group = NULL), method = 'loess') +
          labs(title = str_c("Gene Richness (IGC, ",thres," reads) vs time")) +
          ggsave(str_c(path, "/Longitudinal_IGC_",link_var,".pdf"),
                 device = cairo_pdf, width = 16, height = 10, dpi = 600)

        pp_2 <- categorical_vals %>%
          pull(CategoricalVariable) %>%
          purrr::set_names() %>%
          purrr::map(function(cat_var) {
            dataTable %>%
              as_tibble() %>%
              dplyr::filter(ReadCountReal >= thres) %>%
              dplyr::filter(!duplicated(SampleID)) %>%
              inner_join(metadata_df, by = 'SampleID') %>%
              select(SampleID, !!link_var, !!long_var, !!cat_var, GeneNumber) %>%
              setNames(c("SampleID","LinkVariable","TimeVariable","GroupVariable","GeneRichness")) %>%
              ggplot(aes(TimeVariable, GeneRichness, group = LinkVariable)) +
              geom_line(aes(color = GroupVariable)) +
              geom_point(size = 2) +
              geom_smooth(aes(group = GroupVariable, color = GroupVariable), method = 'loess') +
              labs(title = str_c("Gene Richness (IGC,",thres," reads) vs time for by ",cat_var)) +
              ggsave(str_c(path, "/Longitudinal_IGC_by_",cat_var, "_and_", link_var,".pdf"),
                     device = cairo_pdf, width = 16, height = 10, dpi = 600)
          })

        return(list(pp_1, pp_2))
      })


    plot_list[['Loitudinal_plots_2']] <- categorical_vals %>%
      pull(CategoricalVariable) %>%
      purrr::set_names() %>%
      purrr::map(function(cat_var) {
        metadata_df %>%
          pull(!!cat_var) %>%
          levels(.) %>%
          purrr::set_names() %>%
          purrr::map(function(lev) {
            longitudinal_vals %>%
              pull(LongitudinalVariable) %>%
              purrr::set_names() %>%
              purrr::map(function(long_var) {

                link_var <- longitudinal_vals %>%
                  dplyr::filter(LongitudinalVariable == long_var) %>%
                  pull(LinkVariable)
                thres <- quantile(dataTable$NumberMappedReads, probs = 0.05)
                pre <- dataTable %>%
                  as_tibble() %>%
                  dplyr::filter(ReadCountReal >= thres) %>%
                  dplyr::filter(!duplicated(SampleID)) %>%
                  inner_join(metadata_df, by = 'SampleID') %>%
                  dplyr::filter(!!sym(cat_var) == lev)

                if (nrow(pre) < 5) { return(NULL) }

                pp_1 <- pre %>%
                  select(SampleID, !!link_var, !!long_var, GeneNumber) %>%
                  setNames(c("SampleID","LinkVariable","TimeVariable","GeneRichness")) %>%
                  ggplot(aes(TimeVariable, GeneRichness, group = LinkVariable)) +
                  geom_line() +
                  geom_point(size = 2) +
                  geom_smooth(stat = "smooth", aes(group = NULL)) +
                  labs(title = str_c("Gene Richness (IGC,",thres," reads) vs time")) +
                  ggsave(str_c("CatalogMapping/IGC/Longitudinal/Longitudinal_IGC_",cat_var,"-",lev,"_",long_var,".pdf"),
                         device = cairo_pdf, width = 16, height = 10, dpi = 600)

                if (nrow(categorical_vals) >= 2) {
                  pp_2 <- categorical_vals %>%
                    pull(CategoricalVariable) %>%
                    purrr::set_names() %>%
                    purrr::map(function(cat_var2) {

                      if (cat_var == cat_var2) { return(NULL) }

                      pre_plot <- pre %>%
                        select(SampleID, !!link_var, !!long_var, !!cat_var2, GeneNumber) %>%
                        setNames(c("SampleID","LinkVariable","TimeVariable","GroupVariable","GeneRichness")) %>%
                        ggplot(aes(TimeVariable, GeneRichness, group = LinkVariable)) +
                        geom_line(aes(color = GroupVariable)) +
                        geom_point(size = 2) +
                        labs(title = str_c("Gene Richness (IGC,",thres," reads) vs time for by ", cat_var2)) +
                        ggsave(str_c("CatalogMapping/IGC/Longitudinal/Longitudinal_IGC_",cat_var,"-",
                                     lev,"_by_", cat_var2, "_and_", long_var,".pdf"),
                               device = cairo_pdf, width = 16, height = 10, dpi = 600)
                    })
                }
                if (nrow(categorical_vals) >= 2) {
                  return(list(pp_1, pp_2))
                } else {
                  return(pp_1)
                }
              })
          })
      })
  }

  ## ===========================================================
  ## NUMERICAL PLOTS

  if(!is.null(numeric_vals)) {
    path <- "CatalogMapping/IGC/Correlations"
    dir.create(path, recursive = T, showWarnings = F)

    plot_list[['Correlation_plots_1']] <- numeric_vals %>%
      pull(NumericalVariable) %>%
      purrr::set_names() %>%
      purrr::map(function(num_var) {

         thres <- quantile(dataTable$NumberMappedReads, probs = 0.05)

         prepro <- dataTable %>%
           as_tibble() %>%
           dplyr::filter(ReadCountReal >= thres) %>%
           dplyr::filter(!duplicated(SampleID)) %>%
           inner_join(metadata_df, by = "SampleID") %>%
           select(!!num_var, "GeneNumber") %>%
           setNames(c("Response","GeneNumber"))

          fit1 <- lm(Response ~ GeneNumber, data = prepro)

        ggplotRegression(fit1) +
          ggsave(str_c(path, "/IGC_vs_",num_var,"_corr.pdf"),
                 device = cairo_pdf, width = 16, height = 10, dpi = 600)
      })
  }

  plot_list[['Correlation_plots_2']] <- metadata_df %>%
    select(where(is.numeric)) %>%
    colnames() %>%
    purrr::set_names() %>%
    purrr::map(function(num_col) {

      thres <- quantile(dataTable$NumberMappedReads, probs = 0.05)

      prepro <- dataTable %>%
        as_tibble() %>%
        dplyr::filter(ReadCountReal >= thres) %>%
        dplyr::filter(!duplicated(SampleID)) %>%
        inner_join(metadata_df, by = "SampleID") %>%
        select(!!num_col, "GeneNumber") %>%
        setNames(c("Response","GeneNumber"))

      fit1 <- lm(Response ~ GeneNumber, data = prepro)

      ggplotRegression(fit1) +
        ggsave(str_c(path, "/IGC_vs_",num_col,"_corr.pdf"),
               device = cairo_pdf, width = 16, height = 10, dpi = 600)
    })

  ## ===========================================================
  ## SAVE DATA AND WORKSPACE

  dataTable %>%
    as_tibble() %>%
    dplyr::filter(ReadCountReal >= quantile(dataTable$NumberMappedReads, probs = 0.05)) %>%
    readr::write_csv(file = str_c("CatalogMapping/IGC_at_",
                                  quantile(dataTable$NumberMappedReads, probs = 0.05),".tsv"))
  save.image(file = "CatalogMapping/IGC/IGC_RAnalysis.RData", compress = 'xz')

}
xec-cm/metar documentation built on Oct. 13, 2020, 8:40 p.m.