R/metrics.R

Defines functions getHvgs grouped_boxplot pca_m3c tsne_batch capitalize simpleCap kbet

# kBET function
# Input: Takes in gene conts data table, metadata data table, a foi, output_directory, and output name
# Ouptut: returns a dataframe containing data regarding the kBET test

kbet <- function(gene_counts, annot, output_dir, output_name)
{
  # get the factor of interest (not currently used)
  #foi = annot[factor_of_interest]

  # additionall quantitative metrics
  #avedistVal = avedist(new_data, as.factor(batch) )#, as.factor(diagnosis))
  #pvcamVal = pvcam(new_data, batch, foi)
  #skewdivVal = skewdiv(new_data, as.factor(batch))
  #kldistVal = kldist(new_data, as.factor(batch))
  #sepscoreVal = sepscore(new_data, as.factor(batch) )
  # diffexprmVal =
  # cobraVal =

  data = gene_counts

  batch = annot$batch
  # Capitalize Function

  capitalize <- function(x) {
    substr(x, 1, 1) <- toupper(substr(x, 1, 1))
    x
  }

  # Generate the Plot title
  # plotTitle = paste(capitalize(unlist(strsplit(name, "_"))), collapse= " ")
  plotTitle = paste(output_name, " kBET Plot", sep="")

  file_path <- paste( file.path(output_dir, output_name), "_kbet_plot.png", sep="")
  png(file=file_path)
  batch.estimate <- kBET::kBET(data, batch, plot=FALSE)
  plot.data <- data.frame(class=rep(c('observed', 'expected'),
                                   each=length(batch.estimate$stats$kBET.observed)),
                          data =  c(batch.estimate$stats$kBET.observed,
                                    batch.estimate$stats$kBET.expected))

  print(ggplot2::ggplot(plot.data, ggplot2::aes(class, data)) + ggplot2::geom_boxplot() +
          ggplot2::labs(x='Test', y='Rejection rate',title=paste(plotTitle,'kBET test results',sep=' ')) +
          ggplot2::theme_bw() +
          ggplot2::scale_y_continuous(limits=c(0,1)))

  dev.off()

  results <- batch.estimate$summary

  # add extra quantitative measures to results
  #results['avedist'] = avedistVal
  #results['pvcam'] = pvcamVal
  #results['skewdiv'] = skewdivVal
  #results['kldist'] = kldistVal

  return( list("path" = file_path, "results" = results))

}

simpleCap <- function(x) {
  s <- strsplit(x, " ")[[1]]
  paste(toupper(substring(s, 1,1)), substring(s, 2),
        sep="", collapse=" ")
}

# Capitalize Function
capitalize <- function(x) {
  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
  x
}

tsne_batch <- function(gene_counts, annot, output_dir, output_name)
{
  # output PCA
  file_path <- paste( file.path(output_dir, output_name), "_tsne_plot.png", sep="")
  png(file=file_path)

  #name = paste(sapply(paste(unlist(strsplit(output_name, " ")), sep=" "), simpleCap), collapse=" ")
  #plotTitle = paste(capitalize(unlist(strsplit(name, "_"))), collapse= " ")
  plotTitle <- paste( output_name, "T-SNE", sep=" ")
  invisible(capture.output(
    g <- M3C::tsne(t(gene_counts), labels=as.factor(annot$batch), legendtitle ="Batch", dotsize = 2) +
      ggplot2::ggtitle(plotTitle) +
      ggplot2::theme(plot.title = ggplot2::element_text(size=20, hjust = .5))
  ))
  #print(typeof(g))
  print(g)
  dev.off()

  return (list("path" = file_path, "plot" = g))
}

pca_m3c <- function(gene_counts, annot, output_dir, output_name, foi=NULL)
{

  file_name <- paste(file.path(output_dir, output_name), '.png', sep="")
  #png(file_name)

  # output PCA
  file_path <- paste( file.path(output_dir, output_name), "_pca_picture.png", sep="")
  png(file=file_path)

  name = paste(sapply(paste(unlist(strsplit(output_name, "[_]")), sep=" "), simpleCap), collapse=" ")
  plotTitle = paste(capitalize(unlist(strsplit(name, "_"))), collapse= " ")
  plotTitle <- paste( plotTitle, " PCA", sep="")

  g <- M3C::pca(gene_counts, labels=as.factor(annot$batch), legendtitle ="Batch", dotsize = 2) +
    ggplot2::ggtitle(plotTitle) +
    ggplot2::theme(plot.title = ggplot2::element_text(size=20, hjust = .5)) +
    ggplot2::scale_size_continuous(range = c(1, 2))
  print(g)
  dev.off()

  return (list("path" = file_path, "plot" = g))


}

grouped_boxplot <- function(gene_counts, annot, output_dir, dataset_name)
{
  # get list of samples for all batchs
  # for each batch variable
  # get list of samples for this batch
  # pull out their gene expression values
  # get the means and put it in a list
  # add it to a list
  # add batch x to names

  boxplot_data = NULL


  for (x in unique(annot$batch))
  {
    batchX = annot[annot$batch == x,]
    #print(batchX)

    rownames(batchX)

    gene_counts_subset <- t(gene_counts[rownames(batchX), ])

    #print(dim(gene_counts_subset))

    means = rowMeans(gene_counts_subset)

    mean_values <- as.vector(data.frame(means = rowMeans(gene_counts_subset))$means)

    batch_name = paste(c("batch_", x), collapse="", sep="")

    if(is.null(boxplot_data))
    {
      #print('first time')
      boxplot_data = data.frame("mean" = mean_values, "batch" = rep(x, each=length(mean_values)))
    }
    else
    {
      #print('not first time')
      new_data = data.frame("mean" = mean_values, "batch" = rep(x, each=length(mean_values)))
      #print(dim(new_data))

      boxplot_data = rbind(boxplot_data, new_data)
    }
    #print(dim(boxplot_data))

  }


  #print(boxplot)data
  file_name <- paste(dataset_name, '_comparative_boxplot.png', sep ="")
  output_file_name <- file.path(output_dir, file_name)
  png(output_file_name)


  g <- ggplot2::ggplot(boxplot_data, ggplot2::aes(x = factor(batch), y = mean, fill=factor(batch))) +
    ggplot2::geom_boxplot() +
    ggplot2::labs(title = "Comparative Boxplot", x = "Batch", y = "Mean Gene Expression", fill = "Batch") +
    ggplot2::theme(plot.title   = ggplot2::element_text(size=19, hjust = .5),
          axis.title.y = ggplot2::element_text(size = 15),
          axis.title.x = ggplot2::element_text(size = 15),
          legend.title = ggplot2::element_text(size=14),
          axis.text.x =  ggplot2::element_text(size=16),
          axis.text.y =  ggplot2::element_text(size=16))


  print(g)

  dev.off()

  return (list( "path" = output_file_name, "data" = boxplot_data))
}


getHvgs <- function(gene_counts, annot)
{
  hvgs <- NULL

  # let's iterate through all the batches
  for (batch_i in unique(annot$batch))
  {
    batch_counts <- t(gene_counts[rownames(annot[annot$batch == batch_i,]),]);
    batch_counts <- as.data.frame(cbind(batch_counts, 'var'=matrixStats::rowVars(batch_counts)))
    top_percent <- .10
    n_genes <- dim(batch_counts)[1] * top_percent
    top_varying <- rownames(head(batch_counts[order(batch_counts$var,decreasing=T),],n_genes))

    if(is.null(hvgs))
    {
      hvgs <- top_varying
    }
    else
    {
      hvgs <- Reduce(intersect, list(hvgs, top_varying))
    }

  }
  return (hvgs)
}
thekuhninator/beat documentation built on Oct. 11, 2021, 10:54 a.m.