Gx Pre-Processing with HT12ProcessoR

# save options
options.backup <- options()

# change line wrap to something more readable
options(width = 80)

# Set the global knitr options
knitr::opts_chunk$set(cache = F,
                      results = "hide",
                      echo = T,
                      include = T,
                      message = T,
                      warning = T,
                      fig.width = 9,
                      fig.height = 6)

# start time measurement
time.start <- Sys.time()

# load required packages
for (i in c(
  "HT12ProcessoR",
  "toolboxH",
  "lumi",
  "sva",
  "knitr",
  "here",
  "data.table",
  "evaluate",
  "plotly",
  "ggplot2",
  "pander"
)) {
  suppressPackageStartupMessages(library(i, character.only = TRUE))
}

# set some basic options
panderOptions('table.split.table', Inf)
options(stringsAsFactors=FALSE)

Preparations

This example shows a typical preprocessing of ILLUMINA HT12v4 data using the package HT12ProcessoR. This can be applied for a few samples up to thousands of samples. Details on the Illumina technology can be found here: http://support.illumina.com/content/dam/illumina-marketing/documents/products/datasheets/datasheet_gene_exp_analysis.pdf

This packages comes with an included example (a subset with 96 Chips from GEO accession: GSE65907). This data can be found in r paste0(path.package("HT12ProcessoR"), "/", "extdata"). All annotation files needed for the preprocessing are stored as an .RData object in the data/ folder of this package. For this vignette, please set a working directory (setwd("path/to/project"")), where all files created by this package will be stored.

Check dependencies

Firstly, package dependencies must be met. When installation of the package fails due to missing dependencies, please check whether you can get those dependencies via CRAN, e.g. install.packages() or Bioconductor. The package toolboxH contains convenience-functions by the package creator and can be downloaded here.

Prepare example files

# check the root of your project
# here::dr_here(show_reason = T) #  the here package makes the navigation in directories easier but works best from within R-Projects

# get the project root directory
# prepro.folder <- here() 
# setwd(/path/to/project)
prepro.folder <- getwd() # please choose a working directory

# create the object with the names of the input files ##
# IDs can be chosen freely, but using terms identifying the data files might be advisable 
my.fileset_id <- c('590-651', 'HSS106-HSS155')

# create the results-folder if necessary
dir.create(paste0(prepro.folder, "/tosend/"))

# also create an obj/ folder for intermediary results
dir.create(paste0(prepro.folder, "/obj/"))

# control probe data file
my.con_f <- c(system.file("extdata", "Run12_ControlProbeProfile_590-651.txt", package = "HT12ProcessoR"), # control probe measurements + ercc (spike-in -- class of control probes)
              system.file("extdata", "ControlProbeProfile_HSS106-HSS155.txt", package = "HT12ProcessoR")) #  second example (processing batch 1 & 2)

# gene probe file
my.nobkgd_f <- c(system.file("extdata", "Run12_nonorm_nobkgd_590-651.txt", package = "HT12ProcessoR"), # raw probe data of human gene expression 
                 system.file("extdata", "Einzelanalyse_nonorm_nobkgd_HSS106-HSS155.txt", package = "HT12ProcessoR"))

# sample file
my.sample_f <- c(system.file("extdata", "Run12_SamplesTable_590-651.txt", package = "HT12ProcessoR"), # sample annotation (each measured indiviuum)
                 system.file("extdata", "SamplesTable_HSS106-HSS155.txt", package = "HT12ProcessoR"))

# consolidate in input-file
input <- data.table(fileset_id = my.fileset_id, 
                    con_f = my.con_f,
                    nobkgd_f = my.nobkgd_f,
                    sample_f = my.sample_f)

# how should the input file be called?
input_filename <- paste0(prepro.folder, "/obj/01_file_overview.txt")

# save the input file for later use
write.table(x = input,
            file = input_filename,
            quote = F,
            col.names = T,
            row.names = F,
            sep = "\t")

Prepare parameter file

Next, customize project-specific variables in a parameter file. This file is used to provide most relevant parameters in a single place for the individual preprocessing functions. Parameters will also be explained later. An example is provided with this package

myparams <- read.delim(system.file("extdata", "input_parameter_010.txt", package = "HT12ProcessoR"), quote = "", sep = "\t")

Project-specific parameters that should be customized initially are:

# where is the input file we created located?
myparams[myparams$variable == "file_names_of_files_and_folders", "value"] <- input_filename

# where should the results be saved?
myparams[myparams$variable == "datafolder_results", "value"] <- paste0(prepro.folder, "/tosend/")

# where should the file be saved (overwrite the previously read-in file)
myparamfile <- paste0(prepro.folder, "/obj/input_parameter_010.txt")

# adjust for covariates T/F
myparams[myparams$variable == "ComBat_adjust_covariates", "value"] <- T

# what are the columns in chipsamples containing the covariate data called?
myparams[myparams$variable == "ComBat_covariates_to_adjust", "value"] <- paste("covar1", "covar2", "covar3", sep = ", ")

# save the file again with the above changes
write.table(myparams,
            myparamfile,
            quote = F,
            col.names = T,
            row.names = F,
            sep="\t")

Step 1 - Function checkExtractChipsamples: Check given expression files and extract individuals

This step does the following:

# execute function for the checks described above
prepro_ht12 <- checkExtractChipsamples(paramfile = myparamfile) # check the output of the function for details

# print object in HTML-report
pander(prepro_ht12$history)

# check the class of the object
class(prepro_ht12)

# another file preview
pander(ht(prepro_ht12$chipsamples, 1))

Add fake covariate data

# Add covariates to chipsamples
covar.id <- prepro_ht12$chipsamples$new_ID

# create some random covariates (just for proof of concept)
covars <- data.frame(id = covar.id,
           covar1 = sample(LETTERS[1:3], size = length(covar.id), replace = T),
           covar2 = rnorm(length(covar.id)),
           covar3 = factor(sample(c(0,1), size = length(covar.id), replace = T))
           )

# add covariates to ht12object
for(i in names(covars)[-1]){
  matched <- match(prepro_ht12$chipsamples$new_ID, covars$id)
  prepro_ht12$chipsamples[[i]] <- covars[matched, i]

  # set some NA
  prepro_ht12$chipsamples[[i]][sample(1:nrow(prepro_ht12$chipsamples), size = sample(1:5, 1))] <- NA
}

# what are the covariates calles?
covar.cols <- names(covars)[-1]

Step 2 - Designate individuals and subgroups for preprocessing and redefine batches

Exclude samples:

Designate subgroups:

Alternative treatment of subgoups:

# preview first and last lines
pander(ht(prepro_ht12$chipsamples, 2))

# define samples to be excluded, e.g. when different projects are processed on the same chip
exclude.this <- c("614", "638", "HSS 108", "HSS 112", "HSS 115", "HSS 125", "HSS 127", "HSS 133", "HSS 137", "HSS 138", "HSS 146", "HSS 151","HSS 155")

# Exclude Samples by modifying the $chipsamples element
prepro_ht12$chipsamples <- prepro_ht12$chipsamples[!prepro_ht12$chipsamples$old_ID  %in% exclude.this, ]

# Define Subgroups -  in this example, sampels are from Peripheral Blood Mononuclear Cells (PBMC)
prepro_ht12$chipsamples$subgroup <- "PBMC"
table(prepro_ht12$chipsamples$subgroup)

# show which IDs were processed in which batch
xtabs(~prepro_ht12$chipsamples$Sentrix.Barcode + prepro_ht12$chipsamples$processingbatch)

# do we have strange batches on top of regular batch structure
xtabs(~prepro_ht12$chipsamples$strangebatch + prepro_ht12$chipsamples$processingbatch)

## Quick Check for major differences based on Attributes from the provided ILLUMINA-sample-files
initial_pca <- calcInitialPCA(prepro_ht12)

# plot the results
pca.plot <- ggplot(initial_pca$scores, aes(PC1,
                                           PC2,
                                           pch = subgroup,
                                           col = processingbatch,
                                           size = Detected.Genes..0.01.,
                                           alpha = in_study,
                                           label = paste(old_ID, new_ID, sep = "\n"))) +
  geom_point() +
  scale_alpha_manual(values = c(0.7, 0.3))

# plot a static version
pca.plot

# interactive visualisation - check homogeinity etc.
pca.plot %>% ggplotly()

Step 3 - Function createExpressionSet(): Create an expression set and related information from data

In this step, the expression information that is scattered across many files is united in a single expression set object. This includes control probe information.

Among others, following slots will be created within the updated HT12prepro-object among others:

Additionally, integrity checks are carried out:

# create the ExpressionSet 
prepro_ht12 <- createExpressionSet(prepro_ht12, paramfile = myparamfile) # watch the function output

# explore file output
# show the command history
kable(prepro_ht12$history)

# preview expressionset including samples and controls
prepro_ht12$total_nobkgd_eset

# detection levels per probe
hh(lumi::detection(prepro_ht12$total_nobkgd_eset))

# expression lvl per probe
hh(lumi::exprs(prepro_ht12$total_nobkgd_eset))

Step 4 - Function filterLowExpressed: Filter badly expressed individuals & mark badly expressed genes

This step includes:

Changes to the HT12prepro-object, among others, are:

# execute step 4 by running this function
prepro_ht12 <- filterLowExpressed(prepro_ht12, paramfile = myparamfile)

# show command history
kable(prepro_ht12$history)

# preview function output
head(prepro_ht12$genesdetail)

# show excluded samples
setDT(prepro_ht12$chipsamples)
prepro_ht12$chipsamples[,.N,.(in_study, reason4exclusion)]

# and reset class of this object
setDF(prepro_ht12$chipsamples)

Step 5 - Functions transformNormalizeHT12object() and filterTechnicallyFailed() Transform & normalize data & filter samples for atypical control values

# Transform and normalize the data with this function
prepro_ht12 <- transformNormalizeHT12object(prepro_ht12, paramfile = myparamfile)

# execute the filter step described above with this function
prepro_ht12 <- filterTechnicallyFailed(prepro_ht12, paramfile = myparamfile)

# show command execution history
kable(prepro_ht12$history)

Step 6 - Functions filter4MinBatchsize(), transformNormalizeHT12object(), and removeBatchEffects(): Transform and normalize still valid samples and adjust for batch-effects

# filter for batch-size
prepro_ht12 <- filter4MinBatchsize(prepro_ht12, paramfile = myparamfile)

# repeat transformation and normalization
# This is only necessary if samples were removed in previous step
prepro_ht12 <- transformNormalizeHT12object(prepro_ht12, paramfile = myparamfile)

# execute batch- adjustment via sva::ComBat()
prepro_ht12 <- removeBatchEffects(ht12object = prepro_ht12, paramfile = myparamfile)

# review command history
kable(prepro_ht12$history)

Step 7 - Function checkBatchEffects(): Check for remaining batch effects

# check for remaining batch-effects
prepro_ht12 <- checkBatchEffects(prepro_ht12, paramfile = myparamfile)

# preview the newly created object
ht(prepro_ht12$genesdetail, 2)

# review command history
kable(prepro_ht12$history)

Step 8 - Function filterAtypicalExpressed(): Identify samples with atypical expression values

# filter for atypical expression levels
prepro_ht12 <- filterAtypicalExpressed(prepro_ht12, paramfile = myparamfile)

# preview newly created output
ht(prepro_ht12$chipsamples, 2)

# review command history
kable(prepro_ht12$history)

Step 9 - Function visualizePreprocessing() and comparePCBeforeAfterPrePro(): visualization of Quality Control measures

# create QC-plots
prepro_ht12 <- visualizePreprocessing(prepro_ht12, paramfile = myparamfile)

# PCA before/after preprocessing
prepro_ht12 <- comparePCBeforeAfterPrePro(prepro_ht12, paramfile = myparamfile)

# review command history
kable(prepro_ht12$history)

Step 10 - Function writeFilesTosend(): Write preprocessed data

The directory of the resulting preprocessed data was specified in step Preparations

Parameter file_renaming_samples_tosend can specify a filename where a column named oldname and newname is provided in order to rename sample. Column oldname must match the names of the samples in column new_ID in the $chipsamples data.frame included in the HT12 object. Can be empty if no renaming is required

## Set parameter to use provided renaming file
renaming_fn <- system.file("extdata", "renamingsamples.txt", package = "HT12ProcessoR")

# edit parameter file with location of renaming-file
myparams[ myparams$variable == "file_renaming_samples_tosend", "value"] <- renaming_fn

# write the parameter file
write.table(myparams,
            myparamfile,
            quote = F,
            col.names = T,
            row.names = F,
            sep="\t")

# write result files into the tosend/ folder
prepro_ht12 = writeFilesTosend(prepro_ht12, paramfile = myparamfile)

# preview first and last line of $chipsamples object 
ht(prepro_ht12$chipsamples, 1)

# review command history
kable(prepro_ht12$history)

It might be helpful, to save the HT12-object to preserve detailed information on preprocessing

# save an .RData object containing the final prepro_ht12 object
save(prepro_ht12, file = paste0(prepro.folder, "/obj/prepro_example1_HT12object.RData"))

Step 11 - Function createTextForMethods(): Information for Material and Methods section of a manuscript

This function creates an extended and a short version of the preprocessing done for section Material and Methods of a publication.

Please slightly rephrase text as most journals require unique phrasing even in materials & methods section

# create text-object
mm_text <- createTextForMethods(prepro_ht12, paramfile = myparamfile)

# preview created text as a long version
cat('### Extended Version')
cat(mm_text$extended_version)

# preview created text as a short version
cat('### Short Version')
cat(mm_text$short_version)

Session Info

# output script run time
total.time <- round(as.numeric(Sys.time() - time.start), 2)
message("This script ran for a total of ", total.time, " minutes.")

# output session info
sessionInfo()


holgerman/HT12ProcessoR documentation built on June 5, 2021, 9:18 a.m.