# save options options.backup <- options() # change line wrap to something more readable options(width = 80) # Set the global knitr options knitr::opts_chunk$set(cache = F, results = "hide", echo = T, include = T, message = T, warning = T, fig.width = 9, fig.height = 6)
# start time measurement time.start <- Sys.time() # load required packages for (i in c( "HT12ProcessoR", "toolboxH", "lumi", "sva", "knitr", "here", "data.table", "evaluate", "plotly", "ggplot2", "pander" )) { suppressPackageStartupMessages(library(i, character.only = TRUE)) } # set some basic options panderOptions('table.split.table', Inf) options(stringsAsFactors=FALSE)
This example shows a typical preprocessing of ILLUMINA HT12v4 data using the package HT12ProcessoR. This can be applied for a few samples up to thousands of samples. Details on the Illumina technology can be found here: http://support.illumina.com/content/dam/illumina-marketing/documents/products/datasheets/datasheet_gene_exp_analysis.pdf
This packages comes with an included example (a subset with 96 Chips from GEO accession: GSE65907). This data can be found in r paste0(path.package("HT12ProcessoR"), "/", "extdata")
. All annotation files needed for the preprocessing are stored as an .RData object in the data/ folder of this package. For this vignette, please set a working directory (setwd("path/to/project"")
), where all files created by this package will be stored.
Firstly, package dependencies must be met. When installation of the package fails due to missing dependencies, please check whether you can get those dependencies via CRAN, e.g. install.packages()
or Bioconductor. The package toolboxH
contains convenience-functions by the package creator and can be downloaded here.
Start with Data from Illuminas GenomeStudio. Data should be non-normalized, non-background-substracted. Illumina organises expression data in filesets consisting of three files, a sample file with gene probe data, a control files with control probe data and a sample file with individidual-related information
A tab-delimeted file should be created specifying location of these three files. Following columns are required:
fileset_id
: a short ID for a filesetcon_f
: full path and filename of the control probe centric data filenobkgd_f
: full path and filename of the gene probe centric data filesample_f
: full path and filename of the sample centric data fileThe file should be saved in the folder intended for preprocessing the data
# check the root of your project # here::dr_here(show_reason = T) # the here package makes the navigation in directories easier but works best from within R-Projects # get the project root directory # prepro.folder <- here() # setwd(/path/to/project) prepro.folder <- getwd() # please choose a working directory # create the object with the names of the input files ## # IDs can be chosen freely, but using terms identifying the data files might be advisable my.fileset_id <- c('590-651', 'HSS106-HSS155') # create the results-folder if necessary dir.create(paste0(prepro.folder, "/tosend/")) # also create an obj/ folder for intermediary results dir.create(paste0(prepro.folder, "/obj/")) # control probe data file my.con_f <- c(system.file("extdata", "Run12_ControlProbeProfile_590-651.txt", package = "HT12ProcessoR"), # control probe measurements + ercc (spike-in -- class of control probes) system.file("extdata", "ControlProbeProfile_HSS106-HSS155.txt", package = "HT12ProcessoR")) # second example (processing batch 1 & 2) # gene probe file my.nobkgd_f <- c(system.file("extdata", "Run12_nonorm_nobkgd_590-651.txt", package = "HT12ProcessoR"), # raw probe data of human gene expression system.file("extdata", "Einzelanalyse_nonorm_nobkgd_HSS106-HSS155.txt", package = "HT12ProcessoR")) # sample file my.sample_f <- c(system.file("extdata", "Run12_SamplesTable_590-651.txt", package = "HT12ProcessoR"), # sample annotation (each measured indiviuum) system.file("extdata", "SamplesTable_HSS106-HSS155.txt", package = "HT12ProcessoR")) # consolidate in input-file input <- data.table(fileset_id = my.fileset_id, con_f = my.con_f, nobkgd_f = my.nobkgd_f, sample_f = my.sample_f) # how should the input file be called? input_filename <- paste0(prepro.folder, "/obj/01_file_overview.txt") # save the input file for later use write.table(x = input, file = input_filename, quote = F, col.names = T, row.names = F, sep = "\t")
Next, customize project-specific variables in a parameter file. This file is used to provide most relevant parameters in a single place for the individual preprocessing functions. Parameters will also be explained later. An example is provided with this package
myparams <- read.delim(system.file("extdata", "input_parameter_010.txt", package = "HT12ProcessoR"), quote = "", sep = "\t")
Project-specific parameters that should be customized initially are:
file_names_of_files_and_folders
: Location of the files to preprocess; this is the input_filename
created in the previous stepdatafolder_results
: Directory, where the results of preprocessing can be stored# where is the input file we created located? myparams[myparams$variable == "file_names_of_files_and_folders", "value"] <- input_filename # where should the results be saved? myparams[myparams$variable == "datafolder_results", "value"] <- paste0(prepro.folder, "/tosend/") # where should the file be saved (overwrite the previously read-in file) myparamfile <- paste0(prepro.folder, "/obj/input_parameter_010.txt") # adjust for covariates T/F myparams[myparams$variable == "ComBat_adjust_covariates", "value"] <- T # what are the columns in chipsamples containing the covariate data called? myparams[myparams$variable == "ComBat_covariates_to_adjust", "value"] <- paste("covar1", "covar2", "covar3", sep = ", ") # save the file again with the above changes write.table(myparams, myparamfile, quote = F, col.names = T, row.names = F, sep="\t")
checkExtractChipsamples
: Check given expression files and extract individualsThis step does the following:
renameDublettes
$chipsamples
# execute function for the checks described above prepro_ht12 <- checkExtractChipsamples(paramfile = myparamfile) # check the output of the function for details # print object in HTML-report pander(prepro_ht12$history) # check the class of the object class(prepro_ht12) # another file preview pander(ht(prepro_ht12$chipsamples, 1))
Add fake covariate data
# Add covariates to chipsamples covar.id <- prepro_ht12$chipsamples$new_ID # create some random covariates (just for proof of concept) covars <- data.frame(id = covar.id, covar1 = sample(LETTERS[1:3], size = length(covar.id), replace = T), covar2 = rnorm(length(covar.id)), covar3 = factor(sample(c(0,1), size = length(covar.id), replace = T)) ) # add covariates to ht12object for(i in names(covars)[-1]){ matched <- match(prepro_ht12$chipsamples$new_ID, covars$id) prepro_ht12$chipsamples[[i]] <- covars[matched, i] # set some NA prepro_ht12$chipsamples[[i]][sample(1:nrow(prepro_ht12$chipsamples), size = sample(1:5, 1))] <- NA } # what are the covariates calles? covar.cols <- names(covars)[-1]
Exclude samples:
old_ID
in prepro_ht12$chipsamples
can be used, which reports the original ID used in the raw data from column Sample ID
of the sample-file from ILLUMINADesignate subgroups:
Alternative treatment of subgoups:
removeBatchEffects()
can be done by preserving the subgroup contrastSentrix.Barcode
, the Chip-ID.processingbatch
- The default is the column fileset_id
from the above file input_filename
strangebatch
- default here is 0
# preview first and last lines pander(ht(prepro_ht12$chipsamples, 2)) # define samples to be excluded, e.g. when different projects are processed on the same chip exclude.this <- c("614", "638", "HSS 108", "HSS 112", "HSS 115", "HSS 125", "HSS 127", "HSS 133", "HSS 137", "HSS 138", "HSS 146", "HSS 151","HSS 155") # Exclude Samples by modifying the $chipsamples element prepro_ht12$chipsamples <- prepro_ht12$chipsamples[!prepro_ht12$chipsamples$old_ID %in% exclude.this, ] # Define Subgroups - in this example, sampels are from Peripheral Blood Mononuclear Cells (PBMC) prepro_ht12$chipsamples$subgroup <- "PBMC" table(prepro_ht12$chipsamples$subgroup) # show which IDs were processed in which batch xtabs(~prepro_ht12$chipsamples$Sentrix.Barcode + prepro_ht12$chipsamples$processingbatch) # do we have strange batches on top of regular batch structure xtabs(~prepro_ht12$chipsamples$strangebatch + prepro_ht12$chipsamples$processingbatch) ## Quick Check for major differences based on Attributes from the provided ILLUMINA-sample-files initial_pca <- calcInitialPCA(prepro_ht12) # plot the results pca.plot <- ggplot(initial_pca$scores, aes(PC1, PC2, pch = subgroup, col = processingbatch, size = Detected.Genes..0.01., alpha = in_study, label = paste(old_ID, new_ID, sep = "\n"))) + geom_point() + scale_alpha_manual(values = c(0.7, 0.3)) # plot a static version pca.plot # interactive visualisation - check homogeinity etc. pca.plot %>% ggplotly()
createExpressionSet()
: Create an expression set and related information from dataIn this step, the expression information that is scattered across many files is united in a single expression set object. This includes control probe information.
Among others, following slots will be created within the updated HT12prepro-object among others:
$chipsamples
$rawdataWOcons_joined
$rawdataOnlycons_joined
$rawdataOnlycons_joined
$all_nobkgd_eset
$total_nobkgd_eset
$history
Additionally, integrity checks are carried out:
# create the ExpressionSet prepro_ht12 <- createExpressionSet(prepro_ht12, paramfile = myparamfile) # watch the function output # explore file output # show the command history kable(prepro_ht12$history) # preview expressionset including samples and controls prepro_ht12$total_nobkgd_eset # detection levels per probe hh(lumi::detection(prepro_ht12$total_nobkgd_eset)) # expression lvl per probe hh(lumi::exprs(prepro_ht12$total_nobkgd_eset))
filterLowExpressed
: Filter badly expressed individuals & mark badly expressed genesThis step includes:
filter1ind_expressedGenes
as threshold for the number of interquantile ranges of the number of detected probes to be tolerated. The definition of detection is related to a ILLUMIN-defined detection p-value of 0.01filter1probes_expressedProbes
as threshold, default is 0.05Changes to the HT12prepro-object, among others, are:
$chipsamples
is updated $genesdetail
$history
is updated# execute step 4 by running this function prepro_ht12 <- filterLowExpressed(prepro_ht12, paramfile = myparamfile) # show command history kable(prepro_ht12$history) # preview function output head(prepro_ht12$genesdetail) # show excluded samples setDT(prepro_ht12$chipsamples) prepro_ht12$chipsamples[,.N,.(in_study, reason4exclusion)] # and reset class of this object setDF(prepro_ht12$chipsamples)
transformNormalizeHT12object()
and filterTechnicallyFailed()
Transform & normalize data & filter samples for atypical control valuesnormalisation_method
, method rsn
is also available)This number can be changed via parameter filter2ind_atypischIlmnKontroll
Within the updated HT12prepro-object, among others, following elements are subject to change:
$chipsamples
is updated $all_nobkgd_eset_ql
$total_nobkgd_eset_ql
the slot with the history of the commands named $history
is updated
Several QC plots are shown displaying ILLUMINAS QC-parameter
Within this process, Illuminas QC parameters are recalculated using the transformed and normalized data
The file obj/s06_file_sampleattributes_in_combined_total_eset_expr_mahal.txt will be created, containing the sample related attributes of the current proccessing stage
# Transform and normalize the data with this function prepro_ht12 <- transformNormalizeHT12object(prepro_ht12, paramfile = myparamfile) # execute the filter step described above with this function prepro_ht12 <- filterTechnicallyFailed(prepro_ht12, paramfile = myparamfile) # show command execution history kable(prepro_ht12$history)
filter4MinBatchsize()
, transformNormalizeHT12object()
, and removeBatchEffects()
: Transform and normalize still valid samples and adjust for batch-effects$chipsamples
is updated $total_nobkgd_eset_ql_combat
ComBat()
from the R-package sva for details# filter for batch-size prepro_ht12 <- filter4MinBatchsize(prepro_ht12, paramfile = myparamfile) # repeat transformation and normalization # This is only necessary if samples were removed in previous step prepro_ht12 <- transformNormalizeHT12object(prepro_ht12, paramfile = myparamfile) # execute batch- adjustment via sva::ComBat() prepro_ht12 <- removeBatchEffects(ht12object = prepro_ht12, paramfile = myparamfile) # review command history kable(prepro_ht12$history)
checkBatchEffects()
: Check for remaining batch effectsSentrix.Barcode
, processingbatch
, and - if defined - strangebatch
QQ plots of association are shown
Within the updated HT12prepro-object among others,
$genesdetail
is updated $history
is updated# check for remaining batch-effects prepro_ht12 <- checkBatchEffects(prepro_ht12, paramfile = myparamfile) # preview the newly created object ht(prepro_ht12$genesdetail, 2) # review command history kable(prepro_ht12$history)
filterAtypicalExpressed()
: Identify samples with atypical expression valuesfilter2ind_atypischEuklid
!Note that sometimes it might be meaningful to repeat correction for batch association after excluding atypical samples identified in this step
Within the updated HT12prepro-object among others,
$chipsamples
is updated $history
is updated# filter for atypical expression levels prepro_ht12 <- filterAtypicalExpressed(prepro_ht12, paramfile = myparamfile) # preview newly created output ht(prepro_ht12$chipsamples, 2) # review command history kable(prepro_ht12$history)
visualizePreprocessing()
and comparePCBeforeAfterPrePro()
: visualization of Quality Control measuresPCA of gene expression after preprocessing using only expression probes that are expressed, not overinflated regarding batch affects and beeing specific to the human genome when considering a remapping approach according to Barbosa-Moralis et al. (2010)
Within the updated HT12prepro-object among others,
$dokuobjects_visualizePreprocessing
$history
is updated# create QC-plots prepro_ht12 <- visualizePreprocessing(prepro_ht12, paramfile = myparamfile) # PCA before/after preprocessing prepro_ht12 <- comparePCBeforeAfterPrePro(prepro_ht12, paramfile = myparamfile) # review command history kable(prepro_ht12$history)
writeFilesTosend()
: Write preprocessed dataThe directory of the resulting preprocessed data was specified in step Preparations
file_sampleannot_final.txt
: default is sampleannot_HT12v4.txt --> annotation of samplesfile_probeannot_final.txt
: default is probeannot_HT12v4.txt --> annotation of probesfile_annot_final.xlsx
: default is sampleUNDprobeannot_HT12v4.xlsx --> annotation of samples and probes as excel filefile_final_expression_set
: default is expressionset_preprocessed.Rdata --> Preprocessed data as expression setfile_final_expression_matrix
: default is expressionmatrix_preprocessed.Rdata --> Preprocessed data as R-Matrixfile_all_transcripts_good_incl_remapping_ok
: default is all_transcripts_good_incl_remapping_ok.txt --> IDs of probes that are considered expressed in all subgroups, have no remaining batch effect stronger than expected by chance, and are ok given a remapping approach of Barbosa-Moralis et al. (2010) -- Note that probes with a remaining batch effect might still be included in association analysis if no nominal significant correllation is found between the batch and the analysis variable of interestfile_final_expression_matrix_allProbes.txt
: default is expressionmatrix_preprocessed.txt --> Filename of expression matrix as tab delimited text file after finished preprocessingParameter file_renaming_samples_tosend
can specify a filename where a column named oldname
and newname
is provided in order to rename sample. Column oldname
must match the names of the samples in column new_ID
in the $chipsamples
data.frame included in the HT12 object. Can be empty if no renaming is required
## Set parameter to use provided renaming file renaming_fn <- system.file("extdata", "renamingsamples.txt", package = "HT12ProcessoR") # edit parameter file with location of renaming-file myparams[ myparams$variable == "file_renaming_samples_tosend", "value"] <- renaming_fn # write the parameter file write.table(myparams, myparamfile, quote = F, col.names = T, row.names = F, sep="\t") # write result files into the tosend/ folder prepro_ht12 = writeFilesTosend(prepro_ht12, paramfile = myparamfile) # preview first and last line of $chipsamples object ht(prepro_ht12$chipsamples, 1) # review command history kable(prepro_ht12$history)
It might be helpful, to save the HT12-object to preserve detailed information on preprocessing
# save an .RData object containing the final prepro_ht12 object save(prepro_ht12, file = paste0(prepro.folder, "/obj/prepro_example1_HT12object.RData"))
createTextForMethods()
: Information for Material and Methods section of a manuscriptThis function creates an extended and a short version of the preprocessing done for section Material and Methods of a publication.
Please slightly rephrase text as most journals require unique phrasing even in materials & methods section
# create text-object mm_text <- createTextForMethods(prepro_ht12, paramfile = myparamfile) # preview created text as a long version cat('### Extended Version') cat(mm_text$extended_version) # preview created text as a short version cat('### Short Version') cat(mm_text$short_version)
Session Info
# output script run time total.time <- round(as.numeric(Sys.time() - time.start), 2) message("This script ran for a total of ", total.time, " minutes.") # output session info sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.