PepSAVIms_Introduction.R
In PepSAVIms: PepSAVI-MS Data Analysis

## ----setup, cache=FALSE, include=FALSE--------------------------------------------------
library(knitr)
knit_theme$set("default")
opts_chunk$set(cache=FALSE)
opts_knit$set(root.dir=normalizePath(".."))
options(width=90)

## ----reading-data-ms--------------------------------------------------------------------
# Load package into memory
library(PepSAVIms)

# The mass spectrometry data is provided as a data.frame
is.data.frame(mass_spec)

# There are 30,799 mass-to-charge levels, and 38 variables
dim(mass_spec)

# The first four variables provide the m/z level, time of peak retention, mass,
# and charge state of each observation.  The remaining 34 variables are the mass
# spectrometric intensities for each compound across fractions 11 through 43, and fraction 47.
names(mass_spec)

## ----reading-data-bio-------------------------------------------------------------------
# Load data into memory
data(bioact)

# bioact is a list with each element corresponding to bioactivity data
is.list(bioact)

# Names of the elements in bioact
names(bioact)

# Arbitrarily select one of the datasets for further examples
EC <- bioact$ec

# EC is provided as a data.frame
is.data.frame(EC)

# EC contains data for 3 replicates and 44 fractions
dim(EC)

# The names of the fractions for which bioactivity observations were obtained
names(EC)

## ----consolidating-data-names-----------------------------------------------------------

# Perform consolidation using names
bin_out <- binMS(mass_spec = mass_spec,
                 mtoz = "m/z",
                 charge = "Charge",
                 mass = "Mass",
                 time_peak_reten = "Reten",
                 ms_inten = NULL,
                 time_range = c(14, 45),
                 mass_range = c(2000, 15000),
                 charge_range = c(2, 10),
                 mtoz_diff = 0.05,
                 time_diff = 60)

## ----consolidating-data-all-------------------------------------------------------------

# Make copies of some of the vectors in mass_spec to pass directly to function
mass_vals   <- mass_spec[, "Mass"]
time_vals   <- mass_spec[, "Retention time (min)"]

# Vector of names for the intensity columns.  We include the leading underscore
# so as to prevent any ambiguity between the fraction number and date.
inten_nm <- c(paste0("_", 11:43), "_47")

# Perform consolidation alternate input
bin_out_v2 <- binMS(mass_spec = mass_spec,
                    mtoz = "m/z",
                    charge = "Charge",
                    mass = mass_vals,
                    time_peak_reten = time_vals,
                    ms_inten = inten_nm,
                    time_range = c(14, 45),
                    mass_range = c(2000, 15000),
                    charge_range = c(2, 10),
                    mtoz_diff = 0.05,
                    time_diff = 60)

# We get the same results whether specifying data via column names or column
# indices
identical(bin_out_v2, bin_out)

## ----consolidating-data-summary---------------------------------------------------------
# Print the size of the consolidated data
bin_out

# Show summary information describing the consolidation process
summary(bin_out)

## ----filtering-data-names---------------------------------------------------------------

# Invoke filterMS using column names to specify the region of interest
filter_out <- filterMS(msObj = bin_out,
                       region = paste0("VO_", 17:25),
                       border = "all",
                       bord_ratio = 0.01,
                       min_inten = 1000,
                       max_chg = 10)

# The column indices 7-15 correspond to fractions 17-25
colnames(filter_out)[7:15]

# Invoke filterMS using indices to specify the region of interest
filter_out_v2 <- filterMS(msObj = bin_out,
                          region = 7:15,
                          border = "all",
                          bord_ratio = 0.01,
                          min_inten = 1000,
                          max_chg = 10)

# Confirm that the two objects are equivalent
identical(filter_out_v2, filter_out)

## ----filtering-data-border--------------------------------------------------------------

# Use one value to specify the width of both the left and the right bordering
# region
filter_out_v3 <- filterMS(msObj = bin_out,
                          region = paste0("VO_", 17:25),
                          border = 100,
                          bord_ratio = 0.01,
                          min_inten = 1000,
                          max_chg = 10)

# Use two values to specify the left width and right width of the bordering
# region
filter_out_v4 <- filterMS(msObj = bin_out,
                          region = paste0("VO_", 17:25),
                          border = c(150, 200),
                          bord_ratio = 0.01,
                          min_inten = 1000,
                          max_chg = 10)

# We get the same result be specifying the left and right bordering regions as
# having widths 100 as by choosing "all"
identical(filter_out_v3$msDatObj, filter_out$msDatObj)

# We get the same result be specifying the left and right bordering regions as
# having widths 150 and 200 as by choosing "all"
identical(filter_out_v4$msDatObj, filter_out$msDatObj)

## ----filtering-data-summary-------------------------------------------------------------
# Print the size of the filtered data
filter_out

# Show summary information describing the filtering process
summary(filter_out)

## ----ranking-data-names-----------------------------------------------------------------
# Perform the candidate ranking procedure with fractions 21-24 as the region of
# interest
rank_out <- rankEN(msObj = filter_out,
                   bioact = EC,
                   region_ms = paste0("_", 21:24),
                   region_bio = paste0("_", 21:24),
                   lambda = 0.001,
                   pos_only = TRUE,
                   ncomp = NULL)

## ----ranking-data-summary---------------------------------------------------------------
# Prints the dimensions of the data
rank_out

# Shows the first 10 candidate compounds obtained by the procedure
summary(rank_out, 10)

## ----ranking-data-compounds-------------------------------------------------------------
# Extract the ranked candidates
ranked_candidates <- extract_ranked(rank_out)

# Return object is a data.frame
is.data.frame(ranked_candidates)

# Print first few candidates; should be the same as from the summary function
head(ranked_candidates)

## ----extractMS-matrix-------------------------------------------------------------------

# Refactor the data as a matrix
filter_matr <- extractMS(msObj = filter_out, type = "matrix")

# Return object is a matrix
is.matrix(filter_matr)

# The data has two extra columns, one each for the m/z and charge information
dim(filter_matr)

# Compare to the result of calling dim on the original msDat object
dim(filter_out)

# Print the first few rows and columns of the newly formed matrix.  The row
# names of the matrix are the concatonation of the mass-to-charge ratio and
# charge state, separated by a /.
filter_matr[1:5, 1:4]

## ----extractMS-matrix-export, eval=FALSE------------------------------------------------
#  # Save the data as a csv file.  Probably don't want to keep the row names as that
#  # information is contained in the first two columns of the data.
#  write.csv(filter_matr, file = "filtered_mass_spec.csv", row.names = FALSE)

## ----extractMS-msDat--------------------------------------------------------------------
# Extract the encapsulated msDat object
filter_msDat <- extractMS(filter_out, "msDat")

# For a subclass of msDat the extractMS function has the effect of performing
# the following command
filter_msDat_v2 <- filter_out$msDatObj

# extractMS is the same as copying the msDatObj element for a subclass of msDat
identical(filter_msDat_v2, filter_msDat)

# Calling extractMS on an object that is strictly of class msDat is effectively
# a noop
filter_msDat_v3 <- extractMS(filter_msDat, "msDat")

# extractMS on a strictly msDat object returns the original object
identical(filter_msDat_v3, filter_msDat)

# Printing the extracted msDat object prints the intensity matrix (as opposed to
# the print function for binMS or filterMS objects.  Also compare this to the
# extracted matrix in the previous section: in this form the mass-to-charge and
# charge data is not exposed to the user.
filter_msDat[1:5, 1:2]

## ----msDat------------------------------------------------------------------------------

# Construct an msDat object from object created by a call to extractMS
filter_out_v5 <- msDat(mass_spec = filter_matr,
                       mtoz = "mtoz",
                       charge = "charge",
                       ms_inten = NULL)

# Confirm that reconstructed msDat object is equal.  Need to ignore attributes
# when testing for equality b/c row names are not retained.
all.equal(filter_out_v5, filter_out$msDatObj, check.attributes=FALSE)


## ----msDat-API--------------------------------------------------------------------------

# Check the dimension; can also use nrow, ncol
dim(filter_msDat)

# Print the first few rows and columns
filter_msDat[1:5, 1:3]

# Let's change the fraction names to something more concise
colnames(filter_msDat) <- c(paste0("frac", 11:43), "frac47")

# Print the first few rows and columns with the new fraction names
filter_msDat[1:5, 1:10]

# Suppose there are some m/z levels that we wish to remove
filter_msDat <- filter_msDat[-c(2, 4), ]

# Print the first few rows and columns after removing rows 2 and 4
filter_msDat[1:5, 1:10]

# Suppose that there was an instrumentation error and that we need to change
# some values
filter_msDat[1, paste0("frac", 12:17)] <- c(55, 57, 62, 66, 71, 79)

# Print the first few rows and columns after changing some of the values in
# the first row
filter_msDat[1:5, 1:10]