inst/doc/Introduction.R

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  out.width = "100%",
  error = TRUE
)
knitr::opts_knit$set(global.par = TRUE)

## ---- include=FALSE-------------------------------------------------------------------------------
set.seed(1)
options(width = 100)
par(cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5)

## ----eval=FALSE-----------------------------------------------------------------------------------
#  if(!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  BiocManager::install("proDA")

## ----eval=FALSE-----------------------------------------------------------------------------------
#  # install.packages("devtools")
#  devtools::install_github("const-ae/proDA")

## ----quickstart-----------------------------------------------------------------------------------
# Load the package
library(proDA)
# Generate some dataset with known structure
syn_dataset <- generate_synthetic_data(n_proteins = 100, n_conditions = 2)

# The abundance matrix
syn_dataset$Y[1:5, ]

# Assignment of the samples to the two conditions
syn_dataset$groups

# Fit the probabilistic dropout model
fit <- proDA(syn_dataset$Y, design = syn_dataset$groups)

# Identify which proteins differ between Condition 1 and 2
test_diff(fit, `Condition_1` - `Condition_2`, sort_by = "pval", n_max = 5)

## -------------------------------------------------------------------------------------------------
system.file("extdata/proteinGroups.txt", package = "proDA", mustWork = TRUE)

## -------------------------------------------------------------------------------------------------
# Load the table into memory
maxquant_protein_table <- read.delim(
    system.file("extdata/proteinGroups.txt", package = "proDA", mustWork = TRUE),
    stringsAsFactors = FALSE
)

## -------------------------------------------------------------------------------------------------
# I use a regular expression (regex) to select the intensity columns
intensity_colnames <- grep("^LFQ\\.intensity\\.", colnames(maxquant_protein_table), value=TRUE)
head(intensity_colnames)


# Create the intensity matrix
abundance_matrix <- as.matrix(maxquant_protein_table[, intensity_colnames])
# Adapt column and row maxquant_protein_table
colnames(abundance_matrix) <- sub("^LFQ\\.intensity\\.", "", intensity_colnames)
rownames(abundance_matrix) <- maxquant_protein_table$Protein.IDs
# Print some rows of the matrix with short names so they fit on the screen
abundance_matrix[46:48, 1:6]

## -------------------------------------------------------------------------------------------------
abundance_matrix[abundance_matrix == 0] <- NA
abundance_matrix <- log2(abundance_matrix)
abundance_matrix[46:48, 1:6]


## ----qc-mis_barplot,  out.width="80%", fig.height=4, fig.align="center"---------------------------

barplot(colSums(is.na(abundance_matrix)),
        ylab = "# missing values",
        xlab = "Sample 1 to 36")

## ----qc-raw_boxplot, out.width="80%", fig.height=4, fig.align="center"----------------------------
boxplot(abundance_matrix,
        ylab = "Intensity Distribution",
        xlab = "Sample 1 to 36")

## -------------------------------------------------------------------------------------------------
normalized_abundance_matrix <- median_normalization(abundance_matrix)

## -------------------------------------------------------------------------------------------------
da <- dist_approx(normalized_abundance_matrix)

## ----sample_dist, out.width="60%", fig.align="center"---------------------------------------------
# This chunk only works if pheatmap is installed
# install.packages("pheatmap")
sel <- c(1:3,  # CG1407
         7:9,  # CG59163
         22:24)# CG6618

plot_mat <- as.matrix(da$mean)[sel, sel]
# Remove diagonal elements, so that the colorscale is not distorted
plot_mat[diag(9) == 1] <- NA
# 95% conf interval is approx `sd * 1.96`
uncertainty <- matrix(paste0(" ± ",round(as.matrix(da$sd * 1.96)[sel, sel], 1)), nrow=9)
pheatmap::pheatmap(plot_mat, 
                   cluster_rows = FALSE, cluster_cols = FALSE,
                   display_numbers= uncertainty,
                   number_color = "black")

## -------------------------------------------------------------------------------------------------
# The best way to create this data.frame depends on the column naming scheme
sample_info_df <- data.frame(name = colnames(normalized_abundance_matrix),
                             stringsAsFactors = FALSE)
sample_info_df$condition <- substr(sample_info_df$name, 1, nchar(sample_info_df$name)  - 3)
sample_info_df$replicate <- as.numeric(
  substr(sample_info_df$name, nchar(sample_info_df$name)  - 1, 20)
)
sample_info_df

## -------------------------------------------------------------------------------------------------
fit <- proDA(normalized_abundance_matrix, design = ~ condition, 
             col_data = sample_info_df, reference_level = "S2R")
fit

## -------------------------------------------------------------------------------------------------
# Equivalent to feature_parameters(fit)
fit$feature_parameters

## ----protein_dist, out.width="60%", fig.align="center"--------------------------------------------
# This chunk only works if pheatmap is installed
# install.packages("pheatmap")
pheatmap::pheatmap(dist_approx(fit[1:20, 1:3], by_sample = FALSE)$mean)

## -------------------------------------------------------------------------------------------------
# Test which proteins differ between condition CG1407 and S2R
# S2R is the default contrast, because it was specified as the `reference_level`
test_res <- test_diff(fit, "conditionCG1407")
test_res

## -------------------------------------------------------------------------------------------------
sessionInfo()

Try the proDA package in your browser

Any scripts or data that you put into this service are public.

proDA documentation built on Nov. 8, 2020, 5:01 p.m.