inst/doc/package_data.R

## ----setup, message=FALSE, warning=FALSE, echo=FALSE--------------------------
reset_options_scipen <- getOption("scipen")
reset_options_digits <- getOption("digits")
options(scipen = 1, digits = 5)
library(dnapath)

## -----------------------------------------------------------------------------
pathway_list <- dnapath::get_reactome_pathways("human")

## -----------------------------------------------------------------------------
pathway_list[1:2]

## -----------------------------------------------------------------------------
# Obtain index of pathways containing "p53" or "pi3k" in their name.
index_pathways <- grep("((p53)|(pi3k))", names(pathway_list),
                       ignore.case = TRUE)

## -----------------------------------------------------------------------------
# Subset onto the "p53" and "pi3k" pathways.
cancer_pathways <- pathway_list[index_pathways]
cancer_pathways[1:3] # Print out the first three

## -----------------------------------------------------------------------------
# Make a vector of all entrezgene IDs that we are interested in.
genes_of_interest <- c(7157, 5290, 5291, 5293, 5294)
# Obtain index of pathways containing any genes of interest.
index_pathways <- which(sapply(pathway_list, function(pathway) {
  any(genes_of_interest %in% pathway)
}))

## -----------------------------------------------------------------------------
# Subset onto pathways containing p53 gene or pi3k family of genes.
cancer_pathways <- pathway_list[index_pathways]
cancer_pathways[1:3] # Print out the first three

## -----------------------------------------------------------------------------
# Obtain index of pathways containing "p53" in their name.
index_pathways <- grep("(p53)", names(pathway_list),ignore.case = TRUE)
p53_pathways <- pathway_list[index_pathways]

## ----message = FALSE----------------------------------------------------------
file_clinical <- paste0("http://linkedomics.org/data_download/TCGA-MESO/Human__",
                        "TCGA_MESO__MS__Clinical__Clinical__01_28_2016__BI__",
                        "Clinical__Firehose.tsi")
file_rnaseq <- paste0("http://linkedomics.org/data_download/TCGA-MESO/Human__",
                      "TCGA_MESO__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene",
                      "__Firehose_RSEM_log2.cct.gz")

clinical <- readr::read_tsv(file_clinical)
rnaseq <- readr::read_tsv(file_rnaseq)

## -----------------------------------------------------------------------------
# First, we can glimpse at the raw data to see how it is structured.
clinical[1:5, 1:3]

## -----------------------------------------------------------------------------
# The columns contain samples, and rows contain different variables
# We want the transpose of this.
variable_names <- clinical$attrib_name # Store the variable names.

# Transpose the matrix so that columns correspond to variables.
clinical <- t(clinical[, -1]) # Don't include the "attrib_name" column as a row.
colnames(clinical) <- variable_names # Rename columns using variable names.

# Glimpse at the transposed dataset:
clinical[1:4, 1:4]

## -----------------------------------------------------------------------------
# Now that the rows correspond to subjects and columns to variables,
# the next step is to add a new column that contains the sample IDs. 

clinical <- cbind(ID = rownames(clinical), clinical)
rownames(clinical) <- NULL # Remove the row names.

# Glimpse at the data.
clinical[1:4, 1:4]

## -----------------------------------------------------------------------------
# For the example dataset, we are only interested in ID and tumor stage.
clinical <- clinical[, c("ID", "pathologic_stage")]
# The two groups that will be compared are stage 2 and stage 4.
# Subset rows onto those that have stage 2 or stage 4.
clinical <- clinical[clinical[, "pathologic_stage"] %in% c("stageii", "stageiv"), ]
clinical[1:5, ]

## -----------------------------------------------------------------------------
# We are left with only stage 2 and stage 4 samples:
table(clinical[, 2])

## -----------------------------------------------------------------------------
# First, we can glimps at the raw data to see how it is structured.
rnaseq[1:5, 1:5]

## -----------------------------------------------------------------------------
# As with the clinical data, we need to transpose the raw data so that
# samples are in the rows.
gene_symbols <- rnaseq$attrib_name # Store the gene names

rnaseq <- t(rnaseq[, -1])
colnames(rnaseq) <- gene_symbols

rnaseq[1:5, 1:5]

## -----------------------------------------------------------------------------
# Check if all 32 clinical samples contain gene expression data.
all(clinical[, "ID"] %in% rownames(rnaseq))

# Subset the rnaseq data onto those in the clinical data
rnaseq <- rnaseq[rownames(rnaseq) %in% clinical[, "ID"], ]

# Finally, we must make sure the two dataset are aligned.
# There are many ways to do this, here is one:
#   For each ID in the clinical dataset, find the corresponding row in rnaseq.
#   The rows are then reordered to match the IDs in clinical.
rnaseq <- rnaseq[sapply(clinical[, "ID"], 
                        function(ID) which(rownames(rnaseq) == ID)), ]

# Check that the IDs match:
all(rownames(rnaseq) == clinical[, "ID"])

rnaseq[1:5, 1:5]

## -----------------------------------------------------------------------------
# Convert gene symbols -> entrezgene IDs
gene_symbols <- colnames(rnaseq) # Extract the gene symbols to relabel.
gene_mat <- symbol_to_entrez(gene_symbols, "human",
                             dir_save = tempdir()) # Obtain mapping.
gene_mat[1:5, ]

## ----warning = FALSE----------------------------------------------------------
# The rename_genes() method is a multipurpose function that can be used to rename 
# the genes in a vector (the current case), a pathway list, or the 'dnapath_list'
# object that is returned from dnapath() after performing the differential
# network analysis.
gene_entrez <- rename_genes(gene_symbols, gene_mat) # Rename the symbols.
colnames(rnaseq) <- gene_entrez # Update the columns using the entrezgene IDs.
rnaseq[1:5, 1:5] # Columns now contain entrezgene IDs

## -----------------------------------------------------------------------------
# Subset the columns onto only those genes contained in 'p53_pathways'
index_genes <- which(colnames(rnaseq) %in% get_genes(p53_pathways))
rnaseq <- rnaseq[, index_genes]
dim(rnaseq) # 32 samples with 150 genes.

## -----------------------------------------------------------------------------
# Convert entrezgene IDs -> gene symbols
gene_entrez <- get_genes(p53_pathways) # Extract genes from pathway list.
gene_mat <- entrez_to_symbol(gene_entrez, "human", 
                             dir_save = tempdir()) # Obtain mapping.
gene_mat[1:5, ]

## -----------------------------------------------------------------------------
# Convert the entrezgene IDs into gene symbols in the pathway list.
new_pathway_list <- rename_genes(p53_pathways, gene_mat) 
new_pathway_list[1:2] # Print the first two pathways.

## -----------------------------------------------------------------------------
meso <- list(gene_expression = rnaseq,
             groups = clinical[, "pathologic_stage"])

## ----echo=FALSE---------------------------------------------------------------
options(scipen = reset_options_scipen, digits = reset_options_digits)

Try the dnapath package in your browser

Any scripts or data that you put into this service are public.

dnapath documentation built on May 9, 2022, 9:05 a.m.