testing the functions

#library(abcnet)
#library(dplyr)

Example 1: Using the TCGA Data

#Loading data
group_df=readr::read_tsv("data/PanImmune_FMx_ImmuneSubtypes.tsv")

## Read available gene expression data
dfe_in <- readr::read_tsv("data/GenExp_All_CKine_genes.tsv")
## Read file with available cell data
dfc_in <- readr::read_tsv("data/PanImmune_FMx.tsv")

## Read scaffold interaction file and node type file
node_type <- readr::read_tsv("data/network_node_labels.tsv")
scaffold <- readr::read_tsv("data/try_3a.tsv")

##Read file with cells and genes of interest
cois <- readr::read_lines("data/cells_of_interest.txt")
gois <- readr::read_lines("data/immunomodulator_genes.txt")

1.1. Filtering the scaffold to our cells and genes of interest

#Updating the list of genes of interest based on the gene expression data available
gois <- intersect(gois, colnames(dfe_in))

# Selecting the edges in the scaffold that have at least one gois, and a cois connected to a gois
filtered_scaffold <- abcnet::get_scaffold(scaffold, cois, gois)

#let's update the list of genes of interest with all genes that are present in the filtered scaffold
gois <- union(filtered_scaffold$From, filtered_scaffold$To) %>% intersect(colnames(dfe_in))

1.2. Organizing the expression data

The functions to compute the scores take the cell data and gene expression data in the tidy format. Each row of the measurement data should contain, at least, the following columns: - a sample ID - the node - the expression value - group of this sample ID

Our input data is not in this format right now. As a first step, let's organize the data.

group_col="Subtype_Immune_Model_Based"

#organizing the combination of participant and grouping variable
participants <- group_df$ParticipantBarcode
  groups <- group_df[[group_col]]
  group_of_participant <- groups ; names(group_of_participant) <- participants

#The cell data is in a specific column, so here we are selecting it  
dfc <- dfc_in %>% #filter(ParticipantBarcode %in% participants) %>%
    select(ParticipantBarcode,paste(cois,".Aggregate2",sep=""))

colnames(dfc) <- gsub(".Aggregate2","",colnames(dfc))

#Now, we include the group annotation for each sample
dfc <- dfc %>% 
  dplyr::mutate(Group=group_of_participant[ParticipantBarcode]) 

#Transforming each observation in one row (ie, making a long table)
dfclong <- dfc %>% tidyr::gather(Cell,Cell_Fraction,-c(ParticipantBarcode,Group))

#This step is optional - we are just adding some noise to the measurement
dfclong <- dfclong %>% dplyr::mutate(Cell_Fraction=Cell_Fraction+rnorm(mean=0, sd=0.0001,nrow(.)))

dfclong.generic <- dfclong %>% rename(Node=Cell,Value=Cell_Fraction)

tail(dfclong.generic)

The gene expression data requires a similar processing. After this, we can merge the data for cells and genes in one dataframe.

dfelong <- dfe_in %>% tidyr::gather(Gene,Expression,-ParticipantBarcode)
dfelong <- dfelong %>% dplyr::mutate(ExpLog2 = log2(Expression+1)+rnorm(mean=0, sd=0.0001,nrow(.))) %>%
  dplyr::select(ParticipantBarcode,Gene,Expression=ExpLog2) %>%
  dplyr::mutate(Group=group_of_participant[ParticipantBarcode])

dfelong.generic <- dfelong %>% rename(Node=Gene,Value=Expression)

dfn <- dplyr::bind_rows(dfelong.generic, dfclong.generic)

head(dfn)  

The cell and gene data is in the format to be used for computing the thresholds. - a sample ID: ParticipantBarcode - the node: Node - the expression value: Value - group of this sample ID: Group

1.3. Computing info for abundance scores for all nodes that are in the scaffold and that which we have data

nodes_scores <- abcnet::compute_abundance(dfn, node= "Node", ids = "ParticipantBarcode", exprv = "Value", group = "Group", cois = cois, gois = gois)

nodes_scores$Node[1]
nodes_scores$RatioPerGroup[[1]]

This is the ratio of samples in each group that have the expression of the gene A2M mapping to the upper two tertiles. The output dataframe of this function is formatted in a way that makes it easier to compute the scores of the edges, in section 1.4.

To better organize the table with the abundance scores for each node and group combination, we can creat a new object:

#abundance_scores <- nodes_scores %>% dplyr::select(Node, RatioPerGroup) %>% tidyr::unnest(c(RatioPerGroup))
abundance_scores <- abcnet::get_abundance_table(nodes_scores)
head(abundance_scores)

1.4. Computing the concordance scores for the edges in the filtered scaffold

edges_scores <- abcnet::compute_concordance(filtered_scaffold, nodes_scores)

head(edges_scores)

The abundance_scores and edges_scores objects are in the appropriate format to be exported as tsv files. These files can be used in network visualization softwares, such as Cytoscape.

readr::write_tsv(abundance_scores, paste("nodes_", Sys.Date(),".tsv", sep = ""))
readr::write_tsv(edges_scores, paste("edges_", Sys.Date(),".tsv", sep = ""))

Example 2: creating a random dataset to see if the functions work

#Creating 50 random patients IDs
SampleIDs <-  as.data.frame(do.call(paste0, replicate(5, sample(LETTERS, 100, TRUE), FALSE)))
colnames(SampleIDs) <- "SampleID"

#Creating 2 random groups for the SampleIDs
group <- c("G1", "G2")

SampleIDs <- SampleIDs %>% 
  mutate(Group = sample(group, nrow(.), replace = TRUE))

#Selecting 6 genes IDs to be used in the computation
Nodes <- c("BTLA","VTCN1","CANX","CCL5","HLA-B","TNFRSF14")

expr_data <- merge(SampleIDs, Nodes)
expr_data <- expr_data %>% 
  dplyr::mutate(value = rnorm(mean = 10, sd = 5, nrow(.)))

head(expr_data)

Computing the abundance scores

nodes_abundance <- abcnet::compute_abundance(expr_data, "y", "SampleID", "value", "Group", cois, gois)

nodes_abundance$Node[1]
nodes_abundance$RatioPerGroup[[1]]

#Organizing the abundance scores for exporting it
nodes_table <- abcnet::get_abundance_table(nodes_abundance)

Computing the edges scores

edges_concordance <- abcnet::compute_concordance(filtered_scaffold, nodes_abundance)

edges_concordance


heimannch/abcnet documentation built on Jan. 2, 2021, 5:06 p.m.