knitr::opts_chunk$set(echo = TRUE)

Overview

This script reads converts data downloaded from the Allen Institute Cell Types Database (http://celltypes.brain-map.org/rnaseq) into a format compatible for use as comparison data to human fronto-insula.

Prior to running any code, download the data:

  1. If needed, set the working directory first: e.g., setwd("C:/Users/jeremym/Desktop/VEN_TEST")
  2. Create a subfolder in your current working directory called data.
  3. Download and unzip the following files in data.
    a. Human MTG: http://celltypes.brain-map.org/api/v2/well_known_file_download/694416044
    b. Mouse VISp: http://celltypes.brain-map.org/api/v2/well_known_file_download/694413985
    c. Mouse ALM: http://celltypes.brain-map.org/api/v2/well_known_file_download/694413179 NOTE: These links are accurate as of April 2019. If these data are moved in later releases, this code document will be updated accordingly - please post an Issue if needed. Similarly, note that the dates below may need to be updated.

Load the relevant libraries

suppressPackageStartupMessages({
  library(VENcelltypes)  ### NEED THIS
  library(feather)
  library(matrixStats)
  library(dplyr)
  library(edgeR)
  library(data.table)
})
options(stringsAsFactors=FALSE)

Process the human MTG data

First we need to read the data into R.

exons    <- as.matrix(fread("data/human_MTG_2018-06-14_exon-matrix.csv"),rownames=1)
introns  <- as.matrix(fread("data/human_MTG_2018-06-14_intron-matrix.csv"),rownames=1)
geneInfo <- read.csv("data/human_MTG_2018-06-14_genes-rows.csv",row.names=1)
sampInfo <- read.csv("data/human_MTG_2018-06-14_samples-columns.csv",row.names=1)

Second, convert the meta-data files into formats consistent with the rest of the analysis. Note that the MTG cluster colors (and other info) which is stored as a data file in VENcelltypes.

# Omit cells with no class
kp <- sampInfo$cluster!="no class"

# Format the cluster info
anno <- auto_annotate(sampInfo[kp,])
anno$sample_id <- anno$sample_name

# Update the correct cluster colors and ids
data(clusterInfoMTG)  
anno$cluster_color <- clusterInfoMTG$cluster_color[match(anno$cluster_label,clusterInfoMTG$cluster_label)]
anno$cluster_id    <- clusterInfoMTG$cluster_id[match(anno$cluster_label,clusterInfoMTG$cluster_label)]

Next, convert the data into CPM(exons+introns) and format appropriately. For this data set we also precalculate the medians for convenience.

## Calculate CPM
CPM <- cpm(introns+exons)
rownames(CPM) <- rownames(geneInfo)
colnames(CPM) <- sampInfo$sample_id
CPM <- CPM[,kp]  # Omit cells from outlier clusters as above

## Format appropriately
data <- as.data.frame(t(CPM))
data$sample_id <- anno$sample_id

## Calculate cluster medians in MTG data for comparison with FI
cl          = anno$cluster_id
names(cl)   = anno$sample_id
medianExpr = do.call("cbind", tapply(names(cl), cl, function(x) rowMedians(CPM[,x]))) 
medianExpr <- as.data.frame(medianExpr)
medianExpr$gene <- rownames(geneInfo) 

Finally, output the results to feather files in the MTG directory.

# Create MTG directory
dir.create("MTG")

# Write annotation file
write_feather(anno,"MTG/anno.feather")

# Write medians file
write_feather(medianExpr,"MTG/medians.feather")

# Write data file
write_feather(data,"MTG/data.feather")

Process the mouse VISp data

First we need to read the data into R.

exons    <- as.matrix(fread("data/mouse_VISp_2018-06-14_exon-matrix.csv"),rownames=1)
introns  <- as.matrix(fread("data/mouse_VISp_2018-06-14_intron-matrix.csv"),rownames=1)
geneInfo <- read.csv("data/mouse_VISp_2018-06-14_genes-rows.csv",row.names=1)
sampInfo <- read.csv("data/mouse_VISp_2018-06-14_samples-columns.csv",row.names=1)

Second, convert the meta-data files into formats consistent with the rest of the analysis.

sampInfo[is.na(sampInfo)]=0
anno <- auto_annotate(sampInfo)
anno$sample_id <- anno$sample_name

Next, convert the data into CPM(exons+introns) and format appropriately.

## Calculate CPM
CPM <- cpm(introns+exons)
rownames(CPM) <- rownames(geneInfo)
colnames(CPM) <- anno$sample_id

## Format appropriately
data <- as.data.frame(t(CPM))
data$sample_id <- anno$sample_id

Finally, output the results to feather files in the VISp directory.

# Create MTG directory
dir.create("VISp")

# Write annotation file
write_feather(anno,"VISp/anno.feather")

# Write data file
write_feather(data,"VISp/data.feather")

Process the mouse ALM data

First we need to read the data into R. This step is slow.

exons    <- as.matrix(fread("data/mouse_ALM_2018-06-14_exon-matrix.csv"),rownames=1)
introns  <- as.matrix(fread("data/mouse_ALM_2018-06-14_intron-matrix.csv"),rownames=1)
geneInfo <- read.csv("data/mouse_ALM_2018-06-14_genes-rows.csv",row.names=1)
sampInfo <- read.csv("data/mouse_ALM_2018-06-14_samples-columns.csv",row.names=1)

Second, convert the meta-data files into formats consistent with the rest of the analysis.

sampInfo[is.na(sampInfo)]=0
anno <- auto_annotate(sampInfo)
anno$sample_id <- anno$sample_name

Next, convert the data into CPM(exons+introns) and format appropriately.

## Calculate CPM
CPM <- cpm(introns+exons)
rownames(CPM) <- rownames(geneInfo)
colnames(CPM) <- anno$sample_id

## Format appropriately
data <- as.data.frame(t(CPM))
data$sample_id <- anno$sample_id

Finally, output the results to feather files in the MTG directory.

# Create MTG directory
dir.create("ALM")

# Write annotation file
write_feather(anno,"ALM/anno.feather")

# Write data file
write_feather(data,"ALM/data.feather")

Output session information.

sessionInfo()


AllenInstitute/L5_VEN documentation built on July 31, 2022, 6:32 p.m.