Annotations

Enriched expression in testis

Using FANTOM5 data from https://www.ebi.ac.uk/gxa/experiments/E-MTAB-3579/Downloads generate a list of genes with testis enriched expression for the purpose of annotating component gene loadings

curl 'https://www.ebi.ac.uk/gxa/experiments-content/E-MTAB-3579/resources/ExperimentDownloadSupplier.RnaSeqBaseline/tpms.tsv' -o ../data/gene_annotations/FANTOM5_MTAB_3579.tsv
# Which genes have testis specific expression?

fantom <- fread("../data/gene_annotations/FANTOM5_MTAB_3579.tsv")

# replace NA with 0
for (i in names(fantom))
  fantom[is.na(get(i)), (i):=0]

# select only adult expression
fantom <- fantom[,c(1:2,grep(", adult",colnames(fantom))), with=FALSE]

# remove aldult from name
setnames(fantom, make.names(gsub(", adult","",colnames(fantom))))

# check which names have changed
library(biomaRt)
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org", dataset = "mmusculus_gene_ensembl")
mapTab <- getBM(attributes = c("external_gene_name",'ensembl_gene_id'),
                filter = "ensembl_gene_id", values = fantom$Gene.ID, mart = ensembl, uniqueRows=TRUE)

mapTab <- data.table(mapTab)
setnames(mapTab, "ensembl_gene_id", "Gene.ID")
setkey(mapTab, Gene.ID)
setkey(fantom, Gene.ID)

fantom <- merge(fantom, mapTab, all.x=T)

fantom[is.na(external_gene_name)]

fantom[external_gene_name!=Gene.Name, c("Gene.Name", "external_gene_name")]

# calculate max expression in non testis and non epidiymis
max_nontestis <- apply(fantom[,-c("epididymis","testis","Gene.Name","external_gene_name","Gene.ID")], 1, max)
fantom_summary <- cbind(max_nontestis, fantom[,c("testis","Gene.Name"), with=FALSE])

# Calculate enrichment score
fantom_summary[,fold_difference := (testis+1) / (max_nontestis+1)]

# Display results
fantom_summary[fold_difference>10][order(-fold_difference)][1:100]
qplot(fantom_summary$fold_difference, geom="density", bw=0.05) + scale_x_log10(breaks=c(1,10,100,1000)) +geom_vline(xintercept=10) + ggtitle("Define enrichmennt threshold")

# Subset for annotation with Conrad Dataset
fantom_summary_subset <- fantom_summary[fantom_summary$Gene.Name %in% colnames(data)][,.(Gene.Name, fold_difference = signif(fold_difference,3))]
setkey(fantom_summary_subset, Gene.Name)

enriched_genes <- fantom_summary[fold_difference>10][Gene.Name %in% colnames(SDAresults$loadings[[1]])]$Gene.Name

NMF::aheatmap(t(SDAresults$loadings[[1]][-QC_fail_components,enriched_genes]), breaks=0, Colv = NA, cexRow = 0.7, layout = "_*", main="In Which Components Are Testis Enriched Genes")

The FANTOM5 data isn't very good (low number of replicates), so map genes to human orthologues and calculate enrichment in gtex instead.

# https://gtexportal.org/home/datasets
curl 'https://storage.googleapis.com/gtex_analysis_v7/rna_seq_data/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_median_tpm.gct.gz' -o ../data/gene_annotations/GTEx_median_expression.tsv.gz

# from mgi (http://www.informatics.jax.org/downloads/reports/index.html#homology) get mouse ID to HNCG ID pairing
curl http://www.informatics.jax.org/downloads/reports/HGNC_homologene.rpt -o ../data/gene_annotations/HGNC_homologene.tsv

# from HGNC get HGNC to ensembl ID/Symbol pairing
curl ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/locus_types/gene_with_protein_product.txt -o ../data/gene_annotations/HGNC_prot_genes.tsv
gtex <- fread('zcat < ../data/gene_annotations/GTEx_median_expression.tsv.gz')
gtex[,ensembl_gene_id := substr(gene_id, 1, 15)]

# invalid file - need to add an extra tab and column name after the last one / skip first row
mgi <- fread('../data/gene_annotations/HGNC_homologene.tsv', skip=1)

# NB sometimes two hgnc for 1 mouse gene mgi[V2=="Lyzl1"] -> "HGNC:30502|HGNC:29613", choose the first arbitrarily
mgi$V21 <- unlist(lapply(strsplit(mgi$V21,"|",fixed = T), "[[", 1))
mgi <- mgi[,.(hgnc_id=V21,ensembl_mouse_id=V10)]


hgnc <- fread('../data/gene_annotations/HGNC_prot_genes.tsv')
hgnc <- hgnc[,c("ensembl_gene_id", "hgnc_id", "symbol")]

setkey(hgnc, hgnc_id)
setkey(mgi, hgnc_id)
mgi <- merge(hgnc,mgi)

setkey(gtex, ensembl_gene_id)
setkey(mgi, ensembl_gene_id)
gtex <- merge(mgi, gtex)

gtex[,ensembl_gene_id:=NULL]
gtex[,hgnc_id:=NULL]
gtex[,gene_id:=NULL]
gtex[,Description:=NULL]
gtex[,`Cells - EBV-transformed lymphocytes`:=NULL]
gtex[,`Cells - Transformed fibroblasts`:=NULL]
gtex[,Ovary:=NULL]

library(biomaRt)
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org", dataset = "mmusculus_gene_ensembl")
#data.table(listAttributes(ensembl))[grepl(pattern = "Mouse",x = description)]
mapTab <- getBM(attributes = c('ensembl_gene_id','external_gene_name'),
                filter = "external_gene_name", values = colnames(data), mart = ensembl, uniqueRows=TRUE)

# some on scaffold chromosomes or overlapping e.g. Map2k7 #[!duplicated(mapTab$external_gene_name)]
mapTab <- data.table(mapTab)[!duplicated(mapTab)]
setnames(mapTab, "ensembl_gene_id","ensembl_mouse_id")
setkey(mapTab, ensembl_mouse_id)
setkey(gtex, ensembl_mouse_id)

gtex <- merge(gtex, mapTab)
setnames(gtex, c("symbol","external_gene_name"), c("Human_Orthologue","Gene.Name"))

max_nontestis_gtex <- apply(gtex[,-c("Testis","Human_Orthologue","ensembl_mouse_id","Gene.Name")], 1, max)
gtex_summary <- cbind(max_nontestis_gtex, gtex[,c("Testis","Human_Orthologue","ensembl_mouse_id","Gene.Name"), with=FALSE])

gtex_summary[,fold_difference := (Testis+1) / (max_nontestis_gtex+1)]

setkey(gtex_summary, Gene.Name)

# Display results
gtex_summary[fold_difference>10][order(-fold_difference)][1:100]
qplot(gtex_summary$fold_difference, geom="density", bw=0.05) + scale_x_log10(breaks=c(1,10,100,1000)) +geom_vline(xintercept=10) + ggtitle("Define enrichmennt threshold")

enriched_genes_gtex <- gtex_summary[fold_difference>10][Gene.Name %in% colnames(SDAresults$loadings[[1]])]$Gene.Name

NMF::aheatmap(t(SDAresults$loadings[[1]][-QC_fail_components,enriched_genes_gtex]), breaks=0, Colv = NA, cexRow = 0.7, layout = "_*", main="In Which Components Are Testis Enriched Genes", scale="row")

# Compare Fantom vs gtex
enrich_merge <- merge(gtex_summary, fantom_summary_subset)
plot(log(enrich_merge$fold_difference.x), log(enrich_merge$fold_difference.y), main="Fantom vs Gtex")
abline(0,1)

# NB Ensembl homology isn't very good!
# mapTab <- getBM(attributes = c('ensembl_gene_id',
#                                'external_gene_name',
#                             "mmusculus_homolog_ensembl_gene",
#                             'mmusculus_homolog_associated_gene_name',
#                             'mmusculus_homolog_orthology_confidence',
#                             'mmusculus_homolog_orthology_type'),
#               filter = "ensembl_gene_id", values = gtex$ensembl_gene_id, mart = ensembl, uniqueRows=TRUE)

#mapTab <- data.table(mapTab)[mmusculus_homolog_orthology_type=="ortholog_one2one"]

Genes causing infertility in KO

Get MGI IDs

# ID_mapping <- readRDS("../data/count_matrices/V3_SDAmerged_mouse_V3_SDA_dimnames_ID_mapping.rds")
# 
# library(biomaRt)
# ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org", dataset = "mmusculus_gene_ensembl")
# #data.table(listAttributes(ensembl))[grepl(pattern = "Mouse",x = description)]
# flagged_map <- getBM(attributes = c('mgi_id','ensembl_gene_id','external_gene_name'),
#               filter = "ensembl_gene_id", values = ID_mapping$ensembl_gene_id, mart = ensembl, uniqueRows=TRUE)
# 
# mgi_mapping <- data.table(flagged_map)[ID_mapping, on="ensembl_gene_id"]

#infertility_genes2 <- mgi_mapping[flagged, on="mgi_id"][!is.na(external_gene_name)]$external_gene_name

Load genes with infertility phenotype from MGI for the purposes of annotating component gene loadings

# (Infertility & abnormal gametogenesis http://www.informatics.jax.org/vocab/mp_ontology/MP:0001929) #MP0001925_OR_MP0001930.tsv (male infertility & abnormal meiosis)
# http://www.informatics.jax.org/marker/summary?phenotype=MP%3A0001924+OR+MP%3A0001929&mcv=6238161
curl 'http://www.informatics.jax.org/marker/report.txt?phenotype=MP%3A0001924+OR+MP%3A0001929&mcv=6238161' -o ../data/gene_annotations/MP0001924_OR_MP0001929.tsv
# Load Infertility Gene Annotations
# add underscores to strand_GRCm38  MGI_ID  Feature_Type, and add V10 to last column
flagged <- fread("../data/gene_annotations/MP0001924_OR_MP0001929.tsv")

names(flagged) <- make.names(names(flagged))
setnames(flagged, "MGI_ID","mgi_id")

library(biomaRt)
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org", dataset = "mmusculus_gene_ensembl")
#data.table(listAttributes(ensembl))[grepl(pattern = "Mouse",x = description)]
flagged_map <- getBM(attributes = c('mgi_id','external_gene_name'),
                filter = "mgi_id", values = flagged$mgi_id, mart = ensembl, uniqueRows=TRUE)
infertility_genes <- data.table(flagged_map)[external_gene_name %in% colnames(SDAresults$loadings[[1]])]$external_gene_name

# also same result to just do this but in future MGI symbol may not equal our symbols
#infertility_genes <- flagged$Symbol[flagged$Symbol %in% colnames(SDAresults$loadings[[1]])]
str(infertility_genes)

NMF::aheatmap(t(SDAresults$loadings[[1]][-QC_fail_components,infertility_genes]), breaks=0, Colv = NA, cexRow = 0.7,  main="In Which Components Are Infertility Genes", scale="row")
save(fantom_summary_subset, gtex_summary, infertility_genes, file = "../data/gene_annotations/gene_annotations.rds")
setnames(fantom_summary_subset, c("Gene.Name","fold_difference"),c("gene_symbol","fold_enrichment_fantom"))
setnames(gtex_summary, c("Gene.Name","fold_difference"),c("gene_symbol","fold_enrichment_gtex"))

bulk_expression <- merge(fantom_summary_subset, gtex_summary[,.(gene_symbol, fold_enrichment_gtex, Human_Orthologue)], all = T)
bulk_expression$Infertility_Gene <- bulk_expression$gene_symbol %in% infertility_genes

gene_annotations <- merge(bulk_expression, rna_locations, all = T)

saveRDS(gene_annotations, "../data/cache/gene_annotations.rds")


MyersGroup/testisAtlas documentation built on Nov. 29, 2020, 9:21 p.m.