Using FANTOM5 data from https://www.ebi.ac.uk/gxa/experiments/E-MTAB-3579/Downloads generate a list of genes with testis enriched expression for the purpose of annotating component gene loadings
curl 'https://www.ebi.ac.uk/gxa/experiments-content/E-MTAB-3579/resources/ExperimentDownloadSupplier.RnaSeqBaseline/tpms.tsv' -o ../data/gene_annotations/FANTOM5_MTAB_3579.tsv
# Which genes have testis specific expression? fantom <- fread("../data/gene_annotations/FANTOM5_MTAB_3579.tsv") # replace NA with 0 for (i in names(fantom)) fantom[is.na(get(i)), (i):=0] # select only adult expression fantom <- fantom[,c(1:2,grep(", adult",colnames(fantom))), with=FALSE] # remove aldult from name setnames(fantom, make.names(gsub(", adult","",colnames(fantom)))) # check which names have changed library(biomaRt) ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org", dataset = "mmusculus_gene_ensembl") mapTab <- getBM(attributes = c("external_gene_name",'ensembl_gene_id'), filter = "ensembl_gene_id", values = fantom$Gene.ID, mart = ensembl, uniqueRows=TRUE) mapTab <- data.table(mapTab) setnames(mapTab, "ensembl_gene_id", "Gene.ID") setkey(mapTab, Gene.ID) setkey(fantom, Gene.ID) fantom <- merge(fantom, mapTab, all.x=T) fantom[is.na(external_gene_name)] fantom[external_gene_name!=Gene.Name, c("Gene.Name", "external_gene_name")] # calculate max expression in non testis and non epidiymis max_nontestis <- apply(fantom[,-c("epididymis","testis","Gene.Name","external_gene_name","Gene.ID")], 1, max) fantom_summary <- cbind(max_nontestis, fantom[,c("testis","Gene.Name"), with=FALSE]) # Calculate enrichment score fantom_summary[,fold_difference := (testis+1) / (max_nontestis+1)] # Display results fantom_summary[fold_difference>10][order(-fold_difference)][1:100] qplot(fantom_summary$fold_difference, geom="density", bw=0.05) + scale_x_log10(breaks=c(1,10,100,1000)) +geom_vline(xintercept=10) + ggtitle("Define enrichmennt threshold") # Subset for annotation with Conrad Dataset fantom_summary_subset <- fantom_summary[fantom_summary$Gene.Name %in% colnames(data)][,.(Gene.Name, fold_difference = signif(fold_difference,3))] setkey(fantom_summary_subset, Gene.Name) enriched_genes <- fantom_summary[fold_difference>10][Gene.Name %in% colnames(SDAresults$loadings[[1]])]$Gene.Name NMF::aheatmap(t(SDAresults$loadings[[1]][-QC_fail_components,enriched_genes]), breaks=0, Colv = NA, cexRow = 0.7, layout = "_*", main="In Which Components Are Testis Enriched Genes")
The FANTOM5 data isn't very good (low number of replicates), so map genes to human orthologues and calculate enrichment in gtex instead.
# https://gtexportal.org/home/datasets curl 'https://storage.googleapis.com/gtex_analysis_v7/rna_seq_data/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_median_tpm.gct.gz' -o ../data/gene_annotations/GTEx_median_expression.tsv.gz # from mgi (http://www.informatics.jax.org/downloads/reports/index.html#homology) get mouse ID to HNCG ID pairing curl http://www.informatics.jax.org/downloads/reports/HGNC_homologene.rpt -o ../data/gene_annotations/HGNC_homologene.tsv # from HGNC get HGNC to ensembl ID/Symbol pairing curl ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/locus_types/gene_with_protein_product.txt -o ../data/gene_annotations/HGNC_prot_genes.tsv
gtex <- fread('zcat < ../data/gene_annotations/GTEx_median_expression.tsv.gz') gtex[,ensembl_gene_id := substr(gene_id, 1, 15)] # invalid file - need to add an extra tab and column name after the last one / skip first row mgi <- fread('../data/gene_annotations/HGNC_homologene.tsv', skip=1) # NB sometimes two hgnc for 1 mouse gene mgi[V2=="Lyzl1"] -> "HGNC:30502|HGNC:29613", choose the first arbitrarily mgi$V21 <- unlist(lapply(strsplit(mgi$V21,"|",fixed = T), "[[", 1)) mgi <- mgi[,.(hgnc_id=V21,ensembl_mouse_id=V10)] hgnc <- fread('../data/gene_annotations/HGNC_prot_genes.tsv') hgnc <- hgnc[,c("ensembl_gene_id", "hgnc_id", "symbol")] setkey(hgnc, hgnc_id) setkey(mgi, hgnc_id) mgi <- merge(hgnc,mgi) setkey(gtex, ensembl_gene_id) setkey(mgi, ensembl_gene_id) gtex <- merge(mgi, gtex) gtex[,ensembl_gene_id:=NULL] gtex[,hgnc_id:=NULL] gtex[,gene_id:=NULL] gtex[,Description:=NULL] gtex[,`Cells - EBV-transformed lymphocytes`:=NULL] gtex[,`Cells - Transformed fibroblasts`:=NULL] gtex[,Ovary:=NULL] library(biomaRt) ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org", dataset = "mmusculus_gene_ensembl") #data.table(listAttributes(ensembl))[grepl(pattern = "Mouse",x = description)] mapTab <- getBM(attributes = c('ensembl_gene_id','external_gene_name'), filter = "external_gene_name", values = colnames(data), mart = ensembl, uniqueRows=TRUE) # some on scaffold chromosomes or overlapping e.g. Map2k7 #[!duplicated(mapTab$external_gene_name)] mapTab <- data.table(mapTab)[!duplicated(mapTab)] setnames(mapTab, "ensembl_gene_id","ensembl_mouse_id") setkey(mapTab, ensembl_mouse_id) setkey(gtex, ensembl_mouse_id) gtex <- merge(gtex, mapTab) setnames(gtex, c("symbol","external_gene_name"), c("Human_Orthologue","Gene.Name")) max_nontestis_gtex <- apply(gtex[,-c("Testis","Human_Orthologue","ensembl_mouse_id","Gene.Name")], 1, max) gtex_summary <- cbind(max_nontestis_gtex, gtex[,c("Testis","Human_Orthologue","ensembl_mouse_id","Gene.Name"), with=FALSE]) gtex_summary[,fold_difference := (Testis+1) / (max_nontestis_gtex+1)] setkey(gtex_summary, Gene.Name) # Display results gtex_summary[fold_difference>10][order(-fold_difference)][1:100] qplot(gtex_summary$fold_difference, geom="density", bw=0.05) + scale_x_log10(breaks=c(1,10,100,1000)) +geom_vline(xintercept=10) + ggtitle("Define enrichmennt threshold") enriched_genes_gtex <- gtex_summary[fold_difference>10][Gene.Name %in% colnames(SDAresults$loadings[[1]])]$Gene.Name NMF::aheatmap(t(SDAresults$loadings[[1]][-QC_fail_components,enriched_genes_gtex]), breaks=0, Colv = NA, cexRow = 0.7, layout = "_*", main="In Which Components Are Testis Enriched Genes", scale="row") # Compare Fantom vs gtex enrich_merge <- merge(gtex_summary, fantom_summary_subset) plot(log(enrich_merge$fold_difference.x), log(enrich_merge$fold_difference.y), main="Fantom vs Gtex") abline(0,1) # NB Ensembl homology isn't very good! # mapTab <- getBM(attributes = c('ensembl_gene_id', # 'external_gene_name', # "mmusculus_homolog_ensembl_gene", # 'mmusculus_homolog_associated_gene_name', # 'mmusculus_homolog_orthology_confidence', # 'mmusculus_homolog_orthology_type'), # filter = "ensembl_gene_id", values = gtex$ensembl_gene_id, mart = ensembl, uniqueRows=TRUE) #mapTab <- data.table(mapTab)[mmusculus_homolog_orthology_type=="ortholog_one2one"]
Get MGI IDs
# ID_mapping <- readRDS("../data/count_matrices/V3_SDAmerged_mouse_V3_SDA_dimnames_ID_mapping.rds") # # library(biomaRt) # ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org", dataset = "mmusculus_gene_ensembl") # #data.table(listAttributes(ensembl))[grepl(pattern = "Mouse",x = description)] # flagged_map <- getBM(attributes = c('mgi_id','ensembl_gene_id','external_gene_name'), # filter = "ensembl_gene_id", values = ID_mapping$ensembl_gene_id, mart = ensembl, uniqueRows=TRUE) # # mgi_mapping <- data.table(flagged_map)[ID_mapping, on="ensembl_gene_id"] #infertility_genes2 <- mgi_mapping[flagged, on="mgi_id"][!is.na(external_gene_name)]$external_gene_name
Load genes with infertility phenotype from MGI for the purposes of annotating component gene loadings
# (Infertility & abnormal gametogenesis http://www.informatics.jax.org/vocab/mp_ontology/MP:0001929) #MP0001925_OR_MP0001930.tsv (male infertility & abnormal meiosis) # http://www.informatics.jax.org/marker/summary?phenotype=MP%3A0001924+OR+MP%3A0001929&mcv=6238161 curl 'http://www.informatics.jax.org/marker/report.txt?phenotype=MP%3A0001924+OR+MP%3A0001929&mcv=6238161' -o ../data/gene_annotations/MP0001924_OR_MP0001929.tsv
# Load Infertility Gene Annotations # add underscores to strand_GRCm38 MGI_ID Feature_Type, and add V10 to last column flagged <- fread("../data/gene_annotations/MP0001924_OR_MP0001929.tsv") names(flagged) <- make.names(names(flagged)) setnames(flagged, "MGI_ID","mgi_id") library(biomaRt) ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org", dataset = "mmusculus_gene_ensembl") #data.table(listAttributes(ensembl))[grepl(pattern = "Mouse",x = description)] flagged_map <- getBM(attributes = c('mgi_id','external_gene_name'), filter = "mgi_id", values = flagged$mgi_id, mart = ensembl, uniqueRows=TRUE) infertility_genes <- data.table(flagged_map)[external_gene_name %in% colnames(SDAresults$loadings[[1]])]$external_gene_name # also same result to just do this but in future MGI symbol may not equal our symbols #infertility_genes <- flagged$Symbol[flagged$Symbol %in% colnames(SDAresults$loadings[[1]])] str(infertility_genes) NMF::aheatmap(t(SDAresults$loadings[[1]][-QC_fail_components,infertility_genes]), breaks=0, Colv = NA, cexRow = 0.7, main="In Which Components Are Infertility Genes", scale="row")
save(fantom_summary_subset, gtex_summary, infertility_genes, file = "../data/gene_annotations/gene_annotations.rds")
setnames(fantom_summary_subset, c("Gene.Name","fold_difference"),c("gene_symbol","fold_enrichment_fantom")) setnames(gtex_summary, c("Gene.Name","fold_difference"),c("gene_symbol","fold_enrichment_gtex")) bulk_expression <- merge(fantom_summary_subset, gtex_summary[,.(gene_symbol, fold_enrichment_gtex, Human_Orthologue)], all = T) bulk_expression$Infertility_Gene <- bulk_expression$gene_symbol %in% infertility_genes gene_annotations <- merge(bulk_expression, rna_locations, all = T) saveRDS(gene_annotations, "../data/cache/gene_annotations.rds")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.