knitr::opts_chunk$set(collapse=TRUE, comment = "#>", fig.width=9, fig.height=6, eval=TRUE, echo=TRUE, results="verbatim", dpi=75)
url = "http://epimed.univ-grenoble-alpes.fr/database/query/genes?annotations=epimed_tsg2_restricted_1_testis_adult,epimed_tsg2_restricted_1_placenta_adult,epimed_tsg2_restricted_1_embryonic_stem_cells_embryonic,epimed_tsg2_restricted_1_brain_adult&taxid=9606" genes = read.csv2(url, header=TRUE, sep=";") head(genes[,1:4]) dim(genes) #' Update vector of genes using epimed API #' #' This function updates vector of genes using epimed API #' @param genes_to_position a vector of genes to position #' @param jobid the id of the job if job exists #' @param taxid the name of the taxonomy to refer (9606 for homo spaiens, 10090 for mus musculus) #' @param assembly the name of the assembly to refer #' @param WAIT a bolean indicating if call to service is synchrone or assynchrone #' @importFrom httr POST #' @importFrom jsonlite fromJSON #' @export epimed_api_position = function(genes_to_position, jobid, assembly="GRCh38", taxid=9606, WAIT=TRUE) { if (missing(jobid)) { url = "http://epimed.univ-grenoble-alpes.fr/database/query/jobid" jobid = jsonlite::fromJSON(url) print(paste0("create job ", jobid)) # url = "http://epimed.univ-grenoble-alpes.fr/database/query/genes/update" # body = list(jobid=jobid, symbols=paste(genes_to_update, collapse=", "), taxid=taxid) # response = httr::POST(url, body = body, encode = "form") url = "http://epimed.univ-grenoble-alpes.fr/database/query/genes/position" body=list(jobid=jobid, symbols=paste(genes_to_position, collapse=", "), assembly=assembly, positionType="unique", taxid=taxid) response = httr::POST(url, body = body, encode = "form") } url = paste0("http://epimed.univ-grenoble-alpes.fr/database/query/jobstatus?jobid=", jobid) job = jsonlite::fromJSON(url) while (job$status != "success" & WAIT) { print(paste0("job:", job$jobid, " ", job$status, " ", job$current, "/", job$total)) system("sleep 2") job = jsonlite::fromJSON(url) } print(paste0("job:", job$jobid, " ", job$status, " ", job$current, "/", job$total)) url = paste0("http://epimed.univ-grenoble-alpes.fr/database/query/jobs?format=bed&jobid=", jobid) foo = read.csv2(url, header=TRUE, sep=";", stringsAsFactors=FALSE) return(foo) } bed_genes = epimed_api_position(genes$entrez, jobid="20200120165632730R975") head(bed_genes) dim(bed_genes) ector_tissue_specific_genes = bed_genes[,1:7] ector_tissue_specific_genes = ector_tissue_specific_genes[!is.na(ector_tissue_specific_genes[,1]) & !is.na(ector_tissue_specific_genes[,2]) & !is.na(ector_tissue_specific_genes[,3]) & ector_tissue_specific_genes[,1] != "",] ector_tissue_specific_genes = ector_tissue_specific_genes[order(as.numeric(substr(ector_tissue_specific_genes[,1], 4,10))),] ector_tissue_specific_genes[,1] = factor(ector_tissue_specific_genes[,1], levels=unique(ector_tissue_specific_genes[,1])) ector_tissue_specific_genes = ector_tissue_specific_genes[order(ector_tissue_specific_genes[,1], ector_tissue_specific_genes[,2]),] ector_tissue_specific_genes = ector_tissue_specific_genes[!duplicated(ector_tissue_specific_genes$gene_symbol),] rownames(ector_tissue_specific_genes) = ector_tissue_specific_genes$gene_symbol
ector_tissue_specific_genes
is a dataframe describing tissue-restricted genes (r nrow(ector_tissue_specific_genes)
genes in total). More precisely, it contains the testis-, placenta-, ESC- and brain-restricted genes.
The rows correspond to genes while the columns correspond to their related informations.
exp_grp = epimedtools::get_tcga_exp_grp(tcga_project="TCGA-COAD") head(exp_grp[,1:4]) dim(exp_grp)
exp_grp
is a data matrix containing clinical information for r nrow(exp_grp)
samples. The columns represent these clinical characteristics and each row corresponds to one patient, whose ID is given in the first column (id_sample).
s = readRDS("~/projects/tcga_studies/study_TCGA-COAD_trscr.rds") d = s$data head(d[,1:4]) dim(d)
d
is a data matrix containing the RNAseq counts of r nrow(d)
genes for r ncol(d)
samples in total. It has been obtain from the TCGA study of colon adenocarcinoma (s$data
). The rownames and colnames represent respectively the gene symbols and sample ID.
Elimination of samples from exp_grp
missing in d
:
rownames(exp_grp) = exp_grp$id_sample tmp_idx = intersect(colnames(d), rownames(exp_grp)) e = exp_grp[tmp_idx,] head(e[,1:4]) dim(e) ector_clinicals = e[,c("tissue_status", "os_months", "os_censor")] idx_n = rownames(e)[e$tissue_status %in% "normal"] length(idx_n) idx_c = rownames(e)[e$tissue_status %in% "tumoral"] length(idx_c)
Since not all the samples from exp_grp
are found in d
, the first step is to eliminate in exp_grp
the clinical informations of samples which do not appear in d
.
tmp_idx
is a list of characters representing the samples ID found both in d
and exp_grp
.
e
represents the data matrix exp_grp
without the samples missing in d
, containing then the informations for r nrow(e)
samples (r length(idx_n)
normal and r length(idx_c)
tumorous samples).
Elimination of non tissue-restricted genes in d
:
genes = ector_tissue_specific_genes genes_idx = intersect(rownames(genes), rownames(d)) head(genes_idx) length(genes) d = d[genes_idx,] ector_tissue_specific_genes = ector_tissue_specific_genes[genes_idx,] head(d[,1:4]) dim(d) ector_transcriptome = d
genes_idx
is a list of characters representing the gene symbols found both in ector_tissue_specific_genes
and d
.
d
now contains the RNAseq counts for r nrow(d)
tissue-restricted genes, for each of the r ncol(d)
samples.
# d is matrix is.matrix(ector_transcriptome) # e is df is.data.frame(ector_clinicals) # genes is df is.data.frame(ector_tissue_specific_genes) # genes is bed inspired is.character(ector_tissue_specific_genes[,1]) is.numeric(ector_tissue_specific_genes[,2]) is.numeric(ector_tissue_specific_genes[,3]) sum(ector_tissue_specific_genes[,3] < ector_tissue_specific_genes[,2]) == 0 sum(!ector_tissue_specific_genes[,6] %in% c("+", "-", ".")) == 0 # genes is ordered by chr and position !is.unsorted(ector_tissue_specific_genes[,1]) sapply(unique(ector_tissue_specific_genes[,1]), function(chr) { !is.unsorted(ector_tissue_specific_genes[ector_tissue_specific_genes[,1] %in% chr,2]) }) # rownames(e) == colnames(d) sum(rownames(ector_clinicals) != colnames(ector_transcriptome)) == 0 # rownames(genes) == rownames(d) sum(rownames(ector_tissue_specific_genes) != rownames(ector_transcriptome)) == 0 dim(ector_transcriptome) dim(ector_clinicals) dim(ector_tissue_specific_genes) head(ector_transcriptome) [,1:6] head(ector_clinicals) head(ector_tissue_specific_genes) [,1:6] save(ector_transcriptome, file='~/projects/ector/data/ector_transcriptome.RData' , compress='xz') save(ector_clinicals, file='~/projects/ector/data/ector_clinicals.RData' , compress='xz') save(ector_tissue_specific_genes, file='~/projects/ector/data/ector_tissue_specific_genes.RData' , compress='xz')
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.