In mpetrier/ector: Ectopic gene expression in cancer

knitr::opts_chunk$set(collapse=TRUE, comment = "#>", fig.width=9, fig.height=6, eval=TRUE, echo=TRUE, results="verbatim", dpi=75)

Restricted genes

url = "http://epimed.univ-grenoble-alpes.fr/database/query/genes?annotations=epimed_tsg2_restricted_1_testis_adult,epimed_tsg2_restricted_1_placenta_adult,epimed_tsg2_restricted_1_embryonic_stem_cells_embryonic,epimed_tsg2_restricted_1_brain_adult&taxid=9606"
genes = read.csv2(url, header=TRUE, sep=";")
head(genes[,1:4])
dim(genes)


#' Update vector of genes using epimed API
#'
#' This function updates vector of genes using epimed API
#' @param genes_to_position a vector of genes to position
#' @param jobid the id of the job if job exists
#' @param taxid the name of the taxonomy to refer (9606 for homo spaiens, 10090 for mus musculus)
#' @param assembly the name of the assembly to refer
#' @param WAIT a bolean indicating if call to service is synchrone or assynchrone
#' @importFrom httr POST
#' @importFrom jsonlite fromJSON
#' @export
epimed_api_position = function(genes_to_position, jobid, assembly="GRCh38", taxid=9606, WAIT=TRUE) {
  if (missing(jobid)) {
    url = "http://epimed.univ-grenoble-alpes.fr/database/query/jobid"
    jobid = jsonlite::fromJSON(url)    
    print(paste0("create job ", jobid))
    # url = "http://epimed.univ-grenoble-alpes.fr/database/query/genes/update"
    # body = list(jobid=jobid, symbols=paste(genes_to_update, collapse=", "), taxid=taxid)
    # response = httr::POST(url, body = body, encode = "form")

    url = "http://epimed.univ-grenoble-alpes.fr/database/query/genes/position"
    body=list(jobid=jobid, symbols=paste(genes_to_position, collapse=", "), assembly=assembly, positionType="unique", taxid=taxid)
    response = httr::POST(url, body = body, encode = "form")
  }

  url = paste0("http://epimed.univ-grenoble-alpes.fr/database/query/jobstatus?jobid=", jobid)
  job = jsonlite::fromJSON(url)
  while (job$status != "success" & WAIT) {
    print(paste0("job:", job$jobid, " ", job$status, " ", job$current, "/", job$total))
    system("sleep 2")
    job = jsonlite::fromJSON(url)
  }
  print(paste0("job:", job$jobid, " ", job$status, " ", job$current, "/", job$total))

  url = paste0("http://epimed.univ-grenoble-alpes.fr/database/query/jobs?format=bed&jobid=", jobid)
  foo = read.csv2(url, header=TRUE, sep=";", stringsAsFactors=FALSE)
  return(foo)
}

bed_genes = epimed_api_position(genes$entrez, jobid="20200120165632730R975")
head(bed_genes)
dim(bed_genes)
ector_tissue_specific_genes = bed_genes[,1:7]
ector_tissue_specific_genes = ector_tissue_specific_genes[!is.na(ector_tissue_specific_genes[,1]) & !is.na(ector_tissue_specific_genes[,2]) & !is.na(ector_tissue_specific_genes[,3]) & ector_tissue_specific_genes[,1] != "",]
ector_tissue_specific_genes = ector_tissue_specific_genes[order(as.numeric(substr(ector_tissue_specific_genes[,1], 4,10))),]
ector_tissue_specific_genes[,1] = factor(ector_tissue_specific_genes[,1], levels=unique(ector_tissue_specific_genes[,1]))
ector_tissue_specific_genes = ector_tissue_specific_genes[order(ector_tissue_specific_genes[,1], ector_tissue_specific_genes[,2]),]
ector_tissue_specific_genes  = ector_tissue_specific_genes[!duplicated(ector_tissue_specific_genes$gene_symbol),]
rownames(ector_tissue_specific_genes) = ector_tissue_specific_genes$gene_symbol

ector_tissue_specific_genes is a dataframe describing tissue-restricted genes (r nrow(ector_tissue_specific_genes) genes in total). More precisely, it contains the testis-, placenta-, ESC- and brain-restricted genes. The rows correspond to genes while the columns correspond to their related informations.

Clinicals

exp_grp = epimedtools::get_tcga_exp_grp(tcga_project="TCGA-COAD")
head(exp_grp[,1:4])
dim(exp_grp)

exp_grp is a data matrix containing clinical information for r nrow(exp_grp) samples. The columns represent these clinical characteristics and each row corresponds to one patient, whose ID is given in the first column (id_sample).

Transcriptomic data

s = readRDS("~/projects/tcga_studies/study_TCGA-COAD_trscr.rds")
d = s$data
head(d[,1:4])
dim(d)

d is a data matrix containing the RNAseq counts of r nrow(d) genes for r ncol(d) samples in total. It has been obtain from the TCGA study of colon adenocarcinoma (s$data). The rownames and colnames represent respectively the gene symbols and sample ID.

Data filtering

Elimination of samples from exp_grp missing in d:

rownames(exp_grp) = exp_grp$id_sample
tmp_idx = intersect(colnames(d), rownames(exp_grp))
e = exp_grp[tmp_idx,]                               
head(e[,1:4])
dim(e)
ector_clinicals = e[,c("tissue_status", "os_months", "os_censor")]

idx_n = rownames(e)[e$tissue_status %in% "normal"] 
length(idx_n)
idx_c = rownames(e)[e$tissue_status %in% "tumoral"]
length(idx_c)

Since not all the samples from exp_grp are found in d, the first step is to eliminate in exp_grp the clinical informations of samples which do not appear in d. tmp_idx is a list of characters representing the samples ID found both in d and exp_grp. e represents the data matrix exp_grp without the samples missing in d, containing then the informations for r nrow(e) samples (r length(idx_n) normal and r length(idx_c) tumorous samples).

Elimination of non tissue-restricted genes in d:

genes = ector_tissue_specific_genes
genes_idx = intersect(rownames(genes), rownames(d))
head(genes_idx)
length(genes)
d = d[genes_idx,]
ector_tissue_specific_genes = ector_tissue_specific_genes[genes_idx,]
head(d[,1:4])
dim(d)
ector_transcriptome = d

genes_idx is a list of characters representing the gene symbols found both in ector_tissue_specific_genes and d. d now contains the RNAseq counts for r nrow(d) tissue-restricted genes, for each of the r ncol(d) samples.

Checking data

# d is matrix
is.matrix(ector_transcriptome)
# e is df
is.data.frame(ector_clinicals)
# genes is df
is.data.frame(ector_tissue_specific_genes)

# genes is bed inspired
is.character(ector_tissue_specific_genes[,1])
is.numeric(ector_tissue_specific_genes[,2])
is.numeric(ector_tissue_specific_genes[,3])
sum(ector_tissue_specific_genes[,3] < ector_tissue_specific_genes[,2]) == 0 
sum(!ector_tissue_specific_genes[,6] %in% c("+", "-", ".")) == 0 
# genes is ordered by chr and position
!is.unsorted(ector_tissue_specific_genes[,1])
sapply(unique(ector_tissue_specific_genes[,1]), function(chr) {
  !is.unsorted(ector_tissue_specific_genes[ector_tissue_specific_genes[,1] %in% chr,2])  
})
# rownames(e) == colnames(d)
sum(rownames(ector_clinicals) != colnames(ector_transcriptome)) == 0
# rownames(genes) == rownames(d)
sum(rownames(ector_tissue_specific_genes) != rownames(ector_transcriptome)) == 0 


dim(ector_transcriptome)
dim(ector_clinicals)
dim(ector_tissue_specific_genes)

head(ector_transcriptome)         [,1:6]
head(ector_clinicals)             
head(ector_tissue_specific_genes) [,1:6]


save(ector_transcriptome, file='~/projects/ector/data/ector_transcriptome.RData' , compress='xz')
save(ector_clinicals, file='~/projects/ector/data/ector_clinicals.RData' , compress='xz')
save(ector_tissue_specific_genes, file='~/projects/ector/data/ector_tissue_specific_genes.RData' , compress='xz')