## -------------------------------------------------------------------------- ##
## Creating folder structure for the main reosurce -------------------------
## -------------------------------------------------------------------------- ##
dir.create("_ref") # for reference genomes, annotations, and so on
dir.create("_db") # for sql(ite) dbs, or tabular dbs retrieved
## -------------------------------------------------------------------------- ##
## Retrieving via SRAdb the whole dump of the SRA repository ---------------
## -------------------------------------------------------------------------- ##
library(SRAdb)
sqlfile <- '_db/SRAmetadb.sqlite'
if(!file.exists('_db/SRAmetadb.sqlite'))
sqlfile <<- getSRAdbFile()
# Then, create a connection for later queries. The standard DBI functionality as implemented
# in RSQLite function dbConnect makes the connection to the database. The dbDisconnect
# function disconnects the connection.
sra_con <- dbConnect(SQLite(),sqlfile)
# some examples on how this could be used:
rs <- getSRAinfo ( c("SRX000122"), sra_con, sraType = "sra" )
rs[1:3,]
getFASTQinfo( c("SRR000648","SRR000657"), sra_con, srcType = 'ftp' )
getSRAfile( c("SRR000648","SRR000657"), sra_con, fileType = 'fastq' )
# # Additionally, to obtain the SRAid correspondences as in https://www.biostars.org/p/244150/
# accessions_SRA_file <- "ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab"
# download.file(accessions_SRA_file,paste0("_db/SRA_Accessions_SNAPSHOT_",Sys.Date(),".tab"))
#
# # accessions_SRA <- readr::read_tsv(accessions_SRA_file)
#
# grep '^SRR' _db/SRA_Accessions_SNAPSHOT_2018-01-31.tab | grep 'GSM' > _db/SRA_Accessions_RUNSonly_SNAPSHOT_2018-01-31.tab
#
# # insert header as first line
# # Accession Submission Status Updated Published Received Type Center Visibility Alias Experiment Sample Study Loaded Spots Bases Md5sum BioSample BioProject ReplacedBy
# sed -i "1i\
# $(head -n 1 _db/SRA_Accessions_SNAPSHOT_2018-01-31.tab)" _db/SRA_Accessions_RUNSonly_SNAPSHOT_2018-01-31.tab
#
# mydf <- data.table::fread("_db/SRA_Accessions_RUNSonly_SNAPSHOT_2018-01-31.tab")
# # colnames(mydf) <- dbListFields(sraacc_con,name = "SRAaccs")
#
#
# # to be done once
# library(DBI)
# mySRAaccessiondb <- dbConnect(RSQLite::SQLite(), paste0("_db/SRA_Accessions_SNAPSHOT_",Sys.Date(),".sqlite"))
# dbWriteTable(mySRAaccessiondb, "SRAaccs", mydf)
#
# # to use it:
# sraacc_con <- dbConnect(SQLite(),paste0("_db/SRA_Accessions_SNAPSHOT_",Sys.Date(),".sqlite"))
# dbListTables(sraacc_con)
#
# dbListFields(sraacc_con,name = "SRAaccs")
#
# dbGetQuery(sraacc_con, 'SELECT * FROM SRAaccs WHERE "Alias" == :x',
# params = list(x = "GSM2474455"))
rs <- getSRA( search_terms = "platelets",out_types = c('run','study'), sra_con )
# retrieve info programmatically
unique(rs$study) ## as a starter
## -------------------------------------------------------------------------- ##
## Retrieve the whole refs and annotations ---------------------------------
## -------------------------------------------------------------------------- ##
# Retrieving for human and mouse all the relevant references, annotation, and so on
#' Title
#'
#' @param version_number
#' @param species
#' @param unzip_files
#' @param ref_dir
#' @param quiet_dl
#' @param just_check
#'
#' @return
#' @export
#'
#' @examples
retrieve_ensembl_refs <- function(version_number = 96,
species = c("human","mouse"),
# maybe parametrize species, and use different folder structure?
unzip_files = FALSE,
ref_dir = "_ref",
quiet_dl = FALSE,
just_check = TRUE) {
cur_species <- match.arg(species)
all_organisms <- setNames(c("Homo_sapiens","Mus_musculus"),c("human","mouse"))
all_builds <- setNames(c("GRCh38","GRCm38"),c("human","mouse"))
# all_prefixes <- setNames(c("Homo_sapiens.GRCh38","Mus_musculus.GRCm38"),c("human","mouse"))
cur_organism <- all_organisms[cur_species]
cur_build <- all_builds[cur_species]
if(just_check) {
if(cur_species == "human")
browseURL("https://www.ensembl.org/Homo_sapiens/Info/Annotation")
if(cur_species == "mouse")
browseURL("https://www.ensembl.org/Mus_musculus/Info/Annotation")
return(invisible())
}
message("Retrieving all required files from ENSEMBL version ", version_number)
this_dir <- file.path(ref_dir,paste0("Ensembl.",cur_build,".",version_number))
if(!dir.exists(this_dir)) {
dir.create(this_dir)
message(Sys.time(), " --- ", this_dir, " folder created...")
} else {
message(Sys.time(), " --- ", this_dir, " folder is already existing")
}
message("\nRetrieving references for ", paste(cur_organism, cur_build))
base_ftp <- paste0("ftp://ftp.ensembl.org/pub/release-",version_number)
url_genomesequence <- paste0(base_ftp,"/fasta/",tolower(cur_organism),"/dna/",cur_organism,".",cur_build,".dna.primary_assembly.fa.gz")
url_cdnasequence <- paste0(base_ftp,"/fasta/",tolower(cur_organism),"/cdna/",cur_organism,".",cur_build,".cdna.all.fa.gz")
url_codingseq <- paste0(base_ftp,"/fasta/",tolower(cur_organism),"/cds/",cur_organism,".",cur_build,".cds.all.fa.gz")
url_ncrna <- paste0(base_ftp,"/fasta/",tolower(cur_organism),"/ncrna/",cur_organism,".",cur_build,".ncrna.fa.gz")
url_annogtf <- paste0(base_ftp,"/gtf/",tolower(cur_organism),"/",cur_organism,".",cur_build,".",version_number,".gtf.gz")
url_annogff3 <- paste0(base_ftp,"/gff3/",tolower(cur_organism),"/",cur_organism,".",cur_build,".",version_number,".gff3.gz")
dest_genomesequence <- file.path(this_dir,paste0(cur_organism,".",cur_build,".dna.primary_assembly.fa.gz"))
dest_cdnasequence <- file.path(this_dir,paste0(cur_organism,".",cur_build,".cdna.all.fa.gz"))
dest_codingseq <- file.path(this_dir,paste0(cur_organism,".",cur_build,".cds.all.fa.gz"))
dest_ncrna <- file.path(this_dir,paste0(cur_organism,".",cur_build,".ncrna.fa.gz"))
dest_annogtf <- file.path(this_dir,paste0(cur_organism,".",cur_build,".",version_number,".gtf.gz"))
dest_annogff3 <- file.path(this_dir,paste0(cur_organism,".",cur_build,".",version_number,".gff3.gz"))
if(!file.exists(dest_genomesequence)){
message("Fetching genome sequence in fasta format")
download.file(url = url_genomesequence,destfile = dest_genomesequence, quiet = quiet_dl)
} else {
message("Already found genome sequence in fasta format")
}
if(!file.exists(dest_cdnasequence)){
message("Fetching cdna sequence in fasta format")
download.file(url = url_cdnasequence,destfile = dest_cdnasequence, quiet = quiet_dl)
} else {
message("Already found cdna sequence in fasta format")
}
if(!file.exists(dest_codingseq)){
message("Fetching coding sequences in fasta format")
download.file(url = url_codingseq,destfile = dest_codingseq, quiet = quiet_dl)
} else {
message("Already found coding sequences in fasta format")
}
if(!file.exists(dest_ncrna)){
message("Fetching noncoding rna sequences in fasta format")
download.file(url = url_ncrna,destfile = dest_ncrna, quiet = quiet_dl)
} else {
message("Already found noncoding rna sequences in fasta format")
}
if(!file.exists(dest_annogtf)){
message("Fetching gene annotation in gtf format")
download.file(url = url_annogtf,destfile = dest_annogtf, quiet = quiet_dl)
} else {
message("Already found gene annotation in gtf format")
}
if(!file.exists(dest_annogff3)){
message("Fetching gene annotation in gff3 format")
download.file(url = url_annogff3,destfile = dest_annogff3, quiet = quiet_dl)
} else {
message("Already found gene annotation in gff3 format")
}
if(unzip_files) {
# unzipping all files so that they are directly accessible for the program to generate indices and co.
message("\nUnzipping the downloaded resources... (this might take a while)")
message("Unzipping references in ", this_dir)
system(paste0("gunzip ", file.path(this_dir,"*gz")))
message("Done unzipping")
}
message("\n",Sys.time(), " --- Done retrieving all files for ENSEMBL version ", version_number, " for ", paste0(cur_organism, " - build ", cur_build))
return(invisible())
}
retrieve_ensembl_refs(version_number = 96, species = "mouse", just_check = FALSE)
retrieve_ensembl_refs(version_number = 96, species = "human", just_check = FALSE)
#' Title
#'
#' @param version_number
#' @param unzip_files
#' @param ref_dir
#' @param quiet_dl
#' @param just_check
#'
#' @return
#' @export
#'
#' @examples
retrieve_gencode_refs <- function(version_number = 30, # or "M21"
unzip_files = FALSE,
ref_dir = "_ref",
quiet_dl = FALSE,
just_check = TRUE){
## downloading the essential from the GENCODE project as well (https://www.gencodegenes.org/releases/current.html)
if(just_check) {
browseURL("https://www.gencodegenes.org")
return(invisible())
}
message("Retrieving all required files from GENCODE version ", version_number)
this_dir <- file.path(ref_dir,paste0("GENCODE_",version_number))
if(!dir.exists(file.path(ref_dir,paste0("GENCODE_",version_number)))) {
dir.create(this_dir)
message(Sys.time(), " --- ", this_dir, " folder created...")
} else {
message(Sys.time(), " --- ", this_dir, " folder is already existing")
}
gencode_organism <- ifelse(grepl("^M",version_number),"Gencode_mouse","Gencode_human")
message("\nRetrieving references for GENCODE version ",version_number)
message("Organism: ", gencode_organism)
gencode_fastaref <- ifelse(
grepl("^M",version_number),"GRCm38.p6.genome.fa.gz","GRCh38.p13.genome.fa.gz")
base_ftp <- "ftp://ftp.ebi.ac.uk/pub/databases/gencode/"
url_genomesequence <- paste0(base_ftp ,gencode_organism,"/release_",version_number,"/",gencode_fastaref)
url_cdnasequence <- paste0(base_ftp, gencode_organism,"/release_",version_number,"/gencode.v",version_number,".transcripts.fa.gz")
url_annogtf <- paste0(base_ftp, gencode_organism,"/release_",version_number,"/gencode.v",version_number,".annotation.gtf.gz")
url_annogff3 <- paste0(base_ftp,gencode_organism, "/release_",version_number,"/gencode.v",version_number,".annotation.gff3.gz")
dest_genomesequence <- file.path(this_dir,gencode_fastaref)
dest_cdnasequence <- file.path(this_dir,paste0("gencode.v",version_number,".transcripts.fa.gz"))
dest_annogtf <- file.path(this_dir,paste0("gencode.v",version_number,".annotation.gtf.gz"))
dest_annogff3 <- file.path(this_dir,paste0("gencode.v",version_number,".annotation.gff3.gz"))
if(!file.exists(dest_genomesequence)){
message("Fetching genome sequence in fasta format")
download.file(url = url_genomesequence,destfile = dest_genomesequence, quiet = quiet_dl)
} else {
message("Already found genome sequence in fasta format")
}
if(!file.exists(dest_cdnasequence)){
message("Fetching cdna sequence in fasta format")
download.file(url = url_cdnasequence,destfile = dest_cdnasequence, quiet = quiet_dl)
} else {
message("Already found cdna sequence in fasta format")
}
if(!file.exists(dest_annogtf)){
message("Fetching gene annotation in gtf format")
download.file(url = url_annogtf,destfile = dest_annogtf, quiet = quiet_dl)
} else {
message("Already found gene annotation in gtf format")
}
if(!file.exists(dest_annogff3)){
message("Fetching gene annotation in gff3 format")
download.file(url = url_annogff3,destfile = dest_annogff3, quiet = quiet_dl)
} else {
message("Already found gene annotation in gff3 format")
}
if(unzip_files) {
# unzipping all files so that they are directly accessible for the program to generate indices and co.
message("\nUnzipping the downloaded resources... (this might take a while)")
message("Unzipping references for ",gencode_organism)
system(paste0("gunzip ", file.path(this_dir,"*gz")))
message("Done unzipping")
}
message("\n",Sys.time(), " --- Done retrieving all files for GENCODE version ", version_number)
return(invisible())
}
# retrieve_gencode_refs(version_number = "30",just_check = FALSE)
# retrieve_gencode_refs(version_number = "M21",just_check = FALSE)
## -------------------------------------------------------------------------- ##
## Installing the binaries of the tools that are required ------------------
## -------------------------------------------------------------------------- ##
# nice solution: via bioconda!
conda install -c bioconda ucsc-gtftogenepred ucsc-genepredtobed
conda install -c bioconda ucsc-bedgraphtobigwig ucsc-bedtobigbed ucsc-bedgraphtobigwig ucsc-bigwiginfo ucsc-bigwigtobedgraph ucsc-bedtobigbed ucsc-bigbedtobed ucsc-bigwigtowig ucsc-bigwigsummary
conda install sra-tools
conda install parallel-fastqdump
conda install suppa
# Do something like an environment.yaml file to be read and fed directly to conda?
## -------------------------------------------------------------------------- ##
## Creating the relevant indices -------------------------------------------
## -------------------------------------------------------------------------- ##
# one fasta file for both coding and noncoding genes
cat _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.cdna.all.fa \
_ref/ENSEMBL_release92/Homo_sapiens.GRCh38.ncrna.fa > \
_ref/ENSEMBL_release92/Homo_sapiens.GRCh38.cdna.ncrna.fa
cat _ref/ENSEMBL_release92/Mus_musculus.GRCm38.cdna.all.fa \
_ref/ENSEMBL_release92/Mus_musculus.GRCm38.ncrna.fa > \
_ref/ENSEMBL_release92/Mus_musculus.GRCm38.cdna.ncrna.fa
## STAR indices
mkdir _ref/index_STAR_hs_GRCh38.92
STAR --runThreadN 16 --runMode genomeGenerate --genomeDir _ref/index_STAR_hs_GRCh38.92 \
--genomeFastaFiles _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.dna.primary_assembly.fa \
--sjdbGTFfile _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.92.gtf --sjdbOverhang 100 # as default value, works for range of cases
mkdir _ref/index_STAR_mm_GRCm38.92
STAR --runThreadN 16 --runMode genomeGenerate --genomeDir _ref/index_STAR_mm_GRCm38.92 \
--genomeFastaFiles _ref/ENSEMBL_release92/Mus_musculus.GRCm38.dna.primary_assembly.fa \
--sjdbGTFfile _ref/ENSEMBL_release92/Mus_musculus.GRCm38.92.gtf --sjdbOverhang 100 # as default value, works for range of cases
# salmon indices - this was done with version 0.9.1
salmon index -t _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.cdna.ncrna.fa -i _ref/index_salmon_hs_GRCh38.92_cdna.ncrna.sidx --type quasi -k 31
salmon index -t _ref/ENSEMBL_release92/Mus_musculus.GRCm38.cdna.ncrna.fa -i _ref/index_salmon_mm_GRCm38.92_cdna.ncrna.sidx --type quasi -k 31
# also with cdna only...
salmon index -t _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.cdna.all.fa -i _ref/index_salmon_hs_GRCh38.92_cdna.sidx --type quasi -k 31
salmon index -t _ref/ENSEMBL_release92/Mus_musculus.GRCm38.cdna.all.fa -i _ref/index_salmon_mm_GRCm38.92_cdna.sidx --type quasi -k 31
# and why not, also with gencode transcripts
salmon index -t _ref/GENCODE_human_release28/gencode.v28.transcripts.fa -i _ref/index_salmon_hs_gencode_28.sidx --type quasi -k 31
salmon index -t _ref/GENCODE_mouse_release17/gencode.vM17.transcripts.fa -i _ref/index_salmon_mm_gencode_17.sidx --type quasi -k 31
## salmon indices with recent version - according to the algorithm change...
salmon index -t _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.cdna.ncrna.fa -i _ref/index_salmon_v0.11.2_hs_GRCh38.92_cdna.ncrna.sidx --type quasi -k 31
salmon index -t _ref/ENSEMBL_release92/Mus_musculus.GRCm38.cdna.ncrna.fa -i _ref/index_salmon_v0.11.2_mm_GRCm38.92_cdna.ncrna.sidx --type quasi -k 31
# also with cdna only...
salmon index -t _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.cdna.all.fa -i _ref/index_salmon_v0.11.2_hs_GRCh38.92_cdna.sidx --type quasi -k 31
salmon index -t _ref/ENSEMBL_release92/Mus_musculus.GRCm38.cdna.all.fa -i _ref/index_salmon_v0.11.2_mm_GRCm38.92_cdna.sidx --type quasi -k 31
# and why not, also with gencode transcripts
salmon index -t _ref/GENCODE_human_release28/gencode.v28.transcripts.fa -i _ref/index_salmon_v0.11.2_hs_gencode_28.sidx --type quasi -k 31
salmon index -t _ref/GENCODE_mouse_release18/gencode.vM18.transcripts.fa -i _ref/index_salmon_v0.11.2_mm_gencode_18.sidx --type quasi -k 31
# kallisto indices
kallisto index --index=_ref/index_kallisto_hs_GRCh38.92_cdna.ncrna.kidx -k 31 _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.cdna.ncrna.fa
kallisto index --index=_ref/index_kallisto_mm_GRCm38.92_cdna.ncrna.kidx -k 31 _ref/ENSEMBL_release92/Mus_musculus.GRCm38.cdna.ncrna.fa
## -------------------------------------------------------------------------- ##
## Converting the reference and annotation files to formats used by other tools ----------
## -------------------------------------------------------------------------- ##
# converting fasta to 2bit format
## require installation of kent tools faToTwoBit
# wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/faToTwoBit
chmod +x faToTwoBit
./faToTwoBit _ref _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.dna.primary_assembly.fa _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.dna.primary_assembly.2bit
./faToTwoBit _ref _ref/ENSEMBL_release92/Mus_musculus.GRCm38.dna.primary_assembly.fa _ref/ENSEMBL_release92/Mus_musculus.GRCm38.dna.primary_assembly.2bit
# gtf to genePred formats, as well as to refFlat, and also to bed
gtfToGenePred -genePredExt -geneNameAsName2 _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.92.gtf refFlat.tmp.txt
paste <(cut -f 12 refFlat.tmp.txt) <(cut -f 1-10 refFlat.tmp.txt) > _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.92.refFlat.txt
rm refFlat.tmp.txt
gtfToGenePred -genePredExt -geneNameAsName2 _ref/ENSEMBL_release92/Mus_musculus.GRCm38.92.gtf refFlat.tmp.txt
paste <(cut -f 12 refFlat.tmp.txt) <(cut -f 1-10 refFlat.tmp.txt) > _ref/ENSEMBL_release92/Mus_musculus.GRCm38.92.refFlat.txt
rm refFlat.tmp.txt
gtfToGenePred _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.92.gtf _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.92.genePred
gtfToGenePred _ref/ENSEMBL_release92/Mus_musculus.GRCm38.92.gtf _ref/ENSEMBL_release92/Mus_musculus.GRCm38.92.genePred
genePredToBed _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.92.genePred _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.92.bed
genePredToBed _ref/ENSEMBL_release92/Mus_musculus.GRCm38.92.genePred _ref/ENSEMBL_release92/Mus_musculus.GRCm38.92.bed
## for SUPPA - generate events files
# generate local AS events
mkdir _ref/index_suppa_hs_GRCh38.92
suppa.py generateEvents -i _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.92.gtf \
-o suppa_hs_GRCh38.92_events -f ioe -e SE SS MX RI FL
#Put all the ioe events in the same file:
awk '
FNR==1 && NR!=1 { while (/^<header>/) getline; }
1 {print}
' suppa_hs_GRCh38.92_events*.ioe > suppa_hs_GRCh38.92_events.ioe
# moving all the files to _ref location (cannot generate them directly, raises error?)
mv suppa_hs_GRCh38.92_events* _ref/index_suppa_hs_GRCh38.92/
mkdir _ref/index_suppa_mm_GRCm38.92
suppa.py generateEvents -i _ref/ENSEMBL_release92/Mus_musculus.GRCm38.92.gtf \
-o suppa_mm_GRCm38.92_events -f ioe -e SE SS MX RI FL
awk '
FNR==1 && NR!=1 { while (/^<header>/) getline; }
1 {print}
' suppa_mm_GRCm38.92_events*.ioe > suppa_mm_GRCm38.92_events.ioe
mv suppa_mm_GRCm38.92_events* _ref/index_suppa_mm_GRCm38.92/
# generate the transcript "events"
suppa.py generateEvents -i _ref/ENSEMBL_release92/Homo_sapiens.GRCh38.92.gtf \
-o _ref/index_suppa_hs_GRCh38.92/suppa_hs_GRCh38.92_txevents -f ioi
suppa.py generateEvents -i _ref/ENSEMBL_release92/Mus_musculus.GRCm38.92.gtf \
-o _ref/index_suppa_mm_GRCm38.92/suppa_mm_GRCm38.92_txevents -f ioi
## -------------------------------------------------------------------------- ##
## Retrieving the annotations from AnnotationHub and making them pkgs ------
## -------------------------------------------------------------------------- ##
library(AnnotationHub)
## Load the annotation resource.
ah <- AnnotationHub()
## Query for all available EnsDb databases
query(ah, c("92", "EnsDb","Homo"))
query(ah, c("92", "EnsDb","Mus"))
edb_v92_human <- ah[["AH60977"]]
edb_v92_mouse <- ah[["AH60992"]]
makeEnsembldbPackage(ensdb = dbfile(dbconn(edb_v92_human)),
version = "0.0.0.9000",
maintainer = "Federico Marini <marinif@uni-mainz.de>",
author = "F Marini",
destDir = "_ref")
makeEnsembldbPackage(ensdb = dbfile(dbconn(edb_v92_mouse)),
version = "0.0.0.9000",
maintainer = "Federico Marini <marinif@uni-mainz.de>",
author = "F Marini",
destDir = "_ref")
# these should be then built/installed as usual if one wants to avoid using the (still very handy) AnnotationHub
# for version 96 of Ensembl!
library(AnnotationHub)
## Load the annotation resource.
ah <- AnnotationHub()
## Query for all available EnsDb databases
query(ah, c("96", "EnsDb","Homo"))
query(ah, c("96", "EnsDb","Musculus"))
edb_v96_human <- ah[["AH69187"]]
edb_v96_mouse <- ah[["AH69210"]]
makeEnsembldbPackage(ensdb = dbfile(dbconn(edb_v96_human)),
version = "0.0.0.9000",
maintainer = "Federico Marini <marinif@uni-mainz.de>",
author = "F Marini",
destDir = "_ref")
makeEnsembldbPackage(ensdb = dbfile(dbconn(edb_v96_mouse)),
version = "0.0.0.9000",
maintainer = "Federico Marini <marinif@uni-mainz.de>",
author = "F Marini",
destDir = "_ref")
## -------------------------------------------------------------------------- ##
## Generating tx2gene mappings and GRanges objects ------
## -------------------------------------------------------------------------- ##
# First, the tx2gene objects
# ercc <- readDNAStringSet(ercc_fa)
# ercc <- data.frame(tx = names(ercc), gene = names(ercc), stringsAsFactors = FALSE)
# using the excellent code from https://github.com/markrobinsonuzh/conquer/blob/master/01_build_index.R
library("Biostrings")
## Human
cdna <- readDNAStringSet("_ref/ENSEMBL_release92/Homo_sapiens.GRCh38.cdna.all.fa")
ncrna <- readDNAStringSet("_ref/ENSEMBL_release92/Homo_sapiens.GRCh38.ncrna.fa")
cdna_df <- data.frame(t(sapply(as.character(names(cdna)), function(nm) {
a <- strsplit(nm, " ")[[1]]
tx <- a[1]
gene <- gsub("^gene:", "", a[grep("^gene:", a)])
gene_symbol <- gsub("^gene_symbol:", "", a[grep("^gene_symbol:", a)])
c(tx = ifelse(length(tx) != 0, tx, NA),
gene = ifelse(length(gene) != 0, gene, NA),
gene_symbol = ifelse(length(gene_symbol) != 0, gene_symbol, NA))
})), stringsAsFactors = FALSE)
ncrna_df <- data.frame(t(sapply(as.character(names(ncrna)), function(nm) {
a <- strsplit(nm, " ")[[1]]
tx <- a[1]
gene <- gsub("^gene:", "", a[grep("^gene:", a)])
gene_symbol <- gsub("^gene_symbol:", "", a[grep("^gene_symbol:", a)])
c(tx = ifelse(length(tx) != 0, tx, NA),
gene = ifelse(length(gene) != 0, gene, NA),
gene_symbol = ifelse(length(gene_symbol) != 0, gene_symbol, NA))
})), stringsAsFactors = FALSE)
txgenemap <- rbind(cdna_df, ncrna_df) # , ercc)
colnames(txgenemap) <- c("TXNAME","GENEID","GENENAME")
saveRDS(txgenemap,
file = "_ref/tx2gene_Homo_sapiens.GRCh38.92.cdna.ncrna.txgenemap.rds")
## Mouse
cdna <- readDNAStringSet("_ref/ENSEMBL_release92/Mus_musculus.GRCm38.cdna.all.fa")
ncrna <- readDNAStringSet("_ref/ENSEMBL_release92/Mus_musculus.GRCm38.ncrna.fa")
cdna_df <- data.frame(t(sapply(as.character(names(cdna)), function(nm) {
a <- strsplit(nm, " ")[[1]]
tx <- a[1]
gene <- gsub("^gene:", "", a[grep("^gene:", a)])
gene_symbol <- gsub("^gene_symbol:", "", a[grep("^gene_symbol:", a)])
c(tx = ifelse(length(tx) != 0, tx, NA),
gene = ifelse(length(gene) != 0, gene, NA),
gene_symbol = ifelse(length(gene_symbol) != 0, gene_symbol, NA))
})), stringsAsFactors = FALSE)
ncrna_df <- data.frame(t(sapply(as.character(names(ncrna)), function(nm) {
a <- strsplit(nm, " ")[[1]]
tx <- a[1]
gene <- gsub("^gene:", "", a[grep("^gene:", a)])
gene_symbol <- gsub("^gene_symbol:", "", a[grep("^gene_symbol:", a)])
c(tx = ifelse(length(tx) != 0, tx, NA),
gene = ifelse(length(gene) != 0, gene, NA),
gene_symbol = ifelse(length(gene_symbol) != 0, gene_symbol, NA))
})), stringsAsFactors = FALSE)
txgenemap <- rbind(cdna_df, ncrna_df) # , ercc)
colnames(txgenemap) <- c("TXNAME","GENEID","GENENAME")
saveRDS(txgenemap,
file = "_ref/tx2gene_Mus_musculus.GRCm38.92.cdna.ncrna.txgenemap.rds")
# Probably even more elegant, creating TxDb objects
library(GenomicFeatures)
gtf <- "_ref/GENCODE_human_release28/gencode.v28.annotation.gtf"
txdb.filename <- "_ref/TxDb_gencode_human_v28_annotation.sqlite"
txdb <- makeTxDbFromGFF(gtf)
saveDb(txdb, txdb.filename)
gtf <- "_ref/GENCODE_mouse_release17/gencode.vM17.annotation.gtf"
txdb.filename <- "_ref/TxDb_gencode_mouse_v17_annotation.sqlite"
txdb <- makeTxDbFromGFF(gtf)
saveDb(txdb, txdb.filename)
gtf <- "_ref/GENCODE_mouse_release18/gencode.vM18.annotation.gtf"
txdb.filename <- "_ref/TxDb_gencode_mouse_v18_annotation.sqlite"
txdb <- makeTxDbFromGFF(gtf)
saveDb(txdb, txdb.filename)
gtf <- "_ref/ENSEMBL_release92/Homo_sapiens.GRCh38.92.gtf"
txdb.filename <- "_ref/TxDb_ensembl_human_v92_annotation.sqlite"
txdb <- makeTxDbFromGFF(gtf)
saveDb(txdb, txdb.filename)
gtf <- "_ref/ENSEMBL_release92/Mus_musculus.GRCm38.92.gtf"
txdb.filename <- "_ref/TxDb_ensembl_mouse_v92_annotation.sqlite"
txdb <- makeTxDbFromGFF(gtf)
saveDb(txdb, txdb.filename)
# Now, the GRanges objects
library(GenomicRanges)
library(dplyr)
## Human
cdna <- readDNAStringSet("_ref/ENSEMBL_release92/Homo_sapiens.GRCh38.cdna.all.fa")
ncrna <- readDNAStringSet("_ref/ENSEMBL_release92/Homo_sapiens.GRCh38.ncrna.fa")
nm <- c(names(cdna), names(ncrna))
info <- data.frame(t(sapply(nm, function(w) {
w <- strsplit(w, " ")[[1]]
transcript <- w[1]
gene <- gsub("gene:", "", w[grep("^gene:", w)])
position <- gsub("chromosome:", "", w[grep("^chromosome:", w)])
if (length(position) == 0) position <- gsub("scaffold:", "", w[grep("^scaffold:", w)])
symbol <- gsub("gene_symbol:", "", w[grep("^gene_symbol", w)])
c(transcript = transcript,
gene = gene,
genome = strsplit(position, ":")[[1]][1],
chromosome = strsplit(position, ":")[[1]][2],
start = strsplit(position, ":")[[1]][3],
end = strsplit(position, ":")[[1]][4],
strand = ifelse(strsplit(position, ":")[[1]][5] == "1", "+", "-"),
symbol = symbol
)
})), stringsAsFactors = FALSE)
rownames(info) <- NULL
info$start <- as.numeric(info$start)
info$end <- as.numeric(info$end)
txgr <- GRanges(seqnames = info$chromosome,
ranges = IRanges(start = info$start, end = info$end),
strand = info$strand)
mcols(txgr) <- info[, c("transcript", "gene", "genome", "symbol")]
# erccgtf <- import(ercc_gtf, format = "gtf")
# ercctx <- erccgtf
# mcols(ercctx) <- data.frame(transcript = ercctx$gene_id,
# gene = ercctx$gene_id,
# genome = ercctx$source,
# symbol = ercctx$gene_id)
#
# txgr <- suppressWarnings(c(txgr, ercctx))
names(txgr) <- txgr$transcript
geneinfo <- info %>% group_by(gene) %>%
summarise(genome = unique(genome),
chromosome = unique(chromosome),
start = min(start),
end = max(end),
strand = unique(strand),
symbol = unique(symbol)) %>%
ungroup()
ggr <- GRanges(seqnames = geneinfo$chromosome,
ranges = IRanges(start = geneinfo$start, end = geneinfo$end),
strand = geneinfo$strand)
mcols(ggr) <- geneinfo[, c("gene", "genome", "symbol")]
# erccgene <- erccgtf
# mcols(erccgene) <- data.frame(gene = erccgene$gene_id,
# genome = erccgene$source,
# symbol = erccgene$gene_id)
# ggr <- suppressWarnings(c(ggr, erccgene))
# names(ggr) <- ggr$gene
gene_granges <- ggr
tx_granges <- txgr
saveRDS(list(gene_granges = gene_granges, tx_granges = tx_granges),
file = "_ref/granges_Homo_sapiens.GRCh38.92.cdna.ncrna.granges.gene_tx.rds")
## Mouse
cdna <- readDNAStringSet("_ref/ENSEMBL_release92/Mus_musculus.GRCm38.cdna.all.fa")
ncrna <- readDNAStringSet("_ref/ENSEMBL_release92/Mus_musculus.GRCm38.ncrna.fa")
nm <- c(names(cdna), names(ncrna))
info <- data.frame(t(sapply(nm, function(w) {
w <- strsplit(w, " ")[[1]]
transcript <- w[1]
gene <- gsub("gene:", "", w[grep("^gene:", w)])
position <- gsub("chromosome:", "", w[grep("^chromosome:", w)])
if (length(position) == 0) position <- gsub("scaffold:", "", w[grep("^scaffold:", w)])
symbol <- gsub("gene_symbol:", "", w[grep("^gene_symbol", w)])
c(transcript = transcript,
gene = gene,
genome = strsplit(position, ":")[[1]][1],
chromosome = strsplit(position, ":")[[1]][2],
start = strsplit(position, ":")[[1]][3],
end = strsplit(position, ":")[[1]][4],
strand = ifelse(strsplit(position, ":")[[1]][5] == "1", "+", "-"),
symbol = symbol
)
})), stringsAsFactors = FALSE)
rownames(info) <- NULL
info$start <- as.numeric(info$start)
info$end <- as.numeric(info$end)
txgr <- GRanges(seqnames = info$chromosome,
ranges = IRanges(start = info$start, end = info$end),
strand = info$strand)
mcols(txgr) <- info[, c("transcript", "gene", "genome", "symbol")]
# erccgtf <- import(ercc_gtf, format = "gtf")
# ercctx <- erccgtf
# mcols(ercctx) <- data.frame(transcript = ercctx$gene_id,
# gene = ercctx$gene_id,
# genome = ercctx$source,
# symbol = ercctx$gene_id)
#
# txgr <- suppressWarnings(c(txgr, ercctx))
names(txgr) <- txgr$transcript
geneinfo <- info %>% group_by(gene) %>%
summarise(genome = unique(genome),
chromosome = unique(chromosome),
start = min(start),
end = max(end),
strand = unique(strand),
symbol = unique(symbol)) %>%
ungroup()
ggr <- GRanges(seqnames = geneinfo$chromosome,
ranges = IRanges(start = geneinfo$start, end = geneinfo$end),
strand = geneinfo$strand)
mcols(ggr) <- geneinfo[, c("gene", "genome", "symbol")]
# erccgene <- erccgtf
# mcols(erccgene) <- data.frame(gene = erccgene$gene_id,
# genome = erccgene$source,
# symbol = erccgene$gene_id)
# ggr <- suppressWarnings(c(ggr, erccgene))
# names(ggr) <- ggr$gene
gene_granges <- ggr
tx_granges <- txgr
saveRDS(list(gene_granges = gene_granges, tx_granges = tx_granges),
file = "_ref/granges_Mus_musculus.GRCm38.92.cdna.ncrna.granges.gene_tx.rds")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.