## code to prepare `transcript_region` dataset goes here
download.file(
"ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_33/gencode.v33.annotation.gtf.gz",
"data-raw/gencode.v33.hg38.annotation.gtf.gz"
)
download.file(
"ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_33/GRCh37_mapping/gencode.v33lift37.annotation.gtf.gz",
"data-raw/gencode.v33.hg19.annotation.gtf.gz"
)
library(IRanges)
## hg38
gtf_hg38 <- data.table::fread("data-raw/gencode.v33.hg38.annotation.gtf.gz", skip = 5, sep = "\t", header = FALSE)
table(gtf_hg38$V7)
table(gtf_hg38$V1)
extract_col <- function(x, name) {
library(magrittr)
stringr::str_extract(x, paste0(name, " ([^;]+);")) %>%
stringr::str_remove(paste0(name, " ")) %>%
stringr::str_remove_all("\"") %>%
stringr::str_remove(";")
}
gtf_hg38[, gene_type := extract_col(V9, "gene_type")]
## Keep only protein coding region
hg38 <- gtf_hg38[V3 == "transcript" & gene_type == "protein_coding", .(V1, V4, V5, V7)]
hg38 <- hg38[, data.table::as.data.table(reduce(IRanges(V4, V5))), by = .(V7, V1)]
colnames(hg38)[1:2] <- c("strand", "chrom")
hg38$width <- NULL
## hg19
gtf_hg19 <- data.table::fread("data-raw/gencode.v33.hg19.annotation.gtf.gz", skip = 5, sep = "\t", header = FALSE)
gtf_hg19[, gene_type := extract_col(V9, "gene_type")]
## Keep only protein coding region
hg19 <- gtf_hg19[V3 == "transcript" & gene_type == "protein_coding", .(V1, V4, V5, V7)]
hg19 <- hg19[, data.table::as.data.table(reduce(IRanges(V4, V5))), by = .(V7, V1)]
colnames(hg19)[1:2] <- c("strand", "chrom")
hg19$width <- NULL
## Save to package
transcript.hg38 <- hg38
usethis::use_data(transcript.hg38, overwrite = TRUE)
transcript.hg19 <- hg19
usethis::use_data(transcript.hg19, overwrite = TRUE)
# Gene location -----------------------------------------------------------
## hg38
gtf_hg38 <- data.table::fread("data-raw/gencode.v33.hg38.annotation.gtf.gz", skip = 5, sep = "\t", header = FALSE)
gtf_hg38[, `:=`(
gene_name = extract_col(V9, "gene_name"),
gene_id = extract_col(V9, "gene_id"),
gene_type = extract_col(V9, "gene_type")
)]
gene_hg38 <- gtf_hg38[V3 == "gene", .(V1, V4, V5, V7, gene_name, gene_id, gene_type)]
colnames(gene_hg38)[1:4] <- c("chrom", "start", "end", "strand")
## hg19
gtf_hg19 <- data.table::fread("data-raw/gencode.v33.hg19.annotation.gtf.gz", skip = 5, sep = "\t", header = FALSE)
gtf_hg19[, `:=`(
gene_name = extract_col(V9, "gene_name"),
gene_id = extract_col(V9, "gene_id"),
gene_type = extract_col(V9, "gene_type")
)]
gene_hg19 <- gtf_hg19[V3 == "gene", .(V1, V4, V5, V7, gene_name, gene_id, gene_type)]
colnames(gene_hg19)[1:4] <- c("chrom", "start", "end", "strand")
## Save to extdata
saveRDS(gene_hg19, file = "inst/extdata/human_hg19_gene_info.rds")
saveRDS(gene_hg38, file = "inst/extdata/human_hg38_gene_info.rds")
# fwrite(gene_hg19, file = "inst/extdata/human_hg19_gene_info.tsv.gz")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.