#' Register features from GTF file
#'
#' @param featureset_name Exact featureset name at which to register features
#' @param gtf_filepath Path to GTF file
#' @param register_transcripts whether to register_transcripts (TRUE) or not (FALSE)
#'
#' @return list containing feature id-s of registered genes and transcripts (if selected)
register_features_from_gtf_file = function(featureset_name, gtf_filepath,
register_transcripts,
con = NULL) {
con = revealgenomics:::use_ghEnv_if_null(con = con)
stopifnot(is.logical(register_transcripts))
# Find the matching featureset
fs = get_featuresets(con = con)
fset = fs[fs$name == featureset_name, ]
stopifnot(nrow(fset) == 1)
cat("Found matching featureset_id:", fset$featureset_id, "for featureset:", featureset_name, "\n")
featureset_id = fset$featureset_id
cat("Reading GTF file:", gtf_filepath, "\n")
# feature_df = read.delim(file = gtf_filepath,
# header = F,
# stringsAsFactors = F,
# skip = nSkip)
#
# colnames(feature_df) = c('seqname', 'source', 'feature', 'start', 'end',
# 'score', 'strand', 'frame', 'attribute')
#
feature_df = ape::read.gff(file = gtf_filepath, GFF3 = FALSE)
# attribute column is of the form
# "gene_id ENSG00000223972; gene_name DDX11L1; gene_source ensembl_havana; gene_biotype pseudogene;"
# we create a function to extract out different substrings
extract_info = function(vec, str_to_search) {
sep_match_loc_all = gregexpr(pattern = ";", text = vec)
match_loc = regexpr(pattern = str_to_search, text = vec)
sep_match_loc = sapply(
1:length(match_loc),
function(idx) {
ifelse (match_loc[idx] == -1,
0,
sep_match_loc_all[[idx]][min(which( (sep_match_loc_all[[idx]] - match_loc[idx]) > 0))]
)
}
)
extract = substr(vec,
match_loc + attr(match_loc, "match.length") + 1,
sep_match_loc - 1)
extract[extract == ""] = NA
extract
}
# We then use the above function on segments of the dataframe at a time
attribute_col = as.character(feature_df$attribute)
cat("Splitting the attribute column to extract gene level info.\nShowing one entry of the column as reference\n\t",
attribute_col[1], "\n")
# Add new columns by parsing the `attribute` column
for (elem in c('gene_id', 'transcript_id', 'gene_name',
'gene_biotype', 'transcript_name')) {
feature_df[, elem] = NA
cat("\n\n======\nElem:", elem, "\n======\n")
N = nrow(feature_df)
step = 500000 # step size used to process `attribute` column
starts = seq(1, N, step)
ends = seq(step, step+N, step)
ends[length(ends)] = N
ends
if (length(starts) != length(ends)) stop("Need to cover this case")
for (idx in 1:length(starts)) {
cat("\nRange:", idx, "of total", length(starts), "\n\tTiming: ")
t1 = proc.time()
subrange = starts[idx]:ends[idx]
feature_df[subrange, elem] = extract_info(vec = attribute_col[subrange],
str_to_search = elem)
cat((proc.time()-t1)[[3]], "seconds")
}
}
cat("Formulating feature annotation dataframe from dataframe parsed from GTF\n")
feature_ann_df = feature_df[, c('gene_id', 'transcript_id', 'gene_name',
'gene_biotype', 'transcript_name')]
feature_ann_df = unique(feature_ann_df)
#### GENES ####
cat("Registering genes\n")
ftr_df1 = unique(feature_ann_df[, c('gene_id', 'gene_name', 'gene_biotype')])
dim(ftr_df1)
colnames(ftr_df1) = c('name', 'gene_symbol', 'gene_biotype')
if (any(is.na(ftr_df1$name))) stop("Unexpected")
ftr_df1$feature_type = 'gene'
ftr_df1$chromosome = '...'
ftr_df1$start = '...'
ftr_df1$end = '...'
ftr_df1$featureset_id = featureset_id
ftr_df1$source = 'ensembl_gene_id'
ftr_df1_id = register_feature(df = ftr_df1, register_gene_synonyms = TRUE, con = con)
#### TRANSCRIPT ####
if (register_transcripts) {
cat("Registering transcripts\n")
table(feature_ann_df$gene_biotype)
ftr_df2 = unique(feature_ann_df[, c('transcript_id', 'gene_name', 'transcript_name')])
stopifnot(nrow(ftr_df2) == nrow(feature_ann_df))
colnames(ftr_df2) = c('name', 'gene_symbol', 'transcript_name')
ftr_df2$feature_type = 'transcript'
ftr_df2$chromosome = '...'
ftr_df2$start = '...'
ftr_df2$end = '...'
ftr_df2$featureset_id = fset$featureset_id
ftr_df2$source = 'ensembl_gene_id'
ftr_df2_id = register_feature(df = ftr_df2, register_gene_synonyms = FALSE)
return(list(
gene_features_id = ftr_df1_id,
transcripts_features_id = ftr_df2_id))
} else {
return(list(
gene_features_id = ftr_df1_id,
transcripts_features_id = NULL))
}
}
#' Register features from GTF/GFF3 file
#'
#' @param featureset_name Exact featureset name at which to register features
#' @param gxf_filepath Path to GTF/GFF3 file
#' @param register_transcripts whether to register_transcripts (TRUE) or not (FALSE)
#' @param gxf_format one of \code{'GFF3', 'GTF'}
#'
#' @return list containing feature id-s of registered genes and transcripts (if selected)
#'
#' @export
register_features_from_gxf_file = function(featureset_name, gxf_filepath,
register_transcripts,
gxf_format = c('GFF3', 'GTF'),
con = NULL) {
gxf_format = match.arg(gxf_format)
if (gxf_format == 'GTF') {
# Using old GTF function here, ideally should begin using processing pipeline for GFF3
# with modifications for format
register_features_from_gtf_file(featureset_name = featureset_name,
gtf_filepath = gxf_filepath,
register_transcripts = register_transcripts,
con = con)
} else if (gxf_format == 'GFF3') {
cfg = yaml.load_file(system.file('extdata/config_gff_gtf_loader.yaml', package = 'revealgenomics'))
con = revealgenomics:::use_ghEnv_if_null(con = con)
stopifnot(is.logical(register_transcripts))
# Find the matching featureset
fs = get_featuresets(con = con)
fset = fs[fs$name == featureset_name, ]
stopifnot(nrow(fset) == 1)
cat("Found matching featureset_id:", fset$featureset_id, "for featureset:", featureset_name, "\n")
featureset_id = fset$featureset_id
cat("Reading GFF3 file:", gxf_filepath, "\n")
#### BEGIN: Break attr column into tags ####
# Adapted frmo https://cran.r-project.org/web/packages/rmonad/vignettes/gff-processing.html ####
read_gff <- function(file){
readr::read_tsv(
file,
col_names = c(
"seqid",
"source",
"type",
"start",
"stop",
"score",
"strand",
"phase",
"attr"
),
na = ".",
comment = "#",
col_types = "ccciidcic"
)
}
#### Read ####
g <- read_gff(gxf_filepath)
message("GFF QC and breaking attr column into tags")
for(col in c("seqid", "type", "start", "stop")){
if(any(is.na(g[[col]]))){
stop("GFFError: Column '", col, "' may not have missing values")
}
}
#### Synonyms of type ####
gene_synonyms <- 'SO:0000704'
mRNA_synonyms <- c('messenger_RNA', 'messenger RNA', 'SO:0000234')
CDS_synonyms <- c('coding_sequence', 'coding sequence', 'SO:0000316')
exon_synonyms <- 'SO:0000147'
g$type <- ifelse(g$type %in% gene_synonyms, 'gene', g$type)
g$type <- ifelse(g$type %in% mRNA_synonyms, 'mRNA', g$type)
g$type <- ifelse(g$type %in% CDS_synonyms, 'CDS', g$type)
g$type <- ifelse(g$type %in% exon_synonyms, 'exon', g$type)
#### attribute column ####
tags <- c("ID", "Parent")
df1 = tibble(
attr = stringr::str_split(g$attr, ";"),
order = 1:nrow(g)
)
df2 = df1 %>%
dplyr::mutate(ntags = sapply(attr, length)) %>%
tidyr::unnest(attr) %>%
dplyr::mutate(attr = ifelse(grepl('=', attr), attr, paste(".U", attr, sep="="))) %>%
tidyr::separate_(
col = "attr",
into = c("tag", "value"),
sep = "=",
extra = "merge"
)
dfx = df2 %>%
{
if(nrow(.) > 0){
tidyr::spread(., key="tag", value="value")
} else {
.$tag = NULL
.$value = NULL
.
}
}
#### END: Break attr column into tags ####
matches = sapply(cfg, function(elem) {all(c(elem$gene_identifier,
elem$transcript_identifier,
names(elem$column_name_replace)) %in%
c(colnames(dfx), colnames(g)))})
gxf_format = names(matches)[matches][1]
if (length(gxf_format) == 1 & !is.na(gxf_format)) {
message("Found matching format in API library: ", gxf_format)
} else {
stop("Did not find a matching GFF3/GTF format")
}
# Copy of original data.frame for merging
g$order = 1:nrow(g)
# Common ETL for gene and transcript
run_common_etl = function(dfx1, feature_type = c('gene', 'transcript'),
featureset_id, cfgx, gtf_df,
parent_to_gene_symbol_mapping_df = NULL) {
ftr_colnm = cfgx[[paste0(feature_type, '_identifier')]]
feature_type = match.arg(feature_type)
if (cfgx$ID_column_redundant & 'ID' %in% colnames(dfx1)) {
dfx1$ID = NULL
}
dfx2 = merge(
gtf_df[, colnames(gtf_df)[!(colnames(gtf_df) == 'attr')]], dfx1[order(dfx1$order), ],
by = 'order'
)
# Prepare for loading into scidb
dfx2$order = NULL
dfx2$ntags = NULL
dfx2 = plyr::rename(dfx2,
unlist(cfgx$column_name_replace))
if (feature_type == 'gene') {
gene_identifier = cfgx$gene_identifier
colnames(dfx2)[colnames(dfx2) == gene_identifier] = 'name'
if (cfgx$gene_mapping_to_gene_symbol == 'GeneDF.Name_RenameTo_GeneDF.gene_symbol') {
dfx2 = plyr::rename(dfx2,
c('Name' = 'gene_symbol'))
}
} else if (feature_type == 'transcript') {
transcript_identifier = cfgx$transcript_identifier
colnames(dfx2)[colnames(dfx2) == transcript_identifier] = 'name'
if (cfgx$transcript_mapping_to_gene_symbol$mapper == 'TranscriptDF.Parent_MapTo_GeneDF.name_MapTo_GeneDF.gene_symbol' &
identical(cfgx$transcript_mapping_to_gene_symbol$method, "gsub") &
identical(cfgx$transcript_mapping_to_gene_symbol$column_to_search_in, 'Parent')) {
# gene symbol information needs to be retrieved separately
stopifnot(sum(grepl(cfgx$transcript_mapping_to_gene_symbol$string_to_search,
dfx2$Parent)) == nrow(dfx2))
dfx2$Parent = gsub(cfgx$transcript_mapping_to_gene_symbol$string_to_search,
cfgx$transcript_mapping_to_gene_symbol$string_to_replace_with, dfx2$Parent)
m1 = find_matches_and_return_indices(dfx2$Parent, parent_to_gene_symbol_mapping_df$name)
stopifnot(length(m1$source_unmatched_idx) == 0)
dfx2$gene_symbol = parent_to_gene_symbol_mapping_df$gene_symbol[m1$target_matched_idx]
} else if (cfgx$transcript_mapping_to_gene_symbol$mapper == 'TranscriptDF.gene_RenameTo_TranscriptDF.gene_symbol') {
dfx2 = plyr::rename(dfx2, c('gene' = 'gene_symbol'))
} else {
stop("Case not covered yet")
}
}
if (length(cfgx$cols_to_drop_before_loading) > 0) {
message("Dropping column(s): ", pretty_print(cfgx$cols_to_drop_before_loading))
dfx2[, cfgx$cols_to_drop_before_loading] = rep(NULL, length(cfgx$cols_to_drop_before_loading))
}
message("Assigning scidb required ID-s / converting to scidb required types")
dfx2$featureset_id = featureset_id
dfx2$feature_type = feature_type
dfx2$start = as.character(dfx2$start)
dfx2$end = as.character(dfx2$end)
return(dfx2)
}
tryCatch({
gene_identifier = cfg[[gxf_format]]$gene_identifier
message("=== Trying to extract gene information based on `", gene_identifier, "` tag ",
"and using `is.na(Parent)` check ...")
df_gene = drop_na_columns(
dfx[which(is.na(dfx[, 'Parent', drop = TRUE]) &
!is.na(dfx[, gene_identifier, drop = TRUE])
), ])
if (nrow(df_gene) == 0) stop()
if (length(unique(df_gene[, gene_identifier, drop=TRUE])) != nrow(df_gene)) {
message("Some level of redundancy exists")
message("Retaining unique rows by identifier: ", gene_identifier)
df_gene = drop_na_columns(
df_gene[which(!duplicated(df_gene[, gene_identifier, drop = TRUE])), ])
}
message("Num. rows: ", nrow(df_gene))
df_gene2 = run_common_etl(dfx1 = df_gene, feature_type = 'gene', featureset_id = featureset_id,
cfgx = cfg[[gxf_format]], gtf_df = g)
ftr_idx = register_feature(df1 = df_gene2, con = con)
message("=== Succeeded")
}, error = function(e) {
stop("=== Failed formulation / loading of genes from GTF. Cannot proceed further. \n Error message:\n", e)
})
transcript_idx = NULL
if (register_transcripts) {
tryCatch({
transcript_identifier = cfg[[gxf_format]]$transcript_identifier
message("=== Trying to extract transcript information based on `", transcript_identifier, "` tag...")
df_transcript = drop_na_columns(dfx[which(!is.na(dfx[, transcript_identifier, drop = TRUE])), ])
if (nrow(df_transcript) == 0) stop("No transcripts found based on `", transcript_identifier, "` tag...")
if (length(unique(df_transcript$transcript_id)) != nrow(df_transcript)) {
message("Some level of redundancy exists. Retaining rows that have non-NA `Name` column")
df_transcript = df_transcript[which(!is.na(df_transcript$Name)), ]
if (length(unique(df_transcript$transcript_id)) != nrow(df_transcript)) {
message("Some level of redundancy exists. Retaining by unique transcript ID-s only")
df_transcript = drop_na_columns(
df_transcript[which(!duplicated(df_transcript[, transcript_identifier, drop = TRUE])), ])
}
}
message("Num. rows: ", nrow(df_transcript))
df_transcript2 = run_common_etl(dfx1 = df_transcript, feature_type = 'transcript', featureset_id = featureset_id,
cfgx = cfg[[gxf_format]], gtf_df = g,
parent_to_gene_symbol_mapping_df = df_gene2[, c('name', 'gene_symbol')])
transcript_idx = register_feature(df1 = df_transcript2, con = con)
message("=== Succeeded")
}, error = function(e) {
message("=== Failed loading of transcripts. Error message:\n", e)
})
}
return(list(
gene_features_id = ftr_idx,
transcripts_features_id = transcript_idx))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.