R/functions-gtf.R

Defines functions register_features_from_gtf_file register_features_from_gxf_file

Documented in register_features_from_gtf_file register_features_from_gxf_file

#' Register features from GTF file
#' 
#' @param featureset_name Exact featureset name at which to register features
#' @param gtf_filepath Path to GTF file
#' @param register_transcripts whether to register_transcripts (TRUE) or not (FALSE)
#' 
#' @return list containing feature id-s of registered genes and transcripts (if selected)
register_features_from_gtf_file = function(featureset_name, gtf_filepath, 
                                           register_transcripts, 
                                           con = NULL) {
  
  con = revealgenomics:::use_ghEnv_if_null(con = con)
  
  stopifnot(is.logical(register_transcripts))
  
  # Find the matching featureset
  fs = get_featuresets(con = con)
  fset = fs[fs$name == featureset_name, ]
  stopifnot(nrow(fset) == 1)
  cat("Found matching featureset_id:", fset$featureset_id, "for featureset:", featureset_name, "\n")
  featureset_id = fset$featureset_id
  
  cat("Reading GTF file:", gtf_filepath, "\n")
  # feature_df = read.delim(file = gtf_filepath, 
  #                         header = F, 
  #                         stringsAsFactors = F, 
  #                         skip = nSkip)
  # 
  # colnames(feature_df) = c('seqname', 'source', 'feature', 'start', 'end', 
  #                          'score', 'strand', 'frame', 'attribute')
  # 
  feature_df = ape::read.gff(file = gtf_filepath, GFF3 = FALSE)
  
  # attribute column is of the form
  #    "gene_id ENSG00000223972; gene_name DDX11L1; gene_source ensembl_havana; gene_biotype pseudogene;"
  # we create a function to extract out different substrings
  extract_info = function(vec, str_to_search) {
    sep_match_loc_all = gregexpr(pattern = ";", text = vec)
    match_loc = regexpr(pattern = str_to_search, text = vec)
    sep_match_loc = sapply(
      1:length(match_loc), 
      function(idx) {
        ifelse (match_loc[idx] == -1, 
                0, 
                sep_match_loc_all[[idx]][min(which( (sep_match_loc_all[[idx]] - match_loc[idx])  > 0))]
        )
      }
    )
    extract = substr(vec, 
                     match_loc + attr(match_loc, "match.length") + 1, 
                     sep_match_loc - 1)
    extract[extract == ""] = NA
    extract
  }
  
  # We then use the above function on segments of the dataframe at a time
  attribute_col = as.character(feature_df$attribute)
  cat("Splitting the attribute column to extract gene level info.\nShowing one entry of the column as reference\n\t", 
      attribute_col[1], "\n")
  # Add new columns by parsing the `attribute` column
  for (elem in c('gene_id', 'transcript_id', 'gene_name',
                 'gene_biotype', 'transcript_name')) {
    feature_df[, elem] = NA
    cat("\n\n======\nElem:", elem, "\n======\n")
    
    N = nrow(feature_df)
    step = 500000 # step size used to process `attribute` column
    starts = seq(1, N, step)
    ends   = seq(step, step+N, step)
    ends[length(ends)] = N
    ends
    
    if (length(starts) != length(ends)) stop("Need to cover this case")
    
    for (idx in 1:length(starts)) {
      cat("\nRange:", idx, "of total", length(starts), "\n\tTiming: ")
      t1 = proc.time()
      subrange = starts[idx]:ends[idx]
      feature_df[subrange, elem] = extract_info(vec = attribute_col[subrange], 
                                                str_to_search = elem)
      cat((proc.time()-t1)[[3]], "seconds")
    }
  }
  
  cat("Formulating feature annotation dataframe from dataframe parsed from GTF\n")
  feature_ann_df = feature_df[, c('gene_id', 'transcript_id', 'gene_name',
                                  'gene_biotype', 'transcript_name')]
  feature_ann_df = unique(feature_ann_df)
  
  #### GENES ####
  cat("Registering genes\n")
  ftr_df1 = unique(feature_ann_df[, c('gene_id', 'gene_name', 'gene_biotype')])
  dim(ftr_df1)
  colnames(ftr_df1) = c('name', 'gene_symbol', 'gene_biotype')
  if (any(is.na(ftr_df1$name))) stop("Unexpected")
  ftr_df1$feature_type = 'gene'
  ftr_df1$chromosome = '...'
  ftr_df1$start = '...'
  ftr_df1$end = '...'
  ftr_df1$featureset_id = featureset_id
  ftr_df1$source = 'ensembl_gene_id'
  ftr_df1_id = register_feature(df = ftr_df1, register_gene_synonyms = TRUE, con = con)
  
  #### TRANSCRIPT ####
  if (register_transcripts) {
    cat("Registering transcripts\n")
    table(feature_ann_df$gene_biotype)
    ftr_df2 = unique(feature_ann_df[, c('transcript_id', 'gene_name', 'transcript_name')])
    stopifnot(nrow(ftr_df2) == nrow(feature_ann_df))
    colnames(ftr_df2) = c('name', 'gene_symbol', 'transcript_name')
    ftr_df2$feature_type = 'transcript'
    ftr_df2$chromosome = '...'
    ftr_df2$start = '...'
    ftr_df2$end = '...'
    ftr_df2$featureset_id = fset$featureset_id
    ftr_df2$source = 'ensembl_gene_id'
    ftr_df2_id = register_feature(df = ftr_df2, register_gene_synonyms = FALSE)
    
    return(list(
      gene_features_id = ftr_df1_id, 
      transcripts_features_id = ftr_df2_id))
  } else {
    return(list(
      gene_features_id = ftr_df1_id, 
      transcripts_features_id = NULL))
  }
}

#' Register features from GTF/GFF3 file
#' 
#' @param featureset_name Exact featureset name at which to register features
#' @param gxf_filepath Path to GTF/GFF3 file
#' @param register_transcripts whether to register_transcripts (TRUE) or not (FALSE)
#' @param gxf_format one of \code{'GFF3', 'GTF'}
#' 
#' @return list containing feature id-s of registered genes and transcripts (if selected)
#' 
#' @export
register_features_from_gxf_file = function(featureset_name, gxf_filepath, 
                                           register_transcripts, 
                                           gxf_format = c('GFF3', 'GTF'), 
                                           con = NULL) {
  gxf_format = match.arg(gxf_format)
  if (gxf_format == 'GTF') {
    # Using old GTF function here, ideally should begin using processing pipeline for GFF3 
    # with modifications for format 
    register_features_from_gtf_file(featureset_name = featureset_name, 
                                    gtf_filepath = gxf_filepath, 
                                    register_transcripts = register_transcripts, 
                                    con = con)
  } else if (gxf_format == 'GFF3') {
    cfg = yaml.load_file(system.file('extdata/config_gff_gtf_loader.yaml', package = 'revealgenomics'))
    con = revealgenomics:::use_ghEnv_if_null(con = con)
    
    stopifnot(is.logical(register_transcripts))
    
    # Find the matching featureset
    fs = get_featuresets(con = con)
    fset = fs[fs$name == featureset_name, ]
    stopifnot(nrow(fset) == 1)
    cat("Found matching featureset_id:", fset$featureset_id, "for featureset:", featureset_name, "\n")
    featureset_id = fset$featureset_id
    
    cat("Reading GFF3 file:", gxf_filepath, "\n")
    
    #### BEGIN: Break attr column into tags #### 
    # Adapted frmo https://cran.r-project.org/web/packages/rmonad/vignettes/gff-processing.html ####
    read_gff <- function(file){
      readr::read_tsv(
        file,
        col_names = c(
          "seqid",
          "source",
          "type",
          "start",
          "stop",
          "score",
          "strand",
          "phase",
          "attr"
        ),
        na        = ".",
        comment   = "#",
        col_types = "ccciidcic"
      )
    }
    
    #### Read ####
    g <- read_gff(gxf_filepath)
    message("GFF QC and breaking attr column into tags")
    for(col in c("seqid", "type", "start", "stop")){
      if(any(is.na(g[[col]]))){
        stop("GFFError: Column '", col, "' may not have missing values")
      }
    }
    
    #### Synonyms of type ####
    gene_synonyms <- 'SO:0000704'
    mRNA_synonyms <- c('messenger_RNA', 'messenger RNA', 'SO:0000234')
    CDS_synonyms  <- c('coding_sequence', 'coding sequence', 'SO:0000316')
    exon_synonyms <- 'SO:0000147'
    
    g$type <- ifelse(g$type %in% gene_synonyms, 'gene', g$type)
    g$type <- ifelse(g$type %in% mRNA_synonyms, 'mRNA', g$type)
    g$type <- ifelse(g$type %in% CDS_synonyms,  'CDS',  g$type)
    g$type <- ifelse(g$type %in% exon_synonyms, 'exon', g$type)
    
    #### attribute column ####
    tags <- c("ID", "Parent")
    
    df1 = tibble(
      attr  = stringr::str_split(g$attr, ";"),
      order = 1:nrow(g)
    ) 
    df2 = df1 %>%
      dplyr::mutate(ntags = sapply(attr, length)) %>%
      tidyr::unnest(attr) %>%
      dplyr::mutate(attr = ifelse(grepl('=', attr), attr, paste(".U", attr, sep="="))) %>%
      tidyr::separate_(
        col   = "attr",
        into  = c("tag", "value"),
        sep   = "=",
        extra = "merge"
      )
    dfx = df2 %>%
    {
      if(nrow(.) > 0){
        tidyr::spread(., key="tag", value="value")
      } else {
        .$tag   = NULL
        .$value = NULL
        .
      }
    }
    #### END: Break attr column into tags #### 
    matches = sapply(cfg, function(elem) {all(c(elem$gene_identifier, 
                                                elem$transcript_identifier, 
                                                names(elem$column_name_replace)) %in% 
                                                c(colnames(dfx), colnames(g)))})
    gxf_format = names(matches)[matches][1]
    
    if (length(gxf_format) == 1 & !is.na(gxf_format)) {
      message("Found matching format in API library: ", gxf_format)
    } else {
      stop("Did not find a matching GFF3/GTF format")
    }
    
    # Copy of original data.frame for merging
    g$order = 1:nrow(g)
    
    # Common ETL for gene and transcript
    run_common_etl = function(dfx1, feature_type = c('gene', 'transcript'), 
                              featureset_id, cfgx, gtf_df, 
                              parent_to_gene_symbol_mapping_df = NULL) {
      ftr_colnm = cfgx[[paste0(feature_type, '_identifier')]]
      feature_type = match.arg(feature_type)
      if (cfgx$ID_column_redundant & 'ID' %in% colnames(dfx1)) {
        dfx1$ID = NULL
      } 
      
      dfx2 = merge(
        gtf_df[, colnames(gtf_df)[!(colnames(gtf_df) == 'attr')]], dfx1[order(dfx1$order), ], 
        by = 'order'
      )
      
      # Prepare for loading into scidb
      dfx2$order = NULL
      dfx2$ntags = NULL
      dfx2 = plyr::rename(dfx2, 
                          unlist(cfgx$column_name_replace))
      if (feature_type == 'gene') {
        gene_identifier = cfgx$gene_identifier
        colnames(dfx2)[colnames(dfx2) == gene_identifier] = 'name'
        if (cfgx$gene_mapping_to_gene_symbol == 'GeneDF.Name_RenameTo_GeneDF.gene_symbol') {
          dfx2 = plyr::rename(dfx2, 
                              c('Name' = 'gene_symbol'))
        }
      } else if (feature_type == 'transcript') {
        transcript_identifier = cfgx$transcript_identifier
        colnames(dfx2)[colnames(dfx2) == transcript_identifier] = 'name'
        if (cfgx$transcript_mapping_to_gene_symbol$mapper == 'TranscriptDF.Parent_MapTo_GeneDF.name_MapTo_GeneDF.gene_symbol' & 
            identical(cfgx$transcript_mapping_to_gene_symbol$method, "gsub") & 
            identical(cfgx$transcript_mapping_to_gene_symbol$column_to_search_in, 'Parent')) {
          # gene symbol information needs to be retrieved separately
          stopifnot(sum(grepl(cfgx$transcript_mapping_to_gene_symbol$string_to_search, 
                              dfx2$Parent)) == nrow(dfx2))
          dfx2$Parent = gsub(cfgx$transcript_mapping_to_gene_symbol$string_to_search, 
                             cfgx$transcript_mapping_to_gene_symbol$string_to_replace_with, dfx2$Parent)
          
          m1 = find_matches_and_return_indices(dfx2$Parent, parent_to_gene_symbol_mapping_df$name)
          stopifnot(length(m1$source_unmatched_idx) == 0)
          
          dfx2$gene_symbol = parent_to_gene_symbol_mapping_df$gene_symbol[m1$target_matched_idx]
        } else if (cfgx$transcript_mapping_to_gene_symbol$mapper == 'TranscriptDF.gene_RenameTo_TranscriptDF.gene_symbol') {
          dfx2 = plyr::rename(dfx2, c('gene' = 'gene_symbol'))
        } else {
          stop("Case not covered yet")
        }
      }
      
      if (length(cfgx$cols_to_drop_before_loading) > 0) {
        message("Dropping column(s): ", pretty_print(cfgx$cols_to_drop_before_loading))
        dfx2[, cfgx$cols_to_drop_before_loading] = rep(NULL, length(cfgx$cols_to_drop_before_loading))
      }
      
      message("Assigning scidb required ID-s / converting to scidb required types")
      dfx2$featureset_id = featureset_id
      dfx2$feature_type = feature_type
      dfx2$start = as.character(dfx2$start)
      dfx2$end = as.character(dfx2$end)
      
      return(dfx2)
    }
    
    tryCatch({
      gene_identifier = cfg[[gxf_format]]$gene_identifier
      message("=== Trying to extract gene information based on `", gene_identifier, "` tag ", 
              "and using `is.na(Parent)` check ...")
      df_gene = drop_na_columns(
        dfx[which(is.na(dfx[, 'Parent', drop = TRUE]) & 
                    !is.na(dfx[, gene_identifier, drop = TRUE])
        ), ])
      if (nrow(df_gene) == 0) stop()
      
      if (length(unique(df_gene[, gene_identifier, drop=TRUE])) != nrow(df_gene)) {
        message("Some level of redundancy exists")
        
        message("Retaining unique rows by identifier: ", gene_identifier)
        
        df_gene = drop_na_columns(
          df_gene[which(!duplicated(df_gene[, gene_identifier, drop = TRUE])), ])
        
      }
      
      message("Num. rows: ", nrow(df_gene))
      df_gene2 = run_common_etl(dfx1 = df_gene, feature_type = 'gene', featureset_id = featureset_id, 
                                cfgx = cfg[[gxf_format]], gtf_df = g)
      
      ftr_idx = register_feature(df1 = df_gene2, con = con)
      
      message("=== Succeeded")
    }, error = function(e) {
      stop("=== Failed formulation / loading of genes from GTF. Cannot proceed further. \n Error message:\n", e)
    })
    transcript_idx = NULL
    if (register_transcripts) {
      tryCatch({
        transcript_identifier = cfg[[gxf_format]]$transcript_identifier
        message("=== Trying to extract transcript information based on `", transcript_identifier, "` tag...")
        df_transcript = drop_na_columns(dfx[which(!is.na(dfx[, transcript_identifier, drop = TRUE])), ])
        if (nrow(df_transcript) == 0) stop("No transcripts found based on `", transcript_identifier, "` tag...")
        
        if (length(unique(df_transcript$transcript_id)) != nrow(df_transcript)) {
          message("Some level of redundancy exists. Retaining rows that have non-NA `Name` column")
          
          df_transcript = df_transcript[which(!is.na(df_transcript$Name)), ]
          if (length(unique(df_transcript$transcript_id)) != nrow(df_transcript)) {
            message("Some level of redundancy exists. Retaining by unique transcript ID-s only")
            df_transcript = drop_na_columns(
              df_transcript[which(!duplicated(df_transcript[, transcript_identifier, drop = TRUE])), ])
          }
        }
        message("Num. rows: ", nrow(df_transcript))
        
        df_transcript2 = run_common_etl(dfx1 = df_transcript, feature_type = 'transcript', featureset_id = featureset_id,
                                        cfgx = cfg[[gxf_format]], gtf_df = g,
                                        parent_to_gene_symbol_mapping_df = df_gene2[, c('name', 'gene_symbol')])
        
        transcript_idx = register_feature(df1 = df_transcript2, con = con)
        
        message("=== Succeeded")
      }, error = function(e) {
        message("=== Failed loading of transcripts. Error message:\n", e)
      })
    }
    
    return(list(
      gene_features_id = ftr_idx, 
      transcripts_features_id = transcript_idx))
  }
}
Paradigm4/revealgenomics documentation built on April 7, 2020, 2:01 a.m.