R/annot_processGeneAnnotations.R

Defines functions reduceDataFrameGRanges GRangesListToDataFrame extractExonsStartEnd

reduceDataFrameGRanges <- function(df){
  #Convert data frame into a a Granges object and reduce overlapping exons, then convert back to df
  
  gr = GRanges(seqnames = df$seqnames, ranges = IRanges(start = df$start, end = df$end), strand = df$strand)
  #Reduce exons
  gr = IRanges::reduce(gr)
  #Convert back to data.frame
  gr = GenomicRanges::as.data.frame(gr)
  return(gr)
}

GRangesListToDataFrame <- function(gr_list, joint_id_df){
  #Convert GRangesList into a data frame of exon coordinates
  # gr_list - GRangesList with tr_ids as names
  # joint_id_df - data.frame(with rownames) containing three columns: gene_id, transcript_id and joint_id
  
  names(gr_list) = joint_id_df[names(gr_list),]$joint_id
  exons_df = plyr::ldply(gr_list, GenomicRanges::as.data.frame, .id = "joint_id") %>%
    dplyr::mutate(joint_id = as.vector(joint_id)) %>%
    tidyr::separate(joint_id, into = c("gene_id", "transcript_id"), sep = ":")
  return(exons_df)
}

extractExonsStartEnd <- function(gr_list, gene_tx_map){
  #Extract Exon start and end coorinates per gene from a GrangesList object of exons by transcript id.
  
  #Count the number of transcripts per gene
  transcript_counts = dplyr::group_by(gene_tx_map, gene_id) %>% 
    dplyr::summarize(transcript_count = length(transcript_id))
  
  #Extract genes with only one transcipt
  single_transcript_genes = dplyr::filter(transcript_counts, transcript_count == 1)
  print(dim(single_transcript_genes))
  single_tx_annot = dplyr::semi_join(gene_tx_map, single_transcript_genes, by = "gene_id")
  
  #Extract genes with more than one transcript
  multi_transcript_genes = dplyr::filter(transcript_counts, transcript_count > 1)
  print(dim(multi_transcript_genes))
  multi_tx_annot = dplyr::semi_join(gene_tx_map, multi_transcript_genes, by = "gene_id")
  
  #Construct joint id for multi tx genes
  joint_id_df = dplyr::mutate(gene_tx_map, joint_id = paste(gene_id, transcript_id, sep = ":")) %>% as.data.frame()
  rownames(joint_id_df) = joint_id_df$transcript_id
  
  #Extract exon coordinates for single tx genes
  single_tx_exons = exons[single_tx_annot$transcript_id]
  single_tx_exons_df = GRangesListToDataFrame(single_tx_exons, joint_id_df)
  single_tx_exons_coords = dplyr::group_by(single_tx_exons_df, gene_id) %>% 
    arrange(start) %>% 
    summarize(starts = paste(start, collapse = ","), ends = paste(end, collapse = ","))
  
  #Extract joint exon coordinates for multi tx genes
  multi_tx_exons = exons[multi_tx_annot$transcript_id]
  multi_tx_exons_df = GRangesListToDataFrame(multi_tx_exons, joint_id_df)
  multi_tx_exons_reduced_df = dlply(multi_tx_exons_df, .(gene_id), seqUtils::reduceDataFrameGRanges) %>%
    ldply(.id = "gene_id")
  multi_tx_exons_coords = dplyr::group_by(multi_tx_exons_reduced_df, gene_id) %>% 
    arrange(start) %>% 
    summarize(starts = paste(start, collapse = ","), ends = paste(end, collapse = ","))
  
  #bind results together
  result = rbind(single_tx_exons_coords, multi_tx_exons_coords)
  return(result)
  
}

constructIntronExonDf <- function(gene_id, gene_data, intron_gap = 10, type = "intron"){
  #Return data frames with intron and exon coordinates and names
  # exon_starts - comma-separated string of exon start coords.
  # exon_ends - comma-separated string of exon end coords.
  
  #extract gene data
  gene_record = gene_data[gene_data$gene_id == gene_id,]
  exon_starts = gene_record$exon_starts
  exon_ends = gene_record$exon_ends
  seqname = gene_record$chr
  strand = gene_record$strand
  
  #Extract cooridnates from string
  exon_start_coords = as.numeric(unlist(strsplit(exon_starts, ",")))
  exon_end_coords = as.numeric(unlist(strsplit(exon_ends, ",")))

  #Construct intron data frame
  if (type == "intron"){
    if(length(exon_start_coords) > 1){ #Only genes with more than one exon can have introns
      intron_df = data_frame(start = exon_end_coords[1:length(exon_end_coords)-1] + intron_gap +1,
                             end = exon_start_coords[2:length(exon_start_coords)] - intron_gap -1,
                             strand = strand,
                             seqnames = seqname,
                             Parent = gene_id,
                             type = "intron",
                             gene_id = gene_id)
      intron_df$ID = paste(gene_id, "intron", seq(1,nrow(intron_df)), sep = "_")
      return(intron_df)
    } else{
      return(NULL)
    } 
  } else if (type == "exon"){   #Construct exon data.frame
    exon_df = data_frame(start = exon_start_coords, 
                         end = exon_end_coords,
                         strand = strand,
                         seqnames = seqname,
                         Parent = gene_id,
                         type = "exon",
                         gene_id = gene_id)
    exon_df$ID = paste(gene_id, "exon", seq(1,nrow(exon_df)), sep = "_")
    return(exon_df)
  }
}

#' Convert a data frame into a GRanges object
#' 
#' Seqnames, strand, start and end columns are used as corresponding elements 
#' in the GRanges object. Remaining columns are added into the elementMetadata data frame.
#'
#' @param df Input data frame (required columns: seqnames, start, end, strand)
#'
#' @return GRanges object construct from df.
#' @export
dataFrameToGRanges <- function(df){
  #Convert a data.frame into a GRanges object
  
  gr = GenomicRanges::GRanges(seqnames = df$seqnames, 
               ranges = IRanges::IRanges(start = df$start, end = df$end),
               strand = df$strand)
  
  #Add metadata
  meta = dplyr::select(df, -start, -end, -strand, -seqnames)
  GenomicRanges::elementMetadata(gr) = meta
  
  return(gr)
}
kauralasoo/seqUtils documentation built on May 20, 2019, 7:42 a.m.