R/histoneSig.R

Defines functions protected_grextend data_from_overlaps per_chr_valley_plots parse_signalSet build_feature_table base_features_from_signalSet overlap_baseline plotSignal positions_from_signalSet grextend obtain_extended_parsing_regions parse_regions_with_multiple_overlaps parse_regions_without_multiple_overlaps filter_signalSet np_signals_from_bigwig signal_parser signalSet dna_one_hot check_and_load_refgenome granges_to_continuous granges_chr_filter remove_consecutive check_consecutive lowpass_filter import.np

# Place in required: rtracklayer, GenomicRanges, gtools, dplyr, tidyverse, quantmod
### Include function for creating Homo.sapiens.hg38 database from UCSC track

## HELPER FUNCTIONS

import.np <- function(np_file){
  ## Declare columns to accomodate narrowPeak format
  extraCols_narrowPeak <- c(signalValue = "numeric", pValue = "numeric",
                            qValue = "numeric", peak = "integer")
  ## Read in calling import with extra parameters
  return(import(np_file,format="bed",extraCols=extraCols_narrowPeak))
}

## base filtering function; imported from github (reference pending)
lowpass_filter <- function(x,n){stats::filter(x,rep(1/n,n), sides=1)}

check_consecutive <- function(x){
  any(rle(diff(x))$values == 1)
}

remove_consecutive <- function(x){
  ## Should look for an in-place solution
  split_by_consecutive_intervals <- split(x, cumsum(c(1, diff(x) != 1)))
  consecutive_intervals <- split_by_consecutive_intervals[rapply(split_by_consecutive_intervals,length,how="list") > 1]
  consecutive_intervals <- lapply(consecutive_intervals,function(x) min(x) + (max(x) - min(x) +1)/2)
  return(unlist(modifyList(split_by_consecutive_intervals,consecutive_intervals),use.names=FALSE))

}

## Function to filter and order genomic ranges by 23 canonical chromosomes
granges_chr_filter <- function(granges_object){
  granges_object <- granges_object[grep("[chr]+([0-9]{1,2}$|X|Y)",seqnames(granges_object)),]
  seqlevels(granges_object) <- unique(as.character(seqnames(granges_object)))
  granges_object <- sortSeqlevels(granges_object)
  granges_object <- sort(granges_object)
}


####### Same as function from quantmod, only difference
####### Is the sum is +1 instead of +2

findvalleysone <- function (x, thresh = 0)
{
  pks <- which(diff(sign(diff(x, na.pad = FALSE)), na.pad = FALSE) >
                 0) + 1
  if (!missing(thresh)) {
    if (sign(thresh) > 0)
      thresh <- -thresh
    pks[x[pks - 1] - coredata(x[pks]) < thresh]
  }
  else pks
}

findpeaksone <- function (x, thresh = 0)
{
  pks <- which(diff(sign(diff(x, na.pad = FALSE)), na.pad = FALSE) <
                 0) + 1
  if (!missing(thresh)) {
    if (sign(thresh) < 0)
      thresh <- -thresh
    pks[x[pks - 1] - coredata(x[pks]) > thresh]
  }
  else pks
}

#Obtain position from ranges

granges_to_continuous <- function(x){
  #Where x is a GenomicRanges object with an associated signal value
  #(parsed from bigwig)
  score_vector <- rep(x$score,width(x))
  return(score_vector)
}

check_and_load_refgenome <- function(refgenome){
  if(refgenome %in% BSgenome::available.genomes()){
    refgenome
  }else{
    stop("Provided reference genome not available.
         Consult available options with BSgenome::available.genomes()")}
  require(suppressPackageStartupMessages(refgenome),character.only = TRUE) # Load supplied genome
  genome <- eval(parse(text=refgenome)) ## Include in variable that will be used for further calls
  return(genome)
}

dna_one_hot <- function(x){

  # x is a basepair vector ex: 'A','C','G','T', etc
  ## Probably a horrible implementation, but it's doing the job
  ### HOW DO R FORMULA OBJECTS EVEN WORK?
  bases <- c('A','C','G','T')
  ## Build the encoder
  encoder = as.data.frame(bases)
  names(encoder) <- 'bases'
  dmy <- caret::dummyVars(" ~ bases", data=encoder)
 ## Build one hot matrix from char vector
  x <- as.data.frame(x)
  names(x) <- 'bases'
  one_hot_matrix <- predict(dmy,x)
  colnames(one_hot_matrix) <- bases
  return(one_hot_matrix)

}

### SIGNALSET CLASS

signalSet <- function(){

  signal <- list(signal = vector(),
                 chromosome = character(),
                 start = numeric(),
                 end = numeric(),
                 width = numeric() ,
                 distance_to_nearest_upstream_peak = numeric(),
                 distance_to_nearest_downstream_peak = numeric())

  class(signal) <- "signalSet"

  signal
}

signal_parser <- function(np_object, bw_object){
  overlaps <- findOverlaps(bw_object,np_object)
  values <- queryHits(overlaps)
  by_vector <- factor(subjectHits(overlaps))
  groups <- split(values,by_vector)
  signals <- vector(mode="list", length=length(groups))
  signals <- lapply(groups, function(x) granges_to_continuous(bw_object[x]))
  return(signals)
}


np_signals_from_bigwig <- function(np_object, bw_object){
  
  ##Make object that will be transformed to signal given specified peakfile, bigwig pairs
  ## Preallocate list to be populated with variable size signals
  n_peaks <- length(np_object)
  signals <- vector(mode="list", length=n_peaks)
  signals <- lapply(1:n_peaks,function(x) signalSet())
  parsed_signals <- signal_parser(np_object,bw_object)
  ## Calculate overlap vectors
  widths <- width(np_object)
  starts <- start(np_object)
  ends<- end(np_object)
  
  ## Determine where neighboring ranges start and end
  next_left <- ends[follow(np_object)]
  next_right <- starts[precede(np_object)]
  ## Determine distance
  distance_to_left <- starts - next_left
  distance_to_right <- next_right - ends
  ## Fix NAs at beginning of chromosomes
  distance_to_left[is.na(distance_to_left)] <- 0
  distance_to_right[is.na(distance_to_right)] <-0
  
  ## Fill rest of values in signalSet object
  for (i in (1:n_peaks)){
    signals[[i]]$signal <- parsed_signals[[i]]
    signals[[i]]$chromosome <- as.character(seqnames(np_object[i]))
    signals[[i]]$start <- starts[i]
    signals[[i]]$end <- ends[i]
    signals[[i]]$width <- widths[i]
    signals[[i]]$distance_to_nearest_upstream_peak <- distance_to_right[i]
    signals[[i]]$distance_to_nearest_downstream_peak <- distance_to_left[i]
  }
  
  return(signals)
  
}


filter_signalSet <- function(signalSet, window_size, filter_function, ...){
  ## Use window_size for equal sized lowpass, fractional for proportional filter
  ## optional arguments for function
  ## TODO: add unused protection for dots
  ## BREAKS IF ALL SIGNALS IN THE SIGNALSET ARE THE SAME SIZE
  ## Doesn't work with singles
  ## Possibly fixed with the simplified = FALSE call of mapply (How to introduce it with variable function arguments present?)
  if(missing(...) == FALSE){
    dots <- list(...)
  for(i in 1:length(dots)) {
    assign(x = names(dots)[i], value = dots[[i]])
  }
    if('fractional' %in% names(dots)){
      window_size = ((unlist(lapply(signalSet,'[[', 'width'))) / fractional)
    }
  }
  ## Defaults to a lowpass filter
  if(missing(filter_function)) filter_function <- lowpass_filter else filter_function

  if(length(window_size) > 1){
  filtered_signals <- mapply(filter_function, x = lapply(signalSet,'[[', 'signal'), n = window_size)
  }else if(length(window_size) == 1){
  filtered_signals <- lapply(lapply(signalSet,'[[', 'signal'), filter_function, n = window_size)
  }
  else{stop("n must be a positive numeric integer or vector")}
  ## switch TS object back to numeric signal
  filtered_signals <- lapply(filtered_signals, as.numeric)
  # Replace NAs from timeseries with first real value in new, filtered vector
  ## find first non NA value in filter to subset later
  first_real_index <- lapply(filtered_signals,function(x) which(!is.na(x)))
  first_value_index <- lapply(first_real_index,min)
  values <- Map('[', filtered_signals, first_value_index)
  ##
  filtered_signals <- mapply(function(x,v) replace(x,is.na(x),v), x=filtered_signals ,v=values)
  for (i in 1:length(signalSet)){
  ## assign back into list
    ## Is there a way to do this with an apply or map?
    signalSet[[i]]$signal <- filtered_signals[[i]]
  }

  return(signalSet)
}

## Where query is the file to obtain regions with multiple overlaps from
## and target is the file that can overlap multiple times with query

parse_regions_without_multiple_overlaps <- function(query, target){
  overlap_vector <- countOverlaps(query,target)
  query$num_overlapping_ranges <- overlap_vector
  query[countOverlaps(query,target) <= 1,]
}


## This is used for informative purposes only, so as to subset negative examples
parse_regions_with_multiple_overlaps <- function(query, target){
  overlap_vector <- countOverlaps(query,target)
  query$num_overlapping_ranges <- overlap_vector
  query[countOverlaps(query,target) > 1,]
}


############ TEST FURTHER FOR ROBUSTNESS
############ AND GENERAL SENSE
### Probably rename into "join overlaps" or something

obtain_extended_parsing_regions <- function(query,target){

  #Given a query and a target, returns an object with target values
  #that joins regions that share overlap regions in the corresponding query

  ## Subset target's zeros out
  target$num_overlaps <- countOverlaps(target,query)
  target_length <- length(target)
  zeros <- target[target$num_overlaps == 0]
  #Obtain ranges with at least one overlap
  target_overlap_ranges <- target[target$num_overlaps > 0]

  which <- findOverlaps(target_overlap_ranges,query)
  ## Factor
  factored_indices <- factor(to(which))

  # parsing_index allows us to see which were the query's overlapping ranges with the target
  parsing_index <- from(which)
  target_ranges <- target_overlap_ranges[parsing_index,]
  ## longer than a threshold
  iterator <- data.frame(index = unlist(lapply(split(factored_indices,factored_indices),seq_along)),
                         target_start = start(target_ranges),
                         target_end = end(target_ranges),
                         width = width(target_ranges),
                         query_overlap_range = factored_indices,
                         parsing_index = parsing_index,
                         chr = seqnames(target_ranges))

  ## probably not the most efficient way
  pre_ranges <- split(iterator, f = iterator$query_overlap_range)
  new_start <-  sapply(pre_ranges,function(x) x[1,2])
  new_end <- sapply(pre_ranges,function(x) x[length(x[,3]),3])
  chr <- sapply(pre_ranges,function(x) x[1,7])

  ## change chr value to dynamically adjust according to chr experiment that's provided
  new_multiples <- GRanges(seqnames = chr, ranges = IRanges(start = new_start,end = new_end))
  elementMetadata(zeros) <- NULL
  new_object <- c(zeros,new_multiples)
  new_object$intervalwidth <- width(new_object)
  new_object$num_overlaps <- countOverlaps(new_object,query)

  return(unique(sort(new_object)))
  ## further optimization: remove ranges shared between
  ## created range object and within-range-object-singles (if any)

}

grextend <- function(x, upstream=0, downstream=0){

  ## Obtained from a biostars post

  if (any(strand(x) == "*"))
    warning("'*' ranges were treated as '+'")
  on_plus <- strand(x) == "+" | strand(x) == "*"
  new_start <- start(x) - ifelse(on_plus, upstream, downstream)
  new_end <- end(x) + ifelse(on_plus, downstream, upstream)
  ranges(x) <- IRanges(new_start, new_end)
  trim(x)
}

positions_from_signalSet <- function(signalSet, points, returns = "positions"){
  ##### Breaks on non-nested lists (working as intended?)
  ##### Search for default methods on S3 classes
  #### create methods for both signalSet and signalSet
  ##(or don't, learn to circumvent the list wrapper in signalSets
  force(points)
  if(points == "valleys"){
    x <- lapply(lapply(signalSet,'[[', 1), function(x){findvalleysone(x)})
  }else if(points == "peaks"){
    x <- lapply(lapply(signalSet,'[[', 1), function(x){findpeaksone(x)})
  }else{
    stop("points argument must be specified as either peaks or valleys")
  }
  ##apply findValleys to return alleys in sets
  ## Obtain respective starting positions, subtract 1 since these will be
  ## added to the valley's index
  if (returns == "indices"){
    return(x)
  }else{
    ### subtracting 1 from starts to report valleys correctly after sum
    starts <- unlist(lapply(signalSet,'[[', 3))-1
    return(Map(`+`, x, starts))
  }

}


## Base plotting function
plotSignal <- function(x, highlight="none", ...){
  ### Receives a single signalSet
  ### plotting function to aid in the illustration of signalsets
  ### will show maxima and/or minima as specified with lines crossing each index point
  ### ideally, it will also show areas.
  signal = x[[1]]$signal
  ## add, if NOT list just take as is

  dt = data.table(position = seq_along(signal), signal = signal)
  setkey(dt,position)

  p = ggplot(data = dt , aes(x = position, y= signal)) +
    labs(x="Position",y="Signal Value") +
    geom_line()

  if(highlight == "valleys"){

    valleys <- data.table(position = unlist(positions_from_signalSet(x, "valleys", "indices")))
    dt[valleys, on = "position", valley_exists := i.position]
    p = p + geom_point(data= dt[dt$valley_exists>0], color="red") +
      scale_color_discrete(name = "Point", labels = "Valleys")

  } else if(highlight == "peaks") {

    peaks <- data.table(position = unlist(positions_from_signalSet(x, "peaks", "indices")))
    dt[peaks, on = "position", peak_exists := i.position]
    p = p + geom_point(data= dt[dt$peak_exists>0], color="blue")

  } else if(highlight =='both') {

    valleys <- data.table(position = unlist(positions_from_signalSet(x, "valleys", "indices")))
    peaks <- data.table(position = unlist(positions_from_signalSet(x, "peaks", "indices")))
    dt[valleys, on = "position", valley_exists := i.position]
    dt[peaks, on = "position", peak_exists := i.position]
    p = p + geom_point(data= dt[dt$valley_exists>0], color="red") +
      geom_point(data= dt[dt$peak_exists>0], color="blue")

  }

  return(p)
}

## Function useful for determining baseline percentage of overlaps
## Between a given query and a target
overlap_baseline <- function(query, target, return_unique = "FALSE"){
  ## Warning: has no protection for cases where query and target have different
  ## Seqnames; use function with this taken into consideration.
  ## Currently, user must remove unwanted chromosomes from granges object and make
  ## Sure they correspond in both datasets

  overlaps = countOverlaps(query,target)
  if(return_unique == "TRUE") overlaps[overlaps!=0] <- 1 else overlaps

  query_ranges_by_chr <- tibble(chr = as.vector(seqnames(query)), overlaps = overlaps) %>%
    group_by(chr) %>%
    summarize_all(sum) %>%
    add_row(., chr = "total", overlaps = sum(.$overlaps))


  target_ranges_by_chr <- tibble(chr = as.vector(seqnames(target)), count = rep(1,length(target))) %>%
    group_by(chr) %>%
    summarize_all(sum) %>%
    add_row(., chr = "total", count = sum(.$count))

  overlap_tibble <- tibble(chr = target_ranges_by_chr$chr,
                           overlap_percentage = (unlist(query_ranges_by_chr$overlaps) /
                                                   unlist(target_ranges_by_chr$count)) *100)
  return(overlap_tibble)

}


## Still prototyping with it, use at own risk
## Width calculation is off.

base_features_from_signalSet <- function(x, section = "interval", returns = "indices",
                                             wraptoGRanges = "FALSE", unwrap = "FALSE",
                                             max_area_filter = "FALSE"){

  set_peaks <- positions_from_signalSet(x, "peaks", "indices")
  set_valleys <- positions_from_signalSet(x, "valleys", "indices")
  ## Rationale: even if there's n valleys, no area can be calculated without
  ## Accompanying peaks; therefore, these observations are dropped.
  no_peaks <- unlist(lapply(set_peaks,function(x) any(length(x)==0)))
  no_valleys <- unlist(lapply(set_valleys,function(x) any(length(x)==0)))
  ## Join into single logic vector for removal of observations with 0
  ## On peaks or valleys
  to_remove <- Reduce(`|`, list(no_peaks,no_valleys))
  ## subset valleys and peaks without 0 and continue
  set_valleys <- set_valleys[!to_remove]
  set_peaks <- set_peaks[!to_remove]
  x <- x[!to_remove]
  ## Needs optimizing, implement an apply function that can obtain
  ## All metadata in signalset that's not the base signal
  set_upstream <- lapply(x, '[[', 'distance_to_nearest_upstream_peak')
  set_downstream <- lapply(x, '[[', 'distance_to_nearest_downstream_peak')
  set_chr <-lapply(x, '[[', 'chromosome')
  set_start <-lapply(x, '[[', 'start')
  set_maximums <- sapply(lapply(x,'[[','signal'),max)

  signalset_length <- length(x)
  base_feature_list <- vector(mode ="list",length = signalset_length)

  for(i in 1:signalset_length){
    ## Parse bw signal values associated to np interval
    signal <- x[[i]]$signal
    ## Access each observation
    peaks <- set_peaks[[i]]
    valleys <- set_valleys[[i]]

    ## Remove consecutive ocurrences (is this still necessary?)
    peaks <- if(check_consecutive(peaks)==TRUE) remove_consecutive(peaks) else peaks
    valleys <- if(check_consecutive(valleys)==TRUE) remove_consecutive(valleys) else valleys

    sorted_index_vector <- sort(c(peaks,valleys))
    rep_sorted_index_vector <- c(sorted_index_vector[1],
                                 rep(sorted_index_vector[2:(length(sorted_index_vector)-1)],times=1,each=2),
                                 sorted_index_vector[length(unlist(sorted_index_vector))])

    start <- rep_sorted_index_vector[seq(1,length(rep_sorted_index_vector),by=2)]
    end <- rep_sorted_index_vector[seq(2,length(rep_sorted_index_vector),by=2)]
    extension <- diff(sorted_index_vector)

    base_features <- data.frame(start = start, end = end)

    ## Calculate and Assign area base_features
    signal_area_length <- signal[rep_sorted_index_vector]
    base_features$chr <- set_chr[[i]]
    base_features$extension <- extension
    base_features$height <- abs(zoo::rollapply(signal_area_length, 2, by=2, diff, partial = TRUE, align="left"))
    base_features$area <- (base_features$height * base_features$extension)/2


    ## Add metadata
    ## Needs optimizing as well, same as above
    base_features$signal_maximum <- set_maximums[i]
    base_features$bps_to_next_peak <- set_upstream[[i]]
    base_features$bps_to_previous_peak <- set_downstream[[i]]

    if(returns =="positions"){
      base_features$start <- base_features$start + (set_start[[i]]-1)
      base_features$end <- base_features$end + (set_start[[i]]-1)
    }

    if(section == "valley"){

      area_for_subset <- base_features$area
      extension_for_subset <- base_features$extension
      height_for_subset <- base_features$height
      matches <- match(rep_sorted_index_vector,valleys)
      groups <- matches
      ## Keep all valley values, discard peaks
      matches[is.na(matches)==FALSE] <- 1
      match_index <- matches
      match_index[!is.na(matches)] <- cumsum(match_index[!is.na(matches)])
      ## Assign 0, then subset as logical to keep valley values from area vector
      ## Missing assertion to make valley vector and scores equal
      ## calculate valley areas, extensions and heights on observations that contain
      ## a valley between two peaks
      valley_area <- as.vector(tapply(area_for_subset[match_index],groups,sum,na.rm=TRUE)) ## Area
      valley_extension <-  as.vector(tapply(extension_for_subset[match_index],groups,sum,na.rm=TRUE)) ## Extension
      valley_height <-  as.vector(tapply(height_for_subset[match_index],groups,sum,na.rm=TRUE)) ## Height

      if(returns == "positions") valleys <- valleys + (set_start[[i]]-1) else valleys

      base_feature_list[[i]] <- data.frame(valley = valleys,
                                           chr = rep(set_chr[[i]],length(valleys)),
                                           extension = valley_extension,
                                           height = valley_height,
                                           area = valley_area,
                                           signal_maximum = set_maximums[i],
                                           bps_to_next_peak = rep(set_upstream[[i]], length(valleys)),
                                           bps_to_previous_peak = rep(set_downstream[[i]], length(valleys)))

    } else {base_feature_list[[i]] <- base_features}
    ## Remove valleys with area = 0
    base_feature_list[[i]] <- base_feature_list[[i]][base_feature_list[[i]]$area!=0,]

    if(max_area_filter == "TRUE"){
      base_feature_list[[i]] <- base_feature_list[[i]][which.max(base_feature_list[[i]]$area),]
    }
  }

  if(wraptoGRanges == "TRUE"){
    ## Simplify valley dataframe names before wrapping
    ## as GRanges object
    if(section == "valley"){
      base_feature_list <- lapply(base_feature_list, function(x){
        names(x)[names(x) == "valley"] <- "start"
        x$end <- x$start
        return(x)
      })
    }

    base_feature_list <- lapply(base_feature_list, function(x){
      makeGRangesFromDataFrame(x, seqnames.field = "chr",
                               start.field = "start",
                               end.field = "end",
                               keep.extra.columns=TRUE)
    })

  }

  if(wraptoGRanges == "TRUE" & unwrap == "TRUE"){
  base_feature_list <- suppressWarnings(sort(do.call(c, unlist(base_feature_list,recursive=FALSE))))
  } else if(unwrap == "TRUE"){
  base_feature_list <- bind_rows(base_feature_list)
  } else {base_feature_list}

  return(base_feature_list)


}

build_feature_table <- function(x, metadata_as_features = FALSE, include_sequence = FALSE,  refgenome){
  ### To do: Generalize; merge this function with signal_matrix_from_signalSet to make it able to receive
  ## signalSets or Granges, whichever is supplied.

  ## This check must go, should be a feature of the experimental parsing function
  ### features from signalSet doesn't give return a signalset, therefore this check
  ## Returns an error on non granges base_feature_list

  if(class(x) != "GRanges" & class(x) != "list"){
    stop('Must provide a valid GRanges or signalSet list')
  } else if(class(x) == "list"){
    if(all(sapply(x, class) == 'signalSet') == "FALSE"){
      stop("List provided has at least one none signalSet object")}
  }

  if(include_sequence == TRUE){
    genome <- check_and_load_refgenome(refgenome)
  }

  ## Is there a better way to stop doing this at the beginning of each signalset associated function?
  ## A signalset accessor of sorts?

  if(class(x) == 'list'){

    starts <- sapply(signalSet,'[[','start')
    ends <- sapply(signalSet,'[[','end')
    chrs <- sapply(signalSet,'[[','chromosome')
    widths <- sapply(signalSet, '[[', 'width')
    chromosome <- unlist(mapply(rep,chrs,widths),use.names=FALSE)
    master_index <- unlist(mapply(`:`, starts, ends), use.names=FALSE)
    master_table <- data.table(master_index = master_index, chromosome = chromosome, signal = unlist(sapply(signalSet,'[[','signal')))
  } else if(class(x) == "GRanges"){
    ## Build an index to parse the aggregate sequence of all GRanges objects.
    master_grange <- x ## Copying x in case metadata has to be called later (which it does)
    elementMetadata(master_grange) <- NULL
    master_granges_seqnames <- rep(as.vector(seqnames(master_grange)),width(master_grange))  ## obtain chrnames for index
    master_index <- unlist(mapply(`:`, start(master_grange), end(master_grange)), use.names=FALSE)   ## Create main bp vector
    # Store as a matrix to add further sections, depending on user supplied preferences
    master_table <- data.table(master_index, chromosome=master_granges_seqnames)}

  if(include_sequence == TRUE){
    ### to do: add option for custom genomes
    master_index_seqs <- as.character(getSeq(genome, master_grange))   ## Parse before deconstructing vector
    master_index_seqs <- unlist(strsplit(paste(master_index_seqs,collapse=""),"")) ## Split and join into per basepair sequence vector

    ## one hot encoding of previously obtained sequences
    master_seq_matrix <- dna_one_hot(master_index_seqs)
    master_table <- data.table(master_table,master_seq_matrix) ## Join into index + seq matrix
  }

  if(metadata_as_features == TRUE){
    metadata_features <- as.data.table(elementMetadata(x))
    metadata_features <- metadata_features[rep(seq_len(nrow(metadata_features)), width(x)),]
    master_table <- data.table(master_table,metadata_features)
  }

  return(master_table)
}


###### Experimental | Trying to build a parser to stop:
## 1) Is it a signalSet list or Signalset? discrepancy
## 2) stop the sapply line every single time we need an
## Attribute from our signalSet
### Todo: make to_parse able to receive multiple arguments.

parse_signalSet <- function(x, to_parse){

  ## Check if we're working with a full list of signalSet
  if(class(x) != "list" & class(x) != "signalSet"){
    stop('Must provide a valid signalSet list or signalSet to parse')
  } else if (all(sapply(x, class) == 'signalSet') == "FALSE"){
    stop("List provided has at least one none signalSet object")
  }

  force(to_parse)

  if(is.list(signalSet) == TRUE){
    parser <- function(to_parse) sapply(x,'[[', to_parse)
  } else
    parser <- function(to_parse) x$to_parse

  return(parser(to_parse))
}


per_chr_valley_plots <- function(x, separator='experiment', feature_to_plot = NULL ,
                                 condition=NULL, object_to_return="plot"){
  ## needs a bit more attention, currently designed
  ## to work with granges but a feature table approach should be in order
  ## To do: add argument to make something other than a segmented boxplot?
  if(class(x) != "GRanges" & class(x) != "data.frame"){
    stop('Must provide a valid GRanges, data.frame, or data.table')
  }

  if(!is.null(feature_to_plot)){
    if(!is.vector(feature_to_plot) & class(feature_to_plot) != "character"){
      stop('Must provid a valid character vector with the column names of the features that are to be plotted')
    }
  }

  if(class(x) == "GRanges"){
    features <- as.data.table(elementMetadata(x))
    chrnames <- as.character(seqnames(x))
    ## 'positions' currently works since it's understood a valley object
    ## has the same start and end with length 1; this will
    ## undoubtedly break with a different sized input
    positions <- start(ranges(x))
    chr_data <- data.table(chrnames,positions,features) # separator subset features[,separator,with=FALSE]
  }else if(class(x) =="data.table"){
    chr_data <- data.table(x) ## for performance ?
  }
  ## To do: add a check between features_to_plot and
  ## names(data) to make sure something'll get plotted
  plot_names <- unique(chr_data$chrnames)
  n <- length(plot_names)
  ncol <- floor(sqrt(n))
  plot_list <- vector(mode = "list", length = n)

  for(i in 1:n){
    plotdata <- chr_data[chr_data[,chrnames == plot_names[i]]]

    plot_list[[i]] <- ggplot(data = plotdata, aes(x = chrnames,
                                                  y = eval(parse(text = feature_to_plot)),
                                                  fill = eval(parse(text = separator)))) +
      geom_boxplot() +
      xlab(plot_names[i]) +
      ylab(feature_to_plot) +
      guides(fill=guide_legend(title=separator)) +
      scale_y_log10() ## These transformations should be passed in as options to the function

  }
  names(plot_list) <- plot_names
  if(object_to_return == "plotlist"){
    return(plot_list)
  } else {
  return(do.call("grid.arrange", c(plot_list,ncol=ncol)))
  dev.off()
  }

}

data_from_overlaps <- function(query,target){
  #determines various overlap statistics, from a query (ChIP)
  #and a target (ATAC).
  
  ### percent overlap calculation partially taken from: 
  ### https://support.bioconductor.org/p/72656/
  hits <- findOverlaps(query, target)
  ## Which chip hit group does it correspond to?
  target_overlaps  <- subjectHits(hits)
  target_overlap_groupref <- rep(seq_along(as.numeric(table(subjectHits(hits)))),times=as.numeric(table(subjectHits(hits))))
  query_overlaps <- queryHits(hits)
  query_overlap_groupref <- rep(seq_along(as.numeric(table(queryHits(hits)))),times=as.numeric(table(queryHits(hits))))
  #determine widths
  query_widths <- width(query)
  target_widths <- width(target)
  #
  query_widths_ref <- query_widths[target_overlap_groupref]
  target_widths_ref <- target_widths[target_overlaps]
  
  coverable_target_bases <- as.vector(tapply(target_widths_ref,query_overlap_groupref,sum))
  covered_target_bases <-  as.vector(tapply(width(pintersect(query[queryHits(hits)], target[subjectHits(hits)])),
                                            query_overlap_groupref,sum))
  
  targets_overlapped <- as.numeric(table(query_overlap_groupref))
  
  # Doesnt make much sense on current iteration since there can be plenty of targets
  ## Decide on a weight   target_bases_covered <- 
  covered_base_ratio <- covered_target_bases/coverable_target_bases
  data <- data.frame(covered_target_bases,coverable_target_bases,covered_base_ratio, targets_overlapped)
  return(data)
  
}

protected_grextend <- function(x, upstream=0, downstream=0){
  
  ## Currently only supports granges that are on the same strand
  ## Calculate starts and ends of range
  starts <- start(x)
  ends <- end(x)
  ## Determine where neighboring ranges start and end
  next_left <- ends[follow(x)]
  next_right <- starts[precede(x)]
  ## Determine distance
  distance_to_left<- starts - next_left
  distance_to_right <- next_right - ends
  ## Fix NAs at beginning of chromosomes
  distance_to_left[is.na(distance_to_left)] <- 0
  distance_to_right[is.na(distance_to_right)] <-0

  new_start <-  starts  - ifelse(distance_to_left > upstream , upstream, floor(distance_to_left/2))
  new_end <-  ends + ifelse(distance_to_right > downstream, downstream, floor(distance_to_right/2))
  ranges(x) <- IRanges(new_start, new_end)
  trim(x)

}
semibah/histoneSig documentation built on Sept. 18, 2020, 3:58 p.m.