R/regional_quantification.R

Defines functions compute_call_count_matrices compute_aggr_met_rate compute_region_met_matrix convert_to_granges get_promoters read_region_annot

library(dplyr)
library(data.table)
library(GenomicRanges)

read_region_annot <- function(region_annot_file, format_file, type_filtering =  'none')
{
    df_region_annot = read.table(region_annot_file, sep ='\t')
    df_format = read.table(format_file, sep ='\t')
    colnames(df_region_annot) = as.character(df_format$V2)
    head(df_region_annot)
    dim(df_region_annot)
    #df_region_annot %>% distinct(region_name, .keep_all = TRUE)

    df_region_annot <- dplyr::distinct(df_region_annot, region_name, .keep_all = TRUE)
    dim(df_region_annot)
    head(df_region_annot)
    length(unique(df_region_annot$region_name))


    if(type_filtering != 'none')
    {
      df_region_annot  = df_region_annot[df_region_annot$region_type == type_filtering, ]
    }

    rownames(df_region_annot) =  df_region_annot$region_name

    head(df_region_annot)

    return(df_region_annot)
}

get_promoters <- function(df_gene_annot)
{

    head(df_gene_annot)
    df_promoters = df_gene_annot
    head(df_promoters)
    attach(df_gene_annot)
    df_promoters$start[strand == '+'] = df_gene_annot$start[strand == '+'] - 2000
    df_promoters$end[strand == '+'] = df_gene_annot$start[strand == '+'] + 500

    df_promoters$end[strand == '-'] = df_gene_annot$end[strand == '-'] + 2000
    df_promoters$start[strand == '-'] = df_gene_annot$end[strand == '-'] - 500
    detach(df_gene_annot)
    head(df_promoters)

    df_promoters$start[df_promoters$start < 0] = 0

    return(df_promoters)
}

convert_to_granges <- function(df_region)
{

    gr_region <- with(df_region, GRanges(chrom,
                                           IRanges(start+1, end),
                                           strand = '*',
                                           region_name,
                                           region_type))

    return(gr_region)
}


compute_region_met_matrix <- function(  list_call_count_matrices,
                                       min_call_count_threshold = 10)
{

  full_met_call_count_matrix = list_call_count_matrices$full_met_call_count_matrix
  full_total_call_count_matrix = list_call_count_matrices$full_total_call_count_matrix

  met_rate_matrix = full_met_call_count_matrix / full_total_call_count_matrix

  met_rate_matrix[full_total_call_count_matrix < min_call_count_threshold] = NA

  return(met_rate_matrix)

}


compute_aggr_met_rate <- function( list_call_count_matrices )
{

  aggr_met_counts = rowSums(list_call_count_matrices$full_met_call_count_matrix, na.rm = T)
  aggr_total_counts = rowSums(list_call_count_matrices$full_total_call_count_matrix, na.rm = T)

  head(aggr_met_counts)
  head(aggr_total_counts)

  aggr_rate = aggr_met_counts / aggr_total_counts

  head(aggr_rate)

  return(aggr_rate)

}



compute_call_count_matrices <- function(  df_region,
                               methylation_calls_dir,
                               methylation_type = 'CpG',
                               exclude_cells = c()
                              )
{


  gr_region = convert_to_granges(df_region)
  setwd(methylation_calls_dir)
  pattern  = paste0(methylation_type, '_calls.*organism.cov.*')
  cov_files = list.files(methylation_calls_dir, pattern)



  result_list = list()

  cl <- makeCluster(num_cores, outfile="", type = 'SOCK')
  registerDoSNOW(cl)

  #for(i in  1:length(cov_files))
  #result_list <- foreach(i=1:length(cov_files)) %dopar%
  met_hits_list <- foreach(i=1:length(cov_files)) %dopar%
  {
    library(data.table)
    library(GenomicRanges)

    print(i)
    cov_file = cov_files[i]
    cell_id = gsub('.organism.cov.gz', '', cov_file)
    cell_id = gsub(paste0(methylation_type,'_calls.'), '', cell_id)

    dt_cov = fread(paste0(methylation_calls_dir, cov_file) )
    colnames(dt_cov) = c('chrom', 'start', 'end', 'met_rate', 'met', 'demet')
    #dt_cov$chr = paste0('chr', dt_cov$chr)
    head(dt_cov)
    unique(dt_cov$chr)

    gr_cov <- with(dt_cov, GRanges(chrom, IRanges(start+1, end), strand = '*', met_rate, met, demet)  )
    gr_cov

    #dt_inter = data.table(intersect_bed(gr_region, gr_cov))
    #dim(dt_inter)
    #head(dt_inter)

    hits_obj <- findOverlaps(gr_region, gr_cov)
    class(hits_obj)
    da = as.data.frame(gr_region[queryHits(hits_obj)])
    db = as.data.frame(gr_cov[subjectHits(hits_obj)])
    dt_inter <- data.table(cbind(da, db))


    quant_cols = c('met', 'demet')

    dt_aggr <- dt_inter[, lapply(.SD, sum), by = .(region_name), .SDcols = quant_cols  ]
    length(unique(dt_aggr$region_name))
    head(dt_aggr)
    dim(dt_aggr)

    dt_aggr$call_count = dt_aggr$met + dt_aggr$demet



    #result_list[[cell_id]] = met_rate_vector

    df_aggr = data.frame(dt_aggr)

    head(df_aggr)

    rownames(df_aggr) = df_aggr$region_name

    df_aggr_x = base::merge(df_region, df_aggr, by.x = 'region_name', by.y = 'region_name', all.x = T)

    rownames(df_aggr_x) = df_aggr_x$region_name

    df_aggr_x
  }

  cell_ids = cov_files
  cell_ids = gsub('.organism.cov.gz', '', cell_ids)
  cell_ids = gsub(paste0(methylation_type,'_calls.'), '', cell_ids)
  names(met_hits_list) = cell_ids


  cell_ids = names(met_hits_list)

  include_cells = setdiff(cell_ids, exclude_cells)
  met_hits_list = met_hits_list[include_cells]


  met_call_count_list = lapply(met_hits_list, '[[', 'met')
  demet_call_count_list = lapply(met_hits_list, '[[', 'demet')
  total_call_count_list = lapply(met_hits_list, '[[', 'call_count')

  head(lapply(met_call_count_list, length))
  head(met_call_count_list[[1]])
  length(met_call_count_list[[1]])

  class(met_call_count_list[[1]])

  full_met_call_count_list = lapply(met_call_count_list, '[', df_region$region_name)
  full_total_call_count_list = lapply(total_call_count_list, '[', df_region$region_name)

  full_met_call_count_matrix = do.call('cbind', full_met_call_count_list)
  full_total_call_count_matrix = do.call('cbind', full_total_call_count_list)

  rownames(full_met_call_count_matrix) = df_region$region_name
  rownames(full_total_call_count_matrix) = df_region$region_name

  full_met_call_count_matrix[1:5, 1:5]
  full_total_call_count_matrix[1:5, 1:5]

  list_call_count_matrices = list('full_met_call_count_matrix' = full_met_call_count_matrix,
                     'full_total_call_count_matrix' = full_total_call_count_matrix
                     )

  return(list_call_count_matrices)


}



impute_nas <- function(met_mat, max_ratio_of_na_cells = 0.50)
{
  count_na_by_region = rowSums(is.na(met_mat))
  head(count_na_by_region)
  ratio_na_by_region = count_na_by_region / ncol(met_mat)
  head(ratio_na_by_region)
  sum(ratio_na_by_region > max_ratio_of_na_cells)
  use_regions = ratio_na_by_region <= max_ratio_of_na_cells
  met_mat_filtered = met_mat[use_regions, ]
  dim(met_mat_filtered)
  met_mat_filtered[1:5, 1:5]

  met_mat_imputed = met_mat_filtered
  na_index <- which(is.na(met_mat_imputed), arr.ind=TRUE)
  met_mat_imputed[na_index] <- rowMeans(met_mat_imputed, na.rm=TRUE)[na_index[,1]]

  return(met_mat_imputed)
}
yasin-uzun/SINBAD.0.2 documentation built on Dec. 23, 2021, 7:16 p.m.