R/meth_analysis.R

Defines functions dmTable dmDistance plot_meth_profile from_big_to_profile compute_gene_meth_profile interpolate_gene compute_pop_dm_table compute_indiv_dm_table get_probe_names

Documented in compute_gene_meth_profile compute_indiv_dm_table compute_pop_dm_table dmDistance dmTable from_big_to_profile get_probe_names interpolate_gene plot_meth_profile

#' get_probe_names
#'
#' This function extracts probe names of a given gene from platform
#' @param gene A vector describing the gene (line of a bed file).
#' @param pf_meth A data frame describing CpG positions.
#' @param up_str   An integer specifying up stream size (in bp).
#' @param dwn_str  An integer specifying down stream size (in bp).
#' @param pf_chr_colname string matching the name of the column in the platform that contain the chromosome on which we find a probes.
#' @param pf_pos_colname string matching the name of the column in the platform that contain the position information of probes.
#' @return A vector of probe names
#' @export
get_probe_names = function(
  gene                                ,
  pf_meth                       , 
  pf_chr_colname="Chromosome"         ,
  pf_pos_colname="Start"              ,
  up_str=5000                         , 
  dwn_str=5000                        
 ) {  
   if (substr(gene[[1]], 1, 3) != "chr") {
     gene[[1]] = paste0("chr",gene[[1]])
   }
    
   # get gene properties
   chr =            gene[[1]]
   strand =         gene[[6]]
   gene_name =      gene[[4]]
   beg = as.numeric(gene[[2]])
   end = as.numeric(gene[[3]])

  if (nrow(pf_meth) == 0) {
    warning(paste0("No probes for gene ", gene[[4]],"(",gene[[5]],")."))
    return(NULL) 
  }
   
  if (substr(pf_meth[1, pf_chr_colname], 1, 3) != "chr") {
    pf_meth[,pf_chr_colname] = paste0("chr",pf_meth[,pf_chr_colname])
  }
  
  # get meth infos
  if (strand == "-") {
    off_set_beg = dwn_str
    off_set_end = up_str
    tss = end
  } else {
    off_set_beg = up_str
    off_set_end = dwn_str
    tss = beg
  }

  ## Compute probes associated with the gene 
  probe_idx =   rownames(pf_meth)[
    !is.na(pf_meth[[pf_pos_colname]]) & !is.na(pf_meth[[pf_chr_colname]]) &
    pf_meth[[pf_chr_colname]] == chr &
    pf_meth[[pf_pos_colname]] >= tss-up_str &
    pf_meth[[pf_pos_colname]] < tss+dwn_str
  ]    

  if (length(probe_idx) == 0) {
    warning(paste0("No probes for gene ", gene[[4]],"(",gene[[5]],")."))
    return(NULL)
  } else {
    return(probe_idx)    
  }
}

# Authors: Magali Richard, UGA
# magali.richard@univ-grenoble-alpes.fr
#
#---------------------------------------------
#'Compute differential mehylation table for each probe according to individual-level analysis.
#'
#'Generate differential methylation data table from beta values. According to a reference individual matrix which indicates which samples should be considere as reference for a given probe, the function computes the methylation difference between each tested sample and the mean of reference samples.
#'
#'@param gene_list A \code{gene_list} bedfile containing the genes to screen for differential methylation.
#'@param exp_grp A \code{exp_grp} dataframe that contains metadatas on individuals and samples.
#'@param data_meth A \code{meth} matrix that contains methylation information (beta values). Columns correspond to indivuals, row correspond to probes.
#'@param filter_indiv A vector of individual names to be screened for differential methylation. Optionnal (set on "no_filter" by default).
#'@param indiv_filtering_matrix A matrix of filter to define differentially expressed gene for each pathological sample for each probe. Rownames should be contained in rownames(gene_list) and colnames should contain \code{filter_indiv}. Reference individuals should be set to "0" and individuals to be tested should be set to "1".
#'@param pf_meth A data frame describing CpG positions.
#'@param pf_chr_colname String matching the name of the column in the platform that contain the chromosome on which we find a probes.
#'@param pf_pos_colname String matching the name of the column in the platform that contain the position information of probes.
#'@param updwn_str   An integer specifying up and down stream size (in bp). By default set on 5000pb. 
#'@param slide The maximum width slide allowed when comparing two curves. By default set on 0.
#'@param apply_func A function to be used as/instead of R \code{base::apply}. By default set on \code{base::apply}.
#'@param contrast A vector containing the constrast to use to estimate the compute the methylation profiles. Required if you are in the "pop" mode.
#'@param reference A string indicating what sample to considere as reference. Should be "normal" or "no_de" for all samples without differential expression (normal + patho).
#'@param probe_idx A list of probes associated with each gene
#'
#'
#'@return A matrix of differential methylation values based on single individual analysis. 
#'
#'@export

compute_indiv_dm_table <- function(gene_list, exp_grp, data_meth, filter_indiv = "no_filter", indiv_filtering_matrix , pf_meth, pf_pos_colname, pf_chr_colname, updwn_str = 5000, slide= 0, apply_func = apply, contrast=c("tissue_status","patho","normal"), reference ="normal", probe_idx) {
  
  if (filter_indiv[1] == "no_filter") {
    print("all individuals for which trscr, cnv and meth sample exist will be used in the process")
    filter_indiv = rownames(exp_grp[exp_grp$trscr == 1 & exp_grp$cnv == 1 & exp_grp$meth == 1, ])
  }
  
  if (length(which(filter_indiv %in% colnames(data_meth))) != length(filter_indiv) ) {
    stop("ERROR, filter_indiv does not fit to indivuals present in the data_meth matrix")
  } 
  
  if (length(which(filter_indiv %in% colnames(indiv_filtering_matrix))) != length(filter_indiv)) {
    stop("ERROR, filter_indiv does not fit to indivuals present in the indiv_filtering_matrix")
  }
  
  indiv_filtering_matrix = indiv_filtering_matrix[rownames(gene_list), ]
  
  if (length(which(rownames(indiv_filtering_matrix) %in% rownames(gene_list))) != dim(indiv_filtering_matrix)[1]) {
    stop("ERROR, indiv_filtering_matrix probes does not correspond to the probes present in the data_meth matrix")
  }
  
  if (length(which(rownames(gene_list) %in% names(probe_idx))) != length(rownames(gene_list))) {
    stop("ERROR, probe_idx does not contain all the genes of the gene_list")
  }
  
  #define reference sample
  tmp_exp_grp = exp_grp[filter_indiv,]
  reference_samples = rownames(tmp_exp_grp)[!is.na(tmp_exp_grp[[contrast[1]]]) & tmp_exp_grp[[contrast[1]]] == contrast[3]]
  
  #get probes associated with genes
  #probe_idx = apply_func(gene_list, 1, get_probe_names, pf_meth = pf_meth, pf_chr_colname = pf_chr_colname, pf_pos_colname = pf_pos_colname, up_str=updwn_str+slide, dwn_str=updwn_str+slide)
  idx = sapply(probe_idx, length) > 0
  probes = probe_idx[idx]
  probes = unique(unlist(probe_idx))
  names(probes) = NULL
  
  if (reference == "normal") {
    mean_normal= rowMeans(data_meth[probes, reference_samples])
    diff_meth_data = data_meth[probes, filter_indiv] - mean_normal 
  
  } else if (reference == "no_de") {
    
    #generate a matrix of reference individuals for each probe of interest
    ctrl_matrix_gene = (indiv_filtering_matrix[, filter_indiv]  == 0) + 0
    ctrl_matrix_probe = data_meth[probes, filter_indiv] * 0
    for (gene in rownames(gene_list)) {
      for (probe in probe_idx[[gene]]){
        ctrl_matrix_probe[probe, ] =  ctrl_matrix_gene[gene, ]
      }
    }
  
    #calculate differential methylation
    meth_value = data_meth[probes, filter_indiv]
    meth_value[is.na(meth_value)] <- 0
    ctrl_matrix_probe[is.na(ctrl_matrix_probe)] <- 0
    nb_indiv_ctrl_by_probe = rowSums(ctrl_matrix_probe)
    mean_ctrl= (colSums(t(meth_value) * t(ctrl_matrix_probe))) / nb_indiv_ctrl_by_probe
    mean_normal= rowMeans(data_meth[probes, reference_samples])
    diff_meth_data = data_meth[probes, filter_indiv] - mean(mean_ctrl, mean_normal)
    return(diff_meth_data)
    
  } else {
    warning("reference parameter is incorrect")
    return(NULL)
  }
}

# Authors: Magali Richard, UGA
# magali.richard@univ-grenoble-alpes.fr
#
#---------------------------------------------
#'Compute differential mehylation table for each probe according to population-level analysis (tested samples VS control).
#'
#'Generate differential methylation data table from beta values. It basically extract all tumorous samples and controls. Then it computes the difference between each tumorous and the mean of control.
#'
#'@param gene_list A \code{gene_list} bedfile containing the genes to screen for differential methylation.
#'@param exp_grp A \code{exp_grp} dataframe that contains metadatas on individuals and samples.
#'@param data_meth A \code{meth} matrix that contains methylation information (beta values). Columns correspond to indivuals, row correspond to probes.
#'@param filter_indiv A vector of individual names to be screened for differential expression. Optionnal (set on "no_filter" by default).
#'@param contrast A vector containing the constrast to use to estimate the differential methylation. By default: c("tissue_status","patho","normal")
#'@param pf_meth A data frame describing CpG positions.
#'@param pf_chr_colname String matching the name of the column in the platform that contain the chromosome on which we find a probes.
#'@param pf_pos_colname String matching the name of the column in the platform that contain the position information of probes.
#'@param updwn_str   An integer specifying up and down stream size (in bp). By default set on 5000pb. 
#'@param slide The maximum width slide allowed when comparing two curves. By default set on 0.
#'@param apply_func A function to be used as/instead of R \code{base::apply}. By default set on \code{base::apply}.
#'
#'@return A matrix of differential methylation values based on population analysis. 
#'
#'@export 
#'
compute_pop_dm_table = function(gene_list, exp_grp, data_meth, filter_indiv = "no_filter", contrast=c("tissue_status","patho","normal"), pf_meth, pf_pos_colname, pf_chr_colname, updwn_str = 5000, slide= 0, apply_func = apply){
  
  if (filter_indiv[1] == "no_filter") {
    print("all individuals will be used in the analysis")
    filter_indiv = colnames(data_meth)
  }
  if (length(which(filter_indiv %in% colnames(data_meth))) != length(filter_indiv) ) {
    stop("ERROR, filter_indiv does not fit to indivuals present in the data matrix")
  } 
  
  #get probes associated with genes
  probe_idx = apply_func(gene_list, 1, get_probe_names, pf_meth = pf_meth, pf_chr_colname = pf_chr_colname, pf_pos_colname = pf_pos_colname, up_str=updwn_str+slide, dwn_str=updwn_str+slide)
  idx = sapply(probe_idx, length) > 0
  probe_idx = probe_idx[idx]
  probe_idx = unique(unlist(probe_idx))
  names(probe_idx) = NULL
  
  data = data_meth[probe_idx,  filter_indiv] #data matrix
  tmp_exp_grp = exp_grp[filter_indiv, ]
 
  #tested_samples = rownames(exp_grp)[!is.na(exp_grp[[contrast[1]]]) & exp_grp[[contrast[1]]] == contrast[2]]
  reference_samples = rownames(tmp_exp_grp)[!is.na(tmp_exp_grp[[contrast[1]]]) & tmp_exp_grp[[contrast[1]]] == contrast[3]]
  
  mean_ctrl = rowMeans(data[, reference_samples], na.rm = TRUE)

  diff_meth_data = data - mean_ctrl
  
  return(diff_meth_data)
}



# Authors: Florent Chuffart, INSERM
# florent.chuffart@univ-grenoble-alpes.fr
#
#---------------------------------------------
#'interpolate_gene
#'
#'Build a interpolated signal of differential methylation value for a gene.
#'
#'
#' @param vec A numeric vector specifying differential methylation signal.
#' @param xf coordinates of interpolation
#' @param meth_probe_pos is a vector of probes position on the chromosome. 
#' @param tss the transcription start site on the same chromosome.
#' @param updwn_str is the width of the window on the chromosome in bp where the function will fetch probes position and differential methylation value, default is 5000.
#' @param slide is the maximum width slide you'll alow when comparing two curve, default is 0.
#'@export
interpolate_gene = function(vec, meth_probe_pos, xf, tss, updwn_str, slide) {
  if (sum(!is.na(vec))==0) {
    return(rep(NA, length(xf)))
  }
  xp         <- meth_probe_pos[order(meth_probe_pos)]
  yp         <- vec[order(meth_probe_pos)]
  idx = !is.na(yp)
  xp = xp[idx]
  yp = yp[idx]
  # xp_orig = xp
  # yp_orig = yp
  #concatenate missing bp to x axis (in order to have 10000 bp windows everytime)
  xpA = xpA = c()
  if (tss - updwn_str - slide < min(xp)) {
    xpA = tss - updwn_str - slide
  }
  if (tss + updwn_str + slide > max(xp)) {
    xpB = tss + updwn_str + slide
  }
  # xpA <- seq(from = tss - updwn_str - slide, to = min(xp)-1 )
  # xpB <- seq(from = max(xp)+1, to = tss + updwn_str + slide)
  xp  <- c(xpA, xp, xpB)
  #add fictiv value to added bp in y axis
  yp  <- c( rep(0, length(xpA)), yp, rep(0, length(xpB)) )
  ######
  yf         <- signal::pchip(xp, yp, xf)
  # yf_orig = yf#         <- signal::pchip(xp_orig, yp_orig, xf)
  # layout(matrix(1:2, 1), respect=TRUE)
  # plot(xf, yf, ylim=range(c(yf_orig, yf)))
  # points(xp_orig, yp_orig, pch=16, col=2)
  # plot(xf, yf_orig, ylim=range(c(yf_orig, yf)))
  # points(xp_orig, yp_orig, pch=16, col=2)
  
  
  return(yf)
}

# Authors: Florent Chuffart, INSERM and Magali Richard, UGA
# florent.chuffart@univ-grenoble-alpes.fr
# magali.richard@univ-grenoble-alpes.fr
#
#---------------------------------------------
#
#'compute_gene_meth_profile 
#'
#'Generate  differential methylation profile for a gene around the transcription start side 
#'
#' @param data_meth A matrix of row methylation data with TCGA sample ids as column names and probes ids as row names.
#' @param exp_grp A \code{exp_grp} dataframe that contains metadatas on individuals and samples.
#' @param pf_meth a data.frame with metadata types as columns names and probes ids as row names.
#' @param gene A list that describe gene in bed format.
#' @param type_of_analysis A string indicating if you want to make a population or an individual analysis. Could either be "pop" or "indiv". If not specifyed, all samples are take.
#' @param contrast A vector containing the constrast to use to estimate the compute the methylation profiles. Required if you are in the "pop" mode.
#' @param indiv_filtering_matrix A matrix of filter to define reference and tested individual for each probe. Required if you are in the "indiv" mode.
#' @param updwn_str is the width of the window on the chromosome in bp where the function will fetch probes position and differential methylation value
#' @param slide is the maximum width slide you'll alow when comparing two curve
#' @param wig_size is resolution at which the function interpolate the probes signal
#' @param mask_wide is mask wide of o probe
#' @param pf_chr_colname string matching the name of the column in the platform that contain the chromosome information of probes
#' @param pf_pos_colname string matching the name of the column in the platform that contain the position information of probes
#' @param apply_func Function that will be used for apply.
#' @param min_DE_samples A minimum of differentially expressed sample to consider a given gene as suitable for further analysis. Required if you are in the "indiv" mode.
#'@export

compute_gene_meth_profile = function(gene, exp_grp, data_meth, pf_meth,
  type_of_analysis, contrast=c("tissue_status","patho","normal"), 
  indiv_filtering_matrix = NULL, 
  pf_pos_colname, pf_chr_colname, updwn_str, slide, wig_size, mask_wide, apply_func=apply, 
  min_DE_samples=5
) {

  probe_idx = get_probe_names(gene   , 
    pf_meth=pf_meth      , 
    pf_pos_colname=pf_pos_colname    ,   
    pf_chr_colname=pf_chr_colname    , 
    up_str=updwn_str+slide           , 
    dwn_str=updwn_str+slide 
  )
  
  if (length(probe_idx) == 0) {
    warning(paste0("No probes for gene ", gene[[4]],"(",gene[[5]],")."))
    return(NULL)
  } else {
    
    # Fch, 30 march, 2018 by default we choose to take all samples.
    if (missing(type_of_analysis)) { 
      sample_idx = colnames(data_meth)
    } else {
      if (type_of_analysis == "pop") {
        print(paste("For gene ", gene[[4]], ", the analysis is performed at the population level", sep=""))
        filter_indiv = colnames(data_meth) #select all indivual present in data_meth matrix
        tmp_exp_grp = exp_grp[filter_indiv, ] #filter exp_grp accordingly
        sample_idx = rownames(tmp_exp_grp)[!is.na(tmp_exp_grp[[contrast[1]]]) & tmp_exp_grp[[contrast[1]]] == contrast[2]] #select individuals to test
      
      } else if (type_of_analysis == "indiv") {
        
        print(paste("For gene ", gene[[4]], ", the analysis is performed at the individual level", sep=""))
        filter_indiv = colnames(data_meth)
        tmp_de_vec = indiv_filtering_matrix[gene[[4]],filter_indiv ] 
        sample_idx = names(tmp_de_vec)[tmp_de_vec == 1] #select individual to test according to filtering matrix
        
        if (length(sample_idx) <= min_DE_samples) {
            warning(paste0("Less than ",  min_DE_samples, " DE samples for gene ", gene[[4]],"(",gene[[5]],")."))
           return(NULL)
          
        } 
      } else {
            warning("type_of_analysis parameter is incorrect")
            return(NULL)
      }
    }
  




          
    data         = data_meth[probe_idx,sample_idx]
    meth_probe_pos   = pf_meth[probe_idx, pf_pos_colname]
    strand       = gene[[6]]
    tss          = ifelse (strand == "+", as.numeric(gene[[2]]), as.numeric(gene[[3]]))

    # xf = seq(tss - updwn_str - slide, tss + updwn_str + slide, by = wig_size)
    # bins = cbind(rev(rev(xf)[-1]), xf[-1])
    # xf = xf[-1] - wig_size/2
    # meth_probe_bin = sapply(meth_probe_pos, function(p){
    #   d = abs(p - xf)
    #   min(d)
    #   which(d == min(d))[1]
    # })
    #
    # ret = sapply(1:nrow(bins), function(bin) {
    #   tmp_probe_idx = probe_idx[meth_probe_bin == bin]
    #   if (length(tmp_probe_idx) > 1) {
    #     m = apply(data[tmp_probe_idx,], 2, mean)
    #   } else if (length(tmp_probe_idx) == 0) {
    #     m = data[tmp_probe_idx,]
    #   } else {
    #     m = rep(NA, ncol(data))
    #   }
    #   m
    # })
    # return(ret)

    xf = seq(tss - updwn_str - slide, tss + updwn_str + slide, by = wig_size)
    xf = xf[-1] - wig_size/2

    if  (length(probe_idx) == 1) {
      data = t(data)
    }

    big = apply_func(
      data , 2,
      # vec = data[,5]
      interpolate_gene      ,
      meth_probe_pos=meth_probe_pos   ,
      xf=xf                 ,
      tss=tss               ,
      updwn_str=updwn_str               ,
      slide=slide
    )      

    mask = as.numeric(sapply(xf, function(x) {
      min(abs(x - meth_probe_pos)) <= mask_wide
    }))

    profile = from_big_to_profile(big, xf, mask)

    if (strand == "-") {
      profile = profile[nrow(profile):1,]
    }
          
    return(profile)
  }
}


# Authors: Florent Chuffart, INSERM
# florent.chuffart@univ-grenoble-alpes.fr
#
#---------------------------------------------
#' from_big_to_profile
#'
#' Transform a matrix of interpolated methylome signal of samples to a methyl;ome profile
#'
#' @param xf coordinates of interpolation
#' @param big matrix of interpolated methylome signal of samples 
#' @param mask is mask of o probe
#' @importFrom stats var
#'@export
from_big_to_profile = function(big, xf, mask) {
  m = apply(big, 1, mean, na.rm=TRUE)
  v = apply(big, 1, var, na.rm=TRUE)
  profile = data.frame(x=xf, y=m, var=v, mask=mask)
  profile = cbind(profile, big)
  return(profile)
}


# Authors: Florent Chuffart, INSERM
# florent.chuffart@univ-grenoble-alpes.fr
#
#---------------------------------------------
#
#' plot_meth_profile
#'
#' Plot methylome profile for a gene.
#'
#' @param meth_profile a list of dmProfile
#' @param alpha.f a numeric specifying transparency of convolved signal.
#' @param ylim plot function parameter.
#' @param ADD logical, add to current plot device
#' @param ... args pass to plot
#' @importFrom graphics lines
#' @importFrom graphics plot
#' @importFrom graphics matplot
#' @importFrom grDevices adjustcolor
#'
#'@export
plot_meth_profile = function(meth_profile, alpha.f, ylim=c(-1,1), ADD=FALSE, ...){
  if (!ADD) {
    plot(meth_profile$x, meth_profile$y, ylim=ylim, type="l", ...)    
  }
  lines(meth_profile$x, meth_profile$mask, col=2)
  lines(meth_profile$x, meth_profile$y + 2* sqrt(meth_profile$var), lty=2)
  lines(meth_profile$x, meth_profile$y - 2* sqrt(meth_profile$var), lty=2)  
  if (missing(alpha.f)) {
    alpha.f=2/(ncol(meth_profile)-5)
  }
  matplot(meth_profile$x, meth_profile[,5:ncol(meth_profile)], col=adjustcolor(1, alpha.f=alpha.f), type="l", lty=1, lwd=3, add=TRUE)
}






# Authors: Paul Terzian, UGA
#
#---------------------------------------------
#
#'dmDistance
#'
#'Perform either a frechet distance from the kmlShape package or the eucliddean distance modified from this package (default) between two dmProfile. Return a distance matrix.
#'[warning]Carefully use the frechet distance as it can be heavy computing when dealing with large set of profile. Complexity of the profile also weight on the memory usage. 
#'
#'@param profiles a list of dmProfile 
#'@param frechet a boolean specify if frechet distance will be computed.
#'
#'
#'@export
dmDistance <- function(profiles, frechet = FALSE){
  m  = sapply(profiles, "[[","y")
  v  = sapply(profiles, "[[","var")
  k  = sapply(profiles, "[[","mask")

  d = matrix(0, nrow=ncol(m), ncol=ncol(m) )
  for (i in 1:(ncol(m)-1)) {
    print(i)
    for (j in (i+1):ncol(m)) {
      euc = (m[,i] - m[,j])^2 / (v[,i] + v[,j])
      idx = k[,i] & k[,j] & !is.na(euc)
      # res = sum(euc * k[,i] * k[,j], na.rm=TRUE) / sum(k[,i] * k[,j], na.rm=TRUE)
      # res = sum(euc[idx] * k[idx,i] * k[idx,j]) / sum(k[idx,i] * k[idx,j])
      res = sum(euc[idx] * k[idx,i] * k[idx,j]) / sum(idx)
      res = sqrt(res)
        if (sum(idx)==0) {
          res = NA
        }
      d[i,j] = res
      d[j,i] = res
    }
  }
  
  return(d)
}


# Authors: Paul Terzian, UGA
#
#---------------------------------------------
#
# #'dmDistance2
# #'
# #'Perform either a frechet distance from the kmlShape package or the eucliddean distance modified from this package (default) between two dmProfile. Return a distance matrix.
# #'[warning]Carefully use the frechet distance as it can be heavy computing when dealing with large set of profile. Complexity of the profile also weight on the memory usage.
# #'
# #'@param profiles a list of dmProfile
# #'@param frechet a boolean specify if frechet distance will be computed.
# #'
# #'
# #'@export
# dmDistance2 <- function(profiles, frechet = FALSE){
#   m  = sapply(profiles, "[[","y")
#   v  = sapply(profiles, "[[","var")
#   k  = sapply(profiles, "[[","mask")
#
#   d = sapply(1:ncol(m), function(i) {
#     print(i)
#     sapply(1:ncol(m), function(j) {
#       if (i>=j) {
#         return(0)
#       } else {
#         euc = (m[,i] - m[,j])^2 / (v[,i] + v[,j])
#         idx = k[,i] & k[,j] & !is.na(euc)
#         # res = sum(euc * k[,i] * k[,j], na.rm=TRUE) / sum(k[,i] * k[,j], na.rm=TRUE)
#         # res = sum(euc[idx] * k[idx,i] * k[idx,j]) / sum(k[idx,i] * k[idx,j])
#         res = sum(euc[idx] * k[idx,i] * k[idx,j]) / sum(idx)
#         res = sqrt(res)
#         # d[i,j] = res
#         # d[j,i] = res
#         if (sum(idx)==0) {
#           res = Inf
#         }
#         return(res)
#       }
#     })
#   })
#   d = d+t(d)
#
#   return(d)
# }




# Authors: Paul Terzian, UGA
#
#---------------------------------------------
#
#'dmTable
#'
#'Generate differential methylation data table from beta values. It basically extract all tumorous samples and controls. Then it computes the difference between each tumorous and the mean of control.
#'
#'@param data A matrix of row methylation data with TCGA sample ids as column names and probes ids as row names.
#'@param tested_samples a vector of ids corresmasking to tumoral samples.
#'@param reference_samples a vector of ids corresmasking to control samples.
#'
#'
#'@export
dmTable <- function(data, tested_samples, reference_samples) {
  meanControl <- rowMeans(data[, reference_samples], na.rm = TRUE)
  data    <- data[, tested_samples]
  AllDM = data - meanControl  
  return(AllDM)
}







#
# #'dmDistance_translocate
# #'
# #'Produce a list of two matrix : The distance matrix from a list of dmProfile. Each profile is translocated N times against another, we then keep the min(distance) in the matrix. The second matrix indicates which translocation returned the min(distance)
# #'
# #'@param dmprofileList a list of dmProfile
# #'@param updwn_str is the width of the window on the chromosome in bp where the function will fetch probes position and differential methylation value, default is 5000.
# #'@param slide is the maximum width slide you'll alow when comparing two curve, default is 0.
# #'@param by.interp is resolution at which the function interpolate the probes signal, default is 20.
# #'
# #'@export
# dmDistance_translocate <- function(dmprofileList, updwn_str=5000, slide=500, by.interp = 20){
#
#   #transform bp in bins of bp.length = by.interp
#   bwin   <- updwn_str / by.interp
#   bslide <- slide / by.interp
#
#   #create empty matrix
#   m  <- matrix(rep(NA, length(dmprofileList)*length(dmprofileList)), nrow = length(dmprofileList), ncol = length(dmprofileList))
#   mK <- matrix(rep(NA, length(dmprofileList)*length(dmprofileList)), nrow = length(dmprofileList), ncol = length(dmprofileList))
#
#   for(i in 1:(length(dmprofileList))){
#
#     m1 <- dmprofileList[[i]]
#
#     listmi <- translocateProfile(m1, bslide, bwin)
#
#     # return(listmi)
#     # for(j in (1+i):length(dmprofileList)){
#     for(j in 1:length(dmprofileList)){
#
#       m2 <- dmprofileList[[j]]
#       m2 <- m2[(1+bslide):(bwin*2+bslide), ]
#
#       #str(listmi)
#       distvect <- sapply(listmi, eucli_dist, m2=m2)
#       distvect <- sqrt(distvect)
#
#
#       if(all(is.nan(distvect))){
#         m[i,j]    <- NaN
#         mK[i,j]   <- NaN
#       }else{
#         #distvect[k+slide+1]  <- sqrt(eucli.dist(m1, m2))
#         m[i,j]    <- min(distvect, na.rm = TRUE)
#         distvect[is.nan(distvect)] <- max(distvect, na.rm = TRUE)
#         mK[i,j]   <- which.min(distvect) - bslide - 1
#         # print(which.min(distvect))
#       }
#
#     }
#   }
#   # dbmatrix <- list(dist=m, slide=mK)
#   dbmatrix <-list(m = m, mK = mK)
#   return(dbmatrix)
# }








# #'clustdmProfile
# #'
# #'Plot a dendrogram from a distance matrix, select cluster by clicks on the graphical window, click finish to get a list of geneNames in each cluster selected.
# #'
# #'Carefull, the function has been generating errors with less than 20 genes in the distance matrix.
# #'
# #'@param mat a distance Matrix
# #'@param fill_NA boolean specifying if NA should be imputed, need to be kept TRUE for now if there are NA in the matrix
# #'@param geneLabels A vector of genes names or id, can be extracted from a list of dmProfile with getProfileId
# #'
# #'@return A list of 2 object, the hclust result and a list containing the name of genes for each vector
# #'
# #'@export
# clustdmProfile <- function(mat, fill_NA = TRUE, geneLabels){
#
#   if(fill_NA){
#     for(i in 1:ncol(mat)){
#       mat[is.na(mat[,i]), i] <- mean(c(mean(mat[,i], na.rm = TRUE),
#                                        mean(mat[i,], na.rm = TRUE)))
#     }
#   }
#
#
#   colnames(mat) <- geneLabels
#   rownames(mat) <- geneLabels
#
#
#   hclust_result <- stats::hclust(stats::as.dist(mat), method = "complete")
#
#   graphics::plot(hclust_result)
#
#   print("Plot ready for cluster selection...")
#
#   list_clust <- graphics::identify(hclust_result)
#
#   print("Selection over")
#
#   clust_res <- list(hclust_result = hclust_result, genes_clust = list_clust)
#'
#   return(clust_res)
#
# }




#
# #'translocateProfile
# #'
# #'Produce a list of length bslide + 1 where each element is a dmProfile dataframe translocated around the tss
# #'
# #'@param m1 is a dmProfile dataframe to translocate using bslide
# #'@param bslide is the slide value divided by the width of each bin in dmProfile (= by.interp)
# #'@param bwin is the window parameter divided by the width of each bin in dmProfile (= by.interp)
# #'
# #'@export
# translocateProfile <- function(m1, bslide, bwin){
#
#   listmi <- list()
#
#   for (k in -bslide:bslide){
#
#     listmi[[k+bslide+1]]  <- m1[(1+bslide+k):(bslide+2*bwin+k), ]
#
#   }
#
#   return(listmi)
# }


#' BACKUP COMPUTE INDIV DM TABLE
#' 
#' # Authors: Magali Richard, UGA
#' # magali.richard@univ-grenoble-alpes.fr
#' #
#' #---------------------------------------------
#' #'Compute differential mehylation table for each probe according to individual-level analysis.
#' #'
#' #'Generate differential methylation data table from beta values. According to a reference individual matrix which indicates which samples should be considere as reference for a given probe, the function computes the methylation difference between each tested sample and the mean of reference samples.
#' #'
#' #'@param gene_list A \code{gene_list} bedfile containing the genes to screen for differential methylation.
#' #'@param exp_grp A \code{exp_grp} dataframe that contains metadatas on individuals and samples.
#' #'@param data_meth A \code{meth} matrix that contains methylation information (beta values). Columns correspond to indivuals, row correspond to probes.
#' #'@param filter_indiv A vector of individual names to be screened for differential methylation. Optionnal (set on "no_filter" by default).
#' #'@param indiv_filtering_matrix A matrix of filter to define differentially expressed gene for each pathological sample for each probe. Rownames should be contained in rownames(gene_list) and colnames should contain \code{filter_indiv}. Reference individuals should be set to "0" and individuals to be tested should be set to "1".
#' #'@param pf_meth A data frame describing CpG positions.
#' #'@param pf_chr_colname String matching the name of the column in the platform that contain the chromosome on which we find a probes.
#' #'@param pf_pos_colname String matching the name of the column in the platform that contain the position information of probes.
#' #'@param updwn_str   An integer specifying up and down stream size (in bp). By default set on 5000pb. 
#' #'@param slide The maximum width slide allowed when comparing two curves. By default set on 0.
#' #'@param apply_func A function to be used as/instead of R \code{base::apply}. By default set on \code{base::apply}.
#' #'
#' #'@return A matrix of differential methylation values based on single individual analysis. 
#' #'
#' #'@export
#' 
#' compute_indiv_dm_table <- function(gene_list, exp_grp, data_meth, filter_indiv = "no_filter", indiv_filtering_matrix , pf_meth, pf_pos_colname, pf_chr_colname, updwn_str = 5000, slide= 0, apply_func = apply) {
#'   
#'   if (filter_indiv[1] == "no_filter") {
#'     print("all individuals for which trscr, cnv and meth sample exist will be used in the process")
#'     filter_indiv = rownames(exp_grp[exp_grp$trscr == 1 & exp_grp$cnv == 1 & exp_grp$meth == 1, ])
#'   }
#'   
#'   if (length(which(filter_indiv %in% colnames(data_meth))) != length(filter_indiv) ) {
#'     stop("ERROR, filter_indiv does not fit to indivuals present in the data_meth matrix")
#'   } 
#'   
#'   if (length(which(filter_indiv %in% colnames(indiv_filtering_matrix))) != length(filter_indiv)) {
#'     stop("ERROR, filter_indiv does not fit to indivuals present in the indiv_filtering_matrix")
#'   }
#'   
#'   indiv_filtering_matrix = indiv_filtering_matrix[rownames(gene_list), ]
#'   
#'   if (length(which(rownames(indiv_filtering_matrix) %in% rownames(gene_list))) != dim(indiv_filtering_matrix)[1]) {
#'     stop("ERROR, indiv_filtering_matrix probes does not correspond to the probes present in the data_meth matrix")
#'   }
#'   
#'   #get probes associated with genes
#'   probe_idx = apply_func(gene_list, 1, get_probe_names, pf_meth = pf_meth, pf_chr_colname = pf_chr_colname, pf_pos_colname = pf_pos_colname, up_str=updwn_str+slide, dwn_str=updwn_str+slide)
#'   idx = sapply(probe_idx, length) > 0
#'   probes = probe_idx[idx]
#'   probes = unique(unlist(probe_idx))
#'   names(probes) = NULL
#'   
#'   #generate a matrix of reference individuals for each probe of interest
#'   ctrl_matrix_gene = (indiv_filtering_matrix[, filter_indiv]  == 0) + 0
#'   ctrl_matrix_probe = data_meth[probes, filter_indiv] * 0
#'   for (gene in rownames(gene_list)) {
#'     for (probe in probe_idx[[gene]]){
#'       ctrl_matrix_probe[probe, ] =  ctrl_matrix_gene[gene, ]
#'     }
#'   }
#'   
#'   #calculate differential methylation
#'   meth_value = data_meth[probes, filter_indiv]
#'   meth_value[is.na(meth_value)] <- 0
#'   ctrl_matrix_probe[is.na(ctrl_matrix_probe)] <- 0
#'   nb_indiv_ctrl_by_probe = rowSums(ctrl_matrix_probe)
#'   mean_ctrl= (colSums(t(meth_value) * t(ctrl_matrix_probe))) / nb_indiv_ctrl_by_probe
#'   diff_meth_data = data_meth[probes, filter_indiv] - mean_ctrl  
#'   return(diff_meth_data)
#' }
magrichard/dmprocr documentation built on July 21, 2023, 11:01 p.m.