R/diffSeqLogoSupport.R
In DiffLogo: DiffLogo: A comparative visualisation of biooligomer motifs

Documented in baseDistributionPwm calculatePvalue diffLogoTableConfiguration enrichDiffLogoObjectWithPvalues enrichDiffLogoTableWithPvalues pwmDivergence

##' Default configuration list for diffLogoTable
##'
##' @title Configuration object for diffLogoTable
##' @param alphabet used alphabet of type Alphabet
##' @param stackHeight function for the height of a stack at position i
##' @param baseDistribution function for the heights of the individual bases
##' @param uniformYaxis if TRUE each DiffLogo is plotted with the same scaling of the y-axis
##' @param sparse if TRUE margins are reduced and tickmarks are removed from the logo
##' @param showSequenceLogosTop if TRUE the classical sequence logos are drawn above each column of the table
##' @param enableClustering if TRUE the motifs are reordered, so that similar motifs have a small vertical and horizontal distance in the table
##' @param treeHeight the height of the plotted cluster tree above the columns of the table; set equal to zero to omit the cluster tree
##' @param margin the space reseverved for labels
##' @param ratio the ratio of the plot; this is needed to determine the margin sizes correctly
##' @param align_pwms if True, will align and extend pwms in each cell of diffLogoTable independently.
##' @param multiple_align_pwms if True, will align and extend pwms in the diffLogoTable jointly.
##' @param unaligned_penalty is a function for localPwmAlignment.
##' @param try_reverse_complement if True, alignment will try reverse complement pwms
##' @param length_normalization if True, divergence between pwms is divided by length of pwms.
##' @param numberOfPermutations number of permutations for the permutation test for the calculation of p-values
##' @return list of parameters
##' @export
##' @author Lando Andrey
##' @examples
##' diffLogoTableConfiguration(DNA)
diffLogoTableConfiguration = function(
		alphabet,
		stackHeight=shannonDivergence,
		baseDistribution=normalizedDifferenceOfProbabilities,
		uniformYaxis=TRUE,
		sparse=TRUE,
		showSequenceLogosTop=TRUE,
		enableClustering=TRUE,
		treeHeight=0.5,
		margin=0.02,
		ratio=1,
		align_pwms=FALSE,
		multiple_align_pwms=TRUE,
		unaligned_penalty=divergencePenaltyForUnaligned,
		try_reverse_complement=TRUE,
		length_normalization=FALSE,
		numberOfPermutations=100) {
    #if (!alphabet$supportReverseComplement) {
    #   try_reverse_complement = F;
    #}
    return(
      list(
    		uniformYaxis=uniformYaxis,
    		sparse=sparse,
    		showSequenceLogosTop=showSequenceLogosTop,
    		enableClustering=enableClustering,
    		treeHeight=treeHeight,
    		margin=margin,
    		ratio=ratio,
    		stackHeight=stackHeight,
    		baseDistribution=baseDistribution,
    		multiple_align_pwms=multiple_align_pwms,
    		align_pwms=align_pwms,
    		unaligned_penalty=unaligned_penalty,
    		try_reverse_complement=try_reverse_complement && alphabet$supportReverseComplement,
    		length_normalization=length_normalization,
    		numberOfPermutations=numberOfPermutations)
    )
}

getMarginsDiffLogo = function(sparse) {
	if(sparse) {
		return(c(0.3,1.2,0.1,0.1));
	} else {
		return(c(1,1.5,0.1,0.1));
	}
}

getMarginsSeqLogo = function(sparse) {
	if(sparse) {
		return (c(0.3,1.2,0.0,0.1));
	} else {
		return (c(1,1.5,0.1,0.1));
	}
}

extractNames = function(PWMs) {
	names = names(PWMs);
    if (is.null(names)) {
        names = 1:length(PWMs)
    }
    return (names);
}

revCompPwm = function (pwm) {
    if(nrow(pwm) != 4)
      stop("Can only reverse complement DNA and RNA alphabet.")
    result = pwm[nrow(pwm):1, ncol(pwm):1]
    rownames(result) = rownames(pwm)
    return(result)
}


##' Counts PWM divergence as sum of divergencies of their columns.
##'
##' @title PWM divergence
##' @param pwm_left is a PWM representation in type of matrix
##' @param pwm_right is a PWM representation in type of matrix. The result is symmetric on pwm_left and pwm_right
##' @param divergence is a Divergence function on columns.
##' @return float - sum of divergences
##' @export
pwmDivergence = function(pwm_left, pwm_right, divergence=shannonDivergence) {
    stopifnot(ncol(pwm_left) == ncol(pwm_right));
    return(sum(sapply(1:ncol(pwm_left), function(i) {
        divergence(pwm_left[,i], pwm_right[,i])$height
    })));
}

##' Generates a PWM consisting of only the uniform distribution or the given base_distribution (if defined).
##' 
##' @title Generates a PWM
##' @param pwm_length the number of positions
##' @param alphabet_length the alphabet size
##' @param base_distribution optional base distribution for each PWM position
##' @return a PWM
baseDistributionPwm = function(pwm_length, alphabet_length, base_distribution=NULL) {
    if (is.null(base_distribution)) {
       base_distribution = rep(1.0/alphabet_length, each=alphabet_length)
    }
    return(matrix(rep(base_distribution, each=length, pwm_length), nrow=alphabet_length))
}

##' Enriches a matrix of difflogo objects with p-values which quantifies the probability that two PWM-positions are from the same distribution
##'
##' @title Enriches a matrix of difflogo objects with p-values
##' @param diffLogoObjMatrix matrix of difflogo objects
##' @param sampleSizes number of sequences behind the pwms behind the given difflogo objects
##' @param stackHeight function for the calculation of a divergence measure for two probability vectors
##' @param numberOfPermutations the number of permutations to perform for the calculation of stackHeights
##' @return matrix of difflogo objects enriched with p-values
##' @export
##' @author Martin Nettling
##' @examples
##' motif_folder= "extdata/pwm"
##' motif_names = c("HepG2","MCF7","HUVEC","ProgFib")
##' motifs = list()
##' for (name in motif_names) {
##'   fileName = paste(motif_folder,"/",name,".pwm",sep="")
##'   file = system.file(fileName, package = "DiffLogo")
##'   motifs[[name]] = getPwmFromPwmFile(file)
##' }
##' sampleSizes <- c(100, 150, 200, 250)
##' names(sampleSizes) <- motif_names
##' 
##' diffLogoTableObj = prepareDiffLogoTable(motifs);
##' diffLogoTableObj$diffLogoObjMatrix = enrichDiffLogoTableWithPvalues(diffLogoTableObj$diffLogoObjMatrix, sampleSizes)
enrichDiffLogoTableWithPvalues <- function(diffLogoObjMatrix, sampleSizes, stackHeight=shannonDivergence, numberOfPermutations = 100 ) {
  if(is.null(sampleSizes))
    stop("No sample sizes given!")
  
  motifs = names(diffLogoObjMatrix);
  dim = length(motifs);
  for ( i in 1:dim) {
    for ( k in 1:dim) {
      motif_i = motifs[i];
      motif_k = motifs[k];
      if(!is.null(diffLogoObjMatrix[[motif_i]][[motif_k]])) {
        diffLogoObjMatrix[[motif_i]][[motif_k]] = enrichDiffLogoObjectWithPvalues(
                                                          diffLogoObjMatrix[[motif_i]][[motif_k]],
                                                          sampleSizes[[motif_i]],
                                                          sampleSizes[[motif_k]]
                                                    );
      }
    }
  }
  return(diffLogoObjMatrix);
}

##' Enriches a difflogo object with p-values which quantifies the probability that two PWM-positions are from the same distribution
##'
##' @title Enriches a difflogo object with p-values
##' @param diffLogoObj matrix of difflogo objects
##' @param n1 the number of sequences behind the first pwm behind the given difflogo object
##' @param n2 the number of sequences behind the second pwm behind the given difflogo object
##' @param stackHeight function for the calculation of a divergence measure for two probability vectors
##' @param numberOfPermutations the number of permutations to perform for the calculation of stackHeights
##' @return enriched difflogo object
##' @export
##' @examples 
##' motif_folder= "extdata/pwm"
##' motif_names = c("HepG2","MCF7","HUVEC","ProgFib")
##' motifs = list()
##' for (name in motif_names) {
##'   fileName = paste(motif_folder,"/",name,".pwm",sep="")
##'   file = system.file(fileName, package = "DiffLogo")
##'   motifs[[name]] = getPwmFromPwmFile(file)
##' }
##' 
##' pwm1 = motifs[[motif_names[[1]]]]
##' pwm2 = motifs[[motif_names[[2]]]]
##' n1 <- 100
##' n2 <- 100
##' diffLogoObj = createDiffLogoObject(pwm1 = pwm1, pwm2 = pwm2)
##' diffLogoObj = enrichDiffLogoObjectWithPvalues(diffLogoObj, n1, n2)
enrichDiffLogoObjectWithPvalues <- function(diffLogoObj, n1, n2, stackHeight=shannonDivergence, numberOfPermutations = 100) {
    pwm1 = diffLogoObj$pwm1
    pwm2 = diffLogoObj$pwm2
    npos = ncol(diffLogoObj$pwm1);
    pvals = rep(1,npos);
    for (j in (diffLogoObj$unaligned_from_left+1):(npos - diffLogoObj$unaligned_from_right)) {
      pvals[j] = calculatePvalue(p1 = pwm1[,j], p2 = pwm2[,j], n1 = n1, n2 = n2, stackHeight = stackHeight, numberOfPermutations = numberOfPermutations);
    }
    diffLogoObj$pvals = pvals;
    return(diffLogoObj);
} 


##' Calculates the p-value for the null-hypothesis that two given probability vectors p1, p2 calculated from n1/n2 observations arise from the same distribution
##'
##' @title p-value that two PWM-positions are from the same distribution
##' @param p1 first probability vector with one probability for each symbol of the alphabet
##' @param p2 second probability vector with one probability for each symbol of the alphabet
##' @param n1 number of observations for the calculation of p1
##' @param n2 number of observations for the calculation of p2
##' @param stackHeight function for the calculation of a divergence measure for two probability vectors
##' @param numberOfPermutations the number of permutations to perform for the calculation of stackHeights
##' @param plotGammaDistributionFit if TRUE the fit of a gamma distribution to the sampled stackHeights is plotted
##' @return a numeric p-value
##' @export
##' @importFrom stats var dgamma
##' @importFrom graphics plot lines legend
##' @author Hendrik Treutler
##' @examples
##' p1 <- c(0.2, 0.3, 0.1, 0.4)
##' p2 <- c(0.2, 0.1, 0.3, 0.4)
##' n1 <- 100
##' n2 <- 200
##' numberOfPermutations = 100
##' plotGammaDistributionFit = TRUE
##' 
##' pValue <- calculatePvalue(p1 = p1, p2 = p2, n1 = n1, n2 = n2, stackHeight = shannonDivergence, numberOfPermutations = numberOfPermutations, plotGammaDistributionFit = plotGammaDistributionFit)
calculatePvalue <- function(p1, p2, n1, n2, stackHeight=shannonDivergence, numberOfPermutations = 100, plotGammaDistributionFit = FALSE){
  ############################################################################################
  ## preconditions
  preconditionVectorSameSize(p1, p2)
  preconditionBaseDistribution(p1)
  preconditionBaseDistribution(p2)
  preconditionProbabilityVector(p1)
  preconditionProbabilityVector(p2)

  if(all(p1==p2)) {
    return(1);
  }

  if(any(is.na(n1), is.na(n2), is.null(n1), is.null(n2), !is.numeric(n1), !is.numeric(n2), n1<=0, n2<=0))
    stop("Given counts are corrupt!")
  
  ############################################################################################
  ## divergence to test
  height = stackHeight(p1 = p1, p2 = p2)
  preconditionStackHeight(height)
  observedDivergence = height$height
  
  ############################################################################################
  ## parameters
  alphabet <- 1:length(p1)
  p = (p1 + p2) / 2
  n = n1 + n2
  a = as.integer(round(p * n))
  # the rounding can cause that sum(a) != n
  n = sum(a);

  ############################################################################################
  ## permutations
  multiplier <- 1:length(alphabet)
  seed <- (sum((1 / (p1+1e-3)) * multiplier) * sum((1 / (p2+1e-3)) * multiplier)) %% 2147483647
  set.seed(seed)
  
  symbols <- unlist(sapply(X = alphabet, FUN = function(x){rep(x = x, times = a[[x]])}))
  classes <- c(rep(x = TRUE, times = n1), rep(x = FALSE, times = n - n1))

  divergences <- vector(mode = "numeric", length = numberOfPermutations)
  for(idx in 1:numberOfPermutations){
    newOrder = classes[sample(n)];
    symbols2 = symbols[newOrder]

    a1 = unlist(lapply(X = alphabet, FUN = function(x){sum(symbols2 == x)}))
    a2 = a - a1
    divergences[[idx]] = stackHeight(p1 = a1 / n1, p2 = a2 / n2)$height
  }
  maximumDivergence = max(max(divergences) * 2, observedDivergence * 2)
  
  ############################################################################################
  ## fit gamma distribution
  med.gam <- mean(divergences)
  var.gam <- var(divergences)
  rate <- med.gam/var.gam
  alpha <- ((med.gam)^2)/var.gam
  scale <- 1/rate
  
  gammaDistX <- seq(from=0, to=maximumDivergence, length.out = 10000)

  gammaDistY <- dgamma(x = gammaDistX, rate = rate, shape = alpha)
  if(gammaDistY[1] == Inf) {
    gammaDistY[1] = 0;
  }
  gammaDistY = gammaDistY / sum(gammaDistY)  
  gammaDistYcum = cumsum(gammaDistY)
  
  ############################################################################################
  ## plot
  if(plotGammaDistributionFit){
    cumSumValuesX <- sort(divergences)
    cumSumValuesY <- (1:numberOfPermutations) / numberOfPermutations
    
    plot(NA, xlim = c(0,maximumDivergence), ylim = c(0, 1), ylab = "p", xlab = "x", main = paste("p = (", paste(p, collapse = ","), "), n1 = ", n1, ", n2 = ", n2, sep = ""))
    lines(x = cumSumValuesX, y = cumSumValuesY, col = "blue")
    lines(x = gammaDistX, y = gammaDistYcum, col = "green")
    legend(x = max(divergences)/2, y = 0.3, legend = c("Empiric", paste("Gamma, k=", round(rate, digits = 2), ", theta=", round(alpha, digits = 2), sep = "")), lty = c(1, 1), col = c("blue", "green"))
  }
  
  ############################################################################################
  ## p-value
  pValue <- 1 - gammaDistYcum[[min(which(gammaDistX > observedDivergence))]]
  
  return(pValue)
}

Any scripts or data that you put into this service are public.

DiffLogo documentation built on Nov. 8, 2020, 6:09 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

DiffLogo
DiffLogo: A comparative visualisation of biooligomer motifs

R/diffSeqLogoSupport.R
In DiffLogo: DiffLogo: A comparative visualisation of biooligomer motifs

Defines functions calculatePvalue enrichDiffLogoObjectWithPvalues enrichDiffLogoTableWithPvalues baseDistributionPwm pwmDivergence extractNames getMarginsSeqLogo getMarginsDiffLogo diffLogoTableConfiguration

Documented in baseDistributionPwm calculatePvalue diffLogoTableConfiguration enrichDiffLogoObjectWithPvalues enrichDiffLogoTableWithPvalues pwmDivergence

Try the DiffLogo package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DiffLogo DiffLogo: A comparative visualisation of biooligomer motifs

R/diffSeqLogoSupport.R In DiffLogo: DiffLogo: A comparative visualisation of biooligomer motifs

Defines functions calculatePvalue enrichDiffLogoObjectWithPvalues enrichDiffLogoTableWithPvalues baseDistributionPwm pwmDivergence extractNames getMarginsSeqLogo getMarginsDiffLogo diffLogoTableConfiguration

Documented in baseDistributionPwm calculatePvalue diffLogoTableConfiguration enrichDiffLogoObjectWithPvalues enrichDiffLogoTableWithPvalues pwmDivergence

Try the DiffLogo package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DiffLogo
DiffLogo: A comparative visualisation of biooligomer motifs

R/diffSeqLogoSupport.R
In DiffLogo: DiffLogo: A comparative visualisation of biooligomer motifs