enrichMiR: enrichMiR: miRNA target enrichment analysis

Documented in dround getHumanMirExp getMouseMirExp matchMirExpr recapitalizeGenes recapitalizeMiRs

.plMergeList <- function(ll,...){
  if(!is.null(names(ll))) for(i in 1:length(ll)){
    colnames(ll[[i]]) <- paste(names(ll)[i],colnames(ll[[i]]),sep=".")
  }
  while(length(ll)>1){
    x <- merge(ll[[1]],ll[[2]],by="row.names",...)
    row.names(x) <- x[,1]
    ll[[1]] <- x[,-1]
    ll[[2]] <- NULL
  }
  ll[[1]]
}

#' recapitalizeGenes
#'
#' A utility function to reformat gene names.
#'
#' @param x A character vector or an array (in which case the function will be 
#' applied to row.names)
#' @param gformat The gene format, either 'human' (all caps) or 'mouse' (first 
#' letter capitalized)
#' 
#' @return An object of the same type and dimensions as `x`.
#' 
#' @importFrom tools toTitleCase
#' @export
recapitalizeGenes <- function(x, gformat="mouse"){
  gformat <- match.arg(gformat, choices=c("human","mouse"))
  if(is(x,'data.frame') | is(x,'matrix')){
    row.names(x) <- recapitalizeGenes(row.names(x), gformat)
    return(x)
  }
  switch(gformat,
    mouse=tools::toTitleCase(tolower(x)),
    human=sapply(x,toupper)
  )
}

#' recapitalizeMiRs
#'
#' A utility function to reformat miRNA names so that there are no capital 
#' letters except the 'miR-'.
#'
#' @param x A character vector or an array (in which case the function will be 
#' applied to row.names)
#' 
#' @return An object of the same type and dimensions as `x`.
#'
#' @export
recapitalizeMiRs <- function(x){
  if(is(x,'data.frame') | is(x,'matrix')){
    if(is.null(row.names(x)))
      stop( "Given an array without row.names... please apply to the column ",
            "containing the miRNA names.")
    row.names(x) <- recapitalizeMiRs(row.names(x))
    return(x)
  }
  x <- gsub("mir-","miR-",tolower(x))
}

.cleanMiRname <- function(x){ paste(strsplit(x,"-",fixed=T)[[1]][-1],collapse="-") }


.dea2binary <- function( dea, th=0.05, th.alfc=0, min.at.th=20, alt.top=50, 
                         restrictSign=NULL, verbose=TRUE ){
  dea <- .homogenizeDEA(dea)
  if(!is.null(restrictSign)){
    if(!(restrictSign %in% c(-1,1)))
      stop("`restrictSign`, if given, should be -1 or 1.")
    dea$FDR[sign(dea$logFC)!=restrictSign] <- 1
  }
  dea <- dea[order(dea$FDR),]
  if("PValue" %in% colnames(dea))
    dea <- dea[order(dea$FDR, dea$PValue),]
  if(sum(bi <- (dea$FDR<=th & abs(dea$logFC)>=th.alfc),na.rm=TRUE) < min.at.th){
    if(verbose) message("Insufficient genes passing the defined FDR; will use",
                        "the top ", alt.top, " genes.")
    x <- rep(c(TRUE,FALSE), c(alt.top,nrow(dea)-alt.top))
  }else{
    x <- bi
  }
  names(x) <- row.names(dea)
  x
}


.dea2sig <- function( dea, field=NULL ){
  if(is.null(field)){
    dea <- .homogenizeDEA(dea)
    if("PValue" %in% colnames(dea)){
      x <- sign(dea$logFC)*-log10(dea$PValue)
    }else{
      x <- sign(dea$logFC)*-log10(dea$FDR)
    }
  }else{
    dea <- dea[!is.na(dea[[field]]),]
    x <- dea[[field]]
  }
  names(x) <- row.names(dea)
  x
}

.homogenizeDEA <- function(x, keepTop=TRUE){
  if(is(x,"data.table")){
    if(any(duplicated(x[[1]]))){
      if(keepTop){
        x <- x[order(x[[head(grep("padj|adj\\.P\\.Val|q_value|qval|FDR", colnames(x)),1)]]),]
        x <- x[!duplicated(x[[1]]),]
      }else{
        x <- aggregate(x[,-1,drop=FALSE], by=list(gene=x[[1]]), FUN=mean)
      }
    }
    x <- x[!is.na(x[[1]]),]
    x <- data.frame(row.names=x[[1]], as.data.frame(x[,-1,drop=FALSE]))
  }
  x <- as.data.frame(x)
  if(length(w <- grep("^ENS",row.names(x)))>0)
    row.names(x)[w] <- gsub("\\..*","",row.names(x)[w])

  colnames(x) <- gsub("log2FoldChange|log2Fold|log2FC|log2\\(fold_change\\)|log2\\.fold_change\\.",
                      "logFC", colnames(x))
  
  abf <- colnames(df)[which(colnames(df) %in% c("meanExpr", "AveExpr", 
                                                "baseMean", "logCPM"))]
  if (length(abf) == 1) {
    x$meanExpr <- x[, abf]
    if (abf == "baseMean") 
      x$meanExpr <- log(x$meanExpr + 1)
  }else if(all(c("value_1","value_2") %in% colnames(x))){ # cufflinks
    x$meanExpr <- log(1+x$value_1+x$value_2)
  }
  colnames(x) <- gsub("P\\.Value|pvalue|p_value|pval", "PValue", colnames(x))
  colnames(x) <- gsub("padj|adj\\.P\\.Val|q_value|qval", "FDR", colnames(x))
  if (!("FDR" %in% colnames(x))) 
    x$FDR <- p.adjust(x$PValue, method = "fdr")
  f <- grep("^logFC$",colnames(x),value=TRUE)
  if(length(f)==0) f <- grep("logFC",colnames(x),value=TRUE)
  if(length(f)==0) warning("No logFC found.")
  if(length(f)>1){
    message("Using ",f[1])
    x[["logFC"]] <- x[[f[1]]]
  }
  x <- x[!is.na(x$logFC) & !is.na(x$FDR),]
  x
}


#' dround
#'
#' Trim to a certain number of digits (similar to `format(...,digits=digits)`, 
#' except that the output remains numeric)
#'
#' @param x A vector of numeric values, or a data.frame or similar
#' @param digits The number of digits to keep
#' @param roundGreaterThan1 Whether to trim also numbers greater than 1 (default FALSE)
#'
#' @return A object of same dimensions as `x`
#' @export
#'
#' @examples
#' dround( c(0.00002345, 554356, 12.56) )
dround <- function(x, digits=3, roundGreaterThan1=FALSE){
  if(!is.null(dim(x))){
    for(i in seq_len(ncol(x))){
      if(!is.integer(x[,i]) && is.numeric(x[,i])){
        tryCatch({
          x[,i] <- dround(as.numeric(x[,i]), digits, roundGreaterThan1)
        }, error=function(e) warning(e))
      }
    }
    return(x)
  }  
  if(roundGreaterThan1){
    w <- 1:length(x)
  }else{
    w <- which(abs(x)<1)
  }
  if(length(w)==0) return(x)
  e <- ceiling(-log10(abs(x[w])))
  y <- x
  y[w] <- round(10^e*x[w],digits-1)/10^e
  y[x==0] <- 0
  y
}



.agDF <- function(df, new.rn, 
                  match.names=(!is.null(names(new.rn)) && length(df)!=length(new.rn))){
  if(match.names){
    names(a) <- a <- row.names(df)
    a[names(new.rn)] <- as.character(new.rn)
    new.rn <- a[row.names(df)]
  }
  if(is.null(names(new.rn))) return(df)
  tt <- split(names(new.rn),new.rn)
  tt <- data.frame(row.names=names(tt),
                   members=sapply(tt, FUN=function(x) paste(sort(x),collapse=";")))
  if(ncol(df)==0) return(data.frame(row.names=unique(new.rn)))
  if(!any(duplicated(new.rn))){
    row.names(df) <- new.rn
    df <- merge(df,tt,by = 0, all.x = TRUE)
    row.names(df) <- df$Row.names
    return(df[,-c(1),drop=FALSE])
  }
  df <- aggregate(df, by=list(RN=new.rn), FUN=function(x){
    if(is.factor(x)) x <- as.character(x)
    if(length(x)==1) return(x)
    if(is.numeric(x)) return(x[which.max(abs(x))])
    paste(sort(x), collapse=";")
  })
  row.names(df) <- df[,1]
  df <- merge(df,tt,by = 0, all.x = TRUE)
  row.names(df) <- df$Row.names
  df[,-c(1,2),drop=FALSE]
}

.filterTranscripts <- function(x, minProp=0.9, minLogCPM=1){
  if(!all(c("transcript","gene","logCPM") %in% colnames(x))) stop("Malformed input.")
  gs <- rowsum(exp(x$logCPM), x$gene)
  x[ (exp(x$logCPM)/gs[x$gene,1]) > minProp & x$logCPM>minLogCPM, ]
}

# triggers an error if the sets are not formatted correctly, and return a 
# logical indicating whether the sets are in matrix format
.checkSets <- function(sets, requiredColumns=c(), matrixAlternative=FALSE){
  if( !isFALSE(matrixAlternative) && 
      (is.matrix(sets) || is(sets,"sparseMatrix")) ){
    matrixAlternative <- match.arg(matrixAlternative, c("logical","numeric"))
    if(is(sets,"sparseMatrix")){
      if(matrixAlternative=="logical") stopifnot(is(sets,"lgCMatrix"))
      if(matrixAlternative=="numeric") stopifnot(is(sets,"dgCMatrix"))
    }else{
      if(matrixAlternative=="logical") stopifnot(is.logical(sets))
      if(matrixAlternative=="numeric") stopifnot(is.numeric(sets))
    }
    return(TRUE)
  }
  stopifnot(is.data.frame(sets) || is(sets, "DFrame"))
  stopifnot(c("feature","set",requiredColumns) %in% colnames(sets))
  FALSE
}

.is.matrix <- function(x){
  is.matrix(x) || is(x,"sparseMatrix") || is(x,"DelayedArray")
}

#' @importFrom S4Vectors DataFrame metadata metadata<- 
.list2DF <- function(sets){
  if(!is.null(dim(sets)) && !all(c("feature","set") %in% colnames(sets)))
    stop("Malformed `sets`.")
  if(is(sets, "DataFrame")) return(sets)
  if(is.data.frame(sets)){
    at <- attributes(sets)
    sets <- DataFrame(sets)
    for(f in intersect(at, c("sets.properties","feature.synonyms")))
      metadata(sets)[[f]] <- at[[f]]
    return(sets)
  }
  if(is.null(names(sets))) stop("The sets should be named!")
  if(is.list(sets[[1]])){
    if(all(names(sets[[1]]) %in% c("tfmode","likelihood"))){
      # regulon object
      sets <- lapply(sets, FUN=function(x){
        data.frame(feature=names(x[[1]]), score=x$tfmode*x$likelihood)
      })
      return(cbind(set=rep(names(sets),sapply(sets,nrow)),
                   do.call(rbind, sets)))
    }else{
      stop("`sets` has an unknown format")
    }
  }
  y <- DataFrame( set=factor(rep(sets, lengths(sets))) )
  if(!is.null(names(sets[[1]])) && is.numeric(sets[[1]])){
    y$feature <- unlist(lapply(sets, names))
    y$score <- unlist(lapply(sets, as.numeric))
  }else{
    y$feature <- unlist(lapply(sets, as.character))
  }
  y$feature <- as.factor(y$feature)
  y
}




#' getHumanMirExp
#'
#' Get the human miRNA expression in a given tissue or celltype from the 
#' `microRNAome` package 
#' (\href{http://genome.cshlp.org/content/27/10/1769}{McCall et al., 2017}).
#'
#' @param x Desired tissue or celltype
#'
#' @return If `x` is NULL, returns a vector of the possible values. If `x` is 
#' given and matches a tissue/celltype of the dataset, returns the average miRNA
#' expression as logCPM, i.e. log(1+counts per million).
#' @export
#'
#' @examples
#' head(getHumanMirExp("thyroid"))
getHumanMirExp <- function(x=NULL){
  library(SummarizedExperiment)
  data("microRNAome", package="microRNAome")
  if(is.null(x)) return(sort(unique(microRNAome$cell_tissue)))
  if(!(x %in% microRNAome$cell_tissue)) return(NULL)
  x <- assay(microRNAome)[,microRNAome$cell_tissue==x,drop=FALSE]
  row.names(x) <- gsub("/.+","",(row.names(x)))
  x <- rowSums(x)
  x <- sort(log1p(10^6 * x/sum(x)),decreasing=TRUE)
  x[!duplicated(names(x))]
}

#' getMouseMirExp
#'
#' Get the mouse miRNA expression in a given tissue or celltype, based on the
#' GSE119661 dataset 
#' (\href{https://doi.org/10.1093/nar/gkaa323}{Kern et al., 2020}) 
#' supplemented with some celltypes from the GSE30286 dataset 
#' (\href{https://doi.org/10.1016/j.neuron.2011.11.010}{He et al., 2012}).
#'
#' @param x Desired tissue or celltype
#'
#' @return If `x` is NULL, returns a vector of the possible values. If `x` is 
#' given and matches a tissue/celltype of the dataset, returns the average miRNA
#' expression as logCPM, i.e. log(1+counts per million).
#' @export
#'
#' @examples
#' head(getMouseMirExp("Muscle"))
getMouseMirExp <- function(x=NULL){
  data("miRNAexpMouse", package="enrichMiR")
  if(is.null(x)) return(sort(colnames(miRNAexpMouse)))
  if(!(x %in% colnames(miRNAexpMouse))) return(NULL)
  x <- sort(miRNAexpMouse[,x], decreasing=TRUE)
  x[!duplicated(names(x))]
}

.addBestType <- function(x){
  stopifnot(!is.null(dim(x)))
  if(!is.null(x$best_stype)) return(x)
  if(length(w <- grep("[6-8]mer", colnames(x)))<2) return(x)
  x$best_stype <- apply(as.matrix(x[,w]), 1, FUN=function(y){
    if(length(i <- which(as.numeric(y)>0))==0) return(0)
    head(i,1)
  })
  x$best_stype <- factor(x$best_stype, levels=c(0L, rev(seq_along(w))), 
                         labels=c("no site",rev(gsub("Sites_","",colnames(x)[w]))))
  levels(x$best_stype) <- gsub("_","-",levels(x$best_stype))
  x
}

#' @importFrom data.table as.data.table setkey
.aggregateByFamilies <- function(ts, propTolerated=0.05){
  md <- metadata(ts)
  fam <- md$families
  if(is.null(md$families)) return(ts)
  prop <- sum(levels(ts$set) %in% md$families)
  if(prop>=(1-propTolerated)) return(ts)
  if(length(missng <- setdiff(levels(ts$set), names(md$families)))>0){
    warning("Some sets had no indicated family!")
    md$families <- c(
      setNames(as.character(md$families),names(md$families)),
      setNames(missng,missng))
  }
  levels(ts$set) <- as.character(md$families[levels(ts$set)])
  dt <- as.data.table(as.data.frame(ts))
  rm(ts)
  dt <- setkey(dt, set, feature)
  dt <- unique(dt[,c("set","feature","sites")], by=c("set","feature"))
  ts <- DataFrame(dt)
  rm(dt)
  metadata(ts) <- md
  ts
}



#' matchMirExpr
#' 
#' Tries to match input miRNA expression with given names, eventually adding the
#' -3p/-5p.
#'
#' @param me A named vector of miRNA expression, or data.frame with an 
#' 'expression' column
#' @param setsNames An optional vector of known sets names, or an actual target
#' annotation.
#' @param restrict Logical indicating whether to restrict `me` to elements 
#' included in `setNames` (after re-formating)
#'
#' @return A data.frame of miRNA expressions.
matchMirExpr <- function(me, setsNames=NULL, restrict=TRUE){
  if(is.vector(me)) me <- data.frame(row.names=names(me), expression=me)
  row.names(me) <- gsub("mir","miR", row.names(me))
  if(!any(grepl("miR-",row.names(me)))) return(me)
  w <- grep("-5p$|-3p$", row.names(me), invert=TRUE)
  if(length(w)==0) return(me)
  me2 <- data.frame(row.names=paste0(rep(row.names(me)[w],each=2),
                                     c("-5p","-3p")),
                    expression=rep(me$expression[w],each=2))
  me <- rbind(me,me2[setdiff(row.names(me2),row.names(me)),,drop=FALSE])
  if(!is.null(setsNames)){
    if(!is.vector(setsNames)){
      if(is(setsNames, "DFrame")){
        fams <- metadata(setsNames)$families
        setsNames <- as.character(unique(setsNames$set))
        if(!is.null(fams)) setsNames <- c(setsNames, unlist(fams), names(fams))
      }else if(is.list(setsNames)){
        setsNames <- names(setsNames)
      }else if(is.matrix(setsNames)){
        setsNames <- colnames(setsNames)
      }
    }
    nn <- setNames(gsub("-[1-9]-","-",row.names(me)), row.names(me))
    nn <- nn[nn %in% setsNames & !(nn %in% row.names(me)) & !duplicated(nn)]
    if(length(nn)>0){
      me2 <- data.frame(row.names=nn, expression=me[names(nn),"expression"])
      me <- rbind(me,me2)
    }
    if(restrict) me <- me[row.names(me) %in% setsNames,,drop=FALSE]
  }
  me
}

ETHZ-INS/enrichMiR documentation built on July 20, 2024, 12:03 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ETHZ-INS/enrichMiR
enrichMiR: miRNA target enrichment analysis

R/misc.R
In ETHZ-INS/enrichMiR: enrichMiR: miRNA target enrichment analysis

Defines functions matchMirExpr .aggregateByFamilies .addBestType getMouseMirExp getHumanMirExp .list2DF .is.matrix .checkSets .filterTranscripts .agDF dround .homogenizeDEA .dea2sig .dea2binary .cleanMiRname recapitalizeMiRs recapitalizeGenes .plMergeList

Documented in dround getHumanMirExp getMouseMirExp matchMirExpr recapitalizeGenes recapitalizeMiRs

R Package Documentation

Browse R Packages

We want your feedback!

ETHZ-INS/enrichMiR enrichMiR: miRNA target enrichment analysis

R/misc.R In ETHZ-INS/enrichMiR: enrichMiR: miRNA target enrichment analysis

Defines functions matchMirExpr .aggregateByFamilies .addBestType getMouseMirExp getHumanMirExp .list2DF .is.matrix .checkSets .filterTranscripts .agDF dround .homogenizeDEA .dea2sig .dea2binary .cleanMiRname recapitalizeMiRs recapitalizeGenes .plMergeList

Documented in dround getHumanMirExp getMouseMirExp matchMirExpr recapitalizeGenes recapitalizeMiRs

R Package Documentation

Browse R Packages

We want your feedback!

ETHZ-INS/enrichMiR
enrichMiR: miRNA target enrichment analysis

R/misc.R
In ETHZ-INS/enrichMiR: enrichMiR: miRNA target enrichment analysis