R/customKegg.R

Defines functions customKegg

#'
#' data es un dataframe donde la primera columna son loa symbol y la segunda
#' los entrez id

customKegg <- function(data, universe = NULL, restrict.universe = FALSE,
    species = "Hs", species.KEGG = "hsa", convert = FALSE, gene.pathway = NULL,
    pathway.names = NULL, prior.prob = NULL, covariate = NULL,
    plot = FALSE, ...) {
    if( !is.data.frame(data)){
        stop("de should be a data.frame with firt column as symbol Id and second
             column as entrez Id.")
    }
    if( dim(data)[2] != 2){
        stop("de should be a data.frame with firt column as symbol
        Id and second column as entrez Id.")
    }
    names(data) <- c("SYMBOL","ENTREZID")
    de <- data
    de <- as.vector(de[,2])
    de <- list(DE = de)
    nsets <- length(de)
    if (!all(vapply(de, is.vector, TRUE))) {
        stop("components of de should be vectors")
    }
    for (s in 1:nsets) de[[s]] <- unique(as.character(de[[s]]))
    names(de) <- trimWhiteSpace(names(de))
    NAME <- names(de)
    i <- which(NAME == "" | is.na(NAME))
    NAME[i] <- paste0("DE", i)
    names(de) <- makeUnique(NAME)
    if (is.null(species.KEGG)) {
        species <- match.arg(species, c("Ag", "At", "Bt", "Ce",
            "Dm", "Dr", "EcK12", "EcSakai", "Gg", "Hs", "Mm",
            "Mmu", "Pf", "Pt", "Rn", "Ss", "Xl"))
        species.KEGG <- switch(species, Ag = "aga", At = "ath",
            Bt = "bta", Ce = "cel", Cf = "cfa", Dm = "dme", Dr = "dre",
            EcK12 = "eco", EcSakai = "ecs", Gg = "gga", Hs = "hsa",
            Mm = "mmu", Mmu = "mcc", Pf = "pfa", Pt = "ptr",
            Rn = "rno", Ss = "ssc", Xl = "xla")
    }
    if (is.null(gene.pathway)) {
        GeneID.PathID <- getGeneKEGGLinks(species.KEGG, convert = convert)
    } else {
        GeneID.PathID <- gene.pathway
        d <- dim(GeneID.PathID)
        if (is.null(d)) {
            stop("gene.pathway must be data.frame or matrix")
        }
        if (d[2] < 2) {
            stop("gene.pathway must have at least 2 columns")
        }
        isna <- rowSums(is.na(GeneID.PathID[, 1:2])) > 0.5
        GeneID.PathID <- GeneID.PathID[!isna, ]
        ID.ID <- paste(GeneID.PathID[, 1], GeneID.PathID[, 2],
            sep = ".")
        if (anyDuplicated(ID.ID)) {
            d <- duplicated(ID.ID)
            GeneID.PathID <- GeneID.PathID[!d, ]
        }
    }
    if (is.null(pathway.names)) {
        PathID.PathName <- getKEGGPathwayNames(species.KEGG,
            remove.qualifier = TRUE)
    } else {
        PathID.PathName <- pathway.names
        d <- dim(PathID.PathName)
        if (is.null(d)) {
            stop("pathway.names must be data.frame or matrix")
        }
        if (d[2] < 2) {
            stop("pathway.names must have at least 2 columns")
        }
        isna <- rowSums(is.na(PathID.PathName[, 1:2])) > 0.5
        PathID.PathName <- PathID.PathName[!isna, ]
    }
    if (is.null(universe)) {
        universe <- unique(GeneID.PathID[, 1])
        prior.prob <- covariate <- NULL
    } else {
        universe <- as.character(universe)
        lu <- length(universe)
        if (!lu) {
            stop("No genes in universe")
        }
        if (!is.null(prior.prob) && length(prior.prob) != lu) {
            stop("universe and prior.prob must have same length")
        }
        if (!is.null(covariate) && length(covariate) != lu) {
            stop("universe and covariate must have same length")
        }
        if (restrict.universe) {
            i <- universe %in% GeneID.PathID[, 1]
            universe <- universe[i]
            if (!is.null(prior.prob)) {
                prior.prob <- prior.prob[i]
            }
            if (!is.null(covariate)) {
                covariate <- covariate[i]
            }
        }
    }
    if (anyDuplicated(universe)) {
        d <- duplicated(universe)
        if (!is.null(covariate)) {
            covariate <- rowsum(covariate, group = universe,
                reorder = FALSE)
            n <- rowsum(rep_len(1L, length(universe)), group = universe,
                reorder = FALSE)
            covariate <- covariate/n
        }
        if (!is.null(prior.prob)) {
            prior.prob <- rowsum(prior.prob, group = universe,
                reorder = FALSE)
            n <- rowsum(rep_len(1L, length(universe)), group = universe,
                reorder = FALSE)
            prior.prob <- prior.prob/n
        }
        universe <- universe[!d]
    }
    NGenes <- length(universe)
    if (NGenes < 1L) {
        stop("No annotated genes found in universe")
    }
    for (s in 1:nsets) de[[s]] <- de[[s]][de[[s]] %in% universe]
    i <- GeneID.PathID[, 1] %in% universe
    if (sum(i) == 0L) {
        stop("Pathways do not overlap with universe")
    }
    GeneID.PathID <- GeneID.PathID[i, ]
    if (!is.null(covariate)) {
        if (!is.null(prior.prob)) {
            message("prior.prob being recomputed from covariate")
        }
        covariate <- as.numeric(covariate)
        isDE <- (universe %in% unlist(de))
        o <- order(covariate)
        prior.prob <- covariate
        span <- approx(x = c(20, 200), y = c(1, 0.5), xout = sum(isDE),
            rule = 2)$y
        prior.prob[o] <- tricubeMovingAverage(isDE[o], span = span)
        if (plot) {
            barcodeplot(covariate, index = isDE, worm = TRUE,
                span.worm = span, main = "DE status vs covariate")
        }
    }
    if (is.null(prior.prob)) {
        X <- matrix(1, nrow(GeneID.PathID), nsets + 1)
        colnames(X) <- c("N", names(de))
    } else {
        names(prior.prob) <- universe
        X <- matrix(1, nrow(GeneID.PathID), nsets + 2)
        X[, nsets + 2] <- prior.prob[GeneID.PathID[, 1]]
        colnames(X) <- c("N", names(de), "PP")
    }
    for (s in 1:nsets) {
        X[, s + 1] <- (GeneID.PathID[, 1] %in% de[[s]])
    }
    S <- rowsum(X, group = GeneID.PathID[, 2], reorder = FALSE)
    PValue <- matrix(0, nrow = nrow(S), ncol = nsets)
    colnames(PValue) <- paste("P", names(de), sep = ".")
    nde <- lengths(de, use.names = FALSE)
    if (!is.null(prior.prob)) {
        SumPP <- sum(prior.prob)
        M2 <- NGenes - S[, "N"]
        Odds <- S[, "PP"]/(SumPP - S[, "PP"]) * M2/S[, "N"]
        if (!requireNamespace("BiasedUrn", quietly = TRUE)) {
            stop("BiasedUrn package required but is not installed (or can't be loaded)")
        }
        for (j in seq_len(nsets)) {
            for (i in seq_len(nrow(S))) {
                PValue[i, j] <- BiasedUrn::pWNCHypergeo(S[i,
                  1L + j], S[i, "N"], M2[i], nde[j], Odds[i],
                  lower.tail = FALSE) + BiasedUrn::dWNCHypergeo(S[i,
                  1L + j], S[i, "N"], M2[i], nde[j], Odds[i])
            }
        }
        S <- S[, -ncol(S)]
    } else {
        for (j in seq_len(nsets)) {
            PValue[, j] <- phyper(S[, 1L + j] - 0.5, nde[j],
                NGenes - nde[j], S[, "N"], lower.tail = FALSE)
        }
    }

    g <- rownames(S)
    m <- match(g, PathID.PathName[, 1])
    path <- PathID.PathName[m, 2]
    ##
    kk <- GeneID.PathID
    data <- data.frame(ENTREZID = data[,2])
    data$ENTREZID <- as.character(data$ENTREZID)
    kkb <- inner_join(kk, data, by = c(GeneID = "ENTREZID"))
    kkb <- kkb[!duplicated(kkb), ]
    # kkb <- kkb[ which( !is.na( kkb$SYMBOL ) ), ]
    kkk <- kkb %>% group_by(PathwayID) %>% summarize(genes = paste(GeneID,
        collapse = ","))
    ##

    Results <- data.frame(Pathway = path, S, PValue, stringsAsFactors = FALSE)
    rownames(Results) <- g
    Results$pathID <- rownames(Results)
    resultado <- left_join(Results, kkk, by = c(pathID = "PathwayID"))
    resultado <- resultado[which(resultado$P.DE < 0.05), ]
    return(resultado)
}
fpsanz/enrichGene documentation built on Feb. 20, 2020, 3:20 a.m.