inst/aoas_simulations/common.R

##
## export  OPENBLAS_NUM_THREADS=1
MC_CORES <- as.numeric(Sys.getenv('SLURM_CPUS_ON_NODE'))
if(is.na(MC_CORES)) MC_CORES <- 4

options(mc.cores=MC_CORES)

precenter <- function(samp){
    apply(samp, 2, function(x){
        xI <- abs(x)>0
        x[xI] <- scale(x[xI], scale=FALSE)
        x
    })
}

library(devtools)
library(MAST)
library(parallel)
library(Matrix)
library(data.table)
library(RColorBrewer)
library(Mus.musculus)
load_all('../../')

process_aracne <- function(mat, n=50){
    if(any(is.na(mat))) warning(sum(is.na(mat)), " NAs found in aracne estimate")
    mat[is.na(mat)] <- 0
    path <- seq(from=min(mat), to=max(mat), length.out=n)
    adjMat <- lapply(path, function(x) Matrix::Matrix((mat>x)*1,sparse=TRUE))
    list(adjMat=adjMat, lambda=path, BIC=rep(NA, length(path)))
}

fitSomeModels <- function(samp, models=c('hurdle', 'logistic', 'gaussian', 'gaussianRaw', 'aracne'), fixed=NULL, parallel=TRUE, control=list(debug=1, maxrounds=1000, tol=1e-3), lambda.min.ratio=.2, nlambda=100, checkpointDir=NULL, keepNodePaths=TRUE){
    samp1c <- t(assay(samp))
    sampCenter <- precenter(t(assay(samp)))
    out <- setNames(as.list(models), models)
    if('hurdle' %in% models){
        hCNgo <- fitHurdle(sampCenter, parallel=parallel, control=control, makeModelArgs=list(scale=FALSE, fixed=fixed, center=TRUE, conditionalCenter=TRUE), lambda.min.ratio=lambda.min.ratio, nlambda=nlambda, checkpointDir=checkpointDir, keepNodePaths=keepNodePaths)
        mCNgo <- interpolateEdges(hCNgo)
        out$hurdle <- mCNgo
        out$hurdle$paths <- attr(hCNgo, 'nodePaths')
    }


    if('logistic' %in% models){
        logfit <- autoGLM(samp1c, fixed=fixed, lambda.min.ratio=lambda.min.ratio, nlambda=nlambda, parallel=parallel)
        out$logistic <- interpolateEdges(logfit)
    }

    if('gaussian' %in% models){        
        hugefit <- autoGLM(sampCenter, fixed=fixed, family='gaussian', nlambda=floor(nlambda*1.2), lambda.min.ratio=lambda.min.ratio/2, parallel=parallel)
        hugemati <- interpolateEdges(hugefit)
        out$gaussian <- hugemati
    }

    if('gaussianRaw' %in% models){        
        hugefitRaw <- autoGLM(samp1c, fixed=fixed, family='gaussian', nlambda=floor(nlambda*1.2), lambda.min.ratio=lambda.min.ratio/2, parallel=parallel)
        hugemati <- interpolateEdges(hugefitRaw)
        out$gaussianRaw <- hugemati
    }

    if('aracne' %in% models){
        resids <- if(!is.null(fixed)) resid(lm(samp1c ~ fixed)) else samp1c
        hugefitRaw <- process_aracne(netbenchmark::aracne.wrap(resids))
        hugemati <- interpolateEdges(hugefitRaw)
        out$aracne <- hugemati
    }
    
    structure(out, colnames=colnames(samp1c), rownames=rownames(samp1c))
}


loadLausanne <- function(freq=.05){
    raw <- readRDS('data_private/AllCells.rds')
    rexprs <- raw$exprs
    colnames(rexprs) <- raw$fdata$shortName
    row.names(raw$fdata) <- raw$fdata$primerid <- raw$fdata$shortname
    fl <- FromMatrix("FluidigmAssay", exprsArray = t(rexprs), cData = raw$cdata, fData = raw$fdata)
    frq1 <- freq(subset(fl, ncells==1))
    fl <<- fl[frq1>freq,]
    samp10 <<- subset(fl, ncells==10)
    samp1 <<- subset(fl, ncells==1)
}

loadMait <- function(){
tsca_mait <- thresholdSCRNACountMatrix(2^exprs(sca_mait)-1, nbins=20)
sca_mait <- addlayer(sca_mait, 'et')
layer(sca_mait) <- 'et'
exprs(sca_mait) <- tsca_mait$counts_threshold
cData(sca_mait) <- data.frame(cData(sca_mait), ngeneson=cData(sca_mait)$nGeneOn)
colnames(sca_mait) <- make.unique(fData(sca_mait)$symbolid)
sca_mait
}

namedrbindlist <- function(li, nm=NULL, fill){
    if(!is.null(nm)) names(li) <- nm
    li <- lapply(names(li), function(linm){
               x <- li[[linm]]
               x <- data.table(x)
               x[,L1:=linm]
               x
               })
    rbindlist(li, fill=fill)

}

simplify_components <- function(select, min.components, gadj) {
  clu <- components(gadj)
  goodclu <- clu$membership[select]
  badclu = which(clu$csize<min.components)
  badclu <- setdiff(badclu, goodclu)
  
  badv = clu$membership %in% badclu
  gadj <- delete.vertices(gadj, badv)
  gadj
}


genNetwork <- function(hfit, nedge=40, min.components=3,min.degree.label=2,vnames, select = numeric(), plot=TRUE, ...){
    ##interp <- length(hfit[[2]])-findInterval(nedge, rev(unlist(hfit[[2]])))
    interp <- which.min(abs(nedge-hfit$trueEdges))

    thisMat <- hfit[[1]][[interp]]
    if(!missing(vnames)) dimnames(thisMat) <- list(vnames, vnames)
    gadj <- graph.adjacency(abs(thisMat)>0, mode='max')
    sizes <- Matrix::summary(thisMat)
    sizes <- subset(sizes, i>j)[,'x']
    E(gadj)$width <- (abs(sizes)/max(abs(sizes)))^(1/4)
    E(gadj)$sign <- sign(sizes)
    gadj = simplify_components(select, min.components, gadj)
    
   
    #gadj <- igraph::simplify(gadj, edge.attr.comb='mean')
    V(gadj)$size <- igraph::degree(gadj)/sqrt(max(igraph::degree(gadj)))

    Npal <- 50
    pal <- colorRampPalette(brewer.pal(9, 'BrBG'))(Npal)
    wt <- E(gadj)$width*E(gadj)$sign
    minNeg <- min(wt[wt<0], -1e-3)*1.05
    maxPos <- max(wt[wt>0], 1e-3)*1.05
    wtkey <- c(seq(minNeg, 0, length.out=Npal/2), seq(0, maxPos, length.out=Npal/2))
    maxAll <- max(-minNeg, maxPos)
    wtkey <- seq(-maxAll, maxAll, length.out=Npal)
    paltable <- data.table(color=pal,wtkey=wtkey)
    setkey(paltable, wtkey)
    E(gadj)$color <- paltable[list(wt),color,roll=TRUE]
    V(gadj)$frame.color='blue'
    V(gadj)$label <- ifelse(igraph::degree(gadj)>min.degree.label, V(gadj)$name, '')
    V(gadj)$label.color='black'
    
    if(plot) plotNetwork(gadj, ...)
    
    structure(gadj, scale=paltable)
}

color.bar <- function(col, val, nticks=11) {
    min <- min(val)
    max <- max(val)
    scale = (length(col)-1)/(max-min)
    #dev.new(width=1.75, height=5)
    plot(c(0,2), c(min,max), type='n', bty='n', xaxt='n', xlab='', yaxt='n', ylab='', bg='white')
    ticks=sort(c(0, seq(min, max, len=nticks-1)))
    #labels <- findInterval(ticks, val)
    axis(2, round(ticks, 1), las=1)
    for (i in 1:(length(col)-1)) {
     y = (i-1)/scale + min
     rect(0,y,2,y+1/scale, col=col[i], border=NA)
    }
}


matplotNetwork <- function(plotitems, nedge=70, min.components=0, min.degree.label=-1, matplot=TRUE,  lay = igraph::layout_with_fr(g1, niter = 50), ...){
    g1 <- genNetwork(plotitems[[1]], nedge=nedge, min.components=min.components,  plot=F, ...)
    ni <- length(plotitems)
    if(matplot){
        par(mfrow=c(ceiling(sqrt(ni)), floor(sqrt(ni))), oma=c(0, 0,0,0)+.1, mar=c(0, 0, 3, 3)+.1)
    } else{
        par(oma=c(0,0, 0,0)+.1, mar=c(0, 0, 3, 0)+.1)
    }
    lapply(seq_along(plotitems), function(i){
        gn <- genNetwork(plotitems[[i]], nedge=nedge, min.components=min.components, min.degree.label=min.degree.label, plot=F, ...)
        plotNetwork(gn, layout=lay,  main=simpleCap(names(plotitems)[i]), ...)
        sum(degree(gn))/2
    })
}


plotNetwork <- function(gadj,  layout=layout_with_kk,  width.scale=2, colorbar=FALSE, colorbarx, colorbary, ...){
   
    
    plot(gadj,				#the graph to be plotted
         layout=layout,	# the layout method. see the igraph documentation for details
         vertex.label.dist=0.1,			#puts the name labels slightly off the dots
         #vertex.frame.color='blue', 		#the color of the border of the dots 
         vertex.label.font=1,			#the font of the name labels
         #vertex.label=,		#specifies the lables of the vertices. in this case the 'name' attribute is used
         vertex.label.cex=1,			#specifies the size of the font of the labels. can also be made to vary,
         edge.width=E(gadj)$width*width.scale,
         #edge.color=E(gadj)$color,
         ...)

    if(colorbar){
        scale <- attr(gadj, 'scale')
        if(missing(colorbarx)){
            colorbarx <- 1.2
            colorbary <- .9
            }
        #fields::colorbar.plot(x=colorbarx, y=colorbary, strip=scale[,wtkey], col=scale[,color], strip.width=.02, strip.length=.5)
        Hmisc::subplot(color.bar(scale[,color], scale[,wtkey]), colorbarx, colorbary, size=c(.1, 1.5))
        
        }

}

    
goCriteria <- c('GO:0003677', 'GO:0006355', #transcription factors
                'GO:0006351', #promoters
                'GO:0016070', #mRNA metabolic process
                'GO:0001190', #TF binding
                'GO:0017053') #transcription repressor
                


## Change a phrase to Capital letters
## Not vectorized.
simpleCap <- function(x) {
    x <- tolower(x)
    s <- strsplit(x, " ")[[1]]
    paste(toupper(substring(s, 1, 1)), substring(s, 2),
          sep = "", collapse = " ")
     }

## Select a graph from the interpolation object
getNedgeInterp <- function(ginterp, n){
    gidx <- which.min(abs(ginterp$trueEdges-n))
    ginterp$edgeInterp[[gidx]]
}

## Return a data.table crosswalk of vertex numbers and names
getGraphAlias <- function(graph){
    alias <-  sapply(row.names(graph), simpleCap)
    desc <- data.table(ALIAS=alias)
    desc$i <- seq_along(desc$ALIAS)
    desc
}

##' Join vertex indices, index-ALIAS crosswalk and various databases
##'
##' Merge the godb with a list of vertex indices.  Multiple GOIDs can be assigned to each ALIAS
##' (it is a bipartite matching)
##' @param idx data.table
##' @param idxcol for joining idx and alias
##' @param alias idxcol-ALIAS table
##' @param godb ALIAS-GOID table.  Object of flavor Orgdb
##' @param evidence evidence codes to subset godb
##' @param otherdb (optional) data.table with other ALIAS-GOID records. Joined and unioned.
##' @return data.table of outer join
mergeWithAliasAndGo <- function(idx, idxcol, alias, go, evidence=c('EXP', 'IDA', 'PIP', 'IMP', 'IGI', 'ISS', 'ISO', 'ISA', 'ISM', 'IGC', 'IKR', 'IRD', 'IBA', 'IBD', 'MAR')){
    idx <- merge(idx, alias, by=idxcol)
    go <- go[!is.na(GOID) & EVIDENCE %in% evidence] #excludes some computational evidences.  see http://geneontology.org/page/guide-go-evidence-codes
    setkey(go, ALIAS, GOID)
    go <- unique(go)
    setkey(go, NULL)
    idxgo <- merge(idx, go, by='ALIAS', allow.cartesian=TRUE)
    idxgo[,N_GOID:=.N, keyby=GOID]
    idxgo
}

logodds <- function(tab){
    tab[1,1]*tab[2,2]/(tab[1,2]*tab[2,1])
    }


## Also check degree and clique completion
## Need a general way to merge GO with graph
## graphstatlike -> alias -> go -> stat
edgeGoAnno <- function(ginterp, n, graph, goAlias, goTerm, background, annotate=TRUE, nulldist, ...){
    ## get edge set
    if(missing(graph)){
        graph <- getNedgeInterp(ginterp, n)
    }
    eset <- Matrix::summary(graph)
    setDT(eset)
    
    ## Number of edges in complete graph vs sampled graph
    ncomplete <- nrow(graph)^2-nrow(graph)
    nedge <- nrow(eset)

    
    ## join vertex i to GO
    alias <- getGraphAlias(graph)
    eseti <- mergeWithAliasAndGo(eset, 'i', alias, goAlias, ...)

    ## merge again  vertex j
    setnames(eseti, c('EVIDENCE', 'GOID', 'N_GOID', 'ALIAS', 'ONTOLOGY'), c('EVIDENCE.i', 'GOID.i', 'N_GOID.i', 'ALIAS.i', 'ONTOLOGY.i'))
    setnames(alias, 'i', 'j')
    esetij <- mergeWithAliasAndGo(eseti, 'j', alias, goAlias, ...)
    setnames(esetij, c('EVIDENCE', 'GOID', 'N_GOID', 'ALIAS', 'ONTOLOGY'), c('EVIDENCE.j', 'GOID.j', 'N_GOID.j', 'ALIAS.j', 'ONTOLOGY.j'))

    ## Count go pairs
    gocount <- esetij[,.(N_GOID.ij=.N),keyby=list(GOID.i, GOID.j)]
    
    ## compare to this sample to number of edges in complete graph
    ## calculate tail prob.
    gcbg <- background[gocount]
    gcbg[,phyper:=phyper(N_GOID.ij, Nij, ncomplete-Nij, nedge, lower.tail=FALSE) ,keyby=Nij] #+ dhyper(N_GOID.ij, Nij, ncomplete-Nij, nedge)
    setkey(gcbg, phyper)

    if(annotate){
        gcbgSig <- gcbg[1:nrow(nulldist)][,rank:=1:nrow(nulldist)]
        gcbgSig[,pperm:=mean(c(1, nulldist[rank,]<phyper)), keyby=rank]
        gcbgSig[,phyperAdj:=p.adjust(phyper, 'BY', n=nrow(background))]

        ## join to go terms
        setkey(goTerm, GOID)
        goTerm <- goTerm[unique(gcbgSig$GOID.i)]
        gcbgSig <- merge(gcbgSig, goTerm[,.(GOID, TERM)], by.x='GOID.i', by.y='GOID', all.x=TRUE)
        gcbgSig <- merge(gcbgSig, goTerm[,.(GOID, TERM)], by.x='GOID.j', by.y='GOID', all.x=TRUE)
        setnames(gcbgSig, c('TERM.x', 'TERM.y'), c('TERM.i', 'TERM.j'))

        ## get rid of duplicates (both pairs were concerned before)
        gcbgSig[, GOIDmin:=pmin(as.character(GOID.i), as.character(GOID.j))]
        gcbgSig[, GOIDmax:=pmax(as.character(GOID.i), as.character(GOID.j))]
        setkey(gcbgSig, GOIDmin, GOIDmax)
        gcbgSig <- unique(gcbgSig)
        gcbgSig[,':='(GOIDmin=NULL, GOIDmax=NULL)]

        gcbgSig[,':='(TERM.i=ifelse(is.na(TERM.i), GOID.i, TERM.i),
             TERM.j=ifelse(is.na(TERM.j), GOID.j, TERM.j))]
        gcbgSig[,':='(TERM.i=stringr::str_wrap(TERM.i, width=35),
                      TERM.j=stringr::str_wrap(TERM.j, width=35))]
        
        setkey(gcbgSig, phyperAdj)
        return(list(significant_sets = gcbgSig, go_definition = goTerm, significant_genes = esetij[GOID.i%in% goTerm$GOID]))
    } else{
        return(sets = gcbg)
    }
}

plotgenesee <- function(x, godb, goTerm,network, goids, additionalGenes=NULL, plot = TRUE){
    L1 <- x$L1[1]
    geneseeG <- igraph::graph.edgelist(as.matrix(x[,.(GOID.i, GOID.j)]), directed=FALSE)
    E(geneseeG)$weights <- sqrt(-log10(x$phyper)-6)
    E(geneseeG)$group <- x$L1
    if(missing(goids)){
        # take top 9 GO categories
        goids <- unique(x[Nij<7e4][1:9, c(GOID.i, GOID.j)])
    } else{
        goids <- unique(goids[GOID  %in% x$GOID.j |  GOID %in% x$GOID.i])
    }
    ## assign colors
    ncategories <- length(unique(goids$category))
    catColor <- goids[,color:=brewer.pal(ncategories, 'Set2')[.GRP],keyby=category]
    goTerm <- catColor[goTerm,,on="GOID"]
    goTerm[is.na(category), color:='white']
    goTerm[is.na(category), category:='Uncategorized']
    
    ## join genesee GOIDs to godb (and the genes)
    setkey(godb, GOID)
    sgo <- godb[goids[,.(GOID)],,on='GOID']
    sgo <- goTerm[sgo,,on='GOID']
     if(!'category' %in% names(sgo)){
        sgo[,category:=GOID]
    }
    sgo[,V:=toupper(ALIAS)]
    #sgo[,N:=.N, key=GOID]
    #sgo <- sgo[N<300,]

    ## top 10 terms no equal to unassociated set
    gselect <- unique(c(sgo$V, additionalGenes))
    networkG <- genNetwork(network, 1400, plot=F, select=gselect, min.components = -1)
    V(networkG)$size <- 3
    E(networkG)$width <- 1
    ## select vertices in graph
    setkey(sgo, V)
    sgoGraph <- sgo[names(V(networkG)),,nomatch=NA, mult='first']
    sgoGraph[is.na(color), color:='white']
    ## ## add colors and plot
    ## setkey(sgoGraph, category)
    
    ## paltable <- unique(sgoGraph[, .(category, TERM)])
    ## paltable <- paltable[sgoGraph,,on='category']
    ## #paltable <- paltable[catColor,,on='GOID']
    ## #paltable[is.na(category), ':='(color='white', category='Uncategorized')]
    ## setkey(paltable, V)
    ## paltable <- promoteWarning(paltable[names(V(networkG)),,mult='first'], 'not a multiple of replacement length')
    paltable <- sgoGraph
    V(networkG)$color <- paltable[,color]
    V(networkG)$label <- paltable[,ifelse(!is.na(category) | V %in% additionalGenes, V, '')]
    V(networkG)$label.color <- paltable[,ifelse(V %in% additionalGenes, 'red', 'black')]
    sub_networkG = simplify_components(gselect, min.components = 3, networkG)
    sub_networkG <- delete.vertices(sub_networkG, igraph::degree(sub_networkG)<1)
    
    plotNetwork(sub_networkG, layout=layout_with_fr, vertex.label.cex=1)
    setkey(goTerm, color)
    leg <- unique(goTerm, by='category')
    legend('bottomright', fill=leg[,color], legend=leg[,category])

    setkey(goTerm, GOID)
    geneseeGtable <- goTerm[names(V(geneseeG)),,nomatch=NA, mult='first']
    V(geneseeG)$label <- ifelse(is.na(geneseeGtable$TERM), '', stringr::str_wrap(geneseeGtable$TERM, 40))
    geneseeG <- igraph::simplify(geneseeG, remove.multiple = TRUE, remove.loops = FALSE)
    plot(geneseeG, vertex.size=3, vertex.label.dist=.2, edge.width=E(geneseeG)$weights, layout=layout_with_kk, vertex.color=geneseeGtable$color, vertex.label.cex=1)
    legend('bottomright', fill=leg[,color], legend=leg[,category])
    invisible(networkG)

}
amcdavid/HurdleNormal documentation built on May 14, 2022, 11:12 p.m.