# Jake Yeung
# Date of Creation: 2020-01-15
# File: ~/projects/scchic-functions/R/GlmFunctions.R
# Mostly stolen from Will Townes
# https://github.com/willtownes/scrna2019/blob/master/util/functions.R
Norm<-function(v){sqrt(sum(v^2))}
ColNorms<-function(x){
#compute the L2 norms of columns of a matrix
apply(x,2,Norm)
}
l2norms.frac <- function(factors){
l2norms <- ColNorms(factors)
l2norms.frac <- l2norms / sum(l2norms)
return(l2norms.frac)
}
# l2norms.lst <- funlapply(factors, function(jmark){
# jfactors <-glmpca.factors[[jmark]]
# l2norms <- colNorms(jfactors)
# l2norms.frac <- l2norms / sum(l2norms)
# return(signif(l2norms.frac, digits = 2) * 100)
# })
InitGLMPCAfromLDA <- function(count.mat, tm.result, dat.var.merge, covar.cname = "ncuts.var", bins.keep = 100, do.log = FALSE, svd.on.Yinit = TRUE, use.orig.sz = TRUE){
ntopics <- ncol(tm.result$topics)
Y <- count.mat
if (use.orig.sz){
size.factor <- colSums(Y)
} else {
size.factor <- NULL
}
# print("Size factor:")
# print(head(size.factor))
assertthat::assert_that(all(rownames(Y) == colnames(tm.result$terms)))
# keep variable bins
if (bins.keep > 0){
print(paste("Keeping only top bins bins.keep:", bins.keep))
bins.high.i <- as.data.frame(apply(tm.result$terms, MARGIN = 1, function(jcol) order(jcol, decreasing = TRUE)[1:bins.keep])) %>%
unlist()
bins.high <- unique(rownames(Y)[bins.high.i])
} else {
print(paste("Keeping all bins because bins.keep=", bins.keep))
bins.high <- colnames(tm.result$terms)
}
Y.filt <- Y[bins.high, ]
var.vec <- dat.var.merge[[covar.cname]]
names(var.vec) <- dat.var.merge$cell
X <- data.frame(covariate = var.vec, cell = names(var.vec)) # columns of 1s are implicit
X.reorder <- X[match(colnames(Y.filt), X$cell), ]
X.mat <- matrix(data = X.reorder$covariate, ncol = 1, byrow = TRUE, dimnames = list(X.reorder$cell, covar.cname))
# factors (U) is c by k
# loadings (V) is g by k
if (do.log){
topics.mat <- log2(tm.result$topics)
terms.mat <- log2(tm.result$terms[, bins.high])
} else {
topics.mat <- tm.result$topics
terms.mat <- tm.result$terms[, bins.high]
}
if (svd.on.Yinit){
# GLM loglink function for multinom is log( p / (1 - p) )
# V %*% t(U) on init matrix gives an estimate of p
# after estimate p / (1 - p), THEN remove mean and finally do SVD to get factors and loadings estimate
p <- t(terms.mat) %*% t(topics.mat)
logodds <- log(p / (1 - p))
# remove mean and SVD
logodds.centered <- t(scale(t(logodds), center = TRUE, scale = FALSE))
# logodds.centered.check <- sweep(logodds, MARGIN = 1, STATS = rowMeans(logodds), FUN = "-")
logodds.pca <- prcomp(t(logodds.centered), center = FALSE, scale. = FALSE, rank. = ntopics)
U.init <- logodds.pca$x # cells by k
V.init <- logodds.pca$rotation # genes by k, no need to transpose
} else {
U.init <- scale(topics.mat, center = TRUE, scale = FALSE) # c by k
V.init <- scale(terms.mat, center = TRUE, scale = FALSE) # k by g
V.init <- t(V.init) # now its g by k
}
return(list(Y.filt = Y.filt, U.init = U.init, V.init = V.init, X.mat = X.mat, size.factor = size.factor, ntopics = ntopics))
}
##### Deviance functions #####
poisson_deviance<-function(x,mu,sz){
#assumes log link and size factor sz on the same scale as x (not logged)
#stopifnot(all(x>=0 & sz>0))
2*sum(x*log(x/(sz*mu)),na.rm=TRUE)-2*sum(x-sz*mu)
}
multinomial_deviance<-function(x,p){
-2*sum(x*log(p))
}
binomial_deviance<-function(x,p,n){
term1<-sum(x*log(x/(n*p)), na.rm=TRUE)
nx<-n-x
term2<-sum(nx*log(nx/(n*(1-p))), na.rm=TRUE)
2*(term1+term2)
}
gof_func<-function(x,sz,mod=c("binomial","multinomial","poisson","geometric")){
#Let n=colSums(original matrix where x is a row)
#if binomial, assumes sz=n, required! So sz>0 for whole vector
#if poisson, assumes sz=n/geometric_mean(n), so again all of sz>0
#if geometric, assumes sz=log(n/geometric_mean(n)) which helps numerical stability. Here sz can be <>0
#note sum(x)/sum(sz) is the (scalar) MLE for "mu" in Poisson and "p" in Binomial
mod<-match.arg(mod)
fit<-list(deviance=0,df.residual=length(x)-1,converged=TRUE)
if(mod=="multinomial"){
fit$deviance<-multinomial_deviance(x,sum(x)/sum(sz))
} else if(mod=="binomial"){
fit$deviance<-binomial_deviance(x,sum(x)/sum(sz),sz)
} else if(mod=="poisson"){
fit$deviance<-poisson_deviance(x,sum(x)/sum(sz),sz)
} else if(mod=="geometric"){
if(any(x>0)) {
fit<-glm(x~offset(sz),family=MASS::negative.binomial(theta=1))
}
} else { stop("invalid model") }
if(fit$converged){
dev<-fit$deviance
df<-fit$df.residual #length(x)-1
pval<-pchisq(dev,df,lower.tail=FALSE)
res<-c(dev,pval)
} else {
res<-rep(NA,2)
}
names(res)<-c("deviance","pval")
res
}
compute_size_factors<-function(m,mod=c("binomial","multinomial","poisson","geometric")){
#given matrix m with samples in the columns
#compute size factors suitable for the discrete model in 'mod'
mod<-match.arg(mod)
sz<-Matrix::colSums(m) #base case, multinomial or binomial
if(mod %in% c("multinomial","binomial")){ return(sz) }
sz<-log(sz)
sz<-sz - mean(sz) #make geometric mean of sz be 1 for poisson, geometric
if(mod=="poisson"){ return(exp(sz)) }
sz #geometric, use log scale size factors
}
##### Null Residuals functions #####
poisson_deviance_residuals<-function(x,xhat){
#x,xhat assumed to be same dimension
#sz<-exp(offsets)
#xhat<-mu*sz
term1<-x*log(x/xhat)
term1[is.nan(term1)]<-0 #0*log(0)=0
s2<-2*(term1-(x-xhat))
sign(x-xhat)*sqrt(abs(s2))
}
binomial_deviance_residuals<-function(X,p,n){
#X a matrix, n is vector of length ncol(X)
#if p is matrix, must have same dims as X
#if p is vector, its length must match nrow(X)
if(length(p)==nrow(X)){
mu<-outer(p,n)
} else if(!is.null(dim(p)) && dim(p)==dim(X)){
mu<-t(t(p)*n)
} else { stop("dimensions of p and X must match!") }
term1<-X*log(X/mu)
# term1[is.nan(term1)]<-0 #0*log(0)=0
term1[is.na(term1)]<-0 #0*log(0)=0
nx<- t(n-t(X))
term2<-nx*log(nx/outer(1-p,n))
term2<-nx*log(nx/outer(1-p,n))
# term2[is.nan(term2)]<-0
sign(X-mu)*sqrt(2*(term1+term2))
}
# multinomial_deviance_residuals<-function(X,p,n){
# #not clear if this actually makes sense. Don't use this function!
# #X a matrix, n is vector of length ncol(X)
# #if p is matrix, must have same dims as X
# #if p is vector, its length must match nrow(X)
# if(length(p)==nrow(X)){
# mu<-outer(p,n)
# } else if(!is.null(dim(p)) && dim(p)==dim(X)){
# mu<-t(t(p)*n)
# } else { stop("dimensions of p and X must match!") }
# sign(X-mu)*sqrt(-2*X*log(p))
# }
null_residuals<-function(m,mod=c("binomial","multinomial","poisson","geometric"),type=c("deviance","pearson")){
mod<-match.arg(mod)
type<-match.arg(type)
sz<-compute_size_factors(m,mod)
if(mod %in% c("multinomial","binomial")) {
phat<-Matrix::rowSums(m)/sum(sz)
if(type=="pearson"){ #pearson resids same for multinomial as binomial
mhat<-outer(phat,sz)
return((m-mhat)/sqrt(mhat*(1-phat)))
} else { #deviance residuals
if(mod=="multinomial"){
stop("multinomial deviance residuals not implemented yet")
#return(multinomial_deviance_residuals(m,phat,sz))
} else { #binomial
return(binomial_deviance_residuals(m,phat,sz))
}
}
} else if(mod=="poisson"){
mhat<-outer(Matrix::rowSums(m)/sum(sz), sz) #first argument is "lambda hat" (MLE)
if(type=="deviance"){
return(poisson_deviance_residuals(m,mhat))
} else { #pearson residuals
return((m-mhat)/sqrt(mhat))
}
} else { #geometric
gfunc<-function(g){
fit<-glm(m[g,]~offset(sz),family=MASS::negative.binomial(theta=1))
residuals(fit,type=type)
}
return(t(vapply(1:nrow(m),gfunc,rep(0.0,ncol(m)))))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.