classtpx: MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.

## Tools for manipulation of text count matrices ##

## converting count to frequency matrix
class.normalizetpx <- function(x, byrow=TRUE){
    if(byrow){ s <- slam::row_sums(x)
               s[s==0] <- 1
               return( x/s ) }
    else{
      s <- slam::col_sums(x)
      s[s==0] <- 1
      return(t(t(x)/s)) }
}

## converting a count/freq matrix to tfidf
stm_tfidf <- function(x){
  idf <- log( nrow(x) ) - log(slam::col_sums(x>0) + 1) 
  t( t(x) * idf )
}
    
## Dirichlet RNG
rdir <- function(n, alpha)
{
    x <- matrix(rgamma(length(alpha)*n,alpha),nrow=n,byrow=TRUE)
    return(t(x/rowSums(x))) }

class.voom_generator <- function(counts_class, 
                           class_labs,
                           doshrink=TRUE)
{
  voom_class <- voom2(counts_class);
  mean_voom_features <- apply(voom_class, 2, mean);
  voom_class_adj <- voom_class - rep.row(mean_voom_features, dim(voom_class)[1])
  model_mat <- model.matrix(~as.factor(class_labs)-1) 
  
  beta_class <- matrix(0, dim(voom_class)[2], dim(model_mat)[2]);
  sebeta_class <- matrix(0, dim(voom_class)[2], dim(model_mat)[2])
  for(k in 1:dim(model_mat)[2]){
    model_mat_temp <- cbind(model_mat[,k]);
    limma.obj <- limma::lmFit(t(voom_class_adj), model_mat_temp)
    limma.obj <- limma::eBayes(limma.obj)
    #   mean_genes_limma <- apply(limma.obj$coefficients, 1, mean)
    beta_class[,k] <- as.matrix(limma.obj$coefficients[,1]);
    sebeta_class[,k] <- limma.obj$sigma*(as.matrix(limma.obj$stdev.unscaled[,1]));
  }
  
  voom_mean_class <- t(mean_voom_features + cbind.data.frame(beta_class));
  
  if(doshrink==FALSE){
    ll <- list("voom_class"=voom_class,
               "voom_mean_class"=voom_mean_class)
    return(ll)
  }
  else{
        ash_beta_class <- do.call(cbind, lapply(1:length(unique(class_labs)), 
                                          function(l) 
                                          {
                                            if(length(which(class_labs==l))==1){
                                              return(beta_class[,l])
                                            }else{
                                              return(suppressWarnings(ashr::ash(beta_class[,l], sebeta_class[,l], 
                                                                                mixcompdist="normal")$PosteriorMean))
                                            }
                                          }));
  
      voom_shrunk_class_2 <- matrix(0, dim(counts_class)[1], dim(counts_class)[2])
  
      for(i in 1:length(unique(class_labs))){
          voom_shrunk_class_2[which(class_labs==unique(class_labs)[i]),] <- 
              voom_class[which(class_labs==unique(class_labs)[i]),] - rep.row(as.vector(beta_class[,i]), length(which(class_labs==unique(class_labs)[i]))) + rep.row(as.vector(ash_beta_class[,i]), length(which(class_labs==unique(class_labs)[i])));
        }
  
      voom_shrunk_mean_class <- t(mean_voom_features + cbind.data.frame(beta_class));
  
        ll <- list("voom_class"=voom_class,
                    "voom_shrunk_class"=voom_shrunk_class_2,
                    "voom_mean_class" = voom_mean_class,
                    "voom_shrunk_mean_class"=voom_shrunk_mean_class)
        return(ll)
  }
}

class.nonmodel_clust <- function(counts,
                                 known_samples,
                                 class_labs,
                                 method="svm",
                                 doshrink=FALSE)
{
  counts_class <- counts[known_samples,];
  out <- class.voom_generator(counts_class, class_labs, doshrink=doshrink)
  if(method=="svm"){
      if(doshrink==FALSE){
          data <- out$voom_class;
          mean_model <- out$voom_mean_class;
          sigma_model <- do.call(cbind, lapply(1:dim(data)[2], function(l) tapply(data[,l], class_labs, sd)))
    
          pooled_voom <- voom2(counts)
          test_voom <- pooled_voom[-(known_samples),];
    
          loglik_test <- matrix(0, dim(test_voom)[1], length(unique(class_labs)))
    
          for(k in 1:length(unique(class_labs))){
              loglik_test[,k] <- rowSums(t(apply(test_voom, 1, function(x) dnorm(x, mean_model[k,], sigma_model[k,], log=TRUE))))
          }
          return(loglik_test)
      }else{
            data <- out$voom_shrunk_class;
            mean_model <- out$voom_shrunk_mean_class;
            sigma_model <- do.call(cbind, lapply(1:dim(data)[2], function(l) tapply(data[,l], class_labs, sd)))
    
            pooled_voom <- voom2(counts)
            test_voom <- pooled_voom[-(known_samples),];
    
            loglik_test <- matrix(0, dim(test_voom)[1], length(unique(class_labs)))
    
            for(k in 1:length(unique(class_labs))){
                loglik_test[,k] <- rowSums(t(apply(test_voom, 1, function(x) dnorm(x, mean_model[k,], sigma_model[k,], log=TRUE))))
            }
          return(loglik_test)
      }
  }
}


class.model_clust <- function(counts, 
                              known_samples, 
                              class_labs,
                              dist= c("normal", "poisson", "negbinom"),
                              libadj=TRUE,
                              shrink=FALSE)
{
  counts_class <- counts[known_samples,]
  test_data <- counts[-(known_samples),];
  out <- class.voom_generator(counts_class, class_labs)
      if(dist=="normal"){
          if(shrink==FALSE){          
            data <- out$voom_class;
            mean_model <- out$voom_mean_class;
          }else{
            data <- out$voom_shrunk_class;
            mean_model <- out$voom_shrunk_mean_class;
          }
          sigma_model <- do.call(cbind, lapply(1:dim(data)[2], function(l) tapply(data[,l], class_labs, sd)))
    
          pooled_voom <- voom2(counts)
          test_voom <- pooled_voom[-(known_samples),];
    
          loglik_test <- matrix(0, dim(test_voom)[1], length(unique(class_labs)))
    
          for(k in 1:length(unique(class_labs))){
            loglik_test[,k] <- rowSums(t(apply(test_voom, 1, function(x) dnorm(x, mean_model[k,], sigma_model[k,], log=TRUE))))
            }
            return(loglik_test)
          }
  
      if(dist=="poisson"){
              lib_size_1 <- rowSums(counts[known_samples,]);
              lib_size <- rep(mean(lib_size_1), length(lib_size_1));
              if(shrink==FALSE){
                  if(libadj){
                       counts_shrunk_matrix <- (2^{out$voom_class - 6*log(10, base=2)})*(rep.col(lib_size+1, dim(out$voom_shrunk_mean_class)[2])) - 0.5;
                       counts_shrunk_matrix[counts_shrunk_matrix < 0]=1e-08;
                    }
                  if(!libadj){
                       counts_shrunk_matrix <- (2^{out$voom_class - 6*log(10, base=2)})*(rep.col(lib_size_1+1, dim(out$voom_shrunk_mean_class)[2])) - 0.5;
                       counts_shrunk_matrix[counts_shrunk_matrix < 0]=1e-08;
                  }
              }else{
                  if(libadj){
                       counts_shrunk_matrix <- (2^{out$voom_shrunk_class - 6*log(10, base=2)})*(rep.col(lib_size+1, dim(out$voom_shrunk_mean_class)[2])) - 0.5;
                       counts_shrunk_matrix[counts_shrunk_matrix < 0]=1e-08;
                  }
                  if(!libadj){
                       counts_shrunk_matrix <- (2^{out$voom_shrunk_class - 6*log(10, base=2)})*(rep.col(lib_size_1+1, dim(out$voom_shrunk_mean_class)[2])) - 0.5;
                       counts_shrunk_matrix[counts_shrunk_matrix < 0]=1e-08;
                  }
               }
              df <- data.frame(class_labs, counts_shrunk_matrix)
              model_lambda <- apply(counts_shrunk_matrix, 2, function(x) return (tapply(x, class_labs, mean)));
              cat("Loglik calc")
              loglik_test <- matrix(0, dim(test_data)[1], length(unique(class_labs)))
              for(k in 1:length(unique(class_labs))){
                loglik_test[,k] <- rowSums(t(apply(test_data, 1, function(x) dpois(x, as.numeric(model_lambda[k,]), log=TRUE))))
                }
                return(loglik_test)
            }
  
      if(dist=="negbinom"){
              lib_size_1 <- rowSums(counts[known_samples,]);
              lib_size <- rep(mean(lib_size_1), length(lib_size_1));
              if(shrink==FALSE){
                if(libadj){
                  counts_shrunk_matrix <- (2^{out$voom_class - 6*log(10, base=2)})*(rep.col(lib_size+1, dim(out$voom_shrunk_mean_class)[2])) - 0.5;
                  counts_shrunk_matrix[counts_shrunk_matrix < 0]=1e-08;
                }
                if(!libadj){
                  counts_shrunk_matrix <- (2^{out$voom_class - 6*log(10, base=2)})*(rep.col(lib_size_1+1, dim(out$voom_shrunk_mean_class)[2])) - 0.5;
                  counts_shrunk_matrix[counts_shrunk_matrix < 0]=1e-08;
                }
              }else{
                if(libadj){
                  counts_shrunk_matrix <- (2^{out$voom_shrunk_class - 6*log(10, base=2)})*(rep.col(lib_size+1, dim(out$voom_shrunk_mean_class)[2])) - 0.5;
                  counts_shrunk_matrix[counts_shrunk_matrix < 0]=1e-08;
                }
                if(!libadj){
                  counts_shrunk_matrix <- (2^{out$voom_shrunk_class - 6*log(10, base=2)})*(rep.col(lib_size_1+1, dim(out$voom_shrunk_mean_class)[2])) - 0.5;
                  counts_shrunk_matrix[counts_shrunk_matrix < 0]=1e-08;
                }
              }

              out_mean <- apply(counts_shrunk_matrix, 2, function(x) return (tapply(x, class_labs, mean)));
              out_var <- apply(counts_shrunk_matrix, 2, function(x) return (tapply(x, class_labs, var)));
    
              negben_p <- (out_mean/out_var);
              negben_p[negben_p > 1 ]=0.9999
              negben_p[negben_p < 0] = 0.0001
    
              negben_r <- ceiling(((negben_p)/(1-negben_p))*out_mean)
              
              cat("Loglik calc")
              loglik_test <- matrix(0, dim(test_data)[1], length(unique(class_labs)))
              for(k in 1:length(unique(class_labs))){
                    loglik_test[,k] <- rowSums(t(apply(test_data, 1, function(x) dnbinom(x, as.numeric(negben_r[k,]), as.numeric(negben_p[k,]), log=TRUE))))
              }
            return(loglik_test)
        }
}


chunk <- function(x,n) split(x, factor(sort(rank(x)%%n)))

rep.row<-function(x,n){
  matrix(rep(x,each=n),nrow=n)
}

rep.col<-function(x,n){
  matrix(rep(x,each=n), ncol=n, byrow=TRUE)
}

voom2 <- function(counts){
  libsize.mat <- rep.col(rowSums(counts), dim(counts)[2]);
  voom.out <- log((counts+0.5), base=2) - log((libsize.mat+1), base=2)+ 6* log(10, base=2);
  return(voom.out)
}

kkdey/classtpx documentation built on May 20, 2019, 10:34 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

kkdey/classtpx
MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.

R/class_count.R
In kkdey/classtpx: MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.

R Package Documentation

Browse R Packages

We want your feedback!

kkdey/classtpx MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.

R/class_count.R In kkdey/classtpx: MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.

R Package Documentation

Browse R Packages

We want your feedback!

kkdey/classtpx
MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.

R/class_count.R
In kkdey/classtpx: MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.