R/hetset_analysis.R

# hetset analysis tools =======================================================

# summarize set of selected features ------------------------------------------
evaluate_set <- function(H, verbose = T){
  if (is.null(H@metadata$slf)){
    stop("No features selected: try scan_hetset(H)")
  }
  old_opt <- options()
  options(digits = 2)
  rep.sum <- data.frame("Feature" = H@metadata$slf,"sqHell.rest" = NA,"delta" = NA,"expl.dist" = NA,"rel.importance" = NA)
  for (i in 1:nrow(rep.sum)){
    H_r <- H
    H_r@metadata$slf <- H@metadata$slf[-i]
    rep.sum$sqHell.rest[i] <- calculate_dist(estimate_densities(H_r))@metadata$sqHell
    rep.sum$delta[i] <- H@metadata$sqHell - rep.sum$sqHell.rest[i]
    rep.sum$expl.dist[i] <- rep.sum$delta[i] / H@metadata$sqHell
  }
  CumDelta <- sum(rep.sum$delta)
  rep.sum$rel.importance <- rep.sum$delta / CumDelta
  rep.sum <- rep.sum[order(-rep.sum$rel.importance),]
  if(verbose){
    print(rep.sum)
  }
  options(old_opt)
  rep.sum
}

# show subpopulations
show_partition <- function(H,cutoff = 0.99,inner = 0.95){
  slf <- H@metadata$slf
  N_acc <- 10^5
  mu_A <- H@metadata$prm.A$mean
  Sigma_A <- H@metadata$prm.A$cov
  mu_B <- H@metadata$prm.B$mean
  Sigma_B <- H@metadata$prm.B$cov

  # increase robustness for Mahalanobis distance (Sigma^-1)
  diag(Sigma_A) <- diag(Sigma_A) + 10^-3 * diag(H@metadata$prm.full$cov)
  diag(Sigma_B) <- diag(Sigma_B) + 10^-3 * diag(H@metadata$prm.full$cov)

  rmah_a <- mahalanobis(
    x = mvtnorm::rmvnorm(n = N_acc,mean = mu_A,sigma = Sigma_A),
    center = mu_A,
    cov = Sigma_A)
  rmah_b <- mahalanobis(
    x = mvtnorm::rmvnorm(n = N_acc,mean = mu_B,sigma = Sigma_B),
    center = mu_B,
    cov = Sigma_B)
  xmah_a <- mahalanobis(
    x = t(assay(H)[slf,]),
    center = mu_A,
    cov = Sigma_A)
  xmah_b <- mahalanobis(
    x = t(assay(H)[slf,]),
    center = mu_B,
    cov = Sigma_B)

  rep.sum <- data.frame("ID" = colnames(H),"group" = NA,"dist.A" = NA,"dist.B" = NA,"rel" = NA)
  for (i in 1:dim(H)[2]){
    rep.sum$"dist.A"[i] <- mean(rmah_a < xmah_a[i])
    rep.sum$"dist.B"[i] <- mean(rmah_b < xmah_b[i])
    if (min(rep.sum[i,3:4]) < inner){
      d_A <- mvtnorm::dmvnorm(x = assay(H)[slf,i],mean = mu_A,sigma = Sigma_A)
      d_B <- mvtnorm::dmvnorm(x = assay(H)[slf,i],mean = mu_B,sigma = Sigma_B)
      crel <- max(d_A,d_B)/(d_A+d_B)
      rep.sum$"rel"[i] <- crel
      if(crel >= cutoff){
        rep.sum$"group"[i] <- c("A","B")[1+(d_B > d_A)]
      }
    }
  }
  if (is.null(H$ID)){
    H$ID <- colnames(H)
  }
  output <- merge.data.frame(x = rep.sum,y = SummarizedExperiment::colData(H),by = "ID")

  output
}

# fisher.test for independence of fitted subpopulations and clinical factors --
test_independence <- function(H,feature,S){
  type <- typeof(S)
  t_out <- NULL
  switch(type,
         "double" = {
           eval(parse(text =
                        paste0("t_out <- fisher.test(table(H$",
                               feature,">=",S,",H$prt))")))
         },
         "character" = {
           eval(parse(text =
                        paste0("t_out <- fisher.test(table(H$",
                               feature,"== S,H$prt))")))
         },
         "logical" = {
           eval(parse(text =
                        paste0("t_out <- fisher.test(table(H$",
                               feature,"==",S,",H$prt))")))
         })
  t_out
}

# present categorical dependencies --------------------------------------------
show_dependencies <- function(H,dep_list,S_list,type = "print"){
  N_dep <- length(dep_list)
  L_dep <- data.frame("var" = as.character(dep_list),
                      "S" = NA,
                      "OR" = NA,
                      "OR.lo" = NA,
                      "OR.up" = NA)
  for (i in 1:N_dep){
    aus <- test_independence(H = H,feature = dep_list[i],S = S_list[[i]])
    L_dep[i,2:5] <- c(S_list[[i]],
                   round(aus$estimate,digits = 2),
                   round(aus$conf.int[1:2],digits = 2))
  }
  switch(type,
         "print" = {
           print(L_dep)
         },
         "plot" = {
           l_sig <- sum(!is.na(H@metadata$slf))
           plot_type <- as.character(min(3,l_sig))
           par(new=T)
           par(mar = c(0,0,0,0),mfrow=c(1,1))
           switch(plot_type,
                  '1' = {
                    layout(mat = matrix(c(0,0,0,0,1,0),ncol = 2),
                           widths = c(4/5,1/5),
                           heights = c(0.25,0.17,0.58))
                    plot(x = c(0,1),y = c(0,1),xaxt='n',yaxt='n',type = 'n',ann=F,axes=F)
                    n_tests <- dim(L_dep)[1]
                    text(0.5,0.95,labels = "odds ratios",cex = 2)
                    for (j in 1: n_tests){
                      text(x = 0.1,y = 0.9-0.8*j/n_tests,cex = 1.5,
                           labels = substring(text = L_dep[j,1],
                                              first = 1,
                                              last = min(3,nchar(as.character(L_dep[j,1])))))
                      ausgabe <- paste0(L_dep[j,3]," in [",L_dep[j,4],"; ",L_dep[j,5],"]")
                      text(x = 0.6,y = 0.9-0.8*j/n_tests,labels = ausgabe,cex = 1.5)
                    }
                  },
                  '2' = {
                    layout(mat = matrix(c(0,0,1,0),ncol = 2),
                           widths = c(3/4,1/4),
                           heights = c(1/4,3/4))
                    par(mar = c(1,1,1,1))
                    plot(x = c(0,1),y = c(0,1),xaxt='n',yaxt='n',type = 'n',ann=F,axes=F)
                    n_tests <- dim(L_dep)[1]
                    text(0.5,0.95,labels = "odds ratios",cex = 1.5)
                    for (j in 1: n_tests){
                      text(x = 0.1,y = 0.9-0.8*j/n_tests,cex = 1.25,
                           labels = substring(text = L_dep[j,1],
                                              first = 1,
                                              last = min(3,nchar(as.character(L_dep[j,1])))))
                      ausgabe <- paste0(L_dep[j,3]," in [",L_dep[j,4],"; ",L_dep[j,5],"]")
                      text(x = 0.6,y = 0.9-0.8*j/n_tests,labels = ausgabe,cex = 1.25)
                    }
                  },
                  '3' = {
                    n_field <- ceiling(sqrt(l_sig/2*(l_sig-1)+1))
                    layout(mat = matrix(c(0,0,0,1),2),
                           widths = c((n_field-1)/n_field,1/n_field),
                           heights = c((n_field-1)/n_field,1/n_field))
                    plot(x = c(0,1),y = c(0,1),xaxt='n',yaxt='n',type = 'n')
                    n_tests <- dim(L_dep)[1]
                    text(0.5,0.95,labels = "odds ratios")
                    for (j in 1: n_tests){
                      text(x = 0.1,y = 0.9-0.8*j/n_tests,
                           labels = substring(text = L_dep[j,1],
                                              first = 1,
                                              last = min(3,nchar(as.character(L_dep[j,1])))))
                      ausgabe <- paste0(L_dep[j,3]," in [",L_dep[j,4],"; ",L_dep[j,5],"]")
                      text(x = 0.6,y = 0.9-0.8*j/n_tests,labels = ausgabe,cex = 0.5+4/l_sig)
                    }
                  })
           par(mfrow = c(1,1),mar = c(5,4,4,2)+0.1,mgp = c(3,1,0))
         })
}

# make categorical factors ----------------------------------------------------
make_partition_by_logical <- function(H,log_vec){
  if (dim(H)[2]!=length(log_vec)){
    stop(" length of log_vec must equal the number of samples in H /n")
  }
  H$prt <- NA
  H$prt <- factor(x = c("A","B")[(!log_vec)+1],levels = c("A","B"))
  H
}

# subset  ---------------------------------------------------------------------
subset_hetset <- function(H,prt = NULL,
                          keep.samples = NULL,remove.samples = NULL,
                          keep.features = NULL,remove.features = NULL){

  old_signature <- H@metadata$slf
  old_sample_size <- ncol(H)
  # reduce to subpopulation
  if (!is.null(prt)){
    if (length(prt) != 1){
      stop("prt must equal one of the subpopulations 'A' or 'B' ")
    } else {
      if (prt %in% c("A","B")){
        keep.samples <- (H$prt == prt)
        H$prt <- NA
      } else {
        stop("prt must equal one of the subpopulations 'A' or 'B' ")
      }
    }
  }

  keep.select <- rep(T,ncol(H))
  if (!is.null(keep.samples)){
    if (is.logical(keep.samples)){
      keep.select <- keep.samples
    } else {
      keep.select[!(colnames(H) %in% keep.samples)] <- F
    }
  }
  if (!is.null(remove.samples)){
    if (is.logical(remove.samples)){
      keep.select[remove.samples] <- F
    } else {
      keep.select[(colnames(H) %in% remove.samples)] <- F
    }
  }

  keep.subset <- rep(T,nrow(H))
  if (!is.null(keep.features)){
    if (is.logical(keep.features)){
      keep.subset <- keep.features
    } else {
      keep.subset[!(H@NAMES %in% keep.features)] <- F
    }
  }
  if (!is.null(remove.features)){
    if (is.logical(remove.features)){
      keep.subset[remove.features] <- F
    } else {
      keep.subset[(H@NAMES %in% remove.features)] <- F
    }
  }

  H <- SummarizedExperiment::subset(H,
        subset = keep.subset,
        select = keep.select)

  if ((sum(!(old_signature %in% H@NAMES)) > 0) | (ncol(H) < old_sample_size)){
    nochdrin <- (old_signature %in% H@NAMES)
    H@metadata$slf <- H@metadata$slf[nochdrin]
    if (sum(nochdrin) == 0){
      H@metadata$prm.full <- list("mean" = NA,"cov" = NA)
      H@metadata$prm.A <- list("mean" = NA,"cov" = NA)
      H@metadata$prm.B <- list("mean" = NA,"cov" = NA)
      H@metadata$prp.A <- 0.5
      H@metadata$sqHell <- 0
    } else {
      H@metadata$prm.full$mean <- H@metadata$prm.full$mean[nochdrin]
      H@metadata$prm.A$mean <- H@metadata$prm.A$mean[nochdrin]
      H@metadata$prm.B$mean <- H@metadata$prm.B$mean[nochdrin]
      if (length(old_signature)>1){
        H@metadata$prm.full$cov <- H@metadata$prm.full$cov[nochdrin,nochdrin]
        H@metadata$prm.A$cov <- H@metadata$prm.A$cov[nochdrin,nochdrin]
        H@metadata$prm.B$cov <- H@metadata$prm.B$cov[nochdrin,nochdrin]
      }
      H@metadata$sqHell <- calculate_dist(H)@metadata$sqHell
    }
  }

  H
}
ZytoHMGU/hetset documentation built on June 6, 2019, 2:16 p.m.