R/asdf.R

Defines functions GetMostvar CreateSampleGroups GetUniqSamp Panelize PanelizeOld

#' Turn a count matrix into a panel list.
#'
#' @param count.mat count matrix
#' @param annotation annotation file
#' @param cat.col column number specifying sample identity in annotation
#' @return A panel, i.e. a list of count matrices for each sample.
PanelizeOld <- function(count.mat, annotation, cat.col) {
  samples <- unique(annotation[,cat.col])
  # might need to sort annot and countmat if order of cellids differ between annotation and countmat
  sample.inds <- samples %>% lapply(function(cat, annotation, cat.col){return(annotation[,cat.col]==cat)}, annotation, cat.col)
  panel <- sample.inds %>% lapply(function(inds, count.mat){return(count.mat[, inds])}, count.mat)
  names(panel) = samples
  return(panel)
}

Panelize <- function(a.matrix, cell.ids, sample.ids) { # replace old panelize with this
  inds <- split(cell.ids, sample.ids)
  submats <- inds %>% lapply(function(x){a.matrix[x,]})
  return(submats)
}

#' Get unique samples for a category in annotation
#'
#' @param cat category
#' @param annotation annotation file
#' @param sample.col the number of the sample column
#' @param cat.col the number of the category column
#' @return unique samples for a category
GetUniqSamp <- function(cat,annotation, sample.col, cat.col) {
  samples <- unique(annotation[annotation[ ,cat.col]==cat,][,sample.col])
  return(as.character(as.matrix(samples)))
}

CreateSampleGroups <- function(annotation, cat.col, sample.col) {
  sample.cats <- as.matrix(unique(annotation[ ,cat.col]))
  sample.groups <- sample.cats %>% lapply(GetUniqSamp, annotation, sample.col, cat.col)
  names(sample.groups) <- sample.cats
  return(sample.groups)
}

GetMostvar <- function(de.object,nr) {
  mostvar.names <- de.object[!is.na(de.object)] %>%
    lapply(function(df) rownames(df$res)[order(df$res$pvalue)][1:nr]) %>%
    Reduce(union, .)
}

GetClusterIDs <- function(con.object, str) {
  # will prolly change this later since very specific for clusters
  cluster.cellid <- con.object[['clusters']][[str]][['result']] %$% split(names, membership)
  return(cluster.cellid)
}

RbindPanel <- function(con.panel) {
  count.list <- con.panel$samples %>% lapply(function(x){return(x$counts)}) #%>% Reduce(rbind,.)
  return(do.call(rbind, count.list))
  #return(count.list)
}

RbindRaw <- function(con.object){
  count.list <- con.object$samples %>% lapply(function(x){return(x$misc$rawCounts)})
  return(do.call(rbind, count.list))
}

GetColMeans <- function(cluster.array) {
  if (dim(cluster.array) %>%  is.null) {
    meaned <- cluster.array
  } else {
    meaned <- cluster.array %>% apply(2, mean)
  }
  return(meaned)
}
CategoryExper <- function(cellids, rbound, genes) {
  ReturnExps <- function(cellids) {
    if (length(cellids) < 2) {
      gene.exps <- rbound[cellids,] %>% as.matrix %>% t %>% as.data.frame
    } else {
      gene.exps <- rbound[cellids,]
    }
    return(gene.exps)
  }
  #cluster.exps <- cellids %>% lapply(function(x){return(rbound[x,])})
  cluster.exps <- cellids %>% lapply(ReturnExps)
  cluster.exps <- cluster.exps %>% lapply(function(x){return(x[,genes])})
  cluster.exps <- as.data.frame(lapply(cluster.exps,GetColMeans)) # gives genes*cells, might wanna change in future
  return(cluster.exps)
}

GetClusterExp <- function(con.panel, genes, cluster.str) {
  cell.ids <- GetClusterIDs(con.panel, cluster.str)
  rbound.panel <- RbindPanel(con.panel)
  cluster.exps <- CategoryExper(cell.ids, rbound.panel, genes)
  colnames(cluster.exps) <- names(cell.ids)
  return(cluster.exps)
}

GetSubtypeIDs <- function(annotation, sub.col, id.col) {
  annotation <- as.data.frame(annotation)
  split.annot <- annotation %$% split(annotation[,id.col],annotation[,sub.col])
  return(split.annot)
}

GetSubtypeExp <- function(con.panel, genes, annotation, sub.col, id.col) {
  cell.ids <- GetSubtypeIDs(annotation, sub.col, id.col)
  rbound.panel <- RbindPanel(con.panel)
  subtype.exps <- CategoryExper(cell.ids, rbound.panel, genes)
  colnames(subtype.exps) <- names(cell.ids)
  return(subtype.exps)
}

GetIntersect <- function(list.of.vecs) {
  return(list.of.vecs %>% lapply(function(x){x}) %>% Reduce(intersect, .))
}

FilterAnnot <- function(con.object, annotation, cellid.col) {
  count.names <- con.object$samples %>% lapply(function(x){return(rownames(x$counts))}) %>% unlist
  filt.inds <- as.data.frame(annotation)[,cellid.col] %in% count.names
  return(annotation[filt.inds,])
}

# input: annotation corresponding to one of the states, splits annotation into list of subtypes containing list of samples
SplitCells <- function(annotation, samp.col, sub.col) {
  sub.split <- split(annotation, annotation[, sub.col], drop=T)
  samp.split <- sub.split %>% lapply(function(x){split(x, x[, samp.col], drop=T)})
}

SelectCellProbs <- function(annotation, rbound.panel, cellid.col, nr.cell, genes, pseudo.count, norm.var=FALSE, vec=FALSE){
  if (nr.cell > dim(annotation)[1]) {
    inds <- sample(1:dim(annotation)[1], nr.cell, replace=TRUE)
  } else {
    inds <- sample(1:dim(annotation)[1], nr.cell, replace=FALSE)
  }
  annot.sampled <- annotation[inds,]
  exps <- rbound.panel[annot.sampled[, cellid.col], genes]
  if (norm.var) {
    sds <- apply(exps, 2, sd)
    sds <- sds+1e-8
    exps <- sweep(exps, 2, sds, '/')
  }
  if (vec){
    exps.list <- 1:nr.cell %>% lapply(function(i){return(exps[i,])})
    names(exps.list) <- rownames(exps)
    return(exps.list)
  } else {
    probs <- exps/Matrix::rowSums(exps)
    probs <- probs+pseudo.count
    probs <- probs/Matrix::rowSums(probs)
    probs.list <- 1:nr.cell %>% lapply(function(i){return(probs[i,])})
    names(probs.list) <- rownames(probs)
    return(probs.list)
  }
}

SelectCellProbsAggregated <- function(annotation, genes, rbound.panel, cellid.col, nr.cell, pseudo.count){
  if (nr.cell > dim(annotation)[1]) {
    inds <- sample(1:dim(annotation)[1], nr.cell, replace=TRUE)
  } else {
    inds <- sample(1:dim(annotation)[1], nr.cell, replace=FALSE)
  }
  annot.sampled <- annotation[inds,]
  exps <- rbound.panel[annot.sampled[, cellid.col], genes]
  prob.dist <- exps %>% GetColMeans
  prob.dist <- prob.dist/sum(prob.dist)
  prob.dist <- prob.dist+pseudo.count
  prob.dist <- prob.dist/sum(prob.dist)
  return(prob.dist)
}

IndividualCellProbs <- function(annotation, rbound.panel, cellid.col, sub.col, nr.cell, genes,
                                pseudo.count=0, norm.var=FALSE, vec=FALSE){
  sub.split <- split(annotation, annotation[, sub.col], drop=T)
  cell.distributions <- sub.split %>% lapply(SelectCellProbs, rbound.panel, cellid.col,
                                             nr.cell, genes, pseudo.count, norm.var, vec)
}


GetSampProbs <- function(list.of.samp, rbound.panel, genes, cellid.col, pseudo.count){
  exps.list <- list.of.samp %>% lapply(function(x){rbound.panel[x[,cellid.col], genes]})
  exps.list <- exps.list %>% lapply(GetColMeans)
  exps.list <- exps.list %>% lapply(function(x){
    x<-x/sum(x)
    x<-x+pseudo.count}) # missing the first 'normalization' step
  probs.list <- exps.list %>% lapply(function(vector){return(vector/sum(vector))})
}

GetAllProbs <- function(annotation, rbound.panel, samp.col, sub.col, cellid.col, genes, pseudo.count){
  first.split <- SplitCells(annotation, samp.col, sub.col)
  prob.dist <- first.split %>% lapply(GetSampProbs, rbound.panel, genes, cellid.col, pseudo.count)
}

GetSubMatricesOld <- function(list.of.cats, rbound.panel, genes, cellid.col, avg=F) { # OLD
  exps.list <- list.of.cats %>% lapply(function(x){rbound.panel[x[,cellid.col], genes]})
  if (avg){
    return(exps.list %>% lapply(GetColMeans))
  } else {
    return(exps.list)
  }
}

GetSubMatrices <- function(subtype.vector, cellid.vector, condition.vector, count.matrix, genes, avg=T) { # OLD
  sub.cell.df <- dplyr::bind_cols(subtype.vector=subtype.vector, cellid.vector=cellid.vector)
  condition.split <- split(sub.cell.df, condition.vector)
  subtype.splits <- condition.split %>% lapply(function(x){split(x, x$subtype.vector)})
  obtainSubMats <- function(subtype.cellid.annot, count.matrix, genes, avg){
    sub.cond.vals <- count.matrix[subtype.cellid.annot$cellid.vector, genes]
    if (avg) {
      sub.cond.vals <- sub.cond.vals %>% Matrix::colMeans()
    }
    return(sub.cond.vals)
  }
  sub.mats <- subtype.splits %>% lapply(function(x){x %>% lapply(obtainSubMats, count.matrix, genes, avg)})
  return(sub.mats)
}

GetSubSampMats <- function(annotation, rbound.panel, samp.col, sub.col, cellid.col, genes, pseudo.count=0){
  first.split <- SplitCells(annotation, samp.col, sub.col)
  GetSampMats <- function(list.of.samp, rbound.panel, genes, cellid.col, pseudo.count){
    exps.list <- list.of.samp %>% lapply(function(x){rbound.panel[x[,cellid.col], genes]})
    exps.list <- exps.list %>% lapply(GetColMeans)
  }
  prob.dist <- first.split %>% lapply(GetSampMats, rbound.panel, genes, cellid.col, pseudo.count)
}
ObtainSubSampleMats <- function(annotation.list, rbound.panel, samp.col, sub.col, cellid.col, genes, pseudo.count=0){
  prob.list <- annotation.list %>% lapply(GetSubSampMats, rbound.panel, samp.col, sub.col, cellid.col, genes, pseudo.count)
  return(prob.list)
}


ObtainProbabilities <- function(annotation.list, rbound.panel, samp.col, sub.col, cellid.col, genes, pseudo.count=0){
  #rbound.panel <- RbindPanel(con.object)
  prob.list <- annotation.list %>% lapply(GetAllProbs, rbound.panel, samp.col, sub.col, cellid.col, genes, pseudo.count)
  return(prob.list)
}

CalculateJSD <- function(x, a.list) {
  a.list %>% lapply(JensenShannon,x)
}

CalculateAllJSD <- function(list1, list2) {
  alldists <- list1 %>% lapply(CalculateJSD, list2)
  return(unlist(alldists))
}

CalculateAllCor <- function(list1, list2) {
  CalculateCor <- function(x, a.list) {
    a.list %>% lapply(function(a,b){return(1-cor(a,b))},x)
  }
  alldists<- list1 %>% lapply(CalculateCor, list2)
  return(unlist(alldists))
}

CalculateAllBhat <- function(list1, list2) {
  CalculateBhat <- function(x, a.list) {
    a.list %>% lapply(Bhattacharyya,x)
  }
  alldists <- list1 %>% lapply(CalculateBhat, list2)
  return(unlist(alldists))
}

KLD <- function(P,Q) {
  divergence <- sum(P*log(P/Q))
  return(divergence)
}

JensenShannon <- function(P,Q){
  M <- (P+Q)/2
  kbl.pm <- sum(P*log(P/M))
  kbl.qm <- sum(Q*log(Q/M))
  return((kbl.pm+kbl.qm)/2)
}

Bhattacharyya <- function(x,y){
  -log(sum(sqrt(x*y)))
}

CalculateBhat <- function(x, a.list) {
  a.list %>% lapply(Bhattacharyya,x)
}

CalculateAllBhat <- function(list1, list2) {
  alldists <- list1 %>% lapply(CalculateBhat, list2)
  return(unlist(alldists))
}

KSMatrix <- function(X,Y, pseudo.count=0){
  X <- X+pseudo.count
  Y <- Y+pseudo.count
  a_range <- 1:dim(X)[2]
  ks_stats <- a_range %>% sapply(function(i){ks.test(X[,i], Y[,i])$statistic})
}


#' Wrapper for SelectCellProbs that rearranges input order for convenience
#'
#' @param sub.annot annotation subset
#' @param some.genes some genes
#' @param rbound.panel matrix of all expression values post-pagoda'ing
#' @param cellid.col cellid column number
#' @param nr.cell number of cells
#' @param pseudo.count pseudo probability to add
#' @return A number of cells turned into probability distributions (of the genes)
ObtainDistributions <- function(sub.annot, some.genes, rbound.panel, cellid.col, nr.cell, pseudo.count){
  return(SelectCellProbs(sub.annot, rbound.panel, cellid.col, nr.cell, some.genes, pseudo.count))
}

IndividualCellProbsList <- function(annotation, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count=0){
  sub.split <- split(annotation, annotation[, sub.col], drop=T)
  cell.distributions <- mapply(ObtainDistributions, sub.split, genes.list, MoreArgs=list(rbound.panel, cellid.col, nr.cell, pseudo.count), SIMPLIFY=F)
  return(cell.distributions)
}

ProbsToJSD <- function(state.split, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count){
  list.of.lists <- state.split %>% lapply(IndividualCellProbsList, rbound.panel, cellid.col, sub.col,
                                          nr.cell, genes.list, pseudo.count)
  all.sc.dists.de <- Map(CalculateAllJSD, list.of.lists$healthy, list.of.lists$epilepsy)
  return(all.sc.dists.de %>% as_tibble)
} # specific for this data but saves some lines

IndividualCellProbsAgg <- function(annotation, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count=0){
  sub.split <- split(annotation, annotation[, sub.col], drop=T)
  cell.distributions <- mapply(SelectCellProbsAggregated, sub.split, genes.list, MoreArgs=list(rbound.panel, cellid.col, nr.cell, pseudo.count), SIMPLIFY=F)
  return(cell.distributions)
}

FractionalChanges <- function(annotation, patient.col, subtype.col, condition.col){
  sub.split <- split(annotation, annotation[, subtype.col])
  sub.split <- sub.split %>% lapply(function(x){x <- mutate(x, pat.cond = paste(x[, patient.col], x[, condition.col], sep='-'))})
  CountPatConds <- function(annotation) {
    patcond.col <- dim(annotation)[2]
    combs.df <- table(annotation[, patcond.col]) %>% as.data.frame
    colnames(combs.df)[1] <- 'pat.cond'
    colnames(combs.df)[2] <- 'count'
    combs.df <- combs.df %>% mutate(patient = gsub("-.*","", pat.cond))
    combs.df <- combs.df %>% mutate(condition = gsub(".*-","", pat.cond))
    combs.df <- combs.df %>% mutate(subtype = annotation[, subtype.col][1:nrow(combs.df)])
  }
  sub.split.counts <- sub.split %>% lapply(CountPatConds)
  sub.split.df <- do.call(rbind, sub.split.counts)
  rownames(sub.split.df) <- NULL
  patconds.split <- split(sub.split.df, sub.split.df$pat.cond)
  NormalizeCounts <- function(x){
    x$count <- x$count/sum(x$count)
    return(x)
  }
  patconds.split <- patconds.split %>% lapply(NormalizeCounts)
  freq.df <- do.call(rbind, patconds.split)
  freq.df <- freq.df %>% mutate(freq=count, count=NULL)
  return(freq.df)
}

GetSubMatsList <- function(list.of.annots, rbound.panel, list.of.genes, cellid.col) { # OLD
  GetMat <- function(sub.annot, genes, rbound.panel, cellid.col) {
    cellids <- sub.annot[, cellid.col]
    return(rbound.panel[cellids, genes])
  }
  exps.list <- mapply(GetMat, list.of.annots, list.of.genes, MoreArgs=list(rbound.panel, cellid.col))
  return(exps.list)
}

CalculateJBLD <- function(cov1, cov2) {
  JBLD <- log(det((cov1+cov2)/2)) -0.5*log(det(cov1%*%cov2))
  return(JBLD)
}

AddPseudo <- function(matrix, pseudo.count=1e-4){
  matrix <- as.matrix(matrix)
  return(matrix+pseudo.count)
}

countzerocols <- function(mat){
  mat <- as.matrix(mat)
  n.zerocol <- sum((mat %>% apply(2, sum))==0)
  return(n.zerocol)
}

removezerocols <- function(mat) {
  mat <- as.matrix(mat)
  mat <- mat[, !apply(mat, 2, sum)==0]
  return(mat)
}

CMD <- function(cor1, cor2) {
  vec1 <- as.vector(cor1)
  vec2 <- as.vector(cor2)
  l2norm <- function(x){
    return(sqrt(sum(x^2)))
  }
  cmd <- 1 - (vec1 %*% vec2)/(l2norm(vec1)*l2norm(vec2))
  return(cmd)
}

ComputeCor <- function(mat1, mat2) { # makes sure both matrices have the same columns
  commoncols <- intersect(colnames(mat1), colnames(mat2))
  mat1 <- mat1[, commoncols]
  mat2 <- mat2[, commoncols]
  return(list(mat1, mat2) %>% lapply(cor))
}

PadGenesRows <- function(a.matrix, gene.vector) {
  N <- gene.vector %>% length
  Nold <- dim(a.matrix)[1]
  M <- dim(a.matrix)[2]
  new.mat <- matrix(0L, nrow = N, ncol = M)
  new.mat[1:Nold, 1:M] <- a.matrix %>% as.vector
  missing.genes<- setdiff(gene.vector, rownames(a.matrix))
  name.vector <- c(rownames(a.matrix), missing.genes)
  colnames(new.mat) <- colnames(a.matrix); rownames(new.mat) <- name.vector
  return(new.mat[gene.vector,])
}

isNeg <- function(x){
  if (x<0){return ('Neg')} else {return('Pos')}
}

GetSubMats <- function(count.matrix, cellid.vector, subtype.vector, condition.vector, genes=NULL, avg=T, sum=F,
                       normalize=F, pseudo.prob=0, verbose=T){
  bound.annot <- dplyr::bind_cols(cellid=cellid.vector, subtype=subtype.vector, condition=condition.vector)
  check.subs <- bound.annot %$% split(condition, subtype) %>% lapply(unique)
  has.all <- check.subs[which(check.subs %>%
                                lapply(function(x){length(x)>length(condition.vector %>% unique)-1}) %>% unlist)] %>% names
  missing.some <- setdiff(bound.annot$subtype %>% unique, has.all)
  bound.annot <- bound.annot[bound.annot$subtype %in% has.all, ]
  if (verbose){
    if(!purrr::is_empty(missing.some)){
      message("Subtypes ", paste0(missing.some), " not in all conditions, skipping.")
    }
  }
  condition.split <- bound.annot %>% split(bound.annot$condition, drop=T)
  sub.cms.split <- condition.split %>% lapply(function(x){
    sub.split <- x %>% split(x$subtype, drop=T)
    if (!is.null(genes)){
      cms <- sub.split %>% lapply(function(x){count.matrix[x$cellid, genes]})
    } else {
      cms <- sub.split %>% lapply(function(x){count.matrix[x$cellid, ]})
    }
    cms <- cms %>% lapply(function(x){if(is.null(dim(x))){as.matrix(x)} else{x}})
    if (avg){
      values <- cms %>% lapply(Matrix::colMeans)
    } else if (sum) {
      values <- cms %>% lapply(Matrix::colMeans)
    } else {
      values <- cms
    }
    if (normalize){
      values <- values %>% lapply(function(x){
        x <- x/sum(x)
        if(pseudo.prob>0){x<-x+pseudo.prob; x<-x/sum(x)}
        return(x)
      })
    }
    return(values)
  })
}

PerturbedGeneCorrelations <- function(cms, cellid.vec, celltype.vec, condition.vec, perturb.reps=5, do.pca=F){
  annotation <- dplyr::bind_cols(cellid=cellid.vec, celltype=celltype.vec, condition=condition.vec)
  fcs <- c(0.5, 2, 1)
  fc.vec <- rep(fcs, ncol(cms[[1]])*c(0.15,0.15,0.7))
  fc.vec.randomizations <- replicate(perturb.reps, list(sample(fc.vec)))
  PerturbCounts <- function(cm, fc.vector){
    cm %>% apply(1, function(cell.vec){cell.vec*fc.vector}) %>% Matrix::t()
  }
  dfs <- fc.vec.randomizations %>% lapply(function(fc.vec.iteration){
    repped.fc <- rep(list(fc.vec.iteration), length(cms)) # to have the same lfc multiplier for all subtypes
    cms.pseudo.diseased <- Map(PerturbCounts, cms, repped.fc)
    annot.pseudo <- annotation
    annot.pseudo$cellid %<>% sapply(function(x){paste0(x,"+pseudo")}) %>% unname
    annot.pseudo$condition <- rep('pseudo', nrow(annot.pseudo))
    cm.pseudo.bound <- do.call(rbind, cms.pseudo.diseased)
    new.rownames <- rownames(cm.pseudo.bound) %>% sapply(function(x){paste0(x,"+pseudo")}) %>% unname
    rownames(cm.pseudo.bound) <- new.rownames
    new.annot <- rbind(annotation, annot.pseudo)
    new.cm <- rbind(do.call(rbind, cms), cm.pseudo.bound)
    if (do.pca) {
      pca.pseudo <- prcomp_irlba(new.cm, n=100)
      pca.cm.pseudo <- pca.pseudo$x
      rownames(pca.cm.pseudo) <- rownames(new.cm)
      new.cm <- pca.cm.pseudo
    }
    vecs.sub.pseudo <- GetSubMats(new.cm, new.annot$cellid, new.annot$celltype, new.annot$condition)

    cor.res.pseudo <- Map(function(x,y){1-cor(x,y)}, vecs.sub.pseudo[[1]], vecs.sub.pseudo[[2]])

    plot.df <- cor.res.pseudo %>% as.data.frame %>% gather(key=celltype, value=correlation.distance)
    plot.df$celltype <- gsub('\\.', '/', plot.df$celltype) # annoying fucking dot that gets put in instead of the slash
    plot.df %<>% mutate(ncell=table(annotation$celltype)[plot.df$celltype] %>% as.numeric())
    return(plot.df)
  })
  return(dplyr::bind_rows(dfs))
}
githubz0r/SecretUtils documentation built on May 15, 2021, 10:29 p.m.