R/CoRe.R

Defines functions CoRe.VisCFness CoRe.FiPer CoRe.CF_Benchmark CoRe.PanCancer_ADaM CoRe.CS_ADaM CoRe.extract_tissueType_SubMatrix CoRe.download_DepMatrix CoRe.download_AnnotationModel CoRe.download_BinaryDepMatrix CoRe.coreFitnessGenes CoRe.tradeoffEO_TPR CoRe.truePositiveRate CoRe.empiricalOdds CoRe.randomisedepMat CoRe.generateNullModel CoRe.panessprofile CoRe.AssembleFPs CoRe.ADaM rearrangeMatrix findBestInClass HeuristicMutExSorting CoRe.scale_to_essentials CoRe.Binarize_Matrix FDR_Threshold

Documented in CoRe.ADaM CoRe.AssembleFPs CoRe.Binarize_Matrix CoRe.CF_Benchmark CoRe.coreFitnessGenes CoRe.CS_ADaM CoRe.download_AnnotationModel CoRe.download_BinaryDepMatrix CoRe.download_DepMatrix CoRe.empiricalOdds CoRe.extract_tissueType_SubMatrix CoRe.FiPer CoRe.generateNullModel CoRe.PanCancer_ADaM CoRe.panessprofile CoRe.randomisedepMat CoRe.scale_to_essentials CoRe.tradeoffEO_TPR CoRe.truePositiveRate CoRe.VisCFness

## Compute threshold, at a certain % of FDR, given a genome-wide essentiality profile of a cell line
## based on two reference sets of essential and non-essential genes
FDR_Threshold<-function(FCsprofile,
                        ess_genes,
                        noness_genes,
                        FDRth=0.05){

  FCsprofile<-FCsprofile[intersect(c(ess_genes,noness_genes),names(FCsprofile))]

  predictions<-FCsprofile
  observations<-is.element(names(FCsprofile),ess_genes)+0
  names(observations)<-names(predictions)

  RES<-roc(observations,predictions,direction = '>',quiet = TRUE)
  COORS<-coords(RES,'all',ret = c('threshold','ppv'),transpose = TRUE)

  FDRpercTh<-max(COORS['threshold',which(COORS['ppv',]>=(1-FDRth))])
  threshold<-COORS['threshold',min(which(COORS['threshold',]<=FDRpercTh))]

  return(threshold)
}

## Binarize matrix of logFCs/Bayesian Factors to be used as input for ADaM
CoRe.Binarize_Matrix<-function(depMat,
                               method=c('fdr', 'thr'),
                               ess_genes=NULL,
                               noness_genes=NULL,
                               scaled=FALSE,
                               Bayes_Factor=FALSE,
                               FDRth=0.05,
                               threshold=NULL){

  method = match.arg(method)

  if (method == 'fdr'){
    if (is.null(ess_genes) || is.null(noness_genes)){
      stop('ess_genes or noness_genes vectors are NULL: please specify\n
           character vector of gene names for both parameters when choosing\n
           the "fdr" method')
    }
  }

  if (Bayes_Factor){
    if (scaled){
      warning('Cannot apply scaling when Bayes_Factor is TRUE')
    }

    if (length(threshold) > 0){
      stop('Cannot apply "thr" method when Bayes_Factor is TRUE')
    }

    sig_thrs <- sapply(1:ncol(depMat),function(x){
      FC <- -depMat[,x]
      names(FC) <- rownames(depMat)
      -FDR_Threshold(FC,ess_genes,noness_genes,FDRth=FDRth)
    })

    bdep <- matrix(0, ncol = ncol(depMat), nrow = nrow(depMat), dimnames = dimnames(depMat))

    for (i in 1:ncol(depMat)){
      bdep[which(depMat[,i] < sig_thrs[i]),i] <- 1
    }

  } else {
    if (scaled){
      depMat<-CoRe.scale_to_essentials(depMat, ess_genes, noness_genes)
    }

    if (method == 'fdr'){
      sig_thrs <- sapply(1:ncol(depMat),function(x){
        FC <- depMat[,x]
        names(FC) <- rownames(depMat)
        FDR_Threshold(FC,ess_genes,noness_genes,FDRth=FDRth)
      })
    } else {
      if (is.null(threshold)){
        stop('Threshold is NULL: please specify a value\n
             when choosing the "thr" method')
      }
      sig_thrs <- rep(threshold, ncol(depMat))
    }

    bdep <- matrix(0, ncol = ncol(depMat), nrow = nrow(depMat), dimnames = dimnames(depMat))

    for (i in 1:ncol(depMat)){
      bdep[which(depMat[,i] < sig_thrs[i]),i] <- 1
    }
  }

  return(bdep)
}

## Apply CERES scaling on cell line fold change scores using two reference sets of essential and non-essential
## genes
CoRe.scale_to_essentials <- function(ge_fit,ess_genes,noness_genes){
  essential_indices <- which(row.names(ge_fit) %in% ess_genes)
  nonessential_indices <- which(row.names(ge_fit) %in% noness_genes)

  scaled_ge_fit <- ge_fit %>%

    apply(2, function(x){
      (x - median(x[nonessential_indices], na.rm=T)) %>%
        divide_by(median(x[nonessential_indices], na.rm=T) - median(x[essential_indices], na.rm=T))
    })

  return(scaled_ge_fit)
}

## This function implements an heuristic algorithm that takes in input a sparse binary matrix and sorts its rows
## and column in a way that the patterns of non null entries have a minimal overlap across rows.
HeuristicMutExSorting<-function(mutPatterns){

  mutPatterns<-sign(mutPatterns)

  ngenes<-nrow(mutPatterns)
  nsamples<-ncol(mutPatterns)

  if (is.null(rownames(mutPatterns))){
    rownames(mutPatterns) <- 1:ngenes
  }

  if (is.null(colnames(mutPatterns))){
    colnames(mutPatterns) <- 1:nsamples
  }

  if (nrow(mutPatterns)>1 & ncol(mutPatterns)>1){

    RowNull<-names(which(rowSums(mutPatterns)==0))
    RowNonNull<-which(rowSums(mutPatterns)>0)

    ColNull<-names(which(colSums(mutPatterns)==0))
    ColNonNull<-which(colSums(mutPatterns)>0)

    mutPatterns<-matrix(c(mutPatterns[RowNonNull,ColNonNull]),
                        length(RowNonNull),length(ColNonNull),
                        dimnames=list(rownames(mutPatterns)[RowNonNull],colnames(mutPatterns)[ColNonNull]))

    if (nrow(mutPatterns)>1 & ncol(mutPatterns)>1){

      coveredGenes<-NA
      uncoveredGenes<-rownames(mutPatterns)

      coveredSamples<-NA
      uncoveredSamples<-colnames(mutPatterns)
      BS<-NA

      while(length(uncoveredGenes)>0 & length(uncoveredSamples)>0){

        patterns<-matrix(c(mutPatterns[uncoveredGenes,uncoveredSamples]),
                         nrow = length(uncoveredGenes),
                         ncol = length(uncoveredSamples),
                         dimnames = list(uncoveredGenes,uncoveredSamples))

        if(length(uncoveredGenes)>1){
          bestInClass<-findBestInClass(patterns)
        }else{
          bestInClass<-uncoveredGenes
        }

        if(is.na(BS[1])){
          BS<-bestInClass
        }else{
          BS<-c(BS,bestInClass)
        }

        if(is.na(coveredGenes[1])){
          coveredGenes<-bestInClass
        }else{
          coveredGenes<-c(coveredGenes,bestInClass)
        }

        uncoveredGenes<-setdiff(uncoveredGenes,coveredGenes)
        toCheck<-matrix(c(patterns[bestInClass,uncoveredSamples]),nrow = 1,ncol=ncol(patterns),dimnames = list(bestInClass,uncoveredSamples))

        if (length(coveredGenes)==1){
          coveredSamples<-names(which(colSums(toCheck)>0))
        }else{
          coveredSamples<-c(coveredSamples,names(which(colSums(toCheck)>0)))
        }

        uncoveredSamples<-setdiff(uncoveredSamples,coveredSamples)

      }

      BS<-c(BS,uncoveredGenes)

      CID<-rearrangeMatrix(mutPatterns,BS)

      FINALMAT<-mutPatterns[BS,CID]

      nullCol<-matrix(0,nrow(FINALMAT),length(ColNull),
                      dimnames = list(rownames(FINALMAT),ColNull))

      FINALMAT<-cbind(FINALMAT,nullCol)

      nullRow<-matrix(0,length(RowNull),ncol(FINALMAT),
                      dimnames = list(RowNull,colnames(FINALMAT)))

      FINALMAT<-rbind(FINALMAT,nullRow)

      return(FINALMAT)

    } else {
      stop('Matrix must have at least 2 non-null rows and 2 non-null columns')
    }
  } else {
    stop('Matrix must have at least 2 rows and 2 columns')
  }
}

## This function finds the gene (i.e. row) with the highest exclusive coverage. The exclusive coverage for a gene g
## is defined as the number of uncovered samples in which this gene is mutated minus the number of samples in which
## at least another uncovered gene is mutated.
findBestInClass<-function(patterns){

  if(nrow(patterns)==1){
    return(rownames(patterns))
  }

  if(ncol(patterns)==1){
    return(rownames(patterns)[1])
  }

  exclCov<-colSums(t(2*patterns)-colSums(patterns))
  names(exclCov)<-rownames(patterns)

  return(names(sort(exclCov,decreasing=TRUE))[1])
}

## Rearrange Binary Matrix columns in order to minimise row-wise entry overlap based on exclusive coverage.
rearrangeMatrix<-function(patterns,
                          GENES){

  remainingSamples<-colnames(patterns)

  toAdd<-NULL

  DD<-t(t(2*patterns)-colSums(patterns))
  colnames(DD) <- remainingSamples

  for (g in GENES){
    cols <- remainingSamples[order(DD[g,remainingSamples],decreasing = TRUE)]
    toAdd<-c(toAdd,names(which(patterns[g,cols]>0)))
    remainingSamples<-setdiff(remainingSamples,toAdd)

    if(length(remainingSamples)==0){
      break
    }
  }

  toAdd<-c(toAdd,remainingSamples)

  return(toAdd)
}

## Exported
## Documentation Revised
CoRe.ADaM<-function(depMat,display=TRUE,
                    main_suffix='fitness genes in at least 1 cell line',
                    xlab='n. dependent cell lines',
                    ntrials=1000,verbose=TRUE,TruePositives){

  if(verbose){
    print('- Profiling of number of fitness genes across fixed numbers of cell lines and its cumulative sums')}
  pprofile<-CoRe.panessprofile(depMat=depMat,display = display,xlab = xlab,main_suffix = main_suffix)
  if(verbose){print('+ Done!')
    print('- Null modeling numbers of fitness genes across numbers of cell lines and their cumulative sums')}
  nullmodel<-CoRe.generateNullModel(depMat=depMat,ntrials = ntrials,verbose = verbose,display = display)
  if(verbose){print('+ Done!')
    print('- Computing empirical odds of numbers of fitness genes per number of cell lines')}
  EO<-CoRe.empiricalOdds(observedCumSum = pprofile$CUMsums,simulatedCumSum =nullmodel$nullCumSUM)
  if(verbose){print('+ Done')
    print('- Profiling true positive rates')}
  TPR<-CoRe.truePositiveRate(depMat,TruePositives)
  if(verbose){print('- Done!')
    print('+ Calculating ADaM threshold (min. n. of dependent cell lines for core fitness genes)')}
  crossoverpoint<-CoRe.tradeoffEO_TPR(EO,TPR$TPR,test_set_name = 'curated BAGEL essential',display = display)
  if(verbose){print(paste('ADaM threshold =',crossoverpoint,'(out of',ncol(depMat),'cell lines)'))
    print('- Done!')
    print('+ Estimating set of core fitness genes')}
  coreFitnessGenes<-CoRe.coreFitnessGenes(depMat,crossoverpoint)
  if(verbose){print('- Done!')}
  return (coreFitnessGenes)
}

## Assemble expression based false positives
CoRe.AssembleFPs<-function(URL='https://ndownloader.figshare.com/files/26261476'){
  options(timeout=1000)

  dir.create(tmp <- tempfile())
  dir.create(file.path(tmp, 'mydir'))
  print('Downloading zipped CCLE expression data from DepMap portal')
  download.file(URL,file.path(tmp, 'mydir','CCLE_expression.csv'))
  print('...done')

  print('Reading Expression data matrix...')
  X <- read.csv(file.path(tmp,'mydir','CCLE_expression.csv'),
                stringsAsFactors = FALSE,
                header=TRUE,
                row.names = 1)

  gnames<-rownames(X)
  clnames<-colnames(X)

  numdata<-as.matrix(X)

  numdata<-log2(numdata+1)
  numdata<-t(numdata)
  print('Done')
  print('Selecting overall lowly expressed genes...')

  LowlyExpr<-CoRe.FiPer(depMat = numdata,percentile = 0.9,display = FALSE)$cfgenes

  LowlyExpr<-strsplit(LowlyExpr,'[..]')

  LowlyExpr<-sort(unlist(lapply(LowlyExpr,function(x){x[1][1]})))

  print('Done')
  return(LowlyExpr)

}

CoRe.panessprofile<-function(depMat,display=TRUE,
                             main_suffix='fitness genes in at least 1 cell line',
                             xlab='n. dependent cell lines'){

    depMat<-depMat[which(rowSums(depMat)>0),]
    panessprof<-rep(0,ncol(depMat))
    names(panessprof)<-as.character(1:ncol(depMat))
    paness<-summary(as.factor(rowSums(depMat)),maxsum = length(unique(as.factor(rowSums(depMat)))))
    panessprof[as.character(names(paness))]<-paness

    CUMsums<-rev(cumsum(rev(panessprof)))

    names(CUMsums)<-paste('>=',names(CUMsums),sep='')

    if(display){
        par(mfrow=c(2,1))
        par(mar=c(6,4,4,1))

        main=paste(nrow(depMat),main_suffix)
        barplot(panessprof,ylab='n.genes',xlab=xlab,cex.axis = 0.8,cex.names = 0.8,
                las=2,main=main)

        barplot(CUMsums,ylab='n.genes',xlab=xlab,cex.axis = 0.8,cex.names = 0.6,
                las=2,main='Cumulative sums')
    }
    return(list(panessprof=panessprof,CUMsums=CUMsums))
}

CoRe.generateNullModel<-function(depMat,ntrials=1000,display=TRUE,verbose=TRUE){

    set.seed(100812)
    depMat<-depMat[which(rowSums(depMat)>0),]
    nullProf<-matrix(NA,ntrials,ncol(depMat),dimnames = list(1:ntrials,1:ncol(depMat)))
    nullCumSUM<-matrix(NA,ntrials,ncol(depMat),dimnames = list(1:ntrials,paste('≥',1:ncol(depMat),sep='')))
    if(verbose){
      print('Generating null model...')
      pb <- txtProgressBar(min=1,max=ntrials,style=3)
    }
    for (i in 1:ntrials){
      if(verbose){setTxtProgressBar(pb, i)}
        rMat<-
            CoRe.randomisedepMat(depMat)
        Ret<-
            CoRe.panessprofile(rMat,display = FALSE)
        nullProf[i,]<-Ret$panessprof
        nullCumSUM[i,]<-Ret$CUMsums
    }
    if(verbose){
      Sys.sleep(1)
      close(pb)
      print('')
      print('Done')
    }

    if (display){

        par(mfrow=c(2,1))
        main=c(paste(ntrials,' randomised essentiality profiles of\n',nrow(depMat),' genes across ',ncol(depMat),' cell lines',
                     sep=''))
        boxplot(nullProf,las=2,xlab='n. cell lines',ylab='fitness genes in n cell lines',main=main)

        colnames(nullCumSUM)<-paste('>=',1:ncol(nullCumSUM))
        boxplot(log10(nullCumSUM+1),las=2,main='Cumulative sums',xlab='n. cell lines',
                ylab='log10 [number of fitness genes + 1]',
                cex.axis=0.8)
    }

    return(list(nullProf=nullProf,nullCumSUM=nullCumSUM))
}

CoRe.randomisedepMat<-function(depMat){
    rmat<-apply(depMat,2,sample)
}

CoRe.empiricalOdds<-function(observedCumSum,simulatedCumSum){
  nsamples<-length(observedCumSum)
  ntrials<-nrow(simulatedCumSum)
  odds<-rep(NA,1,nsamples)
  names(odds)<-paste('≥',1:nsamples,sep='')
  for (i in 1:nsamples){
    PDF<-density(simulatedCumSum[,i])
    odds[i]<- log10(observedCumSum[i]/mean(simulatedCumSum[,i]))
  }
  return(odds)
}

CoRe.truePositiveRate<-function(depMat,essentialGeneSet){
  nsamples<-ncol(depMat)

  essentialGeneSet<-intersect(essentialGeneSet,rownames(depMat))

  TPR<-rep(NA,1,nsamples)
  names(TPR)<-paste('≥',1:nsamples,sep='')

  ncells<-rowSums(depMat)

  TP<-rep(NA,1,nsamples)
  names(TP)<-paste('≥',1:nsamples,sep='')

  P<-rep(NA,1,nsamples)
  names(P)<-paste('≥',1:nsamples,sep='')

  for (i in nsamples:1){
    positiveSet<-names(which(ncells>=i))
    P[i]<-length(positiveSet)
    truepositives<-intersect(positiveSet,essentialGeneSet)
    TP[i]<-length(truepositives)
    TPR[i]<-TP[i]/length(essentialGeneSet)
  }

  return(list(P=P,TP=TP,TPR=TPR))
}

CoRe.tradeoffEO_TPR<-function(EO,TPR,test_set_name,display=TRUE){
  x<-EO
  x[x==Inf]<-max(x[x<Inf])
  x<-(x-min(x))/(max(x)-min(x))

  y<-TPR
  y<-(y-min(y))/(max(y)-min(y))

  orEO<-EO
  orEO[orEO==Inf]<-max(orEO[orEO<Inf])
  orTPR<-TPR

  EO<-x
  TPR<-y
  point<-min(which(!y>x))

  if(display){
    CCOL<-'red'
    par(mar=c(4,4,4,4))
    par(mfrow=c(1,1))
    MAIN<-c('log10 (obs/Expct) n.genes [red, left]',
            paste('% covered ',test_set_name,' [blue, right]',sep=''))
    plot(EO,type='l',xlab='genes depleted in >= # cell lines',ylab='',axes=FALSE,lwd=4,main=MAIN,col=CCOL,cex.main=0.8,
         xlim=c(0,length(EO)))
    axis(2,at = seq(0,1,0.2),format(seq(min(orEO),max(orEO),(max(orEO)-min(orEO))/5),digits=2))
    axis(1)
    par(new=TRUE)
    plot(TPR,type='l',xlab='',ylab='',axes=FALSE,lwd=4,col='blue',ylim=c(0,1),xlim=c(0,length(EO)))
    axis(4,at = seq(0,1,0.2),format(seq(min(orTPR),max(orTPR),(max(orTPR)-min(orTPR))/5),digits=2))



    abline(v=point)
    abline(h=y[point],lty=2)

    points(point,y[point],pch=16,cex=2)

    legend('top',paste(format(100*orTPR[point],digits=2),'% covered',sep=''),bg = NULL,bty = 'n')
  }

  return(point)
}

CoRe.coreFitnessGenes<-function(depMat,crossoverpoint){
  coreFitnessGenes<-rownames(depMat)[rowSums(depMat)>=crossoverpoint]
  return (coreFitnessGenes)
}

## not Documented

## Downloading Binary Dependency Matrix (introduced in Behan 2019) from Project Score
CoRe.download_BinaryDepMatrix<-function(URL='https://cog.sanger.ac.uk/cmp/download/binaryDepScores.tsv.zip'){
  temp <- tempfile()
  download.file(URL,temp)
  X <- read.table(unz(temp,'binaryDepScores.tsv'),stringsAsFactors = FALSE,sep='\t',header=TRUE,row.names = 1)
  colnames(X)<-str_replace_all(colnames(X),'[.]','-')
  colnames(X)<-unlist(lapply(colnames(X),function(x){
    if(str_sub(x,1,1)=='X'){
      x<-str_replace(x,'X','')
    }else{
      x
    }
  }))

  return(X)
}

## Downloading Cell Passport models annotation file
CoRe.download_AnnotationModel<-function(URL='https://cog.sanger.ac.uk/cmp/download/model_list_latest.csv.gz'){
  X <- read_csv(URL)
  return(X)
}

## Downloading and scaling Quantitative Dependency Matrix (introduced in Behan 2019) from Project Score
CoRe.download_DepMatrix<-function(URL='https://cog.sanger.ac.uk/cmp/download/essentiality_matrices.zip',
                                  scaled=FALSE,
                                  ess=NULL,
                                  noness=NULL){

  dir.create(tmp <- tempfile())
  dir.create(file.path(tmp, 'mydir'))
  print('Downloading zipped essentiality matrices from Project Score...')
  download.file(URL,file.path(tmp, 'mydir','essentiality_matrices.zip'))
  print('...done')

  dir(file.path(tmp,'mydir'))

  print('Uncompressing zipped essentiality...')
  unzip(file.path(tmp,'mydir','essentiality_matrices.zip'),exdir = file.path(tmp,'mydir'))
  print('...done')

  print('Reading CRISPRcleanR corrected essentiality logFCs...')
  X <- read.table(file.path(tmp,'mydir','EssentialityMatrices','01_corrected_logFCs.tsv'),
                  stringsAsFactors = FALSE,
                  sep='\t',
                  header=TRUE,
                  row.names = 1)

  colnames(X)<-str_replace_all(colnames(X),'[.]','-')
  colnames(X)<-unlist(lapply(colnames(X),function(x){
    if(str_sub(x,1,1)=='X'){
      x<-str_replace(x,'X','')
    }else{
      x
    }
  }))
  print('...done')

  if(scaled){
    X<-CoRe.scale_to_essentials(X,ess,noness)
  }

  return(X)
}

## Extracting Dependency SubMatrix for a given tissue or cancer type, among those included
## in the latest model annotation file on the cell model passports (cite Donny's paper and website URL)
CoRe.extract_tissueType_SubMatrix<-function(fullDepMat,tissue_type='Non-Small Cell Lung Carcinoma'){
  cmp<-read_csv('https://cog.sanger.ac.uk/cmp/download/model_list_latest.csv.gz')
  cls<-cmp$model_name[which(cmp$tissue==tissue_type | cmp$cancer_type==tissue_type)]
  cls<-intersect(cls,colnames(fullDepMat))
  return(fullDepMat[,cls])
}

## Execute ADaM on tissue or cancer type specifc dependency submatrix
CoRe.CS_ADaM<-function(pancan_depMat,
                       tissue_ctype = 'Non-Small Cell Lung Carcinoma',
                       clannotation = NULL,
                       display=TRUE,
                       main_suffix='fitness genes in at least 1 cell line',
                       xlab='n. dependent cell lines',
                       ntrials=1000,verbose=TRUE,TruePositives){

  cls<-clannotation$model_name[clannotation$tissue==tissue_ctype | clannotation$cancer_type==tissue_ctype]
  cls<-intersect(colnames(pancan_depMat),cls)
  cs_depmat<-pancan_depMat[,cls]

  return(CoRe.ADaM(cs_depmat,display=display,
                   main_suffix = main_suffix,
                   xlab=xlab,
                   ntrials=ntrials,
                   verbose=verbose,TruePositives = TruePositives))
}

## Execute ADaM tissue by tissue then at the pancancer level to compute pancancer core fitness genes
CoRe.PanCancer_ADaM<-function(pancan_depMat,
                              tissues_ctypes,
                              clannotation = NULL,
                              display=TRUE,
                              ntrials=1000,verbose=TRUE,TruePositives){


  systematic_CS_ADaM_res<-lapply(tissues_ctypes,function(x){
      if(verbose){
        print(paste('Running ADaM for',x))
      }
      CoRe.CS_ADaM(pancan_depMat,tissue_ctype = x,
                   clannotation,display = display,
                   ntrials = ntrials,
                   verbose=verbose,TruePositives = TruePositives)
    })

  all_TS_CF_genes<-sort(unique(unlist(systematic_CS_ADaM_res)))

  TS_CF_matrix<-
    do.call(cbind,lapply(systematic_CS_ADaM_res,function(x){is.element(all_TS_CF_genes,x)+0}))

  rownames(TS_CF_matrix)<-all_TS_CF_genes
  colnames(TS_CF_matrix)<-tissues_ctypes

  CoRe.ADaM(depMat = TS_CF_matrix, display = display,
            main_suffix = 'genes predicted to be core fitness for at least 1 tissue/cancer-type',
            xlab = 'n. tissue/cancer-type',verbose = FALSE,TruePositives = TruePositives)


}

## Computes recall and other ROC indicators for identified core fitness genes
## with respect to pre-defined signatures of essential genes
CoRe.CF_Benchmark<-function(testedGenes,background,priorKnownSignatures,falsePositives,displayBar=TRUE){
  priorKnownSignatures<-lapply(priorKnownSignatures,function(x){intersect(x,background)})
  falsePositives<-intersect(falsePositives,background)

  memb<-do.call(rbind,lapply(priorKnownSignatures,function(x){
    is.element(testedGenes,x)+0
  }))

  colnames(memb)<-testedGenes

  totals<-unlist(lapply(priorKnownSignatures,function(x){
      length(intersect(x,background))
  }))

  memb<-HeuristicMutExSorting(memb)

  TPRs<-rowSums(memb)/totals[rownames(memb)]

  x<-rowSums(memb)
  N<-length(background)
  n<-length(testedGenes)
  k<-totals

  ps<-phyper(x-1,n,N-n,k,lower.tail=FALSE)[rownames(memb)]

  if(displayBar){
    pheatmap(memb,cluster_rows = FALSE,cluster_cols = FALSE,col=c('white','blue'),show_colnames = FALSE,
             main=paste(length(testedGenes),'core fitness genes'),legend = FALSE,
             width = 5,height = 3)

    par(mfrow=c(1,2))
    par(mar=c(5,1,0,1))
    barplot(100*rev(TPRs),horiz = TRUE,names.arg = NA,xlab='% covered genes',border=FALSE)
    abline(v=0)
    abline(v=c(20,40,60,80,100),lty=2,col='gray',lwd=2)
    ps[ps==0]<-min(ps[ps>0]/10)
    barplot(rev(-log10(ps)),horiz = TRUE,names.arg = NA,xlab='-log10 pval',border=FALSE,xlim=c(1,200),log = 'x')
    abline(v=1)
    abline(v=seq(10,200,20),lty=2,col='gray',lwd=2)
  }


  FPR<-length(intersect(falsePositives,testedGenes))/length(falsePositives)
  PPV<-length(intersect(testedGenes,unlist(priorKnownSignatures)))/length(testedGenes)

  TPRs<-TPRs[order(names(TPRs))]
  ps<-ps[order(names(ps))]

  return(list(TPRs=data.frame(Recall=TPRs,EnrichPval=ps),PPV=PPV,FPR=FPR))
}

## Calculate the Core Fitness genes using the  90th-percentile least dependent cell line from
## Quantitative knockout screen dependency matrix.
CoRe.FiPer<-function(depMat,display=TRUE,percentile=0.9,method='AUC'){

  depMat<-as.matrix(depMat)

  rankCL<-t(apply(depMat,1,function(x){sx<-order(x)
                                       x<-match(1:length(x),sx)}))

  rownames(rankCL)<- rownames(depMat)
  colnames(rankCL)<- colnames(depMat)

  rankG<-apply(depMat,2,function(x){sx<-order(x)
                                    x<-match(1:length(x),sx)})

  rownames(rankG)<- rownames(depMat)
  colnames(rankG)<- colnames(depMat)

  CLnumber<-ncol(depMat)
  threshold = as.integer(CLnumber*percentile)

  nG<-nrow(rankG)
  nCL<-ncol(rankG)

  if(method=='fixed'){
    LeastDependentdf<-do.call(rbind,lapply(1:nG,function(x){rankG[x,match(threshold,rankCL[x,])]}))
    Label = 'Gene rank in 90th perc. least dep cell line'
  }

  if(method=='average'){
    LeastDependentdf<-do.call(rbind,lapply(1:nG,function(x){mean(rankG[x,names(which(rankCL[x,]>=threshold))])}))
    Label = 'Gene average rank in ≥ 90th perc. of least dep cell lines'
  }

  if(method=='slope'){
    LeastDependentdf<-do.call(rbind,lapply(1:nG,function(x){
      a <- rankG[x,colnames(rankCL)[order(rankCL[x,])]]
      b<-as.data.frame(a)
      p<-lm(a ~ seq(1:nCL) , data=b)
      coef(p)[2]
    }))
    Label = 'Slope of gene ranks across ranked dep cell lines'
  }

  if(method=='AUC'){
    LeastDependentdf<-do.call(rbind,lapply(1:nG,function(x){
      a <- rankG[x,colnames(rankCL)[order(rankCL[x,])]]
      sum(a)
    }))
    Label = 'AUC of gene ranks across ranked dep cell lines'
  }

  rownames(LeastDependentdf)<-rownames(rankG)
  doR <- density(LeastDependentdf, bw = 'nrd0')

  if (display){
    par(mfrow=c(2,1))
    hist(LeastDependentdf,breaks = 100,xlab='Rank',main=Label)
    plot(doR,main='Gaussian Kernel Density Estimation',xlab='Rank')

  }

  localmin <- which(diff(-1*sign(diff(doR$y)))==-2)[1]+1
  myranks<- doR$x
  rankthreshold <- as.integer(myranks[localmin])+1

  if(display){
    abline(v=rankthreshold,col='red')
    legend('topleft',legend='Discriminative Threshold',cex=0.7,col='red',lty=1)
  }

  cfgenes <- rownames(LeastDependentdf)[which(LeastDependentdf < rankthreshold[1])]

  return(list(cfgenes=cfgenes,geneRanks=LeastDependentdf,LocalMinRank=rankthreshold[1]))
}

CoRe.VisCFness<-function(depMat,gene,percentile=0.9,posControl='RPL8',negControl='MAP2K1',method='fixed'){
  gg<-gene
  depMat<-as.matrix(depMat)

  rankCL<-t(apply(depMat,1,function(x){
      sx<-order(x)
      x<-match(1:length(x),sx)
      }))

  rownames(rankCL)<- rownames(depMat)
  colnames(rankCL)<- colnames(depMat)

  rankG<-apply(depMat,2,function(x){
    sx<-order(x)
    x<-match(1:length(x),sx)})

  rownames(rankG)<- rownames(depMat)
  colnames(rankG)<- colnames(depMat)

  nG<-nrow(rankG)
  nCL<-ncol(rankG)

  par(mfrow=c(1,2))

  plot(rankG[negControl,names(sort(rankCL[negControl,]))],
       col=rgb(150,0,0,alpha = 180,maxColorValue = 255),
       pch=16,ylim=c(0,nrow(depMat)),
       xlab='cell line dependency ranks',ylab='gene dependency rank in x dependant cell line')
  points(rankG[posControl,names(sort(rankCL[posControl,]))],pch=16,
         col=rgb(0,200,100,alpha = 180,maxColorValue = 255))

  points(rankG[gg,names(sort(rankCL[gg,]))],pch=16,
         col=rgb(0,0,100,alpha = 180,maxColorValue = 255))

  threshold = as.integer(nCL*percentile)

  if(method=='fixed'){
    LeastDependentdf<-do.call(rbind,lapply(1:nG,function(x){rankG[x,match(threshold,rankCL[x,])]}))
    Label = 'Gene rank in 90th perc. least dep cell line'
  }

  if(method=='average'){
    LeastDependentdf<-do.call(rbind,lapply(1:nG,function(x){mean(rankG[x,names(which(rankCL[x,]>=threshold))])}))
    Label = 'Gene average rank in ≥ 90th perc. of least dep cell lines'
  }

  if(method=='slope'){
    LeastDependentdf<-do.call(rbind,lapply(1:nG,function(x){
      a <- rankG[x,colnames(rankCL)[order(rankCL[x,])]]
      b<-as.data.frame(a)
      p<-lm(a ~ seq(1:nCL) , data=b)
      coef(p)[2]
    }))
    Label = 'Slope of gene ranks across ranked dep cell lines'
  }

  if(method=='AUC'){
    LeastDependentdf<-do.call(rbind,lapply(1:nG,function(x){
      a <- rankG[x,colnames(rankCL)[order(rankCL[x,])]]
      sum(a)
    }))
    Label = 'AUC of gene ranks across ranked dep cell lines'
  }

  cc<-c(rgb(0,200,100,alpha = 180,maxColorValue = 255),
        rgb(150,0,0,alpha = 180,maxColorValue = 255),
        rgb(0,0,100,alpha = 180,maxColorValue = 255))
  legend('topleft',legend=c(posControl,negControl,gg),col=cc,pch=16)

  names(LeastDependentdf)<-rownames(rankG)

  hist(LeastDependentdf,main=Label)
  abline(v = LeastDependentdf[negControl],lwd=4,col=rgb(150,0,0,alpha = 180,maxColorValue = 255))
  abline(v = LeastDependentdf[posControl],lwd=4,col=rgb(0,200,100,alpha = 180,maxColorValue = 255))
  abline(v = LeastDependentdf[gg],lwd=4,col=rgb(0,0,100,alpha = 180,maxColorValue = 255))

  doR <- density(LeastDependentdf, bw = 'nrd0')

  localmin <- which(diff(-1*sign(diff(doR$y)))==-2)[1]+1
  myranks<- doR$x
  rankthreshold <- as.integer(myranks[localmin])+1

  abline(v = rankthreshold,
         lwd=3,
         col='darkgrey',
         lty=2)

}
DepMap-Analytics/CoRe documentation built on July 6, 2022, 8:01 a.m.