inst/paper_code/match_it.R

readinRnaFileMapping <- function(){
  meta <- '/mnt/work1/users/pughlab/projects/cancer_cell_lines/rnaseq_dat/data/GDSC/fileList1357.txt'
  meta.gdsc <- '/mnt/work1/users/pughlab/projects/cancer_cell_lines/rnaseq_dat/data/GDSC/E-MTAB-3983.sdrf.txt'
  meta.ccle <- '/mnt/work1/users/pughlab/projects/cancer_cell_lines/rnaseq_dat/data/CCLE/CCLE_meta.txt'
  meta.gcsi <- '/mnt/work1/users/pughlab/projects/cancer_cell_lines/rnaseq_dat/data/GNE/gcsi_meta.tsv'
  pattern="[-\\(\\)\\.\\,\\_\\/ ]"
  
  meta <- read.table(meta, sep="\t", header=F, fill=TRUE)
  meta.gdsc <- read.table(meta.gdsc, sep="\t", header=T, fill=TRUE)
  meta.ccle <- read.table(meta.ccle, sep=",", header=T, fill=TRUE)
  meta.gcsi <- read.table(meta.gcsi, sep="\t", header=T, fill=TRUE)
  
  meta.gdsc$simpleid = toupper(gsub(pattern, "", meta.gdsc$Source.Name))
  meta.ccle$simpleid = toupper(gsub(pattern, "", meta.ccle$Cell_Line))
  meta.gcsi$simpleid = toupper(gsub(pattern, "", meta.gcsi$Cell_line))
  ov = sort(intersect(meta.gdsc$simpleid, meta.ccle$simpleid))  ## 61 from non simple.id, 79 simple
  gdsc.o = sort(setdiff(meta.gdsc$simpleid, meta.ccle$simpleid))
  ccle.o = sort(setdiff(meta.ccle$simpleid, meta.gdsc$simpleid))
  
  # Merge by EGAF(meta) to EGAR (meta.gdsc) and cell-name by EGAN id
  all.meta <- merge(meta, meta.gdsc, by.x='V2', by.y='Comment.EGA_SAMPLE.', all=TRUE)
  all.meta <- merge(all.meta, meta.ccle, by='simpleid', all=TRUE)
  all.meta <- all.meta[,c('V1','Source.Name', 'V4', 'Comment.EGA_RUN.', 'Run', 
                          'Cell_Line', 'simpleid')]
  
  meta.df$simpleid <- toupper(gsub(pattern, "", meta.df$ID))
  meta.df[grep("^T-T$", meta.df$ID),]$simpleid <- 'T-T'
  all.meta.df <- merge(all.meta, meta.df, by="simpleid", all=TRUE)
  
  colnames(all.meta.df)[1:8] <- c("simpleid", "V1", "GDSC_ID", "EGAF", "EGAR", "SRR", "CCLE_ID", "ID")
  ## Adding gCSI
  meta.gcsi <- meta.gcsi[,c(2, 3,10),drop=FALSE]
  colnames(meta.gcsi) <- c("gCSI_cellid", "gCSI_RNA", "simpleid")
  all.meta.df <- merge(all.meta.df, meta.gcsi, by='simpleid', all=TRUE)
  write.table(all.meta.df, file="~/rna_meta_df.csv", sep=",", col.names=TRUE, row.names=FALSE, quote=FALSE)
  
  return(all.meta.df)
}


###########################
#### Preliminary steps ####
###########################
## Run before executing any of the following 3 analyses
benchmarkCCLid <- function(bench){
  library(VariantAnnotation)
  library(CCLid)
  require(DNAcopy)
  
  ## Set dataset colors
  dataset.cols <- setNames(RColorBrewer::brewer.pal(6, "Dark2"),
                           c("GDSC", "CCLE", "gCSI", "CGP", "Pfizer"))
  
  refdir="/mnt/work1/users/home2/quever/git/CCLid-web/extdata"
  PDIR <- "/mnt/work1/users/pughlab/projects/cancer_cell_lines/CCL_paper/CCLid/CCLid"
  ref.dat <- CCLid::loadRef(refdir, 'baf', bin.size=5e5, just.var=TRUE)
  
  var.dat <- ref.dat$var
  ref.mat <- ref.dat$ref
}

############################
#### F1 Score Min.Snps  ####
############################
## This process is meant to calculate the minimum
## number of SNPS needed by calculating the F1 score
## for a series of different number of SNPs
minimumSnps <- function(){
  dataset <- 'GDSC'
  vcf.dir <- file.path('/mnt/work1/users/pughlab/projects/cancer_cell_lines/rnaseq_dat/vcfs',
                       dataset)
  all.vcfs <- list.files(vcf.dir, pattern="vcf.gz$")
  rna.meta.df <- readinRnaFileMapping()
  num.snps.to.test <- seq(100,10,by=-10)
  seed <- 1234
  
  set.seed(seed)
  # s.range <- sort(sample(1:length(all.vcfs), size=100))
  # f1.scores.by.snps <- lapply(seq(40, 10, by=-2), function(num.snps){
  vcf.f1.scores <- mclapply(all.vcfs, function(vcf){
    vcf.map <- mapVcf2Affy(file.path(vcf.dir, vcf))
    
    cat(paste0(vcf, "(", match(vcf, all.vcfs), "/", length(all.vcfs), "): "))
    f1.scores <- sapply(num.snps.to.test, function(num.snps){
      cat(paste0(num.snps, "."))
      idx <- sample(1:length(ref.dat$var), size=max(num.snps.to.test)*10, replace = FALSE)
      vcf.map.var <- mapVariantFeat(vcf.map, ref.dat$var[idx])
      vcf.map.var$BAF <- vcf.map.var$BAF[1:num.snps,]
      vcf.map.var$GT <- vcf.map.var$GT[1:num.snps,]
      
      vaf.to.map <- vcf.map.var
      
      ## Overlap the two VCFs to form a single matrix to combine
      ov.idx <- overlapPos(comp = vaf.to.map$BAF,
                           ref=ref.dat$ref, mapping = 'probeset')
      x.mat <- cbind(vaf.to.map$BAF$BAF[ov.idx$comp], 
                     ref.dat$ref[ov.idx$ref,])
      rna.idx <- grep(gsub(".snpOut.*", "", vcf), rna.meta.df$EGAF)
      colnames(x.mat)[1] <- paste0("RNA_", rna.meta.df[rna.idx,]$ID)
      if(class(ref.dat$ref) == 'big.matrix'){
        x.mat[,-1] <- x.mat[,-1] / 100
        
      }
      # if(rm.gne){
      #   gne.idx <- c(grep("^GNE_", colnames(x.mat)), grep("^Unk[0-9]", colnames(x.mat)))
      #   x.mat <- x.mat[,-gne.idx]
      # }
      
      ## Calculate distance between samples
      x.dist <- similarityMatrix(x.mat, 'euclidean')
      D.vals <- lapply(list("baf"=x.dist), splitConcordanceVals, meta.df=meta.df)
      balanced <- balanceGrps(D.vals)
      
      ## Train model
      models <- trainLogit(balanced, predictors=c('baf'))
      x.vals <- lapply(list("baf"=x.dist), splitConcordanceVals, meta.df=NULL)
      pred <- assemblePredDat(x.vals, known.class=FALSE)
      pred <- mkPredictions(pred, models)
      x.pred <- split(pred, pred$Var2)[[colnames(x.mat)[1]]]
      x.pred <- x.pred[order(x.pred$q),]
      
      rna.idx <- grep(gsub(".snpOut.*", "", vcf), rna.meta.df$EGAF)
      match.idx <- factor(grepl(paste0("_?", rna.meta.df[rna.idx,]$ID, "$"), x.pred$Var1), levels=c(TRUE,FALSE))
      c.tbl <- sapply(split(x.pred, match.idx), function(i) table(i$baf.p.fit))
      
      if(all(dim(c.tbl) == c(2,2))){
        c.tbl <- c.tbl[c("M", "NM"), c('TRUE', 'FALSE')]
        
        precision <- c.tbl[1,1] / (c.tbl[1,1] + c.tbl[1,2])
        recall <- c.tbl[1,1] / (c.tbl[1,1] + c.tbl[2,1])
        f1.score <- 2 * ((precision * recall) / (precision + recall))
        if(is.nan(f1.score)) f1.score <- 0
      } else {
        f1.score <- NULL
      }
      return(f1.score)
    })
    cat("\n")
    
    gc()
    return(f1.scores)
  }, mc.cores = 4)
  save(vcf.f1.scores, file=file.path(PDIR, "match_it", paste0("vcf_f1.", seed, "_", dataset, ".rda")))
  load(file.path(PDIR, "match_it", paste0("vcf_f1.", seed, "_", dataset, ".rda")))
  
  pdf(file.path(vcf.dir, paste0("f1score_", dataset, ".pdf")), height = 4, width=5)
  f1.mat <- do.call(rbind, lapply(vcf.f1.scores, unlist))
  colnames(f1.mat) <- num.snps.to.test
  dataset <- 'GDSC'
  boxplot(f1.mat, col="#0000ff22", las=1, outline=FALSE, 
          ylab='F1-Score', xlab='Num. of SNPs', main=dataset)
  rm.idx <- apply(f1.mat, 2, function(i) i >= median(i))
  f1.mat[rm.idx] <- NA
  colnames(f1.mat) <- rev(colnames(f1.mat))
  melt.f1 <- reshape::melt(f1.mat)
  
  beeswarm::beeswarm(value ~ X2, data = melt.f1, method = 'swarm', corral = 'gutter',
                     cex=0.8, pch = 16, add=TRUE, col=scales::alpha("black", 0.6))
  dev.off()
  scp.path <- "scp quever@192.168.198.99:"
  cat(paste0(scp.path, file.path(vcf.dir, paste0("f1score_", dataset, ".pdf")), ' .\n'))
  
}



##########################
#### Match All RNAseq ####
##########################
## Matches all the RNAseq data from CCLE and GDSC2
expressThis <- function(){
  dataset <- 'GNE' #GDSC
  ds.pattern = '(CCLE)_|(GDSC)_|(GDSC2)_|(CCLE2)_|(GNE)_|(GNE2)_'
  
  vcf.dir <- file.path('/mnt/work1/users/pughlab/projects/cancer_cell_lines/rnaseq_dat/vcfs',
                       dataset)
  all.vcfs <- list.files(vcf.dir, pattern="vcf.gz$")
  rna.meta.df <- readinRnaFileMapping()

  rna.identity <- mclapply(all.vcfs, function(vcf, num.snps=200, top.hits=4){
    vcf.map <- mapVcf2Affy(file.path(vcf.dir, vcf))
    
    cat(paste0(vcf, "(", match(vcf, all.vcfs), "/", length(all.vcfs), "): "))
    idx <- sample(1:length(ref.dat$var), size=max(num.snps)*10, replace = FALSE)
    vcf.map.var <- mapVariantFeat(vcf.map, ref.dat$var[idx])
    if(nrow(vcf.map.var$GT) < num.snps) num.snps <- nrow(vcf.map.var$GT)
    vcf.map.var$BAF <- vcf.map.var$BAF[1:num.snps,]
    vcf.map.var$GT <- vcf.map.var$GT[1:num.snps,]
    vaf.to.map <- vcf.map.var
    
    ## Overlap the two VCFs to form a single matrix to combine
    ov.idx <- overlapPos(comp = vaf.to.map$BAF,
                         ref=ref.dat$ref, mapping = 'probeset')
    x.mat <- cbind(vaf.to.map$BAF$BAF[ov.idx$comp], 
                   ref.dat$ref[ov.idx$ref,])
    if(class(ref.dat$ref) == 'big.matrix'){
      x.mat[,-1] <- x.mat[,-1]/100
      colnames(x.mat)[1] <- gsub(".snpOut.*", "", vcf)

    }
    # if(rm.gne){
    #   gne.idx <- c(grep("^GNE_", colnames(x.mat)), grep("^Unk[0-9]", colnames(x.mat)))
    #   x.mat <- x.mat[,-gne.idx]
    # }
    
    ## Calculate distance between samples
    x.dist <- similarityMatrix(x.mat, 'euclidean')
    D.vals <- lapply(list("baf"=x.dist), splitConcordanceVals, meta.df=meta.df)
    balanced <- balanceGrps(D.vals)
    
    ## Train model
    models <- trainLogit(balanced, predictors=c('baf'))
    x.vals <- lapply(list("baf"=x.dist), splitConcordanceVals, meta.df=NULL)
    pred <- assemblePredDat(x.vals, known.class=FALSE)
    pred <- mkPredictions(pred, models)
    x.pred <- split(pred, pred$Var2)[[colnames(x.mat)[1]]]
    x.pred <- x.pred[order(x.pred$q),]
    
    if(length(grep("^M$", x.pred$baf.p.fit)) < top.hits){
      return(x.pred[1:top.hits,])
    } else {
      return(x.pred[grep("^M$", x.pred$baf.p.fit), ])
    }
  }, mc.cores = 6)
  names(rna.identity) <- gsub(".snpOut.*", "", all.vcfs)
  err.idx <- sapply(rna.identity, function(i) class(i) == 'try-error')
  if(any(err.idx)) rna.identity <- rna.identity[-which(err.idx)]
  save(rna.identity, file=file.path(PDIR, "match_it", paste0(dataset, "_rnaID.rda")))
  load(file.path(PDIR, "match_it", paste0(dataset, "_rnaID.rda")))
  
  ## Annonate the CVCL IDs
  rna.cvcl <- mclapply(names(rna.identity), function(id, dataset){
    print(id)
    rna.id <- rna.identity[[id]]
    file.prefix <- switch(dataset,
                          "GDSC"="EGAF",
                          "CCLE"="SRR",
                          "GNE"="gCSI_RNA")
    if(any(grepl(paste0("^", id, "$"), rna.meta.df[[file.prefix]]))){
      id.x <- rna.meta.df[grep(paste0("^", id, "$"), as.character(rna.meta.df[[file.prefix]])),]$ID[1]
      if(is.na(id.x) & dataset == 'GNE'){
        require(dplyr)
        gcsi.id <- rna.meta.df[grep(paste0("^", id, "$"), as.character(rna.meta.df[[file.prefix]])),]$gCSI_cellid
        rna.id$Var2 <- gsub("^Caco", "CACO", gcsi.id) %>%
          gsub(" ", "-", .) %>%
          gsub("A4/", "A4-", .) %>%
          gsub("^CI-", "Ci-", .) %>%
          gsub("OCI-LY-", "OCI-Ly", .) %>%
          gsub("RAJI", "Raji", .) %>%
          gsub("RAMOS", "Ramos", .) %>%
          gsub("^RI-1", "Ri-1", .) %>%
          gsub("^SC-1", "Sc-1", .) %>%
          gsub("^OVCA(-)?", "OvCA", .) %>%
          gsub("^928-mel", "928-MEL", .) %>%
          gsub("^SNU-1", "NCI-SNU-1", .) %>%
          gsub("^CACO-2", "CACO2", .) %>%
          gsub("^Okajima", "OKAJIMA", .) %>%
          gsub("-Paca-", "-PaCa-", .) %>%
          gsub("PANC-1", "Panc-1", .) %>%
          gsub("^HUP-T4", "HuP-T4", .) %>%
          gsub("^LS-174T", "LS174T", .) %>%
          gsub("^786-O", "786-0", .) %>%
          gsub("^SET-2", "Set-2", .) %>%
          gsub("SW-527", "SW527", .)
      } else {
        rna.id$Var2 <-  id.x
      }
      
      rna.id$clA <- as.character(gsub(ds.pattern, "", rna.id$Var1))
      rna.id$clB <- as.character(gsub(ds.pattern, "", rna.id$Var2))
      rna.id$cvclA <- meta.df[match(rna.id$clA, meta.df$ID),]$CVCL
      rna.id$cvclB <- meta.df[match(rna.id$clB, meta.df$ID),]$CVCL
      rna.id$g.truth <- with(rna.id, clA == clB)
      rna.id[rna.id == 'character(0)'] <- 'CVCL_X482'
      rna.id[is.na(rna.id)] <- 'CVCL_X482'
      if(any(grepl("\\\t$", rna.id$cvclA))) rna.id$cvclA <- gsub("\\\t$", "", rna.id$cvclA)
      if(any(grepl("\\\t$", rna.id$cvclB))) rna.id$cvclB <- gsub("\\\t$", "", rna.id$cvclB)
      rna.id$cellosaurus <- CCLid:::checkAgainst(rna.id) #CCLid:::
      rna.id[order(rna.id$cellosaurus, decreasing=TRUE),]
    }  else {
      rna.id$Var2 <-  id
      rna.id
    }
  }, dataset=dataset, mc.cores = 10)
  names(rna.cvcl)  <- names(rna.identity)
  x <- sapply(rna.cvcl, function(i) any(grepl('g.truth', colnames(i))))
  no.match.idx <- which(sapply(rna.cvcl, function(i) !any(i$g.truth)))
  match.idx <- which(sapply(rna.cvcl, function(i) any(i$g.truth)))
  
  ## Descriptive stats of the number of "matching" and "nonmatching" based on CVCL
  print(paste0("Match: ", length(match.idx),  " / ", length(rna.cvcl)))
  print(paste0("No-match: ", length(no.match.idx),  " / ", length(rna.cvcl)))
  
  ## Concatenate everything and output csv
  rna.m <- plyr::rbind.fill(lapply(rna.cvcl[match.idx], function(i){
    i[which(i$baf.p.fit == 'M'),]
  }))
  rna.nm <- plyr::rbind.fill(rna.cvcl[no.match.idx])
  rna.mnm <- plyr::rbind.fill(rna.m, rna.nm)
  
  table(sapply(rna.cvcl[no.match.idx], function(i) i[1,'baf.p.fit'])) # GDSC-M:14  NM:19 || CCLE-M:13  NM:44 
  table(sapply(rna.cvcl[match.idx], function(i) any(i$g.truth))) # GDSC-M:14  NM:19 || CCLE-M:13  NM:44 
  
  cat(paste0("grep \"", names(rna.cvcl[no.match.idx]), "\" *\n"), "\n")
  
  write.table(rna.mnm, file.path(PDIR, "match_it", paste0(dataset, "_rnaID.csv")),
              sep=",", quote=FALSE, row.names=FALSE, col.names=TRUE)
  scp.path <- "scp quever@192.168.198.99:"
  cat(paste0(scp.path, file.path(PDIR, "match_it", paste0(dataset, "_rnaID.csv")), ' .\n'))
}
  
######################################################
#### Genotype concordance between every cell line ####
######################################################
## This process is meant to compare the genotypes using
## BAF between every cell line in all datasets used
snpsCellIdentity <- function(){
  vcf.map.var <- mapVariantFeat(ref.mat, var.dat)
  x.mat <- as.matrix(vcf.map.var)
  storage.mode(x.mat) <- 'numeric'
  ds.pattern = '(CCLE)_|(GDSC)_|(GDSC2)_|(CCLE2)_|(GNE)_|(GNE2)_'
  #x.mat[is.na(x.mat)] <- median(x.mat, na.rm=TRUE)
  
  x.dist <- similarityMatrix(x.mat, 'euclidean')
  D.vals <- lapply(list("baf"=x.dist), splitConcordanceVals, meta.df=meta.df)
  balanced <- balanceGrps(D.vals)
  
  models <- trainLogit(balanced, predictors=c('baf'))
  x.vals <- lapply(list("baf"=x.dist), splitConcordanceVals, meta.df=NULL)
  pred <- assemblePredDat(x.vals, known.class=FALSE)
  pred <- mkPredictions(pred, models)
  
  ## Format extra columns on prediction matrix
  pred$Var1 <- as.character(pred$Var1)
  pred$Var2 <- as.character(pred$Var2)
  pred$clA <- as.character(gsub(ds.pattern, "", pred$Var1))
  pred$clB <- as.character(gsub(ds.pattern, "", pred$Var2))
  pred$cvclA <- meta.df[match(pred$clA, meta.df$ID),]$CVCL
  pred$cvclB <- meta.df[match(pred$clB, meta.df$ID),]$CVCL
  pred$g.truth <- with(pred, clA == clB)
  pred$g.truth <- c("NM", "M")[as.integer(pred$g.truth) + 1]
  pred$g.truth <- factor(pred$g.truth, levels=c("M", "NM"))
  pred$baf.p.fit <- factor(pred$baf.p.fit, levels=c("M", "NM"))
  datasets <-c('CCLE', 'GDSC', 'GNE')
  pred$ds <- with(pred, paste(Var1, Var2, sep=";")) %>% 
    gsub("_.*;", ";", .) %>% gsub("_.*", "", .)
  
  ## Split based on datasets
  comb <- cbind(combn(datasets, m = 2),
                matrix(rep(datasets, 2), ncol=3, byrow = TRUE))
  cmb.pred <- apply(comb, 2, function(ds){
    id1 <- paste(ds[1], ds[2], sep=";")
    id2 <- paste(ds[2], ds[1], sep=";")
    idx <- as.logical((pred$ds == id1) + (pred$ds == id2))
    split(pred, f=idx)[['TRUE']]
  })
  names(cmb.pred) <- apply(comb, 2, paste, collapse="-")

  ## Plot the confusion matrix between any two datasets
  pdf(file.path(PDIR, "match_it", "gne-gdsc-ccle_conc.pdf"), height = 12)
  par(mfrow=c(6,3), mar=c(3, 2, 3, 2))
  P.m.nm <- lapply(names(cmb.pred), function(pid){
    p <- cmb.pred[[pid]]
    conf.m <- table(p$baf.p.fit, p$g.truth)
    fourfoldplot(conf.m, space = 0.2)
    # conf.m[2,2] <- sum(p$g.truth=='M')
    conf.m[2,2] <- 600
    fourfoldplot(conf.m, space = 0.2, conf.level = 0, std='ind.max', 
                 main=pid, col=c("#ca0020", "#404040"))
    
    p.m.nm <- CCLid:::splitToMNM(p) #CCLid:::
    p.m.nm$cellosaurus <- CCLid:::checkAgainst(p.m.nm) #CCLid:::
    err.pcl <- CCLid:::genErrBp(p.m.nm) #CCLid:::
    p.m.nm <- cbind(p.m.nm, err.pcl)
    p.m.nm <- p.m.nm[order(err.pcl[,1], err.pcl[,2]),]
    
    print(table(err.pcl))
    barplot(t(table(err.pcl)), horiz=TRUE, las=1, xlim=c(0,80), 
            col=c("FALSE"="#f4a582", "TRUE"="#b2182b"), border=NA)
    return(p.m.nm)
  })
  names(P.m.nm) <- names(cmb.pred)
  dev.off()
  cat("rl ", file.path(PDIR, "match_it", "gne-gdsc-ccle_conc.pdf\n"))

  ## Write otu unknown cases for manual curation
  lapply(names(P.m.nm), function(Pid){
    P <- P.m.nm[[Pid]]
    write.table(P[,-c(6:10)], file=file.path(PDIR, "match_it", paste0("X-", Pid, ".tsv")),
                quote = FALSE, sep = "\t", row.names = FALSE, col.names=TRUE)
  })

  lapply(c("CCLE", "GDSC", "GNE"), function(ds.id){
    ds.cl.ids <- sapply(names(P.m.nm), function(Pid){
      P <- P.m.nm[[Pid]]
      row.idx <- which(P$err=='X' & (!as.logical(P$pcl)))
      cl.ids <- c(P[row.idx,]$Var1, P[row.idx,]$Var2)
      unique(cl.ids[grep(ds.id, cl.ids)])
    })
    cat(paste(unique(sort(unlist(ds.cl.ids))), collapse="\n"))
  })
  
  lapply(names(P.m.nm), function(Pid){
    P <- P.m.nm[[Pid]]
    write.table(P[,-c(6:10)], file=file.path(PDIR, "match_it", paste0("X-", Pid, ".tsv")),
                quote = FALSE, sep = "\t", row.names = FALSE, col.names=TRUE)
  })
}
quevedor2/CCLid documentation built on July 15, 2020, 4:05 a.m.