In darachm/rheocyto: Some FlowCytometry Analysis Tools

"I hear and I forget. I see and I remember. I copy this file and modify it and I have a reproducible record of my analyses."

Confucious

This vignette sets out an example of how to grab some FCS files (envisioned as being generated by the Accuri) into R, gate on it a bit, then make a data.frame suitable for plotting with ggplot. It has a summary of results and quality control steps.

You should be familiar with your flow cytometer

This is designed to work with an example dataset of testing sytoxgreen staining for dna content flow cytometery of budding yeast. Download it, this vignette expects it. There's no sample sheet, and the filenames are sorta cryptic. The README has the codings though.

First, we read in the FCS files and turn them into a giant data.frame.

if (exists("oneBigFlowSet")) { rm(oneBigFlowSet) }

datadir <- "exampleDatasetAccuriSeveralFCS/"
for ( fz in dir(datadir) ) {

  print(paste("Reading", datadir))

  tmp <- exprs(read.FCS(paste0(datadir,fz),
    transformation="linearize"))

  if (exists("oneBigFlowSet")) {
    oneBigFlowSet <- rbind2(oneBigFlowSet,tmp)
  } else {
    oneBigFlowSet <- tmp
  }

}

require(flowCore)
require(rheocyto)
require(ggplot2)

if (length(dir("/home/zed/lab/data/mine/15/dec15/151215_exp160r3/"))) {
    directories <- c(
        exp160r3="/home/zed/lab/data/mine/15/dec15/151215_exp160r3/")
}

datz <- list()
for (expr in names(directories)) {
    dirz <- directories[expr]
    for (fz in dir(dirz)[grep("fcs",dir(dirz))]) {
        if (grepl("(water)|(decon)|(h2o)",fz)) {next;}
        print(paste0("doing ",dirz,"",fz))
        datz[[paste0(expr, warning=F,"_",sub(".fcs","",fz))]] <- 
            data.frame(exprs(read.FCS(paste0(dirz,fz),
                transformation=FALSE)))
    }
}

if (exists("lrgdf")) {rm(lrgdf)}
for (fz in names(datz)) {
    if (grepl("(water)|(decon)|(h2o)",fz)) {next;}
    if (!exists("lrgdf")) {
        lrgdf <- data.frame(datz[[as.character(fz)]],
                conc=  sub(".+\\s(\\S+)uM_\\d+$","\\1",fz),
                pktime=sub(".+\\s\\S+uM_(\\d+)$","\\1",fz))
    } else {
        lrgdf <- rbind(lrgdf,data.frame(datz[[as.character(fz)]],
                conc=  sub(".+\\s(\\S+)uM_\\d+$","\\1",fz),
                pktime=sub(".+\\s\\S+uM_(\\d+)$","\\1",fz)))
    }
}

dim(lrgdf)

First, exploratory plots. Gate on FSC and SSC.

ggplot(lrgdf)+theme_bw()+
    aes(x=log(FSC.A),y=log(FSC.H))+facet_grid(conc~pktime)+
    geom_point(size=.5,alpha=.01)

subdf <- subset(lrgdf,log(FSC.A)>11.0)

ggplot(subdf)+theme_bw()+
    aes(x=log(FSC.A),y=log(FL1.A))+
    facet_grid(conc~pktime)+
    geom_point(size=1,alpha=0.01)

ggplot(subdf)+theme_bw()+
    aes(x=log(FSC.H),y=log(FL1.H))+
    facet_grid(conc~pktime)+
    geom_point(size=1,alpha=0.01)

subdf <- subset(lrgdf,log(FL1.H)>12.0)

ggplot(subdf)+theme_bw()+aes(x=log(FL1.A),color=conc)+
    facet_wrap(~pktime)+
    stat_bin(aes(y = ..density..),binwidth=0.02,
        geom="line",position="identity",cex=.5)+
    stat_bin(aes(y = ..density..),binwidth=0.02,
        geom="point",position="identity",cex=2)+
    scale_color_discrete("uM sytox green")+
    xlim(13.5,15.5)

ggplot(subdf)+theme_bw()+aes(x=log(FL1.H),color=conc)+
    facet_wrap(~pktime)+
    stat_bin(aes(y = ..density..),binwidth=0.02,
        geom="line",position="identity",cex=.5)+
    stat_bin(aes(y = ..density..),binwidth=0.02,
        geom="point",position="identity",cex=2)+
    scale_color_discrete("uM sytox green")+
    xlim(13.5,15.5)

ggplot(melt(subset(subdf,conc==1),id.vars="pktime",
        measure.vars=c("FL1.A","FL1.H")))+
    theme_bw()+aes(x=log(value),color=variable)+
    facet_wrap(~pktime)+
    stat_bin(aes(y = ..density..),binwidth=0.02,
        geom="line",position="identity",cex=.5)+
    stat_bin(aes(y = ..density..),binwidth=0.02,
        geom="point",position="identity",cex=2)+
    scale_color_discrete("uM sytox green")+
    xlim(13.5,15.5)
ggplot(melt(subset(subdf,conc==1),id.vars="pktime",
        measure.vars=c("FL1.A","FL1.H")))+
    theme_bw()+aes(x=log(value),color=pktime)+
    facet_wrap(~variable)+
    stat_bin(aes(y = ..density..),binwidth=0.02,
        geom="line",position="identity",cex=.5)+
    stat_bin(aes(y = ..density..),binwidth=0.02,
        geom="point",position="identity",cex=2)+
    scale_color_discrete("uM sytox green")+
    xlim(13.5,15.5)

Ok, so use a 60 min digest, then do FL1.H for the modeling.

For expected results:

g <- ggplot(melt(subset(subdf,pktime==60&conc==1),
        measure.vars=c("FL1.A","FL1.H"),id.vars=c("FSC.A")))+
    theme_bw()+
    aes(x=log(FSC.A),y=log(value))+
    facet_grid(~variable)+
    xlim(11,13.5)+ylim(13.5,15)+
    geom_point(size=3,alpha=0.01)
g
ggsave("/home/zed/lab/outz/figz/sytoxgreenRepresentativeFSCvsFL1.png",
    g,width=6,height=6)

g <- ggplot(melt(subset(subdf,pktime==60&conc==1),
        measure.vars=c("FL1.A","FL1.H"),id.vars=c("FSC.A")))+
    theme_bw()+aes(x=log(value))+
    facet_wrap(~variable)+
    stat_bin(aes(y = ..density..),binwidth=0.01,
        geom="line",position="identity",cex=.5)+
    stat_bin(aes(y = ..density..),binwidth=0.01,
        geom="point",position="identity",cex=2)+
    xlim(13.5,15.5)
g
ggsave("/home/zed/lab/outz/figz/sytoxgreenRepresentativeFL1AvsH.png",
    g,width=6,height=6)

source("src/requireInstall.R")
requireInstall("reshape2")
requireInstall("ggplot2")
requireInstall("flowCore",isBioconductor=T)
requireInstall("flowViz",isBioconductor=T)
requireInstall("Hmisc",isBioconductor=T)
requireInstall("gridExtra")
requireInstall("flowStats",isBioconductor=T)
requireInstall("flowWorkspace",isBioconductor=T)
requireInstall("ggcyto", isBioconductor=T)

Step 2: Read in .fcs files

Read in all data for analysis. Data should be in individual directories that contain .fcs files and a corresponding sample sheet with a generic format. FCS file names need to match the SampleSheet.txt.

Want to make your SampleSheet.txt? Use a GUI spreadsheet thingy, like excel or googledocs or localc or edit() in R. The format is:

| FileName | Ploidy | Dye | ... | |:---|:---|:---|:---| |A01.fcs|1.5ploid|acridineOrange|...| |A02|2.5ploid|acridineOrange|...| |A03.fcs|1.5ploid|sytoselect|...|

Where the only mandatory variable name is FileName, then all the others are whatever you're using.

An abitrary number of directories can be used, put the paths in the dataDirectoryz vector. Here we use relative becuase it should be reproducible between users. You should use ABSOLUTE for your real work. Don't have this folder? Download it (look up).

Now you have oneBigFlowSet. Great. For gating, we'll turn that into a data.frame:

source("src/fs2df.R")
datar <- fs2df(oneBigFlowSet)

Step 3: gate

So I can't actually figure out how to make rmarkdown play with interactive chunks. So you should see an error right about....

gateFilename <- paste0("gatesFor",digest::digest(oneBigFlowSet),".RData")
load(gateFilename)

Okay, so now go to your terminal and run this:

source("src/gator.R")
gatez <- list()
gatez[["singlet"]] <- gator2d(datar,ehx="FSC.A",why="FSC.H",
  gateName="singlet") 
gatez[["cells"]] <- gator2d(datar,ehx="FSC.A",why="SSC.A", 
  gateName="cells") 
gatez[["gfpPos"]] <- gator2d(datar,ehx="FSC.A",why="FL1.A",
  gateName="gfpPositive") 
gatez[["gfpNeg"]] <- gator2d(datar,ehx="FSC.A",why="FL1.A", 
  gateName="gfpDepressing") 
save(file=gateFilename,list=c("gatez"))

If you screw it up, delete the RData file of the gates, and re-run.

how they look?

#source("src/gateCheck.R")
#gateCheck()
#make a function so we can re-call it after starting to apply these
for (gateName in names(gatez)) {
  print(gateName)
  ingate  <- data.frame(
    fs2df(Subset(oneBigFlowSet,gatez[[gateName]])),thisGate=T)
  outgate <- data.frame(
    fs2df(Subset(oneBigFlowSet,!gatez[[gateName]])),thisGate=F)
  shuffle <- rbind(ingate,outgate)
  shuffle <- shuffle[sample(1:nrow(shuffle)),]
  names(shuffle)[which(names(shuffle)=="thisGate")] <- gateName
  print(
    ggplot(shuffle)+
      facet_wrap(~name)+
      aes_string(x=colnames(gatez[[gateName]]@boundaries)[1],
                 y=colnames(gatez[[gateName]]@boundaries)[2],
                 col=gateName)+
      geom_point(size=0.2,alpha=0.5)+
      scale_y_log10()+
      scale_x_log10()
  )
}

Subset the data by applying sequential gates

"When you have doublets, do not fear to abandon them."

Confucious

rm(filteredDatar)
for (gateName in c("singlet","cells")) {
  print(gateName)
  if (exists("filteredDatar")) {
    filteredDatar  <- Subset(filteredDatar,gatez[[gateName]])
  } else {
    filteredDatar  <- Subset(oneBigFlowSet,gatez[[gateName]])
  }
}
#gateCheck() call would go here

Buidling a better datastructure, for later use

source("src/splitFSbyGate.R")
gatedFilteredDatar <- filteredDatar
for (gateName in setdiff(names(gatez),c("singlet","cells"))) {
  gatedFilteredDatar <- splitFSbyGate(gatedFilteredDatar,
    gatez[[gateName]])
}
gfdatar <- fs2df(gatedFilteredDatar)

Step 4: Plotz

Into usable form:

head(gfdatar)
mfdatar <- melt(gfdatar,
  id.vars=c("Strain","ul500nMDye","Ploidy","Media","name","gfpPos","gfpNeg"),
  measure.vars=c("FSC.A","FSC.H","FL1.A","FL1.H","FL2.A","FL2.H"))
head(mfdatar)

summary statistics

Many of the gatedFilteredDatar things will be NA's, that's ok. It's because there's not many members in those particular gate/flowframes.

fsApply(filteredDatar,function(x){
  data.frame(t(each_col(x,summary)))})
fsApply(gatedFilteredDatar,function(x){
  data.frame(t(each_col(x,summary)))})

for each sample plot a histogram of the normalized data, raw FSC and raw GFP per row

ggplot(mfdatar)+
  facet_grid(name~variable,scales="free")+
  aes(x=log10(value))+
  geom_histogram(bins=50)

extract data from flowFrames to plot histograms of values and record summary statistics

aggregate(data=mfdatar,value~variable+name,FUN=mean)
aggregate(data=mfdatar,value~variable+name,FUN=median)

Specific plotz

ggplot(subset(mfdatar,variable%in%c("FL2.A","FL2.H")))+
  aes(x=log10(value))+facet_grid(name~variable)+
  geom_histogram()+
  geom_vline(xintercept=4,col="cyan")

ggplot(subset(gfdatar))+
  aes(x=log10(FL2.A/FSC.A))+
  facet_grid(name~.)+
  geom_histogram()+
  geom_vline(xintercept=0,col="cyan")

ggplot(subset(gfdatar))+
  aes(x=log10(FL1.A/FL2.A))+
  facet_grid(name~.)+
  geom_histogram()+
  geom_vline(xintercept=0,col="cyan")

ggplot(subset(gfdatar))+
  aes(x=log10(FSC.H/FL2.A))+
  facet_grid(name~.)+
  geom_histogram()+
  geom_vline(xintercept=0,col="cyan")

ggplot(subset(mfdatar,variable%in%c("FSC.A","FL1.A","FL2.A")))+
  aes(x=name,y=(value))+facet_grid(~variable,scales="free")+
  geom_violin()+scale_y_log10()+
  theme(axis.text.x=element_text(angle=90))

ggplot(subset(mfdatar,variable%in%c("FSC.A","FL1.A","FL2.A")))+
  aes(x=name,y=(value))+facet_grid(~variable,scales="free")+
  geom_boxplot()+scale_y_log10()+
  theme(axis.text.x=element_text(angle=90))

Comparison of FL1.A to 0nM

aggregate(data=gfdatar,FL1.A~ul500nMDye,FUN=median)
medianFL1Anodye <- median(subset(gfdatar,ul500nMDye==0)$FL1.A)
ggplot(gfdatar)+
  aes(x=ul500nMDye,y=FL1.A/medianFL1Anodye,group=name)+
  geom_boxplot()+scale_y_log10()+
  theme(axis.text.x=element_text(angle=90))

Comparison of FSC.A to 0nM

aggregate(data=gfdatar,FSC.A~ul500nMDye,FUN=median)
medianFSCAnodye <- median(subset(gfdatar,ul500nMDye==0)$FSC.A)
ggplot(gfdatar)+
  aes(x=ul500nMDye,y=FSC.A/medianFSCAnodye,group=name)+
  geom_boxplot()+scale_y_log10()+
  theme(axis.text.x=element_text(angle=90))

Population composition

t(apply(
  data.frame(
    aggregate(data=gfdatar,cbind(gfpPos,gfpNeg)~ul500nMDye,
      FUN=function(x){sum(x==T)}),
    aggregate(data=gfdatar,name~ul500nMDye,FUN=length)
   ),1,function(x){return(c(x[1],x[2:3]/x[5]))}))

Step 5: Quality control

“Great Scientist calculates in terms of the Control of the Quality of the Apparatus, not in terms of earning a living. Agriculture is inspired by the fear of hunger; study, by an interest in salary. Great Scientist is concerned about the Apparatus; not about poverty. Yet.”

Confucious

Scroll up to see how the gates did. Below is an example of ggplot().

ggplot(gfdatar)+
  facet_wrap(~ul500nMDye)+
  aes(x=FSC.A,y=FL1.A,col=gfpNeg)+
  geom_point(alpha=0.5,size=0.2)+
  scale_y_log10()+scale_x_log10()

Data transformation for visualization

“Data can make the Transformation great; it isn’t Transformation which makes the Data great.”

Confucious

In order to look at QC plots the data is transformed using the logicle transform, which is a log transform for high values that transitions to a linear transformation near zero values. This is simply for visualization purposes.

the parameters w,t, and m define the transformation parameters Below, dataset 1 tranformation applied to every channel except width and time.

lgcl <- logicleTransform(w = 0.5, t= 10000, m=4.5)
dataLGCLTransform <- transform(gatedFilteredDatar,'FSC.A' = lgcl(`FSC.A`), 'SSC.A' =lgcl(`SSC.A`), 'FL1.A' = lgcl(`FL1.A`), 'FL2.A' = lgcl(`FL2.A`), 'FL3.A' = lgcl(`FL3.A`), 'FL4.A' = lgcl(`FL4.A`),'FSC.H' = lgcl(`FSC.H`),'SSC.H' = lgcl(`SSC.H`),'FL1.H' = lgcl(`FL1.H`),'FL2.H' = lgcl(`FL2.H`),'FL3.H' = lgcl(`FL3.H`),'FL4.H' = lgcl(`FL4.H`))

Effect of time

"Study the past if you would define the future."

Confucious

ggplot(gfdatar)+
  aes(x=Time,y=FL1.A,col=factor(ul500nMDye))+
  geom_point(size=0.2,alpha=0.5)+
  stat_smooth()+scale_y_log10()

Two more plots

ggplot(gfdatar[sample(1:nrow(gfdatar)),])+
  aes(x=FSC.A,y=FL1.A,col=factor(ul500nMDye))+
  geom_point(size=0.2,alpha=0.5)+
  scale_y_log10()+scale_x_log10()

ggplot(gfdatar[sample(1:nrow(gfdatar)),])+
  aes(x=FSC.A,y=SSC.A,col=factor(ul500nMDye))+
  geom_point(size=0.2,alpha=0.5)+
  scale_y_log10()+scale_x_log10()