Summary

This script prepares cell-level data and metadata for the Pilot MEP LINCs Analysis Pipeline.

In the code, the variable ss determines which staining set (SS1, SS2 or SS3) to merge and the variable cellLine determines the cell line. All .txt data files in the "./RawData" folder will be merged with the well (xlsx) and log (XML) data from the "./Metadata" folder.

#Author: Mark Dane, copyright 8/2015
library("limma")#read GAL file and strsplit2
library("MEMA")#merge, annotate and normalize functions
library("data.table")#fast file reads, data merges and subsetting
library("parallel")#use multiple cores for faster processing

#Select a staining set
ss <- "SS2"
densityThresh <- 0.4
outerThresh <- 0.5

#Filter out debris based on nuclear area
nuclearAreaThresh <- 1000

The merging assumes that the actual, physical B row wells (B01-B04) have been printed upside-down. That is, rotated 180 degrees resulting in the spot 1, 1 being in the lower right corner instead of the upper left corner. The metadata is matched to the actual printed orientation.

#Read in the spot metadata from the gal file
smd <- readSpotMetadata(system.file("extdata/Metadata","20150515_LI8X001_v1.gal",package="MEMA"))
#Relabel the column Name to ECMpAnnotID
setnames(smd, "Name", "ECMpAnnotID")

#Make a short name for display of ECMpAnnotID
#Remove the first underscore and all text after it
smd$ECMp <- gsub("_.*","",smd$ECMpAnnotID)
#Replace any dashes with the word blank
smd$ECMp <- gsub("-","blank",smd$ECMp)

#Add the print order and deposition number to the metadata
ldf <- readLogData(system.file("extdata/Metadata","20150512-112336.xml",
                                    package="MEMA"))
spotMetadata <- merge(smd,ldf, all=TRUE)
setkey(spotMetadata,Spot)
#Make a rotated version of the spot metadata to match the print orientation
spotMetadata180 <- rotateMetadata(spotMetadata)
ARowMetadata <- data.table(spotMetadata,Well=rep(c("A01", "A02","A03","A04"),each=nrow(spotMetadata)))
BRowMetadata <- data.table(spotMetadata180,Well=rep(c("B01", "B02","B03","B04"),each=nrow(spotMetadata180)))

The well metadata describes the cell line, ligands and staining endpoints that are all added on a per well basis. There is one mutlisheet .xlsx file for each plate. Each filename is the plate's barcode.

The raw data files are stored in a "RawData" folder inside a folder for each staining set.There is a main raw data file for each well. Staining sets with cytoplasmic data include raw data files with the word "Cyto" inplace of the word "Main".

The raw data from all wells in all plates in the dataset are read in and merged with their spot and well metadata. The number of nuclei at each spot are counted. Then all intensity values are normalized through dividing them by the median intensity value of the control well in the same plate. Next, the data is filtered to remove objects with a nuclear area less than r nuclearAreaThresh pixels.

#The next steps are to bring in the well metadata, the print order and the ScanR data

cellDataFiles <- dir(system.file("extdata/RawData",package="MEMA"),
                     full.names = FALSE)
splits <- strsplit2(strsplit2(cellDataFiles,split = "_")[,1],"/")
barcodes <- unique(splits[,ncol(splits)])
expDTList <- mclapply(barcodes, function(barcode){
  plateDataFiles <- grep(barcode,cellDataFiles,value = TRUE)
  wells <- unique(strsplit2(split = "_",plateDataFiles)[,2])
  wellDataList <- lapply(wells,function(well){
    #browser()
    wellDataFiles <- grep(well,plateDataFiles,value = TRUE)
    mainDataFile <- grep("main",wellDataFiles,value=TRUE,ignore.case = TRUE)
    cytoDataFile <- grep("Cyto",wellDataFiles,value=TRUE,ignore.case = TRUE)

    #Read in and merge the main and cyto data for each well
    mainDT <- convertColumnNames(fread(system.file("extdata/RawData", mainDataFile, package="MEMA"),stringsAsFactors=FALSE))
    #Add the stain location for each wavelength in the 'main' file
    waveLengths <- grep("Intensity",colnames(mainDT), value=TRUE)
    for(waveLength in waveLengths){
      mainDT <- mainDT[,paste0("Location",waveLength) := "Nucleus", with=FALSE]
    }

    if(length(cytoDataFile)) {
      cytoDT <- convertColumnNames(fread(system.file("extdata/RawData", cytoDataFile, package="MEMA"),stringsAsFactors=FALSE))
      #Add the stain location for each wavelength in the 'Cyto' file
      waveLengths <- grep("Intensity",colnames(cytoDT), value=TRUE)
      for(waveLength in waveLengths){
        cytoDT <- cytoDT[,paste0("Location",waveLength) := "Cytoplasm", with=FALSE]
      }
      #Merge the cyto data to the main data using  ParentObjectIDMO
      # in the cyto file and ObjectID in the main file
      setkey(mainDT,key="ObjectID")
      setkey(cytoDT,key="ParentObjectIDMO")
      DT <- mainDT[cytoDT]
    } else {
      DT <- mainDT
    }

    #clean up the column names
    deleteNames <- colnames(DT)[colnames(DT) %in% c("Position","ParentObjectIDWell","ParentTraceID","i.ObjectID","i.ParentTraceID", "i.Area")]
    DT <- DT[,deleteNames :=NULL, with = FALSE]

    setnames(DT,"Well","Spot")
    #Add the well name as a parameter
    DT$Well <- well
    #Merge the data with its metadata based on the row it's in
    m <- regexpr("[[:alpha:]]",well)
    row <- regmatches(well,m)
    setkey(DT,Spot)
    DT <- switch(row,A = merge(DT,spotMetadata,all=TRUE),B = merge(DT,spotMetadata180,all=TRUE))
    #Add the well name again to fill in NA values
    DT$Well <- well
    return(DT)
  })

  #Create the cell data.table with spot metadata for the plate 
  pcDT <- rbindlist(wellDataList, fill = TRUE)
  #Read the well metadata from a multi-sheet Excel file
  wellMetadata <- data.table(readMetadata(system.file(paste0("extdata/Metadata/",barcode,".xlsx"),package="MEMA")), key="Well")
  #Create a ligand display name by removing the first underscore and all trailing text
  wellMetadata$Ligand <- gsub("_.*","",wellMetadata$LigandAnnotID)
  #merge well metadata with the data and spot metadata
  #browser()
  pcDT <- merge(pcDT,wellMetadata,by = "Well")
  pcDT <- pcDT[,Barcode := barcode]
  #Count the cells at each spot
  pcDT<-pcDT[,SpotCellCount := length(ObjectID),by="Barcode,Well,Spot"]

  #Add a total DAPI column
  pcDT$TotalIntensityDAPI <- pcDT$Area*pcDT$MeanIntensityDAPI
  #If there is a highSerum well in the plate, use it for normalization
  if(sum(pcDT$Ligand=="HighSerum")){
    intensityNames <- grep("^MeanIntensity|^TotalIntensity",colnames(pcDT), value=TRUE)
    for(intensityName in intensityNames){
      #Median normalize to the plate's control well for each channel's value
      pcDT <- pcDT[,paste0(intensityName,"MedNorm") := normWellsWithinPlate(.SD, value=intensityName, baseECM = ".*",baseGF = "HighSerum"), by="Barcode"]
    }

    pcDT <- pcDT[,SpotCellCountMedNorm := normWellsWithinPlate(.SD, value="SpotCellCount", baseECM = ".*",baseGF = "HighSerum"), by="Barcode"]
  }
  pcDT <- pcDT[pcDT$Area >nuclearAreaThresh,]
  return(pcDT)
})
cDT <- rbindlist(expDTList, fill = TRUE)

After merging the metadata with the cell-level data, several types of derived parameters are added. These include:

The origin of coordinate system is placed at the median X and Y of each spot and the local cartesian and polar coordinates are added to the dataset.

Each spot is divided into wedge-shaped bins and the wedge bin value for each cell is added to the dataset. The density around each cell is calculated from the number of nuclear centers within a radius around each nuclei. The Density value is thresholded to classify each cell as Sparse or not.The distance from the local origin is used to classify each cell as an OuterCell or not. The Sparse, OutCell and Wedge classifications are used to classify each cell as a Perimeter cell or not.

For staining set 2, each cell is classified as EdU+ or EdU-. The threshold for EdU+ is based on kmeans threshold of the mean EdU intensity from the control well of each plate.

The intensity values are normalized at each spot so that spot-level variations can be analyzed.

#Add the local cartesian and polar coordinates, Wedge, Density and classifications for Sparse, OuterCell and Perimeter
setkey(cDT,Barcode,Well,Spot,ObjectID)
cDT <- merge(cDT,setkey(positionParms(cDT),Barcode,Well,Spot,ObjectID))

#Add a spot level normalizations on the intensity values
cDT <- cDT[,MeanIntensityAlexa488SpotNorm := MeanIntensityAlexa488/median(MeanIntensityAlexa488, na.rm=TRUE), by="Barcode,Well,Spot"]
cDT <- cDT[,MeanIntensityAlexa555SpotNorm := MeanIntensityAlexa555/median(MeanIntensityAlexa555, na.rm=TRUE), by="Barcode,Well,Spot"]
cDT <- cDT[,MeanIntensityAlexa647SpotNorm := MeanIntensityAlexa647/median(MeanIntensityAlexa647, na.rm=TRUE), by="Barcode,Well,Spot"]

#Create staining set specific derived parameters
if(ss %in% c("SS1", "SS3")){

} else if (ss == "SS2"){

  cDT <- cDT[,EdUPositive := kmeansCluster(.SD, value="MeanIntensityAlexa647MedNorm", ctrlLigand = "HighSerum"), by="Barcode"]
  #Calculate the EdU Positive Percent at each spot
  cDT <- cDT[,EdUPositiveProportion := sum(EdUPositive)/length(ObjectID),by="Barcode,Well,Spot"]
} else stop("Invalid ss parameter")

# Eliminate Variations in the Endpoint metadata
endpointNames <- grep("End",colnames(cDT), value=TRUE)
endpointWL <- regmatches(endpointNames,regexpr("[[:digit:]]{3}|DAPI",endpointNames))
setnames(cDT,endpointNames,paste0("Endpoint",endpointWL))

The cell level raw data and metadata is saved as Level 1 data. The plate and spot normalized values and the metadata is saved as Level 2 data.

#Select the columns and order for saving in the cell-level annotated data file
coreLevel1DataColumns <- c("X","Y","Z","XLocal","YLocal","RadialPosition","Theta","TotalIntensityDAPI","MeanIntensityDAPI","MeanIntensityAlexa488","MeanIntensityAlexa555","MeanIntensityAlexa647","Area","ElongationFactor")

coreLevel2DataColumns<- c("X","Y","Z","XLocal","YLocal","RadialPosition","Theta","SpotCellCount","TotalIntensityDAPIMedNorm","MeanIntensityDAPIMedNorm","MeanIntensityAlexa488MedNorm","MeanIntensityAlexa555MedNorm","MeanIntensityAlexa647MedNorm","SpotCellCountMedNorm","MeanIntensityAlexa488SpotNorm","MeanIntensityAlexa555SpotNorm","MeanIntensityAlexa647SpotNorm","Density","Wedge","Sparse","OuterCell","Perimeter")

coreMetadataColumns <- c("Barcode","Well","Spot","ArrayRow","ArrayColumn","Block","Row","Column","ObjectID","LocationMeanIntensityDAPI","LocationMeanIntensityAlexa488","LocationMeanIntensityAlexa555","LocationMeanIntensityAlexa647","CellLine","EndpointDAPI","Endpoint488","Endpoint555","Endpoint647","LigandAnnotID","Ligand","ECMpAnnotID","ECMp","PrintOrder")


if (ss == "SS1") {
  cDTLevel1 <- cDT[,c(coreMetadataColumns, coreLevel1DataColumns), with=FALSE]
  cDTLevel2 <- cDT[,c(coreMetadataColumns, coreLevel2DataColumns), with=FALSE]

} else if(ss=="SS2"){
  cDTLevel1 <- cDT[,c(coreMetadataColumns, coreLevel1DataColumns), with=FALSE]
  cDTLevel2 <- cDT[,c(coreMetadataColumns, coreLevel2DataColumns,"EdUPositive"), with=FALSE]

}
write.table(format(cDTLevel1, digits=4), paste0(unique(cDT$CellLine),"_",ss,"_","Level1.txt"), sep = "\t",row.names = FALSE, quote=FALSE)
write.table(format(cDTLevel2, digits=4), paste0(unique(cDT$CellLine),"_",ss,"_","Level2.txt"), sep = "\t",row.names = FALSE, quote=FALSE)

The cell-level data is median summarized to the spot level and coefficients of variations on the replicates are calculated. A loess model of the spot cell count is created and median normalized. The spot level data, CVs and metadata are saved as Level 3 data.

#Summarize cell data to spot level (sl) by taking the medians of the parameters
parameterNames<-grep(pattern="(^Total|^Mean|Elongation|Area|SpotCellCount|EdUPositiveProportion|RadialPosition|Population|Density|Z|Barcode|^Spot$|^Well$)",x=names(cDT),value=TRUE)

#Remove any spot-normalized parameters
parameterNames <- grep("SpotNorm",parameterNames,value=TRUE,invert=TRUE)

cDTParameters<-cDT[,parameterNames,with=FALSE]
slDT<-cDTParameters[,lapply(.SD,numericMedian),keyby="Barcode,Well,Spot"]

#Merge back in the spot and well metadata
metadataNames <- grep("(Row|Column|PrintOrder|Block|^ID$|Array|CellLine|Ligand|Endpoint|ECMp|Location|Barcode|^Well$|^Spot$)", x=colnames(cDT), value=TRUE)
mDT <- cDT[,metadataNames,keyby="Barcode,Well,Spot", with=FALSE]
slDT <- mDT[slDT, mult="first"]

#Calculate CVs for each set of replicates in the ScanR data
cvNames<-grep(pattern="(^Total|^Mean|Elongation|Area|SpotCellCount|EdUPositiveProportion|RadialPosition|Population|Density|Z|Barcode|^Well$|ECMpAnnotID)",x=colnames(slDT),value=TRUE)
cvParameters<-slDT[,cvNames,with=FALSE]
cv<-cvParameters[,lapply(.SD,CV),by="Barcode,Well,ECMpAnnotID"]
data.table::setnames(cv,colnames(cv), paste0("CV",colnames(cv)))
data.table::setkey(cv,CVWell,CVECMpAnnotID, CVBarcode)
data.table::setkey(slDT,Well,ECMpAnnotID,Barcode)
slDT <- slDT[cv]

#Add a count of replicates
slDT <- slDT[,ReplicateCount := .N,by="LigandAnnotID,ECMpAnnotID"]

#Add the loess model of the SpotCellCount on a per well basis
  suppressWarnings(slDT <- slDT[,LoessSCC := loessModel(.SD,value="SpotCellCount",span=.1), by="Barcode,Well"])

#Add well level QA Scores
lthresh <- 0.6
  slDT <- slDT[,QAScore := calcQAScore(.SD,threshold=lthresh,maxNrSpot = max(cDT$ArrayRow)*max(cDT$ArrayColumn),value="LoessSCC"),by="Barcode,Well"]
write.table(format(slDT, digits = 4), paste0(unique(slDT$CellLine),"_",ss,"_","Level3.txt"),
            sep = "\t",row.names = FALSE, quote=FALSE)

The spot level data is median summarized to the replicate level is stored as Level 4 data and metadata.

  #Summarize spot level data to MEP level by taking the medians of the parameters
  mepNames<-grep(pattern="(^Total|^Mean|Elongation|^Area|Z|^SpotCellCount|Loess|RadialPosition|EdUPositiveProportion|Population|Density|LigandAnnotID|ECMpAnnotID)",x=names(slDT),value=TRUE)

  mepKeep<-slDT[,mepNames,with=FALSE]
  mepDT<-mepKeep[,lapply(.SD,numericMedian),keyby="LigandAnnotID,ECMpAnnotID"]

  #Merge back in the replicate metadata
  mDT <- slDT[,list(Well,CellLine,Ligand,Endpoint488,Endpoint555,Endpoint647,EndpointDAPI,ReplicateCount,ECMp),keyby="LigandAnnotID,ECMpAnnotID"]
  mepDT <- mDT[mepDT, mult="first"]
write.table(format(mepDT, digits = 4), paste0(unique(slDT$CellLine),"_",ss,"_","Level4.txt"),
            sep = "\t",row.names = FALSE, quote=FALSE)


kdaily/MEMA documentation built on May 20, 2019, 8:28 a.m.