QC.R
In celltrackR: Motion Trajectory Analysis

## ----setup, include=FALSE---------------------------------------------------------------------------------------------
knitr::opts_chunk$set(dpi=72)

## ----pack, warning = FALSE, message = FALSE---------------------------------------------------------------------------
library( celltrackR )
library( ggplot2 )

## ---- echo = FALSE----------------------------------------------------------------------------------------------------
# Save current par() settings
oldpar <- par( no.readonly =TRUE )

## ---------------------------------------------------------------------------------------------------------------------
str( TCells, list.len = 1 )

## ---------------------------------------------------------------------------------------------------------------------
load( system.file("extdata", "TCellsRaw.rda", package="celltrackR" ) )
load( system.file("extdata", "BCellsRaw.rda", package="celltrackR" ) )

## ----Tdata------------------------------------------------------------------------------------------------------------
str( TCellsRaw, list.len = 1 )

## ---------------------------------------------------------------------------------------------------------------------
head( TCellsRaw[[1]] )

## ---------------------------------------------------------------------------------------------------------------------
# for reproducibility
set.seed(2021)

# sample names, sort them in numeric order, and use them to subset the tracks
Bsample <- sample( names(BCellsRaw),50)
Bsample <- Bsample[ order(as.integer(Bsample)) ]
BCellsRaw <- BCellsRaw[ Bsample ]

# same for the T cells, but now we include a few specific IDs we'll need for the
# analysis below.
Tsample <- sample( names(TCellsRaw),44)
Tsample <- unique( c( Tsample, "5","9658","83","6080","7832","8352" ) )
Tsample <- Tsample[ order(as.integer(Tsample)) ]
TCellsRaw <- TCellsRaw[ Tsample ]

## ----tracklength-check------------------------------------------------------------------------------------------------
# Each track has a coordinate matrix with one row per coordinate;
# The number of steps is the number of rows minus one.
track.lengths <- sapply( TCellsRaw, nrow ) - 1
hist( track.lengths, xlab = "Track length (#steps)", breaks = seq(0,40 ) )
summary( track.lengths )

## ----max-tracklength--------------------------------------------------------------------------------------------------
# This is the number of coordinates, so the number of steps is one less.
maxTrackLength( TCellsRaw )

## ----filter-short-----------------------------------------------------------------------------------------------------
# nrow() of a track is always the number of steps plus one.
# For steps >= min.steps, we can substitute nrow > min.steps:
filter.criterion <- function( x, min.steps ){
  nrow(x) > min.steps
}

TCells.filtered <- filterTracks( filter.criterion, TCellsRaw, min.steps = 11 )
# Or shorthand: filterTracks( function(x) nrow(x) > min.steps, TCellsRaw )

## ----check-filtered---------------------------------------------------------------------------------------------------
# Find lengths of the filtered dataset
track.lengths.filtered <- sapply( TCells.filtered, nrow ) - 1

# Histograms of track lengths before and after
c( "min length before filter" = min( track.lengths ),
   "min length after filter" = min( track.lengths.filtered ) )

# Check how many tracks are left:
length( TCells.filtered )

## ---- fig.width=7, fig.height = 3.5-----------------------------------------------------------------------------------
# Drift is 0.05 micron/sec in each dimension
drift.speed <- c( 0.05, 0.05, 0.05 )
add.drift <- function( x, drift.vector )
{
  # separate timepoints and coordinates
  tvec <- x[,1]
  coords <- x[,-1]
  
  # compute movement due to drift.
  drift.matrix <- matrix( rep( drift.vector, nrow(coords) ),
                          ncol = ncol(coords), byrow= TRUE )
  drift.matrix <- drift.matrix * tvec
  
  # Add drift to coordinates
  x[,-1] <- coords + drift.matrix
  return(x)
}

# Create data with drift
TCells.drift <- as.tracks( lapply( TCellsRaw, add.drift, drift.speed ) )

# Plot tracks and 'rose plot' with overlayed starting point
par(mfrow=c(1,2) )
plot(TCells.drift, main = "drift", cex = 0 )
plot( normalizeTracks(TCells.drift), main = "drift (rose plot)", cex = 0, col = "gray" )
abline( h = 0 )
abline( v = 0 )


## ---- fig.width = 4, fig.height = 3-----------------------------------------------------------------------------------
hotellingsTest( TCellsRaw, col = "gray" )

## ---- fig.width = 6, fig.height = 6-----------------------------------------------------------------------------------
hotellingsTest( BCellsRaw, col = "gray" )

## ---- fig.width = 4, fig.height = 3.5---------------------------------------------------------------------------------
# Compute autocovariance, normalize so the first point lies at 1
Tacov <- aggregate( TCellsRaw, overallDot )
Tacov$value <- Tacov$value / Tacov$value[1]

# The same for B cells:
Bacov <- aggregate( BCellsRaw, overallDot )
Bacov$value <- Bacov$value / Bacov$value[1]

# Compare autocovariances:
plot( Tacov )
points( Bacov, col = "red" )

## ---- fig.width = 6, fig.height = 6-----------------------------------------------------------------------------------
hotellingsTest( BCellsRaw, col = "gray", step.spacing = 10 )

## ---- fig.width = 4, fig.height = 4-----------------------------------------------------------------------------------
hotellingsTest( TCells.drift, plot = TRUE, col = "gray", step.spacing = 10 )

## ---- fig.width = 6, fig.height = 6-----------------------------------------------------------------------------------
hotellingsTest( TCellsRaw, dim = c("x","y","z"), step.spacing = 10 )
hotellingsTest( TCells.drift, dim = c("x","y","z"), step.spacing = 10 )

## ---- echo = FALSE, fig.width=7---------------------------------------------------------------------------------------
# sp <- seq(0,10)
# 
# htest.nodrift <- lapply( sp, function(x) hotellingsTest( TCells, 
#                                                  dim = c("x","y","z"),
#                                                  step.spacing = x ))
# htest.drift <- lapply( sp, function(x) hotellingsTest( TCells.drift, 
#                                                  dim = c("x","y","z"),
#                                                  step.spacing = x ))
# 
# pval.nodrift <- sapply( htest.nodrift, function(x) x$p.value )
# pval.drift <- sapply( htest.drift, function(x) x$p.value )
# stat.nodrift <- sapply( htest.nodrift, function(x) unname(x$statistic) )
# stat.drift <- sapply( htest.drift, function(x) unname(x$statistic) )
# 
# d.drift <- data.frame( step.spacing = sp,
#                        exp = "drift",
#                        pval = pval.drift,
#                        Tstatistic = stat.drift )
# d.nodrift <- data.frame( step.spacing = sp,
#                        exp = "no drift",
#                        pval = pval.nodrift,
#                        Tstatistic = stat.nodrift )
# d <- rbind( d.drift, d.nodrift )
# 
# p1 <- ggplot( d, aes( x = step.spacing, y = pval, color = exp ) ) +
#   geom_line() +
#   geom_hline( yintercept = 0.05, color = "red" ) +
#   scale_color_manual( values = c( "drift" = "black", "no drift" = "gray")  ) +
#   scale_y_log10() +
#   theme_classic()
# 
# p2 <- ggplot( d, aes( x = step.spacing, y = Tstatistic, color = exp ) ) +
#   geom_line() +
#   scale_y_continuous( limits = c(0,NA) ) +
#   scale_color_manual( values = c( "drift" = "black", "no drift" = "gray")  ) +
#   theme_classic()
# 
# gridExtra::grid.arrange(p1,p2,ncol=2)

## ----cellPairs, warning = FALSE, message = FALSE, fig.width=6, fig.heigth = 2.5---------------------------------------
# compute for both original as drift data
df.drift <- analyzeCellPairs( TCells.drift )
df.norm <- analyzeCellPairs( TCellsRaw )

# Plot
p.norm <- ggplot( df.norm, aes( x = dist, y = angle ) ) +
  geom_point( color = "gray70", size = 0.5 ) +
  stat_smooth( span = 1, fill = "blue" )+
  labs( x = "distance between cell pairs",
        y = "angle between cell pairs",
        title = "original data") +
  geom_hline( yintercept = 90, color = "red" ) +
  theme_classic()

p.drift <- ggplot( df.drift, aes( x = dist, y = angle ) ) +
  geom_point( color = "gray70", size = 0.5 ) +
  stat_smooth( span = 1, fill = "blue" )+
  labs( x = "distance between cell pairs",
        y = "angle between cell pairs",
        title = "data with drift") +
  geom_hline( yintercept = 90, color = "red" ) +
  theme_classic()

gridExtra::grid.arrange( p.norm, p.drift, ncol = 2 )

## ----stepPairs, warning = FALSE, message = FALSE, fig.width=6, fig.heigth = 2.5---------------------------------------
# compute for both original as drift data.
df.drift <- analyzeStepPairs( TCells.drift, filter.steps = function(x) displacement(x)>2  )
df.norm <- analyzeStepPairs( TCellsRaw, filter.steps = function(x) displacement(x)>2  )

# Plot
p.norm <- ggplot( df.norm, aes( x = dist, y = angle ) ) +
  geom_point( color = "gray70", size = 0.5 ) +
  stat_smooth( span = 0.75, fill="blue" )+
  labs( x = "distance between step pairs",
        y = "angle between step pairs",
        title = "original data") +
  geom_hline( yintercept = 90, color = "red" ) +
  theme_classic()

p.drift <- ggplot( df.drift, aes( x = dist, y = angle ) ) +
  geom_point( color = "gray70", size = 0.5 ) +
  stat_smooth( span = 0.75,  fill="blue")+
  labs( x = "distance between step pairs",
        y = "angle between step pairs",
        title = "data with drift") +
  geom_hline( yintercept = 90, color = "red" ) +
  theme_classic()

gridExtra::grid.arrange( p.norm, p.drift, ncol = 2 )

## ---------------------------------------------------------------------------------------------------------------------
# Get steps and find their displacement vectors
steps.drift <- subtracks( TCells.drift, 1 )
step.disp <- t( sapply( steps.drift, displacementVector ) )

# Get the mean
mean.displacement <- colMeans( step.disp )

# Divide this by the mean timestep to get a drift speed
drift.speed <- mean.displacement/timeStep( TCells.drift )
drift.speed

## ---- fig.width = 6, fig.height = 3.5---------------------------------------------------------------------------------
correct.drift <- function( x, drift.vector )
{
  # separate timepoints and coordinates
  tvec <- x[,1]
  coords <- x[,-1]
  
  # compute movement due to drift.
  drift.matrix <- matrix( rep( drift.vector, nrow(coords) ),
                          ncol = ncol(coords), byrow= TRUE )
  drift.matrix <- drift.matrix * tvec
  
  # Add drift to coordinates
  x[,-1] <- coords - drift.matrix
  return(x)
}

# Create data with drift
TCells.corrected <- as.tracks( lapply( TCells.drift, correct.drift, drift.speed ) )

# Compare (zoom in on part of the field to see better)
par( mfrow = c(1,2) )
plot( TCellsRaw, col = "blue", main = "uncorrected", cex = 0, pch.start = NA, xlim = c(200,400), ylim = c(0,200) )
plot( TCells.drift, col = "red", add = TRUE, cex = 0, pch.start = NA )
plot( TCellsRaw, col = "blue", main = "corrected", cex = 0, pch.start = NA, xlim = c(200,400), ylim = c(0,200)  )
plot( TCells.corrected, col = "red", add = TRUE, cex = 0, pch.start = NA  )


## ---------------------------------------------------------------------------------------------------------------------
# Take the track with id "5"
dup.track <- TCellsRaw[["5"]]

# Add some noise to coordinates
dup.track[,"x"] <- dup.track[,"x"] + rnorm( nrow(dup.track), sd = 0.5 )
dup.track[,"y"] <- dup.track[,"y"] + rnorm( nrow(dup.track), sd = 0.5 )
dup.track[,"z"] <- dup.track[,"z"] + rnorm( nrow(dup.track), sd = 0.5 )

# Wrap the track in a tracks object and add it to the TCell data with
# a unique id number
dup.track <- wrapTrack( dup.track )
new.id <- max( as.numeric( names( TCellsRaw ) ) ) + 1
names(dup.track) <- as.character( new.id )
TCells.dup <- c( TCellsRaw, dup.track )

## ---- warning = FALSE, fig.width = 3, fig.height = 2.5----------------------------------------------------------------
df <- analyzeCellPairs( TCells.dup )

# label cellpairs that have both angle and distance below threshold
angle.thresh <- 5 # in degrees
dist.thresh <- 10 # this should be the expected cell radius
df$id <- paste0( df$cell1,"-",df$cell2 )
df$id[ !(df$angle < angle.thresh & df$dist < dist.thresh) ] <- "" 

# Plot; zoom in on the region with small angles and distances
ggplot( df, aes( x = dist, y = angle ) ) +
  geom_point( color = "gray40" ) +
  geom_text( aes( label = id ), color = "red", hjust = -0.1 ) +
  labs( x = "distance between cell pairs",
        y = "angle between cell pairs" ) +
  coord_cartesian( xlim=c(0,30), ylim=c(0,20) ) +
  geom_hline( yintercept = angle.thresh, col = "blue",lty=2 ) +
  geom_vline( xintercept = dist.thresh, col = "blue", lty=2) +
  theme_classic()


## ---- fig.width = 7, fig.height = 2.5---------------------------------------------------------------------------------
par( mfrow=c(1,3))
plot( TCells.dup[c("5","9659") ] )
plot( TCells.dup[c("83","6080") ] )
plot( TCells.dup[c("7832","8352") ] )

## ---------------------------------------------------------------------------------------------------------------------
corrected.dup <- TCells.dup[ names( TCells.dup ) != "9659" ]

## ---------------------------------------------------------------------------------------------------------------------
tracks <- TCellsRaw
bb <- boundingBox( tracks )
bb

## ---- fig.width = 3, fig.height = 2-----------------------------------------------------------------------------------
# Define points:
lower1 <- c( bb["min","x"], bb["min","y"], bb["min","z"] )
lower2 <- c( bb["max","x"], bb["min","y"], bb["min","z"] )
lower3 <- c( bb["max","x"], bb["max","y"], bb["min","z"] )
zsize <- bb["max","z"] - bb["min","z"]

# Compute angles and distances of steps to this plane.
single.steps <- subtracks( tracks, 1 )
angles <- sapply( single.steps, angleToPlane, p1 = lower1, 
                  p2 = lower2, p3 = lower3 )
distances <- sapply( single.steps, distanceToPlane, p1 = lower1,
                     p2 = lower2, p3 = lower3 )
df <- data.frame( angles = angles,
                  distances = distances )

# Plot
ggplot( df, aes( x = distances,
                 y = angles ) ) +
  geom_point( color = "gray70", size = 0.5 ) +
  stat_smooth( method = "loess", span = 1, fill="blue" ) +
  geom_hline( yintercept = 32.7, color = "red" ) +
  scale_x_continuous( limits=c(0,zsize ), expand = c(0,0) ) +
  theme_classic()

## ---- fig.width = 4, fig.height = 3-----------------------------------------------------------------------------------
par( mar = c(5, 4, 0.5, 2) + 0.1 )
plot( TCellsRaw, dims = c("x","z" ) )

## ---------------------------------------------------------------------------------------------------------------------
boundingBox( TCellsRaw )

## ----get-steps--------------------------------------------------------------------------------------------------------
# Extract all subtracks of length 1 (that is, all "steps")
single.steps <- subtracks( TCellsRaw, 1 )

# The output is a new tracks object with a unique track for each step
# in the data (no longer grouped by the original cell they came from):
str( single.steps, list.len = 3 )

## ----check-avdt-------------------------------------------------------------------------------------------------------
median.dt <- timeStep( TCellsRaw )
median.dt

## ----check-dt---------------------------------------------------------------------------------------------------------
step.dt <- sapply( single.steps, duration )
str(step.dt)

## ----check-dt-hist----------------------------------------------------------------------------------------------------
dt.diff.perc <- (step.dt - median.dt) * 100 / median.dt
hist( dt.diff.perc, xlab = "dt (percentage difference from median)" )

## ---------------------------------------------------------------------------------------------------------------------
range( step.dt )
unique( step.dt )

## ---------------------------------------------------------------------------------------------------------------------
sum( step.dt == 48 )

## ---------------------------------------------------------------------------------------------------------------------
# This function randomly removes coordinates from a track dataset with probability "prob"
remove.points <- function( track, prob=0.1 ){
  
    tlength <- nrow( track )
    remove.rows <- sample( c(TRUE,FALSE), tlength, replace=TRUE,
                           prob = c(prob, (1-prob) ) )
    track <- track[!remove.rows,]
  return(track)
}

# Apply function to dataset to randomly remove coordinates in the data
TCells.gap <- as.tracks( lapply( TCellsRaw, remove.points ) )

## ---------------------------------------------------------------------------------------------------------------------
# duration of the individual steps
steps.gap <- subtracks( TCells.gap, 1 )

T1.step.disp <- sapply( single.steps, displacement )
T1.gap.disp <- sapply( steps.gap, displacement )

lapply( list( original = T1.step.disp, gaps = T1.gap.disp ), summary )

## ---------------------------------------------------------------------------------------------------------------------
T1.norm.disp <- sapply( single.steps, normalizeToDuration( displacement ) )
T1.norm.gap.disp <- sapply( steps.gap, normalizeToDuration( displacement ) )
lapply( list( original = T1.norm.disp, gaps = T1.norm.gap.disp ), summary )

## ---- fig.width=6-----------------------------------------------------------------------------------------------------

# Repair gaps by splitting or interpolation, the number of tracks is 
# different after each fix
split.gap <- repairGaps( TCells.gap, how = "split" )
interpolate.gap <- repairGaps( TCells.gap, how = "interpolate" )

c( "after splitting" = length( split.gap),
   "after interpolation" = length( interpolate.gap ) )

## ---------------------------------------------------------------------------------------------------------------------
T2 <- subsample( TCellsRaw, k = 2 )

## ---------------------------------------------------------------------------------------------------------------------
# displacement
T1.steps <- subtracks( TCellsRaw, 1 )
T1.disp <- sapply( T1.steps, displacement )

T2.steps <- subtracks( T2, 1 )
T2.disp <- sapply( T2.steps, displacement )

lapply( list( T1 = T1.disp, T2 = T2.disp ), summary )


## ---------------------------------------------------------------------------------------------------------------------
# interpolate both datasets at the time resolution of the neutrophils
dt <- timeStep( TCellsRaw )
interpolate.dt <- function( x, dt, how = "spline" ){
  trange <- range( timePoints( wrapTrack( x ) ) )
  tvec <- seq( trange[1], trange[2], by = dt )
  x <- interpolateTrack( x, tvec, how = how )
  return(x)
}

T1.corrected <- as.tracks( lapply( TCellsRaw, interpolate.dt, dt = dt ) )
T2.corrected <- as.tracks( lapply( T2, interpolate.dt, dt = dt ) )

# Check the effect on the displacement statistics:
T1.corr.steps <- subtracks( T1.corrected, 1 )
T1.corr.disp <- sapply( T1.corr.steps, displacement )

T2.corr.steps <- subtracks( T2.corrected, 1 )
T2.corr.disp <- sapply( T2.corr.steps, displacement )

lapply( list( T1 = T1.disp, T2 = T2.disp, 
              T1.corr = T1.corr.disp, T2.corr = T2.corr.disp ), 
        summary )

## ---- echo = FALSE----------------------------------------------------------------------------------------------------
# Reset par() settings
par(oldpar)

## ---- dpi = 100-------------------------------------------------------------------------------------------------------
load( system.file("extdata", "TCellsRaw.rda", package="celltrackR" ) )

par( mar = c(2, 2, 1, 1) + 0.1 )
plot( TCellsRaw )

## ---------------------------------------------------------------------------------------------------------------------
cellSpeeds <- sapply( TCellsRaw, speed )
hist( cellSpeeds, breaks = 30 )

## ---------------------------------------------------------------------------------------------------------------------
cutoff <- 0.1 # 0.1 micron/sec = 6 micron/min
loSpeed <- filterTracks( function(t) speed(t) < cutoff, TCellsRaw )
hiSpeed <- filterTracks( function(t) speed(t) >= cutoff, TCellsRaw )

## ---- fig.width = 6, fig.height = 3.5---------------------------------------------------------------------------------
par(mfrow=c(1,2))
plot( loSpeed, main = "Below cutoff" )
plot( hiSpeed, main = "Above cutoff" )

## ----bic-nonmotile----------------------------------------------------------------------------------------------------
bicNonMotile <- function( track, sigma ){
  
  # we'll use only x and y coordinates since we saw earlier that the z-dimension was
  # not so reliable
  allPoints <- track[,c("x","y")]
  
  # Compute the log likelihood under a multivariate gaussian.
  # For each point, we get the density under the Gaussian distribution 
  # (using dmvnorm from the mvtnorm package).
  # The product of these densities is then the likelihood; but since we need the 
  # log likelihood, we can also first log-transform and then sum:
  Lpoints <- mvtnorm::dmvnorm( allPoints, 
                      mean = colMeans(allPoints), # for a Gaussian around the mean position
                      sigma = sigma*diag(2), # sd of the Gaussian (which we should choose)
                      log = TRUE )
  logL <- sum( Lpoints )
  
  # BIC = k log n - 2 logL; here k = 3 ( mean x, mean y, sigma )
  return( 3*log(nrow(allPoints)) - 2*logL )
}

## ---------------------------------------------------------------------------------------------------------------------
# the BIC for a given cutoff m
bicAtCutoff <- function( track, m, sigma ){
  
  # we'll use only x and y coordinates since we saw earlier that the z-dimension was
  # not so reliable
  allPoints <- track[,c("x","y")]
  
  # Split into two coordinate sets based on the cutoff m:
  firstCoords <- allPoints[1:m, , drop = FALSE]
  lastCoords <- allPoints[(m+1):nrow(allPoints), , drop = FALSE ]
  
  # Compute log likelihood under two separate Gaussians:
  Lpoints1 <- mvtnorm::dmvnorm( firstCoords, 
    mean = colMeans(firstCoords), 
    sigma = sigma*diag(2), 
    log = TRUE )
  Lpoints2 <- mvtnorm::dmvnorm( lastCoords, 
    mean = colMeans(lastCoords), 
    sigma = sigma*diag(2), 
    log = TRUE )
  logL <- sum( Lpoints1 ) + sum( Lpoints2 )
  
  # BIC = k log n - 2 logL; here k = 6 ( 2*mean x, 2*mean y, sigma, and m )
  return( 6*log(nrow(allPoints)) - 2*logL )
}

# We'll try all possible cutoffs m, and choose best model (minimal BIC)
# to compare to our non-motile "null hypothesis":
bicMotile <- function( track, sigma ){
  
  # cutoff anywhere from after the first two coordinates to 
  # before the last two (we want at least 2 points in each Gaussian,
  # to prevent fitting of a single point)
  cutoffOptions <- 2:(nrow(track)-2)
  
 	min( sapply( cutoffOptions, function(m) bicAtCutoff(track,m,sigma) ) )
}

## ---- fig.width = 6, fig.height = 3.5---------------------------------------------------------------------------------
deltaBIC <- function( x, sigma ){
  b1 <- bicNonMotile( x, sigma )
  b2 <- bicMotile( x, sigma )
  d <- b1 - b2
  d
}

dBIC <- sapply( TCellsRaw, deltaBIC, 7 )
motile <- TCellsRaw[ dBIC >= 6 ]
nonMotile <- TCellsRaw[ dBIC < 6 ]


# Plot again for comparison:
par(mfrow=c(1,2))
plot( motile, main = "Motile" )
plot( nonMotile, main = "Non-Motile" )

## ---- fig.width = 4, fig.height = 3-----------------------------------------------------------------------------------
plot( dBIC, cellSpeeds, xlim = c(0,20) )
abline( v = 6, col = "red" ) # delta BIC cutoff
abline( h = max(cellSpeeds), col = "blue", lty = 2 ) # max speed of all cells