In alexholcombe/mixRSVP: Mixture Model Temporal Errors In RSVP Data

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Read in some data, and rename some of the fields

library(mixRSVP)
data <- backwards2_E1 #A dataset provided as an example with the package
#.mat file been preprocessed into melted long dataframe
numItemsInStream <- length( data$letterSeq[1,] )  
data$letterSeq <- NULL
library(dplyr)

#To use dplyr operations, each column must be a 1d atomic vector or a list. So, can't have array fields like letterSeq
data$letterSeq<- NULL

#Give conditions better names than 1 and 2
names(data)[names(data) == 'target'] <- 'stream'
data <- data %>% mutate( stream =ifelse(stream==1, "Left","Right") )
#mutate condition to Orientation
names(data)[names(data) == 'condition'] <- 'orientation'
data <- data %>% mutate( orientation =ifelse(orientation==1, "Canonical","Inverted") )

Plot the histogram and fit for one subjectconditionstream. This is basically a demo of plot_hist_with_fit.

yellow = guessing component
light blue = Gaussian component
green = sum of the guessing and Gaussian components. In other words, the histogram heights predicted by the model
dark blue = continuous Gaussian. This helps get a sense of the effect of discretising the Gaussian. For instance, it's possible for the Gaussian peak to rise high above the bars and still fit the discrete bins, suggesting undesirably high estimates of the efficacy (likely accompanied by an undesirably low precision)

For goodness of fit, lower neg log likelihood means better fit.

dCB<- dplyr::filter(data,subject=="BS",orientation=="Inverted",stream=="Left")  
minSPE<- -17; maxSPE<- 17
plotContinuousGaussian<-TRUE; annotateIt<-TRUE
g<- plot_hist_with_fit(dCB,minSPE,maxSPE,dCB$targetSP,numItemsInStream,plotContinuousGaussian,annotateIt, FALSE)
library(ggplot2)
g + annotate("text", x = 12, y = 25, label = "CB, Inverted, Left stream")

Fit mixture model to whole dataset (actually a subset so that vignette execution doesn't take too long).

condtnVariableNames <- c("subject","orientation", "stream") # 

#there are twenty-some subjects, but analysing all would make the vignette far too long to build
df<- data %>% dplyr::filter(subject>="AE",subject<="AG") #Includes one or two who fail the likelihood ratio test, for illustration

#Check whether already have parameter estimates or instead need to do it
calculate<-FALSE
paste('calculate=',calculate)
if (!exists("estimates"))  { 
  calculate<-TRUE
} else if (length(estimates)!=nrow(df)) {
  calculate<-TRUE
}

if (calculate) {
  estimates<- df %>%  
    group_by_(.dots = condtnVariableNames) %>%  #.dots needed when you have a variable containing multiple factor names
    do(  analyzeOneConditionDF(.,numItemsInStream,parameterBounds(), nReplicates=3)  )
}
head(estimates)

Plot data with fits.

I think you can't put it all into a single plot by calling plot_hist_with_fit multiple times because it returns a plot object, then would have to use grid to combine the plots.

So, the method here is to calculate the fitted curves separately and then add them on. There should be a more integrated way to do that but then it might be harder to probe when fits go wrong?

First calculate the fitted curves.

#want fig.height of 10 per subject

library(dplyr)

#Add R parameter estimates to dataframe. That way calc_curves_dataframe won't have to refit the data.
dg<- merge(df,estimates)

curves<- dg %>% group_by_at(.vars = condtnVariableNames) %>% 
  do(calc_curves_dataframe(.,minSPE,maxSPE,numItemsInStream))

Now calculate the number of observations in each condition, which is used for scaling the pseudo-continuous (fine-grained) Gaussian. Then calculate that Gaussian curve.

#Calc numObservations to each condition. This is needed only for scaling the fine-grained Gaussian
#Calc the number of observations for each condition, because gaussianScaledforData needs to know.
dfGroups<- dg %>% group_by_at(.vars = condtnVariableNames) %>% summarise(nPerCond = n())
#add nPerCond back to parameter estimates
estimates<- merge(estimates,dfGroups)


grain<-.05
gaussFine<- estimates %>% group_by_at(.vars = condtnVariableNames) %>% do(
  gaussian_scaled_from_df(.,minSPE,maxSPE,grain) )

Plot everything

g=ggplot(dg, aes(x=SPE)) + facet_grid(subject+orientation~stream) #,  scales="free_y")
g<-g+geom_histogram(binwidth=1,color="grey90") + xlim(minSPE,maxSPE)
g<-g+ geom_text(x=12, y= 33, aes(label = subject)) #inset subject name/number. Unfortunately it overwrites itself a million times
g<-g +theme_apa() #+theme(panel.grid.minor=element_blank(),panel.grid.major=element_blank())# hide all gridlines.
#g<-g+ theme(line=element_blank(), panel.border = element_blank())
sz=.8
#Plot the underlying Gaussian , not just the discretized Gaussian. But it's way too tall. I don't know if this is 
#a scaling problem or what actually is going on.
#g<-g + geom_line(data=gaussFine,aes(x=x,y=gaussianFreq),color="darkblue",size=1.2)

g<-g+ geom_point(data=curves,aes(x=x,y=combinedFitFreq),color="chartreuse3",size=sz*2.5)
g<-g+ geom_line(data=curves,aes(x=x,y=guessingFreq),color="yellow",size=sz)
#Discretized Gaussian
g<-g+ geom_line(data=curves,aes(x=x,y=gaussianFreq),color="lightblue",size=sz)


numGroups<- nrow(dfGroups) # length(table(df$orientation,df$subject,df$stream)) #try nrow(dfGroups)
fontSz = 3 #100/numGroups
#mixSig - whether mixture model statistically significantly better than guessing
curves <- dplyr::mutate(curves, mixSig = ifelse(pLRtest <= .05, TRUE, FALSE)) #annotate_fit uses this to color the p-value
g<- annotate_fit(g,curves) #assumes curvesDf includes efficacy,latency,precision
#Somehow the which mixSig (TRUE or FALSE) is red and which green is flipped relative to plot_hist_with_fit even though
#identical commands are used. I haven't been able to work out why.
g<- g + scale_color_manual(values=c("red","forestgreen"))  

show(g)