biophysr: Analysis tools for muscle biophysics

Documented in hidden_markov_analysis

#' HMM Analysis
#'
#' @return
#' @export
#'
#' @examples
hidden_markov_analysis <- function(parent_dir, mv2nm, nm2pn, overlay_color ){

## START OF SCRIPT ##

## LOAD PACKAGES ##

writeLines("Loading Packages")
library(biophysr)
suppressPackageStartupMessages(library(tidyverse))
library(gtools)
suppressPackageStartupMessages(library(depmixS4))
suppressPackageStartupMessages(library(gridExtra))
library(readr)

## SET WD ##

setwd(parent_dir)

observation_folders <- list.files(pattern = "obs")
grouped4r_files <- list.files(pattern = "grouped4r.txt", recursive = TRUE)
directions <- list.files(pattern = "directions.csv")

read_directions <- suppressMessages(read_csv(directions)) %>%
  mutate(folder = observation_folders,
         grouped_file = grouped4r_files) %>%
  filter(include == "yes")

read_directions$baseline_start_sec <- read_directions$baseline_start_sec*5000
read_directions$baseline_stop_sec <- read_directions$baseline_stop_sec*5000

hmm_initial_parameters <- c(0.99, 0.01,       #Initial state probabilities
                            0.99, 0.01, #transition probs s1 to s1/s2. These are guesses knowing they are stable states
                            0.01, 0.99) #transition probs s2 to s1/s2. Again a guess


## LOAD IN DATA ##
report <- vector("list")
error_file <- file("error_log.txt", open =  "a")
#loop will start here
for(folder in seq_along(read_directions$folder)){
  tryCatch({

report[[r]] <- report[[r]] <- paste0("failed_to_initialize!_", read_directions$folder[[r]])

writeLines(paste("Analyzing", read_directions$condition[[folder]], read_directions$folder[[folder]]))

dir.create(path = paste0(read_directions$folder[[folder]], "/", "results"))


#Load data and convert mV to nm
dat <- read.delim(read_directions$grouped_file[[folder]], header = FALSE) %>%
  mutate(nm_converted = V1*mv2nm) %>%
  dplyr::pull(nm_converted)


#PROCESS DATA
#detrends data by either performing a piecewise linear detrend or simply removing baseline mean from all points (i.e. constant detrend)
#both of these will center the mean around 0. It just depends if there needs to be long linear drift corrected or not

processed_data <- if(read_directions$detrend[[folder]] == "yes"){

  break_pts <- seq(25000, length(dat), by = 25000)

  pracma::detrend(dat, tt = "linear", bp = break_pts)

} else if(read_directions$detrend[[folder]] == "no"){

  get_mean <- mean(dat[read_directions$baseline_start_sec[[folder]] : read_directions$baseline_stop_sec[[folder]]])
  dat - get_mean

}

## RUNNING MEAN & VAR ##
writeLines("Calculating Running Mean")

run_mean <- running(processed_data, fun = mean, width = 150, by = 75)

writeLines("Calculating Running Variance")

run_var <- running(processed_data, fun = var, width = 150, by = 75)


running_table <- data.frame(run_mean = run_mean,
                            run_var = run_var)

## HMM ##
writeLines("Fitting HMM")
report[[r]] <- report[[r]] <- paste0("failed_HMM!_", read_directions$folder[[r]])

seed <- floor(runif(1, 0, 1e6))

hmm <- depmix(list(run_var~1,
                   run_mean~1),
                   data = running_table,
                   nstates = 2,
                   family = list(gaussian(),
                                 gaussian()))

sd_run_mean <- sd(run_mean)

mean_run_var <- mean(run_var)
sd_run_var <- sd(run_var)


estimate_hmm_gaussians <- c(mean_run_var, sd_run_var, 0, sd_run_mean,
                            mean_run_var/2, sd_run_var/2, 7, sd_run_mean)

hmm <- setpars(hmm, c(hmm_initial_parameters, estimate_hmm_gaussians))


set.seed(seed)

hmm_fit <- fit(hmm)

hmm_posterior <- posterior(hmm_fit)

#make sure HMM starts in state 2 this will reset seed and try to refit 10 times

hmm_repeat <- 0

while(hmm_repeat < 10){

  if(hmm_posterior$state[[1]] == 1){
    writeLines("HMM starts in state 1")
    hmm_repeat <- 11

  } else if(hmm_posterior$state[[1]] == 2){
    writeLines(paste("Refitting HMM", hmm_repeat))

    seed <- floor(runif(1, 0, 1e6))

    set.seed(seed)

    hmm_fit <- fit(hmm)

    hmm_posterior <- posterior(hmm_fit)

    hmm_repeat <- hmm_repeat + 1
  }
}

report[[r]] <- report[[r]] <- paste0("error-HMM_starts_in_state_2!_", read_directions$folder[[r]])

if(hmm_posterior$state[[1]] == 2){
  writeLines(c("Skipping",
               read_directions$folder[[folder]],
               "HMM starts in State 2."), error_file);
  next
}

writeLines("Saving HMM Summary")
sink(paste0(read_directions$folder[[folder]],
                   "/results/",
                   read_directions$condition[[folder]],
                   "_",
                   read_directions$folder[[folder]],
                   "_", "hmm_summary.txt"))

summary(hmm_fit)

paste("set.seed =", seed)
sink()



## Calculate conversion between window length and data points
raw_length <- length(processed_data)
num_windows <- length(run_var)


## COUNT EVENTS ##
writeLines("Counting Events")

counter <- matrix(table(paste0(head(hmm_posterior$state,-1),tail(hmm_posterior$state,-1))), nrow = 1)


dimnames(counter) <- list(c("#_transitions"),
                               c("1-1", "1-2", "2-1", "2-2"))

count_events <- as.data.frame(counter, row.names = "#_transitions", make.names = TRUE) %>%
  mutate(condition = paste0( read_directions$condition[[folder]]))


write_csv(count_events, paste0(read_directions$folder[[folder]],
                               "/results/",
                               read_directions$condition[[folder]],
                               "_",
                               read_directions$folder[[folder]], "_event_count.csv"))


#save running mean, var, & state for ensemble averaging & dygraph
hmm_identified_events <- data.frame(run_mean = run_mean,
                                    run_var = run_var,
                                    state = hmm_posterior$state)

write_csv(hmm_identified_events, paste0(read_directions$folder[[folder]],
                                        "/results/",
                                        read_directions$condition[[folder]],
                                        "_",
                                        read_directions$folder[[folder]],
                                        "_data4_ensemble_average.csv"))


report[[r]] <- report[[r]] <- paste0("error_measureing_events!_", read_directions$folder[[r]])

## MEASURE EVENTS ##
writeLines("Measuring Events")
#setup
conversion <- length(processed_data)/length(run_mean)

#convert running mean object to tibble
run_mean_tibble <- tibble::enframe(run_mean) %>%
  mutate(index = time(run_mean))

#finds lengths of events in number of running windows
run_length_encoding <- rle(hmm_posterior$state)

#converting to a tibble
rle_object <- as_tibble(do.call("cbind", run_length_encoding))

#make a copy of data for time on analysis
rle_object_4_duration <- rle_object %>%
  dplyr::filter(values == 2)

#make a copy of data for time off analysis
time_offs <- rle_object %>%
  dplyr::filter(values == 1) %>%
  tail(-1) %>% #this gets rid of the first state 1 when that begins with the start of the trace recording
  head(-1) %>% #this gets rid of the last state 1 that only ends because we stopped collecting
  mutate(off_length_5kHz = lengths*conversion,
         time_off_ms = (off_length_5kHz/5000)*1000)


#calculates the events durations
on_off_times <- rle_object_4_duration %>%
  dplyr::mutate(n_event = 1:nrow(rle_object_4_duration),
                length_5kHz = lengths*conversion,
                time_on_ms = (length_5kHz/5000)*1000,
                time_off_ms = c(NA, time_offs$time_off_ms)) %>%
  dplyr::select(n_event,values, everything()) %>%
  dplyr::rename("num_windows" = lengths,
                "hmm_state" = values)


#calculate event displacement

#If the rle_object's last row is in state 1, get rid of that last row. This needs to end in state 2 to capture the end of the last event
rle_object_4_step_sizes <- if(tail(rle_object, 1)$values == 1){
  slice(rle_object, -length(rle_object$values))
} else {
  rle_object
}
#Calculate the cumulative sum of the run length encoder
#And splits the tibble into two seperate tables to isolate state 1 info from state 2

split_data <- rle_object_4_step_sizes %>%
  dplyr::mutate(cumsum = cumsum(lengths)) %>%
  dplyr::group_by(values) %>%
  split(rle_object_4_step_sizes$values)

#data is recmombined in a state_1 column and a state_2
#the values in these columns represent the last data point (in window lengths) in either state 1 or state 2
#So the range of values between the end of state 1 (or start of state 2) and the end of state 2 is the event duration
regroup_data <- bind_cols(state_1_end = split_data[[1]]$cumsum, state_2_end = split_data[[2]]$cumsum)

#loop over regrouped data to find the mean of the events displacements
step_sizes <- vector("list", length = nrow(regroup_data)) #allocate space for output storage of loop
peak_nm_index <- vector()
for(i in seq_along(1:nrow(regroup_data))){
  win_values <- c(run_mean_tibble$value[(regroup_data$state_1_end[i]+1) : (regroup_data$state_2_end[i])])
  step_sizes[[i]] <- max(win_values)

  win_values_t <- run_mean_tibble[(regroup_data$state_1_end[i]+1) : (regroup_data$state_2_end[i]),]
  peak_nm_index[i] <- win_values_t$index[which.max(win_values_t$value)]


}

#do opposite to get means of state 1 to subtract from s2 means.
# need to subtract first s1 value and last s2 value of the cumsum to align properly
#don't need the end point of the first s1 because we don't know when the last event ended because we only have knowledge
#of events we observe when data collection starts

minus1 <- split_data[[1]]$cumsum[-1]
minus2 <- split_data[[2]]$cumsum[-length(split_data[[2]]$cumsum)]


s1_regroup_data <- bind_cols(state_2_end = minus2, state_1_end = minus1)

#loop over s1_regrouped data to find the mean of state 1
state_1_avg <- vector("list", length = nrow(regroup_data)) #allocate space for output storage of loop
state_1_avg[[1]] <- mean(run_mean_tibble$value[1:regroup_data$state_1_end[1]]) #get everage of first state 1
for(i in seq_along(1:nrow(s1_regroup_data))){
  state_1_avg[[i+1]] <- mean(run_mean_tibble$value[(s1_regroup_data$state_2_end[i]+1) : (s1_regroup_data$state_1_end[i])])
}


calculate_mean_differences <- tibble(avg_s1 = unlist(state_1_avg),
                                     avg_s2 = unlist(step_sizes),
                                     diff = avg_s2 - avg_s1)
## DIRECTION CORRECTION ##

positive_events <- sum(calculate_mean_differences$diff > 0)
negative_events <- sum(calculate_mean_differences$diff < 0)

#if there are more negative step sizes than positive, actin filament assumed backward and all events flipped (multipled by -1)
#also raw trace, state 1 averages, and step sizes flipped for hmm overlay
direction_correction <- if(negative_events > positive_events){
  calculate_mean_differences$diff * -1
} else {
  calculate_mean_differences$diff
}


flip_raw <- if(negative_events > positive_events){
  processed_data * -1
} else {
  processed_data
}

flip_state_1_avg <- if(negative_events > positive_events){
  unlist(state_1_avg) * -1
} else {
  unlist(state_1_avg)
}

flip_step_sizes <- if(negative_events > positive_events){
  unlist(step_sizes) * -1
} else {
  unlist(step_sizes)
}


#add step sizes and forces to the on_off_times table
measured_events <- on_off_times %>%
  dplyr::mutate(displacement_nm = direction_correction,
                condition = paste0(read_directions$condition[[folder]]),
                force = displacement_nm*nm2pn) %>%
  dplyr::select(condition, time_on_ms, time_off_ms, displacement_nm, force)


## SAVE OUTPUT ##

write_csv(measured_events, paste0(read_directions$folder[[folder]],
                                  "/results/",
                                  read_directions$condition[[folder]],
                                  "_",
                                  read_directions$folder[[folder]],
                                  "_hmm_measured_events.csv"))




#makae hmm overlay for ggplot and dygraph
dp2plot <- 10*5000

s1_avg_4plot <- tibble(state_order = seq(from = 1, length.out = length(state_1_avg), by = 2),
                       avg = flip_state_1_avg)



s2_avg_4plot <- tibble(state_order = seq(from = 2, length.out = length(step_sizes), by = 2),
                       avg = flip_step_sizes)


hmm_overlay <- bind_rows(s1_avg_4plot, s2_avg_4plot) %>%
  arrange(state_order)

overlay <- vector("list")
for(i in seq_along(1:nrow(hmm_overlay))){
  overlay[[i]] <- rep(hmm_overlay$avg[i],
                      (round(conversion)*rle_object$lengths[-length(rle_object$lengths)][i]))
}

overlay <- unlist(overlay)


#save data for dygraph

dygraph_master_list <- list(raw_data = processed_data,
                            run_mean = overlay,
                            final_events = measured_events,
                            periods = regroup_data %>%
                              mutate(state_2_start = (state_1_end + 1) * conversion,
                                     state_2_stop = state_2_end * conversion) %>%
                              dplyr::select(state_2_start, state_2_stop),
                            peak_nm_index = peak_nm_index * conversion,
                            parent_dir = parent_dir)

save("dygraph_master_list", file = paste0(read_directions$folder[[folder]],
                                          "/results/",
                                          read_directions$condition[[folder]],
                                          "_",
                                          read_directions$folder[[folder]],
                                          "_dygraph_data.RData"))
#make dygraph .R file
writeLines(c(
  "#+ echo=FALSE",
  paste0("parent_dir <- ","'", parent_dir, "'"),
  paste0("obs <- ", "'", read_directions$folder[[folder]], "'"),
  paste0("run_mean_color <- ", "'",overlay_color, "'"),
  "
#+ echo=FALSE, fig.width = 10, fig.height = 4
  setwd(parent_dir)

  library(tidyverse)
  library(dygraphs)
  library(rmarkdown)


  directions <- list.files(pattern = 'directions.csv')
  observation_folders <- list.files(pattern = 'obs')

  read_directions_dy <- suppressMessages(read_csv(directions)) %>%
  mutate(folder = observation_folders)%>%
  filter(include == 'yes')

  setwd(paste0(parent_dir, '/', obs, '/results'))
  dg_dat <- list.files(pattern = 'dygraph_data.RData')
  load(dg_dat)

  d <- data.frame(index = (1:length(dygraph_master_list$run_mean)/5000),
                  raw = dygraph_master_list$raw_data[1:length(dygraph_master_list$run_mean)],
                  run = dygraph_master_list$run_mean
                  )

  events <- dygraph_master_list$final_events
  peak_nm_index = dygraph_master_list$peak_nm_index/5000

  periods_df <- data.frame(start = dygraph_master_list$periods$state_2_start/5000,
                           stop = dygraph_master_list$periods$state_2_stop/5000)


  add_shades <- function(x, periods, ...){
    for(p in 1:nrow(periods)){
      x <- dyShading(x, from = periods$start[[p]], to = periods$stop[[p]], ...)
    }
    x
  }

   add_labels <- function(x, events, ...){
    for(event in 1:length(peak_nm_index)){
      x <- dyEvent(x, peak_nm_index[[event]], paste(round(events$time_on_ms[[event]], digits = 1), 'ms,', round(events$force[[event]], digits = 2), 'pN'), ...)
    }
    x
  }

  dygraph(d) %>%
    dySeries('raw', color = '#242424', strokeWidth = 2) %>%
    dySeries('run', color = run_mean_color,  strokeWidth = 2) %>%
    dyRangeSelector() %>%
    add_shades(periods_df, color = 'lightpink') %>%
    add_labels(events, labelLoc = 'bottom') %>%
    dyAxis('x', label = 'seconds', drawGrid = FALSE) %>%
    dyAxis('y', label = 'nm') %>%
    dyUnzoom()

  "),

  paste0(read_directions$folder[[folder]],
         "/results/",
         read_directions$condition[[folder]],
         "_",
         read_directions$folder[[folder]],
         "_dygraph.R")

)

report[[r]] <- paste0("failed_to_plot!_", read_directions$folder[[r]])

## PLOT OVERLAY ##
writeLines("Plotting Hmm Overlay...")



if(length(processed_data)/5000 > 41){

plot1 <- ggplot()+
  geom_line(aes(x = (1:dp2plot)/5000, y = flip_raw[1:dp2plot]))+
  geom_line(aes(x = (1:dp2plot)/5000, y = overlay[1:dp2plot]), color = overlay_color,  size = 0.75)+
  xlab("")+
  ylab("nm")+
  ggtitle(paste0(read_directions$condition[[folder]],
                 "_",
                 read_directions$folder[[folder]]))+
  theme_bw()

plot2 <- ggplot()+
  geom_line(aes(x = (dp2plot:(2*dp2plot))/5000, y = flip_raw[dp2plot:(2*dp2plot)]))+
  geom_line(aes(x = (dp2plot:(2*dp2plot))/5000, y = overlay[dp2plot:(2*dp2plot)]), color = overlay_color,  size = 0.75)+
  xlab("")+
  ylab("nm")+
  theme_bw()

plot3 <- ggplot()+
  geom_line(aes(x = ((2*dp2plot):(3*dp2plot))/5000, y = flip_raw[(2*dp2plot):(3*dp2plot)]))+
  geom_line(aes(x = ((2*dp2plot):(3*dp2plot))/5000, y = overlay[(2*dp2plot):(3*dp2plot)]), color = overlay_color,  size = 0.75)+
  xlab("")+
  ylab("nm")+
  theme_bw()

plot4 <- ggplot()+
  geom_line(aes(x = ((3*dp2plot):(4*dp2plot))/5000, y = flip_raw[(3*dp2plot):(4*dp2plot)]))+
  geom_line(aes(x = ((3*dp2plot):(4*dp2plot))/5000, y = overlay[(3*dp2plot):(4*dp2plot)]), color = overlay_color, size = 0.75)+
  xlab("Seconds")+
  ylab("nm")+
  theme_bw()


overlay_plots  <- arrangeGrob(plot1, plot2, plot3, plot4, ncol = 1)

ggsave(paste0(read_directions$folder[[folder]],
              "/results/",
              read_directions$condition[[folder]],
              "_",
              read_directions$folder[[folder]], "_hmm_overlay_plot.png"), overlay_plots, width = 12)



} else if(length(processed_data)/5000 < 41 & length(processed_data)/5000 > 21){


  plot1 <- ggplot()+
    geom_line(aes(x = (1:dp2plot)/5000, y = flip_raw[1:dp2plot]))+
    geom_line(aes(x = (1:dp2plot)/5000, y = overlay[1:dp2plot]), color = overlay_color,  size = 0.75)+
    xlab("")+
    ylab("nm")+
    ggtitle(paste0(read_directions$condition[[folder]],
                   "_",
                   read_directions$folder[[folder]]))+
    theme_bw()

  plot2 <- ggplot()+
    geom_line(aes(x = (dp2plot:(2*dp2plot))/5000, y = flip_raw[dp2plot:(2*dp2plot)]))+
    geom_line(aes(x = (dp2plot:(2*dp2plot))/5000, y = overlay[dp2plot:(2*dp2plot)]), color = overlay_color,  size = 0.75)+
    xlab("Seconds")+
    ylab("nm")+
    theme_bw()

  overlay_plots  <- arrangeGrob(plot1, plot2, ncol = 1)

  ggsave(paste0(read_directions$folder[[folder]],
                "/results/",
                read_directions$condition[[folder]],
                "_",
                read_directions$folder[[folder]], "_hmm_overlay_plot.png"), overlay_plots, width = 12)
} else {

  plot1 <- ggplot()+
    geom_line(aes(x = (1:25000)/5000, y = flip_raw[1:dp2plot]))+
    geom_line(aes(x = (1:25000)/5000, y = overlay[1:dp2plot]), color = overlay_color,  size = 0.75)+
    xlab("")+
    ylab("nm")+
    ggtitle(paste0(read_directions$condition[[folder]],
                   "_",
                   read_directions$folder[[folder]]))+
    theme_bw()

  ggsave(paste0(read_directions$folder[[folder]],
                "/results/",
                read_directions$condition[[folder]],
                "_",
                read_directions$folder[[folder]], "_hmm_overlay_plot.png"), plot1, width = 12)

}


  }, error=function(e){
    writeLines(paste0("Analysis error in ",
           read_directions$folder[[r]],
           "with error: ",
           as.character(e)), error_file)
    cat("ERROR :",conditionMessage(e), "\n")
    })

}


#render to HTML
writeLines("Render to HTML")
for(r in 1:nrow(read_directions)){
  tryCatch({

    report[[r]] <- paste0("failed_to_render_dygraph!_", read_directions$folder[[r]])

    rmarkdown::render(paste0(parent_dir,
                             "/",
                             read_directions$folder[[r]],
                             "/results/",
                             read_directions$condition[[r]],
                             "_",
                             read_directions$folder[[r]],
                             "_dygraph.R"))

    report[[r]] <- paste0("success!_", read_directions$folder[[r]])

  }, error=function(e){
    writeLines(paste0("Render to HTML failed on ",
                      read_directions$folder[[r]],
                      " with error: ",
                      as.character(e)), error_file)
    cat("ERROR :",conditionMessage(e), "\n")
    })

}

export_directions <- suppressMessages(read_csv(directions)) %>%
  mutate(folder = observation_folders,
         grouped_file = grouped4r_files)

success_report <- tibble(analysis_complete = unlist(report)) %>%
  separate(analysis_complete, c("report", "folder"), sep = "!_") %>%
  full_join(export_directions) %>%
  replace_na(list(report = "user_excluded")) %>%
  arrange(folder) %>%
  select(-starts_with("grouped_file"))

write_csv(success_report, "directions.csv")



#---------
close(error_file)



  writeLines("Done")

}