In YunranChen/prml: An Implementation of PRML Classifier and PRML Filter

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

In this article, I present how to run the real data analysis in computer cluster, including:

Prepare a R script (.R) including the real data anaysis.
Download the singularity file providing the environment for the R script.
Prepare a batch script (.sh) to submit the job (running R script on singularity) to the compute cluster.

Steps

Follow the steps:

Create a folder named Analysis. Within the folder, include your raw data in the sub folder ./Data/JA. Assume the raw data is paired .txt files, named UNIQUEIDDoubleSound.txt and UNIQUEIDDoubleSound_spiketimes.txt, storing the condition of the experiment and the spike times records respectively.
Run the R code in Preparation section, to create a filenames.txt file stored in folder Analysis, including all the paired file names. Run the R code in Preparation section, to separate the raw data into n_part parts (I set 3 parts here.) Create several filenames_NUMBER.txt files stored in folder Analysis, including part of the paired file names respectively.
Create a R file named prml_analysis.R within folder Analysis following the section R Script (Parallel Version).
Create a Batch script named prml_analysis.sh within folder Analysis following the section Batch Script.
Create a folder others within folder Analysis to store the output and error messages from the R script and Batch script.
Upload the folder Analysis to the compute cluster following the section Upload Files.
Download the Singularity file to the folder Analysis as shown in section Singularity file. Notice, once it is download, next time you want to use it, you only need to copy it to your another analysis folder.
Run in the compute cluster by following section Run in Compute Cluster. And you can also check the status of your jobs.
After all the jobs finish, fetch the results to your local folder and visualize your result. Section Fetch Results and Visualizations shows how to gather the results and do basic data cleaning before visualization.
Some notes. See section Notes.
If you want to apply prml_tests_f() instead of prml_tests(), just need to change the R script to the one shown in section Another R Script Example.

Preparation

Make sure you have all the paired raw data files in ./Data/JA folder. The following code is for:

Create a filenames.txt file, extracting all the paired file names from folder ./Data/JA.
Obtain several files named filenames_.txt by splitting the files to n_part parts

library("dplyr")
library("purrr")
library("stringr")

# Create a `filenames.txt` file, extracting all the paired file names from folder `./Data/JA`. 

filenames <- list.files(path = "./Data/JA",pattern = "DoubleSound\\.txt")%>%
  str_split(.,".txt",simplify = TRUE)%>%
  .[,1]
write(filenames,file = "filenames.txt")

#split the files to `n_part` parts.

#fnames -- a list of names of the .txt files in the folder
fnames <- scan("filenames.txt", "a")
n_part <- 3
flist <- split(fnames, seq_along(fnames)%%n_part+1)
map(1:length(flist),~write(flist[[.x]],file = paste0("filenames_",.x,".txt")))

R Script (Parallel Version)

Write a R script (named prml_analysis.R).

In Real Data Analysis article, I parallel the nested for loop in the prml.from.fname through all the possible triplets with n_parallel multithreading. And I also consider splitting the filenames.txt to file_number parts, named filenames_.txt respectively and consider parallelization in compute cluster.

Arguments for the prml_analysis.R script:

n_parallel: number of threads within R level parallelization
file_number: the indicator for the specific subfile. This is within compute cluster level parallelization (using sbtach array)

{
  args = commandArgs(trailingOnly = TRUE)
  stopifnot(length(args) == 2)

  n_parallel = args[1]
  file_number = args[2]

  suppressMessages(library("statmod"))
  suppressMessages(library("dplyr"))
  suppressMessages(library("purrr"))
  suppressMessages(library("prml"))



  # prml.from.tri

  prml.from.tri <- function(trials,
                            spiketimes,
                            frq = c(1100, 742),
                            pos = c(24,-6),
                            on.reward = TRUE,
                            start.time = 0,
                            end.time = 600,
                            match.level = FALSE,
                            AB.eqlevel = FALSE,
                            go.by.soff = FALSE,
                            remove.zeros = FALSE,
                            ...) {
    attach(trials)
    attach(spiketimes)

    timestamps <- split(TIMES, TRIAL2)
    ntrials <- length(timestamps)
    trial.id <- as.numeric(names(timestamps))

    ix1 <- TASKID == 8 & A_FREQ == frq[1] & XA == pos[1]
    ix2 <- TASKID == 8 & A_FREQ == frq[2] & XA == pos[2]
    ix3 <-
      TASKID == 12 &
      (A_FREQ == frq[1] &
         B_FREQ == frq[2] &
         XA == pos[1] &
         XB == pos[2]) |
      (A_FREQ == frq[2] & B_FREQ == frq[1] & XA == pos[2] & XB == pos[1])

    if (on.reward) {
      ix1 <- ix1 & REWARD == 1
      ix2 <- ix2 & REWARD == 1
      ix3 <- ix3 & REWARD == 1
    }

    blev <- sort(unique(B_LEVEL[ix3]))
    targ.lev <- blev[blev > 0]
    lev <- "*"
    if (match.level) {
      if (length(targ.lev) > 1) {
        targ.lev <- max(targ.lev)
        warning("Multiple single sound levels, choosing largest one")
      }
      ix1 <- ix1 & A_LEVEL == targ.lev
      ix2 <- ix2 & A_LEVEL == targ.lev
      lev <- as.character(targ.lev)
    }

    if (AB.eqlevel)
      ix3 <- ix3 & (A_LEVEL == B_LEVEL)

    sing1 <- trials[ix1, 1]
    sing2 <- trials[ix2, 1]
    duplx <- trials[ix3, 1]
    success <- REWARD[ix3]

    if (go.by.soff)
      end.time <- min(SOFF[ix1 | ix2 | ix3])

    spike.counter <- function(jj) {
      jj1 <- match(jj, trial.id)
      spks <- timestamps[[jj1]]
      return(sum(spks > start.time & spks < end.time))
    }

    Acounts <- sapply(sing1, spike.counter)
    Bcounts <- sapply(sing2, spike.counter)
    ABcounts <- sapply(duplx, spike.counter)

    detach(trials)
    detach(spiketimes)

    if ((mean(Acounts) == 0) | (mean(Bcounts == 0)) |
        (mean(ABcounts == 0))) {
      stop("All the spike counts are 0 under a specific condition.")
    }

    if ((length(Acounts) == 1) |
        (length(Bcounts) == 1) | (length(ABcounts) == 1)) {
      stop("Only 1 available trial under a specific condition.")
    }

    s1 <- paste(frq[1], "Hz ", pos[1], "deg ", lev, "Db ", sep = "")
    s2 <- paste(frq[2], "Hz ", pos[2], "deg ", lev, "Db ", sep = "")
    dp <- paste("Duplex: ", lev, "Db ", sep = "")

    res = prml_tests(
      xA = Acounts,
      xB = Bcounts,
      xAB = ABcounts,
      labels = c(s1, s2, dp),
      e = 0,
      remove.zeros = remove.zeros
    )
    # By default:
    #remove.zeros = FALSE
    #mu_l="min",mu_u="max",gamma.pars = c(0.5, 2e-10),
    #n_gq = 20, n_per = 100, alpha = 0.5

    return(res)
  }

  # prml.from.fname

  prml.from.fname <-
    function(fname,
             data.path = "Data",
             on.reward = TRUE,
             match.level = FALSE,
             AB.eqlevel = FALSE,
             outfile = "",
             start = 0,
             end = 600,
             remove.zeros = FALSE,
             mccores = 4) {
      infile1 <- paste(data.path, "/JA/", fname, ".txt", sep = "")

      trials <-
        read.table(
          infile1,
          col.names = c(
            "TRIAL",
            "TASKID",
            "A_FREQ",
            "B_FREQ",
            "XA",
            "XB",
            "REWARD",
            "A_LEVEL",
            "B_LEVEL"
          )
        )

      infile2 <-
        paste(data.path, "/JA/", fname, "_spiketimes.txt", sep = "")

      spiketimes <-
        read.table(infile2, col.names = c("TRIAL2", "TIMES"))

      FREQS <- unique(c(trials$A_FREQ, trials$B_FREQ))
      alt.freq <- sort(FREQS[FREQS > 0 & FREQS != 742])
      alt.pos <- c(-24,-6, 6, 24)

      df <- expand.grid(alt.freq, alt.pos)
      res <- parallel::mclapply(1:nrow(df),
                                function(x) {
                                  try({
                                    lbf <-
                                      prml.from.tri(
                                        trials,
                                        spiketimes,
                                        c(df[x, 1], 742),
                                        c(df[x, 2],-144 / df[x, 2]),
                                        on.reward,
                                        start,
                                        end,
                                        match.level,
                                        AB.eqlevel,
                                        go.by.soff = FALSE,
                                        remove.zeros = remove.zeros
                                      ) %>% unlist(.)

                                    c(df[x, 1], df[x, 2], lbf)
                                  })
                                },
                                mc.cores = mccores)
      lapply(res, function(x) {
        cat(fname, x, "\n", file = outfile, append = TRUE)
      })
    }

  set.seed(123)
  fnames <- scan(paste0("filenames_", file_number, ".txt"), "a") 

  # for loop across all the file.
  start = Sys.time()

  for (file.name in fnames) {
    #"Data" is the folder you store all the .txt files
    #"result.txt" is the folder you store the result
    try(prml.from.fname(
      file.name,
      data.path = "Data",
      on.reward = TRUE,
      match.level = FALSE,
      AB.eqlevel = FALSE,
      outfile = paste0("result_", file_number, ".txt"),
      start = 0,
      end = 600,
      remove.zeros = FALSE,
      mccores = n_parallel
    ))
  }
  end = Sys.time()

  cat("Run time: ", difftime(end, start, units="secs"), "s\n", sep="")
}

Batch Script

Create a .sh file (named prml_analysis.sh).

If you use bash command to create the file, use vim to create a file and edit it, then use esc to escape editing, press shift+zz to exit the file.

```{bash,eval=FALSE,echo=TRUE} cd PATH_TO_ANALYSIS vim prml_analysis.sh

If you use Rstudio to create the file:

File -> New File -> Text File -> Save -> Change the name to `prml_analysis.sh`.

Copy the following to the .sh file. Note:

- `-c8` should match with the `n_parallel` argument of the R script, which is the first number after .R
- `array=1-3` should match with `n_part` in the `Preparation` section. It should be the number of subfiles you have.


```{bash,eval=FALSE,echo=TRUE}
#!/bin/bash
#SBATCH --partition=common
#SBATCH --output=others/filenames_%a.out
#SBATCH --error=others/filenames_%a.err
#SBATCH -c8
#SBATCH --array=1-3
singularity run -B $(pwd):/work prml.simg prml_analysis.R 8 $SLURM_ARRAY_TASK_ID

Upload Files

Open a new teminal 1.
Login to the compute cluster.
Create a folder named Analysis

```{bash,eval=FALSE,echo=TRUE} ssh ACCOUNT@dcc-slogin.oit.duke.edu:. mkdir Analysis

2. Open a terminal 2. 

- Copy the path to the folder `Analysis`. 
- Go to the folder `Analysis`.
- Copy all the files within the local folder `Analysis` to the folder `Analysis` in compute cluster.

```{bash,eval=FALSE,echo=TRUE}
cd PATH_TO_ANALYSIS
rsync -av * ACCOUNT@dcc-slogin.oit.duke.edu:./Analysis

Check your files.

In terminal 1, go to the folder Analysis to see the list of files.

```{bash,eval=FALSE,echo=TRUE} cd Analysis/ ls

## Singularity file

I built a docker image providing the environment to run the `prml_analysis.R` script. The details of the singularity file please refer to the article [Singularity Environment Building](https://yunranchen.github.io/prml/articles/singularity.html).

Here you just need to download it from the dockerhub. 

- Login to the compute cluster.
- Go to the folder named `Analysis`.
- Download the singularity. Copy the following line and run it in compute cluster.

```{bash,eval=FALSE,echo=TRUE}
singularity pull docker://yunran/prml

Run in Compute Cluster

Preparation: Make sure within your Analysis file, including prml_analysis.R, prml_analysis.sh, prml.simg, filenames_.txt files and folder others and Data.

Sbatch your .sh script

```{bash,eval=FALSE,echo=TRUE} sbatch prml_analysis.sh

- Check the status

```{bash,eval=FALSE,echo=TRUE}
squeue -u ACCOUNT

Fetch Results and Visualizations

Assume you create a folder named Results to store all the results of the PRML analysis.

Go to your local folder Results

```{bash,eval=FALSE,echo=TRUE} cd PATH_TO_RESULTS

- Fetch the result in compute cluster back to your local folder `Results`. 

```{bash,eval=FALSE,echo=TRUE}
rsync -av ACCOUNT@dcc-slogin.oit.duke.edu:./Analysis/result_*.txt PATH_TO_RESULTS

Check the result in the local folder Results

```{bash,eval=FALSE,echo=TRUE} ls

- R code to clean the data and store it into a `.RData` file

```r
library("tidyverse")

filelist = list.files(pattern = ".*.txt")
raw = map_df(filelist, function(x) {
  read.table(
    x,
    header = FALSE,
    sep = " ",
    stringsAsFactors = FALSE,
    fill = TRUE
  ) %>%
    filter(!V2 %in% c("Error", ""))
}) %>%
  mutate(e = 0) %>% #plug in the value of e you set in prml_tests()
  select(-V15)
names(raw) = c(
  "CellId",
  "AltFreq",
  "AltPos",
  "SepBF",
  "PrMix",
  "PrInt",
  "PrOut",
  "PrSing",
  "WinModels",
  "Pval1",
  "Pval2",
  "SampSizeA",
  "SampSizeB",
  "SampSizeAB",
  "e"
)
raw = raw %>% mutate(
  SepBF = SepBF %>% as.numeric(),
  PrMix = PrMix %>% as.numeric(),
  PrInt = PrInt %>% as.numeric(),
  PrOut = PrOut %>% as.numeric(),
  PrSing = PrSing %>% as.numeric(),
  WinModels = WinModels %>% as.numeric(),
  Pval1 = Pval1 %>% as.numeric(),
  Pval2 = Pval2 %>% as.numeric(),
  SampSizeA = SampSizeA %>% as.numeric(),
  SampSizeB = SampSizeB %>% as.numeric(),
  SampSizeAB = SampSizeAB %>% as.numeric()
)
model.names = c("Mixture", "Intermediate", "Outside", "Single")
raw = raw %>% mutate(cid = paste(CellId, AltFreq, AltPos, sep = "."))
#filter out data that has errors
cids = raw %>% filter(!WinModels %in% c(1:4)) %>% pull(cid)
rawf = raw %>% filter(!cid %in% cids) %>%
  mutate(WinModel = model.names[WinModels],
         WinPr = pmax(PrMix, PrInt, PrOut, PrSing))

#triplets not filtered by PRML filter
tri_unfilter=rawf%>%
  filter(SampSizeA>=5,SampSizeB>=5,SampSizeAB>=5,SepBF>=3)%>% #well-separated and enough sample size
  mutate(WinProb=cut(WinPr, breaks=c(0,0.25, 0.5, 0.95, 1.01),include.lowest = TRUE)) #cut the winning probabilities to four categories


#triplets filtered by PRML filter
tri_filter=tri_unfilter %>% 
  mutate(invPval1=1/Pval1,invPval2=1/Pval2)%>%
  filter(invPval1<=20,invPval2<=20)%>% #filter by PRML filter.
  dplyr::select(-invPval1,-invPval2)

tri=rbind(tri_unfilter,tri_filter)%>%
  mutate(group=rep(c("unfiltered","filtered"),
                   c(nrow(tri_unfilter),nrow(tri_filter))))%>%
  mutate(WinModel=factor(WinModel,levels = c("Outside","Single","Intermediate","Mixture")))

#save as a .RData
save(tri,file = "tri_filter_unfilter.RData")

Visualization

library("ggplot2")

ggplot(data = tri,mapping = aes(x = WinModel,color=WinProb,fill=WinProb))+
  geom_bar(stat="count",position = "stack")+
  facet_grid(~group)+
  geom_text(stat='count', aes(label=..count..), position = position_stack(vjust = 0.5),size=3,color="black")+
  scale_x_discrete(drop=FALSE) +
  scale_colour_grey(start = 0.9,end=0.3,drop=FALSE) + #if you use grey color
  scale_fill_grey(start = 0.9,end=0.3,drop=FALSE) +
  theme_bw()
ggsave(filename = "plot.pdf",width = 8,height = 4)

Notes

Be careful to match n_parts in Preparation with array= in Batch Script; -c in Batch Script to the first argument of the R script in Batch Script.
If you run the code several times, be sure to remove the old files (use rm in bash). Or, add the day you running the code before all the files' names.
Modify the R script and Batch script to feed your need. Here I just provide a template for convenience.
Can check the .out file within others folder to see the elapse time. (For example, 4218.462s for prml_tests() through 40 files in JA.)

Another R Script Example

This R script is for applying prml_tests_f().

{
  args = commandArgs(trailingOnly = TRUE)
  stopifnot(length(args) == 2)

  n_parallel = args[1]
  file_number = args[2]

  suppressMessages(library("statmod"))
  suppressMessages(library("dplyr"))
  suppressMessages(library("purrr"))
  suppressMessages(library("prml"))

  prml.from.tri.f <- function(trials,
                              spiketimes,
                              frq = c(1100, 742),
                              pos = c(24,-6),
                              on.reward = TRUE,
                              start.time = 0,
                              end.time = 600,
                              match.level = FALSE,
                              AB.eqlevel = FALSE,
                              go.by.soff = FALSE,
                              remove.zeros = FALSE,
                              ...) {
    attach(trials)
    attach(spiketimes)

    timestamps <- split(TIMES, TRIAL2)
    ntrials <- length(timestamps)
    trial.id <-
      as.numeric(names(timestamps)) ## same as unique(TRIAL2)

    ix1 <- TASKID == 8 & A_FREQ == frq[1] & XA == pos[1]
    ix2 <- TASKID == 8 & A_FREQ == frq[2] & XA == pos[2]
    ix3 <-
      TASKID == 12 &
      (A_FREQ == frq[1] &
         B_FREQ == frq[2] &
         XA == pos[1] &
         XB == pos[2]) |
      (A_FREQ == frq[2] & B_FREQ == frq[1] & XA == pos[2] & XB == pos[1])

    if (on.reward) {
      ix1 <- ix1 & REWARD == 1
      ix2 <- ix2 & REWARD == 1
      ix3 <- ix3 & REWARD == 1
    }

    blev <- sort(unique(B_LEVEL[ix3]))
    targ.lev <- blev[blev > 0]
    lev <- "*"
    if (match.level) {
      if (length(targ.lev) > 1) {
        targ.lev <- max(targ.lev)
        warning("Multiple single sound levels, choosing largest one")
      }
      ix1 <- ix1 & A_LEVEL == targ.lev
      ix2 <- ix2 & A_LEVEL == targ.lev
      lev <- as.character(targ.lev)
    }

    if (AB.eqlevel)
      ix3 <- ix3 & (A_LEVEL == B_LEVEL)

    sing1 <- trials[ix1, 1]
    sing2 <- trials[ix2, 1]
    duplx <- trials[ix3, 1]
    success <- REWARD[ix3]

    if (go.by.soff)
      end.time <- min(SOFF[ix1 | ix2 | ix3])

    spike.counter <- function(jj) {
      jj1 <- match(jj, trial.id)
      spks <- timestamps[[jj1]]
      return(sum(spks > start.time & spks < end.time))
    }

    Acounts <- sapply(sing1, spike.counter)
    Bcounts <- sapply(sing2, spike.counter)
    ABcounts <- sapply(duplx, spike.counter)

    detach(trials)
    detach(spiketimes)

    if ((mean(Acounts) == 0) | (mean(Bcounts == 0)) |
        (mean(ABcounts == 0))) {
      stop("All the spike counts are 0 under a specific condition.")
    }

    if ((length(Acounts) == 1) |
        (length(Bcounts) == 1) | (length(ABcounts) == 1)) {
      stop("Only 1 available trial under a specific condition.")
    }

    s1 <- paste(frq[1], "Hz ", pos[1], "deg ", lev, "Db ", sep = "")
    s2 <- paste(frq[2], "Hz ", pos[2], "deg ", lev, "Db ", sep = "")
    dp <- paste("Duplex: ", lev, "Db ", sep = "")
    res = prml_tests_f(
      xA = Acounts,
      xB = Bcounts,
      xAB = ABcounts,
      labels = c(s1, s2, dp),
      e = 0,
      remove.zeros = remove.zeros
    )
    # By default:
    #remove.zeros = FALSE
    #mu_l="min",mu_u="max",gamma.pars = c(0.5, 2e-10),
    #n_gq = 20, n_mu = 100, n_per = 100, alpha = 0.5

    return(res)
  }


  prml.from.fname.f <-
    function(fname,
             data.path = "Data",
             on.reward = TRUE,
             match.level = FALSE,
             AB.eqlevel = FALSE,
             outfile = "",
             start = 0,
             end = 600,
             remove.zeros = FALSE,
             mccores = 4) {
      infile1 <- paste(data.path, "/JA/", fname, ".txt", sep = "")

      trials <-
        read.table(
          infile1,
          col.names = c(
            "TRIAL",
            "TASKID",
            "A_FREQ",
            "B_FREQ",
            "XA",
            "XB",
            "REWARD",
            "A_LEVEL",
            "B_LEVEL"
          )
        )

      infile2 <-
        paste(data.path, "/JA/", fname, "_spiketimes.txt", sep = "")
      spiketimes <-
        read.table(infile2, col.names = c("TRIAL2", "TIMES"))

      FREQS <- unique(c(trials$A_FREQ, trials$B_FREQ))
      alt.freq <- sort(FREQS[FREQS > 0 & FREQS != 742])
      alt.pos <- c(-24,-6, 6, 24)

      df <- expand.grid(alt.freq, alt.pos)
      res <- parallel::mclapply(1:nrow(df),
                                function(x) {
                                  try({
                                    lbf <-
                                      prml.from.tri.f(
                                        trials,
                                        spiketimes,
                                        c(df[x, 1], 742),
                                        c(df[x, 2],-144 / df[x, 2]),
                                        on.reward,
                                        start,
                                        end,
                                        match.level,
                                        AB.eqlevel,
                                        go.by.soff = FALSE,
                                        remove.zeros = remove.zeros
                                      )
                                    cla <- c(df[x, 1], df[x, 2], unlist(lbf$out1))
                                    mix <- c(df[x, 1], df[x, 2], unlist(lbf$out2$mix_pf))
                                    int <- c(df[x, 1], df[x, 2], unlist(lbf$out2$int_pf))
                                    outA <- c(df[x, 1], df[x, 2], unlist(lbf$out2$outA_pf))
                                    outB <- c(df[x, 1], df[x, 2], unlist(lbf$out2$outB_pf))
                                    list(cla, mix, int, outA, outB)
                                  })
                                },
                                mc.cores = mccores)
      lapply(res, function(x) {
        try({
          cat(fname, x[[1]], "\n", file = outfile, append = TRUE)
          cat("mix",
              fname,
              x[[2]],
              "\n",
              file = paste0("mix_", outfile),
              append = TRUE)
          cat(
            "int",
            fname,
            x[[3]],
            "\n",
            file = paste0("int_out_", outfile),
            append = TRUE
          )
          cat(
            "outA",
            fname,
            x[[4]],
            "\n",
            file = paste0("int_out_", outfile),
            append = TRUE
          )
          cat(
            "outB",
            fname,
            x[[5]],
            "\n",
            file = paste0("int_out_", outfile),
            append = TRUE
          )
        })
      })

    }

  set.seed(123)
  fnames <- scan(paste0("filenames_", file_number, ".txt"), "a")

  start <- Sys.time()

  for (file.name in fnames) {
    #"Data" is the folder you store all the .txt files
    #"result.txt" is the folder you store the result
    try(prml.from.fname.f(
      file.name,
      data.path = "Data",
      on.reward = TRUE,
      match.level = FALSE,
      AB.eqlevel = FALSE,
      outfile = paste0("result_", file_number, ".txt"),
      start = 0,
      end = 600,
      remove.zeros = FALSE,
      mccores = n_parallel
    ))
  }

  end <- Sys.time()

  cat("Run time: ", difftime(end, start, units = "secs"), "s\n", sep = "")
}

Fetch Results and Visualizations

In your local folder Results, fetch the results from compute cluster.

```{bash,eval=FALSE,echo=TRUE} rsync -av ACCOUNT@dcc-slogin.oit.duke.edu:./Analysis/result_ PATH_TO_RESULTS

Data cleaning and visualization.

- Obtain the file list for all the result.

```r
library("tidyverse")

filelist_cla = list.files(pattern = "^result_*")
filelist_mix = list.files(pattern = "mix_result_*")
filelist_intout = list.files(pattern = "int_out_result_*")

For the classification,

raw = map_dfr(filelist_cla, function(x) {
  read.table(
    x,
    header = FALSE,
    sep = " ",
    stringsAsFactors = FALSE,
    fill = TRUE
  ) %>%
    filter(!V2 %in% c("Error", ""))
}) %>%
  mutate(e = 0) %>% #plug in the value of e you set in prml_tests()
  select(-V15)
names(raw) = c(
  "CellId",
  "AltFreq",
  "AltPos",
  "SepBF",
  "PrMix",
  "PrInt",
  "PrOut",
  "PrSing",
  "WinModels",
  "Pval1",
  "Pval2",
  "SampSizeA",
  "SampSizeB",
  "SampSizeAB",
  "e"
)
raw = raw %>% mutate(
  SepBF = SepBF %>% as.numeric(),
  PrMix = PrMix %>% as.numeric(),
  PrInt = PrInt %>% as.numeric(),
  PrOut = PrOut %>% as.numeric(),
  PrSing = PrSing %>% as.numeric(),
  WinModels = WinModels %>% as.numeric(),
  Pval1 = Pval1 %>% as.numeric(),
  Pval2 = Pval2 %>% as.numeric(),
  SampSizeA = SampSizeA %>% as.numeric(),
  SampSizeB = SampSizeB %>% as.numeric(),
  SampSizeAB = SampSizeAB %>% as.numeric()
)
model.names = c("Mixture", "Intermediate", "Outside", "Single")
raw = raw %>% mutate(cid = paste(CellId, AltFreq, AltPos, sep = "."))
#filter out data that has errors
cids = raw %>% filter(!WinModels %in% c(1:4)) %>% pull(cid)
rawf = raw %>% filter(!cid %in% cids) %>%
  mutate(WinModel = model.names[WinModels],
         WinPr = pmax(PrMix, PrInt, PrOut, PrSing))

#triplets not filtered by PRML filter
tri_unfilter=rawf%>%
  filter(SampSizeA>=5,SampSizeB>=5,SampSizeAB>=5,SepBF>=3)%>% #well-separated and enough sample size
  mutate(WinProb=cut(WinPr, breaks=c(0,0.25, 0.5, 0.95, 1.01),include.lowest = TRUE)) #cut the winning probabilities to four categories


#triplets filtered by PRML filter
tri_filter=tri_unfilter %>% 
  mutate(invPval1=1/Pval1,invPval2=1/Pval2)%>%
  filter(invPval1<=20,invPval2<=20)%>% #filter by PRML filter.
  dplyr::select(-invPval1,-invPval2)

tri=rbind(tri_unfilter,tri_filter)%>%
  mutate(group=rep(c("unfiltered","filtered"),
                   c(nrow(tri_unfilter),nrow(tri_filter))))%>%
  mutate(WinModel=factor(WinModel,levels = c("Outside","Single","Intermediate","Mixture")))

#save as a .RData
save(tri,file = "tri_filter_unfilter.RData")

Visualization on classification

library("ggplot2")

ggplot(data = tri,mapping = aes(x = WinModel,color=WinProb,fill=WinProb))+
  geom_bar(stat="count",position = "stack")+
  facet_grid(~group)+
  geom_text(stat='count', aes(label=..count..), position = position_stack(vjust = 0.5),size=3,color="black")+
  scale_x_discrete(drop=FALSE) +
  scale_colour_grey(start = 0.9,end=0.3,drop=FALSE) + #if you use grey color
  scale_fill_grey(start = 0.9,end=0.3,drop=FALSE) +
  theme_bw()
ggsave(filename = "plot.pdf",width = 8,height = 4)

For the density estimation,

e_=0 # the value of e 


filelist_intout = list.files(pattern = "int_out_result_*")
int = map_dfr(filelist_intout, function(x) {
  read.table(
    x,
    header = FALSE,
    sep = " ",
    stringsAsFactors = FALSE,
    fill = TRUE
  )}) %>%
  mutate(e=e_) %>%
  mutate(cid=paste(V2,V3,V4,sep = "."))

mix = map_dfr(filelist_mix, function(x) {
  read.table(
    x,
    header = FALSE,
    sep = " ",
    stringsAsFactors = FALSE,
    fill = TRUE
  )}) %>%
  mutate(e=e_,V5=e_+(1-2*e_)*V5,V6=e_+(1-2*e_)*V6)%>%
  mutate(cid=paste(V2,V3,V4,sep = "."))

Visualization on pdf

# Here I draw the filtered triplets.

dfres=tri%>%
  filter(group=="filtered")

ids=dfres%>%pull(cid)

pdf(height = 3, width = 5, file = "plot_pdf.pdf")
for (i in ids){
  try({
    int0=int%>%filter(cid==i,V1=="int")%>%.[,5:104]%>%t()
    df1=data.frame(ind=seq(e_,1-e_,length.out = 100),y=int0,e=rep(e_,100),model=rep("int",100))
    mix0=mix%>%filter(cid==i)%>%.[,c(5,6)]%>%t()
    df2=data.frame(ind=c(0,1),y=mix0,e=rep(0,2))
    p=ggplot(data = df1,mapping = aes(x=ind,y=y))+
      geom_line()+
      #facet_grid(.~e)+
      geom_point(data=df2,mapping = aes(x=ind,y=y))+
      geom_hline(yintercept = e_,linetype="dashed")+
      geom_hline(yintercept = 1-e_,linetype="dashed")+
      geom_vline(xintercept = e_,linetype="dashed")+
      geom_vline(xintercept = 1-e_,linetype="dashed")+
      ggtitle(paste0(i,";"),
              paste0(dfres%>%filter(cid==i)%>%pull(WinModel),
                     ";WinProb:",
                     dfres%>%filter(cid==i)%>%pull(WinPr)%>%round(.,4),
                     ";SizeA:",
                     dfres%>%filter(cid==i)%>%pull(SampSizeA),
                     ";SizeB:",
                     dfres%>%filter(cid==i)%>%pull(SampSizeB),
                     ";SizeAB:",
                     dfres%>%filter(cid==i)%>%pull(SampSizeAB)
              ))+
      theme_bw()+
      xlab("")
    print(p)
  })
}
dev.off()