knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

In this article, I present how to run the real data analysis in computer cluster, including:

Steps

Follow the steps:

  1. Create a folder named Analysis. Within the folder, include your raw data in the sub folder ./Data/JA. Assume the raw data is paired .txt files, named UNIQUEIDDoubleSound.txt and UNIQUEIDDoubleSound_spiketimes.txt, storing the condition of the experiment and the spike times records respectively.

  2. Run the R code in Preparation section, to create a filenames.txt file stored in folder Analysis, including all the paired file names. Run the R code in Preparation section, to separate the raw data into n_part parts (I set 3 parts here.) Create several filenames_NUMBER.txt files stored in folder Analysis, including part of the paired file names respectively.

  3. Create a R file named prml_analysis.R within folder Analysis following the section R Script (Parallel Version).

  4. Create a Batch script named prml_analysis.sh within folder Analysis following the section Batch Script.

  5. Create a folder others within folder Analysis to store the output and error messages from the R script and Batch script.

  6. Upload the folder Analysis to the compute cluster following the section Upload Files.

  7. Download the Singularity file to the folder Analysis as shown in section Singularity file. Notice, once it is download, next time you want to use it, you only need to copy it to your another analysis folder.

  8. Run in the compute cluster by following section Run in Compute Cluster. And you can also check the status of your jobs.

  9. After all the jobs finish, fetch the results to your local folder and visualize your result. Section Fetch Results and Visualizations shows how to gather the results and do basic data cleaning before visualization.

  10. Some notes. See section Notes.

  11. If you want to apply prml_tests_f() instead of prml_tests(), just need to change the R script to the one shown in section Another R Script Example.

Preparation

Make sure you have all the paired raw data files in ./Data/JA folder. The following code is for:

library("dplyr")
library("purrr")
library("stringr")

# Create a `filenames.txt` file, extracting all the paired file names from folder `./Data/JA`. 

filenames <- list.files(path = "./Data/JA",pattern = "DoubleSound\\.txt")%>%
  str_split(.,".txt",simplify = TRUE)%>%
  .[,1]
write(filenames,file = "filenames.txt")

#split the files to `n_part` parts.

#fnames -- a list of names of the .txt files in the folder
fnames <- scan("filenames.txt", "a")
n_part <- 3
flist <- split(fnames, seq_along(fnames)%%n_part+1)
map(1:length(flist),~write(flist[[.x]],file = paste0("filenames_",.x,".txt")))

R Script (Parallel Version)

Write a R script (named prml_analysis.R).

In Real Data Analysis article, I parallel the nested for loop in the prml.from.fname through all the possible triplets with n_parallel multithreading. And I also consider splitting the filenames.txt to file_number parts, named filenames_.txt respectively and consider parallelization in compute cluster.

Arguments for the prml_analysis.R script:

{
  args = commandArgs(trailingOnly = TRUE)
  stopifnot(length(args) == 2)

  n_parallel = args[1]
  file_number = args[2]

  suppressMessages(library("statmod"))
  suppressMessages(library("dplyr"))
  suppressMessages(library("purrr"))
  suppressMessages(library("prml"))



  # prml.from.tri

  prml.from.tri <- function(trials,
                            spiketimes,
                            frq = c(1100, 742),
                            pos = c(24,-6),
                            on.reward = TRUE,
                            start.time = 0,
                            end.time = 600,
                            match.level = FALSE,
                            AB.eqlevel = FALSE,
                            go.by.soff = FALSE,
                            remove.zeros = FALSE,
                            ...) {
    attach(trials)
    attach(spiketimes)

    timestamps <- split(TIMES, TRIAL2)
    ntrials <- length(timestamps)
    trial.id <- as.numeric(names(timestamps))

    ix1 <- TASKID == 8 & A_FREQ == frq[1] & XA == pos[1]
    ix2 <- TASKID == 8 & A_FREQ == frq[2] & XA == pos[2]
    ix3 <-
      TASKID == 12 &
      (A_FREQ == frq[1] &
         B_FREQ == frq[2] &
         XA == pos[1] &
         XB == pos[2]) |
      (A_FREQ == frq[2] & B_FREQ == frq[1] & XA == pos[2] & XB == pos[1])

    if (on.reward) {
      ix1 <- ix1 & REWARD == 1
      ix2 <- ix2 & REWARD == 1
      ix3 <- ix3 & REWARD == 1
    }

    blev <- sort(unique(B_LEVEL[ix3]))
    targ.lev <- blev[blev > 0]
    lev <- "*"
    if (match.level) {
      if (length(targ.lev) > 1) {
        targ.lev <- max(targ.lev)
        warning("Multiple single sound levels, choosing largest one")
      }
      ix1 <- ix1 & A_LEVEL == targ.lev
      ix2 <- ix2 & A_LEVEL == targ.lev
      lev <- as.character(targ.lev)
    }

    if (AB.eqlevel)
      ix3 <- ix3 & (A_LEVEL == B_LEVEL)

    sing1 <- trials[ix1, 1]
    sing2 <- trials[ix2, 1]
    duplx <- trials[ix3, 1]
    success <- REWARD[ix3]

    if (go.by.soff)
      end.time <- min(SOFF[ix1 | ix2 | ix3])

    spike.counter <- function(jj) {
      jj1 <- match(jj, trial.id)
      spks <- timestamps[[jj1]]
      return(sum(spks > start.time & spks < end.time))
    }

    Acounts <- sapply(sing1, spike.counter)
    Bcounts <- sapply(sing2, spike.counter)
    ABcounts <- sapply(duplx, spike.counter)

    detach(trials)
    detach(spiketimes)

    if ((mean(Acounts) == 0) | (mean(Bcounts == 0)) |
        (mean(ABcounts == 0))) {
      stop("All the spike counts are 0 under a specific condition.")
    }

    if ((length(Acounts) == 1) |
        (length(Bcounts) == 1) | (length(ABcounts) == 1)) {
      stop("Only 1 available trial under a specific condition.")
    }

    s1 <- paste(frq[1], "Hz ", pos[1], "deg ", lev, "Db ", sep = "")
    s2 <- paste(frq[2], "Hz ", pos[2], "deg ", lev, "Db ", sep = "")
    dp <- paste("Duplex: ", lev, "Db ", sep = "")

    res = prml_tests(
      xA = Acounts,
      xB = Bcounts,
      xAB = ABcounts,
      labels = c(s1, s2, dp),
      e = 0,
      remove.zeros = remove.zeros
    )
    # By default:
    #remove.zeros = FALSE
    #mu_l="min",mu_u="max",gamma.pars = c(0.5, 2e-10),
    #n_gq = 20, n_per = 100, alpha = 0.5

    return(res)
  }

  # prml.from.fname

  prml.from.fname <-
    function(fname,
             data.path = "Data",
             on.reward = TRUE,
             match.level = FALSE,
             AB.eqlevel = FALSE,
             outfile = "",
             start = 0,
             end = 600,
             remove.zeros = FALSE,
             mccores = 4) {
      infile1 <- paste(data.path, "/JA/", fname, ".txt", sep = "")

      trials <-
        read.table(
          infile1,
          col.names = c(
            "TRIAL",
            "TASKID",
            "A_FREQ",
            "B_FREQ",
            "XA",
            "XB",
            "REWARD",
            "A_LEVEL",
            "B_LEVEL"
          )
        )

      infile2 <-
        paste(data.path, "/JA/", fname, "_spiketimes.txt", sep = "")

      spiketimes <-
        read.table(infile2, col.names = c("TRIAL2", "TIMES"))

      FREQS <- unique(c(trials$A_FREQ, trials$B_FREQ))
      alt.freq <- sort(FREQS[FREQS > 0 & FREQS != 742])
      alt.pos <- c(-24,-6, 6, 24)

      df <- expand.grid(alt.freq, alt.pos)
      res <- parallel::mclapply(1:nrow(df),
                                function(x) {
                                  try({
                                    lbf <-
                                      prml.from.tri(
                                        trials,
                                        spiketimes,
                                        c(df[x, 1], 742),
                                        c(df[x, 2],-144 / df[x, 2]),
                                        on.reward,
                                        start,
                                        end,
                                        match.level,
                                        AB.eqlevel,
                                        go.by.soff = FALSE,
                                        remove.zeros = remove.zeros
                                      ) %>% unlist(.)

                                    c(df[x, 1], df[x, 2], lbf)
                                  })
                                },
                                mc.cores = mccores)
      lapply(res, function(x) {
        cat(fname, x, "\n", file = outfile, append = TRUE)
      })
    }

  set.seed(123)
  fnames <- scan(paste0("filenames_", file_number, ".txt"), "a") 

  # for loop across all the file.
  start = Sys.time()

  for (file.name in fnames) {
    #"Data" is the folder you store all the .txt files
    #"result.txt" is the folder you store the result
    try(prml.from.fname(
      file.name,
      data.path = "Data",
      on.reward = TRUE,
      match.level = FALSE,
      AB.eqlevel = FALSE,
      outfile = paste0("result_", file_number, ".txt"),
      start = 0,
      end = 600,
      remove.zeros = FALSE,
      mccores = n_parallel
    ))
  }
  end = Sys.time()

  cat("Run time: ", difftime(end, start, units="secs"), "s\n", sep="")
}

Batch Script

Create a .sh file (named prml_analysis.sh).

If you use bash command to create the file, use vim to create a file and edit it, then use esc to escape editing, press shift+zz to exit the file.

```{bash,eval=FALSE,echo=TRUE} cd PATH_TO_ANALYSIS vim prml_analysis.sh

If you use Rstudio to create the file:

File -> New File -> Text File -> Save -> Change the name to `prml_analysis.sh`.

Copy the following to the .sh file. Note:

- `-c8` should match with the `n_parallel` argument of the R script, which is the first number after .R
- `array=1-3` should match with `n_part` in the `Preparation` section. It should be the number of subfiles you have.


```{bash,eval=FALSE,echo=TRUE}
#!/bin/bash
#SBATCH --partition=common
#SBATCH --output=others/filenames_%a.out
#SBATCH --error=others/filenames_%a.err
#SBATCH -c8
#SBATCH --array=1-3
singularity run -B $(pwd):/work prml.simg prml_analysis.R 8 $SLURM_ARRAY_TASK_ID

Upload Files

  1. Open a new teminal 1.

  2. Login to the compute cluster.

  3. Create a folder named Analysis

```{bash,eval=FALSE,echo=TRUE} ssh ACCOUNT@dcc-slogin.oit.duke.edu:. mkdir Analysis

2. Open a terminal 2. 

- Copy the path to the folder `Analysis`. 
- Go to the folder `Analysis`.
- Copy all the files within the local folder `Analysis` to the folder `Analysis` in compute cluster.

```{bash,eval=FALSE,echo=TRUE}
cd PATH_TO_ANALYSIS
rsync -av * ACCOUNT@dcc-slogin.oit.duke.edu:./Analysis
  1. Check your files.

In terminal 1, go to the folder Analysis to see the list of files.

```{bash,eval=FALSE,echo=TRUE} cd Analysis/ ls

## Singularity file

I built a docker image providing the environment to run the `prml_analysis.R` script. The details of the singularity file please refer to the article [Singularity Environment Building](https://yunranchen.github.io/prml/articles/singularity.html).

Here you just need to download it from the dockerhub. 

- Login to the compute cluster.
- Go to the folder named `Analysis`.
- Download the singularity. Copy the following line and run it in compute cluster.

```{bash,eval=FALSE,echo=TRUE}
singularity pull docker://yunran/prml

Run in Compute Cluster

Preparation: Make sure within your Analysis file, including prml_analysis.R, prml_analysis.sh, prml.simg, filenames_.txt files and folder others and Data.

```{bash,eval=FALSE,echo=TRUE} sbatch prml_analysis.sh

- Check the status

```{bash,eval=FALSE,echo=TRUE}
squeue -u ACCOUNT

Fetch Results and Visualizations

Assume you create a folder named Results to store all the results of the PRML analysis.

```{bash,eval=FALSE,echo=TRUE} cd PATH_TO_RESULTS

- Fetch the result in compute cluster back to your local folder `Results`. 

```{bash,eval=FALSE,echo=TRUE}
rsync -av ACCOUNT@dcc-slogin.oit.duke.edu:./Analysis/result_*.txt PATH_TO_RESULTS

```{bash,eval=FALSE,echo=TRUE} ls

- R code to clean the data and store it into a `.RData` file

```r
library("tidyverse")

filelist = list.files(pattern = ".*.txt")
raw = map_df(filelist, function(x) {
  read.table(
    x,
    header = FALSE,
    sep = " ",
    stringsAsFactors = FALSE,
    fill = TRUE
  ) %>%
    filter(!V2 %in% c("Error", ""))
}) %>%
  mutate(e = 0) %>% #plug in the value of e you set in prml_tests()
  select(-V15)
names(raw) = c(
  "CellId",
  "AltFreq",
  "AltPos",
  "SepBF",
  "PrMix",
  "PrInt",
  "PrOut",
  "PrSing",
  "WinModels",
  "Pval1",
  "Pval2",
  "SampSizeA",
  "SampSizeB",
  "SampSizeAB",
  "e"
)
raw = raw %>% mutate(
  SepBF = SepBF %>% as.numeric(),
  PrMix = PrMix %>% as.numeric(),
  PrInt = PrInt %>% as.numeric(),
  PrOut = PrOut %>% as.numeric(),
  PrSing = PrSing %>% as.numeric(),
  WinModels = WinModels %>% as.numeric(),
  Pval1 = Pval1 %>% as.numeric(),
  Pval2 = Pval2 %>% as.numeric(),
  SampSizeA = SampSizeA %>% as.numeric(),
  SampSizeB = SampSizeB %>% as.numeric(),
  SampSizeAB = SampSizeAB %>% as.numeric()
)
model.names = c("Mixture", "Intermediate", "Outside", "Single")
raw = raw %>% mutate(cid = paste(CellId, AltFreq, AltPos, sep = "."))
#filter out data that has errors
cids = raw %>% filter(!WinModels %in% c(1:4)) %>% pull(cid)
rawf = raw %>% filter(!cid %in% cids) %>%
  mutate(WinModel = model.names[WinModels],
         WinPr = pmax(PrMix, PrInt, PrOut, PrSing))

#triplets not filtered by PRML filter
tri_unfilter=rawf%>%
  filter(SampSizeA>=5,SampSizeB>=5,SampSizeAB>=5,SepBF>=3)%>% #well-separated and enough sample size
  mutate(WinProb=cut(WinPr, breaks=c(0,0.25, 0.5, 0.95, 1.01),include.lowest = TRUE)) #cut the winning probabilities to four categories


#triplets filtered by PRML filter
tri_filter=tri_unfilter %>% 
  mutate(invPval1=1/Pval1,invPval2=1/Pval2)%>%
  filter(invPval1<=20,invPval2<=20)%>% #filter by PRML filter.
  dplyr::select(-invPval1,-invPval2)

tri=rbind(tri_unfilter,tri_filter)%>%
  mutate(group=rep(c("unfiltered","filtered"),
                   c(nrow(tri_unfilter),nrow(tri_filter))))%>%
  mutate(WinModel=factor(WinModel,levels = c("Outside","Single","Intermediate","Mixture")))

#save as a .RData
save(tri,file = "tri_filter_unfilter.RData")
library("ggplot2")

ggplot(data = tri,mapping = aes(x = WinModel,color=WinProb,fill=WinProb))+
  geom_bar(stat="count",position = "stack")+
  facet_grid(~group)+
  geom_text(stat='count', aes(label=..count..), position = position_stack(vjust = 0.5),size=3,color="black")+
  scale_x_discrete(drop=FALSE) +
  scale_colour_grey(start = 0.9,end=0.3,drop=FALSE) + #if you use grey color
  scale_fill_grey(start = 0.9,end=0.3,drop=FALSE) +
  theme_bw()
ggsave(filename = "plot.pdf",width = 8,height = 4)

Notes

Another R Script Example

This R script is for applying prml_tests_f().

{
  args = commandArgs(trailingOnly = TRUE)
  stopifnot(length(args) == 2)

  n_parallel = args[1]
  file_number = args[2]

  suppressMessages(library("statmod"))
  suppressMessages(library("dplyr"))
  suppressMessages(library("purrr"))
  suppressMessages(library("prml"))

  prml.from.tri.f <- function(trials,
                              spiketimes,
                              frq = c(1100, 742),
                              pos = c(24,-6),
                              on.reward = TRUE,
                              start.time = 0,
                              end.time = 600,
                              match.level = FALSE,
                              AB.eqlevel = FALSE,
                              go.by.soff = FALSE,
                              remove.zeros = FALSE,
                              ...) {
    attach(trials)
    attach(spiketimes)

    timestamps <- split(TIMES, TRIAL2)
    ntrials <- length(timestamps)
    trial.id <-
      as.numeric(names(timestamps)) ## same as unique(TRIAL2)

    ix1 <- TASKID == 8 & A_FREQ == frq[1] & XA == pos[1]
    ix2 <- TASKID == 8 & A_FREQ == frq[2] & XA == pos[2]
    ix3 <-
      TASKID == 12 &
      (A_FREQ == frq[1] &
         B_FREQ == frq[2] &
         XA == pos[1] &
         XB == pos[2]) |
      (A_FREQ == frq[2] & B_FREQ == frq[1] & XA == pos[2] & XB == pos[1])

    if (on.reward) {
      ix1 <- ix1 & REWARD == 1
      ix2 <- ix2 & REWARD == 1
      ix3 <- ix3 & REWARD == 1
    }

    blev <- sort(unique(B_LEVEL[ix3]))
    targ.lev <- blev[blev > 0]
    lev <- "*"
    if (match.level) {
      if (length(targ.lev) > 1) {
        targ.lev <- max(targ.lev)
        warning("Multiple single sound levels, choosing largest one")
      }
      ix1 <- ix1 & A_LEVEL == targ.lev
      ix2 <- ix2 & A_LEVEL == targ.lev
      lev <- as.character(targ.lev)
    }

    if (AB.eqlevel)
      ix3 <- ix3 & (A_LEVEL == B_LEVEL)

    sing1 <- trials[ix1, 1]
    sing2 <- trials[ix2, 1]
    duplx <- trials[ix3, 1]
    success <- REWARD[ix3]

    if (go.by.soff)
      end.time <- min(SOFF[ix1 | ix2 | ix3])

    spike.counter <- function(jj) {
      jj1 <- match(jj, trial.id)
      spks <- timestamps[[jj1]]
      return(sum(spks > start.time & spks < end.time))
    }

    Acounts <- sapply(sing1, spike.counter)
    Bcounts <- sapply(sing2, spike.counter)
    ABcounts <- sapply(duplx, spike.counter)

    detach(trials)
    detach(spiketimes)

    if ((mean(Acounts) == 0) | (mean(Bcounts == 0)) |
        (mean(ABcounts == 0))) {
      stop("All the spike counts are 0 under a specific condition.")
    }

    if ((length(Acounts) == 1) |
        (length(Bcounts) == 1) | (length(ABcounts) == 1)) {
      stop("Only 1 available trial under a specific condition.")
    }

    s1 <- paste(frq[1], "Hz ", pos[1], "deg ", lev, "Db ", sep = "")
    s2 <- paste(frq[2], "Hz ", pos[2], "deg ", lev, "Db ", sep = "")
    dp <- paste("Duplex: ", lev, "Db ", sep = "")
    res = prml_tests_f(
      xA = Acounts,
      xB = Bcounts,
      xAB = ABcounts,
      labels = c(s1, s2, dp),
      e = 0,
      remove.zeros = remove.zeros
    )
    # By default:
    #remove.zeros = FALSE
    #mu_l="min",mu_u="max",gamma.pars = c(0.5, 2e-10),
    #n_gq = 20, n_mu = 100, n_per = 100, alpha = 0.5

    return(res)
  }


  prml.from.fname.f <-
    function(fname,
             data.path = "Data",
             on.reward = TRUE,
             match.level = FALSE,
             AB.eqlevel = FALSE,
             outfile = "",
             start = 0,
             end = 600,
             remove.zeros = FALSE,
             mccores = 4) {
      infile1 <- paste(data.path, "/JA/", fname, ".txt", sep = "")

      trials <-
        read.table(
          infile1,
          col.names = c(
            "TRIAL",
            "TASKID",
            "A_FREQ",
            "B_FREQ",
            "XA",
            "XB",
            "REWARD",
            "A_LEVEL",
            "B_LEVEL"
          )
        )

      infile2 <-
        paste(data.path, "/JA/", fname, "_spiketimes.txt", sep = "")
      spiketimes <-
        read.table(infile2, col.names = c("TRIAL2", "TIMES"))

      FREQS <- unique(c(trials$A_FREQ, trials$B_FREQ))
      alt.freq <- sort(FREQS[FREQS > 0 & FREQS != 742])
      alt.pos <- c(-24,-6, 6, 24)

      df <- expand.grid(alt.freq, alt.pos)
      res <- parallel::mclapply(1:nrow(df),
                                function(x) {
                                  try({
                                    lbf <-
                                      prml.from.tri.f(
                                        trials,
                                        spiketimes,
                                        c(df[x, 1], 742),
                                        c(df[x, 2],-144 / df[x, 2]),
                                        on.reward,
                                        start,
                                        end,
                                        match.level,
                                        AB.eqlevel,
                                        go.by.soff = FALSE,
                                        remove.zeros = remove.zeros
                                      )
                                    cla <- c(df[x, 1], df[x, 2], unlist(lbf$out1))
                                    mix <- c(df[x, 1], df[x, 2], unlist(lbf$out2$mix_pf))
                                    int <- c(df[x, 1], df[x, 2], unlist(lbf$out2$int_pf))
                                    outA <- c(df[x, 1], df[x, 2], unlist(lbf$out2$outA_pf))
                                    outB <- c(df[x, 1], df[x, 2], unlist(lbf$out2$outB_pf))
                                    list(cla, mix, int, outA, outB)
                                  })
                                },
                                mc.cores = mccores)
      lapply(res, function(x) {
        try({
          cat(fname, x[[1]], "\n", file = outfile, append = TRUE)
          cat("mix",
              fname,
              x[[2]],
              "\n",
              file = paste0("mix_", outfile),
              append = TRUE)
          cat(
            "int",
            fname,
            x[[3]],
            "\n",
            file = paste0("int_out_", outfile),
            append = TRUE
          )
          cat(
            "outA",
            fname,
            x[[4]],
            "\n",
            file = paste0("int_out_", outfile),
            append = TRUE
          )
          cat(
            "outB",
            fname,
            x[[5]],
            "\n",
            file = paste0("int_out_", outfile),
            append = TRUE
          )
        })
      })

    }

  set.seed(123)
  fnames <- scan(paste0("filenames_", file_number, ".txt"), "a")

  start <- Sys.time()

  for (file.name in fnames) {
    #"Data" is the folder you store all the .txt files
    #"result.txt" is the folder you store the result
    try(prml.from.fname.f(
      file.name,
      data.path = "Data",
      on.reward = TRUE,
      match.level = FALSE,
      AB.eqlevel = FALSE,
      outfile = paste0("result_", file_number, ".txt"),
      start = 0,
      end = 600,
      remove.zeros = FALSE,
      mccores = n_parallel
    ))
  }

  end <- Sys.time()

  cat("Run time: ", difftime(end, start, units = "secs"), "s\n", sep = "")
}

Fetch Results and Visualizations

In your local folder Results, fetch the results from compute cluster.

```{bash,eval=FALSE,echo=TRUE} rsync -av ACCOUNT@dcc-slogin.oit.duke.edu:./Analysis/result_ PATH_TO_RESULTS

Data cleaning and visualization.

- Obtain the file list for all the result.

```r
library("tidyverse")

filelist_cla = list.files(pattern = "^result_*")
filelist_mix = list.files(pattern = "mix_result_*")
filelist_intout = list.files(pattern = "int_out_result_*")
raw = map_dfr(filelist_cla, function(x) {
  read.table(
    x,
    header = FALSE,
    sep = " ",
    stringsAsFactors = FALSE,
    fill = TRUE
  ) %>%
    filter(!V2 %in% c("Error", ""))
}) %>%
  mutate(e = 0) %>% #plug in the value of e you set in prml_tests()
  select(-V15)
names(raw) = c(
  "CellId",
  "AltFreq",
  "AltPos",
  "SepBF",
  "PrMix",
  "PrInt",
  "PrOut",
  "PrSing",
  "WinModels",
  "Pval1",
  "Pval2",
  "SampSizeA",
  "SampSizeB",
  "SampSizeAB",
  "e"
)
raw = raw %>% mutate(
  SepBF = SepBF %>% as.numeric(),
  PrMix = PrMix %>% as.numeric(),
  PrInt = PrInt %>% as.numeric(),
  PrOut = PrOut %>% as.numeric(),
  PrSing = PrSing %>% as.numeric(),
  WinModels = WinModels %>% as.numeric(),
  Pval1 = Pval1 %>% as.numeric(),
  Pval2 = Pval2 %>% as.numeric(),
  SampSizeA = SampSizeA %>% as.numeric(),
  SampSizeB = SampSizeB %>% as.numeric(),
  SampSizeAB = SampSizeAB %>% as.numeric()
)
model.names = c("Mixture", "Intermediate", "Outside", "Single")
raw = raw %>% mutate(cid = paste(CellId, AltFreq, AltPos, sep = "."))
#filter out data that has errors
cids = raw %>% filter(!WinModels %in% c(1:4)) %>% pull(cid)
rawf = raw %>% filter(!cid %in% cids) %>%
  mutate(WinModel = model.names[WinModels],
         WinPr = pmax(PrMix, PrInt, PrOut, PrSing))

#triplets not filtered by PRML filter
tri_unfilter=rawf%>%
  filter(SampSizeA>=5,SampSizeB>=5,SampSizeAB>=5,SepBF>=3)%>% #well-separated and enough sample size
  mutate(WinProb=cut(WinPr, breaks=c(0,0.25, 0.5, 0.95, 1.01),include.lowest = TRUE)) #cut the winning probabilities to four categories


#triplets filtered by PRML filter
tri_filter=tri_unfilter %>% 
  mutate(invPval1=1/Pval1,invPval2=1/Pval2)%>%
  filter(invPval1<=20,invPval2<=20)%>% #filter by PRML filter.
  dplyr::select(-invPval1,-invPval2)

tri=rbind(tri_unfilter,tri_filter)%>%
  mutate(group=rep(c("unfiltered","filtered"),
                   c(nrow(tri_unfilter),nrow(tri_filter))))%>%
  mutate(WinModel=factor(WinModel,levels = c("Outside","Single","Intermediate","Mixture")))

#save as a .RData
save(tri,file = "tri_filter_unfilter.RData")
library("ggplot2")

ggplot(data = tri,mapping = aes(x = WinModel,color=WinProb,fill=WinProb))+
  geom_bar(stat="count",position = "stack")+
  facet_grid(~group)+
  geom_text(stat='count', aes(label=..count..), position = position_stack(vjust = 0.5),size=3,color="black")+
  scale_x_discrete(drop=FALSE) +
  scale_colour_grey(start = 0.9,end=0.3,drop=FALSE) + #if you use grey color
  scale_fill_grey(start = 0.9,end=0.3,drop=FALSE) +
  theme_bw()
ggsave(filename = "plot.pdf",width = 8,height = 4)
e_=0 # the value of e 


filelist_intout = list.files(pattern = "int_out_result_*")
int = map_dfr(filelist_intout, function(x) {
  read.table(
    x,
    header = FALSE,
    sep = " ",
    stringsAsFactors = FALSE,
    fill = TRUE
  )}) %>%
  mutate(e=e_) %>%
  mutate(cid=paste(V2,V3,V4,sep = "."))

mix = map_dfr(filelist_mix, function(x) {
  read.table(
    x,
    header = FALSE,
    sep = " ",
    stringsAsFactors = FALSE,
    fill = TRUE
  )}) %>%
  mutate(e=e_,V5=e_+(1-2*e_)*V5,V6=e_+(1-2*e_)*V6)%>%
  mutate(cid=paste(V2,V3,V4,sep = ".")) 
# Here I draw the filtered triplets.

dfres=tri%>%
  filter(group=="filtered")

ids=dfres%>%pull(cid)

pdf(height = 3, width = 5, file = "plot_pdf.pdf")
for (i in ids){
  try({
    int0=int%>%filter(cid==i,V1=="int")%>%.[,5:104]%>%t()
    df1=data.frame(ind=seq(e_,1-e_,length.out = 100),y=int0,e=rep(e_,100),model=rep("int",100))
    mix0=mix%>%filter(cid==i)%>%.[,c(5,6)]%>%t()
    df2=data.frame(ind=c(0,1),y=mix0,e=rep(0,2))
    p=ggplot(data = df1,mapping = aes(x=ind,y=y))+
      geom_line()+
      #facet_grid(.~e)+
      geom_point(data=df2,mapping = aes(x=ind,y=y))+
      geom_hline(yintercept = e_,linetype="dashed")+
      geom_hline(yintercept = 1-e_,linetype="dashed")+
      geom_vline(xintercept = e_,linetype="dashed")+
      geom_vline(xintercept = 1-e_,linetype="dashed")+
      ggtitle(paste0(i,";"),
              paste0(dfres%>%filter(cid==i)%>%pull(WinModel),
                     ";WinProb:",
                     dfres%>%filter(cid==i)%>%pull(WinPr)%>%round(.,4),
                     ";SizeA:",
                     dfres%>%filter(cid==i)%>%pull(SampSizeA),
                     ";SizeB:",
                     dfres%>%filter(cid==i)%>%pull(SampSizeB),
                     ";SizeAB:",
                     dfres%>%filter(cid==i)%>%pull(SampSizeAB)
              ))+
      theme_bw()+
      xlab("")
    print(p)
  })
}
dev.off()


YunranChen/prml documentation built on May 16, 2019, 4:06 a.m.