R/parseSimulationOutput.R

Defines functions parseSimulationOutput easySplit extractLine

Documented in parseSimulationOutput

# To extract the next line from the file
extractLine <- function(in_file) {
  readLines(in_file, n=1)
}

easySplit <- function(line, splitter) {
  strsplit(line, splitter)[[1]]
}

#' Input data from CPP output into a list of data frames
#' @param filename The filename of the output generated by the simulation
#' @return A list containing
#' \describe{
#' \item{params}{the parameters used to run the simulation}
#' \item{sequences}{a data frame of the sequences over time and their similarity
#' to the initial sequence
#' \itemize{
#' \item step - timestep in the simulation
#' \item realTime - time since the start of the simulation (in millions of years)
#' \item sequenceId - the unique ID of the sequence (to track it over time);
#'   initial sequence has sequenceId 0 to (numInitialCopies-1)
#' \item parentMain - the unique ID of the sequence this burst from;
#'                (-1 if nothing)
#' \item parentOther - the unique ID of the sequence its parent recombined with;
#'                 (-1 if nothing)
#' \item distanceToInitial - the distance to the initial sequence
#' \item isActive - whether or not the sequence is capable of transposition
#' }}
#' \item{pairwise}{a data frame of the pairwise similarity of sequences over time
#' \itemize{
#' \item step - timestep in the simulation
#' \item realTime - time since the start of the simulation (in millions of years)
#' \item sequenceId1 - an ID of a sequence present at the time
#' \item sequenceId2 - an ID of a different sequence present at the time; not all pairs
#'                 are given - that is, for sequences a and b, either (a, b) or (b, a)
#'                 is present as a row but not both
#' \item distancePairwise - the distance between the two sequences
#' }}
#' \item{familyRepresentatives}{a data frame of the family representatives
#'     for the retrotransposons over time
#' \itemize{
#' \item step - timestep in the simulation
#' \item realTime - time since the start of the simulation (in millions of years)
#' \item familyId - the unique ID of the family representative (to track it over time)
#' \item creationTime - time this family representative was created (in millions of years)
#' \item sequenceId - a sequence belonging to that family (sequences belonging to a
#'                family are listed as separate rows, and all sequences belonging
#'                to that family are listed)
#' }}
#' \item{familyPairwise}{a data frame of the pairwise similarity of the
#'     family representatives for the retrotransposons over time
#' \itemize{
#' \item step - timestep in the simulation
#' \item realTime - time since the start of the simulation (in millions of years)
#' \item familyId1 - an ID of a family representative present at the time
#' \item familyId2 - an ID of a different family representative present at the time;
#'               not all pairs are given - that is, for sequences a and b,
#'               either (a, b) or (b, a) is present as a row but not both
#' \item distancePairwise - the distance between the two family representatives
#' }}
#' }
#' @examples
#' \dontrun{
#' data <- parseSimulationOutput('simulationOutput.out')
#' }
#' @export
parseSimulationOutput <- function(filename)
{
  # TODO: Return a good error message if the file cannot be parsed
  data <- list()
  data$params <- list()
  data$sequences <- data.frame()
  data$pairwise <- data.frame()
  data$familyRepresentatives <- data.frame()
  data$familyPairwise <- data.frame()

  con = file(filename, "r")

  # basic variables
  t <- -1
  realTime <- -1
  tags <- NULL

  while (TRUE)
  {
    nextLine <- extractLine(con)

    if (length(nextLine) == 0) {
      break # EOF
    }
    if (nextLine == "Param<")
    {
      while(TRUE) {
        nextLine <- extractLine(con)
        if(nextLine == ">Param") { break }

        splitLine <- easySplit(nextLine, ":")
        param <- splitLine[1]
        value <- splitLine[2]
        if (param %in% c('SequenceParams_initialSequence',
                         'MutationParams_model',
                         'OutputParams_outputFileName'
                         )) {
          data$params[[param]] <- value
        }
        else if (param %in% c('SeedParams_toSeed')) {
          data$params[[param]] <- as.logical(value)
        }
        else {
          data$params[[param]] <- as.numeric(value)
        }
      }
    }
    else if (nextLine == "Init<")
    {
      timestep <- as.numeric(trimws(extractLine(con), "left", whitespace = "@"))
      numSeq <- as.numeric(trimws(extractLine(con), "left", whitespace = "!"))

      seqIds <- rep(NA_integer_, numSeq)
      parMains <- rep(NA_integer_, numSeq)
      parOthers <- rep(NA_integer_, numSeq)
      initDists <- rep(NA_integer_, numSeq)
      actives <- rep(NA_integer_, numSeq)
      i <- 1
      while (i <= numSeq) {
        nextLine <- extractLine(con)
        splitLine <- easySplit(nextLine, ":")
        seqIds[i] <- splitLine[1]
        parMains[i] <- splitLine[2]
        parOthers[i] <- splitLine[3]
        initDists[i] <- splitLine[4]
        actives[i] <- splitLine[5]
        i <- i+1
      }
      steps <- rep(timestep, numSeq)
      realTimes = steps * data$params$SimulationParams_timePerStep
      data$sequences <- rbind(data$sequences,
        data.frame(step = steps, realTime = realTimes,
                   sequenceId = as.numeric(seqIds),
                   parentMain = as.numeric(parMains),
                   parentOther = as.numeric(parOthers),
                   distanceToInitial = as.numeric(initDists),
                   isActive = as.logical(actives)
        )
      )
      extractLine(con)
    }
    else if (nextLine == "Pair<")
    {
      timestep <- as.numeric(trimws(extractLine(con), "left", whitespace = "@"))
      numSeq <- as.numeric(trimws(extractLine(con), "left", whitespace = "!"))

      seqId1s <- rep(NA_integer_, numSeq*(numSeq-1)/2)
      seqId2s <- rep(NA_integer_, numSeq*(numSeq-1)/2)
      pairDists <- rep(NA_integer_, numSeq*(numSeq-1)/2)

      i <- 1
      while(TRUE) {
        nextLine <- extractLine(con)
        if(nextLine == ">Pair") {
          if (i > 1) {
            steps <- rep(timestep, i-1)
            realTimes = steps * data$params$SimulationParams_timePerStep
            data$pairwise <- rbind(data$pairwise,
              data.frame(step = steps, realTime = realTimes,
                         sequenceId1 = as.numeric(seqId1s[1:(i-1)]),
                         sequenceId2 = as.numeric(seqId2s[1:(i-1)]),
                         distancePairwise = as.numeric(pairDists[1:(i-1)]))
            )
          }
          break
        }
        splitLine <- easySplit(nextLine, ":")
        seqId1s[i] <- splitLine[1]
        seqId2s[i] <- splitLine[2]
        pairDists[i] <- splitLine[3]
        i <- i+1
      }
    }
    else if (nextLine == "FamTags<")
    {
      timestep <- as.numeric(trimws(extractLine(con), "left", whitespace = "@"))
      numFam <- as.numeric(trimws(extractLine(con), "left", whitespace = "!"))
      numSeq <- as.numeric(trimws(extractLine(con), "left", whitespace = "!"))

      famIds <- rep(NA_integer_, numSeq*numFam)
      seqId2s <- rep(NA_integer_, numSeq*numFam)
      pairDists <- rep(NA_integer_, numSeq*numFam)

      i <- 1
      while (i <= numFam) {
        nextLine <- extractLine(con)
        splitLine <- easySplit(nextLine, ":")
        famId <- splitLine[1]
        creationId <- splitLine[2]
        seqIds <- easySplit(splitLine[3], ",")
        steps <- rep(timestep, length(seqIds))
        data$familyRepresentatives <- rbind(data$familyRepresentatives,
          data.frame(step = steps, realTime = steps*data$params$SimulationParams_timePerStep,
                     familyId = rep(as.numeric(famId), length(seqIds)),
                     creationTime = rep(as.numeric(creationId), length(seqIds)),
                     sequenceId = as.numeric(seqIds)
          )
        )
        i <- i+1
      }
    }
    else if (nextLine == "FamDist<")
    {
      timestep <- as.numeric(trimws(extractLine(con), "left", whitespace = "@"))
      numFam <- as.numeric(trimws(extractLine(con), "left", whitespace = "!"))

      famId1s <- rep(NA_integer_, numFam*(numFam-1)/2)
      famId2s <- rep(NA_integer_, numFam*(numFam-1)/2)
      pairDists <- rep(NA_integer_, numFam*(numFam-1)/2)

      i <- 1
      while(TRUE) {
        nextLine <- extractLine(con)
        if(nextLine == ">FamDist") {
          if (i > 1) {
            steps <- rep(timestep, i-1)
            realTimes = steps * data$params$SimulationParams_timePerStep
            data$familyPairwise <- rbind(data$familyPairwise,
              data.frame(step = steps, realTime = realTimes,
                         familyId1 = as.numeric(famId1s[1:(i-1)]),
                         familyId2 = as.numeric(famId2s[1:(i-1)]),
                         distancePairwise = as.numeric(pairDists[1:(i-1)]))
            )
          }
          break
        }
        splitLine <- easySplit(nextLine, ":")
        famId1s[i] <- splitLine[1]
        famId2s[i] <- splitLine[2]
        pairDists[i] <- splitLine[3]
        i <- i+1
      }

    }
  }

  close(con)
  return(data)
}

Try the retrocombinator package in your browser

Any scripts or data that you put into this service are public.

retrocombinator documentation built on Nov. 12, 2021, 1:07 a.m.