Compilation and Call Overhead

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
knitr::knit_engines$set(rtinycc = Rtinycc:::rtinycc_engine)
tcc_bind <- Rtinycc::tcc_bind
tcc_compile <- Rtinycc::tcc_compile
tcc_ffi <- Rtinycc::tcc_ffi
tcc_source <- Rtinycc::tcc_source
has_callme <- requireNamespace("callme", quietly = TRUE)
has_bench <- requireNamespace("bench", quietly = TRUE)
has_profmem <- isTRUE(capabilities("profmem"))

```{css, echo=FALSE} .rtinycc { background-color: #E3F2FD; } pre.rtinycc span { background-color: #E3F2FD; }

This article measures two different costs:

- compilation latency for a tiny module
- call overhead once the code is already compiled

The comparison target is the [`callme`](https://cran.r-project.org/package=callme)
package, which builds ordinary `.Call()` entry points with `R CMD SHLIB`. That
means it goes through the platform compiler toolchain (`gcc`/`clang` on the
usual Unix-like targets), so we should expect stronger optimization than TinyCC
for steady-state machine code. That does not make the comparison useless, but it
does mean the runtime results combine two effects:

- direct `.Call()` entry points and direct R C API allocation in `callme`
- better backend optimization from the system compiler

The point is not that the two packages expose identical APIs. They do not.
Instead, the comparison asks a narrower question:

- how much compile-time latency does in-memory TinyCC avoid?
- what is the extra runtime cost of Rtinycc's generated wrapper layer?
- how much does an extra copy matter when Rtinycc has to convert a returned C
  buffer into an R vector?

## Three Minimal Cases

We use three small workloads:

- `noop()`: takes nothing, returns nothing
- `fill_rand(out, n)`: fills a caller-provided numeric buffer in place
- `rand_unif(n)`: generates `n` random doubles

The `fill_rand()` case is the fairer array-oriented comparison:

- `Rtinycc` receives a `numeric_array`, so the wrapper borrows the backing
  `REAL()` storage of the R vector directly
- `callme` takes an R numeric vector and writes into `REAL(vec)` directly

The `rand_unif()` case intentionally stresses the extra copy path:

- `callme` allocates the final R vector directly with the R C API
- `Rtinycc` returns a heap-allocated `double*`, and the generated wrapper copies
  that buffer into a fresh R numeric vector before freeing the original C
  allocation

```r
#include <R.h>
#include <Rinternals.h>
#include <Rmath.h>
#include <stdlib.h>

void noop(void) {}

void fill_rand(double* out, int n) {
  if (n < 0) {
    Rf_error("n must be non-negative");
  }

  GetRNGstate();
  for (int i = 0; i < n; ++i) {
    out[i] = unif_rand();
  }
  PutRNGstate();
}

double* rand_unif(int n) {
  if (n < 0) {
    Rf_error("n must be non-negative");
  }
  if (n == 0) {
    return (double*) malloc(sizeof(double));
  }

  double *out = (double*) malloc(sizeof(double) * (size_t) n);
  if (!out) {
    Rf_error("malloc failed");
  }

  GetRNGstate();
  for (int i = 0; i < n; ++i) {
    out[i] = unif_rand();
  }
  PutRNGstate();
  return out;
}
#include <R.h>
#include <Rinternals.h>
#include <Rmath.h>

SEXP noop(void) {
  return R_NilValue;
}

SEXP fill_rand(SEXP out_, SEXP n_) {
  int n = asInteger(n_);
  if (n < 0) {
    Rf_error("n must be non-negative");
  }

  if (TYPEOF(out_) != REALSXP) {
    Rf_error("out must be a numeric vector");
  }

  if (XLENGTH(out_) < n) {
    Rf_error("out is shorter than n");
  }

  double *out = REAL(out_);
  GetRNGstate();
  for (int i = 0; i < n; ++i) {
    out[i] = unif_rand();
  }
  PutRNGstate();

  return out_;
}

SEXP rand_unif(SEXP n_) {
  int n = asInteger(n_);
  if (n < 0) {
    Rf_error("n must be non-negative");
  }

  SEXP out = PROTECT(allocVector(REALSXP, n));
  double *ptr = REAL(out);

  GetRNGstate();
  for (int i = 0; i < n; ++i) {
    ptr[i] = unif_rand();
  }
  PutRNGstate();

  UNPROTECT(1);
  return out;
}
build_rtinycc_module <- function() {
  tcc_ffi() |>
    tcc_source(rtinycc_code) |>
    tcc_bind(
      noop = list(args = list(), returns = "void"),
      fill_rand = list(args = list("numeric_array", "i32"), returns = "void"),
      rand_unif = list(
        args = list("i32"),
        returns = list(type = "numeric_array", length_arg = 1, free = TRUE)
      )
    ) |>
    tcc_compile()
}

build_callme_module <- function() {
  before <- names(getLoadedDLLs())
  mod <- callme::compile(callme_code, env = NULL, verbosity = 0)
  dlls <- getLoadedDLLs()
  new_names <- setdiff(names(dlls), before)
  new_names <- new_names[startsWith(new_names, "callme_")]
  attr(mod, "dll_paths") <- unname(vapply(
    dlls[new_names],
    function(x) x[["path"]],
    character(1)
  ))
  mod
}

unload_callme_dlls <- function(dll_paths) {
  dll_paths <- rev(unique(dll_paths))
  if (is.null(dll_paths) || !length(dll_paths)) {
    return(invisible(NULL))
  }
  for (dll_path in dll_paths) {
    if (is.character(dll_path) && nzchar(dll_path) && file.exists(dll_path)) {
      try(dyn.unload(dll_path), silent = TRUE)
    }
  }
  invisible(NULL)
}

build_and_dispose_callme_module <- function() {
  mod <- build_callme_module()
  dll_paths <- attr(mod, "dll_paths", exact = TRUE)
  rm(mod)
  gc()
  unload_callme_dlls(dll_paths)
  invisible(NULL)
}

callme_runtime_reason <- NULL
can_run_callme <- FALSE

if (!has_callme) {
  callme_runtime_reason <- "`callme` is not installed."
} else if (.Platform$OS.type == "windows") {
  callme_runtime_reason <- paste(
    "`callme` comparisons are skipped on Windows during vignette builds",
    "because the helper DLL compilation step is not reliable in CI."
  )
} else {
  callme_probe <- tryCatch(
    {
      build_and_dispose_callme_module()
      NULL
    },
    error = identity
  )

  if (inherits(callme_probe, "error")) {
    callme_runtime_reason <- paste(
      "`callme` comparisons were skipped because runtime compilation failed:",
      conditionMessage(callme_probe)
    )
  } else {
    can_run_callme <- TRUE
  }
}

can_run_benchmarks <- can_run_callme && has_bench && has_profmem

if (is.null(callme_runtime_reason) && !has_bench) {
  callme_runtime_reason <- "`bench` is not installed."
} else if (is.null(callme_runtime_reason) && !has_profmem) {
  callme_runtime_reason <- paste(
    "`bench` runtime comparisons are skipped because memory profiling",
    "is not available in this R build."
  )
} else if (is.null(callme_runtime_reason)) {
  callme_runtime_reason <- "Executable comparisons are enabled."
}

with_benchmark_modules <- function(fun) {
  rt_mod <- build_rtinycc_module()
  cm_mod <- build_callme_module()
  dll_paths <- attr(cm_mod, "dll_paths", exact = TRUE)

  on.exit({
    rm(rt_mod, cm_mod)
    gc()
    unload_callme_dlls(dll_paths)
  }, add = TRUE)

  fun(rt_mod, cm_mod)
}

median_elapsed <- function(expr, times = 3L) {
  expr <- substitute(expr)
  env <- parent.frame()
  stats::median(replicate(
    times,
    {
      gc()
      t0 <- proc.time()[["elapsed"]]
      eval(expr, envir = env)
      proc.time()[["elapsed"]] - t0
    }
  ))
}

run_noop <- function(fun, n) {
  for (i in seq_len(n)) {
    fun()
  }
  invisible(NULL)
}

run_rand <- function(fun, n, reps) {
  for (i in seq_len(reps)) {
    invisible(fun(n))
  }
  invisible(NULL)
}

run_fill <- function(fun, n, reps) {
  for (i in seq_len(reps)) {
    out <- numeric(n)
    invisible(fun(out, n))
  }
  invisible(NULL)
}

rtinycc_recipe <- tcc_ffi() |>
  tcc_source(rtinycc_code) |>
  tcc_bind(
    noop = list(args = list(), returns = "void"),
    fill_rand = list(args = list("numeric_array", "i32"), returns = "void"),
    rand_unif = list(
      args = list("i32"),
      returns = list(type = "numeric_array", length_arg = 1, free = TRUE)
    )
  )

generated_code <- Rtinycc:::generate_ffi_code(
  symbols = rtinycc_recipe$symbols,
  headers = rtinycc_recipe$headers,
  c_code = rtinycc_recipe$c_code,
  is_external = FALSE,
  structs = rtinycc_recipe$structs,
  unions = rtinycc_recipe$unions,
  enums = rtinycc_recipe$enums,
  globals = rtinycc_recipe$globals,
  container_of = rtinycc_recipe$container_of,
  field_addr = rtinycc_recipe$field_addr,
  struct_raw_access = rtinycc_recipe$struct_raw_access,
  introspect = rtinycc_recipe$introspect
)

Availability

has_callme

If callme, bench, or R memory profiling is unavailable, or if the current build environment cannot compile the temporary callme helper DLL, the executable comparisons below are skipped.

has_bench
has_profmem
can_run_callme
can_run_benchmarks

Current comparison status:

callme_runtime_reason

Compilation Latency

This measures module build time, not call time.

compile_times <- data.frame(
  implementation = c("Rtinycc", "callme"),
  seconds = c(
    median_elapsed(build_rtinycc_module(), times = 3L),
    median_elapsed(build_and_dispose_callme_module(), times = 3L)
  )
)

compile_times$milliseconds <- round(compile_times$seconds * 1000, 1)
compile_times

The expected pattern is:

Generated Wrapper Code

The generated code makes the extra return-path work explicit. In particular, the rand_unif() wrapper allocates an R vector, memcpy()s the native double* buffer into it, then free()s the original buffer. In contrast, fill_rand() uses the borrowed numeric_array input path.

Rtinycc:::rtinycc_c_block(generated_code)

noop() Call Overhead

This is the smallest useful call path. It approximates the lower bound on call overhead above a plain .Call() entry point.

noop_bench <- with_benchmark_modules(function(rt_mod, cm_mod) {
  n_noop <- 1000L

  bench::mark(
    Rtinycc = run_noop(rt_mod$noop, n_noop),
    callme = run_noop(cm_mod$noop, n_noop),
    iterations = 20,
    check = TRUE,
    memory = has_profmem,
    filter_gc = FALSE
  )
})

noop_bench

Interpretation:

fill_rand(out, n) And Zero-Copy Arrays

This is the fairer vector comparison because both implementations fill an existing R numeric vector instead of returning a newly allocated result.

fill_bench_n4096 <- with_benchmark_modules(function(rt_mod, cm_mod) {
  bench::mark(
    Rtinycc = run_fill(rt_mod$fill_rand, 4096L, 100L),
    callme = run_fill(cm_mod$fill_rand, 4096L, 100L),
    iterations = 20,
    check = FALSE,
    memory = has_profmem,
    filter_gc = FALSE
  )
})

fill_bench_n4096

Interpretation:

rand_unif(n) And Copy Cost

Here the implementation work is still small, but the return path differs:

We time both a tiny and a larger return size.

rand_results <- with_benchmark_modules(function(rt_mod, cm_mod) {
  rand_bench_n1 <- bench::mark(
    Rtinycc = run_rand(rt_mod$rand_unif, 1L, 1000L),
    callme = run_rand(cm_mod$rand_unif, 1L, 1000L),
    iterations = 20,
    check = FALSE,
    memory = has_profmem,
    filter_gc = FALSE
  )

  rand_bench_n4096 <- bench::mark(
    Rtinycc = run_rand(rt_mod$rand_unif, 4096L, 100L),
    callme = run_rand(cm_mod$rand_unif, 4096L, 100L),
    iterations = 20,
    check = FALSE,
    memory = has_profmem,
    filter_gc = FALSE
  )

  list(rand_bench_n1 = rand_bench_n1, rand_bench_n4096 = rand_bench_n4096)
})

rand_results$rand_bench_n1
rand_results$rand_bench_n4096

The usual pattern is:

What These Numbers Mean

The benchmark gives a reasonable mental model:

So the package is usually strongest when:

It is less ideal when:



Try the Rtinycc package in your browser

Any scripts or data that you put into this service are public.

Rtinycc documentation built on April 28, 2026, 1:07 a.m.