test-nn-transformer.R
In torch: Tensors and Neural Networks with 'GPU' Acceleration

test_that("Results are equal to Pytorch", {
  # for some reason results vary a lot accross platforms 
  skip_on_os(c("windows", "linux", "solaris")) 
  
  with_torch_manual_seed(seed = 123, {
    layer <- nn_transformer_encoder_layer(d_model = 8, nhead = 2, dim_feedforward = 16)
    input <- torch_randn(5, 3, 8)  # (seq_len=5, batch=3, features=8)
    output <- layer(input)  
  })

  expected <- rbind(
    c(2.0008, -0.7164,  1.0093,  0.3160, -0.6471, -0.0106, -0.8325, -1.1195),
    c(1.0405, -0.3428,  1.3047,  0.5384, -1.5694, -0.7368, -1.0591,  0.8246),
    c(0.0510,  0.4845,  0.2344,  0.6781, -1.2878, -0.0163,  1.5982, -1.7421)
  )

  expect_equal_to_r(output[1,,], expected, tolerance = 1e-4)
})

test_that("TransformerEncoderLayer produces output of correct shape", {
  layer <- nn_transformer_encoder_layer(d_model = 8, nhead = 2, dim_feedforward = 16)
  input <- torch_randn(5, 3, 8)  # (seq_len=5, batch=3, features=8)
  output <- layer(input)
  expect_true(torch:::is_torch_tensor(output))
  expect_no_error(as_array(output$to(device = "cpu")))
  expect_equal(dim(output), dim(input))  # output shape should match input shape
})

test_that("TransformerEncoderLayer batch_first works", {
  layer_bf <- nn_transformer_encoder_layer(d_model = 8, nhead = 2, batch_first = TRUE)
  input_bf <- torch_randn(3, 5, 8)  # (batch=3, seq_len=5, features=8)
  output_bf <- layer_bf(input_bf)
  expect_equal(dim(output_bf), dim(input_bf))
})

test_that("TransformerEncoderLayer accepts mask and is_causal", {
  d_model <- 4; seq_len <- 4; batch <- 1
  layer <- nn_transformer_encoder_layer(d_model = d_model, nhead = 1, dropout = 0)  # no dropout for deterministic result
  layer$eval()  # ensure dropout is off
  x <- torch_randn(seq_len, batch, d_model)
  # Create an explicit causal mask (upper triangular True matrix)
  causal_mask <- torch_ones(c(seq_len, seq_len), dtype = torch_bool())$triu(diagonal = 1)
  # Run with explicit mask vs using is_causal flag
  out_mask <- layer(x, src_mask = causal_mask)
  out_flag <- layer(x, is_causal = TRUE)
  expect_equal(out_mask, out_flag)  # should be identical
})

test_that("TransformerEncoderLayer key_padding_mask works as expected", {
  layer <- nn_transformer_encoder_layer(d_model = 6, nhead = 2, dropout = 0)
  layer$eval()
  seq_len <- 3; batch <- 2
  x <- torch_randn(seq_len, batch, 6)
  # key_padding_mask: mask out the last position of sequence for the second batch element
  pad_mask <- torch_tensor(matrix(c(0, 0, 0,   # no padding in batch 1
                                    0, 0, 1),  # third position masked in batch 2
                                  nrow = batch, ncol = seq_len, byrow = TRUE), dtype = torch_bool())
  out_nomask <- layer(x)
  out_masked <- layer(x, src_key_padding_mask = pad_mask)
  # The masked position output should be the same in both (because when masked, it does not attend to anything 
  # except possibly itself through residual connections)
  expect_equal(out_nomask[ ,2, ][3, ], out_masked[ ,2, ][3, ])
  # (Here we compare the last time-step of batch 2 from both outputs.)
})

test_that("TransformerEncoder (stack of layers) produces correct output and uses each layer", {
  # Create a base layer and an encoder with multiple layers
  base_layer <- nn_transformer_encoder_layer(d_model = 8, nhead = 2, dropout = 0)
  model <- nn_transformer_encoder(base_layer, num_layers = 3)
  model$eval()
  x <- torch_randn(4, 2, 8)  # (seq_len=4, batch=2, features=8)
  # Output from the encoder
  out_model <- model(x)
  expect_equal(dim(out_model), dim(x))
  # Manually apply the cloned layers sequentially and compare
  manual_out <- x
  for (i in 1:3) {
    manual_out <- model$layers[[i]](manual_out)
  }
  if (!is.null(model$norm)) {
    manual_out <- model$norm(manual_out)
  }
  expect_equal(out_model, manual_out)
})

test_that("TransformerEncoder results are identical tom python", {
  # for some reason results vary a lot accross platforms 
  skip_on_os(c("windows", "linux", "solaris")) 
  
  with_torch_manual_seed(seed = 123, {
    base_layer <- nn_transformer_encoder_layer(d_model = 8, nhead = 2, dim_feedforward = 16)
    model <- nn_transformer_encoder(base_layer, num_layers = 3)
    x <- torch_randn(4, 2, 8)
    out_model <- model(x)
  })

  expected <- rbind(
    c(2.2889, -0.3724,  0.1357,  0.3804, -0.5985,  0.1374, -1.2492, -0.7224),
    c(1.9246, -0.1313,  0.9192,  0.2264, -1.1253, -0.1943, -1.4291, -0.1902)
  )

  expect_equal_to_r(out_model[1,,], expected, tolerance = 1e-4)
})

test_that("TransformerEncoder supports different norm and preserves results", {
  # Use a final normalization in TransformerEncoder
  layer <- nn_transformer_encoder_layer(d_model = 5, nhead = 1, dropout = 0)
  final_norm <- nn_layer_norm(normalized_shape = 5)
  model <- nn_transformer_encoder(layer, num_layers = 2, norm = final_norm)
  model$eval()
  x <- torch_randn(6, 1, 5)
  out <- model(x)
  # The output with final norm should equal manually normalizing the output of encoder without norm
  model_no_norm <- nn_transformer_encoder(layer, num_layers = 2)  # same layers but without final norm
  model_no_norm$eval()
  out_no_norm <- model_no_norm(x)
  manual_norm <- final_norm(out_no_norm)
  expect_equal(out, manual_norm)
})

test_that("Different activation functions produce similar results when appropriate", {
  torch_manual_seed(42)
  layer_relu_str <- nn_transformer_encoder_layer(d_model = 4, nhead = 1, dim_feedforward = 8, 
                                                 activation = "relu", dropout = 0)
  torch_manual_seed(42)
  layer_relu_fun <- nn_transformer_encoder_layer(d_model = 4, nhead = 1, dim_feedforward = 8, 
                                                 activation = nnf_relu, dropout = 0)
  layer_relu_str$eval(); layer_relu_fun$eval()
  x <- torch_randn(5, 1, 4)
  out_str <- layer_relu_str(x)
  out_fun <- layer_relu_fun(x)
  expect_equal(out_str, out_fun)  # relu string vs function should match
  # Similarly for GELU
  torch_manual_seed(123)
  layer_gelu_str <- nn_transformer_encoder_layer(d_model = 4, nhead = 1, dim_feedforward = 8, 
                                                 activation = "gelu", dropout = 0)
  torch_manual_seed(123)
  layer_gelu_fun <- nn_transformer_encoder_layer(d_model = 4, nhead = 1, dim_feedforward = 8, 
                                                 activation = nnf_gelu, dropout = 0)
  layer_gelu_str$eval(); layer_gelu_fun$eval()
  out_str2 <- layer_gelu_str(x)
  out_fun2 <- layer_gelu_fun(x)
  expect_equal(out_str2, out_fun2)
})

test_that("Modules are serializable and gradients flow", {
  layer <- nn_transformer_encoder_layer(d_model = 3, nhead = 1)
  model <- nn_transformer_encoder(layer, num_layers = 2)
  # Serialize and deserialize
  tmp <- tempfile(fileext = ".pt")
  torch_save(model, tmp)
  model2 <- torch_load(tmp)
  # Check that loaded model outputs the same as original
  x <- torch_randn(2, 1, 3, requires_grad = TRUE)
  model$eval(); model2$eval()
  expect_equal(model(x), model2(x))
  # Gradient flow: do a backward pass
  model$train()
  out <- model(x)
  loss <- out$sum()
  loss$backward()
  # Check that at least one parameter has non-null gradient
  grads <- lapply(model$parameters, function(p) p$grad)
  has_grad <- any(sapply(grads, function(g) { !is_undefined_tensor(g) && g$numel() > 0 }))
  expect_true(has_grad)
})


test_that("TransformerEncoderLayer GPU test", {
  if(cuda_is_available()) {
  layer_bf <- nn_transformer_encoder_layer(d_model = 8, nhead = 2, batch_first = TRUE)
  input_bf <- torch_randn(3, 5, 8, device = "cuda")  # (batch=3, seq_len=5, features=8)
  layer_bf$to(device = "cuda")
  output_bf <- layer_bf(input_bf)
  expect_equal(dim(output_bf), dim(input_bf))
  }
})

Any scripts or data that you put into this service are public.

torch documentation built on Aug. 21, 2025, 5:50 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

torch
Tensors and Neural Networks with 'GPU' Acceleration

tests/testthat/test-nn-transformer.R
In torch: Tensors and Neural Networks with 'GPU' Acceleration

Try the torch package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

torch Tensors and Neural Networks with 'GPU' Acceleration

tests/testthat/test-nn-transformer.R In torch: Tensors and Neural Networks with 'GPU' Acceleration

Try the torch package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

torch
Tensors and Neural Networks with 'GPU' Acceleration

tests/testthat/test-nn-transformer.R
In torch: Tensors and Neural Networks with 'GPU' Acceleration