Nothing
facenet_torchscript_urls <- list(
ONet = c("https://torch-cdn.mlverse.org/models/vision/v2/models/facenet_onet.pth", "4833e21e3a610b4064b5a3d20683a55f", "2 MB"),
PNet = c("https://torch-cdn.mlverse.org/models/vision/v2/models/facenet_pnet.pth", "7f59c98ccf07c4ed51caf68fde86373e", "30 KB"),
RNet = c("https://torch-cdn.mlverse.org/models/vision/v2/models/facenet_rnet.pth", "c19b2f0df8f448455dd7ddbb47dcfa19", "400 KB")
)
#' MTCNN Face Detection Networks
#'
#' These models implement the three-stage Multi-task Cascaded Convolutional Networks (MTCNN)
#' architecture from the paper
#' [Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks](https://arxiv.org/abs/1604.02878).
#'
#' MTCNN detects faces and facial landmarks in an image through a coarse-to-fine pipeline:
#' - **PNet** (Proposal Network): Generates candidate face bounding boxes at multiple scales.
#' - **RNet** (Refine Network): Refines candidate boxes, rejecting false positives.
#' - **ONet** (Output Network): Produces final bounding boxes and 5-point facial landmarks.
#'
#' ## Model Variants
#' ```
#' | Model | Input Size | Parameters | File Size | Outputs | Notes |
#' |-------|----------------|------------|-----------|-------------------------------|-----------------------------------|
#' | PNet | ~12×12+ | ~3k | 30 kB | 2-class face prob + bbox reg | Fully conv, sliding window stage |
#' | RNet | 24×24 | ~30k | 400 kB | 2-class face prob + bbox reg | Dense layers, higher recall |
#' | ONet | 48×48 | ~100k | 2 MB | 2-class prob + bbox + 5-point | Landmark detection stage |
#' ```
#' Inception-ResNet-v1 is a convolutional neural network architecture combining Inception modules
#' with residual connections, designed for face recognition tasks. The model achieves high accuracy
#' on standard face verification benchmarks such as LFW (Labeled Faces in the Wild).
#'
#' ## Model Variants and Performance (LFW accuracy)
#' ```
#' | Weights | LFW Accuracy | File Size |
#' |----------------|--------------|-----------|
#' | CASIA-Webface | 99.05% | 111 MB |
#' | VGGFace2 | 99.65% | 107 MB |
#' ```
#'
#' - The CASIA-Webface pretrained weights provide strong baseline accuracy.
#' - The VGGFace2 pretrained weights achieve higher accuracy, benefiting from a larger, more diverse dataset.
#'
#' @examples
#' \dontrun{
#' # Example usage of PNet
#' model_pnet <- model_facenet_pnet(pretrained = TRUE)
#' model_pnet$eval()
#' input_pnet <- torch_randn(1, 3, 224, 224)
#' output_pnet <- model_pnet(input_pnet)
#' output_pnet
#'
#' # Example usage of RNet
#' model_rnet <- model_facenet_rnet(pretrained = TRUE)
#' model_rnet$eval()
#' input_rnet <- torch_randn(1, 3, 24, 24)
#' output_rnet <- model_rnet(input_rnet)
#' output_rnet
#'
#' # Example usage of ONet
#' model_onet <- model_facenet_onet(pretrained = TRUE)
#' model_onet$eval()
#' input_onet <- torch_randn(1, 3, 48, 48)
#' output_onet <- model_onet(input_onet)
#' output_onet
#'
#' # Example usage of MTCNN
#' mtcnn <- model_mtcnn(pretrained = TRUE)
#' mtcnn$eval()
#' image_tensor <- torch_randn(c(1, 3, 224, 224))
#' out <- mtcnn(image_tensor)
#' out
#'
#' # Load an image from the web
#' wmc <- "https://upload.wikimedia.org/wikipedia/commons/"
#' url <- "b/b4/Catherine_Bell_200101233d_hr_%28cropped%29.jpg"
#' img <- base_loader(paste0(wmc,url))
#'
#' # Convert to torch tensor [C, H, W] normalized
#' input <- transform_to_tensor(img) # [C, H, W]
#' batch <- input$unsqueeze(1) # [1, C, H, W]
#'
#' # Load pretrained model
#' model <- model_facenet_inception_resnet_v1(pretrained = "vggface2")
#' model$eval()
#' output <- model(batch)
#' output
#'
#' # Example usage of Inception-ResNet-v1 with CASIA-Webface Weights
#' model <- model_facenet_inception_resnet_v1(pretrained = "casia-webface")
#' model$eval()
#' output <- model(batch)
#' output
#' }
#'
#' @importFrom torch nn_module nn_conv2d nn_prelu nn_max_pool2d nn_softmax nn_linear nn_batch_norm2d nn_batch_norm1d
#' @importFrom torch nn_relu nn_dropout nn_adaptive_avg_pool2d nn_sequential torch_randn torch_cat
#' @importFrom torch nnf_interpolate nnf_relu nnf_normalize load_state_dict
#'
#' @inheritParams model_mobilenet_v2
#' @param classify Logical, whether to include the classification head. Default is FALSE.
#' @param num_classes Integer, number of output classes for classification. Default is 10.
#' @param dropout_prob Numeric, dropout probability applied before classification. Default is 0.6.
#'
#' @family object_detection_model
#' @rdname model_facenet
#' @name model_facenet
NULL
#' @describeIn model_facenet PNet (Proposal Network) — small fully-convolutional network for candidate face box generation.
#' @export
model_facenet_pnet <- nn_module(
classname = "PNet",
initialize = function(pretrained=TRUE,progress=FALSE,...) {
self$conv1 <- nn_conv2d(3, 10, kernel_size=3)
self$prelu1 <- nn_prelu(10)
self$pool1 <- nn_max_pool2d(2, 2, ceil_mode=TRUE)
self$conv2 <- nn_conv2d(10, 16, kernel_size=3)
self$prelu2 <- nn_prelu(16)
self$conv3 <- nn_conv2d(16, 32, kernel_size=3)
self$prelu3 <- nn_prelu(32)
self$conv4_1 <- nn_conv2d(32, 2, kernel_size=1)
self$softmax4_1 <- nn_softmax(dim=1)
self$conv4_2 <- nn_conv2d(32, 4, kernel_size=1)
self$training <- FALSE
if (pretrained) {
archive <- download_and_cache(facenet_torchscript_urls$PNet[1], prefix = "facenet")
if (tools::md5sum(archive) != facenet_torchscript_urls$PNet[2]){
runtime_error("Corrupt file! Delete the file in {archive} and try again.")
}
state_dict <- load_state_dict(archive)
self$load_state_dict(state_dict)
}
},
forward = function(x) {
x <- self$conv1(x)
x <- self$prelu1(x)
x <- self$pool1(x)
x <- self$conv2(x)
x <- self$prelu2(x)
x <- self$conv3(x)
x <- self$prelu3(x)
a <- self$conv4_1(x)
a <- self$softmax4_1(a)
b <- self$conv4_2(x)
list(boxes = b, cls = a)
}
)
#' @describeIn model_facenet RNet (Refine Network) — medium CNN with dense layers for refining and rejecting false positives.
#' @export
model_facenet_rnet <- nn_module(
classname = "RNet",
initialize = function(pretrained=TRUE,progress=FALSE,...) {
self$conv1 <- nn_conv2d(3, 28, kernel_size=3)
self$prelu1 <- nn_prelu(28)
self$pool1 <- nn_max_pool2d(3, 2, ceil_mode=TRUE)
self$conv2 <- nn_conv2d(28, 48, kernel_size=3)
self$prelu2 <- nn_prelu(48)
self$pool2 <- nn_max_pool2d(3, 2, ceil_mode=TRUE)
self$conv3 <- nn_conv2d(48, 64, kernel_size=2)
self$prelu3 <- nn_prelu(64)
self$dense4 <- nn_linear(576, 128)
self$prelu4 <- nn_prelu(128)
self$dense5_1 <- nn_linear(128, 2)
self$softmax5_1 <- nn_softmax(dim=1)
self$dense5_2 <- nn_linear(128, 4)
self$training <- FALSE
if (pretrained) {
archive <- download_and_cache(facenet_torchscript_urls$RNet[1], prefix = "facenet")
if (tools::md5sum(archive) != facenet_torchscript_urls$RNet[2]){
runtime_error("Corrupt file! Delete the file in {archive} and try again.")
}
state_dict <- load_state_dict(archive)
self$load_state_dict(state_dict)
}
},
forward = function(x) {
x <- self$conv1(x)
x <- self$prelu1(x)
x <- self$pool1(x)
x <- self$conv2(x)
x <- self$prelu2(x)
x <- self$pool2(x)
x <- self$conv3(x)
x <- self$prelu3(x)
x <- x$permute(c(1,4,3,2))$contiguous()
x <- self$dense4(x$view(c(x$size(1), -1)))
x <- self$prelu4(x)
a <- self$dense5_1(x)
a <- self$softmax5_1(a)
b <- self$dense5_2(x)
list(boxes = b, cls = a)
}
)
#' @describeIn model_facenet ONet (Output Network) — deeper CNN that outputs final bounding boxes and 5 facial landmark points.
#' @export
model_facenet_onet <- nn_module(
classname = "ONet",
initialize = function(pretrained=TRUE,progress=FALSE,...) {
self$conv1 <- nn_conv2d(3, 32, kernel_size=3)
self$prelu1 <- nn_prelu(32)
self$pool1 <- nn_max_pool2d(3, 2, ceil_mode=TRUE)
self$conv2 <- nn_conv2d(32, 64, kernel_size=3)
self$prelu2 <- nn_prelu(64)
self$pool2 <- nn_max_pool2d(3, 2, ceil_mode=TRUE)
self$conv3 <- nn_conv2d(64, 64, kernel_size=3)
self$prelu3 <- nn_prelu(64)
self$pool3 <- nn_max_pool2d(2, 2, ceil_mode=TRUE)
self$conv4 <- nn_conv2d(64, 128, kernel_size=2)
self$prelu4 <- nn_prelu(128)
self$dense5 <- nn_linear(1152, 256)
self$prelu5 <- nn_prelu(256)
self$dense6_1 <- nn_linear(256, 2)
self$softmax6_1 <- nn_softmax(dim=1)
self$dense6_2 <- nn_linear(256, 4)
self$dense6_3 <- nn_linear(256, 10)
self$training <- FALSE
if (pretrained) {
archive <- download_and_cache(facenet_torchscript_urls$ONet[1], prefix = "facenet")
if (tools::md5sum(archive) != facenet_torchscript_urls$ONet[2]){
runtime_error("Corrupt file! Delete the file in {archive} and try again.")
}
state_dict <- load_state_dict(archive)
self$load_state_dict(state_dict)
}
},
forward = function(x) {
x <- self$conv1(x)
x <- self$prelu1(x)
x <- self$pool1(x)
x <- self$conv2(x)
x <- self$prelu2(x)
x <- self$pool2(x)
x <- self$conv3(x)
x <- self$prelu3(x)
x <- self$pool3(x)
x <- self$conv4(x)
x <- self$prelu4(x)
x <- x$permute(c(1,4,3,2))$contiguous()
x <- self$dense5(x$view(c(x$size(1), -1)))
x <- self$prelu5(x)
a <- self$dense6_1(x)
a <- self$softmax6_1(a)
b <- self$dense6_2(x)
c <- self$dense6_3(x)
list(boxes = b, landmarks = c, cls = a)
}
)
#' @describeIn model_facenet MTCNN (Multi-task Cascaded Convolutional Networks) — face detection and alignment using a cascade of three neural networks
#'
#' @return `model_mtcnn()` returns a named list with three elements:
#' \itemize{
#' \item \code{boxes}: A tensor of shape \code{(N, 4)} with bounding box coordinates \code{[x1, y1, x2, y2]}.
#' \item \code{landmarks}: A tensor of shape \code{(N, 10)} with (x, y) coordinates of 5 facial landmarks:
#' left eye, right eye, nose, left mouth corner, right mouth corner.
#' \item \code{cls}: A tensor of shape \code{(N, 2)} with face classification probabilities
#' (face / non-face). The \code{cls} head has two classes:
#' \itemize{
#' \item \code{1}: Non-face probability (background)
#' \item \code{2}: Face probability — use this value for thresholding detections
#' }
#' }
#' (Here, \code{N} is the number of detected faces in the input image.)
#' @family object_detection_model
#' @export
model_mtcnn <- nn_module(
classname = "MTCNN",
initialize = function(
pretrained = TRUE,
progress = TRUE,
...
) {
self$pnet <- model_facenet_pnet(pretrained=pretrained,...)
self$rnet <- model_facenet_rnet(pretrained=pretrained,...)
self$onet <- model_facenet_onet(pretrained=pretrained,...)
},
forward = function(x) {
pnet_out <- self$pnet(x)
x_rnet <- nnf_interpolate(x, size = c(24, 24), mode = "bilinear", align_corners = FALSE)
rnet_out <- self$rnet(x_rnet)
x_onet <- nnf_interpolate(x_rnet, size = c(48, 48), mode = "bilinear", align_corners = FALSE)
onet_out <- self$onet(x_onet)
list(boxes = onet_out$boxes, landmarks = onet_out$landmarks, cls = onet_out$cls)
}
)
load_inception_weights <- function(model, name) {
if (name == "vggface2") {
url <- "https://torch-cdn.mlverse.org/models/vision/v2/models/vggface2.pth"
md5 = "c446a04f0b22763858226717ba1f7410"
} else if (name == "casia-webface") {
url <- "https://torch-cdn.mlverse.org/models/vision/v2/models/casia-webface.pth"
md5 = "ff4aff482f6c1941784abba5131bae20"
}
archive <- download_and_cache(url,prefix = "facenet")
if (tools::md5sum(archive) != md5){
runtime_error("Corrupt file! Delete the file in {archive} and try again.")
}
state_dict <- torch::load_state_dict(archive)
model$load_state_dict(state_dict)
model
}
BasicConv2d <- nn_module(
"BasicConv2d",
initialize = function(in_channels, out_channels, kernel_size, stride, padding = 0) {
self$conv <- nn_conv2d(in_channels, out_channels, kernel_size, stride, padding, bias = FALSE)
self$bn <- nn_batch_norm2d(out_channels, eps = 0.001, momentum = 0.1)
},
forward = function(x) {
x %>% self$conv() %>% self$bn() %>% nnf_relu(inplace = TRUE)
}
)
Block35 <- nn_module(
"Block35",
initialize = function(scale = 1.0) {
self$scale <- scale
self$branch0 <- BasicConv2d(256, 32, kernel_size = 1, stride = 1)
self$branch1 <- nn_sequential(
BasicConv2d(256, 32, kernel_size = 1, stride = 1),
BasicConv2d(32, 32, kernel_size = 3, stride = 1, padding = 1)
)
self$branch2 <- nn_sequential(
BasicConv2d(256, 32, kernel_size = 1, stride = 1),
BasicConv2d(32, 48, kernel_size = 3, stride = 1, padding = 1),
BasicConv2d(48, 64, kernel_size = 3, stride = 1, padding = 1)
)
self$conv2d <- nn_conv2d(128, 256, kernel_size = 1, stride = 1)
},
forward = function(x) {
branch0 <- self$branch0(x)
branch1 <- self$branch1(x)
branch2 <- self$branch2(x)
print(branch0$shape)
print(branch1$shape)
print(branch2$shape)
mixed <- torch_cat(list(branch0, branch1, branch2), dim = 2)
up <- self$conv2d(mixed)
x + self$scale * up %>% nnf_relu(inplace = TRUE)
}
)
Block17 <- nn_module(
initialize = function(scale = 1.0) {
self$scale <- scale
self$branch0 <- BasicConv2d(896, 128, kernel_size = 1, stride = 1)
self$branch1 <- nn_sequential(
BasicConv2d(896, 128, kernel_size = 1, stride = 1),
BasicConv2d(128, 128, kernel_size = c(1,7), stride = 1, padding = c(0,3)),
BasicConv2d(128, 128, kernel_size = c(7,1), stride = 1, padding = c(3,0))
)
self$conv2d <- nn_conv2d(256, 896, kernel_size = 1, stride = 1)
self$relu <- nn_relu(inplace = FALSE)
},
forward = function(x) {
x0 <- self$branch0(x)
x1 <- self$branch1(x)
out <- torch_cat(list(x0, x1), dim = 2)
out <- self$conv2d(out)
out <- out * self$scale + x
out %>% self$relu()
}
)
Block8 <- nn_module(
initialize = function(scale = 1.0, noReLU = FALSE) {
self$scale <- scale
self$noReLU <- noReLU
self$branch0 <- BasicConv2d(1792, 192, kernel_size = 1, stride = 1)
self$branch1 <- nn_sequential(
BasicConv2d(1792, 192, kernel_size = 1, stride = 1),
BasicConv2d(192, 192, kernel_size = c(1,3), stride = 1, padding = c(0,1)),
BasicConv2d(192, 192, kernel_size = c(3,1), stride = 1, padding = c(1,0))
)
self$conv2d <- nn_conv2d(384, 1792, kernel_size = 1, stride = 1)
if (!noReLU) {
self$relu <- nn_relu(inplace = FALSE)
}
},
forward = function(x) {
x0 <- self$branch0(x)
x1 <- self$branch1(x)
out <- torch_cat(list(x0, x1), dim = 2)
out <- self$conv2d(out)
out <- out * self$scale + x
if (!self$noReLU) {
out <- self$relu(out)
}
out
}
)
Mixed_6a <- nn_module(
initialize = function() {
self$branch0 <- BasicConv2d(256, 384, kernel_size = 3, stride = 2, padding = 0)
self$branch1 <- nn_sequential(
BasicConv2d(256, 192, kernel_size = 1, stride = 1, padding = 0),
BasicConv2d(192, 192, kernel_size = 3, stride = 1, padding = 0),
BasicConv2d(192, 256, kernel_size = 3, stride = 2, padding = 1)
)
self$branch2 <- nn_max_pool2d(kernel_size = 3, stride = 2)
},
forward = function(x) {
x0 <- self$branch0(x)
x1 <- self$branch1(x)
x2 <- self$branch2(x)
torch_cat(list(x0, x1, x2), dim = 2)
}
)
Mixed_7a <- nn_module(
initialize = function() {
self$branch0 <- nn_sequential(
BasicConv2d(896, 256, kernel_size = 1, stride = 1),
BasicConv2d(256, 384, kernel_size = 3, stride = 2)
)
self$branch1 <- nn_sequential(
BasicConv2d(896, 256, kernel_size = 1, stride = 1),
BasicConv2d(256, 256, kernel_size = 3, stride = 2)
)
self$branch2 <- nn_sequential(
BasicConv2d(896, 256, kernel_size = 1, stride = 1),
BasicConv2d(256, 256, kernel_size = 3, stride = 1, padding = 1),
BasicConv2d(256, 256, kernel_size = 3, stride = 2)
)
self$branch3 <- nn_max_pool2d(kernel_size = 3, stride = 2)
},
forward = function(x) {
x0 <- self$branch0(x)
x1 <- self$branch1(x)
x2 <- self$branch2(x)
x3 <- self$branch3(x)
torch_cat(list(x0, x1, x2, x3), dim = 2)
}
)
#' @describeIn model_facenet Inception-ResNet-v1 — high-accuracy face recognition model combining Inception modules with residual connections, pretrained on VGGFace2 and CASIA-Webface datasets
#'
#' @return
#' `model_facenet_inception_resnet_v1()` returns a tensor output depending on the \code{classify} argument:
#' \itemize{
#' \item When \code{classify = FALSE} (default):
#' A tensor of shape \code{(N, 512)}, where each row is a normalized embedding
#' vector (L2 norm = 1).
#' These 512-dimensional FaceNet embeddings can be compared using cosine
#' similarity or Euclidean distance for face verification and clustering.
#'
#' \item When \code{classify = TRUE}:
#' A tensor of shape \code{(N, num_classes)} containing class logits.
#' }
#'
#' @family classification_model
#' @export
model_facenet_inception_resnet_v1 <- nn_module(
initialize = function(
pretrained = NULL,
classify = FALSE,
num_classes = 10,
dropout_prob = 0.6,
...
) {
if (!is.null(pretrained)) {
if (pretrained == "vggface2") {
tmp_classes <- 8631
} else if (pretrained == "casia-webface") {
tmp_classes <- 10575
} else {
pretrained <- NULL
}
}
self$conv2d_1a <- BasicConv2d(3, 32, kernel_size = 3, stride = 2)
self$conv2d_2a <- BasicConv2d(32, 32, kernel_size = 3, stride = 1)
self$conv2d_2b <- BasicConv2d(32, 64, kernel_size = 3, stride = 1, padding = 1)
self$maxpool_3a <- nn_max_pool2d(kernel_size = 3, stride = 2)
self$conv2d_3b <- BasicConv2d(64, 80, kernel_size = 1, stride = 1)
self$conv2d_4a <- BasicConv2d(80, 192, kernel_size = 3, stride = 1)
self$conv2d_4b <- BasicConv2d(192, 256, kernel_size = 3, stride = 2)
self$mixed_6a <- Mixed_6a()
self$repeat_2 <- nn_sequential(!!!lapply(1:10, function(i) Block17(0.10)))
self$mixed_7a <- Mixed_7a()
self$repeat_3 <- nn_sequential(!!!lapply(1:5, function(i) Block8(0.20)))
self$block8 <- Block8(noReLU = TRUE)
self$avgpool_1a <- nn_adaptive_avg_pool2d(output_size = 1)
self$dropout <- nn_dropout(p = dropout_prob)
self$last_linear <- nn_linear(1792, 512, bias = FALSE)
self$last_bn <- nn_batch_norm1d(512, eps = 0.001, momentum = 0.1, affine = TRUE)
self$classify <- classify
if (!is.null(pretrained)) {
self$logits <- nn_linear(512, tmp_classes)
load_inception_weights(self, pretrained)
}
if (classify && !is.null(num_classes)) {
self$logits <- nn_linear(512, num_classes)
}
},
forward = function(x) {
x <- self$conv2d_1a(x)
x <- self$conv2d_2a(x)
x <- self$conv2d_2b(x)
x <- self$maxpool_3a(x)
x <- self$conv2d_3b(x)
x <- self$conv2d_4a(x)
x <- self$conv2d_4b(x)
x <- self$mixed_6a(x)
x <- self$repeat_2(x)
x <- self$mixed_7a(x)
x <- self$repeat_3(x)
x <- self$block8(x)
x <- self$avgpool_1a(x)
x <- self$dropout(x)
x <- self$last_linear(x$view(c(x$shape[1], -1)))
x <- self$last_bn(x)
if (self$classify) {
x <- self$logits(x)
} else {
x <- nnf_normalize(x, p = 2, dim = 2)
}
x
}
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.