.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "tutorials/_rendered_examples/dynamo/vgg16_fp8_ptq.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        :ref:`Go to the end <sphx_glr_download_tutorials__rendered_examples_dynamo_vgg16_fp8_ptq.py>`
        to download the full example code

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_tutorials__rendered_examples_dynamo_vgg16_fp8_ptq.py:


.. _vgg16_fp8_ptq:

Deploy Quantized Models using Torch-TensorRT
======================================================

Here we demonstrate how to deploy a model quantized to FP8 using the Dynamo frontend of Torch-TensorRT

.. GENERATED FROM PYTHON SOURCE LINES 11-13

Imports and Model Definition
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. GENERATED FROM PYTHON SOURCE LINES 13-120

.. code-block:: python


    import argparse

    import modelopt.torch.quantization as mtq
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    import torch_tensorrt as torchtrt
    import torchvision.datasets as datasets
    import torchvision.transforms as transforms
    from modelopt.torch.quantization.utils import export_torch_mode


    class VGG(nn.Module):
        def __init__(self, layer_spec, num_classes=1000, init_weights=False):
            super(VGG, self).__init__()

            layers = []
            in_channels = 3
            for l in layer_spec:
                if l == "pool":
                    layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
                else:
                    layers += [
                        nn.Conv2d(in_channels, l, kernel_size=3, padding=1),
                        nn.BatchNorm2d(l),
                        nn.ReLU(),
                    ]
                    in_channels = l

            self.features = nn.Sequential(*layers)
            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
            self.classifier = nn.Sequential(
                nn.Linear(512 * 1 * 1, 4096),
                nn.ReLU(),
                nn.Dropout(),
                nn.Linear(4096, 4096),
                nn.ReLU(),
                nn.Dropout(),
                nn.Linear(4096, num_classes),
            )
            if init_weights:
                self._initialize_weights()

        def _initialize_weights(self):
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
                    if m.bias is not None:
                        nn.init.constant_(m.bias, 0)
                elif isinstance(m, nn.BatchNorm2d):
                    nn.init.constant_(m.weight, 1)
                    nn.init.constant_(m.bias, 0)
                elif isinstance(m, nn.Linear):
                    nn.init.normal_(m.weight, 0, 0.01)
                    nn.init.constant_(m.bias, 0)

        def forward(self, x):
            x = self.features(x)
            x = self.avgpool(x)
            x = torch.flatten(x, 1)
            x = self.classifier(x)
            return x


    def vgg16(num_classes=1000, init_weights=False):
        vgg16_cfg = [
            64,
            64,
            "pool",
            128,
            128,
            "pool",
            256,
            256,
            256,
            "pool",
            512,
            512,
            512,
            "pool",
            512,
            512,
            512,
            "pool",
        ]
        return VGG(vgg16_cfg, num_classes, init_weights)


    PARSER = argparse.ArgumentParser(
        description="Load pre-trained VGG model and then tune with FP8 and PTQ. For having a pre-trained VGG model, please refer to https://github.com/pytorch/TensorRT/tree/main/examples/int8/training/vgg16"
    )
    PARSER.add_argument(
        "--ckpt", type=str, required=True, help="Path to the pre-trained checkpoint"
    )
    PARSER.add_argument(
        "--batch-size",
        default=128,
        type=int,
        help="Batch size for tuning the model with PTQ and FP8",
    )

    args = PARSER.parse_args()

    model = vgg16(num_classes=10, init_weights=False)
    model = model.cuda()


.. GENERATED FROM PYTHON SOURCE LINES 121-123

Load the pre-trained model weights
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. GENERATED FROM PYTHON SOURCE LINES 123-140

.. code-block:: python


    ckpt = torch.load(args.ckpt)
    weights = ckpt["model_state_dict"]

    if torch.cuda.device_count() > 1:
        from collections import OrderedDict

        new_state_dict = OrderedDict()
        for k, v in weights.items():
            name = k[7:]  # remove `module.`
            new_state_dict[name] = v
        weights = new_state_dict

    model.load_state_dict(weights)
    # Don't forget to set the model to evaluation mode!
    model.eval()


.. GENERATED FROM PYTHON SOURCE LINES 141-143

Load training dataset and define loss function for PTQ
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. GENERATED FROM PYTHON SOURCE LINES 143-170

.. code-block:: python


    training_dataset = datasets.CIFAR10(
        root="./data",
        train=True,
        download=True,
        transform=transforms.Compose(
            [
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
            ]
        ),
    )
    training_dataloader = torch.utils.data.DataLoader(
        training_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=2,
        drop_last=True,
    )

    data = iter(training_dataloader)
    images, _ = next(data)

    crit = nn.CrossEntropyLoss()


.. GENERATED FROM PYTHON SOURCE LINES 171-173

Define Calibration Loop for quantization
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. GENERATED FROM PYTHON SOURCE LINES 173-191

.. code-block:: python



    def calibrate_loop(model):
        # calibrate over the training dataset
        total = 0
        correct = 0
        loss = 0.0
        for data, labels in training_dataloader:
            data, labels = data.cuda(), labels.cuda(non_blocking=True)
            out = model(data)
            loss += crit(out, labels)
            preds = torch.max(out, 1)[1]
            total += labels.size(0)
            correct += (preds == labels).sum().item()

        print("PTQ Loss: {:.5f} Acc: {:.2f}%".format(loss / total, 100 * correct / total))



.. GENERATED FROM PYTHON SOURCE LINES 192-194

Tune the pre-trained model with FP8 and PTQ
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. GENERATED FROM PYTHON SOURCE LINES 194-200

.. code-block:: python


    quant_cfg = mtq.FP8_DEFAULT_CFG
    # PTQ with in-place replacement to quantized modules
    mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
    # model has FP8 qdq nodes at this point


.. GENERATED FROM PYTHON SOURCE LINES 201-203

Inference
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. GENERATED FROM PYTHON SOURCE LINES 203-261

.. code-block:: python


    # Load the testing dataset
    testing_dataset = datasets.CIFAR10(
        root="./data",
        train=False,
        download=True,
        transform=transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
            ]
        ),
    )

    testing_dataloader = torch.utils.data.DataLoader(
        testing_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=2,
        drop_last=True,
    )  # set drop_last=True to drop the last incomplete batch for static shape `torchtrt.dynamo.compile()`

    with torch.no_grad():
        with export_torch_mode():
            # Compile the model with Torch-TensorRT Dynamo backend
            input_tensor = images.cuda()
            exp_program = torch.export.export(model, (input_tensor,))
            trt_model = torchtrt.dynamo.compile(
                exp_program,
                inputs=[input_tensor],
                enabled_precisions={torch.float8_e4m3fn},
                min_block_size=1,
                debug=False,
            )
            # You can also use torch compile path to compile the model with Torch-TensorRT:
            # trt_model = torch.compile(model, backend="tensorrt")

            # Inference compiled Torch-TensorRT model over the testing dataset
            total = 0
            correct = 0
            loss = 0.0
            class_probs = []
            class_preds = []
            for data, labels in testing_dataloader:
                data, labels = data.cuda(), labels.cuda(non_blocking=True)
                out = trt_model(data)
                loss += crit(out, labels)
                preds = torch.max(out, 1)[1]
                class_probs.append([F.softmax(i, dim=0) for i in out])
                class_preds.append(preds)
                total += labels.size(0)
                correct += (preds == labels).sum().item()

            test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
            test_preds = torch.cat(class_preds)
            test_loss = loss / total
            test_acc = correct / total
            print("Test Loss: {:.5f} Test Acc: {:.2f}%".format(test_loss, 100 * test_acc))


.. rst-class:: sphx-glr-timing

   **Total running time of the script:** ( 0 minutes  0.000 seconds)


.. _sphx_glr_download_tutorials__rendered_examples_dynamo_vgg16_fp8_ptq.py:

.. only:: html

  .. container:: sphx-glr-footer sphx-glr-footer-example




    .. container:: sphx-glr-download sphx-glr-download-python

      :download:`Download Python source code: vgg16_fp8_ptq.py <vgg16_fp8_ptq.py>`

    .. container:: sphx-glr-download sphx-glr-download-jupyter

      :download:`Download Jupyter notebook: vgg16_fp8_ptq.ipynb <vgg16_fp8_ptq.ipynb>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_