Source code for torchtext.models.roberta.bundler
from dataclasses import dataclass
from urllib.parse import urljoin
from typing import Optional, Callable, Dict, Union, Any
from torchtext._download_hooks import load_state_dict_from_url
from torch.nn import Module
import torch
import logging
import re
logger = logging.getLogger(__name__)
from .model import (
RobertaEncoderConf,
RobertaModel,
)
import torchtext.transforms as T
from torchtext import _TEXT_BUCKET
def _is_head_available_in_checkpoint(checkpoint, head_state_dict):
# ensure all keys are present
return all(key in checkpoint.keys() for key in head_state_dict.keys())
[docs]@dataclass
class RobertaModelBundle:
"""RobertaModelBundle(_params: torchtext.models.RobertaEncoderParams, _path: Optional[str] = None, _head: Optional[torch.nn.Module] = None, transform: Optional[Callable] = None)
Example - Pretrained base xlmr encoder
>>> import torch, torchtext
>>> from torchtext.functional import to_tensor
>>> xlmr_base = torchtext.models.XLMR_BASE_ENCODER
>>> model = xlmr_base.get_model()
>>> transform = xlmr_base.transform()
>>> input_batch = ["Hello world", "How are you!"]
>>> model_input = to_tensor(transform(input_batch), padding_value=transform.pad_idx)
>>> output = model(model_input)
>>> output.shape
torch.Size([2, 6, 768])
Example - Pretrained large xlmr encoder attached to un-initialized classification head
>>> import torch, torchtext
>>> from torchtext.models import RobertaClassificationHead
>>> from torchtext.functional import to_tensor
>>> xlmr_large = torchtext.models.XLMR_LARGE_ENCODER
>>> classifier_head = torchtext.models.RobertaClassificationHead(num_classes=2, input_dim = 1024)
>>> model = xlmr_large.get_model(head=classifier_head)
>>> transform = xlmr_large.transform()
>>> input_batch = ["Hello world", "How are you!"]
>>> model_input = to_tensor(transform(input_batch), padding_value=transform.pad_idx)
>>> output = model(model_input)
>>> output.shape
torch.Size([1, 2])
Example - User-specified configuration and checkpoint
>>> from torchtext.models import RobertaEncoderConf, RobertaModelBundle, RobertaClassificationHead
>>> model_weights_path = "https://download.pytorch.org/models/text/xlmr.base.encoder.pt"
>>> encoder_conf = RobertaEncoderConf(vocab_size=250002)
>>> classifier_head = RobertaClassificationHead(num_classes=2, input_dim=768)
>>> model = RobertaModelBundle.build_model(encoder_conf=encoder_conf, head=classifier_head, checkpoint=model_weights_path)
"""
_encoder_conf: RobertaEncoderConf
_path: Optional[str] = None
_head: Optional[Module] = None
transform: Optional[Callable] = None
[docs] def get_model(self,
*,
head: Optional[Module] = None,
load_weights: bool = True,
freeze_encoder: bool = False,
dl_kwargs: Dict[str, Any] = None) -> RobertaModel:
r"""get_model(head: Optional[torch.nn.Module] = None, load_weights: bool = True, freeze_encoder: bool = False, *, dl_kwargs=None) -> torctext.models.RobertaModel
Args:
head (nn.Module): A module to be attached to the encoder to perform specific task. If provided, it will replace the default member head (Default: ``None``)
load_weights (bool): Indicates whether or not to load weights if available. (Default: ``True``)
freeze_encoder (bool): Indicates whether or not to freeze the encoder weights. (Default: ``False``)
dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`. (Default: ``None``)
"""
if load_weights:
assert self._path is not None, "load_weights cannot be True. The pre-trained model weights are not available for the current object"
if freeze_encoder:
if not load_weights or not self._path:
logger.warn("The encoder is not loaded with pre-trained weights. Setting freeze_encoder to True will hinder encoder from learning appropriate weights.")
if head is not None:
input_head = head
if self._head is not None:
logger.log("A custom head module was provided, discarding the default head module.")
else:
input_head = self._head
return RobertaModelBundle.build_model(encoder_conf=self._encoder_conf,
head=input_head,
freeze_encoder=freeze_encoder,
checkpoint=self._path if load_weights else None,
override_checkpoint_head=True,
strict=True,
dl_kwargs=dl_kwargs)
@classmethod
def build_model(
cls,
encoder_conf: RobertaEncoderConf,
*,
head: Optional[Module] = None,
freeze_encoder: bool = False,
checkpoint: Optional[Union[str, Dict[str, torch.Tensor]]] = None,
override_checkpoint_head: bool = False,
strict=True,
dl_kwargs: Dict[str, Any] = None,
) -> RobertaModel:
"""Class builder method
Args:
encoder_conf (RobertaEncoderConf): An instance of class RobertaEncoderConf that defined the encoder configuration
head (nn.Module): A module to be attached to the encoder to perform specific task. (Default: ``None``)
freeze_encoder (bool): Indicates whether to freeze the encoder weights. (Default: ``False``)
checkpoint (str or Dict[str, torch.Tensor]): Path to or actual model state_dict. state_dict can have partial weights i.e only for encoder. (Default: ``None``)
override_checkpoint_head (bool): Override the checkpoint's head state dict (if present) with provided head state dict. (Default: ``False``)
strict (bool): Passed to :func: `torch.nn.Module.load_state_dict` method. (Default: ``True``)
dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`. (Default: ``None``)
"""
model = RobertaModel(encoder_conf, head, freeze_encoder)
if checkpoint is not None:
if torch.jit.isinstance(checkpoint, Dict[str, torch.Tensor]):
state_dict = checkpoint
elif isinstance(checkpoint, str):
dl_kwargs = {} if dl_kwargs is None else dl_kwargs
state_dict = load_state_dict_from_url(checkpoint, **dl_kwargs)
else:
raise TypeError("checkpoint must be of type `str` or `Dict[str, torch.Tensor]` but got {}".format(type(checkpoint)))
if head is not None:
regex = re.compile(r"^head\.")
head_state_dict = {k: v for k, v in model.state_dict().items() if regex.findall(k)}
# If checkpoint does not contains head_state_dict, then we augment the checkpoint with user-provided head state_dict
if not _is_head_available_in_checkpoint(state_dict, head_state_dict) or override_checkpoint_head:
state_dict.update(head_state_dict)
model.load_state_dict(state_dict, strict=strict)
return model
@property
def encoderConf(self) -> RobertaEncoderConf:
return self._encoder_conf
XLMR_BASE_ENCODER = RobertaModelBundle(
_path=urljoin(_TEXT_BUCKET, "xlmr.base.encoder.pt"),
_encoder_conf=RobertaEncoderConf(vocab_size=250002),
transform=lambda: T.Sequential(
T.SentencePieceTokenizer(urljoin(_TEXT_BUCKET, "xlmr.sentencepiece.bpe.model")),
T.VocabTransform(load_state_dict_from_url(urljoin(_TEXT_BUCKET, "xlmr.vocab.pt"))),
T.Truncate(254),
T.AddToken(token=0, begin=True),
T.AddToken(token=2, begin=False),
)
)
XLMR_BASE_ENCODER.__doc__ = (
'''
XLM-R Encoder with Base configuration
The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning
at Scale <https://arxiv.org/abs/1911.02116>`. It is a large multi-lingual language model,
trained on 2.5TB of filtered CommonCrawl data and based on the RoBERTa model architecture.
Originally published by the authors of XLM-RoBERTa under MIT License
and redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/main/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/tree/main/examples/xlmr#pre-trained-models>`__]
Please refer to :func:`torchtext.models.RobertaModelBundle` for the usage.
'''
)
XLMR_LARGE_ENCODER = RobertaModelBundle(
_path=urljoin(_TEXT_BUCKET, "xlmr.large.encoder.pt"),
_encoder_conf=RobertaEncoderConf(vocab_size=250002, embedding_dim=1024, ffn_dimension=4096, num_attention_heads=16, num_encoder_layers=24),
transform=lambda: T.Sequential(
T.SentencePieceTokenizer(urljoin(_TEXT_BUCKET, "xlmr.sentencepiece.bpe.model")),
T.VocabTransform(load_state_dict_from_url(urljoin(_TEXT_BUCKET, "xlmr.vocab.pt"))),
T.Truncate(510),
T.AddToken(token=0, begin=True),
T.AddToken(token=2, begin=False),
)
)
XLMR_LARGE_ENCODER.__doc__ = (
'''
XLM-R Encoder with Large configuration
The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning
at Scale <https://arxiv.org/abs/1911.02116>`. It is a large multi-lingual language model,
trained on 2.5TB of filtered CommonCrawl data and based on the RoBERTa model architecture.
Originally published by the authors of XLM-RoBERTa under MIT License
and redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/main/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/tree/main/examples/xlmr#pre-trained-models>`__]
Please refer to :func:`torchtext.models.RobertaModelBundle` for the usage.
'''
)
ROBERTA_BASE_ENCODER = RobertaModelBundle(
_path=urljoin(_TEXT_BUCKET, "roberta.base.encoder.pt"),
_encoder_conf=RobertaEncoderConf(vocab_size=50265),
transform=lambda: T.Sequential(
T.GPT2BPETokenizer(
encoder_json_path=urljoin(_TEXT_BUCKET, "gpt2_bpe_encoder.json"),
vocab_bpe_path=urljoin(_TEXT_BUCKET, "gpt2_bpe_vocab.bpe"),
),
T.VocabTransform(
load_state_dict_from_url(urljoin(_TEXT_BUCKET, "roberta.vocab.pt"))
),
T.Truncate(254),
T.AddToken(token=0, begin=True),
T.AddToken(token=2, begin=False),
),
)
ROBERTA_BASE_ENCODER.__doc__ = (
'''
Roberta Encoder with Base configuration
RoBERTa iterates on BERT's pretraining procedure, including training the model longer,
with bigger batches over more data; removing the next sentence prediction objective;
training on longer sequences; and dynamically changing the masking pattern applied
to the training data.
The RoBERTa model was pretrained on the reunion of five datasets: BookCorpus,
English Wikipedia, CC-News, OpenWebText, and STORIES. Together theses datasets
contain over a 160GB of text.
Originally published by the authors of RoBERTa under MIT License
and redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/main/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/tree/main/examples/roberta#pre-trained-models>`__]
Please refer to :func:`torchtext.models.RobertaModelBundle` for the usage.
'''
)
ROBERTA_LARGE_ENCODER = RobertaModelBundle(
_path=urljoin(_TEXT_BUCKET, "roberta.large.encoder.pt"),
_encoder_conf=RobertaEncoderConf(
vocab_size=50265,
embedding_dim=1024,
ffn_dimension=4096,
num_attention_heads=16,
num_encoder_layers=24,
),
transform=lambda: T.Sequential(
T.GPT2BPETokenizer(
encoder_json_path=urljoin(_TEXT_BUCKET, "gpt2_bpe_encoder.json"),
vocab_bpe_path=urljoin(_TEXT_BUCKET, "gpt2_bpe_vocab.bpe"),
),
T.VocabTransform(
load_state_dict_from_url(urljoin(_TEXT_BUCKET, "roberta.vocab.pt"))
),
T.Truncate(510),
T.AddToken(token=0, begin=True),
T.AddToken(token=2, begin=False),
),
)
ROBERTA_LARGE_ENCODER.__doc__ = (
'''
Roberta Encoder with Large configuration
RoBERTa iterates on BERT's pretraining procedure, including training the model longer,
with bigger batches over more data; removing the next sentence prediction objective;
training on longer sequences; and dynamically changing the masking pattern applied
to the training data.
The RoBERTa model was pretrained on the reunion of five datasets: BookCorpus,
English Wikipedia, CC-News, OpenWebText, and STORIES. Together theses datasets
contain over a 160GB of text.
Originally published by the authors of RoBERTa under MIT License
and redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/main/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/tree/main/examples/roberta#pre-trained-models>`__]
Please refer to :func:`torchtext.models.RobertaModelBundle` for the usage.
'''
)