Shortcuts

Source code for torchtext.transforms

import json
from copy import deepcopy
from functools import lru_cache
from typing import Any, List, Optional, Union

import torch
import torchtext  # noqa: F401
from torch import Tensor
from torch.nn import Module
from torchtext._torchtext import (
    CLIPEncoder as CLIPEncoderPyBind,
    GPT2BPEEncoder as GPT2BPEEncoderPyBind,
    BERTEncoder as BERTEncoderPyBind,
)
from torchtext.data.functional import load_sp_model
from torchtext.utils import get_asset_local_path
from torchtext.vocab import Vocab

from . import functional as F

__all__ = [
    "SentencePieceTokenizer",
    "VocabTransform",
    "ToTensor",
    "LabelToIndex",
    "Truncate",
    "AddToken",
    "PadTransform",
    "StrToIntTransform",
    "GPT2BPETokenizer",
    "Sequential",
]


[docs]class SentencePieceTokenizer(Module): """ Transform for Sentence Piece tokenizer from pre-trained sentencepiece model Additiona details: https://github.com/google/sentencepiece :param sp_model_path: Path to pre-trained sentencepiece model :type sp_model_path: str Example >>> from torchtext.transforms import SentencePieceTokenizer >>> transform = SentencePieceTokenizer("spm_model") >>> transform(["hello world", "attention is all you need!"]) """ def __init__(self, sp_model_path: str): super().__init__() self.sp_model = load_sp_model(get_asset_local_path(sp_model_path))
[docs] def forward(self, input: Any) -> Any: """ :param input: Input sentence or list of sentences on which to apply tokenizer. :type input: Union[str, List[str]] :return: tokenized text :rtype: Union[List[str], List[List(str)]] """ if torch.jit.isinstance(input, List[str]): tokens: List[List[str]] = [] for text in input: tokens.append(self.sp_model.EncodeAsPieces(text)) return tokens elif torch.jit.isinstance(input, str): return self.sp_model.EncodeAsPieces(input) else: raise TypeError("Input type not supported")
[docs]class VocabTransform(Module): r"""Vocab transform to convert input batch of tokens into corresponding token ids :param vocab: an instance of :class:`torchtext.vocab.Vocab` class. Example: >>> import torch >>> from torchtext.vocab import vocab >>> from torchtext.transforms import VocabTransform >>> from collections import OrderedDict >>> vocab_obj = vocab(OrderedDict([('a', 1), ('b', 1), ('c', 1)])) >>> vocab_transform = VocabTransform(vocab_obj) >>> output = vocab_transform([['a','b'],['a','b','c']]) >>> jit_vocab_transform = torch.jit.script(vocab_transform) """ def __init__(self, vocab: Vocab): super().__init__() assert isinstance(vocab, Vocab) self.vocab = vocab
[docs] def forward(self, input: Any) -> Any: """ :param input: Input batch of token to convert to correspnding token ids :type input: Union[List[str], List[List[str]]] :return: Converted input into corresponding token ids :rtype: Union[List[int], List[List[int]]] """ if torch.jit.isinstance(input, List[str]): return self.vocab.lookup_indices(input) elif torch.jit.isinstance(input, List[List[str]]): output: List[List[int]] = [] for tokens in input: output.append(self.vocab.lookup_indices(tokens)) return output else: raise TypeError("Input type not supported")
[docs]class ToTensor(Module): r"""Convert input to torch tensor :param padding_value: Pad value to make each input in the batch of length equal to the longest sequence in the batch. :type padding_value: Optional[int] :param dtype: :class:`torch.dtype` of output tensor :type dtype: :class:`torch.dtype` """ def __init__(self, padding_value: Optional[int] = None, dtype: torch.dtype = torch.long) -> None: super().__init__() self.padding_value = padding_value self.dtype = dtype
[docs] def forward(self, input: Any) -> Tensor: """ :param input: Sequence or batch of token ids :type input: Union[List[int], List[List[int]]] :rtype: Tensor """ return F.to_tensor(input, padding_value=self.padding_value, dtype=self.dtype)
[docs]class LabelToIndex(Module): r""" Transform labels from string names to ids. :param label_names: a list of unique label names :type label_names: Optional[List[str]] :param label_path: a path to file containing unique label names containing 1 label per line. Note that either label_names or label_path should be supplied but not both. :type label_path: Optional[str] """ def __init__( self, label_names: Optional[List[str]] = None, label_path: Optional[str] = None, sort_names=False, ): assert label_names or label_path, "label_names or label_path is required" assert not (label_names and label_path), "label_names and label_path are mutually exclusive" super().__init__() if label_path: with open(label_path, "r") as f: label_names = [line.strip() for line in f if line.strip()] else: label_names = label_names if sort_names: label_names = sorted(label_names) self._label_vocab = Vocab(torch.classes.torchtext.Vocab(label_names, None)) self._label_names = self._label_vocab.get_itos()
[docs] def forward(self, input: Any) -> Any: """ :param input: Input labels to convert to corresponding ids :type input: Union[str, List[str]] :rtype: Union[int, List[int]] """ if torch.jit.isinstance(input, List[str]): return self._label_vocab.lookup_indices(input) elif torch.jit.isinstance(input, str): return self._label_vocab.__getitem__(input) else: raise TypeError("Input type not supported")
@property def label_names(self) -> List[str]: return self._label_names
[docs]class Truncate(Module): r"""Truncate input sequence :param max_seq_len: The maximum allowable length for input sequence :type max_seq_len: int """ def __init__(self, max_seq_len: int) -> None: super().__init__() self.max_seq_len = max_seq_len
[docs] def forward(self, input: Any) -> Any: """ :param input: Input sequence or batch of sequence to be truncated :type input: Union[List[Union[str, int]], List[List[Union[str, int]]]] :return: Truncated sequence :rtype: Union[List[Union[str, int]], List[List[Union[str, int]]]] """ return F.truncate(input, self.max_seq_len)
[docs]class AddToken(Module): """Add token to beginning or end of sequence :param token: The token to be added :type token: Union[int, str] :param begin: Whether to insert token at start or end or sequence, defaults to True :type begin: bool, optional """ def __init__(self, token: Union[int, str], begin: bool = True) -> None: super().__init__() self.token = token self.begin = begin
[docs] def forward(self, input: Any) -> Any: """ :param input: Input sequence or batch :type input: Union[List[Union[str, int]], List[List[Union[str, int]]]] """ return F.add_token(input, self.token, self.begin)
[docs]class PadTransform(Module): """Pad tensor to a fixed length with given padding value. :param max_length: Maximum length to pad to :type max_length: int :param pad_value: Value to pad the tensor with :type pad_value: bool """ def __init__(self, max_length: int, pad_value: int): super().__init__() self.max_length = max_length self.pad_value = float(pad_value)
[docs] def forward(self, x: Tensor) -> Tensor: """ :param x: The tensor to pad :type x: Tensor :return: Tensor padded up to max_length with pad_value :rtype: Tensor """ max_encoded_length = x.size(-1) if max_encoded_length < self.max_length: pad_amount = self.max_length - max_encoded_length x = torch.nn.functional.pad(x, (0, pad_amount), value=self.pad_value) return x
[docs]class StrToIntTransform(Module): """Convert string tokens to integers (either single sequence or batch).""" def __init__(self): super().__init__()
[docs] def forward(self, input: Any) -> Any: """ :param input: sequence or batch of string tokens to convert :type input: Union[List[str], List[List[str]]] :return: sequence or batch converted into corresponding token ids :rtype: Union[List[int], List[List[int]]] """ return F.str_to_int(input)
[docs]class GPT2BPETokenizer(Module): """ Transform for GPT-2 BPE Tokenizer. Reimplements openai GPT-2 BPE in TorchScript. Original openai implementation https://github.com/openai/gpt-2/blob/master/src/encoder.py :param encoder_json_path: Path to GPT-2 BPE encoder json file. :type encoder_json_path: str :param vocab_bpe_path: Path to bpe vocab file. :type vocab_bpe_path: str :param return_tokens: Indicate whether to return split tokens. If False, it will return encoded token IDs as strings (default: False) :type return_input: bool """ __jit_unused_properties__ = ["is_jitable"] _seperator: torch.jit.Final[str] def __init__(self, encoder_json_path: str, vocab_bpe_path: str, return_tokens: bool = False): super().__init__() self._seperator = "\u0001" # load bpe encoder and bpe decoder with open(get_asset_local_path(encoder_json_path), "r", encoding="utf-8") as f: bpe_encoder = json.load(f) # load bpe vocab with open(get_asset_local_path(vocab_bpe_path), "r", encoding="utf-8") as f: bpe_vocab = f.read() bpe_merge_ranks = { self._seperator.join(merge_pair.split()): i for i, merge_pair in enumerate(bpe_vocab.split("\n")[1:-1]) } # Caching is enabled in Eager mode self.bpe = GPT2BPEEncoderPyBind(bpe_encoder, bpe_merge_ranks, self._seperator, bytes_to_unicode(), True) self._return_tokens = return_tokens @property def is_jitable(self): return isinstance(self.bpe, torch._C.ScriptObject) @torch.jit.export def _encode(self, text: str) -> List[str]: """Encode text into a list of tokens IDs Args: text: An input text string. Returns: A list of bpe token ids represents each bpe tokens For example: "awesome,awe" --> bpe --> bpe tokens: ["aw", "esome"], [","], ["aw", e] --> bpe encode --> bpe token ids: [707, 5927, 11, 707, 68] """ bpe_token_ids: List[int] = self.bpe.encode(text) bpe_tokens: List[str] = [] for bpe_token_id in bpe_token_ids: bpe_tokens.append(str(bpe_token_id)) return bpe_tokens @torch.jit.export def _tokenize(self, text: str) -> List[str]: """Tokenize text into a list of tokens Args: text: An input text string. Returns: A list of bpe token ids represents each bpe tokens For example: "awesome,awe" --> bpe --> bpe tokens: ["aw", "esome"], [","], ["aw", e] """ return self.bpe.tokenize(text)
[docs] def forward(self, input: Any) -> Any: """ :param input: Input sentence or list of sentences on which to apply tokenizer. :type input: Union[str, List[str]] :return: tokenized text :rtype: Union[List[str], List[List(str)]] """ if torch.jit.isinstance(input, List[str]): tokens: List[List[str]] = [] for text in input: if self._return_tokens: tokens.append(self._tokenize(text)) else: tokens.append(self._encode(text)) return tokens elif torch.jit.isinstance(input, str): if self._return_tokens: return self._tokenize(input) else: return self._encode(input) else: raise TypeError("Input type not supported")
def __prepare_scriptable__(self): r"""Return a JITable tokenizer.""" if not self.is_jitable: tokenizer_copy = deepcopy(self) # Disable caching in script mode tokenizer_copy.bpe = torch.classes.torchtext.GPT2BPEEncoder( self.bpe.bpe_encoder_, self.bpe.bpe_merge_ranks_, self.bpe.seperator_, self.bpe.byte_encoder_, False ) return tokenizer_copy return self
[docs]class CLIPTokenizer(Module): """ Transform for CLIP Tokenizer. Based on Byte-Level BPE. Reimplements CLIP Tokenizer in TorchScript. Original implementation: https://github.com/mlfoundations/open_clip/blob/main/src/clip/tokenizer.py This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not. The below code snippet shows how to use the CLIP tokenizer with encoder and merges file taken from the original paper implementation. Example >>> from torchtext.transforms import CLIPTokenizer >>> MERGES_FILE = "http://download.pytorch.org/models/text/clip_merges.bpe" >>> ENCODER_FILE = "http://download.pytorch.org/models/text/clip_encoder.json" >>> tokenizer = CLIPTokenizer(merges_path=MERGES_FILE, encoder_json_path=ENCODER_FILE) >>> tokenizer("the quick brown fox jumped over the lazy dog") :param merges_path: Path to bpe merges file. :type merges_path: str :param encoder_json_path: Optional, path to BPE encoder json file. When specified, this is used to infer num_merges. :type encoder_json_path: str :param num_merges: Optional, number of merges to read from the bpe merges file. :type num_merges: int :param return_tokens: Indicate whether to return split tokens. If False, it will return encoded token IDs as strings (default: False) :type return_input: bool """ __jit_unused_properties__ = ["is_jitable"] _seperator: torch.jit.Final[str] def __init__( self, merges_path: str, encoder_json_path: Optional[str] = None, num_merges: Optional[int] = None, return_tokens: bool = False, ): super().__init__() self._seperator = "\u0001" # load bpe merges with open(get_asset_local_path(merges_path), "r", encoding="utf-8") as f: bpe_merges = f.read().split("\n")[1:] if encoder_json_path: # load bpe encoder with open(get_asset_local_path(encoder_json_path), "r", encoding="utf-8") as f: bpe_encoder = json.load(f) # 256 * 2 for each byte. For each byte we have ['a', 'a</w>'] # Additional 2 tokens for bos and eos num_merges = len(bpe_encoder) - (256 * 2 + 2) bpe_merge_ranks = { self._seperator.join(merge_pair.split()): i for i, merge_pair in enumerate(bpe_merges[:num_merges]) } else: num_merges = num_merges or len(bpe_merges) bpe_merge_ranks = { self._seperator.join(merge_pair.split()): i for i, merge_pair in enumerate(bpe_merges[:num_merges]) } bpe_vocab = list(bytes_to_unicode().values()) bpe_vocab = bpe_vocab + [v + "</w>" for v in bpe_vocab] bpe_vocab.extend(["".join(merge_pair.split()) for merge_pair in bpe_merges[:num_merges]]) bpe_vocab.extend(["<|startoftext|>", "<|endoftext|>"]) bpe_encoder = {v: i for i, v in enumerate(bpe_vocab)} # Caching is enabled in Eager mode self.bpe = CLIPEncoderPyBind(bpe_encoder, bpe_merge_ranks, self._seperator, bytes_to_unicode(), True) self._return_tokens = return_tokens @property def is_jitable(self): return isinstance(self.bpe, torch._C.ScriptObject) @torch.jit.export def _encode(self, text: str) -> List[str]: """Encode text into a list of tokens IDs Args: text: An input text string. Returns: A list of bpe token ids represents each bpe tokens For example: "awesome,awe" --> bpe --> bpe tokens: ["aw", "esome"], [","], ["aw", "e"] --> bpe encode --> bpe token ids: [707, 5927, 11, 707, 68] """ text = text.lower().strip() bpe_token_ids: List[int] = self.bpe.encode(text) bpe_tokens: List[str] = [] for bpe_token_id in bpe_token_ids: bpe_tokens.append(str(bpe_token_id)) return bpe_tokens @torch.jit.export def _tokenize(self, text: str) -> List[str]: """Tokenize text into a list of tokens Args: text: An input text string. Returns: A list of bpe token ids represents each bpe tokens For example: "awesome,awe" --> bpe --> bpe tokens: ["aw", "esome"], [","], ["aw", "e"] """ text = text.lower().strip() return self.bpe.tokenize(text)
[docs] def forward(self, input: Any) -> Any: """ :param input: Input sentence or list of sentences on which to apply tokenizer. :type input: Union[str, List[str]] :return: tokenized text :rtype: Union[List[str], List[List(str)]] """ if torch.jit.isinstance(input, List[str]): tokens: List[List[str]] = [] for text in input: if self._return_tokens: tokens.append(self._tokenize(text)) else: tokens.append(self._encode(text)) return tokens elif torch.jit.isinstance(input, str): if self._return_tokens: return self._tokenize(input) else: return self._encode(input) else: raise TypeError("Input type not supported")
def __prepare_scriptable__(self): r"""Return a JITable tokenizer.""" if not self.is_jitable: tokenizer_copy = deepcopy(self) # Disable caching in script mode tokenizer_copy.bpe = torch.classes.torchtext.CLIPEncoder( self.bpe.bpe_encoder_, self.bpe.bpe_merge_ranks_, self.bpe.seperator_, self.bpe.byte_encoder_, False ) return tokenizer_copy return self
[docs]class BERTTokenizer(Module): """ Transform for BERT Tokenizer. Based on WordPiece algorithm introduced in paper: https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf The backend kernel implementation is taken and modified from https://github.com/LieluoboAi/radish. See PR https://github.com/pytorch/text/pull/1707 summary for more details. The below code snippet shows how to use the BERT tokenizer using the pre-trained vocab files. Example >>> from torchtext.transforms import BERTTokenizer >>> VOCAB_FILE = "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt" >>> tokenizer = BERTTokenizer(vocab_path=VOCAB_FILE, do_lower_case=True, return_tokens=True) >>> tokenizer("Hello World, How are you!") # single sentence input >>> tokenizer(["Hello World","How are you!"]) # batch input :param vocab_path: Path to pre-trained vocabulary file. The path can be either local or URL. :type vocab_path: str :param do_lower_case: Indicate whether to do lower case. (default: True) :type do_lower_case: Optional[bool] :param strip_accents: Indicate whether to strip accents. (default: None) :type strip_accents: Optional[bool] :param return_tokens: Indicate whether to return tokens. If false, returns corresponding token IDs as strings (default: False) :type return_tokens: bool """ __jit_unused_properties__ = ["is_jitable"] def __init__( self, vocab_path: str, do_lower_case: bool = True, strip_accents: Optional[bool] = None, return_tokens=False ) -> None: super().__init__() self.bert_model = BERTEncoderPyBind(get_asset_local_path(vocab_path), do_lower_case, strip_accents) self._return_tokens = return_tokens self._vocab_path = vocab_path self._do_lower_case = do_lower_case self._strip_accents = strip_accents @property def is_jitable(self): return isinstance(self.bert_model, torch._C.ScriptObject) @torch.jit.export def _encode(self, text: str) -> List[str]: """Encode text into a list of tokens IDs Args: text: An input text string. Returns: A list of token ids represents each sub-word For example: --> "Hello world!" --> token ids: [707, 5927, 11, 707, 68] """ token_ids: List[int] = self.bert_model.encode(text.strip()) tokens_ids_str: List[str] = [str(token_id) for token_id in token_ids] return tokens_ids_str @torch.jit.export def _batch_encode(self, text: List[str]) -> List[List[str]]: """Batch version of _encode i.e operate on list of str""" token_ids: List[List[int]] = self.bert_model.batch_encode([t.strip() for t in text]) tokens_ids_str: List[List[str]] = [[str(t) for t in token_id] for token_id in token_ids] return tokens_ids_str @torch.jit.export def _tokenize(self, text: str) -> List[str]: """Tokenize text into a list of tokens Args: text: An input text string. Returns: A list of tokens (sub-words) For example: --> "Hello World!": ["Hello", "World", "!"] """ return self.bert_model.tokenize(text.strip()) @torch.jit.export def _batch_tokenize(self, text: List[str]) -> List[List[str]]: """Batch version of _tokenize i.e operate on list of str""" return self.bert_model.batch_tokenize([t.strip() for t in text])
[docs] def forward(self, input: Any) -> Any: """ :param input: Input sentence or list of sentences on which to apply tokenizer. :type input: Union[str, List[str]] :return: tokenized text :rtype: Union[List[str], List[List(str)]] """ if torch.jit.isinstance(input, List[str]): tokens: List[List[str]] = [] if self._return_tokens: tokens = self._batch_tokenize(input) else: tokens = self._batch_encode(input) return tokens elif torch.jit.isinstance(input, str): if self._return_tokens: return self._tokenize(input) else: return self._encode(input) else: raise TypeError("Input type not supported")
def __prepare_scriptable__(self): if not self.is_jitable: tokenizer_copy = deepcopy(self) tokenizer_copy.bert_model = torch.classes.torchtext.BERTEncoder( self._vocab_path, self._do_lower_case, self._strip_accents ) return tokenizer_copy return self
@lru_cache() def bytes_to_unicode(): """ Original Source: https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9 Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) cs = bs[:] n = 0 for b in range(2 ** 8): if b not in bs: bs.append(b) cs.append(2 ** 8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs))
[docs]class Sequential(torch.nn.Sequential): r"""A container to host a sequence of text transforms."""
[docs] def forward(self, input: Any) -> Any: """ :param input: Input sequence or batch. The input type must be supported by the first transform in the sequence. :type input: `Any` """ for module in self: input = module(input) return input

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources