Shortcuts

Source code for torchtext.vocab.vocab_factory

from collections import Counter, OrderedDict
from typing import Dict, Iterable, List, Optional

from torchtext._torchtext import Vocab as VocabPybind

from .vocab import Vocab


[docs]def vocab( ordered_dict: Dict, min_freq: int = 1, specials: Optional[List[str]] = None, special_first: bool = True ) -> Vocab: r"""Factory method for creating a vocab object which maps tokens to indices. Note that the ordering in which key value pairs were inserted in the `ordered_dict` will be respected when building the vocab. Therefore if sorting by token frequency is important to the user, the `ordered_dict` should be created in a way to reflect this. Args: ordered_dict: Ordered Dictionary mapping tokens to their corresponding occurance frequencies. min_freq: The minimum frequency needed to include a token in the vocabulary. specials: Special symbols to add. The order of supplied tokens will be preserved. special_first: Indicates whether to insert symbols at the beginning or at the end. Returns: torchtext.vocab.Vocab: A `Vocab` object Examples: >>> from torchtext.vocab import vocab >>> from collections import Counter, OrderedDict >>> counter = Counter(["a", "a", "b", "b", "b"]) >>> sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) >>> ordered_dict = OrderedDict(sorted_by_freq_tuples) >>> v1 = vocab(ordered_dict) >>> print(v1['a']) #prints 1 >>> print(v1['out of vocab']) #raise RuntimeError since default index is not set >>> tokens = ['e', 'd', 'c', 'b', 'a'] >>> #adding <unk> token and default index >>> unk_token = '<unk>' >>> default_index = -1 >>> v2 = vocab(OrderedDict([(token, 1) for token in tokens]), specials=[unk_token]) >>> v2.set_default_index(default_index) >>> print(v2['<unk>']) #prints 0 >>> print(v2['out of vocab']) #prints -1 >>> #make default index same as index of unk_token >>> v2.set_default_index(v2[unk_token]) >>> v2['out of vocab'] is v2[unk_token] #prints True """ specials = specials or [] for token in specials: ordered_dict.pop(token, None) tokens = [] # Save room for special tokens for token, freq in ordered_dict.items(): if freq >= min_freq: tokens.append(token) if special_first: tokens[0:0] = specials else: tokens.extend(specials) return Vocab(VocabPybind(tokens, None))
[docs]def build_vocab_from_iterator( iterator: Iterable, min_freq: int = 1, specials: Optional[List[str]] = None, special_first: bool = True, max_tokens: Optional[int] = None, ) -> Vocab: """ Build a Vocab from an iterator. Args: iterator: Iterator used to build Vocab. Must yield list or iterator of tokens. min_freq: The minimum frequency needed to include a token in the vocabulary. specials: Special symbols to add. The order of supplied tokens will be preserved. special_first: Indicates whether to insert symbols at the beginning or at the end. max_tokens: If provided, creates the vocab from the `max_tokens - len(specials)` most frequent tokens. Returns: torchtext.vocab.Vocab: A `Vocab` object Examples: >>> #generating vocab from text file >>> import io >>> from torchtext.vocab import build_vocab_from_iterator >>> def yield_tokens(file_path): >>> with io.open(file_path, encoding = 'utf-8') as f: >>> for line in f: >>> yield line.strip().split() >>> vocab = build_vocab_from_iterator(yield_tokens(file_path), specials=["<unk>"]) """ counter = Counter() for tokens in iterator: counter.update(tokens) specials = specials or [] # First sort by descending frequency, then lexicographically sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0])) if max_tokens is None: ordered_dict = OrderedDict(sorted_by_freq_tuples) else: assert len(specials) < max_tokens, "len(specials) >= max_tokens, so the vocab will be entirely special tokens." ordered_dict = OrderedDict(sorted_by_freq_tuples[: max_tokens - len(specials)]) word_vocab = vocab(ordered_dict, min_freq=min_freq, specials=specials, special_first=special_first) return word_vocab

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources