Shortcuts

Source code for torchtext.vocab.vocab_factory

from .vocab import Vocab
from typing import Dict, Iterable, Optional, List
from collections import Counter, OrderedDict
from torchtext._torchtext import (
    Vocab as VocabPybind,
)


[docs]def vocab(ordered_dict: Dict, min_freq: int = 1) -> Vocab: r"""Factory method for creating a vocab object which maps tokens to indices. Note that the ordering in which key value pairs were inserted in the `ordered_dict` will be respected when building the vocab. Therefore if sorting by token frequency is important to the user, the `ordered_dict` should be created in a way to reflect this. Args: ordered_dict: Ordered Dictionary mapping tokens to their corresponding occurance frequencies. min_freq: The minimum frequency needed to include a token in the vocabulary. Returns: torchtext.vocab.Vocab: A `Vocab` object Examples: >>> from torchtext.vocab import vocab >>> from collections import Counter, OrderedDict >>> counter = Counter(["a", "a", "b", "b", "b"]) >>> sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) >>> ordered_dict = OrderedDict(sorted_by_freq_tuples) >>> v1 = vocab(ordered_dict) >>> print(v1['a']) #prints 1 >>> print(v1['out of vocab']) #raise RuntimeError since default index is not set >>> tokens = ['e', 'd', 'c', 'b', 'a'] >>> v2 = vocab(OrderedDict([(token, 1) for token in tokens])) >>> #adding <unk> token and default index >>> unk_token = '<unk>' >>> default_index = -1 >>> if unk_token not in v2: v2.insert_token(unk_token, 0) >>> v2.set_default_index(default_index) >>> print(v2['<unk>']) #prints 0 >>> print(v2['out of vocab']) #prints -1 >>> #make default index same as index of unk_token >>> v2.set_default_index(v2[unk_token]) >>> v2['out of vocab'] is v2[unk_token] #prints True """ tokens = [] for token, freq in ordered_dict.items(): if freq >= min_freq: tokens.append(token) return Vocab(VocabPybind(tokens, None))
[docs]def build_vocab_from_iterator(iterator: Iterable, min_freq: int = 1, specials: Optional[List[str]] = None, special_first: bool = True) -> Vocab: """ Build a Vocab from an iterator. Args: iterator: Iterator used to build Vocab. Must yield list or iterator of tokens. min_freq: The minimum frequency needed to include a token in the vocabulary. specials: Special symbols to add. The order of supplied tokens will be preserved. special_first: Indicates whether to insert symbols at the beginning or at the end. Returns: torchtext.vocab.Vocab: A `Vocab` object Examples: >>> #generating vocab from text file >>> import io >>> from torchtext.vocab import build_vocab_from_iterator >>> def yield_tokens(file_path): >>> with io.open(file_path, encoding = 'utf-8') as f: >>> for line in f: >>> yield line.strip().split() >>> vocab = build_vocab_from_iterator(yield_tokens_batch(file_path), specials=["<unk>"]) """ counter = Counter() for tokens in iterator: counter.update(tokens) if specials is not None: for tok in specials: del counter[tok] sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[0]) sorted_by_freq_tuples.sort(key=lambda x: x[1], reverse=True) ordered_dict = OrderedDict(sorted_by_freq_tuples) if specials is not None: if special_first: specials = specials[::-1] for symbol in specials: ordered_dict.update({symbol: min_freq}) ordered_dict.move_to_end(symbol, last=not special_first) word_vocab = vocab(ordered_dict, min_freq=min_freq) return word_vocab

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources