Shortcuts

Source code for torchtext.datasets.unsupervised_learning

from torchtext.data.functional import custom_replace
import torch
from torchtext.utils import download_from_url, extract_archive
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import simple_space_split
import os


_patterns = [(r'<.*>', ''),
             (r'&amp;', '&'),
             (r'&lt;', '<'),
             (r'&gt;', '>'),
             (r'<ref[^<]*<\/ref>', ''),
             (r'<[^>]*>', ''),
             (r'\[http:[^] ]*', '['),
             (r'\|thumb', ''),
             (r'\|left', ''),
             (r'\|right', ''),
             (r'\|\d+px', ''),
             (r'\[\[image:[^\[\]]*\|', ''),
             (r'\[\[category:([^|\]]*)[^]]*\]\]', '[[$1]]'),
             (r'\[\[[a-z\-]*:[^\]]*\]\]', ''),
             (r'\[\[[^\|\]]*\|', '[['),
             (r'\{\{[^\}]*\}\}', ''),
             (r'\{[^\}]*\}', ''),
             (r'\[', ''),
             (r'\]', ''),
             (r'&[^;]*;', ' '),
             (r'A', 'a'), (r'B', 'b'), (r'C', 'c'),
             (r'D', 'd'), (r'E', 'e'), (r'F', 'f'),
             (r'G', 'g'), (r'H', 'h'), (r'I', 'i'),
             (r'J', 'j'), (r'K', 'k'), (r'L', 'l'),
             (r'M', 'm'), (r'N', 'n'), (r'O', 'o'),
             (r'P', 'p'), (r'Q', 'q'), (r'R', 'r'),
             (r'S', 's'), (r'T', 't'), (r'U', 'u'),
             (r'V', 'v'), (r'W', 'w'), (r'X', 'x'),
             (r'Y', 'y'), (r'Z', 'z'),
             (r'0', ' zero '), (r'1', ' one '), (r'2', ' two '),
             (r'3', ' three '), (r'4', ' four '), (r'5', ' five '),
             (r'6', ' six '), (r'7', ' seven '), (r'8', ' eight '),
             (r'9', ' nine '),
             (r'[^a-z\n]+', ' '),
             (r'\n ', ''),
             (r'\s+', ' '),
             (r'\n\s*\n', r'\n')
             ]
enwik9_norm_transform = custom_replace(_patterns)


def generate_offsets(filename):
    offsets = []
    with open(filename) as f:
        offsets.append(f.tell())
        while f.readline():
            offsets.append(f.tell())
    return offsets


def read_lines_from_iterator(data_path, offsets, begin_line, num_lines):
    with open(data_path) as f:
        f.seek(offsets[begin_line])
        for i in range(num_lines):
            yield f.readline()


def preprocess_raw_enwik9(input_filename, output_filename):
    with open(input_filename, 'r') as f1:
        with open(output_filename, 'w') as f2:
            while True:
                line = f1.readline()
                if not line:
                    break
                line = list(enwik9_norm_transform([line]))[0]
                if line != ' ' and line != '':
                    if line[0] == ' ':
                        line = line[1:]
                    f2.writelines(line + '\n')


[docs]class EnWik9(torch.utils.data.Dataset): r"""Compressed size of first 10^9 bytes of enwiki-20060303-pages-articles.xml. It's part of Large Text Compression Benchmark project """
[docs] def __init__(self, begin_line=0, num_lines=6348957, root='.data'): """Initiate EnWik9 dataset. Arguments: begin_line: the number of beginning line. Default: 0 num_lines: the number of lines to be loaded. Default: 6348957 root: Directory where the datasets are saved. Default: ".data" data: a list of label/tokens tuple. tokens are a tensor after Examples: >>> from torchtext.datasets import EnWik9 >>> enwik9 = EnWik9(num_lines=20000) >>> vocab = enwik9.get_vocab() """ super(EnWik9, self).__init__() processed_file = os.path.join(root, 'norm_enwik9') if not os.path.exists(processed_file): url = 'http://mattmahoney.net/dc/enwik9.zip' dataset_zip = download_from_url(url, path=os.path.join(root, 'enwik9.zip'), root=root) extracted_file = extract_archive(dataset_zip) raw_file = extracted_file[0] preprocess_raw_enwik9(raw_file, processed_file) # Meta information offsets = generate_offsets(processed_file) read_lines = read_lines_from_iterator(processed_file, offsets, begin_line, num_lines) self._data = [] for item in simple_space_split(read_lines): self._data += item self._vocab = None
def __getitem__(self, i): return self._data[i] def __len__(self): return len(self._data) def __iter__(self): for x in self._data: yield x def get_vocab(self): if self._vocab is None: self._vocab = build_vocab_from_iterator([self._data]) return self._vocab

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources