-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfineweb.py
84 lines (75 loc) · 3.78 KB
/
fineweb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
FineWeb-Edu dataset (for srs pretraining)
https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu
Downloads and tokenizes the data and saves data shards to disk.
Run simply as:
$ python fineweb.py
Will save shards to the local directory "edu_fineweb10B".
"""
import os
import multiprocessing as mp
import numpy as np
import tiktoken
from datasets import load_dataset # pip install datasets
from tqdm import tqdm # pip install tqdm
# ------------------------------------------
local_dir = "edu_fineweb10B"
remote_name = "sample-10BT"
shard_size = int(1e8) # 100M tokens per shard, total of 100 shards
# create the cache the local directory if it doesn't exist yet
DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), local_dir)
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
# download the dataset
# 返回的值是一个Dataset对象,可以像dataframe那样访问数据
fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train")
# init the tokenizer
enc = tiktoken.get_encoding("gpt2")
eot = enc._special_tokens['<|endoftext|>'] # end of text token
def tokenize(doc):
# tokenizes a single document and returns a numpy array of uint16 tokens
tokens = [eot] # the special <|endoftext|> token delimits all documents
tokens.extend(enc.encode_ordinary(doc["text"])) # 在每一个文档开头加上<|endoftext|>
tokens_np = np.array(tokens)
assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
tokens_np_uint16 = tokens_np.astype(np.uint16)
return tokens_np_uint16
def write_datafile(filename, tokens_np):
np.save(filename, tokens_np)
# tokenize all documents and write output shards, each of shard_size tokens (last shard has remainder)
nprocs = max(1, os.cpu_count()//2) # 使用总核心数量的一半
with mp.Pool(nprocs) as pool:
shard_index = 0
# preallocate buffer to hold current shard
all_tokens_np = np.empty((shard_size,), dtype=np.uint16)
token_count = 0 # 当前分片中已存储的token数量
progress_bar = None # 初始化进度条
# 每个进程在每次调用中会处理 16 条数据,每条数据都应用tokenize函数
for tokens in pool.imap(tokenize, fw, chunksize=16):
# is there enough space in the current shard for the new tokens?
if token_count + len(tokens) < shard_size:
# simply append tokens to current shard
all_tokens_np[token_count:token_count+len(tokens)] = tokens
token_count += len(tokens)
# update progress bar
if progress_bar is None:
progress_bar = tqdm(total=shard_size, unit="tokens", desc=f"Shard {shard_index}") # 当前分片的进度条
progress_bar.update(len(tokens))
else:
# write the current shard and start a new one
split = "val" if shard_index == 0 else "train"
filename = os.path.join(DATA_CACHE_DIR, f"edufineweb_{split}_{shard_index:06d}")
# split the document into whatever fits in this shard; the remainder goes to next one
remainder = shard_size - token_count
progress_bar.update(remainder)
all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
write_datafile(filename, all_tokens_np)
shard_index += 1
progress_bar = None # 初始化进度条
# populate the next shard with the leftovers of the current doc
all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
token_count = len(tokens)-remainder
# write any remaining tokens as the last shard
if token_count != 0:
split = "val" if shard_index == 0 else "train"
filename = os.path.join(DATA_CACHE_DIR, f"edufineweb_{split}_{shard_index:06d}")
write_datafile(filename, all_tokens_np[:token_count])