-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstorycloze.py
117 lines (98 loc) · 5.56 KB
/
storycloze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
'''
https://huggingface.co/datasets/LSDSem/story_cloze
This test requires a system to choose the correct ending to a four-sentence story.
Languages: English
需要手动下载数据集
'''
import os
import tiktoken
import torch
import pandas as pd
from tqdm import tqdm
from torch.nn import functional as F
from transformers import GPT2LMHeadModel
class StoryClozeEvaluator:
def iterate_examples(self, path):
'''数据据迭代器'''
data = pd.read_csv(path)
for i in range(len(data)):
example = {}
example["InputSentence1"] = data.iloc[i, :]["InputSentence1"]
example["InputSentence2"] = data.iloc[i, :]["InputSentence2"]
example["InputSentence3"] = data.iloc[i, :]["InputSentence3"]
example["InputSentence4"] = data.iloc[i, :]["InputSentence4"]
example["RandomFifthSentenceQuiz1"] = data.iloc[i, :]["RandomFifthSentenceQuiz1"]
example["RandomFifthSentenceQuiz2"] = data.iloc[i, :]["RandomFifthSentenceQuiz2"]
example["AnswerRightEnding"] = data.iloc[i, :]["AnswerRightEnding"]
yield example
def render_example(self, example, tokenizer):
'''将数据处理为模型所需要的输入形式'''
label = example["AnswerRightEnding"]
input_sentence = [example["InputSentence1"], example["InputSentence2"], example["InputSentence3"], example["InputSentence4"]]
input_sentence = " ".join(input_sentence) # 用空格将四个句子拼接起来
input_tokens = tokenizer.encode(input_sentence)
tok_rows = []
mask_rows = []
for end in [example["RandomFifthSentenceQuiz1"], example["RandomFifthSentenceQuiz2"]]:
end_tokens = tokenizer.encode(" " + end)
tok_rows.append(input_tokens + end_tokens)
mask_rows.append([0] * len(input_tokens) + [1] * len(end_tokens))
max_len = max([len(x) for x in tok_rows])
tokens = torch.zeros((len(tok_rows), max_len), dtype=torch.long)
mask = torch.zeros((len(tok_rows), max_len), dtype=torch.long)
for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)):
tokens[i, :len(tok_row)] = torch.tensor(tok_row)
mask[i, :len(mask_row)] = torch.tensor(mask_row)
return tokens, mask, label
def evaluate_in_training(self, tokens, mask, logits):
'''在训练的过程中评估'''
# contiguous 方法在 PyTorch 中用于确保张量在内存中的存储是连续的
shift_logits = (logits[..., :-1, :]).contiguous() # 去掉最后一个时间步的 logits
shift_tokens = (tokens[..., 1:]).contiguous() # 去掉第一个时间步的 tokens
shift_mask = (mask[..., 1:]).contiguous()
# 将 shift_logits 和 shift_tokens 展平,使得每个位置的 logits 和 tokens 对应
flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
flat_shift_tokens = shift_tokens.view(-1)
shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none') # 计算所有位置的交叉熵损失,不进行任何归约(reduction='none'),返回每个位置的损失值
shift_losses = shift_losses.view(tokens.size(0), -1) # 损失值恢复为原来的形状
# now get the average loss just for the completion region (where mask == 1), in each row
masked_shift_losses = shift_losses * shift_mask # 将损失值与 mask 相乘,只保留掩码位置的损失
# sum and divide by the number of 1s in the mask
sum_loss = masked_shift_losses.sum(dim=1)
avg_loss = sum_loss / shift_mask.sum(dim=1)
# 得到了每个选项的loss,取得最小损失的选项就是最佳答案
pred = sum_loss.argmin().item() # 找到损失最小的索引
pred_norm = avg_loss.argmin().item() # 找到损失最小的索引
return pred, pred_norm
@torch.no_grad()
def evaluate(self, model, device, tokenizer, data_path):
'''训练结束得到权重文件后,对结果评估'''
num_correct_norm = 0
num_correct = 0
num_total = 0
for example in tqdm(self.iterate_examples(data_path)):
tokens, mask, label = self.render_example(example, tokenizer)
tokens, mask = tokens.to(device), mask.to(device)
label = label - 1 # label只有1和2,使其从0开始
logits = model(tokens).logits
pred, pred_norm = self.evaluate_in_training(tokens, mask, logits)
num_total += 1
num_correct += int(pred == label)
num_correct_norm += int(pred_norm == label)
print(f"{num_total} acc_norm: {num_correct_norm}/{num_total}={num_correct_norm/num_total:.4f}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model_path", type=str, default="./model/gpt2", help="the model path to use")
parser.add_argument("-d", "--device", type=str, default="cuda", help="the device to use")
args = parser.parse_args()
DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "StoryCloze")
PATH = os.path.join(DATA_CACHE_DIR, "cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv")
enc = tiktoken.get_encoding("gpt2")
torch.set_float32_matmul_precision('high') # use tf32
model = GPT2LMHeadModel.from_pretrained(args.model_path)
model.to(args.device)
model.eval()
# model = torch.compile(model) # optionally torch compile the model
storycloze_evaluator = StoryClozeEvaluator()
storycloze_evaluator.evaluate(model, args.device, enc, PATH)