-
Notifications
You must be signed in to change notification settings - Fork 0
/
training_nli.py
153 lines (120 loc) · 5.31 KB
/
training_nli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with softmax loss function. At every 1000 training steps, the model is evaluated on the
STS benchmark dataset
Usage:
python training_nli.py
OR
python training_nli.py pretrained_transformer_model_name
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
from tqdm import tqdm
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "data/processed/train_en_es.tsv"
sts_dataset_path = "data/processed/dev_en_es.tsv"
# if not os.path.exists(nli_dataset_path):
# util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
# if not os.path.exists(sts_dataset_path):
# util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
# Read the dataset
train_batch_size = 16
model_save_path = (
"output/training_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# # Read the AllNLI.tsv.gz file and create the training dataset
# logging.info("Read AllNLI train dataset")
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
# train_samples = []
# with open(nli_dataset_path) as f:
# #test_dict = json.load(f)
# l = f.readline()
# #pdb.set_trace()
# for item in tqdm(f):
# premise, hypothesis, label, _ = item.strip().split("\t")
# label_id = label2int[label]
# train_samples.append(InputExample(texts=[premise, hypothesis], label=label_id))
# train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
# train_loss = losses.SoftmaxLoss(
# model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)
# )
# # Read STSbenchmark dataset and use it as development set
# logging.info("Read STSbenchmark dev dataset")
# dev_samples = []
# # with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
# # reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
# # for row in reader:
# # if row["split"] == "dev":
# # score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
# # dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
# with open(sts_dataset_path) as f:
# #test_dict = json.load(f)
# l = f.readline()
# #pdb.set_trace()
# for item in tqdm(f):
# premise, hypothesis, label, _ = item.strip().split("\t")
# label_id = label2int[label]
# dev_samples.append(InputExample(texts=[premise, hypothesis], label=label_id))
# dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
# dev_samples, batch_size=train_batch_size, name="sts-dev"
# )
# # Configure the training
# num_epochs = 1
# warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
# logging.info("Warmup-steps: {}".format(warmup_steps))
# # Train the model
# model.fit(
# train_objectives=[(train_dataloader, train_loss)],
# evaluator=dev_evaluator,
# epochs=num_epochs,
# evaluation_steps=1000,
# warmup_steps=warmup_steps,
# output_path=model_save_path,
# )
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
test_samples = []
test_dataset_path = "data/processed/test_aym.tsv"
with open(sts_dataset_path) as f:
#test_dict = json.load(f)
l = f.readline()
#pdb.set_trace()
for item in tqdm(f):
premise, hypothesis, label, _ = item.strip().split("\t")
label_id = label2int[label]
test_samples.append(InputExample(texts=[premise, hypothesis], label=label_id))
model_save_path = "output/training_nli_bert-base-multilingual-cased-2024-01-11_16-04-54/"
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
test_samples, batch_size=train_batch_size, name="sts-test"
)
test_evaluator(model, output_path=model_save_path)