-
Notifications
You must be signed in to change notification settings - Fork 9
/
test_tm2tb.py
157 lines (129 loc) · 5.12 KB
/
test_tm2tb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""TM2TB unit tests"""
import json
from tm2tb import TermExtractor
from tm2tb import BitermExtractor
from tm2tb import BitextReader
from app import app
app.testing = True
with open('data/test_sentences.json', 'r', encoding='utf8') as fr:
sentences = json.loads(fr.read())
EN_SENTENCE = sentences['en']
ES_SENTENCE = sentences['es']
with open('data/test_results.jsonl', 'r', encoding='utf8') as fr:
RESULTS = [json.loads(line) for line in fr.read().split('\n')[:-1]]
def test_api():
"""Test bilingual term extraction through the API."""
with app.test_client() as client:
data = {
"src_text": EN_SENTENCE,
"tgt_text": ES_SENTENCE,
"src_lang": "en",
"tgt_lang": "es",
"similarity_min": 0.8
}
response = client.post(
headers={"Content-Type": "application/json"},
json=json.dumps(data),
)
expected_response = RESULTS[0]
assert json.loads(response.text) == expected_response
def test_en_sentence():
"""Test term extraction from English sentence."""
extractor = TermExtractor(EN_SENTENCE)
terms = extractor.extract_terms()[:10]
terms.index = terms.index.map(str)
terms = terms.to_dict()
assert terms == RESULTS[1]
def test_en_sentence_lang_code():
"""Test term extraction from English sentence passing a lang code."""
extractor = TermExtractor(EN_SENTENCE, lang='en')
terms = extractor.extract_terms()[:10]
terms.index = terms.index.map(str)
terms = terms.to_dict()
assert terms == RESULTS[2]
def test_es_sentence():
"""Test term extraction from Spanish sentence."""
extractor = TermExtractor(ES_SENTENCE)
terms = extractor.extract_terms()[:10]
terms.index = terms.index.map(str)
terms = terms.to_dict()
assert terms == RESULTS[3]
def test_bilingual_sentences():
"""Test bilingual term extraction from English/Spanish sentences."""
extractor = BitermExtractor((EN_SENTENCE, ES_SENTENCE))
biterms = extractor.extract_terms()[:10]
biterms.index = biterms.index.map(str)
biterms = biterms.to_dict()
assert biterms == RESULTS[4]
def test_bilingual_sentences_lang_codes():
"""Test bilingual term extraction from English/Spanish sentences passing language codes."""
extractor = BitermExtractor((EN_SENTENCE, ES_SENTENCE), src_lang='en', tgt_lang='es')
biterms = extractor.extract_terms()[:10]
biterms.index = biterms.index.map(str)
biterms = biterms.to_dict()
assert biterms == RESULTS[5]
def test_bilingual_csv():
"""Test bilingual term extraction from English/Spanish .csv file."""
path = 'data/test_bitext_en_es.csv'
bitext = BitextReader(path).read_bitext()
extractor = BitermExtractor(bitext)
biterms = extractor.extract_terms()[:10]
biterms.index = biterms.index.map(str)
biterms = biterms.to_dict()
assert biterms == RESULTS[6]
def test_bilingual_xlsx():
"""Test bilingual term extraction from English/Spanish .xlsx file."""
path = 'data/test_bitext_en_es.xlsx'
bitext = BitextReader(path).read_bitext()
extractor = BitermExtractor(bitext)
biterms = extractor.extract_terms()[:10]
biterms.index = biterms.index.map(str)
biterms = biterms.to_dict()
assert biterms == RESULTS[6]
def test_bilingual_mqxliff():
"""Test bilingual term extraction from English/Spanish .mqxliff file."""
path = 'data/test_bitext_en_es.mqxliff'
bitext = BitextReader(path).read_bitext()
extractor = BitermExtractor(bitext)
biterms = extractor.extract_terms()[:10]
biterms.index = biterms.index.map(str)
biterms = biterms.to_dict()
assert biterms == RESULTS[6]
def test_bilingual_mxliff():
"""Test bilingual term extraction from English/Spanish .mxliff file."""
path = 'data/test_bitext_en_es.mxliff'
bitext = BitextReader(path).read_bitext()
extractor = BitermExtractor(bitext)
biterms = extractor.extract_terms()[:10]
biterms.index = biterms.index.map(str)
biterms = biterms.to_dict()
assert biterms == RESULTS[6]
def test_bilingual_tmx():
"""Test bilingual term extraction from English/Spanish .tmx file."""
path = 'data/test_bitext_en_es.tmx'
bitext = BitextReader(path).read_bitext()
extractor = BitermExtractor(bitext)
biterms = extractor.extract_terms()[:10]
biterms.index = biterms.index.map(str)
biterms = biterms.to_dict()
assert biterms == RESULTS[6]
def test_en_text():
"""Test monolingual extraction from English text."""
path = 'data/test_text_en.txt'
with open(path, 'r', encoding='utf8') as fr:
text = fr.read().split('\n')
extractor = TermExtractor(text)
terms = extractor.extract_terms()[:10]
terms.index = terms.index.map(str)
terms = terms.to_dict()
assert terms == RESULTS[7]
def test_es_text():
"""Test monolingual extraction from Spanish text."""
path = 'data/test_text_es.txt'
with open(path, 'r', encoding='utf8') as fr:
text = fr.read().split('\n')
extractor = TermExtractor(text)
terms = extractor.extract_terms()[:10]
terms.index = terms.index.map(str)
terms = terms.to_dict()
assert terms == RESULTS[8]