From 948dd47a50e8754f47cd6bc48f34c6225b30417e Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 29 Sep 2024 15:59:49 +0300 Subject: [PATCH 1/7] add datasets --- mteb/tasks/Reranking/__init__.py | 1 + .../Reranking/multilingual/ESCIReranking.py | 86 +++++++++++ mteb/tasks/Retrieval/__init__.py | 2 + mteb/tasks/Retrieval/jpn/JaqketRetrieval.py | 46 ++++++ .../Retrieval/multilingual/MrTidyRetrieval.py | 137 ++++++++++++++++++ 5 files changed, 272 insertions(+) create mode 100644 mteb/tasks/Reranking/multilingual/ESCIReranking.py create mode 100644 mteb/tasks/Retrieval/jpn/JaqketRetrieval.py create mode 100644 mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py index f96985d458..a4b302a17f 100644 --- a/mteb/tasks/Reranking/__init__.py +++ b/mteb/tasks/Reranking/__init__.py @@ -8,6 +8,7 @@ from .fra.AlloprofReranking import * from .fra.SyntecReranking import * from .jpn.MMarcoReranking import * +from .multilingual.ESCIReranking import * from .multilingual.MIRACLReranking import * from .multilingual.WikipediaRerankingMultilingual import * from .rus.RuBQReranking import * diff --git a/mteb/tasks/Reranking/multilingual/ESCIReranking.py b/mteb/tasks/Reranking/multilingual/ESCIReranking.py new file mode 100644 index 0000000000..39b7aa17c6 --- /dev/null +++ b/mteb/tasks/Reranking/multilingual/ESCIReranking.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +logger = logging.getLogger(__name__) + +_EVAL_SPLIT = "test" +_LANGUAGES = { + "us": ["eng-Latn"], + "es": ["spa-Latn"], + "jp": ["jpn-Jpan"], +} + +_CITATION = """@article{reddy2022shopping, + title={Shopping Queries Dataset: A Large-Scale {ESCI} Benchmark for Improving Product Search}, + author={Chandan K. Reddy and Lluís Màrquez and Fran Valero and Nikhil Rao and Hugo Zaragoza and Sambaran Bandyopadhyay and Arnab Biswas and Anlu Xing and Karthik Subbian}, + year={2022}, + eprint={2206.06588}, + archivePrefix={arXiv} +}""" + + +class ESCIReranking(MultilingualTask, AbsTaskReranking): + metadata = TaskMetadata( + name="ESCIReranking", + description="", + reference="https://github.com/amazon-science/esci-data/", + dataset={ + "path": "mteb/esci", + "revision": "237f74be0503482b4e8bc1b83778c7a87ea93fd8", + }, + type="Reranking", + category="s2p", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=_LANGUAGES, + main_score="NDCG@10", + date=("2022-06-14", "2022-06-14"), + domains=["Written"], + task_subtypes=[], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=_CITATION, + descriptive_stats={ + "test": { + "num_samples": 29285, + "num_positive": 29285, + "num_negative": 29285, + "avg_query_len": 19.691890046098685, + "avg_positive_len": 9.268089465596722, + "avg_negative_len": 1.5105002561038074, + "hf_subset_descriptive_stats": { + "us": { + "num_samples": 21296, + "num_positive": 21296, + "num_negative": 21296, + "avg_query_len": 21.440833959429, + "avg_positive_len": 8.892515026296017, + "avg_negative_len": 1.1956705484598047, + }, + "es": { + "num_samples": 3703, + "num_positive": 3703, + "num_negative": 3703, + "avg_query_len": 20.681609505806104, + "avg_positive_len": 10.561706724277613, + "avg_negative_len": 2.749932487172563, + }, + "jp": { + "num_samples": 4286, + "num_positive": 4286, + "num_negative": 4286, + "avg_query_len": 10.146756882874476, + "avg_positive_len": 10.016565562295847, + "avg_negative_len": 2.003966402239851, + }, + }, + } + }, + ) diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 3975cd9bd3..a25eec33b4 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -94,6 +94,7 @@ from .fra.SyntecRetrieval import * from .hun.HunSum2 import * from .jpn.JaGovFaqsRetrieval import * +from .jpn.JaqketRetrieval import * from .jpn.JaQuADRetrieval import * from .jpn.NLPJournalAbsIntroRetrieval import * from .jpn.NLPJournalTitleAbsRetrieval import * @@ -107,6 +108,7 @@ from .multilingual.MintakaRetrieval import * from .multilingual.MIRACLRetrieval import * from .multilingual.MLQARetrieval import * +from .multilingual.MrTidyRetrieval import * from .multilingual.MultiLongDocRetrieval import * from .multilingual.NeuCLIR2022Retrieval import * from .multilingual.NeuCLIR2023Retrieval import * diff --git a/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py b/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py new file mode 100644 index 0000000000..6c3d6a86ca --- /dev/null +++ b/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class JaqketRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="JaqketRetrieval", + dataset={ + "path": "mteb/jaqket", + "revision": "3a5b92dad489a61e664c05ed2175bc9220230199", + }, + description="JAQKET (JApanese Questions on Knowledge of EnTities) is a QA dataset that is created based on quiz questions.", + reference="https://github.com/kumapo/JAQKET-dataset", + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["jpn-Jpan"], + main_score="ndcg_at_10", + date=("2023-10-09", "2023-10-09"), + domains=["Encyclopaedic", "Non-fiction", "Written"], + task_subtypes=["Question answering"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=None, + sample_creation="found", + bibtex_citation="""@InProceedings{Kurihara_nlp2020, +author = "鈴木正敏 and 鈴木潤 and 松田耕史 and ⻄田京介 and 井之上直也", +title = "JAQKET: クイズを題材にした日本語 QA データセットの構築", +booktitle = "言語処理学会第26回年次大会", +year = "2020", +url = "https://www.anlp.jp/proceedings/annual_meeting/2020/pdf_dir/P2-24.pdf" +note= "in Japanese" +}""", + descriptive_stats={ + "test": { + "average_document_length": 3747.995228882333, + "average_query_length": 50.70611835506519, + "num_documents": 114229, + "num_queries": 997, + "average_relevant_docs_per_query": 1.0, + } + }, + ) diff --git a/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py b/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py new file mode 100644 index 0000000000..afb76f2e48 --- /dev/null +++ b/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +import logging + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_EVAL_LANGS = { + "bengali": ["ben-Beng"], + "english": ["eng-Latn"], + "finnish": ["fin-Latn"], + "russian": ["rus-Cyrl"], + "korean": ["kor-Kore"], + "japanese": ["jpn-Jpan"], + "telugu": ["tel-Telu"], + "thai": ["tha-Thai"], + "swahili": ["swa-Latn"], + "arabic": ["ara-Arab"], + "indonesian": ["ind-Latn"], +} +_EVAL_SPLIT = "test" + +logger = logging.getLogger(__name__) + + +def _load_code_search_code_retrieval( + path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None +): + corpus = {lang: {split: {} for split in splits} for lang in langs} + queries = {lang: {split: {} for split in splits} for lang in langs} + relevant_docs = {lang: {split: {} for split in splits} for lang in langs} + + split = _EVAL_SPLIT + + for lang in langs: + qrels_data = datasets.load_dataset( + path, + name=f"{lang}-qrels", + cache_dir=cache_dir, + revision=revision, + trust_remote_code=True, + )[split] + + for row in qrels_data: + query_id = row["query-id"] + doc_id = row["corpus-id"] + score = row["score"] + if query_id not in relevant_docs[lang][split]: + relevant_docs[lang][split][query_id] = {} + relevant_docs[lang][split][query_id][doc_id] = score + + corpus_data = datasets.load_dataset( + path, + name=f"{lang}-corpus", + cache_dir=cache_dir, + revision=revision, + trust_remote_code=True, + )["train"] + + for row in corpus_data: + doc_id = row["_id"] + doc_title = row["title"] + doc_text = row["text"] + corpus[lang][split][doc_id] = {"title": doc_title, "text": doc_text} + + queries_data = datasets.load_dataset( + path, + name=f"{lang}-queries", + cache_dir=cache_dir, + revision=revision, + trust_remote_code=True, + )[split] + + for row in queries_data: + query_id = row["_id"] + query_text = row["text"] + queries[lang][split][query_id] = query_text + + queries = queries + logger.info("Loaded %d %s Queries.", len(queries), split.upper()) + + return corpus, queries, relevant_docs + + +class MrTidyRetrieval(MultilingualTask, AbsTaskRetrieval): + metadata = TaskMetadata( + name="MrTidyRetrieval", + description="Mr. TyDi is a multi-lingual benchmark dataset built on TyDi, covering eleven typologically diverse languages. It is designed for monolingual retrieval, specifically to evaluate ranking with learned dense representations.", + reference="https://huggingface.co/datasets/castorini/mr-tydi", + dataset={ + "path": "mteb/mrtidy", + "revision": "fc24a3ce8f09746410daee3d5cd823ff7a0675b7", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=_EVAL_LANGS, + main_score="map", + date=("2023-11-01", "2024-05-15"), + domains=["Encyclopaedic", "Written"], + task_subtypes=[], + license="cc-by-sa-3.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@article{mrtydi, + title={{Mr. TyDi}: A Multi-lingual Benchmark for Dense Retrieval}, + author={Xinyu Zhang and Xueguang Ma and Peng Shi and Jimmy Lin}, + year={2021}, + journal={arXiv:2108.08787}, + }""", + descriptive_stats={}, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = ( + _load_code_search_code_retrieval( + path=self.metadata_dict["dataset"]["path"], + langs=self.hf_subsets, + splits=self.metadata_dict["eval_splits"], + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata_dict["dataset"]["revision"], + ) + ) + + self.data_loaded = True + + +if __name__ == "__main__": + print(MrTidyRetrieval().calculate_metadata_metrics()) From ce5c7462f8a9a27d3b4bc8eb21035424e60e42e3 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 29 Sep 2024 17:30:30 +0300 Subject: [PATCH 2/7] fix metrics --- .../Reranking/multilingual/ESCIReranking.py | 2 +- .../Retrieval/multilingual/MrTidyRetrieval.py | 22 +++++++------------ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/mteb/tasks/Reranking/multilingual/ESCIReranking.py b/mteb/tasks/Reranking/multilingual/ESCIReranking.py index 39b7aa17c6..c3597c2fdf 100644 --- a/mteb/tasks/Reranking/multilingual/ESCIReranking.py +++ b/mteb/tasks/Reranking/multilingual/ESCIReranking.py @@ -38,7 +38,7 @@ class ESCIReranking(MultilingualTask, AbsTaskReranking): modalities=["text"], eval_splits=[_EVAL_SPLIT], eval_langs=_LANGUAGES, - main_score="NDCG@10", + main_score="map", date=("2022-06-14", "2022-06-14"), domains=["Written"], task_subtypes=[], diff --git a/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py b/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py index afb76f2e48..f7bf5f9dc8 100644 --- a/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py @@ -26,7 +26,7 @@ logger = logging.getLogger(__name__) -def _load_code_search_code_retrieval( +def _load_data_retrieval( path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None ): corpus = {lang: {split: {} for split in splits} for lang in langs} @@ -99,7 +99,7 @@ class MrTidyRetrieval(MultilingualTask, AbsTaskRetrieval): modalities=["text"], eval_splits=["test"], eval_langs=_EVAL_LANGS, - main_score="map", + main_score="ndcg_at_10", date=("2023-11-01", "2024-05-15"), domains=["Encyclopaedic", "Written"], task_subtypes=[], @@ -120,18 +120,12 @@ def load_data(self, **kwargs): if self.data_loaded: return - self.corpus, self.queries, self.relevant_docs = ( - _load_code_search_code_retrieval( - path=self.metadata_dict["dataset"]["path"], - langs=self.hf_subsets, - splits=self.metadata_dict["eval_splits"], - cache_dir=kwargs.get("cache_dir", None), - revision=self.metadata_dict["dataset"]["revision"], - ) + self.corpus, self.queries, self.relevant_docs = _load_data_retrieval( + path=self.metadata_dict["dataset"]["path"], + langs=self.hf_subsets, + splits=self.metadata_dict["eval_splits"], + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata_dict["dataset"]["revision"], ) self.data_loaded = True - - -if __name__ == "__main__": - print(MrTidyRetrieval().calculate_metadata_metrics()) From 456df8214a8168a644e7278e6de1a5f8d7183355 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 29 Sep 2024 18:17:10 +0300 Subject: [PATCH 3/7] add Touche2020v3 --- .../Retrieval/eng/Touche2020Retrieval.py | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py index 2c9dc8df41..7ef2c1b7d9 100644 --- a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py +++ b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py @@ -1,9 +1,8 @@ from __future__ import annotations +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval - class Touche2020(AbsTaskRetrieval): metadata = TaskMetadata( @@ -57,3 +56,44 @@ class Touche2020(AbsTaskRetrieval): }, }, ) + + +class Touche2020v3(AbsTaskRetrieval): + metadata = TaskMetadata( + name="Touche2020v3", + description="Touché Task 1: Argument Retrieval for Controversial Questions", + reference="https://github.com/castorini/touche-error-analysis", + dataset={ + "path": "mteb/webis-touche2020-v3", + "revision": "431886eaecc48f067a3975b70d0949ea2862463c", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=None, + domains=None, + task_subtypes=None, + license="cc-by-sa-4.0", + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation="""@INPROCEEDINGS{Thakur_etal_SIGIR2024, + author = "Nandan Thakur and Luiz Bonifacio and Maik {Fr\"{o}be} and Alexander Bondarenko and Ehsan Kamalloo and Martin Potthast and Matthias Hagen and Jimmy Lin", + title = "Systematic Evaluation of Neural Retrieval Models on the {Touch\'{e}} 2020 Argument Retrieval Subset of {BEIR}", + booktitle = "Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval", + year = 2024, + address_ = "Washington, D.C." +}""", + descriptive_stats={ + "test": { + "average_document_length": 2096.391812518931, + "average_query_length": 43.42857142857143, + "num_documents": 303732, + "num_queries": 49, + "average_relevant_docs_per_query": 34.93877551020408, + } + }, + ) From 38c0b56bb317ab1b894f47bd523619645421e3a8 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 29 Sep 2024 22:45:33 +0300 Subject: [PATCH 4/7] fix metadata --- .../Retrieval/eng/Touche2020Retrieval.py | 26 +++++++++---------- mteb/tasks/Retrieval/jpn/JaqketRetrieval.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py index 7ef2c1b7d9..a1277406f1 100644 --- a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py +++ b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py @@ -19,13 +19,13 @@ class Touche2020(AbsTaskRetrieval): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="ndcg_at_10", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2020-09-23", "2020-09-23"), + domains=["Academic"], + task_subtypes=["Question answering"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@dataset{potthast_2022_6862281, author = {Potthast, Martin and Gienapp, Lukas and @@ -73,13 +73,13 @@ class Touche2020v3(AbsTaskRetrieval): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="ndcg_at_10", - date=None, - domains=None, - task_subtypes=None, + date=("2020-09-23", "2020-09-23"), + domains=["Academic"], + task_subtypes=["Question answering"], license="cc-by-sa-4.0", - annotations_creators=None, - dialect=None, - sample_creation=None, + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@INPROCEEDINGS{Thakur_etal_SIGIR2024, author = "Nandan Thakur and Luiz Bonifacio and Maik {Fr\"{o}be} and Alexander Bondarenko and Ehsan Kamalloo and Martin Potthast and Matthias Hagen and Jimmy Lin", title = "Systematic Evaluation of Neural Retrieval Models on the {Touch\'{e}} 2020 Argument Retrieval Subset of {BEIR}", diff --git a/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py b/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py index 6c3d6a86ca..0af7d06772 100644 --- a/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py +++ b/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py @@ -24,7 +24,7 @@ class JaqketRetrieval(AbsTaskRetrieval): task_subtypes=["Question answering"], license="cc-by-sa-4.0", annotations_creators="human-annotated", - dialect=None, + dialect=[], sample_creation="found", bibtex_citation="""@InProceedings{Kurihara_nlp2020, author = "鈴木正敏 and 鈴木潤 and 松田耕史 and ⻄田京介 and 井之上直也", From fbe737edb0750a9596ec5793266563a6d8a62fa3 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Thu, 3 Oct 2024 14:15:02 +0300 Subject: [PATCH 5/7] Apply suggestions from code review Co-authored-by: Kenneth Enevoldsen --- mteb/tasks/Retrieval/eng/Touche2020Retrieval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py index a1277406f1..045ac083ab 100644 --- a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py +++ b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py @@ -58,9 +58,9 @@ class Touche2020(AbsTaskRetrieval): ) -class Touche2020v3(AbsTaskRetrieval): +class Touche2020v3Retrieval(AbsTaskRetrieval): metadata = TaskMetadata( - name="Touche2020v3", + name="Touche2020v3Retrieval", description="Touché Task 1: Argument Retrieval for Controversial Questions", reference="https://github.com/castorini/touche-error-analysis", dataset={ From 1a56a469a1467ae3f9faad0b47fcf0a38e0204bf Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Thu, 3 Oct 2024 14:22:00 +0300 Subject: [PATCH 6/7] upd name and supress --- mteb/tasks/Retrieval/eng/Touche2020Retrieval.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py index 045ac083ab..01b955b19d 100644 --- a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py +++ b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py @@ -5,6 +5,8 @@ class Touche2020(AbsTaskRetrieval): + superseded_by = "Touche2020Retrieval.v3" + metadata = TaskMetadata( name="Touche2020", description="Touché Task 1: Argument Retrieval for Controversial Questions", @@ -60,7 +62,7 @@ class Touche2020(AbsTaskRetrieval): class Touche2020v3Retrieval(AbsTaskRetrieval): metadata = TaskMetadata( - name="Touche2020v3Retrieval", + name="Touche2020Retrieval.v3", description="Touché Task 1: Argument Retrieval for Controversial Questions", reference="https://github.com/castorini/touche-error-analysis", dataset={ From 049c914a58f21987a0c37122822b7e8ee9f4fc6b Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Thu, 3 Oct 2024 14:33:33 +0300 Subject: [PATCH 7/7] add benchmark class --- mteb/benchmarks/benchmarks.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index ccb266aacb..c40766045c 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -687,3 +687,37 @@ def __getitem__(self, index): reference=None, citation=None, ) + +MTEB_JPN = Benchmark( + name="MTEB(jpn)", + tasks=get_tasks( + languages=["jpn"], + tasks=[ + # clustering + "LivedoorNewsClustering.v2", + "MewsC16JaClustering", + # classification + "AmazonReviewsClassification", + "AmazonCounterfactualClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + # STS + "JSTS", + "JSICK", + # pair classification + "PawsXPairClassification", + # retrieval + "JaqketRetrieval", + "MrTidyRetrieval", + "JaGovFaqsRetrieval", + "NLPJournalTitleAbsRetrieval", + "NLPJournalAbsIntroRetrieval", + "NLPJournalTitleIntroRetrieval", + # reranking + "ESCIReranking", + ], + ), + description="Main Japanese benchmarks from MTEB", + reference="https://github.com/sbintuitions/JMTEB", + citation=None, +)