Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Add Touche2020v3 and JMTEB #1262

Merged
merged 9 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -687,3 +687,37 @@ def __getitem__(self, index):
reference=None,
citation=None,
)

MTEB_JPN = Benchmark(
name="MTEB(jpn)",
tasks=get_tasks(
languages=["jpn"],
tasks=[
# clustering
"LivedoorNewsClustering.v2",
"MewsC16JaClustering",
# classification
"AmazonReviewsClassification",
"AmazonCounterfactualClassification",
"MassiveIntentClassification",
"MassiveScenarioClassification",
# STS
"JSTS",
"JSICK",
# pair classification
"PawsXPairClassification",
# retrieval
"JaqketRetrieval",
"MrTidyRetrieval",
"JaGovFaqsRetrieval",
"NLPJournalTitleAbsRetrieval",
"NLPJournalAbsIntroRetrieval",
"NLPJournalTitleIntroRetrieval",
# reranking
"ESCIReranking",
],
),
description="Main Japanese benchmarks from MTEB",
reference="https://github.com/sbintuitions/JMTEB",
citation=None,
)
1 change: 1 addition & 0 deletions mteb/tasks/Reranking/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .fra.AlloprofReranking import *
from .fra.SyntecReranking import *
from .jpn.MMarcoReranking import *
from .multilingual.ESCIReranking import *
from .multilingual.MIRACLReranking import *
from .multilingual.WikipediaRerankingMultilingual import *
from .rus.RuBQReranking import *
Expand Down
86 changes: 86 additions & 0 deletions mteb/tasks/Reranking/multilingual/ESCIReranking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from __future__ import annotations

import logging

from mteb.abstasks.AbsTaskReranking import AbsTaskReranking
from mteb.abstasks.MultilingualTask import MultilingualTask
from mteb.abstasks.TaskMetadata import TaskMetadata

logger = logging.getLogger(__name__)

_EVAL_SPLIT = "test"
_LANGUAGES = {
"us": ["eng-Latn"],
"es": ["spa-Latn"],
"jp": ["jpn-Jpan"],
}

_CITATION = """@article{reddy2022shopping,
title={Shopping Queries Dataset: A Large-Scale {ESCI} Benchmark for Improving Product Search},
author={Chandan K. Reddy and Lluís Màrquez and Fran Valero and Nikhil Rao and Hugo Zaragoza and Sambaran Bandyopadhyay and Arnab Biswas and Anlu Xing and Karthik Subbian},
year={2022},
eprint={2206.06588},
archivePrefix={arXiv}
}"""


class ESCIReranking(MultilingualTask, AbsTaskReranking):
metadata = TaskMetadata(
name="ESCIReranking",
description="",
reference="https://github.com/amazon-science/esci-data/",
dataset={
"path": "mteb/esci",
"revision": "237f74be0503482b4e8bc1b83778c7a87ea93fd8",
},
type="Reranking",
category="s2p",
modalities=["text"],
eval_splits=[_EVAL_SPLIT],
eval_langs=_LANGUAGES,
main_score="map",
date=("2022-06-14", "2022-06-14"),
domains=["Written"],
task_subtypes=[],
license="apache-2.0",
annotations_creators="derived",
dialect=[],
sample_creation="created",
bibtex_citation=_CITATION,
descriptive_stats={
"test": {
"num_samples": 29285,
"num_positive": 29285,
"num_negative": 29285,
"avg_query_len": 19.691890046098685,
"avg_positive_len": 9.268089465596722,
"avg_negative_len": 1.5105002561038074,
"hf_subset_descriptive_stats": {
"us": {
"num_samples": 21296,
"num_positive": 21296,
"num_negative": 21296,
"avg_query_len": 21.440833959429,
"avg_positive_len": 8.892515026296017,
"avg_negative_len": 1.1956705484598047,
},
"es": {
"num_samples": 3703,
"num_positive": 3703,
"num_negative": 3703,
"avg_query_len": 20.681609505806104,
"avg_positive_len": 10.561706724277613,
"avg_negative_len": 2.749932487172563,
},
"jp": {
"num_samples": 4286,
"num_positive": 4286,
"num_negative": 4286,
"avg_query_len": 10.146756882874476,
"avg_positive_len": 10.016565562295847,
"avg_negative_len": 2.003966402239851,
},
},
}
},
)
2 changes: 2 additions & 0 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
from .fra.SyntecRetrieval import *
from .hun.HunSum2 import *
from .jpn.JaGovFaqsRetrieval import *
from .jpn.JaqketRetrieval import *
from .jpn.JaQuADRetrieval import *
from .jpn.NLPJournalAbsIntroRetrieval import *
from .jpn.NLPJournalTitleAbsRetrieval import *
Expand All @@ -107,6 +108,7 @@
from .multilingual.MintakaRetrieval import *
from .multilingual.MIRACLRetrieval import *
from .multilingual.MLQARetrieval import *
from .multilingual.MrTidyRetrieval import *
from .multilingual.MultiLongDocRetrieval import *
from .multilingual.NeuCLIR2022Retrieval import *
from .multilingual.NeuCLIR2023Retrieval import *
Expand Down
60 changes: 51 additions & 9 deletions mteb/tasks/Retrieval/eng/Touche2020Retrieval.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval


class Touche2020(AbsTaskRetrieval):
Samoed marked this conversation as resolved.
Show resolved Hide resolved
superseded_by = "Touche2020Retrieval.v3"

metadata = TaskMetadata(
name="Touche2020",
description="Touché Task 1: Argument Retrieval for Controversial Questions",
Expand All @@ -20,13 +21,13 @@ class Touche2020(AbsTaskRetrieval):
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
date=("2020-09-23", "2020-09-23"),
domains=["Academic"],
task_subtypes=["Question answering"],
license="cc-by-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation="""@dataset{potthast_2022_6862281,
author = {Potthast, Martin and
Gienapp, Lukas and
Expand Down Expand Up @@ -57,3 +58,44 @@ class Touche2020(AbsTaskRetrieval):
},
},
)


class Touche2020v3Retrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="Touche2020Retrieval.v3",
description="Touché Task 1: Argument Retrieval for Controversial Questions",
reference="https://github.com/castorini/touche-error-analysis",
dataset={
"path": "mteb/webis-touche2020-v3",
"revision": "431886eaecc48f067a3975b70d0949ea2862463c",
},
type="Retrieval",
category="s2p",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=("2020-09-23", "2020-09-23"),
domains=["Academic"],
task_subtypes=["Question answering"],
license="cc-by-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation="""@INPROCEEDINGS{Thakur_etal_SIGIR2024,
author = "Nandan Thakur and Luiz Bonifacio and Maik {Fr\"{o}be} and Alexander Bondarenko and Ehsan Kamalloo and Martin Potthast and Matthias Hagen and Jimmy Lin",
title = "Systematic Evaluation of Neural Retrieval Models on the {Touch\'{e}} 2020 Argument Retrieval Subset of {BEIR}",
booktitle = "Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval",
year = 2024,
address_ = "Washington, D.C."
}""",
descriptive_stats={
"test": {
"average_document_length": 2096.391812518931,
"average_query_length": 43.42857142857143,
"num_documents": 303732,
"num_queries": 49,
"average_relevant_docs_per_query": 34.93877551020408,
}
},
)
46 changes: 46 additions & 0 deletions mteb/tasks/Retrieval/jpn/JaqketRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata


class JaqketRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="JaqketRetrieval",
dataset={
"path": "mteb/jaqket",
"revision": "3a5b92dad489a61e664c05ed2175bc9220230199",
},
description="JAQKET (JApanese Questions on Knowledge of EnTities) is a QA dataset that is created based on quiz questions.",
reference="https://github.com/kumapo/JAQKET-dataset",
type="Retrieval",
category="s2p",
modalities=["text"],
eval_splits=["test"],
eval_langs=["jpn-Jpan"],
main_score="ndcg_at_10",
date=("2023-10-09", "2023-10-09"),
domains=["Encyclopaedic", "Non-fiction", "Written"],
task_subtypes=["Question answering"],
license="cc-by-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation="""@InProceedings{Kurihara_nlp2020,
author = "鈴木正敏 and 鈴木潤 and 松田耕史 and ⻄田京介 and 井之上直也",
title = "JAQKET: クイズを題材にした日本語 QA データセットの構築",
booktitle = "言語処理学会第26回年次大会",
year = "2020",
url = "https://www.anlp.jp/proceedings/annual_meeting/2020/pdf_dir/P2-24.pdf"
note= "in Japanese"
}""",
descriptive_stats={
"test": {
"average_document_length": 3747.995228882333,
"average_query_length": 50.70611835506519,
"num_documents": 114229,
"num_queries": 997,
"average_relevant_docs_per_query": 1.0,
}
},
)
Loading
Loading