-
Notifications
You must be signed in to change notification settings - Fork 247
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'mieb' into add-classification-tasks
- Loading branch information
Showing
26 changed files
with
987 additions
and
0 deletions.
There are no files selected for viewing
203 changes: 203 additions & 0 deletions
203
mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
from __future__ import annotations | ||
|
||
import itertools | ||
import logging | ||
from collections import defaultdict | ||
from typing import Any | ||
|
||
import numpy as np | ||
from sklearn.base import ClassifierMixin, clone | ||
from sklearn.metrics import f1_score, label_ranking_average_precision_score | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.neighbors import KNeighborsClassifier | ||
from sklearn.preprocessing import MultiLabelBinarizer | ||
|
||
from mteb.abstasks import AbsTask | ||
from mteb.encoder_interface import Encoder | ||
from mteb.load_results.mteb_results import HFSubset, ScoresDict | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def evaluate_classifier( | ||
embeddings_train: np.ndarray, | ||
y_train: np.ndarray, | ||
embeddings_test: np.ndarray, | ||
y_test: np.ndarray, | ||
classifier: ClassifierMixin, | ||
): | ||
scores = {} | ||
classifier = clone(classifier) | ||
classifier.fit(embeddings_train, y_train) | ||
y_pred = classifier.predict(embeddings_test) | ||
accuracy = classifier.score(embeddings_test, y_test) | ||
f1 = f1_score(y_test, y_pred, average="macro") | ||
scores["accuracy"] = accuracy | ||
scores["f1"] = f1 | ||
lrap = label_ranking_average_precision_score(y_test, y_pred) | ||
scores["lrap"] = lrap | ||
return scores | ||
|
||
|
||
class AbsTaskImageMultilabelClassification(AbsTask): | ||
"""Abstract class for image multioutput classification tasks | ||
The similarity is computed between pairs and the results are ranked. | ||
self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns: | ||
image: list[PIL.Image] | ||
labels: list[Hashable] | ||
""" | ||
|
||
image_column_name: str = "image" | ||
label_column_name: str = "labels" | ||
|
||
classifier = KNeighborsClassifier(n_neighbors=5) | ||
|
||
def __init__( | ||
self, | ||
n_experiments=None, | ||
samples_per_label=None, | ||
batch_size=32, | ||
**kwargs, | ||
): | ||
super().__init__(**kwargs) | ||
self.batch_size = batch_size | ||
|
||
# Bootstrap parameters | ||
self.n_experiments = n_experiments or getattr(self, "n_experiments", 10) | ||
self.samples_per_label = samples_per_label or getattr( | ||
self, "samples_per_label", 8 | ||
) | ||
# Run metadata validation by instantiating addressing the attribute | ||
# This is quite hacky. Ideally, this would be done in the constructor of | ||
# each concrete task, but then we have to duplicate the __init__ method's | ||
# interface. | ||
if hasattr(self, "metadata"): | ||
self.metadata | ||
|
||
def _add_main_score(self, scores): | ||
scores["main_score"] = scores[self.metadata.main_score] | ||
|
||
def evaluate( | ||
self, | ||
model: Encoder, | ||
eval_split: str = "test", | ||
train_split: str = "train", | ||
*, | ||
encode_kwargs: dict[str, Any] = {}, | ||
**kwargs: Any, | ||
) -> dict[HFSubset, ScoresDict]: | ||
if not self.data_loaded: | ||
self.load_data() | ||
|
||
scores = {} | ||
hf_subsets = [l for l in self.dataset] if self.is_multilingual else ["default"] | ||
|
||
for hf_subset in hf_subsets: | ||
logger.info( | ||
f"\nTask: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..." | ||
) | ||
|
||
if hf_subset not in self.dataset and hf_subset == "default": | ||
ds = self.dataset | ||
else: | ||
ds = self.dataset[hf_subset] | ||
scores[hf_subset] = self._evaluate_subset( | ||
model, | ||
ds, | ||
eval_split, | ||
train_split, | ||
encode_kwargs=encode_kwargs, | ||
**kwargs, | ||
) | ||
self._add_main_score(scores[hf_subset]) | ||
|
||
return scores | ||
|
||
def _evaluate_subset( | ||
self, | ||
model: Encoder, | ||
dataset, | ||
eval_split: str = "test", | ||
train_split: str = "train", | ||
*, | ||
encode_kwargs: dict[str, Any] = {}, | ||
**kwargs: Any, | ||
) -> ScoresDict: | ||
train_split = dataset[train_split] | ||
eval_split = dataset[eval_split] | ||
params = { | ||
"classifier_type": type(self.classifier).__name__, | ||
"classifier_params": self.classifier.get_params(), | ||
"batch_size": self.batch_size, | ||
} | ||
params.update(kwargs) | ||
|
||
scores = [] | ||
# Bootstrap sample indices from training set for each experiment | ||
train_samples = [] | ||
for _ in range(self.n_experiments): | ||
sample_indices, _ = self._undersample_data_indices( | ||
train_split[self.label_column_name], self.samples_per_label, None | ||
) | ||
train_samples.append(sample_indices) | ||
# Encode all unique images at the indices | ||
unique_train_indices = list(set(itertools.chain.from_iterable(train_samples))) | ||
unique_train_images = train_split.select(unique_train_indices)[ | ||
self.image_column_name | ||
] | ||
|
||
_unique_train_embeddings = model.get_image_embeddings( | ||
unique_train_images, | ||
**encode_kwargs, | ||
) | ||
unique_train_embeddings = dict( | ||
zip(unique_train_indices, _unique_train_embeddings) | ||
) | ||
test_images = eval_split[self.image_column_name] | ||
binarizer = MultiLabelBinarizer() | ||
y_test = binarizer.fit_transform(eval_split[self.label_column_name]) | ||
# Stratified subsampling of test set to 2000 examples. | ||
try: | ||
if len(test_images) > 2000: | ||
test_images, _, y_test, _ = train_test_split( | ||
test_images, y_test, stratify=y_test, train_size=2000 | ||
) | ||
except ValueError: | ||
logger.warning("Couldn't subsample, continuing with the entire test set.") | ||
|
||
X_test = model.get_image_embeddings(test_images, **encode_kwargs) | ||
for i_experiment, sample_indices in enumerate(train_samples): | ||
logger.info( | ||
"=" * 10 | ||
+ f" Experiment {i_experiment+1}/{self.n_experiments} " | ||
+ "=" * 10 | ||
) | ||
X_train = np.stack([unique_train_embeddings[idx] for idx in sample_indices]) | ||
y_train = train_split.select(sample_indices)[self.label_column_name] | ||
y_train = binarizer.transform(y_train) | ||
scores_exp = evaluate_classifier( | ||
X_train, y_train, X_test, y_test, self.classifier | ||
) | ||
scores.append(scores_exp) | ||
|
||
avg_scores: dict[str, Any] = { | ||
k: np.mean([s[k] for s in scores]) for k in scores[0].keys() | ||
} | ||
avg_scores["scores_per_experiment"] = scores | ||
|
||
return avg_scores | ||
|
||
def _undersample_data_indices(self, y, samples_per_label, idxs=None): | ||
"""Undersample data to have samples_per_label samples of each label""" | ||
sample_indices = [] | ||
if idxs is None: | ||
idxs = np.arange(len(y)) | ||
np.random.shuffle(idxs) | ||
label_counter = defaultdict(int) | ||
for i in idxs: | ||
if any((label_counter[label] < samples_per_label) for label in y[i]): | ||
sample_indices.append(i) | ||
for label in y[i]: | ||
label_counter[label] += 1 | ||
return sample_indices, idxs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
44 changes: 44 additions & 0 deletions
44
mteb/tasks/Image/ImageClassification/eng/DTDClassification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from .....abstasks import AbsTaskImageClassification | ||
|
||
|
||
class DTDClassification(AbsTaskImageClassification): | ||
metadata = TaskMetadata( | ||
name="DTD", | ||
description="Describable Textures Dataset in 47 categories.", | ||
reference="https://www.robots.ox.ac.uk/~vgg/data/dtd/", | ||
dataset={ | ||
"path": "tanganke/dtd", | ||
"revision": "d2afa97d9f335b1a6b3b09c637aef667f98f966e", | ||
}, | ||
type="Classification", | ||
category="i2t", | ||
eval_splits=["test"], | ||
eval_langs=["eng-Latn"], | ||
main_score="accuracy", | ||
date=( | ||
"2014-01-01", | ||
"2014-03-01", | ||
), # Estimated range for the collection of reviews | ||
domains=["Encyclopaedic"], | ||
task_subtypes=["Textures recognition"], | ||
license="Not specified", | ||
socioeconomic_status="mixed", | ||
annotations_creators="derived", | ||
dialect=[], | ||
modalities=["image"], | ||
sample_creation="created", | ||
bibtex_citation="""@InProceedings{cimpoi14describing, | ||
Author = {M. Cimpoi and S. Maji and I. Kokkinos and S. Mohamed and and A. Vedaldi}, | ||
Title = {Describing Textures in the Wild}, | ||
Booktitle = {Proceedings of the {IEEE} Conf. on Computer Vision and Pattern Recognition ({CVPR})}, | ||
Year = {2014}} | ||
""", | ||
descriptive_stats={ | ||
"n_samples": {"test": 1880}, | ||
"avg_character_length": {"test": 456}, | ||
}, | ||
) |
49 changes: 49 additions & 0 deletions
49
mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from .....abstasks import AbsTaskImageClassification | ||
|
||
|
||
class RESISC45Classification(AbsTaskImageClassification): | ||
metadata = TaskMetadata( | ||
name="RESISC45", | ||
description="Remote Sensing Image Scene Classification by Northwestern Polytechnical University (NWPU).", | ||
reference="https://ieeexplore.ieee.org/abstract/document/7891544", | ||
dataset={ | ||
"path": "timm/resisc45", | ||
"revision": "fe12fc5f1b7606543b0355eda392f1ddc54625c6", | ||
}, | ||
type="Classification", | ||
category="i2t", | ||
eval_splits=["test"], | ||
eval_langs=["eng-Latn"], | ||
main_score="accuracy", | ||
date=( | ||
"2017-01-01", | ||
"2017-03-01", | ||
), # Estimated range for the collection of reviews | ||
domains=["Encyclopaedic"], | ||
task_subtypes=["Object recognition"], | ||
license="Not specified", | ||
socioeconomic_status="mixed", | ||
annotations_creators="derived", | ||
dialect=[], | ||
modalities=["image"], | ||
sample_creation="created", | ||
bibtex_citation="""@ARTICLE{7891544, | ||
author={Cheng, Gong and Han, Junwei and Lu, Xiaoqiang}, | ||
journal={Proceedings of the IEEE}, | ||
title={Remote Sensing Image Scene Classification: Benchmark and State of the Art}, | ||
year={2017}, | ||
volume={105}, | ||
number={10}, | ||
pages={1865-1883}, | ||
keywords={Remote sensing;Benchmark testing;Spatial resolution;Social network services;Satellites;Image analysis;Machine learning;Unsupervised learning;Classification;Benchmark data set;deep learning;handcrafted features;remote sensing image;scene classification;unsupervised feature learning}, | ||
doi={10.1109/JPROC.2017.2675998}} | ||
""", | ||
descriptive_stats={ | ||
"n_samples": {"test": 6300}, | ||
"avg_character_length": {"test": 256}, | ||
}, | ||
) |
54 changes: 54 additions & 0 deletions
54
mteb/tasks/Image/ImageClassification/eng/STL10Classification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from .....abstasks import AbsTaskImageClassification | ||
|
||
|
||
class STL10Classification(AbsTaskImageClassification): | ||
metadata = TaskMetadata( | ||
name="STL10", | ||
description="Classifying 96x96 images from 10 classes.", | ||
reference="https://cs.stanford.edu/~acoates/stl10/", | ||
dataset={ | ||
"path": "tanganke/stl10", | ||
"revision": "49ae7f94508f7feae62baf836db284306eab0b0f", | ||
}, | ||
type="Classification", | ||
category="i2t", | ||
eval_splits=["test"], | ||
eval_langs=["eng-Latn"], | ||
main_score="accuracy", | ||
date=( | ||
"2011-01-01", | ||
"2011-04-01", | ||
), # Estimated range for the collection of reviews | ||
domains=["Encyclopaedic"], | ||
task_subtypes=["Object recognition"], | ||
license="Not specified", | ||
socioeconomic_status="mixed", | ||
annotations_creators="derived", | ||
dialect=[], | ||
modalities=["image"], | ||
sample_creation="created", | ||
bibtex_citation="""@InProceedings{pmlr-v15-coates11a, | ||
title = {An Analysis of Single-Layer Networks in Unsupervised Feature Learning}, | ||
author = {Coates, Adam and Ng, Andrew and Lee, Honglak}, | ||
booktitle = {Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics}, | ||
pages = {215--223}, | ||
year = {2011}, | ||
editor = {Gordon, Geoffrey and Dunson, David and Dudík, Miroslav}, | ||
volume = {15}, | ||
series = {Proceedings of Machine Learning Research}, | ||
address = {Fort Lauderdale, FL, USA}, | ||
month = {11--13 Apr}, | ||
publisher = {PMLR}, | ||
pdf = {http://proceedings.mlr.press/v15/coates11a/coates11a.pdf}, | ||
url = {https://proceedings.mlr.press/v15/coates11a.html}, | ||
} | ||
""", | ||
descriptive_stats={ | ||
"n_samples": {"test": 8000}, | ||
"avg_character_length": {"test": 431.4}, | ||
}, | ||
) |
Oops, something went wrong.