From 5bedefb59f41da3c36b077a8337a11012c85eadb Mon Sep 17 00:00:00 2001 From: Ilya Matiach Date: Mon, 26 Jun 2023 17:09:00 -0400 Subject: [PATCH] add responsibleai-vision package to responsible-ai-toolbox (#2135) --- responsibleai_vision/README.md | 16 + responsibleai_vision/requirements-automl.txt | 2 + responsibleai_vision/requirements-dev.txt | 30 + responsibleai_vision/requirements.txt | 8 + .../responsibleai_vision/__init__.py | 14 + .../responsibleai_vision/common/__init__.py | 4 + .../responsibleai_vision/common/constants.py | 101 ++ .../responsibleai_vision/common/interfaces.py | 11 + .../responsibleai_vision/managers/__init__.py | 4 + .../managers/error_analysis_manager.py | 322 +++++ .../managers/explainer_manager.py | 657 ++++++++++ .../rai_vision_insights/__init__.py | 9 + .../rai_vision_insights.py | 1149 +++++++++++++++++ .../responsibleai_vision/utils/__init__.py | 4 + .../utils/feature_extractors.py | 59 + .../utils/image_reader.py | 84 ++ .../responsibleai_vision/utils/image_utils.py | 140 ++ .../responsibleai_vision/version.py | 8 + responsibleai_vision/setup.py | 44 + .../tests/common_vision_utils.py | 771 +++++++++++ .../tests/rai_vision_insights_validator.py | 84 ++ .../tests/test_image_utils.py | 48 + .../test_rai_vision_automl_images_insights.py | 131 ++ .../tests/test_rai_vision_insights.py | 265 ++++ ...vision_insights_save_and_load_scenarios.py | 80 ++ 25 files changed, 4045 insertions(+) create mode 100644 responsibleai_vision/README.md create mode 100644 responsibleai_vision/requirements-automl.txt create mode 100644 responsibleai_vision/requirements-dev.txt create mode 100644 responsibleai_vision/requirements.txt create mode 100644 responsibleai_vision/responsibleai_vision/__init__.py create mode 100644 responsibleai_vision/responsibleai_vision/common/__init__.py create mode 100644 responsibleai_vision/responsibleai_vision/common/constants.py create mode 100644 responsibleai_vision/responsibleai_vision/common/interfaces.py create mode 100644 responsibleai_vision/responsibleai_vision/managers/__init__.py create mode 100644 responsibleai_vision/responsibleai_vision/managers/error_analysis_manager.py create mode 100644 responsibleai_vision/responsibleai_vision/managers/explainer_manager.py create mode 100644 responsibleai_vision/responsibleai_vision/rai_vision_insights/__init__.py create mode 100644 responsibleai_vision/responsibleai_vision/rai_vision_insights/rai_vision_insights.py create mode 100644 responsibleai_vision/responsibleai_vision/utils/__init__.py create mode 100644 responsibleai_vision/responsibleai_vision/utils/feature_extractors.py create mode 100644 responsibleai_vision/responsibleai_vision/utils/image_reader.py create mode 100644 responsibleai_vision/responsibleai_vision/utils/image_utils.py create mode 100644 responsibleai_vision/responsibleai_vision/version.py create mode 100644 responsibleai_vision/setup.py create mode 100644 responsibleai_vision/tests/common_vision_utils.py create mode 100644 responsibleai_vision/tests/rai_vision_insights_validator.py create mode 100644 responsibleai_vision/tests/test_image_utils.py create mode 100644 responsibleai_vision/tests/test_rai_vision_automl_images_insights.py create mode 100644 responsibleai_vision/tests/test_rai_vision_insights.py create mode 100644 responsibleai_vision/tests/test_rai_vision_insights_save_and_load_scenarios.py diff --git a/responsibleai_vision/README.md b/responsibleai_vision/README.md new file mode 100644 index 0000000000..23ae27002d --- /dev/null +++ b/responsibleai_vision/README.md @@ -0,0 +1,16 @@ +# Responsible AI Vision SDK for Python + +### This package has been tested with Python 3.6, 3.7, 3.8 and 3.9 + +The Responsible AI Vision SDK enables users to analyze their machine learning models for computer vision in one API. Users will be able to analyze errors, explain the most important features, and understand their data using a single API. + +Highlights of the package include: + +- `explainer.add()` explains the model + +### Supported scenarios, models and datasets + +The Responsible AI Vision SDK supports multiclass classification models on image data currently. + +The open source code for the visualization dashboard can be found here: +https://github.com/microsoft/responsible-ai-widgets diff --git a/responsibleai_vision/requirements-automl.txt b/responsibleai_vision/requirements-automl.txt new file mode 100644 index 0000000000..08e6113223 --- /dev/null +++ b/responsibleai_vision/requirements-automl.txt @@ -0,0 +1,2 @@ +opencv-python==4.3.0.36 +azureml-automl-dnn-vision>=1.47.0 \ No newline at end of file diff --git a/responsibleai_vision/requirements-dev.txt b/responsibleai_vision/requirements-dev.txt new file mode 100644 index 0000000000..d197396917 --- /dev/null +++ b/responsibleai_vision/requirements-dev.txt @@ -0,0 +1,30 @@ +# Requirements for development + +pytest==7.0.1 +pytest-cov +pytest-mock==3.6.1 +requests==2.25.1 + +requirements-parser==0.2.0 + +wheel + +# Required for notebook tests +nbformat +papermill +scrapbook +jupyter +nbval + +docutils<0.18 +sphinx==3.1.1 +sphinx-gallery==0.8.1 +pydata-sphinx-theme==0.3.0 + +transformers +datasets +tensorflow<2.11.0 +opencv-python + +fastai +mlflow \ No newline at end of file diff --git a/responsibleai_vision/requirements.txt b/responsibleai_vision/requirements.txt new file mode 100644 index 0000000000..7d91feadee --- /dev/null +++ b/responsibleai_vision/requirements.txt @@ -0,0 +1,8 @@ +numpy>=1.17.2 +pandas>=0.25.1,<2.0.0 # TODO: remove ceiling on version. +scikit-learn>=0.22.1 +scipy>=1.4.1 +semver~=2.13.0 +responsibleai>=0.27.0 +torchmetrics +vision_explanation_methods \ No newline at end of file diff --git a/responsibleai_vision/responsibleai_vision/__init__.py b/responsibleai_vision/responsibleai_vision/__init__.py new file mode 100644 index 0000000000..4c746e393c --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Responsible AI Vision SDK package.""" + +from responsibleai_vision.common.constants import ModelTask +from responsibleai_vision.rai_vision_insights import RAIVisionInsights + +from .version import name, version + +__name__ = name +__version__ = version + +__all__ = ['ModelTask', 'RAIVisionInsights'] diff --git a/responsibleai_vision/responsibleai_vision/common/__init__.py b/responsibleai_vision/responsibleai_vision/common/__init__.py new file mode 100644 index 0000000000..3c353e3c4b --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/common/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Common infrastructure, constants and utilities.""" diff --git a/responsibleai_vision/responsibleai_vision/common/constants.py b/responsibleai_vision/responsibleai_vision/common/constants.py new file mode 100644 index 0000000000..be47546df5 --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/common/constants.py @@ -0,0 +1,101 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +from enum import Enum + + +class ModelTask(str, Enum): + """Provide model task constants. + + Can be 'image_classification', 'object_detection' or 'unknown'. + """ + + IMAGE_CLASSIFICATION = 'image_classification' + MULTILABEL_IMAGE_CLASSIFICATION = 'multilabel_image_classification' + OBJECT_DETECTION = 'object_detection' + UNKNOWN = 'unknown' + + +class ImageColumns(str, Enum): + """Provide constants related to the input image dataframe columns. + + Can be 'image_url', 'image' or 'label'. + """ + + IMAGE_URL = 'image_url' + IMAGE = 'image' + LABEL = 'label' + IMAGE_DETAILS = 'image_details' + + +class ExplainabilityLiterals: + """Parameters for explainability method names.""" + + MODEL_EXPLAINABILITY = 'model_explainability' + XAI_PARAMETERS = 'xai_parameters' + XAI_ALGORITHM = 'xai_algorithm' + SHAP_METHOD_NAME = 'shap' + XRAI_METHOD_NAME = 'xrai' + INTEGRATEDGRADIENTS_METHOD_NAME = 'integrated_gradients' + GUIDEDGRADCAM_METHOD_NAME = 'guided_gradcam' + GUIDEDBACKPROP_METHOD_NAME = 'guided_backprop' + CONFIDENCE_SCORE_THRESHOLD_MULTILABEL = ( + 'confidence_score_threshold_multilabel' + ) + N_STEPS = "n_steps" + APPROXIMATION_METHOD = "approximation_method" + XRAI_FAST = "xrai_fast" + XAI_ARGS_GROUP = [ + XAI_ALGORITHM, + N_STEPS, + APPROXIMATION_METHOD, + XRAI_FAST, + CONFIDENCE_SCORE_THRESHOLD_MULTILABEL, + ] + SHAP = 'shap' + + +class ExplainabilityDefaults: + """DEFAULT values for explainability parameters.""" + + MODEL_EXPLAINABILITY = False + XAI_ALGORITHM = ExplainabilityLiterals.GUIDEDGRADCAM_METHOD_NAME + OUTPUT_VISUALIZATIONS = True + OUTPUT_ATTRIBUTIONS = False + CONFIDENCE_SCORE_THRESHOLD_MULTILABEL = 0.5 + DEFAULT_MAX_EVALS = 100 + DEFAULT_MASK_RES = 4 + DEFAULT_NUM_MASKS = 50 + + +class XAIPredictionLiterals: + """Strings that will be keys in the output json during prediction.""" + + VISUALIZATIONS_KEY_NAME = 'visualizations' + ATTRIBUTIONS_KEY_NAME = 'attributions' + + +class MLFlowSchemaLiterals: + """MLFlow model signature related schema""" + + INPUT_IMAGE_KEY = 'image_base64' + INPUT_COLUMN_IMAGE = 'image' + INPUT_IMAGE_SIZE = 'image_size' + + +class CommonTags: + """Common constants""" + + IMAGE_DECODE_UTF_FORMAT = 'utf-8' + + +class AutoMLImagesModelIdentifier: + """AutoML model object types""" + + AUTOML_IMAGE_CLASSIFICATION_MODEL = ( + "WrappedMlflowAutomlImagesClassificationModel'>" + ) + + AUTOML_OBJECT_DETECTION_MODEL = ( + "WrappedMlflowAutomlObjectDetectionModel'>" + ) diff --git a/responsibleai_vision/responsibleai_vision/common/interfaces.py b/responsibleai_vision/responsibleai_vision/common/interfaces.py new file mode 100644 index 0000000000..491d2d3bd4 --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/common/interfaces.py @@ -0,0 +1,11 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +from typing import List + + +class VisionExplanationData: + classNames: List[str] + images: List[str] + predictedY: List[str] + trueY: List[str] diff --git a/responsibleai_vision/responsibleai_vision/managers/__init__.py b/responsibleai_vision/responsibleai_vision/managers/__init__.py new file mode 100644 index 0000000000..87a836b247 --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/managers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Contains all of the managers.""" diff --git a/responsibleai_vision/responsibleai_vision/managers/error_analysis_manager.py b/responsibleai_vision/responsibleai_vision/managers/error_analysis_manager.py new file mode 100644 index 0000000000..bd9992cdf6 --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/managers/error_analysis_manager.py @@ -0,0 +1,322 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Defines the Error Analysis Manager class.""" + +import json +from typing import Any, List, Optional + +import jsonschema +import numpy as np +import pandas as pd +from ml_wrappers import wrap_model + +from erroranalysis._internal.error_analyzer import ModelAnalyzer +from erroranalysis._internal.error_report import as_error_report +from responsibleai._tools.shared.state_directory_management import \ + DirectoryManager +from responsibleai.managers.error_analysis_manager import \ + ErrorAnalysisManager as BaseErrorAnalysisManager +from responsibleai.managers.error_analysis_manager import as_error_config +from responsibleai_vision.common.constants import (MLFlowSchemaLiterals, + ModelTask) +from responsibleai_vision.utils.image_reader import ( + get_base64_string_from_path, is_automl_image_model) +from responsibleai_vision.utils.image_utils import get_images + +LABELS = 'labels' + + +def _concat_labels_column(dataset, target_column, classes): + """Concatenate labels column for multilabel models. + + :param dataset: The dataset including the label column. + :type dataset: pandas.DataFrame + :param target_column: The list of label columns in multilabel task. + :type target_column: list[str] + :param classes: The list of labels in multilabel task. + :type classes: list + :return: The labels column concatenated. + :rtype: list + """ + labels = [] + for _, row in dataset[target_column].iterrows(): + row_idxs = range(len(row)) + pred_classes = [classes[i] for i in row_idxs if row[i]] + labels.append(','.join(pred_classes)) + return labels + + +class WrappedIndexPredictorModel: + """Wraps model that uses index to retrieve image data for making + predictions.""" + + def __init__(self, model, dataset, image_mode, transformations, + task_type, classes=None): + """Initialize the WrappedIndexPredictorModel. + + :param model: The model to wrap. + :type model: object + :param dataset: The dataset to use for making predictions. + :type dataset: pandas.DataFrame + :param image_mode: The mode to open the image in. + See pillow documentation for all modes: + https://pillow.readthedocs.io/en/stable/handbook/concepts.html + :type image_mode: str + :param transformations: The transformations to apply to the image. + :type transformations: object + :param task_type: The task to run. + :type task_type: str + :param classes: The classes for the model. + :type classes: list + """ + self.model = model + self.dataset = dataset + self.classes = classes + self.image_mode = image_mode + self.transformations = transformations + self.task_type = task_type + if task_type == ModelTask.OBJECT_DETECTION: + return + if is_automl_image_model(self.model): + test = np.array( + self.dataset.iloc[:, 0].tolist() + ) + test = pd.DataFrame( + data=[ + get_base64_string_from_path(img_path) for img_path in test + ], + columns=[MLFlowSchemaLiterals.INPUT_COLUMN_IMAGE], + ) + else: + test = get_images(self.dataset, self.image_mode, + self.transformations) + self.predictions = self.model.predict(test) + if task_type == ModelTask.MULTILABEL_IMAGE_CLASSIFICATION: + predictions_joined = [] + for row in self.predictions: + # get all labels where prediction is 1 + pred_labels = [i for i in range(len(row)) if row[i]] + if self.classes is not None: + pred_labels = [self.classes[i] for i in pred_labels] + else: + pred_labels = [str(i) for i in pred_labels] + # concatenate all predicted labels into a single string + predictions_joined.append(','.join(pred_labels)) + self.predictions = np.array(predictions_joined) + self.predict_proba = self.model.predict_proba(test) + + def predict(self, X): + """Predict the class labels for the provided data. + + :param X: Data to predict the labels for. + :type X: pandas.DataFrame + :return: Predicted class labels. + :rtype: list + """ + index = X.index + predictions = self.predictions[index] + if self.task_type == ModelTask.MULTILABEL_IMAGE_CLASSIFICATION: + return predictions + if self.classes is not None: + predictions = [self.classes[y] for y in predictions] + return predictions + + def predict_proba(self, X): + """Predict the class probabilities for the provided data. + + :param X: Data to predict the probabilities for. + :type X: pandas.DataFrame + :return: Predicted class probabilities. + :rtype: list[list] + """ + index = X.index + pred_proba = self.predict_proba[index] + return pred_proba + + +class ErrorAnalysisManager(BaseErrorAnalysisManager): + + """Defines a wrapper class of Error Analysis for vision scenario.""" + + def __init__(self, model: Any, dataset: pd.DataFrame, + ext_dataset: pd.DataFrame, target_column: str, + task_type: str, + image_mode: str, transformations: Any, + classes: Optional[List] = None, + categorical_features: Optional[List[str]] = None): + """Creates an ErrorAnalysisManager object. + + :param model: The model to analyze errors on. + A model that implements sklearn.predict or sklearn.predict_proba + or function that accepts a 2d ndarray. + :type model: object + :param dataset: The dataset including the label column. + :type dataset: pandas.DataFrame + :param ext_dataset: The dataset of extracted features including the + label column. + :type ext_dataset: pandas.DataFrame + :param target_column: The name of the label column. + :type target_column: str + :param task_type: The task to run. + :type task_type: str + :param image_mode: The mode to open the image in. + See pillow documentation for all modes: + https://pillow.readthedocs.io/en/stable/handbook/concepts.html + :type image_mode: str + :param transformations: The transformations to apply to the image. + :type transformations: object + :param classes: Class names as a list of strings. + The order of the class names should match that of the model + output. Only required if analyzing a classifier. + :type classes: list + :param categorical_features: The categorical feature names. + :type categorical_features: list[str] + """ + index_classes = classes + is_od = task_type == ModelTask.OBJECT_DETECTION + if isinstance(target_column, list) and not is_od: + # create copy of dataset as we will make modifications to it + dataset = dataset.copy() + index_classes = target_column + labels = _concat_labels_column(dataset, target_column, + index_classes) + dataset[LABELS] = labels + ext_dataset[LABELS] = dataset[LABELS] + dataset.drop(columns=target_column, inplace=True) + ext_dataset.drop(columns=target_column, inplace=True) + target_column = LABELS + index_predictor = ErrorAnalysisManager._create_index_predictor( + model, dataset, target_column, task_type, image_mode, + transformations, index_classes) + super(ErrorAnalysisManager, self).__init__( + index_predictor, ext_dataset, target_column, + classes, categorical_features) + + def compute(self, **kwargs): + """Compute the error analysis data. + + :param kwargs: The keyword arguments to pass to the compute method. + Note that this method does not take any arguments currently. + :type kwargs: dict + """ + super(ErrorAnalysisManager, self).compute() + + @staticmethod + def _create_index_predictor(model, dataset, target_column, task_type, + image_mode, transformations, classes=None): + """Creates a wrapped predictor that uses index to retrieve text data. + + :param model: The model to analyze errors on. + A model that implements sklearn.predict or sklearn.predict_proba + or function that accepts a 2d ndarray. + :type model: object + :param dataset: The dataset including the label column. + :type dataset: pandas.DataFrame + :target_column: The name of the label column. + :type target_column: str + :param task_type: The task to run. + :type task_type: str + :param image_mode: The mode to open the image in. + See pillow documentation for all modes: + https://pillow.readthedocs.io/en/stable/handbook/concepts.html + :type image_mode: str + :param transformations: The transformations to apply to the image. + :type transformations: Any + :param classes: Class names as a list of strings. + The order of the class names should match that of the model + output. + :type classes: list + :return: A wrapped predictor that uses index to retrieve text data. + :rtype: WrappedIndexPredictorModel + """ + dataset = dataset.drop(columns=[target_column]) + index_predictor = WrappedIndexPredictorModel( + model, dataset, image_mode, transformations, task_type, classes) + return index_predictor + + @staticmethod + def _load(path, rai_insights): + """Load the ErrorAnalysisManager from the given path. + + :param path: The directory path to load the ErrorAnalysisManager from. + :type path: str + :param rai_insights: The loaded parent RAIInsights. + :type rai_insights: RAIInsights + :return: The ErrorAnalysisManager manager after loading. + :rtype: ErrorAnalysisManager + """ + # create the ErrorAnalysisManager without any properties using + # the __new__ function, similar to pickle + inst = ErrorAnalysisManager.__new__(ErrorAnalysisManager) + + ea_config_list = [] + ea_report_list = [] + all_ea_dirs = DirectoryManager.list_sub_directories(path) + for ea_dir in all_ea_dirs: + directory_manager = DirectoryManager( + parent_directory_path=path, + sub_directory_name=ea_dir) + + config_path = (directory_manager.get_config_directory() / + 'config.json') + with open(config_path, 'r') as file: + ea_config = json.load(file, object_hook=as_error_config) + ea_config_list.append(ea_config) + + report_path = (directory_manager.get_data_directory() / + 'report.json') + with open(report_path, 'r') as file: + ea_report = json.load(file, object_hook=as_error_report) + # Validate the serialized output against schema + schema = ErrorAnalysisManager._get_error_analysis_schema() + jsonschema.validate( + json.loads(ea_report.to_json()), schema) + ea_report_list.append(ea_report) + + inst.__dict__['_ea_report_list'] = ea_report_list + inst.__dict__['_ea_config_list'] = ea_config_list + + feature_metadata = rai_insights._feature_metadata + categorical_features = feature_metadata.categorical_features + inst.__dict__['_categorical_features'] = categorical_features + target_column = rai_insights.target_column + true_y = rai_insights._ext_test_df[target_column] + if isinstance(target_column, list): + dropped_cols = target_column + else: + dropped_cols = [target_column] + dataset = rai_insights._ext_test_df.drop(columns=dropped_cols) + inst.__dict__['_dataset'] = dataset + feature_names = list(dataset.columns) + inst.__dict__['_feature_names'] = feature_names + task_type = rai_insights.task_type + wrapped_model = wrap_model(rai_insights.model, dataset, + rai_insights.task_type, + classes=rai_insights._classes) + inst.__dict__['_task_type'] = task_type + index_classes = rai_insights._classes + is_od = task_type == ModelTask.OBJECT_DETECTION + index_dataset = rai_insights.test + if isinstance(target_column, list) and not is_od: + # create copy of dataset as we will make modifications to it + index_dataset = index_dataset.copy() + index_classes = target_column + labels = _concat_labels_column(index_dataset, target_column, + index_classes) + index_dataset.drop(columns=target_column, inplace=True) + index_dataset[LABELS] = labels + target_column = LABELS + true_y = index_dataset[target_column] + inst.__dict__['_true_y'] = true_y + index_predictor = ErrorAnalysisManager._create_index_predictor( + wrapped_model, index_dataset, target_column, + task_type, rai_insights.image_mode, + rai_insights._transformations, + rai_insights._classes) + inst.__dict__['_analyzer'] = ModelAnalyzer(index_predictor, + dataset, + true_y, + feature_names, + categorical_features) + return inst diff --git a/responsibleai_vision/responsibleai_vision/managers/explainer_manager.py b/responsibleai_vision/responsibleai_vision/managers/explainer_manager.py new file mode 100644 index 0000000000..cf3ec53cd4 --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/managers/explainer_manager.py @@ -0,0 +1,657 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Defines the Explainer Manager class.""" + +import base64 +import io +import json +import pickle +import warnings +from pathlib import Path +from typing import Any, List, Optional + +import cv2 +import matplotlib.pyplot as pl +import numpy as np +import pandas as pd +import shap +from ml_wrappers import wrap_model +from ml_wrappers.model.image_model_wrapper import (MLflowDRiseWrapper, + PytorchDRiseWrapper) +from PIL import Image, ImageDraw, ImageFont +from shap.plots import colors +from shap.utils._legacy import kmeans +from vision_explanation_methods.DRISE_runner import get_drise_saliency_map + +from responsibleai._interfaces import ModelExplanationData +from responsibleai._internal.constants import ExplainerManagerKeys as Keys +from responsibleai._internal.constants import (ListProperties, ManagerNames, + Metadata) +from responsibleai._tools.shared.state_directory_management import \ + DirectoryManager +from responsibleai.exceptions import UserConfigValidationException +from responsibleai.managers.base_manager import BaseManager +from responsibleai_vision.common.constants import (CommonTags, + ExplainabilityDefaults, + ExplainabilityLiterals, + MLFlowSchemaLiterals, + ModelTask, + XAIPredictionLiterals) +from responsibleai_vision.utils.image_reader import ( + get_base64_string_from_path, get_image_from_path, is_automl_image_model) + +IS_RUN = 'is_run' +IS_ADDED = 'is_added' +CLASSES = 'classes' +U_EVALUATION_EXAMPLES = '_evaluation_examples' +FEATURES = 'features' +META_JSON = Metadata.META_JSON +MODEL = Metadata.MODEL +EXPLANATION = '_explanation' +TASK_TYPE = 'task_type' +_MAX_EVALS = '_max_evals' +_NUM_MASKS = '_num_masks' +_MASK_RES = '_mask_res' +DEFAULT_MAX_EVALS = ExplainabilityDefaults.DEFAULT_MAX_EVALS +DEFAULT_MASK_RES = ExplainabilityDefaults.DEFAULT_MASK_RES +DEFAULT_NUM_MASKS = ExplainabilityDefaults.DEFAULT_NUM_MASKS + + +class ExplainerManager(BaseManager): + + """Defines the ExplainerManager for explaining an image-based model.""" + + def __init__(self, model: Any, + evaluation_examples: pd.DataFrame, + target_column: str, + task_type: str, + classes: Optional[List] = None, + image_mode: str = None, + max_evals: Optional[int] = DEFAULT_MAX_EVALS, + num_masks: Optional[int] = DEFAULT_NUM_MASKS, + mask_res: Optional[int] = DEFAULT_MASK_RES): + """Creates an ExplainerManager object. + + :param model: The model to explain. + A model that implements sklearn.predict or sklearn.predict_proba + or function that accepts a 2d ndarray. + :type model: object + :param evaluation_examples: A matrix of feature vector + examples (# examples x # features) on which to explain the + model's output, with an additional label column. + :type evaluation_examples: pandas.DataFrame + :param target_column: The name of the label column. + :type target_column: str + :param task_type: The task to run. + :type task_type: str + :param classes: Class names as a list of strings. + The order of the class names should match that of the model + output. Only required if explaining classifier. + :type classes: list + :param image_mode: The mode to open the image in. + See pillow documentation for all modes: + https://pillow.readthedocs.io/en/stable/handbook/concepts.html + :type image_mode: str + :param max_evals: The maximum number of evaluations to run. + Used by shap hierarchical image explainer. + If not specified defaults to 100. + :type max_evals: int + :param num_masks: The number of masks to use for the + DRISE image explainer for object detection. + If not specified defaults to 50. + :type num_masks: int + :param mask_res: The resolution of the masks to use for the + DRISE image explainer for object detection. + If not specified defaults to 4. + :type mask_res: int + """ + self._image_mode = image_mode + if task_type == ModelTask.OBJECT_DETECTION: + if is_automl_image_model(model): + self._model = MLflowDRiseWrapper(model._model, classes) + else: + self._model = PytorchDRiseWrapper(model._model, len(classes)) + else: + self._model = model + self._target_column = target_column + if not isinstance(target_column, list): + target_column = [target_column] + self._evaluation_examples = \ + evaluation_examples.drop(columns=target_column) + self._is_run = False + self._is_added = False + self._features = list(self._evaluation_examples.columns) + self._classes = classes + self._explanation = None + self._task_type = task_type + self._max_evals = max_evals + self._num_masks = num_masks + self._mask_res = mask_res + + def add(self): + """Add an explainer to be computed later.""" + if self._model is None: + raise UserConfigValidationException( + 'Model is required for model explanations') + + if self._is_added: + warnings.warn(("DUPLICATE-EXPLAINER-CONFIG: Ignoring. " + "Explanation has already been added, " + "currently limited to one explainer type."), + UserWarning) + return + self._is_added = True + + def compute(self, **kwargs): + """Creates an explanation by running the explainer on the model.""" + if not self._is_added: + return + if self._is_run: + return + self._explanation = [] + if self._is_classification_task: + for i in range(len(self._evaluation_examples)): + self._explanation.append( + self.compute_single_explanation(i, **kwargs) + ) + elif self._is_object_detection_task: + for i in range(len(self._evaluation_examples)): + self._explanation.append( + self.compute_single_explanation(i, object_index=None) + ) + else: + raise ValueError('Unknown task type: {}'.format(self._task_type)) + + self._is_run = True + + def compute_single_explanation(self, + index, + max_evals=None, + object_index=0, + **kwargs): + """Creates an explanation for a single image in the dataset + + :param index: The index of the image to create the explanation for + :type index: int + :param max_evals: the maximum number of evalutions + :type max_evals: int + :param object_index: The index of the object within the image we are + looking to create the explanation for. Note that saliency maps are + created one object per image. The default value for this is 0 to + ensure a modular development process (so this function won't fail + without an updated frontend). This parameter is only for the object + detection scenario using the DRISE functionality. + :type object_index: Optional[int] + :return: The explanation for the image, which is a saliency map. + For object detection, this can be a list of saliency maps if + object index is not specified. + :rtype: str or list[str] + """ + if max_evals is None: + # if not specified use global max_evals value + max_evals = self._max_evals + self.automl_image_model = is_automl_image_model(self._model) + if self.automl_image_model: + # get xai algorithm name + xai_algo_name = kwargs.get( + ExplainabilityLiterals.XAI_ALGORITHM, + ExplainabilityDefaults.XAI_ALGORITHM + ) + else: + xai_algo_name = ExplainabilityLiterals.SHAP + + if not self._is_added: + self.add() + if index < 0 or index > len(self._evaluation_examples) - 1: + raise ValueError('Index out of range') + if self._explanation is not None and index < len(self._explanation): + if self._task_type == ModelTask.OBJECT_DETECTION: + return self._explanation[index][object_index] + else: + return self._explanation[index] + if self._is_classification_task: + ex = self._evaluation_examples + image = ex.iloc[index:index + 1, 0].values[0] + if isinstance(image, str): + if not self.automl_image_model: + image = get_image_from_path(image, self._image_mode) + if xai_algo_name == ExplainabilityLiterals.SHAP: + if not self.automl_image_model: + explanation = self.get_shap_explanations(image, max_evals) + return self.image(explanation, 0) + else: + raise ValueError( + '{} is not supported for the model type: {}'.format( + xai_algo_name, type(self._model) + ) + ) + else: + if self.automl_image_model: + visualization = self.get_automl_explanations( + image, xai_algo_name, **kwargs + ) + return visualization + else: + raise ValueError( + '{} is not supported for the model type: {}'.format( + xai_algo_name, type(self._model) + ) + ) + if self._is_object_detection_task: + ex = self._evaluation_examples + img = ex.iloc[index:index + 1, 0].values[0] + try: + if (type(self._model) is not MLflowDRiseWrapper and + type(self._model) is not PytorchDRiseWrapper): + if is_automl_image_model(self._model): + self._model = MLflowDRiseWrapper(self._model._model, + self._classes) + else: + self._model = PytorchDRiseWrapper(self._model._model, + len(self._classes)) + + # calling DRISE to generate saliency maps for all objects + mask_res_tuple = (self._mask_res, self._mask_res) + fl, _, _, = get_drise_saliency_map(img, + self._model, + len(self._classes), + savename=str(index), + nummasks=self._num_masks, + maskres=mask_res_tuple, + max_figures=5000) + if object_index is None: + return fl + b64_string = fl[object_index] + except BaseException: + if object_index is None: + return [self._get_fail_str()] + b64_string = self._get_fail_str() + return b64_string + else: + raise ValueError('Unknown task type: {}'.format(self._task_type)) + + def get_shap_explanations(self, image, max_evals): + """Generates an explanation using shap method. + + :param image: Input image in numpy format + :type image: numpy.ndarray + :param max_evals: Max evaluations needed for shap explainer + :type max_evals: int + :return: The computed explanation + :rtype: numpy.array + """ + masker = shap.maskers.Image('inpaint_telea', image.shape) + explainer = shap.Explainer(self._model.predict_proba, + masker, + output_names=self._classes) + image_shape = list(image.shape) + image_shape.insert(0, -1) + image = image.reshape(tuple(image_shape)) + exp = shap.Explanation + explanation = explainer(image, max_evals=max_evals, + outputs=exp.argsort.flip[:4]) + return explanation + + def get_automl_explanations(self, image, xai_algo_name, **kwargs): + """Generates an explanation using automl images XAI methods. + + :param image: Input image path + :type image: str + :param xai_algo_name: Input xai algorithm name + :type xai_algo_name: str + :return: The computed explanation + :rtype: base64 string + """ + model_explainability = True + xai_parameters = { + ExplainabilityLiterals.XAI_ALGORITHM: xai_algo_name, + XAIPredictionLiterals.VISUALIZATIONS_KEY_NAME: True, + XAIPredictionLiterals.ATTRIBUTIONS_KEY_NAME: False, + } + xai_parameters.update(kwargs) + image_df = pd.DataFrame( + data=[ + json.dumps( + { + MLFlowSchemaLiterals.INPUT_IMAGE_KEY: + get_base64_string_from_path(image), + + ExplainabilityLiterals.MODEL_EXPLAINABILITY: + model_explainability, + + ExplainabilityLiterals.XAI_PARAMETERS: xai_parameters, + } + ) + ], + columns=[MLFlowSchemaLiterals.INPUT_COLUMN_IMAGE], + ) + + response_df = self._model._mlflow_predict(image_df) + visualization, _ = response_df.loc[ + 0, + [ + XAIPredictionLiterals.VISUALIZATIONS_KEY_NAME, + XAIPredictionLiterals.ATTRIBUTIONS_KEY_NAME, + ], + ].values + + return visualization + + def image(self, explanation, index): + """ Plots SHAP values for image inputs. + + :param explanation: Computed explanation + :type explanation: numpy.array + :param index: Index value + :type index: int + :return: The computed explanation + :rtype: base64 string + """ + width = 20 + aspect = 0.2 + hspace = 0.2 + labelpad = None + + shap_exp = explanation[index] + shap_values = shap_exp.values + shape_len = range(shap_values.shape[-1]) + if len(shap_exp.output_dims) == 1: + shap_values = [shap_values[..., i] for i in shape_len] + elif len(shap_exp.output_dims) == 0: + shap_values = shap_exp.values + else: + raise Exception('Number of outputs needs to have support added') + + pixel_values = shap_exp.data + labels = shap_exp.output_names + + multi_output = True + if not isinstance(shap_values, list): + multi_output = False + shap_values = [shap_values] + + if len(shap_values[0].shape) == 3: + shap_values = [v.reshape(1, *v.shape) for v in shap_values] + pixel_values = pixel_values.reshape(1, *pixel_values.shape) + + # make sure labels + if labels is not None: + labels = np.array(labels) + if (labels.shape[0] != shap_values[0].shape[0] and + labels.shape[0] == len(shap_values)): + labels = np.tile(np.array([labels]), shap_values[0].shape[0]) + assert labels.shape[0] == shap_values[0].shape[0], \ + "Labels must have same row count as shap_values arrays" + if multi_output: + assert labels.shape[1] == len(shap_values), \ + "Labels must have a column for each output in shap_values" + else: + assert len(labels.shape) == 1, \ + "Labels must be a vector for single output shap_values" + + label_kwargs = {} if labelpad is None else {'pad': labelpad} + + # plot our explanations + x = pixel_values + fig_size = np.array([3 * (len(shap_values) + 1), + 2.5 * (x.shape[0] + 1)]) + if fig_size[0] > width: + fig_size *= width / fig_size[0] + fig, axes = pl.subplots(nrows=x.shape[0], + ncols=len(shap_values) + 1, + figsize=fig_size) + if len(axes.shape) == 1: + axes = axes.reshape(1, axes.size) + for row in range(x.shape[0]): + x_curr = x[row].copy() + + # make sure we have a 2D array for grayscale + if len(x_curr.shape) == 3 and x_curr.shape[2] == 1: + x_curr = x_curr.reshape(x_curr.shape[:2]) + if x_curr.max() > 1: + try: + x_curr /= 255. + except Exception: + # In-place divide can fail for certain types + x_curr = x_curr / 255. + + # get a grayscale version of the image + if len(x_curr.shape) == 3 and x_curr.shape[2] == 3: + x_curr_gray = (0.2989 * x_curr[:, :, 0] + + 0.5870 * x_curr[:, :, 1] + + 0.1140 * x_curr[:, :, 2]) # rgb to gray + x_curr_disp = x_curr + elif len(x_curr.shape) == 3: + x_curr_gray = x_curr.mean(2) + + # for non-RGB multi-channel data + flat_vals = x_curr.reshape([x_curr.shape[0] * x_curr.shape[1], + x_curr.shape[2]]).T + flat_vals = (flat_vals.T - flat_vals.mean(1)).T + means = kmeans(flat_vals, 3, round_values=False) + means = means.data.T.reshape([x_curr.shape[0], + x_curr.shape[1], 3]) + x_curr_disp = ((means - np.percentile(means, 0.5, (0, 1))) / + (np.percentile(means, 99.5, (0, 1)) - + np.percentile(means, 1, (0, 1)))) + x_curr_disp[x_curr_disp > 1] = 1 + x_curr_disp[x_curr_disp < 0] = 0 + else: + x_curr_gray = x_curr + x_curr_disp = x_curr + + axes[row, 0].imshow(x_curr_disp, cmap=pl.get_cmap('gray')) + axes[row, 0].axis('off') + s_vals = shap_values + s_range = range(len(s_vals)) + if len(shap_values[0][row].shape) == 2: + abs_vals = np.stack([np.abs(s_vals[i]) for i in s_range], 0) + else: + abs_vals = np.stack([np.abs(s_vals[i].sum(-1)) + for i in s_range], 0) + abs_vals = abs_vals.flatten() + max_val = np.nanpercentile(abs_vals, 99.9) + for i in s_range: + if labels is not None: + axes[row, i + 1].set_title(labels[row, i], **label_kwargs) + sv = (s_vals[i][row] + if len(s_vals[i][row].shape) == 2 + else s_vals[i][row].sum(-1)) + axes[row, i + 1].imshow(x_curr_gray, cmap=pl.get_cmap('gray'), + alpha=0.15, + extent=(-1, sv.shape[1], + sv.shape[0], -1)) + im = axes[row, i + 1].imshow(sv, + cmap=colors.red_transparent_blue, + vmin=-max_val, + vmax=max_val) + axes[row, i + 1].axis('off') + if hspace == 'auto': + fig.tight_layout() + else: + fig.subplots_adjust(hspace=hspace) + cb = fig.colorbar(im, + ax=np.ravel(axes).tolist(), + label='SHAP value', + orientation='horizontal', + aspect=fig_size[0] / aspect) + cb.outline.set_visible(False) + s = io.BytesIO() + pl.savefig(s, format='jpg') + s.seek(0) + b64 = base64.b64encode(s.read()) + b64 = b64.decode(CommonTags.IMAGE_DECODE_UTF_FORMAT) + pl.clf() + return b64 + + def get(self): + """Get the computed explanation. + + Must be called after add and compute methods. + + :return: The computed explanations. + :rtype: + list[interpret_community.explanation.explanation.BaseExplanation] + """ + if self._explanation: + return [self._explanation] + else: + return [] + + def list(self): + """List information about the ExplainerManager. + + :return: A dictionary of properties. + :rtype: dict + """ + props = {ListProperties.MANAGER_TYPE: self.name} + if self._explanation: + props[Keys.IS_COMPUTED] = True + else: + props[Keys.IS_COMPUTED] = False + return props + + def get_data(self): + """Get explanation data + + :return: A array of ModelExplanationData. + :rtype: List[ModelExplanationData] + """ + return [self._get_interpret(i) for i in self.get()] + + def _get_interpret(self, explanation): + interpretation = ModelExplanationData() + return interpretation + + @property + def name(self): + """Get the name of the explainer manager. + + :return: The name of the explainer manager. + :rtype: str + """ + return ManagerNames.EXPLAINER + + @property + def _is_multilabel_task(self): + """Check if the task is a multilabel classification task. + + :return: True if the task is a multilabel classification task. + :rtype: bool + """ + return self._task_type == ModelTask.MULTILABEL_IMAGE_CLASSIFICATION + + @property + def _is_classification_task(self): + """Check if the task is a classification task. + + :return: True if the task is a classification task. + :rtype: bool + """ + is_onelabel_task = self._task_type == ModelTask.IMAGE_CLASSIFICATION + is_multilabel_task = self._is_multilabel_task + return is_onelabel_task or is_multilabel_task + + @property + def _is_object_detection_task(self): + """Check if the task is an object detection task. + + :return: True if the task is an object detection task. + :rtype: bool + """ + return self._task_type == ModelTask.OBJECT_DETECTION + + def _get_fail_str(self): + fail = Image.new('RGB', (100, 100)) + draw = ImageDraw.Draw(fail) + font = ImageFont.load_default() + text = "saliency map could not be created" + textwidth, textheight = draw.textsize(text, font) + x = (fail.width - textwidth) // 2 + y = (fail.height - textheight) // 2 + draw.text((x, y), text, fill="white", font=font) + fail.show() + fail.save('fail.jpg') + image = get_image_from_path("fail.jpg", "RGB") + jpg_img = cv2.imencode('.jpg', image) + return base64.b64encode(jpg_img[1]).decode('utf-8') + + def _save(self, path): + """Save the ExplainerManager to the given path. + + :param path: The directory path to save the ExplainerManager to. + :type path: str + """ + top_dir = Path(path) + top_dir.mkdir(parents=True, exist_ok=True) + if self._is_added: + directory_manager = DirectoryManager(parent_directory_path=path) + data_directory = directory_manager.create_data_directory() + + # save the explanation + if self._explanation: + with open(data_directory / ManagerNames.EXPLAINER, 'wb') as f: + pickle.dump(self._explanation, f, protocol=4) + + meta = {IS_RUN: self._is_run, + IS_ADDED: self._is_added} + with open(data_directory / META_JSON, 'w') as file: + json.dump(meta, file) + + @staticmethod + def _load(path, rai_insights): + """Load the ExplainerManager from the given path. + + :param path: The directory path to load the ExplainerManager from. + :type path: str + :param rai_insights: The loaded parent RAIInsights. + :type rai_insights: RAIInsights + :return: The ExplainerManager manager after loading. + :rtype: ExplainerManager + """ + # create the ExplainerManager without any properties using the __new__ + # function, similar to pickle + inst = ExplainerManager.__new__(ExplainerManager) + + all_cf_dirs = DirectoryManager.list_sub_directories(path) + if len(all_cf_dirs) != 0: + directory_manager = DirectoryManager( + parent_directory_path=path, + sub_directory_name=all_cf_dirs[0]) + data_directory = directory_manager.get_data_directory() + + with open(data_directory / META_JSON, 'r') as meta_file: + meta = meta_file.read() + meta = json.loads(meta) + inst.__dict__['_' + IS_RUN] = meta[IS_RUN] + inst.__dict__['_' + IS_ADDED] = meta[IS_ADDED] + + inst.__dict__[EXPLANATION] = None + explanation_path = data_directory / ManagerNames.EXPLAINER + if explanation_path.exists(): + with open(explanation_path, 'rb') as f: + explanation = pickle.load(f) + inst.__dict__[EXPLANATION] = explanation + else: + inst.__dict__['_' + IS_RUN] = False + inst.__dict__['_' + IS_ADDED] = False + inst.__dict__[EXPLANATION] = None + + wrapped_model = wrap_model(rai_insights.model, rai_insights.test, + rai_insights.task_type, + classes=rai_insights._classes) + inst.__dict__['_' + MODEL] = wrapped_model + inst.__dict__['_' + CLASSES] = rai_insights._classes + inst.__dict__[_MAX_EVALS] = rai_insights.max_evals + inst.__dict__[_NUM_MASKS] = rai_insights.num_masks + inst.__dict__[_MASK_RES] = rai_insights.mask_res + target_column = rai_insights.target_column + if not isinstance(target_column, list): + target_column = [target_column] + test = rai_insights.test.drop(columns=target_column) + inst.__dict__[U_EVALUATION_EXAMPLES] = test + inst.__dict__['_' + FEATURES] = list(test.columns) + inst.__dict__['_' + TASK_TYPE] = rai_insights.task_type + + return inst diff --git a/responsibleai_vision/responsibleai_vision/rai_vision_insights/__init__.py b/responsibleai_vision/responsibleai_vision/rai_vision_insights/__init__.py new file mode 100644 index 0000000000..2a69f32304 --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/rai_vision_insights/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Implementation of RAI Vision Insights API.""" + +from responsibleai_vision.rai_vision_insights.rai_vision_insights import \ + RAIVisionInsights + +__all__ = ['RAIVisionInsights'] diff --git a/responsibleai_vision/responsibleai_vision/rai_vision_insights/rai_vision_insights.py b/responsibleai_vision/responsibleai_vision/rai_vision_insights/rai_vision_insights.py new file mode 100644 index 0000000000..d53eb8d7ee --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/rai_vision_insights/rai_vision_insights.py @@ -0,0 +1,1149 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Defines the RAIVisionInsights class.""" + +import base64 +import io +import json +import os +import pickle +import shutil +import warnings +from enum import Enum +from pathlib import Path +from typing import Any, Optional + +import matplotlib.pyplot as pl +import numpy as np +import pandas as pd +import torch +from ml_wrappers import wrap_model +from torchmetrics.detection.mean_ap import MeanAveragePrecision + +from erroranalysis._internal.cohort_filter import FilterDataWithCohortFilters +from raiutils.data_processing import convert_to_list +from raiutils.models.model_utils import SKLearn +from responsibleai._interfaces import Dataset, RAIInsightsData +from responsibleai._internal.constants import (ManagerNames, Metadata, + SerializationAttributes) +from responsibleai.exceptions import UserConfigValidationException +from responsibleai.feature_metadata import FeatureMetadata +from responsibleai.rai_insights.rai_base_insights import RAIBaseInsights +from responsibleai.serialization_utilities import serialize_json_safe +from responsibleai_vision.common.constants import (CommonTags, + ExplainabilityDefaults, + ImageColumns, + MLFlowSchemaLiterals, + ModelTask) +from responsibleai_vision.managers.error_analysis_manager import \ + ErrorAnalysisManager +from responsibleai_vision.managers.explainer_manager import ExplainerManager +from responsibleai_vision.utils.feature_extractors import extract_features +from responsibleai_vision.utils.image_reader import ( + get_base64_string_from_path, get_image_from_path, is_automl_image_model) +from responsibleai_vision.utils.image_utils import ( + convert_images, get_images, transform_object_detection_labels) + +IMAGE = ImageColumns.IMAGE.value +IMAGE_URL = ImageColumns.IMAGE_URL.value +DEFAULT_MAX_EVALS = ExplainabilityDefaults.DEFAULT_MAX_EVALS +DEFAULT_NUM_MASKS = ExplainabilityDefaults.DEFAULT_NUM_MASKS +DEFAULT_MASK_RES = ExplainabilityDefaults.DEFAULT_MASK_RES +_IMAGE_MODE = 'image_mode' +_IMAGE_DOWNLOADER = 'image_downloader' +_IMAGE_WIDTH = 'image_width' +_MAX_EVALS = 'max_evals' +_NUM_MASKS = 'num_masks' +_MASK_RES = 'mask_res' +_PREDICTIONS = 'predictions' +_TEST = 'test' +_TARGET_COLUMN = 'target_column' +_TASK_TYPE = 'task_type' +_CLASSES = 'classes' +_META_JSON = Metadata.META_JSON +_JSON_EXTENSION = '.json' +_PREDICT = 'predict' +_PREDICT_PROBA = 'predict_proba' +_EXT_TEST = '_ext_test' +_EXT_FEATURES = '_ext_features' +_MODEL = Metadata.MODEL +_MODEL_PKL = _MODEL + '.pkl' +_SERIALIZER = 'serializer' +_TRANSFORMATIONS = 'transformations' +_MLTABLE_DIR = 'mltables' +_MLTABLE_METADATA_FILENAME = 'metadata.json' +_TEST_MLTABLE_PATH = 'test_mltable_path' +_FEATURE_METADATA = Metadata.FEATURE_METADATA +_IDENTITY_FEATURE_NAME = 'identity_feature_name' +_DATETIME_FEATURES = 'datetime_features' +_TIME_SERIES_ID_FEATURES = 'time_series_id_features' +_CATEGORICAL_FEATURES = 'categorical_features' +_DROPPED_FEATURES = 'dropped_features' + + +def reshape_image(image): + """Reshape image to have one extra dimension for rows. + + :param image: Image to reshape. + :type image: numpy.ndarray + :return: Reshaped image. + :rtype: numpy.ndarray + """ + image_shape_len = len(image.shape) + if image_shape_len != 2 and image_shape_len != 3: + raise ValueError('Image must have 2 or 3 dimensions') + return np.expand_dims(image, axis=0) + + +class RAIVisionInsights(RAIBaseInsights): + """Defines the top-level RAIVisionInsights API. + + Use RAIVisionInsights to assess vision machine learning models in a + single API. + """ + + def __init__(self, model: Any, + test: pd.DataFrame, + target_column: str, task_type: str, + classes: Optional[np.ndarray] = None, + serializer: Optional[Any] = None, + maximum_rows_for_test: int = 5000, + image_mode: str = "RGB", + test_data_path: Optional[str] = None, + transformations: Optional[Any] = None, + image_downloader: Optional[Any] = None, + feature_metadata: Optional[FeatureMetadata] = None, + image_width: Optional[float] = None, + max_evals: Optional[int] = DEFAULT_MAX_EVALS, + num_masks: Optional[int] = DEFAULT_NUM_MASKS, + mask_res: Optional[int] = DEFAULT_MASK_RES): + """Creates an RAIVisionInsights object. + + :param model: The model to compute RAI insights for. + A model that implements sklearn.predict or sklearn.predict_proba + or function that accepts a 2d ndarray. + :type model: object + :param test: The test dataframe including the label column. + :type test: pd.DataFrame + :param target_column: The name of the label column or list of columns. + This is a list of columns for multilabel models. + :type target_column: str or list[str] + :param task_type: The task to run. + :type task_type: str + :param classes: The class labels in the dataset. + :type classes: numpy.ndarray + :param serializer: Picklable custom serializer with save and load + methods for custom model serialization. + The save method writes the model to file given a parent directory. + The load method returns the deserialized model from the same + parent directory. + :type serializer: object + :param maximum_rows_for_test: Limit on size of test data + (for performance reasons) + :type maximum_rows_for_test: int + :param image_mode: The mode to open the image in. + See pillow documentation for all modes: + https://pillow.readthedocs.io/en/stable/handbook/concepts.html + :type image_mode: str + :param test_data_path: The path to the test data. + :type test_data_path: str + :param transformations: The transformations to apply to the image. + This must be a callable or a string column name with + transformed images. + :type transformations: object + :param image_downloader: The image downloader to use to download + images from a URL. + :type image_downloader: object + :param feature_metadata: Feature metadata for the dataset + to identify different kinds of features. + :type feature_metadata: Optional[FeatureMetadata] + :param image_width: The width to resize the image to. + The size is in inches. Note larger resolutions in + dashboard can cause slowness and memory errors. + If not specified does not resize images. + :type image_width: float + :param max_evals: The maximum number of evaluations to run. + Used by shap hierarchical image explainer. + If not specified defaults to 100. + :type max_evals: int + :param num_masks: The number of masks to use for the + DRISE image explainer for object detection. + If not specified defaults to 50. + :type num_masks: int + :param mask_res: The resolution of the masks to use for the + DRISE image explainer for object detection. + If not specified defaults to 4. + :type mask_res: int + """ + # drop index as this can cause issues later like when copying + # target column below from test dataset to _ext_test_df + test = test.reset_index(drop=True) + if feature_metadata is None: + # initialize to avoid having to keep checking if it is None + feature_metadata = FeatureMetadata() + self._feature_metadata = feature_metadata + self.image_mode = image_mode + self.image_width = image_width + if max_evals is None: + max_evals = DEFAULT_MAX_EVALS + elif max_evals < 1: + raise ValueError('max_evals must be greater than 0') + if num_masks is None: + num_masks = DEFAULT_NUM_MASKS + elif num_masks < 1: + raise ValueError('num_masks must be greater than 0') + if mask_res is None: + mask_res = DEFAULT_MASK_RES + elif mask_res < 1: + raise ValueError('mask_res must be greater than 0') + self.max_evals = max_evals + self.num_masks = num_masks + self.mask_res = mask_res + self.test_mltable_path = test_data_path + self._transformations = transformations + self._image_downloader = image_downloader + sample = test.iloc[0:2] + sample = get_images(sample, self.image_mode, self._transformations) + self._wrapped_model = wrap_model( + model, sample, task_type, classes=classes) + + # adding this field to use in _get_single_image and _save_predictions + self._task_type = task_type + + self.automl_image_model = is_automl_image_model(self._wrapped_model) + + self._validate_rai_insights_input_parameters( + model=self._wrapped_model, test=test, + target_column=target_column, task_type=task_type, + classes=classes, + serializer=serializer, + maximum_rows_for_test=maximum_rows_for_test) + self._classes = RAIVisionInsights._get_classes( + task_type=task_type, + test=test, + target_column=target_column, + classes=classes + ) + self.predict_output = None + if task_type == ModelTask.OBJECT_DETECTION: + test = transform_object_detection_labels( + test, target_column, self._classes) + super(RAIVisionInsights, self).__init__( + model, None, test, target_column, task_type, + serializer) + + ext_test, ext_features = extract_features( + self.test, self.target_column, self.task_type, + self.image_mode, + self._feature_metadata.dropped_features) + self._ext_test = ext_test + self._ext_features = ext_features + + self._ext_test_df = pd.DataFrame(ext_test, columns=ext_features) + self._ext_test_df[target_column] = test[target_column] + self._initialize_managers() + + def _initialize_managers(self): + """Initializes the managers. + + Initializes the explainer manager. + """ + self._explainer_manager = ExplainerManager( + self._wrapped_model, self.test, + self.target_column, + self.task_type, + self._classes, + self.image_mode, + self.max_evals, + self.num_masks, + self.mask_res) + self._error_analysis_manager = ErrorAnalysisManager( + self._wrapped_model, self.test, self._ext_test_df, + self.target_column, + self.task_type, + self.image_mode, + self._transformations, + self._classes, + self._feature_metadata.categorical_features) + self._managers = [self._explainer_manager, + self._error_analysis_manager] + + def compute(self, **kwargs): + """Calls compute on each of the managers.""" + for manager in self._managers: + manager.compute(**kwargs) + + @staticmethod + def _get_classes(task_type, test, target_column, classes): + if task_type == ModelTask.IMAGE_CLASSIFICATION: + if classes is None: + classes = test[target_column].unique() + # sort the classes after calling unique in numeric case + classes.sort() + return classes + else: + return classes + elif task_type == ModelTask.MULTILABEL_IMAGE_CLASSIFICATION: + if classes is None: + return target_column + else: + return classes + elif task_type == ModelTask.OBJECT_DETECTION: + return classes + else: + return classes + + def _validate_rai_insights_input_parameters( + self, model: Any, test: pd.DataFrame, + target_column: str, task_type: str, + classes: np.ndarray, + serializer, + maximum_rows_for_test: int): + """Validate the inputs for the RAIVisionInsights constructor. + + :param model: The model to compute RAI insights for. + A model that implements sklearn.predict or sklearn.predict_proba + or function that accepts a 2d ndarray. + :type model: object + :param test: The test dataset including the label column. + :type test: pandas.DataFrame + :param target_column: The name of the label column. + :type target_column: str + :param task_type: The task to run, can be `classification` or + `regression`. + :type task_type: str + :param classes: The class labels in the dataset. + :type classes: numpy.ndarray + :param serializer: Picklable custom serializer with save and load + methods defined for model that is not serializable. The save + method returns a dictionary state and load method returns the + model. + :type serializer: object + :param maximum_rows_for_test: Limit on size of test data + (for performance reasons) + :type maximum_rows_for_test: int + """ + valid_tasks = [ + ModelTask.IMAGE_CLASSIFICATION.value, + ModelTask.MULTILABEL_IMAGE_CLASSIFICATION.value, + ModelTask.OBJECT_DETECTION.value + ] + + if task_type not in valid_tasks: + message = (f"Unsupported task type '{task_type}'. " + f"Should be one of {valid_tasks}") + raise UserConfigValidationException(message) + + if model is None: + warnings.warn( + 'INVALID-MODEL-WARNING: No valid model is supplied. ' + 'Explanations will not work') + + if serializer is not None: + if not hasattr(serializer, 'save'): + raise UserConfigValidationException( + 'The serializer does not implement save()') + + if not hasattr(serializer, 'load'): + raise UserConfigValidationException( + 'The serializer does not implement load()') + + try: + pickle.dumps(serializer) + except Exception: + raise UserConfigValidationException( + 'The serializer should be serializable via pickle') + + test_is_pd = isinstance(test, pd.DataFrame) + if not test_is_pd: + raise UserConfigValidationException( + "Unsupported data type for test dataset. " + "Expecting pandas DataFrame." + ) + + if test.shape[0] > maximum_rows_for_test: + msg_fmt = 'The test data has {0} rows, ' +\ + 'but limit is set to {1} rows. ' +\ + 'Please resample the test data or ' +\ + 'adjust maximum_rows_for_test' + raise UserConfigValidationException( + msg_fmt.format( + test.shape[0], maximum_rows_for_test) + ) + + if task_type == ModelTask.MULTILABEL_IMAGE_CLASSIFICATION.value: + if not isinstance(target_column, list): + raise UserConfigValidationException( + 'The target_column should be a list for multilabel ' + 'classification') + # check all target columns are present in test dataset + target_columns_set = set(target_column) + if not target_columns_set.issubset(set(test.columns)): + raise UserConfigValidationException( + 'The list of target_column(s) should be in test data') + else: + if target_column not in list(test.columns): + raise UserConfigValidationException( + 'Target name {0} not present in test data'.format( + target_column) + ) + + if model is not None: + # Pick one row from test data + test_img = self._get_single_image(test, target_column) + # Call the model + try: + model.predict(test_img) + except Exception: + raise UserConfigValidationException( + 'The model passed cannot be used for' + ' getting predictions via predict()' + ) + + def _get_single_image(self, dataset, target_column): + """Get a single image from the test data. + + Used for calling predict on the dataset. + + :param dataset: The dataset to get the image from. + :type dataset: pandas.DataFrame + :param target_column: The name of the label column. + :type target_column: str + :return: A single image from the test data + :rtype: numpy.ndarray + """ + # Pick one row from dataset + if not isinstance(target_column, list): + target_column = [target_column] + img = dataset.drop( + target_column, axis=1).iloc[0][0] + if isinstance(img, str): + if self.automl_image_model: + if self._task_type == ModelTask.OBJECT_DETECTION: + img_data, img_size = get_base64_string_from_path( + img, return_image_size=True) + img = pd.DataFrame( + data=[[img_data, img_size]], + columns=[ + MLFlowSchemaLiterals.INPUT_COLUMN_IMAGE, + MLFlowSchemaLiterals.INPUT_IMAGE_SIZE], + ) + else: + img = pd.DataFrame( + data=[get_base64_string_from_path(img)], + columns=[MLFlowSchemaLiterals.INPUT_COLUMN_IMAGE], + ) + return img + else: + img = get_image_from_path(img, self.image_mode) + # apply a transformation if the image is an RGBA image + if img[0][0].size == 4: + row, col, ch = img.shape + if ch == 4: + rgb = np.zeros((row, col, 3), dtype='float32') + r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2] + a = np.asarray(img[:, :, 3], dtype='float32') / 255.0 + + rgb[:, :, 0] = r * a + (1.0 - a) * 255.0 + rgb[:, :, 1] = g * a + (1.0 - a) * 255.0 + rgb[:, :, 2] = b * a + (1.0 - a) * 255.0 + img = rgb + return reshape_image(img) + + def get_filtered_test_data(self, filters, composite_filters, + include_original_columns_only=False, + use_entire_test_data=False): + """Get the filtered test data based on cohort filters. + + :param filters: The filters to apply. + :type filters: list[Filter] + :param composite_filters: The composite filters to apply. + :type composite_filters: list[CompositeFilter] + :param include_original_columns_only: Whether to return the original + data columns. + :type include_original_columns_only: bool + :param use_entire_test_data: Whether to use entire test set for + filtering the data based on cohort. + :type use_entire_test_data: bool + :return: The filtered test data. + :rtype: pandas.DataFrame + """ + model_analyzer = self._error_analysis_manager._analyzer + dataset = model_analyzer.dataset + model = model_analyzer.model + if self.predict_output is None: + # Cache predictions of the model + self.predict_output = model_analyzer.model.predict(dataset) + pred_y = self.predict_output + true_y = model_analyzer.true_y + categorical_features = model_analyzer.categorical_features + categories = model_analyzer.categories + classes = model_analyzer.classes + model_task = model_analyzer.model_task + + filter_data_with_cohort = FilterDataWithCohortFilters( + model=model, + dataset=dataset, + features=dataset.columns, + categorical_features=categorical_features, + categories=categories, + true_y=true_y, + pred_y=pred_y, + model_task=model_task, + classes=classes) + + return filter_data_with_cohort.filter_data_from_cohort( + filters=filters, + composite_filters=composite_filters, + include_original_columns_only=include_original_columns_only) + + @property + def error_analysis(self) -> ErrorAnalysisManager: + """Get the error analysis manager. + :return: The error analysis manager. + :rtype: ErrorAnalysisManager + """ + return self._error_analysis_manager + + @property + def explainer(self) -> ExplainerManager: + """Get the explainer manager. + :return: The explainer manager. + :rtype: ExplainerManager + """ + return self._explainer_manager + + def get_data(self): + """Get all data as RAIInsightsData object + + :return: Model Analysis Data + :rtype: RAIInsightsData + """ + data = RAIInsightsData() + dataset = self._get_dataset() + data.dataset = dataset + data.errorAnalysisData = self.error_analysis.get_data() + return data + + def _get_dataset(self): + dashboard_dataset = Dataset() + tasktype = self.task_type + classification_tasks = [ModelTask.IMAGE_CLASSIFICATION, + ModelTask.MULTILABEL_IMAGE_CLASSIFICATION, + ModelTask.OBJECT_DETECTION] + is_classification_task = self.task_type in classification_tasks + if isinstance(self.task_type, Enum): + tasktype = self.task_type.value + dashboard_dataset.task_type = tasktype + categorical_features = self._feature_metadata.categorical_features + if categorical_features is None: + categorical_features = [] + dashboard_dataset.categorical_features = categorical_features + dashboard_dataset.class_names = convert_to_list( + self._classes) + + if is_classification_task: + if self.automl_image_model: + dataset = np.array(self.test.drop( + [self.target_column], axis=1).iloc[:, 0].tolist()) + + if tasktype == ModelTask.OBJECT_DETECTION.value: + dataset = pd.DataFrame( + data=[[x for x in get_base64_string_from_path( + img_path, return_image_size=True)] for + img_path in dataset], + columns=[ + MLFlowSchemaLiterals.INPUT_COLUMN_IMAGE, + MLFlowSchemaLiterals.INPUT_IMAGE_SIZE], + ) + else: + dataset = pd.DataFrame( + data=[ + get_base64_string_from_path(img_path) + for img_path in dataset + ], + columns=[MLFlowSchemaLiterals.INPUT_COLUMN_IMAGE], + ) + else: + dataset = get_images(self.test, self.image_mode, + self._transformations) + else: + raise ValueError('Unknown task type: {}'.format(self.task_type)) + predicted_y = None + if dataset is not None and self._wrapped_model is not None: + try: + predicted_y = self._wrapped_model.predict(dataset) + except Exception as ex: + msg = ('Model does not support predict method for given ' + 'dataset type') + raise ValueError(msg) from ex + try: + predicted_y = convert_to_list(predicted_y) + except Exception as ex: + raise ValueError( + 'Model prediction output of unsupported type,') from ex + if predicted_y is not None: + if is_classification_task: + predicted_y = self._convert_labels( + predicted_y, dashboard_dataset.class_names) + dashboard_dataset.predicted_y = predicted_y + if tasktype == ModelTask.OBJECT_DETECTION: + dashboard_dataset.object_detection_predicted_y = predicted_y + row_length = len(dataset) + + dashboard_dataset.features = self._ext_test + + true_y = self.test[self.target_column] + if true_y is not None and len(true_y) == row_length: + true_y = convert_to_list(true_y) + if is_classification_task: + true_y = self._convert_labels( + true_y, dashboard_dataset.class_names) + dashboard_dataset.true_y = true_y + if tasktype == ModelTask.OBJECT_DETECTION: + dashboard_dataset.object_detection_true_y = true_y + + dashboard_dataset.feature_names = self._ext_features + dashboard_dataset.target_column = self.target_column + + column_names = list(self.test.columns) + if IMAGE in column_names: + images = self.test[:].image + elif IMAGE_URL in column_names: + images = self.test[:].image_url + else: + raise ValueError('No image column found in test data') + encoded_images = [] + image_dimensions = [] + + for _, image in enumerate(images): + if isinstance(image, str): + image = get_image_from_path(image, self.image_mode) + s = io.BytesIO() + # IMshow only accepts floats in range [0, 1] + try: + image /= 255 + except Exception: + # In-place divide can fail for certain types + image = image / 255 + axes = pl.gca() + axes.get_xaxis().set_visible(False) + axes.get_yaxis().set_visible(False) + + pl.imshow(image) + # resize image as optimization + size = pl.gcf().get_size_inches() + curr_width = size[0] + curr_height = size[1] + image_dimensions.append([image.shape[1], image.shape[0]]) + new_width = self.image_width + if new_width is not None: + factor = new_width / curr_width + pl.gcf().set_size_inches((new_width, curr_height * factor)) + pl.savefig(s, format='jpg', bbox_inches='tight', pad_inches=0.) + pl.clf() + s.seek(0) + b64_encoded = base64.b64encode(s.read()) + b64 = b64_encoded.decode(CommonTags.IMAGE_DECODE_UTF_FORMAT) + encoded_images.append(b64) + + # passing to frontend to draw bounding boxes with the correct scale + dashboard_dataset.imageDimensions = image_dimensions + + if len(encoded_images) > 0: + dashboard_dataset.images = encoded_images + + if tasktype == ModelTask.OBJECT_DETECTION: + d = dashboard_dataset + dashboard_dataset.object_detection_predicted_y = d.predicted_y + dashboard_dataset.object_detection_true_y = d.true_y + dashboard_dataset.predicted_y = self._format_od_labels( + dashboard_dataset.predicted_y, + class_names=dashboard_dataset.class_names + ) + + dashboard_dataset.true_y = self._format_od_labels( + dashboard_dataset.true_y, + class_names=dashboard_dataset.class_names + ) + + return dashboard_dataset + + def _format_od_labels(self, y, class_names): + """Formats the Object Detection label representation to + multi-label image classification to follow the UI format + provided in fridgeMultilabel.ts. + + :param y: Target array + :type y: list + :param class_names: The class labels in the dataset. + :type class_names: list + :return: Formatted list of targets + :rtype: list + """ + formatted_labels = [] + + for image in y: + object_labels_lst = [0] * len(class_names) + for detection in image: + # tracking number of same objects in the image + object_labels_lst[int(detection[0] - 1)] += 1 + formatted_labels.append(object_labels_lst) + + return formatted_labels + + def _convert_images(self, dataset): + """Converts the images to the format required by the model. + + If the images are base64 encoded, they are decoded and converted to + numpy arrays. If the images are already numpy arrays, they are + returned as is. + + :param dataset: The dataset to convert. + :type dataset: numpy.ndarray + :return: The converted dataset. + :rtype: numpy.ndarray + """ + return convert_images(dataset, self.image_mode) + + def _convert_images_base64_df(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Converts the images to the format required by the model. + + If the images are base64 encoded, they are decoded and converted to + numpy arrays. If the images are already numpy arrays, they are + returned as is. + + :param dataset: The dataset to convert. + :type dataset: pandas.DataFrame + :return: The base64 converted dataset. + :rtype: pandas.DataFrame + """ + if len(dataset) > 0 and isinstance(dataset[0], str): + dataset.loc[:, ImageColumns.IMAGE.value] = dataset.loc[ + :, ImageColumns.IMAGE.value + ].map(lambda x: get_base64_string_from_path(x)) + return dataset + + def save(self, path): + """Save the RAIVisionInsights to the given path. + + In addition to the usual data, saves the extracted features. + + :param path: The directory path to save the RAIInsights to. + :type path: str + """ + super(RAIVisionInsights, self).save(path) + # Save extracted features data + self._save_ext_data(path) + self._save_transformations(path) + self._save_image_downloader(path) + + def _save_ext_data(self, path): + """Save the copy of raw data and their related metadata. + + :param path: The directory path to save the RAIBaseInsights to. + :type path: str + """ + data_directory = Path(path) / SerializationAttributes.DATA_DIRECTORY + ext_path = data_directory / (_EXT_TEST + _JSON_EXTENSION) + ext_features_path = data_directory / (_EXT_FEATURES + _JSON_EXTENSION) + self._save_list_data(ext_path, self._ext_test) + self._save_list_data(ext_features_path, self._ext_features) + + if self._image_downloader: + mltable_directory = data_directory / _MLTABLE_DIR + os.makedirs(mltable_directory, exist_ok=True) + mltable_data_dict = {} + if self.test_mltable_path: + mltable_dir = self.test_mltable_path.split('/')[-1] + mltable_data_dict[_TEST_MLTABLE_PATH] = mltable_dir + test_dir = mltable_directory / mltable_dir + shutil.copytree( + Path(self.test_mltable_path), test_dir + ) + if mltable_data_dict: + dict_path = mltable_directory / _MLTABLE_METADATA_FILENAME + with open(dict_path, 'w') as file: + json.dump( + mltable_data_dict, file, default=serialize_json_safe) + + def _save_transformations(self, path): + """Save the transformations to the given path using pickle. + + :param path: The directory path to save the transformations to. + :type path: str + """ + if self._transformations is not None: + transformations_path = Path(path) / _TRANSFORMATIONS + with open(transformations_path, 'wb') as f: + pickle.dump(self._transformations, f) + + def _save_image_downloader(self, path): + """Save the image downloader to the given path using pickle. + + :param path: The directory path to save the image downloader to. + :type path: str + """ + if self._image_downloader is not None: + image_downloader_path = Path(path) / _IMAGE_DOWNLOADER + with open(image_downloader_path, 'wb') as f: + pickle.dump(self._image_downloader, f) + + def _save_list_data(self, data_path, data): + """Save the list data to the given path. + + :param data_path: The path to save the data to. + :type data_path: str + :param data: The data to save. + :type data: list + """ + with open(data_path, 'w') as file: + json.dump(data, file, default=serialize_json_safe) + + def _convert_labels(self, labels, class_names, unique_labels=None): + """Convert labels to indexes if possible. + + :param labels: Labels to convert. + :type labels: list or numpy.ndarray + :param class_names: List of class names. + :type class_names: list + :param unique_labels: List of unique labels. + :type unique_labels: list + :return: Converted labels. + :rtype: list + """ + if self.task_type == ModelTask.OBJECT_DETECTION: + return labels + + unique_labels = unique_labels or np.unique(labels).tolist() + if isinstance(labels[0], list): + return [self._convert_labels( + li, class_names, unique_labels) for li in labels] + is_boolean = all(isinstance(y, (bool)) for y in unique_labels) + if is_boolean: + labels_arr = np.array(labels) + labels = labels_arr.astype(float).tolist() + if class_names is not None: + num_types = (int, float) + is_numeric = all(isinstance(y, num_types) for y in unique_labels) + if not is_numeric: + labels = [class_names.index(y) for y in labels] + return labels + + def _save_predictions(self, path): + """Save the predict() and predict_proba() output. + + :param path: The directory path to save the RAIVisionInsights to. + :type path: str + """ + prediction_output_path = Path(path) / _PREDICTIONS + prediction_output_path.mkdir(parents=True, exist_ok=True) + + if self.model is None: + return + + if self.automl_image_model: + test = np.array( + self.test.drop([self.target_column], axis=1) + .iloc[:, 0] + .tolist() + ) + if self._task_type == ModelTask.OBJECT_DETECTION.value: + test = pd.DataFrame( + data=[[x for x in get_base64_string_from_path( + img_path, return_image_size=True)] for + img_path in test], + columns=[ + MLFlowSchemaLiterals.INPUT_COLUMN_IMAGE, + MLFlowSchemaLiterals.INPUT_IMAGE_SIZE], + ) + else: + test = pd.DataFrame( + data=[ + get_base64_string_from_path(img_path) for img_path in + test + ], + columns=[MLFlowSchemaLiterals.INPUT_COLUMN_IMAGE], + ) + else: + test = get_images( + self.test, self.image_mode, self._transformations + ) + + predict_output = self._wrapped_model.predict(test) + if type(predict_output) != list: + predict_output = predict_output.tolist() + + self._write_to_file( + prediction_output_path / (_PREDICT + _JSON_EXTENSION), + json.dumps(predict_output)) + + if hasattr(self.model, SKLearn.PREDICT_PROBA): + predict_proba_output = self.model.predict_proba(test) + if type(predict_proba_output) != list: + predict_proba_output = predict_proba_output.tolist() + + self._write_to_file( + prediction_output_path / (_PREDICT_PROBA + _JSON_EXTENSION), + json.dumps(predict_proba_output)) + + def _save_metadata(self, path): + """Save the metadata like target column, categorical features, + task type and the classes (if any). + + :param path: The directory path to save the RAIVisionInsights to. + :type path: str + """ + top_dir = Path(path) + classes = convert_to_list(self._classes) + feature_metadata_dict = self._feature_metadata.to_dict() + meta = { + _TARGET_COLUMN: self.target_column, + _TASK_TYPE: self.task_type, + _CLASSES: classes, + _IMAGE_MODE: self.image_mode, + _FEATURE_METADATA: feature_metadata_dict, + _IMAGE_WIDTH: self.image_width, + _MAX_EVALS: self.max_evals, + _NUM_MASKS: self.num_masks, + _MASK_RES: self.mask_res + } + with open(top_dir / _META_JSON, 'w') as file: + json.dump(meta, file) + + @staticmethod + def _load_metadata(inst, path): + """Load the metadata. + + :param inst: RAIVisionInsights object instance. + :type inst: RAIVisionInsights + :param path: The directory path to metadata location. + :type path: str + """ + top_dir = Path(path) + with open(top_dir / _META_JSON, 'r') as meta_file: + meta = meta_file.read() + meta = json.loads(meta) + inst.__dict__[_TARGET_COLUMN] = meta[_TARGET_COLUMN] + inst.__dict__[_TASK_TYPE] = meta[_TASK_TYPE] + inst.__dict__[_IMAGE_MODE] = meta[_IMAGE_MODE] + if _IMAGE_WIDTH in meta: + inst.__dict__[_IMAGE_WIDTH] = meta[_IMAGE_WIDTH] + else: + inst.__dict__[_IMAGE_WIDTH] = None + params = [_MAX_EVALS, _NUM_MASKS, _MASK_RES] + defaults = [DEFAULT_MAX_EVALS, DEFAULT_NUM_MASKS, DEFAULT_MASK_RES] + for param, default in zip(params, defaults): + if param in meta: + inst.__dict__[param] = meta[param] + else: + inst.__dict__[param] = default + classes = meta[_CLASSES] + + inst.__dict__['_' + _CLASSES] = RAIVisionInsights._get_classes( + task_type=meta[_TASK_TYPE], + test=inst.__dict__[_TEST], + target_column=meta[_TARGET_COLUMN], + classes=classes + ) + + if (Metadata.FEATURE_METADATA not in meta or + meta[Metadata.FEATURE_METADATA] is None): + inst.__dict__['_' + Metadata.FEATURE_METADATA] = FeatureMetadata() + else: + inst.__dict__['_' + Metadata.FEATURE_METADATA] = FeatureMetadata( + identity_feature_name=meta[Metadata.FEATURE_METADATA][ + _IDENTITY_FEATURE_NAME], + datetime_features=meta[Metadata.FEATURE_METADATA][ + _DATETIME_FEATURES], + time_series_id_features=meta[Metadata.FEATURE_METADATA][ + _TIME_SERIES_ID_FEATURES], + categorical_features=meta[Metadata.FEATURE_METADATA][ + _CATEGORICAL_FEATURES], + dropped_features=meta[Metadata.FEATURE_METADATA][ + _DROPPED_FEATURES]) + + # load the image downloader as part of metadata + RAIVisionInsights._load_image_downloader(inst, path) + # load the transformations as part of metadata + RAIVisionInsights._load_transformations(inst, path) + # load the extracted features as part of metadata + RAIVisionInsights._load_ext_data(inst, path) + + @staticmethod + def _load_ext_data(inst, path): + """Load the extracted features data. + + :param inst: RAIVisionInsights object instance. + :type inst: RAIVisionInsights + :param path: The directory path to extracted data location. + :type path: str + """ + top_dir = Path(path) + data_path = top_dir / SerializationAttributes.DATA_DIRECTORY + json_test_path = data_path / (_EXT_TEST + _JSON_EXTENSION) + with open(json_test_path, 'r') as file: + inst._ext_test = json.loads(file.read()) + json_features_path = data_path / (_EXT_FEATURES + _JSON_EXTENSION) + with open(json_features_path, 'r') as file: + inst._ext_features = json.loads(file.read()) + inst._ext_test_df = pd.DataFrame( + inst._ext_test, columns=inst._ext_features) + target_column = inst.target_column + test = inst.test + inst._ext_test_df[target_column] = test[target_column] + + inst.test_mltable_path = None + mltable_directory = data_path / _MLTABLE_DIR + if inst._image_downloader and len(os.listdir(mltable_directory)) > 0: + mltable_dict_path = mltable_directory / _MLTABLE_METADATA_FILENAME + mltable_dict = {} + with open(mltable_dict_path, 'r') as file: + mltable_dict = json.load(file) + + if mltable_dict.get(_TEST_MLTABLE_PATH, ''): + inst.test_mltable_path = str(mltable_directory / mltable_dict[ + _TEST_MLTABLE_PATH]) + test_dataset = inst._image_downloader(inst.test_mltable_path) + inst.test = test_dataset._images_df + + @staticmethod + def _load_transformations(inst, path): + """Load the transformations from pickle file. + + :param inst: RAIVisionInsights object instance. + :type inst: RAIVisionInsights + :param path: The directory path to transformations location. + :type path: str + """ + top_dir = Path(path) + transformations_file = top_dir / _TRANSFORMATIONS + if transformations_file.exists(): + with open(transformations_file, 'rb') as file: + inst._transformations = pickle.load(file) + else: + inst._transformations = None + + @staticmethod + def _load_image_downloader(inst, path): + """Load the image downloader from pickle file. + + :param inst: RAIVisionInsights object instance. + :type inst: RAIVisionInsights + :param path: The directory path to image downloader location. + :type path: str + """ + top_dir = Path(path) + image_downloader_file = top_dir / _IMAGE_DOWNLOADER + if image_downloader_file.exists(): + with open(image_downloader_file, 'rb') as file: + inst._image_downloader = pickle.load(file) + else: + inst._image_downloader = None + + @staticmethod + def load(path): + """Load the RAIVisionInsights from the given path. + + :param path: The directory path to load the RAIVisionInsights from. + :type path: str + :return: The RAIVisionInsights object after loading. + :rtype: RAIVisionInsights + """ + # create the RAIVisionInsights without any properties using the __new__ + # function, similar to pickle + inst = RAIVisionInsights.__new__(RAIVisionInsights) + + manager_map = { + ManagerNames.EXPLAINER: ExplainerManager, + ManagerNames.ERROR_ANALYSIS: ErrorAnalysisManager, + } + + # load current state + RAIBaseInsights._load( + path, inst, manager_map, RAIVisionInsights._load_metadata) + inst._wrapped_model = wrap_model(inst.model, inst.test, inst.task_type, + classes=inst._classes) + inst.automl_image_model = is_automl_image_model(inst._wrapped_model) + inst.predict_output = None + return inst + + def compute_object_detection_metrics( + self, + selection_indexes, + aggregate_method, + class_name, + iou_thresh): + dashboard_dataset = self.get_data().dataset + true_y = dashboard_dataset.object_detection_true_y + predicted_y = dashboard_dataset.object_detection_predicted_y + dashboard_dataset = self.get_data().dataset + true_y = dashboard_dataset.object_detection_true_y + predicted_y = dashboard_dataset.object_detection_predicted_y + + iou_thresh = [iou_thresh / 100.0] + all_cohort_metrics = [] + for cohort_indices in selection_indexes: + metric_OD = MeanAveragePrecision( + class_metrics=True, + iou_thresholds=iou_thresh, + average=aggregate_method) + true_y_cohort = [true_y[cohort_index] for cohort_index + in cohort_indices] + predicted_y_cohort = [predicted_y[cohort_index] for cohort_index + in cohort_indices] + + pred_boxes, pred_labels, pred_scores = [], [], [] + for pred_image in predicted_y_cohort: + for pred_object in pred_image: + pred_labels.append(int(pred_object[0])) + pred_boxes.append(pred_object[1:5]) + pred_scores.append(pred_object[-1]) + + gt_boxes, gt_labels = [], [] + for gt_image in true_y_cohort: + for gt_object in gt_image: + gt_labels.append(int(gt_object[0])) + gt_boxes.append(gt_object[1:5]) + # creating the list of dictionaries for pred and gt + cohort_pred = [ + dict( + boxes=torch.tensor(pred_boxes), + scores=torch.tensor(pred_scores), + labels=torch.tensor(pred_labels), + ) + ] + cohort_gt = [ + dict( + boxes=torch.tensor(gt_boxes), + labels=torch.tensor(gt_labels), + ) + ] + + # this is to find the class index given + # that there might not all classes in the cohort to predict or gt + classes = self._classes + classes = list(classes) + cohort_classes = list(set([classes[i - 1] + for i in pred_labels + gt_labels])) + cohort_classes.sort( + key=lambda class_name: classes.index(class_name)) + # to catch if the class is not in the cohort + try: + index = cohort_classes.index(class_name) + except ValueError: + all_cohort_metrics.append([-1, -1, -1]) + else: + metric_OD.update(cohort_pred, + cohort_gt) + object_detection_values = metric_OD.compute() + mAP = round(object_detection_values + ['map'].item(), 2) + AP = round(object_detection_values + ['map_per_class'][index].item(), 2) + AR = round(object_detection_values + ['mar_100_per_class'][index].item(), 2) + all_cohort_metrics.append([mAP, AP, AR]) + + return all_cohort_metrics diff --git a/responsibleai_vision/responsibleai_vision/utils/__init__.py b/responsibleai_vision/responsibleai_vision/utils/__init__.py new file mode 100644 index 0000000000..ca91131b2d --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/utils/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Contains utilities for RAI image data processing.""" diff --git a/responsibleai_vision/responsibleai_vision/utils/feature_extractors.py b/responsibleai_vision/responsibleai_vision/utils/feature_extractors.py new file mode 100644 index 0000000000..da414e4c9c --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/utils/feature_extractors.py @@ -0,0 +1,59 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Defines the feature extractors.""" + +from typing import List, Optional + +import pandas as pd +from tqdm import tqdm + +from responsibleai_vision.utils.image_reader import get_image_from_path + + +def extract_features(image_dataset: pd.DataFrame, + target_column: str, task_type: str, + image_mode: str = None, + dropped_features: Optional[List[str]] = None): + '''Extract tabular data features from the image dataset. + + :param image_dataset: A pandas dataframe containing the image data. + :type image_dataset: pandas.DataFrame + :param target_column: The name of the label column or list of columns. + This is a list of columns for multilabel models. + :type target_column: str or list[str] + :param task_type: The type of task to be performed. + :type task_type: str + :param image_mode: The mode to open the image in. + See pillow documentation for all modes: + https://pillow.readthedocs.io/en/stable/handbook/concepts.html + :type image_mode: str + :param dropped_features: The list of features to drop from the dataset. + :type dropped_features: list[str] + :return: The list of extracted features and the feature names. + :rtype: list, list + ''' + results = [] + feature_names = ["mean_pixel_value"] + column_names = image_dataset.columns + has_dropped_features = dropped_features is not None + start_meta_index = 2 + if isinstance(target_column, list): + start_meta_index = len(target_column) + 1 + for j in range(start_meta_index, image_dataset.shape[1]): + if has_dropped_features and column_names[j] in dropped_features: + continue + feature_names.append(column_names[j]) + for i in tqdm(range(image_dataset.shape[0])): + image = image_dataset.iloc[i][0] + if isinstance(image, str): + image = get_image_from_path(image, image_mode) + mean_pixel_value = image.mean() + row_feature_values = [mean_pixel_value] + # append all features other than target column and label + for j in range(start_meta_index, image_dataset.shape[1]): + if has_dropped_features and column_names[j] in dropped_features: + continue + row_feature_values.append(image_dataset.iloc[i][j]) + results.append(row_feature_values) + return results, feature_names diff --git a/responsibleai_vision/responsibleai_vision/utils/image_reader.py b/responsibleai_vision/responsibleai_vision/utils/image_reader.py new file mode 100644 index 0000000000..d551f207a0 --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/utils/image_reader.py @@ -0,0 +1,84 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Utilities for reading images.""" + +import base64 +from io import BytesIO +from typing import Any, Tuple, Union + +import requests +from numpy import asarray +from PIL import Image + +from responsibleai_vision.common.constants import (AutoMLImagesModelIdentifier, + CommonTags) + + +def get_image_from_path(image_path, image_mode): + """Get image from path. + + :param image_path: The path to the image. + :type image_path: str + :param image_mode: The mode to open the image in. + See pillow documentation for all modes: + https://pillow.readthedocs.io/en/stable/handbook/concepts.html + :type image_mode: str + :return: The image as a numpy array. + :rtype: numpy.ndarray + """ + image_open_pointer = image_path + if image_path.startswith("http://") or image_path.startswith("https://"): + response = requests.get(image_path) + image_open_pointer = BytesIO(response.content) + with Image.open(image_open_pointer) as im: + if image_mode is not None: + im = im.convert(image_mode) + image_array = asarray(im) + return image_array + + +def get_base64_string_from_path(img_path: str, + return_image_size: bool = False) \ + -> Union[str, Tuple[str, Tuple[int, int]]]: + """Load and convert pillow image to base64-encoded image + + :param img_path: image path + :type img_path: str + :param return_image_size: true if image size should also be returned + :type return_image_size: bool + :return: base64-encoded image OR base64-encoded image and image size + :rtype: Union[str, Tuple[str, Tuple[int, int]]] + """ + try: + img = Image.open(img_path) + except Exception as e: + print("file not found", str(e)) + import urllib.request + urllib.request.urlretrieve(img_path, "tempfile") + img = Image.open("tempfile") + imgio = BytesIO() + img.save(imgio, img.format) + img_str = base64.b64encode(imgio.getvalue()) + if return_image_size: + return img_str.decode(CommonTags.IMAGE_DECODE_UTF_FORMAT), img.size + return img_str.decode(CommonTags.IMAGE_DECODE_UTF_FORMAT) + + +def is_automl_image_model(model: Any) -> bool: + """Check whether the model is automl images mlflow type + + :param model: Model object + :type model: supported model types + :return: True if automl model type else False + :rtype: bool + """ + automl_image_model = False + model_type = str(type(model)) + if model_type.endswith( + AutoMLImagesModelIdentifier.AUTOML_IMAGE_CLASSIFICATION_MODEL + ) or model_type.endswith( + AutoMLImagesModelIdentifier.AUTOML_OBJECT_DETECTION_MODEL + ): + automl_image_model = True + return automl_image_model diff --git a/responsibleai_vision/responsibleai_vision/utils/image_utils.py b/responsibleai_vision/responsibleai_vision/utils/image_utils.py new file mode 100644 index 0000000000..24dcb1fb62 --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/utils/image_utils.py @@ -0,0 +1,140 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Contains image handling utilities.""" + +import numpy as np + +from responsibleai_vision.common.constants import ImageColumns +from responsibleai_vision.utils.image_reader import get_image_from_path + +IMAGE = ImageColumns.IMAGE.value +IMAGE_URL = ImageColumns.IMAGE_URL.value +IMAGE_DETAILS = 'image_details' +LABEL = 'label' +WIDTH = 'width' +HEIGHT = 'height' +TOP_X = 'topX' +TOP_Y = 'topY' +BOTTOM_X = 'bottomX' +BOTTOM_Y = 'bottomY' +IS_CROWD = 'isCrowd' + + +def convert_images(dataset, image_mode): + """Converts the images to the format required by the model. + + If the images are base64 encoded, they are decoded and converted to + numpy arrays. If the images are already numpy arrays, they are + returned as is. + + :param dataset: The dataset to convert. + :type dataset: numpy.ndarray + :param image_mode: The mode to open the image in. + See pillow documentation for all modes: + https://pillow.readthedocs.io/en/stable/handbook/concepts.html + :type image_mode: str + :return: The converted dataset. + :rtype: numpy.ndarray + """ + if len(dataset) > 0 and isinstance(dataset[0], str): + try: + dataset = np.array([get_image_from_path( + x, image_mode) for x in dataset]) + except ValueError: + # if images of different sizes, try to convert one by one + jagged = np.empty(len(dataset), dtype=object) + for i, x in enumerate(dataset): + jagged[i] = get_image_from_path(x, image_mode) + dataset = jagged + return dataset + + +def get_images(dataset, image_mode, transformations=None): + """Get the images from the dataset. + + If transformations are provided as a callable, the images + are transformed. If transformations are provided as a string, + the images are retrieved from that column name in the test dataset. + + :param dataset: The dataset to get the images from. + :type dataset: numpy.ndarray + :param image_mode: The mode to open the image in. + See pillow documentation for all modes: + https://pillow.readthedocs.io/en/stable/handbook/concepts.html + :type image_mode: str + :param transformations: The transformations to apply to the images. + :type transformations: torchvision.transforms + :return: The images. + :rtype: numpy.ndarray + """ + column_names = dataset.columns + is_transformations_str = isinstance(transformations, str) + if is_transformations_str: + images = dataset[transformations] + else: + if IMAGE in column_names: + images = dataset[IMAGE] + elif IMAGE_URL in column_names: + images = dataset[IMAGE_URL] + else: + raise ValueError('No image column found in test data') + + images = np.array(images.tolist()) + converted_images = convert_images(images, image_mode) + + if not is_transformations_str and transformations is not None: + converted_images = transformations(converted_images) + + return converted_images + + +def classes_to_dict(classes): + """Converts the classes to a dictionary. + + :param classes: The classes. + :type classes: list + :return: The classes as a dictionary. + :rtype: dict + """ + return {classes[i]: i + 1 for i in range(len(classes))} + + +def transform_object_detection_labels(test, target_column, classes): + """Transforms the object detection labels to one common format. + + :param test: The test dataset. + :type test: pandas.DataFrame + :param target_column: The column containing the labels. + :type target_column: str + """ + label_dict = classes_to_dict(classes) + for i in range(len(test)): + object_labels = test[target_column][i] + image_details = None + if IMAGE_DETAILS in test: + image_details = test[IMAGE_DETAILS][i] + if len(object_labels) > 0 and isinstance(object_labels[0], dict): + if image_details: + width = image_details[WIDTH] + height = image_details[HEIGHT] + image_labels = [] + for label in object_labels: + class_name = label[LABEL] + class_id = label_dict[class_name] + + xmin = label[TOP_X] * width + ymin = label[TOP_Y] * height + + xmax = label[BOTTOM_X] * width + ymax = label[BOTTOM_Y] * height + + image_labels.append([class_id, int(xmin), int(ymin), + int(xmax), int(ymax), + int(label[IS_CROWD])]) + test[target_column][i] = image_labels + else: + invalid_msg = 'Invalid label format for conversion: ' + err = invalid_msg + 'Image details and label must be present' + raise ValueError(err) + return test diff --git a/responsibleai_vision/responsibleai_vision/version.py b/responsibleai_vision/responsibleai_vision/version.py new file mode 100644 index 0000000000..a284aebd1b --- /dev/null +++ b/responsibleai_vision/responsibleai_vision/version.py @@ -0,0 +1,8 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +name = 'responsibleai_vision' +_major = '0' +_minor = '2' +_patch = '2' +version = '{}.{}.{}'.format(_major, _minor, _patch) diff --git a/responsibleai_vision/setup.py b/responsibleai_vision/setup.py new file mode 100644 index 0000000000..2a3f9a6d16 --- /dev/null +++ b/responsibleai_vision/setup.py @@ -0,0 +1,44 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +import setuptools + +# Version will be read from version.py +version = '' +name = 'responsibleai-vision' +# Fetch Version +with open('responsibleai_vision/version.py') as f: + code = compile(f.read(), f.name, 'exec') + exec(code) + +# Fetch ReadMe +with open('README.md', 'r') as fh: + long_description = fh.read() + +# Use requirements.txt to set the install_requires +with open('requirements.txt') as f: + install_requires = [line.strip() for line in f] + +setuptools.setup( + name=name, # noqa: F821 + version=version, # noqa: F821 + author="Roman Lutz, Ilya Matiach, Ke Xu", + author_email="raiwidgets-maintain@microsoft.com", + description="SDK API to assess image " + "Machine Learning models.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/microsoft/responsible-ai-toolbox", + packages=setuptools.find_packages(), + python_requires='>=3.6', + install_requires=install_requires, + classifiers=[ + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Development Status :: 3 - Alpha" + ] +) diff --git a/responsibleai_vision/tests/common_vision_utils.py b/responsibleai_vision/tests/common_vision_utils.py new file mode 100644 index 0000000000..a3bf5fe8a2 --- /dev/null +++ b/responsibleai_vision/tests/common_vision_utils.py @@ -0,0 +1,771 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +import copy +import json +import os +import sys +import time +import xml.etree.ElementTree as ET +from enum import Enum +from zipfile import ZipFile + +import numpy as np +import pandas as pd +import shap +import torch +import torch.nn as nn +import torch.optim as optim +import torchvision.transforms as transforms +from datasets import load_dataset +from fastai.data.transforms import Normalize +from fastai.learner import load_learner +from fastai.losses import BCEWithLogitsLossFlat +from fastai.metrics import accuracy, accuracy_multi +from fastai.vision import models as fastai_models +from fastai.vision.augment import Resize +from fastai.vision.data import ImageDataLoaders, imagenet_stats +from fastai.vision.learner import vision_learner +from PIL import Image +from sklearn.metrics import f1_score +from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input +from tensorflow.keras.models import load_model +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader, Dataset +from torchvision import models as torchvision_models +from torchvision.models.detection.faster_rcnn import FastRCNNPredictor + +from raiutils.common.retries import retry_function +from responsibleai_vision.common.constants import ImageColumns + +try: + from urllib import urlretrieve +except ImportError: + from urllib.request import urlretrieve + + +CAN = 'can' +CARTON = 'carton' +MILK_BOTTLE = 'milk_bottle' +WATER_BOTTLE = 'water_bottle' +FRIDGE_MULTILABEL_TARGETS = [CAN, CARTON, MILK_BOTTLE, WATER_BOTTLE] +EPOCHS = 10 +LEARNING_RATE = 1e-4 +IM_SIZE = 300 +BATCH_SIZE = 16 +FRIDGE_MODEL_NAME = 'fridge_model' +FRIDGE_MODEL_WINDOWS_NAME = 'fridge_model_windows' +MULTILABEL_FRIDGE_MODEL_NAME = 'multilabel_fridge_model' +MULTILABEL_FRIDGE_MODEL_WINDOWS_NAME = 'multilabel_fridge_model_windows' +WIN = 'win' +IMAGE = ImageColumns.IMAGE.value +LABEL = ImageColumns.LABEL.value +TRAIN = 'train' +VAL = 'val' + + +def load_imagenet_dataset(): + """Loads the imagenet dataset. + + :return: The imagenet dataset. + :rtype: pandas.DataFrame + """ + X, y = shap.datasets.imagenet50() + # load just the first 10 images + X = X[:10] + y = y[:10] + data = pd.DataFrame( + columns=[IMAGE, LABEL], + index=range(X.shape[0])) + classes = load_imagenet_labels() + for i in range(X.shape[0]): + data.iloc[i, 0] = X[i] + if (y[i] >= 1000): + y[i] = np.random.randint(1000) + data.iloc[i, 1] = classes[int(y[i])] + return data + + +def load_flowers_dataset(upscale=False): + """Loads the flowers dataset. + + :return: The flowers dataset. + :rtype: pandas.DataFrame + """ + storage_url = 'https://publictestdatasets.blob.core.windows.net/' + container_path = 'computervision/' + if upscale: + container_path += 'upscaleFlowers/' + else: + container_path += 'smallFlowers/' + image1 = storage_url + container_path + 'image_00001.jpg' + image2 = storage_url + container_path + 'image_00002.jpg' + images = [image1, image2] + data = pd.DataFrame(columns=[ImageColumns.IMAGE.value, + ImageColumns.LABEL.value]) + for image_path in images: + data = data.append({ImageColumns.IMAGE.value: image_path, + ImageColumns.LABEL.value: 'pink primrose'}, + ignore_index=True) + return data + + +class DummyFlowersClassifier(): + def __init__(self): + """Dummy classifier for testing purposes. + """ + pass + + def __call__(self, X): + """Predicts the labels for the images. + + :param X: The images to predict the labels for. + :type X: numpy.ndarray + :return: The predicted labels. + :rtype: numpy.ndarray + """ + shape = X.shape + return np.array(['pink primrose'] * shape[0]) + + def predict(self, X): + """Predicts the labels for the images. + + :param X: The images to predict the labels for. + :type X: numpy.ndarray + :return: The predicted labels. + :rtype: numpy.ndarray + """ + shape = X.shape + return np.array(['pink primrose'] * shape[0]) + + def predict_proba(self, X): + """Predicts the probabilities for the images. + + :param X: The images to predict the probabilities for. + :type X: numpy.ndarray + :return: The predicted probabilities. + :rtype: numpy.ndarray + """ + shape = X.shape + probs = [[0, 1]] * shape[0] + return np.array(probs) + + +def create_dummy_model(df): + """Creates a dummy model for testing purposes. + + :param df: dataframe with image paths and labels + :type df: pandas.DataFrame + :return: dummy model + :rtype: DummyFlowersClassifier + """ + return DummyFlowersClassifier() + + +def retrieve_unzip_file(download_url, data_file): + urlretrieve(download_url, filename=data_file) + # extract files + with ZipFile(data_file, "r") as zipfile: + zipfile.extractall(path="./data") + # delete zip file + os.remove(data_file) + + +def load_fridge_dataset(): + # create data folder if it doesnt exist. + os.makedirs("data", exist_ok=True) + + # download data + download_url = ("https://cvbp-secondary.z19.web.core.windows.net/" + + "datasets/image_classification/fridgeObjects.zip") + data_file = "./data/fridgeObjects.zip" + retrieve_unzip_file(download_url, data_file) + # get all file names into a pandas dataframe with the labels + data = pd.DataFrame(columns=[IMAGE, LABEL]) + for folder in os.listdir("./data/fridgeObjects"): + for file in os.listdir("./data/fridgeObjects/" + folder): + image_path = "./data/fridgeObjects/" + folder + "/" + file + data = data.append({IMAGE: image_path, LABEL: folder}, + ignore_index=True) + return data + + +def load_fridge_object_detection_dataset_labels(automl_format=False): + + src_images = "./data/odFridgeObjects/" + + # Path to the annotations + annotations_folder = os.path.join(src_images, "annotations") + + labels = [] + label_dict = {'can': 1, 'carton': 2, 'milk_bottle': 3, 'water_bottle': 4} + + # Read each annotation + for filename in os.listdir(annotations_folder): + if filename.endswith(".xml"): + root = ET.parse(os.path.join(annotations_folder, + filename)).getroot() + + # Use if need to normalize bounding box coordinates + if automl_format: + width = int(root.find("size/width").text) + height = int(root.find("size/height").text) + + image_labels = [] + for object in root.findall("object"): + name = object.find("name").text + xmin = object.find("bndbox/xmin").text + ymin = object.find("bndbox/ymin").text + xmax = object.find("bndbox/xmax").text + ymax = object.find("bndbox/ymax").text + isCrowd = int(object.find("difficult").text) + if not automl_format: + image_labels.append([ + # label + label_dict[name], + # topX. To normalize, divide by width. + float(xmin), + # topY. To normalize, divide by height. + float(ymin), + # bottomX. To normalize, divide by width + float(xmax), + # bottomY. To normalize, divide by height + float(ymax), + int(isCrowd) + ]) + else: + image_labels.append({ + 'label': name, + 'topX': float(xmin) / width, + 'topY': float(ymin) / height, + 'bottomX': float(xmax) / width, + 'bottomY': float(ymax) / height, + 'isCrowd': int(isCrowd) + }) + labels.append(image_labels) + + return labels + + +def load_image_details(): + src_images = "./data/odFridgeObjects/" + + # Path to the annotations + annotations_folder = os.path.join(src_images, "annotations") + + image_details = [] + + # Read each annotation + for filename in os.listdir(annotations_folder): + if filename.endswith(".xml"): + root = ET.parse(os.path.join(annotations_folder, + filename)).getroot() + width = int(root.find("size/width").text) + height = int(root.find("size/height").text) + image_details.append({ + 'width': width, + 'height': height + }) + return image_details + + +def load_fridge_object_detection_dataset(automl_format=False): + # create data folder if it doesnt exist. + os.makedirs("data", exist_ok=True) + + # download data + download_url = ("https://cvbp-secondary.z19.web.core.windows.net/" + + "datasets/object_detection/odFridgeObjects.zip") + data_file = "./odFridgeObjects.zip" + retrieve_unzip_file(download_url, data_file) + + labels = load_fridge_object_detection_dataset_labels(automl_format) + if automl_format: + image_details = load_image_details() + data = pd.DataFrame(columns=[ImageColumns.IMAGE.value, + ImageColumns.IMAGE_DETAILS.value, + ImageColumns.LABEL.value]) + else: + data = pd.DataFrame(columns=[ImageColumns.IMAGE.value, + ImageColumns.LABEL.value]) + for i, file in enumerate(os.listdir("./data/odFridgeObjects/" + "images")): + image_path = "./data/odFridgeObjects/" + "images" + "/" + file + if automl_format: + row = { + ImageColumns.IMAGE.value: image_path, + ImageColumns.IMAGE_DETAILS.value: image_details[i], + ImageColumns.LABEL.value: labels[i] + } + else: + row = { + ImageColumns.IMAGE.value: image_path, + ImageColumns.LABEL.value: labels[i] + } + data = data.append(row, ignore_index=True) + return data + + +class ImageTransformEnum(Enum): + ''' + Possible modifications to images + ''' + RESIZE = "resize" + GRAYSCALE = "grayscale" + PNG = "png" + OPACITY = "opacity" + BLACKOUT = "blackout" + + +class ImageTypes(Enum): + ''' + Possible modifications to images + ''' + JPEG = ".jpg" + PNG = ".png" + + +class ImageTransforms(object): + def __init__(self, data: pd.DataFrame): + self.data = data + + def apply_transformation(self, + path: str, + transform: ImageTransformEnum, + size=None): + ''' + Transforms pd.DataFrame of images and labels into modified images. + The types of modifications are listed in the ImageTransformEnum + The schema of this Dataframe should be two columns: str of image path, + str of image labels. + ''' + os.makedirs(path, exist_ok=True) + for i, img_path in (self.data[ImageColumns.IMAGE.value]).iteritems(): + image = Image.open(img_path) + if transform == ImageTransformEnum.RESIZE: + image = image.resize(size) + elif transform == ImageTransformEnum.GRAYSCALE: + image = image.convert('L') + elif transform == ImageTransformEnum.OPACITY: + image_array = np.array(image) + alpha_channel = np.ones((image_array.shape[0], + image_array.shape[1], + 1), dtype='uint8') * 255 + img_alpha = np.concatenate((image_array, alpha_channel), + axis=2) + image = Image.fromarray(img_alpha) + print(image) + elif transform == ImageTransformEnum.BLACKOUT: + mask = np.zeros_like(image) + image = Image.fromarray(mask) + is_png_transform = transform == ImageTransformEnum.PNG + is_opacity_transform = transform == ImageTransformEnum.OPACITY + if is_png_transform or is_opacity_transform: + new_path = path + "/" + str(i) + str(ImageTypes.PNG.value) + image.save(new_path, 'PNG', quality=1000) + else: + new_path = path + "/" + str(i) + str(ImageTypes.JPEG.value) + image.save(new_path, 'JPEG', quality=1000) + self.data[i] = new_path + return self.data + + +def load_multilabel_fridge_dataset(): + # create data folder if it doesnt exist. + os.makedirs("data", exist_ok=True) + + # download data + download_url = ("https://cvbp-secondary.z19.web.core.windows.net/" + + "datasets/image_classification/" + + "multilabelFridgeObjects.zip") + folder_path = './data/multilabelFridgeObjects' + data_file = folder_path + '.zip' + retrieve_unzip_file(download_url, data_file) + + data = pd.read_csv(folder_path + '/labels.csv') + data.rename(columns={'filename': IMAGE, + 'labels': LABEL}, inplace=True) + image_col = data[IMAGE] + for i in range(len(image_col)): + image_col[i] = folder_path + '/images/' + image_col[i] + return data + + +def convert_images_to_numpy(dataset): + images = [] + for i in range(len(dataset)): + images.append(np.array(dataset.iloc[i, 0])) + dataset[IMAGE] = images + return dataset + + +def get_pd_mnist_data(dataset): + data = pd.DataFrame({IMAGE: dataset[IMAGE], + LABEL: dataset[LABEL]}) + return convert_images_to_numpy(data) + + +def load_mnist_dataset(): + dataset = load_dataset("mnist") + train_data = get_pd_mnist_data(dataset[TRAIN]) + test_data = get_pd_mnist_data(dataset['test']) + return train_data, test_data + + +def load_imagenet_labels(): + # getting ImageNet 1000 class names + url = "https://s3.amazonaws.com/deep-learning-models/" + \ + "image-models/imagenet_class_index.json" + with open(shap.datasets.cache(url)) as file: + class_names = [v[1] for v in json.load(file).values()] + return class_names + + +class ImageClassificationPipelineSerializer(object): + def save(self, model, path): + model_path = self._get_model_path(path) + model.save(model_path) + + def load(self, path): + model_path = self._get_model_path(path) + return ResNetPipeline.load(model_path) + + def _get_model_path(self, path): + return os.path.join(path, 'image-classification-model') + + +class DummyFlowersPipelineSerializer(object): + def save(self, model, path): + pass + + def load(self, path): + return DummyFlowersClassifier() + + +class ResNetPipeline(object): + def __init__(self): + self.model = ResNet50(weights='imagenet') + + def __call__(self, X): + tmp = X.copy() + preprocess_input(tmp) + return self.model(tmp) + + def save(self, path): + self.model.save(path) + + @staticmethod + def load(path): + model = load_model(path) + inst = ResNetPipeline.__new__(ResNetPipeline) + inst.model = model + return inst + + +def create_image_classification_pipeline(): + return ResNetPipeline() + + +def train_fastai_image_classifier(df): + """Trains a fastai multiclass image classifier. + + :param df: dataframe with image paths and labels + :type df: pandas.DataFrame + :return: fastai vision learner + :rtype: fastai.vision.learner + """ + data = ImageDataLoaders.from_df( + df, valid_pct=0.2, seed=10, bs=BATCH_SIZE, + batch_tfms=[Resize(IM_SIZE), Normalize.from_stats(*imagenet_stats)]) + model = vision_learner(data, fastai_models.resnet18, metrics=[accuracy]) + model.unfreeze() + model.fit(EPOCHS, LEARNING_RATE) + return model + + +def train_fastai_image_multilabel(df): + """Trains fastai image classifier for multilabel classification + + :param df: dataframe with image paths and labels + :type df: pandas.DataFrame + :return: trained fastai model + :rtype: fastai.vision.learner.Learner + """ + data = ImageDataLoaders.from_df( + df, valid_pct=0.2, seed=10, label_delim=' ', bs=BATCH_SIZE, + batch_tfms=[Resize(IM_SIZE), Normalize.from_stats(*imagenet_stats)]) + model = vision_learner(data, fastai_models.resnet18, + metrics=[accuracy_multi], + loss_func=BCEWithLogitsLossFlat()) + model.unfreeze() + model.fit(EPOCHS, LEARNING_RATE) + return model + + +class FetchModel(object): + def __init__(self, multilabel=False): + self.multilabel = multilabel + + def fetch(self): + if sys.platform.startswith(WIN): + if self.multilabel: + model_name = MULTILABEL_FRIDGE_MODEL_WINDOWS_NAME + else: + model_name = FRIDGE_MODEL_WINDOWS_NAME + else: + if self.multilabel: + model_name = MULTILABEL_FRIDGE_MODEL_NAME + else: + model_name = FRIDGE_MODEL_NAME + url = ('https://publictestdatasets.blob.core.windows.net/models/' + + model_name) + saved_model_name = FRIDGE_MODEL_NAME + if self.multilabel: + saved_model_name = MULTILABEL_FRIDGE_MODEL_NAME + urlretrieve(url, saved_model_name) + + +def retrieve_or_train_fridge_model(df, force_train=False, + multilabel=False): + """Retrieves or trains fastai image classifier + + :param df: dataframe with image paths and labels + :type df: pandas.DataFrame + :param force_train: whether to force training of model + :type force_train: bool + :param multilabel: whether to train multilabel classifier + :type multilabel: bool + """ + model_name = FRIDGE_MODEL_NAME + if multilabel: + model_name = MULTILABEL_FRIDGE_MODEL_NAME + if force_train: + if multilabel: + model = train_fastai_image_multilabel(df) + else: + model = train_fastai_image_classifier(df) + # Save model to disk + model.export(model_name) + else: + fetcher = FetchModel(multilabel) + action_name = "Fridge model download" + err_msg = "Failed to download model" + max_retries = 4 + retry_delay = 60 + retry_function(fetcher.fetch, action_name, err_msg, + max_retries=max_retries, + retry_delay=retry_delay) + model = load_learner(model_name) + return model + + +def get_object_detection_model(num_classes=5): + """Loads a general pretrained FasterRCNN model. + + :param num_classes: Number of classes + :type num_classes: int + """ + # load an instance segmentation model pre-trained on COCO + model = torchvision_models.detection.fasterrcnn_resnet50_fpn(pretrained=True) # noqa: E501 + in_features = model.roi_heads.box_predictor.cls_score.in_features + # replace the pre-trained head with a new one + model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) + + return model + + +def download_object_detection_assets(filepath, force=False): + """Downloads the fine-tuned recycling model from url. + + :param filepath: Path to model file + :type filepath: str + """ + if force or not os.path.exists(filepath): + blob_storage_url = "https://publictestdatasets.blob.core.windows.net/" + models = "models/" + rcnn_url = blob_storage_url + models + "fastrcnn.pt" + urlretrieve(rcnn_url, os.path.join(filepath)) + else: + print('Found' + filepath) + + return filepath + + +def retrieve_fridge_object_detection_model(): + """Retrieves the recycling model fine-tuned on fridge. + """ + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + num_classes = 5 + model = get_object_detection_model(num_classes) + + # To use general torchvision pretrained model, + # comment above and uncomment below + # model = detection.fasterrcnn_resnet50_fpn(pretrained=True) + + model.to(device) + + return model + + +def gridify_fridge_multilabel_labels(data): + """Converts multilabel fridge labels to one-hot encoded labels + + :param data: dataframe with image paths and labels + :type data: pandas.DataFrame + :return: dataframe with one-hot encoded labels + :rtype: pandas.DataFrame + """ + data_len = len(data) + can = np.zeros(data_len) + carton = np.zeros(data_len) + milk_bottle = np.zeros(data_len) + water_bottle = np.zeros(data_len) + for i in range(len(data)): + labels = data.iloc[i]['label'] + labels = set(labels.split(' ')) + if CAN in labels: + can[i] = 1 + if CARTON in labels: + carton[i] = 1 + if MILK_BOTTLE in labels: + milk_bottle[i] = 1 + if WATER_BOTTLE in labels: + water_bottle[i] = 1 + data[CAN] = can + data[CARTON] = carton + data[MILK_BOTTLE] = milk_bottle + data[WATER_BOTTLE] = water_bottle + data.drop(columns=ImageColumns.LABEL.value, inplace=True) + return data + + +def train_model(model, dataloaders, criterion, optimizer, scheduler, + dataset_sizes, num_epochs=25): + since = time.time() + + best_model_wts = copy.deepcopy(model.state_dict()) + best_acc = 0.0 + + for epoch in range(num_epochs): + print(f'Epoch {epoch}/{num_epochs - 1}') + print('-' * 10) + + preds_list = [] + labels_list = [] + + # Each epoch has a training and validation phase + for phase in [TRAIN, VAL]: + if phase == TRAIN: + model.train() # Set model to training mode + else: + model.eval() # Set model to evaluate mode + + running_loss = 0.0 + running_corrects = 0 + + # Iterate over data. + for inputs, labels in dataloaders[phase]: + # zero the parameter gradients + optimizer.zero_grad() + + # forward + # track history if only in train + with torch.set_grad_enabled(phase == TRAIN): + outputs = model(inputs) + _, preds = torch.max(outputs, 1) + loss = criterion(outputs, labels) + + # backward + optimize only if in training phase + if phase == TRAIN: + loss.backward() + optimizer.step() + + # statistics + running_loss += loss.item() * inputs.size(0) + running_corrects += torch.sum(preds == labels.data) + + preds_list.extend(preds.tolist()) + labels_list.extend(labels.data.tolist()) + if phase == TRAIN: + scheduler.step() + + epoch_loss = running_loss / dataset_sizes[phase] + epoch_acc = running_corrects.double() / dataset_sizes[phase] + f1_results = f1_score(labels_list, preds_list, average='micro') + print(f'{phase} F1: {f1_results:.4f}') + print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}') + + # deep copy the model + if phase == VAL and epoch_acc > best_acc: + best_acc = epoch_acc + best_model_wts = copy.deepcopy(model.state_dict()) + + print() + + time_elapsed = time.time() - since + mins = time_elapsed // 60 + secs = time_elapsed % 60 + print(f'Training complete in {mins:.0f}m {secs:.0f}s') + print(f'Best val Acc: {best_acc:4f}') + + # load best model weights + model.load_state_dict(best_model_wts) + return model + + +class ImageDataset(Dataset): + def __init__(self, data, transform=None): + self.data = data + self.transform = transform + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + image_data = self.data[IMAGE][idx] + if self.transform is not None: + image_data = self.transform(image_data) + return image_data, self.data[LABEL][idx] + + +def create_pytorch_vision_model(train_data, test_data, num_classes=10): + model_ft = torchvision_models.resnet18(pretrained=False) + num_ftrs = model_ft.fc.in_features + model_ft.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), + padding=(3, 3), bias=False) + model_ft.fc = nn.Linear(num_ftrs, num_classes) + + criterion = nn.CrossEntropyLoss() + + # Observe that all parameters are being optimized + optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9) + + # Decay LR by a factor of 0.1 every 7 epochs + exp_lr_scheduler = lr_scheduler.StepLR( + optimizer_ft, step_size=7, gamma=0.1) + + stacked_data = np.stack(train_data[IMAGE]) + + mean = stacked_data.mean() + std = stacked_data.std() + + # set transformation option + transform = transforms.Compose([ + transforms.ToPILImage(), + transforms.RandomAffine(degrees=30), + transforms.RandomPerspective(), + transforms.ToTensor(), + transforms.Normalize(mean, std)]) + + batch_size = 128 + dataset_sizes = {TRAIN: len(train_data), VAL: len(test_data)} + train_data = ImageDataset(train_data, transform=transform) + test_data = ImageDataset(test_data, transform=transform) + dataloaders = {TRAIN: DataLoader(train_data, batch_size=batch_size, + shuffle=True, num_workers=4), + VAL: DataLoader(test_data, batch_size=batch_size, + shuffle=False, num_workers=4)} + + model_ft = train_model(model_ft, dataloaders, criterion, optimizer_ft, + exp_lr_scheduler, dataset_sizes, num_epochs=1) + return model_ft diff --git a/responsibleai_vision/tests/rai_vision_insights_validator.py b/responsibleai_vision/tests/rai_vision_insights_validator.py new file mode 100644 index 0000000000..a098802224 --- /dev/null +++ b/responsibleai_vision/tests/rai_vision_insights_validator.py @@ -0,0 +1,84 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +import os +from pathlib import Path +from tempfile import TemporaryDirectory + +import pandas as pd + +from responsibleai._internal.constants import ManagerNames +from responsibleai_vision import RAIVisionInsights + +TRAIN_JSON = 'train.json' +TEST_JSON = 'test.json' +DATA = 'data' + + +def validate_rai_vision_insights( + rai_vision_insights, + test_data, + target_column, + task_type +): + pd.testing.assert_frame_equal(rai_vision_insights.test, test_data) + assert rai_vision_insights.target_column == target_column + assert rai_vision_insights.task_type == task_type + + +def run_and_validate_serialization( + pred, + test, + task_type, + class_names, + label, + serializer, + image_width=None +): + """Run and validate serialization. + + :param pred: Model to use for insights + :type pred: object + :param test: Test data to use for insights + :type test: pandas.DataFrame + :param task_type: Task type of model + :type task_type: ModelTask + :param class_names: Class names for model + :type class_names: list[str] + :param label: Label column name + :type label: str + :param serializer: Serializer to use + :type serializer: object + :param image_width: Image width in inches + :type image_width: int + """ + rai_insights = RAIVisionInsights( + pred, test, label, + task_type=task_type, + classes=class_names, + serializer=serializer, + image_width=image_width) + + with TemporaryDirectory() as tmpdir: + save_1 = Path(tmpdir) / "first_save" + save_2 = Path(tmpdir) / "second_save" + + # Save it + rai_insights.save(save_1) + assert len(os.listdir(save_1 / ManagerNames.EXPLAINER)) == 0 + assert not os.path.exists(save_1 / DATA / TRAIN_JSON) + assert os.path.exists(save_1 / DATA / TEST_JSON) + + # Load + rai_2 = RAIVisionInsights.load(save_1) + + # Validate + validate_rai_vision_insights( + rai_2, test, + label, task_type) + + # Save again + rai_2.save(save_2) + assert len(os.listdir(save_2 / ManagerNames.EXPLAINER)) == 0 + assert not os.path.exists(save_2 / DATA / TRAIN_JSON) + assert os.path.exists(save_2 / DATA / TEST_JSON) diff --git a/responsibleai_vision/tests/test_image_utils.py b/responsibleai_vision/tests/test_image_utils.py new file mode 100644 index 0000000000..f2f20f8510 --- /dev/null +++ b/responsibleai_vision/tests/test_image_utils.py @@ -0,0 +1,48 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +from math import isclose + +import numpy as np +from common_vision_utils import load_fridge_object_detection_dataset + +from responsibleai_vision.common.constants import ImageColumns +from responsibleai_vision.utils.image_utils import ( + BOTTOM_X, BOTTOM_Y, HEIGHT, IS_CROWD, TOP_X, TOP_Y, WIDTH, classes_to_dict, + transform_object_detection_labels) + +LABEL = ImageColumns.LABEL.value +IMAGE_DETAILS = ImageColumns.IMAGE_DETAILS.value +TOL = 1 + + +class TestImageUtils(object): + def test_transform_object_detection_labels(self): + data = load_fridge_object_detection_dataset(automl_format=True) + class_names = np.array(['can', 'carton', + 'milk_bottle', 'water_bottle']) + data_transformed = transform_object_detection_labels( + data.copy(), LABEL, class_names) + label_dict = classes_to_dict(class_names) + for i in range(len(data)): + original_label = data[LABEL][i] + label = data_transformed[LABEL][i] + image_details = data[IMAGE_DETAILS][i] + width = image_details[WIDTH] + height = image_details[HEIGHT] + assert isinstance(label, list) + for j in range(len(label)): + label_j = label[j] + assert len(label_j) == 6 + o_label_j = original_label[j] + assert label_j[0] == label_dict[o_label_j[LABEL]] + # use is close for off by one errors due to rounding + assert isclose(label_j[1], o_label_j[TOP_X] * width, + abs_tol=TOL) + assert isclose(label_j[2], o_label_j[TOP_Y] * height, + abs_tol=TOL) + assert isclose(label_j[3], o_label_j[BOTTOM_X] * width, + abs_tol=TOL) + assert isclose(label_j[4], o_label_j[BOTTOM_Y] * height, + abs_tol=TOL) + assert label_j[5] == o_label_j[IS_CROWD] diff --git a/responsibleai_vision/tests/test_rai_vision_automl_images_insights.py b/responsibleai_vision/tests/test_rai_vision_automl_images_insights.py new file mode 100644 index 0000000000..0cc1a238bc --- /dev/null +++ b/responsibleai_vision/tests/test_rai_vision_automl_images_insights.py @@ -0,0 +1,131 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +import copy +import json +import os +import sys +import tempfile + +import pytest +import torch +from common_vision_utils import load_fridge_dataset +from test_rai_vision_insights import run_rai_insights + +from responsibleai_vision import ModelTask +from responsibleai_vision.common.constants import ImageColumns + +try: + import azureml.automl.core.shared.constants as shared_constants + import mlflow + from azureml.automl.dnn.vision.classification.common.constants import \ + ModelNames + from azureml.automl.dnn.vision.classification.models import ModelFactory + from azureml.automl.dnn.vision.common.mlflow.mlflow_model_wrapper import \ + MLFlowImagesModelWrapper + from azureml.automl.dnn.vision.common.model_export_utils import ( + _get_mlflow_signature, _get_scoring_method) +except Exception: + pass + + +def get_automl_images_mlflow_model(class_names): + model_name = ModelNames.SERESNEXT + multilabel = False + with tempfile.TemporaryDirectory() as tmp_output_dir: + task_type = shared_constants.Tasks.IMAGE_CLASSIFICATION + number_of_classes = len(class_names) + model_wrapper = ModelFactory().get_model_wrapper( + model_name, + number_of_classes, + multilabel=multilabel, + device='cpu', + distributed=False, + local_rank=0, + ) + model_wrapper.labels = class_names + # mock for Mlflow model generation + model_file = os.path.join(tmp_output_dir, 'model.pt') + torch.save( + { + 'model_name': model_name, + 'number_of_classes': number_of_classes, + 'model_state': copy.deepcopy(model_wrapper.state_dict()), + 'specs': { + 'multilabel': model_wrapper.multilabel, + 'model_settings': model_wrapper.model_settings, + 'labels': model_wrapper.labels, + }, + }, + model_file, + ) + settings_file = os.path.join( + tmp_output_dir, + shared_constants.MLFlowLiterals.MODEL_SETTINGS_FILENAME, + ) + remote_path = os.path.join(tmp_output_dir, 'outputs') + + with open(settings_file, 'w') as f: + json.dump({}, f) + + conda_env = { + 'channels': ['conda-forge', 'pytorch'], + 'dependencies': [ + 'python=3.7', + 'numpy==1.21.6', + 'pytorch==1.7.1', + 'torchvision==0.12.0', + {'pip': ['azureml-automl-dnn-vision']}, + ], + 'name': 'azureml-automl-dnn-vision-env', + } + + mlflow_model_wrapper = MLFlowImagesModelWrapper( + model_settings={}, + task_type=task_type, + scoring_method=_get_scoring_method(task_type), + ) + mlflow.pyfunc.save_model( + path=remote_path, + python_model=mlflow_model_wrapper, + artifacts={'model': model_file, 'settings': settings_file}, + conda_env=conda_env, + signature=_get_mlflow_signature(task_type), + ) + return mlflow.pyfunc.load_model(remote_path) + + +class TestRAIVisionInsightsAutoMLImages(object): + # Skip for older versions of python + # as azureml-automl-dnn-vision works with '>=3.7,<3.8' + @pytest.mark.skipif( + sys.version_info < (3, 7), + reason='azureml-automl-dnn-vision not supported for older versions', + ) + @pytest.mark.skipif( + sys.version_info > (3, 8), + reason='azureml-automl-dnn-vision not supported for newer versions', + ) + @pytest.mark.skipif( + sys.platform.startswith("darwin"), + reason='azureml-automl-dnn-vision fails to install on macos', + ) + def test_rai_insights_automl_image_classification_fridge(self): + data = load_fridge_dataset() + task_type = ModelTask.IMAGE_CLASSIFICATION + class_names = data[ImageColumns.LABEL.value].unique().tolist() + try: + model = get_automl_images_mlflow_model(class_names) + except Exception as exp: + print( + 'Failed to retrieve or load automl' + ' images mlflow model: {}'.format(exp) + ) + run_rai_insights( + model, + data[:3], + ImageColumns.LABEL, + task_type, + class_names, + test_explainer=True, # enabled as gradcam is faster + ) diff --git a/responsibleai_vision/tests/test_rai_vision_insights.py b/responsibleai_vision/tests/test_rai_vision_insights.py new file mode 100644 index 0000000000..a99173c107 --- /dev/null +++ b/responsibleai_vision/tests/test_rai_vision_insights.py @@ -0,0 +1,265 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +import sys + +import numpy as np +import PIL +import pytest +from common_vision_utils import (FRIDGE_MULTILABEL_TARGETS, ImageTransformEnum, + ImageTransforms, create_dummy_model, + create_image_classification_pipeline, + create_pytorch_vision_model, + gridify_fridge_multilabel_labels, + load_flowers_dataset, load_fridge_dataset, + load_fridge_object_detection_dataset, + load_imagenet_dataset, load_imagenet_labels, + load_mnist_dataset, + load_multilabel_fridge_dataset, + retrieve_fridge_object_detection_model, + retrieve_or_train_fridge_model) +from rai_vision_insights_validator import validate_rai_vision_insights + +from responsibleai.feature_metadata import FeatureMetadata +from responsibleai_vision import ModelTask, RAIVisionInsights +from responsibleai_vision.common.constants import (ExplainabilityDefaults, + ImageColumns) + +DEFAULT_MAX_EVALS = ExplainabilityDefaults.DEFAULT_MAX_EVALS +DEFAULT_NUM_MASKS = ExplainabilityDefaults.DEFAULT_NUM_MASKS +DEFAULT_MASK_RES = ExplainabilityDefaults.DEFAULT_MASK_RES + + +class TestRAIVisionInsights(object): + + def test_rai_insights_image_classification_imagenet(self): + data = load_imagenet_dataset() + pred = create_image_classification_pipeline() + task_type = ModelTask.IMAGE_CLASSIFICATION + class_names = load_imagenet_labels() + run_rai_insights(pred, data[:3], ImageColumns.LABEL, + task_type, class_names, image_mode='RGB') + + @pytest.mark.parametrize('max_evals', [None, 10, 200]) + def test_rai_insights_image_classification_max_evals(self, max_evals): + data = load_imagenet_dataset() + pred = create_image_classification_pipeline() + task_type = ModelTask.IMAGE_CLASSIFICATION + class_names = load_imagenet_labels() + # run on a single image to avoid running out of memory on + # test machines + run_rai_insights(pred, data[:1], ImageColumns.LABEL, + task_type, class_names, image_mode='RGB', + test_explainer=True, max_evals=max_evals) + + @pytest.mark.parametrize('max_evals', [-100, -1, 0]) + def test_rai_insights_invalid_max_evals(self, max_evals): + data = load_imagenet_dataset() + pred = create_image_classification_pipeline() + task_type = ModelTask.IMAGE_CLASSIFICATION + class_names = load_imagenet_labels() + with pytest.raises(ValueError, + match="max_evals must be greater than 0"): + run_rai_insights(pred, data[:1], ImageColumns.LABEL, + task_type, class_names, image_mode='RGB', + test_explainer=True, max_evals=max_evals) + + def test_rai_insights_image_classification_fridge(self): + data = load_fridge_dataset() + try: + model = retrieve_or_train_fridge_model(data) + except Exception as e: + print("Failed to retrieve or load Fastai model, force training") + print("Inner exception message on retrieving model: {}".format(e)) + model = retrieve_or_train_fridge_model(data, force_train=True) + task_type = ModelTask.IMAGE_CLASSIFICATION + class_names = data[ImageColumns.LABEL.value].unique() + run_rai_insights(model, data[:3], ImageColumns.LABEL, + task_type, class_names, test_error_analysis=True) + + def test_rai_insights_image_classification_mnist(self): + train_data, test_data = load_mnist_dataset() + model = create_pytorch_vision_model(train_data, test_data) + task_type = ModelTask.IMAGE_CLASSIFICATION + class_names = train_data[ImageColumns.LABEL.value].unique() + run_rai_insights( + model, test_data[:3], ImageColumns.LABEL, + task_type, class_names) + + def test_rai_insights_multilabel_image_classification_fridge(self): + data = load_multilabel_fridge_dataset() + try: + model = retrieve_or_train_fridge_model(data, multilabel=True) + except Exception as e: + print("Failed to retrieve or load Fastai model, force training") + print("Inner exception message on retrieving model: {}".format(e)) + model = retrieve_or_train_fridge_model( + data, force_train=True, multilabel=True) + data = gridify_fridge_multilabel_labels(data) + task_type = ModelTask.MULTILABEL_IMAGE_CLASSIFICATION + run_rai_insights(model, data[:3], FRIDGE_MULTILABEL_TARGETS, + task_type, test_error_analysis=True) + + @pytest.mark.skip("This test seems to fail due to issues in the \ + MacOS/Linux versions of the build/PR gate.") + @pytest.mark.parametrize('num_masks', [None, 25, DEFAULT_NUM_MASKS]) + @pytest.mark.parametrize('mask_res', [None, DEFAULT_MASK_RES, 8]) + def test_rai_insights_object_detection_fridge(self, num_masks, mask_res): + data = load_fridge_object_detection_dataset() + model = retrieve_fridge_object_detection_model() + task_type = ModelTask.OBJECT_DETECTION + class_names = np.array(['can', 'carton', + 'milk_bottle', 'water_bottle']) + run_rai_insights(model, data[:2], ImageColumns.LABEL, + task_type, class_names, + num_masks=num_masks, mask_res=mask_res) + + @pytest.mark.parametrize('num_masks', [-100, -1, 0]) + def test_rai_insights_invalid_num_masks(self, num_masks): + data = load_fridge_object_detection_dataset() + model = retrieve_fridge_object_detection_model() + task_type = ModelTask.OBJECT_DETECTION + class_names = np.array(['can', 'carton', + 'milk_bottle', 'water_bottle']) + with pytest.raises(ValueError, + match="num_masks must be greater than 0"): + run_rai_insights(model, data[:1], ImageColumns.LABEL, + task_type, class_names, num_masks=num_masks) + + @pytest.mark.parametrize('mask_res', [-100, -1, 0]) + def test_rai_insights_invalid_mask_res(self, mask_res): + data = load_fridge_object_detection_dataset() + model = retrieve_fridge_object_detection_model() + task_type = ModelTask.OBJECT_DETECTION + class_names = np.array(['can', 'carton', + 'milk_bottle', 'water_bottle']) + with pytest.raises(ValueError, + match="mask_res must be greater than 0"): + run_rai_insights(model, data[:1], ImageColumns.LABEL, + task_type, class_names, mask_res=mask_res) + + @pytest.mark.skip("This test fails in the build due to \ + incompatibility between fastai and pytorch \ + 2.0.0. TODO: fix may be to ping pytorch <2.0.0 \ + in the build until fastai updates.") + def test_rai_insights_object_detection_fridge_label_format(self): + data = load_fridge_object_detection_dataset() + model = retrieve_fridge_object_detection_model() + task_type = ModelTask.OBJECT_DETECTION + class_names = np.array(['can', 'carton', + 'milk_bottle', 'water_bottle']) + + rai_insights = RAIVisionInsights(model, data[:3], + ImageColumns.LABEL, + task_type=task_type, + classes=class_names) + y = [ + [ + [1, 100, 200, 300, 400, 0.95], + [2, 100, 200, 300, 400, 0.95], + [1, 100, 200, 300, 400, 0.95] + ], + [ + [1, 100, 200, 300, 400, 0.95], + [2, 100, 200, 300, 400, 0.95], + ] + ] + result = [ + [2, 1, 0, 0], + [1, 1, 0, 0] + ] + assert rai_insights._format_od_labels(y, class_names) == result + + @pytest.mark.skipif(sys.platform == 'darwin', + reason='torch version downgrade on macos') + @pytest.mark.parametrize("path, transform, size", [ + ("./data/odFridgeObjects/img_transforms_large", + ImageTransformEnum.RESIZE, + (1000, 1000)), + ("./data/odFridgeObjects/img_transforms_gray", + ImageTransformEnum.GRAYSCALE, + None), + ("./data/odFridgeObjects/img_transforms_opacity", + ImageTransformEnum.OPACITY, + None), + ("./data/odFridgeObjects/img_transforms_blackout", + ImageTransformEnum.BLACKOUT, + None), + ("./data/odFridgeObjects/img_transforms_png", + ImageTransformEnum.PNG, + None), + ]) + def test_rai_insights_object_detection_fridge_image_transforms(self, + path, + transform, + size): + data = load_fridge_object_detection_dataset()[:10] + data = ImageTransforms(data).apply_transformation(path, + transform, + size) + model = retrieve_fridge_object_detection_model() + task_type = ModelTask.OBJECT_DETECTION + class_names = np.array(['can', 'carton', + 'milk_bottle', 'water_bottle']) + dropped_features = [i for i in range(0, 10)] + run_rai_insights(model, data[:3], ImageColumns.LABEL, + task_type, class_names, + dropped_features=dropped_features) + + @pytest.mark.parametrize( + 'upscale', + [ + pytest.param( + True, + marks=pytest.mark.skip( + 'Insufficient memory on test machines to load images')), + False + ]) + def test_jagged_image_sizes(self, upscale): + if upscale: + PIL.Image.MAX_IMAGE_PIXELS = None + data = load_flowers_dataset(upscale=upscale) + model = create_dummy_model(data) + test_data = data + class_names = data[ImageColumns.LABEL.value].unique() + task_type = ModelTask.IMAGE_CLASSIFICATION + run_rai_insights(model, test_data, ImageColumns.LABEL, + task_type, class_names, upscale=upscale) + + +def run_rai_insights(model, test_data, target_column, + task_type, classes=None, test_explainer=False, + test_error_analysis=False, + image_mode=None, dropped_features=None, + upscale=False, max_evals=DEFAULT_MAX_EVALS, + num_masks=DEFAULT_NUM_MASKS, + mask_res=DEFAULT_MASK_RES): + feature_metadata = None + if dropped_features: + feature_metadata = FeatureMetadata(dropped_features=dropped_features) + image_width = None + if upscale: + image_width = 2 + rai_insights = RAIVisionInsights(model, test_data, + target_column, + task_type=task_type, + classes=classes, + image_mode=image_mode, + feature_metadata=feature_metadata, + image_width=image_width, + max_evals=max_evals, + num_masks=num_masks, + mask_res=mask_res) + # Note: this seems too resource-intensive + # TODO: re-add when we get beefier test machines + if test_explainer: + rai_insights.explainer.add() + if test_error_analysis: + rai_insights.error_analysis.add() + if test_explainer or test_error_analysis: + rai_insights.compute() + rai_insights.get_data() + # Validate + validate_rai_vision_insights( + rai_insights, test_data, + target_column, task_type) diff --git a/responsibleai_vision/tests/test_rai_vision_insights_save_and_load_scenarios.py b/responsibleai_vision/tests/test_rai_vision_insights_save_and_load_scenarios.py new file mode 100644 index 0000000000..e1bd6881c6 --- /dev/null +++ b/responsibleai_vision/tests/test_rai_vision_insights_save_and_load_scenarios.py @@ -0,0 +1,80 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +import shutil +from pathlib import Path +from tempfile import TemporaryDirectory + +import PIL +import pytest +from common_vision_utils import (DummyFlowersPipelineSerializer, + ImageClassificationPipelineSerializer, + create_dummy_model, + create_image_classification_pipeline, + load_flowers_dataset, load_imagenet_dataset, + load_imagenet_labels) +from rai_vision_insights_validator import run_and_validate_serialization + +from responsibleai_vision import ModelTask, RAIVisionInsights +from responsibleai_vision.common.constants import ImageColumns + + +class TestRAIVisionInsightsSaveAndLoadScenarios(object): + + def test_rai_insights_empty_save_load_save(self): + data = load_imagenet_dataset() + pred = create_image_classification_pipeline() + task_type = ModelTask.IMAGE_CLASSIFICATION + class_names = load_imagenet_labels() + test = data[:3] + label = ImageColumns.LABEL + serializer = ImageClassificationPipelineSerializer() + + run_and_validate_serialization( + pred, test, task_type, class_names, label, serializer) + + @pytest.mark.skip("Insufficient memory on test machines to load images") + def test_rai_insights_large_images_save_load_save(self): + PIL.Image.MAX_IMAGE_PIXELS = None + data = load_flowers_dataset(upscale=True) + model = create_dummy_model(data) + test = data + class_names = data[ImageColumns.LABEL.value].unique() + task_type = ModelTask.IMAGE_CLASSIFICATION + label = ImageColumns.LABEL + serializer = DummyFlowersPipelineSerializer() + + image_width = 2 + run_and_validate_serialization( + model, test, task_type, class_names, label, serializer, + image_width) + + def test_loading_rai_insights_without_model_file(self): + data = load_imagenet_dataset() + pred = create_image_classification_pipeline() + task_type = ModelTask.IMAGE_CLASSIFICATION + class_names = load_imagenet_labels() + test = data[:3] + label = ImageColumns.LABEL + serializer = ImageClassificationPipelineSerializer() + + rai_insights = RAIVisionInsights( + pred, test, label, + task_type=task_type, + classes=class_names, + serializer=serializer) + + with TemporaryDirectory() as tmpdir: + assert rai_insights.model is not None + save_path = Path(tmpdir) / "rai_insights" + rai_insights.save(save_path) + + # Remove the model.pkl file to cause an exception to occur + # while loading the model. + model_name = 'image-classification-model' + model_pkl_path = Path(tmpdir) / "rai_insights" / model_name + shutil.rmtree(model_pkl_path) + match_msg = 'No file or directory found' + with pytest.raises(OSError, match=match_msg): + without_model_rai_insights = RAIVisionInsights.load(save_path) + assert without_model_rai_insights.model is None