Add methods and constants for genai metrics (#2524)

* Added info about required packages * Update responsibleaidashboard-question-answering-model-debugging.ipynb * show example prediction * Update responsibleaidashboard-question-answering-model-debugging.ipynb * add methods and constants for genai task type Signed-off-by: Kartik Choudhary <[email protected]> * add missing files for genai metrics Signed-off-by: Kartik Choudhary <[email protected]> * update copyright information Signed-off-by: Kartik Choudhary <[email protected]> --------- Signed-off-by: Kartik Choudhary <[email protected]>
microsoft · Feb 1, 2024 · c8ff9a2 · c8ff9a2
1 parent 5036435
commit c8ff9a2
Show file tree

Hide file tree

Showing 13 changed files with 233 additions and 7 deletions.
diff --git a/raiwidgets/raiwidgets/responsibleai_dashboard.py b/raiwidgets/raiwidgets/responsibleai_dashboard.py
@@ -122,6 +122,15 @@ def get_question_answering_metrics():
             methods=["POST"]
         )
 
+        def get_generative_text_metrics():
+            data = request.get_json(force=True)
+            return jsonify(self.input.get_generative_text_metrics(data))
+        self.add_url_rule(
+            get_generative_text_metrics,
+            '/get_generative_text_metrics',
+            methods=["POST"]
+        )
+
         if hasattr(self._service, 'socketio'):
             @self._service.socketio.on('handle_object_detection_json')
             def handle_object_detection_json(od_json):
@@ -132,3 +141,8 @@ def handle_object_detection_json(od_json):
             def handle_question_answering_json(qa_json):
                 qa_data = json.loads(qa_json['data'])
                 return self.input.get_question_answering_metrics(qa_data)
+
+            @self._service.socketio.on('handle_generative_text_json')
+            def handle_generative_text_json(gt_json):
+                gt_data = json.loads(gt_json['data'])
+                return self.input.get_generative_text_metrics(gt_data)
diff --git a/raiwidgets/raiwidgets/responsibleai_dashboard_input.py b/raiwidgets/raiwidgets/responsibleai_dashboard_input.py
@@ -171,7 +171,7 @@ def _prepare_filtered_error_analysis_data(self, features, filters,
 
     def debug_ml(self, data):
         try:
-            features = data[0]
+            features = data[0]  # TODO: Remove prompt feature
             filters = data[1]
             composite_filters = data[2]
             max_depth = data[3]
@@ -484,3 +484,34 @@ def get_question_answering_metrics(self, post_data):
                     "inner error: {}".format(e_str),
                 WidgetRequestResponseConstants.data: []
             }
+
+    def get_generative_text_metrics(self, post_data):
+        """Flask endpoint function to get Model Overview metrics
+        for the Generative Text scenario.
+
+        :param post_data: List of inputs in the order
+        [true_y, predicted_y, aggregate_method, class_name, iou_threshold].
+        :type post_data: List
+
+        :return: JSON/dict data response
+        :rtype: Dict[str, List]
+        """
+        try:
+            selection_indexes = post_data[0]
+            generative_text_cache = post_data[1]
+            exp = self._analysis.compute_genai_metrics(
+                selection_indexes,
+                generative_text_cache
+            )
+            return {
+                WidgetRequestResponseConstants.data: exp
+            }
+        except Exception as e:
+            print(e)
+            traceback.print_exc()
+            e_str = _format_exception(e)
+            return {
+                WidgetRequestResponseConstants.error:
+                    EXP_VIZ_ERR_MSG.format(e_str),
+                WidgetRequestResponseConstants.data: []
+            }
diff --git a/responsibleai_text/responsibleai_text/common/constants.py b/responsibleai_text/responsibleai_text/common/constants.py
@@ -18,6 +18,8 @@ class ModelTask(str, Enum):
     QUESTION_ANSWERING = 'question_answering'
     ENTAILMENT = 'entailment'
     SUMMARIZATIONS = 'summarizations'
+    GENERATIVE_TEXT = 'generative_text'
+    GENERATIVE_TEXT_CHAT = 'generative_text_chat'
     UNKNOWN = 'unknown'
 
 
@@ -34,3 +36,9 @@ class QuestionAnsweringFields(object):
     QUESTION = "question"
     CONTEXT = "context"
     ANSWERS = "answers"
+
+
+class GenerativeTextFields(object):
+    PROMPT = "prompt"
+    SYS_PROMPT = "sys_prompt"
+    RESPONSE = "response"
diff --git a/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py b/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py
@@ -12,6 +12,7 @@
 import pandas as pd
 from ml_wrappers import wrap_model
 
+from erroranalysis._internal.constants import ModelTask as ErrorAnalysisTask
 from erroranalysis._internal.error_analyzer import ModelAnalyzer
 from erroranalysis._internal.error_report import as_error_report
 from responsibleai._tools.shared.state_directory_management import \
@@ -22,6 +23,7 @@
 from responsibleai.managers.error_analysis_manager import as_error_config
 from responsibleai_text.common.constants import ModelTask
 from responsibleai_text.utils.feature_extractors import get_text_columns
+from responsibleai_text.utils.genai_metrics.metrics import get_genai_metric
 
 LABELS = 'labels'
 
@@ -83,6 +85,14 @@ def __init__(self, model, dataset, is_multilabel, task_type, classes=None):
             self.predictions = self.model.predict(
                 self.dataset.loc[:, ['context', 'questions']])
             self.predictions = np.array(self.predictions)
+        elif self.task_type == ModelTask.GENERATIVE_TEXT:
+            # TODO: Decide the final metric for error analysis
+            coherence = get_genai_metric(
+                'coherence',
+                predictions=self.model.predict(self.dataset),
+                references=dataset['prompt'],
+                wrapper_model=self.model)
+            self.predictions = np.array(coherence['scores'])
         else:
             raise ValueError("Unknown task type: {}".format(self.task_type))
 
@@ -193,9 +203,17 @@ def __init__(self, model: Any, dataset: pd.DataFrame,
             task_type, index_classes)
         if categorical_features is None:
             categorical_features = []
+        if task_type == ModelTask.GENERATIVE_TEXT:
+            sup_task_type = ErrorAnalysisTask.REGRESSION
+            ext_dataset = ext_dataset.copy()
+            del ext_dataset['prompt']
+            ext_dataset['target_score'] = 5
+            target_column = 'target_score'
+        else:
+            sup_task_type = ErrorAnalysisTask.CLASSIFICATION
         super(ErrorAnalysisManager, self).__init__(
             index_predictor, ext_dataset, target_column,
-            classes, categorical_features)
+            classes, categorical_features, model_task=sup_task_type)
 
     @staticmethod
     def _create_index_predictor(model, dataset, target_column,

diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py b/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py
@@ -30,6 +30,8 @@
 from responsibleai_text.managers.explainer_manager import ExplainerManager
 from responsibleai_text.utils.feature_extractors import (extract_features,
                                                          get_text_columns)
+from responsibleai_text.utils.genai_metrics.metrics import \
+    get_genai_metric_mean
 
 module_logger = logging.getLogger(__name__)
 module_logger.setLevel(logging.INFO)
@@ -116,7 +118,8 @@ def __init__(self, model: Any, test: pd.DataFrame,
                  serializer: Optional[Any] = None,
                  maximum_rows_for_test: int = 5000,
                  feature_metadata: Optional[FeatureMetadata] = None,
-                 text_column: Optional[Union[str, List]] = None):
+                 text_column: Optional[Union[str, List]] = None,
+                 eval_model: Any = None):
         """Creates an RAITextInsights object.
 
         :param model: The model to compute RAI insights for.
@@ -148,6 +151,10 @@ def __init__(self, model: Any, test: pd.DataFrame,
             If not provided, and there is additional feature metadata, then
             an exception will be raised.
         :type text_column: str or list[str]
+        :param eval_model: The model to use for evaluation with AI-assisted
+            metrics. If not provided, then the model passed in the model
+            parameter will be used.
+        :type eval_model: object
         """
         # drop index as this can cause issues later like when copying
         # target column below from test dataset to _ext_test_df
@@ -160,6 +167,10 @@ def __init__(self, model: Any, test: pd.DataFrame,
         self._text_column = text_column
         self._feature_metadata = feature_metadata
         self._wrapped_model = wrap_model(model, test, task_type)
+        if eval_model is None:
+            self._eval_model = self._wrapped_model
+        else:
+            self._eval_model = wrap_model(eval_model, test, task_type)
         self._validate_rai_insights_input_parameters(
             model=self._wrapped_model, test=test,
             target_column=target_column, task_type=task_type,
@@ -269,7 +280,9 @@ def _validate_model(self, model: Any, test: pd.DataFrame,
             target_column, axis=1)
         small_test_data = get_text_columns(small_test_data, text_column)
         small_test_data = small_test_data.iloc[0]
-        if task_type != ModelTask.QUESTION_ANSWERING:
+        if task_type not in [
+                ModelTask.QUESTION_ANSWERING,
+                ModelTask.GENERATIVE_TEXT]:
             small_test_data = small_test_data.tolist()
         # Call the model
         try:
@@ -319,7 +332,8 @@ def _validate_rai_insights_input_parameters(
             ModelTask.SENTIMENT_ANALYSIS.value,
             ModelTask.QUESTION_ANSWERING.value,
             ModelTask.ENTAILMENT.value,
-            ModelTask.SUMMARIZATIONS.value
+            ModelTask.SUMMARIZATIONS.value,
+            ModelTask.GENERATIVE_TEXT.value,
         ]
 
         if task_type not in valid_tasks:
@@ -362,6 +376,10 @@ def _validate_rai_insights_input_parameters(
             if not target_columns_set.issubset(set(test.columns)):
                 raise UserConfigValidationException(
                     'The list of target_column(s) should be in test data')
+        elif (task_type == ModelTask.GENERATIVE_TEXT.value and
+              target_column is None):
+            # target column is optional for generative text
+            pass
         else:
             if target_column not in list(test.columns):
                 raise UserConfigValidationException(
@@ -514,6 +532,11 @@ def _get_test_text_data(self, is_classification_task):
             dataset = self.test.drop(target_column, axis=1)
         elif self.task_type == ModelTask.QUESTION_ANSWERING:
             dataset = self.test.drop([self.target_column], axis=1)
+        elif self.task_type == ModelTask.GENERATIVE_TEXT:
+            if self.target_column is None:
+                dataset = self.test.copy()
+            else:
+                dataset = self.test.drop([self.target_column], axis=1)
         else:
             raise ValueError("Unknown task type: {}".format(self.task_type))
         dataset = get_text_columns(dataset, self._text_column)
@@ -853,3 +876,71 @@ def compute_question_answering_metrics(
             except ValueError:
                 all_cohort_metrics.append([0, 0, 0, 0, 0, 0])
         return all_cohort_metrics
+
+    def compute_genai_metrics(
+        self,
+        selection_indexes,
+        genai_cache
+    ):
+        dashboard_dataset = self.get_data().dataset
+        prompt_idx = dashboard_dataset.feature_names.index('prompt')
+        prompts = [feat[prompt_idx] for feat in dashboard_dataset.features]
+        true_y = dashboard_dataset.true_y
+        predicted_y = dashboard_dataset.predicted_y
+
+        all_cohort_metrics = []
+        for cohort_indices in selection_indexes:
+            cohort_metrics = dict()
+
+            if true_y is None:
+                true_y_cohort = None
+            else:
+                true_y_cohort = [true_y[cohort_index] for cohort_index
+                                 in cohort_indices]
+            predicted_y_cohort = [predicted_y[cohort_index] for cohort_index
+                                  in cohort_indices]
+            prompts_cohort = [prompts[cohort_index] for cohort_index
+                              in cohort_indices]
+            try:
+                if true_y_cohort is not None:
+                    exact_match = evaluate.load('exact_match')
+                    cohort_metrics['exact_match'] = exact_match.compute(
+                        predictions=predicted_y_cohort,
+                        references=true_y_cohort)
+
+                cohort_metrics['coherence'] = get_genai_metric_mean(
+                    'coherence',
+                    predictions=predicted_y_cohort,
+                    references=prompts_cohort,
+                    wrapper_model=self._eval_model)
+
+                if true_y_cohort is not None:
+                    cohort_metrics['equivalence'] = get_genai_metric_mean(
+                        'equivalence',
+                        predictions=predicted_y_cohort,
+                        references=prompts_cohort,
+                        answers=true_y_cohort,
+                        wrapper_model=self._eval_model)
+
+                cohort_metrics['fluency'] = get_genai_metric_mean(
+                    'fluency',
+                    predictions=predicted_y_cohort,
+                    references=prompts_cohort,
+                    wrapper_model=self._eval_model)
+
+                cohort_metrics['groundedness'] = get_genai_metric_mean(
+                    'groundedness',
+                    predictions=predicted_y_cohort,
+                    references=prompts_cohort,
+                    wrapper_model=self._eval_model)
+
+                cohort_metrics['relevance'] = get_genai_metric_mean(
+                    'relevance',
+                    predictions=predicted_y_cohort,
+                    references=prompts_cohort,
+                    wrapper_model=self._eval_model)
+
+                all_cohort_metrics.append(cohort_metrics)
+            except ValueError:
+                all_cohort_metrics.append({})
+        return all_cohort_metrics
diff --git a/responsibleai_text/responsibleai_text/utils/feature_extractors.py b/responsibleai_text/responsibleai_text/utils/feature_extractors.py
@@ -12,7 +12,8 @@
 from tqdm import tqdm
 
 from nlp_feature_extractors import attribute_extractors as exts
-from responsibleai_text.common.constants import (ModelTask,
+from responsibleai_text.common.constants import (GenerativeTextFields,
+                                                 ModelTask,
                                                  QuestionAnsweringFields)
 
 nlp = None
@@ -60,6 +61,9 @@ def extract_features(text_dataset: pd.DataFrame,
             feature_names.append(prefix + "maximum_parse_tree_depth")
         feature_names.append("question_type")
         feature_names.append("context_overlap")
+    elif task_type == ModelTask.GENERATIVE_TEXT:
+        start_meta_index = 0
+        feature_names = base_feature_names
     else:
         raise ValueError("Unknown task type: {}".format(task_type))
     # copy over the metadata column names
@@ -96,6 +100,19 @@ def extract_features(text_dataset: pd.DataFrame,
             context_overlap = get_context_overlap(context=context,
                                                   question=question)
             extracted_features.append(context_overlap)
+            # append all other metadata features
+            append_metadata_values(start_meta_index, text_dataset, i,
+                                   extracted_features, has_dropped_features,
+                                   dropped_features, column_names)
+            results.append(extracted_features)
+    elif task_type == ModelTask.GENERATIVE_TEXT:
+        for i, row in tqdm(text_features.iterrows(),
+                           desc='feature extraction'):
+            extracted_features = []
+            add_extracted_features_for_sentence(
+                row[GenerativeTextFields.PROMPT], extracted_features,
+                task_type)
+
             # append all other metadata features
             append_metadata_values(start_meta_index, text_dataset, i,
                                    extracted_features, has_dropped_features,

diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/__init__.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation
+# Licensed under the MIT License.
+
+"""Contains the GenAI metrics."""
diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/__init__.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation
+# Licensed under the MIT License.
+
+"""Contains the implementation of various metrics for GenAI."""
diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py
@@ -24,7 +24,7 @@ def _compute_metric(template, logger, wrapper_model, **kwargs):
     templated_ques = format_str(template, **kwargs)
 
     inp = pd.DataFrame({
-        'questions': templated_ques,
+        'prompt': templated_ques,
         'sys_prompt': _SYS_PROMPT})
 
     responses = wrapper_model.predict(inp)

diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py
@@ -48,6 +48,15 @@
 
 This rating value should always be an integer between 1 and 5. So the rating \
 produced should be 1 or 2 or 3 or 4 or 5.
+Some examples of valid responses are:
+1
+2
+5
+Some examples of invalid responses are:
+1/5
+1.5
+3.0
+5 stars
 
 QUESTION:
 {question}

diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py
@@ -47,6 +47,15 @@
 
 This rating value should always be an integer between 1 and 5. So the rating \
 produced should be 1 or 2 or 3 or 4 or 5.
+Some examples of valid responses are:
+1
+2
+5
+Some examples of invalid responses are:
+1/5
+1.5
+3.0
+5 stars
 
 QUESTION:
 {question}