Skip to content

Commit

Permalink
Add methods and constants for genai metrics (#2524)
Browse files Browse the repository at this point in the history
* Added info about required packages

* Update responsibleaidashboard-question-answering-model-debugging.ipynb

* show example prediction

* Update responsibleaidashboard-question-answering-model-debugging.ipynb

* add methods and constants for genai task type

Signed-off-by: Kartik Choudhary <[email protected]>

* add missing files for genai metrics

Signed-off-by: Kartik Choudhary <[email protected]>

* update copyright information

Signed-off-by: Kartik Choudhary <[email protected]>

---------

Signed-off-by: Kartik Choudhary <[email protected]>
  • Loading branch information
kartik727 authored Feb 1, 2024
1 parent 5036435 commit c8ff9a2
Show file tree
Hide file tree
Showing 13 changed files with 233 additions and 7 deletions.
14 changes: 14 additions & 0 deletions raiwidgets/raiwidgets/responsibleai_dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,15 @@ def get_question_answering_metrics():
methods=["POST"]
)

def get_generative_text_metrics():
data = request.get_json(force=True)
return jsonify(self.input.get_generative_text_metrics(data))
self.add_url_rule(
get_generative_text_metrics,
'/get_generative_text_metrics',
methods=["POST"]
)

if hasattr(self._service, 'socketio'):
@self._service.socketio.on('handle_object_detection_json')
def handle_object_detection_json(od_json):
Expand All @@ -132,3 +141,8 @@ def handle_object_detection_json(od_json):
def handle_question_answering_json(qa_json):
qa_data = json.loads(qa_json['data'])
return self.input.get_question_answering_metrics(qa_data)

@self._service.socketio.on('handle_generative_text_json')
def handle_generative_text_json(gt_json):
gt_data = json.loads(gt_json['data'])
return self.input.get_generative_text_metrics(gt_data)
33 changes: 32 additions & 1 deletion raiwidgets/raiwidgets/responsibleai_dashboard_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def _prepare_filtered_error_analysis_data(self, features, filters,

def debug_ml(self, data):
try:
features = data[0]
features = data[0] # TODO: Remove prompt feature
filters = data[1]
composite_filters = data[2]
max_depth = data[3]
Expand Down Expand Up @@ -484,3 +484,34 @@ def get_question_answering_metrics(self, post_data):
"inner error: {}".format(e_str),
WidgetRequestResponseConstants.data: []
}

def get_generative_text_metrics(self, post_data):
"""Flask endpoint function to get Model Overview metrics
for the Generative Text scenario.
:param post_data: List of inputs in the order
[true_y, predicted_y, aggregate_method, class_name, iou_threshold].
:type post_data: List
:return: JSON/dict data response
:rtype: Dict[str, List]
"""
try:
selection_indexes = post_data[0]
generative_text_cache = post_data[1]
exp = self._analysis.compute_genai_metrics(
selection_indexes,
generative_text_cache
)
return {
WidgetRequestResponseConstants.data: exp
}
except Exception as e:
print(e)
traceback.print_exc()
e_str = _format_exception(e)
return {
WidgetRequestResponseConstants.error:
EXP_VIZ_ERR_MSG.format(e_str),
WidgetRequestResponseConstants.data: []
}
8 changes: 8 additions & 0 deletions responsibleai_text/responsibleai_text/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ class ModelTask(str, Enum):
QUESTION_ANSWERING = 'question_answering'
ENTAILMENT = 'entailment'
SUMMARIZATIONS = 'summarizations'
GENERATIVE_TEXT = 'generative_text'
GENERATIVE_TEXT_CHAT = 'generative_text_chat'
UNKNOWN = 'unknown'


Expand All @@ -34,3 +36,9 @@ class QuestionAnsweringFields(object):
QUESTION = "question"
CONTEXT = "context"
ANSWERS = "answers"


class GenerativeTextFields(object):
PROMPT = "prompt"
SYS_PROMPT = "sys_prompt"
RESPONSE = "response"
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas as pd
from ml_wrappers import wrap_model

from erroranalysis._internal.constants import ModelTask as ErrorAnalysisTask
from erroranalysis._internal.error_analyzer import ModelAnalyzer
from erroranalysis._internal.error_report import as_error_report
from responsibleai._tools.shared.state_directory_management import \
Expand All @@ -22,6 +23,7 @@
from responsibleai.managers.error_analysis_manager import as_error_config
from responsibleai_text.common.constants import ModelTask
from responsibleai_text.utils.feature_extractors import get_text_columns
from responsibleai_text.utils.genai_metrics.metrics import get_genai_metric

LABELS = 'labels'

Expand Down Expand Up @@ -83,6 +85,14 @@ def __init__(self, model, dataset, is_multilabel, task_type, classes=None):
self.predictions = self.model.predict(
self.dataset.loc[:, ['context', 'questions']])
self.predictions = np.array(self.predictions)
elif self.task_type == ModelTask.GENERATIVE_TEXT:
# TODO: Decide the final metric for error analysis
coherence = get_genai_metric(
'coherence',
predictions=self.model.predict(self.dataset),
references=dataset['prompt'],
wrapper_model=self.model)
self.predictions = np.array(coherence['scores'])
else:
raise ValueError("Unknown task type: {}".format(self.task_type))

Expand Down Expand Up @@ -193,9 +203,17 @@ def __init__(self, model: Any, dataset: pd.DataFrame,
task_type, index_classes)
if categorical_features is None:
categorical_features = []
if task_type == ModelTask.GENERATIVE_TEXT:
sup_task_type = ErrorAnalysisTask.REGRESSION
ext_dataset = ext_dataset.copy()
del ext_dataset['prompt']
ext_dataset['target_score'] = 5
target_column = 'target_score'
else:
sup_task_type = ErrorAnalysisTask.CLASSIFICATION
super(ErrorAnalysisManager, self).__init__(
index_predictor, ext_dataset, target_column,
classes, categorical_features)
classes, categorical_features, model_task=sup_task_type)

@staticmethod
def _create_index_predictor(model, dataset, target_column,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
from responsibleai_text.managers.explainer_manager import ExplainerManager
from responsibleai_text.utils.feature_extractors import (extract_features,
get_text_columns)
from responsibleai_text.utils.genai_metrics.metrics import \
get_genai_metric_mean

module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)
Expand Down Expand Up @@ -116,7 +118,8 @@ def __init__(self, model: Any, test: pd.DataFrame,
serializer: Optional[Any] = None,
maximum_rows_for_test: int = 5000,
feature_metadata: Optional[FeatureMetadata] = None,
text_column: Optional[Union[str, List]] = None):
text_column: Optional[Union[str, List]] = None,
eval_model: Any = None):
"""Creates an RAITextInsights object.
:param model: The model to compute RAI insights for.
Expand Down Expand Up @@ -148,6 +151,10 @@ def __init__(self, model: Any, test: pd.DataFrame,
If not provided, and there is additional feature metadata, then
an exception will be raised.
:type text_column: str or list[str]
:param eval_model: The model to use for evaluation with AI-assisted
metrics. If not provided, then the model passed in the model
parameter will be used.
:type eval_model: object
"""
# drop index as this can cause issues later like when copying
# target column below from test dataset to _ext_test_df
Expand All @@ -160,6 +167,10 @@ def __init__(self, model: Any, test: pd.DataFrame,
self._text_column = text_column
self._feature_metadata = feature_metadata
self._wrapped_model = wrap_model(model, test, task_type)
if eval_model is None:
self._eval_model = self._wrapped_model
else:
self._eval_model = wrap_model(eval_model, test, task_type)
self._validate_rai_insights_input_parameters(
model=self._wrapped_model, test=test,
target_column=target_column, task_type=task_type,
Expand Down Expand Up @@ -269,7 +280,9 @@ def _validate_model(self, model: Any, test: pd.DataFrame,
target_column, axis=1)
small_test_data = get_text_columns(small_test_data, text_column)
small_test_data = small_test_data.iloc[0]
if task_type != ModelTask.QUESTION_ANSWERING:
if task_type not in [
ModelTask.QUESTION_ANSWERING,
ModelTask.GENERATIVE_TEXT]:
small_test_data = small_test_data.tolist()
# Call the model
try:
Expand Down Expand Up @@ -319,7 +332,8 @@ def _validate_rai_insights_input_parameters(
ModelTask.SENTIMENT_ANALYSIS.value,
ModelTask.QUESTION_ANSWERING.value,
ModelTask.ENTAILMENT.value,
ModelTask.SUMMARIZATIONS.value
ModelTask.SUMMARIZATIONS.value,
ModelTask.GENERATIVE_TEXT.value,
]

if task_type not in valid_tasks:
Expand Down Expand Up @@ -362,6 +376,10 @@ def _validate_rai_insights_input_parameters(
if not target_columns_set.issubset(set(test.columns)):
raise UserConfigValidationException(
'The list of target_column(s) should be in test data')
elif (task_type == ModelTask.GENERATIVE_TEXT.value and
target_column is None):
# target column is optional for generative text
pass
else:
if target_column not in list(test.columns):
raise UserConfigValidationException(
Expand Down Expand Up @@ -514,6 +532,11 @@ def _get_test_text_data(self, is_classification_task):
dataset = self.test.drop(target_column, axis=1)
elif self.task_type == ModelTask.QUESTION_ANSWERING:
dataset = self.test.drop([self.target_column], axis=1)
elif self.task_type == ModelTask.GENERATIVE_TEXT:
if self.target_column is None:
dataset = self.test.copy()
else:
dataset = self.test.drop([self.target_column], axis=1)
else:
raise ValueError("Unknown task type: {}".format(self.task_type))
dataset = get_text_columns(dataset, self._text_column)
Expand Down Expand Up @@ -853,3 +876,71 @@ def compute_question_answering_metrics(
except ValueError:
all_cohort_metrics.append([0, 0, 0, 0, 0, 0])
return all_cohort_metrics

def compute_genai_metrics(
self,
selection_indexes,
genai_cache
):
dashboard_dataset = self.get_data().dataset
prompt_idx = dashboard_dataset.feature_names.index('prompt')
prompts = [feat[prompt_idx] for feat in dashboard_dataset.features]
true_y = dashboard_dataset.true_y
predicted_y = dashboard_dataset.predicted_y

all_cohort_metrics = []
for cohort_indices in selection_indexes:
cohort_metrics = dict()

if true_y is None:
true_y_cohort = None
else:
true_y_cohort = [true_y[cohort_index] for cohort_index
in cohort_indices]
predicted_y_cohort = [predicted_y[cohort_index] for cohort_index
in cohort_indices]
prompts_cohort = [prompts[cohort_index] for cohort_index
in cohort_indices]
try:
if true_y_cohort is not None:
exact_match = evaluate.load('exact_match')
cohort_metrics['exact_match'] = exact_match.compute(
predictions=predicted_y_cohort,
references=true_y_cohort)

cohort_metrics['coherence'] = get_genai_metric_mean(
'coherence',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)

if true_y_cohort is not None:
cohort_metrics['equivalence'] = get_genai_metric_mean(
'equivalence',
predictions=predicted_y_cohort,
references=prompts_cohort,
answers=true_y_cohort,
wrapper_model=self._eval_model)

cohort_metrics['fluency'] = get_genai_metric_mean(
'fluency',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)

cohort_metrics['groundedness'] = get_genai_metric_mean(
'groundedness',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)

cohort_metrics['relevance'] = get_genai_metric_mean(
'relevance',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)

all_cohort_metrics.append(cohort_metrics)
except ValueError:
all_cohort_metrics.append({})
return all_cohort_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from tqdm import tqdm

from nlp_feature_extractors import attribute_extractors as exts
from responsibleai_text.common.constants import (ModelTask,
from responsibleai_text.common.constants import (GenerativeTextFields,
ModelTask,
QuestionAnsweringFields)

nlp = None
Expand Down Expand Up @@ -60,6 +61,9 @@ def extract_features(text_dataset: pd.DataFrame,
feature_names.append(prefix + "maximum_parse_tree_depth")
feature_names.append("question_type")
feature_names.append("context_overlap")
elif task_type == ModelTask.GENERATIVE_TEXT:
start_meta_index = 0
feature_names = base_feature_names
else:
raise ValueError("Unknown task type: {}".format(task_type))
# copy over the metadata column names
Expand Down Expand Up @@ -96,6 +100,19 @@ def extract_features(text_dataset: pd.DataFrame,
context_overlap = get_context_overlap(context=context,
question=question)
extracted_features.append(context_overlap)
# append all other metadata features
append_metadata_values(start_meta_index, text_dataset, i,
extracted_features, has_dropped_features,
dropped_features, column_names)
results.append(extracted_features)
elif task_type == ModelTask.GENERATIVE_TEXT:
for i, row in tqdm(text_features.iterrows(),
desc='feature extraction'):
extracted_features = []
add_extracted_features_for_sentence(
row[GenerativeTextFields.PROMPT], extracted_features,
task_type)

# append all other metadata features
append_metadata_values(start_meta_index, text_dataset, i,
extracted_features, has_dropped_features,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Contains the GenAI metrics."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Contains the implementation of various metrics for GenAI."""
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def _compute_metric(template, logger, wrapper_model, **kwargs):
templated_ques = format_str(template, **kwargs)

inp = pd.DataFrame({
'questions': templated_ques,
'prompt': templated_ques,
'sys_prompt': _SYS_PROMPT})

responses = wrapper_model.predict(inp)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@
This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
Some examples of valid responses are:
1
2
5
Some examples of invalid responses are:
1/5
1.5
3.0
5 stars
QUESTION:
{question}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@
This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
Some examples of valid responses are:
1
2
5
Some examples of invalid responses are:
1/5
1.5
3.0
5 stars
QUESTION:
{question}
Expand Down
Loading

0 comments on commit c8ff9a2

Please sign in to comment.