From f8c6ebaa15f7acca962784f217ede1c4447f51dc Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 11 Mar 2026 17:40:06 -0500 Subject: [PATCH 1/3] fix: Remove evaluation metric key from schema which failed on some LLMs --- .../sdk/server-ai/src/ldai/judge/__init__.py | 37 ++------ .../ldai/judge/evaluation_schema_builder.py | 90 +++++++----------- packages/sdk/server-ai/tests/test_judge.py | 92 ++++++++++++++----- 3 files changed, 108 insertions(+), 111 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index 0ca402a..2557e4d 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -37,7 +37,7 @@ def __init__( self._ai_config = ai_config self._ai_config_tracker = ai_config_tracker self._ai_provider = ai_provider - self._evaluation_response_structure = EvaluationSchemaBuilder.build(ai_config.evaluation_metric_key) + self._evaluation_response_structure = EvaluationSchemaBuilder.build() async def evaluate( self, @@ -77,10 +77,9 @@ async def evaluate( ) success = response.metrics.success - evals = self._parse_evaluation_response(response.data) - if self._ai_config.evaluation_metric_key not in evals: + if not evals: log.warn('Judge evaluation did not return the expected evaluation') success = False @@ -175,47 +174,27 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str: def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]: """ - Parses the structured evaluation response from the AI provider. - - :param data: The structured response data - :return: Dictionary of evaluation scores keyed by metric key + Parses the structured evaluation response. Expects {"evaluation": {"score": n, "reasoning": "..."}}. """ results: Dict[str, EvalScore] = {} - - if not data.get('evaluations') or not isinstance(data['evaluations'], dict): - log.warn('Invalid response: missing or invalid evaluations object') - return results - - evaluations = data['evaluations'] - metric_key = self._ai_config.evaluation_metric_key if not metric_key: log.warn('Evaluation metric key is missing') return results - evaluation = evaluations.get(metric_key) - - if not evaluation or not isinstance(evaluation, dict): - log.warn(f'Missing evaluation for metric key: {metric_key}') + evaluation = data.get('evaluation') if isinstance(data, dict) else None + if not isinstance(evaluation, dict): + log.warn('Invalid response: missing or invalid evaluation') return results score = evaluation.get('score') reasoning = evaluation.get('reasoning') - if not isinstance(score, (int, float)) or score < 0 or score > 1: - log.warn( - f'Invalid score evaluated for {metric_key}: {score}. ' - 'Score must be a number between 0 and 1 inclusive' - ) + log.warn(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive') return results - if not isinstance(reasoning, str): - log.warn( - f'Invalid reasoning evaluated for {metric_key}: {reasoning}. ' - 'Reasoning must be a string' - ) + log.warn('Invalid reasoning: must be a string') return results results[metric_key] = EvalScore(score=float(score), reasoning=reasoning) - return results diff --git a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py index c69e0af..3616ac4 100644 --- a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py +++ b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py @@ -1,79 +1,51 @@ -"""Internal class for building dynamic evaluation response schemas.""" +"""Internal class for building evaluation response schemas.""" -from typing import Any, Dict, Optional +from typing import Any, Dict class EvaluationSchemaBuilder: """ - Internal class for building dynamic evaluation response schemas. + Internal class for building evaluation response schemas. Not exported - only used internally by Judge. + Schema is a fixed shape: one "evaluation" object with score and reasoning. + The judge config's evaluation_metric_key is only used when keying the result, + not in the schema. """ @staticmethod - def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]: + def build() -> Dict[str, Any]: """ - Build an evaluation response schema from evaluation metric key. + Build the evaluation response schema. No parameters; the schema is + always the same. The judge keys the parsed result by its config's + evaluation_metric_key. - :param evaluation_metric_key: Evaluation metric key, or None if not available - :return: Schema dictionary for structured output, or None if evaluation_metric_key is None - """ - if not evaluation_metric_key: - return None + In practice the model returns JSON like: + {"evaluation": {"score": 0.85, "reasoning": "The response is accurate."}} + :return: Schema dictionary for structured output + """ return { 'title': 'EvaluationResponse', - 'description': f"Response containing evaluation results for {evaluation_metric_key} metric", + 'description': 'Response containing an evaluation (score and reasoning).', 'type': 'object', 'properties': { - 'evaluations': { + 'evaluation': { 'type': 'object', - 'description': ( - f"Object containing evaluation results for " - f"{evaluation_metric_key} metric" - ), - 'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_key), - 'required': [evaluation_metric_key], - 'additionalProperties': False, - }, - }, - 'required': ['evaluations'], - 'additionalProperties': False, - } - - @staticmethod - def _build_key_properties(evaluation_metric_key: str) -> Dict[str, Any]: - """ - Build properties for a single evaluation metric key. - - :param evaluation_metric_key: Evaluation metric key - :return: Dictionary of properties for the key - """ - return { - evaluation_metric_key: EvaluationSchemaBuilder._build_key_schema(evaluation_metric_key) - } - - @staticmethod - def _build_key_schema(key: str) -> Dict[str, Any]: - """ - Build schema for a single evaluation metric key. - - :param key: Evaluation metric key - :return: Schema dictionary for the key - """ - return { - 'type': 'object', - 'properties': { - 'score': { - 'type': 'number', - 'minimum': 0, - 'maximum': 1, - 'description': f'Score between 0.0 and 1.0 for {key}', - }, - 'reasoning': { - 'type': 'string', - 'description': f'Reasoning behind the score for {key}', + 'description': 'The evaluation result.', + 'properties': { + 'score': { + 'type': 'number', + 'minimum': 0, + 'maximum': 1, + 'description': 'Score between 0.0 and 1.0.', + }, + 'reasoning': { + 'type': 'string', + 'description': 'Reasoning behind the score.', + }, + }, + 'required': ['score', 'reasoning'], }, }, - 'required': ['score', 'reasoning'], - 'additionalProperties': False, + 'required': ['evaluation'], } diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index d386b92..b1a1cba 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -109,7 +109,10 @@ def test_judge_initializes_with_evaluation_metric_key( assert judge._ai_config == judge_config_with_key assert judge._evaluation_response_structure is not None assert judge._evaluation_response_structure['title'] == 'EvaluationResponse' - assert '$ld:ai:judge:relevance' in judge._evaluation_response_structure['properties']['evaluations']['required'] + assert judge._evaluation_response_structure['required'] == ['evaluation'] + eval_schema = judge._evaluation_response_structure['properties']['evaluation'] + assert eval_schema['required'] == ['score', 'reasoning'] + assert 'score' in eval_schema['properties'] and 'reasoning' in eval_schema['properties'] def test_judge_initializes_without_evaluation_metric_key( self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider @@ -180,6 +183,58 @@ async def test_evaluate_success_with_valid_response( assert result.evals['$ld:ai:judge:relevance'].score == 0.85 assert 'relevant' in result.evals['$ld:ai:judge:relevance'].reasoning.lower() + @pytest.mark.asyncio + async def test_evaluate_success_with_evaluation_response_shape( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should accept shape { evaluation: { score, reasoning } } and key by metric.""" + mock_response = StructuredResponse( + data={ + 'evaluation': { + 'score': 0.9, + 'reasoning': 'The response is accurate and complete.', + } + }, + raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}', + metrics=LDAIMetrics(success=True), + ) + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + result = await judge.evaluate("What is feature flagging?", "Feature flagging is...") + + assert result is not None + assert result.success is True + assert '$ld:ai:judge:relevance' in result.evals + assert result.evals['$ld:ai:judge:relevance'].score == 0.9 + assert 'accurate' in result.evals['$ld:ai:judge:relevance'].reasoning.lower() + + @pytest.mark.asyncio + async def test_evaluate_success_with_evaluations_backward_compat( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should accept legacy shape { evaluations: { score, reasoning } }.""" + mock_response = StructuredResponse( + data={ + 'evaluations': { + 'score': 0.7, + 'reasoning': 'Partially correct.', + } + }, + raw_response='{"evaluations": {"score": 0.7, "reasoning": "..."}}', + metrics=LDAIMetrics(success=True), + ) + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + result = await judge.evaluate("input", "output") + + assert result is not None + assert result.success is True + assert result.evals['$ld:ai:judge:relevance'].score == 0.7 + @pytest.mark.asyncio async def test_evaluate_handles_missing_evaluation_in_response( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider @@ -345,30 +400,21 @@ class TestEvaluationSchemaBuilder: """Tests for EvaluationSchemaBuilder.""" def test_build_creates_correct_schema(self): - """Schema builder should create correct schema structure.""" - schema = EvaluationSchemaBuilder.build('$ld:ai:judge:relevance') - + """Schema builder should create fixed schema (evaluation with score + reasoning, no key param).""" + schema = EvaluationSchemaBuilder.build() + assert schema['title'] == 'EvaluationResponse' assert schema['type'] == 'object' - assert 'evaluations' in schema['properties'] - assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['required'] - assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['properties'] - - metric_schema = schema['properties']['evaluations']['properties']['$ld:ai:judge:relevance'] - assert metric_schema['type'] == 'object' - assert 'score' in metric_schema['properties'] - assert 'reasoning' in metric_schema['properties'] - assert metric_schema['properties']['score']['type'] == 'number' - assert metric_schema['properties']['score']['minimum'] == 0 - assert metric_schema['properties']['score']['maximum'] == 1 - - def test_build_key_properties_creates_single_key(self): - """_build_key_properties should create properties for a single key.""" - properties = EvaluationSchemaBuilder._build_key_properties('$ld:ai:judge:relevance') - - assert '$ld:ai:judge:relevance' in properties - assert len(properties) == 1 - assert properties['$ld:ai:judge:relevance']['type'] == 'object' + assert schema['required'] == ['evaluation'] + assert 'evaluation' in schema['properties'] + eval_schema = schema['properties']['evaluation'] + assert eval_schema['type'] == 'object' + assert eval_schema['required'] == ['score', 'reasoning'] + assert 'score' in eval_schema['properties'] + assert 'reasoning' in eval_schema['properties'] + assert eval_schema['properties']['score']['type'] == 'number' + assert eval_schema['properties']['score']['minimum'] == 0 + assert eval_schema['properties']['score']['maximum'] == 1 class TestJudgeConfigSerialization: From 49f5e2e8cf3929691464828801388c2d1ac74470 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 11 Mar 2026 17:43:36 -0500 Subject: [PATCH 2/3] additional properties is required for openai schemas --- .../sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py index 3616ac4..d19bd48 100644 --- a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py +++ b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py @@ -45,7 +45,9 @@ def build() -> Dict[str, Any]: }, }, 'required': ['score', 'reasoning'], + 'additionalProperties': False, }, }, 'required': ['evaluation'], + 'additionalProperties': False, } From 916df2ae1d1a3fd6bdfcb55c604d77358bfcc01d Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 11 Mar 2026 17:50:09 -0500 Subject: [PATCH 3/3] fix tests --- packages/sdk/server-ai/tests/test_judge.py | 83 +++++----------------- 1 file changed, 17 insertions(+), 66 deletions(-) diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index b1a1cba..9ac3c64 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -114,15 +114,6 @@ def test_judge_initializes_with_evaluation_metric_key( assert eval_schema['required'] == ['score', 'reasoning'] assert 'score' in eval_schema['properties'] and 'reasoning' in eval_schema['properties'] - def test_judge_initializes_without_evaluation_metric_key( - self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider - ): - """Judge should initialize but have None for evaluation_response_structure.""" - judge = Judge(judge_config_without_key, tracker, mock_ai_provider) - - assert judge._ai_config == judge_config_without_key - assert judge._evaluation_response_structure is None - class TestJudgeEvaluate: """Tests for Judge.evaluate() method.""" @@ -158,14 +149,12 @@ async def test_evaluate_success_with_valid_response( """Evaluate should return JudgeResponse with valid evaluation.""" mock_response = StructuredResponse( data={ - 'evaluations': { - '$ld:ai:judge:relevance': { - 'score': 0.85, - 'reasoning': 'The response is highly relevant to the input.' - } + 'evaluation': { + 'score': 0.85, + 'reasoning': 'The response is highly relevant to the input.' } }, - raw_response='{"evaluations": {...}}', + raw_response='{"evaluation": {"score": 0.85, "reasoning": "..."}}', metrics=LDAIMetrics(success=True) ) @@ -210,46 +199,14 @@ async def test_evaluate_success_with_evaluation_response_shape( assert result.evals['$ld:ai:judge:relevance'].score == 0.9 assert 'accurate' in result.evals['$ld:ai:judge:relevance'].reasoning.lower() - @pytest.mark.asyncio - async def test_evaluate_success_with_evaluations_backward_compat( - self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider - ): - """Evaluate should accept legacy shape { evaluations: { score, reasoning } }.""" - mock_response = StructuredResponse( - data={ - 'evaluations': { - 'score': 0.7, - 'reasoning': 'Partially correct.', - } - }, - raw_response='{"evaluations": {"score": 0.7, "reasoning": "..."}}', - metrics=LDAIMetrics(success=True), - ) - mock_ai_provider.invoke_structured_model.return_value = mock_response - tracker.track_metrics_of = AsyncMock(return_value=mock_response) - - judge = Judge(judge_config_with_key, tracker, mock_ai_provider) - result = await judge.evaluate("input", "output") - - assert result is not None - assert result.success is True - assert result.evals['$ld:ai:judge:relevance'].score == 0.7 - @pytest.mark.asyncio async def test_evaluate_handles_missing_evaluation_in_response( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider ): """Evaluate should handle missing evaluation in response.""" mock_response = StructuredResponse( - data={ - 'evaluations': { - 'wrong-key': { - 'score': 0.5, - 'reasoning': 'Some reasoning' - } - } - }, - raw_response='{"evaluations": {...}}', + data={}, + raw_response='{}', metrics=LDAIMetrics(success=True) ) @@ -271,14 +228,12 @@ async def test_evaluate_handles_invalid_score( """Evaluate should handle invalid score values.""" mock_response = StructuredResponse( data={ - 'evaluations': { - '$ld:ai:judge:relevance': { - 'score': 1.5, - 'reasoning': 'Some reasoning' - } + 'evaluation': { + 'score': 1.5, + 'reasoning': 'Some reasoning' } }, - raw_response='{"evaluations": {...}}', + raw_response='{"evaluation": {"score": 1.5, "reasoning": "..."}}', metrics=LDAIMetrics(success=True) ) @@ -300,13 +255,11 @@ async def test_evaluate_handles_missing_reasoning( """Evaluate should handle missing reasoning.""" mock_response = StructuredResponse( data={ - 'evaluations': { - '$ld:ai:judge:relevance': { - 'score': 0.8, - } + 'evaluation': { + 'score': 0.8, } }, - raw_response='{"evaluations": {...}}', + raw_response='{"evaluation": {"score": 0.8}}', metrics=LDAIMetrics(success=True) ) @@ -364,14 +317,12 @@ async def test_evaluate_messages_calls_evaluate( mock_response = StructuredResponse( data={ - 'evaluations': { - '$ld:ai:judge:relevance': { - 'score': 0.9, - 'reasoning': 'Very relevant' - } + 'evaluation': { + 'score': 0.9, + 'reasoning': 'Very relevant' } }, - raw_response='{"evaluations": {...}}', + raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}', metrics=LDAIMetrics(success=True) )