Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 8 additions & 29 deletions packages/sdk/server-ai/src/ldai/judge/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(
self._ai_config = ai_config
self._ai_config_tracker = ai_config_tracker
self._ai_provider = ai_provider
self._evaluation_response_structure = EvaluationSchemaBuilder.build(ai_config.evaluation_metric_key)
self._evaluation_response_structure = EvaluationSchemaBuilder.build()

async def evaluate(
self,
Expand Down Expand Up @@ -77,10 +77,9 @@ async def evaluate(
)

success = response.metrics.success

evals = self._parse_evaluation_response(response.data)

if self._ai_config.evaluation_metric_key not in evals:
if not evals:
log.warn('Judge evaluation did not return the expected evaluation')
success = False

Expand Down Expand Up @@ -175,47 +174,27 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:

def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]:
"""
Parses the structured evaluation response from the AI provider.

:param data: The structured response data
:return: Dictionary of evaluation scores keyed by metric key
Parses the structured evaluation response. Expects {"evaluation": {"score": n, "reasoning": "..."}}.
"""
results: Dict[str, EvalScore] = {}

if not data.get('evaluations') or not isinstance(data['evaluations'], dict):
log.warn('Invalid response: missing or invalid evaluations object')
return results

evaluations = data['evaluations']

metric_key = self._ai_config.evaluation_metric_key
if not metric_key:
log.warn('Evaluation metric key is missing')
return results

evaluation = evaluations.get(metric_key)

if not evaluation or not isinstance(evaluation, dict):
log.warn(f'Missing evaluation for metric key: {metric_key}')
evaluation = data.get('evaluation') if isinstance(data, dict) else None
if not isinstance(evaluation, dict):
log.warn('Invalid response: missing or invalid evaluation')
return results

score = evaluation.get('score')
reasoning = evaluation.get('reasoning')

if not isinstance(score, (int, float)) or score < 0 or score > 1:
log.warn(
f'Invalid score evaluated for {metric_key}: {score}. '
'Score must be a number between 0 and 1 inclusive'
)
log.warn(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive')
return results

if not isinstance(reasoning, str):
log.warn(
f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
'Reasoning must be a string'
)
log.warn('Invalid reasoning: must be a string')
return results

results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)

return results
Original file line number Diff line number Diff line change
@@ -1,79 +1,53 @@
"""Internal class for building dynamic evaluation response schemas."""
"""Internal class for building evaluation response schemas."""

from typing import Any, Dict, Optional
from typing import Any, Dict


class EvaluationSchemaBuilder:
"""
Internal class for building dynamic evaluation response schemas.
Internal class for building evaluation response schemas.
Not exported - only used internally by Judge.
Schema is a fixed shape: one "evaluation" object with score and reasoning.
The judge config's evaluation_metric_key is only used when keying the result,
not in the schema.
"""

@staticmethod
def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]:
def build() -> Dict[str, Any]:
"""
Build an evaluation response schema from evaluation metric key.
Build the evaluation response schema. No parameters; the schema is
always the same. The judge keys the parsed result by its config's
evaluation_metric_key.

:param evaluation_metric_key: Evaluation metric key, or None if not available
:return: Schema dictionary for structured output, or None if evaluation_metric_key is None
"""
if not evaluation_metric_key:
return None
In practice the model returns JSON like:
{"evaluation": {"score": 0.85, "reasoning": "The response is accurate."}}

:return: Schema dictionary for structured output
"""
return {
'title': 'EvaluationResponse',
'description': f"Response containing evaluation results for {evaluation_metric_key} metric",
'description': 'Response containing an evaluation (score and reasoning).',
'type': 'object',
'properties': {
'evaluations': {
'evaluation': {
'type': 'object',
'description': (
f"Object containing evaluation results for "
f"{evaluation_metric_key} metric"
),
'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_key),
'required': [evaluation_metric_key],
'description': 'The evaluation result.',
'properties': {
'score': {
'type': 'number',
'minimum': 0,
'maximum': 1,
'description': 'Score between 0.0 and 1.0.',
},
'reasoning': {
'type': 'string',
'description': 'Reasoning behind the score.',
},
},
'required': ['score', 'reasoning'],
'additionalProperties': False,
},
},
'required': ['evaluations'],
'additionalProperties': False,
}

@staticmethod
def _build_key_properties(evaluation_metric_key: str) -> Dict[str, Any]:
"""
Build properties for a single evaluation metric key.

:param evaluation_metric_key: Evaluation metric key
:return: Dictionary of properties for the key
"""
return {
evaluation_metric_key: EvaluationSchemaBuilder._build_key_schema(evaluation_metric_key)
}

@staticmethod
def _build_key_schema(key: str) -> Dict[str, Any]:
"""
Build schema for a single evaluation metric key.

:param key: Evaluation metric key
:return: Schema dictionary for the key
"""
return {
'type': 'object',
'properties': {
'score': {
'type': 'number',
'minimum': 0,
'maximum': 1,
'description': f'Score between 0.0 and 1.0 for {key}',
},
'reasoning': {
'type': 'string',
'description': f'Reasoning behind the score for {key}',
},
},
'required': ['score', 'reasoning'],
'required': ['evaluation'],
'additionalProperties': False,
}
123 changes: 60 additions & 63 deletions packages/sdk/server-ai/tests/test_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,16 +109,10 @@ def test_judge_initializes_with_evaluation_metric_key(
assert judge._ai_config == judge_config_with_key
assert judge._evaluation_response_structure is not None
assert judge._evaluation_response_structure['title'] == 'EvaluationResponse'
assert '$ld:ai:judge:relevance' in judge._evaluation_response_structure['properties']['evaluations']['required']

def test_judge_initializes_without_evaluation_metric_key(
self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
):
"""Judge should initialize but have None for evaluation_response_structure."""
judge = Judge(judge_config_without_key, tracker, mock_ai_provider)

assert judge._ai_config == judge_config_without_key
assert judge._evaluation_response_structure is None
assert judge._evaluation_response_structure['required'] == ['evaluation']
eval_schema = judge._evaluation_response_structure['properties']['evaluation']
assert eval_schema['required'] == ['score', 'reasoning']
assert 'score' in eval_schema['properties'] and 'reasoning' in eval_schema['properties']


class TestJudgeEvaluate:
Expand Down Expand Up @@ -155,14 +149,12 @@ async def test_evaluate_success_with_valid_response(
"""Evaluate should return JudgeResponse with valid evaluation."""
mock_response = StructuredResponse(
data={
'evaluations': {
'$ld:ai:judge:relevance': {
'score': 0.85,
'reasoning': 'The response is highly relevant to the input.'
}
'evaluation': {
'score': 0.85,
'reasoning': 'The response is highly relevant to the input.'
}
},
raw_response='{"evaluations": {...}}',
raw_response='{"evaluation": {"score": 0.85, "reasoning": "..."}}',
metrics=LDAIMetrics(success=True)
)

Expand All @@ -181,20 +173,40 @@ async def test_evaluate_success_with_valid_response(
assert 'relevant' in result.evals['$ld:ai:judge:relevance'].reasoning.lower()

@pytest.mark.asyncio
async def test_evaluate_handles_missing_evaluation_in_response(
async def test_evaluate_success_with_evaluation_response_shape(
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
):
"""Evaluate should handle missing evaluation in response."""
"""Evaluate should accept shape { evaluation: { score, reasoning } } and key by metric."""
mock_response = StructuredResponse(
data={
'evaluations': {
'wrong-key': {
'score': 0.5,
'reasoning': 'Some reasoning'
}
'evaluation': {
'score': 0.9,
'reasoning': 'The response is accurate and complete.',
}
},
raw_response='{"evaluations": {...}}',
raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}',
metrics=LDAIMetrics(success=True),
)
mock_ai_provider.invoke_structured_model.return_value = mock_response
tracker.track_metrics_of = AsyncMock(return_value=mock_response)

judge = Judge(judge_config_with_key, tracker, mock_ai_provider)
result = await judge.evaluate("What is feature flagging?", "Feature flagging is...")

assert result is not None
assert result.success is True
assert '$ld:ai:judge:relevance' in result.evals
assert result.evals['$ld:ai:judge:relevance'].score == 0.9
assert 'accurate' in result.evals['$ld:ai:judge:relevance'].reasoning.lower()

@pytest.mark.asyncio
async def test_evaluate_handles_missing_evaluation_in_response(
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
):
"""Evaluate should handle missing evaluation in response."""
mock_response = StructuredResponse(
data={},
raw_response='{}',
metrics=LDAIMetrics(success=True)
)

Expand All @@ -216,14 +228,12 @@ async def test_evaluate_handles_invalid_score(
"""Evaluate should handle invalid score values."""
mock_response = StructuredResponse(
data={
'evaluations': {
'$ld:ai:judge:relevance': {
'score': 1.5,
'reasoning': 'Some reasoning'
}
'evaluation': {
'score': 1.5,
'reasoning': 'Some reasoning'
}
},
raw_response='{"evaluations": {...}}',
raw_response='{"evaluation": {"score": 1.5, "reasoning": "..."}}',
metrics=LDAIMetrics(success=True)
)

Expand All @@ -245,13 +255,11 @@ async def test_evaluate_handles_missing_reasoning(
"""Evaluate should handle missing reasoning."""
mock_response = StructuredResponse(
data={
'evaluations': {
'$ld:ai:judge:relevance': {
'score': 0.8,
}
'evaluation': {
'score': 0.8,
}
},
raw_response='{"evaluations": {...}}',
raw_response='{"evaluation": {"score": 0.8}}',
metrics=LDAIMetrics(success=True)
)

Expand Down Expand Up @@ -309,14 +317,12 @@ async def test_evaluate_messages_calls_evaluate(

mock_response = StructuredResponse(
data={
'evaluations': {
'$ld:ai:judge:relevance': {
'score': 0.9,
'reasoning': 'Very relevant'
}
'evaluation': {
'score': 0.9,
'reasoning': 'Very relevant'
}
},
raw_response='{"evaluations": {...}}',
raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}',
metrics=LDAIMetrics(success=True)
)

Expand Down Expand Up @@ -345,30 +351,21 @@ class TestEvaluationSchemaBuilder:
"""Tests for EvaluationSchemaBuilder."""

def test_build_creates_correct_schema(self):
"""Schema builder should create correct schema structure."""
schema = EvaluationSchemaBuilder.build('$ld:ai:judge:relevance')
"""Schema builder should create fixed schema (evaluation with score + reasoning, no key param)."""
schema = EvaluationSchemaBuilder.build()

assert schema['title'] == 'EvaluationResponse'
assert schema['type'] == 'object'
assert 'evaluations' in schema['properties']
assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['required']
assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['properties']

metric_schema = schema['properties']['evaluations']['properties']['$ld:ai:judge:relevance']
assert metric_schema['type'] == 'object'
assert 'score' in metric_schema['properties']
assert 'reasoning' in metric_schema['properties']
assert metric_schema['properties']['score']['type'] == 'number'
assert metric_schema['properties']['score']['minimum'] == 0
assert metric_schema['properties']['score']['maximum'] == 1

def test_build_key_properties_creates_single_key(self):
"""_build_key_properties should create properties for a single key."""
properties = EvaluationSchemaBuilder._build_key_properties('$ld:ai:judge:relevance')

assert '$ld:ai:judge:relevance' in properties
assert len(properties) == 1
assert properties['$ld:ai:judge:relevance']['type'] == 'object'
assert schema['required'] == ['evaluation']
assert 'evaluation' in schema['properties']
eval_schema = schema['properties']['evaluation']
assert eval_schema['type'] == 'object'
assert eval_schema['required'] == ['score', 'reasoning']
assert 'score' in eval_schema['properties']
assert 'reasoning' in eval_schema['properties']
assert eval_schema['properties']['score']['type'] == 'number'
assert eval_schema['properties']['score']['minimum'] == 0
assert eval_schema['properties']['score']['maximum'] == 1


class TestJudgeConfigSerialization:
Expand Down
Loading