From ea83798e634124a355a3bf26a021ad517818786a Mon Sep 17 00:00:00 2001 From: everythingfades Date: Sun, 22 Feb 2026 23:28:53 +0000 Subject: [PATCH] fix: let the output be structued even if the input is invalid --- evaluation_function/evaluation.py | 14 ++++++++++---- evaluation_function/schemas/result.py | 19 ++++--------------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/evaluation_function/evaluation.py b/evaluation_function/evaluation.py index d0d9230..371d0d7 100755 --- a/evaluation_function/evaluation.py +++ b/evaluation_function/evaluation.py @@ -3,7 +3,7 @@ from evaluation_function.schemas.params import Params from .schemas import FSA, FSAFrontend -from .schemas.result import Result +from .schemas.result import FSAFeedback, Result from .correction import analyze_fsa_correction import json @@ -52,12 +52,18 @@ def evaluation_function( ) except Exception as e: - # Always return LFResult with raw payload for debugging + result: Result = Result( + is_correct=False, + feedback=f"Error during evaluation: {str(e)}", + fsa_feedback=FSAFeedback( + summary=f"Error during evaluation: {str(e)}", + errors=[] + ) + ) return LFResult( is_correct=False, feedback_items=[( "error", - f"Invalid FSA format: {str(e)}\n\n" - f"response: {response}\nanswer: {answer}\nparams: {params}" + result.fsa_feedback.model_dump_json() )] ) diff --git a/evaluation_function/schemas/result.py b/evaluation_function/schemas/result.py index 183c6a1..1370a08 100644 --- a/evaluation_function/schemas/result.py +++ b/evaluation_function/schemas/result.py @@ -212,11 +212,13 @@ class LanguageComparison(BaseModel): description="True if student FSA accepts the same language as expected" ) + # not used, for extension counterexample: Optional[str] = Field( default=None, description="A string where student FSA differs from expected (if languages not equivalent)" ) + # not used, for extension counterexample_type: Optional[Literal["should_accept", "should_reject"]] = Field( default=None, description="Whether the counterexample should be accepted or rejected" @@ -255,6 +257,7 @@ class FSAFeedback(BaseModel): description="Language equivalence comparison with counterexample if applicable" ) + # not used test_results: List[TestResult] = Field( default_factory=list, description="Results of individual test cases" @@ -277,7 +280,6 @@ class Result(BaseModel): { "is_correct": false, "feedback": "Your FSA rejects 'ab' but it should accept it.", - "score": 0.75, "fsa_feedback": { "summary": "Language mismatch - incorrect behavior on some inputs", "errors": [ @@ -324,20 +326,7 @@ class Result(BaseModel): description="Human-readable feedback message for the student" ) - score: Optional[float] = Field( - default=None, - ge=0.0, - le=1.0, - description="Normalized score (0.0-1.0) for partial credit, null if not using partial credit" - ) - fsa_feedback: Optional[FSAFeedback] = Field( default=None, description="Detailed structured feedback with element highlighting for UI" - ) - - # this is dev only - input_data: Optional[FSA] = Field( - default=None, - description="The parsed FSA input data (for development/debugging purposes only)" - ) + ) \ No newline at end of file