Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions evaluation_function/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from evaluation_function.schemas.params import Params
from .schemas import FSA, FSAFrontend
from .schemas.result import Result
from .schemas.result import FSAFeedback, Result
from .correction import analyze_fsa_correction
import json

Expand Down Expand Up @@ -52,12 +52,18 @@ def evaluation_function(
)

except Exception as e:
# Always return LFResult with raw payload for debugging
result: Result = Result(
is_correct=False,
feedback=f"Error during evaluation: {str(e)}",
fsa_feedback=FSAFeedback(
summary=f"Error during evaluation: {str(e)}",
errors=[]
)
)
return LFResult(
is_correct=False,
feedback_items=[(
"error",
f"Invalid FSA format: {str(e)}\n\n"
f"response: {response}\nanswer: {answer}\nparams: {params}"
result.fsa_feedback.model_dump_json()
)]
)
19 changes: 4 additions & 15 deletions evaluation_function/schemas/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,13 @@ class LanguageComparison(BaseModel):
description="True if student FSA accepts the same language as expected"
)

# not used, for extension
counterexample: Optional[str] = Field(
default=None,
description="A string where student FSA differs from expected (if languages not equivalent)"
)

# not used, for extension
counterexample_type: Optional[Literal["should_accept", "should_reject"]] = Field(
default=None,
description="Whether the counterexample should be accepted or rejected"
Expand Down Expand Up @@ -255,6 +257,7 @@ class FSAFeedback(BaseModel):
description="Language equivalence comparison with counterexample if applicable"
)

# not used
test_results: List[TestResult] = Field(
default_factory=list,
description="Results of individual test cases"
Expand All @@ -277,7 +280,6 @@ class Result(BaseModel):
{
"is_correct": false,
"feedback": "Your FSA rejects 'ab' but it should accept it.",
"score": 0.75,
"fsa_feedback": {
"summary": "Language mismatch - incorrect behavior on some inputs",
"errors": [
Expand Down Expand Up @@ -324,20 +326,7 @@ class Result(BaseModel):
description="Human-readable feedback message for the student"
)

score: Optional[float] = Field(
default=None,
ge=0.0,
le=1.0,
description="Normalized score (0.0-1.0) for partial credit, null if not using partial credit"
)

fsa_feedback: Optional[FSAFeedback] = Field(
default=None,
description="Detailed structured feedback with element highlighting for UI"
)

# this is dev only
input_data: Optional[FSA] = Field(
default=None,
description="The parsed FSA input data (for development/debugging purposes only)"
)
)