Fix content-filter responses showing raw JSON in results (#5058447)#45528
Fix content-filter responses showing raw JSON in results (#5058447)#45528slister1001 wants to merge 1 commit intoAzure:mainfrom
Conversation
When Azure OpenAI content filters block a response, the result processor now detects the raw API payload and replaces it with a human-readable message like "[Response blocked by content filter: self_harm (severity: medium)]" instead of showing raw JSON. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
There was a problem hiding this comment.
Pull request overview
Fixes red team evaluation result rendering when Azure OpenAI content filters block a response, preventing raw JSON API payloads from being surfaced as assistant message content in the UI.
Changes:
- Added
ResultProcessor._clean_content_filter_response()to detect and replace content-filter payload-like text with a human-readable message. - Integrated the cleaning step into
_normalize_sample_message()for assistant-role messages.
| try: | ||
| # Try to parse the nested JSON to extract filter details | ||
| if '"message":' in content: | ||
| match = re.search(r'"message"\s*:\s*"(.*?)"(?:\s*[,}])', content) | ||
| if match: | ||
| inner = match.group(1).replace('\\"', '"').replace("\\\\", "\\") | ||
| try: | ||
| inner_json = json.loads(inner) | ||
| for choice in inner_json.get("choices", []): | ||
| cfr = choice.get("content_filter_results", {}) | ||
| for category, details in cfr.items(): | ||
| if isinstance(details, dict) and details.get("filtered"): | ||
| severity = details.get("severity", "unknown") | ||
| filter_details.append(f"{category} (severity: {severity})") | ||
| except (json.JSONDecodeError, KeyError, TypeError): | ||
| pass | ||
| # Also check for the direct error format | ||
| if not filter_details and "content_filter" in content_lower: | ||
| for category in ["hate", "self_harm", "sexual", "violence"]: | ||
| pattern = f'"{category}".*?"filtered"\\s*:\\s*true' | ||
| if re.search(pattern, content, re.IGNORECASE): | ||
| sev_match = re.search(f'"{category}".*?"severity"\\s*:\\s*"(\\w+)"', content, re.IGNORECASE) | ||
| severity = sev_match.group(1) if sev_match else "unknown" | ||
| filter_details.append(f"{category} (severity: {severity})") | ||
| except Exception: # pylint: disable=broad-except | ||
| pass | ||
|
|
There was a problem hiding this comment.
The parsing logic relies on regexes over the raw string and then wraps the whole extraction in a broad except Exception, which can mask unexpected issues and makes the behavior hard to reason about. Since the goal is to detect/clean serialized JSON payloads, it would be more robust to attempt json.loads(content) first (when content looks like JSON) and extract choices[*].content_filter_results directly, falling back to regex only if JSON parsing fails; if you keep the try/except, consider narrowing it to expected exception types.
| try: | |
| # Try to parse the nested JSON to extract filter details | |
| if '"message":' in content: | |
| match = re.search(r'"message"\s*:\s*"(.*?)"(?:\s*[,}])', content) | |
| if match: | |
| inner = match.group(1).replace('\\"', '"').replace("\\\\", "\\") | |
| try: | |
| inner_json = json.loads(inner) | |
| for choice in inner_json.get("choices", []): | |
| cfr = choice.get("content_filter_results", {}) | |
| for category, details in cfr.items(): | |
| if isinstance(details, dict) and details.get("filtered"): | |
| severity = details.get("severity", "unknown") | |
| filter_details.append(f"{category} (severity: {severity})") | |
| except (json.JSONDecodeError, KeyError, TypeError): | |
| pass | |
| # Also check for the direct error format | |
| if not filter_details and "content_filter" in content_lower: | |
| for category in ["hate", "self_harm", "sexual", "violence"]: | |
| pattern = f'"{category}".*?"filtered"\\s*:\\s*true' | |
| if re.search(pattern, content, re.IGNORECASE): | |
| sev_match = re.search(f'"{category}".*?"severity"\\s*:\\s*"(\\w+)"', content, re.IGNORECASE) | |
| severity = sev_match.group(1) if sev_match else "unknown" | |
| filter_details.append(f"{category} (severity: {severity})") | |
| except Exception: # pylint: disable=broad-except | |
| pass | |
| # First, try to parse the entire content as JSON if it looks like JSON. | |
| stripped = content.strip() | |
| if stripped and stripped[0] in ("{", "["): | |
| try: | |
| parsed = json.loads(stripped) | |
| except json.JSONDecodeError: | |
| parsed = None | |
| if isinstance(parsed, dict): | |
| choices = parsed.get("choices", []) | |
| if isinstance(choices, list): | |
| for choice in choices: | |
| if not isinstance(choice, dict): | |
| continue | |
| # Prefer content_filter_results, but also check content_filter for robustness. | |
| cfr = choice.get("content_filter_results") or choice.get("content_filter", {}) | |
| if not isinstance(cfr, dict): | |
| continue | |
| for category, details in cfr.items(): | |
| if isinstance(details, dict) and details.get("filtered"): | |
| severity = details.get("severity", "unknown") | |
| filter_details.append(f"{category} (severity: {severity})") | |
| # If we did not find details from the top-level JSON, try nested "message" JSON extraction. | |
| if not filter_details and '"message":' in content: | |
| match = re.search(r'"message"\s*:\s*"(.*?)"(?:\s*[,}])', content) | |
| if match: | |
| inner = match.group(1).replace('\\"', '"').replace("\\\\", "\\") | |
| try: | |
| inner_json = json.loads(inner) | |
| for choice in inner_json.get("choices", []): | |
| if not isinstance(choice, dict): | |
| continue | |
| cfr = choice.get("content_filter_results", {}) | |
| if not isinstance(cfr, dict): | |
| continue | |
| for category, details in cfr.items(): | |
| if isinstance(details, dict) and details.get("filtered"): | |
| severity = details.get("severity", "unknown") | |
| filter_details.append(f"{category} (severity: {severity})") | |
| except (json.JSONDecodeError, KeyError, TypeError): | |
| # If the nested JSON is malformed or has unexpected shape, ignore and fall back. | |
| pass | |
| # Also check for the direct error format via regex as a final fallback. | |
| if not filter_details and "content_filter" in content_lower: | |
| for category in ["hate", "self_harm", "sexual", "violence"]: | |
| pattern = f'"{category}".*?"filtered"\\s*:\\s*true' | |
| if re.search(pattern, content, re.IGNORECASE): | |
| sev_match = re.search( | |
| f'"{category}".*?"severity"\\s*:\\s*"(\\w+)"', | |
| content, | |
| re.IGNORECASE, | |
| ) | |
| severity = sev_match.group(1) if sev_match else "unknown" | |
| filter_details.append(f"{category} (severity: {severity})") |
| # Clean raw content-filter API responses for assistant messages | ||
| if normalized.get("role") == "assistant": | ||
| content = normalized.get("content", "") | ||
| if isinstance(content, str): | ||
| normalized["content"] = ResultProcessor._clean_content_filter_response(content) | ||
|
|
There was a problem hiding this comment.
This change introduces new normalization behavior (rewriting assistant message content when it resembles a content-filter payload), but there are existing unit tests for red_team/ResultProcessor and none appear to cover this path. Adding a focused test for _normalize_sample_message/_clean_content_filter_response (both a positive case with a real content-filter payload and a negative case with normal text mentioning similar words) would help prevent regressions.
| def _clean_content_filter_response(content: str) -> str: | ||
| """If content looks like a raw content-filter API response, replace with friendly text.""" | ||
| if not content or not isinstance(content, str): |
There was a problem hiding this comment.
_clean_content_filter_response is annotated as (content: str) -> str, but it returns content even when content is not a string (the not isinstance(content, str) branch). This is a real type contract violation and can trip mypy/pylint. Consider either (a) removing the non-str path and only accepting str here, or (b) changing the signature/return type to Optional[str]/Any and adjusting callers accordingly.
| def _clean_content_filter_response(content: str) -> str: | |
| """If content looks like a raw content-filter API response, replace with friendly text.""" | |
| if not content or not isinstance(content, str): | |
| def _clean_content_filter_response(content: Any) -> str: | |
| """If content looks like a raw content-filter API response, replace with friendly text.""" | |
| if not isinstance(content, str): | |
| return str(content) | |
| if not content: |
| content_lower = content.lower() | ||
| if ( | ||
| '"content_filter"' in content_lower | ||
| or "content management policy" in content_lower | ||
| or '"finish_reason":"content_filter"' in content_lower | ||
| or "content_filter_results" in content_lower | ||
| ): |
There was a problem hiding this comment.
The heuristic that triggers content-filter cleaning is very broad (e.g., it matches any assistant text containing "content management policy" or "content_filter_results"). This can incorrectly replace legitimate assistant messages that merely discuss those topics. To avoid false positives, consider tightening detection to cases that look like a serialized API payload (e.g., content starts with '{'/'[' and json.loads succeeds, or specific top-level keys like "choices"/"error" are present).
Bug Fix: #5058447
When Azure OpenAI content filters block a response (HTTP 200, finish_reason: content_filter, content: null), the raw JSON API payload was surfaced verbatim as row-level results in the evaluation UI.
Changes