Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ dependencies = [
"python-dotenv>=1.0.1",
"pypdf>=6.6.2",
"reportlab>=4.4.4",
"python-docx>=1.1.0",
Comment on lines 51 to +54
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep dependencies roughly sorted for maintainability; python-docx is currently placed after reportlab (R) but should be grouped with the other py* / python-* dependencies (near python-dotenv / pypdf).

Suggested change
"python-dotenv>=1.0.1",
"pypdf>=6.6.2",
"reportlab>=4.4.4",
"python-docx>=1.1.0",
"pypdf>=6.6.2",
"python-docx>=1.1.0",
"python-dotenv>=1.0.1",
"reportlab>=4.4.4",

Copilot uses AI. Check for mistakes.
"segno>=1.6.6",
"scipy>=1.15.3",
"SQLAlchemy>=2.0.41",
Expand Down
2 changes: 2 additions & 0 deletions pyrit/prompt_converter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
from pyrit.prompt_converter.unicode_sub_converter import UnicodeSubstitutionConverter
from pyrit.prompt_converter.url_converter import UrlConverter
from pyrit.prompt_converter.variation_converter import VariationConverter
from pyrit.prompt_converter.word_doc_converter import WordDoc_Converter
from pyrit.prompt_converter.zalgo_converter import ZalgoConverter
Comment on lines +100 to 101
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WordDoc_Converter export/import name likely needs to change if the converter class is renamed to match existing converter naming conventions (no underscores in class names).

Copilot uses AI. Check for mistakes.
from pyrit.prompt_converter.zero_width_converter import ZeroWidthConverter

Expand Down Expand Up @@ -177,6 +178,7 @@
"UnicodeSubstitutionConverter",
"UrlConverter",
"VariationConverter",
"WordDoc_Converter",
"VariationSelectorSmugglerConverter",
"WordIndexSelectionStrategy",
"WordKeywordSelectionStrategy",
Expand Down
258 changes: 258 additions & 0 deletions pyrit/prompt_converter/word_doc_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
from __future__ import annotations

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from dataclasses import dataclass
import ast
import hashlib
from io import BytesIO
from pathlib import Path
from typing import Any, Dict, Optional

from docx import Document # type: ignore[import-untyped]

from pyrit.common.logger import logger
from pyrit.identifiers import ConverterIdentifier
from pyrit.models import PromptDataType, SeedPrompt, data_serializer_factory
from pyrit.models.data_type_serializer import DataTypeSerializer
from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter


@dataclass
class _WordDocInjectionConfig:
"""Configuration for how to inject content into a Word document."""

existing_docx: Optional[Path]
placeholder: str


class WordDoc_Converter(PromptConverter):
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The class name WordDoc_Converter is inconsistent with the naming pattern used by other converters (e.g., PDFConverter, UrlConverter) and will surface in discovery APIs (e.g., get_converter_modalities). Consider renaming to WordDocConverter and updating the corresponding import/export in pyrit.prompt_converter.__init__.

Suggested change
class WordDoc_Converter(PromptConverter):
class WordDocConverter(PromptConverter):

Copilot uses AI. Check for mistakes.
"""Convert a text prompt into a Word (.docx) document.

This converter supports two main modes:

1. **New document generation**
If no existing document is provided, the converter creates a simple `.docx`
containing the rendered prompt content in a single paragraph.

2. **Placeholder-based injection into an existing document**
If an ``existing_docx`` is provided, the converter searches for a literal
placeholder string (for example ``{{INJECTION_PLACEHOLDER}}``) in the
document's paragraphs. When the placeholder is found fully inside a single
run, it is replaced with the rendered prompt content while preserving the
rest of the paragraph and its formatting.

.. important::
Placeholders must be fully contained within a single run. If a
placeholder spans multiple runs (for example due to mixed formatting),
this converter will not replace it. This limitation is intentional to
avoid collapsing mixed formatting or rewriting complex run structures.

Security note:
This converter does **not** render Jinja2 templates from arbitrary
``.docx`` content. Templating is handled via ``SeedPrompt`` (if provided),
and only the already-rendered text is injected into the document. This
avoids executing untrusted Jinja2 templates from document bodies.
"""
Comment on lines +31 to +57
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR description mentions updated docs and added unit tests for this converter, but the changeset here only adds the converter, an __init__ export, and a dependency. If docs/tests are expected, they appear to be missing from this PR.

Copilot uses AI. Check for mistakes.

SUPPORTED_INPUT_TYPES = ("text",)
SUPPORTED_OUTPUT_TYPES = ("binary_path",)

def __init__(
self,
*,
prompt_template: Optional[SeedPrompt] = None,
existing_docx: Optional[Path] = None,
placeholder: str = "{{INJECTION_PLACEHOLDER}}",
) -> None:
"""Initialize the Word document converter.

Args:
prompt_template: Optional ``SeedPrompt`` template used to render the
final content before injection. If provided, ``prompt`` should
be a dict-like object (or string representation) whose keys map
to the template parameters.
Comment on lines +73 to +75
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring for prompt_template mentions that prompt can be a “dict-like object”, but PromptConverter.convert_async is typed/documented to accept prompt: str and other converters treat the templated case as a string that can be parsed into a dict. Consider updating this documentation to avoid implying that non-string prompts are supported by the converter interface.

Suggested change
final content before injection. If provided, ``prompt`` should
be a dict-like object (or string representation) whose keys map
to the template parameters.
final content before injection. If provided, ``prompt`` passed
to ``convert_async`` must be a string whose contents can be
interpreted as the template parameters (for example, a
JSON-encoded or other parseable mapping of keys to values).

Copilot uses AI. Check for mistakes.
existing_docx: Optional path to an existing `.docx` file. When
provided, the converter will search for ``placeholder`` inside
the document paragraphs and replace it with the rendered content.
If not provided, a new document is generated instead.
placeholder: Literal placeholder text to search for in the existing
document. This value must be fully contained within a single
run for the replacement to succeed.

Raises:
FileNotFoundError: If ``existing_docx`` is provided but does not exist.
ValueError: If ``placeholder`` is empty.
"""
super().__init__()

if not placeholder:
raise ValueError("Placeholder must be a non-empty string.")

if existing_docx is not None and not existing_docx.is_file():
raise FileNotFoundError(f"Word document not found at: {existing_docx}")

self._prompt_template = prompt_template
self._injection_config = _WordDocInjectionConfig(
existing_docx=existing_docx,
placeholder=placeholder,
)

def _build_identifier(self) -> ConverterIdentifier:
"""Build identifier with template and document parameters."""
template_hash: Optional[str] = None
if self._prompt_template:
template_hash = hashlib.sha256(str(self._prompt_template.value).encode("utf-8")).hexdigest()[:16]

existing_docx_path = None
if self._injection_config.existing_docx:
existing_docx_path = str(self._injection_config.existing_docx)

return self._create_identifier(
converter_specific_params={
"prompt_template_hash": template_hash,
"existing_docx_path": existing_docx_path,
"placeholder": self._injection_config.placeholder,
}
)

async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult:
"""Convert the given prompt into a Word document (.docx).

If ``prompt_template`` is provided, the prompt is first used to render the
template via ``SeedPrompt.render_template_value``. Otherwise, the raw
``prompt`` string is used as the content.

- When ``existing_docx`` is set, this content is injected into the
document by replacing the configured placeholder string.
- When no ``existing_docx`` is provided, a new document with a single
paragraph containing the content is created.

Args:
prompt: The prompt or dynamic data used to generate the content.
input_type: The type of input data. Must be ``"text"``.

Returns:
ConverterResult: Contains the path to the generated `.docx` file in
``output_text`` and ``output_type="binary_path"``.

Raises:
ValueError: If the input type is not supported.
"""
if not self.input_supported(input_type):
raise ValueError("Input type not supported")

content = self._prepare_content(prompt)

if self._injection_config.existing_docx:
doc_bytes = self._inject_into_existing_docx(content)
else:
doc_bytes = self._generate_new_docx(content)

serializer = await self._serialize_docx(doc_bytes)

return ConverterResult(output_text=serializer.value, output_type="binary_path")
Comment on lines +120 to +155
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR introduces a new converter with non-trivial behavior (new doc generation + placeholder injection across runs), but there are no corresponding unit tests under tests/unit/converter/ (the repo has extensive converter test coverage, e.g., test_pdf_converter.py). Add unit tests for the main success and failure paths to prevent regressions.

Copilot uses AI. Check for mistakes.

def _prepare_content(self, prompt: str) -> str:
"""Prepare the content to be injected or written to the document.

If a ``SeedPrompt`` template is provided, the ``prompt`` is parsed (if
necessary) as a dictionary and used to render the template. Otherwise,
the raw prompt string is used.
"""
if self._prompt_template:
logger.debug(f"Preparing Word content with template: {self._prompt_template.value}")
try:
dynamic_data: Dict[str, Any]
if isinstance(prompt, str):
dynamic_data = ast.literal_eval(prompt)
else:
dynamic_data = prompt # type: ignore[assignment]

if not isinstance(dynamic_data, dict):
raise ValueError("Prompt must be a dictionary-compatible object after parsing.")

rendered_content = self._prompt_template.render_template_value(**dynamic_data)
logger.debug("Rendered Word template content successfully.")
return rendered_content
except (ValueError, SyntaxError, KeyError) as exc:
logger.error("Error rendering Word template content: %s", exc)
raise ValueError(f"Failed to render the prompt for Word document: {exc}") from exc

if isinstance(prompt, str):
logger.debug("No template provided for Word document. Using raw prompt content.")
return prompt

raise ValueError("Prompt must be a string when no template is provided.")

def _generate_new_docx(self, content: str) -> bytes:
"""Generate a new `.docx` document containing the given content."""
document = Document()
document.add_paragraph(content)

buffer = BytesIO()
document.save(buffer)
buffer.seek(0)
return buffer.getvalue()

def _inject_into_existing_docx(self, content: str) -> bytes:
"""Inject content into an existing document by replacing the placeholder.

The placeholder must appear fully inside a single run; if it only exists
across multiple runs, it will not be replaced.
"""
assert self._injection_config.existing_docx is not None
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid using assert for required runtime checks. In optimized runs (python -O) assertions are stripped, which could allow existing_docx to be None here and lead to a less clear failure. Prefer an explicit check with a clear exception (or rely on the earlier constructor validation and remove the assertion entirely).

Suggested change
assert self._injection_config.existing_docx is not None
if self._injection_config.existing_docx is None:
raise ValueError(
"Cannot inject into an existing Word document because no 'existing_docx' "
"path was provided in the injection configuration."
)

Copilot uses AI. Check for mistakes.
document = Document(self._injection_config.existing_docx)

placeholder = self._injection_config.placeholder
replaced_any = False

for paragraph in document.paragraphs:
if placeholder not in paragraph.text:
continue

if self._replace_placeholder_in_paragraph(paragraph, placeholder, content):
replaced_any = True

if not replaced_any:
logger.warning(
"No placeholder '%s' found in document '%s' or placeholder spanned multiple runs.",
placeholder,
self._injection_config.existing_docx,
)

buffer = BytesIO()
document.save(buffer)
buffer.seek(0)
return buffer.getvalue()

@staticmethod
def _replace_placeholder_in_paragraph(paragraph: Any, placeholder: str, content: str) -> bool:
"""Replace a placeholder inside a single run of a paragraph.

This function searches all runs of a paragraph and performs a string
replacement in the first run whose text contains the placeholder. It
does not modify other runs, which helps preserve existing formatting.

Returns:
bool: True if a replacement was made, False otherwise.
"""
for run in paragraph.runs:
if placeholder in run.text:
run.text = run.text.replace(placeholder, content)
return True
return False

async def _serialize_docx(self, docx_bytes: bytes) -> DataTypeSerializer:
"""Serialize the generated document using a data serializer."""
extension = "docx"

serializer = data_serializer_factory(
category="prompt-memory-entries",
data_type="binary_path",
extension=extension,
)
await serializer.save_data(docx_bytes)
return serializer
Comment on lines +247 to +257
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_serialize_docx is an async method but its name does not end with _async. Rename it (and its call site) to follow the project convention for async method naming.

Copilot generated this review using guidance from repository custom instructions.