diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1f29c4f --- /dev/null +++ b/Makefile @@ -0,0 +1,35 @@ +.PHONY: dev run download install lint format check test smoke clean help + +help: ## Show this help + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-15s\033[0m %s\n", $$1, $$2}' + +install: ## Install dependencies (including dev) + uv sync + +download: ## Download model files + bash scripts/download.sh + +dev: ## Start dev server with auto-reload + uv run uvicorn slm_server.app:app --reload --host 0.0.0.0 --port 8000 + +run: ## Start server via start.sh + bash scripts/start.sh + +lint: ## Run ruff linter + uv run ruff check slm_server/ + +format: ## Run ruff formatter + uv run ruff format slm_server/ + +check: lint ## Run linter + formatter check + uv run ruff format --check slm_server/ + +smoke: ## Smoke-test the running server APIs with curl + bash scripts/smoke.sh + +test: ## Run tests with coverage + uv run pytest tests/ -v --cov=slm_server --cov-report=term-missing + +clean: ## Remove caches and build artifacts + rm -rf __pycache__ .pytest_cache .ruff_cache .coverage htmlcov build dist *.egg-info + find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml index 7f13693..16f83da 100644 --- a/deploy/helm/values.yaml +++ b/deploy/helm/values.yaml @@ -79,14 +79,35 @@ env: {} # Resource requests and limits for the container. # See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ -# Tuned for Qwen3-0.6B-Q4_K_M (484 MB) + n_ctx=8192 KV cache (~448 MB) on 1-CPU / 1 GB VPS nodes. +# +# Memory budget breakdown (target node: 1-CPU / 2 GB VPS): +# Chat LLM – Qwen3-0.6B-Q4_K_M.gguf ~484 MB (4-bit quantised) +# Embedding – all-MiniLM-L6-v2 quint8 ONNX ~23 MB (uint8 AVX2 quantised) +# KV cache – n_ctx=2048 ~50-80 MB +# Runtime – Python, FastAPI, onnxruntime ~50-100 MB +# ------------------------------------------------------- +# Total request: 550 Mi Hard limit: 1 Gi +# +# Why these models: +# - Qwen3-0.6B-Q4_K_M is the smallest instruction-tuned LLM that still +# supports function calling (chatml format) at usable quality. +# - all-MiniLM-L6-v2 (384-dim, 6-layer) is purpose-trained for sentence +# embeddings via mean pooling, ranking well on STS benchmarks for its +# size. The quint8 AVX2 variant keeps the file at 23 MB vs 90 MB fp32. +# +# Why the limit is reasonable: +# - The worker node (active-nerd-2) has 2 GiB total RAM shared with the +# OS and other pods. 550 Mi request leaves headroom; the 1 Gi hard +# limit prevents OOM-kill from bursty KV-cache growth. +# - MAX_CONCURRENCY=1 ensures only one inference runs at a time, so peak +# memory is predictable (no concurrent KV-cache allocations). resources: limits: - cpu: 1 + cpu: 900m memory: 1Gi requests: - cpu: 200m - memory: 600Mi + cpu: 50m + memory: 550Mi # Readiness and liveness probes configuration probes: diff --git a/pyproject.toml b/pyproject.toml index 7c93f92..0aae74b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,8 @@ dependencies = [ "prometheus-client>=0.22.1", "prometheus-fastapi-instrumentator>=7.1.0", "psutil>=6.1.0", + "onnxruntime>=1.17.0", + "tokenizers>=0.21.0", ] [tool.ruff.lint] diff --git a/scripts/download.sh b/scripts/download.sh index 02339ed..d1d4a8f 100755 --- a/scripts/download.sh +++ b/scripts/download.sh @@ -36,4 +36,30 @@ for file in "${FILES_TO_DOWNLOAD[@]}"; do fi done +# --- Embedding model: all-MiniLM-L6-v2 (ONNX, quantized UINT8 for AVX2) --- +EMBEDDING_REPO_URL="https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2" +EMBEDDING_MODEL_DIR="$MODEL_DIR/all-MiniLM-L6-v2" + +mkdir -p "$EMBEDDING_MODEL_DIR/onnx" + +EMBEDDING_FILES=( + "onnx/model_quint8_avx2.onnx" + "tokenizer.json" +) + +echo "Downloading all-MiniLM-L6-v2 ONNX embedding model..." + +for file in "${EMBEDDING_FILES[@]}"; do + dest="$EMBEDDING_MODEL_DIR/$file" + if [ -f "$dest" ]; then + echo "$file already exists, skipping download." + else + echo "Downloading $file..." + wget -O "$dest" "$EMBEDDING_REPO_URL/resolve/main/$file" || { + echo "Failed to download $file with wget, trying curl..." + curl -L -o "$dest" "$EMBEDDING_REPO_URL/resolve/main/$file" + } + fi +done + echo "Download process complete! Files are in $MODEL_DIR" diff --git a/scripts/smoke.sh b/scripts/smoke.sh new file mode 100755 index 0000000..23094fc --- /dev/null +++ b/scripts/smoke.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -e + +BASE_URL="${BASE_URL:-http://localhost:8000}" + +echo "=== Health check ===" +curl -sf "$BASE_URL/health" +echo + +echo "=== List models ===" +curl -sf "$BASE_URL/api/v1/models" | python3 -m json.tool +echo + +echo "=== Chat completion ===" +curl -sf "$BASE_URL/api/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "Say hello in one sentence."}], + "max_tokens": 64 + }' | python3 -m json.tool +echo + +echo "=== Chat completion (streaming) ===" +curl -sf "$BASE_URL/api/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "What is 2+2?"}], + "max_tokens": 32, + "stream": true + }' +echo + +echo "=== Embeddings (single) ===" +curl -sf "$BASE_URL/api/v1/embeddings" \ + -H "Content-Type: application/json" \ + -d '{ + "input": "Hello world" + }' | python3 -m json.tool +echo + +echo "=== Embeddings (batch) ===" +curl -sf "$BASE_URL/api/v1/embeddings" \ + -H "Content-Type: application/json" \ + -d '{ + "input": ["The cat sat on the mat.", "A dog played in the park."] + }' | python3 -m json.tool +echo + +echo "All smoke tests passed." diff --git a/slm_server/app.py b/slm_server/app.py index d35e337..c3ebb29 100644 --- a/slm_server/app.py +++ b/slm_server/app.py @@ -10,11 +10,14 @@ from llama_cpp import CreateChatCompletionStreamResponse, Llama from slm_server.config import Settings, get_settings +from slm_server.embedding import OnnxEmbeddingModel from slm_server.logging import setup_logging from slm_server.metrics import setup_metrics from slm_server.model import ( ChatCompletionRequest, + EmbeddingData, EmbeddingRequest, + EmbeddingResponse, ModelInfo, ModelListResponse, ) @@ -62,13 +65,21 @@ def get_llm(settings: Annotated[Settings, Depends(get_settings)]) -> Llama: seed=settings.seed, chat_format=CHAT_FORMAT, logits_all=False, - embedding=True, - use_mlock=True, # Use mlock to prevent memory swapping - use_mmap=True, # Use memory-mapped files for faster access + embedding=False, + use_mlock=True, + use_mmap=True, ) return get_llm._instance +def get_embedding_model( + settings: Annotated[Settings, Depends(get_settings)], +) -> OnnxEmbeddingModel: + if not hasattr(get_embedding_model, "_instance"): + get_embedding_model._instance = OnnxEmbeddingModel(settings.embedding) + return get_embedding_model._instance + + def get_app() -> FastAPI: # Get settings when creating app. settings = get_settings() @@ -176,41 +187,53 @@ async def create_chat_completion( @app.post("/api/v1/embeddings") async def create_embeddings( req: EmbeddingRequest, - llm: Annotated[Llama, Depends(get_llm)], + emb_model: Annotated[OnnxEmbeddingModel, Depends(get_embedding_model)], _: Annotated[None, Depends(lock_llm_semaphor)], __: Annotated[None, Depends(raise_as_http_exception)], ): - """Create embeddings for the given input text(s).""" + """Create embeddings using the dedicated ONNX embedding model.""" with slm_embedding_span(req) as span: - # Use llama-cpp-python's create_embedding method directly - embedding_result = await asyncio.to_thread( - llm.create_embedding, - **req.model_dump(), + inputs = req.input if isinstance(req.input, list) else [req.input] + vectors = await asyncio.to_thread(emb_model.encode, inputs, True) + result = EmbeddingResponse( + data=[ + EmbeddingData(embedding=vec.tolist(), index=i) + for i, vec in enumerate(vectors) + ], + model=emb_model.model_id, ) - # Convert llama-cpp response using model_validate like chat completion - set_attribute_response_embedding(span, embedding_result) - return embedding_result + set_attribute_response_embedding(span, result) + return result @app.get("/api/v1/models", response_model=ModelListResponse) async def list_models( settings: Annotated[Settings, Depends(get_settings)], ) -> ModelListResponse: - """List available models (OpenAI-compatible). Returns the single loaded model.""" - model_id = Path(settings.model_path).stem + """List available models (OpenAI-compatible).""" + chat_model_id = Path(settings.model_path).stem try: - created = int(Path(settings.model_path).stat().st_mtime) + chat_created = int(Path(settings.model_path).stat().st_mtime) except (OSError, ValueError): - created = 0 + chat_created = 0 + + try: + emb_created = int(Path(settings.embedding.onnx_path).stat().st_mtime) + except (OSError, ValueError): + emb_created = 0 + return ModelListResponse( - object="list", data=[ ModelInfo( - id=model_id, - object="model", - created=created, + id=chat_model_id, + created=chat_created, owned_by=settings.model_owner, - ) + ), + ModelInfo( + id=settings.embedding.model_id, + created=emb_created, + owned_by="sentence-transformers", + ), ], ) diff --git a/slm_server/config.py b/slm_server/config.py index 7948da3..f0ad876 100644 --- a/slm_server/config.py +++ b/slm_server/config.py @@ -16,6 +16,34 @@ MODEL_PATH_DEFAULT = str(MODELS_DIR / "Qwen3-0.6B-Q4_K_M.gguf") MODEL_OWNER_DEFAULT = "second-state" +EMBEDDING_TOKENIZER_PATH_DEFAULT = str( + MODELS_DIR / "all-MiniLM-L6-v2" / "tokenizer.json" +) +EMBEDDING_ONNX_PATH_DEFAULT = str( + MODELS_DIR / "all-MiniLM-L6-v2" / "onnx" / "model_quint8_avx2.onnx" +) + + +class EmbeddingSettings(BaseModel): + model_id: str = Field( + "all-MiniLM-L6-v2", + description="Model identifier returned in API responses.", + ) + tokenizer_path: str = Field( + EMBEDDING_TOKENIZER_PATH_DEFAULT, + description="Full path to the tokenizer.json file.", + ) + onnx_path: str = Field( + EMBEDDING_ONNX_PATH_DEFAULT, + description="Full path to the ONNX model file.", + ) + max_length: int = Field( + 256, + description="Maximum token sequence length for the tokenizer. " + "all-MiniLM-L6-v2 was trained with 256; increase only if " + "swapping to a model that supports longer sequences.", + ) + class LoggingSettings(BaseModel): verbose: bool = Field(True, description="If logging to stdout by cpp llama") @@ -75,6 +103,7 @@ class Settings(BaseSettings): 1, description="Seconds to wait if undergoing another inference." ) + embedding: EmbeddingSettings = Field(default_factory=EmbeddingSettings) logging: LoggingSettings = Field(default_factory=LoggingSettings) metrics: MetricsSettings = Field(default_factory=MetricsSettings) tracing: TraceSettings = Field(default_factory=TraceSettings) diff --git a/slm_server/embedding.py b/slm_server/embedding.py new file mode 100644 index 0000000..fd995ab --- /dev/null +++ b/slm_server/embedding.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import logging +import time +from typing import TYPE_CHECKING + +import numpy as np +from onnxruntime import InferenceSession +from tokenizers import Tokenizer + +if TYPE_CHECKING: + from slm_server.config import EmbeddingSettings + +logger = logging.getLogger(__name__) + +ONNX_PROVIDERS: list[str] = ["CPUExecutionProvider"] +KEY_INPUT_IDS: str = "input_ids" +KEY_ATTENTION_MASK: str = "attention_mask" +KEY_TOKEN_TYPE_IDS: str = "token_type_ids" + + +class OnnxEmbeddingModel: + """Lightweight ONNX-based sentence embedding model. + + Replicates the sentence-transformers all-MiniLM-L6-v2 pipeline: + tokenize -> BERT forward pass -> mean pooling -> L2 normalize. + Uses onnxruntime + tokenizers directly (no PyTorch dependency). + """ + + def __init__(self, settings: EmbeddingSettings): + start = time.monotonic() + + self.model_id = settings.model_id + + self.tokenizer = Tokenizer.from_file(settings.tokenizer_path) + self.tokenizer.enable_truncation(max_length=settings.max_length) + self.tokenizer.enable_padding(length=None) + + self.session = InferenceSession(settings.onnx_path, providers=ONNX_PROVIDERS) + + elapsed_ms = (time.monotonic() - start) * 1000 + logger.info( + "Loaded embedding model %s in %.1fms", settings.onnx_path, elapsed_ms + ) + + def encode(self, texts: list[str], normalize: bool = True) -> np.ndarray: + """Encode texts into dense vectors (384-dim for MiniLM-L6-v2).""" + if not texts: + return np.empty((0, 384), dtype=np.float32) + + encodings = self.tokenizer.encode_batch(texts) + + input_ids = np.array([e.ids for e in encodings], dtype=np.int64) + attention_mask = np.array([e.attention_mask for e in encodings], dtype=np.int64) + token_type_ids = np.zeros_like(input_ids) + + outputs = self.session.run( + None, + { + KEY_INPUT_IDS: input_ids, + KEY_ATTENTION_MASK: attention_mask, + KEY_TOKEN_TYPE_IDS: token_type_ids, + }, + ) + + token_embeddings = outputs[0] # (batch, seq_len, hidden_dim) + + # Mean pooling: average token embeddings weighted by attention mask + mask_expanded = attention_mask[:, :, np.newaxis].astype(np.float32) + sum_embeddings = np.sum(token_embeddings * mask_expanded, axis=1) + sum_mask = np.clip(mask_expanded.sum(axis=1), a_min=1e-9, a_max=None) + embeddings = sum_embeddings / sum_mask + + if normalize: + norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + embeddings = embeddings / np.clip(norms, a_min=1e-9, a_max=None) + + return embeddings diff --git a/slm_server/model.py b/slm_server/model.py index b2707d6..9168c37 100644 --- a/slm_server/model.py +++ b/slm_server/model.py @@ -6,7 +6,7 @@ ChatCompletionTool, ChatCompletionToolChoiceOption, ) -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, conlist class ChatCompletionRequest(BaseModel): @@ -83,11 +83,25 @@ class ChatCompletionRequest(BaseModel): # Embeddings API Models +# OpenAI allows up to 2048 inputs per request. +MAX_EMBEDDING_INPUTS = 2048 + + class EmbeddingRequest(BaseModel): - input: str | list[str] - model: str | None = Field( - default=None, description="Model name, not important for our server" - ) + input: str | conlist(str, max_length=MAX_EMBEDDING_INPUTS) + model: str | None = Field(default=None, description="Model identifier") + + +class EmbeddingData(BaseModel): + object: str = Field(default="embedding") + embedding: list[float] + index: int + + +class EmbeddingResponse(BaseModel): + object: str = Field(default="list") + data: list[EmbeddingData] + model: str # OpenAI-compatible list models API diff --git a/slm_server/utils/constants.py b/slm_server/utils/constants.py index 6506887..14247c1 100644 --- a/slm_server/utils/constants.py +++ b/slm_server/utils/constants.py @@ -1,5 +1,6 @@ # Constants for span naming and attributes MODEL_NAME = "llama-cpp" +EMBEDDING_MODEL_NAME = "onnx-minilm" SPAN_PREFIX = "slm" # Span names diff --git a/slm_server/utils/spans.py b/slm_server/utils/spans.py index 5449a48..9cbf34e 100644 --- a/slm_server/utils/spans.py +++ b/slm_server/utils/spans.py @@ -7,9 +7,7 @@ from llama_cpp.llama_types import ( CreateChatCompletionResponse as ChatCompletionResponse, ) -from llama_cpp.llama_types import ( - CreateEmbeddingResponse as EmbeddingResponse, -) +from slm_server.model import EmbeddingResponse from opentelemetry import trace from opentelemetry.sdk.trace import Span from opentelemetry.trace import Status, StatusCode @@ -34,6 +32,7 @@ ATTR_STREAMING, ATTR_TEMPERATURE, ATTR_TOTAL_TOKENS, + EMBEDDING_MODEL_NAME, EVENT_ATTR_CHUNK_CONTENT_SIZE, EVENT_ATTR_CHUNK_SIZE, EVENT_CHUNK_GENERATED, @@ -133,24 +132,10 @@ def set_atrribute_response_stream( span.set_attribute(ATTR_CHUNK_COUNT, current_chunk_count + 1) -def set_attribute_response_embedding(span: Span, response: EmbeddingResponse | dict): +def set_attribute_response_embedding(span: Span, response: EmbeddingResponse): """Set embedding response attributes automatically.""" - if isinstance(response, dict): - # Handle dict response - usage = response.get("usage") - if usage: - span.set_attribute(ATTR_PROMPT_TOKENS, usage.get("prompt_tokens", 0)) - span.set_attribute(ATTR_TOTAL_TOKENS, usage.get("total_tokens", 0)) - data = response.get("data") - if data: - span.set_attribute(ATTR_OUTPUT_COUNT, len(data)) - else: - # Handle object response (original code) - if response.usage: - span.set_attribute(ATTR_PROMPT_TOKENS, response.usage.prompt_tokens) - span.set_attribute(ATTR_TOTAL_TOKENS, response.usage.total_tokens) - if response.data: - span.set_attribute(ATTR_OUTPUT_COUNT, len(response.data)) + if response.data: + span.set_attribute(ATTR_OUTPUT_COUNT, len(response.data)) def set_attribute_cancelled(span: Span, reason: str = "client disconnected"): @@ -204,7 +189,7 @@ def slm_span(req: ChatCompletionRequest, is_streaming: bool): @contextmanager -def slm_embedding_span(req: EmbeddingRequest): +def slm_embedding_span(req: EmbeddingRequest, model_name: str = EMBEDDING_MODEL_NAME): """Create SLM span for embedding requests.""" span_name = SPAN_EMBEDDING @@ -216,7 +201,7 @@ def slm_embedding_span(req: EmbeddingRequest): input_content_length = len(req.input) initial_attributes = { - ATTR_MODEL: MODEL_NAME, + ATTR_MODEL: model_name, ATTR_INPUT_COUNT: input_count, ATTR_INPUT_CONTENT_LENGTH: input_content_length, } diff --git a/tests/test_app.py b/tests/test_app.py index d06d3a3..91ab31a 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -8,12 +8,24 @@ from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter from opentelemetry.trace import set_tracer_provider -from slm_server.app import DETAIL_SEM_TIMEOUT, app, get_llm, get_settings +import numpy as np + +from slm_server.app import ( + DETAIL_SEM_TIMEOUT, + app, + get_embedding_model, + get_llm, + get_settings, +) from slm_server.config import Settings # Create a mock Llama instance mock_llama = MagicMock() +# Create a mock embedding model +mock_embedding_model = MagicMock() +mock_embedding_model.model_id = "all-MiniLM-L6-v2" + # Set up OpenTelemetry for tests tracer_provider = TracerProvider() memory_exporter = InMemorySpanExporter() @@ -26,7 +38,12 @@ def override_get_llm(): return mock_llama +def override_get_embedding_model(): + return mock_embedding_model + + app.dependency_overrides[get_llm] = override_get_llm +app.dependency_overrides[get_embedding_model] = override_get_embedding_model # Use TestClient with lifespan context to ensure metrics endpoint is created client = TestClient(app) @@ -36,8 +53,9 @@ def override_get_llm(): def reset_mock(): """Reset the mock before each test.""" mock_llama.reset_mock() - mock_llama.create_chat_completion.side_effect = None # Clear any side effects - mock_llama.create_embedding.side_effect = None # Clear any side effects for embedding + mock_llama.create_chat_completion.side_effect = None + mock_embedding_model.reset_mock() + mock_embedding_model.encode.side_effect = None # Patch the tracer in utils.py to use our test tracer local_tracer = tracer_provider.get_tracer(__name__) @@ -489,21 +507,7 @@ def test_streaming_call_with_empty_chunks(): def test_embeddings_endpoint_string_input(): """Tests the embeddings endpoint with string input.""" - mock_llama.create_embedding.return_value = { - "object": "list", - "data": [ - { - "object": "embedding", - "embedding": [0.1, -0.2, 0.3, -0.4, 0.5], - "index": 0 - } - ], - "model": "test-model", - "usage": { - "prompt_tokens": 5, - "total_tokens": 5 - } - } + mock_embedding_model.encode.return_value = np.array([[0.1, -0.2, 0.3, -0.4, 0.5]]) response = client.post( "/api/v1/embeddings", @@ -518,39 +522,17 @@ def test_embeddings_endpoint_string_input(): assert response_data["data"][0]["object"] == "embedding" assert response_data["data"][0]["embedding"] == [0.1, -0.2, 0.3, -0.4, 0.5] assert response_data["data"][0]["index"] == 0 - assert response_data["model"] == "test-model" - assert response_data["usage"]["prompt_tokens"] == 5 - assert response_data["usage"]["total_tokens"] == 5 - - # Verify the LLM was called correctly - mock_llama.create_embedding.assert_called_once_with( - input="Hello world", - model="test-model" - ) + assert response_data["model"] == "all-MiniLM-L6-v2" + + mock_embedding_model.encode.assert_called_once_with(["Hello world"], True) def test_embeddings_endpoint_list_input(): """Tests the embeddings endpoint with list input.""" - mock_llama.create_embedding.return_value = { - "object": "list", - "data": [ - { - "object": "embedding", - "embedding": [0.1, 0.2, 0.3], - "index": 0 - }, - { - "object": "embedding", - "embedding": [0.4, 0.5, 0.6], - "index": 1 - } - ], - "model": "test-model", - "usage": { - "prompt_tokens": 10, - "total_tokens": 10 - } - } + mock_embedding_model.encode.return_value = np.array([ + [0.1, 0.2, 0.3], + [0.4, 0.5, 0.6], + ]) response = client.post( "/api/v1/embeddings", @@ -564,32 +546,15 @@ def test_embeddings_endpoint_list_input(): assert len(response_data["data"]) == 2 assert response_data["data"][0]["embedding"] == [0.1, 0.2, 0.3] assert response_data["data"][1]["embedding"] == [0.4, 0.5, 0.6] - assert response_data["usage"]["prompt_tokens"] == 10 - # Verify the LLM was called correctly - mock_llama.create_embedding.assert_called_once_with( - input=["First text", "Second text"], - model="test-model" + mock_embedding_model.encode.assert_called_once_with( + ["First text", "Second text"], True ) def test_embeddings_endpoint_default_model(): - """Tests the embeddings endpoint with default model.""" - mock_llama.create_embedding.return_value = { - "object": "list", - "data": [ - { - "object": "embedding", - "embedding": [0.1, 0.2], - "index": 0 - } - ], - "model": "Qwen3-0.6B-GGUF", - "usage": { - "prompt_tokens": 3, - "total_tokens": 3 - } - } + """Tests the embeddings endpoint with default model (from settings).""" + mock_embedding_model.encode.return_value = np.array([[0.1, 0.2]]) response = client.post( "/api/v1/embeddings", @@ -599,18 +564,13 @@ def test_embeddings_endpoint_default_model(): assert response.status_code == 200 response_data = response.json() - assert response_data["model"] == "Qwen3-0.6B-GGUF" - - # Verify default model was used - mock_llama.create_embedding.assert_called_once_with( - input="Test", - model=None # Default model is None - ) + # When no model is specified, the settings model_id is used + assert response_data["model"] == "all-MiniLM-L6-v2" def test_embeddings_endpoint_error(): """Tests the embeddings endpoint error handling.""" - mock_llama.create_embedding.side_effect = Exception("Embedding failed") + mock_embedding_model.encode.side_effect = Exception("Embedding failed") response = client.post( "/api/v1/embeddings", @@ -623,21 +583,7 @@ def test_embeddings_endpoint_error(): def test_embeddings_endpoint_empty_input(): """Tests the embeddings endpoint with empty input.""" - mock_llama.create_embedding.return_value = { - "object": "list", - "data": [ - { - "object": "embedding", - "embedding": [0.0, 0.0], - "index": 0 - } - ], - "model": "test-model", - "usage": { - "prompt_tokens": 0, - "total_tokens": 0 - } - } + mock_embedding_model.encode.return_value = np.array([[0.0, 0.0]]) response = client.post( "/api/v1/embeddings", @@ -648,32 +594,27 @@ def test_embeddings_endpoint_empty_input(): response_data = response.json() assert len(response_data["data"]) == 1 - assert response_data["usage"]["prompt_tokens"] == 0 - # Verify empty string was passed through - mock_llama.create_embedding.assert_called_once_with( - input="", - model="test-model" + mock_embedding_model.encode.assert_called_once_with([""], True) + + +def test_embeddings_endpoint_rejects_too_many_inputs(): + """Tests the embeddings endpoint rejects input list exceeding max items.""" + from slm_server.model import MAX_EMBEDDING_INPUTS + + too_many = ["text"] * (MAX_EMBEDDING_INPUTS + 1) + response = client.post( + "/api/v1/embeddings", + json={"input": too_many, "model": "test-model"}, ) + assert response.status_code == 422 def test_embeddings_endpoint_with_tracing_integration(): """Integration test for embeddings endpoint with complete tracing flow.""" - mock_llama.create_embedding.return_value = { - "object": "list", - "data": [ - { - "object": "embedding", - "embedding": [0.1, -0.2, 0.3, -0.4, 0.5, 0.6, -0.7, 0.8], - "index": 0 - } - ], - "model": "test-model", - "usage": { - "prompt_tokens": 8, - "total_tokens": 8 - } - } + mock_embedding_model.encode.return_value = np.array( + [[0.1, -0.2, 0.3, -0.4, 0.5, 0.6, -0.7, 0.8]] + ) response = client.post( "/api/v1/embeddings", @@ -686,19 +627,12 @@ def test_embeddings_endpoint_with_tracing_integration(): assert response.status_code == 200 response_data = response.json() - # Verify response structure assert response_data["object"] == "list" assert len(response_data["data"]) == 1 assert len(response_data["data"][0]["embedding"]) == 8 - assert response_data["usage"]["prompt_tokens"] == 8 - assert response_data["usage"]["total_tokens"] == 8 - - # Verify the LLM was called with correct parameters - mock_llama.create_embedding.assert_called_once() - call_args = mock_llama.create_embedding.call_args + assert response_data["model"] == "all-MiniLM-L6-v2" - assert call_args[1]["input"] == "This is a test sentence for creating embeddings." - assert call_args[1]["model"] == "test-model" + mock_embedding_model.encode.assert_called_once() def test_request_validation_and_defaults(): @@ -734,18 +668,24 @@ def test_request_validation_and_defaults(): def test_list_models_structure(): - """GET /api/v1/models returns OpenAI-compatible list with one model.""" + """GET /api/v1/models returns OpenAI-compatible list with chat and embedding models.""" response = client.get("/api/v1/models") assert response.status_code == 200 data = response.json() assert data["object"] == "list" assert isinstance(data["data"], list) - assert len(data["data"]) == 1 - model = data["data"][0] - assert model["object"] == "model" - assert "id" in model and isinstance(model["id"], str) - assert "created" in model and isinstance(model["created"], int) - assert model["owned_by"] == "second-state" + assert len(data["data"]) == 2 + + chat_model = data["data"][0] + assert chat_model["object"] == "model" + assert "id" in chat_model and isinstance(chat_model["id"], str) + assert "created" in chat_model and isinstance(chat_model["created"], int) + assert chat_model["owned_by"] == "second-state" + + emb_model = data["data"][1] + assert emb_model["object"] == "model" + assert emb_model["id"] == "all-MiniLM-L6-v2" + assert emb_model["owned_by"] == "sentence-transformers" def test_list_models_with_overridden_settings(): @@ -764,12 +704,12 @@ def override_settings(): assert response.status_code == 200 data = response.json() assert data["object"] == "list" - assert len(data["data"]) == 1 - model = data["data"][0] - assert model["id"] == "SomeModel" - assert model["object"] == "model" - assert model["owned_by"] == "custom-org" - assert model["created"] == 0 # file does not exist + assert len(data["data"]) == 2 + chat_model = data["data"][0] + assert chat_model["id"] == "SomeModel" + assert chat_model["object"] == "model" + assert chat_model["owned_by"] == "custom-org" + assert chat_model["created"] == 0 # file does not exist finally: app.dependency_overrides.pop(get_settings, None) @@ -788,10 +728,10 @@ def override_settings(): try: response = client.get("/api/v1/models") assert response.status_code == 200 - model = response.json()["data"][0] - assert model["id"] == "RealModel" - assert model["created"] > 0 - assert model["created"] == int(model_file.stat().st_mtime) + chat_model = response.json()["data"][0] + assert chat_model["id"] == "RealModel" + assert chat_model["created"] > 0 + assert chat_model["created"] == int(model_file.stat().st_mtime) finally: app.dependency_overrides.pop(get_settings, None) diff --git a/tests/test_embedding.py b/tests/test_embedding.py index 88cec97..6f66fdd 100644 --- a/tests/test_embedding.py +++ b/tests/test_embedding.py @@ -8,19 +8,13 @@ from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter from opentelemetry.trace import StatusCode -from llama_cpp.llama_types import ( - CreateEmbeddingResponse as EmbeddingResponse, - EmbeddingData, - EmbeddingUsage, -) -from slm_server.model import EmbeddingRequest +from slm_server.model import EmbeddingData, EmbeddingRequest, EmbeddingResponse from slm_server.utils import ( ATTR_INPUT_COUNT, ATTR_INPUT_CONTENT_LENGTH, ATTR_MODEL, ATTR_OUTPUT_COUNT, - ATTR_PROMPT_TOKENS, - ATTR_TOTAL_TOKENS, + EMBEDDING_MODEL_NAME, SPAN_EMBEDDING, set_attribute_response_embedding, slm_embedding_span, @@ -30,249 +24,162 @@ @pytest.fixture def setup_tracing(): """Set up tracing with in-memory span exporter for testing.""" - # Create a tracer provider with in-memory exporter tracer_provider = TracerProvider() memory_exporter = InMemorySpanExporter() span_processor = SimpleSpanProcessor(memory_exporter) tracer_provider.add_span_processor(span_processor) - - # Don't override global tracer provider - use local one + local_tracer = tracer_provider.get_tracer(__name__) - + yield memory_exporter, local_tracer - - # Clean up + memory_exporter.clear() class TestSetAttributeResponseEmbedding: """Tests for set_attribute_response_embedding function.""" - - def test_sets_embedding_attributes_correctly(self): - """Test that embedding response attributes are set correctly on span.""" + + def test_sets_output_count(self): mock_span = Mock() - - # Create embedding response with usage and data response = EmbeddingResponse( - object="list", data=[ - EmbeddingData( - object="embedding", - embedding=[0.1, 0.2, -0.3, 0.4, -0.5], - index=0 - ), - EmbeddingData( - object="embedding", - embedding=[0.6, -0.7, 0.8, -0.9, 1.0], - index=1 - ) + EmbeddingData(embedding=[0.1, 0.2], index=0), + EmbeddingData(embedding=[0.3, 0.4], index=1), ], model="test-model", - usage=EmbeddingUsage(prompt_tokens=15, total_tokens=15) ) - set_attribute_response_embedding(mock_span, response) - - # Verify attributes were set - mock_span.set_attribute.assert_any_call(ATTR_PROMPT_TOKENS, 15) - mock_span.set_attribute.assert_any_call(ATTR_TOTAL_TOKENS, 15) - mock_span.set_attribute.assert_any_call(ATTR_OUTPUT_COUNT, 2) # 2 embeddings - + mock_span.set_attribute.assert_any_call(ATTR_OUTPUT_COUNT, 2) + def test_handles_single_embedding(self): - """Test handling of single embedding response.""" mock_span = Mock() - response = EmbeddingResponse( - object="list", - data=[ - EmbeddingData( - object="embedding", - embedding=[0.1, 0.2, 0.3], - index=0 - ) - ], + data=[EmbeddingData(embedding=[0.1, 0.2, 0.3], index=0)], model="test-model", - usage=EmbeddingUsage(prompt_tokens=5, total_tokens=5) ) - set_attribute_response_embedding(mock_span, response) - - # Should set output count to 1 mock_span.set_attribute.assert_any_call(ATTR_OUTPUT_COUNT, 1) - + def test_handles_empty_data(self): - """Test handling of empty embedding data.""" mock_span = Mock() - - response = EmbeddingResponse( - object="list", - data=[], - model="test-model", - usage=EmbeddingUsage(prompt_tokens=0, total_tokens=0) - ) - + response = EmbeddingResponse(data=[], model="test-model") set_attribute_response_embedding(mock_span, response) - - # Should still set usage attributes but not output count since data is empty - mock_span.set_attribute.assert_any_call(ATTR_PROMPT_TOKENS, 0) - mock_span.set_attribute.assert_any_call(ATTR_TOTAL_TOKENS, 0) - # Verify output count was NOT set since data is empty - output_count_calls = [call for call in mock_span.set_attribute.call_args_list - if call[0][0] == ATTR_OUTPUT_COUNT] + output_count_calls = [ + call + for call in mock_span.set_attribute.call_args_list + if call[0][0] == ATTR_OUTPUT_COUNT + ] assert len(output_count_calls) == 0 - - def test_handles_usage_properly(self): - """Test that usage attributes are set when present.""" - mock_span = Mock() - - response = EmbeddingResponse( - object="list", - data=[ - EmbeddingData( - object="embedding", - embedding=[0.1, 0.2], - index=0 - ) - ], - model="test-model", - usage=EmbeddingUsage(prompt_tokens=5, total_tokens=5) - ) - - set_attribute_response_embedding(mock_span, response) - - # Should set both usage and output count attributes - mock_span.set_attribute.assert_any_call(ATTR_OUTPUT_COUNT, 1) - mock_span.set_attribute.assert_any_call(ATTR_PROMPT_TOKENS, 5) - mock_span.set_attribute.assert_any_call(ATTR_TOTAL_TOKENS, 5) class TestSlmEmbeddingSpan: """Tests for slm_embedding_span context manager.""" - + def test_sets_initial_attributes_string_input(self, setup_tracing): - """Test that initial attributes are set correctly for string input.""" memory_exporter, local_tracer = setup_tracing - + request = EmbeddingRequest( - input="Hello world, this is a test input.", - model="test-model" + input="Hello world, this is a test input.", model="test-model" ) - - # Patch the global tracer with our local one - with patch('slm_server.utils.spans.tracer', local_tracer): + + with patch("slm_server.utils.spans.tracer", local_tracer): with slm_embedding_span(request) as span: pass - - # Get the finished span + spans = memory_exporter.get_finished_spans() assert len(spans) == 1 - + span = spans[0] attrs = span.attributes - + assert span.name == SPAN_EMBEDDING - assert attrs[ATTR_MODEL] == "llama-cpp" + assert attrs[ATTR_MODEL] == EMBEDDING_MODEL_NAME assert attrs[ATTR_INPUT_COUNT] == 1 assert attrs[ATTR_INPUT_CONTENT_LENGTH] > 0 - + def test_sets_initial_attributes_list_input(self, setup_tracing): - """Test that initial attributes are set correctly for list input.""" memory_exporter, local_tracer = setup_tracing - + request = EmbeddingRequest( - input=["First text", "Second text", "Third text"], - model="test-model" + input=["First text", "Second text", "Third text"], model="test-model" ) - - # Patch the global tracer with our local one - with patch('slm_server.utils.spans.tracer', local_tracer): + + with patch("slm_server.utils.spans.tracer", local_tracer): with slm_embedding_span(request) as span: pass - - # Get the finished span + spans = memory_exporter.get_finished_spans() assert len(spans) == 1 - + span = spans[0] attrs = span.attributes - + assert attrs[ATTR_INPUT_COUNT] == 3 assert attrs[ATTR_INPUT_CONTENT_LENGTH] > 0 - + def test_handles_empty_string_input(self, setup_tracing): - """Test handling of empty string input.""" memory_exporter, local_tracer = setup_tracing - - request = EmbeddingRequest( - input="", - model="test-model" - ) - - with patch('slm_server.utils.spans.tracer', local_tracer): + + request = EmbeddingRequest(input="", model="test-model") + + with patch("slm_server.utils.spans.tracer", local_tracer): with slm_embedding_span(request) as span: pass - + spans = memory_exporter.get_finished_spans() span = spans[0] attrs = span.attributes - + assert attrs[ATTR_INPUT_COUNT] == 1 assert attrs[ATTR_INPUT_CONTENT_LENGTH] == 0 - + def test_handles_empty_list_input(self, setup_tracing): - """Test handling of empty list input.""" memory_exporter, local_tracer = setup_tracing - - request = EmbeddingRequest( - input=[], - model="test-model" - ) - - with patch('slm_server.utils.spans.tracer', local_tracer): + + request = EmbeddingRequest(input=[], model="test-model") + + with patch("slm_server.utils.spans.tracer", local_tracer): with slm_embedding_span(request): pass - + spans = memory_exporter.get_finished_spans() span = spans[0] attrs = span.attributes - + assert attrs[ATTR_INPUT_COUNT] == 0 assert attrs[ATTR_INPUT_CONTENT_LENGTH] == 0 - + def test_handles_list_with_empty_strings(self, setup_tracing): - """Test handling of list containing empty strings.""" memory_exporter, local_tracer = setup_tracing - + request = EmbeddingRequest( - input=["Hello", "", "World", ""], - model="test-model" + input=["Hello", "", "World", ""], model="test-model" ) - - with patch('slm_server.utils.spans.tracer', local_tracer): + + with patch("slm_server.utils.spans.tracer", local_tracer): with slm_embedding_span(request) as span: pass - + spans = memory_exporter.get_finished_spans() span = spans[0] attrs = span.attributes - + assert attrs[ATTR_INPUT_COUNT] == 4 - assert attrs[ATTR_INPUT_CONTENT_LENGTH] == 10 # len("Hello") + len("World") = 5 + 5 - + assert attrs[ATTR_INPUT_CONTENT_LENGTH] == 10 + def test_handles_exceptions(self, setup_tracing): - """Test exception handling in embedding span context.""" memory_exporter, local_tracer = setup_tracing - + request = EmbeddingRequest(input="test", model="test-model") - + with pytest.raises(ValueError): - with patch('slm_server.utils.spans.tracer', local_tracer): + with patch("slm_server.utils.spans.tracer", local_tracer): with slm_embedding_span(request) as span: raise ValueError("test embedding error") - + spans = memory_exporter.get_finished_spans() span = spans[0] - + assert span.status.status_code == StatusCode.ERROR assert "test embedding error" in span.status.description assert span.attributes["slm.force_sample"] is True @@ -280,191 +187,123 @@ def test_handles_exceptions(self, setup_tracing): class TestEmbeddingModelValidation: """Tests for embedding model validation.""" - + def test_embedding_request_string_input(self): - """Test EmbeddingRequest with string input.""" - request = EmbeddingRequest( - input="Test input text", - model="test-model" - ) - + request = EmbeddingRequest(input="Test input text", model="test-model") assert request.input == "Test input text" assert request.model == "test-model" - + def test_embedding_request_list_input(self): - """Test EmbeddingRequest with list input.""" request = EmbeddingRequest( - input=["First", "Second", "Third"], - model="test-model" + input=["First", "Second", "Third"], model="test-model" ) - assert request.input == ["First", "Second", "Third"] assert request.model == "test-model" - + def test_embedding_request_default_model(self): - """Test EmbeddingRequest with default model.""" request = EmbeddingRequest(input="Test") - - assert request.model is None # Default is None as model is not important for server - + assert request.model is None + def test_embedding_response_creation(self): - """Test EmbeddingResponse creation.""" response = EmbeddingResponse( - object="list", - data=[ - EmbeddingData( - object="embedding", - embedding=[1.0, 2.0, 3.0], - index=0 - ) - ], + data=[EmbeddingData(embedding=[1.0, 2.0, 3.0], index=0)], model="test-model", - usage=EmbeddingUsage(prompt_tokens=10, total_tokens=10) ) - - assert response["object"] == "list" - assert len(response["data"]) == 1 - assert response["data"][0]["embedding"] == [1.0, 2.0, 3.0] - assert response["data"][0]["index"] == 0 - assert response["model"] == "test-model" - assert response["usage"]["prompt_tokens"] == 10 - assert response["usage"]["total_tokens"] == 10 - + assert response.object == "list" + assert len(response.data) == 1 + assert response.data[0].embedding == [1.0, 2.0, 3.0] + assert response.data[0].index == 0 + assert response.model == "test-model" + def test_embedding_data_defaults(self): - """Test EmbeddingData with explicit object field.""" - data = EmbeddingData( - object="embedding", - embedding=[0.1, 0.2, 0.3], - index=0 - ) - - assert data["object"] == "embedding" - assert data["embedding"] == [0.1, 0.2, 0.3] - assert data["index"] == 0 + data = EmbeddingData(embedding=[0.1, 0.2, 0.3], index=0) + assert data.object == "embedding" + assert data.embedding == [0.1, 0.2, 0.3] + assert data.index == 0 class TestIntegrationEmbeddingFlow: """Integration test for complete embedding flow.""" - + def test_complete_embedding_flow_string_input(self, setup_tracing): - """Test complete flow of embedding request with string input.""" memory_exporter, local_tracer = setup_tracing - + request = EmbeddingRequest( - input="This is a test sentence for embedding.", - model="test-model" + input="This is a test sentence for embedding.", model="test-model" ) - - # Patch the global tracer with our local one - with patch('slm_server.utils.spans.tracer', local_tracer): + + with patch("slm_server.utils.spans.tracer", local_tracer): with slm_embedding_span(request) as span: - # Simulate processing embedding response = EmbeddingResponse( - object="list", data=[ EmbeddingData( - object="embedding", embedding=[0.1, -0.2, 0.3, -0.4, 0.5, -0.6, 0.7, -0.8], - index=0 + index=0, ) ], model="test-model", - usage=EmbeddingUsage(prompt_tokens=8, total_tokens=8) ) - set_attribute_response_embedding(span, response) - - # Get finished span and verify + spans = memory_exporter.get_finished_spans() assert len(spans) == 1 - + finished_span = spans[0] - - # Verify span attributes + assert finished_span.name == SPAN_EMBEDDING - assert finished_span.attributes[ATTR_MODEL] == "llama-cpp" + assert finished_span.attributes[ATTR_MODEL] == EMBEDDING_MODEL_NAME assert finished_span.attributes[ATTR_INPUT_COUNT] == 1 assert finished_span.attributes[ATTR_INPUT_CONTENT_LENGTH] > 0 assert finished_span.attributes[ATTR_OUTPUT_COUNT] == 1 - assert finished_span.attributes[ATTR_PROMPT_TOKENS] == 8 - assert finished_span.attributes[ATTR_TOTAL_TOKENS] == 8 - + def test_complete_embedding_flow_list_input(self, setup_tracing): - """Test complete flow of embedding request with list input.""" memory_exporter, local_tracer = setup_tracing - + request = EmbeddingRequest( input=["First sentence.", "Second sentence.", "Third sentence."], - model="test-model" + model="test-model", ) - - # Patch the global tracer with our local one - with patch('slm_server.utils.spans.tracer', local_tracer): + + with patch("slm_server.utils.spans.tracer", local_tracer): with slm_embedding_span(request) as span: - # Simulate processing multiple embeddings response = EmbeddingResponse( - object="list", data=[ - EmbeddingData( - object="embedding", - embedding=[0.1, 0.2, 0.3], - index=0 - ), - EmbeddingData( - object="embedding", - embedding=[0.4, 0.5, 0.6], - index=1 - ), - EmbeddingData( - object="embedding", - embedding=[0.7, 0.8, 0.9], - index=2 - ) + EmbeddingData(embedding=[0.1, 0.2, 0.3], index=0), + EmbeddingData(embedding=[0.4, 0.5, 0.6], index=1), + EmbeddingData(embedding=[0.7, 0.8, 0.9], index=2), ], model="test-model", - usage=EmbeddingUsage(prompt_tokens=12, total_tokens=12) ) - set_attribute_response_embedding(span, response) - - # Get finished span and verify + spans = memory_exporter.get_finished_spans() assert len(spans) == 1 - + finished_span = spans[0] - - # Verify span attributes + assert finished_span.attributes[ATTR_INPUT_COUNT] == 3 assert finished_span.attributes[ATTR_INPUT_CONTENT_LENGTH] > 0 assert finished_span.attributes[ATTR_OUTPUT_COUNT] == 3 - assert finished_span.attributes[ATTR_PROMPT_TOKENS] == 12 - assert finished_span.attributes[ATTR_TOTAL_TOKENS] == 12 - + def test_embedding_flow_with_error(self, setup_tracing): - """Test embedding flow with error handling.""" memory_exporter, local_tracer = setup_tracing - + request = EmbeddingRequest( - input="This will cause an error.", - model="test-model" + input="This will cause an error.", model="test-model" ) - + with pytest.raises(RuntimeError): - with patch('slm_server.utils.spans.tracer', local_tracer): + with patch("slm_server.utils.spans.tracer", local_tracer): with slm_embedding_span(request) as span: raise RuntimeError("Embedding processing failed") - - # Get finished span and verify error handling + spans = memory_exporter.get_finished_spans() assert len(spans) == 1 - + finished_span = spans[0] - - # Verify error status + assert finished_span.status.status_code == StatusCode.ERROR assert "Embedding processing failed" in finished_span.status.description assert finished_span.attributes["slm.force_sample"] is True - - # Initial attributes should still be set assert finished_span.attributes[ATTR_INPUT_COUNT] == 1 - assert finished_span.attributes[ATTR_INPUT_CONTENT_LENGTH] == 25 \ No newline at end of file + assert finished_span.attributes[ATTR_INPUT_CONTENT_LENGTH] == 25 diff --git a/uv.lock b/uv.lock index 5ed8bd2..11e917b 100644 --- a/uv.lock +++ b/uv.lock @@ -2,6 +2,15 @@ version = 1 revision = 2 requires-python = ">=3.13" +[[package]] +name = "annotated-doc" +version = "0.0.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload_time = "2025-11-10T22:07:42.062Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload_time = "2025-11-10T22:07:40.673Z" }, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -170,6 +179,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/47/d63c60f59a59467fda0f93f46335c9d18526d7071f025cb5b89d5353ea42/fastapi-0.116.1-py3-none-any.whl", hash = "sha256:c46ac7c312df840f0c9e220f7964bada936781bc4e2e6eb71f1c4d7553786565", size = 95631, upload_time = "2025-07-11T16:22:30.485Z" }, ] +[[package]] +name = "filelock" +version = "3.24.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/73/92/a8e2479937ff39185d20dd6a851c1a63e55849e447a55e798cc2e1f49c65/filelock-3.24.3.tar.gz", hash = "sha256:011a5644dc937c22699943ebbfc46e969cdde3e171470a6e40b9533e5a72affa", size = 37935, upload_time = "2026-02-19T00:48:20.543Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload_time = "2026-02-19T00:48:18.465Z" }, +] + +[[package]] +name = "flatbuffers" +version = "25.12.19" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e8/2d/d2a548598be01649e2d46231d151a6c56d10b964d94043a335ae56ea2d92/flatbuffers-25.12.19-py2.py3-none-any.whl", hash = "sha256:7634f50c427838bb021c2d66a3d1168e9d199b0607e6329399f04846d42e20b4", size = 26661, upload_time = "2025-12-19T23:16:13.622Z" }, +] + +[[package]] +name = "fsspec" +version = "2026.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload_time = "2026-02-05T21:50:53.743Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload_time = "2026-02-05T21:50:51.819Z" }, +] + [[package]] name = "googleapis-common-protos" version = "1.70.0" @@ -233,6 +268,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload_time = "2025-04-24T03:35:24.344Z" }, ] +[[package]] +name = "hf-xet" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/6e/0f11bacf08a67f7fb5ee09740f2ca54163863b07b70d579356e9222ce5d8/hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f", size = 506020, upload_time = "2025-10-24T19:04:32.129Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/a5/85ef910a0aa034a2abcfadc360ab5ac6f6bc4e9112349bd40ca97551cff0/hf_xet-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ceeefcd1b7aed4956ae8499e2199607765fbd1c60510752003b6cc0b8413b649", size = 2861870, upload_time = "2025-10-24T19:04:11.422Z" }, + { url = "https://files.pythonhosted.org/packages/ea/40/e2e0a7eb9a51fe8828ba2d47fe22a7e74914ea8a0db68a18c3aa7449c767/hf_xet-1.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b70218dd548e9840224df5638fdc94bd033552963cfa97f9170829381179c813", size = 2717584, upload_time = "2025-10-24T19:04:09.586Z" }, + { url = "https://files.pythonhosted.org/packages/a5/7d/daf7f8bc4594fdd59a8a596f9e3886133fdc68e675292218a5e4c1b7e834/hf_xet-1.2.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d40b18769bb9a8bc82a9ede575ce1a44c75eb80e7375a01d76259089529b5dc", size = 3315004, upload_time = "2025-10-24T19:04:00.314Z" }, + { url = "https://files.pythonhosted.org/packages/b1/ba/45ea2f605fbf6d81c8b21e4d970b168b18a53515923010c312c06cd83164/hf_xet-1.2.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd3a6027d59cfb60177c12d6424e31f4b5ff13d8e3a1247b3a584bf8977e6df5", size = 3222636, upload_time = "2025-10-24T19:03:58.111Z" }, + { url = "https://files.pythonhosted.org/packages/4a/1d/04513e3cab8f29ab8c109d309ddd21a2705afab9d52f2ba1151e0c14f086/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6de1fc44f58f6dd937956c8d304d8c2dea264c80680bcfa61ca4a15e7b76780f", size = 3408448, upload_time = "2025-10-24T19:04:20.951Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7c/60a2756d7feec7387db3a1176c632357632fbe7849fce576c5559d4520c7/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f182f264ed2acd566c514e45da9f2119110e48a87a327ca271027904c70c5832", size = 3503401, upload_time = "2025-10-24T19:04:22.549Z" }, + { url = "https://files.pythonhosted.org/packages/4e/64/48fffbd67fb418ab07451e4ce641a70de1c40c10a13e25325e24858ebe5a/hf_xet-1.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:293a7a3787e5c95d7be1857358a9130694a9c6021de3f27fa233f37267174382", size = 2900866, upload_time = "2025-10-24T19:04:33.461Z" }, + { url = "https://files.pythonhosted.org/packages/e2/51/f7e2caae42f80af886db414d4e9885fac959330509089f97cccb339c6b87/hf_xet-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e", size = 2861861, upload_time = "2025-10-24T19:04:19.01Z" }, + { url = "https://files.pythonhosted.org/packages/6e/1d/a641a88b69994f9371bd347f1dd35e5d1e2e2460a2e350c8d5165fc62005/hf_xet-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8", size = 2717699, upload_time = "2025-10-24T19:04:17.306Z" }, + { url = "https://files.pythonhosted.org/packages/df/e0/e5e9bba7d15f0318955f7ec3f4af13f92e773fbb368c0b8008a5acbcb12f/hf_xet-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0", size = 3314885, upload_time = "2025-10-24T19:04:07.642Z" }, + { url = "https://files.pythonhosted.org/packages/21/90/b7fe5ff6f2b7b8cbdf1bd56145f863c90a5807d9758a549bf3d916aa4dec/hf_xet-1.2.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090", size = 3221550, upload_time = "2025-10-24T19:04:05.55Z" }, + { url = "https://files.pythonhosted.org/packages/6f/cb/73f276f0a7ce46cc6a6ec7d6c7d61cbfe5f2e107123d9bbd0193c355f106/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a", size = 3408010, upload_time = "2025-10-24T19:04:28.598Z" }, + { url = "https://files.pythonhosted.org/packages/b8/1e/d642a12caa78171f4be64f7cd9c40e3ca5279d055d0873188a58c0f5fbb9/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f", size = 3503264, upload_time = "2025-10-24T19:04:30.397Z" }, + { url = "https://files.pythonhosted.org/packages/17/b5/33764714923fa1ff922770f7ed18c2daae034d21ae6e10dbf4347c854154/hf_xet-1.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc", size = 2901071, upload_time = "2025-10-24T19:04:37.463Z" }, + { url = "https://files.pythonhosted.org/packages/96/2d/22338486473df5923a9ab7107d375dbef9173c338ebef5098ef593d2b560/hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848", size = 2866099, upload_time = "2025-10-24T19:04:15.366Z" }, + { url = "https://files.pythonhosted.org/packages/7f/8c/c5becfa53234299bc2210ba314eaaae36c2875e0045809b82e40a9544f0c/hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4", size = 2722178, upload_time = "2025-10-24T19:04:13.695Z" }, + { url = "https://files.pythonhosted.org/packages/9a/92/cf3ab0b652b082e66876d08da57fcc6fa2f0e6c70dfbbafbd470bb73eb47/hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd", size = 3320214, upload_time = "2025-10-24T19:04:03.596Z" }, + { url = "https://files.pythonhosted.org/packages/46/92/3f7ec4a1b6a65bf45b059b6d4a5d38988f63e193056de2f420137e3c3244/hf_xet-1.2.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c", size = 3229054, upload_time = "2025-10-24T19:04:01.949Z" }, + { url = "https://files.pythonhosted.org/packages/0b/dd/7ac658d54b9fb7999a0ccb07ad863b413cbaf5cf172f48ebcd9497ec7263/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737", size = 3413812, upload_time = "2025-10-24T19:04:24.585Z" }, + { url = "https://files.pythonhosted.org/packages/92/68/89ac4e5b12a9ff6286a12174c8538a5930e2ed662091dd2572bbe0a18c8a/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865", size = 3508920, upload_time = "2025-10-24T19:04:26.927Z" }, + { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload_time = "2025-10-24T19:04:35.928Z" }, +] + [[package]] name = "httpcore" version = "1.0.9" @@ -261,6 +325,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload_time = "2024-12-06T15:37:21.509Z" }, ] +[[package]] +name = "huggingface-hub" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "httpx" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "shellingham" }, + { name = "tqdm" }, + { name = "typer-slim" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c4/fc/eb9bc06130e8bbda6a616e1b80a7aa127681c448d6b49806f61db2670b61/huggingface_hub-1.4.1.tar.gz", hash = "sha256:b41131ec35e631e7383ab26d6146b8d8972abc8b6309b963b306fbcca87f5ed5", size = 642156, upload_time = "2026-02-06T09:20:03.013Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/ae/2f6d96b4e6c5478d87d606a1934b5d436c4a2bce6bb7c6fdece891c128e3/huggingface_hub-1.4.1-py3-none-any.whl", hash = "sha256:9931d075fb7a79af5abc487106414ec5fba2c0ae86104c0c62fd6cae38873d18", size = 553326, upload_time = "2026-02-06T09:20:00.728Z" }, +] + [[package]] name = "idna" version = "3.10" @@ -452,6 +537,18 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/e2/3d/a426f9777301569a17f3c3bf4ecc3755120531c008e4601450eec13c09ac/llama_cpp_python-0.3.13.tar.gz", hash = "sha256:307ce2abf62c7cf574234b8c633978cf92eb1c4b3cfe6babef889d812c298d84", size = 50059668, upload_time = "2025-07-15T11:43:59.734Z" } +[[package]] +name = "markdown-it-py" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload_time = "2025-08-11T12:57:52.854Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload_time = "2025-08-11T12:57:51.923Z" }, +] + [[package]] name = "markupsafe" version = "3.0.2" @@ -480,6 +577,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload_time = "2024-10-18T15:21:42.784Z" }, ] +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload_time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload_time = "2022-08-14T12:40:09.779Z" }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload_time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload_time = "2023-03-07T16:47:09.197Z" }, +] + [[package]] name = "numpy" version = "2.3.1" @@ -510,6 +625,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d4/ca/af82bf0fad4c3e573c6930ed743b5308492ff19917c7caaf2f9b6f9e2e98/numpy-2.3.1-cp313-cp313t-win_arm64.whl", hash = "sha256:eccb9a159db9aed60800187bc47a6d3451553f0e1b08b068d8b277ddfbb9b244", size = 10260376, upload_time = "2025-06-21T12:24:56.884Z" }, ] +[[package]] +name = "onnxruntime" +version = "1.24.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "flatbuffers" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "protobuf" }, + { name = "sympy" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/98/8f5b9ae63f7f6dd5fb2d192454b915ec966a421fdd0effeeef5be7f7221f/onnxruntime-1.24.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:038ebcd8363c3835ea83eed66129e1d11d8219438892dfb7dc7656c4d4dfa1f9", size = 17217884, upload_time = "2026-02-19T17:13:36.193Z" }, + { url = "https://files.pythonhosted.org/packages/55/e6/dc4dc59565c93506c45017c0dd3f536f6d1b7bc97047821af13fba2e3def/onnxruntime-1.24.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8235cc11e118ad749c497ba93288c04073eccd8cc6cc508c8a7988ae36ab52d8", size = 15026995, upload_time = "2026-02-19T17:13:25.029Z" }, + { url = "https://files.pythonhosted.org/packages/ac/62/6f2851cf3237a91bc04cdb35434293a623d4f6369f79836929600da574ba/onnxruntime-1.24.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e92b46cc6d8be4286436a05382a881c88d85a2ae1ea9cfe5e6fab89f2c3e89cc", size = 17106308, upload_time = "2026-02-19T17:14:09.817Z" }, + { url = "https://files.pythonhosted.org/packages/62/5a/1e2b874daf24f26e98af14281fdbdd6ae1ed548ba471c01ea2a3084c55bb/onnxruntime-1.24.2-cp313-cp313-win_amd64.whl", hash = "sha256:1fd824ee4f6fb811bc47ffec2b25f129f31a087214ca91c8b4f6fda32962b78f", size = 12506095, upload_time = "2026-02-19T17:15:02.434Z" }, + { url = "https://files.pythonhosted.org/packages/2d/6f/8fac5eecb94f861d56a43ede3c2ebcdce60132952d3b72003f3e3d91483c/onnxruntime-1.24.2-cp313-cp313-win_arm64.whl", hash = "sha256:d8cf0acbf90771fff012c33eb2749e8aca2a8b4c66c672f30ee77c140a6fba5b", size = 12168564, upload_time = "2026-02-19T17:14:52.28Z" }, + { url = "https://files.pythonhosted.org/packages/35/e4/7dfed3f445f7289a0abff709d012439c6c901915390704dd918e5f47aad3/onnxruntime-1.24.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e22fb5d9ac51b61f50cca155ce2927576cc2c42501ede6c0df23a1aeb070bdd5", size = 15036844, upload_time = "2026-02-19T17:13:27.928Z" }, + { url = "https://files.pythonhosted.org/packages/90/45/9d52397e30b0d8c1692afcec5184ca9372ff4d6b0f6039bba9ad479a2563/onnxruntime-1.24.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2956f5220e7be8b09482ae5726caabf78eb549142cdb28523191a38e57fb6119", size = 17117779, upload_time = "2026-02-19T17:14:13.862Z" }, + { url = "https://files.pythonhosted.org/packages/a9/c8/2321cd06ddbb4321326df365ccb8345cdb4e05643f539729f3943c706e97/onnxruntime-1.24.2-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:487e3fdedc24bc93f2acdf47c622de49b3999fb5754e7cfa466e5533a0215051", size = 17219405, upload_time = "2026-02-19T17:13:39.925Z" }, + { url = "https://files.pythonhosted.org/packages/ad/ff/a2cdf95d2647f2a5076eb3fc49ae662e375c4eb5c7b6b675f910f96c8e15/onnxruntime-1.24.2-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c33398bd6ab1a6b7de9410af7360cd8b6312bc0c4848ddb738456c13dfbec4b", size = 15027713, upload_time = "2026-02-19T17:13:30.693Z" }, + { url = "https://files.pythonhosted.org/packages/0d/74/a1913b3a0fc2f27fe1751e9545745a3f35fd7833e3438a4208b4e215778f/onnxruntime-1.24.2-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2658b3ce6cb33bdeddfcd74c6da509510310717611220cf2106e6c401febabe5", size = 17106108, upload_time = "2026-02-19T17:14:16.619Z" }, + { url = "https://files.pythonhosted.org/packages/0a/bd/fca80d282bca9848b2c8e101c764432dd61a0e9d2377d1c8b3bab13235d0/onnxruntime-1.24.2-cp314-cp314-win_amd64.whl", hash = "sha256:45b4f68ffec95b2cc0dc96b2b413f69ace9a80a0e5400023c5ac61f73a7a3fdf", size = 12808967, upload_time = "2026-02-19T17:15:05.1Z" }, + { url = "https://files.pythonhosted.org/packages/6d/eb/6b154dd61cac410cacf27a9f53bbf49f4dbfe5b3982f3f5b0247c7bf7b78/onnxruntime-1.24.2-cp314-cp314-win_arm64.whl", hash = "sha256:6c501aaaaa674e689aaac501e26eb96aba908ebc067fe761fbcbed868bd694a6", size = 12491892, upload_time = "2026-02-19T17:14:54.584Z" }, + { url = "https://files.pythonhosted.org/packages/6f/84/14e5e804836476d3ef6ac07afe3ed6bdf01b69f8ef3ce6ae82c6c80b6d62/onnxruntime-1.24.2-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5360d3fd9c08ce17fff757759ce4b152852be14d597130f41174d8271f954630", size = 15036834, upload_time = "2026-02-19T17:13:33.65Z" }, + { url = "https://files.pythonhosted.org/packages/3a/27/ecdd3ae7d49d9f54820ededce2d88ddc3333b9ac9bb5f1d0d6aa3148c686/onnxruntime-1.24.2-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05a2792b5ef9278a89415a1f39d0a22192a872168257100503a5157165a38e7b", size = 17117770, upload_time = "2026-02-19T17:14:20.048Z" }, +] + [[package]] name = "openai" version = "1.97.1" @@ -998,6 +1141,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481, upload_time = "2023-05-01T04:11:28.427Z" }, ] +[[package]] +name = "rich" +version = "14.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload_time = "2026-02-19T17:23:12.474Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload_time = "2026-02-19T17:23:13.732Z" }, +] + [[package]] name = "ruff" version = "0.12.3" @@ -1023,6 +1179,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/30/f3eaf6563c637b6e66238ed6535f6775480db973c836336e4122161986fc/ruff-0.12.3-py3-none-win_arm64.whl", hash = "sha256:5f9c7c9c8f84c2d7f27e93674d27136fbf489720251544c4da7fb3d742e011b1", size = 10805855, upload_time = "2025-07-11T13:21:13.547Z" }, ] +[[package]] +name = "shellingham" +version = "1.5.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload_time = "2023-10-24T04:13:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload_time = "2023-10-24T04:13:38.866Z" }, +] + [[package]] name = "slm-server" version = "1.0.0" @@ -1030,6 +1195,7 @@ source = { editable = "." } dependencies = [ { name = "fastapi" }, { name = "llama-cpp-python" }, + { name = "onnxruntime" }, { name = "opentelemetry-api" }, { name = "opentelemetry-exporter-otlp" }, { name = "opentelemetry-exporter-prometheus" }, @@ -1040,6 +1206,7 @@ dependencies = [ { name = "prometheus-fastapi-instrumentator" }, { name = "psutil" }, { name = "pydantic-settings" }, + { name = "tokenizers" }, { name = "uvicorn" }, ] @@ -1058,6 +1225,7 @@ dev = [ requires-dist = [ { name = "fastapi", specifier = ">=0.116.1" }, { name = "llama-cpp-python", specifier = ">=0.3.13" }, + { name = "onnxruntime", specifier = ">=1.17.0" }, { name = "opentelemetry-api", specifier = ">=1.35.0" }, { name = "opentelemetry-exporter-otlp", specifier = ">=1.35.0" }, { name = "opentelemetry-exporter-prometheus", specifier = ">=0.49b0" }, @@ -1068,6 +1236,7 @@ requires-dist = [ { name = "prometheus-fastapi-instrumentator", specifier = ">=7.1.0" }, { name = "psutil", specifier = ">=6.1.0" }, { name = "pydantic-settings", specifier = ">=2.10.1" }, + { name = "tokenizers", specifier = ">=0.21.0" }, { name = "uvicorn", specifier = ">=0.35.0" }, ] @@ -1124,6 +1293,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/82/95/38ef0cd7fa11eaba6a99b3c4f5ac948d8bc6ff199aabd327a29cc000840c/starlette-0.47.1-py3-none-any.whl", hash = "sha256:5e11c9f5c7c3f24959edbf2dffdc01bba860228acf657129467d8a7468591527", size = 72747, upload_time = "2025-06-21T04:03:15.705Z" }, ] +[[package]] +name = "sympy" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload_time = "2025-04-27T18:05:01.611Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload_time = "2025-04-27T18:04:59.103Z" }, +] + [[package]] name = "tenacity" version = "9.1.2" @@ -1151,6 +1332,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/de/a8/8f499c179ec900783ffe133e9aab10044481679bb9aad78436d239eee716/tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95", size = 894669, upload_time = "2025-02-14T06:02:47.341Z" }, ] +[[package]] +name = "tokenizers" +version = "0.22.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/73/6f/f80cfef4a312e1fb34baf7d85c72d4411afde10978d4657f8cdd811d3ccc/tokenizers-0.22.2.tar.gz", hash = "sha256:473b83b915e547aa366d1eee11806deaf419e17be16310ac0a14077f1e28f917", size = 372115, upload_time = "2026-01-05T10:45:15.988Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/92/97/5dbfabf04c7e348e655e907ed27913e03db0923abb5dfdd120d7b25630e1/tokenizers-0.22.2-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:544dd704ae7238755d790de45ba8da072e9af3eea688f698b137915ae959281c", size = 3100275, upload_time = "2026-01-05T10:41:02.158Z" }, + { url = "https://files.pythonhosted.org/packages/2e/47/174dca0502ef88b28f1c9e06b73ce33500eedfac7a7692108aec220464e7/tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1e418a55456beedca4621dbab65a318981467a2b188e982a23e117f115ce5001", size = 2981472, upload_time = "2026-01-05T10:41:00.276Z" }, + { url = "https://files.pythonhosted.org/packages/d6/84/7990e799f1309a8b87af6b948f31edaa12a3ed22d11b352eaf4f4b2e5753/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2249487018adec45d6e3554c71d46eb39fa8ea67156c640f7513eb26f318cec7", size = 3290736, upload_time = "2026-01-05T10:40:32.165Z" }, + { url = "https://files.pythonhosted.org/packages/78/59/09d0d9ba94dcd5f4f1368d4858d24546b4bdc0231c2354aa31d6199f0399/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25b85325d0815e86e0bac263506dd114578953b7b53d7de09a6485e4a160a7dd", size = 3168835, upload_time = "2026-01-05T10:40:38.847Z" }, + { url = "https://files.pythonhosted.org/packages/47/50/b3ebb4243e7160bda8d34b731e54dd8ab8b133e50775872e7a434e524c28/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfb88f22a209ff7b40a576d5324bf8286b519d7358663db21d6246fb17eea2d5", size = 3521673, upload_time = "2026-01-05T10:40:56.614Z" }, + { url = "https://files.pythonhosted.org/packages/e0/fa/89f4cb9e08df770b57adb96f8cbb7e22695a4cb6c2bd5f0c4f0ebcf33b66/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c774b1276f71e1ef716e5486f21e76333464f47bece56bbd554485982a9e03e", size = 3724818, upload_time = "2026-01-05T10:40:44.507Z" }, + { url = "https://files.pythonhosted.org/packages/64/04/ca2363f0bfbe3b3d36e95bf67e56a4c88c8e3362b658e616d1ac185d47f2/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df6c4265b289083bf710dff49bc51ef252f9d5be33a45ee2bed151114a56207b", size = 3379195, upload_time = "2026-01-05T10:40:51.139Z" }, + { url = "https://files.pythonhosted.org/packages/2e/76/932be4b50ef6ccedf9d3c6639b056a967a86258c6d9200643f01269211ca/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:369cc9fc8cc10cb24143873a0d95438bb8ee257bb80c71989e3ee290e8d72c67", size = 3274982, upload_time = "2026-01-05T10:40:58.331Z" }, + { url = "https://files.pythonhosted.org/packages/1d/28/5f9f5a4cc211b69e89420980e483831bcc29dade307955cc9dc858a40f01/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:29c30b83d8dcd061078b05ae0cb94d3c710555fbb44861139f9f83dcca3dc3e4", size = 9478245, upload_time = "2026-01-05T10:41:04.053Z" }, + { url = "https://files.pythonhosted.org/packages/6c/fb/66e2da4704d6aadebf8cb39f1d6d1957df667ab24cff2326b77cda0dcb85/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:37ae80a28c1d3265bb1f22464c856bd23c02a05bb211e56d0c5301a435be6c1a", size = 9560069, upload_time = "2026-01-05T10:45:10.673Z" }, + { url = "https://files.pythonhosted.org/packages/16/04/fed398b05caa87ce9b1a1bb5166645e38196081b225059a6edaff6440fac/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:791135ee325f2336f498590eb2f11dc5c295232f288e75c99a36c5dbce63088a", size = 9899263, upload_time = "2026-01-05T10:45:12.559Z" }, + { url = "https://files.pythonhosted.org/packages/05/a1/d62dfe7376beaaf1394917e0f8e93ee5f67fea8fcf4107501db35996586b/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38337540fbbddff8e999d59970f3c6f35a82de10053206a7562f1ea02d046fa5", size = 10033429, upload_time = "2026-01-05T10:45:14.333Z" }, + { url = "https://files.pythonhosted.org/packages/fd/18/a545c4ea42af3df6effd7d13d250ba77a0a86fb20393143bbb9a92e434d4/tokenizers-0.22.2-cp39-abi3-win32.whl", hash = "sha256:a6bf3f88c554a2b653af81f3204491c818ae2ac6fbc09e76ef4773351292bc92", size = 2502363, upload_time = "2026-01-05T10:45:20.593Z" }, + { url = "https://files.pythonhosted.org/packages/65/71/0670843133a43d43070abeb1949abfdef12a86d490bea9cd9e18e37c5ff7/tokenizers-0.22.2-cp39-abi3-win_amd64.whl", hash = "sha256:c9ea31edff2968b44a88f97d784c2f16dc0729b8b143ed004699ebca91f05c48", size = 2747786, upload_time = "2026-01-05T10:45:18.411Z" }, + { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload_time = "2026-01-05T10:45:17.232Z" }, +] + [[package]] name = "tqdm" version = "4.67.1" @@ -1163,6 +1370,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload_time = "2024-11-24T20:12:19.698Z" }, ] +[[package]] +name = "typer" +version = "0.24.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-doc" }, + { name = "click" }, + { name = "rich" }, + { name = "shellingham" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f5/24/cb09efec5cc954f7f9b930bf8279447d24618bb6758d4f6adf2574c41780/typer-0.24.1.tar.gz", hash = "sha256:e39b4732d65fbdcde189ae76cf7cd48aeae72919dea1fdfc16593be016256b45", size = 118613, upload_time = "2026-02-21T16:54:40.609Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload_time = "2026-02-21T16:54:41.616Z" }, +] + +[[package]] +name = "typer-slim" +version = "0.24.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a7/a7/e6aecc4b4eb59598829a3b5076a93aff291b4fdaa2ded25efc4e1f4d219c/typer_slim-0.24.0.tar.gz", hash = "sha256:f0ed36127183f52ae6ced2ecb2521789995992c521a46083bfcdbb652d22ad34", size = 4776, upload_time = "2026-02-16T22:08:51.2Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/24/5480c20380dfd18cf33d14784096dca45a24eae6102e91d49a718d3b6855/typer_slim-0.24.0-py3-none-any.whl", hash = "sha256:d5d7ee1ee2834d5020c7c616ed5e0d0f29b9a4b1dd283bdebae198ec09778d0e", size = 3394, upload_time = "2026-02-16T22:08:49.92Z" }, +] + [[package]] name = "typing-extensions" version = "4.14.1"