XyLearningProgramming · XyLearningProgramming · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/Makefile b/Makefile
@@ -0,0 +1,35 @@
+.PHONY: dev run download install lint format check test smoke clean help
+
+help: ## Show this help
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2}'
+
+install: ## Install dependencies (including dev)
+	uv sync
+
+download: ## Download model files
+	bash scripts/download.sh
+
+dev: ## Start dev server with auto-reload
+	uv run uvicorn slm_server.app:app --reload --host 0.0.0.0 --port 8000
+
+run: ## Start server via start.sh
+	bash scripts/start.sh
+
+lint: ## Run ruff linter
+	uv run ruff check slm_server/
+
+format: ## Run ruff formatter
+	uv run ruff format slm_server/
+
+check: lint ## Run linter + formatter check
+	uv run ruff format --check slm_server/
+
+smoke: ## Smoke-test the running server APIs with curl
+	bash scripts/smoke.sh
+
+test: ## Run tests with coverage
+	uv run pytest tests/ -v --cov=slm_server --cov-report=term-missing
+
+clean: ## Remove caches and build artifacts
+	rm -rf __pycache__ .pytest_cache .ruff_cache .coverage htmlcov build dist *.egg-info
+	find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml
@@ -79,14 +79,35 @@ env: {}
 
 # Resource requests and limits for the container.
 # See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
-# Tuned for Qwen3-0.6B-Q4_K_M (484 MB) + n_ctx=8192 KV cache (~448 MB) on 1-CPU / 1 GB VPS nodes.
+#
+# Memory budget breakdown (target node: 1-CPU / 2 GB VPS):
+#   Chat LLM  – Qwen3-0.6B-Q4_K_M.gguf       ~484 MB (4-bit quantised)
+#   Embedding – all-MiniLM-L6-v2 quint8 ONNX   ~23 MB (uint8 AVX2 quantised)
+#   KV cache  – n_ctx=2048                     ~50-80 MB
+#   Runtime   – Python, FastAPI, onnxruntime   ~50-100 MB
+#   -------------------------------------------------------
+#   Total request: 550 Mi   Hard limit: 1 Gi
+#
+# Why these models:
+#   - Qwen3-0.6B-Q4_K_M is the smallest instruction-tuned LLM that still
+#     supports function calling (chatml format) at usable quality.
+#   - all-MiniLM-L6-v2 (384-dim, 6-layer) is purpose-trained for sentence
+#     embeddings via mean pooling, ranking well on STS benchmarks for its
+#     size. The quint8 AVX2 variant keeps the file at 23 MB vs 90 MB fp32.
+#
+# Why the limit is reasonable:
+#   - The worker node (active-nerd-2) has 2 GiB total RAM shared with the
+#     OS and other pods. 550 Mi request leaves headroom; the 1 Gi hard
+#     limit prevents OOM-kill from bursty KV-cache growth.
+#   - MAX_CONCURRENCY=1 ensures only one inference runs at a time, so peak
+#     memory is predictable (no concurrent KV-cache allocations).
 resources:
   limits:
-    cpu: 1
+    cpu: 900m
     memory: 1Gi
   requests:
-    cpu: 200m
-    memory: 600Mi
+    cpu: 50m
+    memory: 550Mi
 
 # Readiness and liveness probes configuration
 probes:

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,8 @@ dependencies = [
     "prometheus-client>=0.22.1",
     "prometheus-fastapi-instrumentator>=7.1.0",
     "psutil>=6.1.0",
+    "onnxruntime>=1.17.0",
+    "tokenizers>=0.21.0",
 ]
 
 [tool.ruff.lint]

diff --git a/scripts/download.sh b/scripts/download.sh
@@ -36,4 +36,30 @@ for file in "${FILES_TO_DOWNLOAD[@]}"; do
     fi
 done
 
+# --- Embedding model: all-MiniLM-L6-v2 (ONNX, quantized UINT8 for AVX2) ---
+EMBEDDING_REPO_URL="https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
+EMBEDDING_MODEL_DIR="$MODEL_DIR/all-MiniLM-L6-v2"
+
+mkdir -p "$EMBEDDING_MODEL_DIR/onnx"
+
+EMBEDDING_FILES=(
+    "onnx/model_quint8_avx2.onnx"
+    "tokenizer.json"
+)
+
+echo "Downloading all-MiniLM-L6-v2 ONNX embedding model..."
+
+for file in "${EMBEDDING_FILES[@]}"; do
+    dest="$EMBEDDING_MODEL_DIR/$file"
+    if [ -f "$dest" ]; then
+        echo "$file already exists, skipping download."
+    else
+        echo "Downloading $file..."
+        wget -O "$dest" "$EMBEDDING_REPO_URL/resolve/main/$file" || {
+            echo "Failed to download $file with wget, trying curl..."
+            curl -L -o "$dest" "$EMBEDDING_REPO_URL/resolve/main/$file"
+        }
+    fi
+done
+
 echo "Download process complete! Files are in $MODEL_DIR"
diff --git a/scripts/smoke.sh b/scripts/smoke.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+set -e
+
+BASE_URL="${BASE_URL:-http://localhost:8000}"
+
+echo "=== Health check ==="
+curl -sf "$BASE_URL/health"
+echo
+
+echo "=== List models ==="
+curl -sf "$BASE_URL/api/v1/models" | python3 -m json.tool
+echo
+
+echo "=== Chat completion ==="
+curl -sf "$BASE_URL/api/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [{"role": "user", "content": "Say hello in one sentence."}],
+    "max_tokens": 64
+  }' | python3 -m json.tool
+echo
+
+echo "=== Chat completion (streaming) ==="
+curl -sf "$BASE_URL/api/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [{"role": "user", "content": "What is 2+2?"}],
+    "max_tokens": 32,
+    "stream": true
+  }'
+echo
+
+echo "=== Embeddings (single) ==="
+curl -sf "$BASE_URL/api/v1/embeddings" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "input": "Hello world"
+  }' | python3 -m json.tool
+echo
+
+echo "=== Embeddings (batch) ==="
+curl -sf "$BASE_URL/api/v1/embeddings" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "input": ["The cat sat on the mat.", "A dog played in the park."]
+  }' | python3 -m json.tool
+echo
+
+echo "All smoke tests passed."
diff --git a/slm_server/app.py b/slm_server/app.py
@@ -10,11 +10,14 @@
 from llama_cpp import CreateChatCompletionStreamResponse, Llama
 
 from slm_server.config import Settings, get_settings
+from slm_server.embedding import OnnxEmbeddingModel
 from slm_server.logging import setup_logging
 from slm_server.metrics import setup_metrics
 from slm_server.model import (
     ChatCompletionRequest,
+    EmbeddingData,
     EmbeddingRequest,
+    EmbeddingResponse,
     ModelInfo,
     ModelListResponse,
 )
@@ -62,13 +65,21 @@ def get_llm(settings: Annotated[Settings, Depends(get_settings)]) -> Llama:
             seed=settings.seed,
             chat_format=CHAT_FORMAT,
             logits_all=False,
-            embedding=True,
-            use_mlock=True,  # Use mlock to prevent memory swapping
-            use_mmap=True,  # Use memory-mapped files for faster access
+            embedding=False,
+            use_mlock=True,
+            use_mmap=True,
         )
     return get_llm._instance
 
 
+def get_embedding_model(
+    settings: Annotated[Settings, Depends(get_settings)],
+) -> OnnxEmbeddingModel:
+    if not hasattr(get_embedding_model, "_instance"):
+        get_embedding_model._instance = OnnxEmbeddingModel(settings.embedding)
+    return get_embedding_model._instance
+
+
 def get_app() -> FastAPI:
     # Get settings when creating app.
     settings = get_settings()
@@ -176,41 +187,53 @@ async def create_chat_completion(
 @app.post("/api/v1/embeddings")
 async def create_embeddings(
     req: EmbeddingRequest,
-    llm: Annotated[Llama, Depends(get_llm)],
+    emb_model: Annotated[OnnxEmbeddingModel, Depends(get_embedding_model)],
     _: Annotated[None, Depends(lock_llm_semaphor)],
     __: Annotated[None, Depends(raise_as_http_exception)],
 ):
-    """Create embeddings for the given input text(s)."""
+    """Create embeddings using the dedicated ONNX embedding model."""
     with slm_embedding_span(req) as span:
-        # Use llama-cpp-python's create_embedding method directly
-        embedding_result = await asyncio.to_thread(
-            llm.create_embedding,
-            **req.model_dump(),
+        inputs = req.input if isinstance(req.input, list) else [req.input]
+        vectors = await asyncio.to_thread(emb_model.encode, inputs, True)
+        result = EmbeddingResponse(
+            data=[
+                EmbeddingData(embedding=vec.tolist(), index=i)
+                for i, vec in enumerate(vectors)
+            ],
+            model=emb_model.model_id,
         )
-        # Convert llama-cpp response using model_validate like chat completion
-        set_attribute_response_embedding(span, embedding_result)
-        return embedding_result
+        set_attribute_response_embedding(span, result)
+        return result
 
 
 @app.get("/api/v1/models", response_model=ModelListResponse)
 async def list_models(
     settings: Annotated[Settings, Depends(get_settings)],
 ) -> ModelListResponse:
-    """List available models (OpenAI-compatible). Returns the single loaded model."""
-    model_id = Path(settings.model_path).stem
+    """List available models (OpenAI-compatible)."""
+    chat_model_id = Path(settings.model_path).stem
     try:
-        created = int(Path(settings.model_path).stat().st_mtime)
+        chat_created = int(Path(settings.model_path).stat().st_mtime)
     except (OSError, ValueError):
-        created = 0
+        chat_created = 0
+
+    try:
+        emb_created = int(Path(settings.embedding.onnx_path).stat().st_mtime)
+    except (OSError, ValueError):
+        emb_created = 0
+
     return ModelListResponse(
-        object="list",
         data=[
             ModelInfo(
-                id=model_id,
-                object="model",
-                created=created,
+                id=chat_model_id,
+                created=chat_created,
                 owned_by=settings.model_owner,
-            )
+            ),
+            ModelInfo(
+                id=settings.embedding.model_id,
+                created=emb_created,
+                owned_by="sentence-transformers",
+            ),
         ],
     )
 

diff --git a/slm_server/config.py b/slm_server/config.py
@@ -16,6 +16,34 @@
 MODEL_PATH_DEFAULT = str(MODELS_DIR / "Qwen3-0.6B-Q4_K_M.gguf")
 MODEL_OWNER_DEFAULT = "second-state"
 
+EMBEDDING_TOKENIZER_PATH_DEFAULT = str(
+    MODELS_DIR / "all-MiniLM-L6-v2" / "tokenizer.json"
+)
+EMBEDDING_ONNX_PATH_DEFAULT = str(
+    MODELS_DIR / "all-MiniLM-L6-v2" / "onnx" / "model_quint8_avx2.onnx"
+)
+
+
+class EmbeddingSettings(BaseModel):
+    model_id: str = Field(
+        "all-MiniLM-L6-v2",
+        description="Model identifier returned in API responses.",
+    )
+    tokenizer_path: str = Field(
+        EMBEDDING_TOKENIZER_PATH_DEFAULT,
+        description="Full path to the tokenizer.json file.",
+    )
+    onnx_path: str = Field(
+        EMBEDDING_ONNX_PATH_DEFAULT,
+        description="Full path to the ONNX model file.",
+    )
+    max_length: int = Field(
+        256,
+        description="Maximum token sequence length for the tokenizer. "
+        "all-MiniLM-L6-v2 was trained with 256; increase only if "
+        "swapping to a model that supports longer sequences.",
+    )
+
 
 class LoggingSettings(BaseModel):
     verbose: bool = Field(True, description="If logging to stdout by cpp llama")
@@ -75,6 +103,7 @@ class Settings(BaseSettings):
         1, description="Seconds to wait if undergoing another inference."
     )
 
+    embedding: EmbeddingSettings = Field(default_factory=EmbeddingSettings)
     logging: LoggingSettings = Field(default_factory=LoggingSettings)
     metrics: MetricsSettings = Field(default_factory=MetricsSettings)
     tracing: TraceSettings = Field(default_factory=TraceSettings)