Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
.PHONY: dev run download install lint format check test smoke clean help

help: ## Show this help
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-15s\033[0m %s\n", $$1, $$2}'

install: ## Install dependencies (including dev)
uv sync

download: ## Download model files
bash scripts/download.sh

dev: ## Start dev server with auto-reload
uv run uvicorn slm_server.app:app --reload --host 0.0.0.0 --port 8000

run: ## Start server via start.sh
bash scripts/start.sh

lint: ## Run ruff linter
uv run ruff check slm_server/

format: ## Run ruff formatter
uv run ruff format slm_server/

check: lint ## Run linter + formatter check
uv run ruff format --check slm_server/

smoke: ## Smoke-test the running server APIs with curl
bash scripts/smoke.sh

test: ## Run tests with coverage
uv run pytest tests/ -v --cov=slm_server --cov-report=term-missing

clean: ## Remove caches and build artifacts
rm -rf __pycache__ .pytest_cache .ruff_cache .coverage htmlcov build dist *.egg-info
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
29 changes: 25 additions & 4 deletions deploy/helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,35 @@ env: {}

# Resource requests and limits for the container.
# See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
# Tuned for Qwen3-0.6B-Q4_K_M (484 MB) + n_ctx=8192 KV cache (~448 MB) on 1-CPU / 1 GB VPS nodes.
#
# Memory budget breakdown (target node: 1-CPU / 2 GB VPS):
# Chat LLM – Qwen3-0.6B-Q4_K_M.gguf ~484 MB (4-bit quantised)
# Embedding – all-MiniLM-L6-v2 quint8 ONNX ~23 MB (uint8 AVX2 quantised)
# KV cache – n_ctx=2048 ~50-80 MB
# Runtime – Python, FastAPI, onnxruntime ~50-100 MB
# -------------------------------------------------------
# Total request: 550 Mi Hard limit: 1 Gi
#
# Why these models:
# - Qwen3-0.6B-Q4_K_M is the smallest instruction-tuned LLM that still
# supports function calling (chatml format) at usable quality.
# - all-MiniLM-L6-v2 (384-dim, 6-layer) is purpose-trained for sentence
# embeddings via mean pooling, ranking well on STS benchmarks for its
# size. The quint8 AVX2 variant keeps the file at 23 MB vs 90 MB fp32.
#
# Why the limit is reasonable:
# - The worker node (active-nerd-2) has 2 GiB total RAM shared with the
# OS and other pods. 550 Mi request leaves headroom; the 1 Gi hard
# limit prevents OOM-kill from bursty KV-cache growth.
# - MAX_CONCURRENCY=1 ensures only one inference runs at a time, so peak
# memory is predictable (no concurrent KV-cache allocations).
resources:
limits:
cpu: 1
cpu: 900m
memory: 1Gi
requests:
cpu: 200m
memory: 600Mi
cpu: 50m
memory: 550Mi

# Readiness and liveness probes configuration
probes:
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ dependencies = [
"prometheus-client>=0.22.1",
"prometheus-fastapi-instrumentator>=7.1.0",
"psutil>=6.1.0",
"onnxruntime>=1.17.0",
"tokenizers>=0.21.0",
]

[tool.ruff.lint]
Expand Down
26 changes: 26 additions & 0 deletions scripts/download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,30 @@ for file in "${FILES_TO_DOWNLOAD[@]}"; do
fi
done

# --- Embedding model: all-MiniLM-L6-v2 (ONNX, quantized UINT8 for AVX2) ---
EMBEDDING_REPO_URL="https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_MODEL_DIR="$MODEL_DIR/all-MiniLM-L6-v2"

mkdir -p "$EMBEDDING_MODEL_DIR/onnx"

EMBEDDING_FILES=(
"onnx/model_quint8_avx2.onnx"
"tokenizer.json"
)

echo "Downloading all-MiniLM-L6-v2 ONNX embedding model..."

for file in "${EMBEDDING_FILES[@]}"; do
dest="$EMBEDDING_MODEL_DIR/$file"
if [ -f "$dest" ]; then
echo "$file already exists, skipping download."
else
echo "Downloading $file..."
wget -O "$dest" "$EMBEDDING_REPO_URL/resolve/main/$file" || {
echo "Failed to download $file with wget, trying curl..."
curl -L -o "$dest" "$EMBEDDING_REPO_URL/resolve/main/$file"
}
fi
done

echo "Download process complete! Files are in $MODEL_DIR"
50 changes: 50 additions & 0 deletions scripts/smoke.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash

set -e

BASE_URL="${BASE_URL:-http://localhost:8000}"

echo "=== Health check ==="
curl -sf "$BASE_URL/health"
echo

echo "=== List models ==="
curl -sf "$BASE_URL/api/v1/models" | python3 -m json.tool
echo

echo "=== Chat completion ==="
curl -sf "$BASE_URL/api/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [{"role": "user", "content": "Say hello in one sentence."}],
"max_tokens": 64
}' | python3 -m json.tool
echo

echo "=== Chat completion (streaming) ==="
curl -sf "$BASE_URL/api/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [{"role": "user", "content": "What is 2+2?"}],
"max_tokens": 32,
"stream": true
}'
echo

echo "=== Embeddings (single) ==="
curl -sf "$BASE_URL/api/v1/embeddings" \
-H "Content-Type: application/json" \
-d '{
"input": "Hello world"
}' | python3 -m json.tool
echo

echo "=== Embeddings (batch) ==="
curl -sf "$BASE_URL/api/v1/embeddings" \
-H "Content-Type: application/json" \
-d '{
"input": ["The cat sat on the mat.", "A dog played in the park."]
}' | python3 -m json.tool
echo

echo "All smoke tests passed."
65 changes: 44 additions & 21 deletions slm_server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@
from llama_cpp import CreateChatCompletionStreamResponse, Llama

from slm_server.config import Settings, get_settings
from slm_server.embedding import OnnxEmbeddingModel
from slm_server.logging import setup_logging
from slm_server.metrics import setup_metrics
from slm_server.model import (
ChatCompletionRequest,
EmbeddingData,
EmbeddingRequest,
EmbeddingResponse,
ModelInfo,
ModelListResponse,
)
Expand Down Expand Up @@ -62,13 +65,21 @@ def get_llm(settings: Annotated[Settings, Depends(get_settings)]) -> Llama:
seed=settings.seed,
chat_format=CHAT_FORMAT,
logits_all=False,
embedding=True,
use_mlock=True, # Use mlock to prevent memory swapping
use_mmap=True, # Use memory-mapped files for faster access
embedding=False,
use_mlock=True,
use_mmap=True,
)
return get_llm._instance


def get_embedding_model(
settings: Annotated[Settings, Depends(get_settings)],
) -> OnnxEmbeddingModel:
if not hasattr(get_embedding_model, "_instance"):
get_embedding_model._instance = OnnxEmbeddingModel(settings.embedding)
return get_embedding_model._instance


def get_app() -> FastAPI:
# Get settings when creating app.
settings = get_settings()
Expand Down Expand Up @@ -176,41 +187,53 @@ async def create_chat_completion(
@app.post("/api/v1/embeddings")
async def create_embeddings(
req: EmbeddingRequest,
llm: Annotated[Llama, Depends(get_llm)],
emb_model: Annotated[OnnxEmbeddingModel, Depends(get_embedding_model)],
_: Annotated[None, Depends(lock_llm_semaphor)],
__: Annotated[None, Depends(raise_as_http_exception)],
):
"""Create embeddings for the given input text(s)."""
"""Create embeddings using the dedicated ONNX embedding model."""
with slm_embedding_span(req) as span:
# Use llama-cpp-python's create_embedding method directly
embedding_result = await asyncio.to_thread(
llm.create_embedding,
**req.model_dump(),
inputs = req.input if isinstance(req.input, list) else [req.input]
vectors = await asyncio.to_thread(emb_model.encode, inputs, True)
result = EmbeddingResponse(
data=[
EmbeddingData(embedding=vec.tolist(), index=i)
for i, vec in enumerate(vectors)
],
model=emb_model.model_id,
)
# Convert llama-cpp response using model_validate like chat completion
set_attribute_response_embedding(span, embedding_result)
return embedding_result
set_attribute_response_embedding(span, result)
return result


@app.get("/api/v1/models", response_model=ModelListResponse)
async def list_models(
settings: Annotated[Settings, Depends(get_settings)],
) -> ModelListResponse:
"""List available models (OpenAI-compatible). Returns the single loaded model."""
model_id = Path(settings.model_path).stem
"""List available models (OpenAI-compatible)."""
chat_model_id = Path(settings.model_path).stem
try:
created = int(Path(settings.model_path).stat().st_mtime)
chat_created = int(Path(settings.model_path).stat().st_mtime)
except (OSError, ValueError):
created = 0
chat_created = 0

try:
emb_created = int(Path(settings.embedding.onnx_path).stat().st_mtime)
except (OSError, ValueError):
emb_created = 0

return ModelListResponse(
object="list",
data=[
ModelInfo(
id=model_id,
object="model",
created=created,
id=chat_model_id,
created=chat_created,
owned_by=settings.model_owner,
)
),
ModelInfo(
id=settings.embedding.model_id,
created=emb_created,
owned_by="sentence-transformers",
),
],
)

Expand Down
29 changes: 29 additions & 0 deletions slm_server/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,34 @@
MODEL_PATH_DEFAULT = str(MODELS_DIR / "Qwen3-0.6B-Q4_K_M.gguf")
MODEL_OWNER_DEFAULT = "second-state"

EMBEDDING_TOKENIZER_PATH_DEFAULT = str(
MODELS_DIR / "all-MiniLM-L6-v2" / "tokenizer.json"
)
EMBEDDING_ONNX_PATH_DEFAULT = str(
MODELS_DIR / "all-MiniLM-L6-v2" / "onnx" / "model_quint8_avx2.onnx"
)


class EmbeddingSettings(BaseModel):
model_id: str = Field(
"all-MiniLM-L6-v2",
description="Model identifier returned in API responses.",
)
tokenizer_path: str = Field(
EMBEDDING_TOKENIZER_PATH_DEFAULT,
description="Full path to the tokenizer.json file.",
)
onnx_path: str = Field(
EMBEDDING_ONNX_PATH_DEFAULT,
description="Full path to the ONNX model file.",
)
max_length: int = Field(
256,
description="Maximum token sequence length for the tokenizer. "
"all-MiniLM-L6-v2 was trained with 256; increase only if "
"swapping to a model that supports longer sequences.",
)


class LoggingSettings(BaseModel):
verbose: bool = Field(True, description="If logging to stdout by cpp llama")
Expand Down Expand Up @@ -75,6 +103,7 @@ class Settings(BaseSettings):
1, description="Seconds to wait if undergoing another inference."
)

embedding: EmbeddingSettings = Field(default_factory=EmbeddingSettings)
logging: LoggingSettings = Field(default_factory=LoggingSettings)
metrics: MetricsSettings = Field(default_factory=MetricsSettings)
tracing: TraceSettings = Field(default_factory=TraceSettings)
Expand Down
Loading