initial: ollama-compatible facade for claude -p

2026-04-26 14:49:44 +10:00
commit a3be103232
4 changed files with 290 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
 venv/
 __pycache__/
 *.pyc
 config.yaml.bak.*
--- a/app.py
+++ b/app.py
@@ -0,0 +1,258 @@
 """
 claude-code-proxy: Ollama-compatible HTTP facade for `claude -p`.
 Exposes a subset of the Ollama API on http://127.0.0.1:11435 and translates
 each request into a `claude -p` subprocess invocation. This lets external
 tools that already speak Ollama (Open WebUI, AnythingLLM, n8n nodes, etc.)
 talk to Claude Code instead of a local Ollama instance.
 Endpoints:
    GET  /                  health check
    GET  /api/version       Ollama version stub
    GET  /api/tags          list "models" (so clients can validate)
    POST /api/show          model details stub
    POST /api/generate      single-shot prompt -> response
    POST /api/chat          multi-message conversation -> response
 Both /api/generate and /api/chat honour the `stream` flag in the request
 body (Ollama default is True). When true, responses are emitted as
 NDJSON chunks; when false, a single JSON object is returned.
 Environment variables:
    CLAUDE_BIN                  path to claude CLI (default: "claude")
    CLAUDE_PROXY_CONCURRENCY    max concurrent claude subprocesses (default: 3)
    CLAUDE_PROXY_MODEL          name advertised in /api/tags (default: "claude-code")
    CLAUDE_PROXY_TIMEOUT        per-request timeout in seconds (default: 300)
    CLAUDE_CODE_OAUTH_TOKEN     long-lived auth token, inherited by claude subprocess
 """
 from __future__ import annotations
 import asyncio
 import json
 import logging
 import os
 import time
 from datetime import datetime, timezone
 from typing import Any, AsyncIterator
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, StreamingResponse
 # --- Configuration ----------------------------------------------------------
 CLAUDE_BIN = os.environ.get("CLAUDE_BIN", "claude")
 CONCURRENCY = int(os.environ.get("CLAUDE_PROXY_CONCURRENCY", "3"))
 DEFAULT_MODEL = os.environ.get("CLAUDE_PROXY_MODEL", "claude-code")
 TIMEOUT_SECONDS = int(os.environ.get("CLAUDE_PROXY_TIMEOUT", "300"))
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 log = logging.getLogger("claude-proxy")
 app = FastAPI(title="claude-code-proxy")
 _semaphore = asyncio.Semaphore(CONCURRENCY)
 # --- Helpers ----------------------------------------------------------------
 def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
 async def _run_claude(prompt: str) -> str:
    """Run `claude -p <prompt>` and return stdout as a string."""
    async with _semaphore:
        log.info("claude -p invoked (prompt %d chars)", len(prompt))
        proc = await asyncio.create_subprocess_exec(
            CLAUDE_BIN, "-p", prompt,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
            env=os.environ.copy(),
        )
        try:
            stdout, stderr = await asyncio.wait_for(
                proc.communicate(), timeout=TIMEOUT_SECONDS
            )
        except asyncio.TimeoutError:
            proc.kill()
            await proc.wait()
            raise RuntimeError(f"claude -p timed out after {TIMEOUT_SECONDS}s")
        if proc.returncode != 0:
            err = stderr.decode("utf-8", errors="replace")[:1000]
            raise RuntimeError(f"claude -p exited {proc.returncode}: {err}")
        return stdout.decode("utf-8", errors="replace")
 def _build_prompt_from_messages(messages: list[dict]) -> str:
    """Flatten OpenAI/Ollama-style messages into a single prompt string."""
    system_parts = [m["content"] for m in messages if m.get("role") == "system"]
    convo: list[str] = []
    for m in messages:
        role = m.get("role", "user")
        if role == "system":
            continue
        prefix = "User" if role == "user" else "Assistant"
        convo.append(f"{prefix}: {m.get('content', '')}")
    convo.append("Assistant:")
    body = "\n\n".join(convo)
    if system_parts:
        return "[System]\n" + "\n\n".join(system_parts) + "\n\n" + body
    return body
 # --- Streaming generators ---------------------------------------------------
 async def _stream_generate(base: dict, text: str) -> AsyncIterator[bytes]:
    """Emit Ollama-style NDJSON for /api/generate: incremental chunks then done."""
    chunk_size = 64
    started = time.time()
    for i in range(0, len(text), chunk_size):
        frame = {**base, "response": text[i:i + chunk_size], "done": False}
        yield (json.dumps(frame) + "\n").encode("utf-8")
        await asyncio.sleep(0)
    final = {
        **base,
        "response": "",
        "done": True,
        "done_reason": "stop",
        "total_duration": int((time.time() - started) * 1e9),
    }
    yield (json.dumps(final) + "\n").encode("utf-8")
 async def _stream_chat(base: dict, text: str) -> AsyncIterator[bytes]:
    """Emit Ollama-style NDJSON for /api/chat: each frame carries a message."""
    chunk_size = 64
    started = time.time()
    for i in range(0, len(text), chunk_size):
        frame = {
            **base,
            "message": {"role": "assistant", "content": text[i:i + chunk_size]},
            "done": False,
        }
        yield (json.dumps(frame) + "\n").encode("utf-8")
        await asyncio.sleep(0)
    final = {
        **base,
        "message": {"role": "assistant", "content": ""},
        "done": True,
        "done_reason": "stop",
        "total_duration": int((time.time() - started) * 1e9),
    }
    yield (json.dumps(final) + "\n").encode("utf-8")
 # --- Routes -----------------------------------------------------------------
@app.get("/")
 async def root() -> dict:
    return {"status": "ok", "service": "claude-code-proxy"}
@app.get("/api/version")
 async def version() -> dict:
    return {"version": "0.1.0-claude-proxy"}
@app.get("/api/tags")
 async def tags() -> dict:
    """Ollama-style model list. Many clients hit this to verify the endpoint."""
    return {
        "models": [{
            "name": DEFAULT_MODEL,
            "model": DEFAULT_MODEL,
            "modified_at": _now_iso(),
            "size": 0,
            "digest": "sha256:claude-code",
            "details": {
                "parent_model": "",
                "format": "claude",
                "family": "claude",
                "families": ["claude"],
                "parameter_size": "unknown",
                "quantization_level": "none",
            },
        }]
    }
@app.post("/api/show")
 async def show(req: Request) -> dict:
    body = await req.json()
    name = body.get("name", DEFAULT_MODEL)
    return {
        "modelfile": f"FROM {name}",
        "parameters": "",
        "template": "",
        "details": {
            "format": "claude",
            "family": "claude",
            "parameter_size": "unknown",
            "quantization_level": "none",
        },
    }
@app.post("/api/generate")
 async def generate(req: Request) -> Any:
    body = await req.json()
    model = body.get("model", DEFAULT_MODEL)
    prompt = body.get("prompt", "")
    system = body.get("system")
    stream = bool(body.get("stream", True))
    full_prompt = f"[System]\n{system}\n\n{prompt}" if system else prompt
    started = time.time()
    try:
        text = await _run_claude(full_prompt)
    except Exception as e:
        log.exception("claude invocation failed")
        return JSONResponse({"error": str(e)}, status_code=500)
    base = {"model": model, "created_at": _now_iso()}
    if stream:
        return StreamingResponse(
            _stream_generate(base, text),
            media_type="application/x-ndjson",
        )
    return {
        **base,
        "response": text,
        "done": True,
        "done_reason": "stop",
        "total_duration": int((time.time() - started) * 1e9),
    }
@app.post("/api/chat")
 async def chat(req: Request) -> Any:
    body = await req.json()
    model = body.get("model", DEFAULT_MODEL)
    messages = body.get("messages", [])
    stream = bool(body.get("stream", True))
    prompt = _build_prompt_from_messages(messages)
    started = time.time()
    try:
        text = await _run_claude(prompt)
    except Exception as e:
        log.exception("claude invocation failed")
        return JSONResponse({"error": str(e)}, status_code=500)
    base = {"model": model, "created_at": _now_iso()}
    if stream:
        return StreamingResponse(
            _stream_chat(base, text),
            media_type="application/x-ndjson",
        )
    return {
        **base,
        "message": {"role": "assistant", "content": text},
        "done": True,
        "done_reason": "stop",
        "total_duration": int((time.time() - started) * 1e9),
    }
--- a/claude-code-proxy.service
+++ b/claude-code-proxy.service
@@ -0,0 +1,25 @@
 [Unit]
 Description=claude-code-proxy: Ollama-compatible HTTP facade for `claude -p`
 Documentation=file:///home/help4bis/claude-proxy/README.md
 After=network-online.target
 Wants=network-online.target
 [Service]
 Type=exec
 User=help4bis
 Group=help4bis
 WorkingDirectory=/home/help4bis/claude-proxy
 Environment=HOME=/home/help4bis
 Environment=PATH=/home/help4bis/.local/bin:/home/help4bis/claude-proxy/venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 Environment=CLAUDE_PROXY_CONCURRENCY=3
 ExecStart=/home/help4bis/claude-proxy/venv/bin/uvicorn app:app --host 127.0.0.1 --port 11435 --workers 1 --log-level info
 Restart=always
 RestartSec=5s
 # Hardening — kept minimal because claude CLI needs free access to $HOME
 # for session state, auth cache (~/.claude, ~/.local/state/claude, ~/.cache/claude)
 # and the $HOME path is an /mnt bind mount which doesn't play with ProtectHome.
 NoNewPrivileges=true
 [Install]
 WantedBy=multi-user.target
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,3 @@
 fastapi>=0.115,<0.116
 uvicorn[standard]>=0.34,<0.35
 pydantic>=2.10,<3