diff --git a/docker-compose.yml b/docker-compose.yml index c0dcf3a..1b320ef 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -46,26 +46,28 @@ services: resources: limits: cpus: '2.0' - memory: 8G + memory: 6G reservations: cpus: '1.0' memory: 4G environment: - OLLAMA_HOST=0.0.0.0 - OLLAMA_ORIGINS=* - - OLLAMA_NUM_PARALLEL=2 + - OLLAMA_NUM_PARALLEL=1 - OLLAMA_NUM_GPU=0 - OLLAMA_KEEP_ALIVE=86400 - OLLAMA_MODEL_TIMEOUT=0 - OLLAMA_MAX_LOADED_MODELS=1 + - OLLAMA_FLASH_ATTENTION=0 + - OLLAMA_LLM_LIBRARY=auto healthcheck: test: ["CMD", "ollama", "list"] interval: 30s timeout: 10s retries: 5 start_period: 120s - # Предзагружаем модель при запуске контейнера - entrypoint: ["/bin/sh", "-c", "ollama serve & sleep 15 && ollama run qwen2.5:7b 'test' && tail -f /dev/null"] + # Предзагружаем модель при запуске контейнера с keepalive + entrypoint: ["/bin/sh", "-c", "ollama serve & sleep 15 && ollama run --keepalive 24h qwen2.5:7b 'test' && tail -f /dev/null"] vector-search: build: context: ./vector-search diff --git a/vector-search/app.py b/vector-search/app.py index 8d52827..eb235dc 100644 --- a/vector-search/app.py +++ b/vector-search/app.py @@ -51,7 +51,7 @@ def get_embedding(text: str) -> list: resp = requests.post(f"{OLLAMA_BASE_URL}/api/embeddings", json={ "model": EMBED_MODEL, "prompt": text - }, timeout=30) + }, timeout=300) print(f"[DEBUG] Ollama response status: {resp.status_code}") if not resp.ok: