From 888873f6304f29a6bf598553ed445b754bf31de1 Mon Sep 17 00:00:00 2001
From: Alex <info@hb3-accelerator.com>
Date: Thu, 4 Sep 2025 11:27:07 +0300
Subject: [PATCH] =?UTF-8?q?=D0=B2=D0=B0=D1=88=D0=B5=20=D1=81=D0=BE=D0=BE?=
 =?UTF-8?q?=D0=B1=D1=89=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BA=D0=BE=D0=BC=D0=BC?=
 =?UTF-8?q?=D0=B8=D1=82=D0=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docker-compose.yml   | 10 ++++++----
 vector-search/app.py |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index c0dcf3a..1b320ef 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -46,26 +46,28 @@ services:
       resources:
         limits:
           cpus: '2.0'
-          memory: 8G
+          memory: 6G
         reservations:
           cpus: '1.0'
           memory: 4G
     environment:
       - OLLAMA_HOST=0.0.0.0
       - OLLAMA_ORIGINS=*
-      - OLLAMA_NUM_PARALLEL=2
+      - OLLAMA_NUM_PARALLEL=1
       - OLLAMA_NUM_GPU=0
       - OLLAMA_KEEP_ALIVE=86400
       - OLLAMA_MODEL_TIMEOUT=0
       - OLLAMA_MAX_LOADED_MODELS=1
+      - OLLAMA_FLASH_ATTENTION=0
+      - OLLAMA_LLM_LIBRARY=auto
     healthcheck:
       test: ["CMD", "ollama", "list"]
       interval: 30s
       timeout: 10s
       retries: 5
       start_period: 120s
-    # Предзагружаем модель при запуске контейнера
-    entrypoint: ["/bin/sh", "-c", "ollama serve & sleep 15 && ollama run qwen2.5:7b 'test' && tail -f /dev/null"]
+    # Предзагружаем модель при запуске контейнера с keepalive
+    entrypoint: ["/bin/sh", "-c", "ollama serve & sleep 15 && ollama run --keepalive 24h qwen2.5:7b 'test' && tail -f /dev/null"]
   vector-search:
     build:
       context: ./vector-search
diff --git a/vector-search/app.py b/vector-search/app.py
index 8d52827..eb235dc 100644
--- a/vector-search/app.py
+++ b/vector-search/app.py
@@ -51,7 +51,7 @@ def get_embedding(text: str) -> list:
         resp = requests.post(f"{OLLAMA_BASE_URL}/api/embeddings", json={
             "model": EMBED_MODEL,
             "prompt": text
-        }, timeout=30)
+        }, timeout=300)
         print(f"[DEBUG] Ollama response status: {resp.status_code}")
         
         if not resp.ok: