chouchouvs commited on
Commit
781b76e
·
verified ·
1 Parent(s): b268d68

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +366 -267
main.py CHANGED
@@ -1,6 +1,15 @@
1
  # -*- coding: utf-8 -*-
2
- from __future__ import annotations
 
 
 
 
 
 
 
 
3
 
 
4
  import os
5
  import io
6
  import json
@@ -8,8 +17,7 @@ import time
8
  import tarfile
9
  import logging
10
  import hashlib
11
- from typing import Dict, Any, List, Tuple, Optional
12
- from concurrent.futures import ThreadPoolExecutor
13
 
14
  import numpy as np
15
  import faiss
@@ -18,151 +26,132 @@ from fastapi.middleware.cors import CORSMiddleware
18
  from fastapi.responses import JSONResponse, StreamingResponse
19
  from pydantic import BaseModel
20
 
21
- import gradio as gr
22
-
23
- # =============================================================================
24
- # LOGGING
25
- # =============================================================================
26
- LOG = logging.getLogger("remote-indexer-async")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  if not LOG.handlers:
28
  h = logging.StreamHandler()
29
- h.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
30
  LOG.addHandler(h)
31
  LOG.setLevel(logging.INFO)
32
 
33
- DBG = logging.getLogger("remote-indexer-async.debug")
34
- if not DBG.handlers:
35
- hd = logging.StreamHandler()
36
- hd.setFormatter(logging.Formatter("[DEBUG] %(asctime)s - %(message)s"))
37
- DBG.addHandler(hd)
38
- DBG.setLevel(logging.DEBUG)
39
-
40
- # =============================================================================
41
- # CONFIG (via ENV)
42
- # =============================================================================
43
- PORT = int(os.getenv("PORT", "7860"))
44
- DATA_ROOT = os.getenv("DATA_ROOT", "/tmp/data") # stockage interne du Space (volatile en Free)
45
- os.makedirs(DATA_ROOT, exist_ok=True)
46
-
47
- # Provider d'embeddings:
48
- # - "dummy" : vecteurs aléatoires déterministes (très rapide)
49
- # - "st" : Sentence-Transformers (CPU-friendly)
50
- # - "hf" : Transformers pur (AutoModel/AutoTokenizer)
51
- EMB_PROVIDER = os.getenv("EMB_PROVIDER", "dummy").strip().lower()
52
- EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-mpnet-base-v2").strip()
53
- EMB_BATCH = int(os.getenv("EMB_BATCH", "32"))
54
- EMB_DIM = int(os.getenv("EMB_DIM", "128")) # utilisé pour dummy
55
-
56
- # Taille du pool de workers (asynchrone)
57
- MAX_WORKERS = int(os.getenv("MAX_WORKERS", "1"))
58
-
59
- # =============================================================================
60
- # CACHE DIRECTORIES (évite PermissionError: '/.cache')
61
- # =============================================================================
62
- def _setup_cache_dirs() -> Dict[str, str]:
63
- os.environ.setdefault("HOME", "/home/user")
64
-
65
- CACHE_ROOT = os.getenv("CACHE_ROOT", "/tmp/.cache").rstrip("/")
66
- paths = {
67
- "root": CACHE_ROOT,
68
- "hf_home": f"{CACHE_ROOT}/huggingface",
69
- "hf_hub": f"{CACHE_ROOT}/huggingface/hub",
70
- "hf_tf": f"{CACHE_ROOT}/huggingface/transformers",
71
- "torch": f"{CACHE_ROOT}/torch",
72
- "st": f"{CACHE_ROOT}/sentence-transformers",
73
- "mpl": f"{CACHE_ROOT}/matplotlib",
74
- }
75
- for p in paths.values():
76
- try:
77
- os.makedirs(p, exist_ok=True)
78
- except Exception as e:
79
- LOG.warning("Impossible de créer %s : %s", p, e)
80
-
81
- os.environ["HF_HOME"] = paths["hf_home"]
82
- os.environ["HF_HUB_CACHE"] = paths["hf_hub"]
83
- os.environ["TRANSFORMERS_CACHE"] = paths["hf_tf"]
84
- os.environ["TORCH_HOME"] = paths["torch"]
85
- os.environ["SENTENCE_TRANSFORMERS_HOME"] = paths["st"]
86
- os.environ["MPLCONFIGDIR"] = paths["mpl"]
87
- os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS_WARNING", "1")
88
- os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
89
-
90
- LOG.info("Caches configurés: %s", json.dumps(paths, indent=2))
91
- return paths
92
-
93
- CACHE_PATHS = _setup_cache_dirs()
94
-
95
- # Cache global lazy (pour les modèles)
96
- _ST_MODEL = None
97
- _HF_TOKENIZER = None
98
- _HF_MODEL = None
99
-
100
- # =============================================================================
101
- # JOB STATE
102
- # =============================================================================
103
- class JobState(BaseModel):
104
- job_id: str
105
- project_id: str
106
- stage: str = "pending" # pending -> chunking -> embedding -> indexing -> done/failed
107
- total_files: int = 0
108
- total_chunks: int = 0
109
- embedded: int = 0
110
- indexed: int = 0
111
- errors: List[str] = []
112
- messages: List[str] = []
113
- started_at: float = time.time()
114
- finished_at: Optional[float] = None
115
 
116
- JOBS: Dict[str, JobState] = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- def _now() -> str:
119
- return time.strftime("%H:%M:%S")
120
 
121
- def _proj_dirs(project_id: str) -> Tuple[str, str, str]:
122
- base = os.path.join(DATA_ROOT, project_id)
123
- ds_dir = os.path.join(base, "dataset")
124
- fx_dir = os.path.join(base, "faiss")
125
- os.makedirs(ds_dir, exist_ok=True)
126
- os.makedirs(fx_dir, exist_ok=True)
127
- return base, ds_dir, fx_dir
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- def _add_msg(st: JobState, msg: str):
130
- st.messages.append(f"[{_now()}] {msg}")
131
- LOG.info("[%s] %s", st.job_id, msg)
132
- DBG.debug("[%s] %s", st.job_id, msg)
133
-
134
- def _set_stage(st: JobState, stage: str):
135
- st.stage = stage
136
- _add_msg(st, f"stage={stage}")
137
-
138
- # =============================================================================
139
- # UTILS
140
- # =============================================================================
141
- def _chunk_text(text: str, size: int = 200, overlap: int = 20) -> List[str]:
142
- text = (text or "").replace("\r\n", "\n")
143
- tokens = list(text)
144
- if size <= 0:
145
- return [text] if text else []
146
- if overlap < 0:
147
- overlap = 0
148
- chunks = []
149
- i = 0
150
- while i < len(tokens):
151
- j = min(i + size, len(tokens))
152
- chunk = "".join(tokens[i:j]).strip()
153
- if chunk:
154
- chunks.append(chunk)
155
- if j == len(tokens):
156
- break
157
- i = j - overlap if (j - overlap) > i else j
158
- return chunks
159
-
160
- def _l2_normalize(x: np.ndarray) -> np.ndarray:
161
- n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
162
- return x / n
163
-
164
- # ----------------------- PROVIDER: DUMMY --------------------------------------
165
  def _emb_dummy(texts: List[str], dim: int = EMB_DIM) -> np.ndarray:
 
166
  vecs = np.zeros((len(texts), dim), dtype="float32")
167
  for i, t in enumerate(texts):
168
  h = hashlib.sha1((t or "").encode("utf-8")).digest()
@@ -171,15 +160,16 @@ def _emb_dummy(texts: List[str], dim: int = EMB_DIM) -> np.ndarray:
171
  vecs[i] = v / (np.linalg.norm(v) + 1e-9)
172
  return vecs
173
 
174
- # ----------------- PROVIDER: Sentence-Transformers ----------------------------
175
  def _get_st_model():
176
  global _ST_MODEL
177
  if _ST_MODEL is None:
178
  from sentence_transformers import SentenceTransformer
179
- _ST_MODEL = SentenceTransformer(EMB_MODEL, cache_folder=CACHE_PATHS["st"])
180
- LOG.info("[st] modèle chargé: %s (cache=%s)", EMB_MODEL, CACHE_PATHS["st"])
181
  return _ST_MODEL
182
 
 
183
  def _emb_st(texts: List[str]) -> np.ndarray:
184
  model = _get_st_model()
185
  vecs = model.encode(
@@ -191,55 +181,72 @@ def _emb_st(texts: List[str]) -> np.ndarray:
191
  ).astype("float32")
192
  return vecs
193
 
194
- # ----------------------- PROVIDER: Transformers (HF) --------------------------
195
  def _get_hf_model():
196
  global _HF_TOKENIZER, _HF_MODEL
197
  if _HF_MODEL is None or _HF_TOKENIZER is None:
198
  from transformers import AutoTokenizer, AutoModel
199
- _HF_TOKENIZER = AutoTokenizer.from_pretrained(EMB_MODEL, cache_dir=CACHE_PATHS["hf_tf"])
200
- _HF_MODEL = AutoModel.from_pretrained(EMB_MODEL, cache_dir=CACHE_PATHS["hf_tf"])
201
  _HF_MODEL.eval()
202
- LOG.info("[hf] modèle chargé: %s (cache=%s)", EMB_MODEL, CACHE_PATHS["hf_tf"])
203
  return _HF_TOKENIZER, _HF_MODEL
204
 
205
- def _mean_pool(last_hidden_state: "np.ndarray", attention_mask: "np.ndarray") -> "np.ndarray":
 
206
  mask = attention_mask[..., None].astype(last_hidden_state.dtype)
207
  summed = (last_hidden_state * mask).sum(axis=1)
208
  counts = mask.sum(axis=1).clip(min=1e-9)
209
  return summed / counts
210
 
 
211
  def _emb_hf(texts: List[str]) -> np.ndarray:
212
  import torch
213
  tok, mod = _get_hf_model()
214
- all_vecs = []
215
  bs = max(1, EMB_BATCH)
216
  with torch.no_grad():
217
  for i in range(0, len(texts), bs):
218
- batch = texts[i:i+bs]
219
  enc = tok(batch, padding=True, truncation=True, return_tensors="pt")
220
  out = mod(**enc)
221
  last = out.last_hidden_state # (b, t, h)
222
  pooled = _mean_pool(last.numpy(), enc["attention_mask"].numpy())
223
  all_vecs.append(pooled.astype("float32"))
224
- vecs = np.concatenate(all_vecs, axis=0)
225
- return _l2_normalize(vecs)
 
 
 
 
 
 
 
 
226
 
227
- # ---------------------------- DATASET / FAISS ---------------------------------
228
- def _save_dataset(ds_dir: str, rows: List[Dict[str, Any]]):
 
 
 
 
229
  os.makedirs(ds_dir, exist_ok=True)
230
  data_path = os.path.join(ds_dir, "data.jsonl")
231
  with open(data_path, "w", encoding="utf-8") as f:
232
  for r in rows:
 
 
233
  f.write(json.dumps(r, ensure_ascii=False) + "\n")
234
  meta = {"format": "jsonl", "columns": ["path", "text", "chunk_id"], "count": len(rows)}
235
  with open(os.path.join(ds_dir, "meta.json"), "w", encoding="utf-8") as f:
236
  json.dump(meta, f, ensure_ascii=False, indent=2)
237
 
 
238
  def _load_dataset(ds_dir: str) -> List[Dict[str, Any]]:
239
  data_path = os.path.join(ds_dir, "data.jsonl")
240
  if not os.path.isfile(data_path):
241
  return []
242
- out = []
243
  with open(data_path, "r", encoding="utf-8") as f:
244
  for line in f:
245
  try:
@@ -248,155 +255,234 @@ def _load_dataset(ds_dir: str) -> List[Dict[str, Any]]:
248
  continue
249
  return out
250
 
251
- def _save_faiss(fx_dir: str, xb: np.ndarray, meta: Dict[str, Any]):
 
 
252
  os.makedirs(fx_dir, exist_ok=True)
253
  idx_path = os.path.join(fx_dir, "emb.faiss")
254
- index = faiss.IndexFlatIP(xb.shape[1]) # cosine ~ inner product si embeddings normalisés
255
- index.add(xb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  faiss.write_index(index, idx_path)
 
 
257
  with open(os.path.join(fx_dir, "meta.json"), "w", encoding="utf-8") as f:
258
  json.dump(meta, f, ensure_ascii=False, indent=2)
259
 
 
260
  def _load_faiss(fx_dir: str) -> faiss.Index:
 
261
  idx_path = os.path.join(fx_dir, "emb.faiss")
262
  if not os.path.isfile(idx_path):
263
- raise FileNotFoundError(f"FAISS index introuvable: {idx_path}")
264
- return faiss.read_index(idx_path)
 
 
265
 
266
  def _tar_dir_to_bytes(dir_path: str) -> bytes:
 
267
  bio = io.BytesIO()
268
- with tarfile.open(fileobj=bio, mode="w:gz") as tar:
269
  tar.add(dir_path, arcname=os.path.basename(dir_path))
270
  bio.seek(0)
271
  return bio.read()
272
 
273
- # =============================================================================
274
- # WORKER POOL (asynchrone)
275
- # =============================================================================
276
- EXECUTOR = ThreadPoolExecutor(max_workers=max(1, MAX_WORKERS))
277
- LOG.info("ThreadPoolExecutor initialisé : max_workers=%s", MAX_WORKERS)
278
 
279
- def _do_index_job(st: JobState, files: List[Dict[str, str]], chunk_size: int, overlap: int, batch_size: int, store_text: bool) -> None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  """
281
- Tâche lourde lancée dans un worker thread.
282
- Met à jour l'état 'st' tout au long du pipeline.
 
 
 
 
283
  """
284
  try:
285
  base, ds_dir, fx_dir = _proj_dirs(st.project_id)
286
 
287
- # 1) Chunking
288
- _set_stage(st, "chunking")
 
289
  rows: List[Dict[str, Any]] = []
290
  st.total_files = len(files)
291
- for it in files:
292
- path = (it.get("path") or "unknown").strip()
293
- txt = it.get("text") or ""
294
- chks = _chunk_text(txt, size=int(chunk_size), overlap=int(overlap))
295
- _add_msg(st, f"{path}: len(text)={len(txt)} chunks={len(chks)}")
296
- for ci, ck in enumerate(chks):
297
- rows.append({"path": path, "text": ck, "chunk_id": ci})
 
298
  st.total_chunks = len(rows)
299
- _add_msg(st, f"Total chunks = {st.total_chunks}")
300
 
301
- # 2) Embedding
302
- _set_stage(st, "embedding")
 
303
  texts = [r["text"] for r in rows]
304
  if EMB_PROVIDER == "dummy":
305
  xb = _emb_dummy(texts, dim=EMB_DIM)
306
- dim = xb.shape[1]
307
  elif EMB_PROVIDER == "st":
308
  xb = _emb_st(texts)
309
- dim = xb.shape[1]
310
  else:
311
  xb = _emb_hf(texts)
312
- dim = xb.shape[1]
313
 
314
- st.embedded = xb.shape[0]
315
- _add_msg(st, f"Embeddings {st.embedded}/{st.total_chunks}")
316
- _add_msg(st, f"Embeddings dim={dim}")
317
-
318
- # 3) Sauvegarde dataset (texte)
319
- _save_dataset(ds_dir, rows)
320
- _add_msg(st, f"Dataset (sans index) sauvegardé dans {ds_dir}")
321
 
322
- # 4) FAISS
323
- _set_stage(st, "indexing")
324
- faiss_meta = {
325
- "dim": int(dim),
 
 
 
 
 
 
 
 
 
326
  "count": int(xb.shape[0]),
327
  "provider": EMB_PROVIDER,
328
- "model": EMB_MODEL if EMB_PROVIDER != "dummy" else None
329
  }
330
- _save_faiss(fx_dir, xb, meta=faiss_meta)
331
  st.indexed = int(xb.shape[0])
332
- _add_msg(st, f"FAISS écrit sur {os.path.join(fx_dir, 'emb.faiss')}")
333
- _add_msg(st, f"OK — dataset+index prêts (projet={st.project_id})")
334
 
335
- _set_stage(st, "done")
 
 
 
336
  st.finished_at = time.time()
337
  except Exception as e:
338
- LOG.exception("Job %s failed", st.job_id)
339
  st.errors.append(str(e))
340
- _add_msg(st, f"❌ Exception: {e}")
341
  st.stage = "failed"
342
  st.finished_at = time.time()
343
 
344
- def _submit_job(project_id: str, files: List[Dict[str, str]], chunk_size: int, overlap: int, batch_size: int, store_text: bool) -> str:
 
 
 
 
 
 
 
 
345
  job_id = hashlib.sha1(f"{project_id}{time.time()}".encode()).hexdigest()[:12]
346
  st = JobState(job_id=job_id, project_id=project_id, stage="pending", messages=[])
347
  JOBS[job_id] = st
348
- _add_msg(st, f"Job {job_id} créé pour project {project_id}")
349
- _add_msg(st, f"Index start project={project_id} files={len(files)} chunk_size={chunk_size} overlap={overlap} batch_size={batch_size} store_text={store_text} provider={EMB_PROVIDER} model={EMB_MODEL if EMB_PROVIDER!='dummy' else '-'}")
350
 
351
- # Soumission au pool (retour immédiat)
352
- EXECUTOR.submit(_do_index_job, st, files, chunk_size, overlap, batch_size, store_text)
353
- _set_stage(st, "queued")
 
 
 
 
 
 
 
 
 
354
  return job_id
355
 
356
- # =============================================================================
357
- # FASTAPI
358
- # =============================================================================
 
359
  fastapi_app = FastAPI(title="remote-indexer-async", version="3.0.0")
360
  fastapi_app.add_middleware(
361
  CORSMiddleware,
362
- allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"],
 
 
 
363
  )
364
 
 
365
  class FileItem(BaseModel):
366
  path: str
367
  text: str
368
 
 
369
  class IndexRequest(BaseModel):
370
  project_id: str
371
  files: List[FileItem]
372
  chunk_size: int = 200
373
  overlap: int = 20
374
  batch_size: int = 32
375
- store_text: bool = True
 
376
 
377
  @fastapi_app.get("/health")
378
  def health():
379
- info = {
380
  "ok": True,
381
  "service": "remote-indexer-async",
382
  "provider": EMB_PROVIDER,
383
  "model": EMB_MODEL if EMB_PROVIDER != "dummy" else None,
384
  "cache_root": os.getenv("CACHE_ROOT", "/tmp/.cache"),
385
  "workers": MAX_WORKERS,
386
- "data_root": DATA_ROOT,
 
 
387
  }
388
- return info
389
 
390
- @fastapi_app.get("/")
391
- def root_redirect():
392
- return {"ok": True, "service": "remote-indexer-async", "ui": "/ui"}
393
 
394
  @fastapi_app.post("/index")
395
  def index(req: IndexRequest):
396
- """
397
- ASYNCHRONE : retourne immédiatement un job_id.
398
- Le traitement est effectué en arrière-plan par le pool de threads.
399
- """
400
  try:
401
  files = [fi.model_dump() for fi in req.files]
402
  job_id = _submit_job(
@@ -409,9 +495,10 @@ def index(req: IndexRequest):
409
  )
410
  return {"job_id": job_id}
411
  except Exception as e:
412
- LOG.exception("index failed (submit)")
413
  raise HTTPException(status_code=500, detail=str(e))
414
 
 
415
  @fastapi_app.get("/status/{job_id}")
416
  def status(job_id: str):
417
  st = JOBS.get(job_id)
@@ -419,26 +506,26 @@ def status(job_id: str):
419
  raise HTTPException(status_code=404, detail="job inconnu")
420
  return JSONResponse(st.model_dump())
421
 
 
422
  class SearchRequest(BaseModel):
423
  project_id: str
424
  query: str
425
  k: int = 5
426
 
 
427
  @fastapi_app.post("/search")
428
  def search(req: SearchRequest):
429
  base, ds_dir, fx_dir = _proj_dirs(req.project_id)
430
 
431
- # Si l'index n'existe pas encore, on répond 409 (conflit / pas prêt)
432
- idx_path = os.path.join(fx_dir, "emb.faiss")
433
- ds_path = os.path.join(ds_dir, "data.jsonl")
434
- if not (os.path.isfile(idx_path) and os.path.isfile(ds_path)):
435
  raise HTTPException(status_code=409, detail="Index non prêt (reviens plus tard)")
436
 
437
  rows = _load_dataset(ds_dir)
438
  if not rows:
439
  raise HTTPException(status_code=404, detail="dataset introuvable")
440
 
441
- # Embedding de la requête avec le MÊME provider
442
  if EMB_PROVIDER == "dummy":
443
  q = _emb_dummy([req.query], dim=EMB_DIM)[0:1, :]
444
  elif EMB_PROVIDER == "st":
@@ -446,10 +533,13 @@ def search(req: SearchRequest):
446
  else:
447
  q = _emb_hf([req.query])[0:1, :]
448
 
449
- # FAISS
450
  index = _load_faiss(fx_dir)
451
  if index.d != q.shape[1]:
452
- raise HTTPException(status_code=500, detail=f"dim incompatibles: index.d={index.d} vs query={q.shape[1]}")
 
 
 
453
  scores, ids = index.search(q, int(max(1, req.k)))
454
  ids = ids[0].tolist()
455
  scores = scores[0].tolist()
@@ -462,72 +552,81 @@ def search(req: SearchRequest):
462
  out.append({"path": r.get("path"), "text": r.get("text"), "score": float(sc)})
463
  return {"results": out}
464
 
465
- # ----------- ARTIFACTS EXPORT -----------
 
 
 
466
  @fastapi_app.get("/artifacts/{project_id}/dataset")
467
  def download_dataset(project_id: str):
468
- base, ds_dir, _ = _proj_dirs(project_id)
469
  if not os.path.isdir(ds_dir):
470
  raise HTTPException(status_code=404, detail="Dataset introuvable")
471
  buf = _tar_dir_to_bytes(ds_dir)
472
- headers = {"Content-Disposition": f'attachment; filename="{project_id}_dataset.tgz"'}
473
- return StreamingResponse(io.BytesIO(buf), media_type="application/gzip", headers=headers)
 
474
 
475
  @fastapi_app.get("/artifacts/{project_id}/faiss")
476
  def download_faiss(project_id: str):
477
- base, _, fx_dir = _proj_dirs(project_id)
478
  if not os.path.isdir(fx_dir):
479
  raise HTTPException(status_code=404, detail="FAISS introuvable")
480
  buf = _tar_dir_to_bytes(fx_dir)
481
- headers = {"Content-Disposition": f'attachment; filename="{project_id}_faiss.tgz"'}
482
- return StreamingResponse(io.BytesIO(buf), media_type="application/gzip", headers=headers)
 
483
 
484
- # =============================================================================
485
- # GRADIO UI (facultatif de test)
486
- # =============================================================================
487
  def _ui_index(project_id: str, sample_text: str):
488
  files = [{"path": "sample.txt", "text": sample_text}]
489
- from pydantic import ValidationError
490
  try:
491
  req = IndexRequest(project_id=project_id, files=[FileItem(**f) for f in files])
492
- except ValidationError as e:
493
- return f"Erreur: {e}"
494
  try:
495
  res = index(req)
496
- return f"Job lancé: {res['job_id']}"
497
  except Exception as e:
498
- return f"Erreur index: {e}"
 
499
 
500
  def _ui_search(project_id: str, query: str, k: int):
501
  try:
502
  res = search(SearchRequest(project_id=project_id, query=query, k=int(k)))
503
  return json.dumps(res, ensure_ascii=False, indent=2)
504
  except Exception as e:
505
- return f"Erreur search: {e}"
506
-
507
- with gr.Blocks(title="Remote Indexer (Async FAISS)", analytics_enabled=False) as ui:
508
- gr.Markdown("## Remote Indexer — **Async** (API: `/index`, `/status/{job}`, `/search`, `/artifacts/...`).")
509
- gr.Markdown(f"**Provider**: `{EMB_PROVIDER}` — **Model**: `{EMB_MODEL if EMB_PROVIDER!='dummy' else '-'}` — **Cache**: `{os.getenv('CACHE_ROOT', '/tmp/.cache')}` — **Workers**: `{MAX_WORKERS}`")
510
- with gr.Tab("Index"):
511
- pid = gr.Textbox(label="Project ID", value="DEEPWEB")
512
- sample = gr.Textbox(label="Texte d’exemple", value="Alpha bravo charlie delta echo foxtrot.", lines=4)
513
- btn = gr.Button("Lancer index (sample)")
514
- out = gr.Textbox(label="Résultat")
515
- btn.click(_ui_index, inputs=[pid, sample], outputs=[out])
516
-
517
- with gr.Tab("Search"):
518
- pid2 = gr.Textbox(label="Project ID", value="DEEPWEB")
 
519
  q = gr.Textbox(label="Query", value="alpha")
520
- k = gr.Slider(1, 20, value=5, step=1, label="k")
521
- btn2 = gr.Button("Rechercher")
522
- out2 = gr.Code(label="Résultats")
523
- btn2.click(_ui_search, inputs=[pid2, q, k], outputs=[out2])
524
 
525
  fastapi_app = gr.mount_gradio_app(fastapi_app, ui, path="/ui")
526
 
527
- # =============================================================================
528
- # MAIN
529
- # =============================================================================
 
530
  if __name__ == "__main__":
531
  import uvicorn
532
- LOG.info("Démarrage Uvicorn sur 0.0.0.0:%s (UI_PATH=/ui) — async index", PORT)
533
- uvicorn.run(fastapi_app, host="0.0.0.0", port=PORT)
 
 
 
1
  # -*- coding: utf-8 -*-
2
+ """
3
+ Version optimisée du module FAISS :
4
+ - Réduction de la dimension des vecteurs (EMB_DIM, configurable)
5
+ - Index quantisé **IVF‑PQ** (faible empreinte disque)
6
+ - Chargement *on‑disk* (mmap) pour limiter la RAM
7
+ - Option `store_text` : ne pas persister le texte brut dans le dataset
8
+ - Compression gzip des artefacts exportés
9
+ - Paramètres contrôlables via variables d’environnement
10
+ """
11
 
12
+ from __future__ import annotations
13
  import os
14
  import io
15
  import json
 
17
  import tarfile
18
  import logging
19
  import hashlib
20
+ from typing import List, Dict, Any, Tuple, Optional
 
21
 
22
  import numpy as np
23
  import faiss
 
26
  from fastapi.responses import JSONResponse, StreamingResponse
27
  from pydantic import BaseModel
28
 
29
+ # --------------------------------------------------------------------------- #
30
+ # CONFIGURATION (variables d’environnement – modifiable à la volée)
31
+ # --------------------------------------------------------------------------- #
32
+ EMB_PROVIDER = os.getenv("EMB_PROVIDER", "dummy").strip().lower()
33
+ EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-mpnet-base-v2").strip()
34
+ EMB_BATCH = int(os.getenv("EMB_BATCH", "32"))
35
+ EMB_DIM = int(os.getenv("EMB_DIM", "64")) # ← dimension réduite (ex. 64)
36
+
37
+ # FAISS quantisation
38
+ FAISS_TYPE = os.getenv("FAISS_TYPE", "IVF_PQ").upper() # FLAT ou IVF_PQ
39
+ FAISS_NLIST = int(os.getenv("FAISS_NLIST", "100")) # nb de centroides (IVF)
40
+ FAISS_M = int(os.getenv("FAISS_M", "8")) # sous‑vecteurs (PQ)
41
+ FAISS_NBITS = int(os.getenv("FAISS_NBITS", "8")) # bits / sous‑vecteur
42
+
43
+ # Stockage du texte brut dans le dataset ? (False → économise disque)
44
+ STORE_TEXT = os.getenv("STORE_TEXT", "false").lower() in ("1", "true", "yes")
45
+
46
+ # --------------------------------------------------------------------------- #
47
+ # LOGGING
48
+ # --------------------------------------------------------------------------- #
49
+ LOG = logging.getLogger("appli_v1")
50
  if not LOG.handlers:
51
  h = logging.StreamHandler()
52
+ h.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s", "%H:%M:%S"))
53
  LOG.addHandler(h)
54
  LOG.setLevel(logging.INFO)
55
 
56
+ # --------------------------------------------------------------------------- #
57
+ # UTILITAIRES
58
+ # --------------------------------------------------------------------------- #
59
+ def list_repo_files(repo_dir: str, top_k: int = 500) -> List[str]:
60
+ """
61
+ Retourne la liste des fichiers texte du dépôt, en respectant .gitignore
62
+ (via Git si disponible, sinon fallback os.walk).
63
+ """
64
+ if not os.path.isdir(repo_dir):
65
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ files: List[str] = []
68
+ try:
69
+ from git import Repo
70
+ repo = Repo(repo_dir)
71
+
72
+ # fichiers trackés
73
+ tracked = repo.git.ls_files().splitlines()
74
+ files.extend(tracked)
75
+
76
+ # fichiers non‑trackés mais non ignorés
77
+ untracked = repo.git.ls_files(others=True, exclude_standard=True).splitlines()
78
+ files.extend(untracked)
79
+
80
+ # filtrage simple
81
+ files = [
82
+ f for f in files
83
+ if not f.startswith('.git/') and not any(p.startswith('.') for p in f.split(os.sep))
84
+ ]
85
+ files = sorted(set(files))[:top_k]
86
+ except Exception as e:
87
+ LOG.debug("Git indisponible / pas un dépôt → fallback os.walk : %s", e)
88
+ for root, _, names in os.walk(repo_dir):
89
+ for name in sorted(names):
90
+ if name.startswith('.'):
91
+ continue
92
+ rel = os.path.relpath(os.path.join(root, name), repo_dir)
93
+ if rel.startswith('.git') or any(p.startswith('.') for p in rel.split(os.sep)):
94
+ continue
95
+ files.append(rel)
96
+ if len(files) >= top_k:
97
+ break
98
+ if len(files) >= top_k:
99
+ break
100
+ files = sorted(set(files))
101
+
102
+ return files
103
+
104
+
105
+ def read_file_safe(file_path: str) -> str:
106
+ """Lit un fichier en UTF‑8, ignore les erreurs."""
107
+ try:
108
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
109
+ return f.read()
110
+ except Exception as e:
111
+ LOG.error("Erreur lecture %s : %s", file_path, e)
112
+ return f"# Erreur lecture : {e}"
113
 
 
 
114
 
115
+ def write_file_safe(file_path: str, content: str) -> str:
116
+ """Écrit un fichier, crée les dossiers parents si besoin."""
117
+ try:
118
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
119
+ with open(file_path, "w", encoding="utf-8") as f:
120
+ f.write(content)
121
+ return f"✅ Fichier sauvegardé : {os.path.basename(file_path)}"
122
+ except Exception as e:
123
+ LOG.error("Erreur écriture %s : %s", file_path, e)
124
+ return f"❌ Erreur sauvegarde : {e}"
125
+
126
+
127
+ # --------------------------------------------------------------------------- #
128
+ # FAKE / DUMMY FAISS (pour compatibilité)
129
+ # --------------------------------------------------------------------------- #
130
+ class DummyFAISS:
131
+ """Classe factice – aucune fonctionnalité réelle."""
132
+ pass
133
+
134
+
135
+ def create_faiss_index(*_, **__) -> DummyFAISS:
136
+ LOG.warning("FAISS désactivé – utilisation du client distant")
137
+ return DummyFAISS()
138
+
139
+
140
+ def search_faiss_index(*_, **__) -> List[Any]:
141
+ LOG.warning("FAISS désactivé – utilisation du client distant")
142
+ return []
143
+
144
+
145
+ # --------------------------------------------------------------------------- #
146
+ # EMBEDDING PROVIDERS
147
+ # --------------------------------------------------------------------------- #
148
+ _ST_MODEL: Optional[Any] = None
149
+ _HF_TOKENIZER: Optional[Any] = None
150
+ _HF_MODEL: Optional[Any] = None
151
+
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  def _emb_dummy(texts: List[str], dim: int = EMB_DIM) -> np.ndarray:
154
+ """Vecteurs aléatoires déterministes (SHA‑1 → seed)."""
155
  vecs = np.zeros((len(texts), dim), dtype="float32")
156
  for i, t in enumerate(texts):
157
  h = hashlib.sha1((t or "").encode("utf-8")).digest()
 
160
  vecs[i] = v / (np.linalg.norm(v) + 1e-9)
161
  return vecs
162
 
163
+
164
  def _get_st_model():
165
  global _ST_MODEL
166
  if _ST_MODEL is None:
167
  from sentence_transformers import SentenceTransformer
168
+ _ST_MODEL = SentenceTransformer(EMB_MODEL, cache_folder=os.getenv("HF_HOME", "/tmp/.cache/huggingface"))
169
+ LOG.info("[st] modèle chargé: %s", EMB_MODEL)
170
  return _ST_MODEL
171
 
172
+
173
  def _emb_st(texts: List[str]) -> np.ndarray:
174
  model = _get_st_model()
175
  vecs = model.encode(
 
181
  ).astype("float32")
182
  return vecs
183
 
184
+
185
  def _get_hf_model():
186
  global _HF_TOKENIZER, _HF_MODEL
187
  if _HF_MODEL is None or _HF_TOKENIZER is None:
188
  from transformers import AutoTokenizer, AutoModel
189
+ _HF_TOKENIZER = AutoTokenizer.from_pretrained(EMB_MODEL, cache_dir=os.getenv("HF_HOME", "/tmp/.cache/huggingface"))
190
+ _HF_MODEL = AutoModel.from_pretrained(EMB_MODEL, cache_dir=os.getenv("HF_HOME", "/tmp/.cache/huggingface"))
191
  _HF_MODEL.eval()
192
+ LOG.info("[hf] modèle chargé: %s", EMB_MODEL)
193
  return _HF_TOKENIZER, _HF_MODEL
194
 
195
+
196
+ def _mean_pool(last_hidden_state: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
197
  mask = attention_mask[..., None].astype(last_hidden_state.dtype)
198
  summed = (last_hidden_state * mask).sum(axis=1)
199
  counts = mask.sum(axis=1).clip(min=1e-9)
200
  return summed / counts
201
 
202
+
203
  def _emb_hf(texts: List[str]) -> np.ndarray:
204
  import torch
205
  tok, mod = _get_hf_model()
206
+ all_vecs: List[np.ndarray] = []
207
  bs = max(1, EMB_BATCH)
208
  with torch.no_grad():
209
  for i in range(0, len(texts), bs):
210
+ batch = texts[i:i + bs]
211
  enc = tok(batch, padding=True, truncation=True, return_tensors="pt")
212
  out = mod(**enc)
213
  last = out.last_hidden_state # (b, t, h)
214
  pooled = _mean_pool(last.numpy(), enc["attention_mask"].numpy())
215
  all_vecs.append(pooled.astype("float32"))
216
+ return np.concatenate(all_vecs, axis=0)
217
+
218
+
219
+ def _reduce_dim(vectors: np.ndarray, target_dim: int = EMB_DIM) -> np.ndarray:
220
+ """PCA simple pour réduire la dimension (si target_dim < current)."""
221
+ if target_dim >= vectors.shape[1]:
222
+ return vectors
223
+ from sklearn.decomposition import PCA
224
+ pca = PCA(n_components=target_dim, random_state=0)
225
+ return pca.fit_transform(vectors).astype("float32")
226
 
227
+
228
+ # --------------------------------------------------------------------------- #
229
+ # DATASET / FAISS I/O
230
+ # --------------------------------------------------------------------------- #
231
+ def _save_dataset(ds_dir: str, rows: List[Dict[str, Any]], store_text: bool = STORE_TEXT) -> None:
232
+ """Sauvegarde le dataset au format JSONL (optionnellement sans le texte)."""
233
  os.makedirs(ds_dir, exist_ok=True)
234
  data_path = os.path.join(ds_dir, "data.jsonl")
235
  with open(data_path, "w", encoding="utf-8") as f:
236
  for r in rows:
237
+ if not store_text:
238
+ r = {k: v for k, v in r.items() if k != "text"}
239
  f.write(json.dumps(r, ensure_ascii=False) + "\n")
240
  meta = {"format": "jsonl", "columns": ["path", "text", "chunk_id"], "count": len(rows)}
241
  with open(os.path.join(ds_dir, "meta.json"), "w", encoding="utf-8") as f:
242
  json.dump(meta, f, ensure_ascii=False, indent=2)
243
 
244
+
245
  def _load_dataset(ds_dir: str) -> List[Dict[str, Any]]:
246
  data_path = os.path.join(ds_dir, "data.jsonl")
247
  if not os.path.isfile(data_path):
248
  return []
249
+ out: List[Dict[str, Any]] = []
250
  with open(data_path, "r", encoding="utf-8") as f:
251
  for line in f:
252
  try:
 
255
  continue
256
  return out
257
 
258
+
259
+ def _save_faiss(fx_dir: str, xb: np.ndarray, meta: Dict[str, Any]) -> None:
260
+ """Sauvegarde un index FAISS quantisé (IVF‑PQ) ou plat selon FAISS_TYPE."""
261
  os.makedirs(fx_dir, exist_ok=True)
262
  idx_path = os.path.join(fx_dir, "emb.faiss")
263
+
264
+ if FAISS_TYPE == "IVF_PQ":
265
+ # ---- IVF‑PQ ---------------------------------------------------------
266
+ quantizer = faiss.IndexFlatIP(xb.shape[1]) # base (inner‑product ≈ cosine)
267
+ index = faiss.IndexIVFPQ(quantizer, xb.shape[1], FAISS_NLIST, FAISS_M, FAISS_NBITS)
268
+
269
+ # entraînement sur un sous‑échantillon (max 10 k vecteurs)
270
+ rng = np.random.default_rng(0)
271
+ train = xb[rng.choice(xb.shape[0], min(10_000, xb.shape[0]), replace=False]
272
+ index.train(train)
273
+
274
+ index.add(xb)
275
+ meta.update({
276
+ "index_type": "IVF_PQ",
277
+ "nlist": FAISS_NLIST,
278
+ "m": FAISS_M,
279
+ "nbits": FAISS_NBITS,
280
+ })
281
+ else: # FLAT (fallback)
282
+ index = faiss.IndexFlatIP(xb.shape[1])
283
+ index.add(xb)
284
+ meta.update({"index_type": "FLAT"})
285
+
286
  faiss.write_index(index, idx_path)
287
+
288
+ # meta.json (inclut le type d’index)
289
  with open(os.path.join(fx_dir, "meta.json"), "w", encoding="utf-8") as f:
290
  json.dump(meta, f, ensure_ascii=False, indent=2)
291
 
292
+
293
  def _load_faiss(fx_dir: str) -> faiss.Index:
294
+ """Charge l’index en mode mmap (lecture à la volée)."""
295
  idx_path = os.path.join(fx_dir, "emb.faiss")
296
  if not os.path.isfile(idx_path):
297
+ raise FileNotFoundError(f"FAISS index introuvable: {idx_path}")
298
+ # mmap minimise la RAM utilisée
299
+ return faiss.read_index(idx_path, faiss.IO_FLAG_MMAP)
300
+
301
 
302
  def _tar_dir_to_bytes(dir_path: str) -> bytes:
303
+ """Archive gzip du répertoire (compression maximale)."""
304
  bio = io.BytesIO()
305
+ with tarfile.open(fileobj=bio, mode="w:gz", compresslevel=9) as tar:
306
  tar.add(dir_path, arcname=os.path.basename(dir_path))
307
  bio.seek(0)
308
  return bio.read()
309
 
 
 
 
 
 
310
 
311
+ # --------------------------------------------------------------------------- #
312
+ # WORKER POOL (asynchrone)
313
+ # --------------------------------------------------------------------------- #
314
+ from concurrent.futures import ThreadPoolExecutor
315
+
316
+ MAX_WORKERS = max(1, int(os.getenv("MAX_WORKERS", "1")))
317
+ EXECUTOR = ThreadPoolExecutor(max_workers=MAX_WORKERS)
318
+ LOG.info("ThreadPoolExecutor initialisé : max_workers=%s", MAX_WORKERS)
319
+
320
+
321
+ def _proj_dirs(project_id: str) -> Tuple[str, str, str]:
322
+ base = os.path.join(os.getenv("DATA_ROOT", "/tmp/data"), project_id)
323
+ ds_dir = os.path.join(base, "dataset")
324
+ fx_dir = os.path.join(base, "faiss")
325
+ os.makedirs(ds_dir, exist_ok=True)
326
+ os.makedirs(fx_dir, exist_ok=True)
327
+ return base, ds_dir, fx_dir
328
+
329
+
330
+ def _do_index_job(
331
+ st: "JobState",
332
+ files: List[Dict[str, str]],
333
+ chunk_size: int,
334
+ overlap: int,
335
+ batch_size: int,
336
+ store_text: bool,
337
+ ) -> None:
338
  """
339
+ Pipeline complet :
340
+ 1️⃣ Chunking
341
+ 2️⃣ Embedding (dummy / st / hf)
342
+ 3️⃣ Réduction de dimension (PCA) si EMB_DIM < dim du modèle
343
+ 4️⃣ Sauvegarde dataset (optionnel texte)
344
+ 5️⃣ Index FAISS quantisé + mmap
345
  """
346
  try:
347
  base, ds_dir, fx_dir = _proj_dirs(st.project_id)
348
 
349
+ # ------------------------------------------------------------------- #
350
+ # 1️⃣ Chunking
351
+ # ------------------------------------------------------------------- #
352
  rows: List[Dict[str, Any]] = []
353
  st.total_files = len(files)
354
+
355
+ for f in files:
356
+ path = (f.get("path") or "unknown").strip()
357
+ txt = f.get("text") or ""
358
+ chunks = _chunk_text(txt, size=chunk_size, overlap=overlap)
359
+ for i, ck in enumerate(chunks):
360
+ rows.append({"path": path, "text": ck, "chunk_id": i})
361
+
362
  st.total_chunks = len(rows)
363
+ LOG.info("Chunking terminé : %d chunks", st.total_chunks)
364
 
365
+ # ------------------------------------------------------------------- #
366
+ # 2️⃣ Embedding
367
+ # ------------------------------------------------------------------- #
368
  texts = [r["text"] for r in rows]
369
  if EMB_PROVIDER == "dummy":
370
  xb = _emb_dummy(texts, dim=EMB_DIM)
 
371
  elif EMB_PROVIDER == "st":
372
  xb = _emb_st(texts)
 
373
  else:
374
  xb = _emb_hf(texts)
 
375
 
376
+ # ------------------------------------------------------------------- #
377
+ # 3️⃣ Réduction de dimension (si nécessaire)
378
+ # ------------------------------------------------------------------- #
379
+ if xb.shape[1] != EMB_DIM:
380
+ xb = _reduce_dim(xb, target_dim=EMB_DIM)
 
 
381
 
382
+ st.embedded = xb.shape[0]
383
+ LOG.info("Embedding terminé : %d vecteurs (dim=%d)", st.embedded, xb.shape[1])
384
+
385
+ # ------------------------------------------------------------------- #
386
+ # 4️⃣ Sauvegarde du dataset
387
+ # ------------------------------------------------------------------- #
388
+ _save_dataset(ds_dir, rows, store_text=store_text)
389
+
390
+ # ------------------------------------------------------------------- #
391
+ # 5️⃣ Index FAISS
392
+ # ------------------------------------------------------------------- #
393
+ meta = {
394
+ "dim": int(xb.shape[1]),
395
  "count": int(xb.shape[0]),
396
  "provider": EMB_PROVIDER,
397
+ "model": EMB_MODEL if EMB_PROVIDER != "dummy" else None,
398
  }
399
+ _save_faiss(fx_dir, xb, meta)
400
  st.indexed = int(xb.shape[0])
401
+ LOG.info("FAISS (%s) écrit : %s", FAISS_TYPE, os.path.join(fx_dir, "emb.faiss"))
 
402
 
403
+ # ------------------------------------------------------------------- #
404
+ # Finalisation
405
+ # ------------------------------------------------------------------- #
406
+ st.stage = "done"
407
  st.finished_at = time.time()
408
  except Exception as e:
409
+ LOG.exception("Job %s échoué", st.job_id)
410
  st.errors.append(str(e))
 
411
  st.stage = "failed"
412
  st.finished_at = time.time()
413
 
414
+
415
+ def _submit_job(
416
+ project_id: str,
417
+ files: List[Dict[str, str]],
418
+ chunk_size: int,
419
+ overlap: int,
420
+ batch_size: int,
421
+ store_text: bool,
422
+ ) -> str:
423
  job_id = hashlib.sha1(f"{project_id}{time.time()}".encode()).hexdigest()[:12]
424
  st = JobState(job_id=job_id, project_id=project_id, stage="pending", messages=[])
425
  JOBS[job_id] = st
 
 
426
 
427
+ LOG.info("Job %s créé %d fichiers", job_id, len(files))
428
+
429
+ EXECUTOR.submit(
430
+ _do_index_job,
431
+ st,
432
+ files,
433
+ chunk_size,
434
+ overlap,
435
+ batch_size,
436
+ store_text,
437
+ )
438
+ st.stage = "queued"
439
  return job_id
440
 
441
+
442
+ # --------------------------------------------------------------------------- #
443
+ # FASTAPI
444
+ # --------------------------------------------------------------------------- #
445
  fastapi_app = FastAPI(title="remote-indexer-async", version="3.0.0")
446
  fastapi_app.add_middleware(
447
  CORSMiddleware,
448
+ allow_origins=["*"],
449
+ allow_credentials=True,
450
+ allow_methods=["*"],
451
+ allow_headers=["*"],
452
  )
453
 
454
+
455
  class FileItem(BaseModel):
456
  path: str
457
  text: str
458
 
459
+
460
  class IndexRequest(BaseModel):
461
  project_id: str
462
  files: List[FileItem]
463
  chunk_size: int = 200
464
  overlap: int = 20
465
  batch_size: int = 32
466
+ store_text: bool = STORE_TEXT # ← configurable
467
+
468
 
469
  @fastapi_app.get("/health")
470
  def health():
471
+ return {
472
  "ok": True,
473
  "service": "remote-indexer-async",
474
  "provider": EMB_PROVIDER,
475
  "model": EMB_MODEL if EMB_PROVIDER != "dummy" else None,
476
  "cache_root": os.getenv("CACHE_ROOT", "/tmp/.cache"),
477
  "workers": MAX_WORKERS,
478
+ "data_root": os.getenv("DATA_ROOT", "/tmp/data"),
479
+ "faiss_type": FAISS_TYPE,
480
+ "emb_dim": EMB_DIM,
481
  }
 
482
 
 
 
 
483
 
484
  @fastapi_app.post("/index")
485
  def index(req: IndexRequest):
 
 
 
 
486
  try:
487
  files = [fi.model_dump() for fi in req.files]
488
  job_id = _submit_job(
 
495
  )
496
  return {"job_id": job_id}
497
  except Exception as e:
498
+ LOG.exception("Erreur soumission index")
499
  raise HTTPException(status_code=500, detail=str(e))
500
 
501
+
502
  @fastapi_app.get("/status/{job_id}")
503
  def status(job_id: str):
504
  st = JOBS.get(job_id)
 
506
  raise HTTPException(status_code=404, detail="job inconnu")
507
  return JSONResponse(st.model_dump())
508
 
509
+
510
  class SearchRequest(BaseModel):
511
  project_id: str
512
  query: str
513
  k: int = 5
514
 
515
+
516
  @fastapi_app.post("/search")
517
  def search(req: SearchRequest):
518
  base, ds_dir, fx_dir = _proj_dirs(req.project_id)
519
 
520
+ # Vérifier la présence de l'index
521
+ if not (os.path.isfile(os.path.join(fx_dir, "emb.faiss")) and os.path.isfile(os.path.join(ds_dir, "data.jsonl"))):
 
 
522
  raise HTTPException(status_code=409, detail="Index non prêt (reviens plus tard)")
523
 
524
  rows = _load_dataset(ds_dir)
525
  if not rows:
526
  raise HTTPException(status_code=404, detail="dataset introuvable")
527
 
528
+ # Embedding de la requête (même provider)
529
  if EMB_PROVIDER == "dummy":
530
  q = _emb_dummy([req.query], dim=EMB_DIM)[0:1, :]
531
  elif EMB_PROVIDER == "st":
 
533
  else:
534
  q = _emb_hf([req.query])[0:1, :]
535
 
536
+ # Recherche FAISS (mmap)
537
  index = _load_faiss(fx_dir)
538
  if index.d != q.shape[1]:
539
+ raise HTTPException(
540
+ status_code=500,
541
+ detail=f"dim incompatibles : index.d={index.d} vs query={q.shape[1]}",
542
+ )
543
  scores, ids = index.search(q, int(max(1, req.k)))
544
  ids = ids[0].tolist()
545
  scores = scores[0].tolist()
 
552
  out.append({"path": r.get("path"), "text": r.get("text"), "score": float(sc)})
553
  return {"results": out}
554
 
555
+
556
+ # --------------------------------------------------------------------------- #
557
+ # ARTIFACTS EXPORT (gzip)
558
+ # --------------------------------------------------------------------------- #
559
  @fastapi_app.get("/artifacts/{project_id}/dataset")
560
  def download_dataset(project_id: str):
561
+ _, ds_dir, _ = _proj_dirs(project_id)
562
  if not os.path.isdir(ds_dir):
563
  raise HTTPException(status_code=404, detail="Dataset introuvable")
564
  buf = _tar_dir_to_bytes(ds_dir)
565
+ hdr = {"Content-Disposition": f'attachment; filename="{project_id}_dataset.tgz"'}
566
+ return StreamingResponse(io.BytesIO(buf), media_type="application/gzip", headers=hdr)
567
+
568
 
569
  @fastapi_app.get("/artifacts/{project_id}/faiss")
570
  def download_faiss(project_id: str):
571
+ _, _, fx_dir = _proj_dirs(project_id)
572
  if not os.path.isdir(fx_dir):
573
  raise HTTPException(status_code=404, detail="FAISS introuvable")
574
  buf = _tar_dir_to_bytes(fx_dir)
575
+ hdr = {"Content-Disposition": f'attachment; filename="{project_id}_faiss.tgz"'}
576
+ return StreamingResponse(io.BytesIO(buf), media_type="application/gzip", headers=hdr)
577
+
578
 
579
+ # --------------------------------------------------------------------------- #
580
+ # GRADIO UI (facultatif simple test)
581
+ # --------------------------------------------------------------------------- #
582
  def _ui_index(project_id: str, sample_text: str):
583
  files = [{"path": "sample.txt", "text": sample_text}]
 
584
  try:
585
  req = IndexRequest(project_id=project_id, files=[FileItem(**f) for f in files])
586
+ except Exception as e:
587
+ return f"Erreur validation : {e}"
588
  try:
589
  res = index(req)
590
+ return f"Job lancé: {res['job_id']}"
591
  except Exception as e:
592
+ return f"Erreur index: {e}"
593
+
594
 
595
  def _ui_search(project_id: str, query: str, k: int):
596
  try:
597
  res = search(SearchRequest(project_id=project_id, query=query, k=int(k)))
598
  return json.dumps(res, ensure_ascii=False, indent=2)
599
  except Exception as e:
600
+ return f"Erreur recherche : {e}"
601
+
602
+
603
+ import gradio as gr
604
+
605
+ with gr.Blocks(title="Remote Indexer (Async – Optimisé)", analytics_enabled=False) as ui:
606
+ gr.Markdown("## Remote Indexer – Optimisé (FAISS quantisé, mmap, texte optionnel)")
607
+ with gr.Row():
608
+ pid = gr.Textbox(label="Project ID", value="DEMO")
609
+ txt = gr.Textbox(label="Texte d’exemple", lines=4, value="Alpha bravo charlie delta echo foxtrot.")
610
+ btn_idx = gr.Button("Lancer index (sample)")
611
+ out_idx = gr.Textbox(label="Résultat")
612
+ btn_idx.click(_ui_index, inputs=[pid, txt], outputs=[out_idx])
613
+
614
+ with gr.Row():
615
  q = gr.Textbox(label="Query", value="alpha")
616
+ k = gr.Slider(1, 20, value=5, step=1, label="Top‑K")
617
+ btn_q = gr.Button("Rechercher")
618
+ out_q = gr.Code(label="Résultats")
619
+ btn_q.click(_ui_search, inputs=[pid, q, k], outputs=[out_q])
620
 
621
  fastapi_app = gr.mount_gradio_app(fastapi_app, ui, path="/ui")
622
 
623
+
624
+ # --------------------------------------------------------------------------- #
625
+ # MAIN
626
+ # --------------------------------------------------------------------------- #
627
  if __name__ == "__main__":
628
  import uvicorn
629
+
630
+ PORT = int(os.getenv("PORT", "7860"))
631
+ LOG.info("Démarrage Uvicorn – port %s – UI à /ui", PORT)
632
+ uvicorn.run(fastapi_app, host="0.0.0.0", port=PORT)