Spaces:

ij
/

ArtistEmbeddingClassifier

Running on Zero

iljung1106 commited on 1 day ago

Commit

c61411c

1 Parent(s): 07f1b5a

Disabled loading CUDA on main process

Files changed (4) hide show

app/model_io.py CHANGED Viewed

@@ -100,7 +100,10 @@ def embed_triview(
             views[k] = vb
             masks[k] = torch.ones(1, dtype=torch.bool, device=lm.device)
-    with torch.no_grad(), torch.amp.autocast("cuda", dtype=getattr(__import__("train_style_ddp"), "amp_dtype", torch.float16), enabled=(lm.device.type == "cuda")):
         z, _, _ = lm.model(views, masks)
     z = torch.nn.functional.normalize(z.float(), dim=1)
     return z.squeeze(0).detach().cpu()

             views[k] = vb
             masks[k] = torch.ones(1, dtype=torch.bool, device=lm.device)
+    # Use lazy dtype detection to avoid CUDA init at import time (ZeroGPU compatibility)
+    import train_style_ddp as _ts
+    _dtype = _ts._get_amp_dtype() if hasattr(_ts, "_get_amp_dtype") else torch.float16
+    with torch.no_grad(), torch.amp.autocast("cuda", dtype=_dtype, enabled=(lm.device.type == "cuda")):
         z, _, _ = lm.model(views, masks)
     z = torch.nn.functional.normalize(z.float(), dim=1)
     return z.squeeze(0).detach().cpu()

scripts/train_style_ddp.py CHANGED Viewed

@@ -74,10 +74,24 @@ torch.backends.cudnn.benchmark = True
 if hasattr(torch, "set_float32_matmul_precision"):
     torch.set_float32_matmul_precision("high")
-if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
-    amp_dtype = torch.bfloat16
-else:
-    amp_dtype = torch.float16
 # --- PIL safety/verbosity tweaks ---
 ImageFile.LOAD_TRUNCATED_IMAGES = True
@@ -1056,7 +1070,7 @@ def ddp_train_worker(rank: int, world_size: int):
                 }
                 masks = {k: v.to(device, non_blocking=True) for k,v in batch["masks"].items()}
-                with torch.amp.autocast('cuda', dtype=amp_dtype):
                     z_fused, z_views_dict, W = model(views, masks)
                     Z_all, Y_all, G_all = [], [], []

 if hasattr(torch, "set_float32_matmul_precision"):
     torch.set_float32_matmul_precision("high")
+# Lazy amp_dtype detection to avoid CUDA init at import time (required for HF Spaces ZeroGPU)
+_amp_dtype_cache = None
+def _get_amp_dtype():
+    global _amp_dtype_cache
+    if _amp_dtype_cache is None:
+        try:
+            if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+                _amp_dtype_cache = torch.bfloat16
+            else:
+                _amp_dtype_cache = torch.float16
+        except Exception:
+            _amp_dtype_cache = torch.float16
+    return _amp_dtype_cache
+# For backwards compatibility, amp_dtype is accessed via property-like usage
+# but we keep a module-level name that can be imported (defaults to float16, updated on first GPU use)
+amp_dtype = torch.float16  # safe default; actual dtype picked at runtime via _get_amp_dtype()
 # --- PIL safety/verbosity tweaks ---
 ImageFile.LOAD_TRUNCATED_IMAGES = True
                 }
                 masks = {k: v.to(device, non_blocking=True) for k,v in batch["masks"].items()}
+                with torch.amp.autocast('cuda', dtype=_get_amp_dtype()):
                     z_fused, z_views_dict, W = model(views, masks)
                     Z_all, Y_all, G_all = [], [], []

train_style_ddp.py CHANGED Viewed

@@ -74,10 +74,24 @@ torch.backends.cudnn.benchmark = True
 if hasattr(torch, "set_float32_matmul_precision"):
     torch.set_float32_matmul_precision("high")
-if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
-    amp_dtype = torch.bfloat16
-else:
-    amp_dtype = torch.float16
 # --- PIL safety/verbosity tweaks ---
 ImageFile.LOAD_TRUNCATED_IMAGES = True
@@ -1056,7 +1070,7 @@ def ddp_train_worker(rank: int, world_size: int):
                 }
                 masks = {k: v.to(device, non_blocking=True) for k,v in batch["masks"].items()}
-                with torch.amp.autocast('cuda', dtype=amp_dtype):
                     z_fused, z_views_dict, W = model(views, masks)
                     Z_all, Y_all, G_all = [], [], []

 if hasattr(torch, "set_float32_matmul_precision"):
     torch.set_float32_matmul_precision("high")
+# Lazy amp_dtype detection to avoid CUDA init at import time (required for HF Spaces ZeroGPU)
+_amp_dtype_cache = None
+def _get_amp_dtype():
+    global _amp_dtype_cache
+    if _amp_dtype_cache is None:
+        try:
+            if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+                _amp_dtype_cache = torch.bfloat16
+            else:
+                _amp_dtype_cache = torch.float16
+        except Exception:
+            _amp_dtype_cache = torch.float16
+    return _amp_dtype_cache
+# For backwards compatibility, amp_dtype is accessed via property-like usage
+# but we keep a module-level name that can be imported (defaults to float16, updated on first GPU use)
+amp_dtype = torch.float16  # safe default; actual dtype picked at runtime via _get_amp_dtype()
 # --- PIL safety/verbosity tweaks ---
 ImageFile.LOAD_TRUNCATED_IMAGES = True
                 }
                 masks = {k: v.to(device, non_blocking=True) for k,v in batch["masks"].items()}
+                with torch.amp.autocast('cuda', dtype=_get_amp_dtype()):
                     z_fused, z_views_dict, W = model(views, masks)
                     Z_all, Y_all, G_all = [], [], []

webui_gradio.py CHANGED Viewed

@@ -15,6 +15,9 @@ try:
 except Exception:  # noqa: BLE001
     spaces = None
 def _patch_fastapi_starlette_middleware_unpack() -> None:
     """
     Work around FastAPI/Starlette version mismatches where Starlette's Middleware
@@ -230,6 +233,11 @@ def load_all(ckpt_path: str, proto_path: str, device: str) -> str:
         return "❌ No checkpoint selected."
     if not proto_path:
         return "❌ No prototype DB selected."
     try:
         lm = load_style_model(ckpt_path, device=device)
         db = load_prototype_db(proto_path, try_dataset_dir=str(ROOT / "dataset"))
@@ -459,12 +467,12 @@ if __name__ == "__main__":
     _patch_fastapi_starlette_middleware_unpack()
     try:
-        _launch_compat(demo, server_name=args.host, server_port=args.port, show_api=False, share=args.share)
     except ValueError as e:
         # Some environments block localhost checks; fall back to share link.
         msg = str(e)
         if "localhost is not accessible" in msg and not args.share:
-            _launch_compat(demo, server_name=args.host, server_port=args.port, show_api=False, share=True)
         else:
             raise

 except Exception:  # noqa: BLE001
     spaces = None
+# Detect if running on HF Spaces (ZeroGPU requires special handling)
+_ON_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE"))
 def _patch_fastapi_starlette_middleware_unpack() -> None:
     """
     Work around FastAPI/Starlette version mismatches where Starlette's Middleware
         return "❌ No checkpoint selected."
     if not proto_path:
         return "❌ No prototype DB selected."
+    # Force CPU on HF Spaces (ZeroGPU doesn't allow CUDA init in main process)
+    if _ON_SPACES:
+        device = "cpu"
     try:
         lm = load_style_model(ckpt_path, device=device)
         db = load_prototype_db(proto_path, try_dataset_dir=str(ROOT / "dataset"))
     _patch_fastapi_starlette_middleware_unpack()
     try:
+        _launch_compat(demo, server_name=args.host, server_port=args.port, show_api=False, share=args.share, ssr_mode=False)
     except ValueError as e:
         # Some environments block localhost checks; fall back to share link.
         msg = str(e)
         if "localhost is not accessible" in msg and not args.share:
+            _launch_compat(demo, server_name=args.host, server_port=args.port, show_api=False, share=True, ssr_mode=False)
         else:
             raise