Spaces:

ij
/

ArtistEmbeddingClassifier

Running on Zero

App Files Files Community

iljung1106 commited on about 10 hours ago

Commit

93d1be8

1 Parent(s): 5570c3c

Grad CAM to XGrad CAM

Browse files

Files changed (2) hide show

app/visualization.py +17 -10
webui_gradio.py +8 -8

app/visualization.py CHANGED Viewed

@@ -71,14 +71,17 @@ def _get_branch_weights(encoder, x: torch.Tensor) -> Dict[str, float]:
     }
-def _compute_gradcam(
     encoder,
     x: torch.Tensor,
     target_layer_name: str = "b3",
 ) -> np.ndarray:
     """
-    Compute Grad-CAM heatmap for a ViewEncoder.
-    Uses gradients of the output w.r.t. an intermediate feature map.
     Returns a heatmap as numpy array [H, W] normalized to [0, 1].
     """
     # Storage for activations and gradients
@@ -86,10 +89,10 @@ def _compute_gradcam(
     gradients = {}
     def forward_hook(module, input, output):
-        activations["value"] = output.detach()
     def backward_hook(module, grad_input, grad_output):
-        gradients["value"] = grad_output[0].detach()
     # Get the target layer
     target_layer = getattr(encoder, target_layer_name, None)
@@ -124,8 +127,12 @@ def _compute_gradcam(
         if acts is None or grads is None:
             return np.zeros((x.shape[2], x.shape[3]), dtype=np.float32)
-        # Compute Grad-CAM weights (global average pooling of gradients)
-        weights = grads.mean(dim=(2, 3), keepdim=True)  # [B, C, 1, 1]
         # Weighted combination of activations
         cam = (weights * acts).sum(dim=1, keepdim=True)  # [B, 1, H, W]
@@ -242,7 +249,7 @@ def analyze_views(
             # Grad-CAM
             try:
-                heatmap = _compute_gradcam(enc, x.clone(), target_layer_name="b3")
                 if original_images.get(k) is not None:
                     gradcam_heatmaps[k] = _overlay_heatmap(original_images[k], heatmap, alpha=0.5)
                 else:
@@ -263,11 +270,11 @@ def analyze_views(
 def format_view_weights_html(analysis: ViewAnalysis) -> str:
     """Format view weights as clean HTML with styled progress bars."""
-    # View labels with descriptions (eye = singular)
     view_info = {
         "whole": ("Whole Image", "#4CAF50"),  # green
         "face": ("Face", "#2196F3"),  # blue
-        "eyes": ("Eye Region", "#FF9800"),  # orange
     }
     html_parts = ['<div style="font-family: sans-serif; padding: 10px;">']

     }
+def _compute_xgradcam(
     encoder,
     x: torch.Tensor,
     target_layer_name: str = "b3",
 ) -> np.ndarray:
     """
+    Compute XGrad-CAM heatmap for a ViewEncoder.
+    XGrad-CAM is an improved variant that uses element-wise gradient-activation
+    products normalized by activation sums, providing better localization.
+    Reference: Axiom-based Grad-CAM (Fu et al., BMVC 2020)
     Returns a heatmap as numpy array [H, W] normalized to [0, 1].
     """
     # Storage for activations and gradients
     gradients = {}
     def forward_hook(module, input, output):
+        activations["value"] = output.detach().clone()
     def backward_hook(module, grad_input, grad_output):
+        gradients["value"] = grad_output[0].detach().clone()
     # Get the target layer
     target_layer = getattr(encoder, target_layer_name, None)
         if acts is None or grads is None:
             return np.zeros((x.shape[2], x.shape[3]), dtype=np.float32)
+        # XGrad-CAM: weights = sum(grads * acts, spatial) / (sum(acts, spatial) + eps)
+        # This normalizes by the activation magnitude, improving localization
+        grad_act_product = grads * acts  # [B, C, H, W]
+        sum_grad_act = grad_act_product.sum(dim=(2, 3), keepdim=True)  # [B, C, 1, 1]
+        sum_acts = acts.sum(dim=(2, 3), keepdim=True) + 1e-7  # [B, C, 1, 1]
+        weights = sum_grad_act / sum_acts  # [B, C, 1, 1]
         # Weighted combination of activations
         cam = (weights * acts).sum(dim=1, keepdim=True)  # [B, 1, H, W]
             # Grad-CAM
             try:
+                heatmap = _compute_xgradcam(enc, x.clone(), target_layer_name="b3")
                 if original_images.get(k) is not None:
                     gradcam_heatmaps[k] = _overlay_heatmap(original_images[k], heatmap, alpha=0.5)
                 else:
 def format_view_weights_html(analysis: ViewAnalysis) -> str:
     """Format view weights as clean HTML with styled progress bars."""
+    # View labels with descriptions
     view_info = {
         "whole": ("Whole Image", "#4CAF50"),  # green
         "face": ("Face", "#2196F3"),  # blue
+        "eyes": ("Eyes", "#FF9800"),  # orange
     }
     html_parts = ['<div style="font-family: sans-serif; padding: 10px;">']

webui_gradio.py CHANGED Viewed

@@ -298,7 +298,7 @@ def classify_and_analyze(
         return ("❌ Provide a whole image.",) + empty_result[1:]
     try:
-        # Extract face and eye region
         face_pil = None
         eye_pil = None
         if ex is not None:
@@ -319,7 +319,7 @@ def classify_and_analyze(
         preds = topk_predictions_unique_labels(db, z, topk=int(topk))
         rows = [[name, float(score)] for (name, score) in preds]
-        # Analysis (Grad-CAM + view weights)
         views = {"whole": wt, "face": ft, "eyes": et}
         original_images = {"whole": w, "face": face_pil, "eyes": eye_pil}
         analysis = analyze_views(lm.model, views, original_images, lm.device)
@@ -546,19 +546,19 @@ def build_ui() -> gr.Blocks:
             gr.Markdown("### 🎯 Classification Results")
             table = gr.Dataframe(headers=["Artist", "Similarity"], datatype=["str", "number"], interactive=False)
-            # Grad-CAM heatmaps
-            gr.Markdown("### 🔥 Grad-CAM Attention Maps")
             gr.Markdown("*Where the model focused in each view:*")
             with gr.Row():
                 gcam_whole = gr.Image(label="Whole Image", type="pil")
                 gcam_face = gr.Image(label="Face", type="pil")
-                gcam_eye = gr.Image(label="Eye Region", type="pil")
             # Extracted views
             gr.Markdown("### 👁️ Auto-Extracted Views")
             with gr.Row():
                 face_prev = gr.Image(label="Detected Face", type="pil")
-                eye_prev = gr.Image(label="Detected Eye", type="pil")
             run_btn.click(
                 classify_and_analyze,
@@ -571,8 +571,8 @@ def build_ui() -> gr.Blocks:
                 "### ⚠️ Temporary Prototypes Only\n"
                 "Add prototypes using random triplet combinations and K-means clustering (same as eval process).\n"
                 "1. Upload multiple whole images\n"
-                "2. Face and eye region are auto-extracted from each\n"
-                "3. Random triplets (whole + face + eye) are created\n"
                 "4. K-means clustering creates K prototype centers\n\n"
                 "**These prototypes are session-only** — lost when the Space restarts."
             )

         return ("❌ Provide a whole image.",) + empty_result[1:]
     try:
+        # Extract face and eyes
         face_pil = None
         eye_pil = None
         if ex is not None:
         preds = topk_predictions_unique_labels(db, z, topk=int(topk))
         rows = [[name, float(score)] for (name, score) in preds]
+        # Analysis (XGrad-CAM + view weights)
         views = {"whole": wt, "face": ft, "eyes": et}
         original_images = {"whole": w, "face": face_pil, "eyes": eye_pil}
         analysis = analyze_views(lm.model, views, original_images, lm.device)
             gr.Markdown("### 🎯 Classification Results")
             table = gr.Dataframe(headers=["Artist", "Similarity"], datatype=["str", "number"], interactive=False)
+            # XGrad-CAM heatmaps
+            gr.Markdown("### 🔥 XGrad-CAM Attention Maps")
             gr.Markdown("*Where the model focused in each view:*")
             with gr.Row():
                 gcam_whole = gr.Image(label="Whole Image", type="pil")
                 gcam_face = gr.Image(label="Face", type="pil")
+                gcam_eye = gr.Image(label="Eyes", type="pil")
             # Extracted views
             gr.Markdown("### 👁️ Auto-Extracted Views")
             with gr.Row():
                 face_prev = gr.Image(label="Detected Face", type="pil")
+                eye_prev = gr.Image(label="Detected Eyes", type="pil")
             run_btn.click(
                 classify_and_analyze,
                 "### ⚠️ Temporary Prototypes Only\n"
                 "Add prototypes using random triplet combinations and K-means clustering (same as eval process).\n"
                 "1. Upload multiple whole images\n"
+                "2. Face and eyes are auto-extracted from each\n"
+                "3. Random triplets (whole + face + eyes) are created\n"
                 "4. K-means clustering creates K prototype centers\n\n"
                 "**These prototypes are session-only** — lost when the Space restarts."
             )