Spaces:

prithivMLmods
/

Super-OCRs-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 16 days ago

Commit

7dc9ea8

verified ·

1 Parent(s): d8886fd

update app [.]

Browse files

Files changed (1) hide show

app.py +73 -28

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import random
 import uuid
 import json
@@ -16,17 +17,7 @@ import torch
 import numpy as np
 from PIL import Image, ImageDraw, ImageOps
 import requests
-# Import spaces if available, otherwise mock it
-try:
-    import spaces
-except ImportError:
-    class spaces:
-        @staticmethod
-        def GPU(func):
-            def wrapper(*args, **kwargs):
-                return func(*args, **kwargs)
-            return wrapper
 from transformers import (
     AutoModel,
@@ -35,7 +26,8 @@ from transformers import (
     AutoProcessor,
     TextIteratorStreamer,
     HunYuanVLForConditionalGeneration,
-    Qwen2_5_VLForConditionalGeneration,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
@@ -52,7 +44,6 @@ if torch.cuda.is_available():
     print("current device:", torch.cuda.current_device())
     print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
-# --- Theme Definition ---
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
@@ -148,7 +139,7 @@ print(f"Loading {MODEL_HUNYUAN}...")
 processor_hy = AutoProcessor.from_pretrained(MODEL_HUNYUAN, use_fast=False)
 model_hy = HunYuanVLForConditionalGeneration.from_pretrained(
     MODEL_HUNYUAN,
-    attn_implementation="eager", # Use eager to avoid SDPA issues if old torch
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
     device_map="auto"
 ).eval()
@@ -161,12 +152,40 @@ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
     trust_remote_code=True,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto" # or .to(device)
 ).eval()
-print("✅ All models loaded successfully.")
-# --- Helper Functions ---
 def clean_repeated_substrings(text):
     """Clean repeated substrings in text (for Hunyuan)"""
@@ -193,8 +212,6 @@ def find_result_image(path):
                 print(f"Error opening result image: {e}")
     return None
-# --- Main Inference Logic ---
 @spaces.GPU
 def run_model(
     model_choice,
@@ -359,7 +376,6 @@ def run_model(
             }
         ]
-        # Prepare inputs for Qwen2.5-VL based architecture
         text = processor_x.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor_x(
@@ -388,7 +404,33 @@ def run_model(
             buffer += new_text.replace("<|im_end|>", "")
             yield buffer, None
-# --- Gradio UI ---
 image_examples = [
     ["examples/1.jpg"],
@@ -398,13 +440,19 @@ image_examples = [
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **Super-OCRs-Demo**", elem_id="main-title")
-    gr.Markdown("Compare DeepSeek-OCR, Dots.OCR, HunyuanOCR, and Nanonets-OCR2-3B in one space.")
     with gr.Row():
         with gr.Column(scale=1):
             # Global Inputs
             model_choice = gr.Dropdown(
-                choices=["HunyuanOCR", "DeepSeek-OCR-Latest-BF16.I64", "Dots.OCR-Latest-BF16", "Nanonets-OCR2-3B"],
                 label="Select Model",
                 value="DeepSeek-OCR-Latest-BF16.I64"
             )
@@ -414,7 +462,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
             with gr.Group(visible=True) as ds_group:
                 ds_model_size = gr.Dropdown(
                     choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
-                    value="Large", label="DeepSeek Resolution"
                 )
                 ds_task_type = gr.Dropdown(
                     choices=["Free OCR", "Convert to Markdown", "Parse Figure", "Locate Object by Reference"],
@@ -422,9 +470,8 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                 )
                 ds_ref_text = gr.Textbox(label="Reference Text (for 'Locate' task only)", placeholder="e.g., the title, red car...", visible=False)
-            # General Prompt (for Dots/Hunyuan/Nanonets)
             with gr.Group(visible=False) as prompt_group:
-                custom_prompt = gr.Textbox(label="Custom Query / Prompt", placeholder="Extract text...", lines=2, value="OCR the content precisely")
             with gr.Accordion("Advanced Settings", open=False):
                 max_new_tokens = gr.Slider(minimum=128, maximum=8192, value=2048, step=128, label="Max New Tokens")
@@ -440,8 +487,6 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
             output_text = gr.Textbox(label="Recognized Text / Markdown", lines=15, show_copy_button=True)
             output_image = gr.Image(label="Visual Grounding Result (DeepSeek Only)", type="pil")
-    # --- UI Event Logic ---
     def update_visibility(model):
         is_ds = (model == "DeepSeek-OCR-Latest-BF16.I64")
         return gr.Group(visible=is_ds), gr.Group(visible=not is_ds)

 import os
+import sys
 import random
 import uuid
 import json
 import numpy as np
 from PIL import Image, ImageDraw, ImageOps
 import requests
+from huggingface_hub import snapshot_download
 from transformers import (
     AutoModel,
     AutoProcessor,
     TextIteratorStreamer,
     HunYuanVLForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
+    GenerationConfig
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
     print("current device:", torch.cuda.current_device())
     print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
 processor_hy = AutoProcessor.from_pretrained(MODEL_HUNYUAN, use_fast=False)
 model_hy = HunYuanVLForConditionalGeneration.from_pretrained(
     MODEL_HUNYUAN,
+    attn_implementation="eager",
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
     device_map="auto"
 ).eval()
     MODEL_ID_X,
     trust_remote_code=True,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto"
 ).eval()
+# 5. NVIDIA-Nemotron-Parse-v1.1
+print("Downloading NVIDIA-Nemotron snapshot to ensure all scripts are present...")
+try:
+    NEMO_DIR = snapshot_download(repo_id="nvidia/NVIDIA-Nemotron-Parse-v1.1")
+    print(f"Model downloaded to: {NEMO_DIR}")
+    sys.path.append(NEMO_DIR)
+    # Import postprocessing from the downloaded directory
+    # Note: Using try/except in case imports fail, though usually required for this model
+    try:
+        from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
+    except ImportError:
+        print("Warning: Could not import Nemotron postprocessing scripts. Fallback to raw decode.")
+    MODEL_NEMO = "nvidia/NVIDIA-Nemotron-Parse-v1.1"
+    print(f"Loading {MODEL_NEMO}...")
+    processor_nemo = AutoProcessor.from_pretrained(NEMO_DIR, trust_remote_code=True)
+    model_nemo = AutoModel.from_pretrained(
+        NEMO_DIR,
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    ).to(device).eval()
+    # Load generation config
+    gen_config_nemo = GenerationConfig.from_pretrained(NEMO_DIR, trust_remote_code=True)
+    NEMO_AVAILABLE = True
+except Exception as e:
+    print(f"Error loading NVIDIA-Nemotron: {e}")
+    NEMO_AVAILABLE = False
+print("✅ All models loaded successfully.")
 def clean_repeated_substrings(text):
     """Clean repeated substrings in text (for Hunyuan)"""
                 print(f"Error opening result image: {e}")
     return None
 @spaces.GPU
 def run_model(
     model_choice,
             }
         ]
         text = processor_x.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor_x(
             buffer += new_text.replace("<|im_end|>", "")
             yield buffer, None
+    # === NVIDIA-Nemotron-Parse-v1.1 Logic ===
+    elif model_choice == "NVIDIA-Nemotron-Parse-v1.1":
+        if not NEMO_AVAILABLE:
+            yield "Nemotron model failed to load. Check logs.", None
+            return
+        # Default Prompt for Nemotron markdown extraction
+        task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
+        # If user provides a custom prompt, we might want to use it,
+        # but Nemotron is highly specialized. Let's stick to the default strict prompt
+        # unless we want to support just raw text. For this demo, we use the standard full pipeline.
+        inputs = processor_nemo(images=[image], text=task_prompt, return_tensors="pt").to(model_nemo.device)
+        with torch.no_grad():
+            outputs = model_nemo.generate(
+                **inputs,
+                generation_config=gen_config_nemo,
+                max_new_tokens=max_new_tokens
+            )
+        generated_text = processor_nemo.batch_decode(outputs, skip_special_tokens=True)[0]
+        # The output might contain the prompt or special tokens depending on exact decoding
+        # The prompt used </s><s> which usually gets stripped by skip_special_tokens=True
+        yield generated_text, None
 image_examples = [
     ["examples/1.jpg"],
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **Super-OCRs-Demo**", elem_id="main-title")
+    gr.Markdown("Compare DeepSeek-OCR, Dots.OCR, HunyuanOCR, Nanonets-OCR2-3B, and NVIDIA-Nemotron-Parse-v1.1")
     with gr.Row():
         with gr.Column(scale=1):
             # Global Inputs
             model_choice = gr.Dropdown(
+                choices=[
+                    "DeepSeek-OCR-Latest-BF16.I64",
+                    "Dots.OCR-Latest-BF16",
+                    "HunyuanOCR",
+                    "Nanonets-OCR2-3B",
+                    "NVIDIA-Nemotron-Parse-v1.1"
+                ],
                 label="Select Model",
                 value="DeepSeek-OCR-Latest-BF16.I64"
             )
             with gr.Group(visible=True) as ds_group:
                 ds_model_size = gr.Dropdown(
                     choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
+                    value="Gundam (Recommended)", label="DeepSeek Resolution"
                 )
                 ds_task_type = gr.Dropdown(
                     choices=["Free OCR", "Convert to Markdown", "Parse Figure", "Locate Object by Reference"],
                 )
                 ds_ref_text = gr.Textbox(label="Reference Text (for 'Locate' task only)", placeholder="e.g., the title, red car...", visible=False)
             with gr.Group(visible=False) as prompt_group:
+                custom_prompt = gr.Textbox(label="Custom Query / Prompt", placeholder="Extract text...", lines=2, value="Convert to Markdown precisely.")
             with gr.Accordion("Advanced Settings", open=False):
                 max_new_tokens = gr.Slider(minimum=128, maximum=8192, value=2048, step=128, label="Max New Tokens")
             output_text = gr.Textbox(label="Recognized Text / Markdown", lines=15, show_copy_button=True)
             output_image = gr.Image(label="Visual Grounding Result (DeepSeek Only)", type="pil")
     def update_visibility(model):
         is_ds = (model == "DeepSeek-OCR-Latest-BF16.I64")
         return gr.Group(visible=is_ds), gr.Group(visible=not is_ds)