Spaces:

prithivMLmods
/

Super-OCRs-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 18 days ago

Commit

ac4e192

verified ·

1 Parent(s): 589ac46

update app

Browse files

Files changed (1) hide show

app.py +70 -8

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ import re
 import tempfile
 import ast
 import html
-import spaces
 from threading import Thread
 from typing import Iterable, Optional
@@ -18,6 +17,17 @@ import numpy as np
 from PIL import Image, ImageDraw, ImageOps
 import requests
 from transformers import (
     AutoModel,
     AutoModelForCausalLM,
@@ -25,6 +35,7 @@ from transformers import (
     AutoProcessor,
     TextIteratorStreamer,
     HunYuanVLForConditionalGeneration,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
@@ -133,6 +144,17 @@ model_hy = HunYuanVLForConditionalGeneration.from_pretrained(
     device_map="auto"
 ).eval()
 print("✅ All models loaded successfully.")
 # --- Helper Functions ---
@@ -289,7 +311,7 @@ def run_model(
             {
                 "role": "user",
                 "content": [
-                    {"type": "image", "image": image}, # The processor handles PIL images in list if passed correctly
                     {"type": "text", "text": query},
                 ],
             }
@@ -305,7 +327,7 @@ def run_model(
             generated_ids = model_hy.generate(
                 **inputs,
                 max_new_tokens=max_new_tokens,
-                do_sample=False # Hunyuan OCR usually preferred greedy/beam
             )
         input_len = inputs.input_ids.shape[1]
@@ -315,6 +337,48 @@ def run_model(
         final_text = clean_repeated_substrings(output_text)
         yield final_text, None
 # --- Gradio UI ---
 image_examples = [
@@ -325,13 +389,13 @@ image_examples = [
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **Super-OCRs-Demo**", elem_id="main-title")
-    gr.Markdown("Compare **DeepSeek-OCR**, **Dots.OCR**, and **HunyuanOCR** in one space.")
     with gr.Row():
         with gr.Column(scale=1):
             # Global Inputs
             model_choice = gr.Radio(
-                choices=["HunyuanOCR", "DeepSeek-OCR-Latest-BF16.I64", "Dots.OCR-Latest-BF16"],
                 label="Select Model",
                 value="DeepSeek-OCR-Latest-BF16.I64"
             )
@@ -339,7 +403,6 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
             # DeepSeek Specific Options
             with gr.Group(visible=True) as ds_group:
-                #gr.Markdown("### DeepSeek Settings")
                 ds_model_size = gr.Dropdown(
                     choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                     value="Gundam (Recommended)", label="DeepSeek Resolution"
@@ -350,7 +413,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                 )
                 ds_ref_text = gr.Textbox(label="Reference Text (for 'Locate' task only)", placeholder="e.g., the title, red car...", visible=False)
-            # General Prompt (for Dots/Hunyuan)
             with gr.Group(visible=False) as prompt_group:
                 custom_prompt = gr.Textbox(label="Custom Query / Prompt", placeholder="Extract text...", lines=2)
@@ -365,7 +428,6 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
             gr.Examples(examples=image_examples, inputs=image_input)
         with gr.Column(scale=2):
-            #gr.Markdown("## Output", elem_id="output-title")
             output_text = gr.Textbox(label="Recognized Text / Markdown", lines=15, show_copy_button=True)
             output_image = gr.Image(label="Visual Grounding Result (DeepSeek Only)", type="pil")

 import tempfile
 import ast
 import html
 from threading import Thread
 from typing import Iterable, Optional
 from PIL import Image, ImageDraw, ImageOps
 import requests
+# Import spaces if available, otherwise mock it
+try:
+    import spaces
+except ImportError:
+    class spaces:
+        @staticmethod
+        def GPU(func):
+            def wrapper(*args, **kwargs):
+                return func(*args, **kwargs)
+            return wrapper
 from transformers import (
     AutoModel,
     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
     HunYuanVLForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
     device_map="auto"
 ).eval()
+# 4. Nanonets-OCR2-3B
+MODEL_ID_X = "nanonets/Nanonets-OCR2-3B"
+print(f"Loading {MODEL_ID_X}...")
+processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
+model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_X,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto" # or .to(device)
+).eval()
 print("✅ All models loaded successfully.")
 # --- Helper Functions ---
             {
                 "role": "user",
                 "content": [
+                    {"type": "image", "image": image},
                     {"type": "text", "text": query},
                 ],
             }
             generated_ids = model_hy.generate(
                 **inputs,
                 max_new_tokens=max_new_tokens,
+                do_sample=False
             )
         input_len = inputs.input_ids.shape[1]
         final_text = clean_repeated_substrings(output_text)
         yield final_text, None
+    # === Nanonets-OCR2-3B Logic ===
+    elif model_choice == "Nanonets-OCR2-3B":
+        query = custom_prompt if custom_prompt else "Extract the text from this image."
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": query},
+                ],
+            }
+        ]
+        # Prepare inputs for Qwen2.5-VL based architecture
+        text = processor_x.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor_x(
+            text=[text],
+            images=[image],
+            padding=True,
+            return_tensors="pt",
+        ).to(model_x.device)
+        streamer = TextIteratorStreamer(processor_x, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": int(top_k),
+        }
+        thread = Thread(target=model_x.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text.replace("<|im_end|>", "")
+            yield buffer, None
 # --- Gradio UI ---
 image_examples = [
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **Super-OCRs-Demo**", elem_id="main-title")
+    gr.Markdown("Compare **DeepSeek-OCR**, **Dots.OCR**, **HunyuanOCR**, and **Nanonets-OCR2-3B** in one space.")
     with gr.Row():
         with gr.Column(scale=1):
             # Global Inputs
             model_choice = gr.Radio(
+                choices=["HunyuanOCR", "DeepSeek-OCR-Latest-BF16.I64", "Dots.OCR-Latest-BF16", "Nanonets-OCR2-3B"],
                 label="Select Model",
                 value="DeepSeek-OCR-Latest-BF16.I64"
             )
             # DeepSeek Specific Options
             with gr.Group(visible=True) as ds_group:
                 ds_model_size = gr.Dropdown(
                     choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                     value="Gundam (Recommended)", label="DeepSeek Resolution"
                 )
                 ds_ref_text = gr.Textbox(label="Reference Text (for 'Locate' task only)", placeholder="e.g., the title, red car...", visible=False)
+            # General Prompt (for Dots/Hunyuan/Nanonets)
             with gr.Group(visible=False) as prompt_group:
                 custom_prompt = gr.Textbox(label="Custom Query / Prompt", placeholder="Extract text...", lines=2)
             gr.Examples(examples=image_examples, inputs=image_input)
         with gr.Column(scale=2):
             output_text = gr.Textbox(label="Recognized Text / Markdown", lines=15, show_copy_button=True)
             output_image = gr.Image(label="Visual Grounding Result (DeepSeek Only)", type="pil")