import base64 import io import json import logging import re import urllib.error import urllib.request from pathlib import Path import gradio as gr import spaces # Hugging Face Spaces Zero GPU support from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( PdfPipelineOptions, granite_picture_description, ) from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_extractor import DocumentExtractor from PIL import Image # Try new preset-based API first (docling >= 2.72), fall back to legacy try: from docling.datamodel.pipeline_options import PictureDescriptionVlmEngineOptions from docling.datamodel.vlm_engine_options import ApiVlmEngineOptions, VlmEngineType _HAS_VLM_ENGINE = True except ImportError: try: from docling.datamodel.pipeline_options import PictureDescriptionApiOptions _HAS_VLM_ENGINE = False except ImportError: _HAS_VLM_ENGINE = False logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) OLLAMA_BASE_URL = "http://127.0.0.1:11434" OLLAMA_API_URL = f"{OLLAMA_BASE_URL}/v1/chat/completions" OLLAMA_MODEL = "ibm/granite3.3-vision:2b" # Initialize the extractor (will be moved to GPU when decorated function is called) def get_extractor(): """Initialize extractor - called within GPU context""" return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF]) def is_ollama_available( url: str = OLLAMA_BASE_URL, timeout: int = 3 ) -> bool: """Check if Ollama is running and reachable on localhost.""" try: req = urllib.request.Request(url, method="GET") with urllib.request.urlopen(req, timeout=timeout) as resp: return resp.status == 200 except (urllib.error.URLError, OSError): return False # Check Ollama availability at startup if is_ollama_available(): logger.info( "Ollama is running on %s — remote VLM will be used for ALL VLM inference " "(picture descriptions AND template extraction)", OLLAMA_BASE_URL, ) else: logger.info( "Ollama not found on %s — will use local GPU for VLM inference", OLLAMA_BASE_URL, ) def get_document_images(source: str) -> list: """Convert a document source (file path or URL) to a list of PIL images. Handles image files directly and renders PDF pages via pypdfium2 (a docling dependency). For URLs, downloads the file first. """ source_path = None tmp_data = None if source.startswith("http://") or source.startswith("https://"): # Download the file req = urllib.request.Request(source) with urllib.request.urlopen(req, timeout=60) as resp: tmp_data = resp.read() content_type = resp.headers.get("Content-Type", "") # Determine type from URL or content-type lower_url = source.lower() if any( lower_url.endswith(ext) for ext in (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp") ) or "image" in content_type: return [Image.open(io.BytesIO(tmp_data)).convert("RGB")] # Assume PDF import pypdfium2 as pdfium pdf = pdfium.PdfDocument(io.BytesIO(tmp_data)) images = [] for page_idx in range(len(pdf)): page = pdf[page_idx] bitmap = page.render(scale=2.0) images.append(bitmap.to_pil().convert("RGB")) return images else: source_path = Path(source) if source_path.suffix.lower() in ( ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp", ): return [Image.open(source_path).convert("RGB")] # PDF import pypdfium2 as pdfium pdf = pdfium.PdfDocument(str(source_path)) images = [] for page_idx in range(len(pdf)): page = pdf[page_idx] bitmap = page.render(scale=2.0) images.append(bitmap.to_pil().convert("RGB")) return images def _image_to_base64(img: Image.Image) -> str: """Convert a PIL Image to a base64-encoded PNG data URL.""" buf = io.BytesIO() img.save(buf, format="PNG") return base64.b64encode(buf.getvalue()).decode("utf-8") def extract_with_ollama(source: str, template: dict) -> dict: """Extract structured data by sending document images + template to Ollama. Uses Ollama's OpenAI-compatible /v1/chat/completions endpoint with vision support. This offloads ALL VLM inference to Ollama so the local GPU is not needed. Returns a dict in the same shape as the DocumentExtractor output: {"pages": [{"page_no": int, "extracted_data": ..., "raw_text": str, "errors": []}]} """ images = get_document_images(source) logger.info( "Extracting with Ollama (%s): %d page(s), template keys: %s", OLLAMA_MODEL, len(images), list(template.keys()), ) template_str = json.dumps(template, indent=2) prompt = ( "Extract the following structured information from this document image. " "Return ONLY valid JSON matching this exact template structure:\n" f"{template_str}\n\n" "Rules:\n" "- Fill in the actual values found in the document\n" "- Use null for fields not found in the document\n" "- Return ONLY the JSON object, no explanation or markdown fences" ) output = {"pages": []} for page_no, img in enumerate(images, start=1): img_b64 = _image_to_base64(img) payload = json.dumps({ "model": OLLAMA_MODEL, "messages": [ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{img_b64}", }, }, { "type": "text", "text": prompt, }, ], } ], "temperature": 0.0, "max_tokens": 4096, }).encode("utf-8") req = urllib.request.Request( OLLAMA_API_URL, data=payload, headers={"Content-Type": "application/json"}, method="POST", ) try: with urllib.request.urlopen(req, timeout=120) as resp: response_data = json.loads(resp.read().decode("utf-8")) content = response_data["choices"][0]["message"]["content"] # Try to parse as JSON try: extracted = json.loads(content) except json.JSONDecodeError: # Try to extract JSON from markdown code block match = re.search( r"```(?:json)?\s*\n?(.*?)\n?```", content, re.DOTALL ) if match: try: extracted = json.loads(match.group(1)) except json.JSONDecodeError: extracted = {"raw_response": content} else: extracted = {"raw_response": content} output["pages"].append({ "page_no": page_no, "extracted_data": extracted, "raw_text": content, "errors": [], }) logger.info("Page %d extracted successfully via Ollama", page_no) except Exception as e: logger.error("Ollama extraction failed for page %d: %s", page_no, e) output["pages"].append({ "page_no": page_no, "extracted_data": None, "raw_text": "", "errors": [str(e)], }) return output def get_converter_with_vision(): """Initialize converter with vision. Checks if Ollama is running on localhost:11434. If available, uses the remote Ollama VLM for picture descriptions (no local GPU required). Otherwise falls back to the local integrated granite_picture_description. """ pipeline_options = PdfPipelineOptions() pipeline_options.do_picture_description = True pipeline_options.images_scale = 2.0 pipeline_options.generate_picture_images = True if is_ollama_available(): logger.info("Ollama detected on localhost:11434 — using remote VLM") if _HAS_VLM_ENGINE: # New preset-based API (docling >= 2.72) picture_desc_options = PictureDescriptionVlmEngineOptions.from_preset( "granite_vision", engine_options=ApiVlmEngineOptions( runtime_type=VlmEngineType.API_OLLAMA, timeout=90, ), ) picture_desc_options.prompt = ( "Describe the image in as much detail as possible." ) else: # Legacy API picture_desc_options = PictureDescriptionApiOptions( url="http://127.0.0.1:11434/v1/chat/completions", params={"model": "ibm/granite3.3-vision:2b"}, prompt="Describe the image in as much detail as possible.", timeout=90, ) pipeline_options.picture_description_options = picture_desc_options pipeline_options.enable_remote_services = True else: logger.info("Ollama not available — falling back to local VLM") pipeline_options.picture_description_options = granite_picture_description pipeline_options.picture_description_options.prompt = ( "Describe the image in as much detail as possible." ) return DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, ) } ) @spaces.GPU(duration=60) # Allocate GPU for up to 60 seconds def process_extraction(file_input, url_input, template_json): """ Process document extraction with the provided template. Uses Hugging Face Spaces Zero GPU feature. Args: file_input: Uploaded file (PDF or image) url_input: URL to a document template_json: JSON string defining the extraction template Returns: JSON string with extracted data """ try: # Determine the source source = None if file_input is not None: source = file_input.name elif url_input and url_input.strip(): source = url_input.strip() else: return json.dumps( {"error": "Please provide either a file or a URL"}, indent=2 ) # If no template is provided, use the converter with vision if not template_json or not template_json.strip(): converter = get_converter_with_vision() try: result = converter.convert(source) doc = result.document # Create a simplified output with Markdown and picture descriptions simplified_output = { "markdown": doc.export_to_markdown(), "pictures": [], } # Extract picture descriptions if available if hasattr(doc, "pictures"): for i, pic in enumerate(doc.pictures): descriptions = [] if hasattr(pic, "annotations"): for ann in pic.annotations: if hasattr(ann, "text"): descriptions.append(ann.text) if descriptions: simplified_output["pictures"].append( {"index": i, "descriptions": descriptions} ) return json.dumps(simplified_output, indent=2) except Exception as e: return json.dumps({"error": f"Conversion failed: {str(e)}"}, indent=2) # Parse the template JSON try: template = json.loads(template_json) except json.JSONDecodeError as e: return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2) # Use Ollama for extraction when available (no local GPU needed for VLM) if is_ollama_available(): logger.info("Using Ollama for template extraction (remote VLM)") output = extract_with_ollama(source, template) return json.dumps(output, indent=2) # Fall back to local DocumentExtractor (uses local GPU) logger.info("Using local DocumentExtractor (local GPU)") extractor = get_extractor() # Perform extraction result = extractor.extract( source=source, template=template, ) # Format the output output = {"pages": []} for page in result.pages: page_data = { "page_no": page.page_no, "extracted_data": page.extracted_data, "raw_text": page.raw_text, "errors": page.errors if page.errors else [], } output["pages"].append(page_data) return json.dumps(output, indent=2) except Exception as e: return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2) # Default template example default_template = json.dumps( {"bill_no": "string", "total": "float", "date": "string"}, indent=2 ) # Create Gradio interface with gr.Blocks(title="Docling Structured Extraction") as demo: gr.Markdown( """ # 📄 Docling Structured Extraction Demo Extract structured data from documents (PDF/Images) using AI-powered extraction. **Note:** This feature is currently in beta. ### How to use: 1. Upload a file OR provide a URL to a document 2. Define your extraction template in JSON format (or leave empty for full document conversion with picture descriptions) 3. Click "Extract" to get structured data or full document JSON 🚀 **Powered by Hugging Face Spaces Zero GPU** """ ) with gr.Row(): with gr.Column(): gr.Markdown("### Input Source") file_input = gr.File( label="Upload File (PDF or Image)" ) url_input = gr.Textbox( label="Or Enter Document URL", placeholder="https://example.com/document.pdf", lines=1, ) gr.Markdown("### Extraction Template") gr.Markdown( """ Define the structure of data you want to extract. Use JSON format with field names and types: - `"string"` for text fields - `"float"` for numbers with decimals - `"int"` for whole numbers """ ) template_input = gr.Code( label="JSON Template", value=default_template, language="json", lines=15 ) extract_btn = gr.Button("Extract", variant="primary", size="lg") with gr.Column(): gr.Markdown("### Extracted Data") output_json = gr.Code(label="Result (JSON)", language="json", lines=25) # Examples section gr.Markdown("### Examples") gr.Examples( examples=[ [ None, "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg", json.dumps({"bill_no": "string", "total": "float"}, indent=2), ], [ None, "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg", json.dumps( { "bill_no": "string", "total": "float", "sender_name": "string", "receiver_name": "string", "postal_code": "string", }, indent=2, ), ], ], inputs=[file_input, url_input, template_input], label="Try these examples", ) # Connect the extraction function extract_btn.click( fn=process_extraction, inputs=[file_input, url_input, template_input], outputs=output_json, ) # Launch the app if __name__ == "__main__": demo.launch()