Image-Text-to-Text
Transformers
Safetensors
English
Chinese
multilingual
qwen3_vl
image-to-text
ocr
document-parse
layout
table
formula
conversational
Eval Results
Instructions to use winninghealth/WiNGPT-DocLoom with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use winninghealth/WiNGPT-DocLoom with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="winninghealth/WiNGPT-DocLoom") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("winninghealth/WiNGPT-DocLoom") model = AutoModelForImageTextToText.from_pretrained("winninghealth/WiNGPT-DocLoom") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use winninghealth/WiNGPT-DocLoom with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "winninghealth/WiNGPT-DocLoom" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "winninghealth/WiNGPT-DocLoom", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/winninghealth/WiNGPT-DocLoom
- SGLang
How to use winninghealth/WiNGPT-DocLoom with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "winninghealth/WiNGPT-DocLoom" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "winninghealth/WiNGPT-DocLoom", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "winninghealth/WiNGPT-DocLoom" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "winninghealth/WiNGPT-DocLoom", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use winninghealth/WiNGPT-DocLoom with Docker Model Runner:
docker model run hf.co/winninghealth/WiNGPT-DocLoom
| import openai | |
| import requests | |
| import base64 | |
| import fitz | |
| from typing import Union | |
| import sys | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| client = openai.OpenAI(api_key="sk-", base_url="http://ip:port/v1") | |
| model = "winninghealth/DocLoom" | |
| build_no_anchoring_v4_yaml_prompt = "Attached is one page of a document that you must process. Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to HTML.\nIf there are any figures or charts, label them with the following markdown syntax \nReturn your output as markdown." | |
| def render_pdf_to_base64png(doc, page_num, target_longest_image_dim: int = 2048): | |
| page = doc[page_num - 1] # PyMuPDF uses 0-based indexing | |
| rect = page.rect | |
| width, height = rect.width, rect.height | |
| longest_dim = max(width, height) | |
| # Calculate zoom factor to achieve target dimension | |
| zoom = target_longest_image_dim / longest_dim | |
| # Render page to pixmap | |
| mat = fitz.Matrix(zoom, zoom) | |
| pix = page.get_pixmap(matrix=mat) | |
| # Convert pixmap to PNG bytes | |
| img_bytes = pix.tobytes("png") | |
| return base64.b64encode(img_bytes).decode("utf-8") | |
| def get_image_base64_from_url(image_url): | |
| response = requests.get(image_url) | |
| response.raise_for_status() | |
| return base64.b64encode(response.content).decode("utf-8") | |
| def ocr_page_with_nanonets_s(img_base64): | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": f"data:image/png;base64,{img_base64}"}, | |
| }, | |
| { | |
| "type": "text", | |
| "text": build_no_anchoring_v4_yaml_prompt, | |
| }, | |
| ], | |
| } | |
| ], | |
| temperature=0.0, | |
| max_tokens=15000, # max 16192 | |
| ) | |
| return response.choices[0].message.content | |
| def process_page(doc, page_num, page_count): | |
| img_base64 = render_pdf_to_base64png(doc, page_num, target_longest_image_dim=1288) | |
| content = ocr_page_with_nanonets_s(img_base64) | |
| return page_num, content | |
| # Process all pages concurrently and save to markdown | |
| if len(sys.argv) < 2: | |
| print("Usage: python DocLoom_test.py <pdf_file_path>") | |
| sys.exit(1) | |
| file_path = sys.argv[1] | |
| output_path = file_path.replace(".pdf", ".md") | |
| # Open PDF once for all operations | |
| doc = fitz.open(file_path) | |
| page_count = len(doc) | |
| print(f"Total pages: {page_count}") | |
| print("Starting OCR processing...\n") | |
| completed_pages = 0 | |
| # Open output file for streaming write | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| page_contents = {} | |
| with ThreadPoolExecutor(max_workers=8) as executor: | |
| futures = { | |
| executor.submit(process_page, doc, page_num, page_count): page_num for page_num in range(1, page_count + 1) | |
| } | |
| for future in as_completed(futures): | |
| page_num, content = future.result() | |
| page_contents[page_num] = content | |
| completed_pages += 1 | |
| # Display progress | |
| progress = (completed_pages / page_count) * 100 | |
| print(f"Progress: {completed_pages}/{page_count} pages ({progress:.1f}%)") | |
| # Sort by page number and write to file | |
| for i in range(1, page_count + 1): | |
| f.write(page_contents[i]) | |
| # if i < page_count: | |
| # f.write("\n\n") | |
| doc.close() | |
| print(f"\nDone! Output saved to: {output_path}") | |