Instructions to use winninghealth/WiNGPT-DocLoom with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use winninghealth/WiNGPT-DocLoom with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="winninghealth/WiNGPT-DocLoom")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("winninghealth/WiNGPT-DocLoom")
model = AutoModelForImageTextToText.from_pretrained("winninghealth/WiNGPT-DocLoom")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use winninghealth/WiNGPT-DocLoom with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "winninghealth/WiNGPT-DocLoom"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "winninghealth/WiNGPT-DocLoom",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/winninghealth/WiNGPT-DocLoom

SGLang

How to use winninghealth/WiNGPT-DocLoom with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "winninghealth/WiNGPT-DocLoom" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "winninghealth/WiNGPT-DocLoom",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "winninghealth/WiNGPT-DocLoom" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "winninghealth/WiNGPT-DocLoom",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use winninghealth/WiNGPT-DocLoom with Docker Model Runner:
```
docker model run hf.co/winninghealth/WiNGPT-DocLoom
```

WiNGPT-DocLoom / DocLoom_test.py

winninghealth

Upload 13 files

489bb91 verified 5 months ago

raw

history blame contribute delete

3.69 kB

	import openai
	import requests
	import base64
	import fitz
	from typing import Union
	import sys
	from concurrent.futures import ThreadPoolExecutor, as_completed


	client = openai.OpenAI(api_key="sk-", base_url="http://ip:port/v1")

	model = "winninghealth/DocLoom"

	build_no_anchoring_v4_yaml_prompt = "Attached is one page of a document that you must process. Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to HTML.\nIf there are any figures or charts, label them with the following markdown syntax ![Alt text describing the contents of the figure](page_startx_starty_width_height.png)\nReturn your output as markdown."


	def render_pdf_to_base64png(doc, page_num, target_longest_image_dim: int = 2048):
	page = doc[page_num - 1] # PyMuPDF uses 0-based indexing
	rect = page.rect
	width, height = rect.width, rect.height
	longest_dim = max(width, height)

	# Calculate zoom factor to achieve target dimension
	zoom = target_longest_image_dim / longest_dim

	# Render page to pixmap
	mat = fitz.Matrix(zoom, zoom)
	pix = page.get_pixmap(matrix=mat)

	# Convert pixmap to PNG bytes
	img_bytes = pix.tobytes("png")

	return base64.b64encode(img_bytes).decode("utf-8")


	def get_image_base64_from_url(image_url):
	response = requests.get(image_url)
	response.raise_for_status()
	return base64.b64encode(response.content).decode("utf-8")


	def ocr_page_with_nanonets_s(img_base64):
	response = client.chat.completions.create(
	model=model,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/png;base64,{img_base64}"},
	},
	{
	"type": "text",
	"text": build_no_anchoring_v4_yaml_prompt,
	},
	],
	}
	],
	temperature=0.0,
	max_tokens=15000, # max 16192
	)
	return response.choices[0].message.content


	def process_page(doc, page_num, page_count):
	img_base64 = render_pdf_to_base64png(doc, page_num, target_longest_image_dim=1288)
	content = ocr_page_with_nanonets_s(img_base64)
	return page_num, content


	# Process all pages concurrently and save to markdown
	if len(sys.argv) < 2:
	print("Usage: python DocLoom_test.py <pdf_file_path>")
	sys.exit(1)

	file_path = sys.argv[1]
	output_path = file_path.replace(".pdf", ".md")

	# Open PDF once for all operations
	doc = fitz.open(file_path)
	page_count = len(doc)

	print(f"Total pages: {page_count}")
	print("Starting OCR processing...\n")

	completed_pages = 0

	# Open output file for streaming write
	with open(output_path, "w", encoding="utf-8") as f:
	page_contents = {}

	with ThreadPoolExecutor(max_workers=8) as executor:
	futures = {
	executor.submit(process_page, doc, page_num, page_count): page_num for page_num in range(1, page_count + 1)
	}

	for future in as_completed(futures):
	page_num, content = future.result()
	page_contents[page_num] = content
	completed_pages += 1

	# Display progress
	progress = (completed_pages / page_count) * 100
	print(f"Progress: {completed_pages}/{page_count} pages ({progress:.1f}%)")

	# Sort by page number and write to file
	for i in range(1, page_count + 1):
	f.write(page_contents[i])
	# if i < page_count:
	# f.write("\n\n")

	doc.close()
	print(f"\nDone! Output saved to: {output_path}")