import openai import requests import base64 import fitz from typing import Union import sys from concurrent.futures import ThreadPoolExecutor, as_completed client = openai.OpenAI(api_key="sk-", base_url="http://ip:port/v1") model = "winninghealth/DocLoom" build_no_anchoring_v4_yaml_prompt = "Attached is one page of a document that you must process. Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to HTML.\nIf there are any figures or charts, label them with the following markdown syntax ![Alt text describing the contents of the figure](page_startx_starty_width_height.png)\nReturn your output as markdown." def render_pdf_to_base64png(doc, page_num, target_longest_image_dim: int = 2048): page = doc[page_num - 1] # PyMuPDF uses 0-based indexing rect = page.rect width, height = rect.width, rect.height longest_dim = max(width, height) # Calculate zoom factor to achieve target dimension zoom = target_longest_image_dim / longest_dim # Render page to pixmap mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) # Convert pixmap to PNG bytes img_bytes = pix.tobytes("png") return base64.b64encode(img_bytes).decode("utf-8") def get_image_base64_from_url(image_url): response = requests.get(image_url) response.raise_for_status() return base64.b64encode(response.content).decode("utf-8") def ocr_page_with_nanonets_s(img_base64): response = client.chat.completions.create( model=model, messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}, }, { "type": "text", "text": build_no_anchoring_v4_yaml_prompt, }, ], } ], temperature=0.0, max_tokens=15000, # max 16192 ) return response.choices[0].message.content def process_page(doc, page_num, page_count): img_base64 = render_pdf_to_base64png(doc, page_num, target_longest_image_dim=1288) content = ocr_page_with_nanonets_s(img_base64) return page_num, content # Process all pages concurrently and save to markdown if len(sys.argv) < 2: print("Usage: python DocLoom_test.py ") sys.exit(1) file_path = sys.argv[1] output_path = file_path.replace(".pdf", ".md") # Open PDF once for all operations doc = fitz.open(file_path) page_count = len(doc) print(f"Total pages: {page_count}") print("Starting OCR processing...\n") completed_pages = 0 # Open output file for streaming write with open(output_path, "w", encoding="utf-8") as f: page_contents = {} with ThreadPoolExecutor(max_workers=8) as executor: futures = { executor.submit(process_page, doc, page_num, page_count): page_num for page_num in range(1, page_count + 1) } for future in as_completed(futures): page_num, content = future.result() page_contents[page_num] = content completed_pages += 1 # Display progress progress = (completed_pages / page_count) * 100 print(f"Progress: {completed_pages}/{page_count} pages ({progress:.1f}%)") # Sort by page number and write to file for i in range(1, page_count + 1): f.write(page_contents[i]) # if i < page_count: # f.write("\n\n") doc.close() print(f"\nDone! Output saved to: {output_path}")