Spaces:

sanchitshaleen
/

chat-with-your-data

Sleeping

chat-with-your-data / server /llm_system /utils /loader.py

sanchitshaleen

Initial deployment of RAG with Gemma-3 to Hugging Face Spaces

4aec76b 10 days ago

16.5 kB

	"""Module dealing specifically with loading files into Document objects.
	Contains the `load_file` function to load text, PDF, and markdown files.
	Uses Docling for advanced PDF parsing with OCR support for scanned PDFs.
	Falls back to PyMuPDF if Docling is not available.

	Supports multimodal document loading with automatic image extraction from PDFs.

	## For testing:
	- Run this file from `server` folder as:
	- `python -m llm_system.utils.loader`
	"""

	import os
	from typing import List, Optional, Dict, Any
	from pathlib import Path
	from datetime import datetime
	from dataclasses import dataclass, field
	import uuid

	from langchain_core.documents import Document
	from langchain_community.document_loaders import TextLoader, PyMuPDFLoader
	from langchain_community.document_loaders import UnstructuredMarkdownLoader
	import fitz # PyMuPDF
	from PIL import Image

	from logger import get_logger
	log = get_logger(name="doc_loader")

	# Try to import Docling for advanced PDF parsing
	try:
	from docling.document_converter import DocumentConverter
	DOCLING_AVAILABLE = True
	log.info("✅ Docling library available - will use for PDF parsing with OCR support")
	except ImportError:
	DOCLING_AVAILABLE = False
	log.warning("⚠️ Docling library not available - will fallback to PyMuPDF for PDFs")

	# Import config for multimodal settings
	try:
	from llm_system.config import EXTRACT_IMAGES_FROM_PDF, IMAGE_OUTPUT_DIR
	except ImportError:
	# Fallback defaults if config not available
	EXTRACT_IMAGES_FROM_PDF = True
	IMAGE_OUTPUT_DIR = "server/user_uploads/extracted_images"


	@dataclass
	class ImageContent:
	"""Represents an image extracted from a document.

	Attributes:
	image_id: Unique identifier for the image
	image_path: Path to where the image is stored on disk
	description: Text description of the image (optional)
	page_number: Page number where image was found
	position: Position on page (e.g., "top", "center", "bottom")
	metadata: Additional metadata (size, format, source PDF, etc.)
	"""
	image_id: str
	image_path: Path
	description: str = ""
	page_number: int = 0
	position: str = ""
	metadata: Dict[str, Any] = field(default_factory=dict)

	def __post_init__(self):
	"""Ensure image_path is a Path object."""
	if isinstance(self.image_path, str):
	self.image_path = Path(self.image_path)


	def extract_images_from_pdf(pdf_path: str, output_dir: str = None, user_id: str = "") -> List[ImageContent]:
	"""Extract images from a PDF file and save them to disk.

	Attempts to use Docling's advanced image extraction first,
	falls back to PyMuPDF for faster extraction.

	Args:
	pdf_path: Path to the PDF file
	output_dir: Directory to save extracted images (default: IMAGE_OUTPUT_DIR)
	user_id: User ID for organizing images

	Returns:
	List of ImageContent objects with paths and metadata
	"""
	if not EXTRACT_IMAGES_FROM_PDF:
	log.debug("Image extraction disabled in config")
	return []

	if output_dir is None:
	output_dir = IMAGE_OUTPUT_DIR

	images = []
	pdf_name = Path(pdf_path).stem

	try:
	# Sanitize directory name (remove special characters)
	pdf_name_safe = "".join(c if c.isalnum() or c in ('-', '_') else '_' for c in pdf_name)

	# Create user-specific output directory
	user_image_dir = Path(output_dir) / user_id / pdf_name_safe
	user_image_dir.mkdir(parents=True, exist_ok=True)
	log.info(f"Created image output directory: {user_image_dir}")

	# Try Docling first for advanced image understanding
	docling_images = []
	if DOCLING_AVAILABLE:
	log.info(f"🔍 Attempting to extract images using Docling...")
	try:
	converter = DocumentConverter()
	docling_doc = converter.convert(pdf_path)
	doc = docling_doc.document

	# Docling stores images in various ways depending on PDF structure
	# Try to access pictures from the document
	if hasattr(doc, 'body') and hasattr(doc.body, 'blocks'):
	log.debug(f"Scanning {len(doc.body.blocks)} Docling blocks for pictures...")
	for block_idx, block in enumerate(doc.body.blocks):
	block_type = type(block).__name__
	log.debug(f"Block {block_idx}: {block_type}")

	# Check for picture blocks
	if 'Picture' in block_type:
	try:
	# Docling picture blocks may have image data
	if hasattr(block, 'image') and block.image is not None:
	image_id = f"img_docling_{block_idx:03d}_{uuid.uuid4().hex[:8]}"
	image_filename = f"{image_id}.png"
	image_path = user_image_dir / image_filename

	# Save the image
	block.image.save(str(image_path), format='PNG')
	log.info(f"✅ Extracted image via Docling: {image_path}")

	# Get page number
	page_num = 0
	if hasattr(block, 'page_number'):
	page_num = block.page_number

	# Create ImageContent
	image_content = ImageContent(
	image_id=image_id,
	image_path=image_path,
	page_number=page_num + 1,
	position="middle",
	metadata={
	"source_pdf": pdf_name,
	"extracted_at": datetime.now().isoformat(),
	"format": "PNG",
	"extractor": "docling",
	"size": (block.image.width, block.image.height) if hasattr(block.image, 'width') else (0, 0),
	}
	)
	docling_images.append(image_content)
	except Exception as e:
	log.debug(f"Could not extract Docling picture block {block_idx}: {e}")
	continue

	if docling_images:
	log.info(f"✅ Docling extracted {len(docling_images)} images")
	images.extend(docling_images)
	return images
	else:
	log.debug("Docling found no extractable picture blocks, falling back to PyMuPDF")

	except Exception as e:
	log.warning(f"⚠️ Docling image extraction failed: {e}, falling back to PyMuPDF")

	# Fallback to PyMuPDF for faster extraction
	log.info(f"📕 Extracting images using PyMuPDF...")
	pdf_document = fitz.open(pdf_path)
	log.info(f"Opened PDF with {pdf_document.page_count} pages")

	for page_num in range(pdf_document.page_count):
	page = pdf_document[page_num]
	image_list = page.get_images(full=True)

	if not image_list:
	log.debug(f"No images found on page {page_num}")
	continue

	log.info(f"Found {len(image_list)} images on page {page_num}")

	for img_index, img in enumerate(image_list):
	try:
	xref = img[0]
	pix = fitz.Pixmap(pdf_document, xref)

	# Convert CMYK to RGB if needed
	if pix.n - pix.alpha < 4:
	pix = fitz.Pixmap(fitz.csRGB, pix)

	# Generate image filename
	image_id = f"img_{page_num:03d}_{img_index:02d}_{uuid.uuid4().hex[:8]}"
	image_filename = f"{image_id}.png"
	image_path = user_image_dir / image_filename

	# Save image
	pix.save(str(image_path))
	log.info(f"✅ Saved image: {image_path}")

	# Create ImageContent object
	image_content = ImageContent(
	image_id=image_id,
	image_path=image_path,
	page_number=page_num + 1, # 1-indexed for humans
	position="middle", # Can be enhanced with actual position
	metadata={
	"source_pdf": pdf_name,
	"extracted_at": datetime.now().isoformat(),
	"format": "PNG",
	"extractor": "pymupdf",
	"size": (pix.width, pix.height),
	}
	)
	images.append(image_content)

	except Exception as e:
	log.warning(f"Failed to extract image {img_index} on page {page_num}: {e}")
	continue

	pdf_document.close()
	log.info(f"✅ Extracted {len(images)} images from PDF")

	except Exception as e:
	log.error(f"❌ Error extracting images from PDF: {e}")
	import traceback
	log.error(traceback.format_exc())

	return images


	def load_file(user_id: str, file_path: str) -> tuple[bool, List[Document], str]:
	"""Load a file and return its content as a list of Document objects. Usually one document per page.

	For PDFs, automatically extracts images and attaches them to metadata.

	Args:
	user_id (str): The ID of the user who is loading the file.
	file_path (str): The absolute path to the file to be loaded.

	Returns:
	tuple[bool, List[Document], str]: A tuple containing:
	- bool: True if the file was loaded successfully, False otherwise.
	- List[Document]: A list of Document objects containing the file's content.
	- str: Message indicating the result of the loading operation.
	"""

	log.info(f"🔍 load_file() starting - file_path: {file_path}, user_id: {user_id}")
	file_extension = file_path.split('.')[-1].lower()
	log.info(f"📋 File extension detected: {file_extension}")

	if file_extension not in ['txt', 'pdf', "md"]:
	log.error(f"❌ Unsupported file type: {file_extension}.")
	return False, [], f"Unsupported file type: {file_extension}. Supported types are: txt, pdf."

	if file_path.endswith('.txt'):
	log.info(f"📄 Loading as TXT file")
	loader = TextLoader(file_path, encoding='utf-8')

	elif file_path.endswith('.md'):
	log.info(f"📝 Loading as Markdown file")
	loader = UnstructuredMarkdownLoader(file_path)

	else:
	# Use Docling for PDFs if available (better OCR support for scanned PDFs)
	file_content = None
	use_docling = DOCLING_AVAILABLE

	if use_docling:
	log.info(f"📕 Loading PDF using Docling (with OCR support for scanned PDFs)")
	try:
	converter = DocumentConverter()
	docling_doc = converter.convert(file_path)

	# Convert Docling output to LangChain Documents
	# Docling preserves structure better than PyMuPDF
	markdown_text = docling_doc.document.export_to_markdown()

	# Create a single document with all content
	file_content = [
	Document(
	page_content=markdown_text,
	metadata={
	"source": os.path.basename(file_path),
	"file_path": file_path,
	"loader": "docling"
	}
	)
	]
	log.info(f"✅ Docling successfully parsed PDF: {len(markdown_text)} chars extracted")
	except Exception as e:
	log.warning(f"⚠️ Docling parsing failed: {e}, falling back to PyMuPDF")
	file_content = None
	use_docling = False

	if not use_docling:
	# Fallback to PyMuPDF if Docling not available or failed
	log.info(f"📕 Loading as PDF file using PyMuPDFLoader")
	loader = PyMuPDFLoader(file_path, extract_images=False)

	# Load the file and return the documents
	if file_content is None:
	# If we didn't get content from Docling, use the loader (PyMuPDF, TextLoader, etc.)
	log.info(f"⏳ Executing loader.load()...")
	try:
	file_content = loader.load()
	log.info(f"✅ loader.load() completed, got {len(file_content)} pages/documents")
	except Exception as e:
	log.error(f"❌ loader.load() failed with exception: {e}")
	import traceback
	log.error(f"Traceback: {traceback.format_exc()}")
	return False, [], f"Error loading file: {e}"

	# Extract images from PDF if applicable
	extracted_images = []
	if file_path.endswith('.pdf'):
	log.info(f"🖼️ Extracting images from PDF...")
	extracted_images = extract_images_from_pdf(file_path, user_id=user_id)
	log.info(f"Found {len(extracted_images)} images")

	# Add user metadata to each doc and attach images
	for doc in file_content:
	doc.metadata['user_id'] = user_id

	# Attach extracted images to the document
	if extracted_images:
	# Convert ImageContent objects to serializable format
	doc.metadata['images'] = [
	{
	'image_id': img.image_id,
	'image_path': str(img.image_path),
	'page_number': img.page_number,
	'position': img.position,
	'metadata': img.metadata
	}
	for img in extracted_images
	]
	log.info(f"Attached {len(extracted_images)} images to document metadata")

	# Since i am exposing the retrieved docs to UI
	# Hide full server file path if its there:
	if 'file_path' in doc.metadata:
	doc.metadata['file_path'] = os.path.basename(doc.metadata['file_path'])

	if 'source' in doc.metadata:
	# If it is not local file, keep source as is:
	if "www." in doc.metadata['source'] or "http" in doc.metadata['source']:
	continue
	# If it is local file, keep only the file name:
	else:
	doc.metadata['source'] = os.path.basename(doc.metadata['source'])

	if not file_content:
	log.error(f"No content found in the file: {file_path}")
	return True, [], f"No content found in the file: {file_path}"

	log.info(f"Loaded {len(file_content)} documents from {file_path} for user {user_id} (with {len(extracted_images)} images).")
	return True, file_content, f"Loaded {len(file_content)} documents with {len(extracted_images)} images."


	if __name__ == "__main__":
	# Example usage
	import os
	print(os.getcwd())
	try:
	status, docs, message = load_file(
	user_id="test_user",
	file_path="/Users/neetikasaxena/Documents/sanchit/sample_code/chat-with-your-data/test_data/resume_sanchit_imo_health.pdf"
	# file_path="../../../GenAI/Data/speech.txt"
	# file_path="../../../GenAI/Data/speech.md"
	)

	print(status)
	print(message)
	print(len(docs))

	for ind, doc in enumerate(docs[:3]):
	print("\n")
	print(repr(doc))

	except Exception as e:
	print(f"Error loading file: {e}")