Create pymupdf_test_WIP.py
Browse files- pymupdf_test_WIP.py +428 -0
pymupdf_test_WIP.py
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ββ Standard library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import json
|
| 5 |
+
import math
|
| 6 |
+
import queue
|
| 7 |
+
import shutil
|
| 8 |
+
import logging
|
| 9 |
+
import tempfile
|
| 10 |
+
import threading
|
| 11 |
+
import subprocess
|
| 12 |
+
import multiprocessing
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from multiprocessing import Pool
|
| 15 |
+
|
| 16 |
+
# ββ Third-party βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
+
import fitz # PyMuPDF
|
| 18 |
+
import tkinter as tk
|
| 19 |
+
from tkinter import filedialog, messagebox
|
| 20 |
+
from joblib import cpu_count, Parallel, delayed
|
| 21 |
+
|
| 22 |
+
# ββ Parser configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
+
PARALLEL_THRESHOLD = 16 # pages β switch to multiprocessing above this
|
| 24 |
+
LINE_TOLERANCE = 1 # pts for snapping nearly-identical rulings
|
| 25 |
+
MIN_RECT_AREA = 1e4 # ptsΒ² β ignore tiny rectangles
|
| 26 |
+
|
| 27 |
+
# ββ pdfplumber-style clustering helpers βββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
def cluster_list(xs, tol):
|
| 29 |
+
"""Return list of clusters (each a list) grouped by β€ tol apart."""
|
| 30 |
+
xs = sorted(xs)
|
| 31 |
+
if len(xs) < 2:
|
| 32 |
+
return [[x] for x in xs]
|
| 33 |
+
groups, grp = [], [xs[0]]
|
| 34 |
+
for x in xs[1:]:
|
| 35 |
+
if x - grp[-1] <= tol:
|
| 36 |
+
grp.append(x)
|
| 37 |
+
else:
|
| 38 |
+
groups.append(grp)
|
| 39 |
+
grp = [x]
|
| 40 |
+
groups.append(grp)
|
| 41 |
+
return groups
|
| 42 |
+
|
| 43 |
+
def make_cluster_dict(vals, tol):
|
| 44 |
+
"""Map each value to a cluster id (0,1,2,β¦) using tolerance."""
|
| 45 |
+
clusters = cluster_list(sorted(set(vals)), tol)
|
| 46 |
+
mapping = {}
|
| 47 |
+
for cid, cl in enumerate(clusters):
|
| 48 |
+
for v in cl:
|
| 49 |
+
mapping[v] = cid
|
| 50 |
+
return mapping
|
| 51 |
+
|
| 52 |
+
# ββ Utility funcs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
+
def clean_cell_text(text):
|
| 54 |
+
if not isinstance(text, str):
|
| 55 |
+
return ""
|
| 56 |
+
text = text.replace("-\n", "").replace("\n", " ")
|
| 57 |
+
return " ".join(text.split())
|
| 58 |
+
|
| 59 |
+
def safe_join(row):
|
| 60 |
+
return [clean_cell_text(str(c)) if c is not None else "" for c in row]
|
| 61 |
+
|
| 62 |
+
def clamp_bbox(bbox, page_rect):
|
| 63 |
+
x0, y0, x1, y1 = bbox
|
| 64 |
+
x0 = max(page_rect.x0, min(x0, page_rect.x1))
|
| 65 |
+
x1 = max(page_rect.x0, min(x1, page_rect.x1))
|
| 66 |
+
y0 = max(page_rect.y0, min(y0, page_rect.y1))
|
| 67 |
+
y1 = max(page_rect.y0, min(y1, page_rect.y1))
|
| 68 |
+
return (x0, y0, x1, y1)
|
| 69 |
+
|
| 70 |
+
# ββ Improved table detection with snapping βββββββββββββββββββββββββββββββββ
|
| 71 |
+
def detect_table_bboxes(page: fitz.Page, tol=LINE_TOLERANCE):
|
| 72 |
+
"""
|
| 73 |
+
Detect table rectangles by:
|
| 74 |
+
1. Collecting very thin horizontal & vertical strokes
|
| 75 |
+
2. Snapping their positions with tolerance `tol`
|
| 76 |
+
3. Forming a grid from unique row & column positions
|
| 77 |
+
4. Returning a list[fitz.Rect] for each cell rectangle
|
| 78 |
+
"""
|
| 79 |
+
horiz_raw, vert_raw = [], []
|
| 80 |
+
for d in page.get_drawings():
|
| 81 |
+
if d["type"] != 1: # stroke only
|
| 82 |
+
continue
|
| 83 |
+
x0, y0, x1, y1 = d["bbox"]
|
| 84 |
+
if abs(y1 - y0) < 2: # horizontal line
|
| 85 |
+
y_mid = (y0 + y1) / 2
|
| 86 |
+
horiz_raw.append((y_mid, x0, x1))
|
| 87 |
+
elif abs(x1 - x0) < 2: # vertical line
|
| 88 |
+
x_mid = (x0 + x1) / 2
|
| 89 |
+
vert_raw.append((x_mid, y0, y1))
|
| 90 |
+
|
| 91 |
+
if not horiz_raw or not vert_raw:
|
| 92 |
+
return []
|
| 93 |
+
|
| 94 |
+
row_map = make_cluster_dict([y for y, _, _ in horiz_raw], tol)
|
| 95 |
+
col_map = make_cluster_dict([x for x, _, _ in vert_raw], tol)
|
| 96 |
+
|
| 97 |
+
# Average positions per cluster id
|
| 98 |
+
rows = {}
|
| 99 |
+
for y, x0, x1 in horiz_raw:
|
| 100 |
+
cid = row_map[y]
|
| 101 |
+
rows.setdefault(cid, []).append(y)
|
| 102 |
+
cols = {}
|
| 103 |
+
for x, y0, y1 in vert_raw:
|
| 104 |
+
cid = col_map[x]
|
| 105 |
+
cols.setdefault(cid, []).append(x)
|
| 106 |
+
|
| 107 |
+
row_pos = sorted(sum(v)/len(v) for v in rows.values())
|
| 108 |
+
col_pos = sorted(sum(v)/len(v) for v in cols.values())
|
| 109 |
+
|
| 110 |
+
rects = []
|
| 111 |
+
for r0, r1 in zip(row_pos, row_pos[1:]):
|
| 112 |
+
for c0, c1 in zip(col_pos, col_pos[1:]):
|
| 113 |
+
rect = fitz.Rect(c0, r0, c1, r1)
|
| 114 |
+
if rect.get_area() >= MIN_RECT_AREA:
|
| 115 |
+
rects.append(rect)
|
| 116 |
+
|
| 117 |
+
# Remove duplicates / contained rects
|
| 118 |
+
unique = []
|
| 119 |
+
for rect in rects:
|
| 120 |
+
if not any(u.contains(rect) or rect.contains(u) for u in unique):
|
| 121 |
+
unique.append(rect)
|
| 122 |
+
|
| 123 |
+
return unique
|
| 124 |
+
|
| 125 |
+
# ββ Table extraction (simple text grouping) ββββββββββββββββββββββββββββββββ
|
| 126 |
+
def extract_table(page: fitz.Page, table_rect: fitz.Rect):
|
| 127 |
+
"""Group words inside `table_rect` into JSON rows [dict]."""
|
| 128 |
+
words = [
|
| 129 |
+
w for w in page.get_text("words")
|
| 130 |
+
if table_rect.x0 <= w[0] <= table_rect.x1
|
| 131 |
+
and table_rect.y0 <= w[1] <= table_rect.y1
|
| 132 |
+
]
|
| 133 |
+
words.sort(key=lambda w: (w[1], w[0])) # sort by y then x
|
| 134 |
+
|
| 135 |
+
# cluster words by line
|
| 136 |
+
lines, cury, cur = [], None, []
|
| 137 |
+
for w in words:
|
| 138 |
+
if cury is None or abs(w[1] - cury) > 5:
|
| 139 |
+
if cur:
|
| 140 |
+
lines.append(cur)
|
| 141 |
+
cur = [w]
|
| 142 |
+
cury = w[1]
|
| 143 |
+
else:
|
| 144 |
+
cur.append(w)
|
| 145 |
+
if cur:
|
| 146 |
+
lines.append(cur)
|
| 147 |
+
|
| 148 |
+
if not lines:
|
| 149 |
+
return []
|
| 150 |
+
|
| 151 |
+
line_texts = [" ".join(w[4] for w in ln) for ln in lines]
|
| 152 |
+
headers = safe_join([line_texts[0]])
|
| 153 |
+
rows = [safe_join([lt]) for lt in line_texts[1:]]
|
| 154 |
+
return [dict(zip(headers, r)) for r in rows]
|
| 155 |
+
|
| 156 |
+
# ββ Per-page worker ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 157 |
+
def process_page(args):
|
| 158 |
+
page_number, pdf_path = args
|
| 159 |
+
try:
|
| 160 |
+
with fitz.open(pdf_path) as doc:
|
| 161 |
+
page = doc.load_page(page_number)
|
| 162 |
+
page_rect = page.rect
|
| 163 |
+
output = f"Page {page_number + 1}\n"
|
| 164 |
+
|
| 165 |
+
# Detect tables
|
| 166 |
+
table_rects = detect_table_bboxes(page)
|
| 167 |
+
table_jsons = []
|
| 168 |
+
for rect in table_rects:
|
| 169 |
+
tbl = extract_table(page, rect)
|
| 170 |
+
if tbl:
|
| 171 |
+
table_jsons.append(json.dumps(tbl, indent=1, ensure_ascii=False))
|
| 172 |
+
|
| 173 |
+
# Words outside tables
|
| 174 |
+
tbl_boxes = [clamp_bbox(rect, page_rect) for rect in table_rects]
|
| 175 |
+
words = page.get_text("words")
|
| 176 |
+
outside = [
|
| 177 |
+
w for w in words
|
| 178 |
+
if not any(b[0] <= w[0] <= b[2] and b[1] <= w[1] <= b[3] for b in tbl_boxes)
|
| 179 |
+
]
|
| 180 |
+
outside.sort(key=lambda w: (w[1], w[0]))
|
| 181 |
+
|
| 182 |
+
cury, cur, text = None, [], ""
|
| 183 |
+
for w in outside:
|
| 184 |
+
if cury is None or abs(w[1] - cury) > 10:
|
| 185 |
+
if cur:
|
| 186 |
+
text += " ".join(cur) + "\n"
|
| 187 |
+
cur, cury = [w[4]], w[1]
|
| 188 |
+
else:
|
| 189 |
+
cur.append(w[4])
|
| 190 |
+
if cur:
|
| 191 |
+
text += " ".join(cur) + "\n"
|
| 192 |
+
|
| 193 |
+
output += text.strip() + "\n"
|
| 194 |
+
for idx, tbl in enumerate(table_jsons, 1):
|
| 195 |
+
output += f'"table {idx}":\n{tbl}\n'
|
| 196 |
+
return page_number, output
|
| 197 |
+
|
| 198 |
+
except fitz.FileDataError as e:
|
| 199 |
+
return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): encrypted / unreadable β {e}"
|
| 200 |
+
except Exception as e:
|
| 201 |
+
return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): {e}"
|
| 202 |
+
|
| 203 |
+
# ββ Document-level processing βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 204 |
+
def process_pdf(pdf_path):
|
| 205 |
+
try:
|
| 206 |
+
if not os.path.exists(pdf_path):
|
| 207 |
+
return f"[ERROR] File not found: {pdf_path}"
|
| 208 |
+
|
| 209 |
+
print(f"[INFO] Starting processing: {pdf_path}")
|
| 210 |
+
try:
|
| 211 |
+
with fitz.open(pdf_path) as doc:
|
| 212 |
+
num_pages = doc.page_count
|
| 213 |
+
except fitz.FileDataError as e:
|
| 214 |
+
return f"[ERROR] Cannot open PDF: {pdf_path} β {e}"
|
| 215 |
+
except Exception as e:
|
| 216 |
+
return f"[ERROR] General error opening PDF: {pdf_path} β {e}"
|
| 217 |
+
|
| 218 |
+
pages = [(i, pdf_path) for i in range(num_pages)]
|
| 219 |
+
results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages)
|
| 220 |
+
|
| 221 |
+
results.sort(key=lambda x: x[0])
|
| 222 |
+
final_output = "\n".join(t for _, t in results)
|
| 223 |
+
|
| 224 |
+
base = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 225 |
+
out_dir = os.path.dirname(pdf_path)
|
| 226 |
+
out_path = os.path.join(out_dir, f"{base}.txt")
|
| 227 |
+
with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
|
| 228 |
+
f.write(final_output)
|
| 229 |
+
print(f"[INFO] Processing complete: {out_path}")
|
| 230 |
+
except (EOFError, BrokenPipeError, KeyboardInterrupt):
|
| 231 |
+
return "[INFO] Processing interrupted by user."
|
| 232 |
+
except Exception as e:
|
| 233 |
+
return f"[ERROR] Unexpected error with '{pdf_path}': {e}"
|
| 234 |
+
|
| 235 |
+
def run_serial(pages): return [process_page(a) for a in pages]
|
| 236 |
+
|
| 237 |
+
def run_parallel(pages):
|
| 238 |
+
cores = min(max(1, cpu_count() - 2), len(pages))
|
| 239 |
+
print(f"Starting parallel processing with {cores} coresβ¦")
|
| 240 |
+
with Pool(cores) as pool:
|
| 241 |
+
return pool.map(process_page, pages)
|
| 242 |
+
|
| 243 |
+
# ββ Batch CLI entrypoint ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 244 |
+
def process_pdfs_main():
|
| 245 |
+
pdfs = sys.argv[1:]
|
| 246 |
+
if not pdfs:
|
| 247 |
+
print("No PDF files provided.")
|
| 248 |
+
return
|
| 249 |
+
|
| 250 |
+
small, large = [], []
|
| 251 |
+
for p in pdfs:
|
| 252 |
+
if not os.path.exists(p):
|
| 253 |
+
print(f"File not found: {p}")
|
| 254 |
+
continue
|
| 255 |
+
try:
|
| 256 |
+
with fitz.open(p) as doc:
|
| 257 |
+
(small if doc.page_count <= PARALLEL_THRESHOLD else large).append(p)
|
| 258 |
+
except fitz.FileDataError:
|
| 259 |
+
print(f"[ERROR] Password-protected PDF skipped: {p}")
|
| 260 |
+
except Exception as e:
|
| 261 |
+
print(f"[ERROR] Error opening {p}: {e}")
|
| 262 |
+
|
| 263 |
+
if small:
|
| 264 |
+
cores = min(max(1, cpu_count() - 2), len(small))
|
| 265 |
+
print(f"\n[Phase 1] Parallel processing of {len(small)} small PDFs with {cores} cores β¦")
|
| 266 |
+
for r in Parallel(n_jobs=cores)(delayed(process_pdf)(p) for p in small):
|
| 267 |
+
print(r)
|
| 268 |
+
|
| 269 |
+
for p in large:
|
| 270 |
+
print(f"\n[Phase 2] Processing large PDF: {os.path.basename(p)}")
|
| 271 |
+
print(process_pdf(p))
|
| 272 |
+
|
| 273 |
+
# ββ Tkinter GUI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 274 |
+
class FileManager:
|
| 275 |
+
def __init__(self, master):
|
| 276 |
+
self.master = master
|
| 277 |
+
master.title("Parser-Sevenof9 β PyMuPDF")
|
| 278 |
+
|
| 279 |
+
self.files, self.last_selected = [], None
|
| 280 |
+
tk.Label(master, text="Selected PDF files:").pack(pady=5)
|
| 281 |
+
|
| 282 |
+
list_frame = tk.Frame(master); list_frame.pack(pady=5)
|
| 283 |
+
sb_list = tk.Scrollbar(list_frame)
|
| 284 |
+
self.listbox = tk.Listbox(list_frame, selectmode=tk.MULTIPLE, width=80, height=6,
|
| 285 |
+
yscrollcommand=sb_list.set)
|
| 286 |
+
sb_list.config(command=self.listbox.yview)
|
| 287 |
+
self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_list.pack(side=tk.RIGHT, fill=tk.Y)
|
| 288 |
+
self.listbox.bind("<<ListboxSelect>>", self.show_text)
|
| 289 |
+
self.listbox.bind("<Button-1>", self.on_click)
|
| 290 |
+
self.listbox.bind("<Shift-Button-1>", self.on_shift_click)
|
| 291 |
+
|
| 292 |
+
self.ctx = tk.Menu(master, tearoff=0)
|
| 293 |
+
self.ctx.add_command(label="Remove selected", command=self.remove_file)
|
| 294 |
+
self.listbox.bind("<Button-3>", lambda e: self.ctx.tk_popup(e.x_root, e.y_root) if self.listbox.curselection() else None)
|
| 295 |
+
|
| 296 |
+
btn_frame = tk.Frame(master); btn_frame.pack(pady=10)
|
| 297 |
+
tk.Button(btn_frame, text="Add Folder", command=self.add_folder).pack(side=tk.LEFT, padx=5)
|
| 298 |
+
tk.Button(btn_frame, text="Select Files", command=self.add_file).pack(side=tk.LEFT, padx=5)
|
| 299 |
+
tk.Button(btn_frame, text="Remove Selected",command=self.remove_file).pack(side=tk.LEFT, padx=5)
|
| 300 |
+
tk.Button(btn_frame, text="Remove All", command=self.remove_all).pack(side=tk.LEFT, padx=5)
|
| 301 |
+
tk.Button(master, text="Stop", command=self.stop_parser).pack(pady=5)
|
| 302 |
+
tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10)
|
| 303 |
+
|
| 304 |
+
tx_frame = tk.Frame(master); tx_frame.pack(padx=10, pady=5)
|
| 305 |
+
sb_text = tk.Scrollbar(tx_frame)
|
| 306 |
+
self.text = tk.Text(tx_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=sb_text.set)
|
| 307 |
+
sb_text.config(command=self.text.yview)
|
| 308 |
+
self.text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_text.pack(side=tk.RIGHT, fill=tk.Y)
|
| 309 |
+
|
| 310 |
+
tk.Label(master, text="Progress:").pack()
|
| 311 |
+
prog_frame = tk.Frame(master); prog_frame.pack(padx=10, pady=5)
|
| 312 |
+
sb_prog = tk.Scrollbar(prog_frame)
|
| 313 |
+
self.prog = tk.Text(prog_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=sb_prog.set)
|
| 314 |
+
sb_prog.config(command=self.prog.yview)
|
| 315 |
+
self.prog.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_prog.pack(side=tk.RIGHT, fill=tk.Y)
|
| 316 |
+
|
| 317 |
+
self.parser_proc = None
|
| 318 |
+
|
| 319 |
+
# ββ Listbox helpers βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 320 |
+
def on_click(self, e):
|
| 321 |
+
idx = self.listbox.nearest(e.y)
|
| 322 |
+
self.listbox.selection_clear(0, tk.END); self.listbox.selection_set(idx)
|
| 323 |
+
self.last_selected = idx; self.show_text(None)
|
| 324 |
+
return "break"
|
| 325 |
+
|
| 326 |
+
def on_shift_click(self, e):
|
| 327 |
+
idx = self.listbox.nearest(e.y)
|
| 328 |
+
if self.last_selected is None: self.last_selected = idx
|
| 329 |
+
lo, hi = sorted((self.last_selected, idx))
|
| 330 |
+
self.listbox.selection_clear(0, tk.END)
|
| 331 |
+
for i in range(lo, hi+1): self.listbox.selection_set(i)
|
| 332 |
+
return "break"
|
| 333 |
+
|
| 334 |
+
# ββ File ops βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 335 |
+
def add_folder(self):
|
| 336 |
+
folder = filedialog.askdirectory(title="Select Folder")
|
| 337 |
+
if not folder: return
|
| 338 |
+
for root, _, fs in os.walk(folder):
|
| 339 |
+
for f in fs:
|
| 340 |
+
if f.lower().endswith(".pdf"):
|
| 341 |
+
p = os.path.join(root, f)
|
| 342 |
+
if p not in self.files:
|
| 343 |
+
self.files.append(p); self.listbox.insert(tk.END, p)
|
| 344 |
+
|
| 345 |
+
def add_file(self):
|
| 346 |
+
for p in filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files","*.pdf")]):
|
| 347 |
+
if p not in self.files:
|
| 348 |
+
self.files.append(p); self.listbox.insert(tk.END, p)
|
| 349 |
+
|
| 350 |
+
def remove_file(self):
|
| 351 |
+
sel = self.listbox.curselection()
|
| 352 |
+
if not sel:
|
| 353 |
+
messagebox.showwarning("Notice","Please select an entry to remove."); return
|
| 354 |
+
for idx in reversed(sel):
|
| 355 |
+
self.listbox.delete(idx); del self.files[idx]
|
| 356 |
+
self.text.delete(1.0, tk.END)
|
| 357 |
+
|
| 358 |
+
def remove_all(self):
|
| 359 |
+
self.listbox.delete(0, tk.END); self.files.clear(); self.text.delete(1.0, tk.END)
|
| 360 |
+
|
| 361 |
+
# ββ Parser control βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 362 |
+
def start_parser(self):
|
| 363 |
+
if not self.files:
|
| 364 |
+
messagebox.showinfo("No Files","Please select at least one file."); return
|
| 365 |
+
self.prog.config(state=tk.NORMAL); self.prog.delete(1.0, tk.END)
|
| 366 |
+
self.prog.insert(tk.END,"Starting parserβ¦\n"); self.prog.config(state=tk.DISABLED)
|
| 367 |
+
threading.Thread(target=self.run_parser).start()
|
| 368 |
+
|
| 369 |
+
def stop_parser(self):
|
| 370 |
+
if self.parser_proc and self.parser_proc.poll() is None:
|
| 371 |
+
self.parser_proc.terminate(); self.append_prog("Parser process was stopped.\n")
|
| 372 |
+
else:
|
| 373 |
+
self.append_prog("No active parser process to stop.\n")
|
| 374 |
+
|
| 375 |
+
def run_parser(self):
|
| 376 |
+
try:
|
| 377 |
+
self.parser_proc = subprocess.Popen(
|
| 378 |
+
[sys.executable, __file__] + self.files,
|
| 379 |
+
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
| 380 |
+
text=True, encoding="utf-8", errors="ignore", bufsize=4096
|
| 381 |
+
)
|
| 382 |
+
for line in self.parser_proc.stdout:
|
| 383 |
+
self.append_prog(line)
|
| 384 |
+
self.parser_proc.stdout.close(); self.parser_proc.wait()
|
| 385 |
+
if self.parser_proc.returncode == 0:
|
| 386 |
+
self.append_prog("\nParser finished successfully.\n")
|
| 387 |
+
self.shell_msg("Parser Done","The parser was executed successfully.")
|
| 388 |
+
else:
|
| 389 |
+
self.append_prog("\nError while running the parser.\n")
|
| 390 |
+
self.shell_msg("Error","Error while running the parser.")
|
| 391 |
+
except Exception as e:
|
| 392 |
+
self.append_prog(f"Error: {e}\n"); self.shell_msg("Error",f"Execution error:\n{e}")
|
| 393 |
+
finally:
|
| 394 |
+
self.parser_proc = None
|
| 395 |
+
|
| 396 |
+
# ββ GUI helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 397 |
+
def append_prog(self, txt):
|
| 398 |
+
self.prog.after(0, lambda:self._ins(txt))
|
| 399 |
+
|
| 400 |
+
def _ins(self, txt):
|
| 401 |
+
self.prog.config(state=tk.NORMAL); self.prog.insert(tk.END, txt)
|
| 402 |
+
self.prog.see(tk.END); self.prog.config(state=tk.DISABLED)
|
| 403 |
+
|
| 404 |
+
def shell_msg(self, title, msg):
|
| 405 |
+
self.master.after(0, lambda: messagebox.showinfo(title, msg))
|
| 406 |
+
|
| 407 |
+
def show_text(self, _):
|
| 408 |
+
sel = self.listbox.curselection()
|
| 409 |
+
if not sel: return
|
| 410 |
+
path = self.files[sel[0]]
|
| 411 |
+
txt = os.path.splitext(path)[0] + ".txt"
|
| 412 |
+
self.text.delete(1.0, tk.END)
|
| 413 |
+
if os.path.exists(txt):
|
| 414 |
+
try:
|
| 415 |
+
with open(txt,"r",encoding="utf-8",errors="ignore") as f:
|
| 416 |
+
self.text.insert(tk.END, f.read())
|
| 417 |
+
except Exception as e:
|
| 418 |
+
self.text.insert(tk.END,f"Error loading text file:\n{e}")
|
| 419 |
+
else:
|
| 420 |
+
self.text.insert(tk.END,"[No corresponding .txt file found]")
|
| 421 |
+
|
| 422 |
+
# ββ Main guard βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 423 |
+
if __name__ == "__main__":
|
| 424 |
+
multiprocessing.freeze_support()
|
| 425 |
+
if len(sys.argv) > 1:
|
| 426 |
+
process_pdfs_main()
|
| 427 |
+
else:
|
| 428 |
+
root = tk.Tk(); FileManager(root); root.mainloop()
|