{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n", "from sklearn.metrics import accuracy_score, recall_score\n", "import numpy as np\n", "from datasets import load_dataset\n", "from PIL import Image, ImageEnhance\n", "import os\n", "import cv2\n", "from sklearn.preprocessing import LabelEncoder\n", "import json\n", "import csv\n", "import re\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def prepare_dataset(ocr_dir, csv_dir, output_file):\n", " with open(output_file, 'w', encoding='utf-8') as jsonl_file:\n", " for filename in os.listdir(ocr_dir):\n", " if filename.endswith('.txt'):\n", " ocr_path = os.path.join(ocr_dir, filename)\n", " csv_path = os.path.join(csv_dir, filename)#.replace('.txt', '.csv'))\n", " print(csv_path)\n", " # if not os.path.exists(csv_path):\n", " # print(f\"Warning: Corresponding CSV file not found for {ocr_path}\")\n", " # continue\n", " \n", " with open(ocr_path, 'r', encoding='utf-8') as ocr_file:\n", " ocr_text = ocr_file.read()\n", " \n", " with open(csv_path, 'r', encoding='utf-8') as csv_file:\n", " csv_text = csv_file.read()\n", " \n", " json_object = {\n", " \"prompt\": ocr_text,\n", " \"completion\": csv_text\n", " }\n", " jsonl_file.write(json.dumps(json_object) + '\\n')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Usage\n", "ocr_dir = os.getcwd() + '/../data/processed/annotations'\n", "csv_dir = os.getcwd() + '/../data/processed/hand_labeled_tables/hand_labeled_tables'\n", "output_file = 'dataset.jsonl'\n", "prepare_dataset(ocr_dir, csv_dir, output_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load pre-trained GPT model and tokenizer\n", "model_name = 'gpt2'\n", "tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n", "model = GPT2LMHeadModel.from_pretrained(model_name)\n", "\n", "# Ensure the model is in evaluation mode\n", "model.eval()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def preprocess_text(text):\n", " # Basic cleaning for OCR text\n", " text = re.sub(r'\\s+', ' ', text) # Remove extra whitespace\n", " text = re.sub(r'[^a-zA-Z0-9\\s,.:()%+-]', '', text) # Remove most special characters, but keep some relevant ones\n", " return text.strip()\n", "\n", "def calculate_loss(model, tokenizer, prompt, true_completion):\n", " # Combine prompt and completion for full context\n", " full_text = f\"{prompt} {true_completion}\"\n", " inputs = tokenizer.encode(full_text, return_tensors='pt', truncation=True, max_length=512)\n", " \n", " # Calculate loss\n", " with torch.no_grad():\n", " outputs = model(inputs, labels=inputs)\n", " \n", " return outputs.loss.item()\n", "\n", "def evaluate_json_dataset(json_file, model, tokenizer):\n", " with open(json_file, 'r') as f:\n", " dataset = [json.loads(line) for line in f]\n", " \n", " losses = []\n", " \n", " for item in dataset:\n", " prompt = preprocess_text(item['prompt'])\n", " completion = preprocess_text(item['completion'])\n", " \n", " loss = calculate_loss(model, tokenizer, prompt, completion)\n", " losses.append(loss)\n", " \n", " average_loss = np.mean(losses)\n", " \n", " return average_loss" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "average_loss = evaluate_json_dataset('dataset.jsonl', model, tokenizer)\n", "print(f\"cross-entropy loss: {average_loss:.4f}\")" ] } ], "metadata": { "kernelspec": { "display_name": "term_project", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.19" } }, "nbformat": 4, "nbformat_minor": 2 }