Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

notebooks/model.ipynb +2 -2
notebooks/naive.ipynb +155 -0
notebooks/svm.ipynb +0 -0

notebooks/model.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0e580316d90588119633a0091618c9eba64d964086822c44c4e41c96101c7177
-size 17075128

 version https://git-lfs.github.com/spec/v1
+oid sha256:65ea1ca0239919445b4377838a0e614ddf2afb5648287551618d12cf8d46fbfa
+size 18438

notebooks/naive.ipynb ADDED Viewed

	@@ -0,0 +1,155 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n",
+    "from sklearn.metrics import accuracy_score, recall_score\n",
+    "import numpy as np\n",
+    "from datasets import load_dataset\n",
+    "from PIL import Image, ImageEnhance\n",
+    "import os\n",
+    "import cv2\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "import json\n",
+    "import csv\n",
+    "import re\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_dataset(ocr_dir, csv_dir, output_file):\n",
+    "    with open(output_file, 'w', encoding='utf-8') as jsonl_file:\n",
+    "        for filename in os.listdir(ocr_dir):\n",
+    "            if filename.endswith('.txt'):\n",
+    "                ocr_path = os.path.join(ocr_dir, filename)\n",
+    "                csv_path = os.path.join(csv_dir, filename)#.replace('.txt', '.csv'))\n",
+    "                print(csv_path)\n",
+    "                # if not os.path.exists(csv_path):\n",
+    "                #     print(f\"Warning: Corresponding CSV file not found for {ocr_path}\")\n",
+    "                #     continue\n",
+    "                \n",
+    "                with open(ocr_path, 'r', encoding='utf-8') as ocr_file:\n",
+    "                    ocr_text = ocr_file.read()\n",
+    "                \n",
+    "                with open(csv_path, 'r', encoding='utf-8') as csv_file:\n",
+    "                    csv_text = csv_file.read()\n",
+    "                \n",
+    "                json_object = {\n",
+    "                    \"prompt\": ocr_text,\n",
+    "                    \"completion\": csv_text\n",
+    "                }\n",
+    "                jsonl_file.write(json.dumps(json_object) + '\\n')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Usage\n",
+    "ocr_dir = os.getcwd() + '/../data/processed/annotations'\n",
+    "csv_dir = os.getcwd() + '/../data/processed/hand_labeled_tables/hand_labeled_tables'\n",
+    "output_file = 'dataset.jsonl'\n",
+    "prepare_dataset(ocr_dir, csv_dir, output_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load pre-trained GPT model and tokenizer\n",
+    "model_name = 'gpt2'\n",
+    "tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n",
+    "model = GPT2LMHeadModel.from_pretrained(model_name)\n",
+    "\n",
+    "# Ensure the model is in evaluation mode\n",
+    "model.eval()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess_text(text):\n",
+    "    # Basic cleaning for OCR text\n",
+    "    text = re.sub(r'\\s+', ' ', text)  # Remove extra whitespace\n",
+    "    text = re.sub(r'[^a-zA-Z0-9\\s,.:()%+-]', '', text)  # Remove most special characters, but keep some relevant ones\n",
+    "    return text.strip()\n",
+    "\n",
+    "def calculate_loss(model, tokenizer, prompt, true_completion):\n",
+    "    # Combine prompt and completion for full context\n",
+    "    full_text = f\"{prompt} {true_completion}\"\n",
+    "    inputs = tokenizer.encode(full_text, return_tensors='pt', truncation=True, max_length=512)\n",
+    "    \n",
+    "    # Calculate loss\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(inputs, labels=inputs)\n",
+    "    \n",
+    "    return outputs.loss.item()\n",
+    "\n",
+    "def evaluate_json_dataset(json_file, model, tokenizer):\n",
+    "    with open(json_file, 'r') as f:\n",
+    "        dataset = [json.loads(line) for line in f]\n",
+    "    \n",
+    "    losses = []\n",
+    "    \n",
+    "    for item in dataset:\n",
+    "        prompt = preprocess_text(item['prompt'])\n",
+    "        completion = preprocess_text(item['completion'])\n",
+    "        \n",
+    "        loss = calculate_loss(model, tokenizer, prompt, completion)\n",
+    "        losses.append(loss)\n",
+    "    \n",
+    "    average_loss = np.mean(losses)\n",
+    "    \n",
+    "    return average_loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "average_loss = evaluate_json_dataset('dataset.jsonl', model, tokenizer)\n",
+    "print(f\"cross-entropy loss: {average_loss:.4f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "term_project",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

notebooks/svm.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff