Spaces:
Sleeping
Sleeping
Upload 14 files
Browse files- .gitattributes +6 -36
- .gitignore +2 -0
- Dockerfile +20 -0
- README.md +250 -5
- api_log.txt +20 -0
- app.py +213 -0
- baseline_analysis.py +55 -0
- baseline_translate.py +51 -0
- debug_load.py +26 -0
- fast_api.py +214 -0
- interactive_translate.py +74 -0
- requirements.txt +92 -0
- test_analysis.py +84 -0
- test_translation.py +71 -0
.gitattributes
CHANGED
|
@@ -1,38 +1,8 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
data/processed/nepali.en filter=lfs diff=lfs merge=lfs -text
|
| 37 |
data/processed/nepali.ne filter=lfs diff=lfs merge=lfs -text
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
data/processed/nepali.en filter=lfs diff=lfs merge=lfs -text
|
| 2 |
data/processed/nepali.ne filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ico filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.en filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.ne filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
models/
|
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime as a parent image
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Set the working directory in the container
|
| 5 |
+
WORKDIR /code
|
| 6 |
+
|
| 7 |
+
# Copy the requirements file into the container at /code
|
| 8 |
+
COPY ./requirements.txt /code/requirements.txt
|
| 9 |
+
|
| 10 |
+
# Install any needed packages specified in requirements.txt
|
| 11 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 12 |
+
|
| 13 |
+
# Copy the rest of the application's code
|
| 14 |
+
COPY . /code/
|
| 15 |
+
|
| 16 |
+
# Expose the port the app runs on
|
| 17 |
+
EXPOSE 7860
|
| 18 |
+
|
| 19 |
+
# Command to run the application
|
| 20 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,255 @@
|
|
| 1 |
---
|
| 2 |
-
title: Translate
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
|
|
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Translate
|
| 3 |
+
emoji: 🌐
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: docker
|
| 7 |
+
app_file: app.py
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
+
# Saksi Translation: Nepali-English Machine Translation
|
| 11 |
|
| 12 |
+
This project provides a machine translation solution to translate text from Nepali and Sinhala to English. It leverages the power of the NLLB (No Language Left Behind) model from Meta AI, which is fine-tuned on a custom dataset for improved performance. The project includes a complete workflow from data acquisition to model deployment, featuring a REST API for easy integration.
|
| 13 |
+
|
| 14 |
+
## Table of Contents
|
| 15 |
+
|
| 16 |
+
- [Features](#features)
|
| 17 |
+
- [Workflow](#workflow)
|
| 18 |
+
- [Tech Stack](#tech-stack)
|
| 19 |
+
- [Model Details](#model-details)
|
| 20 |
+
- [API Endpoints](#api-endpoints)
|
| 21 |
+
- [Getting Started](#getting-started)
|
| 22 |
+
- [Usage](#usage)
|
| 23 |
+
- [Project Structure](#project-structure)
|
| 24 |
+
- [Future Improvements](#future-improvements)
|
| 25 |
+
|
| 26 |
+
## Features
|
| 27 |
+
|
| 28 |
+
- **High-Quality Translation:** Utilizes a fine-tuned NLLB model for accurate translations.
|
| 29 |
+
- **Support for Multiple Languages:** Currently supports Nepali and Sinhala to English translation.
|
| 30 |
+
- **REST API:** Exposes the translation model through a high-performance FastAPI application.
|
| 31 |
+
- **Interactive Frontend:** A simple and intuitive web interface for easy translation.
|
| 32 |
+
- **Batch Translation:** Supports translating multiple texts in a single request.
|
| 33 |
+
- **PDF Translation:** Supports translating text directly from PDF files.
|
| 34 |
+
- **Scalable and Reproducible:** Built with a modular structure and uses MLflow for experiment tracking.
|
| 35 |
+
|
| 36 |
+
## Workflow
|
| 37 |
+
|
| 38 |
+
The project follows a standard machine learning workflow for building and deploying a translation model:
|
| 39 |
+
|
| 40 |
+
1. **Data Acquisition:** The process begins with collecting parallel text data (Nepali/Sinhala and English). The `scripts/fetch_parallel_data.py` script is used to download data from various online sources. The quality and quantity of this data are crucial for the model's performance.
|
| 41 |
+
|
| 42 |
+
2. **Data Cleaning and Preprocessing:** Raw data from the web is often noisy and requires cleaning. The `scripts/clean_text_data.py` script performs several preprocessing steps:
|
| 43 |
+
* **HTML Tag Removal:** Strips out HTML tags and other web artifacts.
|
| 44 |
+
* **Unicode Normalization:** Normalizes Unicode characters to ensure consistency.
|
| 45 |
+
* **Sentence Filtering:** Removes sentences that are too long or too short, which can negatively impact training.
|
| 46 |
+
* **Corpus Alignment:** Ensures a one-to-one correspondence between source and target sentences.
|
| 47 |
+
|
| 48 |
+
3. **Model Finetuning:** The core of the project is fine-tuning a pre-trained NLLB model on our custom parallel dataset. The `src/train.py` script, which leverages the Hugging Face `Trainer` API, handles this process. This script manages the entire training loop, including:
|
| 49 |
+
* Loading the pre-trained NLLB model and tokenizer.
|
| 50 |
+
* Creating a PyTorch Dataset from the preprocessed data.
|
| 51 |
+
* Configuring training arguments like learning rate, batch size, and number of epochs.
|
| 52 |
+
* Executing the training loop and saving the fine-tuned model checkpoints.
|
| 53 |
+
|
| 54 |
+
4. **Model Evaluation:** After training, the model's performance is evaluated using the `src/evaluation.py` script. This script calculates the **BLEU (Bilingual Evaluation Understudy)** score, a widely accepted metric for machine translation quality. It works by comparing the model's translations of a test set with a set of high-quality reference translations.
|
| 55 |
+
|
| 56 |
+
5. **Inference and Deployment:** Once the model is trained and evaluated, it's ready for use.
|
| 57 |
+
* `interactive_translate.py`: A command-line script for quick, interactive translation tests.
|
| 58 |
+
* `fast_api.py`: A production-ready REST API built with FastAPI that serves the translation model. This allows other applications to easily consume the translation service.
|
| 59 |
+
|
| 60 |
+
## Tech Stack
|
| 61 |
+
|
| 62 |
+
The technologies used in this project were chosen to create a robust, efficient, and maintainable machine translation pipeline:
|
| 63 |
+
|
| 64 |
+
- **Python:** The primary language for the project, offering a rich ecosystem of libraries and frameworks for machine learning.
|
| 65 |
+
- **PyTorch:** A flexible and powerful deep learning framework that provides fine-grained control over the model training process.
|
| 66 |
+
- **Hugging Face Transformers:** The backbone of the project, providing easy access to pre-trained models like NLLB and a standardized interface for training and inference.
|
| 67 |
+
- **Hugging Face Datasets:** Simplifies the process of loading and preprocessing large datasets, with efficient data loading and manipulation capabilities.
|
| 68 |
+
- **FastAPI:** A modern, high-performance web framework for building APIs with Python. It's used to serve the translation model as a REST API.
|
| 69 |
+
- **Uvicorn:** A lightning-fast ASGI server, used to run the FastAPI application.
|
| 70 |
+
- **MLflow:** Used for experiment tracking to ensure reproducibility. It logs training parameters, metrics, and model artifacts, which is crucial for managing machine learning projects.
|
| 71 |
+
|
| 72 |
+
## Model Details
|
| 73 |
+
|
| 74 |
+
- **Base Model:** The project uses the `facebook/nllb-200-distilled-600M` model, a distilled version of the NLLB-200 model. This model is designed to be efficient while still providing high-quality translations for a large number of languages.
|
| 75 |
+
- **Fine-tuning:** The base model is fine-tuned on a custom dataset of Nepali-English and Sinhala-English parallel text to improve its performance on these specific language pairs.
|
| 76 |
+
- **Tokenizer:** The `NllbTokenizer` is used for tokenizing the text. It's a sentence-piece based tokenizer that is specifically designed for the NLLB model.
|
| 77 |
+
|
| 78 |
+
## API Endpoints
|
| 79 |
+
|
| 80 |
+
The FastAPI application provides the following endpoints:
|
| 81 |
+
|
| 82 |
+
- **`GET /`**: Returns the frontend HTML page.
|
| 83 |
+
- **`GET /languages`**: Returns a list of supported languages.
|
| 84 |
+
- **`POST /translate`**: Translates a single text.
|
| 85 |
+
- **Request Body:**
|
| 86 |
+
```json
|
| 87 |
+
{
|
| 88 |
+
"text": "string",
|
| 89 |
+
"source_language": "string"
|
| 90 |
+
}
|
| 91 |
+
```
|
| 92 |
+
- **Response Body:**
|
| 93 |
+
```json
|
| 94 |
+
{
|
| 95 |
+
"original_text": "string",
|
| 96 |
+
"translated_text": "string",
|
| 97 |
+
"source_language": "string"
|
| 98 |
+
}
|
| 99 |
+
```
|
| 100 |
+
- **`POST /batch-translate`**: Translates a batch of texts.
|
| 101 |
+
- **Request Body:**
|
| 102 |
+
```json
|
| 103 |
+
{
|
| 104 |
+
"texts": [
|
| 105 |
+
"string"
|
| 106 |
+
],
|
| 107 |
+
"source_language": "string"
|
| 108 |
+
}
|
| 109 |
+
```
|
| 110 |
+
- **Response Body:**
|
| 111 |
+
```json
|
| 112 |
+
{
|
| 113 |
+
"original_texts": [
|
| 114 |
+
"string"
|
| 115 |
+
],
|
| 116 |
+
"translated_texts": [
|
| 117 |
+
"string"
|
| 118 |
+
],
|
| 119 |
+
"source_language": "string"
|
| 120 |
+
}
|
| 121 |
+
```
|
| 122 |
+
- **`POST /translate-pdf`**: Translates a PDF file.
|
| 123 |
+
- **Request:** `source_language: str`, `file: UploadFile`
|
| 124 |
+
- **Response Body:**
|
| 125 |
+
```json
|
| 126 |
+
{
|
| 127 |
+
"filename": "string",
|
| 128 |
+
"translated_text": "string",
|
| 129 |
+
"source_language": "string"
|
| 130 |
+
}
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
## Getting Started
|
| 134 |
+
|
| 135 |
+
### Prerequisites
|
| 136 |
+
|
| 137 |
+
- **Python 3.10 or higher:** Ensure you have a recent version of Python installed.
|
| 138 |
+
- **Git and Git LFS:** Git is required to clone the repository, and Git LFS is required to handle large model files.
|
| 139 |
+
- **(Optional) NVIDIA GPU with CUDA:** A GPU is highly recommended for training the model.
|
| 140 |
+
|
| 141 |
+
### Installation
|
| 142 |
+
|
| 143 |
+
1. **Clone the repository:**
|
| 144 |
+
```bash
|
| 145 |
+
git clone <repository-url>
|
| 146 |
+
cd saksi_translation
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
2. **Create and activate a virtual environment:**
|
| 150 |
+
```bash
|
| 151 |
+
python -m venv .venv
|
| 152 |
+
# On Windows
|
| 153 |
+
.venv\Scripts\activate
|
| 154 |
+
# On macOS/Linux
|
| 155 |
+
source .venv/bin/activate
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
3. **Install dependencies:**
|
| 159 |
+
```bash
|
| 160 |
+
pip install -r requirements.txt
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
## Usage
|
| 164 |
+
|
| 165 |
+
### Data Preparation
|
| 166 |
+
|
| 167 |
+
- **Fetch Parallel Data:**
|
| 168 |
+
```bash
|
| 169 |
+
python scripts/fetch_parallel_data.py --output_dir data/raw
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
- **Clean Text Data:**
|
| 173 |
+
```bash
|
| 174 |
+
python scripts/clean_text_data.py --input_dir data/raw --output_dir data/processed
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
### Training
|
| 178 |
+
|
| 179 |
+
- **Start Training:**
|
| 180 |
+
```bash
|
| 181 |
+
python src/train.py \
|
| 182 |
+
--model_name "facebook/nllb-200-distilled-600M" \
|
| 183 |
+
--dataset_path "data/processed" \
|
| 184 |
+
--output_dir "models/nllb-finetuned-nepali-en" \
|
| 185 |
+
--learning_rate 2e-5 \
|
| 186 |
+
--per_device_train_batch_size 8 \
|
| 187 |
+
--num_train_epochs 3
|
| 188 |
+
```
|
| 189 |
+
|
| 190 |
+
### Evaluation
|
| 191 |
+
|
| 192 |
+
- **Evaluate the Model:**
|
| 193 |
+
```bash
|
| 194 |
+
python src/evaluate.py \
|
| 195 |
+
--model_path "models/nllb-finetuned-nepali-en" \
|
| 196 |
+
--test_data_path "data/test_sets/test.en" \
|
| 197 |
+
--reference_data_path "data/test_sets/test.ne"
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
### Interactive Translation
|
| 201 |
+
|
| 202 |
+
- **Run the interactive script:**
|
| 203 |
+
```bash
|
| 204 |
+
python interactive_translate.py
|
| 205 |
+
```
|
| 206 |
+
|
| 207 |
+
### API
|
| 208 |
+
|
| 209 |
+
- **Run the API:**
|
| 210 |
+
```bash
|
| 211 |
+
uvicorn fast_api:app --reload
|
| 212 |
+
```
|
| 213 |
+
Open your browser and navigate to `http://127.0.0.1:8000` to use the web interface.
|
| 214 |
+
|
| 215 |
+
## Project Structure
|
| 216 |
+
|
| 217 |
+
```
|
| 218 |
+
saksi_translation/
|
| 219 |
+
├── .gitignore
|
| 220 |
+
├── fast_api.py # FastAPI application
|
| 221 |
+
├── interactive_translate.py # Interactive translation script
|
| 222 |
+
├── README.md # Project documentation
|
| 223 |
+
├── requirements.txt # Python dependencies
|
| 224 |
+
├── test_translation.py # Script for testing the translation model
|
| 225 |
+
├── frontend/
|
| 226 |
+
│ ├── index.html # Frontend HTML
|
| 227 |
+
│ ├── script.js # Frontend JavaScript
|
| 228 |
+
│ └── styles.css # Frontend CSS
|
| 229 |
+
├── data/
|
| 230 |
+
│ ├── processed/ # Processed data for training
|
| 231 |
+
│ ├── raw/ # Raw data downloaded from the web
|
| 232 |
+
│ └── test_sets/ # Test sets for evaluation
|
| 233 |
+
├── mlruns/ # MLflow experiment tracking data
|
| 234 |
+
├── models/
|
| 235 |
+
│ └── nllb-finetuned-nepali-en/ # Fine-tuned model
|
| 236 |
+
├── notebooks/ # Jupyter notebooks for experimentation
|
| 237 |
+
├── scripts/
|
| 238 |
+
│ ├── clean_text_data.py
|
| 239 |
+
│ ├── create_test_set.py
|
| 240 |
+
│ ├── download_model.py
|
| 241 |
+
│ ├── fetch_parallel_data.py
|
| 242 |
+
│ └── scrape_bbc_nepali.py
|
| 243 |
+
└── src/
|
| 244 |
+
├── __init__.py
|
| 245 |
+
├── evaluation.py # Script for evaluating the model
|
| 246 |
+
├── train.py # Script for training the model
|
| 247 |
+
└── translate.py # Script for translating text
|
| 248 |
+
```
|
| 249 |
+
|
| 250 |
+
## Future Improvements
|
| 251 |
+
|
| 252 |
+
- **Support for more languages:** The project can be extended to support more languages by adding more parallel data and fine-tuning the model on it.
|
| 253 |
+
- **Improved Model:** The model can be improved by using a larger version of the NLLB model or by fine-tuning it on a larger and cleaner dataset.
|
| 254 |
+
- **Advanced Frontend:** The frontend can be improved by adding features like translation history, user accounts, and more advanced styling.
|
| 255 |
+
- **Containerization:** The application can be containerized using Docker for easier deployment and scaling.
|
api_log.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Loading models on CPU...
|
| 2 |
+
Traceback (most recent call last):
|
| 3 |
+
File "D:\SIH\saksi_translation\api.py", line 14, in <module>
|
| 4 |
+
"nepali": AutoModelForSeq2SeqLM.from_pretrained("models/nllb-finetuned-nepali-en").to(DEVICE),
|
| 5 |
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6 |
+
File "C:\Users\dynos\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\transformers\models\auto\auto_factory.py", line 549, in from_pretrained
|
| 7 |
+
config, kwargs = AutoConfig.from_pretrained(
|
| 8 |
+
~~~~~~~~~~~~~~~~~~~~~~~~~~^
|
| 9 |
+
pretrained_model_name_or_path,
|
| 10 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 11 |
+
...<4 lines>...
|
| 12 |
+
**kwargs,
|
| 13 |
+
^^^^^^^^^
|
| 14 |
+
)
|
| 15 |
+
^
|
| 16 |
+
File "C:\Users\dynos\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\transformers\models\auto\configuration_auto.py", line 1329, in from_pretrained
|
| 17 |
+
raise ValueError(
|
| 18 |
+
...<3 lines>...
|
| 19 |
+
)
|
| 20 |
+
ValueError: Unrecognized model in models/nllb-finetuned-nepali-en. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: aimv2, aimv2_vision_model, albert, align, altclip, apertus, arcee, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, bitnet, blenderbot, blenderbot-small, blip, blip-2, blip_2_qformer, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, cohere2_vision, colpali, colqwen2, conditional_detr, convbert, convnext, convnextv2, cpmant, csm, ctrl, cvt, d_fine, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deepseek_v2, deepseek_v3, deepseek_vl, deepseek_vl_hybrid, deformable_detr, deit, depth_anything, depth_pro, deta, detr, dia, diffllama, dinat, dinov2, dinov2_with_registers, dinov3_convnext, dinov3_vit, distilbert, doge, donut-swin, dots1, dpr, dpt, efficientformer, efficientloftr, efficientnet, electra, emu3, encodec, encoder-decoder, eomt, ernie, ernie4_5, ernie4_5_moe, ernie_m, esm, evolla, exaone4, falcon, falcon_h1, falcon_mamba, fastspeech2_conformer, fastspeech2_conformer_with_hifigan, flaubert, flava, florence2, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, gemma3, gemma3_text, gemma3n, gemma3n_audio, gemma3n_text, gemma3n_vision, git, glm, glm4, glm4_moe, glm4v, glm4v_moe, glm4v_moe_text, glm4v_text, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gpt_oss, gptj, gptsan-japanese, granite, granite_speech, granitemoe, granitemoehybrid, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hgnet_v2, hiera, hubert, hunyuan_v1_dense, hunyuan_v1_moe, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, internvl, internvl_vision, jamba, janus, jetmoe, jukebox, kosmos-2, kosmos-2.5, kyutai_speech_to_text, layoutlm, layoutlmv2, layoutlmv3, led, levit, lfm2, lightglue, lilt, llama, llama4, llama4_text, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, metaclip_2, mgp-str, mimi, minimax, mistral, mistral3, mixtral, mlcd, mllama, mm-grounding-dino, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, modernbert-decoder, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, ovis2, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, perception_encoder, perception_lm, persimmon, phi, phi3, phi4_multimodal, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prompt_depth_anything, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_omni, qwen2_5_vl, qwen2_5_vl_text, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, qwen2_vl_text, qwen3, qwen3_moe, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, sam2, sam2_hiera_det_model, sam2_video, sam2_vision_model, sam_hq, sam_hq_vision_model, sam_vision_model, seamless_m4t, seamless_m4t_v2, seed_oss, segformer, seggpt, sew, sew-d, shieldgemma2, siglip, siglip2, siglip_vision_model, smollm3, smolvlm, smolvlm_vision, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, t5gemma, table-transformer, tapas, textnet, time_series_transformer, timesfm, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, vjepa2, voxtral, voxtral_encoder, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xcodec, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xlstm, xmod, yolos, yoso, zamba, zamba2, zoedepth
|
app.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
A FastAPI application for serving the translation model, inspired by interactive_translate.py.
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
from transformers import M2M100ForConditionalGeneration, NllbTokenizer
|
| 6 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File
|
| 7 |
+
from fastapi.staticfiles import StaticFiles
|
| 8 |
+
from fastapi.responses import FileResponse
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
import logging
|
| 11 |
+
from typing import List
|
| 12 |
+
import fitz # PyMuPDF
|
| 13 |
+
import shutil
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
# --- 1. App Configuration ---
|
| 17 |
+
logging.basicConfig(level=logging.INFO)
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
app = FastAPI(
|
| 21 |
+
title="Saksi Translation API",
|
| 22 |
+
description="A simple API for translating text and PDFs to English.",
|
| 23 |
+
version="2.0",
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
app.mount("/frontend", StaticFiles(directory="frontend"), name="frontend")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# --- 2. Global Variables ---
|
| 30 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 31 |
+
SUPPORTED_LANGUAGES = {
|
| 32 |
+
"nepali": "nep_Npan",
|
| 33 |
+
"sinhala": "sin_Sinh",
|
| 34 |
+
}
|
| 35 |
+
MODEL_PATH = "facebook/nllb-200-distilled-600M"
|
| 36 |
+
model = None
|
| 37 |
+
tokenizer = None
|
| 38 |
+
|
| 39 |
+
# --- 3. Pydantic Models ---
|
| 40 |
+
class TranslationRequest(BaseModel):
|
| 41 |
+
text: str
|
| 42 |
+
source_language: str
|
| 43 |
+
|
| 44 |
+
class TranslationResponse(BaseModel):
|
| 45 |
+
original_text: str
|
| 46 |
+
translated_text: str
|
| 47 |
+
source_language: str
|
| 48 |
+
|
| 49 |
+
class BatchTranslationRequest(BaseModel):
|
| 50 |
+
texts: List[str]
|
| 51 |
+
source_language: str
|
| 52 |
+
|
| 53 |
+
class BatchTranslationResponse(BaseModel):
|
| 54 |
+
original_texts: List[str]
|
| 55 |
+
translated_texts: List[str]
|
| 56 |
+
source_language: str
|
| 57 |
+
|
| 58 |
+
class PdfTranslationResponse(BaseModel):
|
| 59 |
+
filename: str
|
| 60 |
+
translated_text: str
|
| 61 |
+
source_language: str
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# --- 4. Helper Functions ---
|
| 65 |
+
def load_model_and_tokenizer(model_path):
|
| 66 |
+
"""Loads the model and tokenizer from the given path."""
|
| 67 |
+
global model, tokenizer
|
| 68 |
+
logger.info(f"Loading model on {DEVICE.upper()}...")
|
| 69 |
+
try:
|
| 70 |
+
model = M2M100ForConditionalGeneration.from_pretrained(model_path).to(DEVICE)
|
| 71 |
+
tokenizer = NllbTokenizer.from_pretrained(model_path)
|
| 72 |
+
logger.info("Model and tokenizer loaded successfully!")
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.error(f"Error loading model: {e}")
|
| 75 |
+
# In a real app, you might want to exit or handle this more gracefully
|
| 76 |
+
raise
|
| 77 |
+
|
| 78 |
+
def translate_text(text: str, src_lang: str) -> str:
|
| 79 |
+
"""
|
| 80 |
+
Translates a single string of text to English.
|
| 81 |
+
"""
|
| 82 |
+
if src_lang not in SUPPORTED_LANGUAGES:
|
| 83 |
+
raise ValueError(f"Language '{src_lang}' not supported.")
|
| 84 |
+
|
| 85 |
+
tokenizer.src_lang = SUPPORTED_LANGUAGES[src_lang]
|
| 86 |
+
inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
|
| 87 |
+
|
| 88 |
+
generated_tokens = model.generate(
|
| 89 |
+
**inputs,
|
| 90 |
+
forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"),
|
| 91 |
+
max_length=128,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 95 |
+
|
| 96 |
+
def batch_translate_text(texts: List[str], src_lang: str) -> List[str]:
|
| 97 |
+
"""
|
| 98 |
+
Translates a batch of texts to English.
|
| 99 |
+
"""
|
| 100 |
+
if src_lang not in SUPPORTED_LANGUAGES:
|
| 101 |
+
raise ValueError(f"Language '{src_lang}' not supported.")
|
| 102 |
+
|
| 103 |
+
tokenizer.src_lang = SUPPORTED_LANGUAGES[src_lang]
|
| 104 |
+
# We use padding=True to handle batches of different lengths
|
| 105 |
+
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
|
| 106 |
+
|
| 107 |
+
generated_tokens = model.generate(
|
| 108 |
+
**inputs,
|
| 109 |
+
forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"),
|
| 110 |
+
max_length=512, # Allow for longer generated sequences in batches
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
| 114 |
+
|
| 115 |
+
# --- 5. API Events ---
|
| 116 |
+
@app.on_event("startup")
|
| 117 |
+
async def startup_event():
|
| 118 |
+
"""Load the model at startup."""
|
| 119 |
+
load_model_and_tokenizer(MODEL_PATH)
|
| 120 |
+
|
| 121 |
+
# --- 6. API Endpoints ---
|
| 122 |
+
@app.get("/")
|
| 123 |
+
async def root():
|
| 124 |
+
"""Returns the frontend."""
|
| 125 |
+
return FileResponse('frontend/index.html')
|
| 126 |
+
|
| 127 |
+
@app.get("/languages")
|
| 128 |
+
def get_supported_languages():
|
| 129 |
+
"""Returns a list of supported languages."""
|
| 130 |
+
return {"supported_languages": list(SUPPORTED_LANGUAGES.keys())}
|
| 131 |
+
|
| 132 |
+
@app.post("/translate", response_model=TranslationResponse)
|
| 133 |
+
async def translate(request: TranslationRequest):
|
| 134 |
+
"""Translates a single text from a source language to English."""
|
| 135 |
+
try:
|
| 136 |
+
translated_text = translate_text(request.text, request.source_language)
|
| 137 |
+
return TranslationResponse(
|
| 138 |
+
original_text=request.text,
|
| 139 |
+
translated_text=translated_text,
|
| 140 |
+
source_language=request.source_language,
|
| 141 |
+
)
|
| 142 |
+
except ValueError as e:
|
| 143 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 144 |
+
except Exception as e:
|
| 145 |
+
raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {e}")
|
| 146 |
+
|
| 147 |
+
@app.post("/batch-translate", response_model=BatchTranslationResponse)
|
| 148 |
+
async def batch_translate(request: BatchTranslationRequest):
|
| 149 |
+
"""Translates a batch of texts from a source language to English."""
|
| 150 |
+
try:
|
| 151 |
+
translated_texts = batch_translate_text(request.texts, request.source_language)
|
| 152 |
+
return BatchTranslationResponse(
|
| 153 |
+
original_texts=request.texts,
|
| 154 |
+
translated_texts=translated_texts,
|
| 155 |
+
source_language=request.source_language,
|
| 156 |
+
)
|
| 157 |
+
except ValueError as e:
|
| 158 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 159 |
+
except Exception as e:
|
| 160 |
+
raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {e}")
|
| 161 |
+
|
| 162 |
+
@app.post("/translate-pdf", response_model=PdfTranslationResponse)
|
| 163 |
+
async def translate_pdf(source_language: str, file: UploadFile = File(...)):
|
| 164 |
+
"""Translates a PDF file from a source language to English."""
|
| 165 |
+
if file.content_type != "application/pdf":
|
| 166 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
|
| 167 |
+
|
| 168 |
+
# Save the uploaded file temporarily
|
| 169 |
+
temp_pdf_path = f"temp_{file.filename}"
|
| 170 |
+
with open(temp_pdf_path, "wb") as buffer:
|
| 171 |
+
shutil.copyfileobj(file.file, buffer)
|
| 172 |
+
|
| 173 |
+
try:
|
| 174 |
+
# Extract text from the PDF
|
| 175 |
+
doc = fitz.open(temp_pdf_path)
|
| 176 |
+
extracted_text = ""
|
| 177 |
+
for page in doc:
|
| 178 |
+
extracted_text += page.get_text()
|
| 179 |
+
doc.close()
|
| 180 |
+
|
| 181 |
+
if not extracted_text.strip():
|
| 182 |
+
raise HTTPException(status_code=400, detail="Could not extract any text from the PDF.")
|
| 183 |
+
|
| 184 |
+
# Split text into chunks (e.g., by paragraph) to handle large texts
|
| 185 |
+
text_chunks = [p.strip() for p in extracted_text.split('\n') if p.strip()]
|
| 186 |
+
|
| 187 |
+
# Translate the chunks in batches
|
| 188 |
+
translated_chunks = batch_translate_text(text_chunks, source_language)
|
| 189 |
+
|
| 190 |
+
# Join the translated chunks back together
|
| 191 |
+
final_translation = "\n".join(translated_chunks)
|
| 192 |
+
|
| 193 |
+
return PdfTranslationResponse(
|
| 194 |
+
filename=file.filename,
|
| 195 |
+
translated_text=final_translation,
|
| 196 |
+
source_language=source_language,
|
| 197 |
+
)
|
| 198 |
+
except Exception as e:
|
| 199 |
+
logger.error(f"Error processing PDF: {e}")
|
| 200 |
+
raise HTTPException(status_code=500, detail=f"An error occurred while processing the PDF: {e}")
|
| 201 |
+
finally:
|
| 202 |
+
# Clean up the temporary file
|
| 203 |
+
if os.path.exists(temp_pdf_path):
|
| 204 |
+
os.remove(temp_pdf_path)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
# --- 7. Example Usage (for running with uvicorn) ---
|
| 208 |
+
# To run this API, use the following command in your terminal:
|
| 209 |
+
# uvicorn fast_api:app --reload
|
| 210 |
+
|
| 211 |
+
if __name__ == "__main__":
|
| 212 |
+
import uvicorn
|
| 213 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
baseline_analysis.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# baseline_analysis.py
|
| 2 |
+
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
# Define the model we want to use. We'll use a distilled (smaller, faster)
|
| 7 |
+
# version of NLLB-200 for this quick test.
|
| 8 |
+
model_name = "facebook/nllb-200-distilled-600M"
|
| 9 |
+
|
| 10 |
+
# Load the pre-trained tokenizer and model from Hugging Face.
|
| 11 |
+
# This might take a minute to download the first time.
|
| 12 |
+
print(f"Loading model: {model_name}")
|
| 13 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 14 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
| 15 |
+
print("Model loaded successfully!")
|
| 16 |
+
|
| 17 |
+
# Sentences we want to translate.
|
| 18 |
+
sinhala_sentences = [
|
| 19 |
+
"ඩෝසන් මිස් දුරකථනයෙන් ඩෝසන් මිස් කවුද සර්",
|
| 20 |
+
"කවුද ඩෝසන් නැතුව ඉන්නේ ඔව් සර්",
|
| 21 |
+
"ඔබ එය උත්සාහ කරන්න සර්",
|
| 22 |
+
"කොහොමද වැඩේ හරිද ඔව් සර්ට ස්තුතියි",
|
| 23 |
+
"ඔව්, හරි, ස්තුතියි රත්තරං"
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
print("\n--- Starting Translation ---")
|
| 27 |
+
|
| 28 |
+
# Loop through each sentence and translate it.
|
| 29 |
+
for sentence in sinhala_sentences:
|
| 30 |
+
|
| 31 |
+
# 1. Prepare the input for the model
|
| 32 |
+
# We need to tell the tokenizer what the source language is.
|
| 33 |
+
tokenizer.src_lang = "sin_Sinh"
|
| 34 |
+
|
| 35 |
+
# Convert the text into a format the model understands (input IDs).
|
| 36 |
+
inputs = tokenizer(sentence, return_tensors="pt")
|
| 37 |
+
|
| 38 |
+
# 2. Generate the translation
|
| 39 |
+
# We force the model to output English by setting the target language ID.
|
| 40 |
+
target_lang = "eng_Latn"
|
| 41 |
+
translated_tokens = model.generate(
|
| 42 |
+
**inputs,
|
| 43 |
+
forced_bos_token_id=tokenizer.vocab[target_lang],
|
| 44 |
+
max_length=50 # Set a max length for the output
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# 3. Decode the output
|
| 48 |
+
# Convert the model's output tokens back into readable text.
|
| 49 |
+
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
|
| 50 |
+
|
| 51 |
+
# 4. Display the results
|
| 52 |
+
print(f"\nOriginal (si): {sentence}")
|
| 53 |
+
print(f"Translation (en): {translation}")
|
| 54 |
+
|
| 55 |
+
print("\n--- Translation Complete ---")
|
baseline_translate.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# baseline_translate.py
|
| 2 |
+
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
# Define the model we want to use. We'll use a distilled (smaller, faster)
|
| 7 |
+
# version of NLLB-200 for this quick test.
|
| 8 |
+
model_name = "facebook/nllb-200-distilled-600M"
|
| 9 |
+
|
| 10 |
+
# Load the pre-trained tokenizer and model from Hugging Face.
|
| 11 |
+
# This might take a minute to download the first time.
|
| 12 |
+
print(f"Loading model: {model_name}")
|
| 13 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 14 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
| 15 |
+
print("Model loaded successfully!")
|
| 16 |
+
|
| 17 |
+
# Sentences we want to translate.
|
| 18 |
+
sentences_to_translate = {
|
| 19 |
+
"nep_Npan": "नेपालको राजधानी काठमाडौं हो।", # Nepali: "The capital of Nepal is Kathmandu."
|
| 20 |
+
"sin_Sinh": "ශ්රී ලංකාවේ අගනුවර කොළඹ වේ." # Sinhala: "The capital of Sri Lanka is Colombo."
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
print("\n--- Starting Translation ---")
|
| 24 |
+
|
| 25 |
+
# Loop through each sentence and translate it.
|
| 26 |
+
for lang_code, text in sentences_to_translate.items():
|
| 27 |
+
|
| 28 |
+
# 1. Prepare the input for the model
|
| 29 |
+
# We need to tell the tokenizer what the source language is.
|
| 30 |
+
tokenizer.src_lang = lang_code
|
| 31 |
+
|
| 32 |
+
# Convert the text into a format the model understands (input IDs).
|
| 33 |
+
inputs = tokenizer(text, return_tensors="pt")
|
| 34 |
+
|
| 35 |
+
# 2. Generate the translation
|
| 36 |
+
# We force the model to output English by setting the target language ID.
|
| 37 |
+
translated_tokens = model.generate(
|
| 38 |
+
**inputs,
|
| 39 |
+
forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"],
|
| 40 |
+
max_length=50 # Set a max length for the output
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# 3. Decode the output
|
| 44 |
+
# Convert the model's output tokens back into readable text.
|
| 45 |
+
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
|
| 46 |
+
|
| 47 |
+
# 4. Display the results
|
| 48 |
+
print(f"\nOriginal ({lang_code}): {text}")
|
| 49 |
+
print(f"Translation (eng_Latn): {translation}")
|
| 50 |
+
|
| 51 |
+
print("\n--- Translation Complete ---")
|
debug_load.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# debug_load.py
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from transformers import AutoTokenizer, M2M100ForConditionalGeneration
|
| 5 |
+
|
| 6 |
+
# --- Configuration ---
|
| 7 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 8 |
+
nepali_model_path = r"D:\SIH\saksi_translation\models\nllb-finetuned-nepali-en"
|
| 9 |
+
|
| 10 |
+
# --- Tokenizer Loading ---
|
| 11 |
+
print("Loading Nepali tokenizer...")
|
| 12 |
+
try:
|
| 13 |
+
nepali_tokenizer = AutoTokenizer.from_pretrained(nepali_model_path)
|
| 14 |
+
print("Nepali tokenizer loaded successfully.")
|
| 15 |
+
print(nepali_tokenizer)
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print(f"Error loading Nepali tokenizer: {e}")
|
| 18 |
+
|
| 19 |
+
# --- Model Loading ---
|
| 20 |
+
print("\nLoading Nepali model...")
|
| 21 |
+
try:
|
| 22 |
+
nepali_model = M2M100ForConditionalGeneration.from_pretrained(nepali_model_path).to(DEVICE)
|
| 23 |
+
print("Nepali model loaded successfully.")
|
| 24 |
+
print(nepali_model)
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"Error loading Nepali model: {e}")
|
fast_api.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
A FastAPI application for serving the translation model, inspired by interactive_translate.py.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import M2M100ForConditionalGeneration, NllbTokenizer
|
| 7 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File
|
| 8 |
+
from fastapi.staticfiles import StaticFiles
|
| 9 |
+
from fastapi.responses import FileResponse
|
| 10 |
+
from pydantic import BaseModel
|
| 11 |
+
import logging
|
| 12 |
+
from typing import List
|
| 13 |
+
import fitz # PyMuPDF
|
| 14 |
+
import shutil
|
| 15 |
+
import os
|
| 16 |
+
|
| 17 |
+
# --- 1. App Configuration ---
|
| 18 |
+
logging.basicConfig(level=logging.INFO)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
app = FastAPI(
|
| 22 |
+
title="Saksi Translation API",
|
| 23 |
+
description="A simple API for translating text and PDFs to English.",
|
| 24 |
+
version="2.0",
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
app.mount("/frontend", StaticFiles(directory="frontend"), name="frontend")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# --- 2. Global Variables ---
|
| 31 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 32 |
+
SUPPORTED_LANGUAGES = {
|
| 33 |
+
"nepali": "nep_Npan",
|
| 34 |
+
"sinhala": "sin_Sinh",
|
| 35 |
+
}
|
| 36 |
+
MODEL_PATH = "models/nllb-finetuned-nepali-en"
|
| 37 |
+
model = None
|
| 38 |
+
tokenizer = None
|
| 39 |
+
|
| 40 |
+
# --- 3. Pydantic Models ---
|
| 41 |
+
class TranslationRequest(BaseModel):
|
| 42 |
+
text: str
|
| 43 |
+
source_language: str
|
| 44 |
+
|
| 45 |
+
class TranslationResponse(BaseModel):
|
| 46 |
+
original_text: str
|
| 47 |
+
translated_text: str
|
| 48 |
+
source_language: str
|
| 49 |
+
|
| 50 |
+
class BatchTranslationRequest(BaseModel):
|
| 51 |
+
texts: List[str]
|
| 52 |
+
source_language: str
|
| 53 |
+
|
| 54 |
+
class BatchTranslationResponse(BaseModel):
|
| 55 |
+
original_texts: List[str]
|
| 56 |
+
translated_texts: List[str]
|
| 57 |
+
source_language: str
|
| 58 |
+
|
| 59 |
+
class PdfTranslationResponse(BaseModel):
|
| 60 |
+
filename: str
|
| 61 |
+
translated_text: str
|
| 62 |
+
source_language: str
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# --- 4. Helper Functions ---
|
| 66 |
+
def load_model_and_tokenizer(model_path):
|
| 67 |
+
"""Loads the model and tokenizer from the given path."""
|
| 68 |
+
global model, tokenizer
|
| 69 |
+
logger.info(f"Loading model on {DEVICE.upper()}...")
|
| 70 |
+
try:
|
| 71 |
+
model = M2M100ForConditionalGeneration.from_pretrained(model_path).to(DEVICE)
|
| 72 |
+
tokenizer = NllbTokenizer.from_pretrained(model_path)
|
| 73 |
+
logger.info("Model and tokenizer loaded successfully!")
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.error(f"Error loading model: {e}")
|
| 76 |
+
# In a real app, you might want to exit or handle this more gracefully
|
| 77 |
+
raise
|
| 78 |
+
|
| 79 |
+
def translate_text(text: str, src_lang: str) -> str:
|
| 80 |
+
"""
|
| 81 |
+
Translates a single string of text to English.
|
| 82 |
+
"""
|
| 83 |
+
if src_lang not in SUPPORTED_LANGUAGES:
|
| 84 |
+
raise ValueError(f"Language '{src_lang}' not supported.")
|
| 85 |
+
|
| 86 |
+
tokenizer.src_lang = SUPPORTED_LANGUAGES[src_lang]
|
| 87 |
+
inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
|
| 88 |
+
|
| 89 |
+
generated_tokens = model.generate(
|
| 90 |
+
**inputs,
|
| 91 |
+
forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"),
|
| 92 |
+
max_length=128,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 96 |
+
|
| 97 |
+
def batch_translate_text(texts: List[str], src_lang: str) -> List[str]:
|
| 98 |
+
"""
|
| 99 |
+
Translates a batch of texts to English.
|
| 100 |
+
"""
|
| 101 |
+
if src_lang not in SUPPORTED_LANGUAGES:
|
| 102 |
+
raise ValueError(f"Language '{src_lang}' not supported.")
|
| 103 |
+
|
| 104 |
+
tokenizer.src_lang = SUPPORTED_LANGUAGES[src_lang]
|
| 105 |
+
# We use padding=True to handle batches of different lengths
|
| 106 |
+
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
|
| 107 |
+
|
| 108 |
+
generated_tokens = model.generate(
|
| 109 |
+
**inputs,
|
| 110 |
+
forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"),
|
| 111 |
+
max_length=512, # Allow for longer generated sequences in batches
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
| 115 |
+
|
| 116 |
+
# --- 5. API Events ---
|
| 117 |
+
@app.on_event("startup")
|
| 118 |
+
async def startup_event():
|
| 119 |
+
"""Load the model at startup."""
|
| 120 |
+
load_model_and_tokenizer(MODEL_PATH)
|
| 121 |
+
|
| 122 |
+
# --- 6. API Endpoints ---
|
| 123 |
+
@app.get("/")
|
| 124 |
+
async def root():
|
| 125 |
+
"""Returns the frontend."""
|
| 126 |
+
return FileResponse('frontend/index.html')
|
| 127 |
+
|
| 128 |
+
@app.get("/languages")
|
| 129 |
+
def get_supported_languages():
|
| 130 |
+
"""Returns a list of supported languages."""
|
| 131 |
+
return {"supported_languages": list(SUPPORTED_LANGUAGES.keys())}
|
| 132 |
+
|
| 133 |
+
@app.post("/translate", response_model=TranslationResponse)
|
| 134 |
+
async def translate(request: TranslationRequest):
|
| 135 |
+
"""Translates a single text from a source language to English."""
|
| 136 |
+
try:
|
| 137 |
+
translated_text = translate_text(request.text, request.source_language)
|
| 138 |
+
return TranslationResponse(
|
| 139 |
+
original_text=request.text,
|
| 140 |
+
translated_text=translated_text,
|
| 141 |
+
source_language=request.source_language,
|
| 142 |
+
)
|
| 143 |
+
except ValueError as e:
|
| 144 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 145 |
+
except Exception as e:
|
| 146 |
+
raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {e}")
|
| 147 |
+
|
| 148 |
+
@app.post("/batch-translate", response_model=BatchTranslationResponse)
|
| 149 |
+
async def batch_translate(request: BatchTranslationRequest):
|
| 150 |
+
"""Translates a batch of texts from a source language to English."""
|
| 151 |
+
try:
|
| 152 |
+
translated_texts = batch_translate_text(request.texts, request.source_language)
|
| 153 |
+
return BatchTranslationResponse(
|
| 154 |
+
original_texts=request.texts,
|
| 155 |
+
translated_texts=translated_texts,
|
| 156 |
+
source_language=request.source_language,
|
| 157 |
+
)
|
| 158 |
+
except ValueError as e:
|
| 159 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 160 |
+
except Exception as e:
|
| 161 |
+
raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {e}")
|
| 162 |
+
|
| 163 |
+
@app.post("/translate-pdf", response_model=PdfTranslationResponse)
|
| 164 |
+
async def translate_pdf(source_language: str, file: UploadFile = File(...)):
|
| 165 |
+
"""Translates a PDF file from a source language to English."""
|
| 166 |
+
if file.content_type != "application/pdf":
|
| 167 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
|
| 168 |
+
|
| 169 |
+
# Save the uploaded file temporarily
|
| 170 |
+
temp_pdf_path = f"temp_{file.filename}"
|
| 171 |
+
with open(temp_pdf_path, "wb") as buffer:
|
| 172 |
+
shutil.copyfileobj(file.file, buffer)
|
| 173 |
+
|
| 174 |
+
try:
|
| 175 |
+
# Extract text from the PDF
|
| 176 |
+
doc = fitz.open(temp_pdf_path)
|
| 177 |
+
extracted_text = ""
|
| 178 |
+
for page in doc:
|
| 179 |
+
extracted_text += page.get_text()
|
| 180 |
+
doc.close()
|
| 181 |
+
|
| 182 |
+
if not extracted_text.strip():
|
| 183 |
+
raise HTTPException(status_code=400, detail="Could not extract any text from the PDF.")
|
| 184 |
+
|
| 185 |
+
# Split text into chunks (e.g., by paragraph) to handle large texts
|
| 186 |
+
text_chunks = [p.strip() for p in extracted_text.split('\n') if p.strip()]
|
| 187 |
+
|
| 188 |
+
# Translate the chunks in batches
|
| 189 |
+
translated_chunks = batch_translate_text(text_chunks, source_language)
|
| 190 |
+
|
| 191 |
+
# Join the translated chunks back together
|
| 192 |
+
final_translation = "\n".join(translated_chunks)
|
| 193 |
+
|
| 194 |
+
return PdfTranslationResponse(
|
| 195 |
+
filename=file.filename,
|
| 196 |
+
translated_text=final_translation,
|
| 197 |
+
source_language=source_language,
|
| 198 |
+
)
|
| 199 |
+
except Exception as e:
|
| 200 |
+
logger.error(f"Error processing PDF: {e}")
|
| 201 |
+
raise HTTPException(status_code=500, detail=f"An error occurred while processing the PDF: {e}")
|
| 202 |
+
finally:
|
| 203 |
+
# Clean up the temporary file
|
| 204 |
+
if os.path.exists(temp_pdf_path):
|
| 205 |
+
os.remove(temp_pdf_path)
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
# --- 7. Example Usage (for running with uvicorn) ---
|
| 209 |
+
# To run this API, use the following command in your terminal:
|
| 210 |
+
# uvicorn fast_api:app --reload
|
| 211 |
+
|
| 212 |
+
if __name__ == "__main__":
|
| 213 |
+
import uvicorn
|
| 214 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
interactive_translate.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
An interactive script to translate text to English using a fine-tuned NLLB model.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import M2M100ForConditionalGeneration, NllbTokenizer
|
| 7 |
+
|
| 8 |
+
# --- 1. Configuration ---
|
| 9 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 10 |
+
SUPPORTED_LANGUAGES = {
|
| 11 |
+
"nepali": "nep_Npan",
|
| 12 |
+
"sinhala": "sin_Sinh",
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
# --- 2. Load Model and Tokenizer ---
|
| 16 |
+
def load_model_and_tokenizer(model_path):
|
| 17 |
+
"""Loads the model and tokenizer from the given path."""
|
| 18 |
+
print(f"Loading model on {DEVICE.upper()}...")
|
| 19 |
+
try:
|
| 20 |
+
model = M2M100ForConditionalGeneration.from_pretrained(model_path).to(DEVICE)
|
| 21 |
+
tokenizer = NllbTokenizer.from_pretrained(model_path)
|
| 22 |
+
print("Model and tokenizer loaded successfully!")
|
| 23 |
+
return model, tokenizer
|
| 24 |
+
except Exception as e:
|
| 25 |
+
print(f"Error loading model: {e}")
|
| 26 |
+
return None, None
|
| 27 |
+
|
| 28 |
+
# --- 3. Translation Function ---
|
| 29 |
+
def translate_text(model, tokenizer, text: str, src_lang: str) -> str:
|
| 30 |
+
"""
|
| 31 |
+
Translates a single string of text to English.
|
| 32 |
+
"""
|
| 33 |
+
if src_lang not in SUPPORTED_LANGUAGES:
|
| 34 |
+
return f"Language '{src_lang}' not supported."
|
| 35 |
+
|
| 36 |
+
tokenizer.src_lang = SUPPORTED_LANGUAGES[src_lang]
|
| 37 |
+
inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
|
| 38 |
+
|
| 39 |
+
generated_tokens = model.generate(
|
| 40 |
+
**inputs,
|
| 41 |
+
forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"),
|
| 42 |
+
max_length=128,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 46 |
+
|
| 47 |
+
# --- 4. Interactive Translation Loop ---
|
| 48 |
+
if __name__ == "__main__":
|
| 49 |
+
# Select model path based on language
|
| 50 |
+
lang_choice = input(f"Choose a language ({list(SUPPORTED_LANGUAGES.keys())}): ").lower()
|
| 51 |
+
if lang_choice not in SUPPORTED_LANGUAGES:
|
| 52 |
+
print("Invalid language choice.")
|
| 53 |
+
exit()
|
| 54 |
+
|
| 55 |
+
# For now, we assume a single model path. This can be extended.
|
| 56 |
+
model_path = "models/nllb-finetuned-nepali-en"
|
| 57 |
+
model, tokenizer = load_model_and_tokenizer(model_path)
|
| 58 |
+
|
| 59 |
+
if model and tokenizer:
|
| 60 |
+
print(f"\n--- Interactive Translation ({lang_choice.capitalize()}) ---")
|
| 61 |
+
print(f"Enter a {lang_choice} sentence to translate to English.")
|
| 62 |
+
print("Type 'exit' to quit.\n")
|
| 63 |
+
|
| 64 |
+
while True:
|
| 65 |
+
text_to_translate = input(f"{lang_choice.capitalize()}: ")
|
| 66 |
+
if text_to_translate.lower() == "exit":
|
| 67 |
+
break
|
| 68 |
+
|
| 69 |
+
if not text_to_translate.strip():
|
| 70 |
+
print("Please enter some text to translate.")
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
english_translation = translate_text(model, tokenizer, text_to_translate, lang_choice)
|
| 74 |
+
print(f"English: {english_translation}\n")
|
requirements.txt
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accelerate==1.10.1
|
| 2 |
+
aiohappyeyeballs==2.6.1
|
| 3 |
+
aiohttp==3.12.15
|
| 4 |
+
aiosignal==1.4.0
|
| 5 |
+
annotated-types==0.7.0
|
| 6 |
+
anyio==4.11.0
|
| 7 |
+
attrs==25.3.0
|
| 8 |
+
beautifulsoup4==4.14.2
|
| 9 |
+
certifi==2025.10.5
|
| 10 |
+
charset-normalizer==3.4.3
|
| 11 |
+
click==8.3.0
|
| 12 |
+
colorama==0.4.6
|
| 13 |
+
datasets==4.1.1
|
| 14 |
+
dill==0.4.0
|
| 15 |
+
dnspython==2.8.0
|
| 16 |
+
email-validator==2.3.0
|
| 17 |
+
evaluate==0.4.6
|
| 18 |
+
fastapi==0.118.0
|
| 19 |
+
fastapi-cli==0.0.13
|
| 20 |
+
fastapi-cloud-cli==0.3.0
|
| 21 |
+
filelock==3.19.1
|
| 22 |
+
frozenlist==1.7.0
|
| 23 |
+
fsspec==2025.9.0
|
| 24 |
+
h11==0.16.0
|
| 25 |
+
httpcore==1.0.9
|
| 26 |
+
httptools==0.6.4
|
| 27 |
+
httpx==0.28.1
|
| 28 |
+
huggingface-hub==0.35.3
|
| 29 |
+
idna==3.10
|
| 30 |
+
itsdangerous==2.2.0
|
| 31 |
+
Jinja2==3.1.6
|
| 32 |
+
langdetect==1.0.9
|
| 33 |
+
lxml==6.0.2
|
| 34 |
+
markdown-it-py==4.0.0
|
| 35 |
+
MarkupSafe==3.0.3
|
| 36 |
+
mdurl==0.1.2
|
| 37 |
+
mpmath==1.3.0
|
| 38 |
+
multidict==6.6.4
|
| 39 |
+
multiprocess==0.70.16
|
| 40 |
+
networkx==3.5
|
| 41 |
+
numpy==2.3.3
|
| 42 |
+
orjson==3.11.3
|
| 43 |
+
packaging==25.0
|
| 44 |
+
pandas==2.3.3
|
| 45 |
+
portalocker==3.2.0
|
| 46 |
+
propcache==0.4.0
|
| 47 |
+
protobuf==6.32.1
|
| 48 |
+
psutil==7.1.0
|
| 49 |
+
pyarrow==21.0.0
|
| 50 |
+
pydantic==2.11.10
|
| 51 |
+
pydantic-extra-types==2.10.5
|
| 52 |
+
pydantic-settings==2.11.0
|
| 53 |
+
pydantic_core==2.33.2
|
| 54 |
+
Pygments==2.19.2
|
| 55 |
+
PyMuPDF==1.26.4
|
| 56 |
+
python-dateutil==2.9.0.post0
|
| 57 |
+
python-dotenv==1.1.1
|
| 58 |
+
python-multipart==0.0.20
|
| 59 |
+
pytz==2025.2
|
| 60 |
+
PyYAML==6.0.3
|
| 61 |
+
regex==2025.9.18
|
| 62 |
+
requests==2.32.5
|
| 63 |
+
rich==14.1.0
|
| 64 |
+
rich-toolkit==0.15.1
|
| 65 |
+
rignore==0.7.0
|
| 66 |
+
sacrebleu==2.5.1
|
| 67 |
+
safetensors==0.6.2
|
| 68 |
+
sentencepiece==0.2.1
|
| 69 |
+
sentry-sdk==2.39.0
|
| 70 |
+
setuptools==80.9.0
|
| 71 |
+
shellingham==1.5.4
|
| 72 |
+
six==1.17.0
|
| 73 |
+
sniffio==1.3.1
|
| 74 |
+
soupsieve==2.8
|
| 75 |
+
starlette==0.48.0
|
| 76 |
+
sympy==1.14.0
|
| 77 |
+
tabulate==0.9.0
|
| 78 |
+
tokenizers==0.22.1
|
| 79 |
+
torch==2.8.0
|
| 80 |
+
tqdm==4.67.1
|
| 81 |
+
transformers==4.57.0
|
| 82 |
+
typer==0.19.2
|
| 83 |
+
typing-inspection==0.4.2
|
| 84 |
+
typing_extensions==4.15.0
|
| 85 |
+
tzdata==2025.2
|
| 86 |
+
ujson==5.11.0
|
| 87 |
+
urllib3==2.5.0
|
| 88 |
+
uvicorn==0.37.0
|
| 89 |
+
watchfiles==1.1.0
|
| 90 |
+
websockets==15.0.1
|
| 91 |
+
xxhash==3.6.0
|
| 92 |
+
yarl==1.20.1
|
test_analysis.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import codecs
|
| 4 |
+
import torch
|
| 5 |
+
from transformers import M2M100ForConditionalGeneration, NllbTokenizerFast
|
| 6 |
+
|
| 7 |
+
def translate_text(text, model, tokenizer, src_lang, target_lang="eng_Latn"):
|
| 8 |
+
"""
|
| 9 |
+
Translates a single text string.
|
| 10 |
+
"""
|
| 11 |
+
try:
|
| 12 |
+
tokenizer.src_lang = src_lang
|
| 13 |
+
inputs = tokenizer(text, return_tensors="pt")
|
| 14 |
+
generated_tokens = model.generate(
|
| 15 |
+
**inputs,
|
| 16 |
+
forced_bos_token_id=tokenizer.vocab[target_lang],
|
| 17 |
+
max_length=512
|
| 18 |
+
)
|
| 19 |
+
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 20 |
+
return translated_text
|
| 21 |
+
except Exception as e:
|
| 22 |
+
return f"An error occurred during translation: {e}"
|
| 23 |
+
|
| 24 |
+
def main():
|
| 25 |
+
"""
|
| 26 |
+
Main function to load the model and run a test translation.
|
| 27 |
+
"""
|
| 28 |
+
# Reconfigure stdout to handle UTF-8 encoding
|
| 29 |
+
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer)
|
| 30 |
+
|
| 31 |
+
# --- Configuration ---
|
| 32 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
| 33 |
+
nepali_model_path = os.path.join(script_dir, "models", "nllb-finetuned-nepali-en")
|
| 34 |
+
|
| 35 |
+
# --- Model Loading ---
|
| 36 |
+
print("Loading Nepali model and tokenizer...")
|
| 37 |
+
try:
|
| 38 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 39 |
+
nepali_model = M2M100ForConditionalGeneration.from_pretrained(nepali_model_path).to(device)
|
| 40 |
+
nepali_tokenizer = NllbTokenizerFast.from_pretrained(nepali_model_path)
|
| 41 |
+
print("Nepali model and tokenizer loaded successfully.")
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"Error loading Nepali model or tokenizer: {e}")
|
| 44 |
+
return
|
| 45 |
+
|
| 46 |
+
# --- Nepali Translation ---
|
| 47 |
+
nepali_sentences = [
|
| 48 |
+
"जडान बिन्दु थप्नुहोस्",
|
| 49 |
+
"स्टिकी नोट आयात पूरा भयो",
|
| 50 |
+
"मोनोस्पेस १२",
|
| 51 |
+
"पानी जेट पम्पमा दुईवटा भित्रिने र एउटा बाहिरिने पाइप हुन्छन् र एक भित्र अर्को सिद्धान्त अनुरूप दुईवटा पाइप हुन्छन् । पानीको प्रविष्टिमा एउटा पानी जेटले केही ठूलो पाइपमा पूरा चापले टुटीबाट बाहिर फाल्दछ । यस्तो तरिकाले पानी जेटले वायू वा तरललाई दोस्रो प्रविष्टिबाट टाढा पुर्याउदछ । ड्रिफ्टिङ तरलमा ऋणात्मक चापको कारणले यस्तो हुन्छ । त्यसैले यो हाइड्रोडायनमिक विरोधाभाषको एउटा अनुप्रयोग हो । यसले ड्रिफ्टिङ तरल नजिकका वस्तु टाढा फाल्नुको साटोमा सोस्ने कुरा बताउदछ ।",
|
| 52 |
+
"वस्तुको परिवर्तन बचत गर्नुहोस् ।"
|
| 53 |
+
"तिमीलाई कस्तो छ" ,
|
| 54 |
+
"तिमी को हौ",
|
| 55 |
+
"कति बज्यो"
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
print("\n--- Nepali to English Translation Analysis ---")
|
| 59 |
+
for sentence in nepali_sentences:
|
| 60 |
+
print(f"\nOriginal (ne): {sentence}")
|
| 61 |
+
translated_text = translate_text(sentence, nepali_model, nepali_tokenizer, src_lang="nep_Npan")
|
| 62 |
+
print(f"Translated (en): {translated_text}")
|
| 63 |
+
|
| 64 |
+
# --- Sinhala Translation ---
|
| 65 |
+
# NOTE: No fine-tuned model for sinhala was found. Using the baseline model for now.
|
| 66 |
+
print("\n\n--- Sinhala to English Translation Analysis ---")
|
| 67 |
+
|
| 68 |
+
sinhala_sentences = [
|
| 69 |
+
"ඩෝසන්මිස් දුරකථනයෙන් ඩෝසන්මිස් කවුද සර්",
|
| 70 |
+
"කවුද ඩෝසන් නැතුව ඉන්නේ ඔව් සර්",
|
| 71 |
+
"ඔබ එය උත්සාහ කරන්න සර්",
|
| 72 |
+
"කොහොමද වැඩේ හරිද ඔව් සර්ට ස්තුතියි",
|
| 73 |
+
"ඔව්, හරි, ස්තුතියි රත්තරං",
|
| 74 |
+
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
for sentence in sinhala_sentences:
|
| 78 |
+
print(f"\nOriginal (si): {sentence}")
|
| 79 |
+
translated_text = translate_text(sentence, nepali_model, nepali_tokenizer, src_lang="sin_Sinh")
|
| 80 |
+
print(f"Translated (en): {translated_text}")
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
if __name__ == "__main__":
|
| 84 |
+
main()
|
test_translation.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import codecs
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import M2M100ForConditionalGeneration, NllbTokenizerFast
|
| 7 |
+
|
| 8 |
+
def translate_text(text, model, tokenizer, src_lang="nep_Npi", target_lang="eng_Latn"):
|
| 9 |
+
"""
|
| 10 |
+
Translates a single text string.
|
| 11 |
+
"""
|
| 12 |
+
try:
|
| 13 |
+
tokenizer.src_lang = src_lang
|
| 14 |
+
inputs = tokenizer(text, return_tensors="pt")
|
| 15 |
+
generated_tokens = model.generate(
|
| 16 |
+
**inputs,
|
| 17 |
+
forced_bos_token_id=tokenizer.vocab[target_lang],
|
| 18 |
+
max_length=512
|
| 19 |
+
)
|
| 20 |
+
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 21 |
+
return translated_text
|
| 22 |
+
except Exception as e:
|
| 23 |
+
return f"An error occurred during translation: {e}"
|
| 24 |
+
|
| 25 |
+
def main():
|
| 26 |
+
"""
|
| 27 |
+
Main function to load the model and run a test translation.
|
| 28 |
+
"""
|
| 29 |
+
# Reconfigure stdout to handle UTF-8 encoding
|
| 30 |
+
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer)
|
| 31 |
+
|
| 32 |
+
# --- Configuration ---
|
| 33 |
+
# Construct the absolute path to the model directory to ensure it's found correctly
|
| 34 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
| 35 |
+
model_path = os.path.join(script_dir, "models", "nllb-finetuned-nepali-en")
|
| 36 |
+
|
| 37 |
+
# --- Model Loading ---
|
| 38 |
+
print("Loading model and tokenizer...")
|
| 39 |
+
try:
|
| 40 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 41 |
+
model = M2M100ForConditionalGeneration.from_pretrained(model_path).to(device)
|
| 42 |
+
tokenizer = NllbTokenizerFast.from_pretrained(model_path)
|
| 43 |
+
print("Model and tokenizer loaded successfully.")
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"Error loading model or tokenizer: {e}")
|
| 46 |
+
return
|
| 47 |
+
|
| 48 |
+
# --- Translation ---
|
| 49 |
+
sentences_to_translate = [
|
| 50 |
+
"मेरो नाम जेमिनी हो।",
|
| 51 |
+
"आज मौसम कस्तो छ?",
|
| 52 |
+
"मलाई नेपाली खाना मन पर्छ।",
|
| 53 |
+
"तपाईंलाई कस्तो छ?",
|
| 54 |
+
"वस्तुको परिवर्तन बचत गर्नुहोस् ।",
|
| 55 |
+
"तिमीलाई कस्तो छ" ,
|
| 56 |
+
"तिमी को हौ",
|
| 57 |
+
"कति बज्यो",
|
| 58 |
+
"बाटो कहाँ छ",
|
| 59 |
+
"फिल्मले सामान्यतया सकारात्मक समीक्षा प्राप्त गर्यो, हिन्दी डब संस्करणमा अत्यन्तै राम्रो प्रदर्शन गर्यो",
|
| 60 |
+
"इङ्गल्याण्डमा भएको गन्तव्य विवाहको पृष्ठभूमिमा सेट गरिएको, कथाले विवाह योजनाकार जगजिन्दर जोगिन्दर र धर्मपुत्र उत्तराधिकारी आलिया अरोरा बीचको विचित्र प्रेमकथालाई पछ्याउँछ, किनकि उनीहरू विचित्र परिवारहरू, व्यक्तिगत आघातहरू र व्यवस्थित विवाहको बेतुकापनहरू पार गर्छन्।",
|
| 61 |
+
"साई रा नरसिंह रेड्डीको वास्तविक कथा रायलसीमा क्षेत्रका एक भारतीय स्वतन्त्रता सेनानी उय्यालवाडा नरसिंह रेड्डीमा केन्द्रित छ जसले १८४६ मा ब्रिटिश इस्ट इन्डिया कम्पनी विरुद्ध पहिलो सामूहिक विद्रोहको नेतृत्व गरेका थिए, सिपाही विद्रोहको एक दशक अघि। एक पोलिगर (एक सामन्ती सरदार), रेड्डी र उनका अनुयायीहरूले कृषि प्रणालीमा शोषणकारी परिवर्तनहरू विरुद्ध विद्रोह गरे, जसमा उनीहरूको पुर्खाको जग्गा कब्जा र कम्पनीद्वारा अनुचित कर लगाउने समावेश थियो। प्रारम्भिक विजय पछि, उनलाई पछि १८४७ मा पक्राउ गरियो र फाँसी दिएर मृत्युदण्ड दिइयो, उनको शरीर डर जग्गाउन प्रदर्शन गरियो।"
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
for sentence in sentences_to_translate:
|
| 65 |
+
print(f"\nOriginal text (Nepali): '{sentence}'")
|
| 66 |
+
translated_text = translate_text(sentence, model, tokenizer)
|
| 67 |
+
print(f"Translated text (English): '{translated_text}'")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
if __name__ == "__main__":
|
| 71 |
+
main()
|