Spaces:
Running
Running
Commit
Β·
f43f7c7
1
Parent(s):
8cd6bf6
universal humanizer
Browse files- app.py +44 -0
- universal_humanizer.py +0 -16
app.py
CHANGED
|
@@ -4,6 +4,50 @@
|
|
| 4 |
import gradio as gr
|
| 5 |
import time
|
| 6 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# Import our universal humanizer
|
| 9 |
from universal_humanizer import UniversalAITextHumanizer
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
import time
|
| 6 |
import os
|
| 7 |
+
import nltk
|
| 8 |
+
|
| 9 |
+
def ensure_nltk_resources():
|
| 10 |
+
"""Ensure minimal NLTK data for tokenizing and lemmatization."""
|
| 11 |
+
resources = {
|
| 12 |
+
'punkt': 'tokenizers/punkt',
|
| 13 |
+
'wordnet': 'corpora/wordnet',
|
| 14 |
+
'omw-1.4': 'corpora/omw-1.4'
|
| 15 |
+
}
|
| 16 |
+
for name, path in resources.items():
|
| 17 |
+
try:
|
| 18 |
+
nltk.data.find(path)
|
| 19 |
+
print(f"β
Resource already present: {name}")
|
| 20 |
+
except LookupError:
|
| 21 |
+
print(f"π Downloading {name} β¦")
|
| 22 |
+
try:
|
| 23 |
+
nltk.download(name, quiet=True)
|
| 24 |
+
print(f"β
Downloaded {name}")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"β Failed to download {name}: {e}")
|
| 27 |
+
|
| 28 |
+
def test_nltk_setup():
|
| 29 |
+
"""Test basic tokenization & lemmatization to verify setup."""
|
| 30 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
| 31 |
+
from nltk.stem import WordNetLemmatizer
|
| 32 |
+
|
| 33 |
+
text = "This is a test. Testing tokenization and lemmatization."
|
| 34 |
+
# Test sentence splitting
|
| 35 |
+
sentences = sent_tokenize(text)
|
| 36 |
+
print(f"Sentence tokenize works: {len(sentences)} sentences: {sentences}")
|
| 37 |
+
# Test word tokenization
|
| 38 |
+
words = word_tokenize(text)
|
| 39 |
+
print(f"Word tokenize works: {len(words)} words: {words}")
|
| 40 |
+
# Test lemmatization
|
| 41 |
+
lemmatizer = WordNetLemmatizer()
|
| 42 |
+
lem = [lemmatizer.lemmatize(w) for w in words]
|
| 43 |
+
print(f"Lemmatization works: {lem}")
|
| 44 |
+
|
| 45 |
+
# In startup part of your app
|
| 46 |
+
print("π Ensuring NLTK minimal resources β¦")
|
| 47 |
+
ensure_nltk_resources()
|
| 48 |
+
print("π§ Testing NLTK setup β¦")
|
| 49 |
+
test_nltk_setup()
|
| 50 |
+
|
| 51 |
|
| 52 |
# Import our universal humanizer
|
| 53 |
from universal_humanizer import UniversalAITextHumanizer
|
universal_humanizer.py
CHANGED
|
@@ -7,22 +7,6 @@ import time
|
|
| 7 |
from collections import Counter
|
| 8 |
import statistics
|
| 9 |
|
| 10 |
-
# Download required NLTK data
|
| 11 |
-
def ensure_nltk_data():
|
| 12 |
-
try:
|
| 13 |
-
nltk.data.find('tokenizers/punkt')
|
| 14 |
-
except LookupError:
|
| 15 |
-
nltk.download('punkt', quiet=True)
|
| 16 |
-
try:
|
| 17 |
-
nltk.data.find('corpora/wordnet')
|
| 18 |
-
except LookupError:
|
| 19 |
-
nltk.download('wordnet', quiet=True)
|
| 20 |
-
try:
|
| 21 |
-
nltk.data.find('corpora/omw-1.4')
|
| 22 |
-
except LookupError:
|
| 23 |
-
nltk.download('omw-1.4', quiet=True)
|
| 24 |
-
|
| 25 |
-
ensure_nltk_data()
|
| 26 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 27 |
from nltk.corpus import wordnet
|
| 28 |
|
|
|
|
| 7 |
from collections import Counter
|
| 8 |
import statistics
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 11 |
from nltk.corpus import wordnet
|
| 12 |
|