| | |
| | """ |
| | Test: Upgraded flan-t5-base Model for Crossword Clue Generation |
| | Compare flan-t5-base performance against the previous flan-t5-small results. |
| | """ |
| |
|
| | import sys |
| | import logging |
| | from pathlib import Path |
| |
|
| | |
| | sys.path.insert(0, str(Path(__file__).parent)) |
| |
|
| | try: |
| | from llm_clue_generator import LLMClueGenerator |
| | GENERATOR_AVAILABLE = True |
| | except ImportError as e: |
| | print(f"β Import error: {e}") |
| | GENERATOR_AVAILABLE = False |
| |
|
| | |
| | logging.basicConfig( |
| | level=logging.INFO, |
| | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
| | ) |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def test_flan_t5_base(): |
| | """Test flan-t5-base model with problematic examples that failed with flan-t5-small.""" |
| | if not GENERATOR_AVAILABLE: |
| | print("β Cannot run test - LLM generator not available") |
| | return |
| | |
| | print("π§ͺ Testing Upgraded flan-t5-base Model") |
| | print("=" * 60) |
| | |
| | |
| | print("π Initializing flan-t5-base clue generator...") |
| | generator = LLMClueGenerator() |
| | |
| | try: |
| | generator.initialize() |
| | print(f"β
Generator initialized successfully with {generator.model_name}") |
| | print(f"π Model size: ~1GB (vs ~80MB for flan-t5-small)") |
| | except Exception as e: |
| | print(f"β Failed to initialize generator: {e}") |
| | return |
| | |
| | |
| | test_cases = [ |
| | |
| | |
| | |
| | |
| | ("CAT", "animals"), |
| | ("KITTY", "animals"), |
| | ("MEAL", "food"), |
| | ("HUNGER", "food"), |
| | ("SONG", "music"), |
| | ("GUITAR", "music"), |
| | |
| | |
| | ("BATSMAN", "cricket"), |
| | ("SWIMMING", "sports"), |
| | ("AIRPORT", "transportation"), |
| | |
| | |
| | ("DATABASE", "technology"), |
| | ("SCIENTIST", "science"), |
| | ("PIZZA", "food"), |
| | ("MOUNTAIN", "geography"), |
| | ] |
| | |
| | print(f"\nπ― Testing {len(test_cases)} word-topic combinations with flan-t5-base") |
| | print("=" * 60) |
| | |
| | excellent_clues = 0 |
| | good_clues = 0 |
| | poor_clues = 0 |
| | failed_clues = 0 |
| | |
| | |
| | major_improvements = [] |
| | |
| | for word, topic in test_cases: |
| | print(f"\nπ Testing: '{word}' + '{topic}'") |
| | print("-" * 40) |
| | |
| | try: |
| | |
| | styles = ["definition", "description", "category", "function", "context"] |
| | candidates = [] |
| | |
| | for style in styles: |
| | clue = generator.generate_clue( |
| | word=word, |
| | topic=topic, |
| | clue_style=style, |
| | difficulty="medium" |
| | ) |
| | if clue and len(clue) > 5: |
| | candidates.append((style, clue)) |
| | |
| | if candidates: |
| | print(f"Generated {len(candidates)} candidates:") |
| | for i, (style, clue) in enumerate(candidates, 1): |
| | print(f" {i}. [{style}] {clue}") |
| | |
| | |
| | best_style, best_clue = candidates[0] |
| | print(f"\nπ Best clue [{best_style}]: {best_clue}") |
| | |
| | |
| | word_lower = word.lower() |
| | clue_lower = best_clue.lower() |
| | |
| | |
| | contains_word = word_lower in clue_lower |
| | |
| | |
| | old_nonsense = any(bad in clue_lower for bad in [ |
| | "trick and treating", "gritting your teeth", "jack nixt", |
| | "fender", "tryon", "nicolas", "occurrence", "sludge" |
| | ]) |
| | |
| | |
| | is_descriptive = ( |
| | len(best_clue.split()) >= 2 and |
| | len(best_clue) >= 8 and |
| | not contains_word and |
| | not old_nonsense |
| | ) |
| | |
| | |
| | is_definitional = ( |
| | any(def_word in clue_lower for def_word in [ |
| | "player", "sport", "instrument", "device", "system", "food", |
| | "language", "place", "animal", "creature", "location" |
| | ]) and not contains_word |
| | ) |
| | |
| | if contains_word: |
| | print("β Quality: POOR (contains target word)") |
| | poor_clues += 1 |
| | elif old_nonsense: |
| | print("β Quality: POOR (nonsensical)") |
| | poor_clues += 1 |
| | elif is_definitional: |
| | print("β
Quality: EXCELLENT (definitional)") |
| | excellent_clues += 1 |
| | major_improvements.append((word, topic, best_clue)) |
| | elif is_descriptive: |
| | print("β
Quality: GOOD (descriptive)") |
| | good_clues += 1 |
| | major_improvements.append((word, topic, best_clue)) |
| | else: |
| | print("π Quality: ACCEPTABLE") |
| | good_clues += 1 |
| | else: |
| | print("β No valid clues generated") |
| | failed_clues += 1 |
| | |
| | except Exception as e: |
| | print(f"β Error generating clue: {e}") |
| | failed_clues += 1 |
| | |
| | total_tests = len(test_cases) |
| | print(f"\n" + "=" * 60) |
| | print(f"π FLAN-T5-BASE RESULTS") |
| | print(f"=" * 60) |
| | print(f"Total tests: {total_tests}") |
| | print(f"Excellent clues: {excellent_clues}") |
| | print(f"Good clues: {good_clues}") |
| | print(f"Poor clues: {poor_clues}") |
| | print(f"Failed clues: {failed_clues}") |
| | print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%") |
| | print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%") |
| | |
| | |
| | if major_improvements: |
| | print(f"\nπ MAJOR IMPROVEMENTS OVER FLAN-T5-SMALL:") |
| | print("-" * 60) |
| | for word, topic, clue in major_improvements[:5]: |
| | print(f" {word} + {topic}: \"{clue}\"") |
| | |
| | |
| | if excellent_clues >= total_tests * 0.4: |
| | print("π MAJOR SUCCESS! flan-t5-base produces excellent clues!") |
| | print("π Ready for production use - significant upgrade from flan-t5-small") |
| | elif (excellent_clues + good_clues) >= total_tests * 0.6: |
| | print("π Good improvement! Much better than flan-t5-small") |
| | print("β
Suitable for production with semantic fallback") |
| | elif (excellent_clues + good_clues) >= total_tests * 0.3: |
| | print("β οΈ Some improvement over flan-t5-small, but still limited") |
| | else: |
| | print("β Still struggling - may need even larger model or external knowledge") |
| |
|
| |
|
| | def main(): |
| | """Run the flan-t5-base upgrade test.""" |
| | test_flan_t5_base() |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |