Spaces:

NLP-Debater-Project
/

FastAPI-Backend-Models

Running

App Files Files Community

Yassine Mhirsi commited on 5 days ago

Commit

94c2a9a

1 Parent(s): 7218dd0

refactor: Simplify topic extraction logic in TopicService by removing Pydantic schema, enhancing JSON response handling, and adding fuzzy matching for improved topic validation.

Browse files

Files changed (1) hide show

services/topic_service.py +54 -23

services/topic_service.py CHANGED Viewed

@@ -1,10 +1,10 @@
 """Service for topic extraction from text using LangChain Groq"""
 import logging
 from typing import Optional, List
 from langchain_core.messages import HumanMessage, SystemMessage
 from langchain_groq import ChatGroq
-from pydantic import BaseModel, Field
 from langsmith import traceable
 from config import GROQ_API_KEY
@@ -57,11 +57,6 @@ PREDEFINED_TOPICS = [
 ]
-class TopicOutput(BaseModel):
-    """Pydantic schema for topic extraction output"""
-    topic: str = Field(..., description="The selected topic from the predefined list that most closely matches the input text")
 class TopicService:
     """Service for extracting topics from text arguments by matching to predefined topics"""
@@ -72,7 +67,7 @@ class TopicService:
         self.predefined_topics = PREDEFINED_TOPICS
     def initialize(self, model_name: Optional[str] = None):
-        """Initialize the Groq LLM with structured output"""
         if self.initialized:
             logger.info("Topic service already initialized")
             return
@@ -86,15 +81,13 @@ class TopicService:
         try:
             logger.info(f"Initializing topic extraction service with model: {self.model_name}")
-            llm = ChatGroq(
                 model=self.model_name,
                 api_key=GROQ_API_KEY,
                 temperature=0.0,
                 max_tokens=512,
             )
-            # Bind structured output directly to the model
-            self.llm = llm.with_structured_output(TopicOutput)
             self.initialized = True
             logger.info("✓ Topic extraction service initialized successfully")
@@ -111,6 +104,8 @@ class TopicService:
 IMPORTANT: You MUST return EXACTLY one of the predefined topics below. Do not create new topics or modify the wording.
 Predefined Topics:
 {topics_list}
@@ -118,17 +113,17 @@ Instructions:
 1. Analyze the user's input text carefully
 2. Identify the main theme, subject, or argument being discussed
 3. Find the topic from the predefined list that is MOST SIMILAR to the input text
-4. Return the EXACT topic text as it appears in the list above
 Examples:
 - Input: "I think we need to make assisted suicide illegal and punishable by law."
-  Output: "Assisted suicide should be a criminal offence"
 - Input: "Student debt is crushing young people. The government should help pay for college."
-  Output: "We should subsidize student loans"
 - Input: "Marijuana should be legal for adults to use recreationally."
-  Output: "We should legalize cannabis"
 """
     @traceable(name="extract_topic")
@@ -162,7 +157,25 @@ Examples:
                 ]
             )
-            selected_topic = result.topic.strip()
             # Validate that the returned topic is in the predefined list
             if selected_topic not in self.predefined_topics:
@@ -178,14 +191,32 @@ Examples:
                         logger.info(f"Found case-insensitive match: '{selected_topic}'")
                         break
                 else:
-                    # If still no match, log error and raise
-                    logger.error(
-                        f"Could not match returned topic '{selected_topic}' to any predefined topic. "
-                        f"Available topics: {self.predefined_topics[:3]}..."
-                    )
-                    raise ValueError(
-                        f"Returned topic '{selected_topic}' is not in the predefined topics list"
-                    )
             return selected_topic

 """Service for topic extraction from text using LangChain Groq"""
 import logging
+import json
 from typing import Optional, List
 from langchain_core.messages import HumanMessage, SystemMessage
 from langchain_groq import ChatGroq
 from langsmith import traceable
 from config import GROQ_API_KEY
 ]
 class TopicService:
     """Service for extracting topics from text arguments by matching to predefined topics"""
         self.predefined_topics = PREDEFINED_TOPICS
     def initialize(self, model_name: Optional[str] = None):
+        """Initialize the Groq LLM"""
         if self.initialized:
             logger.info("Topic service already initialized")
             return
         try:
             logger.info(f"Initializing topic extraction service with model: {self.model_name}")
+            self.llm = ChatGroq(
                 model=self.model_name,
                 api_key=GROQ_API_KEY,
                 temperature=0.0,
                 max_tokens=512,
             )
             self.initialized = True
             logger.info("✓ Topic extraction service initialized successfully")
 IMPORTANT: You MUST return EXACTLY one of the predefined topics below. Do not create new topics or modify the wording.
+Return your response as a JSON object with a single "topic" field containing the exact topic text from the list.
 Predefined Topics:
 {topics_list}
 1. Analyze the user's input text carefully
 2. Identify the main theme, subject, or argument being discussed
 3. Find the topic from the predefined list that is MOST SIMILAR to the input text
+4. Return a JSON object with the EXACT topic text as it appears in the list above
 Examples:
 - Input: "I think we need to make assisted suicide illegal and punishable by law."
+  Output: {{"topic": "Assisted suicide should be a criminal offence"}}
 - Input: "Student debt is crushing young people. The government should help pay for college."
+  Output: {{"topic": "We should subsidize student loans"}}
 - Input: "Marijuana should be legal for adults to use recreationally."
+  Output: {{"topic": "We should legalize cannabis"}}
 """
     @traceable(name="extract_topic")
                 ]
             )
+            # Extract content from the response
+            response_content = result.content.strip()
+            # Try to parse as JSON first
+            try:
+                parsed_response = json.loads(response_content)
+                selected_topic = parsed_response.get("topic", "").strip()
+            except json.JSONDecodeError:
+                # If not JSON, try to extract topic from plain text
+                # Look for the topic in the response text
+                selected_topic = response_content.strip()
+                # Remove quotes if present
+                if selected_topic.startswith('"') and selected_topic.endswith('"'):
+                    selected_topic = selected_topic[1:-1]
+                elif selected_topic.startswith("'") and selected_topic.endswith("'"):
+                    selected_topic = selected_topic[1:-1]
+            if not selected_topic:
+                raise ValueError("No topic found in LLM response")
             # Validate that the returned topic is in the predefined list
             if selected_topic not in self.predefined_topics:
                         logger.info(f"Found case-insensitive match: '{selected_topic}'")
                         break
                 else:
+                    # If still no match, try fuzzy matching by checking if the topic contains key words
+                    # This is a fallback for when the LLM returns something close but not exact
+                    best_match = None
+                    best_match_score = 0
+                    selected_words = set(selected_topic_lower.split())
+                    for predefined_topic in self.predefined_topics:
+                        predefined_words = set(predefined_topic.lower().split())
+                        # Calculate word overlap
+                        overlap = len(selected_words & predefined_words)
+                        if overlap > best_match_score and overlap >= 2:  # At least 2 words must match
+                            best_match_score = overlap
+                            best_match = predefined_topic
+                    if best_match:
+                        logger.info(f"Found fuzzy match: '{selected_topic}' -> '{best_match}'")
+                        selected_topic = best_match
+                    else:
+                        # If still no match, log error and raise
+                        logger.error(
+                            f"Could not match returned topic '{selected_topic}' to any predefined topic. "
+                            f"Available topics: {self.predefined_topics[:3]}..."
+                        )
+                        raise ValueError(
+                            f"Returned topic '{selected_topic}' is not in the predefined topics list"
+                        )
             return selected_topic