Spaces:

tomvaillant
/

osint-llm

Running

osint-llm / app.py

Tom

Add complete RAG-powered OSINT investigation assistant

6466c00 about 2 months ago

8.58 kB

	"""
	OSINT Investigation Assistant - Gradio App

	A RAG-powered assistant that helps investigators develop methodologies
	for OSINT investigations using a database of 344+ OSINT tools.
	"""

	import os
	import gradio as gr
	from dotenv import load_dotenv
	from src.rag_pipeline import create_pipeline

	# Load environment variables
	load_dotenv()

	# Initialize the RAG pipeline
	print("Initializing OSINT Investigation Pipeline...")
	try:
	pipeline = create_pipeline(
	retrieval_k=5,
	model=os.getenv("LLM_MODEL", "meta-llama/Llama-3.1-8B-Instruct"),
	temperature=float(os.getenv("LLM_TEMPERATURE", "0.7"))
	)
	print("✓ Pipeline initialized successfully")
	except Exception as e:
	print(f"✗ Error initializing pipeline: {e}")
	raise


	def investigate(message: str, history: list) -> str:
	"""
	Main chat function for investigation queries

	Args:
	message: User's investigation query
	history: Chat history (list of [user_msg, bot_msg] pairs)

	Returns:
	Generated investigation methodology
	"""
	try:
	# Generate response (non-streaming for simplicity)
	response = pipeline.generate_methodology(message, stream=False)
	return response
	except Exception as e:
	return f"Error generating response: {str(e)}\n\nPlease check your environment variables (HF_TOKEN, SUPABASE_CONNECTION_STRING) and try again."


	def investigate_stream(message: str, history: list):
	"""
	Streaming version of investigation function

	Args:
	message: User's investigation query
	history: Chat history

	Yields:
	Response chunks
	"""
	try:
	response_stream = pipeline.generate_methodology(message, stream=True)
	full_response = ""
	for chunk in response_stream:
	full_response += chunk
	yield full_response
	except Exception as e:
	yield f"Error generating response: {str(e)}\n\nPlease check your environment variables (HF_TOKEN, SUPABASE_CONNECTION_STRING) and try again."


	def get_tool_recommendations(query: str, k: int = 5) -> str:
	"""
	Get tool recommendations for a query

	Args:
	query: Investigation query
	k: Number of tools to recommend

	Returns:
	Formatted tool recommendations
	"""
	try:
	tools = pipeline.get_tool_recommendations(query, k=k)

	if not tools:
	return "No relevant tools found."

	output = f"## Top {len(tools)} Recommended Tools\n\n"

	for i, tool in enumerate(tools, 1):
	output += f"### {i}. {tool['name']}\n"
	output += f"- Category: {tool['category']}\n"
	output += f"- Cost: {tool['cost']}\n"
	output += f"- URL: {tool['url']}\n"
	output += f"- Description: {tool['description']}\n"
	if tool['details'] and tool['details'] != 'N/A':
	output += f"- Details: {tool['details']}\n"
	output += "\n"

	return output
	except Exception as e:
	return f"Error retrieving tools: {str(e)}"


	# Custom CSS for better appearance
	custom_css = """
	.gradio-container {
	max-width: 900px !important;
	}
	#component-0 {
	max-width: 900px;
	}
	"""

	# Create Gradio interface
	with gr.Blocks(
	title="OSINT Investigation Assistant",
	theme=gr.themes.Soft(),
	css=custom_css
	) as demo:
	gr.Markdown("""
	# 🔍 OSINT Investigation Assistant

	Ask me how to investigate anything using open-source intelligence methods.
	I'll provide you with a structured methodology and recommend specific OSINT tools
	from a database of 344+ tools.

	Examples:
	- "How do I investigate a suspicious domain?"
	- "What tools can I use to verify an image's authenticity?"
	- "How can I trace the origin of a social media account?"
	""")

	# Main chat interface
	chatbot = gr.ChatInterface(
	fn=investigate_stream,
	type="messages",
	examples=[
	"How do I investigate a suspicious domain?",
	"What tools can I use to verify an image's authenticity?",
	"How can I trace the origin of a social media account?",
	"What's the best way to archive web content for investigation?",
	"How do I geolocate an image from social media?"
	],
	cache_examples=False,
	title="Chat Interface",
	description="Ask your investigation questions here",
	api_name="investigate" # This creates the /call/investigate API endpoint
	)

	# Additional tab for direct tool search
	with gr.Tab("Tool Search"):
	gr.Markdown("### Search for OSINT Tools")
	with gr.Row():
	tool_query = gr.Textbox(
	label="Search Query",
	placeholder="e.g., social media analysis, image verification, domain investigation",
	lines=2
	)
	tool_count = gr.Slider(
	minimum=1,
	maximum=20,
	value=5,
	step=1,
	label="Number of Tools"
	)

	tool_search_btn = gr.Button("Search Tools", variant="primary")
	tool_output = gr.Markdown(label="Recommended Tools")

	tool_search_btn.click(
	fn=get_tool_recommendations,
	inputs=[tool_query, tool_count],
	outputs=tool_output,
	api_name="search_tools" # This creates the /call/search_tools API endpoint
	)

	# Information tab
	with gr.Tab("About"):
	gr.Markdown("""
	## About This Assistant

	This OSINT Investigation Assistant helps researchers and investigators develop
	structured methodologies for open-source intelligence investigations.

	### Features
	- 🎯 Structured Methodologies: Get step-by-step investigation plans
	- 🛠️ Tool Recommendations: Access a database of 344+ OSINT tools
	- 🔍 Context-Aware: Tools are recommended based on your specific needs
	- 🚀 API Access: Use this app via API for integration with other tools

	### Technology Stack
	- Vector Database: Supabase with PGVector (344 OSINT tools)
	- LLM: Hugging Face Inference Providers (Llama 3.1)
	- RAG Framework: LangChain for retrieval-augmented generation
	- UI/API: Gradio with automatic API generation

	### API Usage

	This app automatically exposes API endpoints. You can access them using:

	Python Client:
	```python
	from gradio_client import Client

	client = Client("your-space-url")
	result = client.predict("How do I investigate a domain?", api_name="/investigate")
	print(result)
	```

	cURL:
	```bash
	curl -X POST "https://your-space.hf.space/call/investigate" \\
	-H "Content-Type: application/json" \\
	-d '{"data": ["How do I investigate a domain?"]}'
	```

	View the full API documentation at the bottom of this page (click "Use via API").

	### Environment Variables Required
	- `SUPABASE_CONNECTION_STRING`: PostgreSQL connection string for Supabase
	- `HF_TOKEN`: Hugging Face API token for Inference Providers
	- `LLM_MODEL` (optional): Model to use (default: meta-llama/Llama-3.1-8B-Instruct)
	- `LLM_TEMPERATURE` (optional): Temperature for generation (default: 0.7)

	### Data Source
	The tool recommendations are based on the Bellingcat OSINT Toolkit and other
	curated sources, with 344+ tools across categories including:
	- Social Media Investigation
	- Image and Video Analysis
	- Domain and Network Investigation
	- Geolocation
	- Archiving and Preservation
	- And more...

	---

	Built with ❤️ for the OSINT community
	""")

	# Launch configuration
	if __name__ == "__main__":
	# Check for required environment variables
	required_vars = ["SUPABASE_CONNECTION_STRING", "HF_TOKEN"]
	missing_vars = [var for var in required_vars if not os.getenv(var)]

	if missing_vars:
	print(f"⚠️ Warning: Missing environment variables: {', '.join(missing_vars)}")
	print("Please set these in your .env file or as environment variables")

	# Launch the app
	# Set mcp_server=True to enable MCP protocol for agent integration
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_api=True # Show API documentation
	)