import sys
import time
import os
import random

# Add current directory to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from environment import ResearchEnvironment
from models import ResearchAction
from tasks import list_task_ids, TASKS

# ─────────────────────────────────────────────────────────────────────────────
# SAFE STEP WITH LOGGING
# ─────────────────────────────────────────────────────────────────────────────

def safe_step(env, action, step_id=None):
    """Executes a step in the environment with error handling and logging."""
    try:
        obs = env.step(action)
        
        if step_id is not None:
            print(
                f"  Step {step_id}: action={action.action_type} "
                f"| reward={obs.reward:.3f} | score={obs.score:.3f}"
            )
        return obs

    except Exception as e:
        print(f"Error during step {step_id if step_id else ''}: {e}")
        # Return a mock observation object on failure
        return type("Obj", (), {
            "reward": -0.5,
            "score": 0.0,
            "done": True,
            "data": {},
            "message": f"Error: {str(e)}"
        })()

# ─────────────────────────────────────────────────────────────────────────────
# BASELINE AGENT
# ─────────────────────────────────────────────────────────────────────────────

def run_baseline_agent(env, task_id):
    """Runs a non-deterministic baseline agent simulating actual search and evaluation."""
    task_config = TASKS[task_id]
    start_time = time.time()

    print(f"\nTask: {task_id}")
    obs = env.reset(task_id=task_id, seed=random.randint(1, 10000))
    step_id = 1

    # Step 1: Read paper
    obs = safe_step(env, ResearchAction("read_paper", "all"), step_id)
    step_id += 1

    # Step 2: Generate hypothesis
    key_finding = task_config["paper_summaries"][0].get("key_finding", "")
    hypothesis = f"Hypothesis based on {key_finding}: Randomised trials will yield best methods."
    obs = safe_step(env, ResearchAction("propose_hypothesis", hypothesis), step_id)
    step_id += 1

    # Ensure dataset and methods are available
    datasets = [d["dataset_id"] for d in task_config["available_datasets"]]
    methods = [m["method_id"] for m in task_config["available_methods"]]

    best_acc = 0.0
    best_method, best_dataset = None, None

    # Try UP TO 4 experiments randomly
    for _ in range(4):
        if not datasets or not methods:
            break

        dataset = random.choice(datasets)
        method = random.choice(methods)

        # Design experiment
        obs = safe_step(env, ResearchAction("design_experiment", f"{method}:{dataset}"), step_id)
        step_id += 1
        
        exp_id = obs.data.get("experiment_id")
        if not exp_id:
            continue

        # Run experiment
        obs = safe_step(env, ResearchAction("run_experiment", exp_id), step_id)
        step_id += 1
        
        acc = obs.data.get("accuracy", 0.0)

        # Non-deterministic outcome evaluation
        if acc > best_acc:
            best_acc = acc
            best_method = method
            best_dataset = dataset

    # Step: Analyze
    obs = safe_step(env, ResearchAction("analyze_results", "all"), step_id)
    step_id += 1

    # Step: Final answer
    final = f"After extensive evaluation, {best_method} on {best_dataset} performs best with accuracy {best_acc:.3f}"
    obs = safe_step(env, ResearchAction("final_answer", final), step_id)

    elapsed = time.time() - start_time

    return {
        "task_id": task_id,
        "difficulty": task_config["difficulty"],
        "score": obs.score,
        "steps": env.state.step_count if hasattr(env, 'state') else step_id,
        "time": round(elapsed, 2),
    }

# ─────────────────────────────────────────────────────────────────────────────
# RANDOM AGENT
# ─────────────────────────────────────────────────────────────────────────────

def run_random_agent(env, task_id):
    """Runs a random agent to establish a lower bound performance baseline."""
    rng = random.Random(task_id)
    task_config = TASKS[task_id]

    obs = env.reset(task_id=task_id, seed=42)

    actions = [
        "read_paper",
        "propose_hypothesis",
        "design_experiment",
        "run_experiment",
        "analyze_results",
        "final_answer",
    ]

    for _ in range(5):
        action = rng.choice(actions)

        if action == "design_experiment":
            methods = [m["method_id"] for m in task_config["available_methods"]]
            datasets = [d["dataset_id"] for d in task_config["available_datasets"]]
            content = f"{rng.choice(methods)}:{rng.choice(datasets)}"
        else:
            content = "random"

        obs = safe_step(env, ResearchAction(action, content))

        if obs.done:
            break

    if not obs.done:
        obs = safe_step(env, ResearchAction("final_answer", "random conclusion"))

    # Add slight noise for realism in benchmark
    noisy_score = max(0.0, min(1.0, obs.score + rng.uniform(-0.02, 0.02)))

    return {"task_id": task_id, "score": noisy_score}

# ─────────────────────────────────────────────────────────────────────────────
# MAIN EXECUTION
# ─────────────────────────────────────────────────────────────────────────────

def main():
    """Main entry point to run benchmarks for all tasks."""
    env = ResearchEnvironment()
    tasks = list_task_ids()

    print("\n" + "="*40)
    print("RUNNING BASELINE AGENT BENCHMARK")
    print("="*40)
    baseline_scores = []

    for task in tasks:
        result = run_baseline_agent(env, task)
        baseline_scores.append(result["score"])
        print(f"Task: {task} -> Score: {result['score']:.4f}")

    print("\n" + "="*40)
    print("RUNNING RANDOM AGENT BENCHMARK")
    print("="*40)
    random_scores = []

    for task in tasks:
        result = run_random_agent(env, task)
        random_scores.append(result["score"])
        print(f"Task: {task} -> Score: {result['score']:.4f}")

    if not tasks:
        print("No tasks found.")
        return 0

    avg_base = sum(baseline_scores) / len(baseline_scores)
    avg_rand = sum(random_scores) / len(random_scores)

    print("\n" + "="*40)
    print("FINAL SUMMARY")
    print("="*40)
    print(f"Baseline Average: {avg_base:.4f}")
    print(f"Random Average:   {avg_rand:.4f}")
    print(f"Performance Gap:  {avg_base - avg_rand:+.4f}")
    print("="*40)

    return 0

if __name__ == "__main__":
    try:
        sys.exit(main())
    except KeyboardInterrupt:
        print("\nBenchmark interrupted by user.")
        sys.exit(1)