Spaces:

Apollo-LMMs
/

TimeScope

Sleeping

App Files Files Community

TimeScope / app.py

orrzohar

Upload 5 files

4dca8ec verified 5 months ago

raw

history blame contribute delete

9.31 kB

	"""
	Gradio demo – visualise benchmark accuracy curves.

	Required CSV files (place in the same folder as app.py):

	├── aggregated_accuracy.csv
	├── qa_accuracy.csv
	├── ocr_accuracy.csv
	└── temporal_accuracy.csv

	Each file has the columns

	Model,<context‑length‑1>,<context‑length‑2>,…

	where the context‑length headers are strings such as `30min`, `60min`, `120min`, …

	No further cleaning / renaming is done apart from two cosmetic replacements
	(“gpt4.1” → “ChatGPT 4.1”, “gemini2.5pro” → “Gemini 2.5 Pro”).
	"""

	from pathlib import Path

	import pandas as pd
	import plotly.graph_objects as go
	import gradio as gr
	import math

	# --------------------------------------------------------------------- #
	# Config #
	# --------------------------------------------------------------------- #

	FILES = {
	"aggregated": "aggregated_accuracy.csv",
	"qa": "qa_accuracy.csv",
	"ocr": "ocr_accuracy.csv",
	"temporal": "temporal_accuracy.csv",
	}

	# Mapping of internal benchmark keys to nicely formatted display labels
	DISPLAY_LABELS = {
	"aggregated": "Aggregated",
	"qa": "QA",
	"ocr": "OCR",
	"temporal": "Temporal",
	}

	# Optional: choose which models are selected by default for each benchmark.
	# Use the display names exactly as they appear in the Models list.
	# If a benchmark is missing, it falls back to the first six models.
	DEFAULT_MODELS: dict[str, list[str]] = {
	"aggregated": [
	"Gemini 2.5 Pro",
	"ChatGPT 4.1",
	"Qwen2.5-VL-7B",
	"InternVL2.5-8B",
	"LLaMA-3.2-11B-Vision",
	],
	}

	RENAME = {
	r"gpt4\.1": "ChatGPT 4.1",
	r"Gemini\s2\.5\spro": "Gemini 2.5 Pro",
	r"LLaMA-3\.2B-11B": "LLaMA-3.2-11B-Vision",
	}

	# --------------------------------------------------------------------- #
	# Data loading #
	# --------------------------------------------------------------------- #

	def _read_csv(path: str \| Path) -> pd.DataFrame:
	df = pd.read_csv(path)
	df["Model"] = df["Model"].replace(RENAME, regex=True).astype(str)
	return df

	dfs: dict[str, pd.DataFrame] = {name: _read_csv(path) for name, path in FILES.items()}

	# --------------------------------------------------------------------- #
	# Colour palette and model metadata #
	# --------------------------------------------------------------------- #

	import plotly.express as px

	SAFE_PALETTE = px.colors.qualitative.Safe # colour-blind-safe qualitative palette (10 colours)

	# Deterministic list of all unique model names to ensure consistent colour mapping
	ALL_MODELS: list[str] = sorted({m for df in dfs.values() for m in df["Model"].unique()})

	MARKER_SYMBOLS = [
	"circle",
	"square",
	"triangle-up",
	"diamond",
	"cross",
	"triangle-down",
	"x",
	"triangle-right",
	"triangle-left",
	"pentagon",
	]

	TIME_COLS = [c for c in dfs["aggregated"].columns if c.lower() != "model"]


	def _pretty_time(label: str) -> str:
	"""‘30min’ → ‘30min’; ‘120min’ → ‘2hr’; keeps original if no match."""
	if label.endswith("min"):
	minutes = int(label[:-3])
	if minutes >= 60:
	hours = minutes / 60
	return f"{hours:.0f}hr" if hours.is_integer() else f"{hours:.1f}hr"
	return label


	TIME_LABELS = {c: _pretty_time(c) for c in TIME_COLS}

	# --------------------------------------------------------------------- #
	# Plotting #
	# --------------------------------------------------------------------- #

	def render_chart(
	benchmark: str,
	models: list[str],
	log_scale: bool,
	) -> go.Figure:
	bench_key = benchmark.lower()
	df = dfs[bench_key]
	fig = go.Figure()

	# Define colour and marker based on deterministic mapping
	palette = SAFE_PALETTE

	# Determine minimum non-zero Y value across selected models for log scaling
	min_y_val = None

	for idx, m in enumerate(models):
	row = df.loc[df["Model"] == m]
	if row.empty:
	continue
	y = row[TIME_COLS].values.flatten()
	y = [val if val != 0 else None for val in y] # show gaps for 0 / missing

	# Track minimum non-zero accuracy
	y_non_none = [val for val in y if val is not None]
	if y_non_none:
	cur_min = min(y_non_none)
	if min_y_val is None or cur_min < min_y_val:
	min_y_val = cur_min

	model_idx = ALL_MODELS.index(m) if m in ALL_MODELS else idx
	color = palette[model_idx % len(palette)]
	symbol = MARKER_SYMBOLS[model_idx % len(MARKER_SYMBOLS)]
	fig.add_trace(
	go.Scatter(
	x=[TIME_LABELS[c] for c in TIME_COLS],
	y=y,
	mode="lines+markers",
	name=m,
	line=dict(width=3, color=color),
	marker=dict(size=6, color=color, symbol=symbol),
	connectgaps=False,
	)
	)

	# Set Y-axis properties
	if log_scale:
	# Fallback to 0.1 if there are no valid points
	if min_y_val is None or min_y_val <= 0:
	min_y_val = 0.1
	# Plotly expects log10 values for range when axis type is "log"
	yaxis_range = [math.floor(math.log10(min_y_val)), 2] # max at 10^2 = 100
	yaxis_type = "log"
	else:
	yaxis_range = [0, 100]
	yaxis_type = "linear"

	fig.update_layout(
	title=f"{DISPLAY_LABELS.get(bench_key, bench_key.capitalize())} Accuracy Over Time",
	xaxis_title="Video Duration",
	yaxis_title="Accuracy (%)",
	yaxis_type=yaxis_type,
	yaxis_range=yaxis_range,
	legend_title="Model",
	legend=dict(
	orientation="h",
	y=-0.25,
	x=0.5,
	xanchor="center",
	tracegroupgap=8,
	itemwidth=60,
	),
	margin=dict(t=40, r=20, b=80, l=60),
	template="plotly_dark",
	font=dict(family="Inter,Helvetica,Arial,sans-serif", size=14),
	title_font=dict(size=20, family="Inter,Helvetica,Arial,sans-serif", color="white"),
	xaxis=dict(gridcolor="rgba(255,255,255,0.15)"),
	yaxis=dict(gridcolor="rgba(255,255,255,0.15)"),
	hoverlabel=dict(bgcolor="#1e1e1e", font_color="#eeeeee", bordercolor="#888"),
	)
	return fig


	# --------------------------------------------------------------------- #
	# UI #
	# --------------------------------------------------------------------- #

	CSS = """
	#controls {
	padding: 8px 12px;
	}
	.scrollbox {
	max-height: 300px;
	overflow-y: auto;
	}
	body, .gradio-container {
	font-family: 'Inter', 'Helvetica', sans-serif;
	}
	.gradio-container h1, .gradio-container h2 {
	font-weight: 600;
	}

	#controls, .scrollbox {
	background: rgba(255,255,255,0.02);
	border-radius: 6px;
	}

	input[type="checkbox"]:checked {
	accent-color: #FF715E;
	}
	"""

	def available_models(bench: str) -> list[str]:
	return sorted(dfs[bench]["Model"].unique())


	def default_models(bench: str) -> list[str]:
	"""Return list of default-selected models for a benchmark."""
	opts = available_models(bench)
	configured = DEFAULT_MODELS.get(bench, [])
	# Keep only those present in opts
	valid = [m for m in configured if m in opts]
	if not valid:
	# Fall back to first six
	valid = opts[:6]
	return valid


	with gr.Blocks(theme=gr.themes.Base(), css=CSS) as demo:
	gr.Markdown(
	"""
	# 📈 TimeScope

	How long can your video model keep up?
	"""
	)

	# ---- top controls row ---- #
	with gr.Row():
	benchmark_dd = gr.Dropdown(
	label="Type",
	choices=list(DISPLAY_LABELS.values()),
	value=DISPLAY_LABELS["aggregated"],
	scale=1,
	)
	log_cb = gr.Checkbox(
	label="Log-scale Y-axis",
	value=False,
	scale=1,
	)

	# ---- models list and plot ---- #
	plot_out = gr.Plot(
	render_chart("Aggregated", default_models("aggregated"), False)
	)

	models_cb = gr.CheckboxGroup(
	label="Models",
	choices=available_models("aggregated"),
	value=default_models("aggregated"),
	interactive=True,
	elem_classes=["scrollbox"],
	)

	# ‑-- dynamic callbacks ‑-- #
	def _update_models(bench: str):
	bench_key = bench.lower()
	opts = available_models(bench_key)
	defaults = default_models(bench_key)
	# Use generic gr.update for compatibility across Gradio versions
	return gr.update(choices=opts, value=defaults)

	benchmark_dd.change(
	fn=_update_models,
	inputs=benchmark_dd,
	outputs=models_cb,
	queue=False,
	)

	for ctrl in (benchmark_dd, models_cb, log_cb):
	ctrl.change(
	fn=render_chart,
	inputs=[benchmark_dd, models_cb, log_cb],
	outputs=plot_out,
	queue=False,
	)

	# Make legend interaction clearer: click to toggle traces

	demo.launch(share=True)