| | import gradio as gr |
| | from datasets import load_dataset |
| | import random |
| |
|
| | |
| | DATASETS = { |
| | "Main Dataset": "sumuks/fineweb-10BT-annotated", |
| | "Ablation Dataset": "sumuks/fineweb-10BT-annotated-ablation-1" |
| | } |
| |
|
| | SPLIT = "train" |
| |
|
| | |
| | SCORE_COLUMN = "score" |
| | TEXT_COLUMN = "text" |
| | ID_COLUMN = "id" |
| | SUMMARY_COLUMN = "summary" |
| | JUSTIFICATION_COLUMN = "justification" |
| | THINKING_COLUMN = "thinking" |
| | MODEL_COLUMN = "annotation_model" |
| | DATE_COLUMN = "annotation_date" |
| |
|
| | |
| | current_dataset = None |
| | dataset_name = None |
| | seen_ids = set() |
| |
|
| | def load_selected_dataset(selected_dataset): |
| | global current_dataset, dataset_name, seen_ids |
| | dataset_name = DATASETS[selected_dataset] |
| | seen_ids = set() |
| | |
| | try: |
| | current_dataset = load_dataset(dataset_name, split=SPLIT) |
| | return f"✅ Loaded {len(current_dataset)} examples from {dataset_name}" |
| | except Exception as e: |
| | current_dataset = None |
| | return f"❌ Failed to load {dataset_name}: {str(e)}" |
| |
|
| | def get_examples_by_score(score: int, n_examples: int = 5, show_details: bool = False): |
| | if current_dataset is None: |
| | return "Please select and load a dataset first." |
| | |
| | subset = current_dataset.filter(lambda x: x.get(SCORE_COLUMN) == score) |
| | if len(subset) == 0: |
| | return "No examples found for this score." |
| | |
| | n = min(len(subset), n_examples) |
| | examples_text = [] |
| | |
| | |
| | total_available = len(subset) |
| | random_indices = random.sample(range(total_available), n) |
| | |
| | for idx in random_indices: |
| | item = subset[idx] |
| | example_id = item.get(ID_COLUMN, "Unknown") |
| | text = item.get(TEXT_COLUMN, "") |
| | summary = item.get(SUMMARY_COLUMN, "") |
| | justification = item.get(JUSTIFICATION_COLUMN, "") |
| | thinking = item.get(THINKING_COLUMN, "") |
| | model = item.get(MODEL_COLUMN, "") |
| | date = item.get(DATE_COLUMN, "") |
| | |
| | |
| | example_display = f"**Document ID:** {example_id}\n\n" |
| | |
| | if show_details and summary: |
| | example_display += f"**Summary:** {summary}\n\n" |
| | |
| | if show_details and justification: |
| | example_display += f"**Justification:** {justification}\n\n" |
| | |
| | if show_details and thinking: |
| | example_display += f"**Thinking Process:** {thinking}\n\n" |
| | |
| | if show_details and model: |
| | example_display += f"**Model:** {model} | **Date:** {date}\n\n" |
| | |
| | example_display += f"**Text:**\n{text}\n\n---\n" |
| | examples_text.append(example_display) |
| | |
| | return "\n".join(examples_text) |
| |
|
| | def get_random_unseen_example(show_details: bool = False): |
| | if current_dataset is None: |
| | return "Please select and load a dataset first." |
| | |
| | |
| | all_ids = set(current_dataset[ID_COLUMN]) |
| | unseen_ids = all_ids - seen_ids |
| | |
| | if not unseen_ids: |
| | |
| | seen_ids.clear() |
| | unseen_ids = all_ids |
| | if not unseen_ids: |
| | return "No examples available in dataset." |
| | |
| | |
| | random_id = random.choice(list(unseen_ids)) |
| | seen_ids.add(random_id) |
| | |
| | |
| | item_idx = current_dataset[ID_COLUMN].index(random_id) |
| | item = current_dataset[item_idx] |
| | |
| | |
| | text = item.get(TEXT_COLUMN, "") |
| | score = item.get(SCORE_COLUMN, "N/A") |
| | summary = item.get(SUMMARY_COLUMN, "") |
| | justification = item.get(JUSTIFICATION_COLUMN, "") |
| | thinking = item.get(THINKING_COLUMN, "") |
| | model = item.get(MODEL_COLUMN, "") |
| | date = item.get(DATE_COLUMN, "") |
| | |
| | |
| | display = f"**Document ID:** {random_id} | **Score:** {score}\n\n" |
| | |
| | if show_details and summary: |
| | display += f"**Summary:** {summary}\n\n" |
| | |
| | if show_details and justification: |
| | display += f"**Justification:** {justification}\n\n" |
| | |
| | if show_details and thinking: |
| | display += f"**Thinking Process:** {thinking}\n\n" |
| | |
| | if show_details and model: |
| | display += f"**Model:** {model} | **Date:** {date}\n\n" |
| | |
| | display += f"**Text:**\n{text}" |
| | |
| | return display |
| |
|
| | def build_interface(): |
| | with gr.Blocks(theme="default", title="Dataset Inspector") as demo: |
| | gr.Markdown("# 📊 Expert Content Classification Dataset Inspector") |
| | |
| | with gr.Row(): |
| | with gr.Column(scale=2): |
| | dataset_dropdown = gr.Dropdown( |
| | choices=list(DATASETS.keys()), |
| | label="Select Dataset", |
| | value="Main Dataset" |
| | ) |
| | with gr.Column(scale=1): |
| | load_btn = gr.Button("Load Dataset", variant="primary") |
| | |
| | status_display = gr.Markdown("") |
| | |
| | with gr.Row(): |
| | show_details_global = gr.Checkbox( |
| | label="Show annotation details (summary, justification, thinking)", |
| | value=False |
| | ) |
| | |
| | with gr.Tabs(): |
| | |
| | with gr.Tab("🎲 Random Sampling"): |
| | gr.Markdown("Sample random examples you haven't seen before") |
| | with gr.Row(): |
| | sample_btn = gr.Button("Get Random Example", variant="secondary", size="lg") |
| | random_output = gr.Markdown("") |
| | |
| | |
| | for score in range(6): |
| | with gr.Tab(f"⭐ Score {score}"): |
| | gr.Markdown(f"Browse examples with quality score {score}") |
| | with gr.Row(): |
| | n_examples = gr.Slider( |
| | minimum=1, |
| | maximum=20, |
| | value=3, |
| | step=1, |
| | label="Number of examples" |
| | ) |
| | show_btn = gr.Button(f"Show Score {score} Examples", variant="secondary") |
| | |
| | score_output = gr.Markdown("") |
| | |
| | |
| | show_btn.click( |
| | fn=lambda n, details, s=score: get_examples_by_score(s, n, details), |
| | inputs=[n_examples, show_details_global], |
| | outputs=score_output |
| | ) |
| | |
| | |
| | load_btn.click( |
| | fn=load_selected_dataset, |
| | inputs=dataset_dropdown, |
| | outputs=status_display |
| | ) |
| | |
| | sample_btn.click( |
| | fn=get_random_unseen_example, |
| | inputs=show_details_global, |
| | outputs=random_output |
| | ) |
| | |
| | |
| | demo.load( |
| | fn=lambda: load_selected_dataset("Main Dataset"), |
| | outputs=status_display |
| | ) |
| | |
| | return demo |
| |
|
| | if __name__ == "__main__": |
| | demo = build_interface() |
| | demo.launch() |