Spaces:
Running
Running
| import gradio as gr | |
| import pandas as pd | |
| ########################################### | |
| # Load Data # | |
| ########################################### | |
| llm_judge_filename = "llm_judge_results.jsonl" | |
| response_generation_filename = "report_generation.jsonl" | |
| response_generation_w_docs_filename = "report_generation_w_docs.jsonl" | |
| def load_filename_into_df(filename): | |
| df = pd.read_json(filename, lines=True) | |
| return df | |
| color_map = { | |
| "Closed-source Instruct": "#4492F7" , | |
| "Open-weight Instruct": "#0856f1", | |
| "Closed-source Reasoning": "#fac05d" , | |
| "Open-weight Reasoning": "#f59c03", | |
| } | |
| CAPTION_V2 = f"""**ProfBench**: Over 7,000 brand-new expert-authored response–criterion pairs across 80 professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n | |
| ProfBench is a high-quality, text-only dataset that represent the complex reasoning tasks faced by professionals in fields like finance and chemistry. We're not talking about simple Q&A or retrieval-based tasks. We're talking about multi-page assignments that require deep domain knowledge and reasoning. Can AI generate comprehensive reports by applying the nuanced reasoning that a PhD-level physicist/chemist or an MBA-level consultant/financier would have? \n | |
| [Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench) | [Nemo Evaluator SDK](https://github.com/NVIDIA-NeMo/Evaluator)\n | |
| Want to see your favorite models added? Run it with [Nemo Evaluator SDK for scalable evaluation](https://github.com/NVIDIA-NeMo/Evaluator) or [ProfBench code for quick evaluation](https://github.com/NVlabs/ProfBench), send us the scores or ping zhilinw/viviennez [at] nvidia.com to run it for you!""" | |
| def color_model_type_column(df, color_map): | |
| """ | |
| Apply color to the 'Model Type' column of the DataFrame based on a given color mapping. | |
| Parameters: | |
| df (pd.DataFrame): The DataFrame containing the 'Model Type' column. | |
| color_map (dict): A dictionary mapping model types to colors. | |
| Returns: | |
| pd.Styler: The styled DataFrame. | |
| """ | |
| # Function to apply color based on the model type | |
| def apply_color(val): | |
| color = color_map.get(val, "default") # Default color if not specified in color_map | |
| return f"background-color: {color}" | |
| # # Format for different columns | |
| # format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Average", "Model", "Model Type"]} | |
| # format_dict["Average"] = "{:.2f}" | |
| # format_dict[""] = "{:d}" | |
| format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Model", "Category", "Input Tokens", "Output Tokens", "Cost"]} | |
| format_dict["Response Characters"] = "{:d}" | |
| format_dict["Input Tokens"] = "{:d}" | |
| format_dict["Output Tokens"] = "{:d}" | |
| format_dict[""] = "{:d}" | |
| format_dict["Cost"] = "{:.2f}" | |
| return df.style.map(apply_color, subset=["Category"]).format(format_dict, na_rep="") | |
| def regex_table(dataframe, regex, filter_button, style=True): | |
| """ | |
| Takes a model name as a regex, then returns only the rows that has that in it. | |
| """ | |
| # Split regex statement by comma and trim whitespace around regexes | |
| regex_list = [x.strip() for x in regex.split(",")] | |
| # Join the list into a single regex pattern with '|' acting as OR | |
| combined_regex = "|".join(regex_list) | |
| if isinstance(filter_button, list) or isinstance(filter_button, str): | |
| if "Open-weight" not in filter_button: | |
| dataframe = dataframe[~dataframe["Category"].str.contains("Open-weight", case=False, na=False)] | |
| if "Closed-source" not in filter_button: | |
| dataframe = dataframe[~dataframe["Category"].str.contains("Closed-source", case=False, na=False)] | |
| if "Reasoning" not in filter_button: | |
| dataframe = dataframe[~dataframe["Category"].str.contains("Reasoning", case=False, na=False)] | |
| if "Instruct" not in filter_button: | |
| dataframe = dataframe[~dataframe["Category"].str.contains("Instruct", case=False, na=False)] | |
| data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)] | |
| # if update the score to not use prior sets, do so | |
| data = data.sort_values(by="Overall", ascending=False) | |
| data.reset_index(drop=True, inplace=True) | |
| data.insert(0, "", range(1, 1 + len(data))) | |
| if style: | |
| # apply color | |
| data = color_model_type_column(data, color_map) | |
| return data | |
| # Using a string for a predefined color | |
| theme = gr.themes.Default(primary_hue="blue") | |
| ############################################# | |
| # Gradio App # | |
| ############################################# | |
| with gr.Blocks(theme=theme) as app: | |
| # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About" | |
| with gr.Row(): | |
| with gr.Column(scale=6): | |
| gr.Markdown(CAPTION_V2) | |
| with gr.Tabs(elem_id="outer-tabs", elem_classes="tabs-big") as tabs_big: | |
| with gr.TabItem("Report Generation"): | |
| with gr.Row(): | |
| with gr.Column(scale=7): | |
| gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 11 Dec 2025.") | |
| with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs: | |
| with gr.TabItem("Leaderboard"): | |
| with gr.Row(): | |
| search_1 = gr.Textbox( | |
| label="Model Search (delimit with , )", | |
| placeholder="Model Search (delimit with , )", | |
| show_label=False, | |
| scale=8, | |
| ) | |
| model_types_1 = gr.CheckboxGroup( | |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"], | |
| value=["Open-weight", "Closed-source", "Reasoning", "Instruct"], | |
| show_label=False, | |
| scale=8, | |
| ) | |
| with gr.Row(): | |
| col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12 | |
| df_response_generation = load_filename_into_df(response_generation_filename) | |
| rewardbench_table_hidden = gr.Dataframe( | |
| df_response_generation.values, | |
| datatype=col_types_response_generation, | |
| headers=df_response_generation.columns.tolist(), | |
| visible=False, | |
| ) | |
| rewardbench_table = gr.Dataframe( | |
| regex_table( | |
| df_response_generation.copy(), | |
| "", | |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"] | |
| ), | |
| datatype=col_types_response_generation, | |
| headers=df_response_generation.columns.tolist(), | |
| elem_id="response_generation_dataframe", | |
| row_count=(25, "dynamic"), | |
| ) | |
| with gr.TabItem("LLM Judge"): | |
| with gr.Row(): | |
| gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index. \nEvaluation and cost estimation last performed on 20 Sep 2025.") | |
| with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs: | |
| with gr.TabItem("Leaderboard"): | |
| with gr.Row(): | |
| search_1_v1 = gr.Textbox( | |
| label="Model Search (delimit with , )", | |
| placeholder="Model Search (delimit with , )", | |
| show_label=False, | |
| ) | |
| model_types_1_v1 = gr.CheckboxGroup( | |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"], | |
| value=["Open-weight", "Closed-source", "Reasoning", "Instruct"], | |
| label="Model Types", | |
| show_label=False, | |
| # info="Which model types to include.", | |
| ) | |
| with gr.Row(): | |
| col_types_llm_judge = ["number"] + ["markdown"] + ["str"] + ["number"] * 16 | |
| df_llm_judge = load_filename_into_df(llm_judge_filename) | |
| rewardbench_table_hidden_v1 = gr.Dataframe( | |
| df_llm_judge.values, | |
| datatype=col_types_llm_judge, | |
| headers=df_llm_judge.columns.tolist(), | |
| visible=False, | |
| ) | |
| rewardbench_table_v1 = gr.Dataframe( | |
| regex_table( | |
| df_llm_judge.copy(), | |
| "", | |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"], | |
| ), | |
| datatype=col_types_llm_judge, | |
| headers=df_llm_judge.columns.tolist(), | |
| elem_id="llm_judge_dataframe", | |
| row_count=(25, "dynamic"), | |
| ) | |
| with gr.TabItem("Report Generation w Docs"): | |
| with gr.Row(): | |
| with gr.Column(scale=7): | |
| gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge. \nEvaluation and cost estimation last performed on 20 Sep 2025.") | |
| with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs: | |
| with gr.TabItem("Leaderboard"): | |
| with gr.Row(): | |
| search_1_v2 = gr.Textbox( | |
| label="Model Search (delimit with , )", | |
| placeholder="Model Search (delimit with , )", | |
| show_label=False, | |
| scale=8, | |
| ) | |
| model_types_1_v2 = gr.CheckboxGroup( | |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"], | |
| value=["Open-weight", "Closed-source", "Reasoning", "Instruct"], | |
| show_label=False, | |
| scale=8, | |
| ) | |
| with gr.Row(): | |
| col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12 | |
| df_response_generation_w_docs = load_filename_into_df(response_generation_w_docs_filename) | |
| rewardbench_table_hidden_v2 = gr.Dataframe( | |
| df_response_generation_w_docs.values, | |
| datatype=col_types_response_generation, | |
| headers=df_response_generation_w_docs.columns.tolist(), | |
| visible=False, | |
| ) | |
| rewardbench_table_v2 = gr.Dataframe( | |
| regex_table( | |
| df_response_generation_w_docs.copy(), | |
| "", | |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"] | |
| ), | |
| datatype=col_types_response_generation, | |
| headers=df_response_generation_w_docs.columns.tolist(), | |
| elem_id="response_generation_dataframe", | |
| row_count=(25, "dynamic"), | |
| ) | |
| search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table) | |
| search_1_v1.change( | |
| regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1 | |
| ) | |
| search_1_v2.change( | |
| regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2 | |
| ) | |
| model_types_1.change( | |
| regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table | |
| ) | |
| model_types_1_v1.change( | |
| regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1 | |
| ) | |
| model_types_1_v2.change( | |
| regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2 | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("📚 Frequently Asked Questions", open=False): | |
| citation_button = gr.Textbox( | |
| value=r"""1. How is the cost calculated?: We use the token cost from https://openrouter.ai/models multipled by the total input/output tokens in each evaluation. | |
| 2. How can I run Report Generation Leaderboard with Grounding Documents: This benchmark is unable to be run externally at the moment since we are unable to release the required grounding documents. We are working on it.""", | |
| lines=2, | |
| label="FAQ", | |
| elem_id="faq_box", | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("📚 Understand the Metrics", open=False): | |
| citation_button = gr.Textbox( | |
| value=r"""Response Generation (w Docs): We first generate the response. Then we grade the response against the human-annotated rubrics. Finally, we calculate the proportion of rubrics satisfied by each response, weighted by their criterion-weight to derive a score for each response. | |
| LLM Judge: We calculate macro-F1 of the LLM-judge predicted criteria-fulfillment against the human-labelled criterion fulfillment to get Overall F1. We then calculate the bias for each model by taking mean of predicted fulfilment minus mean of human-labelled fulfilment. We calculate Bias Index by taking max(bias) - min(bias) across models. Overall is calculated by Overall F1 - Bias Index.""", | |
| lines=4, | |
| label="Metrics", | |
| elem_id="metrics_box", | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("📚 Citation and Credits", open=False): | |
| citation_button = gr.Textbox( | |
| value=r"""@misc{wang2025profbenchmultidomainrubricsrequiring, | |
| title={ProfBench: Multi-Domain Rubrics requiring Professional Knowledge to Answer and Judge}, | |
| author={Zhilin Wang and Jaehun Jung and Ximing Lu and Shizhe Diao and Ellie Evans and Jiaqi Zeng and Pavlo Molchanov and Yejin Choi and Jan Kautz and Yi Dong}, | |
| year={2025}, | |
| eprint={2510.18941}, | |
| archivePrefix={arXiv}, | |
| primaryClass={cs.CL}, | |
| url={https://arxiv.org/abs/2510.18941}, | |
| }""", | |
| lines=10, | |
| label="If you find the results helpful, please cite the following. ", | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| gr.Textbox("Leaderboard adapted from allenai/reward-bench ", label="Leaderboard credits",) | |
| app.launch() # had .queue() before launch before... not sure if that's necessary | |