| bootstrapping for stddev: perplexity | |
| { | |
| "results": { | |
| "arc_challenge": { | |
| "acc,none": 0.1757679180887372, | |
| "acc_stderr,none": 0.011122850863120485, | |
| "acc_norm,none": 0.21843003412969283, | |
| "acc_norm_stderr,none": 0.012074291605700975 | |
| }, | |
| "arc_easy": { | |
| "acc,none": 0.37542087542087543, | |
| "acc_stderr,none": 0.00993621852711428, | |
| "acc_norm,none": 0.3522727272727273, | |
| "acc_norm_stderr,none": 0.009801753933112785 | |
| }, | |
| "boolq": { | |
| "acc,none": 0.5886850152905199, | |
| "acc_stderr,none": 0.00860639542630921 | |
| }, | |
| "hellaswag": { | |
| "acc,none": 0.2666799442342163, | |
| "acc_stderr,none": 0.004413198640053973, | |
| "acc_norm,none": 0.27384983071101376, | |
| "acc_norm_stderr,none": 0.004450214826707207 | |
| }, | |
| "lambada_openai": { | |
| "perplexity,none": 130.96389727138103, | |
| "perplexity_stderr,none": 5.501211486155379, | |
| "acc,none": 0.22705220260042694, | |
| "acc_stderr,none": 0.005836466732850104 | |
| }, | |
| "openbookqa": { | |
| "acc,none": 0.126, | |
| "acc_stderr,none": 0.014855617750787541, | |
| "acc_norm,none": 0.254, | |
| "acc_norm_stderr,none": 0.01948659680164337 | |
| }, | |
| "piqa": { | |
| "acc,none": 0.5984766050054406, | |
| "acc_stderr,none": 0.011437324373397844, | |
| "acc_norm,none": 0.5919477693144722, | |
| "acc_norm_stderr,none": 0.011466872778651264 | |
| }, | |
| "sciq": { | |
| "acc,none": 0.64, | |
| "acc_stderr,none": 0.015186527932040117, | |
| "acc_norm,none": 0.564, | |
| "acc_norm_stderr,none": 0.015689173023144064 | |
| }, | |
| "wikitext": { | |
| "word_perplexity,none": 112.6458354552029, | |
| "byte_perplexity,none": 2.1788907860475493, | |
| "bits_per_byte,none": 1.1235938851292067 | |
| }, | |
| "winogrande": { | |
| "acc,none": 0.5295974743488555, | |
| "acc_stderr,none": 0.014027843827840085 | |
| } | |
| }, | |
| "configs": { | |
| "arc_challenge": { | |
| "task": "arc_challenge", | |
| "group": [ | |
| "ai2_arc", | |
| "multiple_choice" | |
| ], | |
| "dataset_path": "ai2_arc", | |
| "dataset_name": "ARC-Challenge", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "test_split": "test", | |
| "doc_to_text": "Question: {{question}}\nAnswer:", | |
| "doc_to_target": "{{choices.label.index(answerKey)}}", | |
| "doc_to_choice": "{{choices.text}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| }, | |
| { | |
| "metric": "acc_norm", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "Question: {{question}}\nAnswer:" | |
| }, | |
| "arc_easy": { | |
| "task": "arc_easy", | |
| "group": [ | |
| "ai2_arc", | |
| "multiple_choice" | |
| ], | |
| "dataset_path": "ai2_arc", | |
| "dataset_name": "ARC-Easy", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "test_split": "test", | |
| "doc_to_text": "Question: {{question}}\nAnswer:", | |
| "doc_to_target": "{{choices.label.index(answerKey)}}", | |
| "doc_to_choice": "{{choices.text}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| }, | |
| { | |
| "metric": "acc_norm", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "Question: {{question}}\nAnswer:" | |
| }, | |
| "boolq": { | |
| "task": "boolq", | |
| "group": [ | |
| "super-glue-lm-eval-v1" | |
| ], | |
| "dataset_path": "super_glue", | |
| "dataset_name": "boolq", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:", | |
| "doc_to_target": "label", | |
| "doc_to_choice": [ | |
| "no", | |
| "yes" | |
| ], | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc" | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "passage" | |
| }, | |
| "hellaswag": { | |
| "task": "hellaswag", | |
| "group": [ | |
| "multiple_choice" | |
| ], | |
| "dataset_path": "hellaswag", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}", | |
| "doc_to_target": "{{label}}", | |
| "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| }, | |
| { | |
| "metric": "acc_norm", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": false | |
| }, | |
| "lambada_openai": { | |
| "task": "lambada_openai", | |
| "group": [ | |
| "lambada", | |
| "loglikelihood", | |
| "perplexity" | |
| ], | |
| "dataset_path": "EleutherAI/lambada_openai", | |
| "dataset_name": "default", | |
| "test_split": "test", | |
| "template_aliases": "", | |
| "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}", | |
| "doc_to_target": "{{' '+text.split(' ')[-1]}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "perplexity", | |
| "aggregation": "perplexity", | |
| "higher_is_better": false | |
| }, | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "loglikelihood", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "{{text}}" | |
| }, | |
| "openbookqa": { | |
| "task": "openbookqa", | |
| "group": [ | |
| "multiple_choice" | |
| ], | |
| "dataset_path": "openbookqa", | |
| "dataset_name": "main", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "test_split": "test", | |
| "doc_to_text": "question_stem", | |
| "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}", | |
| "doc_to_choice": "{{choices.text}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| }, | |
| { | |
| "metric": "acc_norm", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "question_stem" | |
| }, | |
| "piqa": { | |
| "task": "piqa", | |
| "group": [ | |
| "multiple_choice" | |
| ], | |
| "dataset_path": "piqa", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "doc_to_text": "Question: {{goal}}\nAnswer:", | |
| "doc_to_target": "label", | |
| "doc_to_choice": "{{[sol1, sol2]}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| }, | |
| { | |
| "metric": "acc_norm", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "goal" | |
| }, | |
| "sciq": { | |
| "task": "sciq", | |
| "group": [ | |
| "multiple_choice" | |
| ], | |
| "dataset_path": "sciq", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "test_split": "test", | |
| "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:", | |
| "doc_to_target": 3, | |
| "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| }, | |
| { | |
| "metric": "acc_norm", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "{{support}} {{question}}" | |
| }, | |
| "wikitext": { | |
| "task": "wikitext", | |
| "group": [ | |
| "perplexity", | |
| "loglikelihood_rolling" | |
| ], | |
| "dataset_path": "EleutherAI/wikitext_document_level", | |
| "dataset_name": "wikitext-2-raw-v1", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "test_split": "test", | |
| "template_aliases": "", | |
| "doc_to_text": "", | |
| "doc_to_target": "<function wikitext_detokenizer at 0x7f8221504040>", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "word_perplexity" | |
| }, | |
| { | |
| "metric": "byte_perplexity" | |
| }, | |
| { | |
| "metric": "bits_per_byte" | |
| } | |
| ], | |
| "output_type": "loglikelihood_rolling", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "{{page}}" | |
| }, | |
| "winogrande": { | |
| "task": "winogrande", | |
| "dataset_path": "winogrande", | |
| "dataset_name": "winogrande_xl", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "doc_to_text": "<function doc_to_text at 0x7f82214d6ef0>", | |
| "doc_to_target": "<function doc_to_target at 0x7f82214d7370>", | |
| "doc_to_choice": "<function doc_to_choice at 0x7f82214d75b0>", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": false | |
| } | |
| }, | |
| "versions": { | |
| "arc_challenge": "Yaml", | |
| "arc_easy": "Yaml", | |
| "boolq": "Yaml", | |
| "hellaswag": "Yaml", | |
| "lambada_openai": "Yaml", | |
| "openbookqa": "Yaml", | |
| "piqa": "Yaml", | |
| "sciq": "Yaml", | |
| "wikitext": "Yaml", | |
| "winogrande": "Yaml" | |
| }, | |
| "config": { | |
| "model": "hf", | |
| "model_args": "pretrained=EleutherAI/pythia-70m", | |
| "num_fewshot": 0, | |
| "batch_size": 16, | |
| "batch_sizes": [], | |
| "device": "cuda:0", | |
| "use_cache": null, | |
| "limit": null, | |
| "bootstrap_iters": 100000 | |
| }, | |
| "git_hash": "4e44f0a" | |
| } | |
| hf (pretrained=EleutherAI/pythia-70m), limit: None, num_fewshot: 0, batch_size: 16 | |
| | Task |Version|Filter| Metric | Value | |Stderr| | |
| |--------------|-------|------|---------------|-------:|---|-----:| | |
| |arc_challenge |Yaml |none |acc | 0.1758|± |0.0111| | |
| | | |none |acc_norm | 0.2184|± |0.0121| | |
| |arc_easy |Yaml |none |acc | 0.3754|± |0.0099| | |
| | | |none |acc_norm | 0.3523|± |0.0098| | |
| |boolq |Yaml |none |acc | 0.5887|± |0.0086| | |
| |hellaswag |Yaml |none |acc | 0.2667|± |0.0044| | |
| | | |none |acc_norm | 0.2738|± |0.0045| | |
| |lambada_openai|Yaml |none |perplexity |130.9639|± |5.5012| | |
| | | |none |acc | 0.2271|± |0.0058| | |
| |openbookqa |Yaml |none |acc | 0.1260|± |0.0149| | |
| | | |none |acc_norm | 0.2540|± |0.0195| | |
| |piqa |Yaml |none |acc | 0.5985|± |0.0114| | |
| | | |none |acc_norm | 0.5919|± |0.0115| | |
| |sciq |Yaml |none |acc | 0.6400|± |0.0152| | |
| | | |none |acc_norm | 0.5640|± |0.0157| | |
| |wikitext |Yaml |none |word_perplexity|112.6458| | | | |
| | | |none |byte_perplexity| 2.1789| | | | |
| | | |none |bits_per_byte | 1.1236| | | | |
| |winogrande |Yaml |none |acc | 0.5296|± |0.0140| | |