Upload 8 files
Browse files- config.json +31 -0
- log_bs32_lr3e-05_20221118_060236_793692.txt +639 -0
- pytorch_model.bin +3 -0
- result.txt +19 -0
- special_tokens_map.json +1 -0
- tokenizer.json +0 -0
- tokenizer_config.json +1 -0
- vocab.txt +0 -0
config.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "/home.local/jianwei/workspace/archive/SparseOptimizer/output/Layer_7_12_Hid_160_768_Head_10_12_IMRatio_3.5",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"BertForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"embedding_size": 160,
|
| 9 |
+
"finetuning_task": "rte",
|
| 10 |
+
"gradient_checkpointing": false,
|
| 11 |
+
"hidden_act": "gelu",
|
| 12 |
+
"hidden_dropout_prob": 0.1,
|
| 13 |
+
"hidden_size": 160,
|
| 14 |
+
"initializer_range": 0.02,
|
| 15 |
+
"intermediate_size": 560,
|
| 16 |
+
"layer_norm_eps": 1e-12,
|
| 17 |
+
"max_position_embeddings": 512,
|
| 18 |
+
"model_type": "bert",
|
| 19 |
+
"num_attention_heads": 10,
|
| 20 |
+
"num_hidden_layers": 7,
|
| 21 |
+
"output_intermediate": true,
|
| 22 |
+
"output_past": true,
|
| 23 |
+
"pad_token_id": 0,
|
| 24 |
+
"position_embedding_type": "absolute",
|
| 25 |
+
"problem_type": "single_label_classification",
|
| 26 |
+
"torch_dtype": "float32",
|
| 27 |
+
"transformers_version": "4.17.0",
|
| 28 |
+
"type_vocab_size": 2,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"vocab_size": 30522
|
| 31 |
+
}
|
log_bs32_lr3e-05_20221118_060236_793692.txt
ADDED
|
@@ -0,0 +1,639 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
------------> log file ==runs2/rte/1/log_bs32_lr3e-05_20221118_060236_793692.txt
|
| 2 |
+
Namespace(aug_train=False, data_dir='/home.local/jianwei/datasets/nlp/glue_data/RTE', do_eval=False, early_stop=True, early_stop_metric='accuracy', eval_step=120, gradient_accumulation_steps=1, learning_rate=3e-05, local_rank=0, lr_scheduler_type=<SchedulerType.CONSTANT_WITH_WARMUP: 'constant_with_warmup'>, max_length=128, max_train_steps=None, model_name_or_path='/home.local/jianwei/workspace/archive/SparseOptimizer/output/Layer_7_12_Hid_160_768_Head_10_12_IMRatio_3.5', num_train_epochs=30, num_warmup_steps=0, output_dir='runs2/rte/1', pad_to_max_length=False, per_device_eval_batch_size=32, per_device_train_batch_size=32, print_step=5, save_last=False, seed=None, task_name='rte', train_file=None, use_slow_tokenizer=False, validation_file=None, weight_decay=0.0)
|
| 3 |
+
Distributed environment: NO
|
| 4 |
+
Num processes: 1
|
| 5 |
+
Process index: 0
|
| 6 |
+
Local process index: 0
|
| 7 |
+
Device: cuda
|
| 8 |
+
Mixed precision type: fp16
|
| 9 |
+
|
| 10 |
+
Sample 595 of the training set: (tensor([ 101, 11929, 1010, 5553, 1012, 2570, 1006, 8418, 25311, 13860,
|
| 11 |
+
3388, 1007, 1011, 1011, 2019, 18410, 2140, 6187, 24887, 2080,
|
| 12 |
+
11183, 1010, 1037, 2280, 3539, 2704, 1010, 2180, 5978, 1005,
|
| 13 |
+
1055, 4883, 2602, 2006, 4465, 1012, 102, 2047, 5077, 3539,
|
| 14 |
+
2704, 2003, 2700, 1012, 102, 0, 0, 0, 0, 0,
|
| 15 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 16 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 17 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 18 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 19 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 20 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 21 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 22 |
+
0, 0, 0, 0, 0, 0, 0, 0]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
| 23 |
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
|
| 24 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 25 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 26 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 27 |
+
0, 0, 0, 0, 0, 0, 0, 0]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 28 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
|
| 29 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 30 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 31 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 32 |
+
0, 0, 0, 0, 0, 0, 0, 0]), tensor(1)).
|
| 33 |
+
Sample 2375 of the training set: (tensor([ 101, 1996, 5611, 2390, 2749, 3344, 2041, 1010, 2006, 5095,
|
| 34 |
+
1010, 1037, 6923, 2510, 3169, 2046, 1996, 2225, 2924, 2237,
|
| 35 |
+
1997, 15419, 2378, 1998, 2049, 13141, 3409, 1010, 2334, 9302,
|
| 36 |
+
4216, 2056, 1012, 102, 1996, 5611, 2390, 3344, 2041, 1037,
|
| 37 |
+
6923, 3169, 1999, 15419, 2378, 1012, 102, 0, 0, 0,
|
| 38 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 39 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 40 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 41 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 42 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 43 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 44 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 45 |
+
0, 0, 0, 0, 0, 0, 0, 0]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
| 46 |
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
|
| 47 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 48 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 49 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 50 |
+
0, 0, 0, 0, 0, 0, 0, 0]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 51 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
|
| 52 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 53 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 54 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 55 |
+
0, 0, 0, 0, 0, 0, 0, 0]), tensor(0)).
|
| 56 |
+
Sample 149 of the training set: (tensor([ 101, 2048, 9767, 8461, 2379, 2019, 5499, 2082, 1999, 4501,
|
| 57 |
+
2730, 2809, 2111, 1998, 5229, 4413, 2500, 7483, 1999, 1996,
|
| 58 |
+
6745, 8293, 1997, 4808, 13940, 1996, 2670, 3417, 1997, 15381,
|
| 59 |
+
1012, 102, 2809, 2111, 8461, 2048, 9767, 2379, 2019, 5499,
|
| 60 |
+
2082, 1999, 4501, 1012, 102, 0, 0, 0, 0, 0,
|
| 61 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 62 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 63 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 64 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 65 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 66 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 67 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 68 |
+
0, 0, 0, 0, 0, 0, 0, 0]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
| 69 |
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
|
| 70 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 71 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 72 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 73 |
+
0, 0, 0, 0, 0, 0, 0, 0]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 74 |
+
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
|
| 75 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 76 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 77 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 78 |
+
0, 0, 0, 0, 0, 0, 0, 0]), tensor(1)).
|
| 79 |
+
***** Running training *****
|
| 80 |
+
Num examples = 2490
|
| 81 |
+
Num Epochs = 30
|
| 82 |
+
Instantaneous batch size per device = 32
|
| 83 |
+
Total train batch size (w. parallel, distributed & accumulation) = 32
|
| 84 |
+
Gradient Accumulation steps = 1
|
| 85 |
+
Total optimization steps = 2340
|
| 86 |
+
000005/002340, loss: 0.694824, avg_loss: 0.691177
|
| 87 |
+
000010/002340, loss: 0.707565, avg_loss: 0.693715
|
| 88 |
+
000015/002340, loss: 0.699615, avg_loss: 0.693022
|
| 89 |
+
000020/002340, loss: 0.699615, avg_loss: 0.693939
|
| 90 |
+
000025/002340, loss: 0.699310, avg_loss: 0.694436
|
| 91 |
+
000030/002340, loss: 0.698532, avg_loss: 0.694941
|
| 92 |
+
000035/002340, loss: 0.686935, avg_loss: 0.694372
|
| 93 |
+
000040/002340, loss: 0.696411, avg_loss: 0.694273
|
| 94 |
+
000045/002340, loss: 0.692871, avg_loss: 0.693708
|
| 95 |
+
000050/002340, loss: 0.687256, avg_loss: 0.693756
|
| 96 |
+
000055/002340, loss: 0.701004, avg_loss: 0.693827
|
| 97 |
+
000060/002340, loss: 0.691040, avg_loss: 0.693579
|
| 98 |
+
000065/002340, loss: 0.689056, avg_loss: 0.693324
|
| 99 |
+
000070/002340, loss: 0.696518, avg_loss: 0.693440
|
| 100 |
+
000075/002340, loss: 0.696930, avg_loss: 0.693460
|
| 101 |
+
000080/002340, loss: 0.693802, avg_loss: 0.693340
|
| 102 |
+
000085/002340, loss: 0.688171, avg_loss: 0.693318
|
| 103 |
+
000090/002340, loss: 0.698029, avg_loss: 0.693154
|
| 104 |
+
000095/002340, loss: 0.689453, avg_loss: 0.692949
|
| 105 |
+
000100/002340, loss: 0.690857, avg_loss: 0.692921
|
| 106 |
+
000105/002340, loss: 0.689819, avg_loss: 0.692827
|
| 107 |
+
000110/002340, loss: 0.682220, avg_loss: 0.692768
|
| 108 |
+
000115/002340, loss: 0.700806, avg_loss: 0.692803
|
| 109 |
+
000120/002340, loss: 0.701385, avg_loss: 0.692652
|
| 110 |
+
***** Running dev evaluation *****
|
| 111 |
+
Num examples = 277
|
| 112 |
+
Instantaneous batch size per device = 32
|
| 113 |
+
epoch 1, step 120/2340: {'accuracy': 0.5523465703971119}
|
| 114 |
+
000125/002340, loss: 0.693527, avg_loss: 0.692706
|
| 115 |
+
000130/002340, loss: 0.689957, avg_loss: 0.692658
|
| 116 |
+
000135/002340, loss: 0.685425, avg_loss: 0.692536
|
| 117 |
+
000140/002340, loss: 0.690201, avg_loss: 0.692434
|
| 118 |
+
000145/002340, loss: 0.686600, avg_loss: 0.692396
|
| 119 |
+
000150/002340, loss: 0.678986, avg_loss: 0.692177
|
| 120 |
+
000155/002340, loss: 0.679138, avg_loss: 0.691975
|
| 121 |
+
000160/002340, loss: 0.694275, avg_loss: 0.691769
|
| 122 |
+
000165/002340, loss: 0.692368, avg_loss: 0.691443
|
| 123 |
+
000170/002340, loss: 0.680664, avg_loss: 0.691252
|
| 124 |
+
000175/002340, loss: 0.666016, avg_loss: 0.690698
|
| 125 |
+
000180/002340, loss: 0.671844, avg_loss: 0.690296
|
| 126 |
+
000185/002340, loss: 0.651184, avg_loss: 0.689748
|
| 127 |
+
000190/002340, loss: 0.659752, avg_loss: 0.688919
|
| 128 |
+
000195/002340, loss: 0.662926, avg_loss: 0.688697
|
| 129 |
+
000200/002340, loss: 0.643776, avg_loss: 0.688136
|
| 130 |
+
000205/002340, loss: 0.693794, avg_loss: 0.687406
|
| 131 |
+
000210/002340, loss: 0.716675, avg_loss: 0.686937
|
| 132 |
+
000215/002340, loss: 0.665474, avg_loss: 0.686136
|
| 133 |
+
000220/002340, loss: 0.625298, avg_loss: 0.685308
|
| 134 |
+
000225/002340, loss: 0.656639, avg_loss: 0.685019
|
| 135 |
+
000230/002340, loss: 0.673508, avg_loss: 0.684550
|
| 136 |
+
000235/002340, loss: 0.575394, avg_loss: 0.682954
|
| 137 |
+
000240/002340, loss: 0.615173, avg_loss: 0.681390
|
| 138 |
+
***** Running dev evaluation *****
|
| 139 |
+
Num examples = 277
|
| 140 |
+
Instantaneous batch size per device = 32
|
| 141 |
+
epoch 3, step 240/2340: {'accuracy': 0.5884476534296029}
|
| 142 |
+
000245/002340, loss: 0.566116, avg_loss: 0.679216
|
| 143 |
+
000250/002340, loss: 0.662231, avg_loss: 0.677990
|
| 144 |
+
000255/002340, loss: 0.742844, avg_loss: 0.677457
|
| 145 |
+
000260/002340, loss: 0.744896, avg_loss: 0.677289
|
| 146 |
+
000265/002340, loss: 0.524788, avg_loss: 0.675974
|
| 147 |
+
000270/002340, loss: 0.573128, avg_loss: 0.674871
|
| 148 |
+
000275/002340, loss: 0.698616, avg_loss: 0.674028
|
| 149 |
+
000280/002340, loss: 0.661125, avg_loss: 0.672997
|
| 150 |
+
000285/002340, loss: 0.577705, avg_loss: 0.671527
|
| 151 |
+
000290/002340, loss: 0.529144, avg_loss: 0.669498
|
| 152 |
+
000295/002340, loss: 0.548820, avg_loss: 0.668429
|
| 153 |
+
000300/002340, loss: 0.533775, avg_loss: 0.667589
|
| 154 |
+
000305/002340, loss: 0.724682, avg_loss: 0.666549
|
| 155 |
+
000310/002340, loss: 0.618702, avg_loss: 0.667052
|
| 156 |
+
000315/002340, loss: 0.600662, avg_loss: 0.666212
|
| 157 |
+
000320/002340, loss: 0.560127, avg_loss: 0.665015
|
| 158 |
+
000325/002340, loss: 0.667423, avg_loss: 0.663344
|
| 159 |
+
000330/002340, loss: 0.520096, avg_loss: 0.661692
|
| 160 |
+
000335/002340, loss: 0.589901, avg_loss: 0.659812
|
| 161 |
+
000340/002340, loss: 0.718616, avg_loss: 0.658405
|
| 162 |
+
000345/002340, loss: 0.523731, avg_loss: 0.657693
|
| 163 |
+
000350/002340, loss: 0.597912, avg_loss: 0.656364
|
| 164 |
+
000355/002340, loss: 0.510841, avg_loss: 0.654704
|
| 165 |
+
000360/002340, loss: 0.598392, avg_loss: 0.652629
|
| 166 |
+
***** Running dev evaluation *****
|
| 167 |
+
Num examples = 277
|
| 168 |
+
Instantaneous batch size per device = 32
|
| 169 |
+
epoch 4, step 360/2340: {'accuracy': 0.6137184115523465}
|
| 170 |
+
000365/002340, loss: 0.509396, avg_loss: 0.650652
|
| 171 |
+
000370/002340, loss: 0.625957, avg_loss: 0.649372
|
| 172 |
+
000375/002340, loss: 0.632420, avg_loss: 0.648425
|
| 173 |
+
000380/002340, loss: 0.562641, avg_loss: 0.647222
|
| 174 |
+
000385/002340, loss: 0.649609, avg_loss: 0.645501
|
| 175 |
+
000390/002340, loss: 0.361694, avg_loss: 0.643182
|
| 176 |
+
000395/002340, loss: 0.425430, avg_loss: 0.642246
|
| 177 |
+
000400/002340, loss: 0.577938, avg_loss: 0.640067
|
| 178 |
+
000405/002340, loss: 0.554668, avg_loss: 0.638333
|
| 179 |
+
000410/002340, loss: 0.505466, avg_loss: 0.636457
|
| 180 |
+
000415/002340, loss: 0.531124, avg_loss: 0.634969
|
| 181 |
+
000420/002340, loss: 0.425911, avg_loss: 0.633147
|
| 182 |
+
000425/002340, loss: 0.532368, avg_loss: 0.632082
|
| 183 |
+
000430/002340, loss: 0.569756, avg_loss: 0.630961
|
| 184 |
+
000435/002340, loss: 0.451645, avg_loss: 0.629107
|
| 185 |
+
000440/002340, loss: 0.459530, avg_loss: 0.627486
|
| 186 |
+
000445/002340, loss: 0.380501, avg_loss: 0.625123
|
| 187 |
+
000450/002340, loss: 0.565880, avg_loss: 0.624122
|
| 188 |
+
000455/002340, loss: 0.422201, avg_loss: 0.621911
|
| 189 |
+
000460/002340, loss: 0.671333, avg_loss: 0.620993
|
| 190 |
+
000465/002340, loss: 0.427799, avg_loss: 0.618575
|
| 191 |
+
000470/002340, loss: 0.301590, avg_loss: 0.616753
|
| 192 |
+
000475/002340, loss: 0.517204, avg_loss: 0.614735
|
| 193 |
+
000480/002340, loss: 0.473822, avg_loss: 0.612666
|
| 194 |
+
***** Running dev evaluation *****
|
| 195 |
+
Num examples = 277
|
| 196 |
+
Instantaneous batch size per device = 32
|
| 197 |
+
epoch 6, step 480/2340: {'accuracy': 0.6209386281588448}
|
| 198 |
+
000485/002340, loss: 0.235840, avg_loss: 0.610187
|
| 199 |
+
000490/002340, loss: 0.535803, avg_loss: 0.608769
|
| 200 |
+
000495/002340, loss: 0.447842, avg_loss: 0.606833
|
| 201 |
+
000500/002340, loss: 0.359915, avg_loss: 0.604468
|
| 202 |
+
000505/002340, loss: 0.473944, avg_loss: 0.601928
|
| 203 |
+
000510/002340, loss: 0.487707, avg_loss: 0.600405
|
| 204 |
+
000515/002340, loss: 0.280029, avg_loss: 0.599008
|
| 205 |
+
000520/002340, loss: 0.509848, avg_loss: 0.597484
|
| 206 |
+
000525/002340, loss: 0.646320, avg_loss: 0.596454
|
| 207 |
+
000530/002340, loss: 0.350674, avg_loss: 0.594710
|
| 208 |
+
000535/002340, loss: 0.480106, avg_loss: 0.593436
|
| 209 |
+
000540/002340, loss: 0.560251, avg_loss: 0.593214
|
| 210 |
+
000545/002340, loss: 0.387239, avg_loss: 0.591432
|
| 211 |
+
000550/002340, loss: 0.277430, avg_loss: 0.589320
|
| 212 |
+
000555/002340, loss: 0.280695, avg_loss: 0.587417
|
| 213 |
+
000560/002340, loss: 0.330351, avg_loss: 0.585310
|
| 214 |
+
000565/002340, loss: 0.391579, avg_loss: 0.583662
|
| 215 |
+
000570/002340, loss: 0.280355, avg_loss: 0.582107
|
| 216 |
+
000575/002340, loss: 0.359081, avg_loss: 0.580171
|
| 217 |
+
000580/002340, loss: 0.367201, avg_loss: 0.578450
|
| 218 |
+
000585/002340, loss: 0.430851, avg_loss: 0.577231
|
| 219 |
+
000590/002340, loss: 0.331879, avg_loss: 0.575557
|
| 220 |
+
000595/002340, loss: 0.333700, avg_loss: 0.573829
|
| 221 |
+
000600/002340, loss: 0.309275, avg_loss: 0.571686
|
| 222 |
+
***** Running dev evaluation *****
|
| 223 |
+
Num examples = 277
|
| 224 |
+
Instantaneous batch size per device = 32
|
| 225 |
+
epoch 7, step 600/2340: {'accuracy': 0.6425992779783394}
|
| 226 |
+
000605/002340, loss: 0.461454, avg_loss: 0.570168
|
| 227 |
+
000610/002340, loss: 0.434152, avg_loss: 0.568408
|
| 228 |
+
000615/002340, loss: 0.565701, avg_loss: 0.567013
|
| 229 |
+
000620/002340, loss: 0.281487, avg_loss: 0.564378
|
| 230 |
+
000625/002340, loss: 0.183996, avg_loss: 0.562576
|
| 231 |
+
000630/002340, loss: 0.308249, avg_loss: 0.560548
|
| 232 |
+
000635/002340, loss: 0.492087, avg_loss: 0.558905
|
| 233 |
+
000640/002340, loss: 0.276144, avg_loss: 0.556907
|
| 234 |
+
000645/002340, loss: 0.379016, avg_loss: 0.555011
|
| 235 |
+
000650/002340, loss: 0.257240, avg_loss: 0.553119
|
| 236 |
+
000655/002340, loss: 0.260510, avg_loss: 0.550735
|
| 237 |
+
000660/002340, loss: 0.482807, avg_loss: 0.549067
|
| 238 |
+
000665/002340, loss: 0.313425, avg_loss: 0.547653
|
| 239 |
+
000670/002340, loss: 0.244961, avg_loss: 0.545744
|
| 240 |
+
000675/002340, loss: 0.386663, avg_loss: 0.544380
|
| 241 |
+
000680/002340, loss: 0.137331, avg_loss: 0.541812
|
| 242 |
+
000685/002340, loss: 0.301256, avg_loss: 0.539778
|
| 243 |
+
000690/002340, loss: 0.284186, avg_loss: 0.537928
|
| 244 |
+
000695/002340, loss: 0.521972, avg_loss: 0.536261
|
| 245 |
+
000700/002340, loss: 0.718600, avg_loss: 0.535717
|
| 246 |
+
000705/002340, loss: 0.237306, avg_loss: 0.534266
|
| 247 |
+
000710/002340, loss: 0.164028, avg_loss: 0.532027
|
| 248 |
+
000715/002340, loss: 0.235560, avg_loss: 0.530920
|
| 249 |
+
000720/002340, loss: 0.224425, avg_loss: 0.529428
|
| 250 |
+
***** Running dev evaluation *****
|
| 251 |
+
Num examples = 277
|
| 252 |
+
Instantaneous batch size per device = 32
|
| 253 |
+
epoch 9, step 720/2340: {'accuracy': 0.6462093862815884}
|
| 254 |
+
000725/002340, loss: 0.250054, avg_loss: 0.527996
|
| 255 |
+
000730/002340, loss: 0.213790, avg_loss: 0.526521
|
| 256 |
+
000735/002340, loss: 0.339844, avg_loss: 0.525346
|
| 257 |
+
000740/002340, loss: 0.192316, avg_loss: 0.523399
|
| 258 |
+
000745/002340, loss: 0.322181, avg_loss: 0.521820
|
| 259 |
+
000750/002340, loss: 0.114270, avg_loss: 0.519722
|
| 260 |
+
000755/002340, loss: 0.242498, avg_loss: 0.517846
|
| 261 |
+
000760/002340, loss: 0.234197, avg_loss: 0.515497
|
| 262 |
+
000765/002340, loss: 0.332447, avg_loss: 0.513969
|
| 263 |
+
000770/002340, loss: 0.163693, avg_loss: 0.512496
|
| 264 |
+
000775/002340, loss: 0.260910, avg_loss: 0.511088
|
| 265 |
+
000780/002340, loss: 0.236919, avg_loss: 0.509495
|
| 266 |
+
000785/002340, loss: 0.151022, avg_loss: 0.507580
|
| 267 |
+
000790/002340, loss: 0.489914, avg_loss: 0.506298
|
| 268 |
+
000795/002340, loss: 0.175525, avg_loss: 0.504419
|
| 269 |
+
000800/002340, loss: 0.274471, avg_loss: 0.502310
|
| 270 |
+
000805/002340, loss: 0.308759, avg_loss: 0.500468
|
| 271 |
+
000810/002340, loss: 0.227170, avg_loss: 0.498888
|
| 272 |
+
000815/002340, loss: 0.112951, avg_loss: 0.496910
|
| 273 |
+
000820/002340, loss: 0.168542, avg_loss: 0.495333
|
| 274 |
+
000825/002340, loss: 0.163078, avg_loss: 0.493526
|
| 275 |
+
000830/002340, loss: 0.208418, avg_loss: 0.492144
|
| 276 |
+
000835/002340, loss: 0.204179, avg_loss: 0.490463
|
| 277 |
+
000840/002340, loss: 0.262290, avg_loss: 0.488488
|
| 278 |
+
***** Running dev evaluation *****
|
| 279 |
+
Num examples = 277
|
| 280 |
+
Instantaneous batch size per device = 32
|
| 281 |
+
epoch 10, step 840/2340: {'accuracy': 0.6245487364620939}
|
| 282 |
+
000845/002340, loss: 0.166388, avg_loss: 0.486870
|
| 283 |
+
000850/002340, loss: 0.221429, avg_loss: 0.485510
|
| 284 |
+
000855/002340, loss: 0.376082, avg_loss: 0.484030
|
| 285 |
+
000860/002340, loss: 0.083231, avg_loss: 0.482307
|
| 286 |
+
000865/002340, loss: 0.161541, avg_loss: 0.480355
|
| 287 |
+
000870/002340, loss: 0.180701, avg_loss: 0.478405
|
| 288 |
+
000875/002340, loss: 0.175531, avg_loss: 0.476498
|
| 289 |
+
000880/002340, loss: 0.148172, avg_loss: 0.475174
|
| 290 |
+
000885/002340, loss: 0.110148, avg_loss: 0.473676
|
| 291 |
+
000890/002340, loss: 0.177225, avg_loss: 0.472175
|
| 292 |
+
000895/002340, loss: 0.051785, avg_loss: 0.470479
|
| 293 |
+
000900/002340, loss: 0.239419, avg_loss: 0.469122
|
| 294 |
+
000905/002340, loss: 0.294643, avg_loss: 0.467460
|
| 295 |
+
000910/002340, loss: 0.372546, avg_loss: 0.466119
|
| 296 |
+
000915/002340, loss: 0.160401, avg_loss: 0.464562
|
| 297 |
+
000920/002340, loss: 0.389829, avg_loss: 0.463444
|
| 298 |
+
000925/002340, loss: 0.461596, avg_loss: 0.462050
|
| 299 |
+
000930/002340, loss: 0.169349, avg_loss: 0.460443
|
| 300 |
+
000935/002340, loss: 0.274192, avg_loss: 0.459206
|
| 301 |
+
000940/002340, loss: 0.245536, avg_loss: 0.457409
|
| 302 |
+
000945/002340, loss: 0.124900, avg_loss: 0.455669
|
| 303 |
+
000950/002340, loss: 0.258810, avg_loss: 0.453951
|
| 304 |
+
000955/002340, loss: 0.328007, avg_loss: 0.452289
|
| 305 |
+
000960/002340, loss: 0.243825, avg_loss: 0.450600
|
| 306 |
+
***** Running dev evaluation *****
|
| 307 |
+
Num examples = 277
|
| 308 |
+
Instantaneous batch size per device = 32
|
| 309 |
+
epoch 12, step 960/2340: {'accuracy': 0.6389891696750902}
|
| 310 |
+
000965/002340, loss: 0.201036, avg_loss: 0.449321
|
| 311 |
+
000970/002340, loss: 0.091728, avg_loss: 0.447797
|
| 312 |
+
000975/002340, loss: 0.182425, avg_loss: 0.446324
|
| 313 |
+
000980/002340, loss: 0.159452, avg_loss: 0.444909
|
| 314 |
+
000985/002340, loss: 0.142912, avg_loss: 0.443522
|
| 315 |
+
000990/002340, loss: 0.304327, avg_loss: 0.442004
|
| 316 |
+
000995/002340, loss: 0.117483, avg_loss: 0.440452
|
| 317 |
+
001000/002340, loss: 0.156437, avg_loss: 0.438837
|
| 318 |
+
001005/002340, loss: 0.032182, avg_loss: 0.437682
|
| 319 |
+
001010/002340, loss: 0.063084, avg_loss: 0.436744
|
| 320 |
+
001015/002340, loss: 0.258552, avg_loss: 0.435504
|
| 321 |
+
001020/002340, loss: 0.091414, avg_loss: 0.434340
|
| 322 |
+
001025/002340, loss: 0.100409, avg_loss: 0.432843
|
| 323 |
+
001030/002340, loss: 0.064708, avg_loss: 0.431516
|
| 324 |
+
001035/002340, loss: 0.459350, avg_loss: 0.430340
|
| 325 |
+
001040/002340, loss: 0.195770, avg_loss: 0.428896
|
| 326 |
+
001045/002340, loss: 0.101108, avg_loss: 0.427430
|
| 327 |
+
001050/002340, loss: 0.162723, avg_loss: 0.425868
|
| 328 |
+
001055/002340, loss: 0.170199, avg_loss: 0.424800
|
| 329 |
+
001060/002340, loss: 0.066082, avg_loss: 0.423415
|
| 330 |
+
001065/002340, loss: 0.139599, avg_loss: 0.422219
|
| 331 |
+
001070/002340, loss: 0.089475, avg_loss: 0.420665
|
| 332 |
+
001075/002340, loss: 0.115157, avg_loss: 0.419250
|
| 333 |
+
001080/002340, loss: 0.085939, avg_loss: 0.417821
|
| 334 |
+
***** Running dev evaluation *****
|
| 335 |
+
Num examples = 277
|
| 336 |
+
Instantaneous batch size per device = 32
|
| 337 |
+
epoch 13, step 1080/2340: {'accuracy': 0.6173285198555957}
|
| 338 |
+
001085/002340, loss: 0.138964, avg_loss: 0.416740
|
| 339 |
+
001090/002340, loss: 0.385725, avg_loss: 0.415552
|
| 340 |
+
001095/002340, loss: 0.173466, avg_loss: 0.414612
|
| 341 |
+
001100/002340, loss: 0.101382, avg_loss: 0.413397
|
| 342 |
+
001105/002340, loss: 0.098917, avg_loss: 0.412091
|
| 343 |
+
001110/002340, loss: 0.088198, avg_loss: 0.410518
|
| 344 |
+
001115/002340, loss: 0.039977, avg_loss: 0.409207
|
| 345 |
+
001120/002340, loss: 0.126413, avg_loss: 0.407805
|
| 346 |
+
001125/002340, loss: 0.154641, avg_loss: 0.406540
|
| 347 |
+
001130/002340, loss: 0.221717, avg_loss: 0.405238
|
| 348 |
+
001135/002340, loss: 0.155590, avg_loss: 0.403870
|
| 349 |
+
001140/002340, loss: 0.072533, avg_loss: 0.402521
|
| 350 |
+
001145/002340, loss: 0.148947, avg_loss: 0.401401
|
| 351 |
+
001150/002340, loss: 0.202878, avg_loss: 0.400165
|
| 352 |
+
001155/002340, loss: 0.054971, avg_loss: 0.399305
|
| 353 |
+
001160/002340, loss: 0.058926, avg_loss: 0.398088
|
| 354 |
+
001165/002340, loss: 0.187665, avg_loss: 0.396901
|
| 355 |
+
001170/002340, loss: 0.091442, avg_loss: 0.395624
|
| 356 |
+
001175/002340, loss: 0.339817, avg_loss: 0.394529
|
| 357 |
+
001180/002340, loss: 0.029183, avg_loss: 0.393430
|
| 358 |
+
001185/002340, loss: 0.052091, avg_loss: 0.392348
|
| 359 |
+
001190/002340, loss: 0.175309, avg_loss: 0.391464
|
| 360 |
+
001195/002340, loss: 0.269615, avg_loss: 0.390438
|
| 361 |
+
001200/002340, loss: 0.042982, avg_loss: 0.389416
|
| 362 |
+
***** Running dev evaluation *****
|
| 363 |
+
Num examples = 277
|
| 364 |
+
Instantaneous batch size per device = 32
|
| 365 |
+
epoch 15, step 1200/2340: {'accuracy': 0.6353790613718412}
|
| 366 |
+
001205/002340, loss: 0.029362, avg_loss: 0.388045
|
| 367 |
+
001210/002340, loss: 0.106356, avg_loss: 0.386842
|
| 368 |
+
001215/002340, loss: 0.055282, avg_loss: 0.385720
|
| 369 |
+
001220/002340, loss: 0.025587, avg_loss: 0.384474
|
| 370 |
+
001225/002340, loss: 0.017830, avg_loss: 0.383314
|
| 371 |
+
001230/002340, loss: 0.156192, avg_loss: 0.382166
|
| 372 |
+
001235/002340, loss: 0.017268, avg_loss: 0.381167
|
| 373 |
+
001240/002340, loss: 0.015908, avg_loss: 0.379919
|
| 374 |
+
001245/002340, loss: 0.024442, avg_loss: 0.378661
|
| 375 |
+
001250/002340, loss: 0.016508, avg_loss: 0.377585
|
| 376 |
+
001255/002340, loss: 0.021355, avg_loss: 0.376479
|
| 377 |
+
001260/002340, loss: 0.024076, avg_loss: 0.375165
|
| 378 |
+
001265/002340, loss: 0.202033, avg_loss: 0.374116
|
| 379 |
+
001270/002340, loss: 0.027793, avg_loss: 0.372882
|
| 380 |
+
001275/002340, loss: 0.027369, avg_loss: 0.372247
|
| 381 |
+
001280/002340, loss: 0.021813, avg_loss: 0.371052
|
| 382 |
+
001285/002340, loss: 0.021163, avg_loss: 0.370046
|
| 383 |
+
001290/002340, loss: 0.046603, avg_loss: 0.369336
|
| 384 |
+
001295/002340, loss: 0.076338, avg_loss: 0.368328
|
| 385 |
+
001300/002340, loss: 0.183380, avg_loss: 0.367225
|
| 386 |
+
001305/002340, loss: 0.169317, avg_loss: 0.366140
|
| 387 |
+
001310/002340, loss: 0.020987, avg_loss: 0.365018
|
| 388 |
+
001315/002340, loss: 0.169484, avg_loss: 0.364127
|
| 389 |
+
001320/002340, loss: 0.044023, avg_loss: 0.363106
|
| 390 |
+
***** Running dev evaluation *****
|
| 391 |
+
Num examples = 277
|
| 392 |
+
Instantaneous batch size per device = 32
|
| 393 |
+
epoch 16, step 1320/2340: {'accuracy': 0.6462093862815884}
|
| 394 |
+
001325/002340, loss: 0.146640, avg_loss: 0.361943
|
| 395 |
+
001330/002340, loss: 0.053370, avg_loss: 0.360778
|
| 396 |
+
001335/002340, loss: 0.024849, avg_loss: 0.359785
|
| 397 |
+
001340/002340, loss: 0.040356, avg_loss: 0.358545
|
| 398 |
+
001345/002340, loss: 0.216520, avg_loss: 0.357564
|
| 399 |
+
001350/002340, loss: 0.020188, avg_loss: 0.356442
|
| 400 |
+
001355/002340, loss: 0.050854, avg_loss: 0.355434
|
| 401 |
+
001360/002340, loss: 0.013922, avg_loss: 0.354336
|
| 402 |
+
001365/002340, loss: 0.034302, avg_loss: 0.353537
|
| 403 |
+
001370/002340, loss: 0.083984, avg_loss: 0.352530
|
| 404 |
+
001375/002340, loss: 0.044313, avg_loss: 0.351671
|
| 405 |
+
001380/002340, loss: 0.197178, avg_loss: 0.350656
|
| 406 |
+
001385/002340, loss: 0.087372, avg_loss: 0.349721
|
| 407 |
+
001390/002340, loss: 0.122292, avg_loss: 0.348657
|
| 408 |
+
001395/002340, loss: 0.161705, avg_loss: 0.347780
|
| 409 |
+
001400/002340, loss: 0.014310, avg_loss: 0.346943
|
| 410 |
+
001405/002340, loss: 0.096345, avg_loss: 0.345930
|
| 411 |
+
001410/002340, loss: 0.142292, avg_loss: 0.345120
|
| 412 |
+
001415/002340, loss: 0.016984, avg_loss: 0.344193
|
| 413 |
+
001420/002340, loss: 0.014843, avg_loss: 0.343171
|
| 414 |
+
001425/002340, loss: 0.054250, avg_loss: 0.342329
|
| 415 |
+
001430/002340, loss: 0.049341, avg_loss: 0.341417
|
| 416 |
+
001435/002340, loss: 0.033567, avg_loss: 0.340340
|
| 417 |
+
001440/002340, loss: 0.108241, avg_loss: 0.339508
|
| 418 |
+
***** Running dev evaluation *****
|
| 419 |
+
Num examples = 277
|
| 420 |
+
Instantaneous batch size per device = 32
|
| 421 |
+
epoch 18, step 1440/2340: {'accuracy': 0.6137184115523465}
|
| 422 |
+
001445/002340, loss: 0.148780, avg_loss: 0.338643
|
| 423 |
+
001450/002340, loss: 0.121979, avg_loss: 0.337871
|
| 424 |
+
001455/002340, loss: 0.015762, avg_loss: 0.337010
|
| 425 |
+
001460/002340, loss: 0.197943, avg_loss: 0.336178
|
| 426 |
+
001465/002340, loss: 0.019593, avg_loss: 0.335371
|
| 427 |
+
001470/002340, loss: 0.129545, avg_loss: 0.334404
|
| 428 |
+
001475/002340, loss: 0.015238, avg_loss: 0.333483
|
| 429 |
+
001480/002340, loss: 0.016869, avg_loss: 0.332625
|
| 430 |
+
001485/002340, loss: 0.011418, avg_loss: 0.331565
|
| 431 |
+
001490/002340, loss: 0.338315, avg_loss: 0.330893
|
| 432 |
+
001495/002340, loss: 0.288740, avg_loss: 0.330484
|
| 433 |
+
001500/002340, loss: 0.148870, avg_loss: 0.329575
|
| 434 |
+
001505/002340, loss: 0.013757, avg_loss: 0.328768
|
| 435 |
+
001510/002340, loss: 0.016786, avg_loss: 0.327894
|
| 436 |
+
001515/002340, loss: 0.013239, avg_loss: 0.326989
|
| 437 |
+
001520/002340, loss: 0.024581, avg_loss: 0.326006
|
| 438 |
+
001525/002340, loss: 0.017539, avg_loss: 0.325226
|
| 439 |
+
001530/002340, loss: 0.067678, avg_loss: 0.324287
|
| 440 |
+
001535/002340, loss: 0.024253, avg_loss: 0.323389
|
| 441 |
+
001540/002340, loss: 0.077925, avg_loss: 0.322495
|
| 442 |
+
001545/002340, loss: 0.024680, avg_loss: 0.321567
|
| 443 |
+
001550/002340, loss: 0.012920, avg_loss: 0.320824
|
| 444 |
+
001555/002340, loss: 0.023837, avg_loss: 0.320000
|
| 445 |
+
001560/002340, loss: 0.221982, avg_loss: 0.319304
|
| 446 |
+
***** Running dev evaluation *****
|
| 447 |
+
Num examples = 277
|
| 448 |
+
Instantaneous batch size per device = 32
|
| 449 |
+
epoch 19, step 1560/2340: {'accuracy': 0.6137184115523465}
|
| 450 |
+
001565/002340, loss: 0.013699, avg_loss: 0.318449
|
| 451 |
+
001570/002340, loss: 0.011844, avg_loss: 0.317610
|
| 452 |
+
001575/002340, loss: 0.012580, avg_loss: 0.316855
|
| 453 |
+
001580/002340, loss: 0.037540, avg_loss: 0.316005
|
| 454 |
+
001585/002340, loss: 0.019229, avg_loss: 0.315232
|
| 455 |
+
001590/002340, loss: 0.048232, avg_loss: 0.314477
|
| 456 |
+
001595/002340, loss: 0.141452, avg_loss: 0.313963
|
| 457 |
+
001600/002340, loss: 0.015298, avg_loss: 0.313133
|
| 458 |
+
001605/002340, loss: 0.013662, avg_loss: 0.312229
|
| 459 |
+
001610/002340, loss: 0.160849, avg_loss: 0.311404
|
| 460 |
+
001615/002340, loss: 0.012301, avg_loss: 0.310524
|
| 461 |
+
001620/002340, loss: 0.063877, avg_loss: 0.309759
|
| 462 |
+
001625/002340, loss: 0.032892, avg_loss: 0.309026
|
| 463 |
+
001630/002340, loss: 0.177563, avg_loss: 0.308279
|
| 464 |
+
001635/002340, loss: 0.157313, avg_loss: 0.307644
|
| 465 |
+
001640/002340, loss: 0.130090, avg_loss: 0.306819
|
| 466 |
+
001645/002340, loss: 0.021889, avg_loss: 0.306081
|
| 467 |
+
001650/002340, loss: 0.152882, avg_loss: 0.305300
|
| 468 |
+
001655/002340, loss: 0.009122, avg_loss: 0.304627
|
| 469 |
+
001660/002340, loss: 0.015140, avg_loss: 0.303849
|
| 470 |
+
001665/002340, loss: 0.164985, avg_loss: 0.303089
|
| 471 |
+
001670/002340, loss: 0.008990, avg_loss: 0.302396
|
| 472 |
+
001675/002340, loss: 0.010757, avg_loss: 0.301671
|
| 473 |
+
001680/002340, loss: 0.009137, avg_loss: 0.300904
|
| 474 |
+
***** Running dev evaluation *****
|
| 475 |
+
Num examples = 277
|
| 476 |
+
Instantaneous batch size per device = 32
|
| 477 |
+
epoch 21, step 1680/2340: {'accuracy': 0.6173285198555957}
|
| 478 |
+
001685/002340, loss: 0.053387, avg_loss: 0.300194
|
| 479 |
+
001690/002340, loss: 0.022511, avg_loss: 0.299502
|
| 480 |
+
001695/002340, loss: 0.105420, avg_loss: 0.298722
|
| 481 |
+
001700/002340, loss: 0.013549, avg_loss: 0.297988
|
| 482 |
+
001705/002340, loss: 0.073981, avg_loss: 0.297318
|
| 483 |
+
001710/002340, loss: 0.014491, avg_loss: 0.296600
|
| 484 |
+
001715/002340, loss: 0.154422, avg_loss: 0.295955
|
| 485 |
+
001720/002340, loss: 0.163267, avg_loss: 0.295310
|
| 486 |
+
001725/002340, loss: 0.136114, avg_loss: 0.294759
|
| 487 |
+
001730/002340, loss: 0.015310, avg_loss: 0.294064
|
| 488 |
+
001735/002340, loss: 0.087005, avg_loss: 0.293422
|
| 489 |
+
001740/002340, loss: 0.020296, avg_loss: 0.292756
|
| 490 |
+
001745/002340, loss: 0.018787, avg_loss: 0.292135
|
| 491 |
+
001750/002340, loss: 0.034191, avg_loss: 0.291526
|
| 492 |
+
001755/002340, loss: 0.045470, avg_loss: 0.290987
|
| 493 |
+
001760/002340, loss: 0.014372, avg_loss: 0.290662
|
| 494 |
+
001765/002340, loss: 0.015767, avg_loss: 0.289942
|
| 495 |
+
001770/002340, loss: 0.039629, avg_loss: 0.289302
|
| 496 |
+
001775/002340, loss: 0.016410, avg_loss: 0.288527
|
| 497 |
+
001780/002340, loss: 0.038289, avg_loss: 0.287933
|
| 498 |
+
001785/002340, loss: 0.017720, avg_loss: 0.287493
|
| 499 |
+
001790/002340, loss: 0.033570, avg_loss: 0.286735
|
| 500 |
+
001795/002340, loss: 0.012522, avg_loss: 0.286079
|
| 501 |
+
001800/002340, loss: 0.053891, avg_loss: 0.285344
|
| 502 |
+
***** Running dev evaluation *****
|
| 503 |
+
Num examples = 277
|
| 504 |
+
Instantaneous batch size per device = 32
|
| 505 |
+
epoch 23, step 1800/2340: {'accuracy': 0.6245487364620939}
|
| 506 |
+
001805/002340, loss: 0.126177, avg_loss: 0.284716
|
| 507 |
+
001810/002340, loss: 0.011923, avg_loss: 0.284070
|
| 508 |
+
001815/002340, loss: 0.142181, avg_loss: 0.283613
|
| 509 |
+
001820/002340, loss: 0.010828, avg_loss: 0.282998
|
| 510 |
+
001825/002340, loss: 0.025087, avg_loss: 0.282492
|
| 511 |
+
001830/002340, loss: 0.273915, avg_loss: 0.281916
|
| 512 |
+
001835/002340, loss: 0.016827, avg_loss: 0.281382
|
| 513 |
+
001840/002340, loss: 0.010785, avg_loss: 0.280767
|
| 514 |
+
001845/002340, loss: 0.015339, avg_loss: 0.280337
|
| 515 |
+
001850/002340, loss: 0.020906, avg_loss: 0.279696
|
| 516 |
+
001855/002340, loss: 0.165239, avg_loss: 0.279069
|
| 517 |
+
001860/002340, loss: 0.053642, avg_loss: 0.278450
|
| 518 |
+
001865/002340, loss: 0.133574, avg_loss: 0.277862
|
| 519 |
+
001870/002340, loss: 0.097644, avg_loss: 0.277226
|
| 520 |
+
001875/002340, loss: 0.059441, avg_loss: 0.276570
|
| 521 |
+
001880/002340, loss: 0.016699, avg_loss: 0.275948
|
| 522 |
+
001885/002340, loss: 0.146401, avg_loss: 0.275488
|
| 523 |
+
001890/002340, loss: 0.011636, avg_loss: 0.274799
|
| 524 |
+
001895/002340, loss: 0.018686, avg_loss: 0.274214
|
| 525 |
+
001900/002340, loss: 0.026965, avg_loss: 0.273611
|
| 526 |
+
001905/002340, loss: 0.013933, avg_loss: 0.272935
|
| 527 |
+
001910/002340, loss: 0.125580, avg_loss: 0.272318
|
| 528 |
+
001915/002340, loss: 0.129783, avg_loss: 0.271802
|
| 529 |
+
001920/002340, loss: 0.116678, avg_loss: 0.271278
|
| 530 |
+
***** Running dev evaluation *****
|
| 531 |
+
Num examples = 277
|
| 532 |
+
Instantaneous batch size per device = 32
|
| 533 |
+
epoch 24, step 1920/2340: {'accuracy': 0.6173285198555957}
|
| 534 |
+
001925/002340, loss: 0.254784, avg_loss: 0.270806
|
| 535 |
+
001930/002340, loss: 0.157526, avg_loss: 0.270238
|
| 536 |
+
001935/002340, loss: 0.031608, avg_loss: 0.269644
|
| 537 |
+
001940/002340, loss: 0.009236, avg_loss: 0.269169
|
| 538 |
+
001945/002340, loss: 0.009980, avg_loss: 0.268799
|
| 539 |
+
001950/002340, loss: 0.033835, avg_loss: 0.268168
|
| 540 |
+
001955/002340, loss: 0.051771, avg_loss: 0.267547
|
| 541 |
+
001960/002340, loss: 0.142184, avg_loss: 0.267055
|
| 542 |
+
001965/002340, loss: 0.046325, avg_loss: 0.266676
|
| 543 |
+
001970/002340, loss: 0.041966, avg_loss: 0.266192
|
| 544 |
+
001975/002340, loss: 0.020202, avg_loss: 0.265597
|
| 545 |
+
001980/002340, loss: 0.125195, avg_loss: 0.265071
|
| 546 |
+
001985/002340, loss: 0.019307, avg_loss: 0.264558
|
| 547 |
+
001990/002340, loss: 0.011511, avg_loss: 0.263954
|
| 548 |
+
001995/002340, loss: 0.092994, avg_loss: 0.263384
|
| 549 |
+
002000/002340, loss: 0.098703, avg_loss: 0.262809
|
| 550 |
+
002005/002340, loss: 0.017836, avg_loss: 0.262371
|
| 551 |
+
002010/002340, loss: 0.047947, avg_loss: 0.261831
|
| 552 |
+
002015/002340, loss: 0.157151, avg_loss: 0.261291
|
| 553 |
+
002020/002340, loss: 0.063095, avg_loss: 0.260695
|
| 554 |
+
002025/002340, loss: 0.239691, avg_loss: 0.260198
|
| 555 |
+
002030/002340, loss: 0.008953, avg_loss: 0.259652
|
| 556 |
+
002035/002340, loss: 0.008303, avg_loss: 0.259056
|
| 557 |
+
002040/002340, loss: 0.133496, avg_loss: 0.258505
|
| 558 |
+
***** Running dev evaluation *****
|
| 559 |
+
Num examples = 277
|
| 560 |
+
Instantaneous batch size per device = 32
|
| 561 |
+
epoch 26, step 2040/2340: {'accuracy': 0.6173285198555957}
|
| 562 |
+
002045/002340, loss: 0.070495, avg_loss: 0.258069
|
| 563 |
+
002050/002340, loss: 0.082666, avg_loss: 0.257558
|
| 564 |
+
002055/002340, loss: 0.036117, avg_loss: 0.257011
|
| 565 |
+
002060/002340, loss: 0.018446, avg_loss: 0.256447
|
| 566 |
+
002065/002340, loss: 0.019938, avg_loss: 0.255982
|
| 567 |
+
002070/002340, loss: 0.010070, avg_loss: 0.255545
|
| 568 |
+
002075/002340, loss: 0.010592, avg_loss: 0.254990
|
| 569 |
+
002080/002340, loss: 0.047749, avg_loss: 0.254418
|
| 570 |
+
002085/002340, loss: 0.157273, avg_loss: 0.253991
|
| 571 |
+
002090/002340, loss: 0.012268, avg_loss: 0.253488
|
| 572 |
+
002095/002340, loss: 0.010397, avg_loss: 0.252964
|
| 573 |
+
002100/002340, loss: 0.152166, avg_loss: 0.252516
|
| 574 |
+
002105/002340, loss: 0.149034, avg_loss: 0.252077
|
| 575 |
+
002110/002340, loss: 0.022406, avg_loss: 0.251554
|
| 576 |
+
002115/002340, loss: 0.050635, avg_loss: 0.251001
|
| 577 |
+
002120/002340, loss: 0.101384, avg_loss: 0.250624
|
| 578 |
+
002125/002340, loss: 0.019535, avg_loss: 0.250064
|
| 579 |
+
002130/002340, loss: 0.017638, avg_loss: 0.249509
|
| 580 |
+
002135/002340, loss: 0.007454, avg_loss: 0.249097
|
| 581 |
+
002140/002340, loss: 0.170886, avg_loss: 0.248638
|
| 582 |
+
002145/002340, loss: 0.008658, avg_loss: 0.248148
|
| 583 |
+
002150/002340, loss: 0.018784, avg_loss: 0.247731
|
| 584 |
+
002155/002340, loss: 0.006945, avg_loss: 0.247294
|
| 585 |
+
002160/002340, loss: 0.149141, avg_loss: 0.246973
|
| 586 |
+
***** Running dev evaluation *****
|
| 587 |
+
Num examples = 277
|
| 588 |
+
Instantaneous batch size per device = 32
|
| 589 |
+
epoch 27, step 2160/2340: {'accuracy': 0.6173285198555957}
|
| 590 |
+
002165/002340, loss: 0.070260, avg_loss: 0.246627
|
| 591 |
+
002170/002340, loss: 0.018735, avg_loss: 0.246110
|
| 592 |
+
002175/002340, loss: 0.011750, avg_loss: 0.245641
|
| 593 |
+
002180/002340, loss: 0.024557, avg_loss: 0.245194
|
| 594 |
+
002185/002340, loss: 0.022439, avg_loss: 0.244675
|
| 595 |
+
002190/002340, loss: 0.009183, avg_loss: 0.244218
|
| 596 |
+
002195/002340, loss: 0.147473, avg_loss: 0.243797
|
| 597 |
+
002200/002340, loss: 0.008439, avg_loss: 0.243311
|
| 598 |
+
002205/002340, loss: 0.009392, avg_loss: 0.242842
|
| 599 |
+
002210/002340, loss: 0.007260, avg_loss: 0.242363
|
| 600 |
+
002215/002340, loss: 0.006505, avg_loss: 0.241869
|
| 601 |
+
002220/002340, loss: 0.036663, avg_loss: 0.241415
|
| 602 |
+
002225/002340, loss: 0.010591, avg_loss: 0.240936
|
| 603 |
+
002230/002340, loss: 0.008057, avg_loss: 0.240418
|
| 604 |
+
002235/002340, loss: 0.005135, avg_loss: 0.240005
|
| 605 |
+
002240/002340, loss: 0.009763, avg_loss: 0.239661
|
| 606 |
+
002245/002340, loss: 0.009173, avg_loss: 0.239206
|
| 607 |
+
002250/002340, loss: 0.015700, avg_loss: 0.238819
|
| 608 |
+
002255/002340, loss: 0.021340, avg_loss: 0.238346
|
| 609 |
+
002260/002340, loss: 0.060185, avg_loss: 0.237882
|
| 610 |
+
002265/002340, loss: 0.038913, avg_loss: 0.237484
|
| 611 |
+
002270/002340, loss: 0.016376, avg_loss: 0.237112
|
| 612 |
+
002275/002340, loss: 0.010828, avg_loss: 0.236714
|
| 613 |
+
002280/002340, loss: 0.129731, avg_loss: 0.236370
|
| 614 |
+
***** Running dev evaluation *****
|
| 615 |
+
Num examples = 277
|
| 616 |
+
Instantaneous batch size per device = 32
|
| 617 |
+
epoch 29, step 2280/2340: {'accuracy': 0.6064981949458483}
|
| 618 |
+
002285/002340, loss: 0.044581, avg_loss: 0.235897
|
| 619 |
+
002290/002340, loss: 0.008923, avg_loss: 0.235524
|
| 620 |
+
002295/002340, loss: 0.011697, avg_loss: 0.235179
|
| 621 |
+
002300/002340, loss: 0.020234, avg_loss: 0.234708
|
| 622 |
+
002305/002340, loss: 0.024606, avg_loss: 0.234225
|
| 623 |
+
002310/002340, loss: 0.007431, avg_loss: 0.233798
|
| 624 |
+
002315/002340, loss: 0.006717, avg_loss: 0.233382
|
| 625 |
+
002320/002340, loss: 0.017990, avg_loss: 0.232940
|
| 626 |
+
002325/002340, loss: 0.145197, avg_loss: 0.232597
|
| 627 |
+
002330/002340, loss: 0.013951, avg_loss: 0.232139
|
| 628 |
+
002335/002340, loss: 0.014238, avg_loss: 0.231719
|
| 629 |
+
002340/002340, loss: 0.019154, avg_loss: 0.231268
|
| 630 |
+
***** Running train evaluation *****
|
| 631 |
+
Num examples = 2490
|
| 632 |
+
Instantaneous batch size per device = 32
|
| 633 |
+
Train Dataset Result: {'accuracy': 0.9955823293172691}
|
| 634 |
+
***** Running dev evaluation *****
|
| 635 |
+
Num examples = 277
|
| 636 |
+
Instantaneous batch size per device = 32
|
| 637 |
+
Dev Dataset Result: {'accuracy': 0.6101083032490975}
|
| 638 |
+
DEV Best Result: accuracy, 0.6462093862815884
|
| 639 |
+
Training time 0:02:36
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58c4433dc0148c6dcbb383b9e233378c256de46436f4b7c33785bfe5dc3da8f7
|
| 3 |
+
size 34299149
|
result.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{'accuracy': 0.5523465703971119}
|
| 2 |
+
{'accuracy': 0.5884476534296029}
|
| 3 |
+
{'accuracy': 0.6137184115523465}
|
| 4 |
+
{'accuracy': 0.6209386281588448}
|
| 5 |
+
{'accuracy': 0.6425992779783394}
|
| 6 |
+
{'accuracy': 0.6462093862815884}
|
| 7 |
+
{'accuracy': 0.6245487364620939}
|
| 8 |
+
{'accuracy': 0.6389891696750902}
|
| 9 |
+
{'accuracy': 0.6173285198555957}
|
| 10 |
+
{'accuracy': 0.6353790613718412}
|
| 11 |
+
{'accuracy': 0.6462093862815884}
|
| 12 |
+
{'accuracy': 0.6137184115523465}
|
| 13 |
+
{'accuracy': 0.6137184115523465}
|
| 14 |
+
{'accuracy': 0.6173285198555957}
|
| 15 |
+
{'accuracy': 0.6245487364620939}
|
| 16 |
+
{'accuracy': 0.6173285198555957}
|
| 17 |
+
{'accuracy': 0.6173285198555957}
|
| 18 |
+
{'accuracy': 0.6173285198555957}
|
| 19 |
+
{'accuracy': 0.6064981949458483}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_basic_tokenize": true, "model_max_length": 512, "name_or_path": "/home.local/jianwei/workspace/archive/SparseOptimizer/output/Layer_7_12_Hid_160_768_Head_10_12_IMRatio_3.5", "never_split": null, "special_tokens_map_file": "/home.local/jianwei/.cache/huggingface/transformers/b680d52711d2451bbd6c6b1700365d6d731977c1357ae86bd7227f61145d3be2.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d", "tokenizer_class": "BertTokenizer"}
|
vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|