yangheng commited on
Commit
853c27d
·
verified ·
1 Parent(s): d404b33

Upload 8 files

Browse files
__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # file: __init__.py
3
+ # time: 13:48 04/06/2024
4
+ # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
+ # github: https://github.com/yangheng95
6
+ # huggingface: https://huggingface.co/yangheng
7
+ # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
+ # Copyright (C) 2019-2024. All Rights Reserved.
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../../SpliceBERT-paper/SpliceBERT-1024nt.tmp",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 512,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 2048,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 1026,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 6,
18
+ "output_hidden_states": true,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.24.0",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 10
26
+ }
omnigenome_wrapper.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # file: omnigenbench_wrapper.py
3
+ # time: 00:57 27/04/2024
4
+ # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
+ # github: https://github.com/yangheng95
6
+ # huggingface: https://huggingface.co/yangheng
7
+ # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
+ # Copyright (C) 2019-2024. All Rights Reserved.
9
+
10
+ import warnings
11
+
12
+ from omnigenbench import OmniTokenizer
13
+
14
+
15
+ class Tokenizer(OmniTokenizer):
16
+ def __init__(self, base_tokenizer=None, u2t=True, add_whitespace=False, **kwargs):
17
+ super(Tokenizer, self).__init__(
18
+ base_tokenizer, u2t=u2t, add_whitespace=add_whitespace, **kwargs
19
+ )
20
+ self.metadata["tokenizer_name"] = self.__class__.__name__
21
+
22
+ def __call__(self, sequence, **kwargs):
23
+ if self.u2t:
24
+ sequence = "".join([seq.replace("U", "T").upper() for seq in sequence])
25
+ if self.t2u:
26
+ sequence = "".join([seq.replace("T", "U").upper() for seq in sequence])
27
+ if self.add_whitespace:
28
+ sequence = " ".join(list(sequence))
29
+ sequence_tokens = self.tokenize(sequence)[
30
+ : kwargs.get("max_length", self.max_length) - 2
31
+ ]
32
+ tokenized_inputs = {
33
+ "input_ids": [],
34
+ "attention_mask": [],
35
+ }
36
+ bos_id = (
37
+ self.base_tokenizer.bos_token_id
38
+ if self.base_tokenizer.bos_token_id is not None
39
+ else self.base_tokenizer.cls_token_id
40
+ )
41
+ eos_id = (
42
+ self.base_tokenizer.eos_token_id
43
+ if self.base_tokenizer.eos_token_id is not None
44
+ else self.base_tokenizer.sep_token_id
45
+ )
46
+ for tokens in sequence_tokens:
47
+ tokenized_inputs["input_ids"].append(
48
+ [bos_id] + self.base_tokenizer.convert_tokens_to_ids(tokens) + [eos_id]
49
+ )
50
+ tokenized_inputs["attention_mask"].append(
51
+ [1] * len(tokenized_inputs["input_ids"][-1])
52
+ )
53
+
54
+ for i, ids in enumerate(tokenized_inputs["input_ids"]):
55
+ if ids.count(self.base_tokenizer.unk_token_id) / len(ids) > 0.1:
56
+ warnings.warn(
57
+ f"Unknown tokens are more than "
58
+ f"{ids.count(self.base_tokenizer.unk_token_id) / len(ids)}% in the {i}-th sequence, "
59
+ f"please check the tokenization process."
60
+ )
61
+ max_length = max(len(ids) for ids in tokenized_inputs["input_ids"])
62
+ tokenized_inputs = self.base_tokenizer.pad(
63
+ tokenized_inputs,
64
+ padding=kwargs.get("padding", "max_length"),
65
+ max_length=min(max_length, kwargs.get("max_length", 512)),
66
+ return_attention_mask=kwargs.get("return_attention_mask", True),
67
+ return_tensors="pt",
68
+ )
69
+ return tokenized_inputs
70
+
71
+ def tokenize(self, sequence, **kwargs):
72
+ if isinstance(sequence, str):
73
+ sequences = [sequence]
74
+ else:
75
+ sequences = sequence
76
+
77
+ sequence_tokens = []
78
+ for i in range(len(sequences)):
79
+ sequence_tokens.append(list(sequences[i]))
80
+
81
+ return sequence_tokens
82
+
83
+ def encode(self, sequence, **kwargs):
84
+ return self.base_tokenizer.encode(sequence, **kwargs)
85
+
86
+ def decode(self, sequence, **kwargs):
87
+ return self.base_tokenizer.decode(sequence, **kwargs)
88
+
89
+ def encode_plus(self, sequence, **kwargs):
90
+ return self.base_tokenizer.encode_plus(sequence, **kwargs)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ad91428c318e6c49233154073ca7a35f5f7899c9f4be3444775bae3dba0149d
3
+ size 78883755
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[UNK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[CLS]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[SEP]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": {
53
+ "type": "BertNormalizer",
54
+ "clean_text": true,
55
+ "handle_chinese_chars": true,
56
+ "strip_accents": null,
57
+ "lowercase": false
58
+ },
59
+ "pre_tokenizer": {
60
+ "type": "BertPreTokenizer"
61
+ },
62
+ "post_processor": {
63
+ "type": "TemplateProcessing",
64
+ "single": [
65
+ {
66
+ "SpecialToken": {
67
+ "id": "[CLS]",
68
+ "type_id": 0
69
+ }
70
+ },
71
+ {
72
+ "Sequence": {
73
+ "id": "A",
74
+ "type_id": 0
75
+ }
76
+ },
77
+ {
78
+ "SpecialToken": {
79
+ "id": "[SEP]",
80
+ "type_id": 0
81
+ }
82
+ }
83
+ ],
84
+ "pair": [
85
+ {
86
+ "SpecialToken": {
87
+ "id": "[CLS]",
88
+ "type_id": 0
89
+ }
90
+ },
91
+ {
92
+ "Sequence": {
93
+ "id": "A",
94
+ "type_id": 0
95
+ }
96
+ },
97
+ {
98
+ "SpecialToken": {
99
+ "id": "[SEP]",
100
+ "type_id": 0
101
+ }
102
+ },
103
+ {
104
+ "Sequence": {
105
+ "id": "B",
106
+ "type_id": 1
107
+ }
108
+ },
109
+ {
110
+ "SpecialToken": {
111
+ "id": "[SEP]",
112
+ "type_id": 1
113
+ }
114
+ }
115
+ ],
116
+ "special_tokens": {
117
+ "[CLS]": {
118
+ "id": "[CLS]",
119
+ "ids": [
120
+ 2
121
+ ],
122
+ "tokens": [
123
+ "[CLS]"
124
+ ]
125
+ },
126
+ "[SEP]": {
127
+ "id": "[SEP]",
128
+ "ids": [
129
+ 3
130
+ ],
131
+ "tokens": [
132
+ "[SEP]"
133
+ ]
134
+ }
135
+ }
136
+ },
137
+ "decoder": {
138
+ "type": "WordPiece",
139
+ "prefix": "##",
140
+ "cleanup": true
141
+ },
142
+ "model": {
143
+ "type": "WordPiece",
144
+ "unk_token": "[UNK]",
145
+ "continuing_subword_prefix": "##",
146
+ "max_input_chars_per_word": 100,
147
+ "vocab": {
148
+ "[PAD]": 0,
149
+ "[UNK]": 1,
150
+ "[CLS]": 2,
151
+ "[SEP]": 3,
152
+ "[MASK]": 4,
153
+ "N": 5,
154
+ "A": 6,
155
+ "C": 7,
156
+ "G": 8,
157
+ "T": 9
158
+ }
159
+ }
160
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": false,
5
+ "mask_token": "[MASK]",
6
+ "name_or_path": "../../SpliceBERT-paper/SpliceBERT-1024nt.tmp/",
7
+ "never_split": null,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "special_tokens_map_file": "../../SpliceBERT-paper/SpliceBERT/special_tokens_map.json",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
vocab.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ N
7
+ A
8
+ C
9
+ G
10
+ T