Upload 8 files

Browse files

Files changed (8) hide show

__init__.py +8 -0
config.json +26 -0
omnigenome_wrapper.py +90 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
tokenizer.json +160 -0
tokenizer_config.json +15 -0
vocab.txt +10 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# -*- coding: utf-8 -*-
+# file: __init__.py
+# time: 13:48 04/06/2024
+# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
+# github: https://github.com/yangheng95
+# huggingface: https://huggingface.co/yangheng
+# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
+# Copyright (C) 2019-2024. All Rights Reserved.

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "../../SpliceBERT-paper/SpliceBERT-1024nt.tmp",
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 1026,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 6,
+  "output_hidden_states": true,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.24.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 10
+}

omnigenome_wrapper.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# -*- coding: utf-8 -*-
+# file: omnigenbench_wrapper.py
+# time: 00:57 27/04/2024
+# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
+# github: https://github.com/yangheng95
+# huggingface: https://huggingface.co/yangheng
+# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
+# Copyright (C) 2019-2024. All Rights Reserved.
+import warnings
+from omnigenbench import OmniTokenizer
+class Tokenizer(OmniTokenizer):
+    def __init__(self, base_tokenizer=None, u2t=True, add_whitespace=False, **kwargs):
+        super(Tokenizer, self).__init__(
+            base_tokenizer, u2t=u2t, add_whitespace=add_whitespace, **kwargs
+        )
+        self.metadata["tokenizer_name"] = self.__class__.__name__
+    def __call__(self, sequence, **kwargs):
+        if self.u2t:
+            sequence = "".join([seq.replace("U", "T").upper() for seq in sequence])
+        if self.t2u:
+            sequence = "".join([seq.replace("T", "U").upper() for seq in sequence])
+        if self.add_whitespace:
+            sequence = " ".join(list(sequence))
+        sequence_tokens = self.tokenize(sequence)[
+            : kwargs.get("max_length", self.max_length) - 2
+        ]
+        tokenized_inputs = {
+            "input_ids": [],
+            "attention_mask": [],
+        }
+        bos_id = (
+            self.base_tokenizer.bos_token_id
+            if self.base_tokenizer.bos_token_id is not None
+            else self.base_tokenizer.cls_token_id
+        )
+        eos_id = (
+            self.base_tokenizer.eos_token_id
+            if self.base_tokenizer.eos_token_id is not None
+            else self.base_tokenizer.sep_token_id
+        )
+        for tokens in sequence_tokens:
+            tokenized_inputs["input_ids"].append(
+                [bos_id] + self.base_tokenizer.convert_tokens_to_ids(tokens) + [eos_id]
+            )
+            tokenized_inputs["attention_mask"].append(
+                [1] * len(tokenized_inputs["input_ids"][-1])
+            )
+        for i, ids in enumerate(tokenized_inputs["input_ids"]):
+            if ids.count(self.base_tokenizer.unk_token_id) / len(ids) > 0.1:
+                warnings.warn(
+                    f"Unknown tokens are more than "
+                    f"{ids.count(self.base_tokenizer.unk_token_id) / len(ids)}% in the {i}-th sequence, "
+                    f"please check the tokenization process."
+                )
+        max_length = max(len(ids) for ids in tokenized_inputs["input_ids"])
+        tokenized_inputs = self.base_tokenizer.pad(
+            tokenized_inputs,
+            padding=kwargs.get("padding", "max_length"),
+            max_length=min(max_length, kwargs.get("max_length", 512)),
+            return_attention_mask=kwargs.get("return_attention_mask", True),
+            return_tensors="pt",
+        )
+        return tokenized_inputs
+    def tokenize(self, sequence, **kwargs):
+        if isinstance(sequence, str):
+            sequences = [sequence]
+        else:
+            sequences = sequence
+        sequence_tokens = []
+        for i in range(len(sequences)):
+            sequence_tokens.append(list(sequences[i]))
+        return sequence_tokens
+    def encode(self, sequence, **kwargs):
+        return self.base_tokenizer.encode(sequence, **kwargs)
+    def decode(self, sequence, **kwargs):
+        return self.base_tokenizer.decode(sequence, **kwargs)
+    def encode_plus(self, sequence, **kwargs):
+        return self.base_tokenizer.encode_plus(sequence, **kwargs)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ad91428c318e6c49233154073ca7a35f5f7899c9f4be3444775bae3dba0149d
+size 78883755

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,160 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "[PAD]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "[UNK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "[CLS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "[SEP]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "[MASK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "BertNormalizer",
+    "clean_text": true,
+    "handle_chinese_chars": true,
+    "strip_accents": null,
+    "lowercase": false
+  },
+  "pre_tokenizer": {
+    "type": "BertPreTokenizer"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "[CLS]": {
+        "id": "[CLS]",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "[CLS]"
+        ]
+      },
+      "[SEP]": {
+        "id": "[SEP]",
+        "ids": [
+          3
+        ],
+        "tokens": [
+          "[SEP]"
+        ]
+      }
+    }
+  },
+  "decoder": {
+    "type": "WordPiece",
+    "prefix": "##",
+    "cleanup": true
+  },
+  "model": {
+    "type": "WordPiece",
+    "unk_token": "[UNK]",
+    "continuing_subword_prefix": "##",
+    "max_input_chars_per_word": 100,
+    "vocab": {
+      "[PAD]": 0,
+      "[UNK]": 1,
+      "[CLS]": 2,
+      "[SEP]": 3,
+      "[MASK]": 4,
+      "N": 5,
+      "A": 6,
+      "C": 7,
+      "G": 8,
+      "T": 9
+    }
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "name_or_path": "../../SpliceBERT-paper/SpliceBERT-1024nt.tmp/",
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": "../../SpliceBERT-paper/SpliceBERT/special_tokens_map.json",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+[PAD]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+N
+A
+C
+G
+T