Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

README.md +73 -0
attention.py +114 -0
config.json +41 -0
configuration_falcon_ocr.py +65 -0
model.safetensors +3 -0
model_args.json +41 -0
modeling_falcon_ocr.py +586 -0
processing_falcon_ocr.py +423 -0
rope.py +127 -0
special_tokens_map.json +390 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,73 @@

+---
+license: apache-2.0
+pipeline_tag: image-to-text
+library_name: transformers
+tags:
+- falcon
+- ocr
+- vision-language
+- document-understanding
+---
+# Falcon OCR
+Dense early-fusion vision-language model for **document OCR**. Given a document image, it extracts text, tables, formulas, and other elements as plain text.
+## Installation
+```bash
+pip install transformers torch einops
+```
+Requires **PyTorch 2.5+** (FlexAttention).
+## Quick Start
+```python
+import torch
+from transformers import AutoModelForCausalLM
+from PIL import Image
+model = AutoModelForCausalLM.from_pretrained(
+    "tiiuae/Falcon-OCR",
+    trust_remote_code=True,
+    dtype=torch.bfloat16,
+    device_map="cuda",
+)
+image = Image.open("document.png")
+texts = model.generate(image)
+print(texts[0])
+```
+> The first `generate()` call is slower (~10-15 s) because `torch.compile` builds optimized kernels. Subsequent calls are much faster.
+## Categories
+By default, category is `"plain"` (general text extraction). You can specify a category to use a task-specific prompt:
+```python
+texts = model.generate(image, category="table")
+texts = model.generate(image, category="formula")
+```
+Available categories: `plain`, `text`, `table`, `formula`, `caption`, `footnote`, `list-item`, `page-footer`, `page-header`, `section-header`, `title`.
+## API Reference
+### `model.generate(images, *, category="plain", **kwargs)`
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `images` | `PIL.Image` or `list` | required | Single image or list of images (PIL, path, or URL) |
+| `category` | `str` or `list[str]` | `"plain"` | OCR category (one per image or broadcast) |
+| `max_new_tokens` | `int` | `4096` | Maximum generation steps |
+| `min_dimension` | `int` | `64` | Minimum image side after resize |
+| `max_dimension` | `int` | `1024` | Maximum image side after resize |
+**Returns:** `list[str]` — one extracted text string per image.
+## Citation

attention.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+from torch import Tensor as T
+from torch.nn.attention.flex_attention import (
+    _mask_mod_signature,
+    create_block_mask,
+    flex_attention,
+)
+# ---------------------------------------------------------------------------
+# Two compiled variants of flex_attention
+# ---------------------------------------------------------------------------
+# _decode:  fullgraph=True, static shapes.
+#           Used for decode steps (S_q == 1) where shapes are fixed and
+#           the call will be captured inside a CUDA graph.  fullgraph=True
+#           avoids graph breaks that would corrupt the capture.
+#
+# _prefill: dynamic=True, symbolic shapes.
+#           Used for prefill steps (S_q > 1) where the sequence length
+#           varies per image.  dynamic=True lets one compiled graph handle
+#           all lengths without recompilation.  Prefill is never inside a
+#           CUDA graph, so symbolic shape guards are fine.
+compiled_flex_attn_decode = torch.compile(flex_attention, fullgraph=True)
+compiled_flex_attn_prefill = torch.compile(flex_attention, dynamic=True)
+def offset_mask_mod(mask_mod: _mask_mod_signature, offset: int):
+    """Get a mask mod function with an offset applied to the query positions."""
+    def _mask_mod(b, h, q, kv):
+        return mask_mod(b, h, q + offset, kv)
+    return _mask_mod
+def get_causal_mask_mod() -> _mask_mod_signature:
+    """Causal mask that prevents attention to future tokens."""
+    def _causal_mask(b: T, h: T, q_idx: T, kv_idx: T) -> T:
+        return q_idx >= kv_idx
+    return _causal_mask
+def get_document_mask_mod(batch: T, eos_id: int) -> _mask_mod_signature:
+    """Creates a document mask that prevents attention across document boundaries.
+    Args:
+        batch: Input batch tensor with shape [b, s, h, d]
+        eos_id: End-of-sequence token ID that marks document boundaries
+    Returns:
+        A mask modifier function that implements document-level masking.
+    """
+    # batch is [b, s, h, d] shape
+    eos_mask = batch == eos_id
+    eos_mask[:, -1] = True
+    cumulative_mask = torch.cumsum(torch.where(eos_mask, 1, 0), dim=1)
+    sequence_indices = torch.zeros_like(cumulative_mask, dtype=torch.int32)
+    sequence_indices[:, 1:] = cumulative_mask[:, :-1]
+    def document_mask(b: T, h: T, q_idx: T, kv_idx: T) -> T:
+        return sequence_indices[b, q_idx] == sequence_indices[b, kv_idx]
+    return document_mask
+def get_non_left_pad_mask_mod(batch: T, pad_id: int) -> _mask_mod_signature:
+    """Prevent model from attending to the left-padded token required for correct batch inference."""
+    non_pad_mask_id = torch.cumsum(batch != pad_id, dim=1)
+    # Left-most pad tokens have cumulative id == 0.
+    def mask_mod(b, h, q_idx, kv_idx):
+        return non_pad_mask_id[b, kv_idx] > 0
+    return mask_mod
+def get_image_prefix_mask_mod(
+    batch: T, soi_id: int, eoi_id: int
+) -> _mask_mod_signature:
+    # batch is [b, s, h, d] shape
+    soi_mask = batch == soi_id
+    eoi_mask = batch == eoi_id
+    acc_soi_mask = torch.cumsum(soi_mask, dim=1)
+    acc_eoi_mask = torch.cumsum(eoi_mask, dim=1)
+    # Get every tokens between two soi_id and eoi_id exclusive of eoi_id
+    img_mask = (acc_soi_mask - acc_eoi_mask) > 0
+    # Create a tensor that assigns each token to its image number
+    # Each image starts with SOI token, so we can use acc_soi_mask to track image numbers
+    img_indices = acc_soi_mask * img_mask
+    def image_prefix_mask_mod(b, h, q_idx, kv_idx):
+        # Check if both tokens are image tokens and belong to the same image
+        is_img_tokens = img_mask[b, q_idx] & img_mask[b, kv_idx]
+        is_same_image = img_indices[b, q_idx] == img_indices[b, kv_idx]
+        return is_img_tokens & is_same_image
+    return image_prefix_mask_mod
+_compiled_create_block_mask = torch.compile(
+    create_block_mask, dynamic=True
+) # Note: can't use mode = 'reduce-overhead' here because it uses internal CUDA graph trees on private streams, causing manual capture to record empty graphs
+@torch.inference_mode()
+def create_attention_mask(*args, **kwargs):
+    """
+    NOTE: We compile this for performance/memory reasons in large masks. To reduce
+    recompiles due to grad_mode flips, we always run mask creation under inference_mode.
+    """
+    return _compiled_create_block_mask(*args, **kwargs)

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "architectures": [
+    "FalconOCRForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_falcon_ocr.FalconOCRConfig",
+    "AutoModelForCausalLM": "modeling_falcon_ocr.FalconOCRForCausalLM"
+  },
+  "model_type": "falcon_ocr",
+  "torch_dtype": "bfloat16",
+  "dim": 768,
+  "n_layers": 22,
+  "n_heads": 16,
+  "head_dim": 64,
+  "n_kv_heads": 8,
+  "vocab_size": 65536,
+  "ffn_dim": 2304,
+  "norm_eps": 1e-05,
+  "max_seq_len": 8192,
+  "rope_theta": 10000,
+  "channel_size": 3,
+  "spatial_patch_size": 16,
+  "temporal_patch_size": 1,
+  "img_id": 227,
+  "eos_id": 11,
+  "image_cls_token_id": 244,
+  "image_mask_token_id": 243,
+  "image_reg_1_token_id": 245,
+  "image_reg_2_token_id": 246,
+  "image_reg_3_token_id": 247,
+  "image_reg_4_token_id": 248,
+  "img_start_id": 229,
+  "img_end_id": 230,
+  "img_row_sep_id": 228,
+  "vid_start_id": 231,
+  "vid_end_id": 232,
+  "frame_sep_id": 233
+}

configuration_falcon_ocr.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from transformers import PretrainedConfig
+class FalconOCRConfig(PretrainedConfig):
+    model_type = "falcon_ocr"
+    def __init__(
+        self,
+        dim: int = 768,
+        n_layers: int = 22,
+        n_heads: int = 16,
+        head_dim: int = 64,
+        n_kv_heads: int = 8,
+        vocab_size: int = 65536,
+        ffn_dim: int = 2304,
+        norm_eps: float = 1e-5,
+        max_seq_len: int = 8192,
+        rope_theta: int = 10000,
+        channel_size: int = 3,
+        spatial_patch_size: int = 16,
+        temporal_patch_size: int = 1,
+        img_id: int = 227,
+        eos_id: int = 11,
+        image_cls_token_id: int = 244,
+        image_mask_token_id: int = 243,
+        image_reg_1_token_id: int = 245,
+        image_reg_2_token_id: int = 246,
+        image_reg_3_token_id: int = 247,
+        image_reg_4_token_id: int = 248,
+        img_start_id: int = 229,
+        img_end_id: int = 230,
+        img_row_sep_id: int = 228,
+        vid_start_id: int = 231,
+        vid_end_id: int = 232,
+        frame_sep_id: int = 233,
+        **kwargs,
+    ):
+        self.dim = dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.n_kv_heads = n_kv_heads
+        self.vocab_size = vocab_size
+        self.ffn_dim = ffn_dim
+        self.norm_eps = norm_eps
+        self.max_seq_len = max_seq_len
+        self.rope_theta = rope_theta
+        self.channel_size = channel_size
+        self.spatial_patch_size = spatial_patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.img_id = img_id
+        self.eos_id = eos_id
+        self.image_cls_token_id = image_cls_token_id
+        self.image_mask_token_id = image_mask_token_id
+        self.image_reg_1_token_id = image_reg_1_token_id
+        self.image_reg_2_token_id = image_reg_2_token_id
+        self.image_reg_3_token_id = image_reg_3_token_id
+        self.image_reg_4_token_id = image_reg_4_token_id
+        self.img_start_id = img_start_id
+        self.img_end_id = img_end_id
+        self.img_row_sep_id = img_row_sep_id
+        self.vid_start_id = vid_start_id
+        self.vid_end_id = vid_end_id
+        self.frame_sep_id = frame_sep_id
+        super().__init__(**kwargs)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa99dfc0a738bb1499d40e24d90ced8311515c2f09516a9dad23662f91a2e63e
+size 1079796208

model_args.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "channel_size": 3,
+  "coord_dec_dim": 8192,
+  "coord_enc_dim": 512,
+  "coord_out_dim": 2048,
+  "coord_token_id": 240,
+  "dim": 768,
+  "do_segmentation": false,
+  "eos_id": 11,
+  "ffn_dim": 2304,
+  "frame_sep_id": 233,
+  "head_dim": 64,
+  "image_cls_token_id": 244,
+  "image_mask_token_id": 243,
+  "image_reg_1_token_id": 245,
+  "image_reg_2_token_id": 246,
+  "image_reg_3_token_id": 247,
+  "image_reg_4_token_id": 248,
+  "img_end_id": 230,
+  "img_id": 227,
+  "img_row_sep_id": 228,
+  "img_start_id": 229,
+  "max_seq_len": 8192,
+  "n_heads": 16,
+  "n_kv_heads": 8,
+  "n_layers": 22,
+  "norm_eps": 1e-05,
+  "num_segm_layers": 3,
+  "rope_theta": 10000,
+  "seg_token_id": 262,
+  "segm_out_dim": 256,
+  "size_dec_dim": 8192,
+  "size_enc_dim": 512,
+  "size_out_dim": 2048,
+  "size_token_id": 241,
+  "spatial_patch_size": 16,
+  "temporal_patch_size": 1,
+  "vid_end_id": 232,
+  "vid_start_id": 231,
+  "vocab_size": 65536
+}

modeling_falcon_ocr.py ADDED Viewed

	@@ -0,0 +1,586 @@

+import math
+from pathlib import Path
+import einops as E
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from PIL import Image
+from torch import Tensor as T
+from torch import nn
+from torch.nn.attention.flex_attention import (
+    AuxRequest,
+    BlockMask,
+    and_masks,
+    or_masks,
+)
+from transformers import AutoTokenizer, PreTrainedModel
+from .attention import (
+    compiled_flex_attn_decode,
+    compiled_flex_attn_prefill,
+    create_attention_mask,
+    get_causal_mask_mod,
+    get_document_mask_mod,
+    get_image_prefix_mask_mod,
+    get_non_left_pad_mask_mod,
+    offset_mask_mod,
+)
+from .configuration_falcon_ocr import FalconOCRConfig
+from .processing_falcon_ocr import load_image, process_batch
+from .rope import (
+    apply_3d_rotary_emb,
+    apply_golden_freqs_cis_to_visual_pos,
+    precompute_freqs_cis,
+)
+CATEGORY_PROMPTS = {
+    "plain": "Extract the text content from this image.",
+    "formula": "Extract the formula content from this image.",
+    "table": "Extract the table content from this image.",
+    "text": "Extract the text content from this image.",
+    "caption": "Extract the caption content from this image.",
+    "footnote": "Extract the footnote content from this image.",
+    "list-item": "Extract the list-item content from this image.",
+    "page-footer": "Extract the page-footer content from this image.",
+    "page-header": "Extract the page-header content from this image.",
+    "section-header": "Extract the section-header content from this image.",
+    "title": "Extract the title content from this image.",
+}
+# ---------------------------------------------------------------------------
+# Sub-modules: Attention
+# ---------------------------------------------------------------------------
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    B, S, H, D = x.shape
+    if n_rep == 1:
+        return x
+    return torch.unsqueeze(x, dim=3).expand(B, S, H, n_rep, D).reshape(B, S, H * n_rep, D)
+class Attention(nn.Module):
+    def __init__(self, config: FalconOCRConfig, layer_id: int):
+        super().__init__()
+        self.layer_id = layer_id
+        self.n_kv_heads = config.n_kv_heads or config.n_heads
+        self.n_rep = config.n_heads // self.n_kv_heads
+        self.head_dim = config.head_dim or config.dim // config.n_heads
+        self.q_dim = config.n_heads * self.head_dim
+        self.kv_dim = self.n_kv_heads * self.head_dim
+        self.wq = nn.Linear(config.dim, self.q_dim, bias=False)
+        self.wk = nn.Linear(config.dim, self.kv_dim, bias=False)
+        self.wv = nn.Linear(config.dim, self.kv_dim, bias=False)
+        self.wo = nn.Linear(config.n_heads * self.head_dim, config.dim, bias=False)
+        self.sinks = nn.Parameter(torch.empty((config.n_heads,)))
+    def _fuse_weights(self):
+        wqkv_weight = torch.cat([self.wq.weight.data, self.wk.weight.data, self.wv.weight.data], dim=0)
+        self.register_buffer("_wqkv_weight", wqkv_weight)
+        del self.wq, self.wk, self.wv
+    def _pre_attention_qkv(self, x) -> tuple[T, T, T]:
+        qkv = F.linear(F.rms_norm(x, (x.size(-1),)), self._wqkv_weight)
+        xq, xk, xv = qkv.split([self.q_dim, self.kv_dim, self.kv_dim], dim=-1)
+        xq = E.rearrange(xq, "b s (h d) -> b s h d", d=self.head_dim)
+        xk = E.rearrange(xk, "b s (h d) -> b s h d", d=self.head_dim)
+        xv = E.rearrange(xv, "b s (h d) -> b s h d", d=self.head_dim)
+        xq = F.rms_norm(xq, (xq.size(-1),))
+        xk = F.rms_norm(xk, (xk.size(-1),))
+        xk = repeat_kv(xk, n_rep=self.n_rep)
+        xv = repeat_kv(xv, n_rep=self.n_rep)
+        return xq, xk, xv
+    def _post_attention(self, output: T, lse: T) -> T:
+        sinks_BHS = self.sinks.view(1, -1, 1)
+        sink_scale = torch.sigmoid(lse - sinks_BHS)
+        output = (output * sink_scale.unsqueeze(-1)).to(output.dtype)
+        output = output.permute(0, 2, 1, 3).contiguous().flatten(2)
+        return self.wo(output)
+    def compile_attention(self, *, dynamic: bool = True, mode: str = "default"):
+        self._pre_attention_qkv = torch.compile(self._pre_attention_qkv, dynamic=dynamic, mode=mode)
+        self._post_attention = torch.compile(self._post_attention, dynamic=dynamic, mode=mode)
+    def forward(
+        self, x: T, attention_masks: BlockMask, freqs_cis: T,
+        freqs_cis_2d: T | None = None, pos_hw: T | None = None,
+        kv_cache=None, input_pos=None, batch_idx=None,
+        flex_attn_kernel_options=None,
+    ):
+        xq, xk, xv = self._pre_attention_qkv(x)
+        xq, xk = apply_3d_rotary_emb(xq, xk, freqs_cis, freqs_cis_2d, pos_hw)
+        xq = E.rearrange(xq, "b s h d -> b h s d")
+        xk = E.rearrange(xk, "b s h d -> b h s d")
+        xv = E.rearrange(xv, "b s h d -> b h s d")
+        xk, xv = kv_cache.insert_kv(self.layer_id, xk, xv, input_pos=input_pos, batch_idx=batch_idx)
+        flex_fn = compiled_flex_attn_decode if xq.shape[2] == 1 else compiled_flex_attn_prefill
+        output, aux_output = flex_fn(xq, xk, xv, block_mask=attention_masks, return_aux=AuxRequest(lse=True))
+        return self._post_attention(output, aux_output.lse)
+# ---------------------------------------------------------------------------
+# Sub-modules: FeedForward
+# ---------------------------------------------------------------------------
+@triton.jit
+def _squared_relu_gate_kernel(
+    packed_ptr, out_ptr, n_rows, n_cols,
+    in_row_stride, in_col_stride, out_row_stride, out_col_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    n_elements = n_rows * n_cols
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    rows = offsets // n_cols
+    cols = offsets % n_cols
+    gate_idx = rows * in_row_stride + (2 * cols) * in_col_stride
+    up_idx = rows * in_row_stride + (2 * cols + 1) * in_col_stride
+    out_idx = rows * out_row_stride + cols * out_col_stride
+    gate = tl.load(packed_ptr + gate_idx, mask=mask)
+    up = tl.load(packed_ptr + up_idx, mask=mask)
+    gate = tl.where(gate > 0, gate, 0.0)
+    out = gate * gate * up
+    tl.store(out_ptr + out_idx, out, mask=mask)
+def squared_relu_gate(packed: T, hidden_dim: int) -> T:
+    packed_2d = packed.flatten(0, -2)
+    n_rows = packed_2d.shape[0]
+    n_cols = hidden_dim
+    out_2d = torch.empty((n_rows, n_cols), device=packed.device, dtype=packed.dtype)
+    n = n_rows * n_cols
+    grid = lambda meta: (triton.cdiv(n, meta["BLOCK_SIZE"]),)
+    _squared_relu_gate_kernel[grid](
+        packed_2d, out_2d, n_rows, n_cols,
+        packed_2d.stride(0), packed_2d.stride(1),
+        out_2d.stride(0), out_2d.stride(1),
+        BLOCK_SIZE=1024,
+    )
+    return out_2d.view(*packed.shape[:-1], hidden_dim)
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        self.hidden_dim = hidden_dim
+    def _fuse_weights(self):
+        if hasattr(self, "_w13_weight"):
+            return
+        w1_weight_fused = self.w1.weight.data * math.sqrt(2.0)
+        w13_weight = torch.empty(
+            (2 * self.hidden_dim, self.w1.weight.shape[1]),
+            device=w1_weight_fused.device, dtype=w1_weight_fused.dtype,
+        )
+        w13_weight[0::2] = w1_weight_fused
+        w13_weight[1::2] = self.w3.weight.data
+        self.register_buffer("_w13_weight", w13_weight)
+        del self.w1, self.w3
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.rms_norm(x, (x.size(-1),))
+        w13_out = F.linear(x, self._w13_weight)
+        return self.w2(squared_relu_gate(w13_out, self.hidden_dim))
+# ---------------------------------------------------------------------------
+# Sub-modules: TransformerBlock
+# ---------------------------------------------------------------------------
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, config: FalconOCRConfig):
+        super().__init__()
+        self.attention = Attention(config, layer_id)
+        self.feed_forward = FeedForward(config.dim, config.ffn_dim)
+    def compile(self, *, dynamic: bool = True, mode: str = "default"):
+        self.feed_forward = torch.compile(self.feed_forward, dynamic=dynamic, mode=mode)
+        self.attention.compile_attention(dynamic=dynamic, mode=mode)
+        return self
+    def forward(
+        self, x: T, freqs_cis: T, freqs_cis_2d: T | None = None,
+        pos_hw: T | None = None, attention_masks=None, kv_cache=None,
+        input_pos=None, batch_idx=None, flex_attn_kernel_options=None,
+    ):
+        B, S, D = x.shape
+        x = x + self.attention(
+            x, freqs_cis=freqs_cis, freqs_cis_2d=freqs_cis_2d, pos_hw=pos_hw,
+            attention_masks=attention_masks, kv_cache=kv_cache,
+            input_pos=input_pos, batch_idx=batch_idx,
+            flex_attn_kernel_options=flex_attn_kernel_options,
+        )
+        out = x + self.feed_forward(x)
+        return out.reshape(B, S, D)
+# ---------------------------------------------------------------------------
+# KV Cache
+# ---------------------------------------------------------------------------
+class KVCache:
+    def __init__(self, max_batch_size, max_seq_length, n_heads, head_dim, num_layers):
+        self.kv_shape = (num_layers, 2, max_batch_size, n_heads, max_seq_length, head_dim)
+        self.kv_cache = None
+        self.pos = 0
+        self.pos_t: T | None = None
+    def reset(self):
+        self.pos = 0
+        self.pos_t = None
+    def get_pos(self):
+        return self.pos
+    def set_pos_t(self, pos_t):
+        self.pos_t = pos_t
+    def increment_and_get_pos_t(self):
+        assert self.pos_t is not None
+        self.pos_t += 1
+        return self.pos_t
+    def insert_kv(self, layer_id: int, k: T, v: T, **kwargs):
+        del kwargs
+        assert self.pos_t is not None
+        if self.kv_cache is None:
+            self.kv_cache = torch.empty(self.kv_shape, dtype=k.dtype, device=k.device)
+        B, H, T_add, D = k.size()
+        t0, t1 = self.pos, self.pos + T_add
+        self.kv_cache[layer_id, 0, :, :, t0:t1] = k
+        self.kv_cache[layer_id, 1, :, :, t0:t1] = v
+        key_view = self.kv_cache[layer_id, 0, :, :, :t1]
+        value_view = self.kv_cache[layer_id, 1, :, :, :t1]
+        if layer_id == self.kv_cache.size(0) - 1:
+            self.pos = t1
+        return key_view, value_view
+# ---------------------------------------------------------------------------
+# Sampling
+# ---------------------------------------------------------------------------
+@torch.inference_mode()
+def sample_next_token(logits, rng, temperature=0.0, top_k=None):
+    assert temperature >= 0.0
+    if temperature == 0.0:
+        return torch.argmax(logits, dim=-1, keepdim=True)
+    if top_k is not None:
+        k = min(top_k, logits.size(-1))
+        vals, idx = torch.topk(logits, k, dim=-1)
+        vals = vals / temperature
+        probs = F.softmax(vals, dim=-1)
+        choice = torch.multinomial(probs, num_samples=1, generator=rng)
+        return idx.gather(1, choice)
+    logits = logits / temperature
+    probs = F.softmax(logits, dim=-1)
+    return torch.multinomial(probs, num_samples=1, generator=rng)
+# ---------------------------------------------------------------------------
+# Main Model
+# ---------------------------------------------------------------------------
+class FalconOCRForCausalLM(PreTrainedModel):
+    config_class = FalconOCRConfig
+    _no_split_modules = ["TransformerBlock"]
+    def __init__(self, config: FalconOCRConfig):
+        super().__init__(config)
+        img_in_dim = config.temporal_patch_size * config.spatial_patch_size ** 2 * config.channel_size
+        self.img_projector = nn.Linear(img_in_dim, config.dim, bias=False)
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.layers = nn.ModuleDict()
+        for layer_id in range(config.n_layers):
+            self.layers[str(layer_id)] = TransformerBlock(layer_id, config)
+        self.norm = nn.RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+        rope_dim = config.head_dim // 2
+        freqs_cis = precompute_freqs_cis(rope_dim, config.max_seq_len, config.rope_theta)
+        freqs_cis_golden = torch.empty((config.n_heads, rope_dim // 2, 2), dtype=torch.float)
+        self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+        self.register_buffer("freqs_cis_golden", freqs_cis_golden, persistent=True)
+        self._weights_fused = False
+        self._is_compiled = False
+        self.post_init()
+    # -- Weight management ---------------------------------------------------
+    def _fuse_weights(self):
+        if self._weights_fused:
+            return
+        device = self.tok_embeddings.weight.device
+        c = self.config
+        rope_dim = c.head_dim // 2
+        freqs_cis = precompute_freqs_cis(rope_dim, c.max_seq_len, c.rope_theta).to(device)
+        self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+        if self.freqs_cis_golden.device != device:
+            self.freqs_cis_golden = self.freqs_cis_golden.to(device)
+        for layer in self.layers.values():
+            layer.attention._fuse_weights()
+            layer.feed_forward._fuse_weights()
+        self._weights_fused = True
+    def compile_model(self):
+        if self._is_compiled:
+            return
+        torch._inductor.config.triton.cudagraphs = False
+        for layer in self.layers.values():
+            layer.compile(dynamic=True, mode="default")
+        self._is_compiled = True
+    # -- Tokenizer -----------------------------------------------------------
+    def _get_tokenizer(self):
+        if not hasattr(self, "_tokenizer"):
+            import os
+            path = self.config._name_or_path
+            is_local = os.path.exists(path)
+            self._tokenizer = AutoTokenizer.from_pretrained(path, local_files_only=is_local, trust_remote_code=True)
+            for token_name, token in self._tokenizer.special_tokens_map.items():
+                if isinstance(token, str):
+                    setattr(self._tokenizer, token_name, token)
+                    setattr(
+                        self._tokenizer, token_name + "_id",
+                        self._tokenizer.convert_tokens_to_ids(token),
+                    )
+        return self._tokenizer
+    # -- Attention mask ------------------------------------------------------
+    def get_attention_mask(self, input_batch: T, max_len: int | None = None):
+        B, S = input_batch.size()
+        c = self.config
+        block_causal_mask_mod = and_masks(
+            get_causal_mask_mod(),
+            get_document_mask_mod(input_batch, c.eos_id),
+            get_non_left_pad_mask_mod(input_batch, self._pad_token_id),
+        )
+        image_prefix_mask_mod = get_image_prefix_mask_mod(
+            batch=input_batch, soi_id=c.image_cls_token_id, eoi_id=c.img_end_id,
+        )
+        mask_mod = or_masks(image_prefix_mask_mod, block_causal_mask_mod)
+        max_len = max_len or S
+        return create_attention_mask(mask_mod, B, None, max_len, max_len)
+    # -- Embedding helpers ---------------------------------------------------
+    def _scatter_img_tokens_with_projector(self, h_BSD, pixel_patches_NLC, pixel_masks_NTHW, tokens_BS):
+        B, S, D = h_BSD.shape
+        pixel_patch_mask = E.reduce(
+            pixel_masks_NTHW,
+            "n (t pt) (h ph) (w pw) -> (n t h w)",
+            reduction="any",
+            pt=self.config.temporal_patch_size,
+            ph=self.config.spatial_patch_size,
+            pw=self.config.spatial_patch_size,
+        )
+        pixel_patches_flat = E.rearrange(pixel_patches_NLC, "n p c -> (n p) c")
+        valid_patches = pixel_patches_flat[pixel_patch_mask]
+        valid_feats = self.img_projector(valid_patches)
+        img_mask_h_BSD = E.repeat(tokens_BS == self.config.img_id, "b s -> b s d", d=D)
+        assert valid_feats.numel() == img_mask_h_BSD.sum()
+        return torch.masked_scatter(h_BSD, img_mask_h_BSD, valid_feats)
+    # -- Core forward --------------------------------------------------------
+    def forward(
+        self,
+        tokens: T,
+        attention_mask: BlockMask,
+        kv_cache,
+        rope_pos_t: T | None = None,
+        rope_pos_hw: T | None = None,
+        pixel_values: T | None = None,
+        pixel_mask: T | None = None,
+    ):
+        B, S = tokens.size()
+        c = self.config
+        block_mask = attention_mask
+        T_pos = kv_cache.get_pos()
+        is_prefill = S != 1
+        if is_prefill:
+            assert rope_pos_t is not None and rope_pos_hw is not None
+            pos_t = rope_pos_t[:, T_pos:T_pos + S].long()
+            kv_cache.pos_t = pos_t[:, -1:]
+            freqs_cis = self.freqs_cis[pos_t]
+            rope_pos_hw = rope_pos_hw[:, T_pos:T_pos + S]
+            freqs_cis_golden = apply_golden_freqs_cis_to_visual_pos(self.freqs_cis_golden, rope_pos_hw)
+            block_mask.seq_lengths = (S, S)
+        else:
+            pos_t = kv_cache.increment_and_get_pos_t()
+            freqs_cis = self.freqs_cis[pos_t]
+            freqs_cis_golden = None
+            block_idx = T_pos // block_mask.BLOCK_SIZE[0]
+            block_mask = block_mask[:, :, block_idx]
+            block_mask.seq_lengths = (S, T_pos + S)
+            block_mask.mask_mod = offset_mask_mod(attention_mask.mask_mod, offset=T_pos)
+        h_BSD = self.tok_embeddings(tokens)
+        if pixel_values is not None:
+            assert pixel_mask is not None
+            pixel_values = pixel_values.to(self.dtype)
+            pixel_mask = pixel_mask.to(self.dtype)
+            pixel_patches_NLC = E.rearrange(
+                pixel_values,
+                "n (t pt) (h ph) (w pw) c -> n (t h w) (pt ph pw c)",
+                pt=c.temporal_patch_size, ph=c.spatial_patch_size, pw=c.spatial_patch_size,
+            )
+            h_BSD = self._scatter_img_tokens_with_projector(h_BSD, pixel_patches_NLC, pixel_mask, tokens)
+        for layer in self.layers.values():
+            h_BSD = layer(
+                h_BSD, freqs_cis=freqs_cis, freqs_cis_2d=freqs_cis_golden,
+                pos_hw=rope_pos_hw, attention_masks=block_mask, kv_cache=kv_cache,
+            )
+        h_BSD = self.norm(h_BSD)
+        logits_BSV = self.output(h_BSD)
+        return logits_BSV
+    # -- Main API: generate --------------------------------------------------
+    @torch.inference_mode()
+    def generate(
+        self,
+        images,
+        *,
+        category: str | list[str] = "plain",
+        max_new_tokens: int = 4096,
+        temperature: float = 0.0,
+        top_k: int | None = None,
+        min_dimension: int = 64,
+        max_dimension: int = 1024,
+        compile: bool = True,
+        seed: int | None = 42,
+    ) -> list[str]:
+        """
+        Extract text from document images.
+        Args:
+            images: Single PIL Image (or path/URL) or list of them.
+            category: OCR category — one of "plain", "text", "table", "formula",
+                "caption", "footnote", "list-item", "page-footer", "page-header",
+                "section-header", "title". Can be a single string (applied to all
+                images) or a list (one per image).
+            max_new_tokens: Maximum generation steps.
+            temperature: Sampling temperature (0.0 = greedy).
+            top_k: Top-k sampling (None = disabled).
+            min_dimension: Min image side after resize.
+            max_dimension: Max image side after resize.
+            compile: Whether to torch.compile on first call.
+            seed: Random seed for reproducibility (None = non-deterministic).
+        Returns:
+            List of extracted text strings, one per image.
+        """
+        self._fuse_weights()
+        if compile:
+            self.compile_model()
+        if isinstance(images, (str, Path, Image.Image)):
+            images = [images]
+        if isinstance(category, str):
+            category = [category] * len(images)
+        assert len(images) == len(category), "Must provide one category per image"
+        device = self.device
+        tokenizer = self._get_tokenizer()
+        self._pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")
+        stop_token_ids = [self.config.eos_id, tokenizer.convert_tokens_to_ids("<|end_of_query|>")]
+        image_prompt_pairs = []
+        for img, cat in zip(images, category):
+            instruction = CATEGORY_PROMPTS.get(cat.strip().lower(), CATEGORY_PROMPTS["plain"])
+            prompt = f"<|image|>{instruction}\n<|OCR_PLAIN|>"
+            image_prompt_pairs.append((img, prompt))
+        batch_inputs = process_batch(
+            tokenizer, self.config, image_prompt_pairs,
+            max_length=4096, min_dimension=min_dimension, max_dimension=max_dimension,
+        )
+        batch_inputs = {k: (v.to(device) if torch.is_tensor(v) else v) for k, v in batch_inputs.items()}
+        tokens = batch_inputs["tokens"]
+        B, L = tokens.size()
+        block_size = 128
+        S = (L + max_new_tokens + block_size - 1) // block_size * block_size
+        assert S <= self.config.max_seq_len
+        rng = torch.Generator(device).manual_seed(seed) if seed is not None else None
+        kv_cache = KVCache(
+            max_batch_size=B, max_seq_length=S, n_heads=self.config.n_heads,
+            head_dim=self.config.head_dim, num_layers=self.config.n_layers,
+        )
+        padded_tokens = torch.full((B, S), self._pad_token_id, dtype=tokens.dtype, device=device)
+        padded_tokens[:, :L] = tokens
+        attention_mask = self.get_attention_mask(padded_tokens, max_len=S)
+        # Prefill
+        logits_BSV = self.forward(
+            tokens=tokens, rope_pos_t=batch_inputs["pos_t"], rope_pos_hw=batch_inputs["pos_hw"],
+            attention_mask=attention_mask, kv_cache=kv_cache,
+            pixel_values=batch_inputs["pixel_values"], pixel_mask=batch_inputs["pixel_mask"],
+        )
+        stop_ids = torch.tensor(stop_token_ids).to(device)
+        should_stop_B = torch.full((B,), False, dtype=torch.bool, device=device)
+        generated_ids: list[list[int]] = [[] for _ in range(B)]
+        # Decode loop
+        while not torch.all(should_stop_B) and (pos := kv_cache.get_pos()) < S:
+            tokens_B1 = sample_next_token(logits_BSV[:, -1], rng, temperature, top_k)
+            if torch.any(should_stop_B):
+                tokens_B1 = tokens_B1.clone()
+                tokens_B1[should_stop_B, :] = self._pad_token_id
+            padded_tokens[:, pos] = tokens_B1[:, -1]
+            for b in range(B):
+                if not should_stop_B[b]:
+                    generated_ids[b].append(tokens_B1[b, 0].item())
+            logits_BSV = self.forward(
+                tokens=tokens_B1, attention_mask=attention_mask, kv_cache=kv_cache,
+            )
+            hit_stop_B = torch.isin(tokens_B1, stop_ids).any(dim=-1)
+            should_stop_B = should_stop_B.logical_or(hit_stop_B)
+        # Decode tokens to text
+        results = []
+        for b in range(B):
+            text = tokenizer.decode(generated_ids[b], skip_special_tokens=False)
+            text = (
+                text
+                .replace("<|end_of_query|>", "")
+                .replace("<|end_of_text|>", "")
+                .strip()
+            )
+            results.append(text)
+        return results

processing_falcon_ocr.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import io
+import math
+import einops as E
+import numpy as np
+import requests
+import torch
+from PIL import Image
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_transforms import convert_to_rgb, resize
+from transformers.image_utils import (
+    ImageInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+IMAGE_MEAN = [0.5, 0.5, 0.5]
+IMAGE_STD = [0.5, 0.5, 0.5]
+def load_image(image):
+    if image is None:
+        return None
+    if isinstance(image, Image.Image):
+        return image
+    if isinstance(image, str):
+        if image.startswith(("http://", "https://")):
+            response = requests.get(image, timeout=10)
+            response.raise_for_status()
+            return Image.open(io.BytesIO(response.content))
+        if image.endswith(".npy"):
+            img_array = io.BytesIO(np.load(image))
+            return Image.open(img_array)
+        return Image.open(image)
+    if isinstance(image, np.bytes_):
+        return Image.open(io.BytesIO(image))
+    if isinstance(image, np.ndarray):
+        return Image.fromarray(image)
+    raise TypeError(f"Unknown image format {image}")
+def load_images(images_input, min_dimension: int, max_dimension: int):
+    images = []
+    if images_input is not None:
+        for inp in images_input:
+            img = load_image(inp)
+            img = resize_image_if_necessary(img, min_dimension, max_dimension)
+            images.append(img)
+    return images
+def resize_image_if_necessary(
+    image,
+    shortest_dimension=224,
+    longest_dimension=896,
+):
+    original_width, original_height = image.size
+    aspect_ratio = original_width / original_height
+    if (
+        shortest_dimension <= original_width <= longest_dimension
+        and shortest_dimension <= original_height <= longest_dimension
+    ):
+        return image
+    is_vertical_image = original_width < original_height
+    if original_width < shortest_dimension or original_height < shortest_dimension:
+        if is_vertical_image:
+            new_width = shortest_dimension
+            new_height = int(new_width / aspect_ratio)
+        else:
+            new_height = shortest_dimension
+            new_width = int(new_height * aspect_ratio)
+    else:
+        if is_vertical_image:
+            new_width = longest_dimension
+            new_height = int(new_width / aspect_ratio)
+        else:
+            new_height = longest_dimension
+            new_width = int(new_height * aspect_ratio)
+    if new_width > longest_dimension:
+        new_width = longest_dimension
+        new_height = int(new_width / aspect_ratio)
+    if new_height > longest_dimension:
+        new_height = longest_dimension
+        new_width = int(new_height * aspect_ratio)
+    resized_image = image.resize((new_width, new_height))
+    return resized_image
+def smart_resize(
+    image,
+    factor: int,
+    resample,
+    input_data_format,
+    min_pixels: int = 56 * 56,
+    max_pixels: int = 14 * 14 * 4 * 1280,
+):
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    if height < factor or width < factor:
+        raise ValueError(f"{height=} or {width=} must be larger than {factor=}")
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = np.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = np.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    image = resize(
+        image,
+        size=(h_bar, w_bar),
+        resample=resample,
+        input_data_format=input_data_format,
+    )
+    return image
+class ImageProcessor(BaseImageProcessor):
+    def __init__(
+        self,
+        patch_size,
+        merge_size,
+        do_resize: bool = True,
+        resample: Image.Resampling = Image.Resampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean or IMAGE_MEAN
+        self.image_std = image_std or IMAGE_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+        validate_preprocess_arguments(
+            rescale_factor=self.rescale_factor,
+            do_normalize=self.do_normalize,
+            image_mean=self.image_mean,
+            image_std=self.image_std,
+            do_resize=self.do_resize,
+            size=self.size,
+            resample=self.resample,
+        )
+    def _preprocess(self, image: ImageInput, do_rescale=None, do_normalize=None):
+        if self.do_convert_rgb:
+            image = convert_to_rgb(image)
+        image = to_numpy_array(image)
+        input_data_format = infer_channel_dimension_format(image)
+        if self.do_resize:
+            image = smart_resize(
+                image,
+                factor=self.patch_size * self.merge_size,
+                resample=self.resample,
+                input_data_format=input_data_format,
+                min_pixels=self.min_pixels,
+                max_pixels=self.max_pixels,
+            )
+        if do_rescale or self.do_rescale:
+            image = self.rescale(image, scale=self.rescale_factor, input_data_format=input_data_format)
+        if do_normalize or self.do_normalize:
+            image = self.normalize(
+                image=image, mean=self.image_mean, std=self.image_std,
+                input_data_format=input_data_format,
+            )
+        return image
+    def preprocess(self, images: list[ImageInput] | None, do_rescale=None, do_normalize=None, **kwargs):
+        del kwargs
+        if images is None:
+            return []
+        images = [item for item in images if item is not None]
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        pixel_values = []
+        for image in images:
+            processed_image = self._preprocess(image, do_rescale, do_normalize)
+            processed_image = processed_image[None, ...]
+            pixel_values.append(processed_image)
+        return pixel_values
+    def batch_images_with_mask(self, pixel_values, max_image_height, max_image_width):
+        if pixel_values is None:
+            return None
+        pixel_values = [item for item in pixel_values if item is not None and len(item) != 0]
+        if len(pixel_values) == 0:
+            return None
+        pixel_values = [torch.from_numpy(img) for img in pixel_values]
+        max_temporal = max(img.shape[0] for img in pixel_values)
+        def pad_image_and_mask(img):
+            time_steps, height, width, channels = img.shape
+            if channels != 3:
+                raise ValueError(f"Expected 3-channel RGB images, got {channels} channels.")
+            padding = (0, 0, 0, max_image_width - width, 0, max_image_height - height, 0, max_temporal - time_steps)
+            padded_image = torch.nn.functional.pad(img, padding)
+            mask = torch.zeros((max_temporal, max_image_height, max_image_width), dtype=torch.long)
+            mask[:time_steps, :height, :width] = 1
+            return padded_image, mask
+        padded_pixel_values, padding_masks = zip(*[pad_image_and_mask(img) for img in pixel_values])
+        padded_pixel_values = torch.stack(list(padded_pixel_values))
+        padding_masks = torch.stack(list(padding_masks))
+        return {"pixel_values": padded_pixel_values, "padding_mask": padding_masks}
+# ---------------------------------------------------------------------------
+# Positional encoding helpers
+# ---------------------------------------------------------------------------
+def _compute_image_spatial_positions(
+    pixel_mask_THW: torch.Tensor,
+    spatial_patch_size: int,
+    temporal_patch_size: int = 1,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    mask_thw = E.reduce(
+        pixel_mask_THW,
+        "(t tp) (h hp) (w wp) -> t h w",
+        reduction="any",
+        tp=temporal_patch_size,
+        hp=spatial_patch_size,
+        wp=spatial_patch_size,
+    )
+    width = E.reduce(mask_thw.sum(dim=-1).int(), "t h -> ", reduction="max")
+    height = E.reduce(mask_thw.sum(dim=-2).int(), "t w -> ", reduction="max")
+    xlim = torch.sqrt(width / height)
+    ylim = torch.sqrt(height / width)
+    xpos = torch.linspace(-xlim, xlim, int(width))
+    ypos = torch.linspace(-ylim, ylim, int(height))
+    wpos, hpos = torch.meshgrid(xpos, ypos, indexing="xy")
+    return hpos.flatten(), wpos.flatten()
+def _get_image_token_masks(tokens, config):
+    spatial_mask = tokens == config.img_id
+    no_increase_mask = (
+        spatial_mask
+        | (tokens == config.image_reg_1_token_id)
+        | (tokens == config.image_reg_2_token_id)
+        | (tokens == config.image_reg_3_token_id)
+        | (tokens == config.image_reg_4_token_id)
+        | (tokens == config.img_end_id)
+    )
+    return spatial_mask, no_increase_mask
+def get_pos_thw(
+    tokens: torch.Tensor,
+    pixel_masks_NTHW: torch.Tensor,
+    config,
+    spatial_patch_size: int,
+    temporal_patch_size: int = 1,
+    pad_token_id: int = None,
+):
+    assert pad_token_id is not None
+    assert tokens.ndim == 2
+    assert pixel_masks_NTHW.ndim == 4
+    spatial_img_token_mask_BS, no_increase_idx_img_token_mask_BS = _get_image_token_masks(tokens, config)
+    hpos_parts, wpos_parts = [], []
+    for i in range(pixel_masks_NTHW.shape[0]):
+        h, w = _compute_image_spatial_positions(pixel_masks_NTHW[i], spatial_patch_size, temporal_patch_size)
+        hpos_parts.append(h)
+        wpos_parts.append(w)
+    hpos_N = torch.cat(hpos_parts) if hpos_parts else torch.empty(0)
+    wpos_N = torch.cat(wpos_parts) if wpos_parts else torch.empty(0)
+    expected_tokens = spatial_img_token_mask_BS.sum().item()
+    actual_tokens = hpos_N.numel()
+    assert actual_tokens == expected_tokens, (
+        f"Mismatch between spatial image tokens ({expected_tokens}) and generated positions ({actual_tokens})."
+    )
+    hpos_BS = torch.full_like(tokens, fill_value=torch.nan, dtype=torch.float, device=tokens.device)
+    wpos_BS = torch.full_like(tokens, fill_value=torch.nan, dtype=torch.float, device=tokens.device)
+    hpos_BS = hpos_BS.masked_scatter_(spatial_img_token_mask_BS, hpos_N)
+    wpos_BS = wpos_BS.masked_scatter_(spatial_img_token_mask_BS, wpos_N)
+    tpos_BS = torch.ones_like(tokens, dtype=torch.float, device=tokens.device)
+    tpos_BS[no_increase_idx_img_token_mask_BS] = 0
+    tpos_BS = torch.cumsum(tpos_BS, dim=1) - 1
+    tpos_BS[tokens == pad_token_id] = 0
+    hw_pos_BS2 = torch.stack([hpos_BS, wpos_BS], dim=-1)
+    return tpos_BS.long(), hw_pos_BS2
+def calculate_image_tokens(image, patch_size, merge_size):
+    height, width = get_image_size(image)
+    return int((height * width) / (patch_size * patch_size * merge_size * merge_size))
+def tokenize_inputs(prompt, images, tokenizer, config, patch_size, merge_size, max_length):
+    img_reg_ids = [
+        config.image_reg_1_token_id,
+        config.image_reg_2_token_id,
+        config.image_reg_3_token_id,
+        config.image_reg_4_token_id,
+    ]
+    if images is not None and len(images) > 0:
+        image_token_counts = [calculate_image_tokens(image, patch_size, merge_size) for image in images]
+    else:
+        image_token_counts = []
+    image_token = tokenizer.convert_ids_to_tokens(config.img_id)
+    prompt_chunks = [tokenizer.encode(chunk) for chunk in prompt.split(image_token)]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, sep) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    bos_id = getattr(tokenizer, "bos_token_id", None)
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and bos_id is not None and prompt_chunks[0][0] == bos_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    separators = []
+    for count in image_token_counts:
+        tokens = [config.img_id] * count
+        image_block = [config.image_cls_token_id, *img_reg_ids, *tokens, config.img_end_id]
+        separators.append(image_block)
+    if len(separators) != 0 and len(separators) != len(prompt_chunks):
+        separators.append(separators[-1])
+    selected_images = []
+    if len(separators) == 0:
+        input_ids = prompt_chunks[0]
+    else:
+        for index, x in enumerate(insert_separator(prompt_chunks, separators)):
+            if index % 2 != 0:
+                if (len(input_ids) + len(x)) < max_length:
+                    input_ids.extend(x)
+                    selected_images.append(images[index // 2])
+            elif index % 2 == 0:
+                input_ids.extend(x[offset:])
+    input_ids = torch.LongTensor(input_ids)
+    return input_ids, selected_images
+def process_batch(
+    tokenizer,
+    config,
+    image_prompt_pairs,
+    max_length,
+    min_dimension,
+    max_dimension,
+    patch_size=16,
+    merge_size=1,
+):
+    """
+    Process a batch of images with text prompts.
+    Uses LEFT PADDING for proper batch generation with causal models.
+    """
+    all_input_ids = []
+    all_selected_images = []
+    processor_local = ImageProcessor(patch_size, merge_size)
+    for img_input, prompt in image_prompt_pairs:
+        img = load_image(img_input)
+        if img is not None:
+            img = resize_image_if_necessary(img, min_dimension, max_dimension)
+        images = processor_local.preprocess(images=[img] if img else [])
+        input_ids, selected_images = tokenize_inputs(
+            prompt, images, tokenizer, config, patch_size, merge_size, max_length,
+        )
+        all_input_ids.append(input_ids)
+        all_selected_images.extend(selected_images)
+    pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")
+    padded_input_ids = torch.nn.utils.rnn.pad_sequence(
+        all_input_ids, batch_first=True, padding_value=pad_token_id, padding_side="left",
+    )
+    processed = processor_local.batch_images_with_mask(all_selected_images, max_dimension, max_dimension)
+    assert processed is not None
+    pos_t, pos_hw = get_pos_thw(
+        padded_input_ids, processed["padding_mask"], config, patch_size, pad_token_id=pad_token_id,
+    )
+    return {
+        "tokens": padded_input_ids,
+        "pixel_values": processed["pixel_values"],
+        "pixel_mask": processed["padding_mask"],
+        "pos_t": pos_t,
+        "pos_hw": pos_hw,
+        "pad_token_id": pad_token_id,
+    }

rope.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import einops as E
+import torch
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor:
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        end (int): End index for precomputing frequencies.
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+    Returns:
+        torch.Tensor: Precomputed frequency tensor with complex exponentials.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis  # [S, D//2]
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """1D rotary embedding"""
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    assert freqs_cis.ndim == 3, (
+        "Freqs_cis must be indexed by position ids already and has shape (B,S,D)"
+    )
+    freqs_cis = E.rearrange(freqs_cis, "b s d -> b s 1 d")
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+###### 2D golden rope
+"""
+Dimension key:
+    B: batch size
+    S: number of tokens per sample, Seqlen
+    T: Number of selected Tokens
+    P: pos_dim
+    h: n_heads
+    d: head_dim
+    F: num_freqs == head_dim // 2
+"""
+def apply_golden_freqs_cis_to_visual_pos(freqs_hFP, pos_BSP) -> torch.Tensor:
+    """
+    This function is applied once per input batch, and the cached
+    freqs_cis is passed through to all layers.
+    Safe for Torch‑Inductor because it never uses boolean indexing on a symbolic tensor.
+    """
+    # 1. Boolean mask → integer indices (no unbacked shapes)
+    img_mask_BS = E.reduce(~torch.isnan(pos_BSP), 'b s p -> b s', reduction='all')
+    idx_b, idx_s = torch.nonzero(img_mask_BS, as_tuple=True)   # each shape: (N,)
+    # 2. Gather the positional tensor for those tokens
+    pos_tP = pos_BSP[idx_b, idx_s].float() # (N, p)
+    # 3. Project positions onto the frequency table → angles θ
+    theta_thF = torch.einsum("tp,hfp->thf", pos_tP, freqs_hFP.float())  # (t, h, f)
+    # 4. Convert to complex numbers on the unit circle
+    freqs_cis_thF = torch.polar(torch.ones_like(theta_thF), theta_thF)
+    return freqs_cis_thF
+def apply_golden_rotary_emb(input_BShd, freqs_cis_thF, pos_BSP) -> torch.Tensor:
+    """
+    Rotates *only* the image tokens in `input_BShd`.  No boolean indexing,
+    so it is safe for Torch‑Inductor.
+    """
+    img_mask_BS = E.reduce(~torch.isnan(pos_BSP), 'b s p -> b s', reduction='all')
+    idx_b, idx_s = torch.nonzero(img_mask_BS, as_tuple=True)  # (N,)
+    input_thd = input_BShd[idx_b, idx_s].float()  # (N, h, d)
+    x_even = input_thd[..., 0::2]  # (N, h, F)
+    x_odd = input_thd[..., 1::2]   # (N, h, F)
+    cos_thF = freqs_cis_thF.real
+    sin_thF = freqs_cis_thF.imag
+    # (a + ib) * (c + id) = (ac - bd) + i(ad + bc)
+    rot_even = x_even * cos_thF - x_odd * sin_thF
+    rot_odd = x_even * sin_thF + x_odd * cos_thF
+    output_real = torch.empty_like(input_thd)
+    output_real[..., 0::2] = rot_even
+    output_real[..., 1::2] = rot_odd
+    output_real = output_real.type_as(input_BShd)
+    output_BShd = input_BShd.clone()
+    output_BShd[idx_b, idx_s] = output_real
+    return output_BShd
+def apply_3d_rotary_emb(
+    xq: torch.Tensor,  # (B, S, H, D)
+    xk: torch.Tensor,  # (B, S, H, D)
+    freqs_cis: torch.Tensor,
+    freqs_cis_2d: torch.Tensor | None,
+    pos_hw: torch.Tensor | None,  # (B,S,3)
+) -> tuple[torch.Tensor, torch.Tensor]:
+    xq_t, xq_hw = xq.chunk(chunks=2, dim=-1)
+    xk_t, xk_hw = xk.chunk(chunks=2, dim=-1)
+    B, S, H, D = xq.shape
+    xq_t, xk_t = apply_rotary_emb(xq_t, xk_t, freqs_cis)
+    if freqs_cis_2d is not None and pos_hw is not None:
+        xq_hw = apply_golden_rotary_emb(xq_hw, freqs_cis_2d, pos_hw)
+        xk_hw = apply_golden_rotary_emb(xk_hw, freqs_cis_2d, pos_hw)
+    xq_out = torch.concat([xq_t, xq_hw], dim=-1).type_as(xq)
+    xk_out = torch.concat([xk_t, xk_hw], dim=-1).type_as(xk)
+    return xq_out, xk_out

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,390 @@

+{
+  "additional_special_tokens": [
+    "<|pad|>",
+    ">>ABSTRACT<<",
+    ">>INTRODUCTION<<",
+    ">>SUMMARY<<",
+    ">>COMMENT<<",
+    ">>ANSWER<<",
+    ">>QUESTION<<",
+    ">>DOMAIN<<",
+    ">>PREFIX<<",
+    ">>SUFFIX<<",
+    ">>MIDDLE<<",
+    "<|finetune_right_pad_id|>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eom_id|>",
+    "<|eot_id|>",
+    "<|begin_of_text|>",
+    ">>TITLE<<",
+    "<tool_response>",
+    "</tool_response>",
+    "<tool_call>",
+    "</tool_call>",
+    "<schema>",
+    "</schema>",
+    "<scratch_pad>",
+    "</scratch_pad>",
+    "<thinking>",
+    "</thinking>",
+    "<explanation>",
+    "</explanation>",
+    "<file_sep>",
+    "<repo_name>",
+    "<tr>",
+    "</tr>",
+    "<|image|>",
+    "<|image_row_sep|>",
+    "<|start_of_image|>",
+    "<|end_of_image|>",
+    "<|start_of_video|>",
+    "<|end_of_video|>",
+    "<|frame_sep|>",
+    "<|start_of_turn|>",
+    "<|end_of_turn|>",
+    "<|start_of_diffusion_query|>",
+    "<|end_of_diffusion_query|>",
+    "<|diffusion_query|>",
+    "<|object|>",
+    "<|coord|>",
+    "<|size|>",
+    "<|perceive|>",
+    "<|image_mask_token|>",
+    "<|image_cls|>",
+    "<|image_reg_1|>",
+    "<|image_reg_2|>",
+    "<|image_reg_3|>",
+    "<|image_reg_4|>",
+    "<|image_reg_5|>",
+    "<|image_reg_6|>",
+    "<|image_reg_7|>",
+    "<|image_reg_8|>",
+    "<|DET|>",
+    "<|POINTING|>",
+    "<|OCR_GROUNDING|>",
+    "<|OCR_DOC_PARSER|>",
+    "<|OCR_PLAIN|>",
+    "<|REF_SEG|>",
+    "<|POINT_REF_SEG|>",
+    "<|CAPTION|>",
+    "<|DETAILED_CAPTION|>",
+    "<|seg|>",
+    "<|end_of_query|>",
+    "<|start_of_query|>",
+    "<|task_sep|>",
+    "<|QA|>",
+    "<|LAYOUT_DETECTION|>",
+    "<|category_sep|>",
+    "<td>",
+    "</td>",
+    "<th>",
+    "</th>",
+    ">>UNUSED_261<<",
+    ">>UNUSED_262<<",
+    ">>UNUSED_263<<",
+    ">>UNUSED_264<<",
+    ">>UNUSED_265<<",
+    ">>UNUSED_266<<",
+    ">>UNUSED_267<<",
+    ">>UNUSED_268<<",
+    ">>UNUSED_269<<",
+    ">>UNUSED_270<<",
+    ">>UNUSED_271<<",
+    ">>UNUSED_272<<",
+    ">>UNUSED_273<<",
+    ">>UNUSED_274<<",
+    ">>UNUSED_275<<",
+    ">>UNUSED_276<<",
+    ">>UNUSED_277<<",
+    ">>UNUSED_278<<",
+    ">>UNUSED_279<<",
+    ">>UNUSED_280<<",
+    ">>UNUSED_281<<",
+    ">>UNUSED_282<<",
+    ">>UNUSED_283<<",
+    ">>UNUSED_284<<",
+    ">>UNUSED_285<<",
+    ">>UNUSED_286<<",
+    ">>UNUSED_287<<",
+    ">>UNUSED_288<<",
+    ">>UNUSED_289<<",
+    ">>UNUSED_290<<",
+    ">>UNUSED_291<<",
+    ">>UNUSED_292<<",
+    ">>UNUSED_293<<",
+    ">>UNUSED_294<<",
+    ">>UNUSED_295<<",
+    ">>UNUSED_296<<",
+    ">>UNUSED_297<<",
+    ">>UNUSED_298<<",
+    ">>UNUSED_299<<",
+    ">>UNUSED_300<<",
+    ">>UNUSED_301<<",
+    ">>UNUSED_302<<",
+    ">>UNUSED_303<<",
+    ">>UNUSED_304<<",
+    ">>UNUSED_305<<",
+    ">>UNUSED_306<<",
+    ">>UNUSED_307<<",
+    ">>UNUSED_308<<",
+    ">>UNUSED_309<<",
+    ">>UNUSED_310<<",
+    ">>UNUSED_311<<",
+    ">>UNUSED_312<<",
+    ">>UNUSED_313<<",
+    ">>UNUSED_314<<",
+    ">>UNUSED_315<<",
+    ">>UNUSED_316<<",
+    ">>UNUSED_317<<",
+    ">>UNUSED_318<<",
+    ">>UNUSED_319<<",
+    ">>UNUSED_320<<",
+    ">>UNUSED_321<<",
+    ">>UNUSED_322<<",
+    ">>UNUSED_323<<",
+    ">>UNUSED_324<<",
+    ">>UNUSED_325<<",
+    ">>UNUSED_326<<",
+    ">>UNUSED_327<<",
+    ">>UNUSED_328<<",
+    ">>UNUSED_329<<",
+    ">>UNUSED_330<<",
+    ">>UNUSED_331<<",
+    ">>UNUSED_332<<",
+    ">>UNUSED_333<<",
+    ">>UNUSED_334<<",
+    ">>UNUSED_335<<",
+    ">>UNUSED_336<<",
+    ">>UNUSED_337<<",
+    ">>UNUSED_338<<",
+    ">>UNUSED_339<<",
+    ">>UNUSED_340<<",
+    ">>UNUSED_341<<",
+    ">>UNUSED_342<<",
+    ">>UNUSED_343<<",
+    ">>UNUSED_344<<",
+    ">>UNUSED_345<<",
+    ">>UNUSED_346<<",
+    ">>UNUSED_347<<",
+    ">>UNUSED_348<<",
+    ">>UNUSED_349<<",
+    ">>UNUSED_350<<",
+    ">>UNUSED_351<<",
+    ">>UNUSED_352<<",
+    ">>UNUSED_353<<",
+    ">>UNUSED_354<<",
+    ">>UNUSED_355<<",
+    ">>UNUSED_356<<",
+    ">>UNUSED_357<<",
+    ">>UNUSED_358<<",
+    ">>UNUSED_359<<",
+    ">>UNUSED_360<<",
+    ">>UNUSED_361<<",
+    ">>UNUSED_362<<",
+    ">>UNUSED_363<<",
+    ">>UNUSED_364<<",
+    ">>UNUSED_365<<",
+    ">>UNUSED_366<<",
+    ">>UNUSED_367<<",
+    ">>UNUSED_368<<",
+    ">>UNUSED_369<<",
+    ">>UNUSED_370<<",
+    ">>UNUSED_371<<",
+    ">>UNUSED_372<<",
+    ">>UNUSED_373<<",
+    ">>UNUSED_374<<",
+    ">>UNUSED_375<<",
+    ">>UNUSED_376<<",
+    ">>UNUSED_377<<",
+    ">>UNUSED_378<<",
+    ">>UNUSED_379<<",
+    ">>UNUSED_380<<",
+    ">>UNUSED_381<<",
+    ">>UNUSED_382<<",
+    ">>UNUSED_383<<",
+    ">>UNUSED_384<<",
+    ">>UNUSED_385<<",
+    ">>UNUSED_386<<",
+    ">>UNUSED_387<<",
+    ">>UNUSED_388<<",
+    ">>UNUSED_389<<",
+    ">>UNUSED_390<<",
+    ">>UNUSED_391<<",
+    ">>UNUSED_392<<",
+    ">>UNUSED_393<<",
+    ">>UNUSED_394<<",
+    ">>UNUSED_395<<",
+    ">>UNUSED_396<<",
+    ">>UNUSED_397<<",
+    ">>UNUSED_398<<",
+    ">>UNUSED_399<<",
+    ">>UNUSED_400<<",
+    ">>UNUSED_401<<",
+    ">>UNUSED_402<<",
+    ">>UNUSED_403<<",
+    ">>UNUSED_404<<",
+    ">>UNUSED_405<<",
+    ">>UNUSED_406<<",
+    ">>UNUSED_407<<",
+    ">>UNUSED_408<<",
+    ">>UNUSED_409<<",
+    ">>UNUSED_410<<",
+    ">>UNUSED_411<<",
+    ">>UNUSED_412<<",
+    ">>UNUSED_413<<",
+    ">>UNUSED_414<<",
+    ">>UNUSED_415<<",
+    ">>UNUSED_416<<",
+    ">>UNUSED_417<<",
+    ">>UNUSED_418<<",
+    ">>UNUSED_419<<",
+    ">>UNUSED_420<<",
+    ">>UNUSED_421<<",
+    ">>UNUSED_422<<",
+    ">>UNUSED_423<<",
+    ">>UNUSED_424<<",
+    ">>UNUSED_425<<",
+    ">>UNUSED_426<<",
+    ">>UNUSED_427<<",
+    ">>UNUSED_428<<",
+    ">>UNUSED_429<<",
+    ">>UNUSED_430<<",
+    ">>UNUSED_431<<",
+    ">>UNUSED_432<<",
+    ">>UNUSED_433<<",
+    ">>UNUSED_434<<",
+    ">>UNUSED_435<<",
+    ">>UNUSED_436<<",
+    ">>UNUSED_437<<",
+    ">>UNUSED_438<<",
+    ">>UNUSED_439<<",
+    ">>UNUSED_440<<",
+    ">>UNUSED_441<<",
+    ">>UNUSED_442<<",
+    ">>UNUSED_443<<",
+    ">>UNUSED_444<<",
+    ">>UNUSED_445<<",
+    ">>UNUSED_446<<",
+    ">>UNUSED_447<<",
+    ">>UNUSED_448<<",
+    ">>UNUSED_449<<",
+    ">>UNUSED_450<<",
+    ">>UNUSED_451<<",
+    ">>UNUSED_452<<",
+    ">>UNUSED_453<<",
+    ">>UNUSED_454<<",
+    ">>UNUSED_455<<",
+    ">>UNUSED_456<<",
+    ">>UNUSED_457<<",
+    ">>UNUSED_458<<",
+    ">>UNUSED_459<<",
+    ">>UNUSED_460<<",
+    ">>UNUSED_461<<",
+    ">>UNUSED_462<<",
+    ">>UNUSED_463<<",
+    ">>UNUSED_464<<",
+    ">>UNUSED_465<<",
+    ">>UNUSED_466<<",
+    ">>UNUSED_467<<",
+    ">>UNUSED_468<<",
+    ">>UNUSED_469<<",
+    ">>UNUSED_470<<",
+    ">>UNUSED_471<<",
+    ">>UNUSED_472<<",
+    ">>UNUSED_473<<",
+    ">>UNUSED_474<<",
+    ">>UNUSED_475<<",
+    ">>UNUSED_476<<",
+    ">>UNUSED_477<<",
+    ">>UNUSED_478<<",
+    ">>UNUSED_479<<",
+    ">>UNUSED_480<<",
+    ">>UNUSED_481<<",
+    ">>UNUSED_482<<",
+    ">>UNUSED_483<<",
+    ">>UNUSED_484<<",
+    ">>UNUSED_485<<",
+    ">>UNUSED_486<<",
+    ">>UNUSED_487<<",
+    ">>UNUSED_488<<",
+    ">>UNUSED_489<<",
+    ">>UNUSED_490<<",
+    ">>UNUSED_491<<",
+    ">>UNUSED_492<<",
+    ">>UNUSED_493<<",
+    ">>UNUSED_494<<",
+    ">>UNUSED_495<<",
+    ">>UNUSED_496<<",
+    ">>UNUSED_497<<",
+    ">>UNUSED_498<<",
+    ">>UNUSED_499<<",
+    ">>UNUSED_500<<",
+    ">>UNUSED_501<<",
+    ">>UNUSED_502<<",
+    ">>UNUSED_503<<",
+    ">>UNUSED_504<<",
+    ">>UNUSED_505<<",
+    ">>UNUSED_506<<",
+    ">>UNUSED_507<<",
+    ">>UNUSED_508<<",
+    ">>UNUSED_509<<",
+    ">>UNUSED_510<<",
+    ">>UNUSED_511<<"
+  ],
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<|image|>",
+  "image_cls_token": "<|image_cls|>",
+  "image_reg_1_token": "<|image_reg_1|>",
+  "image_reg_2_token": "<|image_reg_2|>",
+  "image_reg_3_token": "<|image_reg_3|>",
+  "image_reg_4_token": "<|image_reg_4|>",
+  "image_reg_5_token": "<|image_reg_5|>",
+  "image_reg_6_token": "<|image_reg_6|>",
+  "image_reg_7_token": "<|image_reg_7|>",
+  "image_reg_8_token": "<|image_reg_8|>",
+  "image_row_sep_token": "<|image_row_sep|>",
+  "start_of_image_token": "<|start_of_image|>",
+  "end_of_image_token": "<|end_of_image|>",
+  "start_of_video_token": "<|start_of_video|>",
+  "end_of_video_token": "<|end_of_video|>",
+  "frame_sep_token": "<|frame_sep|>",
+  "start_of_turn_token": "<|start_of_turn|>",
+  "end_of_turn_token": "<|end_of_turn|>",
+  "start_of_diffusion_query_token": "<|start_of_diffusion_query|>",
+  "end_of_diffusion_query_token": "<|end_of_diffusion_query|>",
+  "diffusion_query_token": "<|diffusion_query|>",
+  "object_token": "<|object|>",
+  "coord_token": "<|coord|>",
+  "size_token": "<|size|>",
+  "perceive_token": "<|perceive|>",
+  "image_mask_token": "<|image_mask_token|>",
+  "det_token": "<|DET|>",
+  "pointing_token": "<|POINTING|>",
+  "ocr_grounding_token": "<|OCR_GROUNDING|>",
+  "ocr_doc_parser_token": "<|OCR_DOC_PARSER|>",
+  "ocr_plain_token": "<|OCR_PLAIN|>",
+  "ref_seg_token": "<|REF_SEG|>",
+  "point_ref_seg_token": "<|POINT_REF_SEG|>",
+  "caption_token": "<|CAPTION|>",
+  "detailed_caption_token": "<|DETAILED_CAPTION|>",
+  "seg_token": "<|seg|>",
+  "start_of_query_token": "<|start_of_query|>",
+  "end_of_query_token": "<|end_of_query|>",
+  "task_sep_token": "<|task_sep|>",
+  "qa_token": "<|QA|>",
+  "layout_detection_token": "<|LAYOUT_DETECTION|>",
+  "category_sep_token": "<|category_sep|>",
+  "table_row_start_token": "<tr>",
+  "table_row_end_token": "</tr>",
+  "table_data_start_token": "<td>",
+  "table_data_end_token": "</td>",
+  "table_header_start_token": "<th>",
+  "table_header_end_token": "</th>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff