Instructions to use magicslabnu/OutEffHop-opt-350m with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use magicslabnu/OutEffHop-opt-350m with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="magicslabnu/OutEffHop-opt-350m", trust_remote_code=True)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("magicslabnu/OutEffHop-opt-350m", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("magicslabnu/OutEffHop-opt-350m", trust_remote_code=True)

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use magicslabnu/OutEffHop-opt-350m with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "magicslabnu/OutEffHop-opt-350m"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "magicslabnu/OutEffHop-opt-350m",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/magicslabnu/OutEffHop-opt-350m

SGLang

How to use magicslabnu/OutEffHop-opt-350m with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "magicslabnu/OutEffHop-opt-350m" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "magicslabnu/OutEffHop-opt-350m",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "magicslabnu/OutEffHop-opt-350m" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "magicslabnu/OutEffHop-opt-350m",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use magicslabnu/OutEffHop-opt-350m with Docker Model Runner:
```
docker model run hf.co/magicslabnu/OutEffHop-opt-350m
```

robinzixuan commited on Jun 15, 2024

Commit

ab3b316

verified ·

1 Parent(s): 3590b7f

Upload modeling_opt.py

Browse files

Files changed (1) hide show

modeling_opt.py +23 -9

modeling_opt.py CHANGED Viewed

@@ -17,32 +17,37 @@
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
-from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     QuestionAnsweringModelOutput,
     SequenceClassifierOutputWithPast,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_opt import OPTConfig
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
@@ -128,6 +133,16 @@ def softmax_1(input: torch.Tensor, dim=-1, dtype=torch.float32) -> torch.Tensor:
     output = softmax_n_shifted_zeros(input, 1, dim=dim)
     return output if dtype is None else output.type(dtype=dtype)
 class OPTAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -147,7 +162,7 @@ class OPTAttention(nn.Module):
         self.head_dim = self.embed_dim // self.num_heads
         self.is_causal = True
         if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {
@@ -251,10 +266,10 @@ class OPTAttention(nn.Module):
         # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
         if attn_weights.dtype == torch.float16:
-            attn_weights = softmax_1(
                 attn_weights, dim=-1, dtype=torch.float32).to(torch.float16)
         else:
-            attn_weights = softmax_1(attn_weights, dim=-1)
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
@@ -306,7 +321,6 @@ class OPTAttention(nn.Module):
 class OptFlashAttention2(OPTAttention):
     """
     OPT flash attention module. This module inherits from `OPTAttention` as the weights of the module stays untouched.

 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     QuestionAnsweringModelOutput,
     SequenceClassifierOutputWithPast,
 )
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_opt import OPTConfig
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
     output = softmax_n_shifted_zeros(input, 1, dim=dim)
     return output if dtype is None else output.type(dtype=dtype)
+def clipped_softmax(data, dim=1, eta=1.1, gamma=-0.1, **kw):
+    sm_out = torch.nn.functional.softmax(data, dim=dim, **kw)
+    stretched_out = sm_out * (eta - gamma) + gamma
+    return torch.clip(stretched_out, 0, 1)
+def clipped_softmax1(data, dim=1, eta=1.1, gamma=-0.1, **kw):
+    sm_out = softmax_1(data, dim=dim, **kw)
+    stretched_out = sm_out * (eta - gamma) + gamma
+    return torch.clip(stretched_out, 0, 1)
 class OPTAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
         self.head_dim = self.embed_dim // self.num_heads
         self.is_causal = True
+        self.softmax_fn = clipped_softmax1
         if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {
         # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
         if attn_weights.dtype == torch.float16:
+            attn_weights = self.softmax_fn(
                 attn_weights, dim=-1, dtype=torch.float32).to(torch.float16)
         else:
+            attn_weights = self.softmax_fn(attn_weights, dim=-1)
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
 class OptFlashAttention2(OPTAttention):
     """
     OPT flash attention module. This module inherits from `OPTAttention` as the weights of the module stays untouched.