|
|
from fastapi import FastAPI, Request |
|
|
from huggingface_hub import InferenceClient |
|
|
import os |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
|
|
|
|
|
|
client = InferenceClient(token=HF_TOKEN, model="meta-llama/Llama-3.2-3B-Instruct") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/") |
|
|
def root(): |
|
|
return {"message": "Gemma 3 API on CPU"} |
|
|
|
|
|
|
|
|
@app.post("/generate") |
|
|
async def generate(request: Request): |
|
|
body = await request.json() |
|
|
prompt = body.get("prompt", "") |
|
|
messages = [ |
|
|
{"role": "user", "content": prompt} |
|
|
] |
|
|
out = client.chat_completion(messages) |
|
|
print(out) |
|
|
response = out.choices[0].message.content |
|
|
return {"response": response} |