File size: 5,030 Bytes
b8674d2 abfe06e b8674d2 386fd01 8b2e894 d4d9ecd 70fee5e b8674d2 70fee5e b8674d2 ca5dc61 b8674d2 1277204 ca5dc61 b8674d2 ca5dc61 abfe06e 386fd01 abfe06e ca5dc61 b8674d2 ca5dc61 e171067 abfe06e b455788 20d00c9 b8674d2 b455788 502bdcb 20d00c9 b8674d2 20d00c9 b8674d2 0619122 5e7b159 b8674d2 386fd01 0619122 abfe06e 386fd01 965563c 386fd01 5e7b159 b8674d2 386fd01 b8674d2 386fd01 1277204 20d00c9 386fd01 20d00c9 281b438 b8674d2 281b438 386fd01 b8674d2 abfe06e 20d00c9 b8674d2 20d00c9 386fd01 b8674d2 502bdcb b8674d2 386fd01 5f2e5ba 386fd01 ea76d00 386fd01 1277204 b8674d2 386fd01 b8674d2 386fd01 5f2e5ba 386fd01 5f2e5ba ea76d00 386fd01 ea76d00 386fd01 b8674d2 5f2e5ba 386fd01 110d1f2 386fd01 b8674d2 386fd01 110d1f2 386fd01 110d1f2 386fd01 12f41ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import os
import gradio as gr
import datetime, re, requests
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from concurrent.futures import ThreadPoolExecutor
# ---------------------------
# Environment-safe settings
# ---------------------------
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# ---------------------------
# Load Models (SAFE MODE)
# ---------------------------
# Claim Extraction (FORCE slow tokenizer)
claim_model_name = "MoritzLaurer/DeBERTa-v3-base-mnli"
claim_classifier = pipeline(
"zero-shot-classification",
model=claim_model_name,
tokenizer=claim_model_name,
device=-1,
use_fast=False # 🔥 CRITICAL FIX
)
claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
# AI Text Detection
ai_detect_model_name = "roberta-base-openai-detector"
ai_detector = pipeline(
"text-classification",
model=ai_detect_model_name,
device=-1
)
# Semantic Model (EmbeddingGemma)
SEM_MODEL_NAME = "google/embeddinggemma-300m"
HF_TOKEN = os.getenv("HF_TOKEN")
sem_model = SentenceTransformer(
SEM_MODEL_NAME,
use_auth_token=HF_TOKEN
)
# ---------------------------
# Google Search Config
# ---------------------------
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CX = os.getenv("GOOGLE_CX")
google_quota = {"count": 0, "date": datetime.date.today()}
GOOGLE_DAILY_LIMIT = 100
def check_google_quota():
global google_quota
today = datetime.date.today()
if google_quota["date"] != today:
google_quota = {"count": 0, "date": today}
# ---------------------------
# Text Split Helper
# ---------------------------
def safe_split_text(text):
pattern = r'(?<!\d)[.](?!\d)|;'
return [s.strip() for s in re.split(pattern, text) if len(s.strip()) > 10]
# ---------------------------
# Claim Extraction
# ---------------------------
def extract_claims(text, max_claims=20):
sentences = safe_split_text(text)
def classify(s):
out = claim_classifier(s, claim_labels)
return {
"text": s,
"label": out["labels"][0],
"score": round(out["scores"][0], 3)
}
with ThreadPoolExecutor(max_workers=4) as ex:
results = list(ex.map(classify, sentences))
return results[:max_claims]
# ---------------------------
# AI Detection
# ---------------------------
def detect_ai(texts):
if isinstance(texts, str):
texts = [texts]
results = []
for t in texts:
r = ai_detector(t)[0]
label = "AI-generated" if r["label"].lower() in ["fake", "ai-generated"] else "Human"
results.append({
"text": t,
"label": label,
"score": round(r["score"], 3)
})
return results
# ---------------------------
# Keyword + Semantic Fact Check
# ---------------------------
def fetch_google_search_semantic(claim, k=3):
check_google_quota()
global google_quota
if google_quota["count"] >= GOOGLE_DAILY_LIMIT:
return {"keyword": [], "semantic": []}
url = (
"https://www.googleapis.com/customsearch/v1"
f"?q={requests.utils.quote(claim)}"
f"&key={GOOGLE_API_KEY}&cx={GOOGLE_CX}&num=10"
)
r = requests.get(url).json()
google_quota["count"] += 1
items = r.get("items", [])
snippets = [f"{i['title']}: {i['snippet']}" for i in items]
keyword_results = snippets[:k]
if not snippets:
return {"keyword": keyword_results, "semantic": []}
q_emb = sem_model.encode(claim, normalize_embeddings=True)
s_emb = sem_model.encode(snippets, normalize_embeddings=True)
sims = util.cos_sim(q_emb, s_emb)[0]
top_idx = sims.argsort(descending=True)[:k]
semantic_results = [snippets[i] for i in top_idx]
return {
"keyword": keyword_results,
"semantic": semantic_results
}
# ---------------------------
# Predict
# ---------------------------
def predict(text=""):
if not text.strip():
return {"error": "No input provided"}
full_ai = detect_ai(text)
sentences = safe_split_text(text)
full_fc = {s: fetch_google_search_semantic(s) for s in sentences}
claims = extract_claims(text)
claim_ai = detect_ai([c["text"] for c in claims])
claim_fc = {c["text"]: fetch_google_search_semantic(c["text"]) for c in claims}
return {
"full_text": {
"input": text,
"ai_detection": full_ai,
"fact_checking": full_fc
},
"claims": claims,
"claims_ai_detection": claim_ai,
"claims_fact_checking": claim_fc,
"google_quota_used": google_quota["count"]
}
# ---------------------------
# UI
# ---------------------------
with gr.Blocks() as demo:
gr.Markdown("## EduShield AI Backend – Keyword + Semantic Fact Check")
inp = gr.Textbox(lines=8, label="Input Text")
btn = gr.Button("Run Analysis")
out = gr.JSON()
btn.click(predict, inp, out)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0")
|