File size: 10,655 Bytes
1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 333f8d9 1ddd3c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
# main.py
import json
import time
from typing import Dict, List, Optional
from urllib.parse import urlencode, urlparse, parse_qs
from bs4 import BeautifulSoup
from curl_cffi.requests import Session
from fastapi import FastAPI, HTTPException, Query, Request
from pydantic import BaseModel, Field
# 1. Pydantic Models for API Responses (Unchanged)
class BingSearchResult(BaseModel):
url: str = Field(..., description="The URL of the search result.")
title: str = Field(..., description="The title of the search result.")
description: str = Field(..., description="A brief description or snippet from the result page.")
class BingImageResult(BaseModel):
title: str = Field(..., description="The title or caption of the image.")
image: str = Field(..., description="The direct URL to the full-resolution image.")
thumbnail: str = Field(..., description="The URL to the thumbnail of the image.")
url: str = Field(..., description="The URL of the webpage where the image was found.")
source: str = Field(..., description="The source domain of the image.")
class BingNewsResult(BaseModel):
title: str = Field(..., description="The title of the news article.")
url: str = Field(..., description="The URL to the full news article.")
description: str = Field(..., description="A snippet from the news article.")
source: str = Field(..., description="The publisher or source of the news article.")
# 2. FastAPI Application Setup (Unchanged)
app = FastAPI(
title="Definitive Fast Bing Search API",
description="Returns correct, non-localized search results from Bing using advanced techniques.",
version="9.0.0-complete"
)
# 3. Middleware to Add Custom Headers (Unchanged)
@app.middleware("http")
async def add_custom_headers(request: Request, call_next):
start_time = time.time()
response = await call_next(request)
process_time = time.time() - start_time
response.headers["X-Process-Time"] = f"{process_time:.4f} seconds"
response.headers["X-Powered-By"] = "NiansuhAI"
return response
# 4. The Definitive Bing Search Class
class BingSearch:
"""The definitive Bing search scraper that counters aggressive localization."""
def __init__(
self,
proxies: Optional[Dict[str, str]] = None,
timeout: int = 15,
impersonate: str = "chrome110"
):
self.session = Session(
proxies=proxies or {},
timeout=timeout,
impersonate=impersonate,
verify=False
)
self._base_url = "https://www.bing.com"
self.session.headers.update({
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
})
def _update_session_for_region(self, region: str = "en-US"):
"""THE CRUCIAL FIX: Sets a cookie that explicitly tells Bing our preferred market."""
self.session.cookies.set("SRCHHPGUSR", f"SRCHLANG=en&MKT={region}", domain=".bing.com")
def text(
self, keywords: str, max_results: int, region: str, safesearch: str
) -> List[BingSearchResult]:
self._update_session_for_region(region)
safe_map = {"on": "Strict", "moderate": "Moderate", "off": "Off"}
safe = safe_map.get(safesearch.lower(), "Moderate")
fetched_results = []
page = 1
while len(fetched_results) < max_results:
params = { "q": keywords, "first": (page - 1) * 10 + 1, "safeSearch": safe }
try:
resp = self.session.get(self._base_url + "/search", params=params)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
except Exception as e:
print(f"Error fetching text search page: {e}"); break
result_blocks = soup.select("li.b_algo")
if not result_blocks: break
for result in result_blocks:
link_tag = result.select_one("h2 a")
desc_tag = result.select_one(".b_caption p")
if link_tag and desc_tag and link_tag.get('href'):
fetched_results.append(BingSearchResult(
url=link_tag['href'], title=link_tag.get_text(strip=True),
description=desc_tag.get_text(strip=True)))
if len(fetched_results) >= max_results: break
page += 1
return fetched_results[:max_results]
def images(
self, keywords: str, max_results: int, region: str, safesearch: str
) -> List[BingImageResult]:
self._update_session_for_region(region)
safe_map = {"on": "Strict", "moderate": "Moderate", "off": "Off"}
safe = safe_map.get(safesearch.lower(), "Moderate")
params = {"q": keywords, "safeSearch": safe, "form": "HDRSC2"}
try:
resp = self.session.get(f"{self._base_url}/images/search", params=params)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
except Exception as e:
raise Exception(f"Bing image search failed: {e}")
results = []
for item in soup.select("a.iusc"):
if len(results) >= max_results: break
try:
m_data = json.loads(item.get("m", "{}"))
if m_data and m_data.get("murl"):
results.append(BingImageResult(
title=m_data.get("t", ""), image=m_data.get("murl"),
thumbnail=m_data.get("turl", ""), url=m_data.get("purl", ""),
source=m_data.get("surl", "")))
except Exception: continue
return results
def news(
self, keywords: str, max_results: int, region: str, safesearch: str
) -> List[BingNewsResult]:
self._update_session_for_region(region)
safe_map = {"on": "Strict", "moderate": "Moderate", "off": "Off"}
safe = safe_map.get(safesearch.lower(), "Moderate")
params = {"q": keywords, "safeSearch": safe, "form": "QBNH"}
try:
resp = self.session.get(f"{self._base_url}/news/search", params=params)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
except Exception as e:
raise Exception(f"Bing news search failed: {e}")
results = []
for item in soup.select("div.news-card"):
if len(results) >= max_results: break
a_tag = item.find("a", class_="title")
snippet = item.find("div", class_="snippet")
source = item.find("div", class_="source")
if a_tag and a_tag.get('href'):
results.append(BingNewsResult(
title=a_tag.get_text(strip=True), url=a_tag['href'],
description=snippet.get_text(strip=True) if snippet else "",
source=source.get_text(strip=True) if source else ""))
return results
def suggestions(self, query: str, region: str = "en-US") -> List[str]:
# The suggestions endpoint is an API and correctly uses the 'mkt' parameter.
params = {"query": query, "mkt": region}
url = f"https://api.bing.com/osjson.aspx?{urlencode(params)}"
try:
resp = self.session.get(url)
resp.raise_for_status()
data = resp.json()
return data[1] if isinstance(data, list) and len(data) > 1 else []
except Exception: return []
# 5. API Endpoints
# IMPORTANT: For guaranteed results from a specific country (e.g., en-US),
# you MUST use a proxy server from that country.
#
# Example proxy setup:
# proxies = {
# "http": "http://USERNAME:[email protected]:PORT",
# "https": "http://USERNAME:[email protected]:PORT",
# }
# bing = BingSearch(proxies=proxies)
bing = BingSearch() # Without a proxy, results may still be localized.
@app.get("/search", response_model=List[BingSearchResult], summary="Perform a Bing text search")
async def text_search(
keywords: str = Query(..., description="The search query."),
max_results: int = Query(10, ge=1, le=50, description="Maximum number of results."),
region: str = Query("en-US", description="Market to search in (e.g., 'en-US'). A proxy is recommended."),
safesearch: str = Query("moderate", description="Safe search level: 'on', 'moderate', or 'off'.")
):
try:
return bing.text(keywords, max_results, region, safesearch)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/images", response_model=List[BingImageResult], summary="Perform a Bing image search")
async def image_search(
keywords: str = Query(..., description="The image search query."),
max_results: int = Query(10, ge=1, le=50, description="Maximum number of image results."),
region: str = Query("en-US", description="Market to search in (e.g., 'en-US'). A proxy is recommended."),
safesearch: str = Query("moderate", description="Safe search level: 'on', 'moderate', or 'off'.")
):
try:
return bing.images(keywords, max_results, region, safesearch)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/news", response_model=List[BingNewsResult], summary="Perform a Bing news search")
async def news_search(
keywords: str = Query(..., description="The news search query."),
max_results: int = Query(10, ge=1, le=50, description="Maximum number of news results."),
region: str = Query("en-US", description="Market to search in (e.g., 'en-US'). A proxy is recommended."),
safesearch: str = Query("moderate", description="Safe search level: 'on', 'moderate', or 'off'.")
):
try:
return bing.news(keywords, max_results, region, safesearch)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/suggestions", response_model=List[str], summary="Get Bing search suggestions")
async def get_suggestions(
query: str = Query(..., description="The query to get suggestions for."),
region: str = Query("en-US", description="Market for suggestions (e.g., 'en-US').")
):
try:
return bing.suggestions(query, region)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000) |