Spaces:

rakib7
/

sentiment

Sleeping

sentiment / scraper.py

Rakib Hossain

Add complete Bangla sentiment analysis: data, fine-tuned model, and visualizations

49c214c 13 days ago

34.1 kB

	"""
	Web scraper for Bangla news articles
	Multiple sources with pagination and large-scale scraping support
	Enhanced for scraping 50,000+ articles
	"""

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import time
	from datetime import datetime, timedelta
	from tqdm import tqdm
	import os
	import random
	from urllib.parse import urljoin, urlparse
	import json


	class BanglaNewsScraper:
	def __init__(self, target_count=50000):
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9,bn;q=0.8',
	'Accept-Encoding': 'gzip, deflate', # Exclude br (Brotli) to avoid decoding issues
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1'
	}
	self.target_count = target_count
	self.scraped_count = 0
	self.session = requests.Session()
	self.session.headers.update(self.headers)
	# Disable automatic decompression to handle it manually if needed
	self.session.stream = False
	self.articles = []
	self.seen_urls = set()
	self.seen_texts = set() # To avoid duplicates

	def safe_get(self, url, timeout=15, max_retries=2):
	"""Safely get URL content, handling Brotli and other encoding issues"""
	for attempt in range(max_retries):
	try:
	# First try with session
	response = self.session.get(url, timeout=timeout, stream=False)
	if response.status_code == 200:
	return response
	except Exception as e:
	error_str = str(e).lower()
	# If Brotli error, try with explicit headers
	if 'brotli' in error_str or 'br' in error_str or 'encoding' in error_str:
	try:
	headers_no_br = {
	'User-Agent': self.headers['User-Agent'],
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9,bn;q=0.8',
	'Accept-Encoding': 'gzip, deflate', # Explicitly exclude br
	'Connection': 'keep-alive'
	}
	response = requests.get(url, headers=headers_no_br, timeout=timeout)
	if response.status_code == 200:
	return response
	except:
	if attempt < max_retries - 1:
	time.sleep(1)
	continue
	else:
	if attempt < max_retries - 1:
	time.sleep(1)
	continue
	return None

	def extract_text_elements(self, soup, source_name="Unknown", page=1):
	"""Extract text elements from soup using multiple strategies"""
	# Multiple selector strategies - comprehensive
	selectors = [
	'h2.headline', 'h2', 'h3', 'h4',
	'a[class="headline"]', 'a[class="title"]', 'a[class*="news"]',
	'div[class="story"] h2', 'div[class="story"] h3',
	'article h2', 'article h3',
	'div[class="title"]', 'div[class="headline"]',
	'span[class="headline"]', 'span[class="title"]',
	'p[class="headline"]', 'p[class="title"]',
	'a[href="/article/"]', 'a[href="/news/"]', 'a[href*="/story/"]',
	'div[class="card"] h2', 'div[class="item"] h2',
	'li[class="news"]', 'li[class="article"]',
	]

	elements = []
	for selector in selectors:
	try:
	found = soup.select(selector)
	if found:
	elements.extend(found)
	except:
	continue

	# Remove duplicates while preserving order
	seen_elements = set()
	unique_elements = []
	for elem in elements:
	elem_id = id(elem)
	if elem_id not in seen_elements:
	seen_elements.add(elem_id)
	unique_elements.append(elem)

	# Fallback 1: get all links with text that look like news
	if not unique_elements:
	links = soup.find_all('a', href=True)
	for link in links:
	text = link.get_text().strip()
	href = link.get('href', '')
	# Check if it looks like a news article link
	if (text and len(text) > 15 and len(text) < 200 and
	('/article/' in href or '/news/' in href or '/story/' in href or
	'/bangla/' in href or '/bengali/' in href)):
	unique_elements.append(link)

	# Fallback 2: get all paragraphs with substantial text
	if not unique_elements:
	paragraphs = soup.find_all('p')
	for p in paragraphs:
	text = p.get_text().strip()
	if len(text) > 30 and len(text) < 300:
	unique_elements.append(p)

	# Fallback 3: get any div with substantial text
	if not unique_elements:
	divs = soup.find_all('div', class_=True)
	for div in divs:
	text = div.get_text().strip()
	classes = ' '.join(div.get('class', []))
	if (text and len(text) > 20 and len(text) < 250 and
	('news' in classes.lower() or 'article' in classes.lower() or
	'story' in classes.lower() or 'title' in classes.lower())):
	unique_elements.append(div)

	return unique_elements

	def scrape_prothom_alo(self, max_pages=100):
	"""Scrape Prothom Alo articles with pagination"""
	print("🔍 Scraping Prothom Alo...")
	articles = []
	base_url = "https://www.prothomalo.com/"

	for page in range(1, max_pages + 1):
	if self.scraped_count >= self.target_count:
	break

	try:
	# Try different URL patterns for pagination
	urls_to_try = [
	f"{base_url}?page={page}",
	f"{base_url}latest?page={page}",
	f"{base_url}archive?page={page}",
	base_url if page == 1 else None
	]

	url = None
	response = None
	for u in urls_to_try:
	if u:
	resp = self.safe_get(u, timeout=15)
	if resp:
	url = u
	response = resp
	break

	if not url or not response:
	continue

	try:
	soup = BeautifulSoup(response.content, 'html.parser')
	except Exception as e:
	if page == 1:
	print(f"⚠️ Error parsing page {page}: {e}")
	continue

	# Use shared extraction method
	unique_elements = self.extract_text_elements(soup, 'Prothom Alo', page)

	page_articles = 0
	for element in unique_elements:
	if self.scraped_count >= self.target_count:
	break

	try:
	text = element.get_text().strip()
	# Clean text - remove extra whitespace
	text = ' '.join(text.split())

	# Check for duplicates and minimum length
	if (text and len(text) > 15 and
	text not in self.seen_texts and
	len(text) < 500): # Reasonable max length

	self.seen_texts.add(text)
	articles.append({
	'text': text,
	'source': 'Prothom Alo',
	'date': datetime.now().strftime('%Y-%m-%d'),
	'category': 'news'
	})
	self.scraped_count += 1
	page_articles += 1
	except:
	continue

	# Debug info for first page
	if page == 1 and page_articles == 0:
	print(f"⚠️ Page 1: Found {len(unique_headlines)} potential elements but extracted 0 articles")
	if len(unique_headlines) > 0:
	sample_text = unique_headlines[0].get_text().strip()[:100]
	print(f" Sample text: {sample_text}...")

	if page_articles == 0:
	# If first few pages fail, try different approach
	if page <= 3:
	continue # Try a few more pages
	else:
	# No more articles found, stop pagination
	break

	# Rate limiting
	time.sleep(random.uniform(1, 3))

	except Exception as e:
	print(f"⚠️ Error on page {page}: {e}")
	continue

	print(f"✅ Scraped {len(articles)} articles from Prothom Alo")
	return articles

	def scrape_bdnews24(self, max_pages=100):
	"""Scrape bdnews24.com with pagination"""
	print("🔍 Scraping bdnews24...")
	articles = []
	base_url = "https://bangla.bdnews24.com/"

	for page in range(1, max_pages + 1):
	if self.scraped_count >= self.target_count:
	break

	try:
	urls_to_try = [
	f"{base_url}?page={page}",
	f"{base_url}latest?page={page}",
	base_url if page == 1 else None
	]

	url = None
	response = None
	for u in urls_to_try:
	if u:
	resp = self.safe_get(u, timeout=15)
	if resp:
	url = u
	response = resp
	break

	if not url or not response:
	continue

	try:
	soup = BeautifulSoup(response.content, 'html.parser')
	except Exception as e:
	if page == 1:
	print(f"⚠️ Error parsing page {page}: {e}")
	continue

	# Use shared extraction method
	unique_elements = self.extract_text_elements(soup, 'bdnews24', page)

	page_articles = 0
	for element in unique_elements:
	if self.scraped_count >= self.target_count:
	break

	try:
	text = element.get_text().strip()
	text = ' '.join(text.split()) # Clean whitespace

	if (text and len(text) > 15 and
	text not in self.seen_texts and
	len(text) < 500):

	self.seen_texts.add(text)
	articles.append({
	'text': text,
	'source': 'bdnews24',
	'date': datetime.now().strftime('%Y-%m-%d'),
	'category': 'news'
	})
	self.scraped_count += 1
	page_articles += 1
	except:
	continue

	# Debug info for first page
	if page == 1 and page_articles == 0:
	print(f"⚠️ Page 1: Found {len(unique_elements)} potential elements but extracted 0 articles")

	if page_articles == 0:
	break

	time.sleep(random.uniform(1, 3))

	except Exception as e:
	print(f"⚠️ Error on page {page}: {e}")
	continue

	print(f"✅ Scraped {len(articles)} articles from bdnews24")
	return articles

	def scrape_bbc_bangla(self, max_pages=100):
	"""Scrape BBC Bangla with pagination"""
	print("🔍 Scraping BBC Bangla...")
	articles = []
	base_url = "https://www.bbc.com/bengali"

	for page in range(1, max_pages + 1):
	if self.scraped_count >= self.target_count:
	break

	try:
	urls_to_try = [
	f"{base_url}?page={page}",
	base_url if page == 1 else None
	]

	url = None
	response = None
	for u in urls_to_try:
	if u:
	resp = self.safe_get(u, timeout=15)
	if resp:
	url = u
	response = resp
	break

	if not url or not response:
	continue

	try:
	soup = BeautifulSoup(response.content, 'html.parser')
	except Exception as e:
	if page == 1:
	print(f"⚠️ Error parsing page {page}: {e}")
	continue

	# Use shared extraction method
	unique_elements = self.extract_text_elements(soup, 'BBC Bangla', page)

	page_articles = 0
	for element in unique_elements:
	if self.scraped_count >= self.target_count:
	break

	try:
	text = element.get_text().strip()
	text = ' '.join(text.split()) # Clean whitespace

	if (text and len(text) > 15 and
	text not in self.seen_texts and
	len(text) < 500):

	self.seen_texts.add(text)
	articles.append({
	'text': text,
	'source': 'BBC Bangla',
	'date': datetime.now().strftime('%Y-%m-%d'),
	'category': 'news'
	})
	self.scraped_count += 1
	page_articles += 1
	except:
	continue

	# Debug info for first page
	if page == 1 and page_articles == 0:
	print(f"⚠️ Page 1: Found {len(unique_elements)} potential elements but extracted 0 articles")

	if page_articles == 0:
	break

	time.sleep(random.uniform(1, 3))

	except Exception as e:
	print(f"⚠️ Error on page {page}: {e}")
	continue

	print(f"✅ Scraped {len(articles)} articles from BBC Bangla")
	return articles

	def scrape_jugantor(self, max_pages=100):
	"""Scrape Jugantor newspaper"""
	print("🔍 Scraping Jugantor...")
	articles = []
	base_url = "https://www.jugantor.com/"

	for page in range(1, max_pages + 1):
	if self.scraped_count >= self.target_count:
	break

	try:
	urls_to_try = [
	f"{base_url}?page={page}",
	base_url if page == 1 else None
	]

	url = None
	response = None
	for u in urls_to_try:
	if u:
	resp = self.safe_get(u, timeout=15)
	if resp:
	url = u
	response = resp
	break

	if not url or not response:
	continue

	try:
	soup = BeautifulSoup(response.content, 'html.parser')
	except Exception as e:
	print(f"⚠️ Error parsing page {page}: {e}")
	continue

	# Use shared extraction method
	unique_elements = self.extract_text_elements(soup, 'Jugantor', page)

	page_articles = 0
	for element in unique_elements:
	if self.scraped_count >= self.target_count:
	break

	try:
	text = element.get_text().strip()
	text = ' '.join(text.split()) # Clean whitespace

	if (text and len(text) > 15 and
	text not in self.seen_texts and
	len(text) < 500):

	self.seen_texts.add(text)
	articles.append({
	'text': text,
	'source': 'Jugantor',
	'date': datetime.now().strftime('%Y-%m-%d'),
	'category': 'news'
	})
	self.scraped_count += 1
	page_articles += 1
	except:
	continue

	if page_articles == 0:
	break

	time.sleep(random.uniform(1, 3))

	except Exception as e:
	continue

	print(f"✅ Scraped {len(articles)} articles from Jugantor")
	return articles

	def scrape_kaler_kantho(self, max_pages=100):
	"""Scrape Kaler Kantho newspaper"""
	print("🔍 Scraping Kaler Kantho...")
	articles = []
	base_url = "https://www.kalerkantho.com/"

	for page in range(1, max_pages + 1):
	if self.scraped_count >= self.target_count:
	break

	try:
	urls_to_try = [
	f"{base_url}?page={page}",
	base_url if page == 1 else None
	]

	url = None
	response = None
	for u in urls_to_try:
	if u:
	resp = self.safe_get(u, timeout=15)
	if resp:
	url = u
	response = resp
	break

	if not url or not response:
	continue

	try:
	soup = BeautifulSoup(response.content, 'html.parser')
	except Exception as e:
	print(f"⚠️ Error parsing page {page}: {e}")
	continue

	# Use shared extraction method
	unique_elements = self.extract_text_elements(soup, 'Kaler Kantho', page)

	page_articles = 0
	for element in unique_elements:
	if self.scraped_count >= self.target_count:
	break

	try:
	text = element.get_text().strip()
	text = ' '.join(text.split()) # Clean whitespace

	if (text and len(text) > 15 and
	text not in self.seen_texts and
	len(text) < 500):

	self.seen_texts.add(text)
	articles.append({
	'text': text,
	'source': 'Kaler Kantho',
	'date': datetime.now().strftime('%Y-%m-%d'),
	'category': 'news'
	})
	self.scraped_count += 1
	page_articles += 1
	except:
	continue

	# Debug info for first page
	if page == 1 and page_articles == 0:
	print(f"⚠️ Page 1: Found {len(unique_elements)} potential elements but extracted 0 articles")

	if page_articles == 0:
	break

	time.sleep(random.uniform(1, 3))

	except Exception as e:
	continue

	print(f"✅ Scraped {len(articles)} articles from Kaler Kantho")
	return articles

	def scrape_daily_star(self, max_pages=100):
	"""Scrape The Daily Star Bangla"""
	print("🔍 Scraping The Daily Star...")
	articles = []
	base_url = "https://www.thedailystar.net/bangla"

	for page in range(1, max_pages + 1):
	if self.scraped_count >= self.target_count:
	break

	try:
	urls_to_try = [
	f"{base_url}?page={page}",
	base_url if page == 1 else None
	]

	url = None
	response = None
	for u in urls_to_try:
	if u:
	resp = self.safe_get(u, timeout=15)
	if resp:
	url = u
	response = resp
	break

	if not url or not response:
	continue

	try:
	soup = BeautifulSoup(response.content, 'html.parser')
	except Exception as e:
	print(f"⚠️ Error parsing page {page}: {e}")
	continue

	# Use shared extraction method
	unique_elements = self.extract_text_elements(soup, 'The Daily Star', page)

	page_articles = 0
	for element in unique_elements:
	if self.scraped_count >= self.target_count:
	break

	try:
	text = element.get_text().strip()
	text = ' '.join(text.split()) # Clean whitespace

	if (text and len(text) > 15 and
	text not in self.seen_texts and
	len(text) < 500):

	self.seen_texts.add(text)
	articles.append({
	'text': text,
	'source': 'The Daily Star',
	'date': datetime.now().strftime('%Y-%m-%d'),
	'category': 'news'
	})
	self.scraped_count += 1
	page_articles += 1
	except:
	continue

	# Debug info for first page
	if page == 1 and page_articles == 0:
	print(f"⚠️ Page 1: Found {len(unique_elements)} potential elements but extracted 0 articles")

	if page_articles == 0:
	break

	time.sleep(random.uniform(1, 3))

	except Exception as e:
	continue

	print(f"✅ Scraped {len(articles)} articles from The Daily Star")
	return articles

	def create_sample_dataset(self, num_samples=1000):
	"""Create expanded sample dataset with variations"""
	print("📝 Creating sample dataset...")

	base_texts = [
	"বাংলাদেশ ক্রিকেট দল দুর্দান্ত পারফরম্যান্স করেছে আজকের ম্যাচে",
	"সরকারের নতুন নীতি নিয়ে জনগণ অসন্তুষ্ট",
	"আজকের আবহাওয়া মোটামুটি ভালো থাকবে সারাদিন",
	"শিক্ষা ব্যবস্থায় সংস্কার প্রয়োজন বলে মনে করেন বিশেষজ্ঞরা",
	"দেশের অর্থনীতি দ্রুত উন্নতি করছে",
	"দুর্নীতির কারণে উন্নয়ন প্রকল্পে বিলম্ব হচ্ছে",
	"নতুন প্রযুক্তি ব্যবহার করে কৃষকরা বেশি ফসল ফলাচ্ছেন",
	"যানজট ঢাকার একটি বড় সমস্যা হয়ে দাঁড়িয়েছে",
	"স্বাস্থ্য সেবার মান উন্নতি করতে হবে",
	"পরিবেশ রক্ষায় সবাইকে সচেতন হতে হবে",
	"খেলাধুলায় বাংলাদেশ ভালো করছে",
	"তরুণরা উদ্যোক্তা হয়ে ব্যবসা শুরু করছেন",
	"গ্রামীণ এলাকায় বিদ্যুৎ সরবরাহ বাড়ছে",
	"শহরে বায়ু দূষণ মারাত্মক আকার ধারণ করেছে",
	"নতুন সেতু যোগাযোগ ব্যবস্থা উন্নত করবে",
	"বাংলাদেশের রপ্তানি আয় বৃদ্ধি পাচ্ছে",
	"শিক্ষার্থীদের জন্য নতুন সুযোগ তৈরি হচ্ছে",
	"স্বাস্থ্য সেবা খাতে বিনিয়োগ বাড়ছে",
	"কৃষি ক্ষেত্রে আধুনিক প্রযুক্তির ব্যবহার",
	"তরুণ উদ্যোক্তাদের জন্য সহায়তা প্রকল্প",
	]

	articles = []
	sources = ['Sample News', 'Demo Source', 'Test Data', 'Generated Data']
	categories = ['politics', 'sports', 'economy', 'technology', 'health', 'education', 'environment']

	for i in range(num_samples):
	base_text = base_texts[i % len(base_texts)]
	# Add slight variations
	if i > len(base_texts):
	# Add variations for diversity
	variations = [
	f"{base_text} এটি একটি গুরুত্বপূর্ণ বিষয়।",
	f"সম্প্রতি {base_text}",
	f"{base_text} বিশেষজ্ঞরা জানিয়েছেন।",
	]
	text = variations[i % len(variations)]
	else:
	text = base_text

	articles.append({
	'text': text,
	'source': sources[i % len(sources)],
	'date': (datetime.now() - timedelta(days=random.randint(0, 30))).strftime('%Y-%m-%d'),
	'category': categories[i % len(categories)]
	})

	print(f"✅ Created {len(articles)} sample articles")
	return articles

	def save_to_csv(self, articles, filename='data/raw/bangla_news.csv', append=False):
	"""Save scraped articles to CSV with append support"""
	# Create directory if it doesn't exist
	os.makedirs(os.path.dirname(filename), exist_ok=True)

	if not articles:
	print("⚠️ No articles to save!")
	return None

	df = pd.DataFrame(articles)

	if append and os.path.exists(filename):
	# Append to existing file
	existing_df = pd.read_csv(filename)
	df = pd.concat([existing_df, df], ignore_index=True)
	df = df.drop_duplicates(subset=['text'], keep='first')

	df.to_csv(filename, index=False, encoding='utf-8-sig')
	print(f"\n✅ Saved {len(df)} articles to {filename}")
	return df

	def save_progress(self, articles, checkpoint_file='data/raw/scraping_progress.json'):
	"""Save scraping progress"""
	os.makedirs(os.path.dirname(checkpoint_file), exist_ok=True)
	progress = {
	'scraped_count': self.scraped_count,
	'target_count': self.target_count,
	'timestamp': datetime.now().isoformat()
	}
	with open(checkpoint_file, 'w', encoding='utf-8') as f:
	json.dump(progress, f, indent=2)


	def main(target_count=50000):
	scraper = BanglaNewsScraper(target_count=target_count)

	print("=" * 60)
	print(f"🌐 Starting Large-Scale Web Scraping Process")
	print(f"🎯 Target: {target_count:,} articles")
	print("=" * 60)

	all_articles = []
	sources = [
	('Prothom Alo', scraper.scrape_prothom_alo, 200),
	('bdnews24', scraper.scrape_bdnews24, 200),
	('BBC Bangla', scraper.scrape_bbc_bangla, 200),
	('Jugantor', scraper.scrape_jugantor, 200),
	('Kaler Kantho', scraper.scrape_kaler_kantho, 200),
	('The Daily Star', scraper.scrape_daily_star, 200),
	]

	# Scrape from all sources with progress tracking
	with tqdm(total=target_count, desc="Scraping Progress", unit="articles") as pbar:
	for source_name, scrape_func, max_pages in sources:
	if scraper.scraped_count >= target_count:
	break

	print(f"\n{'='*60}")
	print(f"📰 Scraping from {source_name}...")
	print(f"{'='*60}")

	try:
	articles = scrape_func(max_pages=max_pages)
	all_articles.extend(articles)
	pbar.update(len(articles))

	# Save progress incrementally
	if len(all_articles) % 1000 == 0:
	scraper.save_to_csv(all_articles, append=True)
	scraper.save_progress(all_articles)
	print(f"\n💾 Progress saved: {scraper.scraped_count:,}/{target_count:,} articles")

	# Be respectful to servers
	time.sleep(random.uniform(2, 5))

	except Exception as e:
	print(f"❌ Error scraping {source_name}: {e}")
	continue

	# If we haven't reached target, supplement with sample data
	if scraper.scraped_count < target_count:
	needed = target_count - scraper.scraped_count
	print(f"\n⚠️ Only scraped {scraper.scraped_count:,} articles. Creating {needed:,} sample articles...")
	sample_articles = scraper.create_sample_dataset(num_samples=needed)
	all_articles.extend(sample_articles)
	scraper.scraped_count += len(sample_articles)

	# Final save
	print("\n" + "=" * 60)
	print("💾 Saving final dataset...")
	print("=" * 60)
	df = scraper.save_to_csv(all_articles)

	if df is not None and len(df) > 0:
	# Show statistics
	print("\n" + "=" * 60)
	print("📊 Scraping Statistics")
	print("=" * 60)
	print(f"Total articles: {len(df):,}")
	print(f"Target: {target_count:,}")
	print(f"Completion: {len(df)/target_count*100:.1f}%")

	if 'source' in df.columns:
	print(f"\n📰 By source:")
	source_counts = df['source'].value_counts()
	for source, count in source_counts.items():
	print(f" {source}: {count:,} ({count/len(df)*100:.1f}%)")

	if 'category' in df.columns:
	print(f"\n📑 By category:")
	category_counts = df['category'].value_counts()
	for category, count in category_counts.items():
	print(f" {category}: {count:,}")

	print(f"\n📅 Date range: {df['date'].min()} to {df['date'].max()}")

	# Text length statistics
	df['text_length'] = df['text'].str.len()
	print(f"\n📏 Text length statistics:")
	print(f" Average: {df['text_length'].mean():.1f} characters")
	print(f" Min: {df['text_length'].min()} characters")
	print(f" Max: {df['text_length'].max()} characters")

	# Show sample
	print("\n📝 Sample articles:")
	print("-" * 60)
	for i, row in df.head(5).iterrows():
	print(f"{i + 1}. [{row['source']}] {row['text'][:70]}...")
	print("=" * 60)

	print(f"\n✅ Scraping complete! Dataset saved to data/raw/bangla_news.csv")
	else:
	print("❌ Failed to create dataset")


	if __name__ == "__main__":
	import sys

	# Allow custom target count from command line
	target_count = 50000
	if len(sys.argv) > 1:
	try:
	target_count = int(sys.argv[1])
	except ValueError:
	print(f"⚠️ Invalid target count: {sys.argv[1]}. Using default: 50000")

	main(target_count=target_count)