Spaces:

rakib7
/

sentiment

Sleeping

File size: 34,136 Bytes

"""
Web scraper for Bangla news articles
Multiple sources with pagination and large-scale scraping support
Enhanced for scraping 50,000+ articles
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime, timedelta
from tqdm import tqdm
import os
import random
from urllib.parse import urljoin, urlparse
import json


class BanglaNewsScraper:
    def __init__(self, target_count=50000):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9,bn;q=0.8',
            'Accept-Encoding': 'gzip, deflate',  # Exclude br (Brotli) to avoid decoding issues
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        self.target_count = target_count
        self.scraped_count = 0
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        # Disable automatic decompression to handle it manually if needed
        self.session.stream = False
        self.articles = []
        self.seen_urls = set()
        self.seen_texts = set()  # To avoid duplicates

    def safe_get(self, url, timeout=15, max_retries=2):
        """Safely get URL content, handling Brotli and other encoding issues"""
        for attempt in range(max_retries):
            try:
                # First try with session
                response = self.session.get(url, timeout=timeout, stream=False)
                if response.status_code == 200:
                    return response
            except Exception as e:
                error_str = str(e).lower()
                # If Brotli error, try with explicit headers
                if 'brotli' in error_str or 'br' in error_str or 'encoding' in error_str:
                    try:
                        headers_no_br = {
                            'User-Agent': self.headers['User-Agent'],
                            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                            'Accept-Language': 'en-US,en;q=0.9,bn;q=0.8',
                            'Accept-Encoding': 'gzip, deflate',  # Explicitly exclude br
                            'Connection': 'keep-alive'
                        }
                        response = requests.get(url, headers=headers_no_br, timeout=timeout)
                        if response.status_code == 200:
                            return response
                    except:
                        if attempt < max_retries - 1:
                            time.sleep(1)
                            continue
                else:
                    if attempt < max_retries - 1:
                        time.sleep(1)
                        continue
        return None

    def extract_text_elements(self, soup, source_name="Unknown", page=1):
        """Extract text elements from soup using multiple strategies"""
        # Multiple selector strategies - comprehensive
        selectors = [
            'h2.headline', 'h2', 'h3', 'h4',
            'a[class*="headline"]', 'a[class*="title"]', 'a[class*="news"]',
            'div[class*="story"] h2', 'div[class*="story"] h3',
            'article h2', 'article h3',
            'div[class*="title"]', 'div[class*="headline"]',
            'span[class*="headline"]', 'span[class*="title"]',
            'p[class*="headline"]', 'p[class*="title"]',
            'a[href*="/article/"]', 'a[href*="/news/"]', 'a[href*="/story/"]',
            'div[class*="card"] h2', 'div[class*="item"] h2',
            'li[class*="news"]', 'li[class*="article"]',
        ]

        elements = []
        for selector in selectors:
            try:
                found = soup.select(selector)
                if found:
                    elements.extend(found)
            except:
                continue

        # Remove duplicates while preserving order
        seen_elements = set()
        unique_elements = []
        for elem in elements:
            elem_id = id(elem)
            if elem_id not in seen_elements:
                seen_elements.add(elem_id)
                unique_elements.append(elem)

        # Fallback 1: get all links with text that look like news
        if not unique_elements:
            links = soup.find_all('a', href=True)
            for link in links:
                text = link.get_text().strip()
                href = link.get('href', '')
                # Check if it looks like a news article link
                if (text and len(text) > 15 and len(text) < 200 and
                    ('/article/' in href or '/news/' in href or '/story/' in href or 
                     '/bangla/' in href or '/bengali/' in href)):
                    unique_elements.append(link)

        # Fallback 2: get all paragraphs with substantial text
        if not unique_elements:
            paragraphs = soup.find_all('p')
            for p in paragraphs:
                text = p.get_text().strip()
                if len(text) > 30 and len(text) < 300:
                    unique_elements.append(p)

        # Fallback 3: get any div with substantial text
        if not unique_elements:
            divs = soup.find_all('div', class_=True)
            for div in divs:
                text = div.get_text().strip()
                classes = ' '.join(div.get('class', []))
                if (text and len(text) > 20 and len(text) < 250 and
                    ('news' in classes.lower() or 'article' in classes.lower() or 
                     'story' in classes.lower() or 'title' in classes.lower())):
                    unique_elements.append(div)

        return unique_elements

    def scrape_prothom_alo(self, max_pages=100):
        """Scrape Prothom Alo articles with pagination"""
        print("🔍 Scraping Prothom Alo...")
        articles = []
        base_url = "https://www.prothomalo.com/"
        
        for page in range(1, max_pages + 1):
            if self.scraped_count >= self.target_count:
                break
                
            try:
                # Try different URL patterns for pagination
                urls_to_try = [
                    f"{base_url}?page={page}",
                    f"{base_url}latest?page={page}",
                    f"{base_url}archive?page={page}",
                    base_url if page == 1 else None
                ]
                
                url = None
                response = None
                for u in urls_to_try:
                    if u:
                        resp = self.safe_get(u, timeout=15)
                        if resp:
                            url = u
                            response = resp
                    break

                if not url or not response:
                    continue
                    
                try:
                    soup = BeautifulSoup(response.content, 'html.parser')
                except Exception as e:
                    if page == 1:
                        print(f"⚠️ Error parsing page {page}: {e}")
                    continue

                # Use shared extraction method
                unique_elements = self.extract_text_elements(soup, 'Prothom Alo', page)

                page_articles = 0
                for element in unique_elements:
                    if self.scraped_count >= self.target_count:
                        break
                        
                    try:
                        text = element.get_text().strip()
                        # Clean text - remove extra whitespace
                        text = ' '.join(text.split())
                        
                        # Check for duplicates and minimum length
                        if (text and len(text) > 15 and 
                            text not in self.seen_texts and
                            len(text) < 500):  # Reasonable max length
                            
                            self.seen_texts.add(text)
                        articles.append({
                            'text': text,
                            'source': 'Prothom Alo',
                            'date': datetime.now().strftime('%Y-%m-%d'),
                            'category': 'news'
                        })
                            self.scraped_count += 1
                            page_articles += 1
                except:
                    continue

                # Debug info for first page
                if page == 1 and page_articles == 0:
                    print(f"⚠️ Page 1: Found {len(unique_headlines)} potential elements but extracted 0 articles")
                    if len(unique_headlines) > 0:
                        sample_text = unique_headlines[0].get_text().strip()[:100]
                        print(f"   Sample text: {sample_text}...")

                if page_articles == 0:
                    # If first few pages fail, try different approach
                    if page <= 3:
                        continue  # Try a few more pages
                    else:
                        # No more articles found, stop pagination
                        break
                    
                # Rate limiting
                time.sleep(random.uniform(1, 3))

        except Exception as e:
                print(f"⚠️ Error on page {page}: {e}")
                continue

        print(f"✅ Scraped {len(articles)} articles from Prothom Alo")
        return articles

    def scrape_bdnews24(self, max_pages=100):
        """Scrape bdnews24.com with pagination"""
        print("🔍 Scraping bdnews24...")
        articles = []
        base_url = "https://bangla.bdnews24.com/"

        for page in range(1, max_pages + 1):
            if self.scraped_count >= self.target_count:
                break
                
            try:
                urls_to_try = [
                    f"{base_url}?page={page}",
                    f"{base_url}latest?page={page}",
                    base_url if page == 1 else None
                ]
                
                url = None
                response = None
                for u in urls_to_try:
                    if u:
                        resp = self.safe_get(u, timeout=15)
                        if resp:
                            url = u
                            response = resp
                    break

                if not url or not response:
                    continue
                    
                try:
                    soup = BeautifulSoup(response.content, 'html.parser')
                except Exception as e:
                    if page == 1:
                        print(f"⚠️ Error parsing page {page}: {e}")
                    continue

                # Use shared extraction method
                unique_elements = self.extract_text_elements(soup, 'bdnews24', page)

                page_articles = 0
                for element in unique_elements:
                    if self.scraped_count >= self.target_count:
                        break
                        
                    try:
                        text = element.get_text().strip()
                        text = ' '.join(text.split())  # Clean whitespace
                        
                        if (text and len(text) > 15 and 
                            text not in self.seen_texts and
                            len(text) < 500):
                            
                            self.seen_texts.add(text)
                        articles.append({
                            'text': text,
                            'source': 'bdnews24',
                            'date': datetime.now().strftime('%Y-%m-%d'),
                            'category': 'news'
                        })
                            self.scraped_count += 1
                            page_articles += 1
                except:
                    continue

                # Debug info for first page
                if page == 1 and page_articles == 0:
                    print(f"⚠️ Page 1: Found {len(unique_elements)} potential elements but extracted 0 articles")

                if page_articles == 0:
                    break
                    
                time.sleep(random.uniform(1, 3))

        except Exception as e:
                print(f"⚠️ Error on page {page}: {e}")
                continue

        print(f"✅ Scraped {len(articles)} articles from bdnews24")
        return articles

    def scrape_bbc_bangla(self, max_pages=100):
        """Scrape BBC Bangla with pagination"""
        print("🔍 Scraping BBC Bangla...")
        articles = []
        base_url = "https://www.bbc.com/bengali"

        for page in range(1, max_pages + 1):
            if self.scraped_count >= self.target_count:
                break
                
            try:
                urls_to_try = [
                    f"{base_url}?page={page}",
                    base_url if page == 1 else None
                ]
                
                url = None
                response = None
                for u in urls_to_try:
                    if u:
                        resp = self.safe_get(u, timeout=15)
                        if resp:
                            url = u
                            response = resp
                            break
                
                if not url or not response:
                    continue
                    
                try:
                    soup = BeautifulSoup(response.content, 'html.parser')
                except Exception as e:
                    if page == 1:
                        print(f"⚠️ Error parsing page {page}: {e}")
                    continue

                # Use shared extraction method
                unique_elements = self.extract_text_elements(soup, 'BBC Bangla', page)

                page_articles = 0
                for element in unique_elements:
                    if self.scraped_count >= self.target_count:
                        break
                        
                    try:
                        text = element.get_text().strip()
                        text = ' '.join(text.split())  # Clean whitespace
                        
                        if (text and len(text) > 15 and 
                            text not in self.seen_texts and
                            len(text) < 500):
                            
                            self.seen_texts.add(text)
                        articles.append({
                            'text': text,
                            'source': 'BBC Bangla',
                            'date': datetime.now().strftime('%Y-%m-%d'),
                            'category': 'news'
                        })
                            self.scraped_count += 1
                            page_articles += 1
                except:
                    continue

                # Debug info for first page
                if page == 1 and page_articles == 0:
                    print(f"⚠️ Page 1: Found {len(unique_elements)} potential elements but extracted 0 articles")

                if page_articles == 0:
                    break
                    
                time.sleep(random.uniform(1, 3))

        except Exception as e:
                print(f"⚠️ Error on page {page}: {e}")
                continue

        print(f"✅ Scraped {len(articles)} articles from BBC Bangla")
        return articles

    def scrape_jugantor(self, max_pages=100):
        """Scrape Jugantor newspaper"""
        print("🔍 Scraping Jugantor...")
        articles = []
        base_url = "https://www.jugantor.com/"

        for page in range(1, max_pages + 1):
            if self.scraped_count >= self.target_count:
                break
                
            try:
                urls_to_try = [
                    f"{base_url}?page={page}",
                    base_url if page == 1 else None
                ]
                
                url = None
                response = None
                for u in urls_to_try:
                    if u:
                        resp = self.safe_get(u, timeout=15)
                        if resp:
                            url = u
                            response = resp
                            break
                
                if not url or not response:
                    continue
                    
                try:
                    soup = BeautifulSoup(response.content, 'html.parser')
                except Exception as e:
                    print(f"⚠️ Error parsing page {page}: {e}")
                    continue

                # Use shared extraction method
                unique_elements = self.extract_text_elements(soup, 'Jugantor', page)

                page_articles = 0
                for element in unique_elements:
                    if self.scraped_count >= self.target_count:
                        break
                        
                    try:
                        text = element.get_text().strip()
                        text = ' '.join(text.split())  # Clean whitespace
                        
                        if (text and len(text) > 15 and 
                            text not in self.seen_texts and
                            len(text) < 500):
                            
                            self.seen_texts.add(text)
                            articles.append({
                                'text': text,
                                'source': 'Jugantor',
                                'date': datetime.now().strftime('%Y-%m-%d'),
                                'category': 'news'
                            })
                            self.scraped_count += 1
                            page_articles += 1
                    except:
                        continue

                if page_articles == 0:
                    break
                    
                time.sleep(random.uniform(1, 3))
                
            except Exception as e:
                continue

        print(f"✅ Scraped {len(articles)} articles from Jugantor")
        return articles

    def scrape_kaler_kantho(self, max_pages=100):
        """Scrape Kaler Kantho newspaper"""
        print("🔍 Scraping Kaler Kantho...")
        articles = []
        base_url = "https://www.kalerkantho.com/"

        for page in range(1, max_pages + 1):
            if self.scraped_count >= self.target_count:
                break
                
            try:
                urls_to_try = [
                    f"{base_url}?page={page}",
                    base_url if page == 1 else None
                ]
                
                url = None
                response = None
                for u in urls_to_try:
                    if u:
                        resp = self.safe_get(u, timeout=15)
                        if resp:
                            url = u
                            response = resp
                            break
                
                if not url or not response:
                    continue
                    
                try:
                    soup = BeautifulSoup(response.content, 'html.parser')
                except Exception as e:
                    print(f"⚠️ Error parsing page {page}: {e}")
                    continue

                # Use shared extraction method
                unique_elements = self.extract_text_elements(soup, 'Kaler Kantho', page)

                page_articles = 0
                for element in unique_elements:
                    if self.scraped_count >= self.target_count:
                        break
                        
                    try:
                        text = element.get_text().strip()
                        text = ' '.join(text.split())  # Clean whitespace
                        
                        if (text and len(text) > 15 and 
                            text not in self.seen_texts and
                            len(text) < 500):
                            
                            self.seen_texts.add(text)
                            articles.append({
                                'text': text,
                                'source': 'Kaler Kantho',
                                'date': datetime.now().strftime('%Y-%m-%d'),
                                'category': 'news'
                            })
                            self.scraped_count += 1
                            page_articles += 1
                    except:
                        continue
                
                # Debug info for first page
                if page == 1 and page_articles == 0:
                    print(f"⚠️ Page 1: Found {len(unique_elements)} potential elements but extracted 0 articles")

                if page_articles == 0:
                    break
                    
                time.sleep(random.uniform(1, 3))
                
            except Exception as e:
                continue

        print(f"✅ Scraped {len(articles)} articles from Kaler Kantho")
        return articles

    def scrape_daily_star(self, max_pages=100):
        """Scrape The Daily Star Bangla"""
        print("🔍 Scraping The Daily Star...")
        articles = []
        base_url = "https://www.thedailystar.net/bangla"

        for page in range(1, max_pages + 1):
            if self.scraped_count >= self.target_count:
                break
                
            try:
                urls_to_try = [
                    f"{base_url}?page={page}",
                    base_url if page == 1 else None
                ]
                
                url = None
                response = None
                for u in urls_to_try:
                    if u:
                        resp = self.safe_get(u, timeout=15)
                        if resp:
                            url = u
                            response = resp
                            break
                
                if not url or not response:
                    continue
                    
                try:
                    soup = BeautifulSoup(response.content, 'html.parser')
                except Exception as e:
                    print(f"⚠️ Error parsing page {page}: {e}")
                    continue

                # Use shared extraction method
                unique_elements = self.extract_text_elements(soup, 'The Daily Star', page)

                page_articles = 0
                for element in unique_elements:
                    if self.scraped_count >= self.target_count:
                        break
                        
                    try:
                        text = element.get_text().strip()
                        text = ' '.join(text.split())  # Clean whitespace
                        
                        if (text and len(text) > 15 and 
                            text not in self.seen_texts and
                            len(text) < 500):
                            
                            self.seen_texts.add(text)
                            articles.append({
                                'text': text,
                                'source': 'The Daily Star',
                                'date': datetime.now().strftime('%Y-%m-%d'),
                                'category': 'news'
                            })
                            self.scraped_count += 1
                            page_articles += 1
                    except:
                        continue
                
                # Debug info for first page
                if page == 1 and page_articles == 0:
                    print(f"⚠️ Page 1: Found {len(unique_elements)} potential elements but extracted 0 articles")

                if page_articles == 0:
                    break
                    
                time.sleep(random.uniform(1, 3))
                
            except Exception as e:
                continue

        print(f"✅ Scraped {len(articles)} articles from The Daily Star")
        return articles

    def create_sample_dataset(self, num_samples=1000):
        """Create expanded sample dataset with variations"""
        print("📝 Creating sample dataset...")

        base_texts = [
            "বাংলাদেশ ক্রিকেট দল দুর্দান্ত পারফরম্যান্স করেছে আজকের ম্যাচে",
            "সরকারের নতুন নীতি নিয়ে জনগণ অসন্তুষ্ট",
            "আজকের আবহাওয়া মোটামুটি ভালো থাকবে সারাদিন",
            "শিক্ষা ব্যবস্থায় সংস্কার প্রয়োজন বলে মনে করেন বিশেষজ্ঞরা",
            "দেশের অর্থনীতি দ্রুত উন্নতি করছে",
            "দুর্নীতির কারণে উন্নয়ন প্রকল্পে বিলম্ব হচ্ছে",
            "নতুন প্রযুক্তি ব্যবহার করে কৃষকরা বেশি ফসল ফলাচ্ছেন",
            "যানজট ঢাকার একটি বড় সমস্যা হয়ে দাঁড়িয়েছে",
            "স্বাস্থ্য সেবার মান উন্নতি করতে হবে",
            "পরিবেশ রক্ষায় সবাইকে সচেতন হতে হবে",
            "খেলাধুলায় বাংলাদেশ ভালো করছে",
            "তরুণরা উদ্যোক্তা হয়ে ব্যবসা শুরু করছেন",
            "গ্রামীণ এলাকায় বিদ্যুৎ সরবরাহ বাড়ছে",
            "শহরে বায়ু দূষণ মারাত্মক আকার ধারণ করেছে",
            "নতুন সেতু যোগাযোগ ব্যবস্থা উন্নত করবে",
            "বাংলাদেশের রপ্তানি আয় বৃদ্ধি পাচ্ছে",
            "শিক্ষার্থীদের জন্য নতুন সুযোগ তৈরি হচ্ছে",
            "স্বাস্থ্য সেবা খাতে বিনিয়োগ বাড়ছে",
            "কৃষি ক্ষেত্রে আধুনিক প্রযুক্তির ব্যবহার",
            "তরুণ উদ্যোক্তাদের জন্য সহায়তা প্রকল্প",
        ]

        articles = []
        sources = ['Sample News', 'Demo Source', 'Test Data', 'Generated Data']
        categories = ['politics', 'sports', 'economy', 'technology', 'health', 'education', 'environment']

        for i in range(num_samples):
            base_text = base_texts[i % len(base_texts)]
            # Add slight variations
            if i > len(base_texts):
                # Add variations for diversity
                variations = [
                    f"{base_text} এটি একটি গুরুত্বপূর্ণ বিষয়।",
                    f"সম্প্রতি {base_text}",
                    f"{base_text} বিশেষজ্ঞরা জানিয়েছেন।",
                ]
                text = variations[i % len(variations)]
            else:
                text = base_text
                
            articles.append({
                'text': text,
                'source': sources[i % len(sources)],
                'date': (datetime.now() - timedelta(days=random.randint(0, 30))).strftime('%Y-%m-%d'),
                'category': categories[i % len(categories)]
            })

        print(f"✅ Created {len(articles)} sample articles")
        return articles

    def save_to_csv(self, articles, filename='data/raw/bangla_news.csv', append=False):
        """Save scraped articles to CSV with append support"""
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        if not articles:
            print("⚠️ No articles to save!")
            return None

        df = pd.DataFrame(articles)
        
        if append and os.path.exists(filename):
            # Append to existing file
            existing_df = pd.read_csv(filename)
            df = pd.concat([existing_df, df], ignore_index=True)
            df = df.drop_duplicates(subset=['text'], keep='first')
        
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"\n✅ Saved {len(df)} articles to {filename}")
        return df

    def save_progress(self, articles, checkpoint_file='data/raw/scraping_progress.json'):
        """Save scraping progress"""
        os.makedirs(os.path.dirname(checkpoint_file), exist_ok=True)
        progress = {
            'scraped_count': self.scraped_count,
            'target_count': self.target_count,
            'timestamp': datetime.now().isoformat()
        }
        with open(checkpoint_file, 'w', encoding='utf-8') as f:
            json.dump(progress, f, indent=2)


def main(target_count=50000):
    scraper = BanglaNewsScraper(target_count=target_count)

    print("=" * 60)
    print(f"🌐 Starting Large-Scale Web Scraping Process")
    print(f"🎯 Target: {target_count:,} articles")
    print("=" * 60)

    all_articles = []
    sources = [
        ('Prothom Alo', scraper.scrape_prothom_alo, 200),
        ('bdnews24', scraper.scrape_bdnews24, 200),
        ('BBC Bangla', scraper.scrape_bbc_bangla, 200),
        ('Jugantor', scraper.scrape_jugantor, 200),
        ('Kaler Kantho', scraper.scrape_kaler_kantho, 200),
        ('The Daily Star', scraper.scrape_daily_star, 200),
    ]

    # Scrape from all sources with progress tracking
    with tqdm(total=target_count, desc="Scraping Progress", unit="articles") as pbar:
        for source_name, scrape_func, max_pages in sources:
            if scraper.scraped_count >= target_count:
                break
                
            print(f"\n{'='*60}")
            print(f"📰 Scraping from {source_name}...")
            print(f"{'='*60}")
            
            try:
                articles = scrape_func(max_pages=max_pages)
                all_articles.extend(articles)
                pbar.update(len(articles))
                
                # Save progress incrementally
                if len(all_articles) % 1000 == 0:
                    scraper.save_to_csv(all_articles, append=True)
                    scraper.save_progress(all_articles)
                    print(f"\n💾 Progress saved: {scraper.scraped_count:,}/{target_count:,} articles")
                
                # Be respectful to servers
                time.sleep(random.uniform(2, 5))
                
            except Exception as e:
                print(f"❌ Error scraping {source_name}: {e}")
                continue

    # If we haven't reached target, supplement with sample data
    if scraper.scraped_count < target_count:
        needed = target_count - scraper.scraped_count
        print(f"\n⚠️ Only scraped {scraper.scraped_count:,} articles. Creating {needed:,} sample articles...")
        sample_articles = scraper.create_sample_dataset(num_samples=needed)
        all_articles.extend(sample_articles)
        scraper.scraped_count += len(sample_articles)

    # Final save
    print("\n" + "=" * 60)
    print("💾 Saving final dataset...")
    print("=" * 60)
    df = scraper.save_to_csv(all_articles)

    if df is not None and len(df) > 0:
        # Show statistics
        print("\n" + "=" * 60)
        print("📊 Scraping Statistics")
        print("=" * 60)
        print(f"Total articles: {len(df):,}")
        print(f"Target: {target_count:,}")
        print(f"Completion: {len(df)/target_count*100:.1f}%")

        if 'source' in df.columns:
            print(f"\n📰 By source:")
            source_counts = df['source'].value_counts()
            for source, count in source_counts.items():
                print(f"  {source}: {count:,} ({count/len(df)*100:.1f}%)")

        if 'category' in df.columns:
            print(f"\n📑 By category:")
            category_counts = df['category'].value_counts()
            for category, count in category_counts.items():
                print(f"  {category}: {count:,}")

        print(f"\n📅 Date range: {df['date'].min()} to {df['date'].max()}")

        # Text length statistics
        df['text_length'] = df['text'].str.len()
        print(f"\n📏 Text length statistics:")
        print(f"  Average: {df['text_length'].mean():.1f} characters")
        print(f"  Min: {df['text_length'].min()} characters")
        print(f"  Max: {df['text_length'].max()} characters")

        # Show sample
        print("\n📝 Sample articles:")
        print("-" * 60)
        for i, row in df.head(5).iterrows():
            print(f"{i + 1}. [{row['source']}] {row['text'][:70]}...")
        print("=" * 60)
        
        print(f"\n✅ Scraping complete! Dataset saved to data/raw/bangla_news.csv")
    else:
        print("❌ Failed to create dataset")


if __name__ == "__main__":
    import sys
    
    # Allow custom target count from command line
    target_count = 50000
    if len(sys.argv) > 1:
        try:
            target_count = int(sys.argv[1])
        except ValueError:
            print(f"⚠️ Invalid target count: {sys.argv[1]}. Using default: 50000")
    
    main(target_count=target_count)