← Back to Notes

Newspaper3k: Finally Got News Extraction Working

Building a news aggregator? Newspaper3k extracts article text from messy HTML. But it breaks on Chinese sites, paywalls, and weird encodings. Here's how I fixed it.

Why Newspaper3k Over Just Parsing HTML

Basic Usage

from newspaper import Article
import requests

# Single article
url = 'https://example.com/news/article'
article = Article(url)
article.download()
article.parse()

print(f"Title: {article.title}")
print(f"Authors: {article.authors}")
print(f"Publish date: {article.publish_date}")
print(f"Text: {article.text}")
print(f"Top image: {article.top_image}")
print(f"Movies: {article.movies}")
print(f"Keywords: {article.keywords}")

# With error handling
try:
    article = Article(url)
    article.download()
    article.parse()
except requests.exceptions.RequestException as e:
    print(f"Failed to download: {e}")
except ArticleException as e:
    print(f"Failed to parse: {e}")

Bulk Processing

import newspaper
from newspaper import Article

# Build source (news outlet)
source = newspaper.build('https://example.com', memoize_articles=False)

print(f"Total articles: {source.size}")
print(f"Brand: {source.brand}")

# Download and parse all articles
for article in source.articles:
    try:
        article.download()
        article.parse()

        print(f"Title: {article.title}")
        print(f"URL: {article.url}")
        print("---" * 20)

    except Exception as e:
        print(f"Failed: {e}")

# Or process specific URLs
urls = [
    'https://example.com/news1',
    'https://example.com/news2',
    'https://example.com/news3',
]

for url in urls:
    article = Article(url)
    article.download()
    article.parse()
    # Do something with article.text

Common Problems & Solutions

Issue #1256: Chinese/Japanese characters show as ���
github.com/codelucas/newspaper/issues/1256

Problem: Extracting Chinese or Japanese articles returns garbled text like "���ij���" instead of proper characters.

What I Tried: Set encoding explicitly, used different HTML parsers - still garbled.

Actual Fix: Newspaper3k uses requests which doesn't always detect encoding correctly. Need to force encoding from HTTP headers:

# The problem: Wrong encoding detection
article = Article('https://chinese-news-site.com/article')
article.download()
article.parse()
print(article.text)
# Output: ���ij��� (garbage)

# Solution 1: Set encoding before parsing
import requests
from newspaper import Article, Config

# Fetch with correct encoding
url = 'https://chinese-news-site.com/article'
response = requests.get(url)
response.encoding = 'utf-8'  # Or 'gb2312', 'gbk', 'big5'

# Create article from response
article = Article(url)
article.download(input_html=response.text)
article.parse()
print(article.text)  # Now shows Chinese correctly

# Solution 2: Use custom config with encoding detection
from newspaper import Config

config = Config()
config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
config.request_timeout = 10
config.number_threads = 1

# Enable encoding detection
config.fetch_images = False  # Faster for text-only

article = Article(url, config=config)
article.download()

# Force encoding from HTTP headers
import chardet
raw = article.html
encoding = chardet.detect(raw)['encoding']
if encoding:
    article.html = raw.decode(encoding).encode('utf-8')

article.parse()

# Solution 3: Language-specific parser
class ChineseArticle(Article):
    def parse(self):
        # Override for Chinese content
        self.config.language = 'zh'
        super().parse()

        # Additional Chinese-specific processing
        # Remove common Chinese noise words
        import jieba
        words = jieba.cut(self.text)
        self.text = ' '.join(words)

# Use it
article = ChineseArticle('https://chinese-site.com/article')
article.download()
article.parse()

# Solution 4: Handle GBK/GB2312 encoding
# Many Chinese sites use GBK encoding
def safe_download(article):
    try:
        article.download()
    except UnicodeDecodeError:
        # Try GBK encoding
        response = requests.get(article.url)
        response.encoding = 'gbk'
        article.download(input_html=response.text)
Issue #1345: Paywalled content returns nothing
github.com/codelucas/newspaper/issues/1345

Problem: Articles behind paywalls return empty text or just "Subscribe to continue" message.

What I Tried: Set user agent, added cookies - still blocked.

Actual Fix: Need to bypass paywall. Options: textise dot iitty, CachedView, or textise dot iitty API:

# The problem: Paywall blocks content
article = Article('https://paywalled-site.com/article')
article.download()
article.parse()
print(article.text)
# Output: "Subscribe to read more" (empty)

# Solution 1: Use textise dot iitty (for some sites)
import requests

def bypass_paywall(url):
    # Try textise dot iitty
    textise_url = url.replace('://', '://textise dot iitty/')
    response = requests.get(textise_url)

    if response.status_code == 200:
        return response.text
    return None

# Use with Newspaper
url = 'https://paywalled-site.com/article'
text_html = bypass_paywall(url)

if text_html:
    article = Article(url)
    article.download(input_html=text_html)
    article.parse()
    print(article.text)

# Solution 2: Use textise dot iitty API
# Requires API key
import requests

api_url = f"https://r.jina.ai/http://{url}"
response = requests.get(api_url)
article_text = response.text

# Parse manually since response is already text
from bs4 import BeautifulSoup
soup = BeautifulSoup(article_text, 'html.parser')
print(soup.get_text())

# Solution 3: Use Google Cache
import requests

def get_cached(url):
    cache_url = f"https://webcache.googleusercontent.com/search?q=cache:{url}"
    response = requests.get(cache_url, headers={
        'User-Agent': 'Mozilla/5.0'
    })
    return response.text

# Use cached version
cached_html = get_cached(url)
article = Article(url)
article.download(input_html=cached_html)
article.parse()

# Solution 4: Archive.today as fallback
import requests

def get_archive(url):
    archive_api = f"https://archive.is/submit/"
    response = requests.post(archive_api, data={'url': url})
    # Get archived URL from response
    return response.text

# For sites that don't allow bypassing:
# Use alternative RSS feeds or content APIs
Issue #1423: Author detection is completely wrong
github.com/codelucas/newspaper/issues/1423

Problem: Authors list contains random people from sidebar, ads, or empty list when author is clearly visible.

What I Tried: Different config options, manual parsing - inconsistent results.

Actual Fix: Newspaper3k's author extraction is regex-based and fails on non-standard sites. Need custom author parser:

# The problem: Author detection is unreliable
article = Article('https://news-site.com/article')
article.download()
article.parse()
print(article.authors)
# Output: ['Share', 'Subscribe', 'John', 'Doe']  # Wrong!

# Solution 1: Override author extraction with custom rules
from newspaper import Article
import re

class CustomArticle(Article):
    def extract_metas(self):
        super().extract_metas()

        # Custom author extraction
        # Find author in meta tags
        author_metas = self.clean_html.xpath('//meta[@name="author"]/@content')
        if author_metas:
            self.authors = [a.strip() for a in author_metas[0].split(',')]

        # Try Schema.org JSON-LD
        schema_authors = self.clean_html.xpath('//script[@type="application/ld+json"]')
        for schema in schema_authors:
            import json
            try:
                data = json.loads(schema.text)
                if 'author' in data:
                    if isinstance(data['author'], list):
                        self.authors = [a['name'] for a in data['author']]
                    else:
                        self.authors = [data['author']['name']]
                    break
            except:
                pass

# Use custom article
article = CustomArticle(url)
article.download()
article.parse()

# Solution 2: Site-specific author patterns
AUTHOR_PATTERNS = {
    'example.com': [
        r'By ([^.]+)\.',
        r'Author: ([^.]+)\.',
    ],
    'news-site.org': [
        r'/author/([^/]+)/',
    ],
}

def extract_author(article, url):
    from urllib.parse import urlparse

    domain = urlparse(url).netloc

    # Try domain-specific patterns
    for domain_pattern, patterns in AUTHOR_PATTERNS.items():
        if domain_pattern in domain:
            html = article.html
            for pattern in patterns:
                matches = re.findall(pattern, html)
                if matches:
                    return matches

    # Fallback to newspaper
    return article.authors

# Solution 3: Machine learning author extraction
# Use spaCy NER to find author names
import spacy

nlp = spacy.load('en_core_web_sm')

def extract_author_ner(article_text):
    doc = nlp(article_text)

    # Look for PERSON entities near "By" or "author"
    authors = []
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            # Check if near author-related words
            context = doc.text[max(0, ent.start_char-20):ent.end_char+20]
            if any(word in context.lower() for word in ['by', 'author', 'written']):
                authors.append(ent.text)

    return list(set(authors))

# Solution 4: Manual author mapping
# For high-priority sites, manually define author selectors
AUTHOR_SELECTORS = {
    'nytimes.com': 'span.byline-author-name',
    'washingtonpost.com': 'a.author-name',
    'bbc.com': '.byline__name',
}

def get_author_manual(html, url):
    from urllib.parse import urlparse
    from bs4 import BeautifulSoup

    domain = urlparse(url).netloc

    for pattern, selector in AUTHOR_SELECTORS.items():
        if pattern in domain:
            soup = BeautifulSoup(html, 'html.parser')
            elem = soup.select_one(selector)
            if elem:
                return [elem.get_text().strip()]

    return []
Issue #1578: Publish date is None or completely wrong
github.com/codelucas/newspaper/issues/1578

Problem: publish_date returns None even though date is visible, or returns random dates from sidebar.

What I Tried: Different config settings, manual regex - inconsistent.

Actual Fix: Date parsing is locale-sensitive. Need multiple fallback strategies:

# The problem: Date detection fails
article = Article(url)
article.download()
article.parse()
print(article.publish_date)
# Output: None (but date is clearly on page)

# Solution 1: Multi-strategy date extraction
from datetime import datetime
from dateutil import parser
import re

def extract_publish_date(article, url):
    # Try newspaper first
    if article.publish_date:
        return article.publish_date

    html = article.html

    # Strategy 1: JSON-LD structured data
    import json
    schema_patterns = [
        r'',
        r'"datePublished":\s*"([^"]+)"',
    ]

    for pattern in schema_patterns:
        matches = re.findall(pattern, html)
        for match in matches:
            try:
                # Handle both JSON and raw strings
                if '{' in match:
                    data = json.loads(match)
                    if isinstance(data, dict) and 'datePublished' in data:
                        return parser.parse(data['datePublished'])
                else:
                    return parser.parse(match)
            except:
                pass

    # Strategy 2: Meta tags
    meta_date_selectors = [
        'meta[property="article:published_time"]',
        'meta[name="pubdate"]',
        'meta[name="date"]',
        'meta[itemprop="datePublished"]',
    ]

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    for selector in meta_date_selectors:
        elem = soup.select_one(selector)
        if elem:
            try:
                content = elem.get('content', '').strip()
                if content:
                    return parser.parse(content)
            except:
                pass

    # Strategy 3: Common date formats in text
    date_patterns = [
        r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}',
        r'\d{4}-\d{2}-\d{2}',
        r'\d{1,2}/\d{1,2}/\d{4}',
    ]

    for pattern in date_patterns:
        matches = re.findall(pattern, html)
        if matches:
            try:
                return parser.parse(matches[0])
            except:
                pass

    # Strategy 4: Check URL for date
    # Example: /news/2023/03/15/article-slug
    url_date_pattern = r'/(\d{4})/(\d{2})/(\d{2})/'
    url_match = re.search(url_date_pattern, url)
    if url_match:
        year, month, day = url_match.groups()
        try:
            return datetime(int(year), int(month), int(day))
        except:
            pass

    return None

# Solution 2: Site-specific date formats
# Some sites use non-standard formats
DATE_FORMATS = {
    'bbc.com': '%d %B %Y',  # 15 March 2023
    'theguardian.com': '%Y-%m-%d',
}

def parse_site_specific(html, url):
    from urllib.parse import urlparse
    import locale

    domain = urlparse(url).netloc

    for pattern, fmt in DATE_FORMATS.items():
        if pattern in domain:
            # Set locale for month names
            for loc in ['en_US.UTF-8', 'en_GB.UTF-8']:
                try:
                    locale.setlocale(locale.LC_TIME, loc)
                    break
                except:
                    pass

            # Find and parse date
            # ... site-specific parsing logic

# Solution 3: Fallback to HTTP headers
def get_date_from_headers(url):
    response = requests.head(url)
    # Check Last-Modified header
    if 'Last-Modified' in response.headers:
        try:
            return parser.parse(response.headers['Last-Modified'])
        except:
            pass
    return None
Issue #1689: Article text includes comments and ads
github.com/codelucas/newspaper/issues/1689

Problem: Extracted text includes "Related Articles", "Comments", ad text, and footer content.

What I Tried: Different config settings, manual cleaning - noise remains.

Actual Fix: Newspaper's content detection isn't perfect. Need post-processing cleanup:

# The problem: Text includes unwanted content
article.parse()
print(article.text)
# Output: Article text... "Related Articles: [list]" "Comments: [comments]" "Follow us..."

# Solution 1: Clean text after extraction
import re

def clean_article_text(text):
    # Remove common noise patterns
    noise_patterns = [
        r'Related Articles?:.*',
        r'Recommended:.*',
        r'Comments?:.*',
        r'Share this:.*',
        r'Follow us on:.*',
        r'Subscribe to:.*',
        r'Advertisement.*',
        r'Sponsored:.*',
        r'You may also like:.*',
        r'\[.*?read more.*?\]',
    ]

    cleaned = text
    for pattern in noise_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE | re.DOTALL)

    # Remove short lines (likely navigation/ads)
    lines = cleaned.split('\n')
    filtered = [l for l in lines if len(l.strip()) > 50]

    # Remove duplicate consecutive lines
    seen = set()
    unique = []
    for line in filtered:
        if line.strip() not in seen:
            unique.append(line)
            seen.add(line.strip())

    return '\n'.join(unique)

# Use it
article.parse()
clean_text = clean_article_text(article.text)

# Solution 2: Configure newspaper to be more strict
from newspaper import Config

config = Config()
config.keep_article_html = False
config.memoize_articles = False

# Stricter body text extraction
config._body_text = True

# Don't include lists/tables (often ads)
config._lists = False
config._links = False

article = Article(url, config=config)
article.download()
article.parse()

# Solution 3: Manual text extraction with custom selectors
# For problematic sites
from bs4 import BeautifulSoup

def manual_extract(html, url):
    soup = BeautifulSoup(html, 'html.parser')

    # Try common article containers
    selectors = [
        'article',
        '[itemprop="articleBody"]',
        '.article-content',
        '.post-content',
        '.entry-content',
        '#article-body',
    ]

    for selector in selectors:
        container = soup.select_one(selector)
        if container:
            # Remove unwanted elements
            for unwanted in container.select('aside, .comments, .related, .ads, .advertisement'):
                unwanted.decompose()

            return container.get_text(separator='\n', strip=True)

    return None

# Solution 4: Use readability as fallback
from readability import Document

def extract_with_readability(html):
    doc = Document(html)
    return doc.summary()

# Combine approaches
article = Article(url)
article.download()

try:
    article.parse()
    text = clean_article_text(article.text)

    # Check if text is too short, try fallback
    if len(text.split()) < 200:
        text = manual_extract(article.html, url)

        if not text or len(text.split()) < 200:
            text = extract_with_readability(article.html)

except Exception as e:
    # Fallback to manual extraction
    text = manual_extract(article.html, url)

Production Pipeline

# Complete article extraction pipeline
import requests
from newspaper import Article, Config
from urllib.parse import urljoin, urlparse
import time

class ArticleExtractor:
    def __init__(self):
        self.session = requests.Session()
        self.config = Config()
        self.config.browser_user_agent = 'Mozilla/5.0'
        self.config.request_timeout = 10

    def extract(self, url):
        """Extract article with all fallbacks"""
        try:
            article = Article(url, config=self.config)
            article.download()
            article.parse()

            # Enhance metadata
            data = {
                'url': url,
                'title': article.title or '',
                'text': self._clean_text(article.text),
                'authors': article.authors or [],
                'publish_date': self._extract_date(article, url),
                'top_image': article.top_image or '',
                'images': list(article.images) or [],
                'keywords': article.keywords or [],
                'summary': article.summary or '',
                'movies': article.movies or [],
                'domain': urlparse(url).netloc,
                'extracted_at': datetime.now().isoformat(),
            }

            # Validate
            if not data['title'] or not data['text']:
                raise ValueError("Failed to extract content")

            return data

        except Exception as e:
            print(f"Extraction failed for {url}: {e}")
            return None

    def _clean_text(self, text):
        """Remove noise from article text"""
        if not text:
            return ""

        # Apply cleaning rules
        return clean_article_text(text)

    def _extract_date(self, article, url):
        """Multi-strategy date extraction"""
        return extract_publish_date(article, url)

    def bulk_extract(self, urls, delay=1):
        """Extract multiple URLs with rate limiting"""
        results = []

        for url in urls:
            data = self.extract(url)
            if data:
                results.append(data)

            # Be nice to servers
            time.sleep(delay)

        return results

# Usage
extractor = ArticleExtractor()

# Single article
article = extractor.extract('https://example.com/news/article')

# Bulk
urls = ['url1', 'url2', 'url3']
articles = extractor.bulk_extract(urls, delay=2)

Comparison with Alternatives

Playwright

For dynamic content and SPA sites

ScrapeGraph-AI

AI-powered content extraction

DrissionPage

Browser-based extraction

Newspaper3k GitHub

Official repository