Newspaper3k: Finally Got News Extraction Working
Building a news aggregator? Newspaper3k extracts article text from messy HTML. But it breaks on Chinese sites, paywalls, and weird encodings. Here's how I fixed it.
Why Newspaper3k Over Just Parsing HTML
- • Removes cruft: Ads, navigation, comments - gets just the article
- • Multi-language: Works on Chinese, Arabic, Cyrillic (after some fixes)
- • Metadata: Authors, publish date, images, top image
- • The bad: Struggles with paywalls, lazy-loaded content, SPA sites
- • Works for: 90% of traditional news sites. Use Playwright for the rest
Basic Usage
from newspaper import Article
import requests
# Single article
url = 'https://example.com/news/article'
article = Article(url)
article.download()
article.parse()
print(f"Title: {article.title}")
print(f"Authors: {article.authors}")
print(f"Publish date: {article.publish_date}")
print(f"Text: {article.text}")
print(f"Top image: {article.top_image}")
print(f"Movies: {article.movies}")
print(f"Keywords: {article.keywords}")
# With error handling
try:
article = Article(url)
article.download()
article.parse()
except requests.exceptions.RequestException as e:
print(f"Failed to download: {e}")
except ArticleException as e:
print(f"Failed to parse: {e}")
Bulk Processing
import newspaper
from newspaper import Article
# Build source (news outlet)
source = newspaper.build('https://example.com', memoize_articles=False)
print(f"Total articles: {source.size}")
print(f"Brand: {source.brand}")
# Download and parse all articles
for article in source.articles:
try:
article.download()
article.parse()
print(f"Title: {article.title}")
print(f"URL: {article.url}")
print("---" * 20)
except Exception as e:
print(f"Failed: {e}")
# Or process specific URLs
urls = [
'https://example.com/news1',
'https://example.com/news2',
'https://example.com/news3',
]
for url in urls:
article = Article(url)
article.download()
article.parse()
# Do something with article.text
Common Problems & Solutions
Problem: Extracting Chinese or Japanese articles returns garbled text like "���ij���" instead of proper characters.
What I Tried: Set encoding explicitly, used different HTML parsers - still garbled.
Actual Fix: Newspaper3k uses requests which doesn't always detect encoding correctly. Need to force encoding from HTTP headers:
# The problem: Wrong encoding detection
article = Article('https://chinese-news-site.com/article')
article.download()
article.parse()
print(article.text)
# Output: ���ij��� (garbage)
# Solution 1: Set encoding before parsing
import requests
from newspaper import Article, Config
# Fetch with correct encoding
url = 'https://chinese-news-site.com/article'
response = requests.get(url)
response.encoding = 'utf-8' # Or 'gb2312', 'gbk', 'big5'
# Create article from response
article = Article(url)
article.download(input_html=response.text)
article.parse()
print(article.text) # Now shows Chinese correctly
# Solution 2: Use custom config with encoding detection
from newspaper import Config
config = Config()
config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
config.request_timeout = 10
config.number_threads = 1
# Enable encoding detection
config.fetch_images = False # Faster for text-only
article = Article(url, config=config)
article.download()
# Force encoding from HTTP headers
import chardet
raw = article.html
encoding = chardet.detect(raw)['encoding']
if encoding:
article.html = raw.decode(encoding).encode('utf-8')
article.parse()
# Solution 3: Language-specific parser
class ChineseArticle(Article):
def parse(self):
# Override for Chinese content
self.config.language = 'zh'
super().parse()
# Additional Chinese-specific processing
# Remove common Chinese noise words
import jieba
words = jieba.cut(self.text)
self.text = ' '.join(words)
# Use it
article = ChineseArticle('https://chinese-site.com/article')
article.download()
article.parse()
# Solution 4: Handle GBK/GB2312 encoding
# Many Chinese sites use GBK encoding
def safe_download(article):
try:
article.download()
except UnicodeDecodeError:
# Try GBK encoding
response = requests.get(article.url)
response.encoding = 'gbk'
article.download(input_html=response.text)
Problem: Articles behind paywalls return empty text or just "Subscribe to continue" message.
What I Tried: Set user agent, added cookies - still blocked.
Actual Fix: Need to bypass paywall. Options: textise dot iitty, CachedView, or textise dot iitty API:
# The problem: Paywall blocks content
article = Article('https://paywalled-site.com/article')
article.download()
article.parse()
print(article.text)
# Output: "Subscribe to read more" (empty)
# Solution 1: Use textise dot iitty (for some sites)
import requests
def bypass_paywall(url):
# Try textise dot iitty
textise_url = url.replace('://', '://textise dot iitty/')
response = requests.get(textise_url)
if response.status_code == 200:
return response.text
return None
# Use with Newspaper
url = 'https://paywalled-site.com/article'
text_html = bypass_paywall(url)
if text_html:
article = Article(url)
article.download(input_html=text_html)
article.parse()
print(article.text)
# Solution 2: Use textise dot iitty API
# Requires API key
import requests
api_url = f"https://r.jina.ai/http://{url}"
response = requests.get(api_url)
article_text = response.text
# Parse manually since response is already text
from bs4 import BeautifulSoup
soup = BeautifulSoup(article_text, 'html.parser')
print(soup.get_text())
# Solution 3: Use Google Cache
import requests
def get_cached(url):
cache_url = f"https://webcache.googleusercontent.com/search?q=cache:{url}"
response = requests.get(cache_url, headers={
'User-Agent': 'Mozilla/5.0'
})
return response.text
# Use cached version
cached_html = get_cached(url)
article = Article(url)
article.download(input_html=cached_html)
article.parse()
# Solution 4: Archive.today as fallback
import requests
def get_archive(url):
archive_api = f"https://archive.is/submit/"
response = requests.post(archive_api, data={'url': url})
# Get archived URL from response
return response.text
# For sites that don't allow bypassing:
# Use alternative RSS feeds or content APIs
Problem: Authors list contains random people from sidebar, ads, or empty list when author is clearly visible.
What I Tried: Different config options, manual parsing - inconsistent results.
Actual Fix: Newspaper3k's author extraction is regex-based and fails on non-standard sites. Need custom author parser:
# The problem: Author detection is unreliable
article = Article('https://news-site.com/article')
article.download()
article.parse()
print(article.authors)
# Output: ['Share', 'Subscribe', 'John', 'Doe'] # Wrong!
# Solution 1: Override author extraction with custom rules
from newspaper import Article
import re
class CustomArticle(Article):
def extract_metas(self):
super().extract_metas()
# Custom author extraction
# Find author in meta tags
author_metas = self.clean_html.xpath('//meta[@name="author"]/@content')
if author_metas:
self.authors = [a.strip() for a in author_metas[0].split(',')]
# Try Schema.org JSON-LD
schema_authors = self.clean_html.xpath('//script[@type="application/ld+json"]')
for schema in schema_authors:
import json
try:
data = json.loads(schema.text)
if 'author' in data:
if isinstance(data['author'], list):
self.authors = [a['name'] for a in data['author']]
else:
self.authors = [data['author']['name']]
break
except:
pass
# Use custom article
article = CustomArticle(url)
article.download()
article.parse()
# Solution 2: Site-specific author patterns
AUTHOR_PATTERNS = {
'example.com': [
r'By ([^.]+)\.',
r'Author: ([^.]+)\.',
],
'news-site.org': [
r'/author/([^/]+)/',
],
}
def extract_author(article, url):
from urllib.parse import urlparse
domain = urlparse(url).netloc
# Try domain-specific patterns
for domain_pattern, patterns in AUTHOR_PATTERNS.items():
if domain_pattern in domain:
html = article.html
for pattern in patterns:
matches = re.findall(pattern, html)
if matches:
return matches
# Fallback to newspaper
return article.authors
# Solution 3: Machine learning author extraction
# Use spaCy NER to find author names
import spacy
nlp = spacy.load('en_core_web_sm')
def extract_author_ner(article_text):
doc = nlp(article_text)
# Look for PERSON entities near "By" or "author"
authors = []
for ent in doc.ents:
if ent.label_ == 'PERSON':
# Check if near author-related words
context = doc.text[max(0, ent.start_char-20):ent.end_char+20]
if any(word in context.lower() for word in ['by', 'author', 'written']):
authors.append(ent.text)
return list(set(authors))
# Solution 4: Manual author mapping
# For high-priority sites, manually define author selectors
AUTHOR_SELECTORS = {
'nytimes.com': 'span.byline-author-name',
'washingtonpost.com': 'a.author-name',
'bbc.com': '.byline__name',
}
def get_author_manual(html, url):
from urllib.parse import urlparse
from bs4 import BeautifulSoup
domain = urlparse(url).netloc
for pattern, selector in AUTHOR_SELECTORS.items():
if pattern in domain:
soup = BeautifulSoup(html, 'html.parser')
elem = soup.select_one(selector)
if elem:
return [elem.get_text().strip()]
return []
Problem: publish_date returns None even though date is visible, or returns random dates from sidebar.
What I Tried: Different config settings, manual regex - inconsistent.
Actual Fix: Date parsing is locale-sensitive. Need multiple fallback strategies:
# The problem: Date detection fails
article = Article(url)
article.download()
article.parse()
print(article.publish_date)
# Output: None (but date is clearly on page)
# Solution 1: Multi-strategy date extraction
from datetime import datetime
from dateutil import parser
import re
def extract_publish_date(article, url):
# Try newspaper first
if article.publish_date:
return article.publish_date
html = article.html
# Strategy 1: JSON-LD structured data
import json
schema_patterns = [
r'',
r'"datePublished":\s*"([^"]+)"',
]
for pattern in schema_patterns:
matches = re.findall(pattern, html)
for match in matches:
try:
# Handle both JSON and raw strings
if '{' in match:
data = json.loads(match)
if isinstance(data, dict) and 'datePublished' in data:
return parser.parse(data['datePublished'])
else:
return parser.parse(match)
except:
pass
# Strategy 2: Meta tags
meta_date_selectors = [
'meta[property="article:published_time"]',
'meta[name="pubdate"]',
'meta[name="date"]',
'meta[itemprop="datePublished"]',
]
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
for selector in meta_date_selectors:
elem = soup.select_one(selector)
if elem:
try:
content = elem.get('content', '').strip()
if content:
return parser.parse(content)
except:
pass
# Strategy 3: Common date formats in text
date_patterns = [
r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}',
r'\d{4}-\d{2}-\d{2}',
r'\d{1,2}/\d{1,2}/\d{4}',
]
for pattern in date_patterns:
matches = re.findall(pattern, html)
if matches:
try:
return parser.parse(matches[0])
except:
pass
# Strategy 4: Check URL for date
# Example: /news/2023/03/15/article-slug
url_date_pattern = r'/(\d{4})/(\d{2})/(\d{2})/'
url_match = re.search(url_date_pattern, url)
if url_match:
year, month, day = url_match.groups()
try:
return datetime(int(year), int(month), int(day))
except:
pass
return None
# Solution 2: Site-specific date formats
# Some sites use non-standard formats
DATE_FORMATS = {
'bbc.com': '%d %B %Y', # 15 March 2023
'theguardian.com': '%Y-%m-%d',
}
def parse_site_specific(html, url):
from urllib.parse import urlparse
import locale
domain = urlparse(url).netloc
for pattern, fmt in DATE_FORMATS.items():
if pattern in domain:
# Set locale for month names
for loc in ['en_US.UTF-8', 'en_GB.UTF-8']:
try:
locale.setlocale(locale.LC_TIME, loc)
break
except:
pass
# Find and parse date
# ... site-specific parsing logic
# Solution 3: Fallback to HTTP headers
def get_date_from_headers(url):
response = requests.head(url)
# Check Last-Modified header
if 'Last-Modified' in response.headers:
try:
return parser.parse(response.headers['Last-Modified'])
except:
pass
return None
Problem: Extracted text includes "Related Articles", "Comments", ad text, and footer content.
What I Tried: Different config settings, manual cleaning - noise remains.
Actual Fix: Newspaper's content detection isn't perfect. Need post-processing cleanup:
# The problem: Text includes unwanted content
article.parse()
print(article.text)
# Output: Article text... "Related Articles: [list]" "Comments: [comments]" "Follow us..."
# Solution 1: Clean text after extraction
import re
def clean_article_text(text):
# Remove common noise patterns
noise_patterns = [
r'Related Articles?:.*',
r'Recommended:.*',
r'Comments?:.*',
r'Share this:.*',
r'Follow us on:.*',
r'Subscribe to:.*',
r'Advertisement.*',
r'Sponsored:.*',
r'You may also like:.*',
r'\[.*?read more.*?\]',
]
cleaned = text
for pattern in noise_patterns:
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE | re.DOTALL)
# Remove short lines (likely navigation/ads)
lines = cleaned.split('\n')
filtered = [l for l in lines if len(l.strip()) > 50]
# Remove duplicate consecutive lines
seen = set()
unique = []
for line in filtered:
if line.strip() not in seen:
unique.append(line)
seen.add(line.strip())
return '\n'.join(unique)
# Use it
article.parse()
clean_text = clean_article_text(article.text)
# Solution 2: Configure newspaper to be more strict
from newspaper import Config
config = Config()
config.keep_article_html = False
config.memoize_articles = False
# Stricter body text extraction
config._body_text = True
# Don't include lists/tables (often ads)
config._lists = False
config._links = False
article = Article(url, config=config)
article.download()
article.parse()
# Solution 3: Manual text extraction with custom selectors
# For problematic sites
from bs4 import BeautifulSoup
def manual_extract(html, url):
soup = BeautifulSoup(html, 'html.parser')
# Try common article containers
selectors = [
'article',
'[itemprop="articleBody"]',
'.article-content',
'.post-content',
'.entry-content',
'#article-body',
]
for selector in selectors:
container = soup.select_one(selector)
if container:
# Remove unwanted elements
for unwanted in container.select('aside, .comments, .related, .ads, .advertisement'):
unwanted.decompose()
return container.get_text(separator='\n', strip=True)
return None
# Solution 4: Use readability as fallback
from readability import Document
def extract_with_readability(html):
doc = Document(html)
return doc.summary()
# Combine approaches
article = Article(url)
article.download()
try:
article.parse()
text = clean_article_text(article.text)
# Check if text is too short, try fallback
if len(text.split()) < 200:
text = manual_extract(article.html, url)
if not text or len(text.split()) < 200:
text = extract_with_readability(article.html)
except Exception as e:
# Fallback to manual extraction
text = manual_extract(article.html, url)
Production Pipeline
# Complete article extraction pipeline
import requests
from newspaper import Article, Config
from urllib.parse import urljoin, urlparse
import time
class ArticleExtractor:
def __init__(self):
self.session = requests.Session()
self.config = Config()
self.config.browser_user_agent = 'Mozilla/5.0'
self.config.request_timeout = 10
def extract(self, url):
"""Extract article with all fallbacks"""
try:
article = Article(url, config=self.config)
article.download()
article.parse()
# Enhance metadata
data = {
'url': url,
'title': article.title or '',
'text': self._clean_text(article.text),
'authors': article.authors or [],
'publish_date': self._extract_date(article, url),
'top_image': article.top_image or '',
'images': list(article.images) or [],
'keywords': article.keywords or [],
'summary': article.summary or '',
'movies': article.movies or [],
'domain': urlparse(url).netloc,
'extracted_at': datetime.now().isoformat(),
}
# Validate
if not data['title'] or not data['text']:
raise ValueError("Failed to extract content")
return data
except Exception as e:
print(f"Extraction failed for {url}: {e}")
return None
def _clean_text(self, text):
"""Remove noise from article text"""
if not text:
return ""
# Apply cleaning rules
return clean_article_text(text)
def _extract_date(self, article, url):
"""Multi-strategy date extraction"""
return extract_publish_date(article, url)
def bulk_extract(self, urls, delay=1):
"""Extract multiple URLs with rate limiting"""
results = []
for url in urls:
data = self.extract(url)
if data:
results.append(data)
# Be nice to servers
time.sleep(delay)
return results
# Usage
extractor = ArticleExtractor()
# Single article
article = extractor.extract('https://example.com/news/article')
# Bulk
urls = ['url1', 'url2', 'url3']
articles = extractor.bulk_extract(urls, delay=2)
Comparison with Alternatives
For dynamic content and SPA sites
AI-powered content extraction
Browser-based extraction
Official repository