Final piece
Final part - combining all pieces into a working CLI application.
CLI with argparse
import argparse
import sys
import logging
from core.spider import DoubanSpider
from core.ai_engine import AIEngine
from core.database import Database
from analysis.charts import ChartGenerator
from utils.logger import get_logger
logger = get_logger(__name__)
def main():
"""Main entry point for the spider"""
parser = argparse.ArgumentParser(
description='Douban AI Spider - Intelligent Movie Data Crawler',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s # Run full pipeline
%(prog)s --test # Test mode (1 page)
%(prog)s --skip-scrape # Use existing database
%(prog)s --skip-ai # Skip AI parsing
%(prog)s --pages 5 # Scrape 5 pages only
"""
)
# Action options
parser.add_argument('--skip-scrape', action='store_true',
help='Skip web scraping (use existing database)')
parser.add_argument('--skip-ai', action='store_true',
help='Skip AI parsing (store raw data only)')
parser.add_argument('--skip-charts', action='store_true',
help='Skip chart generation')
# Configuration options
parser.add_argument('--pages', type=int, default=10,
help='Number of pages to scrape (default: 10)')
parser.add_argument('--test', action='store_true',
help='Run in test mode (scrape only 1 page)')
# Special commands
parser.add_argument('--info', action='store_true',
help='Show database information')
parser.add_argument('--clear', action='store_true',
help='Clear all data from database')
args = parser.parse_args()
# Handle special commands
if args.info:
return show_database_info()
if args.clear:
return clear_database()
# Run main pipeline
return run_pipeline(args)
Pipeline execution
def run_pipeline(args):
"""Execute the complete spider pipeline"""
print("=" * 60)
print(" Douban AI Spider - Starting")
print("=" * 60)
print()
processed_movies = []
# Step 1: Web Scraping
if args.skip_scrape:
logger.info("Skipping web scraping")
logger.info("Using existing database")
else:
logger.info("Step 1: Web scraping...")
pages_to_scrape = 1 if args.test else args.pages
try:
with DoubanSpider() as spider:
spider.total_pages = pages_to_scrape
raw_movies = spider.fetch_all_pages()
if not raw_movies:
logger.error("No movies scraped. Exiting.")
return 1
logger.info(f"Scraped {len(raw_movies)} movies")
# Step 2: AI Parsing
if args.skip_ai:
logger.info("Skipping AI parsing")
processed_movies = raw_movies
else:
logger.info("Step 2: AI parsing...")
try:
with AIEngine() as ai:
processed_movies = ai.parse_movie_batch(raw_movies)
# Clean up unused fields
for movie in processed_movies:
movie.pop('info_text', None)
movie.pop('cover_url', None)
except ValueError as e:
logger.error(f"AI Engine failed: {e}")
logger.info("Continuing without AI...")
# Set defaults
for movie in raw_movies:
movie.update({
'director': None,
'actors': [],
'year': None,
'country': None,
'genres': []
})
processed_movies.append(movie)
except Exception as e:
logger.error(f"Scraping failed: {e}")
return 1
# Step 3: Database Storage
logger.info("Step 3: Storing in database...")
try:
with Database() as db:
success_count = db.insert_movies_batch(processed_movies)
logger.info(f"Stored {success_count}/{len(processed_movies)} movies")
except Exception as e:
logger.error(f"Database error: {e}")
return 1
# Step 4: Data Visualization
if not args.skip_charts:
logger.info("Step 4: Generating charts...")
try:
with Database() as db:
movies = db.get_all_movies()
if movies:
generator = ChartGenerator()
results = generator.generate_all_charts(movies)
logger.info("Charts generated:")
for chart_type, path in results.items():
if path:
logger.info(f" ✓ {chart_type}: {path}")
except Exception as e:
logger.error(f"Chart generation failed: {e}")
print()
print("=" * 60)
print(" ✓ Completed successfully!")
print("=" * 60)
print()
return 0
Logging setup
# utils/logger.py
import logging
import sys
from pathlib import Path
def get_logger(name: str) -> logging.Logger:
"""Get a configured logger instance"""
logger = logging.getLogger(name)
if logger.handlers:
return logger
logger.setLevel(logging.INFO)
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(
logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
)
# File handler
log_path = Path("logs/spider.log")
log_path.parent.mkdir(parents=True, exist_ok=True)
file_handler = logging.FileHandler(log_path, encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
)
logger.addHandler(console_handler)
logger.addHandler(file_handler)
return logger
Usage examples
# Run full pipeline
python main.py
# Test mode (1 page only)
python main.py --test
# Skip scraping, use existing data
python main.py --skip-scrape
# Skip AI parsing (faster)
python main.py --skip-ai
# Custom number of pages
python main.py --pages 5
# View database info
python main.py --info
# Clear database
python main.py --clear
Common issues
403 Forbidden
Happened when scraping too fast. Fixed by:
- Increase delay in config.py
- Check if IP is blocked
- Try again later
API Rate Limit
Hit limits with DeepSeek. Fixed by:
- Check API quota
- Add caching to avoid duplicate calls
- Reduce batch size
Chinese characters not displaying
In charts mostly. Fixed by:
- Install fonts:
apt-get install fonts-wqy-zenhei - Configure matplotlib font settings
Memory issues
Browser instances eating memory when doing lots of pages. Fixed by restarting browser periodically.
Possible improvements
- Proxy support: Rotating proxies for large-scale scraping
- Incremental updates: Only fetch new data since last run
- Web dashboard: Flask/FastAPI UI
- Export options: CSV, Excel, JSON export
- Docker: Containerize for deployment
Legal note
Important: Respect robots.txt, implement rate limiting, and use scraped data responsibly. This project is for educational purposes only.
Summary
Built a complete AI-powered web scraper with:
- HTTP client with anti-scraping measures
- LLM-based data parsing
- SQLite database
- Data visualization
- CLI interface
- Error handling
Full source code: github.com/stars1324/python-ai-spider