Data Visualization with Matplotlib

Creating charts from movie data

After storing the data

Data is in the database. Now let's visualize it - see trends and patterns that aren't obvious from raw numbers.

Setup

import matplotlib.pyplot as plt
import matplotlib
from collections import Counter

# Configure for Chinese character support
matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False

# Set style
plt.style.use('seaborn-v0_8-darkgrid')

# Chart configuration
FIGURE_SIZE = (12, 8)
DPI = 300

Chinese characters were an issue initially. Had to configure fonts properly or they showed as squares.

Year distribution

Line chart showing how many movies from each year made it to Top 250:

from pathlib import Path

def create_year_distribution_chart(movies, output_dir: str):
    """Line chart showing movie count by year"""

    years = [m.get('year') for m in movies if m.get('year')]
    year_counts = Counter(years)
    sorted_years = sorted(year_counts.items())

    fig, ax = plt.subplots(figsize=FIGURE_SIZE)

    years_list = [y[0] for y in sorted_years]
    counts_list = [y[1] for y in sorted_years]

    ax.plot(years_list, counts_list, marker='o', linewidth=2, markersize=6)
    ax.fill_between(years_list, counts_list, alpha=0.3)

    ax.set_xlabel('Release Year', fontsize=12, fontweight='bold')
    ax.set_ylabel('Number of Movies', fontsize=12, fontweight='bold')
    ax.set_title('Douban Top 250: Movie Distribution by Year', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)

    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    output_path = Path(output_dir) / 'year_distribution.png'
    plt.savefig(output_path, dpi=DPI, bbox_inches='tight')
    plt.close()

    return str(output_path)

Top directors

Horizontal bar chart:

def create_top_directors_chart(movies, output_dir: str, top_n: int = 10):
    """Bar chart for top directors"""

    directors = [m.get('director') for m in movies if m.get('director')]
    director_counts = Counter(directors)
    top_directors = director_counts.most_common(top_n)

    fig, ax = plt.subplots(figsize=FIGURE_SIZE)

    director_names = [d[0] for d in top_directors]
    movie_counts = [d[1] for d in top_directors]

    # Color gradient
    colors = plt.cm.viridis(range(len(director_names)))

    bars = ax.barh(director_names, movie_counts, color=colors)

    # Add value labels
    for i, bar in enumerate(bars):
        width = bar.get_width()
        ax.text(width + 0.1, bar.get_y() + bar.get_height() / 2,
               f'{int(width)}', ha='left', va='center', fontsize=10)

    ax.set_xlabel('Number of Movies in Top 250', fontsize=12, fontweight='bold')
    ax.set_ylabel('Director', fontsize=12, fontweight='bold')
    ax.set_title(f'Top {top_n} Directors in Douban Top 250', fontsize=14, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)

    ax.invert_yaxis()
    plt.tight_layout()

    output_path = Path(output_dir) / 'top_directors.png'
    plt.savefig(output_path, dpi=DPI, bbox_inches='tight')
    plt.close()

    return str(output_path)

Genre distribution

def create_genre_distribution_chart(movies, output_dir: str):
    """Pie chart showing genre distribution"""

    # Extract all genres (movies can have multiple)
    all_genres = []
    for movie in movies:
        genres = movie.get('genres', [])
        if isinstance(genres, list):
            all_genres.extend(genres)

    genre_counts = Counter(all_genres)

    fig, ax = plt.subplots(figsize=FIGURE_SIZE)

    genres = list(genre_counts.keys())
    counts = list(genre_counts.values())

    colors = plt.cm.Set3(range(len(genres)))
    explode = [0.05 if count > 30 else 0 for count in counts]

    wedges, texts, autotexts = ax.pie(
        counts,
        labels=genres,
        autopct='%1.1f%%',
        startangle=90,
        colors=colors,
        explode=explode,
        shadow=True
    )

    # Enhance text
    for text in texts:
        text.set_fontsize(11)
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontsize(10)
        autotext.set_fontweight('bold')

    ax.set_title('Genre Distribution in Douban Top 250', fontsize=14, fontweight='bold')
    ax.legend(wedges, [f'{g}: {c}' for g, c in zip(genres, counts)],
             title="Genre (Count)",
             loc="center left",
             bbox_to_anchor=(1, 0, 0.5, 1))

    plt.tight_layout()

    output_path = Path(output_dir) / 'genre_distribution.png'
    plt.savefig(output_path, dpi=DPI, bbox_inches='tight')
    plt.close()

    return str(output_path)

Chart generator

class ChartGenerator:
    def __init__(self, output_dir: str = "analysis/output"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def generate_all_charts(self, movies) -> Dict[str, str]:
        """Generate all charts and return file paths"""

        results = {
            'year_distribution': create_year_distribution_chart(movies, self.output_dir),
            'top_directors': create_top_directors_chart(movies, self.output_dir),
            'genre_distribution': create_genre_distribution_chart(movies, self.output_dir),
            'rating_distribution': create_rating_distribution_chart(movies, self.output_dir)
        }

        return results

What I learned

  • High DPI (300) makes charts look professional
  • Chinese fonts need special handling
  • Tight layout prevents label cutoff
  • Color gradients make charts more readable
  • Saving as PNG is more portable than PDF