Beautiful Soup 4: Parsing Tables That Actually Works
I had to extract data from 200+ HTML tables on the web. Some had missing closing tags, others used colspan/rowspan weirdly, and a few were nested tables. Here's the BS4 setup that handled them all.
Why Table Parsing Is Hard
HTML tables on the real web are messy. The documentation examples always show clean, well-formed tables. Real-world tables have:
- Missing
</td>and</tr>tags colspanandrowspanattributes that shift alignment- Nested tables (tables inside
<td>elements) - Tables inside
<div>or other containers - Headers not using
<th>tags
The naive soup.find_all('table') approach breaks on all of these.
Problem
When parsing tables with colspan or rowspan, the data wouldn't align with the correct columns. A cell spanning 3 columns would cause my parser to skip columns entirely.
Result: Column headers didn't match data rows.
What I Tried
Attempt 1: Ignored colspan/rowspan and parsed row by row - Data ended up in wrong columns
Attempt 2: Used pandas.read_html() - Failed on malformed HTML with missing tags
Attempt 3: Manually tracked cell positions with a counter - Complex and error-prone
Actual Fix
The solution is to build a grid representation of the table, tracking which cells are occupied by spanning cells. Here's a robust implementation:
from bs4 import BeautifulSoup
from typing import List, Dict, Any
import requests
def parse_table_with_span(table) -> List[Dict[str, str]]:
"""
Parse HTML table respecting colspan and rowspan
Args:
table: BeautifulSoup table element
Returns:
List of dictionaries, one per row
"""
rows = table.find_all('tr')
if not rows:
return []
# First pass: find max columns (considering colspan)
max_cols = 0
for row in rows:
cells = row.find_all(['td', 'th'])
col_count = sum(int(cell.get('colspan', 1)) for cell in cells)
max_cols = max(max_cols, col_count)
# Initialize grid to track occupied cells
# occupied[row][col] = True if cell is part of a rowspan
grid = [[False for _ in range(max_cols)] for _ in range(len(rows))]
result = []
for row_idx, row in enumerate(rows):
cells = row.find_all(['td', 'th'])
row_data = {}
col_idx = 0
for cell in cells:
# Find next available column
while col_idx < max_cols and grid[row_idx][col_idx]:
col_idx += 1
if col_idx >= max_cols:
break
# Get cell data
cell_text = cell.get_text(strip=True)
# Get span attributes
colspan = int(cell.get('colspan', 1))
rowspan = int(cell.get('rowspan', 1))
# Store cell data
row_data[f'col_{col_idx}'] = cell_text
# Mark cells as occupied
for r in range(row_idx, row_idx + rowspan):
for c in range(col_idx, col_idx + colspan):
if r < len(grid) and c < max_cols:
grid[r][c] = True
col_idx += colspan
result.append(row_data)
# Extract headers from first row with tags
headers = []
header_row = None
for row in rows:
th_cells = row.find_all('th')
if th_cells:
header_row = row
break
if header_row:
# Parse header row similarly
cells = header_row.find_all(['th', 'td'])
col_idx = 0
for cell in cells:
while col_idx < max_cols and grid[0][col_idx]:
col_idx += 1
colspan = int(cell.get('colspan', 1))
header_text = cell.get_text(strip=True)
headers.append(header_text)
col_idx += colspan
# Map numbered columns to headers
formatted_result = []
for row_data in result[1:]: # Skip header row
formatted_row = {}
for col_key, value in row_data.items():
col_num = int(col_key.split('_')[1])
if col_num < len(headers):
formatted_row[headers[col_num]] = value
else:
formatted_row[f'Column_{col_num}'] = value
formatted_result.append(formatted_row)
return formatted_result
# Usage
html = """
Name & Type
Value
Item 1
Type A
100
"""
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')
data = parse_table_with_span(table)
print(data)
# Output: [{'Name & Type': 'Item 1', 'Column_1': 'Type A', 'Value': '100'}]
Problem
Many websites have malformed HTML with missing </td> or </tr> tags. BeautifulSoup would parse these incorrectly, merging cells or rows together.
What I Tried
Attempt 1: Used default 'html.parser' - Failed on many malformed tables
Attempt 2: Switched to 'lxml' parser - Better but still had issues with badly nested tags
Attempt 3: Pre-processed HTML with regex - Too brittle, broke valid HTML
Actual Fix
Use 'html5lib' parser for malformed HTML. It's more forgiving and handles missing tags better than the default parsers.
# Install html5lib first
# pip install html5lib
from bs4 import BeautifulSoup
import requests
def parse_malformed_table(url: str) -> List[Dict]:
"""
Parse tables from potentially malformed HTML
"""
response = requests.get(url)
response.encoding = response.apparent_encoding
# Use html5lib for malformed HTML
soup = BeautifulSoup(response.content, 'html5lib')
# Find all tables
tables = soup.find_all('table')
results = []
for table in tables:
# Parse with our span-aware function
data = parse_table_with_span(table)
if data:
results.append(data)
return results
# Alternative: Use lxml with fallback
def parse_with_fallback(content: bytes):
"""
Try multiple parsers in order of preference
"""
parsers = ['html5lib', 'lxml', 'html.parser']
for parser in parsers:
try:
soup = BeautifulSoup(content, parser)
tables = soup.find_all('table')
if tables:
return tables
except Exception as e:
print(f"Parser {parser} failed: {e}")
continue
return []
Problem
When a table cell contains another table inside it, find_all('td') would return cells from both the parent and nested tables. This caused data corruption.
What I Tried
Attempt 1: Used find_all('td', recursive=False) - Worked but missed nested data I needed
Attempt 2: Removed nested tables before parsing - Lost data
Attempt 3: Traversed DOM tree manually - Complex and slow
Actual Fix
The solution is to work with direct children only (recursive=False) and explicitly handle nested tables as separate entities:
from bs4 import BeautifulSoup
def parse_table_with_nesting(table) -> Dict:
"""
Parse table, handling nested tables separately
"""
result = {
'main_table': [],
'nested_tables': []
}
rows = table.find_all('tr', recursive=False)
for row in rows:
# Only get direct children
cells = row.find_all(['td', 'th'], recursive=False)
row_data = []
for cell in cells:
# Check if cell contains a nested table
nested_table = cell.find('table')
if nested_table:
# Parse nested table separately
nested_data = parse_table_with_nesting(nested_table)
result['nested_tables'].append(nested_data)
# Get text before/after nested table
cell_text = cell.get_text(strip=True)
row_data.append({
'has_nested_table': True,
'text': cell_text,
'nested_table_index': len(result['nested_tables']) - 1
})
else:
# Regular cell
row_data.append({
'has_nested_table': False,
'text': cell.get_text(strip=True)
})
result['main_table'].append(row_data)
return result
# Alternative: Extract nested tables first
def extract_nested_tables(soup):
"""
Extract and replace nested tables with placeholders
"""
nested_count = 0
extracted_tables = {}
# Find all tables (including nested)
all_tables = soup.find_all('table')
for table in all_tables:
# Check if this table is nested (inside another table)
parent_table = table.find_parent('table')
if parent_table:
# Replace with placeholder
placeholder = soup.new_tag('div')
placeholder['data-nested-table-id'] = nested_count
placeholder.string = f'[Nested Table {nested_count}]'
table.replace_with(placeholder)
# Store table data
extracted_tables[nested_count] = parse_table_with_span(table)
nested_count += 1
return extracted_tables
What I Learned
- Lesson 1: Use 'html5lib' parser for malformed HTML - it's much more forgiving than 'html.parser' or 'lxml'.
- Lesson 2: Always account for colspan and rowspan - they're common in real-world tables.
- Lesson 3: Nested tables are separate entities - don't try to flatten them into the parent.
- Lesson 4: Use
recursive=False when you only want direct children.
- Overall: A grid-based approach that tracks occupied cells handles spanning correctly.
Production Setup That Works
Here's my complete table parser that handles all the edge cases:
# table_parser.py - Production table parser
from bs4 import BeautifulSoup
from typing import List, Dict, Optional, Union
import requests
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class TableParser:
"""
Robust HTML table parser that handles:
- Malformed HTML (missing tags)
- colspan and rowspan
- Nested tables
- Multiple tables per page
"""
def __init__(self, parser: str = 'html5lib'):
"""
Args:
parser: BeautifulSoup parser to use (html5lib, lxml, html.parser)
"""
self.parser = parser
def parse_from_url(self, url: str, timeout: int = 30) -> List[List[Dict]]:
"""
Parse all tables from a URL
Args:
url: URL to parse
timeout: Request timeout in seconds
Returns:
List of tables, each table is a list of row dicts
"""
try:
response = requests.get(url, timeout=timeout)
response.raise_for_status()
response.encoding = response.apparent_encoding
return self.parse_from_html(response.content)
except Exception as e:
logger.error(f"Error fetching {url}: {e}")
return []
def parse_from_html(self, html_content: Union[str, bytes]) -> List[List[Dict]]:
"""
Parse all tables from HTML content
Args:
html_content: Raw HTML content
Returns:
List of tables, each table is a list of row dicts
"""
soup = BeautifulSoup(html_content, self.parser)
tables = soup.find_all('table')
logger.info(f"Found {len(tables)} tables")
results = []
for idx, table in enumerate(tables):
try:
table_data = self._parse_single_table(table)
if table_data:
results.append(table_data)
logger.info(f"Parsed table {idx + 1}: {len(table_data)} rows")
except Exception as e:
logger.warning(f"Failed to parse table {idx + 1}: {e}")
continue
return results
def _parse_single_table(self, table) -> List[Dict]:
"""
Parse a single table with colspan/rowspan support
Args:
table: BeautifulSoup table element
Returns:
List of row dictionaries
"""
rows = table.find_all('tr', recursive=False)
if not rows:
return []
# Determine grid dimensions
max_cols = self._calculate_max_columns(rows)
# Initialize occupancy grid
grid = [[False for _ in range(max_cols)] for _ in range(len(rows))]
# Extract headers
headers = self._extract_headers(rows, grid, max_cols)
# Parse data rows
data_rows = []
for row_idx, row in enumerate(rows):
if self._is_header_row(row):
continue # Skip header rows
row_data = self._parse_row(row, row_idx, grid, max_cols, headers)
if row_data:
data_rows.append(row_data)
return data_rows
def _calculate_max_columns(self, rows: list) -> int:
"""Calculate maximum number of columns in table"""
max_cols = 0
for row in rows:
cells = row.find_all(['td', 'th'], recursive=False)
col_count = sum(int(cell.get('colspan', 1)) for cell in cells)
max_cols = max(max_cols, col_count)
return max_cols
def _is_header_row(self, row) -> bool:
"""Check if row is a header row"""
return bool(row.find('th'))
def _extract_headers(self, rows: list, grid: list, max_cols: int) -> List[str]:
"""Extract column headers from table"""
headers = []
for row in rows:
if not self._is_header_row(row):
continue
cells = row.find_all(['th', 'td'], recursive=False)
col_idx = 0
for cell in cells:
# Find next available column
while col_idx < max_cols and grid[0][col_idx]:
col_idx += 1
colspan = int(cell.get('colspan', 1))
header_text = cell.get_text(strip=True)
# Add header for each spanned column
for _ in range(colspan):
headers.append(header_text)
# Mark cells as occupied
rowspan = int(cell.get('rowspan', 1))
for r in range(rowspan):
for c in range(col_idx, col_idx + colspan):
if r < len(grid) and c < max_cols:
grid[r][c] = True
col_idx += colspan
break # Use first header row
return headers
def _parse_row(self, row, row_idx: int, grid: list, max_cols: int, headers: List[str]) -> Optional[Dict]:
"""Parse a single data row"""
cells = row.find_all(['td', 'th'], recursive=False)
row_data = {}
col_idx = 0
for cell in cells:
# Find next available column
while col_idx < max_cols and grid[row_idx][col_idx]:
col_idx += 1
if col_idx >= max_cols:
break
# Get cell data
cell_text = cell.get_text(strip=True)
# Check for nested tables
nested_table = cell.find('table')
if nested_table:
cell_text = f"[Nested table with data]"
# Store in appropriate column
if col_idx < len(headers):
column_name = headers[col_idx]
else:
column_name = f"Column_{col_idx}"
row_data[column_name] = cell_text
# Mark cells as occupied
colspan = int(cell.get('colspan', 1))
rowspan = int(cell.get('rowspan', 1))
for r in range(row_idx, row_idx + rowspan):
for c in range(col_idx, col_idx + colspan):
if r < len(grid) and c < max_cols:
grid[r][c] = True
col_idx += colspan
return row_data if row_data else None
def to_csv(self, tables: List[List[Dict]], filename: str):
"""
Export parsed tables to CSV
Args:
tables: Parsed tables
filename: Output filename
"""
import csv
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
for table_idx, table in enumerate(tables):
if not table:
continue
# Write table separator
if table_idx > 0:
writer.writerow([])
writer.writerow([f'=== Table {table_idx + 1} ==='])
writer.writerow([])
# Write headers
headers = list(table[0].keys())
writer.writerow(headers)
# Write data
for row in table:
writer.writerow([row.get(h, '') for h in headers])
logger.info(f"Exported to {filename}")
# Usage example
if __name__ == "__main__":
parser = TableParser(parser='html5lib')
# Parse from URL
tables = parser.parse_from_url('https://example.com/page-with-tables')
# Export to CSV
parser.to_csv(tables, 'output.csv')
# Or work with data directly
for table in tables:
for row in table:
print(row)
Monitoring & Debugging
Common Issues to Watch For
- Empty tables: Parser found table but extracted no rows - check table structure
- Misaligned columns: colspan/rowspan not handled correctly - use grid-based parsing
- Missing headers: First row has no
<th> tags - auto-detect headers
- Duplicate data: Nested tables included in parent - use
recursive=False
Debug Helper
def debug_table_structure(table_element):
"""Print table structure for debugging"""
rows = table_element.find_all('tr', recursive=False)
print(f"Table has {len(rows)} rows")
for row_idx, row in enumerate(rows):
cells = row.find_all(['td', 'th'], recursive=False)
print(f" Row {row_idx}: {len(cells)} cells")
for cell_idx, cell in enumerate(cells):
colspan = cell.get('colspan', '1')
rowspan = cell.get('rowspan', '1')
text = cell.get_text(strip=True)[:30]
print(f" Cell {cell_idx}: colspan={colspan}, rowspan={rowspan}, text='{text}'")
Related Resources