Reddit Fetch: Finally Built a Quality Comment Corpus
I needed high-quality Reddit comments for LLM fine-tuning. PRAW (Python Reddit API Wrapper) seemed perfect, but the rate limits and data quality issues made it harder than expected. Here's what actually worked.
Why Reddit for LLM Training?
Reddit is one of the best sources for conversational AI training data:
- Natural conversations: Real discussions, not synthetic data
- Diverse topics: From programming to cooking to philosophy
- Voting signals: Upvotes/downvotes indicate quality
- Context preserved: Thread structure and parent-child relationships
- Large corpus: Billions of comments across millions of subreddits
But raw Reddit data needs significant cleaning before it's useful for training.
Problem
When scraping comments from large subreddits, PRAW would hit rate limits after a few hundred requests. The built-in rate limiting wasn't aggressive enough.
Error: prawcore.exceptions.ResponseException: 429 Too Many Requests
What I Tried
Attempt 1: Increased num_requests in PRAW config - Still hit limits
Attempt 2: Added manual time.sleep() between calls - Helped but wasteful
Attempt 3: Used multiple accounts with rotation - Worked but complex to manage
Actual Fix
Configure PRAW's rate limits more conservatively and add exponential backoff:
import praw
import time
from typing import List
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def create_reddit_client(
client_id: str,
client_secret: str,
user_agent: str
) -> praw.Reddit:
"""
Create Reddit client with conservative rate limiting
Args:
client_id: Reddit API client ID
client_secret: Reddit API client secret
user_agent: Descriptive user agent
Returns:
Configured Reddit instance
"""
reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent,
# Conservative rate limiting
ratelimit_seconds=600, # 10 minutes between bursts
timeout=30,
# Number of requests per rate limit window
# Reddit's actual limit is 60/minute, use 40 to be safe
num_requests=40
)
return reddit
async def fetch_comments_safe(
subreddit_name: str,
limit: int = 1000,
min_score: int = 10
) -> List[dict]:
"""
Fetch comments with aggressive rate limiting
Args:
subreddit_name: Name of subreddit (without r/)
limit: Maximum comments to fetch
min_score: Minimum upvote score threshold
Returns:
List of comment dictionaries
"""
reddit = create_reddit_client(
client_id="YOUR_CLIENT_ID",
client_secret="YOUR_CLIENT_SECRET",
user_agent="LLM_Training_Bot/1.0"
)
comments_data = []
try:
subreddit = reddit.subreddit(subreddit_name)
# Fetch comments with pagination
for comment in subreddit.comments(limit=limit):
# Filter by score
if comment.score < min_score:
continue
# Extract data
comment_data = {
'id': comment.id,
'body': comment.body,
'score': comment.score,
'author': str(comment.author) if comment.author else '[deleted]',
'created_utc': comment.created_utc,
'subreddit': subreddit_name,
'parent_id': comment.parent_id,
'link_id': comment.link_id
}
comments_data.append(comment_data)
# Log progress
if len(comments_data) % 100 == 0:
logger.info(f"Fetched {len(comments_data)} comments...")
# Conservative delay between requests
time.sleep(2)
logger.info(f"Total comments fetched: {len(comments_data)}")
return comments_data
except prawcore.exceptions.ResponseException as e:
if e.status == 429:
logger.error(f"Rate limited! Waiting 10 minutes...")
time.sleep(600)
else:
logger.error(f"API error: {e}")
return comments_data
# Usage
if __name__ == '__main__':
import asyncio
comments = asyncio.run(fetch_comments_safe(
subreddit_name='Python',
limit=1000,
min_score=10
))
print(f"Fetched {len(comments)} comments")
Problem
A significant portion of comments were [deleted] or [removed]. This polluted the corpus and wasted processing time.
What I Tried
Attempt 1: Filtered out [deleted] comments - Still slow, had to fetch them first
Attempt 2: Increased min_score threshold - Helped but missed good low-score comments
Attempt 3: Only scraped from "hot" threads - Better but biased
Actual Fix
Filter aggressively and focus on high-quality threads:
import re
from typing import List, Dict
import praw
def clean_comment_body(body: str) -> str:
"""
Clean comment body for LLM training
Args:
body: Raw comment text
Returns:
Cleaned text
"""
if not body or body in ['[deleted]', '[removed]']:
return None
# Remove URLs
body = re.sub(r'http\S+', '', body)
# Remove Reddit-specific markup
body = re.sub(r'^\s*>\s*', '', body, flags=re.MULTILINE) # Quotes
body = re.sub(r'^\s*-\s*', '', body, flags=re.MULTILINE) # Lists
# Remove excessive whitespace
body = ' '.join(body.split())
# Filter very short comments
if len(body) < 20:
return None
return body.strip()
def fetch_quality_comments(
reddit: praw.Reddit,
subreddit_name: str,
max_comments: int = 1000
) -> List[Dict]:
"""
Fetch only high-quality comments
Focus on:
- High-scoring comments
- From active discussions
- With substantial content
"""
comments_data = []
# Get subreddit
subreddit = reddit.subreddit(subreddit_name)
# Focus on top posts from past month (better quality)
for submission in subreddit.top(time_filter='month', limit=50):
# Replace more (load all comments)
submission.comments.replace_more(limit=0)
# Get top-level comments
for comment in submission.comments:
# Skip deleted
if not hasattr(comment, 'body') or comment.body in ['[deleted]', '[removed]']:
continue
# Skip low-quality
if comment.score < 5:
continue
# Clean body
cleaned_body = clean_comment_body(comment.body)
if not cleaned_body:
continue
# Extract data
comment_data = {
'id': comment.id,
'body': cleaned_body,
'score': comment.score,
'subreddit': subreddit_name,
'thread_title': submission.title,
'thread_score': submission.score,
'created_utc': comment.created_utc,
'depth': 0 if not comment.parent_id.startswith('t1_') else 1
}
comments_data.append(comment_data)
# Also fetch replies (depth 1)
if len(comment.replies) > 0:
for reply in comment.replies:
if hasattr(reply, 'body'):
cleaned_reply = clean_comment_body(reply.body)
if cleaned_reply and reply.score >= 3:
comments_data.append({
'id': reply.id,
'body': cleaned_reply,
'score': reply.score,
'subreddit': subreddit_name,
'thread_title': submission.title,
'thread_score': submission.score,
'created_utc': reply.created_utc,
'depth': 2
})
# Check limit
if len(comments_data) >= max_comments:
break
if len(comments_data) >= max_comments:
break
logger.info(f"Fetched {len(comments_data)} quality comments")
return comments_data
Problem
I needed to fetch more than the ~1000 comments that PRAW's standard iterator provides. Old comments beyond the limit were inaccessible.
What I Tried
Attempt 1: Increased limit parameter - Still capped at ~1000
Attempt 2: Used Pushshift API (now shutdown) - Gone in 2023
Attempt 3: Scrolled through "new" - Worked but very slow
Actual Fix
Use multiple strategies: focus on recent content and use specific time filters:
def fetch_comments_by_timeframe(
reddit: praw.Reddit,
subreddit_name: str,
days_back: int = 7
) -> List[Dict]:
"""
Fetch comments from specific timeframe
Args:
reddit: Reddit instance
subreddit_name: Subreddit name
days_back: Number of days to look back
Returns:
List of comments
"""
import time as time_module
comments_data = []
cutoff_time = time_module.time() - (days_back * 86400)
subreddit = reddit.subreddit(subreddit_name)
# Use search to find submissions, then get comments
# This gives better control than subreddit.comments()
for submission in subreddit.new(limit=None):
if submission.created_utc < cutoff_time:
break # Past our timeframe
submission.comments.replace_more(limit=0)
for comment in submission.comments:
if not hasattr(comment, 'body'):
continue
if comment.created_utc < cutoff_time:
continue
cleaned = clean_comment_body(comment.body)
if cleaned and comment.score >= 1:
comments_data.append({
'id': comment.id,
'body': cleaned,
'score': comment.score,
'created_utc': comment.created_utc
})
return comments_data
# Alternative: Use multiple subreddits for diversity
def fetch_diverse_corpus(
reddit: praw.Reddit,
subreddits: List[str],
comments_per_sub: int = 500
) -> List[Dict]:
"""
Fetch comments from multiple subreddits
Args:
reddit: Reddit instance
subreddits: List of subreddit names
comments_per_sub: Comments to fetch per subreddit
Returns:
Combined comment corpus
"""
all_comments = []
for sub in subreddits:
logger.info(f"Fetching from r/{sub}...")
try:
comments = fetch_quality_comments(
reddit,
sub,
max_comments=comments_per_sub
)
all_comments.extend(comments)
logger.info(f"Got {len(comments)} from r/{sub}")
except Exception as e:
logger.error(f"Error fetching from r/{sub}: {e}")
continue
logger.info(f"Total corpus size: {len(all_comments)} comments")
return all_comments
What I Learned
- Lesson 1: Reddit's API rate limits are strict - use conservative settings.
- Lesson 2: Quality over quantity - filter by score and clean aggressively.
- Lesson 3: Thread context matters - fetch from submission comments, not generic stream.
- Lesson 4: Diverse subreddits = better corpus - don't scrape from just one.
- Overall: With proper filtering, Reddit data is excellent for LLM fine-tuning on conversational tasks.
Preparing Data for LLM Training
JSONL Format for Training
import json
from typing import List, Dict
from datetime import datetime
def save_for_training(
comments: List[Dict],
output_file: str,
format_type: str = 'jsonl'
):
"""
Save comments in training format
Args:
comments: List of comment dictionaries
output_file: Output file path
format_type: 'jsonl' or 'json'
"""
with open(output_file, 'w', encoding='utf-8') as f:
if format_type == 'jsonl':
# JSONL format (one JSON per line)
for comment in comments:
# Extract just the fields needed for training
training_sample = {
'text': comment['body'],
'meta': {
'score': comment['score'],
'subreddit': comment['subreddit'],
'timestamp': comment['created_utc']
}
}
f.write(json.dumps(training_sample, ensure_ascii=False) + '\n')
else:
# Single JSON file
json.dump(comments, f, ensure_ascii=False, indent=2)
print(f"Saved {len(comments)} comments to {output_file}")
# Usage
if __name__ == '__main__':
reddit = create_reddit_client(
client_id="YOUR_ID",
client_secret="YOUR_SECRET",
user_agent="LLM_Trainer/1.0"
)
# Fetch diverse corpus
subreddits = ['Python', 'learnprogramming', 'AskProgramming', 'coding']
comments = fetch_diverse_corpus(
reddit,
subreddits,
comments_per_sub=500
)
# Save for training
save_for_training(
comments,
'reddit_corpus.jsonl',
format_type='jsonl'
)
Dataset Statistics
def analyze_corpus(corpus_file: str):
"""Analyze corpus statistics"""
import json
from collections import Counter
total_comments = 0
total_length = 0
subreddit_counts = Counter()
score_dist = Counter()
with open(corpus_file, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
total_comments += 1
total_length += len(data['text'])
subreddit_counts[data['meta']['subreddit']] += 1
# Bucket scores
score = data['meta']['score']
if score < 10:
score_dist['1-9'] += 1
elif score < 50:
score_dist['10-49'] += 1
elif score < 100:
score_dist['50-99'] += 1
else:
score_dist['100+'] += 1
print(f"Total comments: {total_comments}")
print(f"Avg length: {total_length / total_comments:.1f} chars")
print(f"\nSubreddits: {dict(subreddit_counts.most_common(10))}")
print(f"\nScore distribution: {dict(score_dist)}")
# Run analysis
analyze_corpus('reddit_corpus.jsonl')
Production Setup That Works
# reddit_corpus_builder.py - Production corpus builder
import praw
import json
import time
import logging
from typing import List, Dict
from datetime import datetime, timedelta
import re
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class RedditCorpusBuilder:
"""
Build high-quality Reddit corpus for LLM training
"""
def __init__(
self,
client_id: str,
client_secret: str,
user_agent: str
):
self.reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent,
ratelimit_seconds=600,
num_requests=40
)
def clean_text(self, text: str) -> str:
"""Clean comment text"""
if not text or text in ['[deleted]', '[removed]']:
return None
# Remove URLs
text = re.sub(r'http\S+', '', text)
# Remove markup
text = re.sub(r'^\s*>\s*', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*-\s*', '', text, flags=re.MULTILINE)
# Clean whitespace
text = ' '.join(text.split())
# Min length
if len(text) < 20:
return None
return text.strip()
def fetch_from_subreddit(
self,
subreddit_name: str,
max_comments: int = 1000,
days_back: int = 30,
min_score: int = 5
) -> List[Dict]:
"""Fetch comments from single subreddit"""
logger.info(f"Fetching from r/{subreddit_name}...")
comments = []
cutoff_time = (datetime.utcnow() - timedelta(days=days_back)).timestamp()
try:
subreddit = self.reddit.subreddit(subreddit_name)
# Focus on top posts
for submission in subreddit.top(time_filter='month', limit=100):
if submission.created_utc < cutoff_time:
continue
submission.comments.replace_more(limit=0)
for comment in submission.comments:
if not hasattr(comment, 'body'):
continue
if comment.score < min_score:
continue
cleaned = self.clean_text(comment.body)
if not cleaned:
continue
comments.append({
'text': cleaned,
'score': comment.score,
'subreddit': subreddit_name,
'created_utc': comment.created_utc,
'thread_title': submission.title
})
if len(comments) >= max_comments:
break
logger.info(f"Got {len(comments)} comments from r/{subreddit_name}")
return comments
except Exception as e:
logger.error(f"Error fetching r/{subreddit_name}: {e}")
return []
def build_corpus(
self,
subreddits: List[str],
output_file: str = 'reddit_corpus.jsonl',
comments_per_sub: int = 500
):
"""Build corpus from multiple subreddits"""
all_comments = []
for sub in subreddits:
comments = self.fetch_from_subreddit(
sub,
max_comments=comments_per_sub
)
all_comments.extend(comments)
time.sleep(10) # Delay between subreddits
# Save to JSONL
logger.info(f"Saving {len(all_comments)} comments to {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
for comment in all_comments:
sample = {
'text': comment['text'],
'meta': {
'score': comment['score'],
'subreddit': comment['subreddit'],
'timestamp': comment['created_utc']
}
}
f.write(json.dumps(sample, ensure_ascii=False) + '\n')
logger.info(f"Corpus saved successfully")
# Usage
if __name__ == '__main__':
builder = RedditCorpusBuilder(
client_id="YOUR_CLIENT_ID",
client_secret="YOUR_CLIENT_SECRET",
user_agent="LLM_Corpus_Builder/1.0"
)
# Programming-focused subreddits
subreddits = [
'Python',
'learnprogramming',
'AskProgramming',
'coding',
'programmerhumor'
]
builder.build_corpus(
subreddits=subreddits,
output_file='programming_reddit_corpus.jsonl',
comments_per_sub=500
)
Monitoring & Debugging
Reddit API Console
Create Reddit app at: https://www.reddit.com/prefs/apps
Common Issues
- 429 errors: Reduce requests, add longer delays
- Redirect errors: Check user agent format (must be descriptive)
- Empty results: Subreddit may be private or banned
- Many deleted comments: Use score threshold and focus on active threads
Quality Metrics
def check_corpus_quality(corpus_file: str):
"""Check corpus quality for LLM training"""
import json
with open(corpus_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
print(f"Total samples: {len(lines)}")
# Sample check
for i, line in enumerate(lines[:5]):
data = json.loads(line)
print(f"\nSample {i+1}:")
print(f" Text: {data['text'][:100]}...")
print(f" Score: {data['meta']['score']}")
print(f" Subreddit: {data['meta']['subreddit']}")
Related Resources
⚠️ Legal & Ethical Note
Reddit data is user-generated content. Respect Reddit's Terms of Service and API usage policies. Don't redistribute raw comment data without proper attribution. This article is for educational purposes.