Facebook Page Scraper: Actually Got It Working Without Login

I needed to scrape public posts from 500 Facebook business pages for market research. Facebook kept blocking me after ~100 pages. Here's the cookie rotation setup that finally worked.

Why This Is Hard

Facebook has one of the most aggressive anti-scraping systems out there. Even for public business page content, they'll:

Most scrapers on GitHub haven't been updated to handle Facebook's 2024-2025 detection improvements.

Problem

When scraping Facebook pages that were previously public, I started getting "Login unsuccessful" or "A login (cookies) is required to see this page" errors. These were business pages with public posts.

Error: facebook-scraper.exceptions.LoginRequired: You need to login to see this page

What I Tried

Attempt 1: Used facebook-scraper without cookies - Failed immediately with login required
Attempt 2: Added my own cookies using EditThisCookie - Worked for ~100 pages, then blocked
Attempt 3: Used multiple accounts with rotating cookies - Each account got flagged within an hour

Actual Fix

The solution is a cookie rotation system with realistic user agents and request timing. Facebook checks cookie freshness, request patterns, and browser fingerprints.

import requests
from bs4 import BeautifulSoup
import time
import random
from typing import List, Dict
import json

class FacebookScraper:
    def __init__(self, cookies_file: str = 'fb_cookies.json'):
        """
        Initialize with a pool of cookies from different accounts
        Cookies should be exported from a real browser session
        """
        self.cookies_pool = self._load_cookies(cookies_file)
        self.current_cookie_index = 0
        self.session = requests.Session()

        # Rotate user agents to avoid fingerprinting
        self.user_agents = [
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        ]

    def _load_cookies(self, cookies_file: str) -> List[Dict]:
        """Load cookies from JSON file"""
        try:
            with open(cookies_file, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            print(f"Cookies file {cookies_file} not found")
            return []

    def _get_current_cookies(self) -> Dict:
        """Get current cookies from pool and rotate"""
        if not self.cookies_pool:
            return {}

        cookies = self.cookies_pool[self.current_cookie_index]
        self.current_cookie_index = (self.current_cookie_index + 1) % len(self.cookies_pool)
        return cookies

    def _make_headers(self) -> Dict:
        """Generate realistic headers"""
        return {
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }

    def scrape_page(self, page_id: str, max_posts: int = 50) -> List[Dict]:
        """
        Scrape posts from a Facebook page

        Args:
            page_id: Facebook page ID or username
            max_posts: Maximum number of posts to scrape

        Returns:
            List of post dictionaries
        """
        url = f"https://mbasic.facebook.com/{page_id}"
        posts = []

        cookies = self._get_current_cookies()
        if not cookies:
            print("No cookies available, scraping may fail")
            cookies = {}

        try:
            # Use mbasic.facebook.com for simpler HTML
            response = self.session.get(
                url,
                headers=self._make_headers(),
                cookies=cookies,
                timeout=15
            )

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')

                # mbasic structure - simpler than desktop
                post_elements = soup.find_all('div', class_='async_like')

                for post_elem in post_elements[:max_posts]:
                    try:
                        post_data = self._extract_post_data(post_elem)
                        if post_data:
                            posts.append(post_data)
                    except Exception as e:
                        print(f"Error extracting post: {e}")
                        continue

                # Random delay to mimic human behavior
                time.sleep(random.uniform(2, 5))

            elif response.status_code == 403 or 'login' in response.url:
                print(f"Access denied for page {page_id}, may need fresh cookies")

        except Exception as e:
            print(f"Error scraping page {page_id}: {e}")

        return posts

    def _extract_post_data(self, post_elem) -> Dict:
        """Extract data from post element"""
        # Implementation depends on HTML structure
        return {
            'text': post_elem.get_text(strip=True),
            'timestamp': self._extract_timestamp(post_elem)
        }

    def _extract_timestamp(self, post_elem) -> str:
        """Extract timestamp from post"""
        # Parse timestamp from element
        return ""

Problem

After scraping successfully for a while, all requests started returning 500 Server Errors. This wasn't a rate limit (no 429), but Facebook's bot detection kicking in.

Error: HTTP 500 Internal Server Error - returned consistently

What I Tried

Attempt 1: Changed IP address - Still got 500 errors (account was flagged)
Attempt 2: Waited 24 hours - Errors persisted, cookie was burned
Attempt 3: Reduced request frequency from 45 min to 90 min - Delayed the 500s but still happened

Actual Fix

The 500 errors meant my cookie/account was flagged. I needed a proper rotation strategy with multiple accounts and conservative timing.

# Cookie rotation strategy
import json
import random
from datetime import datetime, timedelta

class CookieManager:
    def __init__(self):
        self.cookies_file = 'fb_cookies.json'
        self.usage_log = {}

    def get_fresh_cookie(self) -> Dict:
        """
        Get a cookie that hasn't been used recently
        Implements cooling period between uses
        """
        with open(self.cookies_file, 'r') as f:
            cookies = json.load(f)

        # Filter cookies that haven't been used in 2 hours
        now = datetime.now()
        available_cookies = []

        for cookie_set in cookies:
            cookie_id = cookie_set.get('id', 'unknown')

            if cookie_id not in self.usage_log:
                available_cookies.append(cookie_set)
                continue

            last_used = self.usage_log[cookie_id]
            if now - last_used > timedelta(hours=2):
                available_cookies.append(cookie_set)

        if not available_cookies:
            print("All cookies in cooling period, waiting...")
            return None

        # Select random cookie to distribute usage
        selected = random.choice(available_cookies)
        cookie_id = selected.get('id', 'unknown')
        self.usage_log[cookie_id] = now

        return {
            c['name']: c['value']
            for c in selected.get('cookies', [])
        }

# Usage
cookie_manager = CookieManager()

while True:
    cookie = cookie_manager.get_fresh_cookie()
    if cookie:
        # Scrape with this cookie
        scrape_with_cookie(cookie)
    else:
        # Wait before retry
        time.sleep(3600)  # Wait 1 hour

Problem

Facebook started showing "You Can't Use This Feature Right Now" messages, indicating my account/cookie was temporarily blocked. This happens when Facebook detects automated behavior.

What I Tried

Attempt 1: Continued using same cookie - Block became permanent
Attempt 2: Switched to new account with same IP - New account also blocked quickly
Attempt 3: Used residential proxy - Worked but expensive

Actual Fix

The solution is to prevent detection in the first place by mimicking human behavior: random delays, mouse movements (if using Selenium), and realistic browsing patterns.

import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
import random

def human_like_scroll(driver):
    """Scroll like a human, not a bot"""
    last_height = driver.execute_script("return document.body.scrollHeight")

    for _ in range(random.randint(3, 5)):
        # Random scroll distance
        scroll_distance = random.randint(300, 800)

        # Smooth scroll (human-like)
        driver.execute_script(f"window.scrollBy(0, {scroll_distance});")

        # Random pause
        time.sleep(random.uniform(0.5, 2.0))

    # Move mouse randomly to simulate human
    actions = ActionChains(driver)
    for _ in range(3):
        x_offset = random.randint(-100, 100)
        y_offset = random.randint(-100, 100)
        actions.move_by_offset(x_offset, y_offset).perform()
        time.sleep(random.uniform(0.1, 0.5))

def scrape_facebook_page(page_url: str):
    """
    Scrape Facebook page with human-like behavior
    """
    options = uc.ChromeOptions()

    # Use realistic user agent
    options.add_argument('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36')

    driver = uc.Chrome(options=options)

    try:
        driver.get(page_url)

        # Wait for page load
        time.sleep(random.uniform(2, 4))

        # Scroll like a human
        human_like_scroll(driver)

        # Extract content
        posts = driver.find_elements(By.CSS_SELECTOR, '[data-testid="tweet"]')
        # Parse posts...

        # Random "reading" time
        time.sleep(random.uniform(3, 8))

    finally:
        driver.quit()

What I Learned

Exporting Cookies from Browser

To use the cookie rotation system, you need real cookies from logged-in sessions:

Method 1: EditThisCookie Extension

# 1. Install EditThisCookie Chrome extension
# 2. Log into Facebook in Chrome
# 3. Click the cookie icon → Export → JSON
# 4. Save to fb_cookies.json

# Expected JSON format:
[
  {
    "id": "account_1",
    "cookies": [
      {"name": "c_user", "value": "123456", "domain": ".facebook.com"},
      {"name": "xs", "value": "abc123...", "domain": ".facebook.com"},
      {"name": "datr", "value": "xyz789...", "domain": ".facebook.com"}
    ]
  },
  {
    "id": "account_2",
    "cookies": [...]
  }
]

Method 2: Selenium Session Export

from selenium import webdriver
import json

def export_facebook_cookies():
    """Export cookies from Selenium session"""
    driver = webdriver.Chrome()

    # Log in manually
    driver.get("https://www.facebook.com")
    input("Press Enter after logging in...")

    # Export cookies
    cookies = driver.get_cookies()

    with open('fb_cookies.json', 'w') as f:
        json.dump([{
            'id': 'selenium_session',
            'cookies': cookies
        }], f, indent=2)

    driver.quit()
    print("Cookies exported to fb_cookies.json")

Production Setup That Works

Here's my complete production setup for scraping Facebook pages at scale:

# fb_scraper.py - Production configuration

import requests
from bs4 import BeautifulSoup
import time
import random
import json
from typing import List, Optional
from datetime import datetime, timedelta
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FacebookPageScraper:
    """
    Production Facebook page scraper with:
    - Cookie rotation
    - Rate limiting
    - Human-like behavior
    - Error recovery
    """

    def __init__(self, cookies_file: str = 'fb_cookies.json'):
        self.cookies_file = cookies_file
        self.cookies_pool = self._load_cookies()
        self.cookie_usage = {}  # Track when each cookie was last used
        self.session = requests.Session()

        # Configuration
        self.min_delay = 60  # Minimum 1 minute between requests
        self.max_delay = 180  # Maximum 3 minutes between requests
        self.cookie_cooldown = 2  # Hours before reusing cookie

    def _load_cookies(self) -> List[Dict]:
        """Load cookies from file"""
        try:
            with open(self.cookies_file, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            logger.error(f"Cookies file {self.cookies_file} not found")
            return []

    def _get_available_cookie(self) -> Optional[Dict]:
        """
        Get a cookie that's available (past cooldown period)
        Returns None if all cookies are cooling down
        """
        now = datetime.now()

        for cookie_set in self.cookies_pool:
            cookie_id = cookie_set.get('id', 'unknown')

            # Skip if recently used
            if cookie_id in self.cookie_usage:
                last_used = self.cookie_usage[cookie_id]
                if now - last_used < timedelta(hours=self.cookie_cooldown):
                    continue

            # Mark as used
            self.cookie_usage[cookie_id] = now

            # Convert to dict format
            cookie_dict = {}
            for c in cookie_set.get('cookies', []):
                cookie_dict[c['name']] = c['value']

            return cookie_dict

        return None

    def _human_delay(self):
        """Random delay to mimic human behavior"""
        delay = random.uniform(self.min_delay, self.max_delay)
        logger.info(f"Waiting {delay:.1f} seconds...")
        time.sleep(delay)

    def scrape_page(self, page_id: str, max_posts: int = 100) -> List[Dict]:
        """
        Scrape posts from a Facebook page

        Args:
            page_id: Facebook page ID or username
            max_posts: Maximum posts to scrape

        Returns:
            List of scraped posts
        """
        # Get available cookie
        cookie = self._get_available_cookie()
        if not cookie:
            logger.warning("No cookies available, waiting for cooldown...")
            time.sleep(3600)  # Wait 1 hour
            return self.scrape_page(page_id, max_posts)

        # Use mbasic.facebook.com for simpler HTML
        url = f"https://mbasic.facebook.com/{page_id}"

        headers = {
            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9'
        }

        try:
            response = self.session.get(
                url,
                headers=headers,
                cookies=cookie,
                timeout=20
            )

            if response.status_code == 200:
                posts = self._parse_posts(response.content)
                logger.info(f"Scraped {len(posts)} posts from {page_id}")
                return posts

            elif response.status_code == 403:
                logger.error(f"Access denied for {page_id}, cookie may be burned")
                return []

            elif 'login' in response.url:
                logger.error(f"Redirected to login, cookie expired for {page_id}")
                return []

            else:
                logger.error(f"Unexpected status {response.status_code} for {page_id}")
                return []

        except Exception as e:
            logger.error(f"Error scraping {page_id}: {e}")
            return []

        finally:
            # Always add delay between requests
            if len(self.cookies_pool) > 1:
                self._human_delay()

    def _parse_posts(self, html_content: bytes) -> List[Dict]:
        """Parse posts from HTML content"""
        soup = BeautifulSoup(html_content, 'html.parser')
        posts = []

        # Parse mbasic structure
        # Implementation depends on current HTML structure
        post_elements = soup.find_all('div', class_='async_like')

        for elem in post_elements:
            try:
                post = {
                    'text': elem.get_text(strip=True),
                    'timestamp': self._extract_timestamp(elem)
                }
                posts.append(post)
            except Exception as e:
                logger.debug(f"Error parsing post: {e}")
                continue

        return posts

    def _extract_timestamp(self, element) -> str:
        """Extract timestamp from post element"""
        # Implementation
        return ""

# Usage example
if __name__ == "__main__":
    scraper = FacebookPageScraper()

    pages_to_scrape = [
        'microsoft',
        'google',
        'amazon'
    ]

    for page_id in pages_to_scrape:
        posts = scraper.scrape_page(page_id, max_posts=50)
        print(f"Got {len(posts)} posts from {page_id}")

        # Save to database or file
        # save_posts(page_id, posts)

Monitoring & Debugging

Red Flags to Watch For

Health Check Script

def check_cookie_health(cookie: dict) -> bool:
    """Check if a cookie is still valid"""
    response = requests.get(
        "https://mbasic.facebook.com/",
        cookies=cookie,
        timeout=10
    )

    # If we get redirected to login, cookie is dead
    if 'login' in response.url or response.status_code == 403:
        return False

    return True

Related Resources

⚠️ Legal Note

Web scraping Facebook may violate their Terms of Service. This article is for educational purposes. Always respect Facebook's ToS and robots.txt. Consider using the official Facebook Graph API with appropriate permissions for legitimate use cases. Scraping personal data without consent may violate privacy laws.