Understanding Dynamic Content
1. Types of Dynamic Loading
- AJAX requests
- Infinite scroll
- Lazy loading
- WebSocket updates
- React/Vue.js state changes
Solution Approaches
1. Using Selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class DynamicScraper:
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def scrape_infinite_scroll(self, url, scroll_pause=2):
self.driver.get(url)
last_height = self.driver.execute_script('return document.body.scrollHeight')
while True:
# Scroll down
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# Wait for new content
time.sleep(scroll_pause)
# Calculate new scroll height
new_height = self.driver.execute_script('return document.body.scrollHeight')
# Break if no more content
if new_height == last_height:
break
last_height = new_height
# Extract content
elements = self.driver.find_elements(By.CSS_SELECTOR, '.content-item')
return [elem.text for elem in elements]
2. Using Playwright
from playwright.sync_api import sync_playwright
class ModernScraper:
def __init__(self):
self.playwright = sync_playwright().start()
self.browser = self.playwright.chromium.launch()
async def scrape_spa(self, url):
page = self.browser.new_page()
# Navigate and wait for network idle
await page.goto(url, wait_until='networkidle')
# Wait for specific content
await page.wait_for_selector('.dynamic-content')
# Extract data
data = await page.evaluate('''
() => {
const items = document.querySelectorAll('.item');
return Array.from(items).map(item => ({
title: item.querySelector('.title').innerText,
description: item.querySelector('.desc').innerText
}));
}
''')
return data
3. Intercepting AJAX Requests
from mitmproxy import ctx
class AjaxInterceptor:
def __init__(self):
self.data = []
def request(self, flow):
# Add custom headers
flow.request.headers['X-Requested-With'] = 'XMLHttpRequest'
def response(self, flow):
# Capture API responses
if 'api' in flow.request.pretty_url:
try:
self.data.append(json.loads(flow.response.content))
except json.JSONDecodeError:
pass
# Usage with Selenium
proxy = {
'http': 'http://localhost:8080',
'https': 'http://localhost:8080'
}
options = webdriver.ChromeOptions()
options.add_argument('--proxy-server=localhost:8080')
driver = webdriver.Chrome(options=options)
Best Practices
1. Handling Loading States
class LoadingHandler:
def wait_for_load(self, page):
# Wait for network idle
page.wait_for_load_state('networkidle')
# Check loading indicators
try:
page.wait_for_selector('.loading-spinner', state='hidden')
except TimeoutError:
pass
# Ensure content is ready
page.wait_for_selector('.content-loaded')
2. Error Recovery
class ResilientScraper:
def safe_extract(self, page, selector, timeout=5000):
try:
element = page.wait_for_selector(selector, timeout=timeout)
return element.text_content()
except TimeoutError:
logger.warning(f'Element {selector} not found')
return None
async def retry_action(self, action, max_retries=3):
for attempt in range(max_retries):
try:
return await action()
except Exception as e:
if attempt == max_retries - 1:
raise
await asyncio.sleep(2 ** attempt)
Remember: Dynamic content scraping requires patience and proper waiting mechanisms. Always respect the website's resources and implement appropriate delays.
