What are Headless Browsers?
Headless browsers are web browsers without a graphical user interface that can be controlled programmatically. They're essential for web automation, testing, and scraping JavaScript-heavy websites.
Popular Options
1. Chrome Headless
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class HeadlessChrome:
def __init__(self):
options = Options()
options.add_argument('--headless=new') # New headless mode
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
self.driver = webdriver.Chrome(options=options)
def get_page_content(self, url):
self.driver.get(url)
return self.driver.page_source
2. Playwright
from playwright.sync_api import sync_playwright
class PlaywrightBrowser:
def __init__(self):
self.playwright = sync_playwright().start()
self.browser = self.playwright.chromium.launch(headless=True)
async def scrape_spa(self, url):
page = self.browser.new_page()
await page.goto(url, wait_until='networkidle')
# Wait for dynamic content
await page.wait_for_selector('.dynamic-content')
return await page.content()
Use Cases
1. JavaScript Rendering
class DynamicContentScraper:
def __init__(self):
self.browser = HeadlessChrome()
def get_rendered_content(self, url):
# Wait for specific elements
self.browser.driver.get(url)
WebDriverWait(self.browser.driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'dynamic-data'))
)
# Extract data after rendering
return {
'title': self.browser.driver.title,
'content': self.browser.driver.find_element(By.CLASS_NAME, 'dynamic-data').text
}
2. Performance Testing
class PerformanceTester:
def __init__(self):
self.browser = PlaywrightBrowser()
async def measure_load_time(self, url):
page = self.browser.new_page()
# Measure performance metrics
performance = await page.evaluate("""
() => {
const timing = window.performance.timing;
return {
loadTime: timing.loadEventEnd - timing.navigationStart,
domReady: timing.domContentLoadedEventEnd - timing.navigationStart
}
}
""")
return performance
Remember: Headless browsers are powerful tools but come with higher resource usage compared to simple HTTP requests.
