What are Headless Browsers?
Headless browsers are web browsers without a graphical user interface (no on-screen window) that you control programmatically - through a script rather than a mouse and keyboard. Because they run the full browser engine, they can execute JavaScript and build the page exactly as a user would see it. That makes them essential for web automation, automated testing, and scraping JavaScript-heavy websites where the content only appears after scripts run.
Popular Options
1. Chrome Headless
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class HeadlessChrome:
def __init__(self):
options = Options()
options.add_argument('--headless=new') # New headless mode
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
self.driver = webdriver.Chrome(options=options)
def get_page_content(self, url):
self.driver.get(url)
return self.driver.page_source
2. Playwright
from playwright.sync_api import sync_playwright
class PlaywrightBrowser:
def __init__(self):
self.playwright = sync_playwright().start()
self.browser = self.playwright.chromium.launch(headless=True)
async def scrape_spa(self, url):
page = self.browser.new_page()
await page.goto(url, wait_until='networkidle')
# Wait for dynamic content
await page.wait_for_selector('.dynamic-content')
return await page.content()
Use Cases
1. JavaScript Rendering
class DynamicContentScraper:
def __init__(self):
self.browser = HeadlessChrome()
def get_rendered_content(self, url):
# Wait for specific elements
self.browser.driver.get(url)
WebDriverWait(self.browser.driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'dynamic-data'))
)
# Extract data after rendering
return {
'title': self.browser.driver.title,
'content': self.browser.driver.find_element(By.CLASS_NAME, 'dynamic-data').text
}
2. Performance Testing
class PerformanceTester:
def __init__(self):
self.browser = PlaywrightBrowser()
async def measure_load_time(self, url):
page = self.browser.new_page()
# Measure performance metrics
performance = await page.evaluate("""
() => {
const timing = window.performance.timing;
return {
loadTime: timing.loadEventEnd - timing.navigationStart,
domReady: timing.domContentLoadedEventEnd - timing.navigationStart
}
}
""")
return performance
Remember: headless browsers are powerful, but they run a full browser engine, so they use far more CPU and memory than a simple HTTP request. Reach for them only when you actually need JavaScript or interaction.
