Python Web Scraping

How to extract data from websites using Selenium Python? (2026 Guide)

How to extract data from websites using Selenium Python? (2026 Guide) — conceptual illustration
On this page

How to extract data from websites using Selenium Python? (2026 Guide).

Quick facts

What it isBrowser automation via WebDriver
Best forJS-rendered pages & interactions
LocatorsCSS selectors, XPath
Key skillExplicit waits over sleep()
Lighter altPlaywright (modern API)

Quick Setup Guide

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

class ModernSeleniumScraper:
    def __init__(self, headless=True):
        options = webdriver.ChromeOptions()
        if headless:
            options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        
        self.driver = webdriver.Chrome(options=options)
        self.wait = WebDriverWait(self.driver, timeout=10)
    
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.driver.quit()

Essential Features

1. Finding Elements Smartly

def find_element_safely(self, by, value, timeout=10):
    try:
        element = self.wait.until(
            EC.presence_of_element_located((by, value))
        )
        return element
    except TimeoutException:
        print(f'Element {value} not found within {timeout} seconds')
        return None

# Usage examples:
button = find_element_safely(By.ID, 'submit-button')
heading = find_element_safely(By.CSS_SELECTOR, 'h1.title')
link = find_element_safely(By.XPATH, '//a[contains(text(), "Next")]')

2. Handling Dynamic Content

def wait_for_dynamic_content(self, selector, timeout=10):
    try:
        # Wait for element to be clickable
        element = self.wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
        )
        return element
    except TimeoutException:
        print(f'Dynamic content not loaded: {selector}')
        return None

# Handle infinite scroll
def scroll_to_bottom(self):
    last_height = self.driver.execute_script('return document.body.scrollHeight')
    while True:
        self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(2)  # Allow content to load
        
        new_height = self.driver.execute_script('return document.body.scrollHeight')
        if new_height == last_height:
            break
        last_height = new_height

3. Real-World Example: E-commerce Scraper

class EcommerceScraper(ModernSeleniumScraper):
    def scrape_product_page(self, url):
        try:
            self.driver.get(url)
            
            # Wait for main content
            self.wait_for_dynamic_content('.product-container')
            
            return {
                'title': self.get_text('h1.product-title'),
                'price': self.get_price('.product-price'),
                'description': self.get_text('.product-description'),
                'rating': self.get_rating('.product-rating'),
                'reviews': self.get_reviews('.review-section'),
                'url': url
            }
        except Exception as e:
            print(f'Error scraping {url}: {e}')
            return None
    
    def get_text(self, selector):
        element = self.find_element_safely(By.CSS_SELECTOR, selector)
        return element.text.strip() if element else None
    
    def get_price(self, selector):
        price_elem = self.find_element_safely(By.CSS_SELECTOR, selector)
        if price_elem:
            price_text = price_elem.text.strip().replace('#39;, '').replace(',', '')
            try:
                return float(price_text)
            except ValueError:
                return None
        return None

Best Practices

1. Error Handling

  • Always use try-except blocks
  • Implement timeouts
  • Handle stale elements
  • Log errors properly

2. Performance Optimization

  • Use headless mode when possible
  • Implement element caching
  • Minimize page loads
  • Clean up resources

3. Anti-Detection Measures

def configure_stealth_options(self):
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--disable-infobars')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_experimental_option('useAutomationExtension', False)
    return options

4. Data Validation

def validate_extracted_data(self, data):
    required_fields = ['title', 'price', 'description']
    return all(data.get(field) for field in required_fields)

Common Challenges & Solutions

1. Handling Popups

def handle_popup(self):
    try:
        popup = self.wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'popup'))
        )
        close_button = popup.find_element(By.CLASS_NAME, 'close-button')
        close_button.click()
    except TimeoutException:
        pass  # No popup found

2. Managing Sessions

def login(self, username, password):
    self.driver.get('https://example.com/login')
    
    username_field = self.find_element_safely(By.ID, 'username')
    password_field = self.find_element_safely(By.ID, 'password')
    
    username_field.send_keys(username)
    password_field.send_keys(password)
    
    submit = self.find_element_safely(By.ID, 'login-button')
    submit.click()
    
    return self.wait_for_dynamic_content('.dashboard')

Advanced Topics

1. Parallel Scraping

from concurrent.futures import ThreadPoolExecutor

def scrape_multiple_pages(urls, max_workers=4):
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(scrape_single_page, url) for url in urls]
        for future in futures:
            results.append(future.result())
    return results

2. Custom Wait Conditions

from selenium.webdriver.support.wait import WebDriverWait

def wait_for_text_change(self, element, original_text):
    def text_changed(driver):
        return element.text != original_text
    
    self.wait.until(text_changed)

Remember to always respect websites' terms of service and implement proper delays between requests to avoid overwhelming servers.

Related terms

Concept map

How How to extract data from websites using Selenium Python? (2026 Guide) connects

The terms most directly tied to this one. Hover a node to see its neighbours, click to preview, drag to rearrange.

0 terms · 0 connections
You are here · Python Web Scraping
Building map…

Frequently asked questions

Why is my Selenium script not finding elements?

Usually a timing issue — the element has not rendered yet. Use explicit waits (WebDriverWait + expected_conditions) instead of time.sleep, which is brittle and slow.

Is Selenium detectable as a bot?

Yes. Default WebDriver exposes navigator.webdriver and other signals. Protected sites flag it quickly; stealth-patched browsers or a scraping API are more durable.

Should I use Selenium or Playwright?

Playwright has a cleaner async API, auto-waiting, and better defaults. Selenium remains fine for existing projects and the widest language support.

Last updated: 2026-05-28