Quick Setup Guide
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
class ModernSeleniumScraper:
def __init__(self, headless=True):
options = webdriver.ChromeOptions()
if headless:
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
self.driver = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.driver, timeout=10)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.driver.quit()
Essential Features
1. Finding Elements Smartly
def find_element_safely(self, by, value, timeout=10):
try:
element = self.wait.until(
EC.presence_of_element_located((by, value))
)
return element
except TimeoutException:
print(f'Element {value} not found within {timeout} seconds')
return None
# Usage examples:
button = find_element_safely(By.ID, 'submit-button')
heading = find_element_safely(By.CSS_SELECTOR, 'h1.title')
link = find_element_safely(By.XPATH, '//a[contains(text(), "Next")]')
2. Handling Dynamic Content
def wait_for_dynamic_content(self, selector, timeout=10):
try:
# Wait for element to be clickable
element = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
)
return element
except TimeoutException:
print(f'Dynamic content not loaded: {selector}')
return None
# Handle infinite scroll
def scroll_to_bottom(self):
last_height = self.driver.execute_script('return document.body.scrollHeight')
while True:
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(2) # Allow content to load
new_height = self.driver.execute_script('return document.body.scrollHeight')
if new_height == last_height:
break
last_height = new_height
3. Real-World Example: E-commerce Scraper
class EcommerceScraper(ModernSeleniumScraper):
def scrape_product_page(self, url):
try:
self.driver.get(url)
# Wait for main content
self.wait_for_dynamic_content('.product-container')
return {
'title': self.get_text('h1.product-title'),
'price': self.get_price('.product-price'),
'description': self.get_text('.product-description'),
'rating': self.get_rating('.product-rating'),
'reviews': self.get_reviews('.review-section'),
'url': url
}
except Exception as e:
print(f'Error scraping {url}: {e}')
return None
def get_text(self, selector):
element = self.find_element_safely(By.CSS_SELECTOR, selector)
return element.text.strip() if element else None
def get_price(self, selector):
price_elem = self.find_element_safely(By.CSS_SELECTOR, selector)
if price_elem:
price_text = price_elem.text.strip().replace('#39;, '').replace(',', '')
try:
return float(price_text)
except ValueError:
return None
return None
Best Practices
1. Error Handling
- Always use try-except blocks
- Implement timeouts
- Handle stale elements
- Log errors properly
2. Performance Optimization
- Use headless mode when possible
- Implement element caching
- Minimize page loads
- Clean up resources
3. Anti-Detection Measures
def configure_stealth_options(self):
options = webdriver.ChromeOptions()
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--disable-infobars')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
return options
4. Data Validation
def validate_extracted_data(self, data):
required_fields = ['title', 'price', 'description']
return all(data.get(field) for field in required_fields)
Common Challenges & Solutions
1. Handling Popups
def handle_popup(self):
try:
popup = self.wait.until(
EC.presence_of_element_located((By.CLASS_NAME, 'popup'))
)
close_button = popup.find_element(By.CLASS_NAME, 'close-button')
close_button.click()
except TimeoutException:
pass # No popup found
2. Managing Sessions
def login(self, username, password):
self.driver.get('https://example.com/login')
username_field = self.find_element_safely(By.ID, 'username')
password_field = self.find_element_safely(By.ID, 'password')
username_field.send_keys(username)
password_field.send_keys(password)
submit = self.find_element_safely(By.ID, 'login-button')
submit.click()
return self.wait_for_dynamic_content('.dashboard')
Advanced Topics
1. Parallel Scraping
from concurrent.futures import ThreadPoolExecutor
def scrape_multiple_pages(urls, max_workers=4):
results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(scrape_single_page, url) for url in urls]
for future in futures:
results.append(future.result())
return results
2. Custom Wait Conditions
from selenium.webdriver.support.wait import WebDriverWait
def wait_for_text_change(self, element, original_text):
def text_changed(driver):
return element.text != original_text
self.wait.until(text_changed)
Remember to always respect websites' terms of service and implement proper delays between requests to avoid overwhelming servers.
