Python script uses Selenium to scrape website links from a single URL. The script runs most URLs without issue but appears to be getting stuck or having memory issues that I'm having a difficult time resolving. I'm not sure how to fix the script so it continues and reduces memory/cpu usage.
The expected outcome is that the script attempts to locate 'href' and the associated link, adds links to list and returns list.
from selenium.webdriver.common.by import Byimport timefrom selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsfrom selenium.webdriver.chrome.service import Servicefrom webdriver_manager.chrome import ChromeDriverManagerimport sys# Scrape single URLdef web_scraper(url): url_bool = True count_issue = 0 while url_bool: try: url_list = [] userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.114 Safari/537.36' options = Options() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument(f"--user-agent={userAgent}") options.add_argument('--disable-gpu') # Is this needed anymore? driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) driver.get(url) all_links = driver.find_elements(By.TAG_NAME, 'a') grab_count = 0 # Grab LINKs from scrape for item in all_links: try: if (item.get_attribute('href')).startswith('https://'): grab_count = grab_count + 1 if item.get_attribute('href') not in url_list: url_list.append(item.get_attribute('href')) #print(item.get_attribute('href')) except: pass driver.quit() print('Grab URL Count: '+ str(grab_count)) print('Grab Non-duplicate URL count: '+ str(len(url_list))) url_bool = False # Test for error except Exception as e: driver.quit() f = open("web.log", "a") f.write(str(e)) f.close() count_issue = count_issue + 1 if count_issue == 3: url_bool = False print("\nDriver (get) error: " + str(e)) time.sleep(5) driver.quit() return(url_list)
I believe this code is where the script halts:
for item in all_links: try: if (item.get_attribute('href')).startswith('https://'): grab_count = grab_count + 1 if item.get_attribute('href') not in url_list: url_list.append(item.get_attribute('href')) #print(item.get_attribute('href')) except: pass
The script is currently hung up on this URL
Another URL causing the same problem
The server is showing high CPU, Memory and Disk I/O.
chromeDriver -vGoogle Chrome 126.0.6478.114