This works very well on Windows plaftform but fails to scrape and trigger bot messages on ubuntu server.
from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.chrome.service import Servicefrom selenium.webdriver.chrome.options import Optionsfrom bs4 import BeautifulSoupimport timeimport requestsimport osfrom sys import platformimport loggingfrom PyPDF2 import PdfReaderurl = 'https://www.nseindia.com/companies-listing/corporate-filings-announcements'bot_token = 'PjME_E'bot_chatID = '17HDGH2'if platform.startswith('win'): logging.basicConfig(format='%(asctime)s - %(name)s - %(threadName)s - %(levelname)s - %(message)s', level=logging.INFO, handlers=[logging.FileHandler("nse_announcements.log"), logging.StreamHandler()])else: logging.basicConfig(format='%(asctime)s - %(name)s - %(threadName)s - %(levelname)s - %(message)s', level=logging.INFO, handlers=[logging.FileHandler("/root/nse_announcements_bot/nse_announcements.log"), logging.StreamHandler()])if platform.startswith('win'): # Define the paths for ChromeDriver and Chrome for Windows service_path = os.path.join(os.getcwd(), "chromedriver-win64", "chromedriver.exe") chrome_dev_path = os.path.join(os.getcwd(), "chrome-win64", "chrome.exe") pdf_link_file_path = 'announcements_pdf_links.txt' temp_pdf = 'temp.pdf'else: service_path = os.path.join("/root/nse_announcements_bot/chromedriver-linux64", "chromedriver") chrome_dev_path = os.path.join( "/root/nse_announcements_bot/chrome-linux64", "chrome") pdf_link_file_path = '/root/nse_announcements_bot/announcements_pdf_links.txt' temp_pdf = '/root/nse_announcements_bot/temp.pdf' logging.info(f"Using Linux paths: {service_path, chrome_dev_path}")requests.get(f'https://api.telegram.org/bot{bot_token}/sendMessage', params={'chat_id': bot_chatID, 'text': "Hello! I'm now active and ready to fetch NSE announcements for you."})chrome_options = Options()chrome_options.add_argument("--disable-gpu")chrome_options.add_argument('--headless')chrome_options.add_argument("--no-sandbox")chrome_options.add_argument("--disable-webgl")chrome_options.add_argument("--disable-infobars")chrome_options.add_argument("--disable-popup-blocking")chrome_options.add_argument("--disable-notifications")chrome_options.add_argument("--window-size=1920,1080")chrome_options.binary_location = chrome_dev_pathchrome_options.add_argument('log-level=3')chrome_options.page_load_strategy = 'normal'user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"chrome_options.add_argument(f'user-agent={user_agent}')def clean_text(text): return text.replace('&', '').replace('<', '').replace('>', '').replace('"', '').replace("'", "")def send_telegram_message(data): message = f"<b>Symbol:</b> {data['Symbol']}\n" \ f"<b>Company Name:</b> {data['Company Name']}\n" \ f"<b>Broadcast Date/Time:</b> {data['Broadcast Date/Time']}\n" \ f"<b>Subject:</b> {data['Subject']}\n" \ f"<b>PDF Link:</b> {data['PDF Link']}" url = f'https://api.telegram.org/bot{bot_token}/sendMessage' params = {'chat_id': bot_chatID,'parse_mode': 'HTML','text': message } max_attempts = 3 # Maximum number of attempts to send the message for attempt in range(max_attempts): response = requests.get(url, params=params) if response.status_code == 200: # HTTP OK response_data = response.json() if response_data.get("ok"): return True return Falseexisting_links = set()# Load existing links from file if it existstry: with open(pdf_link_file_path, 'r') as file: existing_links = set(line.strip() for line in file)except FileNotFoundError: passavoidable_keywords = ['74(5)', '7(3)', '40(9)', '40(10)', '39(3)', 'Newspaper Publication', 'Analysts/Institutional Investor', 'Analyst/ Institutional', 'Share Based Employee Benefits and Sweat Equity','SEBI (Depositories and Participants) Regulations, 2018', 'Registrars and Share Transfer','newspaper','advertisement', 'Depository Participants', 'investor presentation']avoidable_subjects = ['Analysts/Institutional Investor', 'Loss of Share Certificates']while True: try: with webdriver.Chrome(service=Service(service_path), options=chrome_options) as driver: wait = WebDriverWait(driver, 20) driver.get(url) time.sleep(20) wait.until(EC.presence_of_element_located((By.ID, "CFanncEquityTable"))) html_source = driver.page_source soup = BeautifulSoup(html_source, 'html.parser') table = soup.find('table', {'id': 'CFanncEquityTable'}) if table: logging.info('table found..') if not table or table.find_all('a', href=True) == []: continue new_link_found = False for row in table.find_all('tr'): cells = row.find_all('td') if len(cells) > 4: pdf_link = cells[4].find('a', href=True)['href'] if cells[4].find('a', href=True) and cells[4].find('a', href=True)['href'].endswith('.pdf') else None subject = clean_text(cells[2].text.strip()) if pdf_link and pdf_link not in existing_links: if any(keyword.lower() in subject.lower() for keyword in avoidable_subjects): logging.info(f'************** SKIPPING PDF SINCE IT CONTAINS ONE OF THE AVOIDABLE SUBJECTS ***************') existing_links.add(pdf_link) with open(pdf_link_file_path, 'a') as file: file.write(pdf_link +'\n') continue try: session = requests.Session() session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) response = session.get(pdf_link, timeout=10) with open(temp_pdf, "wb") as f: f.write(response.content) reader = PdfReader(temp_pdf) first_page = reader.pages[0] text = first_page.extract_text() if any(keyword.lower() in text.lower() for keyword in avoidable_keywords): logging.info(f'************** SKIPPING PDF SINCE IT CONTAINS ONE OF THE AVOIDABLE KEYWORDS ***************') existing_links.add(pdf_link) with open(pdf_link_file_path, 'a') as file: file.write(pdf_link +'\n') continue # Skip this PDF as it either contains no text or avoidable keywords except requests.exceptions.RequestException as e: logging.info(f"Request failed: {e}") pass new_link_found = True # Clean the symbol and check if it was edited symbol = cells[0].text.strip() original_symbol = symbol symbol = clean_text(symbol) # Append "(Edited)" if the symbol was changed if symbol != original_symbol: symbol = symbol +" (Edited)" data = {'Symbol': symbol,'Company Name': clean_text(cells[1].text.strip()),'Subject': subject,'Details': clean_text(cells[3].text.strip()),'Broadcast Date/Time': cells[5].text.strip().split("Exchange")[0].strip(),'PDF Link': pdf_link } logging.info(data) should_link_be_saved = send_telegram_message(data) if should_link_be_saved: existing_links.add(pdf_link) with open(pdf_link_file_path, 'a') as file: file.write(pdf_link +'\n') if not new_link_found: logging.info('No new PDF detected OR table data insufficient...') except Exception as e: logging.info(f'Failed... trying again...')
in Ubuntu, I get Failed... trying again... log message only.But in windows, it works very well, scrapes the new announcements and trigger the telegram bot message.
Any suggestions?