Jul 12, 2024
a-last
a-disabled a-last
requests-html
, beautifulsoup4
from requests_html import HTMLSession
s = HTMLSession()
url = "https://www.amazon.com/s?keywords=example"
get_data
from bs4 import BeautifulSoup
def get_data(url):
r = s.get(url) # Use session to get the URL
soup = BeautifulSoup(r.text, 'html.parser') # Parse HTML
return soup
print(get_data(url)) # Print entire HTML soup for verification
get_next_page
def get_next_page(soup):
page = soup.find('ul', {'class': 'paginationclass'}) # Replace with actual class
if not page.find('li', {'class': 'a-disabled a-last'}):
next_link = page.find('li', {'class': 'a-last'}).find('a')['href']
next_url = 'https://www.amazon.com' + next_link
return next_url
else:
return None
soup = get_data(url)
print(get_next_page(soup)) # Print next page URL
while True:
soup = get_data(url)
url = get_next_page(soup)
if not url:
break
print(url) # Print each next page URL