21 lines
519 B
Python
21 lines
519 B
Python
|
import requests
|
||
|
from bs4 import BeautifulSoup
|
||
|
import queue
|
||
|
import re
|
||
|
import time
|
||
|
import random
|
||
|
|
||
|
urls = queue.PriorityQueue()
|
||
|
urls.put((0.5, "https://www.scrapingcourse.com/ecommerce/"))
|
||
|
visited_urls = []
|
||
|
|
||
|
while not urls.empty():
|
||
|
_, current_url = urls.get()
|
||
|
soup = BeautifulSoup(get_html(current_url), "html.parser")
|
||
|
|
||
|
visited_urls.append(current_url)
|
||
|
crawl_page(soup, current_url, visited_urls, urls)
|
||
|
|
||
|
# if it is a product page:
|
||
|
# scrape_page(soup, url, products)
|
||
|
time.sleep(random.uniform(1, 3))
|