cds_introduction_data_scien.../code/crawler.py

21 lines
519 B
Python

import requests
from bs4 import BeautifulSoup
import queue
import re
import time
import random
urls = queue.PriorityQueue()
urls.put((0.5, "https://www.scrapingcourse.com/ecommerce/"))
visited_urls = []
while not urls.empty():
_, current_url = urls.get()
soup = BeautifulSoup(get_html(current_url), "html.parser")
visited_urls.append(current_url)
crawl_page(soup, current_url, visited_urls, urls)
# if it is a product page:
# scrape_page(soup, url, products)
time.sleep(random.uniform(1, 3))