109 lines
3.5 KiB
Python
109 lines
3.5 KiB
Python
"""
|
|
Was ist zu tun?
|
|
Erstellen Sie mittels Scrapy einen Web Crawler für die in Ü11 Website beschriebenen Website.
|
|
Lassen Sie Ihr Programm laufen und geben Sie die Daten in output.json aus.
|
|
Laden Sie das Python-Programm als auch output.json in diese Aufgabe hoch.
|
|
"""
|
|
|
|
import json
|
|
|
|
import scrapy
|
|
|
|
|
|
class MySpider(scrapy.Spider):
|
|
name = "books_spider"
|
|
start_urls = ["https://books.toscrape.com/"]
|
|
output_file = "output.json"
|
|
|
|
def __init__(self):
|
|
self.books_data = []
|
|
|
|
def parse(self, response):
|
|
# Extrahiere alle Bücher auf der aktuellen Seite
|
|
books = response.css("article.product_pod")
|
|
|
|
for book in books:
|
|
# Extrahiere Buchdetails
|
|
title = book.css("h3 a::attr(title)").get()
|
|
if not title:
|
|
title = book.css("h3 a::text").get()
|
|
|
|
price = book.css("p.price_color::text").get()
|
|
|
|
# Extrahiere Bewertung (Sterne)
|
|
rating_class = book.css("p.star-rating::attr(class)").get()
|
|
rating = None
|
|
if rating_class:
|
|
rating_words = ["Zero", "One", "Two", "Three", "Four", "Five"]
|
|
for i, word in enumerate(rating_words):
|
|
if word in rating_class:
|
|
rating = i
|
|
break
|
|
|
|
# Verfügbarkeit
|
|
availability = book.css("p.instock.availability::text").getall()
|
|
availability_text = (
|
|
" ".join([text.strip() for text in availability if text.strip()])
|
|
if availability
|
|
else None
|
|
)
|
|
|
|
# Buchlink für weitere Details
|
|
book_link = book.css("h3 a::attr(href)").get()
|
|
if book_link:
|
|
book_url = response.urljoin(book_link)
|
|
else:
|
|
book_url = None
|
|
|
|
book_data = {
|
|
"title": title.strip() if title else None,
|
|
"price": price.strip() if price else None,
|
|
"rating": rating,
|
|
"availability": availability_text,
|
|
"url": book_url,
|
|
}
|
|
|
|
self.books_data.append(book_data)
|
|
yield book_data
|
|
|
|
# Suche nach "Next" Button für Pagination
|
|
next_page = response.css("li.next a::attr(href)").get()
|
|
if next_page:
|
|
next_page_url = response.urljoin(next_page)
|
|
yield scrapy.Request(url=next_page_url, callback=self.parse)
|
|
|
|
def closed(self, reason):
|
|
# Speichere alle Daten in JSON-Datei wenn Spider beendet wird
|
|
with open(self.output_file, "w", encoding="utf-8") as f:
|
|
json.dump(self.books_data, f, ensure_ascii=False, indent=2)
|
|
self.logger.info(
|
|
f"Gespeichert {len(self.books_data)} Bücher in {self.output_file}"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from scrapy.crawler import CrawlerProcess
|
|
|
|
# Konfiguration für Scrapy
|
|
process = CrawlerProcess(
|
|
{
|
|
"USER_AGENT": "books-scraper (+http://www.yourdomain.com)",
|
|
"ROBOTSTXT_OBEY": True,
|
|
"DOWNLOAD_DELAY": 1, # Höfliche Verzögerung zwischen Anfragen
|
|
"RANDOMIZE_DOWNLOAD_DELAY": 0.5,
|
|
"FEEDS": {
|
|
"output.json": {
|
|
"format": "json",
|
|
"encoding": "utf8",
|
|
"store_empty": False,
|
|
"indent": 2,
|
|
},
|
|
},
|
|
}
|
|
)
|
|
|
|
print("Starte Web Scraping von books.toscrape.com...")
|
|
process.crawl(MySpider)
|
|
process.start()
|
|
print("Scraping abgeschlossen! Daten wurden in output.json gespeichert.")
|