2025-06-09 13:41:42 +02:00

109 lines
3.5 KiB
Python

"""
Was ist zu tun?
Erstellen Sie mittels Scrapy einen Web Crawler für die in Ü11 Website beschriebenen Website.
Lassen Sie Ihr Programm laufen und geben Sie die Daten in output.json aus.
Laden Sie das Python-Programm als auch output.json in diese Aufgabe hoch.
"""
import json
import scrapy
class MySpider(scrapy.Spider):
name = "books_spider"
start_urls = ["https://books.toscrape.com/"]
output_file = "output.json"
def __init__(self):
self.books_data = []
def parse(self, response):
# Extrahiere alle Bücher auf der aktuellen Seite
books = response.css("article.product_pod")
for book in books:
# Extrahiere Buchdetails
title = book.css("h3 a::attr(title)").get()
if not title:
title = book.css("h3 a::text").get()
price = book.css("p.price_color::text").get()
# Extrahiere Bewertung (Sterne)
rating_class = book.css("p.star-rating::attr(class)").get()
rating = None
if rating_class:
rating_words = ["Zero", "One", "Two", "Three", "Four", "Five"]
for i, word in enumerate(rating_words):
if word in rating_class:
rating = i
break
# Verfügbarkeit
availability = book.css("p.instock.availability::text").getall()
availability_text = (
" ".join([text.strip() for text in availability if text.strip()])
if availability
else None
)
# Buchlink für weitere Details
book_link = book.css("h3 a::attr(href)").get()
if book_link:
book_url = response.urljoin(book_link)
else:
book_url = None
book_data = {
"title": title.strip() if title else None,
"price": price.strip() if price else None,
"rating": rating,
"availability": availability_text,
"url": book_url,
}
self.books_data.append(book_data)
yield book_data
# Suche nach "Next" Button für Pagination
next_page = response.css("li.next a::attr(href)").get()
if next_page:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def closed(self, reason):
# Speichere alle Daten in JSON-Datei wenn Spider beendet wird
with open(self.output_file, "w", encoding="utf-8") as f:
json.dump(self.books_data, f, ensure_ascii=False, indent=2)
self.logger.info(
f"Gespeichert {len(self.books_data)} Bücher in {self.output_file}"
)
if __name__ == "__main__":
from scrapy.crawler import CrawlerProcess
# Konfiguration für Scrapy
process = CrawlerProcess(
{
"USER_AGENT": "books-scraper (+http://www.yourdomain.com)",
"ROBOTSTXT_OBEY": True,
"DOWNLOAD_DELAY": 1, # Höfliche Verzögerung zwischen Anfragen
"RANDOMIZE_DOWNLOAD_DELAY": 0.5,
"FEEDS": {
"output.json": {
"format": "json",
"encoding": "utf8",
"store_empty": False,
"indent": 2,
},
},
}
)
print("Starte Web Scraping von books.toscrape.com...")
process.crawl(MySpider)
process.start()
print("Scraping abgeschlossen! Daten wurden in output.json gespeichert.")