""" Was ist zu tun? Erstellen Sie mittels Scrapy einen Web Crawler für die in Ü11 Website beschriebenen Website. Lassen Sie Ihr Programm laufen und geben Sie die Daten in output.json aus. Laden Sie das Python-Programm als auch output.json in diese Aufgabe hoch. """ import json import scrapy class MySpider(scrapy.Spider): name = "books_spider" start_urls = ["https://books.toscrape.com/"] output_file = "output.json" def __init__(self): self.books_data = [] def parse(self, response): # Extrahiere alle Bücher auf der aktuellen Seite books = response.css("article.product_pod") for book in books: # Extrahiere Buchdetails title = book.css("h3 a::attr(title)").get() if not title: title = book.css("h3 a::text").get() price = book.css("p.price_color::text").get() # Extrahiere Bewertung (Sterne) rating_class = book.css("p.star-rating::attr(class)").get() rating = None if rating_class: rating_words = ["Zero", "One", "Two", "Three", "Four", "Five"] for i, word in enumerate(rating_words): if word in rating_class: rating = i break # Verfügbarkeit availability = book.css("p.instock.availability::text").getall() availability_text = ( " ".join([text.strip() for text in availability if text.strip()]) if availability else None ) # Buchlink für weitere Details book_link = book.css("h3 a::attr(href)").get() if book_link: book_url = response.urljoin(book_link) else: book_url = None book_data = { "title": title.strip() if title else None, "price": price.strip() if price else None, "rating": rating, "availability": availability_text, "url": book_url, } self.books_data.append(book_data) yield book_data # Suche nach "Next" Button für Pagination next_page = response.css("li.next a::attr(href)").get() if next_page: next_page_url = response.urljoin(next_page) yield scrapy.Request(url=next_page_url, callback=self.parse) def closed(self, reason): # Speichere alle Daten in JSON-Datei wenn Spider beendet wird with open(self.output_file, "w", encoding="utf-8") as f: json.dump(self.books_data, f, ensure_ascii=False, indent=2) self.logger.info( f"Gespeichert {len(self.books_data)} Bücher in {self.output_file}" ) if __name__ == "__main__": from scrapy.crawler import CrawlerProcess # Konfiguration für Scrapy process = CrawlerProcess( { "USER_AGENT": "books-scraper (+http://www.yourdomain.com)", "ROBOTSTXT_OBEY": True, "DOWNLOAD_DELAY": 1, # Höfliche Verzögerung zwischen Anfragen "RANDOMIZE_DOWNLOAD_DELAY": 0.5, "FEEDS": { "output.json": { "format": "json", "encoding": "utf8", "store_empty": False, "indent": 2, }, }, } ) print("Starte Web Scraping von books.toscrape.com...") process.crawl(MySpider) process.start() print("Scraping abgeschlossen! Daten wurden in output.json gespeichert.")