Scraper

2025-06-09 13:41:42 +02:00 · 2025-06-09 13:41:42 +02:00 · 74071245a1
commit 74071245a1
parent b194d24901
2 changed files with 7201 additions and 4 deletions
--- a/code/output.json
+++ b/code/output.json
--- a/code/part-3/scraping/main.py
+++ b/code/part-3/scraping/main.py
@ -5,14 +5,104 @@ Lassen Sie Ihr Programm laufen und geben Sie die Daten in output.json aus.
 Laden Sie das Python-Programm als auch output.json in diese Aufgabe hoch.
 """

+import json
+
 import scrapy

+
 class MySpider(scrapy.Spider):
-    url = "https://books.toscrape.com/"
+    name = "books_spider"
+    start_urls = ["https://books.toscrape.com/"]
    output_file = "output.json"

-    def start_requests(self):
-        yield scrapy.Request(url=self.url, callback=self.parse)
+    def __init__(self):
+        self.books_data = []

    def parse(self, response):
-        pass
+        # Extrahiere alle Bücher auf der aktuellen Seite
+        books = response.css("article.product_pod")
+
+        for book in books:
+            # Extrahiere Buchdetails
+            title = book.css("h3 a::attr(title)").get()
+            if not title:
+                title = book.css("h3 a::text").get()
+
+            price = book.css("p.price_color::text").get()
+
+            # Extrahiere Bewertung (Sterne)
+            rating_class = book.css("p.star-rating::attr(class)").get()
+            rating = None
+            if rating_class:
+                rating_words = ["Zero", "One", "Two", "Three", "Four", "Five"]
+                for i, word in enumerate(rating_words):
+                    if word in rating_class:
+                        rating = i
+                        break
+
+            # Verfügbarkeit
+            availability = book.css("p.instock.availability::text").getall()
+            availability_text = (
+                " ".join([text.strip() for text in availability if text.strip()])
+                if availability
+                else None
+            )
+
+            # Buchlink für weitere Details
+            book_link = book.css("h3 a::attr(href)").get()
+            if book_link:
+                book_url = response.urljoin(book_link)
+            else:
+                book_url = None
+
+            book_data = {
+                "title": title.strip() if title else None,
+                "price": price.strip() if price else None,
+                "rating": rating,
+                "availability": availability_text,
+                "url": book_url,
+            }
+
+            self.books_data.append(book_data)
+            yield book_data
+
+        # Suche nach "Next" Button für Pagination
+        next_page = response.css("li.next a::attr(href)").get()
+        if next_page:
+            next_page_url = response.urljoin(next_page)
+            yield scrapy.Request(url=next_page_url, callback=self.parse)
+
+    def closed(self, reason):
+        # Speichere alle Daten in JSON-Datei wenn Spider beendet wird
+        with open(self.output_file, "w", encoding="utf-8") as f:
+            json.dump(self.books_data, f, ensure_ascii=False, indent=2)
+        self.logger.info(
+            f"Gespeichert {len(self.books_data)} Bücher in {self.output_file}"
+        )
+
+
+if __name__ == "__main__":
+    from scrapy.crawler import CrawlerProcess
+
+    # Konfiguration für Scrapy
+    process = CrawlerProcess(
+        {
+            "USER_AGENT": "books-scraper (+http://www.yourdomain.com)",
+            "ROBOTSTXT_OBEY": True,
+            "DOWNLOAD_DELAY": 1,  # Höfliche Verzögerung zwischen Anfragen
+            "RANDOMIZE_DOWNLOAD_DELAY": 0.5,
+            "FEEDS": {
+                "output.json": {
+                    "format": "json",
+                    "encoding": "utf8",
+                    "store_empty": False,
+                    "indent": 2,
+                },
+            },
+        }
+    )
+
+    print("Starte Web Scraping von books.toscrape.com...")
+    process.crawl(MySpider)
+    process.start()
+    print("Scraping abgeschlossen! Daten wurden in output.json gespeichert.")