CDS104-Databases-and-Data-P.../code/part-3/scraping/main.py

"""
Was ist zu tun?
Erstellen Sie mittels Scrapy einen Web Crawler für die in Ü11 Website beschriebenen Website.
Lassen Sie Ihr Programm laufen und geben Sie die Daten in output.json aus.
Laden Sie das Python-Programm als auch output.json in diese Aufgabe hoch.
"""

import json

import scrapy


class MySpider(scrapy.Spider):
    name = "books_spider"
    start_urls = ["https://books.toscrape.com/"]
    output_file = "output.json"

    def __init__(self):
        self.books_data = []

    def parse(self, response):
        # Extrahiere alle Bücher auf der aktuellen Seite
        books = response.css("article.product_pod")

        for book in books:
            # Extrahiere Buchdetails
            title = book.css("h3 a::attr(title)").get()
            if not title:
                title = book.css("h3 a::text").get()

            price = book.css("p.price_color::text").get()

            # Extrahiere Bewertung (Sterne)
            rating_class = book.css("p.star-rating::attr(class)").get()
            rating = None
            if rating_class:
                rating_words = ["Zero", "One", "Two", "Three", "Four", "Five"]
                for i, word in enumerate(rating_words):
                    if word in rating_class:
                        rating = i
                        break

            # Verfügbarkeit
            availability = book.css("p.instock.availability::text").getall()
            availability_text = (
                " ".join([text.strip() for text in availability if text.strip()])
                if availability
                else None
            )

            # Buchlink für weitere Details
            book_link = book.css("h3 a::attr(href)").get()
            if book_link:
                book_url = response.urljoin(book_link)
            else:
                book_url = None

            book_data = {
                "title": title.strip() if title else None,
                "price": price.strip() if price else None,
                "rating": rating,
                "availability": availability_text,
                "url": book_url,
            }

            self.books_data.append(book_data)
            yield book_data

        # Suche nach "Next" Button für Pagination
        next_page = response.css("li.next a::attr(href)").get()
        if next_page:
            next_page_url = response.urljoin(next_page)
            yield scrapy.Request(url=next_page_url, callback=self.parse)

    def closed(self, reason):
        # Speichere alle Daten in JSON-Datei wenn Spider beendet wird
        with open(self.output_file, "w", encoding="utf-8") as f:
            json.dump(self.books_data, f, ensure_ascii=False, indent=2)
        self.logger.info(
            f"Gespeichert {len(self.books_data)} Bücher in {self.output_file}"
        )


if __name__ == "__main__":
    from scrapy.crawler import CrawlerProcess

    # Konfiguration für Scrapy
    process = CrawlerProcess(
        {
            "USER_AGENT": "books-scraper (+http://www.yourdomain.com)",
            "ROBOTSTXT_OBEY": True,
            "DOWNLOAD_DELAY": 1,  # Höfliche Verzögerung zwischen Anfragen
            "RANDOMIZE_DOWNLOAD_DELAY": 0.5,
            "FEEDS": {
                "output.json": {
                    "format": "json",
                    "encoding": "utf8",
                    "store_empty": False,
                    "indent": 2,
                },
            },
        }
    )

    print("Starte Web Scraping von books.toscrape.com...")
    process.crawl(MySpider)
    process.start()
    print("Scraping abgeschlossen! Daten wurden in output.json gespeichert.")