This commit is contained in:
DotNaos 2025-06-09 13:41:42 +02:00
parent b194d24901
commit 74071245a1
2 changed files with 7201 additions and 4 deletions

7107
code/output.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -5,14 +5,104 @@ Lassen Sie Ihr Programm laufen und geben Sie die Daten in output.json aus.
Laden Sie das Python-Programm als auch output.json in diese Aufgabe hoch.
"""
import json
import scrapy
class MySpider(scrapy.Spider):
url = "https://books.toscrape.com/"
name = "books_spider"
start_urls = ["https://books.toscrape.com/"]
output_file = "output.json"
def start_requests(self):
yield scrapy.Request(url=self.url, callback=self.parse)
def __init__(self):
self.books_data = []
def parse(self, response):
pass
# Extrahiere alle Bücher auf der aktuellen Seite
books = response.css("article.product_pod")
for book in books:
# Extrahiere Buchdetails
title = book.css("h3 a::attr(title)").get()
if not title:
title = book.css("h3 a::text").get()
price = book.css("p.price_color::text").get()
# Extrahiere Bewertung (Sterne)
rating_class = book.css("p.star-rating::attr(class)").get()
rating = None
if rating_class:
rating_words = ["Zero", "One", "Two", "Three", "Four", "Five"]
for i, word in enumerate(rating_words):
if word in rating_class:
rating = i
break
# Verfügbarkeit
availability = book.css("p.instock.availability::text").getall()
availability_text = (
" ".join([text.strip() for text in availability if text.strip()])
if availability
else None
)
# Buchlink für weitere Details
book_link = book.css("h3 a::attr(href)").get()
if book_link:
book_url = response.urljoin(book_link)
else:
book_url = None
book_data = {
"title": title.strip() if title else None,
"price": price.strip() if price else None,
"rating": rating,
"availability": availability_text,
"url": book_url,
}
self.books_data.append(book_data)
yield book_data
# Suche nach "Next" Button für Pagination
next_page = response.css("li.next a::attr(href)").get()
if next_page:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def closed(self, reason):
# Speichere alle Daten in JSON-Datei wenn Spider beendet wird
with open(self.output_file, "w", encoding="utf-8") as f:
json.dump(self.books_data, f, ensure_ascii=False, indent=2)
self.logger.info(
f"Gespeichert {len(self.books_data)} Bücher in {self.output_file}"
)
if __name__ == "__main__":
from scrapy.crawler import CrawlerProcess
# Konfiguration für Scrapy
process = CrawlerProcess(
{
"USER_AGENT": "books-scraper (+http://www.yourdomain.com)",
"ROBOTSTXT_OBEY": True,
"DOWNLOAD_DELAY": 1, # Höfliche Verzögerung zwischen Anfragen
"RANDOMIZE_DOWNLOAD_DELAY": 0.5,
"FEEDS": {
"output.json": {
"format": "json",
"encoding": "utf8",
"store_empty": False,
"indent": 2,
},
},
}
)
print("Starte Web Scraping von books.toscrape.com...")
process.crawl(MySpider)
process.start()
print("Scraping abgeschlossen! Daten wurden in output.json gespeichert.")