Course pdf extractor

2024-10-26 20:33:00 +02:00 · 2024-10-26 20:33:00 +02:00 · 2c69edd489
parent f21d288959
commit 2c69edd489
7 changed files with 166 additions and 2560 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,8 @@
+# Temporary files
+**/*.log
+**/data
+
+
 # IDE
 **/.idea/

@ -41,3 +46,4 @@
 # Latex
 !**/out/*.pdf
 **/auxil/*
+
--- a/out/main.pdf
+++ b/out/main.pdf
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -1,6 +0,0 @@
-{
-  "devDependencies": {
-    "@unocss/postcss": "^0.63.4",
-    "unocss": "^0.63.4"
-  }
-}
--- a/src/backend/course_content_extractor.py
+++ b/src/backend/course_content_extractor.py
@ -0,0 +1,79 @@
+import os
+import zipfile
+import shutil
+import tempfile
+import subprocess
+import sys
+
+class CourseContentExtractor:
+    def __init__(self, download_dir, output_dir=None):
+        self.download_dir = download_dir
+        self.output_dir = output_dir or os.path.join(os.getcwd(), 'data')
+
+    def extract_contents(self):
+        # Ensure output_dir exists
+        if not os.path.exists(self.output_dir):
+            os.makedirs(self.output_dir)
+
+        # Find all ZIP files in download_dir
+        zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')]
+        for filename in zip_files:
+            zip_path = os.path.join(self.download_dir, filename)
+            base_name = os.path.splitext(filename)[0]
+
+            # Use the base name as the course folder name
+            course_name = base_name
+
+            course_output_dir = os.path.join(self.output_dir, course_name)
+            # Ensure course_output_dir exists
+            if not os.path.exists(course_output_dir):
+                os.makedirs(course_output_dir)
+
+            # Create a temporary directory for extraction
+            with tempfile.TemporaryDirectory() as temp_extract_dir:
+                # Extract ZIP file to temporary directory
+                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                    zip_ref.extractall(temp_extract_dir)
+                # Walk through the extracted files
+                for root, dirs, files in os.walk(temp_extract_dir):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        if file.lower().endswith('.pdf'):
+                            # Copy PDF files to course_output_dir
+                            shutil.copy2(file_path, course_output_dir)
+                        elif file.lower().endswith(('.ppt', '.pptx')):
+                            # Convert PowerPoint files to PDF
+                            self.convert_ppt_to_pdf(file_path, course_output_dir)
+                # Delete the ZIP file after processing
+                os.remove(zip_path)
+        print(f"All PDF and PowerPoint files have been extracted to {self.output_dir}")
+
+    def convert_ppt_to_pdf(self, ppt_path, output_dir):
+        try:
+            # Determine the command based on the operating system
+            if sys.platform.startswith('win'):
+                # Windows systems
+                office_executable = 'soffice'
+            else:
+                # Linux and others
+                office_executable = 'libreoffice'
+
+            # Prepare the command to convert PPT/PPTX to PDF using LibreOffice
+            command = [
+                office_executable,
+                '--headless',
+                '--convert-to', 'pdf',
+                '--outdir', output_dir,
+                ppt_path
+            ]
+            # Execute the command
+            subprocess.run(command, check=True)
+            print(f"Converted {os.path.basename(ppt_path)} to PDF.")
+        except subprocess.CalledProcessError as e:
+            print(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}")
+            # Optionally, copy the original PPT/PPTX file
+            shutil.copy2(ppt_path, output_dir)
+        except FileNotFoundError:
+            print(f"{office_executable} is not installed or not found in the system path.")
+            # Optionally, copy the original PPT/PPTX file
+            shutil.copy2(ppt_path, output_dir)
--- a/src/backend/main.py
+++ b/src/backend/main.py
@ -1,5 +1,8 @@
+# main.py
+
 import logging
 from moodle_downloader import MoodleDownloader
+from course_content_extractor import CourseContentExtractor
 import os

 # Configure logging
@ -29,6 +32,10 @@ try:

    # Download all courses
    downloader.download_all_courses()
+
+    # Extract course contents using the updated class
+    extractor = CourseContentExtractor(downloader.download_dir)
+    extractor.extract_contents()
 finally:
    # Close the browser
    downloader.close()
--- a/src/backend/moodle_downloader.py
+++ b/src/backend/moodle_downloader.py
@ -1,7 +1,9 @@
 import os
+import re
 import time
 import logging
 import requests
+import unicodedata
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
@ -11,12 +13,20 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.chrome.service import Service as ChromeService
 from webdriver_manager.chrome import ChromeDriverManager
+import tempfile

 class MoodleDownloader:
    def __init__(self, username, password, download_dir=None, headless=False):
        self.username = username
        self.password = password
-        self.download_dir = download_dir or os.path.join(os.getcwd(), 'downloads')
+        if download_dir:
+            self.download_dir = download_dir
+            self.cleanup_download_dir = False
+        else:
+            # Create a unique temporary directory
+            self.temp_dir = tempfile.TemporaryDirectory()
+            self.download_dir = self.temp_dir.name
+            self.cleanup_download_dir = True
        self.headless = headless
        self.driver = None
        self.courses = []
@ -127,12 +137,30 @@ class MoodleDownloader:

            logging.info(f"{len(course_elements)} courses found.")

+            existing_urls = set()
            for coursename_element in course_elements:
                try:
-                    course_name = coursename_element.text.strip()
+                    # Get the text content
+                    full_text = coursename_element.text.strip()
+                    lines = [line.strip() for line in full_text.split('\n') if line.strip()]
+                    # Remove duplicates
+                    unique_lines = list(dict.fromkeys(lines))
+                    # Assume the last line is the actual course name
+                    course_name = unique_lines[-1]
+
+                    # Extract course code and term
+                    short_name = self.extract_course_code_and_term(course_name)
+
                    course_url = coursename_element.get_attribute('href')
-                    self.courses.append({'CourseName': course_name, 'URL': course_url})
-                    logging.info(f"Course found: {course_name} - {course_url}")
+
+                    # Check for duplicates
+                    if course_url in existing_urls:
+                        logging.info(f"Duplicate course found: {short_name} - {course_url}")
+                        continue
+                    existing_urls.add(course_url)
+
+                    self.courses.append({'CourseName': short_name, 'URL': course_url})
+                    logging.info(f"Course found: {short_name} - {course_url}")
                except Exception as e:
                    logging.warning(f"Error extracting course: {e}")
                    continue
@ -144,6 +172,30 @@ class MoodleDownloader:
            logging.error("An error occurred while retrieving courses.", exc_info=True)
            raise e

+    def extract_course_code_and_term(self, course_name):
+        # Regular expression to match course code and term
+        # Example course name: 'Mathematik I (cds-401) HS24'
+        pattern = r'\(([^)]+)\)\s+(\w+\d*)'
+        match = re.search(pattern, course_name)
+        if match:
+            course_code = match.group(1)
+            term = match.group(2)
+            # Sanitize and return
+            return f"{self.sanitize_filename(course_code)}_{self.sanitize_filename(term)}"
+        else:
+            # If pattern doesn't match, return sanitized course name
+            return self.sanitize_filename(course_name)
+
+    def sanitize_filename(self, name):
+        # Normalize unicode characters
+        name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
+        # Remove invalid characters for filenames, including newlines
+        sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
+        # Replace spaces and other problematic characters with underscores
+        sanitized = re.sub(r'[\s]+', '_', sanitized)
+        # Truncate to a reasonable length (e.g., 100 characters)
+        return sanitized[:100]
+
    def download_all_courses(self):
        if not self.courses:
            logging.warning("No courses to download.")
@ -155,8 +207,6 @@ class MoodleDownloader:
        if not os.path.exists(self.download_dir):
            os.makedirs(self.download_dir)

-        course_counter = 1
-
        for course in self.courses:
            course_name = course['CourseName']
            course_url = course['URL']
@ -183,19 +233,15 @@ class MoodleDownloader:
                )

                # Extract 'sesskey' and 'contextid'
-                sesskey_input = driver.find_element(By.NAME, 'sesskey')
-                sesskey = sesskey_input.get_attribute('value')
-                contextid_input = driver.find_element(By.NAME, 'contextid')
-                contextid = contextid_input.get_attribute('value')
+                sesskey = driver.find_element(By.NAME, 'sesskey').get_attribute('value')
+                contextid = driver.find_element(By.NAME, 'contextid').get_attribute('value')

                logging.info(f"sesskey: {sesskey}, contextid: {contextid}")

                # Extract cookies from the Selenium session
                logging.info("Extracting cookies from the Selenium session.")
                selenium_cookies = driver.get_cookies()
-                cookies = {}
-                for cookie in selenium_cookies:
-                    cookies[cookie['name']] = cookie['value']
+                cookies = {cookie['name']: cookie['value'] for cookie in selenium_cookies}

                # Prepare the HTTP POST request
                download_url = 'https://moodle.fhgr.ch/course/downloadcontent.php'
@ -216,9 +262,18 @@ class MoodleDownloader:
                    response = session.post(download_url, data=post_data, headers=headers, stream=True)
                    response.raise_for_status()

-                    # Generate filename as course_N.zip
-                    filename = f"course_{course_counter}.zip"
-                    course_counter += 1
+                    # Attempt to extract filename from Content-Disposition header
+                    content_disposition = response.headers.get('Content-Disposition', '')
+                    filename = None
+                    if content_disposition:
+                        matches = re.findall('filename="(.+)"', content_disposition)
+                        if matches:
+                            filename = matches[0]
+                    if not filename:
+                        # If no filename in headers, use sanitized course name
+                        filename = f"{course_name}.zip"
+                        filename = self.sanitize_filename(filename)
+
                    filepath = os.path.join(self.download_dir, filename)

                    # Overwrite existing files
@ -239,3 +294,6 @@ class MoodleDownloader:
        if self.driver:
            logging.info("Closing the browser.")
            self.driver.quit()
+        if self.cleanup_download_dir:
+            logging.info("Cleaning up temporary download directory.")
+            self.temp_dir.cleanup()