Course pdf extractor

2024-10-26 20:33:00 +02:00 · 2024-10-26 20:33:00 +02:00 · 2c69edd489
parent f21d288959
commit 2c69edd489
7 changed files with 166 additions and 2560 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,8 @@
 # Temporary files
 **/*.log
 **/data
 # IDE
 **/.idea/
@ -41,3 +46,4 @@
 # Latex
 !**/out/*.pdf
 **/auxil/*
--- a/out/main.pdf
+++ b/out/main.pdf
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -1,6 +0,0 @@
 {
  "devDependencies": {
    "@unocss/postcss": "^0.63.4",
    "unocss": "^0.63.4"
  }
 }
--- a/src/backend/course_content_extractor.py
+++ b/src/backend/course_content_extractor.py
@ -0,0 +1,79 @@
 import os
 import zipfile
 import shutil
 import tempfile
 import subprocess
 import sys
 class CourseContentExtractor:
    def __init__(self, download_dir, output_dir=None):
        self.download_dir = download_dir
        self.output_dir = output_dir or os.path.join(os.getcwd(), 'data')
    def extract_contents(self):
        # Ensure output_dir exists
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        # Find all ZIP files in download_dir
        zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')]
        for filename in zip_files:
            zip_path = os.path.join(self.download_dir, filename)
            base_name = os.path.splitext(filename)[0]
            # Use the base name as the course folder name
            course_name = base_name
            course_output_dir = os.path.join(self.output_dir, course_name)
            # Ensure course_output_dir exists
            if not os.path.exists(course_output_dir):
                os.makedirs(course_output_dir)
            # Create a temporary directory for extraction
            with tempfile.TemporaryDirectory() as temp_extract_dir:
                # Extract ZIP file to temporary directory
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(temp_extract_dir)
                # Walk through the extracted files
                for root, dirs, files in os.walk(temp_extract_dir):
                    for file in files:
                        file_path = os.path.join(root, file)
                        if file.lower().endswith('.pdf'):
                            # Copy PDF files to course_output_dir
                            shutil.copy2(file_path, course_output_dir)
                        elif file.lower().endswith(('.ppt', '.pptx')):
                            # Convert PowerPoint files to PDF
                            self.convert_ppt_to_pdf(file_path, course_output_dir)
                # Delete the ZIP file after processing
                os.remove(zip_path)
        print(f"All PDF and PowerPoint files have been extracted to {self.output_dir}")
    def convert_ppt_to_pdf(self, ppt_path, output_dir):
        try:
            # Determine the command based on the operating system
            if sys.platform.startswith('win'):
                # Windows systems
                office_executable = 'soffice'
            else:
                # Linux and others
                office_executable = 'libreoffice'
            # Prepare the command to convert PPT/PPTX to PDF using LibreOffice
            command = [
                office_executable,
                '--headless',
                '--convert-to', 'pdf',
                '--outdir', output_dir,
                ppt_path
            ]
            # Execute the command
            subprocess.run(command, check=True)
            print(f"Converted {os.path.basename(ppt_path)} to PDF.")
        except subprocess.CalledProcessError as e:
            print(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}")
            # Optionally, copy the original PPT/PPTX file
            shutil.copy2(ppt_path, output_dir)
        except FileNotFoundError:
            print(f"{office_executable} is not installed or not found in the system path.")
            # Optionally, copy the original PPT/PPTX file
            shutil.copy2(ppt_path, output_dir)
--- a/src/backend/main.py
+++ b/src/backend/main.py
@ -1,5 +1,8 @@
 # main.py
 import logging
 from moodle_downloader import MoodleDownloader
 from course_content_extractor import CourseContentExtractor
 import os
 # Configure logging
@ -29,6 +32,10 @@ try:
    # Download all courses
    downloader.download_all_courses()
    # Extract course contents using the updated class
    extractor = CourseContentExtractor(downloader.download_dir)
    extractor.extract_contents()
 finally:
    # Close the browser
    downloader.close()
--- a/src/backend/moodle_downloader.py
+++ b/src/backend/moodle_downloader.py
@ -1,7 +1,9 @@
 import os
 import re
 import time
 import logging
 import requests
 import unicodedata
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
@ -11,12 +13,20 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.chrome.service import Service as ChromeService
 from webdriver_manager.chrome import ChromeDriverManager
 import tempfile
 class MoodleDownloader:
    def __init__(self, username, password, download_dir=None, headless=False):
        self.username = username
        self.password = password
-        self.download_dir = download_dir or os.path.join(os.getcwd(), 'downloads')
+        if download_dir:
            self.download_dir = download_dir
            self.cleanup_download_dir = False
        else:
            # Create a unique temporary directory
            self.temp_dir = tempfile.TemporaryDirectory()
            self.download_dir = self.temp_dir.name
            self.cleanup_download_dir = True
        self.headless = headless
        self.driver = None
        self.courses = []
@ -127,12 +137,30 @@ class MoodleDownloader:
            logging.info(f"{len(course_elements)} courses found.")
            existing_urls = set()
            for coursename_element in course_elements:
                try:
-                    course_name = coursename_element.text.strip()
+                    # Get the text content
                    full_text = coursename_element.text.strip()
                    lines = [line.strip() for line in full_text.split('\n') if line.strip()]
                    # Remove duplicates
                    unique_lines = list(dict.fromkeys(lines))
                    # Assume the last line is the actual course name
                    course_name = unique_lines[-1]
                    # Extract course code and term
                    short_name = self.extract_course_code_and_term(course_name)
                    course_url = coursename_element.get_attribute('href')
-                    self.courses.append({'CourseName': course_name, 'URL': course_url})
+
-                    logging.info(f"Course found: {course_name} - {course_url}")
+                    # Check for duplicates
                    if course_url in existing_urls:
                        logging.info(f"Duplicate course found: {short_name} - {course_url}")
                        continue
                    existing_urls.add(course_url)
                    self.courses.append({'CourseName': short_name, 'URL': course_url})
                    logging.info(f"Course found: {short_name} - {course_url}")
                except Exception as e:
                    logging.warning(f"Error extracting course: {e}")
                    continue
@ -144,6 +172,30 @@ class MoodleDownloader:
            logging.error("An error occurred while retrieving courses.", exc_info=True)
            raise e
    def extract_course_code_and_term(self, course_name):
        # Regular expression to match course code and term
        # Example course name: 'Mathematik I (cds-401) HS24'
        pattern = r'\(([^)]+)\)\s+(\w+\d*)'
        match = re.search(pattern, course_name)
        if match:
            course_code = match.group(1)
            term = match.group(2)
            # Sanitize and return
            return f"{self.sanitize_filename(course_code)}_{self.sanitize_filename(term)}"
        else:
            # If pattern doesn't match, return sanitized course name
            return self.sanitize_filename(course_name)
    def sanitize_filename(self, name):
        # Normalize unicode characters
        name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
        # Remove invalid characters for filenames, including newlines
        sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
        # Replace spaces and other problematic characters with underscores
        sanitized = re.sub(r'[\s]+', '_', sanitized)
        # Truncate to a reasonable length (e.g., 100 characters)
        return sanitized[:100]
    def download_all_courses(self):
        if not self.courses:
            logging.warning("No courses to download.")
@ -155,8 +207,6 @@ class MoodleDownloader:
        if not os.path.exists(self.download_dir):
            os.makedirs(self.download_dir)
        course_counter = 1
        for course in self.courses:
            course_name = course['CourseName']
            course_url = course['URL']
@ -183,19 +233,15 @@ class MoodleDownloader:
                )
                # Extract 'sesskey' and 'contextid'
-                sesskey_input = driver.find_element(By.NAME, 'sesskey')
+                sesskey = driver.find_element(By.NAME, 'sesskey').get_attribute('value')
-                sesskey = sesskey_input.get_attribute('value')
+                contextid = driver.find_element(By.NAME, 'contextid').get_attribute('value')
                contextid_input = driver.find_element(By.NAME, 'contextid')
                contextid = contextid_input.get_attribute('value')
                logging.info(f"sesskey: {sesskey}, contextid: {contextid}")
                # Extract cookies from the Selenium session
                logging.info("Extracting cookies from the Selenium session.")
                selenium_cookies = driver.get_cookies()
-                cookies = {}
+                cookies = {cookie['name']: cookie['value'] for cookie in selenium_cookies}
                for cookie in selenium_cookies:
                    cookies[cookie['name']] = cookie['value']
                # Prepare the HTTP POST request
                download_url = 'https://moodle.fhgr.ch/course/downloadcontent.php'
@ -216,9 +262,18 @@ class MoodleDownloader:
                    response = session.post(download_url, data=post_data, headers=headers, stream=True)
                    response.raise_for_status()
-                    # Generate filename as course_N.zip
+                    # Attempt to extract filename from Content-Disposition header
-                    filename = f"course_{course_counter}.zip"
+                    content_disposition = response.headers.get('Content-Disposition', '')
-                    course_counter += 1
+                    filename = None
                    if content_disposition:
                        matches = re.findall('filename="(.+)"', content_disposition)
                        if matches:
                            filename = matches[0]
                    if not filename:
                        # If no filename in headers, use sanitized course name
                        filename = f"{course_name}.zip"
                        filename = self.sanitize_filename(filename)
                    filepath = os.path.join(self.download_dir, filename)
                    # Overwrite existing files
@ -239,3 +294,6 @@ class MoodleDownloader:
        if self.driver:
            logging.info("Closing the browser.")
            self.driver.quit()
        if self.cleanup_download_dir:
            logging.info("Cleaning up temporary download directory.")
            self.temp_dir.cleanup()