From ccdfc21f43056de2b67e125216d78175de9e8489 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oliver=20Sch=C3=BCtz?= <schuetzoliver00@gmail.com>
Date: Thu, 31 Oct 2024 15:58:37 +0100
Subject: [PATCH] Saving

---
 src/backend/course_content_extractor.py | 119 +++++++++++++++++++-----
 src/backend/folder_structure.yaml       |   5 +-
 src/backend/main.py                     |  33 ++++++-
 src/backend/moodle_downloader.py        | 101 +++++++++++++++-----
 4 files changed, 205 insertions(+), 53 deletions(-)

diff --git a/src/backend/course_content_extractor.py b/src/backend/course_content_extractor.py
index 066edbf..9c2dd03 100644
--- a/src/backend/course_content_extractor.py
+++ b/src/backend/course_content_extractor.py
@@ -8,17 +8,34 @@ import subprocess
 import sys
 import re
 import unicodedata
+import logging
 
 
 class CourseContentExtractor:
     def __init__(self, download_dir, root_dir):
+        """
+        Initialize the CourseContentExtractor.
+
+        :param download_dir: Directory where ZIP files are downloaded
+        :param root_dir: Root directory for organizing study materials
+        """
         self.download_dir = download_dir
         self.root_dir = root_dir  # Read from environment variable
 
     def extract_contents(self, courses):
+        """
+        Extract and organize course contents based on the provided folder structure.
+
+        :param courses: List of course dictionaries containing 'Semester' and 'CourseName'
+        """
         # Ensure root_dir exists
         if not os.path.exists(self.root_dir):
-            os.makedirs(self.root_dir)
+            try:
+                os.makedirs(self.root_dir)
+                logging.info(f"Created root directory: {self.root_dir}")
+            except Exception as e:
+                logging.error(f"Failed to create root directory '{self.root_dir}': {e}")
+                return
 
         # Loop through downloaded ZIP files
         zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')]
@@ -27,19 +44,22 @@ class CourseContentExtractor:
             base_name = os.path.splitext(filename)[0]
 
             # Find the course info matching the ZIP file
-            course_info = next((course for course in courses if course['CourseName'] == base_name), None)
+            # Sanitize both course names to ensure matching
+            course_info = next(
+                (course for course in courses if self.sanitize_filename(course['CourseName']) == self.sanitize_filename(base_name)),
+                None
+            )
             if not course_info:
                 print(f"No matching course found for {base_name}. Skipping.")
+                logging.warning(f"No matching course found for {base_name}. Skipping.")
                 continue
 
             # Build the folder structure
-            study_program = course_info['StudyProgram']
             semester = course_info['Semester']
             course_name = course_info['CourseName']
 
             course_output_dir = os.path.join(
                 self.root_dir,
-                study_program,
                 semester,
                 course_name
             )
@@ -48,38 +68,62 @@ class CourseContentExtractor:
             subfolders = ['Lectures', 'Notes', 'Summary', 'Tasks']
             for subfolder in subfolders:
                 subfolder_path = os.path.join(course_output_dir, subfolder)
-                os.makedirs(subfolder_path, exist_ok=True)
-
-            # Create 'Code_files' subfolder under 'Tasks/<task_name>'
-            task_name = 'Task1'  # Adjust as needed or make dynamic
-            code_files_path = os.path.join(course_output_dir, 'Tasks', task_name, 'Code_files')
-            os.makedirs(code_files_path, exist_ok=True)
+                try:
+                    os.makedirs(subfolder_path, exist_ok=True)
+                    logging.info(f"Created subfolder: {subfolder_path}")
+                except Exception as e:
+                    logging.error(f"Failed to create subfolder '{subfolder_path}': {e}")
+                    continue
 
             # Extract and organize files
             with tempfile.TemporaryDirectory() as temp_extract_dir:
-                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-                    zip_ref.extractall(temp_extract_dir)
+                try:
+                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                        zip_ref.extractall(temp_extract_dir)
+                    logging.info(f"Extracted ZIP file to temporary directory: {temp_extract_dir}")
+                except zipfile.BadZipFile as e:
+                    logging.error(f"Bad ZIP file '{zip_path}': {e}")
+                    continue
+                except Exception as e:
+                    logging.error(f"Failed to extract ZIP file '{zip_path}': {e}")
+                    continue
 
                 for root, dirs, files in os.walk(temp_extract_dir):
                     for file in files:
                         file_path = os.path.join(root, file)
                         if file.lower().endswith('.pdf'):
                             dest_folder = os.path.join(course_output_dir, 'Lectures')
-                            shutil.copy2(file_path, dest_folder)
+                            try:
+                                shutil.copy2(file_path, dest_folder)
+                                logging.info(f"Copied PDF file to Lectures: {file}")
+                            except Exception as e:
+                                logging.error(f"Failed to copy PDF file '{file}' to '{dest_folder}': {e}")
                         elif file.lower().endswith(('.ppt', '.pptx')):
-                            self.convert_ppt_to_pdf(file_path, os.path.join(course_output_dir, 'Lectures'))
-                        elif file.lower().endswith(('.py', '.java', '.cpp', '.c', '.js', '.html', '.css')):
-                            # Example: Place code files into 'Tasks/<task_name>/Code_files'
-                            shutil.copy2(file_path, code_files_path)
+                            try:
+                                self.convert_ppt_to_pdf(file_path, os.path.join(course_output_dir, 'Lectures'))
+                            except Exception as e:
+                                logging.error(f"Failed to convert PPT file '{file}': {e}")
                         else:
-                            # Handle other file types or skip
-                            pass
+                            # Skip unwanted file types
+                            logging.info(f"Skipped unsupported file type: {file}")
 
             # Delete the ZIP file after processing
-            os.remove(zip_path)
+            try:
+                os.remove(zip_path)
+                logging.info(f"Deleted ZIP file after extraction: {zip_path}")
+            except Exception as e:
+                logging.error(f"Failed to delete ZIP file '{zip_path}': {e}")
+
         print(f"All files have been extracted to {self.root_dir}")
+        logging.info(f"All files have been extracted to {self.root_dir}")
 
     def convert_ppt_to_pdf(self, ppt_path, output_dir):
+        """
+        Convert PowerPoint files to PDF using LibreOffice.
+
+        :param ppt_path: Path to the PPT/PPTX file
+        :param output_dir: Directory to save the converted PDF
+        """
         try:
             # Determine the command based on the operating system
             if sys.platform.startswith('win'):
@@ -97,21 +141,46 @@ class CourseContentExtractor:
             ]
             # Execute the command
             subprocess.run(command, check=True)
+            logging.info(f"Converted {os.path.basename(ppt_path)} to PDF.")
             print(f"Converted {os.path.basename(ppt_path)} to PDF.")
         except subprocess.CalledProcessError as e:
+            logging.error(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}")
             print(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}")
             # Optionally, copy the original PPT/PPTX file
-            shutil.copy2(ppt_path, output_dir)
+            try:
+                shutil.copy2(ppt_path, output_dir)
+                logging.info(f"Copied original PPT/PPTX to {output_dir}")
+            except Exception as ex:
+                logging.error(f"Failed to copy PPT/PPTX file '{ppt_path}' to '{output_dir}': {ex}")
         except FileNotFoundError:
+            logging.error(f"{office_executable} is not installed or not found in the system path.")
             print(f"{office_executable} is not installed or not found in the system path.")
             # Optionally, copy the original PPT/PPTX file
-            shutil.copy2(ppt_path, output_dir)
+            try:
+                shutil.copy2(ppt_path, output_dir)
+                logging.info(f"Copied original PPT/PPTX to {output_dir}")
+            except Exception as ex:
+                logging.error(f"Failed to copy PPT/PPTX file '{ppt_path}' to '{output_dir}': {ex}")
 
     def sanitize_filename(self, name):
+        """
+        Sanitize the filename by removing invalid characters, replacing spaces with underscores,
+        and truncating to a maximum length to prevent path issues.
+
+        :param name: Original filename
+        :return: Sanitized filename
+        """
         # Normalize unicode characters
         name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
         # Remove invalid characters for filenames, including newlines
         sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
-        # Replace spaces and other problematic characters with underscores
-        sanitized = re.sub(r'[\s]+', '_', sanitized)
-        return sanitized[:100]
+        # Replace spaces with underscores
+        sanitized = re.sub(r'\s+', '_', sanitized)
+        # Remove trailing underscores
+        sanitized = sanitized.rstrip('_')
+        # Truncate to a reasonable length (e.g., 200 characters)
+        MAX_LENGTH = 200
+        if len(sanitized) > MAX_LENGTH:
+            sanitized = sanitized[:MAX_LENGTH]
+            logging.warning(f"Filename truncated to {MAX_LENGTH} characters: '{sanitized}'")
+        return sanitized
diff --git a/src/backend/folder_structure.yaml b/src/backend/folder_structure.yaml
index 545cb0f..2e55bc3 100644
--- a/src/backend/folder_structure.yaml
+++ b/src/backend/folder_structure.yaml
@@ -6,7 +6,4 @@ root_dir: ${STUDY_MATERIAL_ROOT_DIR} # Replace with the actual environment varia
         Lectures: [] # Folder for lecture materials such as PDFs or recordings (relative to the user-specified root path, e.g., <root_path>/Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Lectures)
         Notes: [] # Folder for lecture or self-study notes (relative to the user-specified root path, e.g., <root_path>/Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Notes)
         Summary: [] # Folder for summarized notes or cheat sheets (relative to the user-specified root path, e.g., <root_path>/Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Summary)
-        Tasks:
-          # User-defined task_name
-          <task_name>: # The specific task or assignment name, defined by the user (e.g., Task1)
-            Code_files: [] # Folder for code files related to the specific task (relative to the user-specified root path, e.g., <root_path>/Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Tasks/Task1/Code_files)
+        Tasks: [] # Folder where the user can make a coding project
\ No newline at end of file
diff --git a/src/backend/main.py b/src/backend/main.py
index 71eb815..40fb2db 100644
--- a/src/backend/main.py
+++ b/src/backend/main.py
@@ -1,22 +1,45 @@
+# update_study_material.py
+
 import os
 import shutil
 import tempfile
 from moodle_downloader import MoodleDownloader
 from course_content_extractor import CourseContentExtractor
 from dotenv import load_dotenv
+import logging
 
 
 def main():
+    # Configure logging
+    logging.basicConfig(
+        level=logging.DEBUG,  # Changed from INFO to DEBUG for detailed logs
+        format='%(asctime)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.StreamHandler()
+        ]
+    )
+
     # Load environment variables
     load_dotenv()
     root_dir = os.getenv('STUDY_MATERIAL_ROOT_DIR')
     if not root_dir:
         print("Please set the STUDY_MATERIAL_ROOT_DIR environment variable.")
+        logging.error("STUDY_MATERIAL_ROOT_DIR environment variable not set.")
         return
 
+    # Check if root_dir exists and is a directory
+    if not os.path.isdir(root_dir):
+        print(f"The specified STUDY_MATERIAL_ROOT_DIR does not exist or is not a directory: {root_dir}")
+        logging.error(f"Invalid STUDY_MATERIAL_ROOT_DIR: {root_dir}")
+        return
+
+    # Treat root_dir as the study_program folder
+    study_program = os.path.basename(os.path.normpath(root_dir))
+    logging.info(f"Using root_dir as the study_program: {study_program}")
+
     # Use system temporary directory for downloads
     with tempfile.TemporaryDirectory() as download_dir:
-        print(f"Using temporary download directory: {download_dir}")
+        logging.info(f"Using temporary download directory: {download_dir}")
 
         # Load credentials from environment variables
         username = os.getenv('MOODLE_USERNAME')
@@ -24,6 +47,7 @@ def main():
 
         if not username or not password:
             print("Please set your Moodle credentials in environment variables.")
+            logging.error("Moodle credentials not set in environment variables.")
             return
 
         # Initialize downloader
@@ -35,12 +59,17 @@ def main():
         finally:
             downloader.close()
 
+        # Assign study_program to each course
+        for course in downloader.courses:
+            course['StudyProgram'] = study_program
+
         # Initialize extractor
         extractor = CourseContentExtractor(download_dir=download_dir, root_dir=root_dir)
         extractor.extract_contents(downloader.courses)
 
     # Temporary directory is automatically cleaned up here
-    print("Temporary download directory has been cleaned up.")
+    logging.info("Temporary download directory has been cleaned up.")
+    print("Study materials have been updated successfully.")
 
 
 if __name__ == "__main__":
diff --git a/src/backend/moodle_downloader.py b/src/backend/moodle_downloader.py
index f36ca66..3f2187f 100644
--- a/src/backend/moodle_downloader.py
+++ b/src/backend/moodle_downloader.py
@@ -17,7 +17,15 @@ from webdriver_manager.chrome import ChromeDriverManager
 
 
 class MoodleDownloader:
-    def __init__(self, username, password, download_dir=None, headless=False):
+    def __init__(self, username, password, download_dir, headless=False):
+        """
+        Initialize the MoodleDownloader.
+
+        :param username: Moodle username
+        :param password: Moodle password
+        :param download_dir: Directory to download ZIP files
+        :param headless: Run browser in headless mode
+        """
         self.username = username
         self.password = password
         self.download_dir = download_dir  # Set externally to use system temp
@@ -28,7 +36,9 @@ class MoodleDownloader:
         self.MY_COURSES_URL = 'https://moodle.fhgr.ch/my/courses.php'
 
     def setup_driver(self):
-        # Set up Chrome options
+        """
+        Set up the Selenium WebDriver with Chrome options.
+        """
         chrome_options = Options()
         if self.headless:
             chrome_options.add_argument('--headless')  # Headless mode
@@ -52,6 +62,9 @@ class MoodleDownloader:
         self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
 
     def login(self):
+        """
+        Log in to Moodle.
+        """
         self.setup_driver()
         driver = self.driver
         try:
@@ -112,6 +125,9 @@ class MoodleDownloader:
             raise e
 
     def get_courses(self):
+        """
+        Retrieve the list of courses from Moodle.
+        """
         driver = self.driver
         try:
             # Navigate to "My Courses" page
@@ -134,10 +150,19 @@ class MoodleDownloader:
             existing_urls = set()
             for coursename_element in course_elements:
                 try:
-                    # Get the text content
-                    full_text = coursename_element.text.strip()
+                    # Extract course name from the nested span
+                    course_name_element = coursename_element.find_element(By.CSS_SELECTOR, 'span.multiline span[aria-hidden="true"]')
+                    course_title = course_name_element.text.strip()
+                    logging.debug(f"Course title extracted: '{course_title}'")
+
+                    # Extract semester from the sibling div
+                    parent_div = coursename_element.find_element(By.XPATH, '..')  # Navigate to parent div
+                    category_span = parent_div.find_element(By.CSS_SELECTOR, 'span.categoryname.text-truncate')
+                    semester = category_span.text.strip()
+                    logging.debug(f"Semester extracted: '{semester}'")
+
                     # Extract course info
-                    course_info = self.extract_course_info(full_text)
+                    course_info = self.extract_course_info(course_title)
 
                     course_url = coursename_element.get_attribute('href')
 
@@ -148,8 +173,7 @@ class MoodleDownloader:
                     existing_urls.add(course_url)
 
                     self.courses.append({
-                        'StudyProgram': course_info['study_program'],
-                        'Semester': course_info['semester'],
+                        'Semester': self.sanitize_semester(semester),
                         'CourseName': course_info['course_name'],
                         'URL': course_url
                     })
@@ -165,31 +189,46 @@ class MoodleDownloader:
             logging.error("An error occurred while retrieving courses.", exc_info=True)
             raise e
 
+    def sanitize_semester(self, semester):
+        """
+        Sanitize the semester name by replacing spaces with underscores and removing trailing underscores.
+
+        :param semester: Original semester string
+        :return: Sanitized semester string
+        """
+        sanitized = re.sub(r'\s+', '_', semester).strip('_')
+        logging.debug(f"Sanitized semester: '{sanitized}'")
+        return sanitized
+
     def extract_course_info(self, course_title):
-        # Example course title: 'Programmierung und Prompt Engineering (cds-201) HS24'
-        pattern = r'^(.*?)\s*\(([^)]+)\)\s*(\w+\d*)$'
+        """
+        Extract course information from the course title.
+
+        :param course_title: Full course title string (e.g., 'Algorithmen und Datenstrukturen (cds-203) HS24')
+        :return: Dictionary with 'course_name'
+        """
+        # Remove the semester from the course title
+        # Example: 'Algorithmen und Datenstrukturen (cds-203) HS24' -> 'Algorithmen und Datenstrukturen (cds-203)'
+        pattern = r'^(.*?)\s*\(([^)]+)\)\s*\w+\d*$'
         match = re.search(pattern, course_title)
         if match:
-            study_program = 'Computational_and_Data_Science'  # Replace with your actual study program if different
             course_full_name = match.group(1).strip()
             course_code = match.group(2).strip()
-            semester = match.group(3).strip()
-            course_identifier = f"{course_code}_{self.sanitize_filename(course_full_name)}"
+            course_name = f"{course_full_name} ({course_code})"
             return {
-                'study_program': study_program,
-                'semester': semester,
-                'course_name': course_identifier
+                'course_name': course_name
             }
         else:
             # Handle cases where the pattern doesn't match
             sanitized_title = self.sanitize_filename(course_title)
             return {
-                'study_program': 'Unknown_Program',
-                'semester': 'Unknown_Semester',
                 'course_name': sanitized_title
             }
 
     def download_all_courses(self):
+        """
+        Download all courses as ZIP files.
+        """
         if not self.courses:
             logging.warning("No courses to download.")
             return
@@ -199,6 +238,7 @@ class MoodleDownloader:
         # Ensure the download directory exists
         if not os.path.exists(self.download_dir):
             os.makedirs(self.download_dir)
+            logging.info(f"Created download directory: {self.download_dir}")
 
         for course in self.courses:
             course_name = course['CourseName']
@@ -256,12 +296,13 @@ class MoodleDownloader:
                     response.raise_for_status()
 
                     # Determine filename
-                    filename = f"{course_name}.zip"
+                    filename = f"{self.sanitize_filename(course_name)}.zip"
                     filepath = os.path.join(self.download_dir, filename)
 
                     # Overwrite existing files
                     if os.path.exists(filepath):
                         os.remove(filepath)
+                        logging.info(f"Overwriting existing file: {filepath}")
 
                     with open(filepath, 'wb') as f:
                         for chunk in response.iter_content(chunk_size=8192):
@@ -274,16 +315,32 @@ class MoodleDownloader:
                 continue
 
     def sanitize_filename(self, name):
+        """
+        Sanitize the filename by removing invalid characters, replacing spaces with underscores,
+        and truncating to a maximum length to prevent path issues.
+
+        :param name: Original filename
+        :return: Sanitized filename
+        """
         # Normalize unicode characters
         name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
         # Remove invalid characters for filenames, including newlines
         sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
-        # Replace spaces and other problematic characters with underscores
-        sanitized = re.sub(r'[\s]+', '_', sanitized)
-        # Truncate to a reasonable length (e.g., 100 characters)
-        return sanitized[:100]
+        # Replace spaces with underscores
+        sanitized = re.sub(r'\s+', '_', sanitized)
+        # Remove trailing underscores
+        sanitized = sanitized.rstrip('_')
+        # Truncate to a reasonable length (e.g., 200 characters)
+        MAX_LENGTH = 200
+        if len(sanitized) > MAX_LENGTH:
+            sanitized = sanitized[:MAX_LENGTH]
+            logging.warning(f"Filename truncated to {MAX_LENGTH} characters: '{sanitized}'")
+        return sanitized
 
     def close(self):
+        """
+        Close the Selenium WebDriver.
+        """
         if self.driver:
             logging.info("Closing the browser.")
             self.driver.quit()