From ccdfc21f43056de2b67e125216d78175de9e8489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Sch=C3=BCtz?= Date: Thu, 31 Oct 2024 15:58:37 +0100 Subject: [PATCH] Saving --- src/backend/course_content_extractor.py | 119 +++++++++++++++++++----- src/backend/folder_structure.yaml | 5 +- src/backend/main.py | 33 ++++++- src/backend/moodle_downloader.py | 101 +++++++++++++++----- 4 files changed, 205 insertions(+), 53 deletions(-) diff --git a/src/backend/course_content_extractor.py b/src/backend/course_content_extractor.py index 066edbf..9c2dd03 100644 --- a/src/backend/course_content_extractor.py +++ b/src/backend/course_content_extractor.py @@ -8,17 +8,34 @@ import subprocess import sys import re import unicodedata +import logging class CourseContentExtractor: def __init__(self, download_dir, root_dir): + """ + Initialize the CourseContentExtractor. + + :param download_dir: Directory where ZIP files are downloaded + :param root_dir: Root directory for organizing study materials + """ self.download_dir = download_dir self.root_dir = root_dir # Read from environment variable def extract_contents(self, courses): + """ + Extract and organize course contents based on the provided folder structure. + + :param courses: List of course dictionaries containing 'Semester' and 'CourseName' + """ # Ensure root_dir exists if not os.path.exists(self.root_dir): - os.makedirs(self.root_dir) + try: + os.makedirs(self.root_dir) + logging.info(f"Created root directory: {self.root_dir}") + except Exception as e: + logging.error(f"Failed to create root directory '{self.root_dir}': {e}") + return # Loop through downloaded ZIP files zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')] @@ -27,19 +44,22 @@ class CourseContentExtractor: base_name = os.path.splitext(filename)[0] # Find the course info matching the ZIP file - course_info = next((course for course in courses if course['CourseName'] == base_name), None) + # Sanitize both course names to ensure matching + course_info = next( + (course for course in courses if self.sanitize_filename(course['CourseName']) == self.sanitize_filename(base_name)), + None + ) if not course_info: print(f"No matching course found for {base_name}. Skipping.") + logging.warning(f"No matching course found for {base_name}. Skipping.") continue # Build the folder structure - study_program = course_info['StudyProgram'] semester = course_info['Semester'] course_name = course_info['CourseName'] course_output_dir = os.path.join( self.root_dir, - study_program, semester, course_name ) @@ -48,38 +68,62 @@ class CourseContentExtractor: subfolders = ['Lectures', 'Notes', 'Summary', 'Tasks'] for subfolder in subfolders: subfolder_path = os.path.join(course_output_dir, subfolder) - os.makedirs(subfolder_path, exist_ok=True) - - # Create 'Code_files' subfolder under 'Tasks/' - task_name = 'Task1' # Adjust as needed or make dynamic - code_files_path = os.path.join(course_output_dir, 'Tasks', task_name, 'Code_files') - os.makedirs(code_files_path, exist_ok=True) + try: + os.makedirs(subfolder_path, exist_ok=True) + logging.info(f"Created subfolder: {subfolder_path}") + except Exception as e: + logging.error(f"Failed to create subfolder '{subfolder_path}': {e}") + continue # Extract and organize files with tempfile.TemporaryDirectory() as temp_extract_dir: - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - zip_ref.extractall(temp_extract_dir) + try: + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(temp_extract_dir) + logging.info(f"Extracted ZIP file to temporary directory: {temp_extract_dir}") + except zipfile.BadZipFile as e: + logging.error(f"Bad ZIP file '{zip_path}': {e}") + continue + except Exception as e: + logging.error(f"Failed to extract ZIP file '{zip_path}': {e}") + continue for root, dirs, files in os.walk(temp_extract_dir): for file in files: file_path = os.path.join(root, file) if file.lower().endswith('.pdf'): dest_folder = os.path.join(course_output_dir, 'Lectures') - shutil.copy2(file_path, dest_folder) + try: + shutil.copy2(file_path, dest_folder) + logging.info(f"Copied PDF file to Lectures: {file}") + except Exception as e: + logging.error(f"Failed to copy PDF file '{file}' to '{dest_folder}': {e}") elif file.lower().endswith(('.ppt', '.pptx')): - self.convert_ppt_to_pdf(file_path, os.path.join(course_output_dir, 'Lectures')) - elif file.lower().endswith(('.py', '.java', '.cpp', '.c', '.js', '.html', '.css')): - # Example: Place code files into 'Tasks//Code_files' - shutil.copy2(file_path, code_files_path) + try: + self.convert_ppt_to_pdf(file_path, os.path.join(course_output_dir, 'Lectures')) + except Exception as e: + logging.error(f"Failed to convert PPT file '{file}': {e}") else: - # Handle other file types or skip - pass + # Skip unwanted file types + logging.info(f"Skipped unsupported file type: {file}") # Delete the ZIP file after processing - os.remove(zip_path) + try: + os.remove(zip_path) + logging.info(f"Deleted ZIP file after extraction: {zip_path}") + except Exception as e: + logging.error(f"Failed to delete ZIP file '{zip_path}': {e}") + print(f"All files have been extracted to {self.root_dir}") + logging.info(f"All files have been extracted to {self.root_dir}") def convert_ppt_to_pdf(self, ppt_path, output_dir): + """ + Convert PowerPoint files to PDF using LibreOffice. + + :param ppt_path: Path to the PPT/PPTX file + :param output_dir: Directory to save the converted PDF + """ try: # Determine the command based on the operating system if sys.platform.startswith('win'): @@ -97,21 +141,46 @@ class CourseContentExtractor: ] # Execute the command subprocess.run(command, check=True) + logging.info(f"Converted {os.path.basename(ppt_path)} to PDF.") print(f"Converted {os.path.basename(ppt_path)} to PDF.") except subprocess.CalledProcessError as e: + logging.error(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}") print(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}") # Optionally, copy the original PPT/PPTX file - shutil.copy2(ppt_path, output_dir) + try: + shutil.copy2(ppt_path, output_dir) + logging.info(f"Copied original PPT/PPTX to {output_dir}") + except Exception as ex: + logging.error(f"Failed to copy PPT/PPTX file '{ppt_path}' to '{output_dir}': {ex}") except FileNotFoundError: + logging.error(f"{office_executable} is not installed or not found in the system path.") print(f"{office_executable} is not installed or not found in the system path.") # Optionally, copy the original PPT/PPTX file - shutil.copy2(ppt_path, output_dir) + try: + shutil.copy2(ppt_path, output_dir) + logging.info(f"Copied original PPT/PPTX to {output_dir}") + except Exception as ex: + logging.error(f"Failed to copy PPT/PPTX file '{ppt_path}' to '{output_dir}': {ex}") def sanitize_filename(self, name): + """ + Sanitize the filename by removing invalid characters, replacing spaces with underscores, + and truncating to a maximum length to prevent path issues. + + :param name: Original filename + :return: Sanitized filename + """ # Normalize unicode characters name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII') # Remove invalid characters for filenames, including newlines sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name) - # Replace spaces and other problematic characters with underscores - sanitized = re.sub(r'[\s]+', '_', sanitized) - return sanitized[:100] + # Replace spaces with underscores + sanitized = re.sub(r'\s+', '_', sanitized) + # Remove trailing underscores + sanitized = sanitized.rstrip('_') + # Truncate to a reasonable length (e.g., 200 characters) + MAX_LENGTH = 200 + if len(sanitized) > MAX_LENGTH: + sanitized = sanitized[:MAX_LENGTH] + logging.warning(f"Filename truncated to {MAX_LENGTH} characters: '{sanitized}'") + return sanitized diff --git a/src/backend/folder_structure.yaml b/src/backend/folder_structure.yaml index 545cb0f..2e55bc3 100644 --- a/src/backend/folder_structure.yaml +++ b/src/backend/folder_structure.yaml @@ -6,7 +6,4 @@ root_dir: ${STUDY_MATERIAL_ROOT_DIR} # Replace with the actual environment varia Lectures: [] # Folder for lecture materials such as PDFs or recordings (relative to the user-specified root path, e.g., /Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Lectures) Notes: [] # Folder for lecture or self-study notes (relative to the user-specified root path, e.g., /Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Notes) Summary: [] # Folder for summarized notes or cheat sheets (relative to the user-specified root path, e.g., /Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Summary) - Tasks: - # User-defined task_name - : # The specific task or assignment name, defined by the user (e.g., Task1) - Code_files: [] # Folder for code files related to the specific task (relative to the user-specified root path, e.g., /Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Tasks/Task1/Code_files) + Tasks: [] # Folder where the user can make a coding project \ No newline at end of file diff --git a/src/backend/main.py b/src/backend/main.py index 71eb815..40fb2db 100644 --- a/src/backend/main.py +++ b/src/backend/main.py @@ -1,22 +1,45 @@ +# update_study_material.py + import os import shutil import tempfile from moodle_downloader import MoodleDownloader from course_content_extractor import CourseContentExtractor from dotenv import load_dotenv +import logging def main(): + # Configure logging + logging.basicConfig( + level=logging.DEBUG, # Changed from INFO to DEBUG for detailed logs + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler() + ] + ) + # Load environment variables load_dotenv() root_dir = os.getenv('STUDY_MATERIAL_ROOT_DIR') if not root_dir: print("Please set the STUDY_MATERIAL_ROOT_DIR environment variable.") + logging.error("STUDY_MATERIAL_ROOT_DIR environment variable not set.") return + # Check if root_dir exists and is a directory + if not os.path.isdir(root_dir): + print(f"The specified STUDY_MATERIAL_ROOT_DIR does not exist or is not a directory: {root_dir}") + logging.error(f"Invalid STUDY_MATERIAL_ROOT_DIR: {root_dir}") + return + + # Treat root_dir as the study_program folder + study_program = os.path.basename(os.path.normpath(root_dir)) + logging.info(f"Using root_dir as the study_program: {study_program}") + # Use system temporary directory for downloads with tempfile.TemporaryDirectory() as download_dir: - print(f"Using temporary download directory: {download_dir}") + logging.info(f"Using temporary download directory: {download_dir}") # Load credentials from environment variables username = os.getenv('MOODLE_USERNAME') @@ -24,6 +47,7 @@ def main(): if not username or not password: print("Please set your Moodle credentials in environment variables.") + logging.error("Moodle credentials not set in environment variables.") return # Initialize downloader @@ -35,12 +59,17 @@ def main(): finally: downloader.close() + # Assign study_program to each course + for course in downloader.courses: + course['StudyProgram'] = study_program + # Initialize extractor extractor = CourseContentExtractor(download_dir=download_dir, root_dir=root_dir) extractor.extract_contents(downloader.courses) # Temporary directory is automatically cleaned up here - print("Temporary download directory has been cleaned up.") + logging.info("Temporary download directory has been cleaned up.") + print("Study materials have been updated successfully.") if __name__ == "__main__": diff --git a/src/backend/moodle_downloader.py b/src/backend/moodle_downloader.py index f36ca66..3f2187f 100644 --- a/src/backend/moodle_downloader.py +++ b/src/backend/moodle_downloader.py @@ -17,7 +17,15 @@ from webdriver_manager.chrome import ChromeDriverManager class MoodleDownloader: - def __init__(self, username, password, download_dir=None, headless=False): + def __init__(self, username, password, download_dir, headless=False): + """ + Initialize the MoodleDownloader. + + :param username: Moodle username + :param password: Moodle password + :param download_dir: Directory to download ZIP files + :param headless: Run browser in headless mode + """ self.username = username self.password = password self.download_dir = download_dir # Set externally to use system temp @@ -28,7 +36,9 @@ class MoodleDownloader: self.MY_COURSES_URL = 'https://moodle.fhgr.ch/my/courses.php' def setup_driver(self): - # Set up Chrome options + """ + Set up the Selenium WebDriver with Chrome options. + """ chrome_options = Options() if self.headless: chrome_options.add_argument('--headless') # Headless mode @@ -52,6 +62,9 @@ class MoodleDownloader: self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) def login(self): + """ + Log in to Moodle. + """ self.setup_driver() driver = self.driver try: @@ -112,6 +125,9 @@ class MoodleDownloader: raise e def get_courses(self): + """ + Retrieve the list of courses from Moodle. + """ driver = self.driver try: # Navigate to "My Courses" page @@ -134,10 +150,19 @@ class MoodleDownloader: existing_urls = set() for coursename_element in course_elements: try: - # Get the text content - full_text = coursename_element.text.strip() + # Extract course name from the nested span + course_name_element = coursename_element.find_element(By.CSS_SELECTOR, 'span.multiline span[aria-hidden="true"]') + course_title = course_name_element.text.strip() + logging.debug(f"Course title extracted: '{course_title}'") + + # Extract semester from the sibling div + parent_div = coursename_element.find_element(By.XPATH, '..') # Navigate to parent div + category_span = parent_div.find_element(By.CSS_SELECTOR, 'span.categoryname.text-truncate') + semester = category_span.text.strip() + logging.debug(f"Semester extracted: '{semester}'") + # Extract course info - course_info = self.extract_course_info(full_text) + course_info = self.extract_course_info(course_title) course_url = coursename_element.get_attribute('href') @@ -148,8 +173,7 @@ class MoodleDownloader: existing_urls.add(course_url) self.courses.append({ - 'StudyProgram': course_info['study_program'], - 'Semester': course_info['semester'], + 'Semester': self.sanitize_semester(semester), 'CourseName': course_info['course_name'], 'URL': course_url }) @@ -165,31 +189,46 @@ class MoodleDownloader: logging.error("An error occurred while retrieving courses.", exc_info=True) raise e + def sanitize_semester(self, semester): + """ + Sanitize the semester name by replacing spaces with underscores and removing trailing underscores. + + :param semester: Original semester string + :return: Sanitized semester string + """ + sanitized = re.sub(r'\s+', '_', semester).strip('_') + logging.debug(f"Sanitized semester: '{sanitized}'") + return sanitized + def extract_course_info(self, course_title): - # Example course title: 'Programmierung und Prompt Engineering (cds-201) HS24' - pattern = r'^(.*?)\s*\(([^)]+)\)\s*(\w+\d*)$' + """ + Extract course information from the course title. + + :param course_title: Full course title string (e.g., 'Algorithmen und Datenstrukturen (cds-203) HS24') + :return: Dictionary with 'course_name' + """ + # Remove the semester from the course title + # Example: 'Algorithmen und Datenstrukturen (cds-203) HS24' -> 'Algorithmen und Datenstrukturen (cds-203)' + pattern = r'^(.*?)\s*\(([^)]+)\)\s*\w+\d*$' match = re.search(pattern, course_title) if match: - study_program = 'Computational_and_Data_Science' # Replace with your actual study program if different course_full_name = match.group(1).strip() course_code = match.group(2).strip() - semester = match.group(3).strip() - course_identifier = f"{course_code}_{self.sanitize_filename(course_full_name)}" + course_name = f"{course_full_name} ({course_code})" return { - 'study_program': study_program, - 'semester': semester, - 'course_name': course_identifier + 'course_name': course_name } else: # Handle cases where the pattern doesn't match sanitized_title = self.sanitize_filename(course_title) return { - 'study_program': 'Unknown_Program', - 'semester': 'Unknown_Semester', 'course_name': sanitized_title } def download_all_courses(self): + """ + Download all courses as ZIP files. + """ if not self.courses: logging.warning("No courses to download.") return @@ -199,6 +238,7 @@ class MoodleDownloader: # Ensure the download directory exists if not os.path.exists(self.download_dir): os.makedirs(self.download_dir) + logging.info(f"Created download directory: {self.download_dir}") for course in self.courses: course_name = course['CourseName'] @@ -256,12 +296,13 @@ class MoodleDownloader: response.raise_for_status() # Determine filename - filename = f"{course_name}.zip" + filename = f"{self.sanitize_filename(course_name)}.zip" filepath = os.path.join(self.download_dir, filename) # Overwrite existing files if os.path.exists(filepath): os.remove(filepath) + logging.info(f"Overwriting existing file: {filepath}") with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): @@ -274,16 +315,32 @@ class MoodleDownloader: continue def sanitize_filename(self, name): + """ + Sanitize the filename by removing invalid characters, replacing spaces with underscores, + and truncating to a maximum length to prevent path issues. + + :param name: Original filename + :return: Sanitized filename + """ # Normalize unicode characters name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII') # Remove invalid characters for filenames, including newlines sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name) - # Replace spaces and other problematic characters with underscores - sanitized = re.sub(r'[\s]+', '_', sanitized) - # Truncate to a reasonable length (e.g., 100 characters) - return sanitized[:100] + # Replace spaces with underscores + sanitized = re.sub(r'\s+', '_', sanitized) + # Remove trailing underscores + sanitized = sanitized.rstrip('_') + # Truncate to a reasonable length (e.g., 200 characters) + MAX_LENGTH = 200 + if len(sanitized) > MAX_LENGTH: + sanitized = sanitized[:MAX_LENGTH] + logging.warning(f"Filename truncated to {MAX_LENGTH} characters: '{sanitized}'") + return sanitized def close(self): + """ + Close the Selenium WebDriver. + """ if self.driver: logging.info("Closing the browser.") self.driver.quit()