Saving
parent
0a4af50d08
commit
ccdfc21f43
|
@ -8,17 +8,34 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
class CourseContentExtractor:
|
class CourseContentExtractor:
|
||||||
def __init__(self, download_dir, root_dir):
|
def __init__(self, download_dir, root_dir):
|
||||||
|
"""
|
||||||
|
Initialize the CourseContentExtractor.
|
||||||
|
|
||||||
|
:param download_dir: Directory where ZIP files are downloaded
|
||||||
|
:param root_dir: Root directory for organizing study materials
|
||||||
|
"""
|
||||||
self.download_dir = download_dir
|
self.download_dir = download_dir
|
||||||
self.root_dir = root_dir # Read from environment variable
|
self.root_dir = root_dir # Read from environment variable
|
||||||
|
|
||||||
def extract_contents(self, courses):
|
def extract_contents(self, courses):
|
||||||
|
"""
|
||||||
|
Extract and organize course contents based on the provided folder structure.
|
||||||
|
|
||||||
|
:param courses: List of course dictionaries containing 'Semester' and 'CourseName'
|
||||||
|
"""
|
||||||
# Ensure root_dir exists
|
# Ensure root_dir exists
|
||||||
if not os.path.exists(self.root_dir):
|
if not os.path.exists(self.root_dir):
|
||||||
|
try:
|
||||||
os.makedirs(self.root_dir)
|
os.makedirs(self.root_dir)
|
||||||
|
logging.info(f"Created root directory: {self.root_dir}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to create root directory '{self.root_dir}': {e}")
|
||||||
|
return
|
||||||
|
|
||||||
# Loop through downloaded ZIP files
|
# Loop through downloaded ZIP files
|
||||||
zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')]
|
zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')]
|
||||||
|
@ -27,19 +44,22 @@ class CourseContentExtractor:
|
||||||
base_name = os.path.splitext(filename)[0]
|
base_name = os.path.splitext(filename)[0]
|
||||||
|
|
||||||
# Find the course info matching the ZIP file
|
# Find the course info matching the ZIP file
|
||||||
course_info = next((course for course in courses if course['CourseName'] == base_name), None)
|
# Sanitize both course names to ensure matching
|
||||||
|
course_info = next(
|
||||||
|
(course for course in courses if self.sanitize_filename(course['CourseName']) == self.sanitize_filename(base_name)),
|
||||||
|
None
|
||||||
|
)
|
||||||
if not course_info:
|
if not course_info:
|
||||||
print(f"No matching course found for {base_name}. Skipping.")
|
print(f"No matching course found for {base_name}. Skipping.")
|
||||||
|
logging.warning(f"No matching course found for {base_name}. Skipping.")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Build the folder structure
|
# Build the folder structure
|
||||||
study_program = course_info['StudyProgram']
|
|
||||||
semester = course_info['Semester']
|
semester = course_info['Semester']
|
||||||
course_name = course_info['CourseName']
|
course_name = course_info['CourseName']
|
||||||
|
|
||||||
course_output_dir = os.path.join(
|
course_output_dir = os.path.join(
|
||||||
self.root_dir,
|
self.root_dir,
|
||||||
study_program,
|
|
||||||
semester,
|
semester,
|
||||||
course_name
|
course_name
|
||||||
)
|
)
|
||||||
|
@ -48,38 +68,62 @@ class CourseContentExtractor:
|
||||||
subfolders = ['Lectures', 'Notes', 'Summary', 'Tasks']
|
subfolders = ['Lectures', 'Notes', 'Summary', 'Tasks']
|
||||||
for subfolder in subfolders:
|
for subfolder in subfolders:
|
||||||
subfolder_path = os.path.join(course_output_dir, subfolder)
|
subfolder_path = os.path.join(course_output_dir, subfolder)
|
||||||
|
try:
|
||||||
os.makedirs(subfolder_path, exist_ok=True)
|
os.makedirs(subfolder_path, exist_ok=True)
|
||||||
|
logging.info(f"Created subfolder: {subfolder_path}")
|
||||||
# Create 'Code_files' subfolder under 'Tasks/<task_name>'
|
except Exception as e:
|
||||||
task_name = 'Task1' # Adjust as needed or make dynamic
|
logging.error(f"Failed to create subfolder '{subfolder_path}': {e}")
|
||||||
code_files_path = os.path.join(course_output_dir, 'Tasks', task_name, 'Code_files')
|
continue
|
||||||
os.makedirs(code_files_path, exist_ok=True)
|
|
||||||
|
|
||||||
# Extract and organize files
|
# Extract and organize files
|
||||||
with tempfile.TemporaryDirectory() as temp_extract_dir:
|
with tempfile.TemporaryDirectory() as temp_extract_dir:
|
||||||
|
try:
|
||||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||||
zip_ref.extractall(temp_extract_dir)
|
zip_ref.extractall(temp_extract_dir)
|
||||||
|
logging.info(f"Extracted ZIP file to temporary directory: {temp_extract_dir}")
|
||||||
|
except zipfile.BadZipFile as e:
|
||||||
|
logging.error(f"Bad ZIP file '{zip_path}': {e}")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to extract ZIP file '{zip_path}': {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
for root, dirs, files in os.walk(temp_extract_dir):
|
for root, dirs, files in os.walk(temp_extract_dir):
|
||||||
for file in files:
|
for file in files:
|
||||||
file_path = os.path.join(root, file)
|
file_path = os.path.join(root, file)
|
||||||
if file.lower().endswith('.pdf'):
|
if file.lower().endswith('.pdf'):
|
||||||
dest_folder = os.path.join(course_output_dir, 'Lectures')
|
dest_folder = os.path.join(course_output_dir, 'Lectures')
|
||||||
|
try:
|
||||||
shutil.copy2(file_path, dest_folder)
|
shutil.copy2(file_path, dest_folder)
|
||||||
|
logging.info(f"Copied PDF file to Lectures: {file}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to copy PDF file '{file}' to '{dest_folder}': {e}")
|
||||||
elif file.lower().endswith(('.ppt', '.pptx')):
|
elif file.lower().endswith(('.ppt', '.pptx')):
|
||||||
|
try:
|
||||||
self.convert_ppt_to_pdf(file_path, os.path.join(course_output_dir, 'Lectures'))
|
self.convert_ppt_to_pdf(file_path, os.path.join(course_output_dir, 'Lectures'))
|
||||||
elif file.lower().endswith(('.py', '.java', '.cpp', '.c', '.js', '.html', '.css')):
|
except Exception as e:
|
||||||
# Example: Place code files into 'Tasks/<task_name>/Code_files'
|
logging.error(f"Failed to convert PPT file '{file}': {e}")
|
||||||
shutil.copy2(file_path, code_files_path)
|
|
||||||
else:
|
else:
|
||||||
# Handle other file types or skip
|
# Skip unwanted file types
|
||||||
pass
|
logging.info(f"Skipped unsupported file type: {file}")
|
||||||
|
|
||||||
# Delete the ZIP file after processing
|
# Delete the ZIP file after processing
|
||||||
|
try:
|
||||||
os.remove(zip_path)
|
os.remove(zip_path)
|
||||||
|
logging.info(f"Deleted ZIP file after extraction: {zip_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to delete ZIP file '{zip_path}': {e}")
|
||||||
|
|
||||||
print(f"All files have been extracted to {self.root_dir}")
|
print(f"All files have been extracted to {self.root_dir}")
|
||||||
|
logging.info(f"All files have been extracted to {self.root_dir}")
|
||||||
|
|
||||||
def convert_ppt_to_pdf(self, ppt_path, output_dir):
|
def convert_ppt_to_pdf(self, ppt_path, output_dir):
|
||||||
|
"""
|
||||||
|
Convert PowerPoint files to PDF using LibreOffice.
|
||||||
|
|
||||||
|
:param ppt_path: Path to the PPT/PPTX file
|
||||||
|
:param output_dir: Directory to save the converted PDF
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
# Determine the command based on the operating system
|
# Determine the command based on the operating system
|
||||||
if sys.platform.startswith('win'):
|
if sys.platform.startswith('win'):
|
||||||
|
@ -97,21 +141,46 @@ class CourseContentExtractor:
|
||||||
]
|
]
|
||||||
# Execute the command
|
# Execute the command
|
||||||
subprocess.run(command, check=True)
|
subprocess.run(command, check=True)
|
||||||
|
logging.info(f"Converted {os.path.basename(ppt_path)} to PDF.")
|
||||||
print(f"Converted {os.path.basename(ppt_path)} to PDF.")
|
print(f"Converted {os.path.basename(ppt_path)} to PDF.")
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
|
logging.error(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}")
|
||||||
print(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}")
|
print(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}")
|
||||||
# Optionally, copy the original PPT/PPTX file
|
# Optionally, copy the original PPT/PPTX file
|
||||||
|
try:
|
||||||
shutil.copy2(ppt_path, output_dir)
|
shutil.copy2(ppt_path, output_dir)
|
||||||
|
logging.info(f"Copied original PPT/PPTX to {output_dir}")
|
||||||
|
except Exception as ex:
|
||||||
|
logging.error(f"Failed to copy PPT/PPTX file '{ppt_path}' to '{output_dir}': {ex}")
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
|
logging.error(f"{office_executable} is not installed or not found in the system path.")
|
||||||
print(f"{office_executable} is not installed or not found in the system path.")
|
print(f"{office_executable} is not installed or not found in the system path.")
|
||||||
# Optionally, copy the original PPT/PPTX file
|
# Optionally, copy the original PPT/PPTX file
|
||||||
|
try:
|
||||||
shutil.copy2(ppt_path, output_dir)
|
shutil.copy2(ppt_path, output_dir)
|
||||||
|
logging.info(f"Copied original PPT/PPTX to {output_dir}")
|
||||||
|
except Exception as ex:
|
||||||
|
logging.error(f"Failed to copy PPT/PPTX file '{ppt_path}' to '{output_dir}': {ex}")
|
||||||
|
|
||||||
def sanitize_filename(self, name):
|
def sanitize_filename(self, name):
|
||||||
|
"""
|
||||||
|
Sanitize the filename by removing invalid characters, replacing spaces with underscores,
|
||||||
|
and truncating to a maximum length to prevent path issues.
|
||||||
|
|
||||||
|
:param name: Original filename
|
||||||
|
:return: Sanitized filename
|
||||||
|
"""
|
||||||
# Normalize unicode characters
|
# Normalize unicode characters
|
||||||
name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
|
name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
|
||||||
# Remove invalid characters for filenames, including newlines
|
# Remove invalid characters for filenames, including newlines
|
||||||
sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
|
sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
|
||||||
# Replace spaces and other problematic characters with underscores
|
# Replace spaces with underscores
|
||||||
sanitized = re.sub(r'[\s]+', '_', sanitized)
|
sanitized = re.sub(r'\s+', '_', sanitized)
|
||||||
return sanitized[:100]
|
# Remove trailing underscores
|
||||||
|
sanitized = sanitized.rstrip('_')
|
||||||
|
# Truncate to a reasonable length (e.g., 200 characters)
|
||||||
|
MAX_LENGTH = 200
|
||||||
|
if len(sanitized) > MAX_LENGTH:
|
||||||
|
sanitized = sanitized[:MAX_LENGTH]
|
||||||
|
logging.warning(f"Filename truncated to {MAX_LENGTH} characters: '{sanitized}'")
|
||||||
|
return sanitized
|
||||||
|
|
|
@ -6,7 +6,4 @@ root_dir: ${STUDY_MATERIAL_ROOT_DIR} # Replace with the actual environment varia
|
||||||
Lectures: [] # Folder for lecture materials such as PDFs or recordings (relative to the user-specified root path, e.g., <root_path>/Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Lectures)
|
Lectures: [] # Folder for lecture materials such as PDFs or recordings (relative to the user-specified root path, e.g., <root_path>/Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Lectures)
|
||||||
Notes: [] # Folder for lecture or self-study notes (relative to the user-specified root path, e.g., <root_path>/Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Notes)
|
Notes: [] # Folder for lecture or self-study notes (relative to the user-specified root path, e.g., <root_path>/Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Notes)
|
||||||
Summary: [] # Folder for summarized notes or cheat sheets (relative to the user-specified root path, e.g., <root_path>/Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Summary)
|
Summary: [] # Folder for summarized notes or cheat sheets (relative to the user-specified root path, e.g., <root_path>/Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Summary)
|
||||||
Tasks:
|
Tasks: [] # Folder where the user can make a coding project
|
||||||
# User-defined task_name
|
|
||||||
<task_name>: # The specific task or assignment name, defined by the user (e.g., Task1)
|
|
||||||
Code_files: [] # Folder for code files related to the specific task (relative to the user-specified root path, e.g., <root_path>/Computational_and_Data_Science/HS24/cds-201_Programmierung und Prompt Engineering/Tasks/Task1/Code_files)
|
|
|
@ -1,22 +1,45 @@
|
||||||
|
# update_study_material.py
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
from moodle_downloader import MoodleDownloader
|
from moodle_downloader import MoodleDownloader
|
||||||
from course_content_extractor import CourseContentExtractor
|
from course_content_extractor import CourseContentExtractor
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.DEBUG, # Changed from INFO to DEBUG for detailed logs
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.StreamHandler()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
root_dir = os.getenv('STUDY_MATERIAL_ROOT_DIR')
|
root_dir = os.getenv('STUDY_MATERIAL_ROOT_DIR')
|
||||||
if not root_dir:
|
if not root_dir:
|
||||||
print("Please set the STUDY_MATERIAL_ROOT_DIR environment variable.")
|
print("Please set the STUDY_MATERIAL_ROOT_DIR environment variable.")
|
||||||
|
logging.error("STUDY_MATERIAL_ROOT_DIR environment variable not set.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Check if root_dir exists and is a directory
|
||||||
|
if not os.path.isdir(root_dir):
|
||||||
|
print(f"The specified STUDY_MATERIAL_ROOT_DIR does not exist or is not a directory: {root_dir}")
|
||||||
|
logging.error(f"Invalid STUDY_MATERIAL_ROOT_DIR: {root_dir}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Treat root_dir as the study_program folder
|
||||||
|
study_program = os.path.basename(os.path.normpath(root_dir))
|
||||||
|
logging.info(f"Using root_dir as the study_program: {study_program}")
|
||||||
|
|
||||||
# Use system temporary directory for downloads
|
# Use system temporary directory for downloads
|
||||||
with tempfile.TemporaryDirectory() as download_dir:
|
with tempfile.TemporaryDirectory() as download_dir:
|
||||||
print(f"Using temporary download directory: {download_dir}")
|
logging.info(f"Using temporary download directory: {download_dir}")
|
||||||
|
|
||||||
# Load credentials from environment variables
|
# Load credentials from environment variables
|
||||||
username = os.getenv('MOODLE_USERNAME')
|
username = os.getenv('MOODLE_USERNAME')
|
||||||
|
@ -24,6 +47,7 @@ def main():
|
||||||
|
|
||||||
if not username or not password:
|
if not username or not password:
|
||||||
print("Please set your Moodle credentials in environment variables.")
|
print("Please set your Moodle credentials in environment variables.")
|
||||||
|
logging.error("Moodle credentials not set in environment variables.")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Initialize downloader
|
# Initialize downloader
|
||||||
|
@ -35,12 +59,17 @@ def main():
|
||||||
finally:
|
finally:
|
||||||
downloader.close()
|
downloader.close()
|
||||||
|
|
||||||
|
# Assign study_program to each course
|
||||||
|
for course in downloader.courses:
|
||||||
|
course['StudyProgram'] = study_program
|
||||||
|
|
||||||
# Initialize extractor
|
# Initialize extractor
|
||||||
extractor = CourseContentExtractor(download_dir=download_dir, root_dir=root_dir)
|
extractor = CourseContentExtractor(download_dir=download_dir, root_dir=root_dir)
|
||||||
extractor.extract_contents(downloader.courses)
|
extractor.extract_contents(downloader.courses)
|
||||||
|
|
||||||
# Temporary directory is automatically cleaned up here
|
# Temporary directory is automatically cleaned up here
|
||||||
print("Temporary download directory has been cleaned up.")
|
logging.info("Temporary download directory has been cleaned up.")
|
||||||
|
print("Study materials have been updated successfully.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -17,7 +17,15 @@ from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
|
||||||
|
|
||||||
class MoodleDownloader:
|
class MoodleDownloader:
|
||||||
def __init__(self, username, password, download_dir=None, headless=False):
|
def __init__(self, username, password, download_dir, headless=False):
|
||||||
|
"""
|
||||||
|
Initialize the MoodleDownloader.
|
||||||
|
|
||||||
|
:param username: Moodle username
|
||||||
|
:param password: Moodle password
|
||||||
|
:param download_dir: Directory to download ZIP files
|
||||||
|
:param headless: Run browser in headless mode
|
||||||
|
"""
|
||||||
self.username = username
|
self.username = username
|
||||||
self.password = password
|
self.password = password
|
||||||
self.download_dir = download_dir # Set externally to use system temp
|
self.download_dir = download_dir # Set externally to use system temp
|
||||||
|
@ -28,7 +36,9 @@ class MoodleDownloader:
|
||||||
self.MY_COURSES_URL = 'https://moodle.fhgr.ch/my/courses.php'
|
self.MY_COURSES_URL = 'https://moodle.fhgr.ch/my/courses.php'
|
||||||
|
|
||||||
def setup_driver(self):
|
def setup_driver(self):
|
||||||
# Set up Chrome options
|
"""
|
||||||
|
Set up the Selenium WebDriver with Chrome options.
|
||||||
|
"""
|
||||||
chrome_options = Options()
|
chrome_options = Options()
|
||||||
if self.headless:
|
if self.headless:
|
||||||
chrome_options.add_argument('--headless') # Headless mode
|
chrome_options.add_argument('--headless') # Headless mode
|
||||||
|
@ -52,6 +62,9 @@ class MoodleDownloader:
|
||||||
self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
|
self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
|
||||||
|
|
||||||
def login(self):
|
def login(self):
|
||||||
|
"""
|
||||||
|
Log in to Moodle.
|
||||||
|
"""
|
||||||
self.setup_driver()
|
self.setup_driver()
|
||||||
driver = self.driver
|
driver = self.driver
|
||||||
try:
|
try:
|
||||||
|
@ -112,6 +125,9 @@ class MoodleDownloader:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
def get_courses(self):
|
def get_courses(self):
|
||||||
|
"""
|
||||||
|
Retrieve the list of courses from Moodle.
|
||||||
|
"""
|
||||||
driver = self.driver
|
driver = self.driver
|
||||||
try:
|
try:
|
||||||
# Navigate to "My Courses" page
|
# Navigate to "My Courses" page
|
||||||
|
@ -134,10 +150,19 @@ class MoodleDownloader:
|
||||||
existing_urls = set()
|
existing_urls = set()
|
||||||
for coursename_element in course_elements:
|
for coursename_element in course_elements:
|
||||||
try:
|
try:
|
||||||
# Get the text content
|
# Extract course name from the nested span
|
||||||
full_text = coursename_element.text.strip()
|
course_name_element = coursename_element.find_element(By.CSS_SELECTOR, 'span.multiline span[aria-hidden="true"]')
|
||||||
|
course_title = course_name_element.text.strip()
|
||||||
|
logging.debug(f"Course title extracted: '{course_title}'")
|
||||||
|
|
||||||
|
# Extract semester from the sibling div
|
||||||
|
parent_div = coursename_element.find_element(By.XPATH, '..') # Navigate to parent div
|
||||||
|
category_span = parent_div.find_element(By.CSS_SELECTOR, 'span.categoryname.text-truncate')
|
||||||
|
semester = category_span.text.strip()
|
||||||
|
logging.debug(f"Semester extracted: '{semester}'")
|
||||||
|
|
||||||
# Extract course info
|
# Extract course info
|
||||||
course_info = self.extract_course_info(full_text)
|
course_info = self.extract_course_info(course_title)
|
||||||
|
|
||||||
course_url = coursename_element.get_attribute('href')
|
course_url = coursename_element.get_attribute('href')
|
||||||
|
|
||||||
|
@ -148,8 +173,7 @@ class MoodleDownloader:
|
||||||
existing_urls.add(course_url)
|
existing_urls.add(course_url)
|
||||||
|
|
||||||
self.courses.append({
|
self.courses.append({
|
||||||
'StudyProgram': course_info['study_program'],
|
'Semester': self.sanitize_semester(semester),
|
||||||
'Semester': course_info['semester'],
|
|
||||||
'CourseName': course_info['course_name'],
|
'CourseName': course_info['course_name'],
|
||||||
'URL': course_url
|
'URL': course_url
|
||||||
})
|
})
|
||||||
|
@ -165,31 +189,46 @@ class MoodleDownloader:
|
||||||
logging.error("An error occurred while retrieving courses.", exc_info=True)
|
logging.error("An error occurred while retrieving courses.", exc_info=True)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
def sanitize_semester(self, semester):
|
||||||
|
"""
|
||||||
|
Sanitize the semester name by replacing spaces with underscores and removing trailing underscores.
|
||||||
|
|
||||||
|
:param semester: Original semester string
|
||||||
|
:return: Sanitized semester string
|
||||||
|
"""
|
||||||
|
sanitized = re.sub(r'\s+', '_', semester).strip('_')
|
||||||
|
logging.debug(f"Sanitized semester: '{sanitized}'")
|
||||||
|
return sanitized
|
||||||
|
|
||||||
def extract_course_info(self, course_title):
|
def extract_course_info(self, course_title):
|
||||||
# Example course title: 'Programmierung und Prompt Engineering (cds-201) HS24'
|
"""
|
||||||
pattern = r'^(.*?)\s*\(([^)]+)\)\s*(\w+\d*)$'
|
Extract course information from the course title.
|
||||||
|
|
||||||
|
:param course_title: Full course title string (e.g., 'Algorithmen und Datenstrukturen (cds-203) HS24')
|
||||||
|
:return: Dictionary with 'course_name'
|
||||||
|
"""
|
||||||
|
# Remove the semester from the course title
|
||||||
|
# Example: 'Algorithmen und Datenstrukturen (cds-203) HS24' -> 'Algorithmen und Datenstrukturen (cds-203)'
|
||||||
|
pattern = r'^(.*?)\s*\(([^)]+)\)\s*\w+\d*$'
|
||||||
match = re.search(pattern, course_title)
|
match = re.search(pattern, course_title)
|
||||||
if match:
|
if match:
|
||||||
study_program = 'Computational_and_Data_Science' # Replace with your actual study program if different
|
|
||||||
course_full_name = match.group(1).strip()
|
course_full_name = match.group(1).strip()
|
||||||
course_code = match.group(2).strip()
|
course_code = match.group(2).strip()
|
||||||
semester = match.group(3).strip()
|
course_name = f"{course_full_name} ({course_code})"
|
||||||
course_identifier = f"{course_code}_{self.sanitize_filename(course_full_name)}"
|
|
||||||
return {
|
return {
|
||||||
'study_program': study_program,
|
'course_name': course_name
|
||||||
'semester': semester,
|
|
||||||
'course_name': course_identifier
|
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
# Handle cases where the pattern doesn't match
|
# Handle cases where the pattern doesn't match
|
||||||
sanitized_title = self.sanitize_filename(course_title)
|
sanitized_title = self.sanitize_filename(course_title)
|
||||||
return {
|
return {
|
||||||
'study_program': 'Unknown_Program',
|
|
||||||
'semester': 'Unknown_Semester',
|
|
||||||
'course_name': sanitized_title
|
'course_name': sanitized_title
|
||||||
}
|
}
|
||||||
|
|
||||||
def download_all_courses(self):
|
def download_all_courses(self):
|
||||||
|
"""
|
||||||
|
Download all courses as ZIP files.
|
||||||
|
"""
|
||||||
if not self.courses:
|
if not self.courses:
|
||||||
logging.warning("No courses to download.")
|
logging.warning("No courses to download.")
|
||||||
return
|
return
|
||||||
|
@ -199,6 +238,7 @@ class MoodleDownloader:
|
||||||
# Ensure the download directory exists
|
# Ensure the download directory exists
|
||||||
if not os.path.exists(self.download_dir):
|
if not os.path.exists(self.download_dir):
|
||||||
os.makedirs(self.download_dir)
|
os.makedirs(self.download_dir)
|
||||||
|
logging.info(f"Created download directory: {self.download_dir}")
|
||||||
|
|
||||||
for course in self.courses:
|
for course in self.courses:
|
||||||
course_name = course['CourseName']
|
course_name = course['CourseName']
|
||||||
|
@ -256,12 +296,13 @@ class MoodleDownloader:
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
# Determine filename
|
# Determine filename
|
||||||
filename = f"{course_name}.zip"
|
filename = f"{self.sanitize_filename(course_name)}.zip"
|
||||||
filepath = os.path.join(self.download_dir, filename)
|
filepath = os.path.join(self.download_dir, filename)
|
||||||
|
|
||||||
# Overwrite existing files
|
# Overwrite existing files
|
||||||
if os.path.exists(filepath):
|
if os.path.exists(filepath):
|
||||||
os.remove(filepath)
|
os.remove(filepath)
|
||||||
|
logging.info(f"Overwriting existing file: {filepath}")
|
||||||
|
|
||||||
with open(filepath, 'wb') as f:
|
with open(filepath, 'wb') as f:
|
||||||
for chunk in response.iter_content(chunk_size=8192):
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
@ -274,16 +315,32 @@ class MoodleDownloader:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
def sanitize_filename(self, name):
|
def sanitize_filename(self, name):
|
||||||
|
"""
|
||||||
|
Sanitize the filename by removing invalid characters, replacing spaces with underscores,
|
||||||
|
and truncating to a maximum length to prevent path issues.
|
||||||
|
|
||||||
|
:param name: Original filename
|
||||||
|
:return: Sanitized filename
|
||||||
|
"""
|
||||||
# Normalize unicode characters
|
# Normalize unicode characters
|
||||||
name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
|
name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
|
||||||
# Remove invalid characters for filenames, including newlines
|
# Remove invalid characters for filenames, including newlines
|
||||||
sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
|
sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
|
||||||
# Replace spaces and other problematic characters with underscores
|
# Replace spaces with underscores
|
||||||
sanitized = re.sub(r'[\s]+', '_', sanitized)
|
sanitized = re.sub(r'\s+', '_', sanitized)
|
||||||
# Truncate to a reasonable length (e.g., 100 characters)
|
# Remove trailing underscores
|
||||||
return sanitized[:100]
|
sanitized = sanitized.rstrip('_')
|
||||||
|
# Truncate to a reasonable length (e.g., 200 characters)
|
||||||
|
MAX_LENGTH = 200
|
||||||
|
if len(sanitized) > MAX_LENGTH:
|
||||||
|
sanitized = sanitized[:MAX_LENGTH]
|
||||||
|
logging.warning(f"Filename truncated to {MAX_LENGTH} characters: '{sanitized}'")
|
||||||
|
return sanitized
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
"""
|
||||||
|
Close the Selenium WebDriver.
|
||||||
|
"""
|
||||||
if self.driver:
|
if self.driver:
|
||||||
logging.info("Closing the browser.")
|
logging.info("Closing the browser.")
|
||||||
self.driver.quit()
|
self.driver.quit()
|
||||||
|
|
Loading…
Reference in New Issue