Course pdf extractor

main
Oliver Schütz 2024-10-26 20:33:00 +02:00
parent f21d288959
commit 2c69edd489
7 changed files with 166 additions and 2560 deletions

6
.gitignore vendored
View File

@ -1,3 +1,8 @@
# Temporary files
**/*.log
**/data
# IDE
**/.idea/
@ -41,3 +46,4 @@
# Latex
!**/out/*.pdf
**/auxil/*

Binary file not shown.

2538
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +0,0 @@
{
"devDependencies": {
"@unocss/postcss": "^0.63.4",
"unocss": "^0.63.4"
}
}

View File

@ -0,0 +1,79 @@
import os
import zipfile
import shutil
import tempfile
import subprocess
import sys
class CourseContentExtractor:
def __init__(self, download_dir, output_dir=None):
self.download_dir = download_dir
self.output_dir = output_dir or os.path.join(os.getcwd(), 'data')
def extract_contents(self):
# Ensure output_dir exists
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
# Find all ZIP files in download_dir
zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')]
for filename in zip_files:
zip_path = os.path.join(self.download_dir, filename)
base_name = os.path.splitext(filename)[0]
# Use the base name as the course folder name
course_name = base_name
course_output_dir = os.path.join(self.output_dir, course_name)
# Ensure course_output_dir exists
if not os.path.exists(course_output_dir):
os.makedirs(course_output_dir)
# Create a temporary directory for extraction
with tempfile.TemporaryDirectory() as temp_extract_dir:
# Extract ZIP file to temporary directory
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(temp_extract_dir)
# Walk through the extracted files
for root, dirs, files in os.walk(temp_extract_dir):
for file in files:
file_path = os.path.join(root, file)
if file.lower().endswith('.pdf'):
# Copy PDF files to course_output_dir
shutil.copy2(file_path, course_output_dir)
elif file.lower().endswith(('.ppt', '.pptx')):
# Convert PowerPoint files to PDF
self.convert_ppt_to_pdf(file_path, course_output_dir)
# Delete the ZIP file after processing
os.remove(zip_path)
print(f"All PDF and PowerPoint files have been extracted to {self.output_dir}")
def convert_ppt_to_pdf(self, ppt_path, output_dir):
try:
# Determine the command based on the operating system
if sys.platform.startswith('win'):
# Windows systems
office_executable = 'soffice'
else:
# Linux and others
office_executable = 'libreoffice'
# Prepare the command to convert PPT/PPTX to PDF using LibreOffice
command = [
office_executable,
'--headless',
'--convert-to', 'pdf',
'--outdir', output_dir,
ppt_path
]
# Execute the command
subprocess.run(command, check=True)
print(f"Converted {os.path.basename(ppt_path)} to PDF.")
except subprocess.CalledProcessError as e:
print(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}")
# Optionally, copy the original PPT/PPTX file
shutil.copy2(ppt_path, output_dir)
except FileNotFoundError:
print(f"{office_executable} is not installed or not found in the system path.")
# Optionally, copy the original PPT/PPTX file
shutil.copy2(ppt_path, output_dir)

View File

@ -1,5 +1,8 @@
# main.py
import logging
from moodle_downloader import MoodleDownloader
from course_content_extractor import CourseContentExtractor
import os
# Configure logging
@ -29,6 +32,10 @@ try:
# Download all courses
downloader.download_all_courses()
# Extract course contents using the updated class
extractor = CourseContentExtractor(downloader.download_dir)
extractor.extract_contents()
finally:
# Close the browser
downloader.close()

View File

@ -1,7 +1,9 @@
import os
import re
import time
import logging
import requests
import unicodedata
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
@ -11,12 +13,20 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import tempfile
class MoodleDownloader:
def __init__(self, username, password, download_dir=None, headless=False):
self.username = username
self.password = password
self.download_dir = download_dir or os.path.join(os.getcwd(), 'downloads')
if download_dir:
self.download_dir = download_dir
self.cleanup_download_dir = False
else:
# Create a unique temporary directory
self.temp_dir = tempfile.TemporaryDirectory()
self.download_dir = self.temp_dir.name
self.cleanup_download_dir = True
self.headless = headless
self.driver = None
self.courses = []
@ -127,12 +137,30 @@ class MoodleDownloader:
logging.info(f"{len(course_elements)} courses found.")
existing_urls = set()
for coursename_element in course_elements:
try:
course_name = coursename_element.text.strip()
# Get the text content
full_text = coursename_element.text.strip()
lines = [line.strip() for line in full_text.split('\n') if line.strip()]
# Remove duplicates
unique_lines = list(dict.fromkeys(lines))
# Assume the last line is the actual course name
course_name = unique_lines[-1]
# Extract course code and term
short_name = self.extract_course_code_and_term(course_name)
course_url = coursename_element.get_attribute('href')
self.courses.append({'CourseName': course_name, 'URL': course_url})
logging.info(f"Course found: {course_name} - {course_url}")
# Check for duplicates
if course_url in existing_urls:
logging.info(f"Duplicate course found: {short_name} - {course_url}")
continue
existing_urls.add(course_url)
self.courses.append({'CourseName': short_name, 'URL': course_url})
logging.info(f"Course found: {short_name} - {course_url}")
except Exception as e:
logging.warning(f"Error extracting course: {e}")
continue
@ -144,6 +172,30 @@ class MoodleDownloader:
logging.error("An error occurred while retrieving courses.", exc_info=True)
raise e
def extract_course_code_and_term(self, course_name):
# Regular expression to match course code and term
# Example course name: 'Mathematik I (cds-401) HS24'
pattern = r'\(([^)]+)\)\s+(\w+\d*)'
match = re.search(pattern, course_name)
if match:
course_code = match.group(1)
term = match.group(2)
# Sanitize and return
return f"{self.sanitize_filename(course_code)}_{self.sanitize_filename(term)}"
else:
# If pattern doesn't match, return sanitized course name
return self.sanitize_filename(course_name)
def sanitize_filename(self, name):
# Normalize unicode characters
name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
# Remove invalid characters for filenames, including newlines
sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
# Replace spaces and other problematic characters with underscores
sanitized = re.sub(r'[\s]+', '_', sanitized)
# Truncate to a reasonable length (e.g., 100 characters)
return sanitized[:100]
def download_all_courses(self):
if not self.courses:
logging.warning("No courses to download.")
@ -155,8 +207,6 @@ class MoodleDownloader:
if not os.path.exists(self.download_dir):
os.makedirs(self.download_dir)
course_counter = 1
for course in self.courses:
course_name = course['CourseName']
course_url = course['URL']
@ -183,19 +233,15 @@ class MoodleDownloader:
)
# Extract 'sesskey' and 'contextid'
sesskey_input = driver.find_element(By.NAME, 'sesskey')
sesskey = sesskey_input.get_attribute('value')
contextid_input = driver.find_element(By.NAME, 'contextid')
contextid = contextid_input.get_attribute('value')
sesskey = driver.find_element(By.NAME, 'sesskey').get_attribute('value')
contextid = driver.find_element(By.NAME, 'contextid').get_attribute('value')
logging.info(f"sesskey: {sesskey}, contextid: {contextid}")
# Extract cookies from the Selenium session
logging.info("Extracting cookies from the Selenium session.")
selenium_cookies = driver.get_cookies()
cookies = {}
for cookie in selenium_cookies:
cookies[cookie['name']] = cookie['value']
cookies = {cookie['name']: cookie['value'] for cookie in selenium_cookies}
# Prepare the HTTP POST request
download_url = 'https://moodle.fhgr.ch/course/downloadcontent.php'
@ -216,9 +262,18 @@ class MoodleDownloader:
response = session.post(download_url, data=post_data, headers=headers, stream=True)
response.raise_for_status()
# Generate filename as course_N.zip
filename = f"course_{course_counter}.zip"
course_counter += 1
# Attempt to extract filename from Content-Disposition header
content_disposition = response.headers.get('Content-Disposition', '')
filename = None
if content_disposition:
matches = re.findall('filename="(.+)"', content_disposition)
if matches:
filename = matches[0]
if not filename:
# If no filename in headers, use sanitized course name
filename = f"{course_name}.zip"
filename = self.sanitize_filename(filename)
filepath = os.path.join(self.download_dir, filename)
# Overwrite existing files
@ -239,3 +294,6 @@ class MoodleDownloader:
if self.driver:
logging.info("Closing the browser.")
self.driver.quit()
if self.cleanup_download_dir:
logging.info("Cleaning up temporary download directory.")
self.temp_dir.cleanup()