Course pdf extractor

main
Oliver Schütz 2024-10-26 20:33:00 +02:00
parent f21d288959
commit 2c69edd489
7 changed files with 166 additions and 2560 deletions

6
.gitignore vendored
View File

@ -1,3 +1,8 @@
# Temporary files
**/*.log
**/data
# IDE # IDE
**/.idea/ **/.idea/
@ -41,3 +46,4 @@
# Latex # Latex
!**/out/*.pdf !**/out/*.pdf
**/auxil/* **/auxil/*

Binary file not shown.

2538
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +0,0 @@
{
"devDependencies": {
"@unocss/postcss": "^0.63.4",
"unocss": "^0.63.4"
}
}

View File

@ -0,0 +1,79 @@
import os
import zipfile
import shutil
import tempfile
import subprocess
import sys
class CourseContentExtractor:
def __init__(self, download_dir, output_dir=None):
self.download_dir = download_dir
self.output_dir = output_dir or os.path.join(os.getcwd(), 'data')
def extract_contents(self):
# Ensure output_dir exists
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
# Find all ZIP files in download_dir
zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')]
for filename in zip_files:
zip_path = os.path.join(self.download_dir, filename)
base_name = os.path.splitext(filename)[0]
# Use the base name as the course folder name
course_name = base_name
course_output_dir = os.path.join(self.output_dir, course_name)
# Ensure course_output_dir exists
if not os.path.exists(course_output_dir):
os.makedirs(course_output_dir)
# Create a temporary directory for extraction
with tempfile.TemporaryDirectory() as temp_extract_dir:
# Extract ZIP file to temporary directory
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(temp_extract_dir)
# Walk through the extracted files
for root, dirs, files in os.walk(temp_extract_dir):
for file in files:
file_path = os.path.join(root, file)
if file.lower().endswith('.pdf'):
# Copy PDF files to course_output_dir
shutil.copy2(file_path, course_output_dir)
elif file.lower().endswith(('.ppt', '.pptx')):
# Convert PowerPoint files to PDF
self.convert_ppt_to_pdf(file_path, course_output_dir)
# Delete the ZIP file after processing
os.remove(zip_path)
print(f"All PDF and PowerPoint files have been extracted to {self.output_dir}")
def convert_ppt_to_pdf(self, ppt_path, output_dir):
try:
# Determine the command based on the operating system
if sys.platform.startswith('win'):
# Windows systems
office_executable = 'soffice'
else:
# Linux and others
office_executable = 'libreoffice'
# Prepare the command to convert PPT/PPTX to PDF using LibreOffice
command = [
office_executable,
'--headless',
'--convert-to', 'pdf',
'--outdir', output_dir,
ppt_path
]
# Execute the command
subprocess.run(command, check=True)
print(f"Converted {os.path.basename(ppt_path)} to PDF.")
except subprocess.CalledProcessError as e:
print(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}")
# Optionally, copy the original PPT/PPTX file
shutil.copy2(ppt_path, output_dir)
except FileNotFoundError:
print(f"{office_executable} is not installed or not found in the system path.")
# Optionally, copy the original PPT/PPTX file
shutil.copy2(ppt_path, output_dir)

View File

@ -1,5 +1,8 @@
# main.py
import logging import logging
from moodle_downloader import MoodleDownloader from moodle_downloader import MoodleDownloader
from course_content_extractor import CourseContentExtractor
import os import os
# Configure logging # Configure logging
@ -29,6 +32,10 @@ try:
# Download all courses # Download all courses
downloader.download_all_courses() downloader.download_all_courses()
# Extract course contents using the updated class
extractor = CourseContentExtractor(downloader.download_dir)
extractor.extract_contents()
finally: finally:
# Close the browser # Close the browser
downloader.close() downloader.close()

View File

@ -1,7 +1,9 @@
import os import os
import re
import time import time
import logging import logging
import requests import requests
import unicodedata
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
@ -11,12 +13,20 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
import tempfile
class MoodleDownloader: class MoodleDownloader:
def __init__(self, username, password, download_dir=None, headless=False): def __init__(self, username, password, download_dir=None, headless=False):
self.username = username self.username = username
self.password = password self.password = password
self.download_dir = download_dir or os.path.join(os.getcwd(), 'downloads') if download_dir:
self.download_dir = download_dir
self.cleanup_download_dir = False
else:
# Create a unique temporary directory
self.temp_dir = tempfile.TemporaryDirectory()
self.download_dir = self.temp_dir.name
self.cleanup_download_dir = True
self.headless = headless self.headless = headless
self.driver = None self.driver = None
self.courses = [] self.courses = []
@ -127,12 +137,30 @@ class MoodleDownloader:
logging.info(f"{len(course_elements)} courses found.") logging.info(f"{len(course_elements)} courses found.")
existing_urls = set()
for coursename_element in course_elements: for coursename_element in course_elements:
try: try:
course_name = coursename_element.text.strip() # Get the text content
full_text = coursename_element.text.strip()
lines = [line.strip() for line in full_text.split('\n') if line.strip()]
# Remove duplicates
unique_lines = list(dict.fromkeys(lines))
# Assume the last line is the actual course name
course_name = unique_lines[-1]
# Extract course code and term
short_name = self.extract_course_code_and_term(course_name)
course_url = coursename_element.get_attribute('href') course_url = coursename_element.get_attribute('href')
self.courses.append({'CourseName': course_name, 'URL': course_url})
logging.info(f"Course found: {course_name} - {course_url}") # Check for duplicates
if course_url in existing_urls:
logging.info(f"Duplicate course found: {short_name} - {course_url}")
continue
existing_urls.add(course_url)
self.courses.append({'CourseName': short_name, 'URL': course_url})
logging.info(f"Course found: {short_name} - {course_url}")
except Exception as e: except Exception as e:
logging.warning(f"Error extracting course: {e}") logging.warning(f"Error extracting course: {e}")
continue continue
@ -144,6 +172,30 @@ class MoodleDownloader:
logging.error("An error occurred while retrieving courses.", exc_info=True) logging.error("An error occurred while retrieving courses.", exc_info=True)
raise e raise e
def extract_course_code_and_term(self, course_name):
# Regular expression to match course code and term
# Example course name: 'Mathematik I (cds-401) HS24'
pattern = r'\(([^)]+)\)\s+(\w+\d*)'
match = re.search(pattern, course_name)
if match:
course_code = match.group(1)
term = match.group(2)
# Sanitize and return
return f"{self.sanitize_filename(course_code)}_{self.sanitize_filename(term)}"
else:
# If pattern doesn't match, return sanitized course name
return self.sanitize_filename(course_name)
def sanitize_filename(self, name):
# Normalize unicode characters
name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
# Remove invalid characters for filenames, including newlines
sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
# Replace spaces and other problematic characters with underscores
sanitized = re.sub(r'[\s]+', '_', sanitized)
# Truncate to a reasonable length (e.g., 100 characters)
return sanitized[:100]
def download_all_courses(self): def download_all_courses(self):
if not self.courses: if not self.courses:
logging.warning("No courses to download.") logging.warning("No courses to download.")
@ -155,8 +207,6 @@ class MoodleDownloader:
if not os.path.exists(self.download_dir): if not os.path.exists(self.download_dir):
os.makedirs(self.download_dir) os.makedirs(self.download_dir)
course_counter = 1
for course in self.courses: for course in self.courses:
course_name = course['CourseName'] course_name = course['CourseName']
course_url = course['URL'] course_url = course['URL']
@ -183,19 +233,15 @@ class MoodleDownloader:
) )
# Extract 'sesskey' and 'contextid' # Extract 'sesskey' and 'contextid'
sesskey_input = driver.find_element(By.NAME, 'sesskey') sesskey = driver.find_element(By.NAME, 'sesskey').get_attribute('value')
sesskey = sesskey_input.get_attribute('value') contextid = driver.find_element(By.NAME, 'contextid').get_attribute('value')
contextid_input = driver.find_element(By.NAME, 'contextid')
contextid = contextid_input.get_attribute('value')
logging.info(f"sesskey: {sesskey}, contextid: {contextid}") logging.info(f"sesskey: {sesskey}, contextid: {contextid}")
# Extract cookies from the Selenium session # Extract cookies from the Selenium session
logging.info("Extracting cookies from the Selenium session.") logging.info("Extracting cookies from the Selenium session.")
selenium_cookies = driver.get_cookies() selenium_cookies = driver.get_cookies()
cookies = {} cookies = {cookie['name']: cookie['value'] for cookie in selenium_cookies}
for cookie in selenium_cookies:
cookies[cookie['name']] = cookie['value']
# Prepare the HTTP POST request # Prepare the HTTP POST request
download_url = 'https://moodle.fhgr.ch/course/downloadcontent.php' download_url = 'https://moodle.fhgr.ch/course/downloadcontent.php'
@ -216,9 +262,18 @@ class MoodleDownloader:
response = session.post(download_url, data=post_data, headers=headers, stream=True) response = session.post(download_url, data=post_data, headers=headers, stream=True)
response.raise_for_status() response.raise_for_status()
# Generate filename as course_N.zip # Attempt to extract filename from Content-Disposition header
filename = f"course_{course_counter}.zip" content_disposition = response.headers.get('Content-Disposition', '')
course_counter += 1 filename = None
if content_disposition:
matches = re.findall('filename="(.+)"', content_disposition)
if matches:
filename = matches[0]
if not filename:
# If no filename in headers, use sanitized course name
filename = f"{course_name}.zip"
filename = self.sanitize_filename(filename)
filepath = os.path.join(self.download_dir, filename) filepath = os.path.join(self.download_dir, filename)
# Overwrite existing files # Overwrite existing files
@ -239,3 +294,6 @@ class MoodleDownloader:
if self.driver: if self.driver:
logging.info("Closing the browser.") logging.info("Closing the browser.")
self.driver.quit() self.driver.quit()
if self.cleanup_download_dir:
logging.info("Cleaning up temporary download directory.")
self.temp_dir.cleanup()