Course pdf extractor
parent
f21d288959
commit
2c69edd489
|
@ -1,3 +1,8 @@
|
|||
# Temporary files
|
||||
**/*.log
|
||||
**/data
|
||||
|
||||
|
||||
# IDE
|
||||
**/.idea/
|
||||
|
||||
|
@ -41,3 +46,4 @@
|
|||
# Latex
|
||||
!**/out/*.pdf
|
||||
**/auxil/*
|
||||
|
||||
|
|
BIN
out/main.pdf
BIN
out/main.pdf
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -1,6 +0,0 @@
|
|||
{
|
||||
"devDependencies": {
|
||||
"@unocss/postcss": "^0.63.4",
|
||||
"unocss": "^0.63.4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
import os
|
||||
import zipfile
|
||||
import shutil
|
||||
import tempfile
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
class CourseContentExtractor:
|
||||
def __init__(self, download_dir, output_dir=None):
|
||||
self.download_dir = download_dir
|
||||
self.output_dir = output_dir or os.path.join(os.getcwd(), 'data')
|
||||
|
||||
def extract_contents(self):
|
||||
# Ensure output_dir exists
|
||||
if not os.path.exists(self.output_dir):
|
||||
os.makedirs(self.output_dir)
|
||||
|
||||
# Find all ZIP files in download_dir
|
||||
zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')]
|
||||
for filename in zip_files:
|
||||
zip_path = os.path.join(self.download_dir, filename)
|
||||
base_name = os.path.splitext(filename)[0]
|
||||
|
||||
# Use the base name as the course folder name
|
||||
course_name = base_name
|
||||
|
||||
course_output_dir = os.path.join(self.output_dir, course_name)
|
||||
# Ensure course_output_dir exists
|
||||
if not os.path.exists(course_output_dir):
|
||||
os.makedirs(course_output_dir)
|
||||
|
||||
# Create a temporary directory for extraction
|
||||
with tempfile.TemporaryDirectory() as temp_extract_dir:
|
||||
# Extract ZIP file to temporary directory
|
||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||
zip_ref.extractall(temp_extract_dir)
|
||||
# Walk through the extracted files
|
||||
for root, dirs, files in os.walk(temp_extract_dir):
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
if file.lower().endswith('.pdf'):
|
||||
# Copy PDF files to course_output_dir
|
||||
shutil.copy2(file_path, course_output_dir)
|
||||
elif file.lower().endswith(('.ppt', '.pptx')):
|
||||
# Convert PowerPoint files to PDF
|
||||
self.convert_ppt_to_pdf(file_path, course_output_dir)
|
||||
# Delete the ZIP file after processing
|
||||
os.remove(zip_path)
|
||||
print(f"All PDF and PowerPoint files have been extracted to {self.output_dir}")
|
||||
|
||||
def convert_ppt_to_pdf(self, ppt_path, output_dir):
|
||||
try:
|
||||
# Determine the command based on the operating system
|
||||
if sys.platform.startswith('win'):
|
||||
# Windows systems
|
||||
office_executable = 'soffice'
|
||||
else:
|
||||
# Linux and others
|
||||
office_executable = 'libreoffice'
|
||||
|
||||
# Prepare the command to convert PPT/PPTX to PDF using LibreOffice
|
||||
command = [
|
||||
office_executable,
|
||||
'--headless',
|
||||
'--convert-to', 'pdf',
|
||||
'--outdir', output_dir,
|
||||
ppt_path
|
||||
]
|
||||
# Execute the command
|
||||
subprocess.run(command, check=True)
|
||||
print(f"Converted {os.path.basename(ppt_path)} to PDF.")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}")
|
||||
# Optionally, copy the original PPT/PPTX file
|
||||
shutil.copy2(ppt_path, output_dir)
|
||||
except FileNotFoundError:
|
||||
print(f"{office_executable} is not installed or not found in the system path.")
|
||||
# Optionally, copy the original PPT/PPTX file
|
||||
shutil.copy2(ppt_path, output_dir)
|
|
@ -1,5 +1,8 @@
|
|||
# main.py
|
||||
|
||||
import logging
|
||||
from moodle_downloader import MoodleDownloader
|
||||
from course_content_extractor import CourseContentExtractor
|
||||
import os
|
||||
|
||||
# Configure logging
|
||||
|
@ -29,6 +32,10 @@ try:
|
|||
|
||||
# Download all courses
|
||||
downloader.download_all_courses()
|
||||
|
||||
# Extract course contents using the updated class
|
||||
extractor = CourseContentExtractor(downloader.download_dir)
|
||||
extractor.extract_contents()
|
||||
finally:
|
||||
# Close the browser
|
||||
downloader.close()
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
import os
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
import requests
|
||||
import unicodedata
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
|
@ -11,12 +13,20 @@ from selenium.webdriver.support import expected_conditions as EC
|
|||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
import tempfile
|
||||
|
||||
class MoodleDownloader:
|
||||
def __init__(self, username, password, download_dir=None, headless=False):
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.download_dir = download_dir or os.path.join(os.getcwd(), 'downloads')
|
||||
if download_dir:
|
||||
self.download_dir = download_dir
|
||||
self.cleanup_download_dir = False
|
||||
else:
|
||||
# Create a unique temporary directory
|
||||
self.temp_dir = tempfile.TemporaryDirectory()
|
||||
self.download_dir = self.temp_dir.name
|
||||
self.cleanup_download_dir = True
|
||||
self.headless = headless
|
||||
self.driver = None
|
||||
self.courses = []
|
||||
|
@ -127,12 +137,30 @@ class MoodleDownloader:
|
|||
|
||||
logging.info(f"{len(course_elements)} courses found.")
|
||||
|
||||
existing_urls = set()
|
||||
for coursename_element in course_elements:
|
||||
try:
|
||||
course_name = coursename_element.text.strip()
|
||||
# Get the text content
|
||||
full_text = coursename_element.text.strip()
|
||||
lines = [line.strip() for line in full_text.split('\n') if line.strip()]
|
||||
# Remove duplicates
|
||||
unique_lines = list(dict.fromkeys(lines))
|
||||
# Assume the last line is the actual course name
|
||||
course_name = unique_lines[-1]
|
||||
|
||||
# Extract course code and term
|
||||
short_name = self.extract_course_code_and_term(course_name)
|
||||
|
||||
course_url = coursename_element.get_attribute('href')
|
||||
self.courses.append({'CourseName': course_name, 'URL': course_url})
|
||||
logging.info(f"Course found: {course_name} - {course_url}")
|
||||
|
||||
# Check for duplicates
|
||||
if course_url in existing_urls:
|
||||
logging.info(f"Duplicate course found: {short_name} - {course_url}")
|
||||
continue
|
||||
existing_urls.add(course_url)
|
||||
|
||||
self.courses.append({'CourseName': short_name, 'URL': course_url})
|
||||
logging.info(f"Course found: {short_name} - {course_url}")
|
||||
except Exception as e:
|
||||
logging.warning(f"Error extracting course: {e}")
|
||||
continue
|
||||
|
@ -144,6 +172,30 @@ class MoodleDownloader:
|
|||
logging.error("An error occurred while retrieving courses.", exc_info=True)
|
||||
raise e
|
||||
|
||||
def extract_course_code_and_term(self, course_name):
|
||||
# Regular expression to match course code and term
|
||||
# Example course name: 'Mathematik I (cds-401) HS24'
|
||||
pattern = r'\(([^)]+)\)\s+(\w+\d*)'
|
||||
match = re.search(pattern, course_name)
|
||||
if match:
|
||||
course_code = match.group(1)
|
||||
term = match.group(2)
|
||||
# Sanitize and return
|
||||
return f"{self.sanitize_filename(course_code)}_{self.sanitize_filename(term)}"
|
||||
else:
|
||||
# If pattern doesn't match, return sanitized course name
|
||||
return self.sanitize_filename(course_name)
|
||||
|
||||
def sanitize_filename(self, name):
|
||||
# Normalize unicode characters
|
||||
name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
|
||||
# Remove invalid characters for filenames, including newlines
|
||||
sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
|
||||
# Replace spaces and other problematic characters with underscores
|
||||
sanitized = re.sub(r'[\s]+', '_', sanitized)
|
||||
# Truncate to a reasonable length (e.g., 100 characters)
|
||||
return sanitized[:100]
|
||||
|
||||
def download_all_courses(self):
|
||||
if not self.courses:
|
||||
logging.warning("No courses to download.")
|
||||
|
@ -155,8 +207,6 @@ class MoodleDownloader:
|
|||
if not os.path.exists(self.download_dir):
|
||||
os.makedirs(self.download_dir)
|
||||
|
||||
course_counter = 1
|
||||
|
||||
for course in self.courses:
|
||||
course_name = course['CourseName']
|
||||
course_url = course['URL']
|
||||
|
@ -183,19 +233,15 @@ class MoodleDownloader:
|
|||
)
|
||||
|
||||
# Extract 'sesskey' and 'contextid'
|
||||
sesskey_input = driver.find_element(By.NAME, 'sesskey')
|
||||
sesskey = sesskey_input.get_attribute('value')
|
||||
contextid_input = driver.find_element(By.NAME, 'contextid')
|
||||
contextid = contextid_input.get_attribute('value')
|
||||
sesskey = driver.find_element(By.NAME, 'sesskey').get_attribute('value')
|
||||
contextid = driver.find_element(By.NAME, 'contextid').get_attribute('value')
|
||||
|
||||
logging.info(f"sesskey: {sesskey}, contextid: {contextid}")
|
||||
|
||||
# Extract cookies from the Selenium session
|
||||
logging.info("Extracting cookies from the Selenium session.")
|
||||
selenium_cookies = driver.get_cookies()
|
||||
cookies = {}
|
||||
for cookie in selenium_cookies:
|
||||
cookies[cookie['name']] = cookie['value']
|
||||
cookies = {cookie['name']: cookie['value'] for cookie in selenium_cookies}
|
||||
|
||||
# Prepare the HTTP POST request
|
||||
download_url = 'https://moodle.fhgr.ch/course/downloadcontent.php'
|
||||
|
@ -216,9 +262,18 @@ class MoodleDownloader:
|
|||
response = session.post(download_url, data=post_data, headers=headers, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
# Generate filename as course_N.zip
|
||||
filename = f"course_{course_counter}.zip"
|
||||
course_counter += 1
|
||||
# Attempt to extract filename from Content-Disposition header
|
||||
content_disposition = response.headers.get('Content-Disposition', '')
|
||||
filename = None
|
||||
if content_disposition:
|
||||
matches = re.findall('filename="(.+)"', content_disposition)
|
||||
if matches:
|
||||
filename = matches[0]
|
||||
if not filename:
|
||||
# If no filename in headers, use sanitized course name
|
||||
filename = f"{course_name}.zip"
|
||||
filename = self.sanitize_filename(filename)
|
||||
|
||||
filepath = os.path.join(self.download_dir, filename)
|
||||
|
||||
# Overwrite existing files
|
||||
|
@ -239,3 +294,6 @@ class MoodleDownloader:
|
|||
if self.driver:
|
||||
logging.info("Closing the browser.")
|
||||
self.driver.quit()
|
||||
if self.cleanup_download_dir:
|
||||
logging.info("Cleaning up temporary download directory.")
|
||||
self.temp_dir.cleanup()
|
Loading…
Reference in New Issue