Course pdf extractor
parent
f21d288959
commit
2c69edd489
|
@ -1,3 +1,8 @@
|
||||||
|
# Temporary files
|
||||||
|
**/*.log
|
||||||
|
**/data
|
||||||
|
|
||||||
|
|
||||||
# IDE
|
# IDE
|
||||||
**/.idea/
|
**/.idea/
|
||||||
|
|
||||||
|
@ -41,3 +46,4 @@
|
||||||
# Latex
|
# Latex
|
||||||
!**/out/*.pdf
|
!**/out/*.pdf
|
||||||
**/auxil/*
|
**/auxil/*
|
||||||
|
|
||||||
|
|
BIN
out/main.pdf
BIN
out/main.pdf
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"devDependencies": {
|
|
||||||
"@unocss/postcss": "^0.63.4",
|
|
||||||
"unocss": "^0.63.4"
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,79 @@
|
||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
class CourseContentExtractor:
|
||||||
|
def __init__(self, download_dir, output_dir=None):
|
||||||
|
self.download_dir = download_dir
|
||||||
|
self.output_dir = output_dir or os.path.join(os.getcwd(), 'data')
|
||||||
|
|
||||||
|
def extract_contents(self):
|
||||||
|
# Ensure output_dir exists
|
||||||
|
if not os.path.exists(self.output_dir):
|
||||||
|
os.makedirs(self.output_dir)
|
||||||
|
|
||||||
|
# Find all ZIP files in download_dir
|
||||||
|
zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')]
|
||||||
|
for filename in zip_files:
|
||||||
|
zip_path = os.path.join(self.download_dir, filename)
|
||||||
|
base_name = os.path.splitext(filename)[0]
|
||||||
|
|
||||||
|
# Use the base name as the course folder name
|
||||||
|
course_name = base_name
|
||||||
|
|
||||||
|
course_output_dir = os.path.join(self.output_dir, course_name)
|
||||||
|
# Ensure course_output_dir exists
|
||||||
|
if not os.path.exists(course_output_dir):
|
||||||
|
os.makedirs(course_output_dir)
|
||||||
|
|
||||||
|
# Create a temporary directory for extraction
|
||||||
|
with tempfile.TemporaryDirectory() as temp_extract_dir:
|
||||||
|
# Extract ZIP file to temporary directory
|
||||||
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||||
|
zip_ref.extractall(temp_extract_dir)
|
||||||
|
# Walk through the extracted files
|
||||||
|
for root, dirs, files in os.walk(temp_extract_dir):
|
||||||
|
for file in files:
|
||||||
|
file_path = os.path.join(root, file)
|
||||||
|
if file.lower().endswith('.pdf'):
|
||||||
|
# Copy PDF files to course_output_dir
|
||||||
|
shutil.copy2(file_path, course_output_dir)
|
||||||
|
elif file.lower().endswith(('.ppt', '.pptx')):
|
||||||
|
# Convert PowerPoint files to PDF
|
||||||
|
self.convert_ppt_to_pdf(file_path, course_output_dir)
|
||||||
|
# Delete the ZIP file after processing
|
||||||
|
os.remove(zip_path)
|
||||||
|
print(f"All PDF and PowerPoint files have been extracted to {self.output_dir}")
|
||||||
|
|
||||||
|
def convert_ppt_to_pdf(self, ppt_path, output_dir):
|
||||||
|
try:
|
||||||
|
# Determine the command based on the operating system
|
||||||
|
if sys.platform.startswith('win'):
|
||||||
|
# Windows systems
|
||||||
|
office_executable = 'soffice'
|
||||||
|
else:
|
||||||
|
# Linux and others
|
||||||
|
office_executable = 'libreoffice'
|
||||||
|
|
||||||
|
# Prepare the command to convert PPT/PPTX to PDF using LibreOffice
|
||||||
|
command = [
|
||||||
|
office_executable,
|
||||||
|
'--headless',
|
||||||
|
'--convert-to', 'pdf',
|
||||||
|
'--outdir', output_dir,
|
||||||
|
ppt_path
|
||||||
|
]
|
||||||
|
# Execute the command
|
||||||
|
subprocess.run(command, check=True)
|
||||||
|
print(f"Converted {os.path.basename(ppt_path)} to PDF.")
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"Failed to convert {os.path.basename(ppt_path)} to PDF. Error: {e}")
|
||||||
|
# Optionally, copy the original PPT/PPTX file
|
||||||
|
shutil.copy2(ppt_path, output_dir)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"{office_executable} is not installed or not found in the system path.")
|
||||||
|
# Optionally, copy the original PPT/PPTX file
|
||||||
|
shutil.copy2(ppt_path, output_dir)
|
|
@ -1,5 +1,8 @@
|
||||||
|
# main.py
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from moodle_downloader import MoodleDownloader
|
from moodle_downloader import MoodleDownloader
|
||||||
|
from course_content_extractor import CourseContentExtractor
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
|
@ -29,6 +32,10 @@ try:
|
||||||
|
|
||||||
# Download all courses
|
# Download all courses
|
||||||
downloader.download_all_courses()
|
downloader.download_all_courses()
|
||||||
|
|
||||||
|
# Extract course contents using the updated class
|
||||||
|
extractor = CourseContentExtractor(downloader.download_dir)
|
||||||
|
extractor.extract_contents()
|
||||||
finally:
|
finally:
|
||||||
# Close the browser
|
# Close the browser
|
||||||
downloader.close()
|
downloader.close()
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
import requests
|
import requests
|
||||||
|
import unicodedata
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
|
@ -11,12 +13,20 @@ from selenium.webdriver.support import expected_conditions as EC
|
||||||
from selenium.common.exceptions import TimeoutException
|
from selenium.common.exceptions import TimeoutException
|
||||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
import tempfile
|
||||||
|
|
||||||
class MoodleDownloader:
|
class MoodleDownloader:
|
||||||
def __init__(self, username, password, download_dir=None, headless=False):
|
def __init__(self, username, password, download_dir=None, headless=False):
|
||||||
self.username = username
|
self.username = username
|
||||||
self.password = password
|
self.password = password
|
||||||
self.download_dir = download_dir or os.path.join(os.getcwd(), 'downloads')
|
if download_dir:
|
||||||
|
self.download_dir = download_dir
|
||||||
|
self.cleanup_download_dir = False
|
||||||
|
else:
|
||||||
|
# Create a unique temporary directory
|
||||||
|
self.temp_dir = tempfile.TemporaryDirectory()
|
||||||
|
self.download_dir = self.temp_dir.name
|
||||||
|
self.cleanup_download_dir = True
|
||||||
self.headless = headless
|
self.headless = headless
|
||||||
self.driver = None
|
self.driver = None
|
||||||
self.courses = []
|
self.courses = []
|
||||||
|
@ -127,12 +137,30 @@ class MoodleDownloader:
|
||||||
|
|
||||||
logging.info(f"{len(course_elements)} courses found.")
|
logging.info(f"{len(course_elements)} courses found.")
|
||||||
|
|
||||||
|
existing_urls = set()
|
||||||
for coursename_element in course_elements:
|
for coursename_element in course_elements:
|
||||||
try:
|
try:
|
||||||
course_name = coursename_element.text.strip()
|
# Get the text content
|
||||||
|
full_text = coursename_element.text.strip()
|
||||||
|
lines = [line.strip() for line in full_text.split('\n') if line.strip()]
|
||||||
|
# Remove duplicates
|
||||||
|
unique_lines = list(dict.fromkeys(lines))
|
||||||
|
# Assume the last line is the actual course name
|
||||||
|
course_name = unique_lines[-1]
|
||||||
|
|
||||||
|
# Extract course code and term
|
||||||
|
short_name = self.extract_course_code_and_term(course_name)
|
||||||
|
|
||||||
course_url = coursename_element.get_attribute('href')
|
course_url = coursename_element.get_attribute('href')
|
||||||
self.courses.append({'CourseName': course_name, 'URL': course_url})
|
|
||||||
logging.info(f"Course found: {course_name} - {course_url}")
|
# Check for duplicates
|
||||||
|
if course_url in existing_urls:
|
||||||
|
logging.info(f"Duplicate course found: {short_name} - {course_url}")
|
||||||
|
continue
|
||||||
|
existing_urls.add(course_url)
|
||||||
|
|
||||||
|
self.courses.append({'CourseName': short_name, 'URL': course_url})
|
||||||
|
logging.info(f"Course found: {short_name} - {course_url}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Error extracting course: {e}")
|
logging.warning(f"Error extracting course: {e}")
|
||||||
continue
|
continue
|
||||||
|
@ -144,6 +172,30 @@ class MoodleDownloader:
|
||||||
logging.error("An error occurred while retrieving courses.", exc_info=True)
|
logging.error("An error occurred while retrieving courses.", exc_info=True)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
def extract_course_code_and_term(self, course_name):
|
||||||
|
# Regular expression to match course code and term
|
||||||
|
# Example course name: 'Mathematik I (cds-401) HS24'
|
||||||
|
pattern = r'\(([^)]+)\)\s+(\w+\d*)'
|
||||||
|
match = re.search(pattern, course_name)
|
||||||
|
if match:
|
||||||
|
course_code = match.group(1)
|
||||||
|
term = match.group(2)
|
||||||
|
# Sanitize and return
|
||||||
|
return f"{self.sanitize_filename(course_code)}_{self.sanitize_filename(term)}"
|
||||||
|
else:
|
||||||
|
# If pattern doesn't match, return sanitized course name
|
||||||
|
return self.sanitize_filename(course_name)
|
||||||
|
|
||||||
|
def sanitize_filename(self, name):
|
||||||
|
# Normalize unicode characters
|
||||||
|
name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
|
||||||
|
# Remove invalid characters for filenames, including newlines
|
||||||
|
sanitized = re.sub(r'[<>:"/\\|?*\n\r]+', '', name)
|
||||||
|
# Replace spaces and other problematic characters with underscores
|
||||||
|
sanitized = re.sub(r'[\s]+', '_', sanitized)
|
||||||
|
# Truncate to a reasonable length (e.g., 100 characters)
|
||||||
|
return sanitized[:100]
|
||||||
|
|
||||||
def download_all_courses(self):
|
def download_all_courses(self):
|
||||||
if not self.courses:
|
if not self.courses:
|
||||||
logging.warning("No courses to download.")
|
logging.warning("No courses to download.")
|
||||||
|
@ -155,8 +207,6 @@ class MoodleDownloader:
|
||||||
if not os.path.exists(self.download_dir):
|
if not os.path.exists(self.download_dir):
|
||||||
os.makedirs(self.download_dir)
|
os.makedirs(self.download_dir)
|
||||||
|
|
||||||
course_counter = 1
|
|
||||||
|
|
||||||
for course in self.courses:
|
for course in self.courses:
|
||||||
course_name = course['CourseName']
|
course_name = course['CourseName']
|
||||||
course_url = course['URL']
|
course_url = course['URL']
|
||||||
|
@ -183,19 +233,15 @@ class MoodleDownloader:
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract 'sesskey' and 'contextid'
|
# Extract 'sesskey' and 'contextid'
|
||||||
sesskey_input = driver.find_element(By.NAME, 'sesskey')
|
sesskey = driver.find_element(By.NAME, 'sesskey').get_attribute('value')
|
||||||
sesskey = sesskey_input.get_attribute('value')
|
contextid = driver.find_element(By.NAME, 'contextid').get_attribute('value')
|
||||||
contextid_input = driver.find_element(By.NAME, 'contextid')
|
|
||||||
contextid = contextid_input.get_attribute('value')
|
|
||||||
|
|
||||||
logging.info(f"sesskey: {sesskey}, contextid: {contextid}")
|
logging.info(f"sesskey: {sesskey}, contextid: {contextid}")
|
||||||
|
|
||||||
# Extract cookies from the Selenium session
|
# Extract cookies from the Selenium session
|
||||||
logging.info("Extracting cookies from the Selenium session.")
|
logging.info("Extracting cookies from the Selenium session.")
|
||||||
selenium_cookies = driver.get_cookies()
|
selenium_cookies = driver.get_cookies()
|
||||||
cookies = {}
|
cookies = {cookie['name']: cookie['value'] for cookie in selenium_cookies}
|
||||||
for cookie in selenium_cookies:
|
|
||||||
cookies[cookie['name']] = cookie['value']
|
|
||||||
|
|
||||||
# Prepare the HTTP POST request
|
# Prepare the HTTP POST request
|
||||||
download_url = 'https://moodle.fhgr.ch/course/downloadcontent.php'
|
download_url = 'https://moodle.fhgr.ch/course/downloadcontent.php'
|
||||||
|
@ -216,9 +262,18 @@ class MoodleDownloader:
|
||||||
response = session.post(download_url, data=post_data, headers=headers, stream=True)
|
response = session.post(download_url, data=post_data, headers=headers, stream=True)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
# Generate filename as course_N.zip
|
# Attempt to extract filename from Content-Disposition header
|
||||||
filename = f"course_{course_counter}.zip"
|
content_disposition = response.headers.get('Content-Disposition', '')
|
||||||
course_counter += 1
|
filename = None
|
||||||
|
if content_disposition:
|
||||||
|
matches = re.findall('filename="(.+)"', content_disposition)
|
||||||
|
if matches:
|
||||||
|
filename = matches[0]
|
||||||
|
if not filename:
|
||||||
|
# If no filename in headers, use sanitized course name
|
||||||
|
filename = f"{course_name}.zip"
|
||||||
|
filename = self.sanitize_filename(filename)
|
||||||
|
|
||||||
filepath = os.path.join(self.download_dir, filename)
|
filepath = os.path.join(self.download_dir, filename)
|
||||||
|
|
||||||
# Overwrite existing files
|
# Overwrite existing files
|
||||||
|
@ -239,3 +294,6 @@ class MoodleDownloader:
|
||||||
if self.driver:
|
if self.driver:
|
||||||
logging.info("Closing the browser.")
|
logging.info("Closing the browser.")
|
||||||
self.driver.quit()
|
self.driver.quit()
|
||||||
|
if self.cleanup_download_dir:
|
||||||
|
logging.info("Cleaning up temporary download directory.")
|
||||||
|
self.temp_dir.cleanup()
|
Loading…
Reference in New Issue