Subtitle normalization.

This commit is contained in:
Giò Diani 2024-11-09 12:08:25 +01:00
parent fe92ba58b3
commit d7588ef2d6
16 changed files with 14681 additions and 4060 deletions

.editorconfig Normal file
View File

@ -0,0 +1,26 @@
# EditorConfig is awesome:
# top-most EditorConfig file
root = true
# Unix-style newlines with a newline ending every file
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
charset = utf-8
# 4 space indentation
indent_style = space
indent_size = 4
# 2 space indentation
indent_style = space
indent_size = 2
trim_trailing_whitespace = false
indent_style = space
indent_size = 2

VANA-php/app/Helpers.php Normal file
View File

@ -0,0 +1,30 @@
namespace App;
class Helpers
* Create a new class instance.
public function __construct()
public static function leadingZero($number)
return $number < 10 ? '0'.$number : $number;
public static function secondsToTimecode(float $seconds){
$hours = floor($seconds / 3600);
$minutes = floor(($seconds % 3600) / 60);
return "${hours}:${minutes}";

VANA-python/database/.gitattributes vendored Normal file
View File

@ -0,0 +1,2 @@
# GitHub syntax highlighting
pixi.lock linguist-language=YAML linguist-generated=true

VANA-python/database/.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
# pixi environments

View File

@ -0,0 +1,20 @@
authors = [{name = "Giò Diani", email = ""}]
dependencies = []
description = "Add a short description here"
name = "database"
requires-python = ">= 3.11"
version = "0.1.0"
build-backend = ""
requires = ["hatchling"]
channels = ["conda-forge"]
platforms = ["linux-64"]
database = { path = ".", editable = true }

View File

@ -0,0 +1,2 @@
# GitHub syntax highlighting
pixi.lock linguist-language=YAML linguist-generated=true

View File

@ -0,0 +1,4 @@
# pixi environments

VANA-python/normalize_subtitles/pixi.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,27 @@
authors = [{name = "Giò Diani", email = ""}]
dependencies = ["webvtt-py>=0.5.1,<0.6", "spacy-llm>=0.7.2,<0.8"]
description = "Normalisierung der Untertitel."
name = "normalize_subtitles"
requires-python = ">= 3.11"
version = "0.1.0"
build-backend = ""
requires = ["hatchling"]
channels = ["conda-forge"]
platforms = ["win-64", "linux-64", "osx-64"]
normalize_subtitles = { path = ".", editable = true }
spacy = ">=3.7.6,<4"
setuptools = ">=75.1.0,<76"
wheel = ">=0.44.0,<0.45"
pip = ">=24.2,<25"
polars = ">=1.12.0,<2"

View File

@ -0,0 +1,82 @@
import re
import polars as pl
import spacy
import timecode as tc
import webvtt
class NormalizeVtt:
def __init__(self):
self.nlp = spacy.load("de_core_news_sm")
# Der Timecode der Untertitel wird angepasst, dass jeder Block einem Satz entspricht.
def sentencize(self, vtt):
captions =
sentences = []
times_start = []
times_end = []
text = ""
duplicate_count = 0
for index, c in enumerate(captions):
Erstellt neue Timecodes, wenn die Aufteilung der Captions
in neuen Captions mit identischem Timecode resultiert.
if duplicate_count >= 2:
time_difference = (times_end[-1] - times_start[-1]) / duplicate_count
i = duplicate_count
while i > 0:
times_start[0 - i] = times_start[0 - i] + (
time_difference * (duplicate_count - i)
times_end[0 - i] = times_start[0 - i] + time_difference
i = i - 1
duplicate_count = 0
c.text = re.sub(r"\n", " ", c.text) # Entferne Umbrüche
c.text = re.sub(
r"Mit Live-Untertiteln von SWISS TXT", "", c.text
) # Entferne den Satz, welcher immer zu Beginn der Sendung erscheint.
c.text = re.sub(r"%", "Prozent", c.text) # Ersetze "%" durch "Prozent"
c.text = re.sub(r"[\.]{3}", "", c.text) # Entferne Auslassungspunkte
c.text = re.sub(r"[\"]", "", c.text) # Entferne Anführungszeichen
c.text = re.sub(r"\s\-\s", "", c.text) # Entferne Parenthese
c.text = re.sub(r"(\-\s)", "", c.text) # Entferne Bindestriche bei Umbrüchen
c.text = re.sub(
r"\([A-zäöüÄÖÜ]+\)", "", c.text
) # Entferne Klammerbemerkungen z. B. "(Sandro Brotz)"
c.text = re.sub(
r"\s{2,}", " ", c.text
) # Entferne mehrfache aufeinanderfolgende Leerzeichen
Tokenisierung der Captions und Unterteilung in Sätze.
Wenn das Token einem Satzzeichen entspricht, werden
alle vorangegangenen Tokens als neue Caption festgehalten.
for token in self.nlp(c.text):
if token.text in [".", "?", "!", ":"]:
text = ""
duplicate_count = duplicate_count + 1
elif token.text in [","]:
Kommas werden über diesen Weg entfernt,
damit Kommas in Zahlangaben erhalten bleiben.
Z. B. 5,3 Mia.
text = text + " " + token.text
d = {"sentences": sentences, "start": times_start, "end": times_end}
df = pl.DataFrame(data=d)

View File

@ -0,0 +1,28 @@
#!/usr/bin/env python3
import argparse
import re
import normalize_vtt as nv
normalizer = nv.NormalizeVtt()
# Commandline
parser = argparse.ArgumentParser(
prog='Subtitles Preprocessor',
description='Some Textanalytics for the Subtitles.')
parser.add_argument('--action', '-a')
args = parser.parse_args()
match args.action:
case 'normalize':
case _:
print('No action found.')

View File

@ -0,0 +1,5 @@
# Konvertiert einen Timecode im Format 00:00:00.000 zu Millisekunden
def toMilisec(timecode):
hrsMin = timecode.split(':')
miliSec = hrsMin[2].split('.')
return ((int(hrsMin[0]) * 3600000) + (int(hrsMin[1]) * 60000) + (int(miliSec[0]) * 1000) + int(miliSec[1])) / 1000

VANA.sublime-project Normal file
View File

@ -0,0 +1,16 @@
"folders": [
"path": "."
"name": "Python venv",
"cmd": ["/home/gio/Code/VANA/VANA-python/normalize_subtitles/.pixi/envs/default/bin/python", "$file"],
"selector": "source.python",
"file_regex": "^\\s*File \"(...*?)\", line ([0-9]*)"

File diff suppressed because one or more lines are too long