Subtitle normalization.

2024-11-09 12:08:25 +01:00 · 2024-11-09 12:08:25 +01:00 · d7588ef2d6
commit d7588ef2d6
parent fe92ba58b3
16 changed files with 14681 additions and 4060 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -0,0 +1,26 @@
 # EditorConfig is awesome: http://EditorConfig.org
 # top-most EditorConfig file
 root = true
 # Unix-style newlines with a newline ending every file
 [*]
 end_of_line = lf
 insert_final_newline = true
 trim_trailing_whitespace = true
 charset = utf-8
 # 4 space indentation
 [*.{py,java,r,R}]
 indent_style = space
 indent_size = 4
 # 2 space indentation
 [*.{js,json,y{a,}ml,html,cwl}]
 indent_style = space
 indent_size = 2
 [*.{md,Rmd,rst}]
 trim_trailing_whitespace = false
 indent_style = space
 indent_size = 2
--- a/VANA-php/app/Helpers.php
+++ b/VANA-php/app/Helpers.php
@ -0,0 +1,30 @@
 <?php
 namespace App;
 class Helpers
 {
    /**
     * Create a new class instance.
     */
    public function __construct()
    {
        //
    }
    public static function leadingZero($number)
    {
        return $number < 10 ? '0'.$number : $number;
    }
    public static function secondsToTimecode(float $seconds){
        $hours = floor($seconds / 3600);
        $minutes = floor(($seconds % 3600) / 60);
        return "${hours}:${minutes}";
    }
 }
--- a/VANA-python/database/.gitattributes
+++ b/VANA-python/database/.gitattributes
@ -0,0 +1,2 @@
 # GitHub syntax highlighting
 pixi.lock linguist-language=YAML linguist-generated=true
--- a/VANA-python/database/.gitignore
+++ b/VANA-python/database/.gitignore
@ -0,0 +1,4 @@
 # pixi environments
 .pixi
 *.egg-info
--- a/VANA-python/database/pyproject.toml
+++ b/VANA-python/database/pyproject.toml
@ -0,0 +1,20 @@
 [project]
 authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
 dependencies = []
 description = "Add a short description here"
 name = "database"
 requires-python = ">= 3.11"
 version = "0.1.0"
 [build-system]
 build-backend = "hatchling.build"
 requires = ["hatchling"]
 [tool.pixi.project]
 channels = ["conda-forge"]
 platforms = ["linux-64"]
 [tool.pixi.pypi-dependencies]
 database = { path = ".", editable = true }
 [tool.pixi.tasks]
--- a/VANA-python/database/src/database/init.py
+++ b/VANA-python/database/src/database/init.py
--- a/VANA-python/normalize_subtitles/.gitattributes
+++ b/VANA-python/normalize_subtitles/.gitattributes
@ -0,0 +1,2 @@
 # GitHub syntax highlighting
 pixi.lock linguist-language=YAML linguist-generated=true
--- a/VANA-python/normalize_subtitles/.gitignore
+++ b/VANA-python/normalize_subtitles/.gitignore
@ -0,0 +1,4 @@
 # pixi environments
 .pixi
 *.egg-info
--- a/VANA-python/normalize_subtitles/pixi.lock
+++ b/VANA-python/normalize_subtitles/pixi.lock
--- a/VANA-python/normalize_subtitles/pyproject.toml
+++ b/VANA-python/normalize_subtitles/pyproject.toml
@ -0,0 +1,27 @@
 [project]
 authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
 dependencies = ["webvtt-py>=0.5.1,<0.6", "spacy-llm>=0.7.2,<0.8"]
 description = "Normalisierung der Untertitel."
 name = "normalize_subtitles"
 requires-python = ">= 3.11"
 version = "0.1.0"
 [build-system]
 build-backend = "hatchling.build"
 requires = ["hatchling"]
 [tool.pixi.project]
 channels = ["conda-forge"]
 platforms = ["win-64", "linux-64", "osx-64"]
 [tool.pixi.pypi-dependencies]
 normalize_subtitles = { path = ".", editable = true }
 [tool.pixi.tasks]
 [tool.pixi.dependencies]
 spacy = ">=3.7.6,<4"
 setuptools = ">=75.1.0,<76"
 wheel = ">=0.44.0,<0.45"
 pip = ">=24.2,<25"
 polars = ">=1.12.0,<2"
--- a/VANA-python/normalize_subtitles/src/normalize_subtitles/init.py
+++ b/VANA-python/normalize_subtitles/src/normalize_subtitles/init.py
--- a/VANA-python/normalize_subtitles/src/normalize_subtitles/normalize_vtt.py
+++ b/VANA-python/normalize_subtitles/src/normalize_subtitles/normalize_vtt.py
@ -0,0 +1,82 @@
 import re
 import polars as pl
 import spacy
 import timecode as tc
 import webvtt
 class NormalizeVtt:
    def __init__(self):
        self.nlp = spacy.load("de_core_news_sm")
    # Der Timecode der Untertitel wird angepasst, dass jeder Block einem Satz entspricht.
    def sentencize(self, vtt):
        captions = webvtt.read(vtt)
        sentences = []
        times_start = []
        times_end = []
        text = ""
        duplicate_count = 0
        for index, c in enumerate(captions):
            """ 
            Erstellt neue Timecodes, wenn die Aufteilung der Captions
            in neuen Captions mit identischem Timecode resultiert.
            """
            if duplicate_count >= 2:
                time_difference = (times_end[-1] - times_start[-1]) / duplicate_count
                i = duplicate_count
                while i > 0:
                    times_start[0 - i] = times_start[0 - i] + (
                        time_difference * (duplicate_count - i)
                    )
                    times_end[0 - i] = times_start[0 - i] + time_difference
                    i = i - 1
            duplicate_count = 0
            c.text = re.sub(r"\n", " ", c.text)  # Entferne Umbrüche
            c.text = re.sub(
                r"Mit Live-Untertiteln von SWISS TXT", "", c.text
            )  # Entferne den Satz, welcher immer zu Beginn der Sendung erscheint.
            c.text = re.sub(r"%", "Prozent", c.text)  # Ersetze "%" durch "Prozent"
            c.text = re.sub(r"[\.]{3}", "", c.text)  # Entferne Auslassungspunkte
            c.text = re.sub(r"[\"]", "", c.text)  # Entferne Anführungszeichen
            c.text = re.sub(r"\s\-\s", "", c.text)  # Entferne Parenthese
            c.text = re.sub(r"(\-\s)", "", c.text) # Entferne Bindestriche bei Umbrüchen
            c.text = re.sub(
                r"\([A-zäöüÄÖÜ]+\)", "", c.text
            )  # Entferne Klammerbemerkungen z. B. "(Sandro Brotz)"
            c.text = re.sub(
                r"\s{2,}", " ", c.text
            )  # Entferne mehrfache aufeinanderfolgende Leerzeichen
            """ 
            Tokenisierung der Captions und Unterteilung in Sätze.
            Wenn das Token einem Satzzeichen entspricht, werden
            alle vorangegangenen Tokens als neue Caption festgehalten.
            """
            for token in self.nlp(c.text):
                if token.text in [".", "?", "!", ":"]:
                    sentences.append(text.strip())
                    times_end.append(tc.toMilisec(c.end))
                    times_start.append(tc.toMilisec(c.start))
                    text = ""
                    duplicate_count = duplicate_count + 1
                elif token.text in [","]:
                    """
 					Kommas werden über diesen Weg entfernt, 
 					damit Kommas in Zahlangaben erhalten bleiben.
 					Z. B. 5,3 Mia.
                    """
                    continue
                else:
                    text = text + " " + token.text
        d = {"sentences": sentences, "start": times_start, "end": times_end}
        df = pl.DataFrame(data=d)
        df.write_csv("out2.csv")
        return
--- a/VANA-python/normalize_subtitles/src/normalize_subtitles/subtitles-processing.py
+++ b/VANA-python/normalize_subtitles/src/normalize_subtitles/subtitles-processing.py
@ -0,0 +1,28 @@
 #!/usr/bin/env python3
 import argparse
 import re
 import normalize_vtt as nv
 normalizer = nv.NormalizeVtt()
 # Commandline
 parser = argparse.ArgumentParser(
 	prog='Subtitles Preprocessor',
 	description='Some Textanalytics for the Subtitles.')
 parser.add_argument('--action', '-a')
 parser.add_argument('--vtt')
 args = parser.parse_args()
 match args.action:
 	case 'normalize':
 		normalizer.sentencize(args.vtt)
 	case _:
 		print('No action found.')
--- a/VANA-python/normalize_subtitles/src/normalize_subtitles/timecode.py
+++ b/VANA-python/normalize_subtitles/src/normalize_subtitles/timecode.py
@ -0,0 +1,5 @@
 # Konvertiert einen Timecode im Format 00:00:00.000 zu Millisekunden
 def toMilisec(timecode):
    hrsMin = timecode.split(':')
    miliSec = hrsMin[2].split('.')
    return ((int(hrsMin[0]) * 3600000) + (int(hrsMin[1]) * 60000) + (int(miliSec[0]) * 1000) + int(miliSec[1])) / 1000
--- a/VANA.sublime-project
+++ b/VANA.sublime-project
@ -0,0 +1,16 @@
 {
 	"folders": [
 		{
 			"path": "."
 		}
 	],
 	"build_systems":
    [
        {
            "name": "Python venv",
            "cmd": ["/home/gio/Code/VANA/VANA-python/normalize_subtitles/.pixi/envs/default/bin/python", "$file"],
 		    "selector": "source.python",
 		    "file_regex": "^\\s*File \"(...*?)\", line ([0-9]*)"
 		}
    ]
 }
--- a/VANA.sublime-workspace
+++ b/VANA.sublime-workspace
		`@ -0,0 +1,2 @@`
							`# GitHub syntax highlighting`
							`pixi.lock linguist-language=YAML linguist-generated=true`