Subtitle normalization.

master
Giò Diani 2024-11-09 12:08:25 +01:00
parent fe92ba58b3
commit d7588ef2d6
16 changed files with 14681 additions and 4060 deletions

26
.editorconfig Normal file
View File

@ -0,0 +1,26 @@
# EditorConfig is awesome: http://EditorConfig.org
# top-most EditorConfig file
root = true
# Unix-style newlines with a newline ending every file
[*]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
charset = utf-8
# 4 space indentation
[*.{py,java,r,R}]
indent_style = space
indent_size = 4
# 2 space indentation
[*.{js,json,y{a,}ml,html,cwl}]
indent_style = space
indent_size = 2
[*.{md,Rmd,rst}]
trim_trailing_whitespace = false
indent_style = space
indent_size = 2

30
VANA-php/app/Helpers.php Normal file
View File

@ -0,0 +1,30 @@
<?php
namespace App;
class Helpers
{
/**
* Create a new class instance.
*/
public function __construct()
{
//
}
public static function leadingZero($number)
{
return $number < 10 ? '0'.$number : $number;
}
public static function secondsToTimecode(float $seconds){
$hours = floor($seconds / 3600);
$minutes = floor(($seconds % 3600) / 60);
return "${hours}:${minutes}";
}
}

2
VANA-python/database/.gitattributes vendored Normal file
View File

@ -0,0 +1,2 @@
# GitHub syntax highlighting
pixi.lock linguist-language=YAML linguist-generated=true

4
VANA-python/database/.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
# pixi environments
.pixi
*.egg-info

View File

@ -0,0 +1,20 @@
[project]
authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
dependencies = []
description = "Add a short description here"
name = "database"
requires-python = ">= 3.11"
version = "0.1.0"
[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]
[tool.pixi.project]
channels = ["conda-forge"]
platforms = ["linux-64"]
[tool.pixi.pypi-dependencies]
database = { path = ".", editable = true }
[tool.pixi.tasks]

View File

@ -0,0 +1,2 @@
# GitHub syntax highlighting
pixi.lock linguist-language=YAML linguist-generated=true

View File

@ -0,0 +1,4 @@
# pixi environments
.pixi
*.egg-info

3741
VANA-python/normalize_subtitles/pixi.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,27 @@
[project]
authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
dependencies = ["webvtt-py>=0.5.1,<0.6", "spacy-llm>=0.7.2,<0.8"]
description = "Normalisierung der Untertitel."
name = "normalize_subtitles"
requires-python = ">= 3.11"
version = "0.1.0"
[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]
[tool.pixi.project]
channels = ["conda-forge"]
platforms = ["win-64", "linux-64", "osx-64"]
[tool.pixi.pypi-dependencies]
normalize_subtitles = { path = ".", editable = true }
[tool.pixi.tasks]
[tool.pixi.dependencies]
spacy = ">=3.7.6,<4"
setuptools = ">=75.1.0,<76"
wheel = ">=0.44.0,<0.45"
pip = ">=24.2,<25"
polars = ">=1.12.0,<2"

View File

@ -0,0 +1,82 @@
import re
import polars as pl
import spacy
import timecode as tc
import webvtt
class NormalizeVtt:
def __init__(self):
self.nlp = spacy.load("de_core_news_sm")
# Der Timecode der Untertitel wird angepasst, dass jeder Block einem Satz entspricht.
def sentencize(self, vtt):
captions = webvtt.read(vtt)
sentences = []
times_start = []
times_end = []
text = ""
duplicate_count = 0
for index, c in enumerate(captions):
"""
Erstellt neue Timecodes, wenn die Aufteilung der Captions
in neuen Captions mit identischem Timecode resultiert.
"""
if duplicate_count >= 2:
time_difference = (times_end[-1] - times_start[-1]) / duplicate_count
i = duplicate_count
while i > 0:
times_start[0 - i] = times_start[0 - i] + (
time_difference * (duplicate_count - i)
)
times_end[0 - i] = times_start[0 - i] + time_difference
i = i - 1
duplicate_count = 0
c.text = re.sub(r"\n", " ", c.text) # Entferne Umbrüche
c.text = re.sub(
r"Mit Live-Untertiteln von SWISS TXT", "", c.text
) # Entferne den Satz, welcher immer zu Beginn der Sendung erscheint.
c.text = re.sub(r"%", "Prozent", c.text) # Ersetze "%" durch "Prozent"
c.text = re.sub(r"[\.]{3}", "", c.text) # Entferne Auslassungspunkte
c.text = re.sub(r"[\"]", "", c.text) # Entferne Anführungszeichen
c.text = re.sub(r"\s\-\s", "", c.text) # Entferne Parenthese
c.text = re.sub(r"(\-\s)", "", c.text) # Entferne Bindestriche bei Umbrüchen
c.text = re.sub(
r"\([A-zäöüÄÖÜ]+\)", "", c.text
) # Entferne Klammerbemerkungen z. B. "(Sandro Brotz)"
c.text = re.sub(
r"\s{2,}", " ", c.text
) # Entferne mehrfache aufeinanderfolgende Leerzeichen
"""
Tokenisierung der Captions und Unterteilung in Sätze.
Wenn das Token einem Satzzeichen entspricht, werden
alle vorangegangenen Tokens als neue Caption festgehalten.
"""
for token in self.nlp(c.text):
if token.text in [".", "?", "!", ":"]:
sentences.append(text.strip())
times_end.append(tc.toMilisec(c.end))
times_start.append(tc.toMilisec(c.start))
text = ""
duplicate_count = duplicate_count + 1
elif token.text in [","]:
"""
Kommas werden über diesen Weg entfernt,
damit Kommas in Zahlangaben erhalten bleiben.
Z. B. 5,3 Mia.
"""
continue
else:
text = text + " " + token.text
d = {"sentences": sentences, "start": times_start, "end": times_end}
df = pl.DataFrame(data=d)
df.write_csv("out2.csv")
return

View File

@ -0,0 +1,28 @@
#!/usr/bin/env python3
import argparse
import re
import normalize_vtt as nv
normalizer = nv.NormalizeVtt()
# Commandline
parser = argparse.ArgumentParser(
prog='Subtitles Preprocessor',
description='Some Textanalytics for the Subtitles.')
parser.add_argument('--action', '-a')
parser.add_argument('--vtt')
args = parser.parse_args()
match args.action:
case 'normalize':
normalizer.sentencize(args.vtt)
case _:
print('No action found.')

View File

@ -0,0 +1,5 @@
# Konvertiert einen Timecode im Format 00:00:00.000 zu Millisekunden
def toMilisec(timecode):
hrsMin = timecode.split(':')
miliSec = hrsMin[2].split('.')
return ((int(hrsMin[0]) * 3600000) + (int(hrsMin[1]) * 60000) + (int(miliSec[0]) * 1000) + int(miliSec[1])) / 1000

16
VANA.sublime-project Normal file
View File

@ -0,0 +1,16 @@
{
"folders": [
{
"path": "."
}
],
"build_systems":
[
{
"name": "Python venv",
"cmd": ["/home/gio/Code/VANA/VANA-python/normalize_subtitles/.pixi/envs/default/bin/python", "$file"],
"selector": "source.python",
"file_regex": "^\\s*File \"(...*?)\", line ([0-9]*)"
}
]
}

File diff suppressed because one or more lines are too long