Subtitle normalization.
parent
fe92ba58b3
commit
d7588ef2d6
|
@ -0,0 +1,26 @@
|
|||
# EditorConfig is awesome: http://EditorConfig.org
|
||||
|
||||
# top-most EditorConfig file
|
||||
root = true
|
||||
|
||||
# Unix-style newlines with a newline ending every file
|
||||
[*]
|
||||
end_of_line = lf
|
||||
insert_final_newline = true
|
||||
trim_trailing_whitespace = true
|
||||
charset = utf-8
|
||||
|
||||
# 4 space indentation
|
||||
[*.{py,java,r,R}]
|
||||
indent_style = space
|
||||
indent_size = 4
|
||||
|
||||
# 2 space indentation
|
||||
[*.{js,json,y{a,}ml,html,cwl}]
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
|
||||
[*.{md,Rmd,rst}]
|
||||
trim_trailing_whitespace = false
|
||||
indent_style = space
|
||||
indent_size = 2
|
|
@ -0,0 +1,30 @@
|
|||
<?php
|
||||
|
||||
namespace App;
|
||||
|
||||
class Helpers
|
||||
{
|
||||
/**
|
||||
* Create a new class instance.
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
//
|
||||
}
|
||||
|
||||
public static function leadingZero($number)
|
||||
{
|
||||
return $number < 10 ? '0'.$number : $number;
|
||||
}
|
||||
|
||||
public static function secondsToTimecode(float $seconds){
|
||||
|
||||
$hours = floor($seconds / 3600);
|
||||
$minutes = floor(($seconds % 3600) / 60);
|
||||
|
||||
return "${hours}:${minutes}";
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
# GitHub syntax highlighting
|
||||
pixi.lock linguist-language=YAML linguist-generated=true
|
|
@ -0,0 +1,4 @@
|
|||
|
||||
# pixi environments
|
||||
.pixi
|
||||
*.egg-info
|
|
@ -0,0 +1,20 @@
|
|||
[project]
|
||||
authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
|
||||
dependencies = []
|
||||
description = "Add a short description here"
|
||||
name = "database"
|
||||
requires-python = ">= 3.11"
|
||||
version = "0.1.0"
|
||||
|
||||
[build-system]
|
||||
build-backend = "hatchling.build"
|
||||
requires = ["hatchling"]
|
||||
|
||||
[tool.pixi.project]
|
||||
channels = ["conda-forge"]
|
||||
platforms = ["linux-64"]
|
||||
|
||||
[tool.pixi.pypi-dependencies]
|
||||
database = { path = ".", editable = true }
|
||||
|
||||
[tool.pixi.tasks]
|
|
@ -0,0 +1,2 @@
|
|||
# GitHub syntax highlighting
|
||||
pixi.lock linguist-language=YAML linguist-generated=true
|
|
@ -0,0 +1,4 @@
|
|||
|
||||
# pixi environments
|
||||
.pixi
|
||||
*.egg-info
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,27 @@
|
|||
[project]
|
||||
authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
|
||||
dependencies = ["webvtt-py>=0.5.1,<0.6", "spacy-llm>=0.7.2,<0.8"]
|
||||
description = "Normalisierung der Untertitel."
|
||||
name = "normalize_subtitles"
|
||||
requires-python = ">= 3.11"
|
||||
version = "0.1.0"
|
||||
|
||||
[build-system]
|
||||
build-backend = "hatchling.build"
|
||||
requires = ["hatchling"]
|
||||
|
||||
[tool.pixi.project]
|
||||
channels = ["conda-forge"]
|
||||
platforms = ["win-64", "linux-64", "osx-64"]
|
||||
|
||||
[tool.pixi.pypi-dependencies]
|
||||
normalize_subtitles = { path = ".", editable = true }
|
||||
|
||||
[tool.pixi.tasks]
|
||||
|
||||
[tool.pixi.dependencies]
|
||||
spacy = ">=3.7.6,<4"
|
||||
setuptools = ">=75.1.0,<76"
|
||||
wheel = ">=0.44.0,<0.45"
|
||||
pip = ">=24.2,<25"
|
||||
polars = ">=1.12.0,<2"
|
|
@ -0,0 +1,82 @@
|
|||
import re
|
||||
|
||||
import polars as pl
|
||||
import spacy
|
||||
import timecode as tc
|
||||
import webvtt
|
||||
|
||||
|
||||
class NormalizeVtt:
|
||||
|
||||
def __init__(self):
|
||||
self.nlp = spacy.load("de_core_news_sm")
|
||||
|
||||
# Der Timecode der Untertitel wird angepasst, dass jeder Block einem Satz entspricht.
|
||||
def sentencize(self, vtt):
|
||||
captions = webvtt.read(vtt)
|
||||
|
||||
sentences = []
|
||||
times_start = []
|
||||
times_end = []
|
||||
text = ""
|
||||
duplicate_count = 0
|
||||
|
||||
for index, c in enumerate(captions):
|
||||
"""
|
||||
Erstellt neue Timecodes, wenn die Aufteilung der Captions
|
||||
in neuen Captions mit identischem Timecode resultiert.
|
||||
"""
|
||||
if duplicate_count >= 2:
|
||||
time_difference = (times_end[-1] - times_start[-1]) / duplicate_count
|
||||
i = duplicate_count
|
||||
while i > 0:
|
||||
times_start[0 - i] = times_start[0 - i] + (
|
||||
time_difference * (duplicate_count - i)
|
||||
)
|
||||
times_end[0 - i] = times_start[0 - i] + time_difference
|
||||
i = i - 1
|
||||
|
||||
duplicate_count = 0
|
||||
|
||||
c.text = re.sub(r"\n", " ", c.text) # Entferne Umbrüche
|
||||
c.text = re.sub(
|
||||
r"Mit Live-Untertiteln von SWISS TXT", "", c.text
|
||||
) # Entferne den Satz, welcher immer zu Beginn der Sendung erscheint.
|
||||
c.text = re.sub(r"%", "Prozent", c.text) # Ersetze "%" durch "Prozent"
|
||||
c.text = re.sub(r"[\.]{3}", "", c.text) # Entferne Auslassungspunkte
|
||||
c.text = re.sub(r"[\"]", "", c.text) # Entferne Anführungszeichen
|
||||
c.text = re.sub(r"\s\-\s", "", c.text) # Entferne Parenthese
|
||||
c.text = re.sub(r"(\-\s)", "", c.text) # Entferne Bindestriche bei Umbrüchen
|
||||
c.text = re.sub(
|
||||
r"\([A-zäöüÄÖÜ]+\)", "", c.text
|
||||
) # Entferne Klammerbemerkungen z. B. "(Sandro Brotz)"
|
||||
c.text = re.sub(
|
||||
r"\s{2,}", " ", c.text
|
||||
) # Entferne mehrfache aufeinanderfolgende Leerzeichen
|
||||
|
||||
|
||||
"""
|
||||
Tokenisierung der Captions und Unterteilung in Sätze.
|
||||
Wenn das Token einem Satzzeichen entspricht, werden
|
||||
alle vorangegangenen Tokens als neue Caption festgehalten.
|
||||
"""
|
||||
for token in self.nlp(c.text):
|
||||
if token.text in [".", "?", "!", ":"]:
|
||||
sentences.append(text.strip())
|
||||
times_end.append(tc.toMilisec(c.end))
|
||||
times_start.append(tc.toMilisec(c.start))
|
||||
text = ""
|
||||
duplicate_count = duplicate_count + 1
|
||||
elif token.text in [","]:
|
||||
"""
|
||||
Kommas werden über diesen Weg entfernt,
|
||||
damit Kommas in Zahlangaben erhalten bleiben.
|
||||
Z. B. 5,3 Mia.
|
||||
"""
|
||||
continue
|
||||
else:
|
||||
text = text + " " + token.text
|
||||
d = {"sentences": sentences, "start": times_start, "end": times_end}
|
||||
df = pl.DataFrame(data=d)
|
||||
df.write_csv("out2.csv")
|
||||
return
|
|
@ -0,0 +1,28 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import re
|
||||
|
||||
import normalize_vtt as nv
|
||||
|
||||
normalizer = nv.NormalizeVtt()
|
||||
|
||||
# Commandline
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='Subtitles Preprocessor',
|
||||
description='Some Textanalytics for the Subtitles.')
|
||||
|
||||
parser.add_argument('--action', '-a')
|
||||
parser.add_argument('--vtt')
|
||||
args = parser.parse_args()
|
||||
|
||||
match args.action:
|
||||
case 'normalize':
|
||||
normalizer.sentencize(args.vtt)
|
||||
case _:
|
||||
print('No action found.')
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
# Konvertiert einen Timecode im Format 00:00:00.000 zu Millisekunden
|
||||
def toMilisec(timecode):
|
||||
hrsMin = timecode.split(':')
|
||||
miliSec = hrsMin[2].split('.')
|
||||
return ((int(hrsMin[0]) * 3600000) + (int(hrsMin[1]) * 60000) + (int(miliSec[0]) * 1000) + int(miliSec[1])) / 1000
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"folders": [
|
||||
{
|
||||
"path": "."
|
||||
}
|
||||
],
|
||||
"build_systems":
|
||||
[
|
||||
{
|
||||
"name": "Python venv",
|
||||
"cmd": ["/home/gio/Code/VANA/VANA-python/normalize_subtitles/.pixi/envs/default/bin/python", "$file"],
|
||||
"selector": "source.python",
|
||||
"file_regex": "^\\s*File \"(...*?)\", line ([0-9]*)"
|
||||
}
|
||||
]
|
||||
}
|
14754
VANA.sublime-workspace
14754
VANA.sublime-workspace
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue