improvement topic modelling

master
Giò Diani 2025-01-16 22:18:12 +01:00
parent 021b828fea
commit f7c0df98b2
35 changed files with 10498 additions and 15354 deletions

View File

@ -47,7 +47,7 @@ class Episode extends Api
$subtitles = new Subtitles;
$subs = $subtitles->getWebVTT('urn:srf:episode:tv:'.$ep['id']);
$test = EpisodeModel::firstOrCreate(
$epModel = EpisodeModel::updateOrCreate(
[
'urn' => $ep['fullLengthUrn']
],

View File

@ -18,13 +18,13 @@ class Subtitles extends Api
public function getUrl(string $urn): string
{
$response = Http::withHeaders($this->headers)->withQueryParameters([
'episode' => $urn,
])->get($this->endpoint.'subtitles');
if($response->ok()){
return $response->json()[0]['url'];
if($response->ok() && !empty($response[0]['url'])){
return $response[0]['url'];
}
return false;
@ -34,9 +34,16 @@ class Subtitles extends Api
public function getWebVTT(string $urn): ?string
{
$url = $this->getUrl($urn);
$response = Http::get($url);
$response = null;
if($response->successful()){
if($url){
$response = Http::get($url);
} else {
$altUrn = explode(":", $urn)[4];
$response = Http::get("https://subtitles.eai-general.aws.srf.ch/srf/{$altUrn}/episode/de/vod/vod.vtt");
}
if($response && $response->successful()){
return $response->body();
}

View File

@ -0,0 +1,26 @@
<?php
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
Schema::table('episodes', function (Blueprint $table) {
$table->json('topics')->nullable();
});
}
/**
* Reverse the migrations.
*/
public function down(): void
{
//
}
};

View File

@ -47,6 +47,17 @@ input, button, textarea, select {
font: inherit;
}
input{
margin-top: .2em;
border-radius: .2em;
border: 1px solid #fff;
background: var(--c3);
color: #fff;
padding: .2em .5em;
display: block;
}
h1,
h2,
h3,
@ -294,6 +305,21 @@ main {
background: var(--c3);
}
.track-ctrl h2{
font-size: 1.2em;
}
.track-ctrl p {
margin-top: 1em;
font-size: 1em;
line-height: 1.5;
}
.track-ctrl label {
margin-top: 1em;
display: inline-block;
}
.track-viz{
background: var(--c3);
}

View File

@ -116,7 +116,10 @@
<div class="track">
<div class="track-ctrl">
<h2>Topics</h2>
<label>
Segmente:
<input type="number" min="1" max="10" value="4" id="topic-track-segment-ctrl">
</label>
</div>
<div class="track-viz">
<ul id="topic-segement-list" class="segments">

View File

@ -36,4 +36,9 @@ def save_sentiments_f_sub(id, data):
con.commit()
con.close()
def save_topics(id, data):
cur.execute("UPDATE episodes SET topics = ? WHERE id = ?", [data, id])
con.commit()
con.close()

View File

@ -0,0 +1,21 @@
# subtitles_processing
Paket zur Aufbereitung der Untertitel.
## subtitles-processing.py
Normalisiert die Untertitel einer Episode. Die timecodes werden umgespeichert, damit immer ganze Sätze pro Zeile vorhanden sind.
```bash
python src/normalize_subtitles/subtitles-processing.py -a <"normalize"> -ep <int>
```
## count_words.py
Zählt die Wortanzahl pro Satz.
```bash
python src/normalize_subtitles/count_words.py -ep <int>
```
## count_words.py
Rechnet die Sentimente pro Satz.
```bash
python src/normalize_subtitles/sentence_sentiment.py -ep <int>
```

View File

@ -1,8 +1,8 @@
[project]
authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
dependencies = ["webvtt-py>=0.5.1,<0.6", "spacy-llm>=0.7.2,<0.8", "germansentiment>=1.1.0,<2"]
dependencies = ["webvtt-py>=0.5.1,<0.6", "spacy-llm>=0.7.2,<0.8", "germansentiment>=1.1.0,<2", "bertopic>=0.16.4,<0.17", "gensim>=4.3.3,<5"]
description = "Normalisierung der Untertitel."
name = "normalize_subtitles"
name = "subtitles_processing"
requires-python = ">= 3.11"
version = "0.1.0"
@ -15,7 +15,7 @@ channels = ["conda-forge"]
platforms = ["win-64", "linux-64", "osx-64"]
[tool.pixi.pypi-dependencies]
normalize_subtitles = { path = ".", editable = true }
subtitles_processing = { path = ".", editable = true }
database = { path = "../database", editable = true}
[tool.pixi.tasks]

View File

@ -85,6 +85,7 @@ class NormalizeVtt:
d = {"sentences": sentences, "start": times_start, "end": times_end}
df = pl.DataFrame(data=d)
data_folder = str(Path(__file__).parents[4]) + "/data/" + ep + '/normalized_vtt.csv'
DATA_PATH = Path(__file__).parents[4]
data_folder = DATA_PATH / 'data' / ep / 'normalized_vtt.csv'
return df.write_csv(data_folder)
df.write_csv(data_folder)

View File

@ -10,7 +10,7 @@ from germansentiment import SentimentModel
def get_sent(ep):
data_folder = str(Path(__file__).parents[4]) + "/data/" + ep + '/normalized_vtt.csv'
data_folder = str(Path(__file__).parents[4]) + "/data/" + str(ep) + '/normalized_vtt.csv'
df = pl.read_csv(data_folder)
model = SentimentModel()

File diff suppressed because one or more lines are too long

View File

@ -1,2 +0,0 @@
# GitHub syntax highlighting
pixi.lock linguist-language=YAML linguist-generated=true

View File

@ -1,4 +0,0 @@
# pixi environments
.pixi
*.egg-info

File diff suppressed because it is too large Load Diff

View File

@ -1,23 +0,0 @@
[project]
authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
dependencies = []
description = "Add a short description here"
name = "video_deepface"
requires-python = ">=3.7,<3.11"
version = "0.1.0"
[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]
[tool.pixi.project]
channels = ["conda-forge"]
platforms = ["linux-64"]
[tool.pixi.pypi-dependencies]
video_deepface = { path = ".", editable = true }
[tool.pixi.tasks]
[tool.pixi.dependencies]
deepface = ">=0.0.86"

View File

@ -1,10 +0,0 @@
from deepface import DeepFace
"""
objs = DeepFace.analyze(
img_path = "/home/gio/Code/VANA/data/24/frame000010.jpg",
actions = ['age', 'gender', 'emotion'],
)
print(objs)
"""

View File

@ -1,2 +0,0 @@
# GitHub syntax highlighting
pixi.lock linguist-language=YAML linguist-generated=true

View File

@ -1,4 +0,0 @@
# pixi environments
.pixi
*.egg-info

File diff suppressed because it is too large Load Diff

View File

@ -1,20 +0,0 @@
[project]
authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
dependencies = ["deepface>=0.0.93,<0.0.94"]
description = "Add a short description here"
name = "video_deepface2"
requires-python = ">= 3.6,<3.11"
version = "0.1.0"
[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]
[tool.pixi.project]
channels = ["conda-forge"]
platforms = ["linux-64"]
[tool.pixi.pypi-dependencies]
video_deepface2 = { path = ".", editable = true }
[tool.pixi.tasks]

View File

@ -1,9 +0,0 @@
from deepface import DeepFace
objs = DeepFace.analyze(
img_path = "/home/gio/Code/VANA/data/24/frame000305.jpg",
actions = ['age', 'gender', 'emotion'],
)
print(objs)

View File

@ -8,7 +8,7 @@
[
{
"name": "Python venv",
"cmd": ["/home/gio/Code/VANA/VANA-python/video_deepface2/.pixi/envs/default/bin/python", "$file"],
"cmd": ["/home/gio/Code/VANA/VANA-python/subtitles_processing/.pixi/envs/default/bin/python", "$file"],
"selector": "source.python",
"file_regex": "^\\s*File \"(...*?)\", line ([0-9]*)"
}

File diff suppressed because one or more lines are too long

Binary file not shown.