improvement topic modelling

master
Giò Diani 2025-01-16 22:18:12 +01:00
parent 021b828fea
commit f7c0df98b2
35 changed files with 10498 additions and 15354 deletions

View File

@ -47,7 +47,7 @@ class Episode extends Api
$subtitles = new Subtitles; $subtitles = new Subtitles;
$subs = $subtitles->getWebVTT('urn:srf:episode:tv:'.$ep['id']); $subs = $subtitles->getWebVTT('urn:srf:episode:tv:'.$ep['id']);
$test = EpisodeModel::firstOrCreate( $epModel = EpisodeModel::updateOrCreate(
[ [
'urn' => $ep['fullLengthUrn'] 'urn' => $ep['fullLengthUrn']
], ],

View File

@ -18,13 +18,13 @@ class Subtitles extends Api
public function getUrl(string $urn): string public function getUrl(string $urn): string
{ {
$response = Http::withHeaders($this->headers)->withQueryParameters([ $response = Http::withHeaders($this->headers)->withQueryParameters([
'episode' => $urn, 'episode' => $urn,
])->get($this->endpoint.'subtitles'); ])->get($this->endpoint.'subtitles');
if($response->ok() && !empty($response[0]['url'])){
if($response->ok()){ return $response[0]['url'];
return $response->json()[0]['url'];
} }
return false; return false;
@ -34,9 +34,16 @@ class Subtitles extends Api
public function getWebVTT(string $urn): ?string public function getWebVTT(string $urn): ?string
{ {
$url = $this->getUrl($urn); $url = $this->getUrl($urn);
$response = Http::get($url); $response = null;
if($response->successful()){ if($url){
$response = Http::get($url);
} else {
$altUrn = explode(":", $urn)[4];
$response = Http::get("https://subtitles.eai-general.aws.srf.ch/srf/{$altUrn}/episode/de/vod/vod.vtt");
}
if($response && $response->successful()){
return $response->body(); return $response->body();
} }

View File

@ -0,0 +1,26 @@
<?php
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
Schema::table('episodes', function (Blueprint $table) {
$table->json('topics')->nullable();
});
}
/**
* Reverse the migrations.
*/
public function down(): void
{
//
}
};

View File

@ -47,6 +47,17 @@ input, button, textarea, select {
font: inherit; font: inherit;
} }
input{
margin-top: .2em;
border-radius: .2em;
border: 1px solid #fff;
background: var(--c3);
color: #fff;
padding: .2em .5em;
display: block;
}
h1, h1,
h2, h2,
h3, h3,
@ -294,6 +305,21 @@ main {
background: var(--c3); background: var(--c3);
} }
.track-ctrl h2{
font-size: 1.2em;
}
.track-ctrl p {
margin-top: 1em;
font-size: 1em;
line-height: 1.5;
}
.track-ctrl label {
margin-top: 1em;
display: inline-block;
}
.track-viz{ .track-viz{
background: var(--c3); background: var(--c3);
} }

View File

@ -116,7 +116,10 @@
<div class="track"> <div class="track">
<div class="track-ctrl"> <div class="track-ctrl">
<h2>Topics</h2> <h2>Topics</h2>
<label>
Segmente:
<input type="number" min="1" max="10" value="4" id="topic-track-segment-ctrl"> <input type="number" min="1" max="10" value="4" id="topic-track-segment-ctrl">
</label>
</div> </div>
<div class="track-viz"> <div class="track-viz">
<ul id="topic-segement-list" class="segments"> <ul id="topic-segement-list" class="segments">

View File

@ -36,4 +36,9 @@ def save_sentiments_f_sub(id, data):
con.commit() con.commit()
con.close() con.close()
def save_topics(id, data):
cur.execute("UPDATE episodes SET topics = ? WHERE id = ?", [data, id])
con.commit()
con.close()

View File

@ -0,0 +1,21 @@
# subtitles_processing
Paket zur Aufbereitung der Untertitel.
## subtitles-processing.py
Normalisiert die Untertitel einer Episode. Die timecodes werden umgespeichert, damit immer ganze Sätze pro Zeile vorhanden sind.
```bash
python src/normalize_subtitles/subtitles-processing.py -a <"normalize"> -ep <int>
```
## count_words.py
Zählt die Wortanzahl pro Satz.
```bash
python src/normalize_subtitles/count_words.py -ep <int>
```
## count_words.py
Rechnet die Sentimente pro Satz.
```bash
python src/normalize_subtitles/sentence_sentiment.py -ep <int>
```

View File

@ -1,8 +1,8 @@
[project] [project]
authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}] authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
dependencies = ["webvtt-py>=0.5.1,<0.6", "spacy-llm>=0.7.2,<0.8", "germansentiment>=1.1.0,<2"] dependencies = ["webvtt-py>=0.5.1,<0.6", "spacy-llm>=0.7.2,<0.8", "germansentiment>=1.1.0,<2", "bertopic>=0.16.4,<0.17", "gensim>=4.3.3,<5"]
description = "Normalisierung der Untertitel." description = "Normalisierung der Untertitel."
name = "normalize_subtitles" name = "subtitles_processing"
requires-python = ">= 3.11" requires-python = ">= 3.11"
version = "0.1.0" version = "0.1.0"
@ -15,7 +15,7 @@ channels = ["conda-forge"]
platforms = ["win-64", "linux-64", "osx-64"] platforms = ["win-64", "linux-64", "osx-64"]
[tool.pixi.pypi-dependencies] [tool.pixi.pypi-dependencies]
normalize_subtitles = { path = ".", editable = true } subtitles_processing = { path = ".", editable = true }
database = { path = "../database", editable = true} database = { path = "../database", editable = true}
[tool.pixi.tasks] [tool.pixi.tasks]

View File

@ -85,6 +85,7 @@ class NormalizeVtt:
d = {"sentences": sentences, "start": times_start, "end": times_end} d = {"sentences": sentences, "start": times_start, "end": times_end}
df = pl.DataFrame(data=d) df = pl.DataFrame(data=d)
data_folder = str(Path(__file__).parents[4]) + "/data/" + ep + '/normalized_vtt.csv' DATA_PATH = Path(__file__).parents[4]
data_folder = DATA_PATH / 'data' / ep / 'normalized_vtt.csv'
return df.write_csv(data_folder) df.write_csv(data_folder)

View File

@ -10,7 +10,7 @@ from germansentiment import SentimentModel
def get_sent(ep): def get_sent(ep):
data_folder = str(Path(__file__).parents[4]) + "/data/" + ep + '/normalized_vtt.csv' data_folder = str(Path(__file__).parents[4]) + "/data/" + str(ep) + '/normalized_vtt.csv'
df = pl.read_csv(data_folder) df = pl.read_csv(data_folder)
model = SentimentModel() model = SentimentModel()

File diff suppressed because one or more lines are too long

View File

@ -1,2 +0,0 @@
# GitHub syntax highlighting
pixi.lock linguist-language=YAML linguist-generated=true

View File

@ -1,4 +0,0 @@
# pixi environments
.pixi
*.egg-info

File diff suppressed because it is too large Load Diff

View File

@ -1,23 +0,0 @@
[project]
authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
dependencies = []
description = "Add a short description here"
name = "video_deepface"
requires-python = ">=3.7,<3.11"
version = "0.1.0"
[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]
[tool.pixi.project]
channels = ["conda-forge"]
platforms = ["linux-64"]
[tool.pixi.pypi-dependencies]
video_deepface = { path = ".", editable = true }
[tool.pixi.tasks]
[tool.pixi.dependencies]
deepface = ">=0.0.86"

View File

@ -1,10 +0,0 @@
from deepface import DeepFace
"""
objs = DeepFace.analyze(
img_path = "/home/gio/Code/VANA/data/24/frame000010.jpg",
actions = ['age', 'gender', 'emotion'],
)
print(objs)
"""

View File

@ -1,2 +0,0 @@
# GitHub syntax highlighting
pixi.lock linguist-language=YAML linguist-generated=true

View File

@ -1,4 +0,0 @@
# pixi environments
.pixi
*.egg-info

File diff suppressed because it is too large Load Diff

View File

@ -1,20 +0,0 @@
[project]
authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
dependencies = ["deepface>=0.0.93,<0.0.94"]
description = "Add a short description here"
name = "video_deepface2"
requires-python = ">= 3.6,<3.11"
version = "0.1.0"
[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]
[tool.pixi.project]
channels = ["conda-forge"]
platforms = ["linux-64"]
[tool.pixi.pypi-dependencies]
video_deepface2 = { path = ".", editable = true }
[tool.pixi.tasks]

View File

@ -1,9 +0,0 @@
from deepface import DeepFace
objs = DeepFace.analyze(
img_path = "/home/gio/Code/VANA/data/24/frame000305.jpg",
actions = ['age', 'gender', 'emotion'],
)
print(objs)

View File

@ -8,7 +8,7 @@
[ [
{ {
"name": "Python venv", "name": "Python venv",
"cmd": ["/home/gio/Code/VANA/VANA-python/video_deepface2/.pixi/envs/default/bin/python", "$file"], "cmd": ["/home/gio/Code/VANA/VANA-python/subtitles_processing/.pixi/envs/default/bin/python", "$file"],
"selector": "source.python", "selector": "source.python",
"file_regex": "^\\s*File \"(...*?)\", line ([0-9]*)" "file_regex": "^\\s*File \"(...*?)\", line ([0-9]*)"
} }

File diff suppressed because one or more lines are too long

Binary file not shown.