improvement topic modelling

2025-01-16 22:18:12 +01:00 · 2025-01-16 22:18:12 +01:00 · f7c0df98b2
commit f7c0df98b2
parent 021b828fea
35 changed files with 10498 additions and 15354 deletions
--- a/VANA-php/app/Srgssr/Episode.php
+++ b/VANA-php/app/Srgssr/Episode.php
@ -47,7 +47,7 @@ class Episode extends Api
                $subtitles = new Subtitles;
                $subs = $subtitles->getWebVTT('urn:srf:episode:tv:'.$ep['id']);

-                $test = EpisodeModel::firstOrCreate(
+                $epModel = EpisodeModel::updateOrCreate(
                    [
                        'urn' => $ep['fullLengthUrn']
                    ],
--- a/VANA-php/app/Srgssr/Subtitles.php
+++ b/VANA-php/app/Srgssr/Subtitles.php
@ -18,13 +18,13 @@ class Subtitles extends Api

    public function getUrl(string $urn): string
    {
+
        $response = Http::withHeaders($this->headers)->withQueryParameters([
            'episode' => $urn,
        ])->get($this->endpoint.'subtitles');

-
-        if($response->ok()){
-            return $response->json()[0]['url'];
+        if($response->ok() && !empty($response[0]['url'])){
+            return $response[0]['url'];
        }

        return false;
@ -34,9 +34,16 @@ class Subtitles extends Api
    public function getWebVTT(string $urn): ?string
    {
        $url = $this->getUrl($urn);
-        $response = Http::get($url);
+        $response = null;

-        if($response->successful()){
+        if($url){
+            $response = Http::get($url);
+        } else {
+            $altUrn = explode(":", $urn)[4];
+            $response = Http::get("https://subtitles.eai-general.aws.srf.ch/srf/{$altUrn}/episode/de/vod/vod.vtt");
+        }
+
+        if($response && $response->successful()){
            return $response->body();
        }

--- a/VANA-php/database/migrations/2025_01_16_210341_add_topics.php
+++ b/VANA-php/database/migrations/2025_01_16_210341_add_topics.php
@ -0,0 +1,26 @@
+<?php
+
+use Illuminate\Database\Migrations\Migration;
+use Illuminate\Database\Schema\Blueprint;
+use Illuminate\Support\Facades\Schema;
+
+return new class extends Migration
+{
+    /**
+     * Run the migrations.
+     */
+    public function up(): void
+    {
+        Schema::table('episodes', function (Blueprint $table) {
+            $table->json('topics')->nullable();
+        });
+    }
+
+    /**
+     * Reverse the migrations.
+     */
+    public function down(): void
+    {
+        //
+    }
+};
--- a/VANA-php/resources/css/app.css
+++ b/VANA-php/resources/css/app.css
@ -47,6 +47,17 @@ input, button, textarea, select {
  font: inherit;
 }

+input{
+    margin-top: .2em;
+    border-radius: .2em;
+    border: 1px solid #fff;
+    background: var(--c3);
+    color: #fff;
+    padding: .2em .5em;
+    display: block;
+}
+
+
 h1,
 h2,
 h3,
@ -294,6 +305,21 @@ main {
    background: var(--c3);
 }

+.track-ctrl h2{
+    font-size: 1.2em;
+}
+
+.track-ctrl p {
+    margin-top: 1em;
+    font-size: 1em;
+    line-height: 1.5;
+}
+
+.track-ctrl label {
+    margin-top: 1em;
+    display: inline-block;
+}
+
 .track-viz{
    background: var(--c3);
 }
--- a/VANA-php/resources/views/detail.blade.php
+++ b/VANA-php/resources/views/detail.blade.php
@ -116,7 +116,10 @@
            <div class="track">
                <div class="track-ctrl">
                    <h2>Topics</h2>
+                    <label>
+                        Segmente:
                    <input type="number" min="1" max="10" value="4" id="topic-track-segment-ctrl">
+                    </label>
                </div>
                <div class="track-viz">
                    <ul id="topic-segement-list" class="segments">
--- a/VANA-python/database/src/database/queries.py
+++ b/VANA-python/database/src/database/queries.py
@ -36,4 +36,9 @@ def save_sentiments_f_sub(id, data):
    con.commit()
    con.close()

+def save_topics(id, data):
+    cur.execute("UPDATE episodes SET topics = ? WHERE id = ?", [data, id])
+    con.commit()
+    con.close()
+

--- a/VANA-python/subtitles_processing/.gitattributes
+++ b/VANA-python/subtitles_processing/.gitattributes
--- a/VANA-python/subtitles_processing/.gitignore
+++ b/VANA-python/subtitles_processing/.gitignore
--- a/VANA-python/subtitles_processing/README.md
+++ b/VANA-python/subtitles_processing/README.md
@ -0,0 +1,21 @@
+# subtitles_processing
+Paket zur Aufbereitung der Untertitel.
+
+## subtitles-processing.py
+Normalisiert die Untertitel einer Episode. Die timecodes werden umgespeichert, damit immer ganze Sätze pro Zeile vorhanden sind.
+  
+```bash
+python src/normalize_subtitles/subtitles-processing.py -a <"normalize"> -ep <int>
+```
+
+## count_words.py
+Zählt die Wortanzahl pro Satz.
+```bash
+python src/normalize_subtitles/count_words.py -ep <int>
+```
+
+## count_words.py
+Rechnet die Sentimente pro Satz.
+```bash
+python src/normalize_subtitles/sentence_sentiment.py -ep <int>
+```
--- a/VANA-python/subtitles_processing/pixi.lock
+++ b/VANA-python/subtitles_processing/pixi.lock
--- a/VANA-python/subtitles_processing/pyproject.toml
+++ b/VANA-python/subtitles_processing/pyproject.toml
@ -1,8 +1,8 @@
 [project]
 authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
-dependencies = ["webvtt-py>=0.5.1,<0.6", "spacy-llm>=0.7.2,<0.8", "germansentiment>=1.1.0,<2"]
+dependencies = ["webvtt-py>=0.5.1,<0.6", "spacy-llm>=0.7.2,<0.8", "germansentiment>=1.1.0,<2", "bertopic>=0.16.4,<0.17", "gensim>=4.3.3,<5"]
 description = "Normalisierung der Untertitel."
-name = "normalize_subtitles"
+name = "subtitles_processing"
 requires-python = ">= 3.11"
 version = "0.1.0"

@ -15,7 +15,7 @@ channels = ["conda-forge"]
 platforms = ["win-64", "linux-64", "osx-64"]

 [tool.pixi.pypi-dependencies]
-normalize_subtitles = { path = ".", editable = true }
+subtitles_processing = { path = ".", editable = true }
 database = { path = "../database", editable = true}

 [tool.pixi.tasks]
--- a/VANA-python/subtitles_processing/src/subtitles_processing/init.py
+++ b/VANA-python/subtitles_processing/src/subtitles_processing/init.py
--- a/VANA-python/subtitles_processing/src/subtitles_processing/pycache/normalize_vtt.cpython-311.pyc
+++ b/VANA-python/subtitles_processing/src/subtitles_processing/pycache/normalize_vtt.cpython-311.pyc
--- a/VANA-python/subtitles_processing/src/subtitles_processing/pycache/timecode.cpython-311.pyc
+++ b/VANA-python/subtitles_processing/src/subtitles_processing/pycache/timecode.cpython-311.pyc
--- a/VANA-python/subtitles_processing/src/subtitles_processing/count_words.py
+++ b/VANA-python/subtitles_processing/src/subtitles_processing/count_words.py
--- a/VANA-python/subtitles_processing/src/subtitles_processing/normalize_vtt.py
+++ b/VANA-python/subtitles_processing/src/subtitles_processing/normalize_vtt.py
@ -85,6 +85,7 @@ class NormalizeVtt:
        d = {"sentences": sentences, "start": times_start, "end": times_end}
        df = pl.DataFrame(data=d)

-        data_folder = str(Path(__file__).parents[4]) + "/data/" + ep + '/normalized_vtt.csv'
+        DATA_PATH = Path(__file__).parents[4]
+        data_folder = DATA_PATH / 'data' / ep / 'normalized_vtt.csv'

-        return df.write_csv(data_folder)
+        df.write_csv(data_folder)
--- a/VANA-python/subtitles_processing/src/subtitles_processing/sentence_sentiment.py
+++ b/VANA-python/subtitles_processing/src/subtitles_processing/sentence_sentiment.py
@ -10,7 +10,7 @@ from germansentiment import SentimentModel


 def get_sent(ep):
-    data_folder = str(Path(__file__).parents[4]) + "/data/" + ep + '/normalized_vtt.csv'
+    data_folder = str(Path(__file__).parents[4]) + "/data/" + str(ep) + '/normalized_vtt.csv'
    df = pl.read_csv(data_folder)

    model = SentimentModel()
--- a/VANA-python/subtitles_processing/src/subtitles_processing/subtitles-processing.py
+++ b/VANA-python/subtitles_processing/src/subtitles_processing/subtitles-processing.py
--- a/VANA-python/subtitles_processing/src/subtitles_processing/timecode.py
+++ b/VANA-python/subtitles_processing/src/subtitles_processing/timecode.py
--- a/VANA-python/subtitles_processing/src/subtitles_processing/topics.py
+++ b/VANA-python/subtitles_processing/src/subtitles_processing/topics.py
--- a/VANA-python/video_deepface/.gitattributes
+++ b/VANA-python/video_deepface/.gitattributes
@ -1,2 +0,0 @@
-# GitHub syntax highlighting
-pixi.lock linguist-language=YAML linguist-generated=true
--- a/VANA-python/video_deepface/.gitignore
+++ b/VANA-python/video_deepface/.gitignore
@ -1,4 +0,0 @@
-
-# pixi environments
-.pixi
-*.egg-info
--- a/VANA-python/video_deepface/pixi.lock
+++ b/VANA-python/video_deepface/pixi.lock
--- a/VANA-python/video_deepface/pyproject.toml
+++ b/VANA-python/video_deepface/pyproject.toml
@ -1,23 +0,0 @@
-[project]
-authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
-dependencies = []
-description = "Add a short description here"
-name = "video_deepface"
-requires-python = ">=3.7,<3.11"
-version = "0.1.0"
-
-[build-system]
-build-backend = "hatchling.build"
-requires = ["hatchling"]
-
-[tool.pixi.project]
-channels = ["conda-forge"]
-platforms = ["linux-64"]
-
-[tool.pixi.pypi-dependencies]
-video_deepface = { path = ".", editable = true }
-
-[tool.pixi.tasks]
-
-[tool.pixi.dependencies]
-deepface = ">=0.0.86"
--- a/VANA-python/video_deepface/src/analyze.py
+++ b/VANA-python/video_deepface/src/analyze.py
@ -1,10 +0,0 @@
-from deepface import DeepFace
-
-"""
-objs = DeepFace.analyze(
-    img_path = "/home/gio/Code/VANA/data/24/frame000010.jpg",
-    actions = ['age', 'gender', 'emotion'],
-)
-
-print(objs)
-"""
--- a/VANA-python/video_deepface/src/video_deepface/init.py
+++ b/VANA-python/video_deepface/src/video_deepface/init.py
--- a/VANA-python/video_deepface2/.gitattributes
+++ b/VANA-python/video_deepface2/.gitattributes
@ -1,2 +0,0 @@
-# GitHub syntax highlighting
-pixi.lock linguist-language=YAML linguist-generated=true
--- a/VANA-python/video_deepface2/.gitignore
+++ b/VANA-python/video_deepface2/.gitignore
@ -1,4 +0,0 @@
-
-# pixi environments
-.pixi
-*.egg-info
--- a/VANA-python/video_deepface2/pixi.lock
+++ b/VANA-python/video_deepface2/pixi.lock
--- a/VANA-python/video_deepface2/pyproject.toml
+++ b/VANA-python/video_deepface2/pyproject.toml
@ -1,20 +0,0 @@
-[project]
-authors = [{name = "Giò Diani", email = "mail@gionathandiani.name"}]
-dependencies = ["deepface>=0.0.93,<0.0.94"]
-description = "Add a short description here"
-name = "video_deepface2"
-requires-python = ">= 3.6,<3.11"
-version = "0.1.0"
-
-[build-system]
-build-backend = "hatchling.build"
-requires = ["hatchling"]
-
-[tool.pixi.project]
-channels = ["conda-forge"]
-platforms = ["linux-64"]
-
-[tool.pixi.pypi-dependencies]
-video_deepface2 = { path = ".", editable = true }
-
-[tool.pixi.tasks]
--- a/VANA-python/video_deepface2/src/test.py
+++ b/VANA-python/video_deepface2/src/test.py
@ -1,9 +0,0 @@
-from deepface import DeepFace
-
-objs = DeepFace.analyze(
-    img_path = "/home/gio/Code/VANA/data/24/frame000305.jpg",
-    actions = ['age', 'gender', 'emotion'],
-)
-
-print(objs)
-
--- a/VANA-python/video_deepface2/src/video_deepface2/init.py
+++ b/VANA-python/video_deepface2/src/video_deepface2/init.py
--- a/VANA.sublime-project
+++ b/VANA.sublime-project
@ -8,7 +8,7 @@
    [
        {
            "name": "Python venv",
-            "cmd": ["/home/gio/Code/VANA/VANA-python/video_deepface2/.pixi/envs/default/bin/python", "$file"],
+            "cmd": ["/home/gio/Code/VANA/VANA-python/subtitles_processing/.pixi/envs/default/bin/python", "$file"],
 		    "selector": "source.python",
 		    "file_regex": "^\\s*File \"(...*?)\", line ([0-9]*)"
 		}
--- a/VANA.sublime-workspace
+++ b/VANA.sublime-workspace
--- a/database.sqlite
+++ b/database.sqlite