487 lines
21 KiB
Python
487 lines
21 KiB
Python
"""
|
||
============================================================================
|
||
Example 0: Tokens, Embeddings, and Language Similarity — An Introduction
|
||
============================================================================
|
||
AISE501 – AI in Software Engineering I
|
||
Fachhochschule Graubünden
|
||
|
||
GOAL:
|
||
Before we look at CODE embeddings, we need to understand the
|
||
foundational concepts: tokenization and text embeddings. This script
|
||
walks through the full pipeline step by step, using German words
|
||
and phrases so you can build intuition in your native language.
|
||
|
||
The pipeline is: Text → Tokens → Token IDs → Embedding Vectors
|
||
|
||
WHAT YOU WILL LEARN:
|
||
1. How text is split into TOKENS (sub-word units)
|
||
2. How tokens are mapped to integer IDs (the model's vocabulary)
|
||
3. How token IDs become dense EMBEDDING VECTORS (768 dimensions)
|
||
4. How cosine similarity measures meaning — similar phrases are
|
||
close in vector space, different phrases are far apart
|
||
5. How to VISUALIZE the embedding space in 2D using PCA
|
||
|
||
LANGUAGE:
|
||
All examples use German words and phrases to make the concepts
|
||
tangible. The model (multilingual) handles German natively.
|
||
|
||
HARDWARE:
|
||
Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac).
|
||
============================================================================
|
||
"""
|
||
|
||
import torch
|
||
import numpy as np
|
||
from transformers import AutoTokenizer, AutoModel, BertTokenizer
|
||
import torch.nn.functional as F
|
||
from sklearn.decomposition import PCA
|
||
import matplotlib
|
||
import matplotlib.pyplot as plt
|
||
|
||
matplotlib.use("Agg")
|
||
|
||
# ── Device selection ──────────────────────────────────────────────────────
|
||
def get_device():
|
||
if torch.cuda.is_available():
|
||
return torch.device("cuda")
|
||
elif torch.backends.mps.is_available():
|
||
return torch.device("mps")
|
||
return torch.device("cpu")
|
||
|
||
DEVICE = get_device()
|
||
print(f"Using device: {DEVICE}\n")
|
||
|
||
# ── Load a MULTILINGUAL EMBEDDING model ───────────────────────────────────
|
||
# We use paraphrase-multilingual-mpnet-base-v2: a sentence embedding model
|
||
# fine-tuned for semantic similarity across 50+ languages including German.
|
||
# It uses an XLM-RoBERTa backbone and produces 768-dimensional embeddings
|
||
# where cosine similarity directly reflects semantic similarity.
|
||
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
||
|
||
print(f"Loading model: {MODEL_NAME} ...")
|
||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
|
||
model.eval()
|
||
print("Model loaded.\n")
|
||
|
||
# ── Load a German-only tokenizer for comparison ──────────────────────────
|
||
# gbert-base uses WordPiece trained exclusively on German text (~31k vocab).
|
||
# We only load its tokenizer — no model weights needed.
|
||
GERMAN_TOKENIZER_NAME = "deepset/gbert-base"
|
||
print(f"Loading German tokenizer: {GERMAN_TOKENIZER_NAME} ...")
|
||
german_tokenizer = BertTokenizer.from_pretrained(GERMAN_TOKENIZER_NAME)
|
||
print("German tokenizer loaded.\n")
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════════════
|
||
# PART 1: TOKENIZATION — How text becomes numbers
|
||
# ══════════════════════════════════════════════════════════════════════════
|
||
print("=" * 70)
|
||
print("PART 1: TOKENIZATION")
|
||
print("=" * 70)
|
||
print("""
|
||
Neural networks cannot read text — they only understand numbers.
|
||
TOKENIZATION is the first step: splitting text into sub-word pieces
|
||
called TOKENS, then mapping each token to an integer ID.
|
||
|
||
We compare two tokenizers:
|
||
• gbert (German-only, ~31k vocab) — trained exclusively on German text
|
||
• mpnet (multilingual, ~250k vocab) — trained on 100+ languages
|
||
""")
|
||
|
||
german_words = [
|
||
"Fachhochschule",
|
||
"Softwareentwicklung",
|
||
"Künstliche Intelligenz",
|
||
"Programmiersprache",
|
||
"Datenbank",
|
||
"Maschinelles Lernen",
|
||
"Graubünden",
|
||
"unhappiness", # English comparison
|
||
]
|
||
|
||
# ── 1a: German-only tokenizer (gbert / WordPiece) ────────────────────────
|
||
print("─── 1a: German-Only Tokenizer (gbert, WordPiece, 31k vocab) ───\n")
|
||
print(f"{'Word/Phrase':<28s} {'#':>3s} {'Tokens'}")
|
||
print("-" * 90)
|
||
|
||
for word in german_words:
|
||
ids = german_tokenizer.encode(word, add_special_tokens=False)
|
||
toks = german_tokenizer.convert_ids_to_tokens(ids)
|
||
print(f"{word:<28s} {len(toks):3d} {' | '.join(toks)}")
|
||
|
||
# ── 1b: Multilingual tokenizer (mpnet / SentencePiece) ───────────────────
|
||
print(f"\n─── 1b: Multilingual Tokenizer (mpnet, SentencePiece, 250k vocab) ───\n")
|
||
print(f"{'Word/Phrase':<28s} {'#':>3s} {'Tokens'}")
|
||
print("-" * 90)
|
||
|
||
for word in german_words:
|
||
ids = tokenizer.encode(word, add_special_tokens=False)
|
||
toks = tokenizer.convert_ids_to_tokens(ids)
|
||
print(f"{word:<28s} {len(toks):3d} {' | '.join(toks)}")
|
||
|
||
print("""
|
||
KEY OBSERVATIONS:
|
||
• The GERMAN tokenizer keeps common words intact: "Fachhochschule" is
|
||
a SINGLE token, "Programmiersprache" splits at the natural compound
|
||
boundary "Programmier" + "sprache".
|
||
• The MULTILINGUAL tokenizer fragments German more aggressively:
|
||
"Fachhochschule" → 4 tokens ("Fach", "ho", "ch", "schule"), because
|
||
its 250k vocabulary is shared across 100+ languages — German gets
|
||
a smaller budget per word.
|
||
• Both tokenizers use STATISTICAL sub-word splitting (not morphological
|
||
analysis). The German tokenizer simply has more German-specific
|
||
entries because its entire vocabulary is dedicated to one language.
|
||
• Trade-off: the multilingual tokenizer needs more tokens per German
|
||
word, but it enables CROSS-LINGUAL capabilities (comparing German
|
||
and English in the same embedding space — see Part 3b).
|
||
• The rest of this script uses the multilingual model for embeddings.
|
||
""")
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════════════
|
||
# PART 2: FROM TOKENS TO EMBEDDING VECTORS
|
||
# ══════════════════════════════════════════════════════════════════════════
|
||
print("=" * 70)
|
||
print("PART 2: FROM TOKENS TO EMBEDDING VECTORS")
|
||
print("=" * 70)
|
||
print("""
|
||
Each token ID is looked up in an EMBEDDING TABLE — a large matrix where
|
||
each row is a dense vector (768 dimensions in this model, up to 4096 in
|
||
large LLMs). The transformer then refines these vectors through 12 layers
|
||
of self-attention, producing contextual embeddings where each token's
|
||
vector depends on ALL surrounding tokens.
|
||
""")
|
||
|
||
example_sentence = "Der Student lernt Programmieren an der Fachhochschule"
|
||
|
||
inputs = tokenizer(example_sentence, return_tensors="pt").to(DEVICE)
|
||
token_ids = inputs["input_ids"].squeeze().tolist()
|
||
tokens = tokenizer.convert_ids_to_tokens(token_ids)
|
||
|
||
with torch.no_grad():
|
||
outputs = model(**inputs)
|
||
|
||
# outputs.last_hidden_state: shape [1, num_tokens, 768]
|
||
hidden_states = outputs.last_hidden_state.squeeze(0)
|
||
|
||
print(f'Sentence: "{example_sentence}"\n')
|
||
print(f"{'Pos':>4s} {'Token':<20s} {'ID':>7s} {'Vector (first 8 of 768 dims)...'}")
|
||
print("-" * 80)
|
||
|
||
for i, (tok, tid) in enumerate(zip(tokens, token_ids)):
|
||
vec = hidden_states[i].cpu().numpy()
|
||
vec_preview = " ".join(f"{v:+.3f}" for v in vec[:8])
|
||
print(f"{i:4d} {tok:<20s} {tid:7d} [{vec_preview} ...]")
|
||
|
||
print(f"""
|
||
KEY OBSERVATIONS:
|
||
• Each token becomes a vector of {hidden_states.shape[1]} numbers.
|
||
• These numbers are NOT random — they encode the token's meaning
|
||
IN CONTEXT. The vector for "Fachhochschule" here is different from
|
||
the vector for "Fachhochschule" in a different sentence.
|
||
• The full sentence has {len(tokens)} tokens, producing a matrix of
|
||
shape [{len(tokens)} × {hidden_states.shape[1]}].
|
||
• To get a single vector for the whole sentence, we average all
|
||
token vectors (mean pooling).
|
||
""")
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════════════
|
||
# PART 3: MEASURING SIMILARITY BETWEEN WORDS
|
||
# ══════════════════════════════════════════════════════════════════════════
|
||
print("=" * 70)
|
||
print("PART 3: WORD AND PHRASE SIMILARITY")
|
||
print("=" * 70)
|
||
print("""
|
||
If embeddings capture meaning, then SIMILAR words should have SIMILAR
|
||
vectors (high cosine similarity) and DIFFERENT words should have
|
||
DIFFERENT vectors (low cosine similarity). Let's test this with German.
|
||
""")
|
||
|
||
|
||
def embed_text(text: str) -> torch.Tensor:
|
||
"""Embed a word or phrase into a single normalized vector."""
|
||
inputs = tokenizer(text, return_tensors="pt", truncation=True,
|
||
max_length=128, padding=True).to(DEVICE)
|
||
with torch.no_grad():
|
||
outputs = model(**inputs)
|
||
mask = inputs["attention_mask"].unsqueeze(-1)
|
||
embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1)
|
||
return F.normalize(embedding, p=2, dim=1).squeeze(0)
|
||
|
||
|
||
# ── 3a: Single word similarities ─────────────────────────────────────────
|
||
print("─── 3a: Single Word Similarities ───\n")
|
||
|
||
word_pairs = [
|
||
# Semantically SIMILAR pairs (synonyms or near-synonyms)
|
||
("Auto", "Fahrzeug"), # car / vehicle — near-synonyms
|
||
("Arzt", "Doktor"), # physician / doctor — synonyms
|
||
("Programmierer", "Entwickler"), # programmer / developer
|
||
("schnell", "rasch"), # fast / swift — synonyms
|
||
("Haus", "Gebäude"), # house / building — closely related
|
||
|
||
# SAME CATEGORY but different concepts
|
||
("Hund", "Katze"), # dog / cat — both pets, but different!
|
||
("Montag", "Freitag"), # Monday / Friday — both weekdays
|
||
|
||
# Semantically UNRELATED pairs
|
||
("Hund", "Mathematik"), # dog vs math
|
||
("Auto", "Philosophie"), # car vs philosophy
|
||
("schnell", "Datenbank"), # fast vs database
|
||
]
|
||
|
||
print(f"{'Word A':<20s} {'Word B':<20s} {'Cosine Sim':>10s} {'Relationship'}")
|
||
print("-" * 75)
|
||
|
||
for w1, w2 in word_pairs:
|
||
v1, v2 = embed_text(w1), embed_text(w2)
|
||
sim = torch.dot(v1.cpu(), v2.cpu()).item()
|
||
if sim > 0.6:
|
||
rel = "synonyms/close"
|
||
elif sim > 0.3:
|
||
rel = "related"
|
||
else:
|
||
rel = "unrelated"
|
||
bar = "█" * int(max(0, sim) * 30)
|
||
print(f"{w1:<20s} {w2:<20s} {sim:10.3f} {bar} ({rel})")
|
||
|
||
print("""
|
||
KEY OBSERVATIONS:
|
||
→ Synonyms (Auto/Fahrzeug, Arzt/Doktor) have HIGHEST similarity.
|
||
→ Same-category but different concepts (Hund/Katze) have MODERATE
|
||
similarity — they share context (both are pets) but a dog is NOT
|
||
a cat. The model captures this nuance!
|
||
→ Completely unrelated words (Hund/Mathematik) have LOW similarity.
|
||
→ Embedding similarity reflects MEANING OVERLAP, not just category.
|
||
""")
|
||
|
||
# ── 3b: Phrase/sentence similarities ─────────────────────────────────────
|
||
print("─── 3b: Phrase and Sentence Similarities ───\n")
|
||
|
||
phrases = {
|
||
"ML_de": "Maschinelles Lernen ist ein Teilgebiet der Informatik",
|
||
"ML_en": "Machine learning is a subfield of computer science",
|
||
"DL_de": "Deep Learning verwendet neuronale Netze mit vielen Schichten",
|
||
"Koch": "Der Koch bereitet das Abendessen in der Küche vor",
|
||
"Wetter": "Morgen wird es regnen und kalt sein",
|
||
"Prog": "Python ist eine beliebte Programmiersprache",
|
||
}
|
||
|
||
phrase_embeddings = {name: embed_text(text) for name, text in phrases.items()}
|
||
|
||
names = list(phrases.keys())
|
||
print(f"{'':>10s}", end="")
|
||
for n in names:
|
||
print(f"{n:>10s}", end="")
|
||
print()
|
||
|
||
for n1 in names:
|
||
print(f"{n1:>10s}", end="")
|
||
for n2 in names:
|
||
sim = torch.dot(phrase_embeddings[n1].cpu(),
|
||
phrase_embeddings[n2].cpu()).item()
|
||
print(f"{sim:10.3f}", end="")
|
||
print()
|
||
|
||
print("""
|
||
KEY OBSERVATIONS:
|
||
• "Maschinelles Lernen..." (German) and "Machine learning..." (English)
|
||
should have HIGH similarity — the model understands both languages
|
||
and maps equivalent meanings to nearby vectors.
|
||
• ML and Deep Learning sentences should be moderately similar (related
|
||
topics in computer science).
|
||
• The cooking sentence and weather sentence should be DISSIMILAR to
|
||
the tech sentences — completely different topics.
|
||
• This CROSS-LINGUAL capability is what makes multilingual embeddings
|
||
so powerful.
|
||
""")
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════════════
|
||
# PART 4: VISUALIZING THE EMBEDDING SPACE
|
||
# ══════════════════════════════════════════════════════════════════════════
|
||
print("=" * 70)
|
||
print("PART 4: VISUALIZING THE EMBEDDING SPACE")
|
||
print("=" * 70)
|
||
print("""
|
||
768 dimensions are impossible to visualize. We use PCA to project the
|
||
vectors down to 2D while preserving as much structure as possible.
|
||
If the embeddings truly capture meaning, we should see CLUSTERS of
|
||
related words in the 2D plot.
|
||
""")
|
||
|
||
# Groups of German words organized by semantic category
|
||
word_groups = {
|
||
"Tiere": ["Hund", "Katze", "Pferd", "Vogel", "Fisch", "Kuh"],
|
||
"Technik": ["Computer", "Software", "Programmieren", "Datenbank",
|
||
"Algorithmus", "Internet"],
|
||
"Essen": ["Brot", "Käse", "Apfel", "Suppe", "Kuchen", "Wurst"],
|
||
"Natur": ["Berg", "Fluss", "Wald", "See", "Wiese", "Schnee"],
|
||
"Berufe": ["Arzt", "Lehrer", "Ingenieur", "Koch", "Pilot", "Anwalt"],
|
||
}
|
||
|
||
all_words = []
|
||
all_categories = []
|
||
all_vectors = []
|
||
|
||
print("Computing embeddings for word groups...")
|
||
for category, words in word_groups.items():
|
||
for word in words:
|
||
vec = embed_text(word).cpu().numpy()
|
||
all_words.append(word)
|
||
all_categories.append(category)
|
||
all_vectors.append(vec)
|
||
print(f" {category}: {', '.join(words)}")
|
||
|
||
X = np.stack(all_vectors)
|
||
print(f"\nEmbedding matrix: {X.shape[0]} words × {X.shape[1]} dimensions")
|
||
|
||
# ── PCA to 2D ────────────────────────────────────────────────────────────
|
||
pca = PCA(n_components=2)
|
||
X_2d = pca.fit_transform(X)
|
||
|
||
# ── Plot ──────────────────────────────────────────────────────────────────
|
||
category_names = list(word_groups.keys())
|
||
cmap = plt.cm.Set1
|
||
colors = {cat: cmap(i / len(category_names)) for i, cat in enumerate(category_names)}
|
||
|
||
fig, ax = plt.subplots(figsize=(12, 9))
|
||
|
||
for i, (word, cat) in enumerate(zip(all_words, all_categories)):
|
||
x, y = X_2d[i]
|
||
ax.scatter(x, y, c=[colors[cat]], s=120, edgecolors="black",
|
||
linewidth=0.5, zorder=3)
|
||
ax.annotate(word, (x, y), fontsize=9, ha="center", va="bottom",
|
||
xytext=(0, 7), textcoords="offset points",
|
||
fontweight="bold")
|
||
|
||
for cat in category_names:
|
||
ax.scatter([], [], c=[colors[cat]], s=100, label=cat,
|
||
edgecolors="black", linewidth=0.5)
|
||
|
||
ax.legend(loc="best", fontsize=11, title="Kategorie", title_fontsize=12,
|
||
framealpha=0.9)
|
||
|
||
var = pca.explained_variance_ratio_
|
||
ax.set_title(
|
||
"Deutsche Wörter im Embedding-Raum (768D → 2D via PCA)\n"
|
||
f"PC1: {var[0]:.1%} Varianz, PC2: {var[1]:.1%} Varianz",
|
||
fontsize=14, fontweight="bold"
|
||
)
|
||
ax.set_xlabel("Hauptkomponente 1 (PC1)", fontsize=12)
|
||
ax.set_ylabel("Hauptkomponente 2 (PC2)", fontsize=12)
|
||
ax.grid(True, alpha=0.3)
|
||
fig.tight_layout()
|
||
fig.savefig("embedding_space_german.png", dpi=150)
|
||
print(f"\nSaved: embedding_space_german.png")
|
||
|
||
# ── Second plot: Phrases including cross-lingual ──────────────────────────
|
||
print("\nComputing phrase embeddings for visualization...")
|
||
|
||
viz_phrases = {
|
||
# German CS phrases
|
||
"Maschinelles Lernen": "Technik (DE)",
|
||
"Neuronale Netze": "Technik (DE)",
|
||
"Softwareentwicklung": "Technik (DE)",
|
||
"Künstliche Intelligenz": "Technik (DE)",
|
||
# English equivalents
|
||
"Machine Learning": "Technik (EN)",
|
||
"Neural Networks": "Technik (EN)",
|
||
"Software Development": "Technik (EN)",
|
||
"Artificial Intelligence": "Technik (EN)",
|
||
# German everyday phrases
|
||
"Guten Morgen": "Alltag (DE)",
|
||
"Wie geht es Ihnen": "Alltag (DE)",
|
||
"Das Wetter ist schön": "Alltag (DE)",
|
||
"Ich gehe einkaufen": "Alltag (DE)",
|
||
# English everyday phrases
|
||
"Good morning": "Alltag (EN)",
|
||
"How are you": "Alltag (EN)",
|
||
"The weather is nice": "Alltag (EN)",
|
||
"I am going shopping": "Alltag (EN)",
|
||
}
|
||
|
||
phrase_labels = list(viz_phrases.keys())
|
||
phrase_cats = list(viz_phrases.values())
|
||
phrase_vecs = np.stack([embed_text(p).cpu().numpy() for p in phrase_labels])
|
||
|
||
pca2 = PCA(n_components=2)
|
||
P_2d = pca2.fit_transform(phrase_vecs)
|
||
|
||
cat_colors = {
|
||
"Technik (DE)": "#1f77b4",
|
||
"Technik (EN)": "#aec7e8",
|
||
"Alltag (DE)": "#d62728",
|
||
"Alltag (EN)": "#ff9896",
|
||
}
|
||
|
||
fig2, ax2 = plt.subplots(figsize=(12, 9))
|
||
|
||
for i, (label, cat) in enumerate(zip(phrase_labels, phrase_cats)):
|
||
x, y = P_2d[i]
|
||
marker = "o" if "(DE)" in cat else "s" # circle=German, square=English
|
||
ax2.scatter(x, y, c=cat_colors[cat], s=140, marker=marker,
|
||
edgecolors="black", linewidth=0.5, zorder=3)
|
||
ax2.annotate(label, (x, y), fontsize=8, ha="center", va="bottom",
|
||
xytext=(0, 8), textcoords="offset points")
|
||
|
||
for cat, color in cat_colors.items():
|
||
marker = "o" if "(DE)" in cat else "s"
|
||
ax2.scatter([], [], c=color, s=100, marker=marker, label=cat,
|
||
edgecolors="black", linewidth=0.5)
|
||
|
||
ax2.legend(loc="best", fontsize=10, title="Kategorie & Sprache",
|
||
title_fontsize=11, framealpha=0.9)
|
||
|
||
var2 = pca2.explained_variance_ratio_
|
||
ax2.set_title(
|
||
"Cross-lingual Embeddings: Deutsche & Englische Phrasen\n"
|
||
f"PC1: {var2[0]:.1%} Varianz, PC2: {var2[1]:.1%} Varianz",
|
||
fontsize=14, fontweight="bold"
|
||
)
|
||
ax2.set_xlabel("Hauptkomponente 1 (PC1)", fontsize=12)
|
||
ax2.set_ylabel("Hauptkomponente 2 (PC2)", fontsize=12)
|
||
ax2.grid(True, alpha=0.3)
|
||
fig2.tight_layout()
|
||
fig2.savefig("embedding_space_crosslingual.png", dpi=150)
|
||
print(f"Saved: embedding_space_crosslingual.png")
|
||
|
||
print(f"""
|
||
{'=' * 70}
|
||
SUMMARY: THE FULL PIPELINE
|
||
{'=' * 70}
|
||
|
||
Text → Tokens → Token IDs → Embeddings
|
||
"Fachhochschule" [▁Fach, ho, [28356, 497, [0.012, -0.34,
|
||
ch, schule] 206, 72460] 0.88, ...]
|
||
(768 dimensions)
|
||
|
||
1. TOKENIZATION splits text into statistical sub-word pieces.
|
||
→ Splits are based on frequency, not German morphology.
|
||
→ Each token maps to an integer ID from the vocabulary.
|
||
|
||
2. EMBEDDING VECTORS are 768-dimensional representations of meaning.
|
||
→ Computed by the transformer's 12 layers of self-attention.
|
||
→ Similar meanings → nearby vectors (high cosine similarity).
|
||
→ Different meanings → distant vectors (low cosine similarity).
|
||
|
||
3. COSINE SIMILARITY measures how "aligned" two vectors are.
|
||
→ 1.0 = identical meaning, 0.0 = unrelated, -1.0 = opposite.
|
||
|
||
4. CROSS-LINGUAL EMBEDDINGS map equivalent phrases in different
|
||
languages to nearby vectors. "Maschinelles Lernen" ≈ "Machine
|
||
Learning" in embedding space.
|
||
|
||
5. The SAME PRINCIPLES apply to CODE EMBEDDINGS (next examples):
|
||
→ Code is tokenized into sub-word pieces
|
||
→ A transformer produces embedding vectors
|
||
→ Similar code has similar vectors
|
||
→ This enables semantic code search, clone detection, and RAG
|
||
|
||
Check the two PNG files for visual confirmation:
|
||
• embedding_space_german.png — German word clusters
|
||
• embedding_space_crosslingual.png — DE/EN phrase alignment
|
||
""")
|