Lecture 2: Words and Word Representations¶

Demo Notebook¶

This notebook accompanies Lecture 2 and demonstrates:

  1. Co-occurrence counts and word-context matrices
  2. Similarity measures (dot product, cosine similarity)
  3. Nearest neighbors (count-based, SVD-based, LSA-based)
  4. Word2Vec embeddings
  5. Analogy evaluation across all methods
In [1]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from tqdm import tqdm

from datasets import load_dataset
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab', quiet=True)

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

np.random.seed(42)

Loading Simple Wikipedia¶

We use the Simple Wikipedia dataset.

In [2]:
# Load Simple Wikipedia dataset (first 100k documents)
print("Loading Simple Wikipedia dataset...")
simplewiki_dataset = load_dataset("rahular/simple-wikipedia", split="train[:100000]")
print(f"Dataset loaded: {len(simplewiki_dataset)} documents")
Loading Simple Wikipedia dataset...
Dataset loaded: 100000 documents
In [3]:
def read_corpus(data, max_docs=None):
    """Loads simple wikipedia and returns tokenized documents."""
    raw_texts = data["text"][:max_docs] if max_docs else data["text"]
    tokenized_docs = [[w.lower() for w in word_tokenize(doc)] for doc in tqdm(raw_texts)]
    return tokenized_docs

print("Tokenizing corpus...")
tokenized = read_corpus(simplewiki_dataset)
print(f"Tokenized {len(tokenized)} documents")
print(f"Sample: {tokenized[1][:15]}...")
Tokenizing corpus...
100%|███████████████| 100000/100000 [00:07<00:00, 14147.01it/s]
Tokenized 100000 documents
Sample: ['april', 'is', 'the', 'fourth', 'month', 'of', 'the', 'year', ',', 'and', 'comes', 'between', 'march', 'and', 'may']...

In [4]:
# Build vocabulary
all_words = [w for doc in tokenized for w in doc]
word_counts = Counter(all_words)

# Filter to words with count >= 5
min_count = 5
vocab = sorted([w for w, c in word_counts.items() if c >= min_count])
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}

print(f"Total tokens: {len(all_words)}")
print(f"Vocabulary size (min_count={min_count}): {len(vocab)}")
Total tokens: 4737939
Vocabulary size (min_count=5): 30354
In [5]:
# Find a example sentence with queen

for doc in tokenized:
    if "queen" in doc:
        pos = doc.index("queen")
        print(" ".join(doc[pos-10:pos+10]))
        break
years later , turing received a posthumous royal pardon from queen elizabeth ii . today , the “ turing law
In [29]:
# most common 50 words
print(word_counts.most_common(500))
[('the', 286229), ('.', 259446), (',', 214881), ('of', 131439), ('and', 109291), ('in', 108036), ('a', 97203), ('to', 81746), ('is', 80107), ('was', 43778), ("''", 43182), ('``', 42333), ('it', 37715), ('are', 37242), (')', 35057), ('(', 35049), ('that', 31463), ('for', 30545), ('as', 28342), ('on', 24687), ('he', 23955), ('by', 23830), ('with', 22203), ('or', 22177), ('they', 22118), ('from', 20954), ('this', 17926), ('be', 17287), ('not', 16825), ('people', 16650), ("'s", 16458), ('an', 15660), ('his', 15538), ('also', 15078), ('have', 14845), ('can', 14308), ('at', 13747), ('were', 13655), ('which', 13633), ('has', 13561), ('many', 12631), ('one', 11749), ('there', 11672), ('called', 11656), ('other', 11619), ('their', 11050), (':', 10891), ('but', 10860), ('some', 10806), ('when', 9862), ('used', 9535), ('most', 9382), ('had', 9344), ('first', 8588), ('about', 8571), ('who', 8445), ('made', 7902), ('more', 7762), ('after', 7696), ('all', 7208), ('its', 7131), ('because', 6958), ('city', 6801), ('two', 6777), ('very', 6672), ('these', 6624), ('time', 6571), ('like', 6562), ('into', 6064), ('new', 5909), ('world', 5895), ('them', 5794), ('she', 5786), ('different', 5710), ('than', 5694), ('been', 5580), ('may', 5556), ('only', 5467), ('her', 5385), ('such', 5205), ('years', 5122), ('united', 5018), ('became', 4962), ('known', 4880), ('part', 4810), ('states', 4809), ('so', 4807), ('make', 4726), ('often', 4666), ('if', 4658), ('up', 4650), ('do', 4477), ('use', 4458), (';', 4361), ('music', 4239), ('during', 4224), ('where', 4089), ('would', 4089), ('between', 4069), ('will', 4057), ('over', 4051), ('usually', 4030), ('then', 4029), ('war', 3973), ("'", 3949), ('out', 3800), ('name', 3750), ('him', 3631), ('person', 3502), ('same', 3428), ('state', 3425), ('country', 3421), ('year', 3419), ('each', 3401), ('example', 3380), ('did', 3319), ('later', 3238), ('no', 3210), ('countries', 3208), ('three', 3198), ('sometimes', 3062), ('water', 3052), ('important', 3024), ('things', 3013), ('before', 3011), ('century', 2981), ('south', 2965), ('way', 2954), ('however', 2950), ('number', 2949), ('around', 2946), ('small', 2931), ('north', 2916), ('could', 2912), ('means', 2878), ('found', 2862), ('english', 2853), ('work', 2831), ('government', 2790), ('much', 2770), ('american', 2763), ('another', 2741), ('what', 2728), ('area', 2726), ('king', 2696), ('live', 2687), ('born', 2683), ('long', 2677), ('through', 2668), ('large', 2648), ('started', 2631), ('well', 2630), ('language', 2624), ('both', 2619), ('being', 2617), ('system', 2616), ('word', 2613), ('get', 2607), ('famous', 2590), ('now', 2588), ('%', 2556), ('since', 2530), ('until', 2514), ('how', 2509), ('said', 2503), ('even', 2468), ('group', 2459), ('died', 2454), ('common', 2449), ('second', 2415), ('any', 2408), ('while', 2368), ('still', 2325), ('family', 2258), ('main', 2251), ('several', 2249), ('life', 2248), ('president', 2200), ('body', 2190), ('popular', 2175), ('named', 2170), ('place', 2146), ('old', 2128), ('england', 2103), ('day', 2089), ('capital', 2080), ('good', 2074), ('against', 2064), ('back', 2054), ('does', 2053), ('together', 2044), ('parts', 2043), ('including', 2036), ('include', 2036), ('river', 2014), ('great', 2012), ('british', 1940), ('-', 1915), ('movie', 1908), ('band', 1907), ('form', 1903), ('early', 1898), ('power', 1895), ('air', 1885), ('wrote', 1885), ('four', 1883), ('using', 1883), ('largest', 1873), ('become', 1852), ('million', 1841), ('high', 1836), ('national', 1832), ('end', 1827), ('east', 1825), ('game', 1820), ('go', 1815), ('written', 1814), ('i', 1810), ('island', 1807), ('own', 1806), ('’', 1801), ('show', 1793), ('children', 1792), ('species', 1789), ('1', 1787), ('under', 1781), ('you', 1774), ('began', 1773), ('sea', 1767), ('came', 1744), ('times', 1743), ('we', 1712), ('food', 1710), ('just', 1698), ('europe', 1694), ('french', 1694), ('school', 1693), ('best', 1692), ('see', 1691), ('france', 1686), ('germany', 1684), ('series', 1683), ('s', 1683), ('land', 1682), ('today', 1681), ('released', 1678), ('every', 1674), ('west', 1672), ('death', 1669), ('makes', 1664), ('german', 1657), ('thought', 1634), ('took', 1630), ('light', 1619), ('album', 1615), ('left', 1613), ('earth', 1610), ('population', 1608), ('empire', 1603), ('help', 1601), ('down', 1593), ('few', 1592), ('animals', 1590), ('won', 1590), ('those', 1585), ('lot', 1576), ('money', 1566), ('home', 1566), ('last', 1563), ('big', 1560), ('god', 1537), ('played', 1532), ('2', 1530), ('play', 1529), ('town', 1528), ('built', 1491), ('church', 1483), ('words', 1480), ('america', 1474), ('making', 1473), ('living', 1468), ('john', 1468), ('take', 1465), ('father', 1459), ('region', 1448), ('university', 1443), ('type', 1442), ('created', 1437), ('book', 1434), ('modern', 1434), ('went', 1433), ('less', 1419), ('union', 1414), ('kingdom', 1412), ('without', 1409), ('human', 1407), ('house', 1406), ('games', 1406), ('history', 1404), ('change', 1403), ('white', 1402), ('ii', 1396), ('near', 1393), ('army', 1393), ('others', 1382), ('party', 1378), ('roman', 1372), ('black', 1368), ('types', 1366), ('based', 1362), ('again', 1359), ('man', 1354), ('given', 1343), ('put', 1341), ('energy', 1339), ('“', 1331), ('”', 1327), ('major', 1324), ('groups', 1323), ('age', 1319), ('cities', 1312), ('set', 1308), ('special', 1305), ('places', 1298), ('comes', 1294), ('next', 1290), ('languages', 1285), ('although', 1281), ('come', 1280), ('india', 1280), ('similar', 1278), ('blood', 1268), ('ancient', 1263), ('something', 1259), ('men', 1259), ('lived', 1258), ('should', 1249), ('television', 1245), ('away', 1245), ('china', 1244), ('western', 1240), ('say', 1240), ('team', 1238), ('members', 1234), ('–', 1232), ('women', 1229), ('period', 1229), ('too', 1224), ('january', 1223), ('march', 1218), ('though', 1213), ('company', 1213), ('wanted', 1206), ('rock', 1200), ('right', 1199), ('almost', 1198), ('law', 1195), ('control', 1190), ('son', 1188), ('islands', 1188), ('killed', 1187), ('republic', 1187), ('computer', 1184), ('middle', 1181), ('red', 1181), ('3', 1180), ('changed', 1179), ('must', 1179), ('line', 1177), ('general', 1175), ('international', 1169), ('five', 1169), ('married', 1169), ('plants', 1165), ('instead', 1158), ('greek', 1153), ('mean', 1152), ('term', 1152), ('mostly', 1144), ('december', 1142), ('areas', 1141), ('july', 1138), ('london', 1138), ('little', 1138), ('york', 1136), ('mother', 1122), ('de', 1121), ('days', 1107), ('northern', 1105), ('led', 1104), ('off', 1103), ('september', 1102), ('seen', 1101), ('got', 1100), ('june', 1095), ('november', 1095), ('point', 1093), ('order', 1093), ('meaning', 1092), ('august', 1088), ('central', 1087), ('always', 1085), ('april', 1084), ('october', 1083), ('young', 1082), ('third', 1080), ('side', 1079), ('public', 1078), ('10', 1074), ('european', 1071), ('uses', 1071), ('song', 1068), ('along', 1067), ('study', 1067), ('think', 1059), ('movies', 1057), ('short', 1057), ('done', 1050), ('give', 1048), ('force', 1048), ('numbers', 1044), ('5', 1040), ('story', 1040), ('once', 1037), ('inside', 1037), ('moved', 1034), ('books', 1024), ('soviet', 1024), ('songs', 1022), ('kind', 1021), ('political', 1020), ('works', 1018), ('hard', 1015), ('us', 1010), ('gave', 1008), ('player', 1008), ('county', 1007), ('metal', 1006), ('japan', 1002), ('cause', 997), ('following', 995), ('need', 995), ('australia', 994), ('certain', 991), ('africa', 990), ('better', 985), ('especially', 981), ('february', 977), ('might', 976), ('find', 973), ('4', 970), ('natural', 970), ('move', 967), ('top', 965), ('problems', 958), ('free', 956), ('head', 955), ('someone', 954), ('smaller', 954), ('latin', 948), ('able', 948), ('single', 943), ('southern', 942), ('military', 942), ('held', 939), ('eat', 934), ('league', 930), ('space', 927), ('having', 925), ('never', 923), ('chemical', 922), ('know', 921), ('enough', 917), ('according', 914), ('list', 911), ('ways', 910), ('want', 906), ('kinds', 903), ('strong', 900), ('humans', 899), ('sound', 897), ('football', 897), ('either', 896), ('writing', 893), ('lost', 889), ('changes', 886), ('caused', 886), ('member', 884), ('cells', 881), ('idea', 880), ('keep', 879), ('chinese', 876), ('considered', 874), ('late', 870), ('spanish', 870), ('sun', 868), ('henry', 868), ('heart', 863), ('among', 862), ('20', 859)]

Part 1: Co-occurrence Counts¶

Build a word-context matrix where each cell counts how often words appear together within a window.

In [6]:
def build_cooccurrence_matrix(tokenized_corpus, word2idx, window_size=2):
    """Build a word-context co-occurrence matrix."""
    V = len(word2idx)
    cooccur = np.zeros((V, V), dtype=np.float32)
    
    for doc in tqdm(tokenized_corpus):
        for i, word in enumerate(doc):
            if word not in word2idx:
                continue
            word_idx = word2idx[word]
            start = max(0, i - window_size)
            end = min(len(doc), i + window_size + 1)
            
            for j in range(start, end):
                if i == j:
                    continue
                context_word = doc[j]
                if context_word in word2idx:
                    context_idx = word2idx[context_word]
                    cooccur[word_idx, context_idx] += 1
    
    return cooccur

print("Building co-occurrence matrix...")
cooccur_matrix = build_cooccurrence_matrix(tokenized, word2idx, window_size=2)
print(f"Co-occurrence matrix shape: {cooccur_matrix.shape}")
Building co-occurrence matrix...
100%|███████████████| 100000/100000 [00:04<00:00, 23884.31it/s]
Co-occurrence matrix shape: (30354, 30354)

In [7]:
# Compute pairwise dot products for co-occurrence vectors
print("Computing dot product similarity...")
cooccur_dot_product = linear_kernel(cooccur_matrix)
print(f"Similarity matrix shape: {cooccur_dot_product.shape}")
Computing dot product similarity...
Similarity matrix shape: (30354, 30354)
In [8]:
# Compute pairwise cosine similarity for co-occurrence vectors
print("Computing pairwise cosine similarity...")
cooccur_cosine_sim = cosine_similarity(cooccur_matrix)
print(f"Similarity matrix shape: {cooccur_cosine_sim.shape}")
Computing pairwise cosine similarity...
Similarity matrix shape: (30354, 30354)

Part 2: SVD on Co-occurrence Matrix¶

Apply Truncated SVD to reduce dimensionality:

$$\mathbf{M} \approx \mathbf{U}_k \mathbf{\Sigma}_k \mathbf{V}_k^\top$$

In [9]:
n_components = 100
print(f"Applying Truncated SVD with {n_components} components...")

svd_cooccur = TruncatedSVD(n_components=n_components, n_iter=10, random_state=42)
svd_cooccur_vectors = svd_cooccur.fit_transform(cooccur_matrix)

print(f"SVD vectors shape: {svd_cooccur_vectors.shape}")
print(f"Explained variance ratio: {svd_cooccur.explained_variance_ratio_.sum():.2%}")

# Compute cosine similarity
svd_cooccur_cosine_sim = cosine_similarity(svd_cooccur_vectors)
Applying Truncated SVD with 100 components...
SVD vectors shape: (30354, 100)
Explained variance ratio: 99.93%

Part 3: LSA (Latent Semantic Analysis)¶

Apply SVD to TF-IDF weighted document-word matrix:

  • Rows: documents
  • Columns: words
  • Entries: TF-IDF weights (term frequency × inverse document frequency)

TF-IDF downweights common words and emphasizes distinctive terms, giving better semantic representations than raw counts.

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer

# Build document-word matrix with raw counts, then apply TF-IDF
def build_doc_word_matrix(tokenized_corpus, word2idx):
    """Build a document-word count matrix."""
    n_docs = len(tokenized_corpus)
    V = len(word2idx)
    doc_word = np.zeros((n_docs, V), dtype=np.float32)
    
    for doc_idx, doc in enumerate(tqdm(tokenized_corpus)):
        for word in doc:
            if word in word2idx:
                word_idx = word2idx[word]
                doc_word[doc_idx, word_idx] += 1
    
    return doc_word

print("Building document-word matrix...")
doc_word_counts = build_doc_word_matrix(tokenized, word2idx)

# Apply TF-IDF weighting (returns sparse matrix for efficiency)
print("Applying TF-IDF weighting...")
tfidf_transformer = TfidfTransformer()
doc_word_matrix = tfidf_transformer.fit_transform(doc_word_counts)

print(f"Document-word matrix shape: {doc_word_matrix.shape}")
print(f"  (documents x words, TF-IDF weighted, sparse)")
Building document-word matrix...
100%|███████████████| 100000/100000 [00:02<00:00, 35740.72it/s]
Applying TF-IDF weighting...
Document-word matrix shape: (100000, 30354)
  (documents x words, TF-IDF weighted, sparse)
In [11]:
doc_word_matrix.shape
Out[11]:
(100000, 30354)
In [12]:
# Apply SVD to TF-IDF matrix to get LSA word vectors
# We need word vectors, so we use V^T from the SVD decomposition
print(f"Applying SVD with {n_components} components for LSA (on TF-IDF matrix)...")

svd_lsa = TruncatedSVD(n_components=n_components, n_iter=10, random_state=42)
doc_vectors = svd_lsa.fit_transform(doc_word_matrix)  # U * Sigma

# Word vectors are the columns of V^T (or rows of V)
# svd.components_ gives V^T (n_components x n_features)
# So we transpose to get word vectors (n_features x n_components)
lsa_word_vectors = svd_lsa.components_.T  # (vocab_size x n_components)

print(f"LSA word vectors shape: {lsa_word_vectors.shape}")
print(f"Explained variance ratio: {svd_lsa.explained_variance_ratio_.sum():.2%}")

# Compute cosine similarity
lsa_cosine_sim = cosine_similarity(lsa_word_vectors)
Applying SVD with 100 components for LSA (on TF-IDF matrix)...
LSA word vectors shape: (30354, 100)
Explained variance ratio: 14.31%

Compare Nearest Neighbors Across Methods¶

In [34]:
def find_nearest_neighbors(word, word2idx, idx2word, sim_matrix, k=5):
    """Find k nearest neighbors using precomputed similarity matrix."""
    if word not in word2idx:
        return []
    
    word_idx = word2idx[word]
    similarities = sim_matrix[word_idx]
    top_indices = np.argsort(similarities)[::-1][:k+1]
    
    results = []
    for idx in top_indices:
        if idx != word_idx:
            results.append((idx2word[idx], similarities[idx]))
        if len(results) >= k:
            break
    return results

focus_words = ['king', 'queen', 'man', 'woman', 'university', 'chicago', 'biology', 'guitar', 'company']
focus_words = [w for w in focus_words if w in word2idx]
In [35]:
# Compare all three count-based methods
print("Nearest Neighbors Comparison (top 3)")
print("="*90)

for word in focus_words:
    dot_product_nn = find_nearest_neighbors(word, word2idx, idx2word, cooccur_dot_product, k=3)
    cooccur_nn = find_nearest_neighbors(word, word2idx, idx2word, cooccur_cosine_sim, k=3)
    svd_nn = find_nearest_neighbors(word, word2idx, idx2word, svd_cooccur_cosine_sim, k=3)
    lsa_nn = find_nearest_neighbors(word, word2idx, idx2word, lsa_cosine_sim, k=3)
    
    print(f"\n{word}:")
    print(f"  Co-occurrence dot product: {', '.join([w for w, _ in dot_product_nn])}")
    print(f"  Co-occurrence consine similarity: {', '.join([w for w, _ in cooccur_nn])}")
    print(f"  SVD(cooccur):  {', '.join([w for w, _ in svd_nn])}")
    print(f"  LSA(doc-word): {', '.join([w for w, _ in lsa_nn])}")
Nearest Neighbors Comparison (top 3)
==========================================================================================

king:
  Co-occurrence dot product: the, of, ,
  Co-occurrence consine similarity: governor, queen, ruler
  SVD(cooccur):  queen, chief, chancellor
  LSA(doc-word): vi, henry, edward

queen:
  Co-occurrence dot product: the, ,, of
  Co-occurrence consine similarity: king, lord, chief
  SVD(cooccur):  king, chief, director
  LSA(doc-word): elizabeth, consort, vi

man:
  Co-occurrence dot product: ,, the, of
  Co-occurrence consine similarity: woman, girl, student
  SVD(cooccur):  woman, girl, student
  LSA(doc-word): servant, woman, haley

woman:
  Co-occurrence dot product: of, ,, .
  Co-occurrence consine similarity: man, person, girl
  SVD(cooccur):  man, worker, girl
  LSA(doc-word): pregnant, girl, nurse

university:
  Co-occurrence dot product: the, of, .
  Co-occurrence consine similarity: height, age, length
  SVD(cooccur):  height, composition, rate
  LSA(doc-word): wesleyan, ateneo, universities

chicago:
  Co-occurrence dot product: the, ,, .
  Co-occurrence consine similarity: baltimore, general, water
  SVD(cooccur):  baltimore, medieval, pittsburgh
  LSA(doc-word): miami, ua, austin

biology:
  Co-occurrence dot product: the, ,, .
  Co-occurrence consine similarity: mathematics, japan, poland
  SVD(cooccur):  geometry, mathematics, physics
  LSA(doc-word): ecology, environment, genetics

guitar:
  Co-occurrence dot product: ,, the, of
  Co-occurrence consine similarity: polish, hill, piano
  SVD(cooccur):  bass, red, polish
  LSA(doc-word): drums, acoustic, vocals

company:
  Co-occurrence dot product: of, ,, .
  Co-occurrence consine similarity: situation, project, year
  SVD(cooccur):  pilot, bus, system
  LSA(doc-word): inc., corporation, companies

Part 4: Word2Vec¶

Prediction-based embeddings: predict context words from center word.

$$P(o|c) = \frac{\exp(u_o^\top v_c)}{\sum_{w \in V} \exp(u_w^\top v_c)}$$

In [15]:
from gensim.models import Word2Vec

print("Training Word2Vec...")
model = Word2Vec(
    sentences=tokenized,
    vector_size=100,
    window=5,
    min_count=5,
    sg=1,  # Skip-gram
    workers=4,
    seed=42
)
word_vectors = model.wv
print(f"Trained word vectors for {len(word_vectors)} words")
Training Word2Vec...
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Trained word vectors for 30354 words
In [18]:
# Word2Vec nearest neighbors
print("Word2Vec Nearest Neighbors (top 5):")
for word in focus_words:
    if word in word_vectors:
        similar = word_vectors.most_similar(word, topn=5)
        print(f"  {word}: {', '.join([w for w, _ in similar])}")
Word2Vec Nearest Neighbors (top 5):
  king: edward, vi, viii, throne, harald
  queen: elizabeth, regnant, empress, consort, margrethe
  man: woman, girl, men, teenager, werther
  woman: man, girl, baby, pregnant, female
  city: town, capital, rotterdam, cities, seaport
  country: nation, mali, lesotho, malawi, azerbaijan
  good: bad, keen, excellent, worthy, happy
  bad: terrible, uncomfortable, harsh, good, trouble

Part 5: Analogy Evaluation¶

Evaluate all embedding methods on the Google analogy dataset.

For analogy a : b :: c : ?, compute $\vec{b} - \vec{a} + \vec{c}$ and find nearest neighbor.

In [19]:
import urllib.request
import os

# Download Google analogy dataset
analogy_url = "https://raw.githubusercontent.com/nicholas-leonard/word2vec/master/questions-words.txt"
analogy_file = "questions-words.txt"

if not os.path.exists(analogy_file):
    print("Downloading Google analogy dataset...")
    urllib.request.urlretrieve(analogy_url, analogy_file)

def load_analogy_dataset(filepath):
    """Load Google analogy dataset."""
    analogies = {}
    current_category = None
    
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip().lower()
            if line.startswith(':'):
                current_category = line[2:]
                analogies[current_category] = []
            elif line and current_category:
                parts = line.split()
                if len(parts) == 4:
                    analogies[current_category].append(parts)
    return analogies

analogy_dataset = load_analogy_dataset(analogy_file)
print(f"Loaded {len(analogy_dataset)} analogy categories")

# Show total questions
total = sum(len(q) for q in analogy_dataset.values())
print(f"Total questions: {total}")
Loaded 14 analogy categories
Total questions: 19544
In [20]:
def evaluate_analogies_matrix(vectors, word2idx, idx2word, analogy_dataset):
    """
    Evaluate analogies using a word vector matrix.
    For a:b::c:d, compute b - a + c and find nearest neighbor.
    """
    # Normalize vectors for cosine similarity
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    norms[norms == 0] = 1
    vectors_norm = vectors / norms
    
    results = {}
    
    for category, questions in analogy_dataset.items():
        correct = 0
        total = 0
        skipped = 0
        
        for a, b, c, expected in questions:
            # Check if all words are in vocabulary
            if not all(w in word2idx for w in [a, b, c, expected]):
                skipped += 1
                continue
            
            total += 1
            
            # Get indices
            a_idx, b_idx, c_idx = word2idx[a], word2idx[b], word2idx[c]
            expected_idx = word2idx[expected]
            
            # Compute b - a + c
            query = vectors_norm[b_idx] - vectors_norm[a_idx] + vectors_norm[c_idx]
            query_norm = query / (np.linalg.norm(query) + 1e-10)
            
            # Find most similar (excluding a, b, c)
            similarities = vectors_norm @ query_norm
            similarities[[a_idx, b_idx, c_idx]] = -np.inf  # exclude input words
            
            predicted_idx = np.argmax(similarities)
            
            if predicted_idx == expected_idx:
                correct += 1
        
        accuracy = correct / total if total > 0 else 0
        results[category] = {
            'correct': correct,
            'total': total,
            'skipped': skipped,
            'accuracy': accuracy
        }
    
    return results

def evaluate_analogies_gensim(wv, analogy_dataset):
    """Evaluate analogies using gensim KeyedVectors."""
    results = {}
    
    for category, questions in analogy_dataset.items():
        correct = 0
        total = 0
        skipped = 0
        
        for a, b, c, expected in questions:
            if not all(w in wv for w in [a, b, c, expected]):
                skipped += 1
                continue
            
            total += 1
            
            try:
                predicted = wv.most_similar(positive=[b, c], negative=[a], topn=1)[0][0]
                if predicted == expected:
                    correct += 1
            except:
                pass
        
        accuracy = correct / total if total > 0 else 0
        results[category] = {
            'correct': correct,
            'total': total,
            'skipped': skipped,
            'accuracy': accuracy
        }
    
    return results

def summarize_results(results, name):
    """Summarize analogy results."""
    total_correct = sum(r['correct'] for r in results.values())
    total_questions = sum(r['total'] for r in results.values())
    overall_acc = total_correct / total_questions if total_questions > 0 else 0
    return {
        'name': name,
        'overall_accuracy': overall_acc,
        'total_correct': total_correct,
        'total_questions': total_questions,
        'by_category': results
    }
In [21]:
# Evaluate all methods
print("Evaluating all embedding methods on analogy task...")
print("(This may take a few minutes)\n")

all_results = {}

# 1. Raw co-occurrence
print("1. Evaluating Co-occurrence vectors...")
cooccur_results = evaluate_analogies_matrix(cooccur_matrix, word2idx, idx2word, analogy_dataset)
all_results['Co-occurrence'] = summarize_results(cooccur_results, 'Co-occurrence')

# 2. SVD on co-occurrence
print("2. Evaluating SVD(co-occurrence) vectors...")
svd_results = evaluate_analogies_matrix(svd_cooccur_vectors, word2idx, idx2word, analogy_dataset)
all_results['SVD(cooccur)'] = summarize_results(svd_results, 'SVD(cooccur)')

# 3. LSA (SVD on TF-IDF document-word matrix)
print("3. Evaluating LSA vectors (TF-IDF)...")
lsa_results = evaluate_analogies_matrix(lsa_word_vectors, word2idx, idx2word, analogy_dataset)
all_results['LSA'] = summarize_results(lsa_results, 'LSA')

# 4. Word2Vec
print("4. Evaluating Word2Vec vectors...")
w2v_results = evaluate_analogies_gensim(word_vectors, analogy_dataset)
all_results['Word2Vec'] = summarize_results(w2v_results, 'Word2Vec')

print("\nDone!")
Evaluating all embedding methods on analogy task...
(This may take a few minutes)

1. Evaluating Co-occurrence vectors...
2. Evaluating SVD(co-occurrence) vectors...
3. Evaluating LSA vectors (TF-IDF)...
4. Evaluating Word2Vec vectors...

Done!
In [22]:
# Display overall results
print("\n" + "="*70)
print("OVERALL ANALOGY EVALUATION RESULTS")
print("="*70)
print(f"{'Method':<20} {'Accuracy':>12} {'Correct':>12} {'Total':>12}")
print("-"*70)

for method, result in all_results.items():
    print(f"{method:<20} {result['overall_accuracy']:>12.1%} {result['total_correct']:>12} {result['total_questions']:>12}")
======================================================================
OVERALL ANALOGY EVALUATION RESULTS
======================================================================
Method                   Accuracy      Correct        Total
----------------------------------------------------------------------
Co-occurrence                4.1%          563        13643
SVD(cooccur)                 2.6%          353        13643
LSA                         12.1%         1649        13643
Word2Vec                    19.0%         2597        13643
In [23]:
# Detailed comparison by category
categories = list(analogy_dataset.keys())
methods = list(all_results.keys())

print("\n" + "="*100)
print("ACCURACY BY CATEGORY")
print("="*100)
print(f"{'Category':<30} " + " ".join([f"{m:>15}" for m in methods]))
print("-"*100)

for cat in categories:
    accs = []
    for method in methods:
        acc = all_results[method]['by_category'][cat]['accuracy']
        accs.append(f"{acc:>14.1%}")
    print(f"{cat:<30} " + " ".join(accs))
====================================================================================================
ACCURACY BY CATEGORY
====================================================================================================
Category                         Co-occurrence    SVD(cooccur)             LSA        Word2Vec
----------------------------------------------------------------------------------------------------
capital-common-countries                 3.2%           0.2%          16.8%          15.8%
capital-world                            0.9%           0.1%           9.2%           4.8%
currency                                 0.0%           0.0%           1.3%           1.3%
city-in-state                            2.0%           0.3%          15.4%           8.2%
family                                  27.2%          20.8%          14.3%          46.8%
gram1-adjective-to-adverb                1.3%           0.5%           0.2%           3.8%
gram2-opposite                           1.0%           1.0%           2.3%           3.6%
gram3-comparative                        9.6%           9.5%          20.0%          45.3%
gram4-superlative                        3.7%           1.8%           3.3%          26.0%
gram5-present-participle                 3.4%           1.1%          12.6%          22.9%
gram6-nationality-adjective              2.1%           1.7%          19.5%          29.7%
gram7-past-tense                         4.4%           2.4%           6.7%          16.1%
gram8-plural                             5.3%           2.1%          15.8%          23.6%
gram9-plural-verbs                       7.5%           5.4%           7.3%          23.5%
In [24]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(10, 6))

methods = list(all_results.keys())
accuracies = [all_results[m]['overall_accuracy'] * 100 for m in methods]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']

bars = ax.bar(methods, accuracies, color=colors)
ax.set_ylabel('Accuracy (%)')
ax.set_title('Analogy Task: Overall Accuracy by Method')
ax.set_ylim(0, max(accuracies) * 1.2)

# Add value labels
for bar, acc in zip(bars, accuracies):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
            f'{acc:.1f}%', ha='center', va='bottom', fontsize=11)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [25]:
# Plot by category (semantic vs syntactic)
semantic_cats = ['capital-common-countries', 'capital-world', 'currency', 'city-in-state', 'family']
syntactic_cats = [c for c in categories if c not in semantic_cats]

def get_grouped_accuracy(results, cat_list):
    correct = sum(results['by_category'][c]['correct'] for c in cat_list if c in results['by_category'])
    total = sum(results['by_category'][c]['total'] for c in cat_list if c in results['by_category'])
    return correct / total if total > 0 else 0

fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(methods))
width = 0.35

semantic_accs = [get_grouped_accuracy(all_results[m], semantic_cats) * 100 for m in methods]
syntactic_accs = [get_grouped_accuracy(all_results[m], syntactic_cats) * 100 for m in methods]

bars1 = ax.bar(x - width/2, semantic_accs, width, label='Semantic', color='steelblue')
bars2 = ax.bar(x + width/2, syntactic_accs, width, label='Syntactic', color='coral')

ax.set_ylabel('Accuracy (%)')
ax.set_title('Analogy Task: Semantic vs Syntactic')
ax.set_xticks(x)
ax.set_xticklabels(methods)
ax.legend()

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2, height + 0.3,
                f'{height:.1f}%', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()
No description has been provided for this image

Summary¶

Count-based methods:

  • Co-occurrence: raw word-context counts (high-dimensional, sparse)
  • SVD(cooccur): SVD on word-context matrix (dense, lower-dimensional)
  • LSA: SVD on TF-IDF weighted document-word matrix (captures document-level patterns)

Prediction-based methods:

  • Word2Vec: predict context words, learn vectors via SGD

Key insight: SVD and Word2Vec are closely related! (Levy & Goldberg, 2014)