text = "Communication & Intelligence is awesome!"

print("Text:", text)

Text: Communication & Intelligence is awesome!

# Simple whitespace split
simple_tokens = text.split()

print(f"Text: {text}\n")
print(f"Simple .split() tokens ({len(simple_tokens)}):")
print(simple_tokens)

Text: Communication & Intelligence is awesome!

Simple .split() tokens (5):
['Communication', '&', 'Intelligence', 'is', 'awesome!']

import re

# Split on word boundaries, keeping only word characters
regex_tokens = re.findall(r'\w+', text)

print(f"Text: {text}\n")
print(f"Regex r'\\w+' tokens ({len(regex_tokens)}):")
print(regex_tokens)

Text: Communication & Intelligence is awesome!

Regex r'\w+' tokens (4):
['Communication', 'Intelligence', 'is', 'awesome']

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

nltk_tokens = word_tokenize(text)

print(f"Text: {text}\n")
print(f"NLTK word_tokenize() tokens ({len(nltk_tokens)}):")
print(nltk_tokens)

Text: Communication & Intelligence is awesome!

NLTK word_tokenize() tokens (6):
['Communication', '&', 'Intelligence', 'is', 'awesome', '!']

# Character tokenization - just use list()
char_tokens = list(text)

print(f"Text: {text}")
print(f"\nCharacter tokens ({len(char_tokens)}):")
print(char_tokens)

Text: Communication & Intelligence is awesome!

Character tokens (40):
['C', 'o', 'm', 'm', 'u', 'n', 'i', 'c', 'a', 't', 'i', 'o', 'n', ' ', '&', ' ', 'I', 'n', 't', 'e', 'l', 'l', 'i', 'g', 'e', 'n', 'c', 'e', ' ', 'i', 's', ' ', 'a', 'w', 'e', 's', 'o', 'm', 'e', '!']

# Compare vocabulary sizes
all_words = word_tokenize(text.lower())
all_chars = list(text.lower())

unique_words = set(all_words)
unique_chars = set(all_chars)

print(f"Unique words: {len(unique_words)}")
print(f"Unique characters: {len(unique_chars)}")
print(f"\nUnique characters: {sorted(unique_chars)}")

Unique words: 6
Unique characters: 16

Unique characters: [' ', '!', '&', 'a', 'c', 'e', 'g', 'i', 'l', 'm', 'n', 'o', 's', 't', 'u', 'w']

# Byte tokenization on our sample text
byte_tokens = list(text.encode('utf-8'))

print(f"Text: {text}")
print(f"\nByte tokens ({len(byte_tokens)}):")
print(byte_tokens)

Text: Communication & Intelligence is awesome!

Byte tokens (40):
[67, 111, 109, 109, 117, 110, 105, 99, 97, 116, 105, 111, 110, 32, 38, 32, 73, 110, 116, 101, 108, 108, 105, 103, 101, 110, 99, 101, 32, 105, 115, 32, 97, 119, 101, 115, 111, 109, 101, 33]

# Each ASCII letter is 1 byte
# Show bytes in hex for readability
print("In hexadecimal:")
print([hex(b) for b in byte_tokens])

In hexadecimal:
['0x43', '0x6f', '0x6d', '0x6d', '0x75', '0x6e', '0x69', '0x63', '0x61', '0x74', '0x69', '0x6f', '0x6e', '0x20', '0x26', '0x20', '0x49', '0x6e', '0x74', '0x65', '0x6c', '0x6c', '0x69', '0x67', '0x65', '0x6e', '0x63', '0x65', '0x20', '0x69', '0x73', '0x20', '0x61', '0x77', '0x65', '0x73', '0x6f', '0x6d', '0x65', '0x21']

# Multilingual example
sample = "Hello 世界 😀"

byte_tokens = list(sample.encode('utf-8'))
print(f"Text: {sample}")
print(f"Total bytes: {len(byte_tokens)}")
print(f"\nBreakdown:")
print(f"  'Hello ' = {len('Hello '.encode('utf-8'))} bytes (ASCII)")
print(f"  '世界'   = {len('世界'.encode('utf-8'))} bytes (Chinese, 3 bytes each)")
print(f"  ' 😀'   = {len(' 😀'.encode('utf-8'))} bytes (space + emoji, 4 bytes)")

Text: Hello 世界 😀
Total bytes: 17

Breakdown:
  'Hello ' = 6 bytes (ASCII)
  '世界'   = 6 bytes (Chinese, 3 bytes each)
  ' 😀'   = 5 bytes (space + emoji, 4 bytes)

# Compare tokenization methods on multilingual text
print(f"Text: {sample}\n")

print("Word tokenization:")
words = word_tokenize(sample)
print(f"  {len(words)} tokens: {words}")

print("\nCharacter tokenization:")
characters = list(sample)
print(f"  {len(characters)} tokens: {characters}")

print("\nByte tokenization:")
byte_tokens = list(sample.encode('utf-8'))
print(f"  {len(byte_tokens)} tokens: {byte_tokens}")

Text: Hello 世界 😀

Word tokenization:
  3 tokens: ['Hello', '世界', '😀']

Character tokenization:
  10 tokens: ['H', 'e', 'l', 'l', 'o', ' ', '世', '界', ' ', '😀']

Byte tokenization:
  17 tokens: [72, 101, 108, 108, 111, 32, 228, 184, 150, 231, 149, 140, 32, 240, 159, 152, 128]

import tiktoken

# Use GPT-4's tokenizer (cl100k_base encoding)
enc = tiktoken.get_encoding("cl100k_base")

bpe_tokens = enc.encode(text)

print(f"Text: {text}")
print(f"\nBPE token IDs ({len(bpe_tokens)}):")
print(bpe_tokens)

print("\nDecoded tokens:")
for token_id in bpe_tokens:
    print(f"  {token_id} -> '{enc.decode([token_id])}'")

Text: Communication & Intelligence is awesome!

BPE token IDs (6):
[66511, 612, 22107, 374, 12738, 0]

Decoded tokens:
  66511 -> 'Communication'
  612 -> ' &'
  22107 -> ' Intelligence'
  374 -> ' is'
  12738 -> ' awesome'
  0 -> '!'

text = "This course will introduce fundamental concepts in natural language processing (NLP). It will cover the basics of enabling computers to understand and generate language, including word embeddings, language modeling, transformers, and an overview of large language models. It will also cover topics on connections with other disciplines such as linguistics and other social sciences."

print("Text:", text)

Text: This course will introduce fundamental concepts in natural language processing (NLP). It will cover the basics of enabling computers to understand and generate language, including word embeddings, language modeling, transformers, and an overview of large language models. It will also cover topics on connections with other disciplines such as linguistics and other social sciences.

import spacy
from spacy.lang.en import English

# Create a blank English model with rule-based sentencizer
nlp = English()
nlp.add_pipe("sentencizer")

doc = nlp(text)
sentences = list(doc.sents)

print(f"Found {len(sentences)} sentences (spaCy rule-based):\n")
for i, sent in enumerate(sentences, 1):
    print(f"{i}. {sent.text.strip()}")

Found 3 sentences (spaCy rule-based):

1. This course will introduce fundamental concepts in natural language processing (NLP).
2. It will cover the basics of enabling computers to understand and generate language, including word embeddings, language modeling, transformers, and an overview of large language models.
3. It will also cover topics on connections with other disciplines such as linguistics and other social sciences.

from nltk.tokenize import sent_tokenize

nltk_sentences = sent_tokenize(text)

print(f"Found {len(nltk_sentences)} sentences (NLTK Punkt):\n")
for i, sent in enumerate(nltk_sentences, 1):
    print(f"{i}. {sent.strip()}")

Found 3 sentences (NLTK Punkt):

1. This course will introduce fundamental concepts in natural language processing (NLP).
2. It will cover the basics of enabling computers to understand and generate language, including word embeddings, language modeling, transformers, and an overview of large language models.
3. It will also cover topics on connections with other disciplines such as linguistics and other social sciences.

text = "I work at U.of.C. What about you?"

print("Text:", text)

Text: I work at U.of.C. What about you?

doc = nlp(text)
sentences = list(doc.sents)

print(f"Found {len(sentences)} sentences (spaCy rule-based):\n")
for i, sent in enumerate(sentences, 1):
    print(f"{i}. {sent.text.strip()}")

Found 2 sentences (spaCy rule-based):

1. I work at U.of.
2. C. What about you?

nltk_sentences = sent_tokenize(text)

print(f"Found {len(nltk_sentences)} sentences (NLTK Punkt):\n")
for i, sent in enumerate(nltk_sentences, 1):
    print(f"{i}. {sent.strip()}")

Found 2 sentences (NLTK Punkt):

1. I work at U.of.C.
2. What about you?

Tokenization in NLP¶

What is Tokenization?¶

Sample Text¶

Word Tokenization¶

Rule-based: Simple .split()¶

Rule-based: Regular Expressions¶

NLTK Word Tokenizer¶

Other remaining issues¶

Character Tokenization¶

Character vs Word Vocabulary Size¶

Byte-Based Tokenization¶

Multilingual Text and Emojis¶

4. BPE (Byte Pair Encoding) Tokenization¶

Sentence Tokenization¶

Rule-based: spaCy Sentencizer¶

NLTK Sentence Tokenizer¶

What about this text?¶

Takeaways¶