Word vectors with spaCy
spaCy provides a mapping from a vocabular of common words to vectors. These vectors, sometimes called "word embeddings," are designed (using the GloVe algorithm) to map semantic meaning into numeric proximity. Wikipedia: Word embedding
Key statements
# WARNING: spacy's default English model *does not provide* word vector data.
# Instead, it provides "good guess" vectors that are lower quality.
# Download a larger language model, like this, to receive vector data:
# python -m spacy download en_core_web_lg
# Inputs: document_string (a str)
import numpy as np
import spacy
# Load a language model and parse your document.
nlp = spacy.load('en_core_web_lg')
doc = nlp(document_string)
# Retrieve word vectors for each token.
for token in doc:
print('Vector for %s:' % token, token.vector)
# Compute word similarities using vector data.
# We'll put these into a numpy matrix.
matrix_rows = []
for token1 in doc:
row = [token1.similarity(token2) for token2 in doc]
matrix_rows.append(row)
similarity_matrix = np.array(matrix_rows)
Working example
import spacy
# Set up functions to help produce human-friendly printing.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def skip_and_print(*args):
""" Act like print(), but skip a line before printing. """
print('\n' + str(args[0]), *args[1:])
# The next definitions work together to help print tables.
col_widths = []
def row(*values):
""" Return a str with `values` spaced by `col_widths`. """
fmt = ' '.join('%%%ds' % width for width in col_widths)
return fmt % tuple(values)
def hr():
""" Return a horizontal rule str using `col_widths`. """
size = sum(map(abs, col_widths)) + len(col_widths) - 1
return '~' * size
# Load a language model and parse a document.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# spacy's default English language model doesn't have word
# vector data, so we must use a larger variant to use vectors.
# Here's how to download a large model for English:
# python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')
document_string = 'happy glad cheddar munster'
skip_and_print('Working with string: "%s"' % document_string)
doc = nlp(document_string)
# Retrieve word vectors for each token.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
skip_and_print('Word vectors for each token:\n')
col_widths = [7, 1, -30]
print(row('Word', '|', 'Vector'))
print(hr())
for token in doc:
v = token.vector
v_str = ('%5.2f ' * 4) % tuple(v[:4])
print(row(token, '|', '(' + v_str + '...)'))
# Word vectors for each token:
#
# Word | Vector
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# happy | ( 0.04 0.41 -0.52 -0.07 ...)
# glad | ( 0.07 0.25 -0.53 -0.03 ...)
# cheddar | (-0.63 0.53 0.23 -0.16 ...)
# munster | (-0.15 0.76 0.48 0.23 ...)
# Compute word similarities using vector data.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
skip_and_print('Similarity matrix for the above words:\n')
col_widths = [7] * 5 # Our longest word has length 7.
print(row('', *doc))
print(hr())
for token1 in doc:
values = [token1]
for token2 in doc:
values.append('%5.2f' % token1.similarity(token2))
print(row(*values))
# Similarity matrix for the above words:
#
# happy glad cheddar munster
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# happy 1.00 0.77 0.06 -0.01
# glad 0.77 1.00 0.07 0.00
# cheddar 0.06 0.07 1.00 0.21
# munster -0.01 0.00 0.21 1.00
Notes
spaCy offers support for other word vector sources, including Facebook's fastText model, as well as for custom word-similarity methods.