Tokenization with spaCy
Tokenization is the process of breaking a document down into standardized word representations, as well as splitting out separating punctuation. Wikipedia: Text segmentation
Key statements
# Inputs: document_string (a str), process_token() (a fn)
import spacy
# Load the language model and parse your document.
nlp = spacy.load('en') # en for English; others available.
doc = nlp(document_string)
# Perform your operations over the tokens of the document.
for token in doc:
process_token(token)
# Alternatively, index or slice into doc as if it were a list.
process_token(doc[234])
Working example
import json # this is used for pretty-printing.
import spacy
# Set up functions to help produce human-friendly printing.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def skip_and_print(*args):
""" Act like print(), but skip a line before printing. """
print('\n' + str(args[0]), *args[1:])
def print_table(rows, padding=0):
""" Print `rows` with content-based column widths. """
col_widths = [
max(len(str(value)) for value in col) + padding
for col in zip(*rows)
]
total_width = sum(col_widths) + len(col_widths) - 1
fmt = ' '.join('%%-%ds' % width for width in col_widths)
print(fmt % tuple(rows[0]))
print('~' * total_width)
for row in rows[1:]:
print(fmt % tuple(row))
# Load a language model.
# ~~~~~~~~~~~~~~~~~~~~~~
nlp = spacy.load('en')
# Understand your language model.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Language models may contain useful metadata, such as how it
# was created, or a contact email address.
skip_and_print('Data about the language model:')
print(json.dumps(nlp.meta, indent=4))
# Parse and tokenize a string.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Parse a string as a document.
orig_str = 'The quick brown fox jumps over the lazy dog.'
skip_and_print('String being parsed: "%s"' % orig_str)
doc = nlp(orig_str)
# Although the Doc type (the type of `doc`) is not a list, you
# can index tokens and slices of tokens using the same syntax.
skip_and_print('Type of `doc`:', type(doc)) # Doc
skip_and_print('Type of doc[0]:', type(doc[0])) # Token
print('doc[0]:', doc[0]) # The
skip_and_print('Type of doc[-3:]:', type(doc[-3:])) # Span
print('doc[-3:]:', doc[-3:]) # lazy dog.
# Iterate over tokens of the document.
skip_and_print('List of tokens in doc:')
for token in doc:
print(token)
# Enumerate all token attributes.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Each token has many useful properties.
skip_and_print('List of attributes from a sample Token:')
print([attr for attr in dir(doc[0]) if '__' not in attr])
# Attributes include, e.g., .pos_, .text, and .vector.
# Use .text_with_ws to recreate your original string.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
skip_and_print('Recreated original string:')
print(''.join(token.text_with_ws for token in doc))
# Mapping tokens to integers.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The `orth` attribute provides an integer id for every token:
skip_and_print('The .orth value for all tokens:')
print([token.orth for token in doc]) # A list of integers.
# Each token also comes with a normalized form. In English, this
# may simply be the lowercased word itself.
# Let's print a table of token texts, ids, normalized forms, and
# the ids for the normalized forms:
skip_and_print('Compare text/orth vs norm\'d string/ids:')
rows = [['Text', 'Orth Id', 'Norm\'d', 'Norm Id']]
for token in doc:
rows.append([token.text, # Token str w/o outer space.
token.orth, # Integer id for .text value.
token.norm_, # Normalized str of .text value.
token.norm]) # Integer id for .norm_ value.
print_table(rows)
Notes
Download the default English model with the shell command python -m spacy download en
.
Other language models are listed on spaCy's official models page. That page includes information about English language models that are more comprehensive than the default.