Tokenization with spaCy

Tokenization is the process of breaking a document down into standardized word representations, as well as splitting out separating punctuation. Wikipedia: Text segmentation

Key statements
# Inputs: document_string (a str), process_token() (a fn)

import spacy

# Load the language model and parse your document.
nlp = spacy.load('en')  # en for English; others available.
doc = nlp(document_string)

# Perform your operations over the tokens of the document.
for token in doc:

# Alternatively, index or slice into doc as if it were a list.
Working example
import json  # this is used for pretty-printing.
import spacy

# Set up functions to help produce human-friendly printing.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def skip_and_print(*args):
    """ Act like print(), but skip a line before printing. """
    print('\n' + str(args[0]), *args[1:])

def print_table(rows, padding=0):
    """ Print `rows` with content-based column widths. """
    col_widths = [
        max(len(str(value)) for value in col) + padding
        for col in zip(*rows)
    total_width = sum(col_widths) + len(col_widths) - 1
    fmt = ' '.join('%%-%ds' % width for width in col_widths)
    print(fmt % tuple(rows[0]))
    print('~' * total_width)
    for row in rows[1:]:
        print(fmt % tuple(row))

# Load a language model.
# ~~~~~~~~~~~~~~~~~~~~~~

nlp = spacy.load('en')

# Understand your language model.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Language models may contain useful metadata, such as how it
# was created, or a contact email address.
skip_and_print('Data about the language model:')
print(json.dumps(nlp.meta, indent=4))

# Parse and tokenize a string.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Parse a string as a document.
orig_str = 'The quick brown fox jumps over the lazy dog.'
skip_and_print('String being parsed: "%s"' % orig_str)
doc = nlp(orig_str)

# Although the Doc type (the type of `doc`) is not a list, you
# can index tokens and slices of tokens using the same syntax.

skip_and_print('Type of `doc`:', type(doc))          # Doc

skip_and_print('Type of doc[0]:', type(doc[0]))      # Token
print('doc[0]:', doc[0])                             # The

skip_and_print('Type of doc[-3:]:', type(doc[-3:]))  # Span
print('doc[-3:]:', doc[-3:])                         # lazy dog.

# Iterate over tokens of the document.
skip_and_print('List of tokens in doc:')
for token in doc:

# Enumerate all token attributes.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Each token has many useful properties.
skip_and_print('List of attributes from a sample Token:')
print([attr for attr in dir(doc[0]) if '__' not in attr])
# Attributes include, e.g., .pos_, .text, and .vector.

# Use .text_with_ws to recreate your original string.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

skip_and_print('Recreated original string:')
print(''.join(token.text_with_ws for token in doc))

# Mapping tokens to integers.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~

# The `orth` attribute provides an integer id for every token:
skip_and_print('The .orth value for all tokens:')
print([token.orth for token in doc])  # A list of integers.

# Each token also comes with a normalized form. In English, this
# may simply be the lowercased word itself.

# Let's print a table of token texts, ids, normalized forms, and
# the ids for the normalized forms:

skip_and_print('Compare text/orth vs norm\'d string/ids:')
rows = [['Text', 'Orth Id', 'Norm\'d', 'Norm Id']]
for token in doc:
    rows.append([token.text,   # Token str w/o outer space.
                 token.orth,   # Integer id for .text value.
                 token.norm_,  # Normalized str of .text value.
                 token.norm])  # Integer id for .norm_ value.


Download the default English model with the shell command python -m spacy download en.

Other language models are listed on spaCy's official models page. That page includes information about English language models that are more comprehensive than the default.