Character and token categories in spaCy

spaCy can identify numeric quantities, email addresses, URLs, and certain Unicode-based character classes such as whitespace, letters, and digits.

Language: Python 3
Library: spacy

Key statements

# Inputs: document_string (a str)

import spacy

# Load the language model and parse your document.
nlp = spacy.load('en')  # en for English; others available.
doc = nlp(document_string)

# Extract all emails, URLs, and numbers from document_string.
emails  = [token for token in doc if token.like_email]
urls    = [token for token in doc if token.like_url]
numbers = [token for token in doc if token.like_num]

# Find all tokens that are all-digits as well as quote marks.
digit_tokens = [token for token in doc if token.is_digit]
quote_tokens = [token for token in doc if token.is_quote]

# Other .is_<something> attributes include:
#  .is_alpha, .is_lower, .is_upper, .is_punct, .is_stop

Working example

import spacy


# Set up a function to help produce human-friendly printing.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def skip_and_print(*args):
    """ Act like print(), but skip a line before printing. """
    print('\n' + str(args[0]), *args[1:])


# Load a language model.
# ~~~~~~~~~~~~~~~~~~~~~~

nlp = spacy.load('en')


# Print out the list of stop words from a language model.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# The term "stop words" applies to very common words in a given
# language. They're so common that they add little new
# information to the text. Examples: "the", "and", "about".

import spacy.lang.en as en

skip_and_print('List of stop words from the en model:')
for stop_word in en.stop_words.STOP_WORDS:  # This is a set.
    print(stop_word, end=' ')
print()


# Recognizing emails, URLs, and numbers.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

sample1 = """I visit google.com about 1,000 times a day.
I often email myself at hi@example.com to remember things.
At 4:00pm I paid $8.75 for an indulgently fancy coffee."""

doc1 = nlp(sample1)

skip_and_print('Extracting from this string:\n%s' % sample1)

skip_and_print('URLs:')
print([t for t in doc1 if t.like_url])
# Prints: [google.com]

skip_and_print('emails:')
print([t for t in doc1 if t.like_email])
# Prints: [hi@example.com]

skip_and_print('numbers:')
print([t for t in doc1 if t.like_num])
# Prints: [1,000, 8.75]


# Known character classes
# ~~~~~~~~~~~~~~~~~~~~~~~

# spacy can recognize these kinds of tokens:
categories = [
    'alpha',     # Letters (in any language).
    'digit',     # Digits like 0-9 or ১২৩ (Bengali digits).
    'lower',     # lower case like this.
    'upper',     # UPPER CASE LIKE THIS.
    'title',     # Title Case Like This.
    'punct',     # Punctuation marks.
    'space',     # All-whitespace tokens.
    'bracket',   # Brackets like [ or ].
    'quote',     # Quotation marks.
    'currency',  # Currency symbols, like £ or ฿.
    'stop'       # Stop words.
]

sample2 = """A word in Russian, such as «Привет» is still
understood in terms of character classes. Numbers [like £300]
can be recognized as well."""

doc2 = nlp(sample2)

skip_and_print('Categories from this string:\n%s' % sample2)

for category in categories:
    skip_and_print(category + ':')
    print([
        token.text
        for token in doc2
        if getattr(token, 'is_' + category)
    ])

Notes

As implied in the spacy docs for the Token class, the is_alpha, is_digit, is_upper, is_lower, is_title, and is_space attributes hold cached values that reflect the behavior of Python's built-in str methods with similar names (such as str.isalpha()).

Notes

See Also