Character and token categories in spaCy
spaCy can identify numeric quantities, email addresses, URLs, and certain Unicode-based character classes such as whitespace, letters, and digits.
Key statements
# Inputs: document_string (a str)
import spacy
# Load the language model and parse your document.
nlp = spacy.load('en') # en for English; others available.
doc = nlp(document_string)
# Extract all emails, URLs, and numbers from document_string.
emails = [token for token in doc if token.like_email]
urls = [token for token in doc if token.like_url]
numbers = [token for token in doc if token.like_num]
# Find all tokens that are all-digits as well as quote marks.
digit_tokens = [token for token in doc if token.is_digit]
quote_tokens = [token for token in doc if token.is_quote]
# Other .is_<something> attributes include:
# .is_alpha, .is_lower, .is_upper, .is_punct, .is_stop
Working example
import spacy
# Set up a function to help produce human-friendly printing.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def skip_and_print(*args):
""" Act like print(), but skip a line before printing. """
print('\n' + str(args[0]), *args[1:])
# Load a language model.
# ~~~~~~~~~~~~~~~~~~~~~~
nlp = spacy.load('en')
# Print out the list of stop words from a language model.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The term "stop words" applies to very common words in a given
# language. They're so common that they add little new
# information to the text. Examples: "the", "and", "about".
import spacy.lang.en as en
skip_and_print('List of stop words from the en model:')
for stop_word in en.stop_words.STOP_WORDS: # This is a set.
print(stop_word, end=' ')
print()
# Recognizing emails, URLs, and numbers.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sample1 = """I visit google.com about 1,000 times a day.
I often email myself at hi@example.com to remember things.
At 4:00pm I paid $8.75 for an indulgently fancy coffee."""
doc1 = nlp(sample1)
skip_and_print('Extracting from this string:\n%s' % sample1)
skip_and_print('URLs:')
print([t for t in doc1 if t.like_url])
# Prints: [google.com]
skip_and_print('emails:')
print([t for t in doc1 if t.like_email])
# Prints: [hi@example.com]
skip_and_print('numbers:')
print([t for t in doc1 if t.like_num])
# Prints: [1,000, 8.75]
# Known character classes
# ~~~~~~~~~~~~~~~~~~~~~~~
# spacy can recognize these kinds of tokens:
categories = [
'alpha', # Letters (in any language).
'digit', # Digits like 0-9 or ১২৩ (Bengali digits).
'lower', # lower case like this.
'upper', # UPPER CASE LIKE THIS.
'title', # Title Case Like This.
'punct', # Punctuation marks.
'space', # All-whitespace tokens.
'bracket', # Brackets like [ or ].
'quote', # Quotation marks.
'currency', # Currency symbols, like £ or ฿.
'stop' # Stop words.
]
sample2 = """A word in Russian, such as «Привет» is still
understood in terms of character classes. Numbers [like £300]
can be recognized as well."""
doc2 = nlp(sample2)
skip_and_print('Categories from this string:\n%s' % sample2)
for category in categories:
skip_and_print(category + ':')
print([
token.text
for token in doc2
if getattr(token, 'is_' + category)
])
Notes
As implied in the spacy docs for the Token class, the is_alpha
, is_digit
, is_upper
, is_lower
, is_title
, and is_space
attributes hold cached values that reflect the behavior of Python's built-in str
methods with similar names (such as str.isalpha()
).