Named-entity recognition with spaCy
Named-entity recognition is the problem of finding things that are mentioned by name in text. Examples include places (San Francisco), people (Darth Vader), and organizations (Unbox Research). Wikipedia: Named-entity recognition
Key statements
# Inputs: document_string (a str)
import spacy
# Load a language model and parse a document.
nlp = spacy.load('en')
doc = nlp(document_string)
# Find named entities.
for ent in doc.ents:
print(ent) # Each `ent` is a `Span` object.
# Recover entity substrings in the source document.
for ent in doc.ents:
substring = document_string[ent.start_char: ent.end_char]
Working example
import spacy
# Set up functions to help produce human-friendly printing.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def skip_and_print(*args):
""" Act like print(), but skip a line before printing. """
print('\n' + str(args[0]), *args[1:])
def print_table(rows):
""" Print `rows` with content-based column widths. """
col_widths = [
max(len(str(value)) for value in col)
for col in zip(*rows)
]
total_width = sum(col_widths) + len(col_widths) - 1
fmt = ' '.join('%%-%ds' % width for width in col_widths)
print(fmt % tuple(rows[0]))
print('~' * total_width)
for row in rows[1:]:
print(fmt % tuple(row))
# Load a language model and parse a document.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
nlp = spacy.load('en')
document_string = "I like to visit Park Tea House in Berkeley."
skip_and_print('Working with string: "%s"' % document_string)
doc = nlp(document_string)
# Finding named entities.
# ~~~~~~~~~~~~~~~~~~~~~~~
rows = [['Name', 'Start', 'End', 'Label']]
# Each `ent` object is an instance of the `Span` class.
for ent in doc.ents:
rows.append([
ent.text, # The str of the named entity phrase.
ent.start_char, # Source str index of the first char.
ent.end_char, # Source str index of the last+1 char.
ent.label_ # A str label for the entity type.
])
skip_and_print('Named entities found:')
print_table(rows)
# Named entities found:
#
# Name Start End Label
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Park Tea House 16 30 ORG
# Berkeley 34 42 GPE
# Example entity types (values found in .label_):
#
# ORG = organization
# GPE = geo-political entity
# PERSON = person (may be fictional!)
# Recovering substrings of named entities in your source.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This an example of recovering the original substring from your
# document. This can be useful, for example, if you're working
# with a pipeline that thinks in terms of character indexes in a
# master document.
for ent in doc.ents:
skip_and_print('Recovering "%s":' % ent)
print(document_string)
print(' ' * ent.start_char + '^' * len(ent.text))
# How you can access the entity subtring using a slice:
substr = document_string[ent.start_char: ent.end_char]
# Recovering "Park Tea House":
# I like to visit Park Tea House in Berkeley.
# ^^^^^^^^^^^^^^
#
# Recovering "Berkeley":
# I like to visit Park Tea House in Berkeley.
# ^^^^^^^^
Notes
spaCy's built-in entity types are listed in their docs for named-entity recognition.