777 lines
37 KiB
Python
777 lines
37 KiB
Python
|
|
"""Functionality to match phone numbers in a piece of text"""
|
||
|
|
|
||
|
|
# Based on original Java code:
|
||
|
|
# java/src/com/google/i18n/phonenumbers/PhoneNumberMatch.java
|
||
|
|
# java/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java
|
||
|
|
# Copyright (C) 2011 The Libphonenumber Authors
|
||
|
|
#
|
||
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
|
# you may not use this file except in compliance with the License.
|
||
|
|
# You may obtain a copy of the License at
|
||
|
|
#
|
||
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
|
#
|
||
|
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
|
# See the License for the specific language governing permissions and
|
||
|
|
# limitations under the License.
|
||
|
|
import re
|
||
|
|
|
||
|
|
# Extra regexp function; see README
|
||
|
|
from .re_util import fullmatch
|
||
|
|
from .util import UnicodeMixin, u, unicod, prnt
|
||
|
|
from .util import U_EMPTY_STRING, U_DASH, U_SEMICOLON, U_SLASH, U_X_LOWER, U_X_UPPER, U_PERCENT
|
||
|
|
from .unicode_util import Category, Block, is_letter
|
||
|
|
from .phonenumberutil import _MAX_LENGTH_FOR_NSN, _MAX_LENGTH_COUNTRY_CODE
|
||
|
|
from .phonenumberutil import _VALID_PUNCTUATION, _PLUS_CHARS, NON_DIGITS_PATTERN
|
||
|
|
from .phonenumberutil import _EXTN_PATTERNS_FOR_MATCHING, _REGEX_FLAGS
|
||
|
|
from .phonenumberutil import _SECOND_NUMBER_START_PATTERN, _UNWANTED_END_CHAR_PATTERN
|
||
|
|
from .phonenumberutil import MatchType, NumberParseException, PhoneNumberFormat
|
||
|
|
from .phonenumberutil import is_possible_number, is_valid_number, parse
|
||
|
|
from .phonenumberutil import normalize_digits_only, national_significant_number
|
||
|
|
from .phonenumberutil import _format_nsn_using_pattern, ndd_prefix_for_region
|
||
|
|
from .phonenumberutil import format_number, is_number_match, region_code_for_country_code
|
||
|
|
from .phonenumberutil import _maybe_strip_national_prefix_carrier_code
|
||
|
|
from .phonenumberutil import _choose_formatting_pattern_for_number
|
||
|
|
from .phonenumberutil import _formatting_rule_has_first_group_only
|
||
|
|
from .phonenumber import CountryCodeSource
|
||
|
|
from .phonemetadata import PhoneMetadata
|
||
|
|
|
||
|
|
# Import auto-generated data structures
|
||
|
|
try:
|
||
|
|
from .data import _ALT_NUMBER_FORMATS
|
||
|
|
except ImportError: # pragma no cover
|
||
|
|
# Before the generated code exists, the data/ directory is empty.
|
||
|
|
# The generation process imports this module, creating a circular
|
||
|
|
# dependency. The hack below works around this.
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
if os.path.basename(sys.argv[0]) in ("buildmetadatafromxml.py", "buildprefixdata.py"):
|
||
|
|
prnt("Failed to import generated data (but OK as during autogeneration)", file=sys.stderr)
|
||
|
|
_ALT_NUMBER_FORMATS = {}
|
||
|
|
else:
|
||
|
|
raise
|
||
|
|
|
||
|
|
|
||
|
|
def _limit(lower, upper):
|
||
|
|
"""Returns a regular expression quantifier with an upper and lower limit."""
|
||
|
|
if ((lower < 0) or (upper <= 0) or (upper < lower)):
|
||
|
|
raise Exception("Illegal argument to _limit")
|
||
|
|
return unicod("{%d,%d}") % (lower, upper)
|
||
|
|
|
||
|
|
|
||
|
|
# Build the MATCHING_BRACKETS and PATTERN regular expression patterns. The
|
||
|
|
# building blocks below exist to make the patterns more easily understood.
|
||
|
|
_OPENING_PARENS = u("(\\[\uFF08\uFF3B")
|
||
|
|
_CLOSING_PARENS = u(")\\]\uFF09\uFF3D")
|
||
|
|
_NON_PARENS = u("[^") + _OPENING_PARENS + _CLOSING_PARENS + u("]")
|
||
|
|
# Limit on the number of pairs of brackets in a phone number.
|
||
|
|
_BRACKET_PAIR_LIMIT = _limit(0, 3)
|
||
|
|
|
||
|
|
# Pattern to check that brackets match. Opening brackets should be closed
|
||
|
|
# within a phone number. This also checks that there is something inside the
|
||
|
|
# brackets. Having no brackets at all is also fine.
|
||
|
|
#
|
||
|
|
# An opening bracket at the beginning may not be closed, but subsequent ones
|
||
|
|
# should be. It's also possible that the leading bracket was dropped, so we
|
||
|
|
# shouldn't be surprised if we see a closing bracket first. We limit the sets
|
||
|
|
# of brackets in a phone number to four.
|
||
|
|
_MATCHING_BRACKETS = re.compile(u("(?:[") + _OPENING_PARENS + u("])?") + u("(?:") + _NON_PARENS + u("+") +
|
||
|
|
u("[") + _CLOSING_PARENS + u("])?") +
|
||
|
|
_NON_PARENS + u("+") +
|
||
|
|
u("(?:[") + _OPENING_PARENS + u("]") + _NON_PARENS +
|
||
|
|
u("+[") + _CLOSING_PARENS + u("])") + _BRACKET_PAIR_LIMIT +
|
||
|
|
_NON_PARENS + u("*"))
|
||
|
|
|
||
|
|
# Limit on the number of leading (plus) characters.
|
||
|
|
_LEAD_LIMIT = _limit(0, 2)
|
||
|
|
# Limit on the number of consecutive punctuation characters.
|
||
|
|
_PUNCTUATION_LIMIT = _limit(0, 4)
|
||
|
|
# The maximum number of digits allowed in a digit-separated block. As we allow
|
||
|
|
# all digits in a single block, set high enough to accommodate the entire
|
||
|
|
# national number and the international country code.
|
||
|
|
_DIGIT_BLOCK_LIMIT = (_MAX_LENGTH_FOR_NSN + _MAX_LENGTH_COUNTRY_CODE)
|
||
|
|
# Limit on the number of blocks separated by punctuation. Use _DIGIT_BLOCK_LIMIT
|
||
|
|
# since some formats use spaces to separate each digit.
|
||
|
|
_BLOCK_LIMIT = _limit(0, _DIGIT_BLOCK_LIMIT)
|
||
|
|
|
||
|
|
# A punctuation sequence allowing white space.
|
||
|
|
_PUNCTUATION = u("[") + _VALID_PUNCTUATION + u("]") + _PUNCTUATION_LIMIT
|
||
|
|
# A digits block without punctuation.
|
||
|
|
_DIGIT_SEQUENCE = u("\\d") + _limit(1, _DIGIT_BLOCK_LIMIT)
|
||
|
|
# Punctuation that may be at the start of a phone number - brackets and plus signs.
|
||
|
|
_LEAD_CLASS_CHARS = _OPENING_PARENS + _PLUS_CHARS
|
||
|
|
_LEAD_CLASS = u("[") + _LEAD_CLASS_CHARS + u("]")
|
||
|
|
_LEAD_PATTERN = re.compile(_LEAD_CLASS)
|
||
|
|
|
||
|
|
# Phone number pattern allowing optional punctuation.
|
||
|
|
# This is the phone number pattern used by _find(), similar to
|
||
|
|
# phonenumberutil._VALID_PHONE_NUMBER, but with the following differences:
|
||
|
|
# - All captures are limited in order to place an upper bound to the text
|
||
|
|
# matched by the pattern.
|
||
|
|
# - Leading punctuation / plus signs are limited.
|
||
|
|
# - Consecutive occurrences of punctuation are limited.
|
||
|
|
# - Number of digits is limited.
|
||
|
|
# - No whitespace is allowed at the start or end.
|
||
|
|
# - No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently
|
||
|
|
# supported.
|
||
|
|
_PATTERN = re.compile(u("(?:") + _LEAD_CLASS + _PUNCTUATION + u(")") + _LEAD_LIMIT +
|
||
|
|
_DIGIT_SEQUENCE + u("(?:") + _PUNCTUATION + _DIGIT_SEQUENCE + u(")") + _BLOCK_LIMIT +
|
||
|
|
u("(?:") + _EXTN_PATTERNS_FOR_MATCHING + u(")?"),
|
||
|
|
_REGEX_FLAGS)
|
||
|
|
|
||
|
|
# Matches strings that look like publication pages. Example: "Computing
|
||
|
|
# Complete Answers to Queries in the Presence of Limited Access Patterns.
|
||
|
|
# Chen Li. VLDB J. 12(3): 211-227 (2003)."
|
||
|
|
#
|
||
|
|
# The string "211-227 (2003)" is not a telephone number.
|
||
|
|
_PUB_PAGES = re.compile(u("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}"))
|
||
|
|
|
||
|
|
# Matches strings that look like dates using "/" as a separator. Examples:
|
||
|
|
# 3/10/2011, 31/10/96 or 08/31/95.
|
||
|
|
_SLASH_SEPARATED_DATES = re.compile(u("(?:(?:[0-3]?\\d/[01]?\\d)|(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}"))
|
||
|
|
|
||
|
|
# Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
|
||
|
|
# not include the trailing ":\d\d" -- that is covered by TIME_STAMPS_SUFFIX.
|
||
|
|
_TIME_STAMPS = re.compile(u("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d +[0-2]\\d$"))
|
||
|
|
_TIME_STAMPS_SUFFIX = re.compile(u(":[0-5]\\d"))
|
||
|
|
|
||
|
|
# Patterns used to extract phone numbers from a larger phone-number-like
|
||
|
|
# pattern. These are ordered according to specificity. For example,
|
||
|
|
# white-space is last since that is frequently used in numbers, not just to
|
||
|
|
# separate two numbers. We have separate patterns since we don't want to break
|
||
|
|
# up the phone-number-like text on more than one different kind of symbol at
|
||
|
|
# one time, although symbols of the same type (e.g. space) can be safely
|
||
|
|
# grouped together.
|
||
|
|
#
|
||
|
|
# Note that if there is a match, we will always check any text found up to the
|
||
|
|
# first match as well.
|
||
|
|
_INNER_MATCHES = (
|
||
|
|
# Breaks on the slash - e.g. "651-234-2345/332-445-1234"
|
||
|
|
re.compile(u("/+(.*)")),
|
||
|
|
# Note that the bracket here is inside the capturing group, since we
|
||
|
|
# consider it part of the phone number. Will match a pattern like "(650)
|
||
|
|
# 223 3345 (754) 223 3321".
|
||
|
|
re.compile(u("(\\([^(]*)")),
|
||
|
|
# Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We
|
||
|
|
# require a space on either side of the hyphen for it to be considered a
|
||
|
|
# separator.
|
||
|
|
re.compile(u("(?u)(?:\\s-|-\\s)\\s*(.+)")),
|
||
|
|
# Various types of wide hyphens. Note we have decided not to enforce a
|
||
|
|
# space here, since it's possible that it's supposed to be used to break
|
||
|
|
# two numbers without spaces, and we haven't seen many instances of it
|
||
|
|
# used within a number.
|
||
|
|
re.compile(u("(?u)[\u2012-\u2015\uFF0D]\\s*(.+)")),
|
||
|
|
# Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
|
||
|
|
re.compile(u("(?u)\\.+\\s*([^.]+)")),
|
||
|
|
# Breaks on space - e.g. "3324451234 8002341234"
|
||
|
|
re.compile(u("(?u)\\s+(\\S+)")))
|
||
|
|
|
||
|
|
|
||
|
|
class Leniency(object):
|
||
|
|
"""Leniency when finding potential phone numbers in text segments.
|
||
|
|
|
||
|
|
The levels here are ordered in increasing strictness."""
|
||
|
|
# Phone numbers accepted are possible (i.e. is_possible_number(number)) but
|
||
|
|
# not necessarily valid (is_valid_number(number)).
|
||
|
|
POSSIBLE = 0
|
||
|
|
# Phone numbers accepted are both possible (is_possible_number(number))
|
||
|
|
# and valid (is_valid_number(PhoneNumber)). Numbers written in national
|
||
|
|
# format must have their national-prefix present if it is usually written
|
||
|
|
# for a number of this type.
|
||
|
|
VALID = 1
|
||
|
|
# Phone numbers accepted are valid (i.e. is_valid_number(number)) and are
|
||
|
|
# grouped in a possible way for this locale. For example, a US number
|
||
|
|
# written as "65 02 53 00 00" and "650253 0000" are not accepted at this
|
||
|
|
# leniency level, whereas "650 253 0000", "650 2530000" or "6502530000"
|
||
|
|
# are.
|
||
|
|
# Numbers with more than one '/' symbol in the national significant number
|
||
|
|
# are also dropped at this level.
|
||
|
|
#
|
||
|
|
# Warning: This level might result in lower coverage especially for
|
||
|
|
# regions outside of country code "+1". If you are not sure about which
|
||
|
|
# level to use, email the discussion group
|
||
|
|
# libphonenumber-discuss@googlegroups.com.
|
||
|
|
STRICT_GROUPING = 2
|
||
|
|
# Phone numbers accepted are valid (i.e. is_valid_number(number)) and are
|
||
|
|
# grouped in the same way that we would have formatted it, or as a single
|
||
|
|
# block. For example, a US number written as "650 2530000" is not accepted
|
||
|
|
# at this leniency level, whereas "650 253 0000" or "6502530000" are.
|
||
|
|
# Numbers with more than one '/' symbol are also dropped at this level.
|
||
|
|
# Warning: This level might result in lower coverage especially for
|
||
|
|
# regions outside of country code "+1". If you are not sure about which
|
||
|
|
# level to use, email the discussion group
|
||
|
|
# libphonenumber-discuss@googlegroups.com.
|
||
|
|
EXACT_GROUPING = 3
|
||
|
|
|
||
|
|
|
||
|
|
def _verify(leniency, numobj, candidate, matcher):
|
||
|
|
"""Returns True if number is a verified number according to the
|
||
|
|
leniency."""
|
||
|
|
if leniency == Leniency.POSSIBLE:
|
||
|
|
return is_possible_number(numobj)
|
||
|
|
elif leniency == Leniency.VALID:
|
||
|
|
if (not is_valid_number(numobj) or
|
||
|
|
not _contains_only_valid_x_chars(numobj, candidate)):
|
||
|
|
return False
|
||
|
|
return _is_national_prefix_present_if_required(numobj)
|
||
|
|
elif leniency == Leniency.STRICT_GROUPING:
|
||
|
|
return _verify_strict_grouping(numobj, candidate, matcher)
|
||
|
|
elif leniency == Leniency.EXACT_GROUPING:
|
||
|
|
return _verify_exact_grouping(numobj, candidate, matcher)
|
||
|
|
else:
|
||
|
|
raise Exception("Error: unsupported Leniency value %s" % leniency)
|
||
|
|
|
||
|
|
|
||
|
|
def _verify_strict_grouping(numobj, candidate, matcher):
|
||
|
|
if (not is_valid_number(numobj) or
|
||
|
|
not _contains_only_valid_x_chars(numobj, candidate) or
|
||
|
|
_contains_more_than_one_slash_in_national_number(numobj, candidate) or
|
||
|
|
not _is_national_prefix_present_if_required(numobj)):
|
||
|
|
return False
|
||
|
|
return matcher._check_number_grouping_is_valid(numobj, candidate,
|
||
|
|
_all_number_groups_remain_grouped)
|
||
|
|
|
||
|
|
|
||
|
|
def _all_number_groups_remain_grouped(numobj, normalized_candidate, formatted_number_groups):
|
||
|
|
"""Returns True if the groups of digits found in our candidate phone number match our
|
||
|
|
expectations.
|
||
|
|
|
||
|
|
Arguments:
|
||
|
|
numobj -- the original number we found when parsing
|
||
|
|
normalized_candidate -- the candidate number, normalized to only contain ASCII digits,
|
||
|
|
but with non-digits (spaces etc) retained
|
||
|
|
expected_number_groups -- the groups of digits that we would expect to see if we
|
||
|
|
formatted this number
|
||
|
|
Returns True if expectations matched.
|
||
|
|
"""
|
||
|
|
from_index = 0
|
||
|
|
if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:
|
||
|
|
# First skip the country code if the normalized candidate contained it.
|
||
|
|
country_code = str(numobj.country_code)
|
||
|
|
from_index = normalized_candidate.find(country_code) + len(country_code)
|
||
|
|
# Check each group of consecutive digits are not broken into separate
|
||
|
|
# groupings in the candidate string.
|
||
|
|
for ii, formatted_number_group in enumerate(formatted_number_groups):
|
||
|
|
# Fails if the substring of normalized_candidate starting from
|
||
|
|
# from_index doesn't contain the consecutive digits in
|
||
|
|
# formatted_number_group.
|
||
|
|
from_index = normalized_candidate.find(formatted_number_group, from_index)
|
||
|
|
if from_index < 0:
|
||
|
|
return False
|
||
|
|
# Moves from_index forward.
|
||
|
|
from_index += len(formatted_number_group)
|
||
|
|
if (ii == 0 and from_index < len(normalized_candidate)):
|
||
|
|
# We are at the position right after the NDC. We get the region
|
||
|
|
# used for formatting information based on the country code in the
|
||
|
|
# phone number, rather than the number itself, as we do not need
|
||
|
|
# to distinguish between different countries with the same country
|
||
|
|
# calling code and this is faster.
|
||
|
|
region = region_code_for_country_code(numobj.country_code)
|
||
|
|
if (ndd_prefix_for_region(region, True) is not None and
|
||
|
|
normalized_candidate[from_index].isdigit()):
|
||
|
|
# This means there is no formatting symbol after the NDC. In
|
||
|
|
# this case, we only accept the number if there is no
|
||
|
|
# formatting symbol at all in the number, except for
|
||
|
|
# extensions. This is only important for countries with
|
||
|
|
# national prefixes.
|
||
|
|
nsn = national_significant_number(numobj)
|
||
|
|
return normalized_candidate[(from_index - len(formatted_number_group)):].startswith(nsn)
|
||
|
|
# The check here makes sure that we haven't mistakenly already used the extension to
|
||
|
|
# match the last group of the subscriber number. Note the extension cannot have
|
||
|
|
# formatting in-between digits.
|
||
|
|
return (normalized_candidate[from_index:].find(numobj.extension or U_EMPTY_STRING) != -1)
|
||
|
|
|
||
|
|
|
||
|
|
def _verify_exact_grouping(numobj, candidate, matcher):
|
||
|
|
if (not is_valid_number(numobj) or
|
||
|
|
not _contains_only_valid_x_chars(numobj, candidate) or
|
||
|
|
_contains_more_than_one_slash_in_national_number(numobj, candidate) or
|
||
|
|
not _is_national_prefix_present_if_required(numobj)):
|
||
|
|
return False
|
||
|
|
return matcher._check_number_grouping_is_valid(numobj, candidate,
|
||
|
|
_all_number_groups_are_exactly_present)
|
||
|
|
|
||
|
|
|
||
|
|
def _all_number_groups_are_exactly_present(numobj, normalized_candidate, formatted_number_groups):
|
||
|
|
"""Returns True if the groups of digits found in our candidate phone number match our
|
||
|
|
expectations.
|
||
|
|
|
||
|
|
Arguments:
|
||
|
|
numobj -- the original number we found when parsing
|
||
|
|
normalized_candidate -- the candidate number, normalized to only contain ASCII digits,
|
||
|
|
but with non-digits (spaces etc) retained
|
||
|
|
expected_number_groups -- the groups of digits that we would expect to see if we
|
||
|
|
formatted this number
|
||
|
|
Returns True if expectations matched.
|
||
|
|
"""
|
||
|
|
candidate_groups = re.split(NON_DIGITS_PATTERN, normalized_candidate)
|
||
|
|
# Set this to the last group, skipping it if the number has an extension.
|
||
|
|
if numobj.extension is not None:
|
||
|
|
candidate_number_group_index = len(candidate_groups) - 2
|
||
|
|
else:
|
||
|
|
candidate_number_group_index = len(candidate_groups) - 1
|
||
|
|
# First we check if the national significant number is formatted as a
|
||
|
|
# block. We use contains and not equals, since the national significant
|
||
|
|
# number may be present with a prefix such as a national number prefix, or
|
||
|
|
# the country code itself.
|
||
|
|
if (len(candidate_groups) == 1 or
|
||
|
|
candidate_groups[candidate_number_group_index].find(national_significant_number(numobj)) != -1):
|
||
|
|
return True
|
||
|
|
# Starting from the end, go through in reverse, excluding the first group,
|
||
|
|
# and check the candidate and number groups are the same.
|
||
|
|
formatted_number_group_index = len(formatted_number_groups) - 1
|
||
|
|
while (formatted_number_group_index > 0 and candidate_number_group_index >= 0):
|
||
|
|
if (candidate_groups[candidate_number_group_index] !=
|
||
|
|
formatted_number_groups[formatted_number_group_index]):
|
||
|
|
return False
|
||
|
|
formatted_number_group_index -= 1
|
||
|
|
candidate_number_group_index -= 1
|
||
|
|
# Now check the first group. There may be a national prefix at the start, so we only check
|
||
|
|
# that the candidate group ends with the formatted number group.
|
||
|
|
return (candidate_number_group_index >= 0 and
|
||
|
|
candidate_groups[candidate_number_group_index].endswith(formatted_number_groups[0]))
|
||
|
|
|
||
|
|
|
||
|
|
def _get_national_number_groups_without_pattern(numobj):
|
||
|
|
"""Helper method to get the national-number part of a number, formatted without any national
|
||
|
|
prefix, and return it as a set of digit blocks that would be formatted together following
|
||
|
|
standard formatting rules."""
|
||
|
|
# This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX represents groups of
|
||
|
|
# digits.
|
||
|
|
rfc3966_format = format_number(numobj, PhoneNumberFormat.RFC3966)
|
||
|
|
# We remove the extension part from the formatted string before splitting
|
||
|
|
# it into different groups.
|
||
|
|
end_index = rfc3966_format.find(U_SEMICOLON)
|
||
|
|
if end_index < 0:
|
||
|
|
end_index = len(rfc3966_format)
|
||
|
|
|
||
|
|
# The country-code will have a '-' following it.
|
||
|
|
start_index = rfc3966_format.find(U_DASH) + 1
|
||
|
|
return rfc3966_format[start_index:end_index].split(U_DASH)
|
||
|
|
|
||
|
|
|
||
|
|
def _get_national_number_groups(numobj, formatting_pattern):
|
||
|
|
"""Helper method to get the national-number part of a number, formatted without any national
|
||
|
|
prefix, and return it as a set of digit blocks that should be formatted together according to
|
||
|
|
the formatting pattern passed in."""
|
||
|
|
# If a format is provided, we format the NSN only, and split that according to the separator.
|
||
|
|
nsn = national_significant_number(numobj)
|
||
|
|
return _format_nsn_using_pattern(nsn, formatting_pattern,
|
||
|
|
PhoneNumberFormat.RFC3966).split(U_DASH)
|
||
|
|
|
||
|
|
|
||
|
|
def _contains_more_than_one_slash_in_national_number(numobj, candidate):
|
||
|
|
first_slash_in_body_index = candidate.find(U_SLASH)
|
||
|
|
if first_slash_in_body_index < 0:
|
||
|
|
# No slashes, this is okay.
|
||
|
|
return False
|
||
|
|
# Now look for a second one.
|
||
|
|
second_slash_in_body_index = candidate.find(U_SLASH, first_slash_in_body_index + 1)
|
||
|
|
if second_slash_in_body_index < 0:
|
||
|
|
# Only one slash, this is okay.,
|
||
|
|
return False
|
||
|
|
|
||
|
|
# If the first slash is after the country calling code, this is permitted.
|
||
|
|
candidate_has_country_code = (numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITH_PLUS_SIGN or
|
||
|
|
numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITHOUT_PLUS_SIGN)
|
||
|
|
if (candidate_has_country_code and
|
||
|
|
normalize_digits_only(candidate[:first_slash_in_body_index]) ==
|
||
|
|
unicod(numobj.country_code)):
|
||
|
|
# Any more slashes and this is illegal.
|
||
|
|
return (candidate[(second_slash_in_body_index + 1):].find(U_SLASH) != -1)
|
||
|
|
return True
|
||
|
|
|
||
|
|
|
||
|
|
def _contains_only_valid_x_chars(numobj, candidate):
|
||
|
|
# The characters 'x' and 'X' can be (1) a carrier code, in which case they
|
||
|
|
# always precede the national significant number or (2) an extension sign,
|
||
|
|
# in which case they always precede the extension number. We assume a
|
||
|
|
# carrier code is more than 1 digit, so the first case has to have more
|
||
|
|
# than 1 consecutive 'x' or 'X', whereas the second case can only have
|
||
|
|
# exactly 1 'x' or 'X'. We ignore the character if it appears as the last
|
||
|
|
# character of the string.
|
||
|
|
ii = 0
|
||
|
|
while ii < (len(candidate) - 1):
|
||
|
|
if (candidate[ii] == U_X_LOWER or candidate[ii] == U_X_UPPER):
|
||
|
|
next_char = candidate[ii + 1]
|
||
|
|
if (next_char == U_X_LOWER or next_char == U_X_UPPER):
|
||
|
|
# This is the carrier code case, in which the 'X's always
|
||
|
|
# precede the national significant number.
|
||
|
|
ii += 1
|
||
|
|
if is_number_match(numobj, candidate[ii:]) != MatchType.NSN_MATCH:
|
||
|
|
return False
|
||
|
|
# This is the extension sign case, in which the 'x' or 'X' should
|
||
|
|
# always precede the extension number.
|
||
|
|
elif normalize_digits_only(candidate[ii:]) != numobj.extension:
|
||
|
|
return False
|
||
|
|
ii += 1
|
||
|
|
return True
|
||
|
|
|
||
|
|
|
||
|
|
def _is_national_prefix_present_if_required(numobj):
|
||
|
|
# First, check how we deduced the country code. If it was written in
|
||
|
|
# international format, then the national prefix is not required.
|
||
|
|
if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:
|
||
|
|
return True
|
||
|
|
phone_number_region = region_code_for_country_code(numobj.country_code)
|
||
|
|
metadata = PhoneMetadata.metadata_for_region(phone_number_region, None)
|
||
|
|
if metadata is None:
|
||
|
|
return True
|
||
|
|
# Check if a national prefix should be present when formatting this number.
|
||
|
|
national_number = national_significant_number(numobj)
|
||
|
|
format_rule = _choose_formatting_pattern_for_number(metadata.number_format,
|
||
|
|
national_number)
|
||
|
|
# To do this, we check that a national prefix formatting rule was present
|
||
|
|
# and that it wasn't just the first-group symbol ($1) with punctuation.
|
||
|
|
if (format_rule is not None and
|
||
|
|
format_rule.national_prefix_formatting_rule):
|
||
|
|
if format_rule.national_prefix_optional_when_formatting:
|
||
|
|
# The national-prefix is optional in these cases, so we don't need
|
||
|
|
# to check if it was present.
|
||
|
|
return True
|
||
|
|
if _formatting_rule_has_first_group_only(format_rule.national_prefix_formatting_rule):
|
||
|
|
# National Prefix not needed for this number.
|
||
|
|
return True
|
||
|
|
# Normalize the remainder.
|
||
|
|
raw_input = normalize_digits_only(numobj.raw_input)
|
||
|
|
# Check if we found a national prefix and/or carrier code at the start of the raw input,
|
||
|
|
# and return the result.
|
||
|
|
return _maybe_strip_national_prefix_carrier_code(raw_input, metadata)[2]
|
||
|
|
return True
|
||
|
|
|
||
|
|
|
||
|
|
class PhoneNumberMatcher(object):
|
||
|
|
"""A stateful class that finds and extracts telephone numbers from text.
|
||
|
|
|
||
|
|
Vanity numbers (phone numbers using alphabetic digits such as '1-800-SIX-FLAGS' are
|
||
|
|
not found.
|
||
|
|
|
||
|
|
This class is not thread-safe.
|
||
|
|
"""
|
||
|
|
# The potential states of a PhoneNumberMatcher.
|
||
|
|
_NOT_READY = 0
|
||
|
|
_READY = 1
|
||
|
|
_DONE = 2
|
||
|
|
|
||
|
|
def __init__(self, text, region,
|
||
|
|
leniency=Leniency.VALID, max_tries=65535):
|
||
|
|
"""Creates a new instance.
|
||
|
|
|
||
|
|
Arguments:
|
||
|
|
text -- The character sequence that we will search, None for no text.
|
||
|
|
country -- The country to assume for phone numbers not written in
|
||
|
|
international format (with a leading plus, or with the
|
||
|
|
international dialing prefix of the specified region). May be
|
||
|
|
None or "ZZ" if only numbers with a leading plus should be
|
||
|
|
considered.
|
||
|
|
leniency -- The leniency to use when evaluating candidate phone
|
||
|
|
numbers.
|
||
|
|
max_tries -- The maximum number of invalid numbers to try before
|
||
|
|
giving up on the text. This is to cover degenerate cases where
|
||
|
|
the text has a lot of false positives in it. Must be >= 0.
|
||
|
|
"""
|
||
|
|
if leniency is None:
|
||
|
|
raise ValueError("Need a leniency value")
|
||
|
|
if int(max_tries) < 0:
|
||
|
|
raise ValueError("Need max_tries to be positive int")
|
||
|
|
# The text searched for phone numbers.
|
||
|
|
self.text = text
|
||
|
|
if self.text is None:
|
||
|
|
self.text = U_EMPTY_STRING
|
||
|
|
# The region (country) to assume for phone numbers without an
|
||
|
|
# international prefix, possibly None.
|
||
|
|
self.preferred_region = region
|
||
|
|
# The degree of validation requested.
|
||
|
|
self.leniency = leniency
|
||
|
|
# The maximum number of retries after matching an invalid number.
|
||
|
|
self._max_tries = int(max_tries)
|
||
|
|
# The iteration tristate.
|
||
|
|
self._state = PhoneNumberMatcher._NOT_READY
|
||
|
|
# The last successful match, None unless in state _READY
|
||
|
|
self._last_match = None
|
||
|
|
# The next index to start searching at. Undefined in state _DONE
|
||
|
|
self._search_index = 0
|
||
|
|
|
||
|
|
def _find(self, index):
|
||
|
|
"""Attempts to find the next subsequence in the searched sequence on or after index
|
||
|
|
that represents a phone number. Returns the next match, None if none was found.
|
||
|
|
|
||
|
|
Arguments:
|
||
|
|
index -- The search index to start searching at.
|
||
|
|
Returns the phone number match found, None if none can be found.
|
||
|
|
"""
|
||
|
|
match = _PATTERN.search(self.text, index)
|
||
|
|
while self._max_tries > 0 and match is not None:
|
||
|
|
start = match.start()
|
||
|
|
candidate = self.text[start:match.end()]
|
||
|
|
|
||
|
|
# Check for extra numbers at the end.
|
||
|
|
# TODO: This is the place to start when trying to support
|
||
|
|
# extraction of multiple phone number from split notations (+41 79
|
||
|
|
# 123 45 67 / 68).
|
||
|
|
candidate = self._trim_after_first_match(_SECOND_NUMBER_START_PATTERN,
|
||
|
|
candidate)
|
||
|
|
|
||
|
|
match = self._extract_match(candidate, start)
|
||
|
|
if match is not None:
|
||
|
|
return match
|
||
|
|
# Move along
|
||
|
|
index = start + len(candidate)
|
||
|
|
self._max_tries -= 1
|
||
|
|
match = _PATTERN.search(self.text, index)
|
||
|
|
return None
|
||
|
|
|
||
|
|
def _trim_after_first_match(self, pattern, candidate):
|
||
|
|
"""Trims away any characters after the first match of pattern in
|
||
|
|
candidate, returning the trimmed version."""
|
||
|
|
trailing_chars_match = pattern.search(candidate)
|
||
|
|
if trailing_chars_match:
|
||
|
|
candidate = candidate[:trailing_chars_match.start()]
|
||
|
|
return candidate
|
||
|
|
|
||
|
|
@classmethod
|
||
|
|
def _is_latin_letter(cls, letter):
|
||
|
|
"""Helper method to determine if a character is a Latin-script letter
|
||
|
|
or not. For our purposes, combining marks should also return True
|
||
|
|
since we assume they have been added to a preceding Latin character."""
|
||
|
|
# Combining marks are a subset of non-spacing-mark
|
||
|
|
if (not is_letter(letter) and
|
||
|
|
Category.get(letter) != Category.NON_SPACING_MARK):
|
||
|
|
return False
|
||
|
|
block = Block.get(letter)
|
||
|
|
return (block == Block.BASIC_LATIN or
|
||
|
|
block == Block.LATIN_1_SUPPLEMENT or
|
||
|
|
block == Block.LATIN_EXTENDED_A or
|
||
|
|
block == Block.LATIN_EXTENDED_ADDITIONAL or
|
||
|
|
block == Block.LATIN_EXTENDED_B or
|
||
|
|
block == Block.COMBINING_DIACRITICAL_MARKS)
|
||
|
|
|
||
|
|
@classmethod
|
||
|
|
def _is_invalid_punctuation_symbol(cls, character):
|
||
|
|
return (character == U_PERCENT or
|
||
|
|
Category.get(character) == Category.CURRENCY_SYMBOL)
|
||
|
|
|
||
|
|
def _extract_match(self, candidate, offset):
|
||
|
|
"""Attempts to extract a match from a candidate string.
|
||
|
|
|
||
|
|
Arguments:
|
||
|
|
candidate -- The candidate text that might contain a phone number.
|
||
|
|
offset -- The offset of candidate within self.text
|
||
|
|
Returns the match found, None if none can be found
|
||
|
|
"""
|
||
|
|
# Skip a match that is more likely a publication page reference or a
|
||
|
|
# date.
|
||
|
|
if (_SLASH_SEPARATED_DATES.search(candidate)):
|
||
|
|
return None
|
||
|
|
|
||
|
|
# Skip potential time-stamps.
|
||
|
|
if _TIME_STAMPS.search(candidate):
|
||
|
|
following_text = self.text[offset + len(candidate):]
|
||
|
|
if _TIME_STAMPS_SUFFIX.match(following_text):
|
||
|
|
return None
|
||
|
|
|
||
|
|
# Try to come up with a valid match given the entire candidate.
|
||
|
|
match = self._parse_and_verify(candidate, offset)
|
||
|
|
if match is not None:
|
||
|
|
return match
|
||
|
|
|
||
|
|
# If that failed, try to find an "inner match" -- there might be a
|
||
|
|
# phone number within this candidate.
|
||
|
|
return self._extract_inner_match(candidate, offset)
|
||
|
|
|
||
|
|
def _extract_inner_match(self, candidate, offset):
|
||
|
|
"""Attempts to extract a match from candidate if the whole candidate
|
||
|
|
does not qualify as a match.
|
||
|
|
|
||
|
|
Arguments:
|
||
|
|
candidate -- The candidate text that might contain a phone number
|
||
|
|
offset -- The current offset of candidate within text
|
||
|
|
Returns the match found, None if none can be found
|
||
|
|
"""
|
||
|
|
for possible_inner_match in _INNER_MATCHES:
|
||
|
|
group_match = possible_inner_match.search(candidate)
|
||
|
|
is_first_match = True
|
||
|
|
while group_match and self._max_tries > 0:
|
||
|
|
if is_first_match:
|
||
|
|
# We should handle any group before this one too.
|
||
|
|
group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
|
||
|
|
candidate[:group_match.start()])
|
||
|
|
match = self._parse_and_verify(group, offset)
|
||
|
|
if match is not None:
|
||
|
|
return match
|
||
|
|
self._max_tries -= 1
|
||
|
|
is_first_match = False
|
||
|
|
group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
|
||
|
|
group_match.group(1))
|
||
|
|
match = self._parse_and_verify(group, offset + group_match.start(1))
|
||
|
|
if match is not None:
|
||
|
|
return match
|
||
|
|
self._max_tries -= 1
|
||
|
|
group_match = possible_inner_match.search(candidate, group_match.start() + 1)
|
||
|
|
return None
|
||
|
|
|
||
|
|
def _parse_and_verify(self, candidate, offset):
|
||
|
|
"""Parses a phone number from the candidate using phonenumberutil.parse and
|
||
|
|
verifies it matches the requested leniency. If parsing and verification succeed, a
|
||
|
|
corresponding PhoneNumberMatch is returned, otherwise this method returns None.
|
||
|
|
|
||
|
|
Arguments:
|
||
|
|
candidate -- The candidate match.
|
||
|
|
offset -- The offset of candidate within self.text.
|
||
|
|
Returns the parsed and validated phone number match, or None.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
# Check the candidate doesn't contain any formatting which would
|
||
|
|
# indicate that it really isn't a phone number.
|
||
|
|
if (not fullmatch(_MATCHING_BRACKETS, candidate) or _PUB_PAGES.search(candidate)):
|
||
|
|
return None
|
||
|
|
|
||
|
|
# If leniency is set to VALID or stricter, we also want to skip
|
||
|
|
# numbers that are surrounded by Latin alphabetic characters, to
|
||
|
|
# skip cases like abc8005001234 or 8005001234def.
|
||
|
|
if self.leniency >= Leniency.VALID:
|
||
|
|
# If the candidate is not at the start of the text, and does
|
||
|
|
# not start with phone-number punctuation, check the previous
|
||
|
|
# character
|
||
|
|
if (offset > 0 and
|
||
|
|
not _LEAD_PATTERN.match(candidate)):
|
||
|
|
previous_char = self.text[offset - 1]
|
||
|
|
# We return None if it is a latin letter or an invalid
|
||
|
|
# punctuation symbol
|
||
|
|
if (self._is_invalid_punctuation_symbol(previous_char) or
|
||
|
|
self._is_latin_letter(previous_char)):
|
||
|
|
return None
|
||
|
|
last_char_index = offset + len(candidate)
|
||
|
|
if last_char_index < len(self.text):
|
||
|
|
next_char = self.text[last_char_index]
|
||
|
|
if (self._is_invalid_punctuation_symbol(next_char) or
|
||
|
|
self._is_latin_letter(next_char)):
|
||
|
|
return None
|
||
|
|
|
||
|
|
numobj = parse(candidate, self.preferred_region, keep_raw_input=True)
|
||
|
|
if _verify(self.leniency, numobj, candidate, self):
|
||
|
|
# We used parse(keep_raw_input=True) to create this number,
|
||
|
|
# but for now we don't return the extra values parsed.
|
||
|
|
# TODO: stop clearing all values here and switch all users
|
||
|
|
# over to using raw_input rather than the raw_string of
|
||
|
|
# PhoneNumberMatch.
|
||
|
|
numobj.country_code_source = CountryCodeSource.UNSPECIFIED
|
||
|
|
numobj.raw_input = None
|
||
|
|
numobj.preferred_domestic_carrier_code = None
|
||
|
|
return PhoneNumberMatch(offset, candidate, numobj)
|
||
|
|
except NumberParseException:
|
||
|
|
# ignore and continue
|
||
|
|
pass
|
||
|
|
return None
|
||
|
|
|
||
|
|
def _check_number_grouping_is_valid(self, numobj, candidate, checker):
|
||
|
|
normalized_candidate = normalize_digits_only(candidate, True) # keep non-digits
|
||
|
|
formatted_number_groups = _get_national_number_groups_without_pattern(numobj)
|
||
|
|
if checker(numobj, normalized_candidate, formatted_number_groups):
|
||
|
|
return True
|
||
|
|
# If this didn't pass, see if there are any alternate formats that match, and try them instead.
|
||
|
|
alternate_formats = _ALT_NUMBER_FORMATS.get(numobj.country_code, None)
|
||
|
|
nsn = national_significant_number(numobj)
|
||
|
|
if alternate_formats is not None:
|
||
|
|
for alternate_format in alternate_formats:
|
||
|
|
if len(alternate_format.leading_digits_pattern) > 0:
|
||
|
|
# There is only one leading digits pattern for alternate formats.
|
||
|
|
pattern = re.compile(alternate_format.leading_digits_pattern[0])
|
||
|
|
if not pattern.match(nsn):
|
||
|
|
# Leading digits don't match; try another one.
|
||
|
|
continue
|
||
|
|
formatted_number_groups = _get_national_number_groups(numobj, alternate_format)
|
||
|
|
if checker(numobj, normalized_candidate, formatted_number_groups):
|
||
|
|
return True
|
||
|
|
return False
|
||
|
|
|
||
|
|
def has_next(self):
|
||
|
|
"""Indicates whether there is another match available"""
|
||
|
|
if self._state == PhoneNumberMatcher._NOT_READY:
|
||
|
|
self._last_match = self._find(self._search_index)
|
||
|
|
if self._last_match is None:
|
||
|
|
self._state = PhoneNumberMatcher._DONE
|
||
|
|
else:
|
||
|
|
self._search_index = self._last_match.end
|
||
|
|
self._state = PhoneNumberMatcher._READY
|
||
|
|
return (self._state == PhoneNumberMatcher._READY)
|
||
|
|
|
||
|
|
def next(self):
|
||
|
|
"""Return the next match; raises Exception if no next match available"""
|
||
|
|
# Check the state and find the next match as a side-effect if necessary.
|
||
|
|
if not self.has_next():
|
||
|
|
raise StopIteration("No next match")
|
||
|
|
# Don't retain that memory any longer than necessary.
|
||
|
|
result = self._last_match
|
||
|
|
self._last_match = None
|
||
|
|
self._state = PhoneNumberMatcher._NOT_READY
|
||
|
|
return result
|
||
|
|
|
||
|
|
def __iter__(self):
|
||
|
|
while self.has_next():
|
||
|
|
yield self.next()
|
||
|
|
|
||
|
|
|
||
|
|
class PhoneNumberMatch(UnicodeMixin):
|
||
|
|
"""The immutable match of a phone number within a piece of text.
|
||
|
|
|
||
|
|
Matches may be found using the find() method of PhoneNumberMatcher.
|
||
|
|
|
||
|
|
A match consists of the phone number (in .number) as well as the .start
|
||
|
|
and .end offsets of the corresponding subsequence of the searched
|
||
|
|
text. Use .raw_string to obtain a copy of the matched subsequence.
|
||
|
|
|
||
|
|
The following annotated example clarifies the relationship between the
|
||
|
|
searched text, the match offsets, and the parsed number:
|
||
|
|
|
||
|
|
>>> text = "Call me at +1 425 882-8080 for details."
|
||
|
|
>>> country = "US"
|
||
|
|
>>> import phonenumbers
|
||
|
|
>>> matcher = phonenumbers.PhoneNumberMatcher(text, country)
|
||
|
|
>>> matcher.has_next()
|
||
|
|
True
|
||
|
|
>>> m = matcher.next() # Find the first phone number match
|
||
|
|
>>> m.raw_string # contains the phone number as it appears in the text.
|
||
|
|
"+1 425 882-8080"
|
||
|
|
>>> (m.start, m.end) # define the range of the matched subsequence.
|
||
|
|
(11, 26)
|
||
|
|
>>> text[m.start, m.end]
|
||
|
|
"+1 425 882-8080"
|
||
|
|
>>> phonenumberutil.parse("+1 425 882-8080", "US") == m.number
|
||
|
|
True
|
||
|
|
"""
|
||
|
|
def __init__(self, start, raw_string, numobj):
|
||
|
|
if start < 0:
|
||
|
|
raise Exception("Start index not >= 0")
|
||
|
|
if raw_string is None or numobj is None:
|
||
|
|
raise Exception("Invalid argument")
|
||
|
|
# The start index into the text.
|
||
|
|
self.start = start
|
||
|
|
# The raw substring matched.
|
||
|
|
self.raw_string = raw_string
|
||
|
|
self.end = self.start + len(raw_string)
|
||
|
|
# The matched phone number.
|
||
|
|
self.number = numobj
|
||
|
|
|
||
|
|
def __eq__(self, other):
|
||
|
|
if not isinstance(other, PhoneNumberMatch):
|
||
|
|
return False
|
||
|
|
return (self.start == other.start and
|
||
|
|
self.raw_string == other.raw_string and
|
||
|
|
self.end == other.end and
|
||
|
|
self.number == other.number)
|
||
|
|
|
||
|
|
def __ne__(self, other):
|
||
|
|
return not self.__eq__(other)
|
||
|
|
|
||
|
|
def __repr__(self):
|
||
|
|
return (unicod("PhoneNumberMatch(start=%r, raw_string=%r, numobj=%r)") %
|
||
|
|
(self.start,
|
||
|
|
self.raw_string,
|
||
|
|
self.number))
|
||
|
|
|
||
|
|
def __unicode__(self):
|
||
|
|
return unicod("PhoneNumberMatch [%s,%s) %s") % (self.start, self.end, self.raw_string)
|