yeastar-voipms-webhook-proxy/env/Lib/site-packages/phonenumbers/phonenumbermatcher.py

777 lines
37 KiB
Python
Raw Normal View History

2025-07-13 22:10:11 -04:00
"""Functionality to match phone numbers in a piece of text"""
# Based on original Java code:
# java/src/com/google/i18n/phonenumbers/PhoneNumberMatch.java
# java/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java
# Copyright (C) 2011 The Libphonenumber Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
# Extra regexp function; see README
from .re_util import fullmatch
from .util import UnicodeMixin, u, unicod, prnt
from .util import U_EMPTY_STRING, U_DASH, U_SEMICOLON, U_SLASH, U_X_LOWER, U_X_UPPER, U_PERCENT
from .unicode_util import Category, Block, is_letter
from .phonenumberutil import _MAX_LENGTH_FOR_NSN, _MAX_LENGTH_COUNTRY_CODE
from .phonenumberutil import _VALID_PUNCTUATION, _PLUS_CHARS, NON_DIGITS_PATTERN
from .phonenumberutil import _EXTN_PATTERNS_FOR_MATCHING, _REGEX_FLAGS
from .phonenumberutil import _SECOND_NUMBER_START_PATTERN, _UNWANTED_END_CHAR_PATTERN
from .phonenumberutil import MatchType, NumberParseException, PhoneNumberFormat
from .phonenumberutil import is_possible_number, is_valid_number, parse
from .phonenumberutil import normalize_digits_only, national_significant_number
from .phonenumberutil import _format_nsn_using_pattern, ndd_prefix_for_region
from .phonenumberutil import format_number, is_number_match, region_code_for_country_code
from .phonenumberutil import _maybe_strip_national_prefix_carrier_code
from .phonenumberutil import _choose_formatting_pattern_for_number
from .phonenumberutil import _formatting_rule_has_first_group_only
from .phonenumber import CountryCodeSource
from .phonemetadata import PhoneMetadata
# Import auto-generated data structures
try:
from .data import _ALT_NUMBER_FORMATS
except ImportError: # pragma no cover
# Before the generated code exists, the data/ directory is empty.
# The generation process imports this module, creating a circular
# dependency. The hack below works around this.
import os
import sys
if os.path.basename(sys.argv[0]) in ("buildmetadatafromxml.py", "buildprefixdata.py"):
prnt("Failed to import generated data (but OK as during autogeneration)", file=sys.stderr)
_ALT_NUMBER_FORMATS = {}
else:
raise
def _limit(lower, upper):
"""Returns a regular expression quantifier with an upper and lower limit."""
if ((lower < 0) or (upper <= 0) or (upper < lower)):
raise Exception("Illegal argument to _limit")
return unicod("{%d,%d}") % (lower, upper)
# Build the MATCHING_BRACKETS and PATTERN regular expression patterns. The
# building blocks below exist to make the patterns more easily understood.
_OPENING_PARENS = u("(\\[\uFF08\uFF3B")
_CLOSING_PARENS = u(")\\]\uFF09\uFF3D")
_NON_PARENS = u("[^") + _OPENING_PARENS + _CLOSING_PARENS + u("]")
# Limit on the number of pairs of brackets in a phone number.
_BRACKET_PAIR_LIMIT = _limit(0, 3)
# Pattern to check that brackets match. Opening brackets should be closed
# within a phone number. This also checks that there is something inside the
# brackets. Having no brackets at all is also fine.
#
# An opening bracket at the beginning may not be closed, but subsequent ones
# should be. It's also possible that the leading bracket was dropped, so we
# shouldn't be surprised if we see a closing bracket first. We limit the sets
# of brackets in a phone number to four.
_MATCHING_BRACKETS = re.compile(u("(?:[") + _OPENING_PARENS + u("])?") + u("(?:") + _NON_PARENS + u("+") +
u("[") + _CLOSING_PARENS + u("])?") +
_NON_PARENS + u("+") +
u("(?:[") + _OPENING_PARENS + u("]") + _NON_PARENS +
u("+[") + _CLOSING_PARENS + u("])") + _BRACKET_PAIR_LIMIT +
_NON_PARENS + u("*"))
# Limit on the number of leading (plus) characters.
_LEAD_LIMIT = _limit(0, 2)
# Limit on the number of consecutive punctuation characters.
_PUNCTUATION_LIMIT = _limit(0, 4)
# The maximum number of digits allowed in a digit-separated block. As we allow
# all digits in a single block, set high enough to accommodate the entire
# national number and the international country code.
_DIGIT_BLOCK_LIMIT = (_MAX_LENGTH_FOR_NSN + _MAX_LENGTH_COUNTRY_CODE)
# Limit on the number of blocks separated by punctuation. Use _DIGIT_BLOCK_LIMIT
# since some formats use spaces to separate each digit.
_BLOCK_LIMIT = _limit(0, _DIGIT_BLOCK_LIMIT)
# A punctuation sequence allowing white space.
_PUNCTUATION = u("[") + _VALID_PUNCTUATION + u("]") + _PUNCTUATION_LIMIT
# A digits block without punctuation.
_DIGIT_SEQUENCE = u("\\d") + _limit(1, _DIGIT_BLOCK_LIMIT)
# Punctuation that may be at the start of a phone number - brackets and plus signs.
_LEAD_CLASS_CHARS = _OPENING_PARENS + _PLUS_CHARS
_LEAD_CLASS = u("[") + _LEAD_CLASS_CHARS + u("]")
_LEAD_PATTERN = re.compile(_LEAD_CLASS)
# Phone number pattern allowing optional punctuation.
# This is the phone number pattern used by _find(), similar to
# phonenumberutil._VALID_PHONE_NUMBER, but with the following differences:
# - All captures are limited in order to place an upper bound to the text
# matched by the pattern.
# - Leading punctuation / plus signs are limited.
# - Consecutive occurrences of punctuation are limited.
# - Number of digits is limited.
# - No whitespace is allowed at the start or end.
# - No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently
# supported.
_PATTERN = re.compile(u("(?:") + _LEAD_CLASS + _PUNCTUATION + u(")") + _LEAD_LIMIT +
_DIGIT_SEQUENCE + u("(?:") + _PUNCTUATION + _DIGIT_SEQUENCE + u(")") + _BLOCK_LIMIT +
u("(?:") + _EXTN_PATTERNS_FOR_MATCHING + u(")?"),
_REGEX_FLAGS)
# Matches strings that look like publication pages. Example: "Computing
# Complete Answers to Queries in the Presence of Limited Access Patterns.
# Chen Li. VLDB J. 12(3): 211-227 (2003)."
#
# The string "211-227 (2003)" is not a telephone number.
_PUB_PAGES = re.compile(u("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}"))
# Matches strings that look like dates using "/" as a separator. Examples:
# 3/10/2011, 31/10/96 or 08/31/95.
_SLASH_SEPARATED_DATES = re.compile(u("(?:(?:[0-3]?\\d/[01]?\\d)|(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}"))
# Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
# not include the trailing ":\d\d" -- that is covered by TIME_STAMPS_SUFFIX.
_TIME_STAMPS = re.compile(u("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d +[0-2]\\d$"))
_TIME_STAMPS_SUFFIX = re.compile(u(":[0-5]\\d"))
# Patterns used to extract phone numbers from a larger phone-number-like
# pattern. These are ordered according to specificity. For example,
# white-space is last since that is frequently used in numbers, not just to
# separate two numbers. We have separate patterns since we don't want to break
# up the phone-number-like text on more than one different kind of symbol at
# one time, although symbols of the same type (e.g. space) can be safely
# grouped together.
#
# Note that if there is a match, we will always check any text found up to the
# first match as well.
_INNER_MATCHES = (
# Breaks on the slash - e.g. "651-234-2345/332-445-1234"
re.compile(u("/+(.*)")),
# Note that the bracket here is inside the capturing group, since we
# consider it part of the phone number. Will match a pattern like "(650)
# 223 3345 (754) 223 3321".
re.compile(u("(\\([^(]*)")),
# Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We
# require a space on either side of the hyphen for it to be considered a
# separator.
re.compile(u("(?u)(?:\\s-|-\\s)\\s*(.+)")),
# Various types of wide hyphens. Note we have decided not to enforce a
# space here, since it's possible that it's supposed to be used to break
# two numbers without spaces, and we haven't seen many instances of it
# used within a number.
re.compile(u("(?u)[\u2012-\u2015\uFF0D]\\s*(.+)")),
# Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
re.compile(u("(?u)\\.+\\s*([^.]+)")),
# Breaks on space - e.g. "3324451234 8002341234"
re.compile(u("(?u)\\s+(\\S+)")))
class Leniency(object):
"""Leniency when finding potential phone numbers in text segments.
The levels here are ordered in increasing strictness."""
# Phone numbers accepted are possible (i.e. is_possible_number(number)) but
# not necessarily valid (is_valid_number(number)).
POSSIBLE = 0
# Phone numbers accepted are both possible (is_possible_number(number))
# and valid (is_valid_number(PhoneNumber)). Numbers written in national
# format must have their national-prefix present if it is usually written
# for a number of this type.
VALID = 1
# Phone numbers accepted are valid (i.e. is_valid_number(number)) and are
# grouped in a possible way for this locale. For example, a US number
# written as "65 02 53 00 00" and "650253 0000" are not accepted at this
# leniency level, whereas "650 253 0000", "650 2530000" or "6502530000"
# are.
# Numbers with more than one '/' symbol in the national significant number
# are also dropped at this level.
#
# Warning: This level might result in lower coverage especially for
# regions outside of country code "+1". If you are not sure about which
# level to use, email the discussion group
# libphonenumber-discuss@googlegroups.com.
STRICT_GROUPING = 2
# Phone numbers accepted are valid (i.e. is_valid_number(number)) and are
# grouped in the same way that we would have formatted it, or as a single
# block. For example, a US number written as "650 2530000" is not accepted
# at this leniency level, whereas "650 253 0000" or "6502530000" are.
# Numbers with more than one '/' symbol are also dropped at this level.
# Warning: This level might result in lower coverage especially for
# regions outside of country code "+1". If you are not sure about which
# level to use, email the discussion group
# libphonenumber-discuss@googlegroups.com.
EXACT_GROUPING = 3
def _verify(leniency, numobj, candidate, matcher):
"""Returns True if number is a verified number according to the
leniency."""
if leniency == Leniency.POSSIBLE:
return is_possible_number(numobj)
elif leniency == Leniency.VALID:
if (not is_valid_number(numobj) or
not _contains_only_valid_x_chars(numobj, candidate)):
return False
return _is_national_prefix_present_if_required(numobj)
elif leniency == Leniency.STRICT_GROUPING:
return _verify_strict_grouping(numobj, candidate, matcher)
elif leniency == Leniency.EXACT_GROUPING:
return _verify_exact_grouping(numobj, candidate, matcher)
else:
raise Exception("Error: unsupported Leniency value %s" % leniency)
def _verify_strict_grouping(numobj, candidate, matcher):
if (not is_valid_number(numobj) or
not _contains_only_valid_x_chars(numobj, candidate) or
_contains_more_than_one_slash_in_national_number(numobj, candidate) or
not _is_national_prefix_present_if_required(numobj)):
return False
return matcher._check_number_grouping_is_valid(numobj, candidate,
_all_number_groups_remain_grouped)
def _all_number_groups_remain_grouped(numobj, normalized_candidate, formatted_number_groups):
"""Returns True if the groups of digits found in our candidate phone number match our
expectations.
Arguments:
numobj -- the original number we found when parsing
normalized_candidate -- the candidate number, normalized to only contain ASCII digits,
but with non-digits (spaces etc) retained
expected_number_groups -- the groups of digits that we would expect to see if we
formatted this number
Returns True if expectations matched.
"""
from_index = 0
if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:
# First skip the country code if the normalized candidate contained it.
country_code = str(numobj.country_code)
from_index = normalized_candidate.find(country_code) + len(country_code)
# Check each group of consecutive digits are not broken into separate
# groupings in the candidate string.
for ii, formatted_number_group in enumerate(formatted_number_groups):
# Fails if the substring of normalized_candidate starting from
# from_index doesn't contain the consecutive digits in
# formatted_number_group.
from_index = normalized_candidate.find(formatted_number_group, from_index)
if from_index < 0:
return False
# Moves from_index forward.
from_index += len(formatted_number_group)
if (ii == 0 and from_index < len(normalized_candidate)):
# We are at the position right after the NDC. We get the region
# used for formatting information based on the country code in the
# phone number, rather than the number itself, as we do not need
# to distinguish between different countries with the same country
# calling code and this is faster.
region = region_code_for_country_code(numobj.country_code)
if (ndd_prefix_for_region(region, True) is not None and
normalized_candidate[from_index].isdigit()):
# This means there is no formatting symbol after the NDC. In
# this case, we only accept the number if there is no
# formatting symbol at all in the number, except for
# extensions. This is only important for countries with
# national prefixes.
nsn = national_significant_number(numobj)
return normalized_candidate[(from_index - len(formatted_number_group)):].startswith(nsn)
# The check here makes sure that we haven't mistakenly already used the extension to
# match the last group of the subscriber number. Note the extension cannot have
# formatting in-between digits.
return (normalized_candidate[from_index:].find(numobj.extension or U_EMPTY_STRING) != -1)
def _verify_exact_grouping(numobj, candidate, matcher):
if (not is_valid_number(numobj) or
not _contains_only_valid_x_chars(numobj, candidate) or
_contains_more_than_one_slash_in_national_number(numobj, candidate) or
not _is_national_prefix_present_if_required(numobj)):
return False
return matcher._check_number_grouping_is_valid(numobj, candidate,
_all_number_groups_are_exactly_present)
def _all_number_groups_are_exactly_present(numobj, normalized_candidate, formatted_number_groups):
"""Returns True if the groups of digits found in our candidate phone number match our
expectations.
Arguments:
numobj -- the original number we found when parsing
normalized_candidate -- the candidate number, normalized to only contain ASCII digits,
but with non-digits (spaces etc) retained
expected_number_groups -- the groups of digits that we would expect to see if we
formatted this number
Returns True if expectations matched.
"""
candidate_groups = re.split(NON_DIGITS_PATTERN, normalized_candidate)
# Set this to the last group, skipping it if the number has an extension.
if numobj.extension is not None:
candidate_number_group_index = len(candidate_groups) - 2
else:
candidate_number_group_index = len(candidate_groups) - 1
# First we check if the national significant number is formatted as a
# block. We use contains and not equals, since the national significant
# number may be present with a prefix such as a national number prefix, or
# the country code itself.
if (len(candidate_groups) == 1 or
candidate_groups[candidate_number_group_index].find(national_significant_number(numobj)) != -1):
return True
# Starting from the end, go through in reverse, excluding the first group,
# and check the candidate and number groups are the same.
formatted_number_group_index = len(formatted_number_groups) - 1
while (formatted_number_group_index > 0 and candidate_number_group_index >= 0):
if (candidate_groups[candidate_number_group_index] !=
formatted_number_groups[formatted_number_group_index]):
return False
formatted_number_group_index -= 1
candidate_number_group_index -= 1
# Now check the first group. There may be a national prefix at the start, so we only check
# that the candidate group ends with the formatted number group.
return (candidate_number_group_index >= 0 and
candidate_groups[candidate_number_group_index].endswith(formatted_number_groups[0]))
def _get_national_number_groups_without_pattern(numobj):
"""Helper method to get the national-number part of a number, formatted without any national
prefix, and return it as a set of digit blocks that would be formatted together following
standard formatting rules."""
# This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX represents groups of
# digits.
rfc3966_format = format_number(numobj, PhoneNumberFormat.RFC3966)
# We remove the extension part from the formatted string before splitting
# it into different groups.
end_index = rfc3966_format.find(U_SEMICOLON)
if end_index < 0:
end_index = len(rfc3966_format)
# The country-code will have a '-' following it.
start_index = rfc3966_format.find(U_DASH) + 1
return rfc3966_format[start_index:end_index].split(U_DASH)
def _get_national_number_groups(numobj, formatting_pattern):
"""Helper method to get the national-number part of a number, formatted without any national
prefix, and return it as a set of digit blocks that should be formatted together according to
the formatting pattern passed in."""
# If a format is provided, we format the NSN only, and split that according to the separator.
nsn = national_significant_number(numobj)
return _format_nsn_using_pattern(nsn, formatting_pattern,
PhoneNumberFormat.RFC3966).split(U_DASH)
def _contains_more_than_one_slash_in_national_number(numobj, candidate):
first_slash_in_body_index = candidate.find(U_SLASH)
if first_slash_in_body_index < 0:
# No slashes, this is okay.
return False
# Now look for a second one.
second_slash_in_body_index = candidate.find(U_SLASH, first_slash_in_body_index + 1)
if second_slash_in_body_index < 0:
# Only one slash, this is okay.,
return False
# If the first slash is after the country calling code, this is permitted.
candidate_has_country_code = (numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITH_PLUS_SIGN or
numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITHOUT_PLUS_SIGN)
if (candidate_has_country_code and
normalize_digits_only(candidate[:first_slash_in_body_index]) ==
unicod(numobj.country_code)):
# Any more slashes and this is illegal.
return (candidate[(second_slash_in_body_index + 1):].find(U_SLASH) != -1)
return True
def _contains_only_valid_x_chars(numobj, candidate):
# The characters 'x' and 'X' can be (1) a carrier code, in which case they
# always precede the national significant number or (2) an extension sign,
# in which case they always precede the extension number. We assume a
# carrier code is more than 1 digit, so the first case has to have more
# than 1 consecutive 'x' or 'X', whereas the second case can only have
# exactly 1 'x' or 'X'. We ignore the character if it appears as the last
# character of the string.
ii = 0
while ii < (len(candidate) - 1):
if (candidate[ii] == U_X_LOWER or candidate[ii] == U_X_UPPER):
next_char = candidate[ii + 1]
if (next_char == U_X_LOWER or next_char == U_X_UPPER):
# This is the carrier code case, in which the 'X's always
# precede the national significant number.
ii += 1
if is_number_match(numobj, candidate[ii:]) != MatchType.NSN_MATCH:
return False
# This is the extension sign case, in which the 'x' or 'X' should
# always precede the extension number.
elif normalize_digits_only(candidate[ii:]) != numobj.extension:
return False
ii += 1
return True
def _is_national_prefix_present_if_required(numobj):
# First, check how we deduced the country code. If it was written in
# international format, then the national prefix is not required.
if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:
return True
phone_number_region = region_code_for_country_code(numobj.country_code)
metadata = PhoneMetadata.metadata_for_region(phone_number_region, None)
if metadata is None:
return True
# Check if a national prefix should be present when formatting this number.
national_number = national_significant_number(numobj)
format_rule = _choose_formatting_pattern_for_number(metadata.number_format,
national_number)
# To do this, we check that a national prefix formatting rule was present
# and that it wasn't just the first-group symbol ($1) with punctuation.
if (format_rule is not None and
format_rule.national_prefix_formatting_rule):
if format_rule.national_prefix_optional_when_formatting:
# The national-prefix is optional in these cases, so we don't need
# to check if it was present.
return True
if _formatting_rule_has_first_group_only(format_rule.national_prefix_formatting_rule):
# National Prefix not needed for this number.
return True
# Normalize the remainder.
raw_input = normalize_digits_only(numobj.raw_input)
# Check if we found a national prefix and/or carrier code at the start of the raw input,
# and return the result.
return _maybe_strip_national_prefix_carrier_code(raw_input, metadata)[2]
return True
class PhoneNumberMatcher(object):
"""A stateful class that finds and extracts telephone numbers from text.
Vanity numbers (phone numbers using alphabetic digits such as '1-800-SIX-FLAGS' are
not found.
This class is not thread-safe.
"""
# The potential states of a PhoneNumberMatcher.
_NOT_READY = 0
_READY = 1
_DONE = 2
def __init__(self, text, region,
leniency=Leniency.VALID, max_tries=65535):
"""Creates a new instance.
Arguments:
text -- The character sequence that we will search, None for no text.
country -- The country to assume for phone numbers not written in
international format (with a leading plus, or with the
international dialing prefix of the specified region). May be
None or "ZZ" if only numbers with a leading plus should be
considered.
leniency -- The leniency to use when evaluating candidate phone
numbers.
max_tries -- The maximum number of invalid numbers to try before
giving up on the text. This is to cover degenerate cases where
the text has a lot of false positives in it. Must be >= 0.
"""
if leniency is None:
raise ValueError("Need a leniency value")
if int(max_tries) < 0:
raise ValueError("Need max_tries to be positive int")
# The text searched for phone numbers.
self.text = text
if self.text is None:
self.text = U_EMPTY_STRING
# The region (country) to assume for phone numbers without an
# international prefix, possibly None.
self.preferred_region = region
# The degree of validation requested.
self.leniency = leniency
# The maximum number of retries after matching an invalid number.
self._max_tries = int(max_tries)
# The iteration tristate.
self._state = PhoneNumberMatcher._NOT_READY
# The last successful match, None unless in state _READY
self._last_match = None
# The next index to start searching at. Undefined in state _DONE
self._search_index = 0
def _find(self, index):
"""Attempts to find the next subsequence in the searched sequence on or after index
that represents a phone number. Returns the next match, None if none was found.
Arguments:
index -- The search index to start searching at.
Returns the phone number match found, None if none can be found.
"""
match = _PATTERN.search(self.text, index)
while self._max_tries > 0 and match is not None:
start = match.start()
candidate = self.text[start:match.end()]
# Check for extra numbers at the end.
# TODO: This is the place to start when trying to support
# extraction of multiple phone number from split notations (+41 79
# 123 45 67 / 68).
candidate = self._trim_after_first_match(_SECOND_NUMBER_START_PATTERN,
candidate)
match = self._extract_match(candidate, start)
if match is not None:
return match
# Move along
index = start + len(candidate)
self._max_tries -= 1
match = _PATTERN.search(self.text, index)
return None
def _trim_after_first_match(self, pattern, candidate):
"""Trims away any characters after the first match of pattern in
candidate, returning the trimmed version."""
trailing_chars_match = pattern.search(candidate)
if trailing_chars_match:
candidate = candidate[:trailing_chars_match.start()]
return candidate
@classmethod
def _is_latin_letter(cls, letter):
"""Helper method to determine if a character is a Latin-script letter
or not. For our purposes, combining marks should also return True
since we assume they have been added to a preceding Latin character."""
# Combining marks are a subset of non-spacing-mark
if (not is_letter(letter) and
Category.get(letter) != Category.NON_SPACING_MARK):
return False
block = Block.get(letter)
return (block == Block.BASIC_LATIN or
block == Block.LATIN_1_SUPPLEMENT or
block == Block.LATIN_EXTENDED_A or
block == Block.LATIN_EXTENDED_ADDITIONAL or
block == Block.LATIN_EXTENDED_B or
block == Block.COMBINING_DIACRITICAL_MARKS)
@classmethod
def _is_invalid_punctuation_symbol(cls, character):
return (character == U_PERCENT or
Category.get(character) == Category.CURRENCY_SYMBOL)
def _extract_match(self, candidate, offset):
"""Attempts to extract a match from a candidate string.
Arguments:
candidate -- The candidate text that might contain a phone number.
offset -- The offset of candidate within self.text
Returns the match found, None if none can be found
"""
# Skip a match that is more likely a publication page reference or a
# date.
if (_SLASH_SEPARATED_DATES.search(candidate)):
return None
# Skip potential time-stamps.
if _TIME_STAMPS.search(candidate):
following_text = self.text[offset + len(candidate):]
if _TIME_STAMPS_SUFFIX.match(following_text):
return None
# Try to come up with a valid match given the entire candidate.
match = self._parse_and_verify(candidate, offset)
if match is not None:
return match
# If that failed, try to find an "inner match" -- there might be a
# phone number within this candidate.
return self._extract_inner_match(candidate, offset)
def _extract_inner_match(self, candidate, offset):
"""Attempts to extract a match from candidate if the whole candidate
does not qualify as a match.
Arguments:
candidate -- The candidate text that might contain a phone number
offset -- The current offset of candidate within text
Returns the match found, None if none can be found
"""
for possible_inner_match in _INNER_MATCHES:
group_match = possible_inner_match.search(candidate)
is_first_match = True
while group_match and self._max_tries > 0:
if is_first_match:
# We should handle any group before this one too.
group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
candidate[:group_match.start()])
match = self._parse_and_verify(group, offset)
if match is not None:
return match
self._max_tries -= 1
is_first_match = False
group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
group_match.group(1))
match = self._parse_and_verify(group, offset + group_match.start(1))
if match is not None:
return match
self._max_tries -= 1
group_match = possible_inner_match.search(candidate, group_match.start() + 1)
return None
def _parse_and_verify(self, candidate, offset):
"""Parses a phone number from the candidate using phonenumberutil.parse and
verifies it matches the requested leniency. If parsing and verification succeed, a
corresponding PhoneNumberMatch is returned, otherwise this method returns None.
Arguments:
candidate -- The candidate match.
offset -- The offset of candidate within self.text.
Returns the parsed and validated phone number match, or None.
"""
try:
# Check the candidate doesn't contain any formatting which would
# indicate that it really isn't a phone number.
if (not fullmatch(_MATCHING_BRACKETS, candidate) or _PUB_PAGES.search(candidate)):
return None
# If leniency is set to VALID or stricter, we also want to skip
# numbers that are surrounded by Latin alphabetic characters, to
# skip cases like abc8005001234 or 8005001234def.
if self.leniency >= Leniency.VALID:
# If the candidate is not at the start of the text, and does
# not start with phone-number punctuation, check the previous
# character
if (offset > 0 and
not _LEAD_PATTERN.match(candidate)):
previous_char = self.text[offset - 1]
# We return None if it is a latin letter or an invalid
# punctuation symbol
if (self._is_invalid_punctuation_symbol(previous_char) or
self._is_latin_letter(previous_char)):
return None
last_char_index = offset + len(candidate)
if last_char_index < len(self.text):
next_char = self.text[last_char_index]
if (self._is_invalid_punctuation_symbol(next_char) or
self._is_latin_letter(next_char)):
return None
numobj = parse(candidate, self.preferred_region, keep_raw_input=True)
if _verify(self.leniency, numobj, candidate, self):
# We used parse(keep_raw_input=True) to create this number,
# but for now we don't return the extra values parsed.
# TODO: stop clearing all values here and switch all users
# over to using raw_input rather than the raw_string of
# PhoneNumberMatch.
numobj.country_code_source = CountryCodeSource.UNSPECIFIED
numobj.raw_input = None
numobj.preferred_domestic_carrier_code = None
return PhoneNumberMatch(offset, candidate, numobj)
except NumberParseException:
# ignore and continue
pass
return None
def _check_number_grouping_is_valid(self, numobj, candidate, checker):
normalized_candidate = normalize_digits_only(candidate, True) # keep non-digits
formatted_number_groups = _get_national_number_groups_without_pattern(numobj)
if checker(numobj, normalized_candidate, formatted_number_groups):
return True
# If this didn't pass, see if there are any alternate formats that match, and try them instead.
alternate_formats = _ALT_NUMBER_FORMATS.get(numobj.country_code, None)
nsn = national_significant_number(numobj)
if alternate_formats is not None:
for alternate_format in alternate_formats:
if len(alternate_format.leading_digits_pattern) > 0:
# There is only one leading digits pattern for alternate formats.
pattern = re.compile(alternate_format.leading_digits_pattern[0])
if not pattern.match(nsn):
# Leading digits don't match; try another one.
continue
formatted_number_groups = _get_national_number_groups(numobj, alternate_format)
if checker(numobj, normalized_candidate, formatted_number_groups):
return True
return False
def has_next(self):
"""Indicates whether there is another match available"""
if self._state == PhoneNumberMatcher._NOT_READY:
self._last_match = self._find(self._search_index)
if self._last_match is None:
self._state = PhoneNumberMatcher._DONE
else:
self._search_index = self._last_match.end
self._state = PhoneNumberMatcher._READY
return (self._state == PhoneNumberMatcher._READY)
def next(self):
"""Return the next match; raises Exception if no next match available"""
# Check the state and find the next match as a side-effect if necessary.
if not self.has_next():
raise StopIteration("No next match")
# Don't retain that memory any longer than necessary.
result = self._last_match
self._last_match = None
self._state = PhoneNumberMatcher._NOT_READY
return result
def __iter__(self):
while self.has_next():
yield self.next()
class PhoneNumberMatch(UnicodeMixin):
"""The immutable match of a phone number within a piece of text.
Matches may be found using the find() method of PhoneNumberMatcher.
A match consists of the phone number (in .number) as well as the .start
and .end offsets of the corresponding subsequence of the searched
text. Use .raw_string to obtain a copy of the matched subsequence.
The following annotated example clarifies the relationship between the
searched text, the match offsets, and the parsed number:
>>> text = "Call me at +1 425 882-8080 for details."
>>> country = "US"
>>> import phonenumbers
>>> matcher = phonenumbers.PhoneNumberMatcher(text, country)
>>> matcher.has_next()
True
>>> m = matcher.next() # Find the first phone number match
>>> m.raw_string # contains the phone number as it appears in the text.
"+1 425 882-8080"
>>> (m.start, m.end) # define the range of the matched subsequence.
(11, 26)
>>> text[m.start, m.end]
"+1 425 882-8080"
>>> phonenumberutil.parse("+1 425 882-8080", "US") == m.number
True
"""
def __init__(self, start, raw_string, numobj):
if start < 0:
raise Exception("Start index not >= 0")
if raw_string is None or numobj is None:
raise Exception("Invalid argument")
# The start index into the text.
self.start = start
# The raw substring matched.
self.raw_string = raw_string
self.end = self.start + len(raw_string)
# The matched phone number.
self.number = numobj
def __eq__(self, other):
if not isinstance(other, PhoneNumberMatch):
return False
return (self.start == other.start and
self.raw_string == other.raw_string and
self.end == other.end and
self.number == other.number)
def __ne__(self, other):
return not self.__eq__(other)
def __repr__(self):
return (unicod("PhoneNumberMatch(start=%r, raw_string=%r, numobj=%r)") %
(self.start,
self.raw_string,
self.number))
def __unicode__(self):
return unicod("PhoneNumberMatch [%s,%s) %s") % (self.start, self.end, self.raw_string)