Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions Patterns/English/English-NumbersWithUnit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -658,34 +658,34 @@ FractionalUnitNameToCodeMap: !dictionary
CompoundUnitConnectorRegex: !simpleRegex
def: (?<spacer>and)
MultiplierRegex: !simpleRegex
def: \s*\b(thousand|million|billion|trillion)s?\b
def: \s*\b(thousand|million|billion|trillion|lakh|crore)s?\b
CurrencyPrefixList: !dictionary
types: [ string, string ]
entries:
# Currency Prefix Symbols/Terms
Dobra: db|std
Dollar: $
Brazilian Real: R$
United States dollar: united states $|us$|us $|u.s. $|u.s $|usd$
United States dollar: united states $|us$|us $|u.s. $|u.s $|usd$|usd
East Caribbean dollar: east caribbean $
Mexican peso: mxn$|mxn $|mex$
Australian dollar: australian $|australia $
Mexican peso: mxn$|mxn $|mex$|mxn
Australian dollar: australian $|australia $|a$|aud
Bahamian dollar: bahamian $|bahamia $
Barbadian dollar: barbadian $|barbadin $
Belize dollar: belize $
Bermudian dollar: bermudian $
British Virgin Islands dollar: british virgin islands $|bvi$|virgin islands $|virgin island $|british virgin island $
Brunei dollar: brunei $|b$
Sen: sen
Singapore dollar: singapore $|s$
Canadian dollar: canadian $|can$|c$|c $|canada $
Singapore dollar: singapore $|s$|sg$|sgd
Canadian dollar: canadian $|can$|c$|c $|canada $|cad$|cad
Cayman Islands dollar: cayman islands $|ci$|cayman island $
New Zealand dollar: new zealand $|nz$|nz $
Cook Islands dollar: cook islands $|cook island $
Fijian dollar: fijian $|fiji $
Guyanese dollar: gy$|gy $|g$|g $
Hong Kong dollar: hong kong $|hk$|hkd|hk $
Indian rupee: ₹
Indian rupee: ₹|inr|rs
Jamaican dollar: jamaican $|j$|jamaica $
Kiribati dollar: kiribati $
Liberian dollar: liberian $|liberia $
Expand All @@ -701,12 +701,23 @@ CurrencyPrefixList: !dictionary
Trinidad and Tobago dollar: trinidad and tobago $|trinidad $|trinidadian $
Tuvaluan dollar: tuvaluan $
Samoan tālā: ws$
Chinese yuan: ¥
Japanese yen: ¥
Euro: €
Chinese yuan: ¥|cny|rmb
Japanese yen: ¥|jpy
Euro: €|eur
Pound: £
Costa Rican colón: ₡
Turkish lira: ₺
# ISO 4217 prefix codes for currencies commonly used as leading prefixes
# in financial documents but absent from symbol-only entries above
British pound: gbp
Vietnamese dong: vnd
Swedish krona: sek
Norwegian krone: nok
Danish krone: dkk
Swiss franc: chf
South Korean won: krw
Brazilian real: brl
South African rand: zar
#CC
Bitcoin: ₿|btc|xbt
AmbiguousCurrencyUnitList: !list
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import re
from abc import abstractmethod
from typing import List

Expand All @@ -10,6 +11,26 @@
from recognizers_text.utilities import QueryProcessor
from recognizers_number_with_unit.number_with_unit.parsers import UnitValue, CurrencyUnitValue

# Matches an uppercase ISO currency prefix (1–3 letters + optional $) immediately
# followed by a digit — e.g. 'USD34', 'VND4,927', 'A$100', 'SG$40', 'CAD$1'.
# Used in CurrencyModel.parse() to insert a separating space before
# QueryProcessor lowercases the query, preventing the internal number extractor
# from misreading patterns like 'usd34.6 million' as '6 million'.
_CURRENCY_ISO_CONCAT_RE = re.compile(r'\b([A-Z]{1,3}\$|[A-Z]{3})(?=\d)')


def _to_original_pos(normalised_pos: int, insertions: List[int]) -> int:
"""Convert a position in the space-normalised string to the original string position.

Each inserted space at original position insertions[k] shifts all subsequent
normalised positions by +k+1. To reverse: subtract the count of insertions
whose normalised position falls strictly before normalised_pos.
"""
count = sum(
1 for k, p in enumerate(insertions) if p + k < normalised_pos
)
return normalised_pos - count


class ExtractorParserModel:
def __init__(self, extractor: Extractor, parser: Parser):
Expand Down Expand Up @@ -106,6 +127,45 @@ class CurrencyModel(AbstractNumberWithUnitModel):
def model_type_name(self) -> str:
return 'currency'

def parse(self, query: str) -> List[ModelResult]:
# Normalise uppercase ISO currency prefixes that are directly
# concatenated to digits before the base class calls
# QueryProcessor.preprocess() (which lowercases the query).
#
# Without this step the internal EnglishNumberExtractor (Unit mode)
# misreads patterns such as:
# 'USD34.6 million' -> extracts '6 million' (decimal boundary)
# 'VND4,927 billion' -> extracts '927 billion' (comma boundary)
#
# After inserting the space:
# 'USD34.6 million' -> 'USD 34.6 million' -> '34.6 million' ✓
# 'VND4,927 billion' -> 'VND 4,927 billion' -> '4,927 billion' ✓
#
# Uppercase-only matching avoids false positives on common English
# words ('can', 'try', 'nor', etc.) which are never all-caps.
#
# When spaces are inserted the base-class results carry positions and
# text from the normalised string, not the original. We record the
# insertion points and map every result back to the original string so
# that callers always receive offsets that are valid against their input.
insertions = [m.end() for m in _CURRENCY_ISO_CONCAT_RE.finditer(query)]

if not insertions:
# No concatenation found — no position adjustment needed.
return super().parse(query)

normalised = _CURRENCY_ISO_CONCAT_RE.sub(r'\1 ', query)
results = super().parse(normalised)

for result in results:
orig_start = _to_original_pos(result.start, insertions)
orig_end = _to_original_pos(result.end, insertions)
result.start = orig_start
result.end = orig_end
result.text = query[orig_start:orig_end + 1]

return results


class DimensionModel(AbstractNumberWithUnitModel):
@property
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -546,30 +546,30 @@ class EnglishNumericWithUnit:
("Millibitcoin", "MILLIBITCOIN"),
("Satoshi", "SATOSHI")])
CompoundUnitConnectorRegex = f'(?<spacer>and)'
MultiplierRegex = f'\\s*\\b(thousand|million|billion|trillion)s?\\b'
MultiplierRegex = f'\\s*\\b(thousand|million|billion|trillion|lakh|crore)s?\\b'
CurrencyPrefixList = dict([("Dobra", "db|std"),
("Dollar", "$"),
("Brazilian Real", "R$"),
("United States dollar", "united states $|us$|us $|u.s. $|u.s $|usd$"),
("United States dollar", "united states $|us$|us $|u.s. $|u.s $|usd$|usd"),
("East Caribbean dollar", "east caribbean $"),
("Mexican peso", "mxn$|mxn $|mex$"),
("Australian dollar", "australian $|australia $"),
("Mexican peso", "mxn$|mxn $|mex$|mxn"),
("Australian dollar", "australian $|australia $|a$|aud"),
("Bahamian dollar", "bahamian $|bahamia $"),
("Barbadian dollar", "barbadian $|barbadin $"),
("Belize dollar", "belize $"),
("Bermudian dollar", "bermudian $"),
("British Virgin Islands dollar", "british virgin islands $|bvi$|virgin islands $|virgin island $|british virgin island $"),
("Brunei dollar", "brunei $|b$"),
("Sen", "sen"),
("Singapore dollar", "singapore $|s$"),
("Canadian dollar", "canadian $|can$|c$|c $|canada $"),
("Singapore dollar", "singapore $|s$|sg$|sgd"),
("Canadian dollar", "canadian $|can$|c$|c $|canada $|cad$|cad"),
("Cayman Islands dollar", "cayman islands $|ci$|cayman island $"),
("New Zealand dollar", "new zealand $|nz$|nz $"),
("Cook Islands dollar", "cook islands $|cook island $"),
("Fijian dollar", "fijian $|fiji $"),
("Guyanese dollar", "gy$|gy $|g$|g $"),
("Hong Kong dollar", "hong kong $|hk$|hkd|hk $"),
("Indian rupee", "₹"),
("Indian rupee", "₹|inr|rs"),
("Jamaican dollar", "jamaican $|j$|jamaica $"),
("Kiribati dollar", "kiribati $"),
("Liberian dollar", "liberian $|liberia $"),
Expand All @@ -585,12 +585,21 @@ class EnglishNumericWithUnit:
("Trinidad and Tobago dollar", "trinidad and tobago $|trinidad $|trinidadian $"),
("Tuvaluan dollar", "tuvaluan $"),
("Samoan tālā", "ws$"),
("Chinese yuan", "¥"),
("Japanese yen", "¥"),
("Euro", "€"),
("Chinese yuan", "¥|cny|rmb"),
("Japanese yen", "¥|jpy"),
("Euro", "€|eur"),
("Pound", "£"),
("Costa Rican colón", "₡"),
("Turkish lira", "₺"),
("British pound", "gbp"),
("Vietnamese dong", "vnd"),
("Swedish krona", "sek"),
("Norwegian krone", "nok"),
("Danish krone", "dkk"),
("Swiss franc", "chf"),
("South Korean won", "krw"),
("Brazilian real", "brl"),
("South African rand", "zar"),
("Bitcoin", "₿|btc|xbt")])
AmbiguousCurrencyUnitList = [r'din.', r'kiwi', r'kina', r'kobo', r'lari', r'lipa', r'napa', r'para', r'sfr.', r'taka', r'tala', r'toea', r'vatu', r'yuan', r'all', r'ang', r'ban', r'bob', r'btn', r'byr', r'cad', r'cop', r'cup', r'dop', r'gip', r'jod', r'kgs', r'lak', r'lei', r'mga', r'mop', r'nad', r'omr', r'pul', r'sar', r'sbd', r'scr', r'sdg', r'sek', r'sen', r'sol', r'sos', r'std', r'try', r'yer', r'yen', r'db', r'pen', r'ron', r'mad', r'zar', r'gel', r'satoshi', r'satoshis']
InformationSuffixList = dict([("Bit", "-bit|bit|bits"),
Expand Down
151 changes: 151 additions & 0 deletions Specs/NumberWithUnit/English/CurrencyModel.json
Original file line number Diff line number Diff line change
Expand Up @@ -2980,5 +2980,156 @@
}
}
]
},
{
"Input": "USD34.6 million deal",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "USD34.6 million",
"Start": 0,
"End": 14,
"TypeName": "currency",
"Resolution": {
"isoCurrency": "USD",
"unit": "United States dollar",
"value": "34600000"
}
}
]
},
{
"Input": "USD0.92 Million consideration",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "USD0.92 Million",
"Start": 0,
"End": 14,
"TypeName": "currency",
"Resolution": {
"isoCurrency": "USD",
"unit": "United States dollar",
"value": "920000"
}
}
]
},
{
"Input": "VND4,927 billion acquisition",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "VND4,927 billion",
"Start": 0,
"End": 15,
"TypeName": "currency",
"Resolution": {
"unit": "Vietnamese dong",
"value": "4927000000000"
}
}
]
},
{
"Input": "GBP 27 million",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "gbp 27 million",
"Start": 0,
"End": 13,
"TypeName": "currency",
"Resolution": {
"isoCurrency": "GBP",
"unit": "British pound",
"value": "27000000"
}
}
]
},
{
"Input": "SEK 60,500,000",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "sek 60,500,000",
"Start": 0,
"End": 13,
"TypeName": "currency",
"Resolution": {
"isoCurrency": "SEK",
"unit": "Swedish krona",
"value": "60500000"
}
}
]
},
{
"Input": "CAD$1,700,000",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "CAD$1,700,000",
"Start": 0,
"End": 12,
"TypeName": "currency",
"Resolution": {
"isoCurrency": "CAD",
"unit": "Canadian dollar",
"value": "1700000"
}
}
]
},
{
"Input": "A$100,000 cash consideration",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "A$100,000",
"Start": 0,
"End": 8,
"TypeName": "currency",
"Resolution": {
"isoCurrency": "AUD",
"unit": "Australian dollar",
"value": "100000"
}
}
]
},
{
"Input": "SG$40 million",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "SG$40 million",
"Start": 0,
"End": 12,
"TypeName": "currency",
"Resolution": {
"isoCurrency": "SGD",
"unit": "Singapore dollar",
"value": "40000000"
}
}
]
},
{
"Input": "Rs 660 crore",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "rs 660 crore",
"Start": 0,
"End": 11,
"TypeName": "currency",
"Resolution": {
"unit": "Rupee",
"value": "6600000000"
}
}
]
}
]