From 8ec7c519953d950ee296b6e3a4e52dc161a36420 Mon Sep 17 00:00:00 2001 From: antstorm Date: Wed, 14 Aug 2024 12:51:48 +0100 Subject: [PATCH 1/9] Rename current default parser to OptimisticParser --- lib/monetize.rb | 4 ++-- lib/monetize/{parser.rb => optimistic_parser.rb} | 4 ++-- spec/monetize_spec.rb | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) rename lib/monetize/{parser.rb => optimistic_parser.rb} (97%) diff --git a/lib/monetize.rb b/lib/monetize.rb index dfedaba7d..5946048c8 100644 --- a/lib/monetize.rb +++ b/lib/monetize.rb @@ -4,7 +4,7 @@ require 'monetize/core_extensions' require 'monetize/errors' require 'monetize/version' -require 'monetize/parser' +require 'monetize/optimistic_parser' require 'monetize/collection' module Monetize @@ -36,7 +36,7 @@ def parse!(input, currency = Money.default_currency, options = {}) return input if input.is_a?(Money) return from_numeric(input, currency) if input.is_a?(Numeric) - parser = Monetize::Parser.new(input, currency, options) + parser = Monetize::OptimisticParser.new(input, currency, options) amount, currency = parser.parse Money.from_amount(amount, currency) diff --git a/lib/monetize/parser.rb b/lib/monetize/optimistic_parser.rb similarity index 97% rename from lib/monetize/parser.rb rename to lib/monetize/optimistic_parser.rb index 8d88c09a6..29b07a3b0 100644 --- a/lib/monetize/parser.rb +++ b/lib/monetize/optimistic_parser.rb @@ -1,7 +1,7 @@ # encoding: utf-8 module Monetize - class Parser + class OptimisticParser CURRENCY_SYMBOLS = { '$' => 'USD', '€' => 'EUR', @@ -76,7 +76,7 @@ def to_big_decimal(value) def parse_currency computed_currency = nil computed_currency = input[/[A-Z]{2,3}/] - computed_currency = nil unless Monetize::Parser::CURRENCY_SYMBOLS.value?(computed_currency) + computed_currency = nil unless Monetize::OptimisticParser::CURRENCY_SYMBOLS.value?(computed_currency) computed_currency ||= compute_currency if assume_from_symbol? diff --git a/spec/monetize_spec.rb b/spec/monetize_spec.rb index 00aad8002..f41a07d1e 100644 --- a/spec/monetize_spec.rb +++ b/spec/monetize_spec.rb @@ -56,7 +56,7 @@ Monetize.assume_from_symbol = false end - Monetize::Parser::CURRENCY_SYMBOLS.each_pair do |symbol, iso_code| + Monetize::OptimisticParser::CURRENCY_SYMBOLS.each_pair do |symbol, iso_code| context iso_code do let(:currency) { Money::Currency.find(iso_code) } let(:amount) { 5_95 } From 20c1ce0485ded689308417fb32484bea9141090d Mon Sep 17 00:00:00 2001 From: antstorm Date: Wed, 14 Aug 2024 13:00:10 +0100 Subject: [PATCH 2/9] Extract abstract Monetize::Parser class --- lib/monetize/optimistic_parser.rb | 37 ++++--------------------------- lib/monetize/parser.rb | 37 +++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 33 deletions(-) create mode 100644 lib/monetize/parser.rb diff --git a/lib/monetize/optimistic_parser.rb b/lib/monetize/optimistic_parser.rb index 29b07a3b0..bceb4e99b 100644 --- a/lib/monetize/optimistic_parser.rb +++ b/lib/monetize/optimistic_parser.rb @@ -1,38 +1,9 @@ # encoding: utf-8 +require 'monetize/parser' + module Monetize - class OptimisticParser - CURRENCY_SYMBOLS = { - '$' => 'USD', - '€' => 'EUR', - '£' => 'GBP', - '₤' => 'GBP', - 'R$' => 'BRL', - 'RM' => 'MYR', - 'Rp' => 'IDR', - 'R' => 'ZAR', - '¥' => 'JPY', - 'C$' => 'CAD', - '₼' => 'AZN', - '元' => 'CNY', - 'Kč' => 'CZK', - 'Ft' => 'HUF', - '₹' => 'INR', - '₽' => 'RUB', - '₺' => 'TRY', - '₴' => 'UAH', - 'Fr' => 'CHF', - 'zł' => 'PLN', - '₸' => 'KZT', - "₩" => 'KRW', - 'S$' => 'SGD', - 'HK$'=> 'HKD', - 'NT$'=> 'TWD', - '₱' => 'PHP', - } - - MULTIPLIER_SUFFIXES = { 'K' => 3, 'M' => 6, 'B' => 9, 'T' => 12 } - MULTIPLIER_SUFFIXES.default = 0 + class OptimisticParser < Parser MULTIPLIER_REGEXP = Regexp.new(format('^(.*?\d)(%s)\b([^\d]*)$', MULTIPLIER_SUFFIXES.keys.join('|')), 'i') DEFAULT_DECIMAL_MARK = '.'.freeze @@ -76,7 +47,7 @@ def to_big_decimal(value) def parse_currency computed_currency = nil computed_currency = input[/[A-Z]{2,3}/] - computed_currency = nil unless Monetize::OptimisticParser::CURRENCY_SYMBOLS.value?(computed_currency) + computed_currency = nil unless CURRENCY_SYMBOLS.value?(computed_currency) computed_currency ||= compute_currency if assume_from_symbol? diff --git a/lib/monetize/parser.rb b/lib/monetize/parser.rb new file mode 100644 index 000000000..85b5d7140 --- /dev/null +++ b/lib/monetize/parser.rb @@ -0,0 +1,37 @@ +# encoding: utf-8 + +module Monetize + class Parser + CURRENCY_SYMBOLS = { + '$' => 'USD', + '€' => 'EUR', + '£' => 'GBP', + '₤' => 'GBP', + 'R$' => 'BRL', + 'RM' => 'MYR', + 'Rp' => 'IDR', + 'R' => 'ZAR', + '¥' => 'JPY', + 'C$' => 'CAD', + '₼' => 'AZN', + '元' => 'CNY', + 'Kč' => 'CZK', + 'Ft' => 'HUF', + '₹' => 'INR', + '₽' => 'RUB', + '₺' => 'TRY', + '₴' => 'UAH', + 'Fr' => 'CHF', + 'zł' => 'PLN', + '₸' => 'KZT', + "₩" => 'KRW', + 'S$' => 'SGD', + 'HK$'=> 'HKD', + 'NT$'=> 'TWD', + '₱' => 'PHP', + } + + MULTIPLIER_SUFFIXES = { 'K' => 3, 'M' => 6, 'B' => 9, 'T' => 12 } + MULTIPLIER_SUFFIXES.default = 0 + end +end From 9d9bff61d728e8a73a136c2f770bc1a61ab9eb61 Mon Sep 17 00:00:00 2001 From: antstorm Date: Wed, 14 Aug 2024 15:58:26 +0100 Subject: [PATCH 3/9] Specify Monetize::Parser interface --- lib/monetize/optimistic_parser.rb | 6 ++++-- lib/monetize/parser.rb | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/monetize/optimistic_parser.rb b/lib/monetize/optimistic_parser.rb index bceb4e99b..f64cb79f5 100644 --- a/lib/monetize/optimistic_parser.rb +++ b/lib/monetize/optimistic_parser.rb @@ -36,14 +36,16 @@ def parse private + private + + attr_reader :input, :fallback_currency, :options + def to_big_decimal(value) BigDecimal(value) rescue ::ArgumentError => err fail ParseError, err.message end - attr_reader :input, :fallback_currency, :options - def parse_currency computed_currency = nil computed_currency = input[/[A-Z]{2,3}/] diff --git a/lib/monetize/parser.rb b/lib/monetize/parser.rb index 85b5d7140..d961fffbc 100644 --- a/lib/monetize/parser.rb +++ b/lib/monetize/parser.rb @@ -33,5 +33,13 @@ class Parser MULTIPLIER_SUFFIXES = { 'K' => 3, 'M' => 6, 'B' => 9, 'T' => 12 } MULTIPLIER_SUFFIXES.default = 0 + + def initialize(input, fallback_currency = Money.default_currency, options = {}) + raise NotImplementedError, 'Monetize::Parser subclasses must implement #initialize' + end + + def parse + raise NotImplementedError, 'Monetize::Parser subclasses must implement #parse' + end end end From 1b9e607612e8f68343f8a96aac790a41ae6cea9b Mon Sep 17 00:00:00 2001 From: antstorm Date: Wed, 14 Aug 2024 16:55:41 +0100 Subject: [PATCH 4/9] Support registering multiple parsers --- lib/monetize.rb | 29 ++++++++++++++- lib/monetize/optimistic_parser.rb | 2 +- lib/monetize/parser.rb | 2 +- spec/monetize_spec.rb | 60 +++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 3 deletions(-) diff --git a/lib/monetize.rb b/lib/monetize.rb index 5946048c8..28dfbfa2f 100644 --- a/lib/monetize.rb +++ b/lib/monetize.rb @@ -26,6 +26,10 @@ class << self # human text that we're dealing with fractions of cents. attr_accessor :expect_whole_subunits + # Specify which of the previously registered parsers should be used when parsing an input + # unless overriden using the :parser keyword option for the .parse and parse! methods. + attr_accessor :default_parser + def parse(input, currency = Money.default_currency, options = {}) parse! input, currency, options rescue Error @@ -36,7 +40,7 @@ def parse!(input, currency = Money.default_currency, options = {}) return input if input.is_a?(Money) return from_numeric(input, currency) if input.is_a?(Numeric) - parser = Monetize::OptimisticParser.new(input, currency, options) + parser = fetch_parser(input, currency, options) amount, currency = parser.parse Money.from_amount(amount, currency) @@ -77,5 +81,28 @@ def extract_cents(input, currency = Money.default_currency) money = parse(input, currency) money.cents if money end + + # Registers a new parser class along with the default options. It can then be used by + # providing a :parser option when parsing an input or by specifying a default parser + # using Monetize.default_parser=. + def register_parser(name, klass, options = {}) + @parsers ||= {} + @parsers[name] = [klass, options] + end + + private + + attr_reader :parsers + + def fetch_parser(input, currency, options) + parser_name = options[:parser] || default_parser + parser_klass, parser_options = parsers.fetch(parser_name) do + raise ArgumentError, "Parser not registered: #{parser_name}" + end + parser_klass.new(input, currency, parser_options.merge(options)) + end end end + +Monetize.register_parser(:optimistic, Monetize::OptimisticParser) +Monetize.default_parser = :optimistic diff --git a/lib/monetize/optimistic_parser.rb b/lib/monetize/optimistic_parser.rb index f64cb79f5..dd1c41e16 100644 --- a/lib/monetize/optimistic_parser.rb +++ b/lib/monetize/optimistic_parser.rb @@ -8,7 +8,7 @@ class OptimisticParser < Parser DEFAULT_DECIMAL_MARK = '.'.freeze - def initialize(input, fallback_currency = Money.default_currency, options = {}) + def initialize(input, fallback_currency, options) @input = input.to_s.strip @fallback_currency = fallback_currency @options = options diff --git a/lib/monetize/parser.rb b/lib/monetize/parser.rb index d961fffbc..c002d0b63 100644 --- a/lib/monetize/parser.rb +++ b/lib/monetize/parser.rb @@ -34,7 +34,7 @@ class Parser MULTIPLIER_SUFFIXES = { 'K' => 3, 'M' => 6, 'B' => 9, 'T' => 12 } MULTIPLIER_SUFFIXES.default = 0 - def initialize(input, fallback_currency = Money.default_currency, options = {}) + def initialize(input, fallback_currency, options) raise NotImplementedError, 'Monetize::Parser subclasses must implement #initialize' end diff --git a/spec/monetize_spec.rb b/spec/monetize_spec.rb index f41a07d1e..14af6b230 100644 --- a/spec/monetize_spec.rb +++ b/spec/monetize_spec.rb @@ -36,6 +36,17 @@ } JSON + # Dummy parser that always returns an amount and currency specified via options + class TestParser < Monetize::Parser + def initialize(input, currency, options) + @options = options + end + + def parse + [@options[:amount], @options[:currency]] + end + end + describe '.parse' do it 'parses european-formatted inputs under 10EUR' do expect(Monetize.parse('EUR 5,95')).to eq Money.new(595, 'EUR') @@ -390,6 +401,12 @@ expect(Monetize.parse('£10.00')).to eq Money.new(10_00, 'GBP') end end + + context 'when specified parser does not exist' do + it 'returns nil' do + expect(Monetize.parse('100 USD', nil, parser: :foo)).to eq(nil) + end + end end describe '.parse!' do @@ -406,6 +423,14 @@ it 'raises ArgumentError with invalid format' do expect { Monetize.parse!('11..0') }.to raise_error Monetize::ParseError end + + context 'when specified parser does not exist' do + it 'raises ArgumentError' do + expect do + Monetize.parse!('100 USD', nil, parser: :foo) + end.to raise_error(Monetize::ArgumentError, 'Parser not registered: foo') + end + end end describe '.parse_collection' do @@ -630,4 +655,39 @@ expect(4.635.to_money).to eq '4.635'.to_money end end + + describe '.register_parser' do + it 'registers a new parser with a provided name' do + Monetize.register_parser(:test, TestParser, amount: 42, currency: 'GBP') + + expect(Monetize.parse!('test', nil, parser: :test)).to eq(Money.new(42_00, 'GBP')) + end + + it 'registers the same parser with a different name' do + Monetize.register_parser(:test_1, TestParser, amount: 1, currency: 'GBP') + Monetize.register_parser(:test_2, TestParser, amount: 2, currency: 'USD') + + expect(Monetize.parse!('test', nil, parser: :test_1)).to eq(Money.new(1_00, 'GBP')) + expect(Monetize.parse!('test', nil, parser: :test_2)).to eq(Money.new(2_00, 'USD')) + end + + it 'overrides existing parser with the same name' do + Monetize.register_parser(:test, TestParser, amount: 42, currency: 'GBP') + Monetize.register_parser(:test, TestParser, amount: 99, currency: 'USD') + + expect(Monetize.parse!('test', nil, parser: :test)).to eq(Money.new(99_00, 'USD')) + end + end + + describe '.default_parser=' do + before { Monetize.register_parser(:test, TestParser, amount: 1, currency: 'USD') } + after { Monetize.default_parser = :optimistic } + + it 'specifies which parser to use by default' do + expect(Monetize.parse!('99 GBP')).to eq(Money.new(99_00, 'GBP')) + + Monetize.default_parser = :test + expect(Monetize.parse!('99 GBP')).to eq(Money.new(1_00, 'USD')) + end + end end From fb6ed965204323ab22bcaa6a72df5104c26e83a6 Mon Sep 17 00:00:00 2001 From: antstorm Date: Fri, 16 Aug 2024 14:37:23 +0100 Subject: [PATCH 5/9] Add StrictParser --- lib/monetize.rb | 2 + lib/monetize/strict_parser.rb | 152 ++++++++++++++++++++++++++++++++++ lib/monetize/tokenizer.rb | 78 +++++++++++++++++ spec/monetize_spec.rb | 2 + 4 files changed, 234 insertions(+) create mode 100644 lib/monetize/strict_parser.rb create mode 100644 lib/monetize/tokenizer.rb diff --git a/lib/monetize.rb b/lib/monetize.rb index 28dfbfa2f..361b1bb81 100644 --- a/lib/monetize.rb +++ b/lib/monetize.rb @@ -5,6 +5,7 @@ require 'monetize/errors' require 'monetize/version' require 'monetize/optimistic_parser' +require 'monetize/strict_parser' require 'monetize/collection' module Monetize @@ -105,4 +106,5 @@ def fetch_parser(input, currency, options) end Monetize.register_parser(:optimistic, Monetize::OptimisticParser) +Monetize.register_parser(:strict, Monetize::StrictParser) Monetize.default_parser = :optimistic diff --git a/lib/monetize/strict_parser.rb b/lib/monetize/strict_parser.rb new file mode 100644 index 000000000..6398adb52 --- /dev/null +++ b/lib/monetize/strict_parser.rb @@ -0,0 +1,152 @@ +require 'monetize/parser' +require 'monetize/tokenizer' + +module Monetize + class StrictParser < Parser + # TODO: perform exhaustive match + # TODO: error subclasses with detailed explanation + # TODO: check if decimal mark is a thousands separator (1,000 USD) + # TODO: switch to using allowed format as strings for added flexibility + + # Some advanced regexp tips to understand the next bit of code: + # "?:" - makes the group non-capturing, excluding it from the resulting match data + # "?" - creates a named capture + # "\k" - backreferences a named capture + # "?!" - negative lookahead (next character(-s) can't be the contents of this group) + AMOUNT_REGEXP = %r{ + ^ + (?: # whole units + (?: # try to capture units separated by thousands + \d{1,3} # must start with 3 or less whole numbers + (?:(?[\.\ ,])\d{3})? # first occurance of separated thousands, captures the separator + (?:\k\d{3})* # other iterations with a backreference to the first separator + ) + |\d+ # fallback to non thousands-separated units + ) + (?: # this group captures subunits + (?!\k) # avoid thousands separator used to separate decimals + (?[\.,]) # captured decimal separator + \d+ # subunits + )? + $ + }ix.freeze + + def initialize(input, fallback_currency = Money.default_currency, options = {}) + @input = input.to_s + @options = options + @fallback_currency = Money::Currency.wrap(fallback_currency) + # This shouldn't be here, however String#to_money defaults currency to nil. Ideally we want + # the default to always be Money.default_currency unless specified. In that case an explicit + # nil would indicate that the currency must be determined from the input. + @fallback_currency ||= Money.default_currency + end + + def parse + result = Tokenizer.new(input, options).process + + unless ALLOWED_FORMATS.include?(result.map(&:first)) + raise ParseError, "invalid input - #{result.map(&:first)}" + end + + amount = result.find { |token| token.type == :amount } + sign = result.find { |token| token.type == :sign } + symbol = result.find { |token| token.type == :symbol } + currency_iso = result.find { |token| token.type == :currency_iso } + + currency = + if currency_iso + parse_currency_iso(currency_iso.match.to_s) + elsif symbol && assume_from_symbol? + parse_symbol(symbol.match.to_s) + else + fallback_currency + end + + amount = parse_amount(currency, amount.match, sign&.match) + + [amount, currency] + end + + private + + ALLOWED_FORMATS = [ + [:amount], # 9.99 + [:sign, :amount], # -9.99 + [:symbol, :amount], # £9.99 + [:sign, :symbol, :amount], # -£9.99 + [:symbol, :sign, :amount], # £-9.99 + [:symbol, :amount, :sign], # £9.99- + [:amount, :symbol], # 9.99£ + [:sign, :amount, :symbol], # -9.99£ + [:currency_iso, :amount], # GBP 9.99 + [:currency_iso, :sign, :amount], # GBP -9.99 + [:amount, :currency_iso], # 9.99 GBP + [:sign, :amount, :currency_iso], # -9.99 GBP + [:symbol, :amount, :currency_iso], # £9.99 GBP + [:sign, :symbol, :amount, :currency_iso], # -£9.99 GBP + ].freeze + + attr_reader :input, :fallback_currency, :options + + def parse_amount(currency, amount_match, sign) + amount = amount_match[:amount] + multiplier = amount_match[:multiplier] + + matches = amount.match(AMOUNT_REGEXP) + + unless matches + raise ParseError, 'the provided input does not contain a valid amount' + end + + thousands_separator = matches[:ts] + decimal_separator = matches[:ds] + + # A single thousands separator without a decimal separator might be considered a decimal + # separator in some cases (e.g. '1.001 TND' is likely 1.001 and not 1001). Here we need to + # check if the currency allows 3+ subunits. + if thousands_separator && + !decimal_separator && + currency.subunit_to_unit > 100 && + amount.count(thousands_separator) == 1 + _, possible_subunits = amount.split(thousands_separator) + + if possible_subunits.length > 2 + decimal_separator = thousands_separator + thousands_separator = nil + end + end + + amount.gsub!(thousands_separator, '') if thousands_separator + amount.gsub!(decimal_separator, '.') if decimal_separator + amount = amount.to_f + + amount = apply_multiplier(amount, multiplier) + amount = apply_sign(amount, sign.to_s) + + amount + end + + def parse_symbol(symbol) + Money::Currency.wrap(CURRENCY_SYMBOLS[symbol]) + end + + def parse_currency_iso(currency_iso) + Money::Currency.wrap(currency_iso) + end + + def assume_from_symbol? + options.fetch(:assume_from_symbol) { Monetize.assume_from_symbol } + end + + def apply_multiplier(num, multiplier) + return num unless multiplier + + exponent = MULTIPLIER_SUFFIXES[multiplier.to_s.upcase] + num * 10**exponent + end + + def apply_sign(num, sign) + sign == '-' ? num * -1 : num + end + end +end diff --git a/lib/monetize/tokenizer.rb b/lib/monetize/tokenizer.rb new file mode 100644 index 000000000..ea2c95a54 --- /dev/null +++ b/lib/monetize/tokenizer.rb @@ -0,0 +1,78 @@ +require 'monetize/parser' + +module Monetize + class Tokenizer + SYMBOLS = Monetize::Parser::CURRENCY_SYMBOLS.keys.map { |symbol| Regexp.escape(symbol) }.freeze + THOUSAND_SEPARATORS = /[\.\ ,]/.freeze + DECIMAL_MARKS = /[\.,]/.freeze + MULTIPLIERS = Monetize::Parser::MULTIPLIER_SUFFIXES.keys.join('|').freeze + + REPLACEMENT_SYMBOL = '§'.freeze + SYMBOL_REGEXP = Regexp.new(SYMBOLS.join('|')).freeze + CURRENCY_ISO_REGEXP = /(? # amount group + \d+ # starts with at least one digit + (?:#{THOUSAND_SEPARATORS}\d{3})* # separated into groups of 3 digits by a thousands separator + (?!\d) # not followed by a digit + (?:#{DECIMAL_MARKS}\d+)? # might have decimal mark followed by decimal part + ) + (?#{MULTIPLIERS})? # optional multiplier + }ix.freeze + + class Token < Struct.new(:type, :match); end + + def initialize(input, options = {}) + @original_input = input + @options = options + end + + def process + # matches are removed from the input string to avoid overlapping matches + input = original_input.dup + result = [] + + result += match(input, :currency_iso, CURRENCY_ISO_REGEXP) + result += match(input, :symbol, SYMBOL_REGEXP) + result += match(input, :sign, SIGN_REGEXP) + result += match(input, :amount, AMOUNT_REGEXP) + + # allow only unmatched empty spaces, nothing else + unless input.gsub(REPLACEMENT_SYMBOL, '').strip.empty? + raise ParseError, 'non-exhaustive match' + end + + result.sort_by { |token| token.match.offset(0).first } + end + + private + + attr_reader :original_input, :options + + def match(input, type, regexp) + tokens = [] + # TODO: replace with gsub to avoid another loop later? + input.scan(regexp) { tokens << Token.new(type, Regexp.last_match) } + + # Replace the matches from the input with § to avoid over-matching + tokens.each do |token| + offset = token.match.offset(0) + input[offset.first..(offset.last - 1)] = REPLACEMENT_SYMBOL * token.match.to_s.length + end + + tokens + end + + def preview(result) + preview_input = original_input.dup + result.reverse.each do |token| + offset = token.match.offset(0) + preview_input.slice!(offset.first, token.match.to_s.length) + preview_input.insert(offset.first, "<#{token.type}>") + end + + puts preview_input + end + end +end diff --git a/spec/monetize_spec.rb b/spec/monetize_spec.rb index 14af6b230..8003ba615 100644 --- a/spec/monetize_spec.rb +++ b/spec/monetize_spec.rb @@ -36,6 +36,8 @@ } JSON + before { Monetize.default_parser = :strict } + # Dummy parser that always returns an amount and currency specified via options class TestParser < Monetize::Parser def initialize(input, currency, options) From d3cf5fff324fc9dccaefa645bab51da86dd4303c Mon Sep 17 00:00:00 2001 From: antstorm Date: Fri, 16 Aug 2024 14:57:41 +0100 Subject: [PATCH 6/9] Temporary ENV variable to run tests with StrictParser --- .github/workflows/ruby.yml | 2 ++ spec/monetize_spec.rb | 2 -- spec/spec_helper.rb | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 91f3d244d..9eadec655 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -32,3 +32,5 @@ jobs: bundler-cache: true # runs 'bundle install' and caches installed gems automatically - name: Run tests run: bundle exec rspec spec + - name: Run strict tests + run: MODE=strict bundle exec rspec spec diff --git a/spec/monetize_spec.rb b/spec/monetize_spec.rb index 8003ba615..14af6b230 100644 --- a/spec/monetize_spec.rb +++ b/spec/monetize_spec.rb @@ -36,8 +36,6 @@ } JSON - before { Monetize.default_parser = :strict } - # Dummy parser that always returns an amount and currency specified via options class TestParser < Monetize::Parser def initialize(input, currency, options) diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 51f6235e2..ed318e121 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -4,4 +4,10 @@ RSpec.configure do |config| config.order = 'random' + if ENV['MODE'] == 'strict' + config.before(:each) do + Monetize.default_parser = :strict + end + end end + From 8e46dc0031b4c08607cb0f19c766bcf1bb9f98a6 Mon Sep 17 00:00:00 2001 From: antstorm Date: Fri, 16 Aug 2024 14:58:04 +0100 Subject: [PATCH 7/9] Optimize performance of the StrictParser --- lib/monetize/strict_parser.rb | 40 +++++++++++++++++------------------ lib/monetize/tokenizer.rb | 13 ++++++------ 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/lib/monetize/strict_parser.rb b/lib/monetize/strict_parser.rb index 6398adb52..97695823e 100644 --- a/lib/monetize/strict_parser.rb +++ b/lib/monetize/strict_parser.rb @@ -3,11 +3,11 @@ module Monetize class StrictParser < Parser - # TODO: perform exhaustive match - # TODO: error subclasses with detailed explanation - # TODO: check if decimal mark is a thousands separator (1,000 USD) # TODO: switch to using allowed format as strings for added flexibility + THOUSAND_SEPARATORS = /[\.\ ,]/.freeze + DECIMAL_MARKS = /[\.,]/.freeze + # Some advanced regexp tips to understand the next bit of code: # "?:" - makes the group non-capturing, excluding it from the resulting match data # "?" - creates a named capture @@ -15,18 +15,18 @@ class StrictParser < Parser # "?!" - negative lookahead (next character(-s) can't be the contents of this group) AMOUNT_REGEXP = %r{ ^ - (?: # whole units - (?: # try to capture units separated by thousands - \d{1,3} # must start with 3 or less whole numbers - (?:(?[\.\ ,])\d{3})? # first occurance of separated thousands, captures the separator - (?:\k\d{3})* # other iterations with a backreference to the first separator + (?: # whole units + (?: # try to capture units separated by thousands + \d{1,3} # must start with 3 or less whole numbers + (?:(?#{THOUSAND_SEPARATORS})\d{3})? # first occurance of separated thousands, captures the separator + (?:\k\d{3})* # other iterations with a the same exact separator ) - |\d+ # fallback to non thousands-separated units + |\d+ # fallback to non thousands-separated units ) - (?: # this group captures subunits - (?!\k) # avoid thousands separator used to separate decimals - (?[\.,]) # captured decimal separator - \d+ # subunits + (?: # this group captures subunits + (?!\k) # disallow captured thousands separator as decimals separator + (?#{DECIMAL_MARKS}) # captured decimal separator + \d+ # subunits )? $ }ix.freeze @@ -42,16 +42,16 @@ def initialize(input, fallback_currency = Money.default_currency, options = {}) end def parse - result = Tokenizer.new(input, options).process + tokens = Tokenizer.new(input, options).process - unless ALLOWED_FORMATS.include?(result.map(&:first)) - raise ParseError, "invalid input - #{result.map(&:first)}" + unless ALLOWED_FORMATS.include?(tokens.map(&:first)) + raise ParseError, "invalid input - #{tokens.map(&:first)}" end - amount = result.find { |token| token.type == :amount } - sign = result.find { |token| token.type == :sign } - symbol = result.find { |token| token.type == :symbol } - currency_iso = result.find { |token| token.type == :currency_iso } + amount = tokens.find { |token| token.type == :amount } + sign = tokens.find { |token| token.type == :sign } + symbol = tokens.find { |token| token.type == :symbol } + currency_iso = tokens.find { |token| token.type == :currency_iso } currency = if currency_iso diff --git a/lib/monetize/tokenizer.rb b/lib/monetize/tokenizer.rb index ea2c95a54..645f65d0d 100644 --- a/lib/monetize/tokenizer.rb +++ b/lib/monetize/tokenizer.rb @@ -52,13 +52,12 @@ def process def match(input, type, regexp) tokens = [] - # TODO: replace with gsub to avoid another loop later? - input.scan(regexp) { tokens << Token.new(type, Regexp.last_match) } - - # Replace the matches from the input with § to avoid over-matching - tokens.each do |token| - offset = token.match.offset(0) - input[offset.first..(offset.last - 1)] = REPLACEMENT_SYMBOL * token.match.to_s.length + input.gsub!(regexp) do + tokens << Token.new(type, Regexp.last_match) + # Replace the matches from the input with § to avoid overlapping matches. Stripping + # out the matches is dangerous because it can bring things irrelevant things together: + # '12USD34' will become '1234' after removing currency, which is NOT expected. + REPLACEMENT_SYMBOL * Regexp.last_match.to_s.length end tokens From aaaa0b086e333714acd8446eade8d45837e61a48 Mon Sep 17 00:00:00 2001 From: antstorm Date: Fri, 16 Aug 2024 15:05:24 +0100 Subject: [PATCH 8/9] Tidy up StrictParser --- lib/monetize/strict_parser.rb | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/lib/monetize/strict_parser.rb b/lib/monetize/strict_parser.rb index 97695823e..ef090c3c6 100644 --- a/lib/monetize/strict_parser.rb +++ b/lib/monetize/strict_parser.rb @@ -48,21 +48,19 @@ def parse raise ParseError, "invalid input - #{tokens.map(&:first)}" end - amount = tokens.find { |token| token.type == :amount } - sign = tokens.find { |token| token.type == :sign } - symbol = tokens.find { |token| token.type == :symbol } - currency_iso = tokens.find { |token| token.type == :currency_iso } + parts = Struct.new(:amount, :sign, :symbol, :currency_iso).new + tokens.each { |token| parts[token.type] = token } currency = - if currency_iso - parse_currency_iso(currency_iso.match.to_s) - elsif symbol && assume_from_symbol? - parse_symbol(symbol.match.to_s) + if parts.currency_iso + parse_currency_iso(parts.currency_iso.match.to_s) + elsif parts.symbol && assume_from_symbol? + parse_symbol(parts.symbol.match.to_s) else fallback_currency end - amount = parse_amount(currency, amount.match, sign&.match) + amount = parse_amount(currency, parts.amount.match, parts.sign&.match) [amount, currency] end From dc619e17a6ef688f9256a729b00c4d3f1622a53f Mon Sep 17 00:00:00 2001 From: antstorm Date: Fri, 16 Aug 2024 15:11:15 +0100 Subject: [PATCH 9/9] Add class comment to StrictParser --- lib/monetize/strict_parser.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/monetize/strict_parser.rb b/lib/monetize/strict_parser.rb index ef090c3c6..9fb54ba22 100644 --- a/lib/monetize/strict_parser.rb +++ b/lib/monetize/strict_parser.rb @@ -2,6 +2,9 @@ require 'monetize/tokenizer' module Monetize + # Unlike the OptimisticParser which aims for the best effort match, the StrictParser is + # designed to only parse well formatted inputs (see ALLOWED_FORMATS) and ignore anything + # that looks off. class StrictParser < Parser # TODO: switch to using allowed format as strings for added flexibility