diff options
author | Olivier Chafik <ochafik@users.noreply.github.com> | 2024-06-06 10:07:06 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-06 10:07:06 +0100 |
commit | 55b2d0849d3ec9e45e4a4d9e480f5fa7977872a6 (patch) | |
tree | aef3a9e9a051670751912daf0d06a76783b44b45 /examples | |
parent | f5d7b268ec4bf8628aa6ccc9f6631d0230dde76f (diff) |
grammars: x{min,max} repetition operator (#6640)
* grammars: x{min,max} repetition operator + tweak +/*/? to avoid duplication of original over alternates
* grammars: handle `x{n}` and fix `x{n,n}`
* grammars: document new repetition operators
* grammars: uniform use of int for min & max
* grammars: refactor parser test
* grammar: parsing tests w/ natural pretty print of updated expectations
* grammars: much prettier print of expectations (+ TEST_GRAMMAR_PARSER_PRINT_ALL=1 to force all)
* grammars: improve test pretty print again
* grammars: pretty print rules and chars
* grammars: fix copy rule skipping
* grammars: disallow `a{,}` (not allowed in regexps)
* Update common/grammar-parser.cpp
Co-authored-by: Clint Herron <hanclinto@gmail.com>
* grammars: fix copy rule skipping (again) & display of expectations
* grammars: more test cases
* grammars: update reps parsing to bring ? / * / + closer to before
* json: use new GBNF repetitions{m,n} syntax
* grammars: update performance gotchas w/ repetition advice
* Update examples/json_schema_to_grammar.py
Co-authored-by: Clint Herron <hanclinto@gmail.com>
* Update examples/server/public/json-schema-to-grammar.mjs
Co-authored-by: Clint Herron <hanclinto@gmail.com>
* grammars: comment on rule repetitions
* grammars: ensure unambiguous number alternatives
* grammar: nit typo switched error msgs
* grammar: nit numbering in comment
* json: update numeric rule to be unambiguous
* Apply suggestions from code review
Co-authored-by: Clint Herron <hanclinto@gmail.com>
* Update examples/server/public/json-schema-to-grammar.mjs
Co-authored-by: Clint Herron <hanclinto@gmail.com>
* json: fix integral-part
* grammar: add repetition tests
---------
Co-authored-by: Clint Herron <hanclinto@gmail.com>
Diffstat (limited to 'examples')
-rwxr-xr-x | examples/json_schema_to_grammar.py | 68 | ||||
-rw-r--r-- | examples/pydantic_models_to_grammar.py | 2 | ||||
-rw-r--r-- | examples/server/public/json-schema-to-grammar.mjs | 67 |
3 files changed, 36 insertions, 101 deletions
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 826cd3f7..7d889c3f 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -6,52 +6,22 @@ import re import sys from typing import Any, Dict, List, Set, Tuple, Union -def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False): - if not separator_rule: - if min_items == 0 and max_items == 1: - return f'{item_rule}?' - elif min_items == 1 and max_items is None: - return f'{item_rule}+' - - result = '' - - if min_items > 0: - if item_rule_is_literal and separator_rule is None: - result = '"' + (item_rule[1:-1] * min_items) + '"' - else: - result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items) - - def opt_repetitions(up_to_n, prefix_with_sep=False): - ''' - - n=4, no sep: '(a (a (a (a)?)?)?)?' - - n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?' - - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?' - ''' - - content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule - if up_to_n == 0: - return '' - elif up_to_n == 1: - return f'({content})?' - elif separator_rule and not prefix_with_sep: - return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?' - else: - return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n) - if min_items > 0 and max_items != min_items: - result += ' ' +def _build_repetition(item_rule, min_items, max_items, separator_rule=None): - if max_items is not None: - result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0) - else: - item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})' + if min_items == 0 and max_items == 1: + return f'{item_rule}?' - if min_items == 0 and separator_rule: - result = f'({item_rule} {item_operator}*)?' + if not separator_rule: + if min_items == 1 and max_items is None: + return f'{item_rule}+' + elif min_items == 0 and max_items is None: + return f'{item_rule}*' else: - result += f'{item_operator}*' + return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}' - return result + result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None) + return f'({result})?' if min_items == 0 else result class BuiltinRule: @@ -59,31 +29,29 @@ class BuiltinRule: self.content = content self.deps = deps or [] -_up_to_15_digits = _build_repetition('[0-9]', 0, 15) - # whitespace is constrained to a single space char to prevent model "running away" in # whitespace. Also maybe improves generation quality? SPACE_RULE = '" "?' PRIMITIVE_RULES = { 'boolean' : BuiltinRule('("true" | "false") space', []), - 'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []), - 'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []), + 'decimal-part' : BuiltinRule('[0-9]{1,16}', []), + 'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []), 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']), 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), - 'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []), - 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []), + 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []), + 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []), 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']), 'null' : BuiltinRule('"null" space', []), } # TODO: support "uri", "email" string formats STRING_FORMAT_RULES = { - 'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), - 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), + 'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), + 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), 'date-time' : BuiltinRule('date "T" time', ['date', 'time']), 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']), 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']), @@ -333,7 +301,7 @@ class SchemaConverter: sub_rule_ids[sub] = id sub = id - seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False) + seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False) else: literal = '' while i < length: diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py index 9acc7cc6..f029c73a 100644 --- a/examples/pydantic_models_to_grammar.py +++ b/examples/pydantic_models_to_grammar.py @@ -624,7 +624,7 @@ string ::= "\"" ( "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) )* "\"" ws ws ::= ([ \t\n] ws)? -float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws +float ::= ("-"? ([0] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws integer ::= [0-9]+""" diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index 8e0be1b4..cef11eab 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -2,57 +2,26 @@ const SPACE_RULE = '" "?'; function _buildRepetition(itemRule, minItems, maxItems, opts={}) { + if (minItems === 0 && maxItems === 1) { + return `${itemRule}?`; + } + + const separatorRule = opts.separatorRule ?? ''; const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false if (separatorRule === '') { - if (minItems === 0 && maxItems === 1) { - return `${itemRule}?`; - } else if (minItems === 1 && maxItems === undefined) { + if (minItems === 1 && maxItems === undefined) { return `${itemRule}+`; - } - } - - let result = ''; - if (minItems > 0) { - if (itemRuleIsLiteral && separatorRule === '') { - result = `"${itemRule.slice(1, -1).repeat(minItems)}"`; + } else if (minItems === 0 && maxItems === undefined) { + return `${itemRule}*`; } else { - result = Array.from({ length: minItems }, () => itemRule) - .join(separatorRule !== '' ? ` ${separatorRule} ` : ' '); + return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`; } } - const optRepetitions = (upToN, prefixWithSep=false) => { - const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule; - if (upToN === 0) { - return ''; - } else if (upToN === 1) { - return `(${content})?`; - } else if (separatorRule !== '' && !prefixWithSep) { - return `(${content} ${optRepetitions(upToN - 1, true)})?`; - } else { - return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join(''); - } - }; - - if (minItems > 0 && maxItems !== minItems) { - result += ' '; - } - - if (maxItems !== undefined) { - result += optRepetitions(maxItems - minItems, minItems > 0); - } else { - const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`; - - if (minItems === 0 && separatorRule !== '') { - result = `(${itemRule} ${itemOperator}*)?`; - } else { - result += `${itemOperator}*`; - } - } - - return result; + const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined); + return minItems === 0 ? `(${result})?` : result; } class BuiltinRule { @@ -62,27 +31,25 @@ class BuiltinRule { } } -const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15); - const PRIMITIVE_RULES = { boolean : new BuiltinRule('("true" | "false") space', []), - 'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []), - 'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []), + 'decimal-part' : new BuiltinRule('[0-9]{1,16}', []), + 'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []), number : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), integer : new BuiltinRule('("-"? integral-part) space', ['integral-part']), value : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), - uuid : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []), - char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []), + uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []), + char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []), string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']), null : new BuiltinRule('"null" space', []), }; // TODO: support "uri", "email" string formats const STRING_FORMAT_RULES = { - 'date' : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), - 'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), + 'date' : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), + 'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), 'date-time' : new BuiltinRule('date "T" time', ['date', 'time']), 'date-string' : new BuiltinRule('"\\"" date "\\"" space', ['date']), 'time-string' : new BuiltinRule('"\\"" time "\\"" space', ['time']), |