summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOlivier Chafik <ochafik@users.noreply.github.com>2024-06-11 01:00:30 +0100
committerGitHub <noreply@github.com>2024-06-11 01:00:30 +0100
commit396b18dfec2c56846e80362db70af09b9e1d70ba (patch)
tree2081089863e80e93a906424c8fa15ac4e3fa091f
parent864a99e7a01d9422d2f55618dbe62c8099a2175c (diff)
`json`: document schema conversion in GBNF readme, align manual grammar examples & converters (#7841)
* json: fix char pattern in grammar converters * json: prevent number precision & whitespace runaways in example grammars * json: add doc to grammar readme
-rw-r--r--common/json-schema-to-grammar.cpp2
-rwxr-xr-xexamples/json_schema_to_grammar.py2
-rw-r--r--examples/server/public/json-schema-to-grammar.mjs2
-rw-r--r--grammars/README.md39
-rw-r--r--grammars/json.gbnf6
-rw-r--r--grammars/json_arr.gbnf6
-rwxr-xr-xtests/test-json-schema-to-grammar.cpp38
7 files changed, 67 insertions, 28 deletions
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 737bae27..11221a32 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -57,7 +57,7 @@ std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
- {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
+ {"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 7d889c3f..cd444d01 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -43,7 +43,7 @@ PRIMITIVE_RULES = {
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
- 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
+ 'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null' : BuiltinRule('"null" space', []),
}
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
index cef11eab..dc246839 100644
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -41,7 +41,7 @@ const PRIMITIVE_RULES = {
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
- char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
+ char : new BuiltinRule(`[^"\\\\\\x7F\\x00-\\x1F] | [\\\\] (["\\\\bfnrt] | "u" [0-9a-fA-F]{4})`, []),
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
null : new BuiltinRule('"null" space', []),
};
diff --git a/grammars/README.md b/grammars/README.md
index 3ffc7cec..2ec21a4c 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -94,6 +94,8 @@ This guide provides a brief overview. Check out the GBNF files in this directory
./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
```
+`llama.cpp` can also convert JSON schemas to grammars either ahead of time or at each request, see below.
+
## Troubleshooting
Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218).
@@ -103,3 +105,40 @@ Grammars currently have performance gotchas (see https://github.com/ggerganov/ll
A common pattern is to allow repetitions of a pattern `x` up to N times.
While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) may result in extremely slow sampling. Instead, you can write `x{0,N}` (or `(x (x (x ... (x)?...)?)?)?` w/ N-deep nesting in earlier llama.cpp versions).
+
+## Using GBNF grammars
+
+You can use GBNF grammars:
+
+- In the [server](../examples/server)'s completion endpoints, passed as the `grammar` body field
+- In the [main](../examples/main) CLI, passed as the `--grammar` & `--grammar-file` flags
+- With the [gbnf-validator](../examples/gbnf-validator) tool, to test them against strings.
+
+## JSON Schemas → GBNF
+
+`llama.cpp` supports converting a subset of https://json-schema.org/ to GBNF grammars:
+
+- In the [server](../examples/server):
+ - For any completion endpoints, passed as the `json_schema` body field
+ - For the `/chat/completions` endpoint, passed inside the `result_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}`)
+- In the [main](../examples/main) CLI, passed as the `--json` / `-j` flag
+- To convert to a grammar ahead of time:
+ - in CLI, with [json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
+ - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
+
+Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
+
+Here is also a non-exhaustive list of **unsupported** features:
+
+- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
+- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
+ - `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
+- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
+- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
+- `string` formats `uri`, `email`
+- [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
+- `uniqueItems`
+- `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
+- [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
+- [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
+- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
diff --git a/grammars/json.gbnf b/grammars/json.gbnf
index a8a80752..064a53f8 100644
--- a/grammars/json.gbnf
+++ b/grammars/json.gbnf
@@ -16,10 +16,10 @@ array ::=
string ::=
"\"" (
[^"\\\x7F\x00-\x1F] |
- "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
+ "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
)* "\"" ws
-number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
-ws ::= ([ \t\n] ws)?
+ws ::= [ \t\n]{0,20}
diff --git a/grammars/json_arr.gbnf b/grammars/json_arr.gbnf
index 31a3202f..bd1312d9 100644
--- a/grammars/json_arr.gbnf
+++ b/grammars/json_arr.gbnf
@@ -25,10 +25,10 @@ array ::=
string ::=
"\"" (
[^"\\\x7F\x00-\x1F] |
- "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
+ "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
)* "\"" ws
-number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [1-9] [0-9]{0,15})? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
-ws ::= ([ \t\n] ws)?
+ws ::= [ \t\n]{0,20}
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 052c0807..bea876bd 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -105,7 +105,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""(
array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
null ::= "null" space
@@ -152,7 +152,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"type": "string"
})""",
R"""(
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "\"" char* "\"" space
space ::= " "?
)"""
@@ -166,7 +166,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"minLength": 1
})""",
R"""(
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "\"" char+ "\"" space
space ::= " "?
)"""
@@ -180,7 +180,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"minLength": 3
})""",
R"""(
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "\"" char{3,} "\"" space
space ::= " "?
)"""
@@ -194,7 +194,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"maxLength": 3
})""",
R"""(
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "\"" char{0,3} "\"" space
space ::= " "?
)"""
@@ -209,7 +209,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"maxLength": 4
})""",
R"""(
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "\"" char{1,4} "\"" space
space ::= " "?
)"""
@@ -283,7 +283,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"prefixItems": [{ "type": "string" }]
})""",
R"""(
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "[" space string "]" space
space ::= " "?
string ::= "\"" char* "\"" space
@@ -297,7 +297,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"prefixItems": [{ "type": "string" }, { "type": "number" }]
})""",
R"""(
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
@@ -466,7 +466,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-kv ::= "\"a\"" space ":" space string
b-kv ::= "\"b\"" space ":" space string
c-kv ::= "\"c\"" space ":" space string
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
space ::= " "?
string ::= "\"" char* "\"" space
@@ -486,7 +486,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
a-kv ::= "\"a\"" space ":" space string
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "{" space (a-kv )? "}" space
space ::= " "?
string ::= "\"" char* "\"" space
@@ -510,7 +510,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
b-kv ::= "\"b\"" space ":" space string
b-rest ::= ( "," space c-kv )?
c-kv ::= "\"c\"" space ":" space string
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "{" space (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
space ::= " "?
string ::= "\"" char* "\"" space
@@ -534,7 +534,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-kv ::= "\"a\"" space ":" space string
b-kv ::= "\"b\"" space ":" space string
c-kv ::= "\"c\"" space ":" space string
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
d-kv ::= "\"d\"" space ":" space string
d-rest ::= ( "," space c-kv )?
root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
@@ -554,7 +554,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
additional-kv ::= string ":" space additional-value
additional-kvs ::= additional-kv ( "," space additional-kv )*
additional-value ::= "[" space (number ("," space number)*)? "]" space
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
@@ -574,7 +574,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""(
array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
null ::= "null" space
@@ -596,7 +596,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""(
array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
null ::= "null" space
@@ -637,7 +637,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-kv ::= "\"a\"" space ":" space number
additional-kv ::= string ":" space string
additional-kvs ::= additional-kv ( "," space additional-kv )*
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
@@ -662,7 +662,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-rest ::= additional-kvs
additional-kv ::= string ":" space number
additional-kvs ::= additional-kv ( "," space additional-kv )*
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
@@ -690,7 +690,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
additional-kvs ::= additional-kv ( "," space additional-kv )*
b-kv ::= "\"b\"" space ":" space number
b-rest ::= additional-kvs
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
@@ -721,7 +721,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
}
})""",
R"""(
- char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+ char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
foo ::= "{" space foo-a-kv "}" space
foo-a-kv ::= "\"a\"" space ":" space string
root ::= foo