summaryrefslogtreecommitdiff
path: root/common/json-schema-to-grammar.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'common/json-schema-to-grammar.cpp')
-rw-r--r--common/json-schema-to-grammar.cpp233
1 files changed, 138 insertions, 95 deletions
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 0e468034..0f8f1b1d 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -11,35 +11,101 @@
using json = nlohmann::ordered_json;
+template <typename Iterator>
+static std::string join(Iterator begin, Iterator end, const std::string & separator);
+
+static std::string repeat(const std::string & str, size_t n);
+
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
+ if (separator_rule.empty()) {
+ if (min_items == 0 && max_items == 1) {
+ return item_rule + "?";
+ } else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
+ return item_rule + "+";
+ }
+ }
+
+ std::string result;
+ if (min_items > 0) {
+ if (item_rule_is_literal && separator_rule.empty()) {
+ result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
+ } else {
+ std::vector<std::string> items(min_items, item_rule);
+ result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
+ }
+ }
+
+ std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
+ auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
+
+ if (up_to_n == 0) {
+ return "";
+ } else if (up_to_n == 1) {
+ return "(" + content + ")?";
+ } else if (!separator_rule.empty() && !prefix_with_sep) {
+ return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
+ } else {
+ std::string res = repeat("(" + content + " ", up_to_n);
+ // strip trailing space
+ res = res.substr(0, res.length() - 1);
+ res += repeat(")?", up_to_n);
+ return res;
+ }
+ };
+
+ if (min_items > 0 && max_items != min_items) {
+ result += " ";
+ }
+
+ if (max_items != std::numeric_limits<int>::max()) {
+ result += opt_repetitions(max_items - min_items, min_items > 0);
+ } else {
+ std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
+ if (min_items == 0 && !separator_rule.empty()) {
+ result = "(" + item_rule + " " + item_operator + "*)?";
+ } else {
+ result += item_operator + "*";
+ }
+ }
+
+ return result;
+}
+
const std::string SPACE_RULE = "\" \"?";
-std::unordered_map<std::string, std::string> PRIMITIVE_RULES = {
- {"boolean", "(\"true\" | \"false\") space"},
- {"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
- {"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
- {"value", "object | array | string | number | boolean"},
- {"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
- {"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
- {"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+struct BuiltinRule {
+ std::string content;
+ std::vector<std::string> deps;
+};
+
+const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
+
+std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
+ {"boolean", {"(\"true\" | \"false\") space", {}}},
+ {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
+ {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
+ {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
+ {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
+ {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
+ {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
+ {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
+ {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
- {"string", " \"\\\"\" (\n"
- " [^\"\\\\] |\n"
- " \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
- " )* \"\\\"\" space"},
- {"null", "\"null\" space"}
+ "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
+ {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
+ {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
+ {"null", {"\"null\" space", {}}},
};
-std::vector<std::string> OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};
-
-std::unordered_map<std::string, std::string> DATE_RULES = {
- {"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
- {"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
- {"date-time", "date \"T\" time"},
- {"date-string", "\"\\\"\" date \"\\\"\" space"},
- {"time-string", "\"\\\"\" time \"\\\"\" space"},
- {"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
+
+std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
+ {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+ {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+ {"date-time", {"date \"T\" time", {"date", "time"}}},
+ {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
+ {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
+ {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
};
static bool is_reserved_name(const std::string & name) {
@@ -47,7 +113,7 @@ static bool is_reserved_name(const std::string & name) {
if (RESERVED_NAMES.empty()) {
RESERVED_NAMES.insert("root");
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
- for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
+ for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
}
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
}
@@ -192,7 +258,7 @@ private:
if (_dotall) {
rule = "[\\U00000000-\\U0010FFFF]";
} else {
- rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
+ rule = "[^\\x0A\\x0D]";
}
return _add_rule("dot", rule);
};
@@ -308,47 +374,21 @@ private:
auto &sub = last.first;
auto sub_is_literal = last.second;
- if (min_times == 0 && max_times == std::numeric_limits<int>::max()) {
- sub += "*";
- } else if (min_times == 0 && max_times == 1) {
- sub += "?";
- } else if (min_times == 1 && max_times == std::numeric_limits<int>::max()) {
- sub += "+";
- } else {
- if (!sub_is_literal) {
- std::string & sub_id = sub_rule_ids[sub];
- if (sub_id.empty()) {
- sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
- }
- sub = sub_id;
- }
- std::string result;
- if (sub_is_literal && min_times > 0) {
- result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
- } else {
- for (int j = 0; j < min_times; j++) {
- if (j > 0) {
- result += " ";
- }
- result += sub;
- }
- }
- if (min_times > 0 && min_times < max_times) {
- result += " ";
+ if (!sub_is_literal) {
+ std::string & sub_id = sub_rule_ids[sub];
+ if (sub_id.empty()) {
+ sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
}
- if (max_times == std::numeric_limits<int>::max()) {
- result += sub + "*";
- } else {
- for (int j = min_times; j < max_times; j++) {
- if (j > min_times) {
- result += " ";
- }
- result += sub + "?";
- }
- }
- seq.back().first = result;
- seq.back().second = false;
+ sub = sub_id;
}
+ seq.back().first = build_repetition(
+ sub_is_literal ? "\"" + sub + "\"" : sub,
+ min_times,
+ max_times,
+ "",
+ sub_is_literal
+ );
+ seq.back().second = false;
} else {
std::string literal;
auto is_non_literal = [&](char c) {
@@ -424,7 +464,7 @@ private:
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
- std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+ std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
prop_kv_rule_names["*"] = kv_rule;
optional_props.push_back("*");
}
@@ -486,6 +526,25 @@ private:
return rule;
}
+ std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
+ auto n = _add_rule(name, rule.content);
+ for (const auto & dep : rule.deps) {
+ BuiltinRule dep_rule;
+ auto it = PRIMITIVE_RULES.find(dep);
+ if (it == PRIMITIVE_RULES.end()) {
+ it = STRING_FORMAT_RULES.find(dep);
+ if (it == STRING_FORMAT_RULES.end()) {
+ _errors.push_back("Rule " + dep + " not known");
+ continue;
+ }
+ }
+ if (_rules.find(dep) == _rules.end()) {
+ _add_primitive(dep, it->second);
+ }
+ }
+ return n;
+ }
+
public:
SchemaConverter(
const std::function<json(const std::string &)> & fetch_json,
@@ -647,49 +706,33 @@ public:
return _add_rule(rule_name, rule);
} else {
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
- std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
- std::string successive_items;
int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
- int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : -1;
- if (min_items > 0) {
- successive_items += repeat(list_item_operator, min_items - 1);
- min_items--;
- }
- if (max_items >= 0 && max_items > min_items) {
- successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
- } else {
- successive_items += list_item_operator + "*";
- }
- std::string rule;
- if (min_items == 0) {
- rule = "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
- } else {
- rule = "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
- }
- return _add_rule(rule_name, rule);
+ int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
+
+ return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
}
} else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
return _visit_pattern(schema["pattern"], rule_name);
} else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
- return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
- } else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
- for (const auto & kv : DATE_RULES) {
- _add_rule(kv.first, kv.second);
- }
- return schema_format + "-string";
+ return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
+ } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
+ auto prim_name = schema_format + "-string";
+ return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
+ } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
+ std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
+ int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
+ int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
+ return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
} else if (schema.empty() || schema_type == "object") {
- for (const auto & n : OBJECT_RULE_NAMES) {
- _add_rule(n, PRIMITIVE_RULES.at(n));
- }
- return _add_rule(rule_name, "object");
+ return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
} else {
if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
_errors.push_back("Unrecognized schema: " + schema.dump());
return "";
}
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
- return _add_rule(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
+ return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
}
}