Skip to content

Commit 2dc840f

Browse files
SchokuroffjochenchristCopilotjschoedlclaude
authored
fix: ODCS to .proto, problem with array of objects (#1012)
* fix * fix * fix * fix * fix * fix * fix * fix * Update datacontract/export/protobuf_exporter.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update datacontract/export/protobuf_exporter.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix * fix * fix * fix: correct nested message indentation and add changelog for protobuf exporter Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: ruff import block formatting Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Jochen Christ <jochen.christ@entropy-data.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Jakob Schödl <jakob.schoedl@entropy-data.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 592e6d1 commit 2dc840f

3 files changed

Lines changed: 239 additions & 63 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## Unreleased
99

10+
### Fixed
11+
- Fix Protobuf export for arrays of objects and improve message/enum naming to UpperCamelCase (#1012 @Schokuroff)
12+
1013
## [0.11.8] - 2026-04-10
1114

1215
### Added

datacontract/export/protobuf_exporter.py

Lines changed: 214 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
1-
import sys
21
from typing import List, Optional
32

43
from open_data_contract_standard.model import OpenDataContractStandard, SchemaProperty
54

65
from datacontract.export.exporter import Exporter
76

7+
OBJECT_TYPES: set = {"object", "record", "struct"}
8+
89

910
class ProtoBufExporter(Exporter):
1011
def export(self, data_contract, schema_name, server, sql_server_type, export_args) -> dict:
11-
# Returns a dict containing the protobuf representation.
12+
"""Exports data contract to Protobuf format."""
1213
proto = to_protobuf(data_contract)
1314
return proto
1415

1516

1617
def _get_config_value(prop: SchemaProperty, key: str) -> Optional[str]:
17-
"""Get a custom property value."""
18+
"""Get a custom property value from customProperties."""
1819
if prop.customProperties is None:
1920
return None
2021
for cp in prop.customProperties:
@@ -57,10 +58,10 @@ def to_protobuf(data_contract: OpenDataContractStandard) -> str:
5758

5859
# Build header with syntax and package declarations.
5960
header = 'syntax = "proto3";\n\n'
60-
package = "example" # Default package
61+
package = "example" # Default package, can be customized
6162
header += f"package {package};\n\n"
6263

63-
# Append enum definitions.
64+
# Append enum definitions before messages.
6465
for enum_name, enum_values in enum_definitions.items():
6566
header += f"// Enum for {enum_name}\n"
6667
header += f"enum {enum_name} {{\n"
@@ -72,6 +73,7 @@ def to_protobuf(data_contract: OpenDataContractStandard) -> str:
7273
else:
7374
header += f" // Warning: Enum values for {enum_name} are not a dictionary\n"
7475
header += "}\n\n"
76+
7577
return header + messages
7678

7779

@@ -86,11 +88,12 @@ def _is_enum_field(prop: SchemaProperty) -> bool:
8688
def _get_enum_name(prop: SchemaProperty) -> str:
8789
"""
8890
Returns the enum name either from the field's "enum_name" or derived from the field name.
91+
Uses UpperCamelCase formatting.
8992
"""
9093
enum_name = _get_config_value(prop, "enum_name")
9194
if enum_name:
92-
return enum_name
93-
return _to_protobuf_message_name(prop.name)
95+
return _snake_to_upper_camel(enum_name)
96+
return _snake_to_upper_camel(prop.name)
9497

9598

9699
def _get_enum_values(prop: SchemaProperty) -> dict:
@@ -103,69 +106,126 @@ def _get_enum_values(prop: SchemaProperty) -> dict:
103106
return {}
104107

105108

106-
def _to_protobuf_message_name(name: str) -> str:
109+
def _snake_to_upper_camel(name: str) -> str:
107110
"""
108-
Returns a valid Protobuf message/enum name by capitalizing the first letter.
111+
Convert snake_case to UpperCamelCase.
112+
Preserves existing capitalization in parts.
113+
114+
Examples:
115+
"fsa_room" -> "FsaRoom"
116+
"FsaRegister" -> "FsaRegister" (already in UpperCamelCase)
117+
"simple_obj" -> "SimpleObj"
109118
"""
110-
return name[0].upper() + name[1:] if name else name
119+
if not name:
120+
return name
111121

122+
# If already UpperCamelCase (first letter uppercase, no underscores after first word)
123+
if name and name[0].isupper() and "_" not in name:
124+
return name
112125

113-
def to_protobuf_message(
114-
model_name: str, properties: List[SchemaProperty], description: str, indent_level: int = 0
115-
) -> str:
126+
parts = name.split("_")
127+
# Capitalize each part while preserving internal capitalization
128+
return "".join(part[0].upper() + part[1:] if part else "" for part in parts)
129+
130+
131+
def _get_type_name(prop: SchemaProperty) -> str:
116132
"""
117-
Generates a Protobuf message definition from the model's fields.
118-
Handles nested messages for complex types.
133+
Get appropriate message/enum type name in UpperCamelCase.
134+
Used for message declarations and field type references.
119135
"""
120-
result = ""
121-
if description:
122-
result += f"{indent(indent_level)}// {description}\n"
136+
# For enums
137+
if _is_enum_field(prop):
138+
return _get_enum_name(prop)
123139

124-
result += f"message {_to_protobuf_message_name(model_name)} {{\n"
125-
number = 1
126-
for prop in properties:
127-
# For nested objects, generate a nested message.
128-
field_type = prop.logicalType or ""
129-
if field_type.lower() in ["object", "record", "struct"]:
130-
nested_desc = prop.description or ""
131-
nested_props = prop.properties or []
132-
nested_message = to_protobuf_message(prop.name, nested_props, nested_desc, indent_level + 1)
133-
result += nested_message + "\n"
140+
# For regular objects
141+
if prop.logicalType and prop.logicalType.lower() in OBJECT_TYPES:
142+
return _snake_to_upper_camel(prop.name)
134143

135-
field_desc = prop.description or ""
136-
result += to_protobuf_field(prop, field_desc, number, indent_level + 1) + "\n"
137-
number += 1
144+
# For objects inside arrays
145+
if (
146+
prop.logicalType
147+
and prop.logicalType.lower() == "array"
148+
and prop.items
149+
and prop.items.logicalType
150+
and prop.items.logicalType.lower() in OBJECT_TYPES
151+
):
152+
# If explicit name is provided in items.name
153+
if hasattr(prop.items, "name") and prop.items.name:
154+
# Normalize items.name the same way as message declarations
155+
return _snake_to_upper_camel(prop.items.name)
138156

139-
result += f"{indent(indent_level)}}}\n"
140-
return result
157+
# Otherwise generate from field name
158+
return _snake_to_upper_camel(prop.name)
159+
160+
return _snake_to_upper_camel(prop.name)
141161

142162

143-
def to_protobuf_field(prop: SchemaProperty, description: str, number: int, indent_level: int = 0) -> str:
163+
def _should_create_nested_message(prop: SchemaProperty) -> bool:
144164
"""
145-
Generates a field definition within a Protobuf message.
165+
Check if we need to create a nested message for this property.
166+
Returns True for objects and arrays of objects.
146167
"""
147-
result = ""
148-
if description:
149-
result += f"{indent(indent_level)}// {description}\n"
150-
result += f"{indent(indent_level)}{_convert_type(prop)} {prop.name} = {number};"
151-
return result
168+
if not prop.logicalType:
169+
return False
152170

171+
lower_type = prop.logicalType.lower()
153172

154-
def indent(indent_level: int) -> str:
155-
return " " * indent_level
173+
# Regular object
174+
if lower_type in OBJECT_TYPES:
175+
return True
176+
177+
# Array of objects
178+
if lower_type == "array" and prop.items:
179+
items_lower_type = prop.items.logicalType.lower() if prop.items.logicalType else ""
180+
return items_lower_type in OBJECT_TYPES
181+
182+
return False
156183

157184

158-
def _convert_type(prop: SchemaProperty) -> str:
185+
def _get_nested_properties(prop: SchemaProperty) -> Optional[List[SchemaProperty]]:
159186
"""
160-
Converts a field's type (from the data contract) to a Protobuf type.
161-
Prioritizes enum conversion if a non-empty "values" property exists.
187+
Get properties for nested message.
188+
Returns None if no nested properties.
162189
"""
163-
# For debugging purposes
164-
print("Converting field:", prop.name, file=sys.stderr)
165-
# If the field should be treated as an enum, return its enum name.
166-
if _is_enum_field(prop):
167-
return _get_enum_name(prop)
190+
if prop.logicalType and prop.logicalType.lower() in OBJECT_TYPES:
191+
return prop.properties or []
192+
193+
if (
194+
prop.logicalType
195+
and prop.logicalType.lower() == "array"
196+
and prop.items
197+
and prop.items.logicalType
198+
and prop.items.logicalType.lower() in OBJECT_TYPES
199+
):
200+
return prop.items.properties or []
201+
202+
return None
203+
168204

205+
def _get_nested_description(prop: SchemaProperty) -> str:
206+
"""
207+
Get description for nested message.
208+
"""
209+
if prop.logicalType and prop.logicalType.lower() in OBJECT_TYPES:
210+
return prop.description or ""
211+
212+
if (
213+
prop.logicalType
214+
and prop.logicalType.lower() == "array"
215+
and prop.items
216+
and prop.items.logicalType
217+
and prop.items.logicalType.lower() in OBJECT_TYPES
218+
):
219+
return prop.items.description or ""
220+
221+
return ""
222+
223+
224+
def _get_primitive_type(prop: SchemaProperty) -> str:
225+
"""
226+
Get Protobuf type for primitive fields.
227+
Handles recursive type resolution for arrays of primitives.
228+
"""
169229
field_type = prop.logicalType or ""
170230
lower_type = field_type.lower()
171231

@@ -185,19 +245,112 @@ def _convert_type(prop: SchemaProperty) -> str:
185245
return "bool"
186246
if lower_type in ["bytes"]:
187247
return "bytes"
188-
if lower_type in ["object", "record", "struct"]:
189-
return _to_protobuf_message_name(prop.name)
248+
249+
# Recursive handling for arrays of primitives
250+
if lower_type == "array" and prop.items:
251+
return _get_primitive_type(prop.items)
252+
253+
return "string" # Fallback for unrecognized types
254+
255+
256+
def _get_field_type(prop: SchemaProperty) -> str:
257+
"""
258+
Get Protobuf type for field (string, int32, repeated TypeName, etc).
259+
Combines repeated keyword with type name for arrays.
260+
"""
261+
field_type = prop.logicalType or ""
262+
lower_type = field_type.lower()
263+
264+
# Handle arrays
190265
if lower_type == "array":
191-
# Handle array types. Check for an "items" property.
192266
if prop.items:
193267
items_type = prop.items.logicalType or ""
194-
if items_type.lower() in ["object", "record", "struct"]:
195-
# Singularize the field name (a simple approach).
196-
singular = prop.name[:-1] if prop.name.endswith("s") else prop.name
197-
return "repeated " + _to_protobuf_message_name(singular)
268+
items_lower_type = items_type.lower()
269+
270+
# If array contains objects
271+
if items_lower_type in OBJECT_TYPES:
272+
type_name = _get_type_name(prop) # e.g., FsaRoom
273+
return f"repeated {type_name}"
198274
else:
199-
return "repeated " + _convert_type(prop.items)
275+
# For primitive types
276+
primitive_type = _get_primitive_type(prop.items)
277+
return f"repeated {primitive_type}"
200278
else:
201-
return "repeated string"
202-
# Fallback for unrecognized types.
203-
return "string"
279+
return "repeated string" # Default array type
280+
281+
# Handle regular objects
282+
if lower_type in OBJECT_TYPES:
283+
type_name = _get_type_name(prop) # e.g., SimpleObj
284+
return type_name
285+
286+
# Handle enums
287+
if _is_enum_field(prop):
288+
return _get_enum_name(prop)
289+
290+
# Handle primitive types
291+
return _get_primitive_type(prop)
292+
293+
294+
def to_protobuf_message(
295+
model_name: str, properties: List[SchemaProperty], description: str, indent_level: int = 0
296+
) -> str:
297+
"""
298+
Generates a Protobuf message definition from the model's fields.
299+
Handles nested messages for complex types recursively.
300+
"""
301+
result = ""
302+
if description:
303+
result += f"{indent(indent_level)}// {description}\n"
304+
305+
# Message name always in UpperCamelCase
306+
message_name = _snake_to_upper_camel(model_name)
307+
result += f"{indent(indent_level)}message {message_name} {{\n"
308+
309+
# Phase 1: Create all nested messages
310+
for prop in properties:
311+
if _should_create_nested_message(prop):
312+
type_name = _get_type_name(prop) # UpperCamelCase
313+
nested_props = _get_nested_properties(prop)
314+
nested_desc = _get_nested_description(prop)
315+
316+
if nested_props is not None:
317+
nested_message = to_protobuf_message(type_name, nested_props, nested_desc, indent_level + 1)
318+
result += nested_message + "\n"
319+
320+
# Phase 2: Create all fields
321+
number = 1
322+
for prop in properties:
323+
field_name = prop.name # snake_case (preserve as in YAML)
324+
field_decl = _get_field_declaration(prop)
325+
field_desc = prop.description or ""
326+
327+
result += f"{indent(indent_level + 1)}"
328+
if field_desc:
329+
result += f"// {field_desc}\n{indent(indent_level + 1)}"
330+
331+
result += f"{field_decl} {field_name} = {number};\n"
332+
number += 1
333+
334+
result += f"{indent(indent_level)}}}\n"
335+
return result
336+
337+
338+
def indent(indent_level: int) -> str:
339+
"""Generate indentation string for Protobuf formatting."""
340+
return " " * indent_level
341+
342+
343+
def _get_field_declaration(prop: SchemaProperty) -> str:
344+
"""
345+
Returns field declaration with optional keyword if needed.
346+
"""
347+
field_type = _get_field_type(prop) # includes "repeated" if needed
348+
349+
logical_type = (prop.logicalType or "").lower()
350+
is_array = logical_type == "array"
351+
is_message_type = logical_type in OBJECT_TYPES
352+
353+
# Add 'optional' only for non-required, non-array, non-message fields (scalars/enums)
354+
if hasattr(prop, "required") and prop.required is False and not is_array and not is_message_type:
355+
return f"optional {field_type}"
356+
return field_type

0 commit comments

Comments
 (0)