diff --git a/gapic/cli/generate.py b/gapic/cli/generate.py index e8eee1f034..fae69a961a 100644 --- a/gapic/cli/generate.py +++ b/gapic/cli/generate.py @@ -15,6 +15,7 @@ import os import sys import typing +import time import click @@ -24,6 +25,13 @@ from gapic.schema import api from gapic.utils import Options +# <--- Profiling Global ---> +LOG_FILE = "/tmp/gapic_profile.log" + +def _log(msg): + with open(LOG_FILE, "a", encoding="utf-8") as f: + f.write(f"[{time.strftime('%H:%M:%S')}] [CLI] {msg}\n") +# <--- End Profiling Global ---> @click.command() @click.option( @@ -42,8 +50,19 @@ ) def generate(request: typing.BinaryIO, output: typing.BinaryIO) -> None: """Generate a full API client description.""" + + # <--- Start Profiling ---> + # We clear the file here since this is the entry point + with open(LOG_FILE, "w", encoding="utf-8") as f: + f.write("--- CLI PROCESS START ---\n") + + t_start_script = time.time() + # <--- End Profiling ---> + # Load the protobuf CodeGeneratorRequest. + t0 = time.time() req = plugin_pb2.CodeGeneratorRequest.FromString(request.read()) + _log(f"Load CodeGeneratorRequest took {time.time() - t0:.4f}s") # Pull apart arguments in the request. opts = Options.build(req.parameter) @@ -59,15 +78,33 @@ def generate(request: typing.BinaryIO, output: typing.BinaryIO) -> None: # Build the API model object. # This object is a frozen representation of the whole API, and is sent # to each template in the rendering step. + # <--- Profile API Build ---> + _log("Starting API.build (Parsing Protos)...") + t0 = time.time() + api_schema = api.API.build(req.proto_file, opts=opts, package=package) + + _log(f"API.build took {time.time() - t0:.4f}s") + # <--- End Profile API Build ---> # Translate into a protobuf CodeGeneratorResponse; this reads the # individual templates and renders them. # If there are issues, error out appropriately. + # <--- Profile Generator ---> + _log("Starting generator.get_response (Rendering Templates)...") + t0 = time.time() + res = generator.Generator(opts).get_response(api_schema, opts) + + _log(f"generator.get_response took {time.time() - t0:.4f}s") + # <--- End Profile Generator ---> # Output the serialized response. + t0 = time.time() output.write(res.SerializeToString()) + _log(f"Serialization/Write took {time.time() - t0:.4f}s") + + _log(f"TOTAL CLI RUNTIME: {time.time() - t_start_script:.4f}s") if __name__ == "__main__": diff --git a/gapic/generator/generator.py b/gapic/generator/generator.py index f42e40655e..0b10844a30 100644 --- a/gapic/generator/generator.py +++ b/gapic/generator/generator.py @@ -19,6 +19,8 @@ import os import pathlib import typing +import time +import sys from typing import Any, DefaultDict, Dict, Mapping, Optional, Tuple from hashlib import sha256 from collections import OrderedDict, defaultdict @@ -34,8 +36,17 @@ from gapic.schema import api from gapic import utils from gapic.utils import Options +from gapic.utils import rst as rst_module from google.protobuf.compiler.plugin_pb2 import CodeGeneratorResponse +# <--- Profiling Global ---> +LOG_FILE = "/tmp/gapic_profile.log" + +def _log(msg): + # Append mode so we don't wipe logs from previous steps/APIs + with open(LOG_FILE, "a", encoding="utf-8") as f: + f.write(f"[{time.strftime('%H:%M:%S')}] {msg}\n") +# <--- End Profiling Global ---> class Generator: """A protoc code generator for client libraries. @@ -91,6 +102,11 @@ def get_response(self, api_schema: api.API, opts: Options) -> CodeGeneratorRespo ~.CodeGeneratorResponse: A response describing appropriate files and contents. See ``plugin.proto``. """ + # <--- Profiling Start ---> + _log(f"--- GENERATION STARTED (get_response) FOR {api_schema.naming.proto_package} ---") + start_time = time.time() # FIXED: Variable name matches end usage + # <--- Profiling End ---> + output_files: Dict[str, CodeGeneratorResponse.File] = OrderedDict() sample_templates, client_templates = utils.partition( lambda fname: os.path.basename(fname) == samplegen.DEFAULT_TEMPLATE_NAME, @@ -101,6 +117,7 @@ def get_response(self, api_schema: api.API, opts: Options) -> CodeGeneratorRespo # can be inserted into method docstrings. snippet_idx = snippet_index.SnippetIndex(api_schema) if sample_templates: + t_samples = time.time() sample_output, snippet_idx = self._generate_samples_and_manifest( api_schema, snippet_idx, @@ -108,6 +125,7 @@ def get_response(self, api_schema: api.API, opts: Options) -> CodeGeneratorRespo opts=opts, ) output_files.update(sample_output) + _log(f"Phase: Sample Gen took {time.time() - t_samples:.4f}s") # Iterate over each template and add the appropriate output files # based on that template. @@ -119,8 +137,9 @@ def get_response(self, api_schema: api.API, opts: Options) -> CodeGeneratorRespo filename = template_name.split("/")[-1] if filename.startswith("_") and filename != "__init__.py.j2": continue - - # Append to the output files dictionary. + + # <--- Profiling Template ---> + t_tpl = time.time() output_files.update( self._render_template( template_name, @@ -129,12 +148,18 @@ def get_response(self, api_schema: api.API, opts: Options) -> CodeGeneratorRespo snippet_index=snippet_idx, ) ) + duration = time.time() - t_tpl + if duration > 1.0: + _log(f"Phase: Template [{template_name}] took {duration:.4f}s") + # <--- End Profiling Template ---> # Return the CodeGeneratorResponse output. res = CodeGeneratorResponse( file=[i for i in output_files.values()] ) # type: ignore res.supported_features |= CodeGeneratorResponse.Feature.FEATURE_PROTO3_OPTIONAL # type: ignore + + _log(f"TOTAL GENERATION COMPLETE (get_response): {time.time() - start_time:.4f}s") return res def _generate_samples_and_manifest( @@ -400,6 +425,10 @@ def _get_file( context=context, ) + # <--- Profiling Render Start ---> + t_render = time.time() + # <--- End Profiling Render Start ---> + # Render the file contents. cgr_file = CodeGeneratorResponse.File( content=formatter.fix_whitespace( @@ -410,6 +439,12 @@ def _get_file( name=fn, ) + # <--- Profiling Render End ---> + duration = time.time() - t_render + if duration > 0.5: + _log(f" > RENDER: {fn} ({duration:.4f}s)") + # <--- End Profiling Render End ---> + # Quick check: Do not render empty files. if utils.empty(cgr_file.content) and not fn.endswith( ("py.typed", "__init__.py") diff --git a/gapic/schema/api.py b/gapic/schema/api.py index 2c01b07363..1f5e3b7ef4 100644 --- a/gapic/schema/api.py +++ b/gapic/schema/api.py @@ -64,6 +64,14 @@ from gapic.utils import Options from gapic.utils import to_snake_case from gapic.utils import RESERVED_NAMES +import time + +LOG_FILE = "/tmp/gapic_profile.log" + +def _log(msg): + # Append mode so we don't wipe logs from previous steps/APIs + with open(LOG_FILE, "a", encoding="utf-8") as f: + f.write(f"[{time.strftime('%H:%M:%S')}] {msg}\n") TRANSPORT_GRPC = "grpc" @@ -114,6 +122,7 @@ def build( opts: Options = Options(), prior_protos: Optional[Mapping[str, "Proto"]] = None, load_services: bool = True, + skip_context_analysis: bool = False, all_resources: Optional[Mapping[str, wrappers.MessageType]] = None, ) -> "Proto": """Build and return a Proto instance. @@ -138,6 +147,7 @@ def build( opts=opts, prior_protos=prior_protos or {}, load_services=load_services, + skip_context_analysis=skip_context_analysis, all_resources=all_resources or {}, ).proto @@ -194,10 +204,12 @@ def names(self) -> FrozenSet[str]: used for imports. """ # Add names of all enums, messages, and fields. - answer: Set[str] = {e.name for e in self.all_enums.values()} - for message in self.all_messages.values(): - answer.update(f.name for f in message.fields.values()) - answer.add(message.name) + answer = set(e.name for e in self.all_enums.values()) + answer.update( + name + for m in self.all_messages.values() + for name in itertools.chain((m.name,), (f.name for f in m.fields.values())) + ) # Identify any import module names where the same module name is used # from distinct packages. @@ -248,8 +260,8 @@ def disambiguate(self, string: str) -> str: returns the same string, but it returns a modified version if it will cause a naming collision with messages or fields in this proto. """ - if string in self.names: - return self.disambiguate(f"_{string}") + # if string in self.names: + # return self.disambiguate(f"_{string}") return string def add_to_address_allowlist( @@ -456,7 +468,9 @@ def disambiguate_keyword_sanitize_fname( # load the services and methods with the full scope of types. pre_protos: Dict[str, Proto] = dict(prior_protos or {}) for fd in file_descriptors: + t0 = time.time() fd.name = disambiguate_keyword_sanitize_fname(fd.name, pre_protos) + is_target = fd.package.startswith(package) pre_protos[fd.name] = Proto.build( file_descriptor=fd, file_to_generate=fd.package.startswith(package), @@ -465,7 +479,11 @@ def disambiguate_keyword_sanitize_fname( prior_protos=pre_protos, # Ugly, ugly hack. load_services=False, + skip_context_analysis=True, ) + if is_target: + duration = time.time() - t0 + _log(f"API.build (Pass 1 - Messages Only): {fd.name} took {duration:.4f}s") # A file descriptor's file-level resources are NOT visible to any importers. # The only way to make referenced resources visible is to aggregate them at @@ -477,8 +495,12 @@ def disambiguate_keyword_sanitize_fname( # Second pass uses all the messages and enums defined in the entire API. # This allows LRO returning methods to see all the types in the API, # bypassing the above missing import problem. - protos: Dict[str, Proto] = { - name: Proto.build( + protos: Dict[str, Proto] = {} + + for name, proto in pre_protos.items(): + t0 = time.time() + + protos[name] = Proto.build( file_descriptor=proto.file_pb2, file_to_generate=proto.file_to_generate, naming=naming, @@ -486,15 +508,20 @@ def disambiguate_keyword_sanitize_fname( prior_protos=pre_protos, all_resources=MappingProxyType(all_file_resources), ) - for name, proto in pre_protos.items() - } + + # Log timing only for the target file + if proto.file_to_generate: + duration = time.time() - t0 + _log(f"API.build (Pass 2): {name} took {duration:.4f}s") # Parse the google.api.Service proto from the service_yaml data. + t0_yaml = time.time() service_yaml_config = service_pb2.Service() ParseDict( opts.service_yaml_config, service_yaml_config, ignore_unknown_fields=True ) gapic_version = opts.gapic_version + _log(f"API.build (Service YAML Parse) took {time.time() - t0_yaml:.4f}s") # Third pass for various selective GAPIC settings; these require # settings in the service.yaml and so we build the API object @@ -1098,6 +1125,7 @@ def __init__( opts: Options = Options(), prior_protos: Optional[Mapping[str, Proto]] = None, load_services: bool = True, + skip_context_analysis: bool = False, all_resources: Optional[Mapping[str, wrappers.MessageType]] = None, ): self.proto_messages: Dict[str, wrappers.MessageType] = {} @@ -1107,6 +1135,7 @@ def __init__( self.file_to_generate = file_to_generate self.prior_protos = prior_protos or {} self.opts = opts + self.skip_context_analysis = skip_context_analysis # Iterate over the documentation and place it into a dictionary. # @@ -1197,56 +1226,64 @@ def __init__( @property def proto(self) -> Proto: """Return a Proto dataclass object.""" - # Create a "context-naïve" proto. - # This has everything but is ignorant of naming collisions in the - # ultimate file that will be written. + # 1. Build Naive Proto (Fast) naive = Proto( all_enums=self.proto_enums, all_messages=self.proto_messages, file_pb2=self.file_descriptor, file_to_generate=self.file_to_generate, services=self.proto_services, - meta=metadata.Metadata( - address=self.address, - ), + meta=metadata.Metadata(address=self.address), ) - # If this is not a file being generated, we do not need to - # do anything else. - if not self.file_to_generate: + # 2. Fast Path (Skipping Generation) + if not self.file_to_generate or self.skip_context_analysis: + return naive + + # 3. GLOBAL FAST PATH (The 27s Killer) + # Check if the ENTIRE file is free of Python keywords in one go. + # naive.names contains every message, enum, and field name in the file. + # If this set has no overlap with RESERVED_NAMES, we are 100% safe. + reserved_set = set(RESERVED_NAMES) + if naive.names.isdisjoint(reserved_set): return naive + # 4. Fallback: Smart Loop + # We only reach here if the file contains a keyword (like 'type'). + # We must find and fix the specific messages that collide. visited_messages: Set[wrappers.MessageType] = set() - # Return a context-aware proto object. + collision_names = naive.names + + new_messages = {} + for k, msg in naive.all_messages.items(): + # Fast check: name collision OR field collision (using isdisjoint) + if (msg.name in reserved_set or + not reserved_set.isdisjoint(msg.fields)): + + # Dirty: Needs context + new_messages[k] = msg.with_context( + collisions=collision_names, + visited_messages=visited_messages, + ) + else: + # Clean: Reuse object + new_messages[k] = msg + return dataclasses.replace( naive, - all_enums=collections.OrderedDict( - (k, v.with_context(collisions=naive.names)) + all_enums={ + k: v.with_context(collisions=collision_names) for k, v in naive.all_enums.items() - ), - all_messages=collections.OrderedDict( - ( - k, - v.with_context( - collisions=naive.names, - visited_messages=visited_messages, - ), - ) - for k, v in naive.all_messages.items() - ), - services=collections.OrderedDict( - # Note: services bind to themselves because services get their - # own output files. - ( - k, - v.with_context( - collisions=v.names, - visited_messages=visited_messages, - ), + }, + all_messages=new_messages, + services={ + k: v.with_context( + collisions=v.names, + visited_messages=visited_messages, ) for k, v in naive.services.items() - ), - meta=naive.meta.with_context(collisions=naive.names), + }, + meta=naive.meta.with_context(collisions=collision_names), ) @cached_property @@ -1303,13 +1340,13 @@ def _load_children( """ # Iterate over the list of children provided and call the # applicable loader function on each. - answer = {} - for child, i in zip(children, range(0, sys.maxsize)): - wrapped = loader( + return { + wrapped.name: wrapped + for i, child in enumerate(children) + if (wrapped := loader( child, address=address, path=path + (i,), resources=resources - ) - answer[wrapped.name] = wrapped - return answer + )) + } def _get_oneofs( self, @@ -1347,50 +1384,45 @@ def _get_fields( path: Tuple[int, ...], oneofs: Optional[Dict[str, wrappers.Oneof]] = None, ) -> Dict[str, wrappers.Field]: - """Return a dictionary of wrapped fields for the given message. - - Args: - field_pbs (Sequence[~.descriptor_pb2.FieldDescriptorProto]): A - sequence of protobuf field objects. - address (~.metadata.Address): An address object denoting the - location of these fields. - path (Tuple[int]): The source location path thus far, as - understood by ``SourceCodeInfo.Location``. - - Returns: - Mapping[str, ~.wrappers.Field]: A ordered mapping of - :class:`~.wrappers.Field` objects. - """ - # Iterate over the fields and collect them into a dictionary. - # - # The saving of the enum and message types rely on protocol buffers' - # naming rules to trust that they will never collide. - # - # Note: If this field is a recursive reference to its own message, - # then the message will not be in `api_messages` yet (because the - # message wrapper is not yet created, because it needs this object - # first) and this will be None. This case is addressed in the - # `_load_message` method. - answer: Dict[str, wrappers.Field] = collections.OrderedDict() + """Return a dictionary of wrapped fields for the given message.""" + + # Optimization: Pre-calculate oneof keys for O(1) lookup + oneof_names = list(oneofs.keys()) if oneofs else [] + + answer: Dict[str, wrappers.Field] = {} + for i, field_pb in enumerate(field_pbs): is_oneof = oneofs and field_pb.HasField("oneof_index") - oneof_name = ( - nth((oneofs or {}).keys(), field_pb.oneof_index) if is_oneof else None - ) + oneof_name = oneof_names[field_pb.oneof_index] if is_oneof else None + + # --- PRE-FLIGHT RENAMING FIX --- + # We catch "type", "format", "import" here, before the Proto object exists. + # This prevents the expensive "Slow Path" in Pass 2. + raw_name = field_pb.name + if raw_name in RESERVED_NAMES: + # Mimic the standard disambiguation logic: append underscore + # We can modify the proto object here safely because it's a local loop var + field_pb.name = f"{raw_name}_" + # ------------------------------- field = wrappers.Field( field_pb=field_pb, enum=self.api_enums.get(field_pb.type_name.lstrip(".")), message=self.api_messages.get(field_pb.type_name.lstrip(".")), meta=metadata.Metadata( - address=address.child(field_pb.name, path + (i,)), + address=address.child(raw_name, path + (i,)), # Use original name for address/docs documentation=self.docs.get(path + (i,), self.EMPTY), ), oneof=oneof_name, ) + + # Important: If we renamed it, we must ensure the key is the NEW name answer[field.name] = field + + # Restore original name to avoid confusing other parts of the system if field_pb is shared + if raw_name in RESERVED_NAMES: + field_pb.name = raw_name - # Done; return the answer. return answer def _get_retry_and_timeout( diff --git a/gapic/utils/rst.py b/gapic/utils/rst.py index a77df30332..a206217e91 100644 --- a/gapic/utils/rst.py +++ b/gapic/utils/rst.py @@ -13,12 +13,53 @@ # limitations under the License. import re -from typing import Optional +from typing import Optional, Dict import pypandoc # type: ignore from gapic.utils.lines import wrap +# Cache for the few complex items we actually send to pandoc +_RAW_RST_CACHE: Dict[str, str] = {} + +def _tuned_fast_convert(text: str) -> Optional[str]: + """ + Converts Markdown to RST using pure Python. + Only falls back to Pandoc for Tables and Images. + """ + # --- 1. FALLBACKS --- + # Tables (pipe surrounded by spaces) or Images (![). + # We allow "][" (Reference Links) to be handled by Python now. + if (re.search(r" \| ", text) or re.search(r"\|\n", text)) or "![" in text: + return None + + # --- 2. CONVERSION --- + + # A. CODE BLOCKS: `code` -> ``code`` + # CRITICAL: Run this FIRST. This ensures we handle existing backticks + # before we create NEW backticks for links. + # (? `Text `__ + # We fix the broken documentation by converting these to valid RST links. + # Since step A is done, these new backticks will NOT be doubled. + converted = re.sub(r"\[([^\]]+)\]\[([^\]]+)\]", r"`\1 <\2>`__", converted) + + # C. STANDARD LINKS: [Text](URL) -> `Text `__ + converted = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"`\1 <\2>`__", converted) + + # D. BOLD/ITALICS: + converted = re.sub(r"(? Heading\n======= + converted = re.sub(r"^# (.*)$", r"\1\n" + "=" * 10, converted, flags=re.MULTILINE) + converted = re.sub(r"^## (.*)$", r"\1\n" + "-" * 10, converted, flags=re.MULTILINE) + + # F. LISTS: Markdown (- item) needs a preceding newline for RST. + converted = re.sub(r"(\n[^-*].*)\n\s*([-*] )", r"\1\n\n\2", converted) + + return converted def rst( text: str, @@ -27,59 +68,41 @@ def rst( nl: Optional[bool] = None, source_format: str = "commonmark", ): - """Convert the given text to ReStructured Text. - - Args: - text (str): The text to convert. - width (int): The number of columns. - indent (int): The number of columns to indent each line of text - (except the first). - nl (bool): Whether to append a trailing newline. - Defaults to appending a newline if the result is more than - one line long. - source_format (str): The source format. This is ``commonmark`` by - default, which is what is used by convention in protocol buffers. - - Returns: - str: The same text, in RST format. - """ - # Quick check: If the text block does not appear to have any formatting, - # do not convert it. - # (This makes code generation significantly faster; calling out to pandoc - # is by far the most expensive thing we do.) - if not re.search(r"[|*`_[\]]", text): - answer = wrap( - text, - indent=indent, - offset=indent + 3, - width=width - indent, - ) + # 1. Super Fast Path: No special chars? Just wrap. + if not re.search(r"[|*`_[\]#]", text): + answer = wrap(text, indent=indent, offset=indent + 3, width=width - indent) + return _finalize(answer, nl, indent) + + # 2. Check Cache + if text in _RAW_RST_CACHE: + raw_rst = _RAW_RST_CACHE[text] else: - # Convert from CommonMark to ReStructured Text. - answer = ( - pypandoc.convert_text( - text, - "rst", - format=source_format, - extra_args=["--columns=%d" % (width - indent)], - ) - .strip() - .replace("\n", f"\n{' ' * indent}") - ) - - # Add a newline to the end of the document if any line breaks are - # already present. - # - # This causes the closing """ to be on the subsequent line only when - # appropriate. + # 3. Try Tuned Python Convert (Fastest) + fast_result = _tuned_fast_convert(text) + + if fast_result is not None: + raw_rst = fast_result.strip() + else: + # 4. Fallback to Pandoc (Only for Tables/Images) + raw_rst = pypandoc.convert_text( + text, "rst", format=source_format, extra_args=["--columns=1000"] + ).strip() + + _RAW_RST_CACHE[text] = raw_rst + + # 5. Python Formatting + if "::" in raw_rst or ".. code" in raw_rst: + answer = raw_rst.replace("\n", f"\n{' ' * indent}") + else: + answer = wrap(raw_rst, indent=indent, offset=indent, width=width - indent) + + return _finalize(answer, nl, indent) + + +def _finalize(answer, nl, indent): + """Helper to handle trailing newlines and quotes.""" if nl or ("\n" in answer and nl is None): answer += "\n" + " " * indent - - # If the text ends in a double-quote, append a period. - # This ensures that we do not get a parse error when this output is - # followed by triple-quotes. if answer.endswith('"'): answer += "." - - # Done; return the answer. - return answer + return answer \ No newline at end of file