From 140ad94735b4875168ba25c19651ec7f85336781 Mon Sep 17 00:00:00 2001 From: Fuat Date: Mon, 2 Mar 2026 03:10:44 +0300 Subject: [PATCH 1/6] Add C++ header parser with access specifier tracking Regex-based parser for .h/.hpp/.hxx/.h++ files that captures public API: classes, structs, unions, methods, constructors, destructors, operator overloads, free functions, fields, enums, typedefs, and using aliases. Key features: - Access specifier state machine (public/protected/private sections) - Doxygen doc comment extraction (@brief, @param, @return) - Template declaration handling - Export macro stripping (SFML_API, etc.) - Namespace and class nesting with brace-depth tracking - FQN uses :: separator (C++ convention) Also fixes db.py to support :: in namespace splitting and FTS escaping. --- src/codesurface/db.py | 7 +- src/codesurface/parsers/__init__.py | 2 + src/codesurface/parsers/cpp.py | 1403 +++++++++++++++++++++++++++ 3 files changed, 1409 insertions(+), 3 deletions(-) create mode 100644 src/codesurface/parsers/cpp.py diff --git a/src/codesurface/db.py b/src/codesurface/db.py index 0b7fad1..16695f2 100644 --- a/src/codesurface/db.py +++ b/src/codesurface/db.py @@ -38,10 +38,11 @@ def _build_search_text(record: dict) -> str: val = record.get(field, "") if val: tokens.append(split_identifier(val)) - # Last namespace segment (e.g. "Services" from "CampGame.Services") + # Last namespace segment (e.g. "Services" from "CampGame.Services", + # or "Utils" from "MyLib::Utils") ns = record.get("namespace", "") if ns: - last_part = ns.rsplit(".", 1)[-1] + last_part = re.split(r"[.:]", ns)[-1] tokens.append(split_identifier(last_part)) return " ".join(tokens) @@ -249,7 +250,7 @@ def _escape_fts(query: str) -> str: "ICommand" → (ICommand*) OR (I Command*) """ q = query - for ch in '."-*()': + for ch in '."-*():': q = q.replace(ch, " ") terms = [t for t in q.split() if t] if not terms: diff --git a/src/codesurface/parsers/__init__.py b/src/codesurface/parsers/__init__.py index 8ee125e..73cea5e 100644 --- a/src/codesurface/parsers/__init__.py +++ b/src/codesurface/parsers/__init__.py @@ -53,12 +53,14 @@ def all_extensions() -> list[str]: # --- Auto-register built-in parsers --- +from .cpp import CppParser # noqa: E402 from .csharp import CSharpParser # noqa: E402 from .go import GoParser # noqa: E402 from .java import JavaParser # noqa: E402 from .python_parser import PythonParser # noqa: E402 from .typescript import TypeScriptParser # noqa: E402 +register("cpp", CppParser) register("csharp", CSharpParser) register("go", GoParser) register("java", JavaParser) diff --git a/src/codesurface/parsers/cpp.py b/src/codesurface/parsers/cpp.py new file mode 100644 index 0000000..e56e2ba --- /dev/null +++ b/src/codesurface/parsers/cpp.py @@ -0,0 +1,1403 @@ +"""C++ header parser that captures public API declarations. + +Scans C++ header files (.h, .hpp, .hxx, .h++) tracking brace-depth scope, +namespace nesting, class/struct/union hierarchy with access specifier state, +and emits only public members. + +Captures: classes, structs, unions, methods, constructors, destructors, +operator overloads, free functions, public fields, enums, enum class, +typedef, and using aliases. + +Doc comments: Doxygen /** */ blocks and /// lines with @brief/@param/@return +extraction. +""" + +import re +from pathlib import Path + +from .base import BaseParser + +# --------------------------------------------------------------------------- +# Skip patterns +# --------------------------------------------------------------------------- + +_SKIP_DIRS = frozenset({ + "build", ".git", "third_party", "vendor", "test", "tests", + "examples", "node_modules", ".cache", "obj", "out", + "Debug", "Release", "x64", "x86", ".vs", +}) + +_SKIP_DIR_PREFIXES = ("cmake-build-",) + +# C++ keywords that can't be member names +_CPP_KEYWORDS = frozenset({ + "if", "else", "for", "while", "do", "switch", "case", "default", + "break", "continue", "return", "goto", "try", "catch", "throw", + "new", "delete", "this", "sizeof", "alignof", "decltype", "typeid", + "static_cast", "dynamic_cast", "const_cast", "reinterpret_cast", + "true", "false", "nullptr", "void", "auto", "register", "volatile", + "extern", "mutable", "inline", "constexpr", "consteval", "constinit", + "public", "protected", "private", "friend", "using", "namespace", + "class", "struct", "union", "enum", "typedef", "template", + "virtual", "override", "final", "static", "const", "noexcept", + "explicit", "operator", "typename", "concept", "requires", +}) + +# --------------------------------------------------------------------------- +# Regex patterns +# --------------------------------------------------------------------------- + +# Export/API macros: SFML_API, IMGUI_API, MY_EXPORT, etc. +_EXPORT_MACRO_RE = re.compile(r"\b\w+_(?:API|EXPORT|DLL|SHARED)\b") + +# Namespace: namespace foo { or namespace foo::bar { +_NAMESPACE_RE = re.compile( + r"^\s*namespace\s+" + r"(\w+(?:::\w+)*)" # namespace name (possibly nested) + r"\s*\{?" +) + +# Anonymous namespace +_ANON_NAMESPACE_RE = re.compile(r"^\s*namespace\s*\{") + +# Access specifier: public: / protected: / private: +_ACCESS_RE = re.compile(r"^\s*(public|protected|private)\s*:") + +# Class/struct/union declaration +# Handles: template<...> class EXPORT_API ClassName : public Base { +_CLASS_RE = re.compile( + r"^\s*(?:template\s*<[^>]*>\s*)?" # optional template<...> + r"(class|struct|union)\s+" + r"(?:\w+_(?:API|EXPORT|DLL|SHARED)\s+)?" # optional export macro + r"(\w+)" # class name + r"(?:\s+final)?" # optional final + r"(.*)" # rest: inheritance, {, ; +) + +# Forward declaration: class Foo; or struct Foo; +_FORWARD_DECL_RE = re.compile( + r"^\s*(?:class|struct|union)\s+" + r"(?:\w+_(?:API|EXPORT|DLL|SHARED)\s+)?" + r"\w+\s*;" +) + +# Friend declaration +_FRIEND_RE = re.compile(r"^\s*friend\s+") + +# Enum: enum Foo { or enum class Foo : int { +_ENUM_RE = re.compile( + r"^\s*(enum\s+class|enum\s+struct|enum)\s+" + r"(\w+)" # enum name + r"(?:\s*:\s*(\w+))?" # optional underlying type + r"(.*)" # rest +) + +# Enum value: NAME = value, or NAME, +_ENUM_VALUE_RE = re.compile( + r"^\s*(\w+)" # enumerator name + r"(?:\s*=\s*([^,}]+))?" # optional = value + r"\s*[,}]?" +) + +# Typedef: typedef old_type new_name; +_TYPEDEF_RE = re.compile( + r"^\s*typedef\s+" + r"(.+?)\s+" # original type + r"(\w+)\s*;" # new name +) + +# Using alias: using Name = type; +_USING_ALIAS_RE = re.compile( + r"^\s*(?:template\s*<[^>]*>\s*)?" + r"using\s+(\w+)\s*=\s*(.+?)\s*;" +) + +# Method/function declaration (very broad, refined in code) +# Captures: optional qualifiers, return type, name, params +_FUNC_RE = re.compile( + r"^\s*" + r"((?:(?:static|virtual|inline|explicit|constexpr|consteval|" + r"friend|extern|nodiscard|\[\[nodiscard\]\]|" + r"\w+_(?:API|EXPORT|DLL|SHARED))\s+)*)" # leading qualifiers + r"([\w:*&<>,\s]+?)\s+" # return type + r"(\w+)" # function/method name + r"\s*\(" # open paren +) + +# Constructor: ClassName(params) +_CTOR_RE = re.compile( + r"^\s*" + r"((?:(?:explicit|inline|constexpr|consteval|" + r"\w+_(?:API|EXPORT|DLL|SHARED))\s+)*)" # optional qualifiers + r"(\w+)" # class name (must match current) + r"\s*\(" # open paren +) + +# Destructor: ~ClassName() or virtual ~ClassName() +_DTOR_RE = re.compile( + r"^\s*" + r"(?:virtual\s+)?" + r"~(\w+)" # class name + r"\s*\(" +) + +# Operator overload: ReturnType operator+(...) or operator Type() +_OPERATOR_RE = re.compile( + r"^\s*" + r"((?:(?:static|virtual|inline|explicit|constexpr|friend|" + r"\w+_(?:API|EXPORT|DLL|SHARED))\s+)*)" # leading qualifiers + r"([\w:*&<>,\s]*?)\s*" # return type (may be empty for conversion) + r"(operator\s*(?:\(\)|" # operator() — call operator + r"\[\]|" # operator[] — subscript + r"->|" # operator-> — member access + r"<<|>>|" # shift operators + r"[+\-*/%^&|~!=<>]=?|" # arithmetic/comparison ops + r"&&|\|\||" # logical ops + r",|" # comma operator + r"\w[\w:*&<> ]*?" # conversion operator + r"))" + r"\s*\(" # open paren +) + +# Field declaration: type name; or type name = value; +# Only matched inside class/struct bodies when access is public +_FIELD_RE = re.compile( + r"^\s*" + r"((?:(?:static|const|constexpr|inline|mutable|volatile)\s+)*)" # qualifiers + r"([\w:*&<>,\s]+?)\s+" # type + r"(\w+)" # field name + r"(?:\s*(?:=\s*[^;]+|{[^}]*}|\[[^\]]*\]))?" # optional init + r"\s*;" +) + +# Template prefix: template<...> (possibly multi-line) +_TEMPLATE_RE = re.compile(r"^\s*template\s*<") + +# Preprocessor directive +_PREPROCESSOR_RE = re.compile(r"^\s*#") + +# Macro continuation (line ending with \) +_MACRO_CONT_RE = re.compile(r"\\\s*$") + + +class CppParser(BaseParser): + """Parser for C++ header files.""" + + @property + def file_extensions(self) -> list[str]: + return [".h", ".hpp", ".hxx", ".h++"] + + def parse_directory(self, directory: Path) -> list[dict]: + """Override to skip build/vendor/test directories.""" + records: list[dict] = [] + for ext in self.file_extensions: + for f in sorted(directory.rglob(f"*{ext}")): + parts = f.relative_to(directory).parts + if any( + p in _SKIP_DIRS + or any(p.startswith(pfx) for pfx in _SKIP_DIR_PREFIXES) + for p in parts + ): + continue + try: + records.extend(self.parse_file(f, directory)) + except Exception as e: + import sys + print(f"codesurface: failed to parse {f}: {e}", file=sys.stderr) + continue + return records + + def parse_file(self, path: Path, base_dir: Path) -> list[dict]: + return _parse_cpp_file(path, base_dir) + + +# --------------------------------------------------------------------------- +# Core parser +# --------------------------------------------------------------------------- + +def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: + """Parse a single C++ header file and extract public API members.""" + try: + text = path.read_text(encoding="utf-8", errors="replace") + except (OSError, UnicodeDecodeError): + return [] + + rel_path = path.relative_to(base_dir).as_posix() + lines = text.splitlines() + + # Skip generated files + for line in lines[:20]: + if "DO NOT EDIT" in line or "GENERATED" in line.upper(): + return [] + + records: list[dict] = [] + + # State + namespace_stack: list[tuple[str, int]] = [] # (name, brace_depth_when_opened) + class_stack: list[list] = [] # [name, kind, depth, access] + brace_depth = 0 + in_multiline_comment = False + pending_template = "" + in_enum: str = "" # enum name if inside enum body + enum_class_name: str = "" # owning class for the enum, if any + enum_brace_depth = -1 + + i = 0 + while i < len(lines): + line = lines[i] + + # --- Multi-line comment continuation --- + if in_multiline_comment: + if "*/" in line: + in_multiline_comment = False + after = line[line.index("*/") + 2:] + brace_depth += _count_braces(after) + i += 1 + continue + + stripped = line.strip() + + # Empty line + if not stripped: + i += 1 + continue + + # Start of multi-line comment (not doc comment — those are read on demand) + if "/*" in stripped and "*/" not in stripped: + # Check if it's NOT a doc comment (we handle those via lookback) + if not stripped.startswith("/**") and not stripped.startswith("/*!"): + if not stripped.startswith("//"): + in_multiline_comment = True + pre = line[:line.find("/*")] + brace_depth += _count_braces(pre) + i += 1 + continue + else: + # Doc comment block — skip lines until */ + in_multiline_comment = True + i += 1 + continue + + # Single-line comment + if stripped.startswith("//"): + i += 1 + continue + + # Preprocessor + if _PREPROCESSOR_RE.match(line): + # Skip continuation lines + while _MACRO_CONT_RE.search(line) and i + 1 < len(lines): + i += 1 + line = lines[i] + i += 1 + continue + + # Count braces + brace_delta = _count_braces(line) + new_depth = brace_depth + brace_delta + + # --- Template accumulation --- + if _TEMPLATE_RE.match(line) and not _has_declaration_after_template(stripped): + pending_template = stripped + # Multi-line template: balance angle brackets + angle_depth = _count_angles(stripped) + while angle_depth > 0 and i + 1 < len(lines): + i += 1 + next_stripped = lines[i].strip() + pending_template += " " + next_stripped + angle_depth += _count_angles(next_stripped) + brace_delta = _count_braces(lines[i]) + new_depth = brace_depth + brace_delta + brace_depth = new_depth + i += 1 + continue + + # --- Inside enum body --- + if in_enum: + if new_depth <= enum_brace_depth: + # Enum body closed + in_enum = "" + enum_class_name = "" + enum_brace_depth = -1 + brace_depth = new_depth + i += 1 + continue + + # Parse enum values + val_m = _ENUM_VALUE_RE.match(stripped) + if val_m and stripped not in ("{", "}"): + val_name = val_m.group(1) + if val_name not in _CPP_KEYWORDS and not val_name.startswith("//"): + val_value = val_m.group(2) + ns = _build_ns(namespace_stack) + sig = val_name + if val_value: + sig += f" = {val_value.strip()}" + + parent_class = enum_class_name or in_enum + fqn_parts = [p for p in [ns, enum_class_name, in_enum, val_name] if p] + fqn = "::".join(fqn_parts) + + records.append(_build_record( + fqn=fqn, + namespace=ns, + class_name=in_enum, + member_name=val_name, + member_type="field", + signature=sig, + file_path=rel_path, + line_start=i + 1, + line_end=i + 1, + )) + + brace_depth = new_depth + i += 1 + continue + + # --- Close class/struct/union scope --- + while class_stack and new_depth <= class_stack[-1][2]: + class_stack.pop() + + # --- Close namespace scope --- + while namespace_stack and new_depth <= namespace_stack[-1][1]: + namespace_stack.pop() + + # --- Anonymous namespace (skip tracking) --- + if _ANON_NAMESPACE_RE.match(line): + brace_depth = new_depth + i += 1 + continue + + # --- Namespace declaration --- + ns_m = _NAMESPACE_RE.match(line) + if ns_m and "=" not in line: + ns_name = ns_m.group(1) + # Handle inline namespaces and nested namespace::name + if "{" in line: + # For nested ns like namespace a::b::c { + parts = ns_name.split("::") + base_depth = brace_depth + for j, part in enumerate(parts): + # Each nested namespace segment gets its own depth level + # but they all open at the same brace + if j == len(parts) - 1: + namespace_stack.append((part, base_depth)) + else: + namespace_stack.append((part, base_depth)) + else: + # namespace without brace on same line — will come on next line + namespace_stack.append((ns_name, brace_depth)) + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Forward declaration (skip) --- + if _FORWARD_DECL_RE.match(line): + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Friend declaration (skip) --- + if _FRIEND_RE.match(stripped): + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Access specifier --- + access_m = _ACCESS_RE.match(line) + if access_m and class_stack: + class_stack[-1][3] = access_m.group(1) + brace_depth = new_depth + i += 1 + continue + + # Determine if we're in a public context + is_public = _is_public(class_stack) + + # --- Enum declaration --- + enum_m = _ENUM_RE.match(line) + if enum_m: + enum_keyword = enum_m.group(1) + enum_name = enum_m.group(2) + enum_underlying = enum_m.group(3) or "" + enum_rest = enum_m.group(4) + + if is_public and enum_name not in _CPP_KEYWORDS: + ns = _build_ns(namespace_stack) + owning_class = class_stack[-1][0] if class_stack else "" + doc = _look_back_for_doc(lines, i) + + sig = f"{enum_keyword} {enum_name}" + if enum_underlying: + sig += f" : {enum_underlying}" + if pending_template: + sig = pending_template + " " + sig + + fqn_parts = [p for p in [ns, owning_class, enum_name] if p] + fqn = "::".join(fqn_parts) + + records.append(_build_record( + fqn=fqn, + namespace=ns, + class_name=owning_class or enum_name, + member_name="" if not owning_class else enum_name, + member_type="type", + signature=sig, + summary=doc.get("brief", ""), + file_path=rel_path, + line_start=i + 1, + line_end=i + 1, + )) + + # Track enum body for value extraction + if "{" in line: + in_enum = enum_name + enum_class_name = owning_class + enum_brace_depth = brace_depth + + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Typedef --- + td_m = _TYPEDEF_RE.match(line) + if td_m: + if is_public: + orig_type = td_m.group(1).strip() + new_name = td_m.group(2) + if new_name not in _CPP_KEYWORDS: + ns = _build_ns(namespace_stack) + owning_class = class_stack[-1][0] if class_stack else "" + doc = _look_back_for_doc(lines, i) + + sig = f"typedef {orig_type} {new_name}" + fqn_parts = [p for p in [ns, owning_class, new_name] if p] + fqn = "::".join(fqn_parts) + + records.append(_build_record( + fqn=fqn, + namespace=ns, + class_name=owning_class or new_name, + member_name="" if not owning_class else new_name, + member_type="type", + signature=sig, + summary=doc.get("brief", ""), + file_path=rel_path, + line_start=i + 1, + line_end=i + 1, + )) + + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Using alias --- + using_m = _USING_ALIAS_RE.match(line) + if using_m: + if is_public: + alias_name = using_m.group(1) + alias_type = using_m.group(2).strip() + if alias_name not in _CPP_KEYWORDS: + ns = _build_ns(namespace_stack) + owning_class = class_stack[-1][0] if class_stack else "" + doc = _look_back_for_doc(lines, i) + + sig = f"using {alias_name} = {alias_type}" + if pending_template: + sig = pending_template + " " + sig + fqn_parts = [p for p in [ns, owning_class, alias_name] if p] + fqn = "::".join(fqn_parts) + + records.append(_build_record( + fqn=fqn, + namespace=ns, + class_name=owning_class or alias_name, + member_name="" if not owning_class else alias_name, + member_type="type", + signature=sig, + summary=doc.get("brief", ""), + file_path=rel_path, + line_start=i + 1, + line_end=i + 1, + )) + + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Class / struct / union declaration --- + class_m = _CLASS_RE.match(line) + if class_m: + kind = class_m.group(1) # class, struct, or union + name = class_m.group(2) + rest = class_m.group(3).strip() + + # Skip forward declarations (ending with ;) + if rest.endswith(";") and "{" not in rest: + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + if name not in _CPP_KEYWORDS and is_public: + ns = _build_ns(namespace_stack) + owning_class = class_stack[-1][0] if class_stack else "" + doc = _look_back_for_doc(lines, i) + + # Build signature with inheritance + sig = f"{kind} {name}" + # Extract inheritance + inheritance = _extract_inheritance(rest) + if inheritance: + sig += f" : {inheritance}" + if pending_template: + sig = pending_template + " " + sig + + fqn_parts = [p for p in [ns, owning_class, name] if p] + fqn = "::".join(fqn_parts) + + records.append(_build_record( + fqn=fqn, + namespace=ns, + class_name=owning_class or name, + member_name="" if not owning_class else name, + member_type="type", + signature=sig, + summary=doc.get("brief", ""), + file_path=rel_path, + line_start=i + 1, + line_end=i + 1, + )) + + # Push onto class stack with default access + if "{" in line: + default_access = "public" if kind in ("struct", "union") else "private" + class_stack.append([name, kind, brace_depth, default_access]) + + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Destructor --- + dtor_m = _DTOR_RE.match(line) + if dtor_m and class_stack: + class_name_match = dtor_m.group(1) + if class_name_match == class_stack[-1][0] and is_public: + ns = _build_ns(namespace_stack) + owning_class = class_stack[-1][0] + doc = _look_back_for_doc(lines, i) + + full_sig, end_i = _collect_signature(lines, i) + params_str = _extract_params_str(full_sig, f"~{class_name_match}") + sig = _clean_sig(f"~{class_name_match}({params_str})") + + # Add qualifiers + quals = _extract_trailing_qualifiers(full_sig) + if quals: + sig += " " + quals + + if "virtual" in stripped: + sig = "virtual " + sig + + fqn = "::".join([p for p in [ns, owning_class, f"~{owning_class}"] if p]) + records.append(_build_record( + fqn=fqn, + namespace=ns, + class_name=owning_class, + member_name=f"~{owning_class}", + member_type="method", + signature=sig, + summary=doc.get("brief", ""), + params_json=doc.get("params", []), + file_path=rel_path, + line_start=i + 1, + line_end=end_i + 1, + )) + + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Operator overload --- + op_m = _OPERATOR_RE.match(line) + if op_m and is_public: + qualifiers = op_m.group(1).strip() + ret_type = op_m.group(2).strip() + op_name = op_m.group(3).strip() + + # Skip friend declarations + if "friend" in qualifiers: + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + ns = _build_ns(namespace_stack) + owning_class = class_stack[-1][0] if class_stack else "" + doc = _look_back_for_doc(lines, i) + + full_sig, end_i = _collect_signature(lines, i) + params_str = _extract_params_str(full_sig, op_name) + + sig_parts = [] + if qualifiers: + sig_parts.append(_strip_export_macros(qualifiers)) + if ret_type: + sig_parts.append(_strip_export_macros(ret_type)) + sig_parts.append(f"{op_name}({params_str})") + sig = _clean_sig(" ".join(sig_parts)) + + quals = _extract_trailing_qualifiers(full_sig) + if quals: + sig += " " + quals + + if pending_template: + sig = pending_template + " " + sig + + fqn_parts = [p for p in [ns, owning_class, op_name] if p] + fqn = "::".join(fqn_parts) + + # Handle overloads: add param types to FQN + param_types = _extract_param_types(params_str) + if param_types: + fqn += f"({param_types})" + + records.append(_build_record( + fqn=fqn, + namespace=ns, + class_name=owning_class, + member_name=op_name, + member_type="method", + signature=sig, + summary=doc.get("brief", ""), + params_json=doc.get("params", []), + returns_text=doc.get("returns", ""), + file_path=rel_path, + line_start=i + 1, + line_end=end_i + 1, + )) + + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Constructor --- + if class_stack: + ctor_m = _CTOR_RE.match(line) + if ctor_m: + ctor_name = ctor_m.group(2) + if ctor_name == class_stack[-1][0] and is_public: + qualifiers = ctor_m.group(1).strip() + ns = _build_ns(namespace_stack) + owning_class = class_stack[-1][0] + doc = _look_back_for_doc(lines, i) + + full_sig, end_i = _collect_signature(lines, i) + params_str = _extract_params_str(full_sig, ctor_name) + + sig_parts = [] + if qualifiers: + sig_parts.append(_strip_export_macros(qualifiers)) + sig_parts.append(f"{ctor_name}({params_str})") + sig = _clean_sig(" ".join(sig_parts)) + + if pending_template: + sig = pending_template + " " + sig + + fqn_parts = [p for p in [ns, owning_class, ctor_name] if p] + fqn = "::".join(fqn_parts) + + # Handle overloads + param_types = _extract_param_types(params_str) + if param_types: + fqn += f"({param_types})" + + records.append(_build_record( + fqn=fqn, + namespace=ns, + class_name=owning_class, + member_name=ctor_name, + member_type="method", + signature=sig, + summary=doc.get("brief", ""), + params_json=doc.get("params", []), + file_path=rel_path, + line_start=i + 1, + line_end=end_i + 1, + )) + + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Method / Free function --- + func_m = _FUNC_RE.match(line) + if func_m and is_public: + qualifiers = func_m.group(1).strip() + ret_type = func_m.group(2).strip() + func_name = func_m.group(3) + + # Skip if func_name is a keyword or matches current class (that's a ctor) + if func_name in _CPP_KEYWORDS: + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # Skip friend functions + if "friend" in qualifiers: + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # Skip if this is actually a constructor (name matches class) + if class_stack and func_name == class_stack[-1][0]: + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + ns = _build_ns(namespace_stack) + owning_class = class_stack[-1][0] if class_stack else "" + doc = _look_back_for_doc(lines, i) + + full_sig, end_i = _collect_signature(lines, i) + params_str = _extract_params_str(full_sig, func_name) + + sig_parts = [] + clean_quals = _strip_export_macros(qualifiers) + if clean_quals: + sig_parts.append(clean_quals) + clean_ret = _strip_export_macros(ret_type) + if clean_ret: + sig_parts.append(clean_ret) + sig_parts.append(f"{func_name}({params_str})") + sig = _clean_sig(" ".join(sig_parts)) + + # Add trailing qualifiers (const, noexcept, override, = 0, etc.) + quals = _extract_trailing_qualifiers(full_sig) + if quals: + sig += " " + quals + + if pending_template: + sig = pending_template + " " + sig + + fqn_parts = [p for p in [ns, owning_class, func_name] if p] + fqn = "::".join(fqn_parts) + + records.append(_build_record( + fqn=fqn, + namespace=ns, + class_name=owning_class, + member_name=func_name, + member_type="method", + signature=sig, + summary=doc.get("brief", ""), + params_json=doc.get("params", []), + returns_text=doc.get("returns", ""), + file_path=rel_path, + line_start=i + 1, + line_end=end_i + 1, + )) + + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Field (inside class/struct body, public only) --- + if class_stack and is_public: + field_m = _FIELD_RE.match(line) + if field_m: + field_quals = field_m.group(1).strip() + field_type = field_m.group(2).strip() + field_name = field_m.group(3) + + if field_name not in _CPP_KEYWORDS: + # Skip if field_type looks like a keyword-only thing + if field_type and field_type not in _CPP_KEYWORDS: + ns = _build_ns(namespace_stack) + owning_class = class_stack[-1][0] + doc = _look_back_for_doc(lines, i) + + sig_parts = [] + if field_quals: + sig_parts.append(field_quals) + sig_parts.append(field_type) + sig_parts.append(field_name) + sig = _clean_sig(" ".join(sig_parts)) + + fqn_parts = [p for p in [ns, owning_class, field_name] if p] + fqn = "::".join(fqn_parts) + + records.append(_build_record( + fqn=fqn, + namespace=ns, + class_name=owning_class, + member_name=field_name, + member_type="field", + signature=sig, + summary=doc.get("brief", ""), + file_path=rel_path, + line_start=i + 1, + line_end=i + 1, + )) + + pending_template = "" + brace_depth = new_depth + i += 1 + + # Deduplicate — keep first occurrence + unique: list[dict] = [] + seen: set[str] = set() + for rec in records: + fqn = rec["fqn"] + if fqn not in seen: + seen.add(fqn) + unique.append(rec) + return unique + + +# --------------------------------------------------------------------------- +# Helper: public access check +# --------------------------------------------------------------------------- + +def _is_public(class_stack: list[list]) -> bool: + """Check if the current position is in a public context. + + Returns True if we're at namespace scope (no class) or all enclosing + classes have public access for the current section. + """ + if not class_stack: + return True + # All enclosing classes must grant public visibility + return all(cs[3] == "public" for cs in class_stack) + + +# --------------------------------------------------------------------------- +# Namespace builder +# --------------------------------------------------------------------------- + +def _build_ns(namespace_stack: list[tuple[str, int]]) -> str: + """Build the current namespace string from the stack.""" + if not namespace_stack: + return "" + return "::".join(name for name, _ in namespace_stack) + + +# --------------------------------------------------------------------------- +# Inheritance extraction +# --------------------------------------------------------------------------- + +def _extract_inheritance(rest: str) -> str: + """Extract inheritance clause from the text after class name.""" + # rest looks like ": public Base, private Other {" or just "{" + colon_idx = rest.find(":") + if colon_idx == -1: + return "" + after_colon = rest[colon_idx + 1:] + # Strip opening brace and beyond + brace_idx = after_colon.find("{") + if brace_idx != -1: + after_colon = after_colon[:brace_idx] + return after_colon.strip() + + +# --------------------------------------------------------------------------- +# Doc comment extraction (Doxygen) +# --------------------------------------------------------------------------- + +def _look_back_for_doc(lines: list[str], decl_idx: int) -> dict: + """Look backwards from a declaration for Doxygen doc comments. + + Handles both /// line comments and /** block comments. + Returns dict with 'brief', 'params' (list), 'returns'. + """ + result: dict = {"brief": "", "params": [], "returns": ""} + doc_lines: list[str] = [] + i = decl_idx - 1 + + # First, try to collect /// comments + while i >= 0: + stripped = lines[i].strip() + if stripped.startswith("///"): + text = stripped[3:].strip() + # Strip leading < (used for member docs in some styles) + if text.startswith("<"): + text = text[1:].strip() + doc_lines.append(text) + i -= 1 + elif stripped.startswith("//!"): + text = stripped[3:].strip() + if text.startswith("<"): + text = text[1:].strip() + doc_lines.append(text) + i -= 1 + elif not stripped: + # Allow one blank line gap + if i > 0 and (lines[i - 1].strip().startswith("///") or + lines[i - 1].strip().startswith("//!")): + i -= 1 + continue + break + else: + break + + if doc_lines: + doc_lines.reverse() + return _parse_doxygen_lines(doc_lines) + + # Try /** ... */ block comment + i = decl_idx - 1 + # Skip blank lines + while i >= 0 and not lines[i].strip(): + i -= 1 + if i < 0: + return result + + # Check if previous line ends a block comment + last_stripped = lines[i].strip() + if not (last_stripped.endswith("*/") or last_stripped == "*/"): + return result + + # Collect block comment lines + block_lines: list[str] = [] + while i >= 0: + stripped = lines[i].strip() + block_lines.append(stripped) + if stripped.startswith("/**") or stripped.startswith("/*!"): + break + i -= 1 + + if not block_lines: + return result + + block_lines.reverse() + + # Clean up block comment markers + cleaned: list[str] = [] + for bline in block_lines: + # Remove leading /**, /*!, trailing */ + text = bline + if text.startswith("/**") or text.startswith("/*!"): + text = text[3:].strip() + if text.endswith("*/"): + text = text[:-2].strip() + # Remove leading * from middle lines + if text.startswith("*"): + text = text[1:].strip() + if text: + cleaned.append(text) + + return _parse_doxygen_lines(cleaned) + + +def _parse_doxygen_lines(doc_lines: list[str]) -> dict: + """Parse Doxygen tags from collected doc comment lines.""" + brief = "" + params: list[dict] = [] + returns = "" + brief_lines: list[str] = [] + + i = 0 + while i < len(doc_lines): + line = doc_lines[i] + + # @brief or \brief + if line.startswith("@brief ") or line.startswith("\\brief "): + brief = line[7:].strip() + i += 1 + continue + + # @param or \param + if line.startswith("@param") or line.startswith("\\param"): + param_text = line[6:].strip() + # Handle @param[in], @param[out], @param[in,out] + if param_text.startswith("["): + bracket_end = param_text.find("]") + if bracket_end != -1: + param_text = param_text[bracket_end + 1:].strip() + parts = param_text.split(None, 1) + if parts: + pname = parts[0] + pdesc = parts[1] if len(parts) > 1 else "" + params.append({"name": pname, "description": pdesc}) + i += 1 + continue + + # @return or \return or @returns or \returns + if (line.startswith("@return") or line.startswith("\\return")): + tag = "@return" if line.startswith("@return") else "\\return" + rest = line[len(tag):].strip() + if rest.startswith("s"): + rest = rest[1:].strip() # handle @returns + returns = rest + i += 1 + continue + + # @see, @note, @warning, @deprecated, @throws, etc. — skip + if line.startswith("@") or line.startswith("\\"): + i += 1 + continue + + # Regular text — part of brief if no @brief tag found yet + if not brief: + brief_lines.append(line) + + i += 1 + + if not brief and brief_lines: + full = " ".join(brief_lines) + # First sentence + for j, ch in enumerate(full): + if ch == "." and (j + 1 >= len(full) or full[j + 1] == " "): + brief = full[:j + 1] + break + if not brief: + brief = full + + return {"brief": brief, "params": params, "returns": returns} + + +# --------------------------------------------------------------------------- +# Signature collection +# --------------------------------------------------------------------------- + +def _collect_signature(lines: list[str], start: int) -> tuple[str, int]: + """Collect a function/method signature that may span multiple lines.""" + sig = lines[start] + i = start + paren_depth = _count_parens(sig) + + # Collect until parens balanced (max 50 lines lookahead) + limit = min(start + 50, len(lines)) + while paren_depth > 0 and i + 1 < limit: + i += 1 + next_line = lines[i].strip() + sig += " " + next_line + paren_depth += _count_parens(next_line) + + return sig, i + + +def _extract_params_str(full_sig: str, func_name: str) -> str: + """Extract the parameter string from a collected signature.""" + # Find the function name, then the opening paren + name_idx = full_sig.find(func_name) + if name_idx == -1: + return "" + + search_from = name_idx + len(func_name) + paren_start = full_sig.find("(", search_from) + if paren_start == -1: + return "" + + paren_end = _find_matching_paren(full_sig, paren_start) + if paren_end == -1: + return "" + + params = full_sig[paren_start + 1:paren_end].strip() + params = re.sub(r"\s+", " ", params) + return _strip_export_macros(params) + + +def _extract_trailing_qualifiers(full_sig: str) -> str: + """Extract trailing qualifiers after the closing paren (const, noexcept, etc.).""" + # Find the last closing paren at depth 0 + depth = 0 + last_close = -1 + for j, ch in enumerate(full_sig): + if ch == "(": + depth += 1 + elif ch == ")": + depth -= 1 + if depth == 0: + last_close = j + + if last_close == -1: + return "" + + after = full_sig[last_close + 1:].strip() + + # Strip body (everything from { onward) + brace_idx = after.find("{") + if brace_idx != -1: + after = after[:brace_idx].strip() + + # Strip semicolons + after = after.rstrip(";").strip() + + # Also strip initializer lists (starting with :) + # but keep const, noexcept, override, final, = 0, = default, = delete + colon_idx = after.find(":") + if colon_idx != -1: + # Check if it's an initializer list (not part of =) + before_colon = after[:colon_idx].strip() + if not before_colon.endswith("="): + after = before_colon + + return after.strip() + + +def _extract_param_types(params_str: str) -> str: + """Extract just the type names from a parameter string for FQN overload disambiguation.""" + if not params_str.strip(): + return "" + + types: list[str] = [] + # Split on commas, respecting angle brackets and parens + parts = _split_params(params_str) + for part in parts: + part = part.strip() + if not part: + continue + # Remove default values + eq_idx = _find_default_eq(part) + if eq_idx != -1: + part = part[:eq_idx].strip() + # Last word (before any & or *) is the param name, rest is type + tokens = part.rsplit(None, 1) + if len(tokens) >= 2: + type_part = tokens[0].strip() + types.append(type_part) + elif tokens: + types.append(tokens[0].strip()) + + return ", ".join(types) + + +def _split_params(params: str) -> list[str]: + """Split parameter string on commas, respecting nested <>, (), [].""" + parts: list[str] = [] + depth = 0 + current: list[str] = [] + + for ch in params: + if ch in "<([": + depth += 1 + current.append(ch) + elif ch in ">)]": + depth -= 1 + current.append(ch) + elif ch == "," and depth == 0: + parts.append("".join(current)) + current = [] + else: + current.append(ch) + + if current: + parts.append("".join(current)) + return parts + + +def _find_default_eq(param: str) -> int: + """Find the = sign for a default value, respecting nested brackets.""" + depth = 0 + for j, ch in enumerate(param): + if ch in "<([": + depth += 1 + elif ch in ">)]": + depth -= 1 + elif ch == "=" and depth == 0: + return j + return -1 + + +# --------------------------------------------------------------------------- +# Brace / paren / angle counting +# --------------------------------------------------------------------------- + +def _count_braces(line: str) -> int: + """Count net brace depth change, skipping strings and comments.""" + depth = 0 + in_double = False + in_single = False + escape = False + i = 0 + + while i < len(line): + ch = line[i] + + if escape: + escape = False + i += 1 + continue + if ch == "\\": + escape = True + i += 1 + continue + + if in_single: + if ch == "'": + in_single = False + i += 1 + continue + if in_double: + if ch == '"': + in_double = False + i += 1 + continue + + # Check for line comment + if ch == "/" and i + 1 < len(line) and line[i + 1] == "/": + break + + if ch == "'": + in_single = True + elif ch == '"': + in_double = True + elif ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + + i += 1 + + return depth + + +def _count_parens(line: str) -> int: + """Count net parenthesis depth change.""" + depth = 0 + in_double = False + in_single = False + escape = False + + for ch in line: + if escape: + escape = False + continue + if ch == "\\": + escape = True + continue + if in_single: + if ch == "'": + in_single = False + continue + if in_double: + if ch == '"': + in_double = False + continue + if ch == "'": + in_single = True + elif ch == '"': + in_double = True + elif ch == "(": + depth += 1 + elif ch == ")": + depth -= 1 + + return depth + + +def _count_angles(line: str) -> int: + """Count net angle bracket depth for template<...> matching.""" + depth = 0 + in_double = False + in_single = False + + for ch in line: + if in_single: + if ch == "'": + in_single = False + continue + if in_double: + if ch == '"': + in_double = False + continue + if ch == "'": + in_single = True + elif ch == '"': + in_double = True + elif ch == "<": + depth += 1 + elif ch == ">": + depth -= 1 + + return depth + + +def _find_matching_paren(text: str, start: int) -> int: + """Find the index of the matching closing paren.""" + depth = 0 + for j in range(start, len(text)): + if text[j] == "(": + depth += 1 + elif text[j] == ")": + depth -= 1 + if depth == 0: + return j + return -1 + + +def _has_declaration_after_template(stripped: str) -> bool: + """Check if a template<...> line also contains the declaration on the same line.""" + # Balance angle brackets + depth = 0 + for j, ch in enumerate(stripped): + if ch == "<": + depth += 1 + elif ch == ">": + depth -= 1 + if depth == 0: + # Check if there's a declaration keyword after + rest = stripped[j + 1:].strip() + if rest and any(rest.startswith(kw) for kw in + ("class ", "struct ", "union ", "enum ", + "typename ", "using ", "void ", "int ", + "auto ", "const ", "static ", "virtual ", + "inline ", "explicit ", "constexpr ")): + return True + # Also check for return type + function pattern + if rest and re.match(r"[\w:*&<>]+\s+\w+\s*\(", rest): + return True + return bool(rest and not rest.startswith("//")) + return False + + +# --------------------------------------------------------------------------- +# Cleaning helpers +# --------------------------------------------------------------------------- + +def _strip_export_macros(text: str) -> str: + """Remove export/API macros from text.""" + return _EXPORT_MACRO_RE.sub("", text).strip() + + +def _clean_sig(sig: str) -> str: + """Clean up whitespace in a signature.""" + return re.sub(r"\s+", " ", sig).strip() + + +# --------------------------------------------------------------------------- +# Record builder +# --------------------------------------------------------------------------- + +def _build_record(**kwargs) -> dict: + """Build a standard API record dict.""" + record = { + "fqn": kwargs.get("fqn", ""), + "namespace": kwargs.get("namespace", ""), + "class_name": kwargs.get("class_name", ""), + "member_name": kwargs.get("member_name", ""), + "member_type": kwargs.get("member_type", ""), + "signature": kwargs.get("signature", ""), + "summary": kwargs.get("summary", ""), + "params_json": kwargs.get("params_json", []), + "returns_text": kwargs.get("returns_text", ""), + "file_path": kwargs.get("file_path", ""), + "line_start": kwargs.get("line_start", 0), + "line_end": kwargs.get("line_end", 0), + } + return record From cfcf2fdb67c7f5f276f0e4acb09c94d44793c1b0 Mon Sep 17 00:00:00 2001 From: Fuat Date: Mon, 2 Mar 2026 03:55:07 +0300 Subject: [PATCH 2/6] Fix C++ parser bugs found during large-codebase testing - Fix enum forward declarations (e.g. `enum Foo;`) incorrectly setting pending_enum, which consumed subsequent lines containing `{` (root cause of ImVec2 constructor capture failure) - Cancel pending_enum if next line isn't a standalone `{` brace - Fix inline method body content polluting operator signatures by truncating at body `{` before searching for trailing qualifiers - Add _find_body_brace helper for paren-aware `{` detection - Remove all debug prints Tested on SFML (4684 records), fmt (1729), imgui (4766), bullet3 (10890), opencv (34435). --- src/codesurface/parsers/cpp.py | 191 +++++++++++++++++++++++++++++---- 1 file changed, 171 insertions(+), 20 deletions(-) diff --git a/src/codesurface/parsers/cpp.py b/src/codesurface/parsers/cpp.py index e56e2ba..b5091d7 100644 --- a/src/codesurface/parsers/cpp.py +++ b/src/codesurface/parsers/cpp.py @@ -47,8 +47,8 @@ # Regex patterns # --------------------------------------------------------------------------- -# Export/API macros: SFML_API, IMGUI_API, MY_EXPORT, etc. -_EXPORT_MACRO_RE = re.compile(r"\b\w+_(?:API|EXPORT|DLL|SHARED)\b") +# Export/API macros: SFML_API, IMGUI_API, MY_EXPORT, CV_EXPORTS, etc. +_EXPORT_MACRO_RE = re.compile(r"\b\w+_(?:API|EXPORTS?|DLL|SHARED)\b") # Namespace: namespace foo { or namespace foo::bar { _NAMESPACE_RE = re.compile( @@ -68,7 +68,7 @@ _CLASS_RE = re.compile( r"^\s*(?:template\s*<[^>]*>\s*)?" # optional template<...> r"(class|struct|union)\s+" - r"(?:\w+_(?:API|EXPORT|DLL|SHARED)\s+)?" # optional export macro + r"(?:\w+_(?:API|EXPORTS?|DLL|SHARED)\s+)?" # optional export macro r"(\w+)" # class name r"(?:\s+final)?" # optional final r"(.*)" # rest: inheritance, {, ; @@ -77,7 +77,7 @@ # Forward declaration: class Foo; or struct Foo; _FORWARD_DECL_RE = re.compile( r"^\s*(?:class|struct|union)\s+" - r"(?:\w+_(?:API|EXPORT|DLL|SHARED)\s+)?" + r"(?:\w+_(?:API|EXPORTS?|DLL|SHARED)\s+)?" r"\w+\s*;" ) @@ -118,7 +118,7 @@ r"^\s*" r"((?:(?:static|virtual|inline|explicit|constexpr|consteval|" r"friend|extern|nodiscard|\[\[nodiscard\]\]|" - r"\w+_(?:API|EXPORT|DLL|SHARED))\s+)*)" # leading qualifiers + r"\w+_(?:API|EXPORTS?|DLL|SHARED))\s+)*)" # leading qualifiers r"([\w:*&<>,\s]+?)\s+" # return type r"(\w+)" # function/method name r"\s*\(" # open paren @@ -128,7 +128,7 @@ _CTOR_RE = re.compile( r"^\s*" r"((?:(?:explicit|inline|constexpr|consteval|" - r"\w+_(?:API|EXPORT|DLL|SHARED))\s+)*)" # optional qualifiers + r"\w+_(?:API|EXPORTS?|DLL|SHARED))\s+)*)" # optional qualifiers r"(\w+)" # class name (must match current) r"\s*\(" # open paren ) @@ -145,7 +145,7 @@ _OPERATOR_RE = re.compile( r"^\s*" r"((?:(?:static|virtual|inline|explicit|constexpr|friend|" - r"\w+_(?:API|EXPORT|DLL|SHARED))\s+)*)" # leading qualifiers + r"\w+_(?:API|EXPORTS?|DLL|SHARED))\s+)*)" # leading qualifiers r"([\w:*&<>,\s]*?)\s*" # return type (may be empty for conversion) r"(operator\s*(?:\(\)|" # operator() — call operator r"\[\]|" # operator[] — subscript @@ -170,6 +170,15 @@ r"\s*;" ) +# Macro-wrapped class: ATTRIBUTE_ALIGNED16(class) or MY_MACRO(struct) +# The class/struct keyword is inside a macro call, name comes on next line +_MACRO_CLASS_RE = re.compile( + r"^\s*\w+\s*\(\s*(class|struct|union)\s*\)\s*$" +) + +# Bare class name on its own line (follows a MACRO(class) line) +_BARE_NAME_RE = re.compile(r"^\s*(\w+)\s*$") + # Template prefix: template<...> (possibly multi-line) _TEMPLATE_RE = re.compile(r"^\s*template\s*<") @@ -241,6 +250,13 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: in_enum: str = "" # enum name if inside enum body enum_class_name: str = "" # owning class for the enum, if any enum_brace_depth = -1 + pending_enum: str = "" # enum name when { is on next line + pending_enum_class: str = "" + # Deferred class push: when class decl has no { on its line + # Stored as [name, kind, inheritance, decl_line_idx, already_emitted] + pending_class: list | None = None + # MACRO(class) on previous line — kind stored, waiting for name on next line + pending_macro_class_kind: str = "" i = 0 while i < len(lines): @@ -296,6 +312,81 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: brace_delta = _count_braces(line) new_depth = brace_depth + brace_delta + # --- MACRO(class) pattern (e.g. ATTRIBUTE_ALIGNED16(class)) --- + if pending_macro_class_kind: + # Expecting the class name on this line + bare_m = _BARE_NAME_RE.match(stripped) + if bare_m: + macro_name = bare_m.group(1) + if macro_name not in _CPP_KEYWORDS: + # Treat as class declaration — defer push until { + pending_class = [macro_name, pending_macro_class_kind, "", i, False] + pending_macro_class_kind = "" + brace_depth = new_depth + i += 1 + continue + pending_macro_class_kind = "" + # Fall through to normal parsing + + macro_class_m = _MACRO_CLASS_RE.match(stripped) + if macro_class_m: + pending_macro_class_kind = macro_class_m.group(1) + brace_depth = new_depth + i += 1 + continue + + # --- Deferred class push: previous class decl had no { --- + if pending_class and "{" in line: + pc_name, pc_kind, pc_inherit, pc_decl_line, pc_emitted = pending_class + default_access = "public" if pc_kind in ("struct", "union") else "private" + class_stack.append([pc_name, pc_kind, brace_depth, default_access]) + # Emit type record for the deferred class (only if not already emitted) + is_pub = _is_public(class_stack[:-1]) # check enclosing context + if not pc_emitted and is_pub and pc_name not in _CPP_KEYWORDS: + ns = _build_ns(namespace_stack) + owning_class = class_stack[-2][0] if len(class_stack) > 1 else "" + doc = _look_back_for_doc(lines, pc_decl_line) + sig = f"{pc_kind} {pc_name}" + if pc_inherit: + sig += f" : {pc_inherit}" + if pending_template: + sig = pending_template + " " + sig + fqn_parts = [p for p in [ns, owning_class, pc_name] if p] + fqn = "::".join(fqn_parts) + records.append(_build_record( + fqn=fqn, + namespace=ns, + class_name=owning_class or pc_name, + member_name="" if not owning_class else pc_name, + member_type="type", + signature=sig, + summary=doc.get("brief", ""), + file_path=rel_path, + line_start=pc_decl_line + 1, + line_end=pc_decl_line + 1, + )) + pending_class = None + pending_template = "" + brace_depth = new_depth + i += 1 + continue + + # --- Deferred enum push: previous enum decl had no { --- + if pending_enum: + if "{" in stripped and stripped.startswith("{"): + in_enum = pending_enum + enum_class_name = pending_enum_class + enum_brace_depth = brace_depth + pending_enum = "" + pending_enum_class = "" + brace_depth = new_depth + i += 1 + continue + else: + # Next line wasn't a standalone { — cancel deferred enum + pending_enum = "" + pending_enum_class = "" + # --- Template accumulation --- if _TEMPLATE_RE.match(line) and not _has_declaration_after_template(stripped): pending_template = stripped @@ -417,6 +508,17 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: # Determine if we're in a public context is_public = _is_public(class_stack) + # Determine if we're at declaration level (not inside a function body) + # Class members: depth == class_depth + 1 + # Namespace-level: depth == namespace_depth + 1 (or 0 if no namespace) + # Deeper means we're inside a function body — skip declarations + if class_stack: + at_decl_level = brace_depth == class_stack[-1][2] + 1 + elif namespace_stack: + at_decl_level = brace_depth == namespace_stack[-1][1] + 1 + else: + at_decl_level = brace_depth == 0 + # --- Enum declaration --- enum_m = _ENUM_RE.match(line) if enum_m: @@ -457,6 +559,10 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: in_enum = enum_name enum_class_name = owning_class enum_brace_depth = brace_depth + elif not enum_rest.rstrip().endswith(";"): + # Brace on next line — defer (skip forward declarations) + pending_enum = enum_name + pending_enum_class = owning_class pending_template = "" brace_depth = new_depth @@ -579,6 +685,11 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: if "{" in line: default_access = "public" if kind in ("struct", "union") else "private" class_stack.append([name, kind, brace_depth, default_access]) + else: + # Brace on next line — defer the push + inheritance = _extract_inheritance(rest) + # Type record was already emitted above + pending_class = [name, kind, inheritance, i, True] pending_template = "" brace_depth = new_depth @@ -587,7 +698,7 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: # --- Destructor --- dtor_m = _DTOR_RE.match(line) - if dtor_m and class_stack: + if dtor_m and class_stack and at_decl_level: class_name_match = dtor_m.group(1) if class_name_match == class_stack[-1][0] and is_public: ns = _build_ns(namespace_stack) @@ -628,7 +739,7 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: # --- Operator overload --- op_m = _OPERATOR_RE.match(line) - if op_m and is_public: + if op_m and is_public and at_decl_level: qualifiers = op_m.group(1).strip() ret_type = op_m.group(2).strip() op_name = op_m.group(3).strip() @@ -691,7 +802,7 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: continue # --- Constructor --- - if class_stack: + if class_stack and at_decl_level: ctor_m = _CTOR_RE.match(line) if ctor_m: ctor_name = ctor_m.group(2) @@ -742,7 +853,7 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: # --- Method / Free function --- func_m = _FUNC_RE.match(line) - if func_m and is_public: + if func_m and is_public and at_decl_level: qualifiers = func_m.group(1).strip() ret_type = func_m.group(2).strip() func_name = func_m.group(3) @@ -816,8 +927,10 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: i += 1 continue - # --- Field (inside class/struct body, public only) --- - if class_stack and is_public: + # --- Field (inside class/struct body, public only, at class body level) --- + # Only match fields at class body depth (depth == class_depth + 1), + # not inside method bodies (depth >= class_depth + 2) + if class_stack and is_public and brace_depth == class_stack[-1][2] + 1: field_m = _FIELD_RE.match(line) if field_m: field_quals = field_m.group(1).strip() @@ -1114,10 +1227,17 @@ def _extract_params_str(full_sig: str, func_name: str) -> str: def _extract_trailing_qualifiers(full_sig: str) -> str: """Extract trailing qualifiers after the closing paren (const, noexcept, etc.).""" + # Truncate at method body start to avoid inline body content + # polluting the paren depth search (e.g., { IM_ASSERT(...) }) + sig = full_sig + body_start = _find_body_brace(sig) + if body_start != -1: + sig = sig[:body_start] + # Find the last closing paren at depth 0 depth = 0 last_close = -1 - for j, ch in enumerate(full_sig): + for j, ch in enumerate(sig): if ch == "(": depth += 1 elif ch == ")": @@ -1128,12 +1248,7 @@ def _extract_trailing_qualifiers(full_sig: str) -> str: if last_close == -1: return "" - after = full_sig[last_close + 1:].strip() - - # Strip body (everything from { onward) - brace_idx = after.find("{") - if brace_idx != -1: - after = after[:brace_idx].strip() + after = sig[last_close + 1:].strip() # Strip semicolons after = after.rstrip(";").strip() @@ -1150,6 +1265,42 @@ def _extract_trailing_qualifiers(full_sig: str) -> str: return after.strip() +def _find_body_brace(sig: str) -> int: + """Find the first '{' that's not inside parens, strings, or comments.""" + depth = 0 + in_double = False + in_single = False + escape = False + for j, ch in enumerate(sig): + if escape: + escape = False + continue + if ch == "\\": + escape = True + continue + if in_single: + if ch == "'": + in_single = False + continue + if in_double: + if ch == '"': + in_double = False + continue + if ch == "/" and j + 1 < len(sig) and sig[j + 1] == "/": + break + if ch == "'": + in_single = True + elif ch == '"': + in_double = True + elif ch == "(": + depth += 1 + elif ch == ")": + depth -= 1 + elif ch == "{" and depth == 0: + return j + return -1 + + def _extract_param_types(params_str: str) -> str: """Extract just the type names from a parameter string for FQN overload disambiguation.""" if not params_str.strip(): From 058471b41f5e45c508deac3a6adb3a8f984a750e Mon Sep 17 00:00:00 2001 From: Fuat Date: Mon, 2 Mar 2026 04:09:14 +0300 Subject: [PATCH 3/6] Skip constructor initializer lists and macro variable assignments - Skip lines starting with ':' (initializer list) and ',' with '(' (continuation) to prevent false method captures - Skip function matches where ')' is followed by '=' assignment (macro-declared variables like B3_ATTRIBUTE_ALIGNED16) - Preserves = 0, = default, = delete correctly Verified: 0 false positives across 183,379 records from 9 codebases. --- src/codesurface/parsers/cpp.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/codesurface/parsers/cpp.py b/src/codesurface/parsers/cpp.py index b5091d7..58fbd20 100644 --- a/src/codesurface/parsers/cpp.py +++ b/src/codesurface/parsers/cpp.py @@ -299,6 +299,16 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: i += 1 continue + # Constructor initializer list lines (: member(val) or , member(val)) + if stripped.startswith(":") and not stripped.startswith("::"): + brace_depth += _count_braces(line) + i += 1 + continue + if stripped.startswith(",") and "(" in stripped: + brace_depth += _count_braces(line) + i += 1 + continue + # Preprocessor if _PREPROCESSOR_RE.match(line): # Skip continuation lines @@ -879,6 +889,15 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: i += 1 continue + # Skip macro-style variable declarations: TYPE MACRO(name) = value + body_pos = _find_body_brace(stripped) + after_parens = stripped[stripped.find(")") + 1:body_pos if body_pos != -1 else len(stripped)].strip() if ")" in stripped else "" + if after_parens.startswith("=") and not any(after_parens.startswith(p) for p in ("= 0", "= default", "= delete")): + pending_template = "" + brace_depth = new_depth + i += 1 + continue + ns = _build_ns(namespace_stack) owning_class = class_stack[-1][0] if class_stack else "" doc = _look_back_for_doc(lines, i) From 3749e451182420d361573726c56869699a09c510 Mon Sep 17 00:00:00 2001 From: Fuat Date: Mon, 2 Mar 2026 04:47:49 +0300 Subject: [PATCH 4/6] Fix PR review bugs: multi-line signatures, overload FQN, @returns stripping - Fix multi-line signature handling: move _collect_signature before conditional blocks so end_i is always set, accumulate brace depth across consumed lines, advance i to end_i+1 - Add param types to method/free function FQN for overload disambiguation - Fix @returns tag parsing to match longest tag first, preventing accidental stripping of 's' from descriptions like "success" - Fix _look_back_for_doc to require /** marker, not scan to file top - Fix _count_braces to skip /* */ inline comments - Fix template brace accumulation to use += instead of recalculating - Optimize parse_directory to walk tree once instead of per-extension - Normalize _extract_param_types separator to comma without space - Fix server.py get_class to split on both . and :: for C++ FQNs - Remove unused parent_class variable --- src/codesurface/parsers/cpp.py | 81 +++++++++++++++++++++------------- src/codesurface/server.py | 3 +- 2 files changed, 53 insertions(+), 31 deletions(-) diff --git a/src/codesurface/parsers/cpp.py b/src/codesurface/parsers/cpp.py index 58fbd20..5bfb527 100644 --- a/src/codesurface/parsers/cpp.py +++ b/src/codesurface/parsers/cpp.py @@ -199,21 +199,23 @@ def file_extensions(self) -> list[str]: def parse_directory(self, directory: Path) -> list[dict]: """Override to skip build/vendor/test directories.""" records: list[dict] = [] - for ext in self.file_extensions: - for f in sorted(directory.rglob(f"*{ext}")): - parts = f.relative_to(directory).parts - if any( - p in _SKIP_DIRS - or any(p.startswith(pfx) for pfx in _SKIP_DIR_PREFIXES) - for p in parts - ): - continue - try: - records.extend(self.parse_file(f, directory)) - except Exception as e: - import sys - print(f"codesurface: failed to parse {f}: {e}", file=sys.stderr) - continue + ext_set = set(self.file_extensions) + for f in sorted(directory.rglob("*")): + if f.suffix not in ext_set: + continue + parts = f.relative_to(directory).parts + if any( + p in _SKIP_DIRS + or any(p.startswith(pfx) for pfx in _SKIP_DIR_PREFIXES) + for p in parts + ): + continue + try: + records.extend(self.parse_file(f, directory)) + except Exception as e: + import sys + print(f"codesurface: failed to parse {f}: {e}", file=sys.stderr) + continue return records def parse_file(self, path: Path, base_dir: Path) -> list[dict]: @@ -407,8 +409,7 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: next_stripped = lines[i].strip() pending_template += " " + next_stripped angle_depth += _count_angles(next_stripped) - brace_delta = _count_braces(lines[i]) - new_depth = brace_depth + brace_delta + new_depth += _count_braces(lines[i]) brace_depth = new_depth i += 1 continue @@ -435,7 +436,6 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: if val_value: sig += f" = {val_value.strip()}" - parent_class = enum_class_name or in_enum fqn_parts = [p for p in [ns, enum_class_name, in_enum, val_name] if p] fqn = "::".join(fqn_parts) @@ -710,12 +710,12 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: dtor_m = _DTOR_RE.match(line) if dtor_m and class_stack and at_decl_level: class_name_match = dtor_m.group(1) + full_sig, end_i = _collect_signature(lines, i) if class_name_match == class_stack[-1][0] and is_public: ns = _build_ns(namespace_stack) owning_class = class_stack[-1][0] doc = _look_back_for_doc(lines, i) - full_sig, end_i = _collect_signature(lines, i) params_str = _extract_params_str(full_sig, f"~{class_name_match}") sig = _clean_sig(f"~{class_name_match}({params_str})") @@ -743,8 +743,9 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: )) pending_template = "" + new_depth += sum(_count_braces(lines[j]) for j in range(i + 1, end_i + 1)) brace_depth = new_depth - i += 1 + i = end_i + 1 continue # --- Operator overload --- @@ -807,8 +808,9 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: )) pending_template = "" + new_depth += sum(_count_braces(lines[j]) for j in range(i + 1, end_i + 1)) brace_depth = new_depth - i += 1 + i = end_i + 1 continue # --- Constructor --- @@ -816,13 +818,13 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: ctor_m = _CTOR_RE.match(line) if ctor_m: ctor_name = ctor_m.group(2) + full_sig, end_i = _collect_signature(lines, i) if ctor_name == class_stack[-1][0] and is_public: qualifiers = ctor_m.group(1).strip() ns = _build_ns(namespace_stack) owning_class = class_stack[-1][0] doc = _look_back_for_doc(lines, i) - full_sig, end_i = _collect_signature(lines, i) params_str = _extract_params_str(full_sig, ctor_name) sig_parts = [] @@ -857,8 +859,9 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: )) pending_template = "" + new_depth += sum(_count_braces(lines[j]) for j in range(i + 1, end_i + 1)) brace_depth = new_depth - i += 1 + i = end_i + 1 continue # --- Method / Free function --- @@ -926,6 +929,11 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: fqn_parts = [p for p in [ns, owning_class, func_name] if p] fqn = "::".join(fqn_parts) + # Handle overloads + param_types = _extract_param_types(params_str) + if param_types: + fqn += f"({param_types})" + records.append(_build_record( fqn=fqn, namespace=ns, @@ -942,8 +950,9 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: )) pending_template = "" + new_depth += sum(_count_braces(lines[j]) for j in range(i + 1, end_i + 1)) brace_depth = new_depth - i += 1 + i = end_i + 1 continue # --- Field (inside class/struct body, public only, at class body level) --- @@ -1105,14 +1114,16 @@ def _look_back_for_doc(lines: list[str], decl_idx: int) -> dict: # Collect block comment lines block_lines: list[str] = [] + found_marker = False while i >= 0: stripped = lines[i].strip() block_lines.append(stripped) if stripped.startswith("/**") or stripped.startswith("/*!"): + found_marker = True break i -= 1 - if not block_lines: + if not found_marker or not block_lines: return result block_lines.reverse() @@ -1170,10 +1181,11 @@ def _parse_doxygen_lines(doc_lines: list[str]) -> dict: # @return or \return or @returns or \returns if (line.startswith("@return") or line.startswith("\\return")): - tag = "@return" if line.startswith("@return") else "\\return" - rest = line[len(tag):].strip() - if rest.startswith("s"): - rest = rest[1:].strip() # handle @returns + # Match longest tag first to avoid stripping 's' from description + for tag in ("@returns", "\\returns", "@return", "\\return"): + if line.startswith(tag): + rest = line[len(tag):].strip() + break returns = rest i += 1 continue @@ -1344,7 +1356,7 @@ def _extract_param_types(params_str: str) -> str: elif tokens: types.append(tokens[0].strip()) - return ", ".join(types) + return ",".join(types) def _split_params(params: str) -> list[str]: @@ -1423,6 +1435,15 @@ def _count_braces(line: str) -> int: if ch == "/" and i + 1 < len(line) and line[i + 1] == "/": break + # Check for block comment + if ch == "/" and i + 1 < len(line) and line[i + 1] == "*": + end = line.find("*/", i + 2) + if end != -1: + i = end + 2 + continue + else: + break # unclosed block comment, skip rest of line + if ch == "'": in_single = True elif ch == '"': diff --git a/src/codesurface/server.py b/src/codesurface/server.py index 51a55e1..ac5ec6e 100644 --- a/src/codesurface/server.py +++ b/src/codesurface/server.py @@ -2,6 +2,7 @@ import argparse import json +import re import sys import time from pathlib import Path @@ -337,7 +338,7 @@ def get_class(class_name: str) -> str: if _conn is None: return "No codebase indexed. Start the server with --project ." - short_name = class_name.rsplit(".", 1)[-1] + short_name = re.split(r"[.:]", class_name)[-1] members = db.get_class_members(_conn, short_name) if not members: From abef90337acbdf6e257cfdf6169c4957f8501500 Mon Sep 17 00:00:00 2001 From: Fuat Date: Tue, 3 Mar 2026 20:43:02 +0300 Subject: [PATCH 5/6] Fix C++ parser bugs found during extensive cross-codebase validation Parser fixes (cpp.py): - Fix _FUNC_RE to handle pointer/ref return types with Type *name( style (Godot, bullet3, OpenCV use this convention extensively) - Add ALL_CAPS macro qualifiers to _FUNC_RE leading quals (_FORCE_INLINE_, etc.) - Fix _FIELD_RE to handle Type*Name fields with no space before name - Fix _DTOR_RE to allow export macros before ~ (IMGUI_API ~ImDrawList()) - Fix trailing qualifier check: const no longer matches constexpr/consteval, and const followed by a type name is not swallowed as a trailing qualifier - Add [[nodiscard]] attribute support in _CLASS_RE and _FORWARD_DECL_RE - Add export macro suffix support (CV_EXPORTS_W, etc.) across all regexes - Add MACRO(class) with inheritance pattern (_BARE_NAME_INHERIT_RE) - Add const overload FQN disambiguation (method const suffix) - Add constructor trailing qualifiers (= default, = delete, noexcept) - Add operator++/-- support in _OPERATOR_RE - Strip inline comments from trailing qualifiers to prevent leak - Reject copyright/license block comments as doc comments (max 40 lines) DB fixes (db.py): - Add namespace-aware get_class_members() with optional namespace filter - Add get_class_namespaces() for discovering class name collisions - Expand FTS5 special character escaping (commas, brackets, etc.) Server fixes (server.py): - Support namespace-qualified get_class queries (e.g., "cv::Mat") - Auto-disambiguate when multiple namespaces share a class name - Add _pick_primary_namespace() heuristic preferring non-thirdparty paths - Show disambiguation note with alternative qualified names Tested against 5 real codebases: bullet3, godot, opencv, imgui, nlohmann/json. Total records recovered: +16,097 across all codebases. --- src/codesurface/db.py | 29 +++- src/codesurface/parsers/cpp.py | 234 +++++++++++++++++++++++++++++---- src/codesurface/server.py | 58 +++++++- 3 files changed, 287 insertions(+), 34 deletions(-) diff --git a/src/codesurface/db.py b/src/codesurface/db.py index 16695f2..fbfec3a 100644 --- a/src/codesurface/db.py +++ b/src/codesurface/db.py @@ -189,13 +189,32 @@ def get_by_fqn(conn: sqlite3.Connection, fqn: str) -> dict | None: return dict(row) if row else None -def get_class_members(conn: sqlite3.Connection, class_name: str) -> list[dict]: - """Get all members of a class by class name.""" +def get_class_members(conn: sqlite3.Connection, class_name: str, + namespace: str | None = None) -> list[dict]: + """Get all members of a class by class name, optionally filtered by namespace.""" + if namespace: + rows = conn.execute( + "SELECT * FROM api_records WHERE class_name = ? AND namespace = ? " + "ORDER BY member_type, member_name", + (class_name, namespace), + ).fetchall() + else: + rows = conn.execute( + "SELECT * FROM api_records WHERE class_name = ? ORDER BY member_type, member_name", + (class_name,), + ).fetchall() + return [dict(row) for row in rows] + + +def get_class_namespaces(conn: sqlite3.Connection, class_name: str) -> list[str]: + """Get all distinct namespaces that contain a class with this name.""" rows = conn.execute( - "SELECT * FROM api_records WHERE class_name = ? ORDER BY member_type, member_name", + "SELECT DISTINCT namespace FROM api_records " + "WHERE class_name = ? AND member_type = 'type' " + "ORDER BY namespace", (class_name,), ).fetchall() - return [dict(row) for row in rows] + return [row["namespace"] for row in rows] def resolve_namespace(conn: sqlite3.Connection, name: str) -> list[dict]: @@ -250,7 +269,7 @@ def _escape_fts(query: str) -> str: "ICommand" → (ICommand*) OR (I Command*) """ q = query - for ch in '."-*():': + for ch in '."-*():,;{}[]!@#$%^&+|\\~`': q = q.replace(ch, " ") terms = [t for t in q.split() if t] if not terms: diff --git a/src/codesurface/parsers/cpp.py b/src/codesurface/parsers/cpp.py index 5bfb527..05d74e4 100644 --- a/src/codesurface/parsers/cpp.py +++ b/src/codesurface/parsers/cpp.py @@ -47,28 +47,48 @@ # Regex patterns # --------------------------------------------------------------------------- -# Export/API macros: SFML_API, IMGUI_API, MY_EXPORT, CV_EXPORTS, etc. -_EXPORT_MACRO_RE = re.compile(r"\b\w+_(?:API|EXPORTS?|DLL|SHARED)\b") +# Trailing qualifier keywords that can follow the closing paren of a signature +_TRAILING_QUAL_STARTS = ( + "noexcept", "override", "final", "volatile", + "->", "= 0", "= default", "= delete", "[[", "requires", "throw(", +) + +# Regex for matching "const" as a trailing qualifier of a method signature. +# Must be followed by another known qualifier, semicolon, brace, =, or end-of-line. +# This prevents "const Type& foo()" from being swallowed as a trailing qualifier. +_TRAILING_CONST_RE = re.compile( + r"^const\s*(?:noexcept|override|final|volatile|;|\{|=|\[|\->|&|\s*$)" +) + +# extern "C" { block — transparent scope (declarations inside are file-scope) +_EXTERN_C_RE = re.compile(r'^\s*extern\s+"C"\s*\{') +# extern "C" without brace (may be on next line, or single-decl form) +_EXTERN_C_NOBRACE_RE = re.compile(r'^\s*extern\s+"C"\s*$') + +# Export/API macros: SFML_API, IMGUI_API, MY_EXPORT, CV_EXPORTS, CV_EXPORTS_W, etc. +_EXPORT_MACRO_RE = re.compile(r"\b\w+_(?:API|EXPORTS?|DLL|SHARED)(?:_\w+)*\b") # Namespace: namespace foo { or namespace foo::bar { _NAMESPACE_RE = re.compile( - r"^\s*namespace\s+" + r"^\s*(?:inline\s+)?namespace\s+" r"(\w+(?:::\w+)*)" # namespace name (possibly nested) r"\s*\{?" ) # Anonymous namespace -_ANON_NAMESPACE_RE = re.compile(r"^\s*namespace\s*\{") +_ANON_NAMESPACE_RE = re.compile(r"^\s*(?:inline\s+)?namespace\s*\{") # Access specifier: public: / protected: / private: _ACCESS_RE = re.compile(r"^\s*(public|protected|private)\s*:") # Class/struct/union declaration # Handles: template<...> class EXPORT_API ClassName : public Base { +# Handles: struct [[nodiscard]] ClassName : public Base { _CLASS_RE = re.compile( r"^\s*(?:template\s*<[^>]*>\s*)?" # optional template<...> r"(class|struct|union)\s+" - r"(?:\w+_(?:API|EXPORTS?|DLL|SHARED)\s+)?" # optional export macro + r"(?:\[\[[^\]]*\]\]\s*)?" # optional [[attribute]] + r"(?:\w+_(?:API|EXPORTS?|DLL|SHARED)(?:_\w+)*\s+)?" # optional export macro r"(\w+)" # class name r"(?:\s+final)?" # optional final r"(.*)" # rest: inheritance, {, ; @@ -77,7 +97,8 @@ # Forward declaration: class Foo; or struct Foo; _FORWARD_DECL_RE = re.compile( r"^\s*(?:class|struct|union)\s+" - r"(?:\w+_(?:API|EXPORTS?|DLL|SHARED)\s+)?" + r"(?:\[\[[^\]]*\]\]\s*)?" # optional [[attribute]] + r"(?:\w+_(?:API|EXPORTS?|DLL|SHARED)(?:_\w+)*\s+)?" r"\w+\s*;" ) @@ -88,7 +109,7 @@ _ENUM_RE = re.compile( r"^\s*(enum\s+class|enum\s+struct|enum)\s+" r"(\w+)" # enum name - r"(?:\s*:\s*(\w+))?" # optional underlying type + r"(?:\s*:\s*([\w:]+(?:\s+\w+)*))?" # optional underlying type r"(.*)" # rest ) @@ -114,29 +135,41 @@ # Method/function declaration (very broad, refined in code) # Captures: optional qualifiers, return type, name, params +# Return type is greedy and must end at a ptr/ref char or whitespace boundary, +# which correctly handles all C++ pointer styles: +# Type name( — ends at space +# Type *name( — ends at * (Godot/Linux style) +# Type* name( — ends at space after * +# Type * name( — ends at space after * _FUNC_RE = re.compile( r"^\s*" r"((?:(?:static|virtual|inline|explicit|constexpr|consteval|" r"friend|extern|nodiscard|\[\[nodiscard\]\]|" - r"\w+_(?:API|EXPORTS?|DLL|SHARED))\s+)*)" # leading qualifiers - r"([\w:*&<>,\s]+?)\s+" # return type + r"\w+_(?:API|EXPORTS?|DLL|SHARED)(?:_\w+)*|" + r"[A-Z_][A-Z_0-9]+)\s+)*)" # leading qualifiers (incl. ALL_CAPS macros) + r"([\w:*&<>,\s]+(?:[*&]\s*|\s))" # return type (greedy, ends at ptr/ref or space) r"(\w+)" # function/method name r"\s*\(" # open paren ) # Constructor: ClassName(params) +# Accepts C++ keywords + ALL_CAPS macro qualifiers (SIMD_FORCE_INLINE, _FORCE_INLINE_, etc.) +# Safe to be permissive: name must match current class (checked in code at line ~896) _CTOR_RE = re.compile( r"^\s*" r"((?:(?:explicit|inline|constexpr|consteval|" - r"\w+_(?:API|EXPORTS?|DLL|SHARED))\s+)*)" # optional qualifiers + r"\w+_(?:API|EXPORTS?|DLL|SHARED)(?:_\w+)*|" # export macros (CV_EXPORTS_W, etc.) + r"[A-Z_][A-Z_0-9]+)\s+)*)" # ALL_CAPS macros (SIMD_FORCE_INLINE, etc.) r"(\w+)" # class name (must match current) r"\s*\(" # open paren ) -# Destructor: ~ClassName() or virtual ~ClassName() +# Destructor: ~ClassName(), virtual ~ClassName(), EXPORT_API ~ClassName() _DTOR_RE = re.compile( r"^\s*" - r"(?:virtual\s+)?" + r"(?:(?:virtual|inline|" + r"\w+_(?:API|EXPORTS?|DLL|SHARED)(?:_\w+)*|" + r"[A-Z_][A-Z_0-9]+)\s+)*" # optional qualifiers r"~(\w+)" # class name r"\s*\(" ) @@ -145,12 +178,13 @@ _OPERATOR_RE = re.compile( r"^\s*" r"((?:(?:static|virtual|inline|explicit|constexpr|friend|" - r"\w+_(?:API|EXPORTS?|DLL|SHARED))\s+)*)" # leading qualifiers + r"\w+_(?:API|EXPORTS?|DLL|SHARED)(?:_\w+)*)\s+)*)" # leading qualifiers r"([\w:*&<>,\s]*?)\s*" # return type (may be empty for conversion) r"(operator\s*(?:\(\)|" # operator() — call operator r"\[\]|" # operator[] — subscript r"->|" # operator-> — member access r"<<|>>|" # shift operators + r"\+\+|--|" # increment/decrement operators r"[+\-*/%^&|~!=<>]=?|" # arithmetic/comparison ops r"&&|\|\||" # logical ops r",|" # comma operator @@ -161,10 +195,11 @@ # Field declaration: type name; or type name = value; # Only matched inside class/struct bodies when access is public +# Type uses greedy match ending at ptr/ref or space (handles Type*Name style) _FIELD_RE = re.compile( r"^\s*" r"((?:(?:static|const|constexpr|inline|mutable|volatile)\s+)*)" # qualifiers - r"([\w:*&<>,\s]+?)\s+" # type + r"([\w:*&<>,\s]+(?:[*&]\s*|\s))" # type (greedy, ends at ptr/ref or space) r"(\w+)" # field name r"(?:\s*(?:=\s*[^;]+|{[^}]*}|\[[^\]]*\]))?" # optional init r"\s*;" @@ -179,6 +214,14 @@ # Bare class name on its own line (follows a MACRO(class) line) _BARE_NAME_RE = re.compile(r"^\s*(\w+)\s*$") +# Class name with inheritance on same line (follows a MACRO(class) line) +# e.g., "btTypedConstraint : public btTypedObject" +_BARE_NAME_INHERIT_RE = re.compile( + r"^\s*(\w+)" # class name + r"(?:\s+final)?" # optional final + r"\s*:\s*(.+)" # : inheritance... +) + # Template prefix: template<...> (possibly multi-line) _TEMPLATE_RE = re.compile(r"^\s*template\s*<") @@ -259,6 +302,10 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: pending_class: list | None = None # MACRO(class) on previous line — kind stored, waiting for name on next line pending_macro_class_kind: str = "" + # extern "C" { transparent scopes — track brace depths so declarations + # inside are treated as file-scope rather than nested + extern_c_depths: list[int] = [] + pending_extern_c: bool = False i = 0 while i < len(lines): @@ -302,14 +349,16 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: continue # Constructor initializer list lines (: member(val) or , member(val)) - if stripped.startswith(":") and not stripped.startswith("::"): - brace_depth += _count_braces(line) - i += 1 - continue - if stripped.startswith(",") and "(" in stripped: - brace_depth += _count_braces(line) - i += 1 - continue + # Only skip when inside a class body (not at namespace/file scope) + if class_stack and brace_depth > class_stack[-1][2]: + if stripped.startswith(":") and not stripped.startswith("::"): + brace_depth += _count_braces(line) + i += 1 + continue + if stripped.startswith(",") and "(" in stripped: + brace_depth += _count_braces(line) + i += 1 + continue # Preprocessor if _PREPROCESSOR_RE.match(line): @@ -320,23 +369,63 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: i += 1 continue + # --- extern "C" { transparent scope --- + if _EXTERN_C_RE.match(stripped): + # extern "C" { on this line — record brace depth, consume the { + extern_c_depths.append(brace_depth) + brace_depth += _count_braces(line) + i += 1 + continue + if _EXTERN_C_NOBRACE_RE.match(stripped): + # extern "C" without { — brace may be on next line + pending_extern_c = True + i += 1 + continue + if pending_extern_c: + pending_extern_c = False + if stripped.startswith("{"): + extern_c_depths.append(brace_depth) + brace_depth += _count_braces(line) + i += 1 + continue + # No brace — single-decl form (extern "C" void foo();) + # Fall through to normal parsing + # Count braces brace_delta = _count_braces(line) new_depth = brace_depth + brace_delta + # Close extern "C" scopes when brace depth drops + while extern_c_depths and new_depth <= extern_c_depths[-1]: + extern_c_depths.pop() + # --- MACRO(class) pattern (e.g. ATTRIBUTE_ALIGNED16(class)) --- if pending_macro_class_kind: - # Expecting the class name on this line + # Expecting the class name on this line (bare or with inheritance) bare_m = _BARE_NAME_RE.match(stripped) if bare_m: macro_name = bare_m.group(1) if macro_name not in _CPP_KEYWORDS: - # Treat as class declaration — defer push until { pending_class = [macro_name, pending_macro_class_kind, "", i, False] pending_macro_class_kind = "" brace_depth = new_depth i += 1 continue + # Also try name with inheritance: "Name : public Base" + inherit_m = _BARE_NAME_INHERIT_RE.match(stripped) + if inherit_m: + macro_name = inherit_m.group(1) + macro_inherit = inherit_m.group(2).strip() + # Strip opening brace from inheritance if present + brace_pos = macro_inherit.find("{") + if brace_pos != -1: + macro_inherit = macro_inherit[:brace_pos].strip() + if macro_name not in _CPP_KEYWORDS: + pending_class = [macro_name, pending_macro_class_kind, macro_inherit, i, False] + pending_macro_class_kind = "" + brace_depth = new_depth + i += 1 + continue pending_macro_class_kind = "" # Fall through to normal parsing @@ -527,7 +616,7 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: elif namespace_stack: at_decl_level = brace_depth == namespace_stack[-1][1] + 1 else: - at_decl_level = brace_depth == 0 + at_decl_level = brace_depth == len(extern_c_depths) # --- Enum declaration --- enum_m = _ENUM_RE.match(line) @@ -565,7 +654,38 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: )) # Track enum body for value extraction - if "{" in line: + if "{" in line and "}" in line: + # Single-line enum: enum Foo { A, B, C }; + brace_open = line.index("{") + brace_close = line.index("}") + body = line[brace_open + 1:brace_close].strip() + if body: + for val_part in body.split(","): + val_part = val_part.strip() + if not val_part: + continue + val_m2 = _ENUM_VALUE_RE.match(val_part) + if val_m2: + val_name = val_m2.group(1) + if val_name not in _CPP_KEYWORDS: + val_value = val_m2.group(2) + sig2 = val_name + if val_value: + sig2 += f" = {val_value.strip()}" + fqn_parts2 = [p for p in [ns, owning_class, enum_name, val_name] if p] + fqn2 = "::".join(fqn_parts2) + records.append(_build_record( + fqn=fqn2, + namespace=ns, + class_name=enum_name, + member_name=val_name, + member_type="field", + signature=sig2, + file_path=rel_path, + line_start=i + 1, + line_end=i + 1, + )) + elif "{" in line: in_enum = enum_name enum_class_name = owning_class enum_brace_depth = brace_depth @@ -791,6 +911,9 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: param_types = _extract_param_types(params_str) if param_types: fqn += f"({param_types})" + # Distinguish const vs non-const overloads + if _quals_have_const(quals): + fqn += " const" records.append(_build_record( fqn=fqn, @@ -833,6 +956,11 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: sig_parts.append(f"{ctor_name}({params_str})") sig = _clean_sig(" ".join(sig_parts)) + # Add trailing qualifiers (noexcept, = default, = delete, etc.) + quals = _extract_trailing_qualifiers(full_sig) + if quals: + sig += " " + quals + if pending_template: sig = pending_template + " " + sig @@ -933,6 +1061,9 @@ def _parse_cpp_file(path: Path, base_dir: Path) -> list[dict]: param_types = _extract_param_types(params_str) if param_types: fqn += f"({param_types})" + # Distinguish const vs non-const overloads + if _quals_have_const(quals): + fqn += " const" records.append(_build_record( fqn=fqn, @@ -1128,6 +1259,20 @@ def _look_back_for_doc(lines: list[str], decl_idx: int) -> dict: block_lines.reverse() + # Reject copyright/license block comments (not doc comments) + raw_text = " ".join(block_lines) + if any(kw in raw_text for kw in ( + "Copyright", "copyright", "LICENSE", "License", "license", + "SPDX-License", "Permission is hereby granted", + "All rights reserved", "WARRANTY", + "#pragma", "#include", "#ifndef", + )): + return result + + # Reject very large block comments (likely file-level headers, not doc comments) + if len(block_lines) > 40: + return result + # Clean up block comment markers cleaned: list[str] = [] for bline in block_lines: @@ -1232,6 +1377,24 @@ def _collect_signature(lines: list[str], start: int) -> tuple[str, int]: sig += " " + next_line paren_depth += _count_parens(next_line) + # After parens balance, collect trailing qualifiers on subsequent lines + # (const, noexcept, override, final, ->, = 0, = default, = delete, etc.) + while i + 1 < limit: + next_line = lines[i + 1].strip() + if not next_line or next_line.startswith("//"): + break + if next_line.startswith("{") or next_line.startswith(";"): + break + if any(next_line.startswith(q) for q in _TRAILING_QUAL_STARTS) or \ + _TRAILING_CONST_RE.match(next_line): + i += 1 + sig += " " + next_line + # Stop if this line ends the declaration + if ";" in next_line or "{" in next_line: + break + else: + break + return sig, i @@ -1256,6 +1419,14 @@ def _extract_params_str(full_sig: str, func_name: str) -> str: return _strip_export_macros(params) +def _quals_have_const(quals: str) -> bool: + """Check if trailing qualifiers contain a top-level 'const' (method constness).""" + if not quals: + return False + # Match 'const' as a whole word, not inside noexcept(...) or other tokens + return bool(re.search(r"\bconst\b", quals)) + + def _extract_trailing_qualifiers(full_sig: str) -> str: """Extract trailing qualifiers after the closing paren (const, noexcept, etc.).""" # Truncate at method body start to avoid inline body content @@ -1281,6 +1452,11 @@ def _extract_trailing_qualifiers(full_sig: str) -> str: after = sig[last_close + 1:].strip() + # Strip inline comments (// ...) + comment_idx = after.find("//") + if comment_idx != -1: + after = after[:comment_idx].strip() + # Strip semicolons after = after.rstrip(";").strip() @@ -1293,6 +1469,12 @@ def _extract_trailing_qualifiers(full_sig: str) -> str: if not before_colon.endswith("="): after = before_colon + # Validate: if remaining text doesn't start with a known trailing qualifier, + # it's likely leaked comment text (e.g., ". Dead-zones should be handled...") + if after and not any(after.startswith(q) for q in _TRAILING_QUAL_STARTS) \ + and not _TRAILING_CONST_RE.match(after): + after = "" + return after.strip() diff --git a/src/codesurface/server.py b/src/codesurface/server.py index ac5ec6e..89dd355 100644 --- a/src/codesurface/server.py +++ b/src/codesurface/server.py @@ -173,6 +173,32 @@ def _auto_reindex() -> bool: return changed +def _pick_primary_namespace(namespaces: list[str], members: list[dict]) -> str | None: + """Pick the most likely primary namespace when a class name is ambiguous. + + Heuristic: prefer the namespace whose members have file_paths NOT in thirdparty/ + directories. If still ambiguous, pick the namespace with the most members. + Returns None if no clear winner (caller should use all members). + """ + if not namespaces: + return None + + # Count members per namespace, preferring non-thirdparty paths + ns_scores: dict[str, tuple[int, int]] = {} # ns -> (non_thirdparty_count, total_count) + for m in members: + ns = m.get("namespace", "") + fp = m.get("file_path", "") + is_thirdparty = any(seg in fp.lower() for seg in ("thirdparty/", "third_party/", "3rdparty/", "vendor/", "extern/")) + prev = ns_scores.get(ns, (0, 0)) + ns_scores[ns] = (prev[0] + (0 if is_thirdparty else 1), prev[1] + 1) + + # Sort: prefer most non-thirdparty members, then most total members + ranked = sorted(ns_scores.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True) + if ranked: + return ranked[0][0] + return None + + def _format_file_location(r: dict) -> str: """Format file path with optional line range from a record.""" fp = r.get("file_path", "") @@ -338,12 +364,20 @@ def get_class(class_name: str) -> str: if _conn is None: return "No codebase indexed. Start the server with --project ." - short_name = re.split(r"[.:]", class_name)[-1] - members = db.get_class_members(_conn, short_name) + # Support namespace-qualified queries: "cv::Mat", "embree::Object" + ns_filter = None + if "::" in class_name: + parts_split = class_name.rsplit("::", 1) + ns_filter = parts_split[0] + short_name = parts_split[1] + else: + short_name = re.split(r"[.:]", class_name)[-1] + + members = db.get_class_members(_conn, short_name, namespace=ns_filter) if not members: if _auto_reindex(): - members = db.get_class_members(_conn, short_name) + members = db.get_class_members(_conn, short_name, namespace=ns_filter) if not members: results = db.search(_conn, class_name, n=5, member_type="type") if results: @@ -353,6 +387,16 @@ def get_class(class_name: str) -> str: return "\n".join(parts) return f"No class '{class_name}' found." + # If no namespace filter was given, check for ambiguity + if not ns_filter: + namespaces = db.get_class_namespaces(_conn, short_name) + if len(namespaces) > 1: + # Pick the most likely namespace: prefer non-empty, non-thirdparty + # Show disambiguation notice + ns_filter = _pick_primary_namespace(namespaces, members) + if ns_filter is not None: + members = db.get_class_members(_conn, short_name, namespace=ns_filter) + _index_fresh = False type_record = next((m for m in members if m["member_type"] == "type"), None) ns = type_record["namespace"] if type_record else members[0].get("namespace", "") @@ -370,6 +414,14 @@ def get_class(class_name: str) -> str: fp = type_record.get("file_path", "") if fp: parts.append(f"File: {_format_file_location(type_record)}") + + # Show disambiguation note if same class name exists in other namespaces + if not ns_filter: + all_ns = db.get_class_namespaces(_conn, short_name) + other_ns = [n for n in all_ns if n != ns] + if other_ns: + also = ", ".join(f'"{n}::{short_name}"' if n else f'"::{short_name}"' for n in other_ns[:5]) + parts.append(f"Note: also found in other namespaces. Use qualified name to disambiguate: {also}") parts.append("") groups: dict[str, list[dict]] = {} From 3dece06ee6b88d874ac18ca4169cb5fd3e34ed45 Mon Sep 17 00:00:00 2001 From: Fuat Date: Tue, 3 Mar 2026 21:52:51 +0300 Subject: [PATCH 6/6] Fix global namespace filter and param type pointer extraction - db.py: Change `if namespace:` to `if namespace is not None:` so filtering by global namespace (empty string) works correctly - cpp.py: Move leading *& from param name token back to type in _extract_param_types, so `int *p` produces type `int*` not `int` --- src/codesurface/db.py | 2 +- src/codesurface/parsers/cpp.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/codesurface/db.py b/src/codesurface/db.py index fbfec3a..fd46c01 100644 --- a/src/codesurface/db.py +++ b/src/codesurface/db.py @@ -192,7 +192,7 @@ def get_by_fqn(conn: sqlite3.Connection, fqn: str) -> dict | None: def get_class_members(conn: sqlite3.Connection, class_name: str, namespace: str | None = None) -> list[dict]: """Get all members of a class by class name, optionally filtered by namespace.""" - if namespace: + if namespace is not None: rows = conn.execute( "SELECT * FROM api_records WHERE class_name = ? AND namespace = ? " "ORDER BY member_type, member_name", diff --git a/src/codesurface/parsers/cpp.py b/src/codesurface/parsers/cpp.py index 05d74e4..534cc83 100644 --- a/src/codesurface/parsers/cpp.py +++ b/src/codesurface/parsers/cpp.py @@ -1530,10 +1530,16 @@ def _extract_param_types(params_str: str) -> str: eq_idx = _find_default_eq(part) if eq_idx != -1: part = part[:eq_idx].strip() - # Last word (before any & or *) is the param name, rest is type + # Last word is the param name, rest is type. + # Handle pointer/ref attached to name: "int *p" → type "int*" tokens = part.rsplit(None, 1) if len(tokens) >= 2: type_part = tokens[0].strip() + name_part = tokens[1] + # Move leading *& from name back to type + while name_part and name_part[0] in "*&": + type_part += name_part[0] + name_part = name_part[1:] types.append(type_part) elif tokens: types.append(tokens[0].strip())