diff --git a/src/target_tools/headergen/Dockerfile b/src/target_tools/headergen/Dockerfile index d7dd8b834..df49b8d2f 100644 --- a/src/target_tools/headergen/Dockerfile +++ b/src/target_tools/headergen/Dockerfile @@ -10,7 +10,7 @@ WORKDIR /app # Install dependencies RUN apt-get update \ - && apt-get -y install git gcc + && apt-get -y install git gcc g++ COPY requirements.txt /app/requirements.txt diff --git a/src/target_tools/headergen/src/runner.py b/src/target_tools/headergen/src/runner.py index 69698e40e..5f3105b5b 100644 --- a/src/target_tools/headergen/src/runner.py +++ b/src/target_tools/headergen/src/runner.py @@ -62,6 +62,7 @@ def main_runner(args): logger.info(file) inferred = process_file(file) + inferred = translator.enrich_with_col_offsets(file, inferred) json_file_path = str(file).replace(".py", "_result.json") diff --git a/src/target_tools/headergen/src/translator.py b/src/target_tools/headergen/src/translator.py index 2014be66b..d0d89de80 100644 --- a/src/target_tools/headergen/src/translator.py +++ b/src/target_tools/headergen/src/translator.py @@ -1,6 +1,8 @@ import argparse +import ast import json import os +from collections import defaultdict from pathlib import Path @@ -9,6 +11,75 @@ def list_json_files(folder_path): return python_files +def build_position_map(source_path): + """Map (name, line_number) -> [1-indexed col_offsets] for every name + occurrence in the source. HeaderGen's server doesn't emit col_offset, but + for any (name, line) it gives us, the column is determined by the source. + We keep all candidates so the enrichment can skip ambiguous cases.""" + positions = defaultdict(list) + try: + with open(source_path) as f: + tree = ast.parse(f.read()) + except Exception: + return positions + + for node in ast.walk(tree): + if isinstance(node, ast.Name): + positions[(node.id, node.lineno)].append(node.col_offset + 1) + elif isinstance(node, ast.arg): + positions[(node.arg, node.lineno)].append(node.col_offset + 1) + elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + prefix = ( + "async def " if isinstance(node, ast.AsyncFunctionDef) else "def " + ) + positions[(node.name, node.lineno)].append( + node.col_offset + len(prefix) + 1 + ) + elif isinstance(node, ast.ClassDef): + positions[(node.name, node.lineno)].append( + node.col_offset + len("class ") + 1 + ) + return positions + + +def _lookup_name(entry): + """Return the source-level name to look up for this entry's position.""" + if "variable" in entry: + # Subscript/attribute accesses like 'h[0]' or 'self.child' are + # reported as the full expression; the col_offset GT expects is + # where the base name begins. + name = entry["variable"] + for sep in ("[", "."): + if sep in name: + name = name.split(sep, 1)[0] + break + return name + if "parameter" in entry: + return entry["parameter"] + if "function" in entry: + # Nested functions are reported as 'outer.inner'; the position + # we want is the inner name's own column. + return entry["function"].rsplit(".", 1)[-1] + return None + + +def enrich_with_col_offsets(source_path, entries): + """Augment HeaderGen entries with col_offset by looking up the position + of each entry's identifying name in the source file. Skip ambiguous + cases (multiple candidates) so we never guess a position.""" + positions = build_position_map(source_path) + for entry in entries: + if "col_offset" in entry: + continue + name = _lookup_name(entry) + if name is None: + continue + cands = sorted(set(positions.get((name, entry["line_number"]), []))) + if len(cands) == 1: + entry["col_offset"] = cands[0] + return entries + + def translate_content(file_path): with open(file_path) as f: data = json.load(f) diff --git a/src/target_tools/jedi/src/jedi_type_inference.py b/src/target_tools/jedi/src/jedi_type_inference.py index 61a27775d..a72afed04 100644 --- a/src/target_tools/jedi/src/jedi_type_inference.py +++ b/src/target_tools/jedi/src/jedi_type_inference.py @@ -100,17 +100,24 @@ def find_types_by_execute(self, jedi_obj): return _type def get_function_name(self, jedi_obj): + """Return the qualified name of jedi_obj relative to its module, + walking up parent scopes so nested functions become 'outer.inner'.""" try: if jedi_obj.name == "": - func_name = "lambda" - else: - parts = jedi_obj.full_name.split(".", 1) - func_name = parts[-1] if len(parts) > 1 else jedi_obj.full_name - except Exception as e: + return "lambda" + parts = [] + current = jedi_obj + while current is not None and current.type != "module": + name = "lambda" if current.name == "" else current.name + parts.append(name) + try: + current = current.parent() + except Exception: + break + return ".".join(reversed(parts)) if parts else jedi_obj.name + except Exception: print("full_name not found in jedi_obj?") - func_name = jedi_obj.name - - return func_name + return jedi_obj.name def infer_types(self): """ @@ -143,24 +150,46 @@ def infer_types(self): if _infer: for inferred in _infer: if inferred.type == "function": - # _type = self.parse_type_hint(inferred.get_type_hint()) - # if not _type: - # self.find_types_by_execute(inferred) - - _type = self.find_types_by_execute(inferred) - - _info = { - "file": node.name, - "line_number": pos["line"], - } - if inferred.name != "": - _info["function"] = self.get_function_name(inferred) - _info["type"] = _type if _type else {"any"} - - variable_name = var.split(":")[0].strip() - if variable_name != self.get_function_name(inferred): - _info["variable"] = variable_name - if _type: + # Distinguish between the function's own definition + # site (return-type is what's wanted, e.g. for `def + # func1():`) and a reference to it (callable is + # what's wanted, e.g. for `a = func1`). + at_def_site = ( + pos["line"] == inferred.line + and pos["column"] == inferred.column + ) + + if at_def_site: + _type = self.find_types_by_execute(inferred) + + _info = { + "file": node.name, + "line_number": pos["line"], + "col_offset": pos["column"] + 1, + } + if inferred.name != "": + _info["function"] = self.get_function_name(inferred) + _info["type"] = _type if _type else {"any"} + + variable_name = var.split(":")[0].strip() + if variable_name != self.get_function_name(inferred): + _info["variable"] = variable_name + if _type: + output_inferred.append(_info) + else: + variable_name = var.split(":")[0].strip() + _info = { + "file": node.name, + "line_number": pos["line"], + "col_offset": pos["column"] + 1, + "variable": variable_name, + "type": {"callable"}, + } + parent = pos["jedi_obj"].parent() + if parent and parent.type != "module": + parent_func = self.get_function_name(parent) + if parent_func: + _info["function"] = parent_func output_inferred.append(_info) elif inferred.type == "instance": @@ -187,17 +216,15 @@ def infer_types(self): _info = { "file": node.name, "line_number": pos["line"], + "col_offset": pos["column"] + 1, "variable": var.split(":")[0], "type": {_type}, } - if ( - not pos["jedi_obj"].parent().name - == pos["jedi_obj"].parent().module_name - ): - if self.get_function_name(pos["jedi_obj"].parent()): - _info["function"] = self.get_function_name( - pos["jedi_obj"].parent() - ) + parent = pos["jedi_obj"].parent() + if parent and parent.type != "module": + parent_func = self.get_function_name(parent) + if parent_func: + _info["function"] = parent_func if _type: output_inferred.append(_info) @@ -206,6 +233,7 @@ def infer_types(self): _info = { "file": node.name, "line_number": pos["line"], + "col_offset": pos["column"] + 1, "variable": var.split(":")[0], "function": self.get_function_name( pos["jedi_obj"].parent() @@ -225,6 +253,7 @@ def infer_types(self): _info = { "file": node.name, "line_number": pos["line"], + "col_offset": pos["column"] + 1, "parameter": var.split(":")[0], "function": self.get_function_name( pos["jedi_obj"].parent() diff --git a/src/target_tools/scalpel/src/runner.py b/src/target_tools/scalpel/src/runner.py index c078d7f5b..20e1f7c62 100644 --- a/src/target_tools/scalpel/src/runner.py +++ b/src/target_tools/scalpel/src/runner.py @@ -4,6 +4,7 @@ import os from pathlib import Path +import translator import utils from scalpel.typeinfer.typeinfer import TypeInference @@ -42,6 +43,7 @@ def main_runner(args): try: # logger.debug(file) inferred = process_file(file) + inferred = translator.enrich_with_col_offsets(file, inferred) json_file_path = str(file).replace(".py", "_result.json") with open(json_file_path, "w") as json_file: diff --git a/src/target_tools/scalpel/src/translator.py b/src/target_tools/scalpel/src/translator.py index 9f3c550a8..5541fc4d4 100644 --- a/src/target_tools/scalpel/src/translator.py +++ b/src/target_tools/scalpel/src/translator.py @@ -1,6 +1,8 @@ import argparse +import ast import json import os +from collections import defaultdict from pathlib import Path @@ -9,6 +11,70 @@ def list_json_files(folder_path): return python_files +def build_position_map(source_path): + """Map (name, line_number) -> [1-indexed col_offsets] for every name + occurrence in the source. Scalpel's runner doesn't emit col_offset, but + for any (name, line) it gives us, the column is determined by the source. + We keep all candidates so the enrichment can skip ambiguous cases.""" + positions = defaultdict(list) + try: + with open(source_path) as f: + tree = ast.parse(f.read()) + except Exception: + return positions + + for node in ast.walk(tree): + if isinstance(node, ast.Name): + positions[(node.id, node.lineno)].append(node.col_offset + 1) + elif isinstance(node, ast.arg): + positions[(node.arg, node.lineno)].append(node.col_offset + 1) + elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + prefix = ( + "async def " if isinstance(node, ast.AsyncFunctionDef) else "def " + ) + positions[(node.name, node.lineno)].append( + node.col_offset + len(prefix) + 1 + ) + elif isinstance(node, ast.ClassDef): + positions[(node.name, node.lineno)].append( + node.col_offset + len("class ") + 1 + ) + return positions + + +def _lookup_name(entry): + """Return the source-level name to look up for this entry's position.""" + if "variable" in entry: + name = entry["variable"] + for sep in ("[", "."): + if sep in name: + name = name.split(sep, 1)[0] + break + return name + if "parameter" in entry: + return entry["parameter"] + if "function" in entry: + return entry["function"].rsplit(".", 1)[-1] + return None + + +def enrich_with_col_offsets(source_path, entries): + """Augment entries with col_offset by looking up the position of each + entry's identifying name in the source file. Skip ambiguous cases + (multiple candidates) so we never guess a position.""" + positions = build_position_map(source_path) + for entry in entries: + if "col_offset" in entry: + continue + name = _lookup_name(entry) + if name is None: + continue + cands = sorted(set(positions.get((name, entry["line_number"]), []))) + if len(cands) == 1: + entry["col_offset"] = cands[0] + return entries + + def main_translator(args): json_files = list_json_files(args.bechmark_path) error_count = 0