Source Code Tracing: utils.py

  1from collections.abc import ByteString, Callable
  2import configparser
  3import logging
  4from pathlib import Path
  5from typing import TypedDict
  6from urllib.request import pathname2url
  7
  8from giturlparse import parse  # type: ignore[import-untyped]
  9from tree_sitter import Language, Parser, Point, Query, QueryCursor
 10from tree_sitter import Node as TreeSitterNode
 11
 12from sphinx_codelinks.config import UNIX_NEWLINE, CommentCategory
 13from sphinx_codelinks.source_discover.config import CommentType
 14
 15# Language-specific node types for scope detection
 16SCOPE_NODE_TYPES = {
[docs] 17    # @Python Scope Node Types, IMPL_PY_2, impl, [FE_PY]
 18    CommentType.python: {"function_definition", "class_definition"},
[docs] 19    # @C and C++ Scope Node Types, IMPL_C_2, impl, [FE_C_SUPPORT, FE_CPP]
 20    CommentType.cpp: {"function_definition", "class_definition"},
 21    CommentType.cs: {"method_declaration", "class_declaration", "property_declaration"},
 22    CommentType.yaml: {"block_mapping_pair", "block_sequence_item", "document"},
 23    # @Rust Scope Node Types, IMPL_RUST_2, impl, [FE_RUST];
 24    CommentType.rust: {
 25        "function_item",
 26        "struct_item",
 27        "enum_item",
 28        "impl_item",
 29        "trait_item",
 30        "mod_item",
 31    },
 32}
 33
 34# initialize logger
 35logger = logging.getLogger(__name__)
 36logger.setLevel(logging.INFO)
 37# log to the console
 38console = logging.StreamHandler()
 39console.setLevel(logging.INFO)
 40logger.addHandler(console)
 41
 42GIT_HOST_URL_TEMPLATE = {
 43    "github": "https://github.com/{owner}/{repo}/blob/{rev}/{path}#L{lineno}",
 44    "gitlab": "https://gitlab.com/{owner}/{repo}/-/blob/{rev}/{path}#L{lineno}",
 45}
 46
 47PYTHON_QUERY = """
 48                ; Match comments
 49                (comment) @comment
 50
 51                ; Match docstrings inside modules, functions, or classes
 52                (module (expression_statement (string)) @comment)
 53                (function_definition (block (expression_statement (string)) @comment))
 54                (class_definition (block (expression_statement (string)) @comment))
 55            """
 56CPP_QUERY = """(comment) @comment"""
 57C_SHARP_QUERY = """(comment) @comment"""
 58YAML_QUERY = """(comment) @comment"""
 59RUST_QUERY = """
 60    (line_comment) @comment
 61    (block_comment) @comment
 62"""
 63
 64
 65def is_text_file(filepath: Path, sample_size: int = 2048) -> bool:
 66    """Return True if file is likely text, False if binary."""
 67    try:
 68        with filepath.open("rb") as f:
 69            chunk = f.read(sample_size)
 70        # Quick binary heuristic: null byte present
 71        if b"\x00" in chunk:
 72            return False
 73        # Try UTF-8 decode on the sample
 74        chunk.decode("utf-8")
 75        return True
 76    except UnicodeDecodeError:
 77        return False
 78
 79
[docs] 80# @Tree-sitter parser initialization for multiple languages, IMPL_LANG_1, impl, [FE_C_SUPPORT, FE_CPP, FE_PY, FE_YAML, FE_RUST]
 81def init_tree_sitter(comment_type: CommentType) -> tuple[Parser, Query]:
 82    if comment_type == CommentType.cpp:
 83        import tree_sitter_cpp  # noqa: PLC0415
 84
 85        parsed_language = Language(tree_sitter_cpp.language())
 86        query = Query(parsed_language, CPP_QUERY)
 87    elif comment_type == CommentType.python:
 88        import tree_sitter_python  # noqa: PLC0415
 89
 90        parsed_language = Language(tree_sitter_python.language())
 91        query = Query(parsed_language, PYTHON_QUERY)
 92    elif comment_type == CommentType.cs:
 93        import tree_sitter_c_sharp  # noqa: PLC0415
 94
 95        parsed_language = Language(tree_sitter_c_sharp.language())
 96        query = Query(parsed_language, C_SHARP_QUERY)
 97    elif comment_type == CommentType.yaml:
 98        import tree_sitter_yaml  # noqa: PLC0415
 99
100        parsed_language = Language(tree_sitter_yaml.language())
101        query = Query(parsed_language, YAML_QUERY)
102    elif comment_type == CommentType.rust:
103        import tree_sitter_rust  # noqa: PLC0415
104
105        parsed_language = Language(tree_sitter_rust.language())
106        query = Query(parsed_language, RUST_QUERY)
107    else:
108        raise ValueError(f"Unsupported comment style: {comment_type}")
109    parser = Parser(parsed_language)
110    return parser, query
111
112
113def wrap_read_callable_point(
114    src_string: ByteString,
115) -> Callable[[int, Point], ByteString]:
116    def read_callable_byte_offset(byte_offset: int, _: Point) -> ByteString:
117        return src_string[byte_offset : byte_offset + 1]
118
119    return read_callable_byte_offset
120
121
[docs]122# @Comment extraction from source code using tree-sitter, IMPL_EXTR_1, impl, [FE_DEF]
123def extract_comments(
124    src_string: ByteString, parser: Parser, query: Query
125) -> list[TreeSitterNode] | None:
126    """Get all comments from source files by tree-sitter."""
127    read_point_fn = wrap_read_callable_point(src_string)
128    tree = parser.parse(read_point_fn)
129    query_cursor = QueryCursor(query)
130    captures: dict[str, list[TreeSitterNode]] = query_cursor.captures(tree.root_node)
131
132    return captures.get("comment")
133
134
135def find_enclosing_scope(
136    node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
137) -> TreeSitterNode | None:
138    """Find the enclosing scope of a comment."""
139    scope_types = SCOPE_NODE_TYPES.get(comment_type, SCOPE_NODE_TYPES[CommentType.cpp])
140    current: TreeSitterNode = node
141    while current:
142        if current.type in scope_types:
143            return current
144        current: TreeSitterNode | None = current.parent  # type: ignore[no-redef]  # required for node traversal
145    return None
146
147
148def find_next_scope(
149    node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
150) -> TreeSitterNode | None:
151    """Find the next scope of a comment."""
152    scope_types = SCOPE_NODE_TYPES.get(comment_type, SCOPE_NODE_TYPES[CommentType.cpp])
153    current: TreeSitterNode = node
154    while current:
155        if current.type in scope_types:
156            return current
157        current: TreeSitterNode | None = current.next_named_sibling  # type: ignore[no-redef]  # required for node traversal
158        if current and current.type == "block":
159            for child in current.named_children:
160                if child.type in scope_types:
161                    return child
162    return None
163
164
165def _find_yaml_structure_in_block_node(
166    block_node: TreeSitterNode,
167) -> TreeSitterNode | None:
168    """Find YAML structure elements within a block_node."""
169    for grandchild in block_node.named_children:
170        if grandchild.type == "block_mapping":
171            for ggchild in grandchild.named_children:
172                if ggchild.type == "block_mapping_pair":
173                    return ggchild
174        elif grandchild.type == "block_sequence":
175            for ggchild in grandchild.named_children:
176                if ggchild.type == "block_sequence_item":
177                    return ggchild
178    return None
179
180
181def find_yaml_next_structure(node: TreeSitterNode) -> TreeSitterNode | None:
182    """Find the next YAML structure element after the comment node."""
183    current = node.next_named_sibling
184    while current:
185        if current.type in {
186            "block_mapping_pair",
187            "block_sequence_item",
188            "flow_mapping",
189            "flow_sequence",
190        }:
191            return current
192        if current.type == "document":
193            for child in current.named_children:
194                if child.type == "block_node":
195                    result = _find_yaml_structure_in_block_node(child)
196                    if result:
197                        return result
198        if current.type == "block_node":
199            result = _find_yaml_structure_in_block_node(current)
200            if result:
201                return result
202        current = current.next_named_sibling
203    return None
204
205
206def find_yaml_prev_sibling_on_same_row(node: TreeSitterNode) -> TreeSitterNode | None:
207    """Find a previous named sibling that is on the same row as the comment."""
208    comment_row = node.start_point.row
209    current = node.prev_named_sibling
210
211    while current:
212        # Check if this sibling ends on the same row as the comment starts
213        # This indicates it's an inline comment
214        if current.end_point.row == comment_row:
215            return current
216        # If we find a sibling that ends before the comment row, we can stop
217        # as we won't find any siblings on the same row going backwards
218        if current.end_point.row < comment_row:
219            break
220        current = current.prev_named_sibling
221
222    return None
223
224
225def find_yaml_associated_structure(node: TreeSitterNode) -> TreeSitterNode | None:
226    """Find the YAML structure (key-value pair, list item, etc.) associated with a comment."""
227    # First, check if this is an inline comment by looking for a previous sibling on the same row
228    prev_sibling_same_row = find_yaml_prev_sibling_on_same_row(node)
229    if prev_sibling_same_row:
230        return prev_sibling_same_row
231
232    # If no previous sibling on same row, try to find the next named sibling (structure after the comment)
233    structure = find_yaml_next_structure(node)
234    if structure:
235        return structure
236
237    # If no next sibling found, traverse up to find parent structure
238    parent = node.parent
239    while parent:
240        if parent.type in {"block_mapping_pair", "block_sequence_item"}:
241            return parent
242        parent = parent.parent
243
244    return None
245
246
247def find_associated_scope(
248    node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
249) -> TreeSitterNode | None:
250    """Find the associated scope of a comment."""
251    if comment_type == CommentType.yaml:
252        # YAML uses different structure association logic
253        return find_yaml_associated_structure(node)
254
255    if node.type == CommentCategory.docstring:
256        # Only for python's docstring
257        return find_enclosing_scope(node, comment_type)
258    # General comments regardless of comment types
259    associated_scope = find_next_scope(node, comment_type)
260    if not associated_scope:
261        associated_scope = find_enclosing_scope(node, comment_type)
262    return associated_scope
263
264
265def locate_git_root(src_dir: Path) -> Path | None:
266    """Traverse upwards to find git root."""
267    current = src_dir.resolve()
268    parents = list(current.parents)
269    parents.append(current)
270    for parent in parents:
271        if (parent / ".git").exists() and (parent / ".git").is_dir():
272            return parent
273    logger.warning(f"git root is not found in the parent of {src_dir}")
274    return None
275
276
277def get_remote_url(git_root: Path, remote_name: str = "origin") -> str | None:
278    """Get remote url from .git/config."""
279    config_path = git_root / ".git" / "config"
280    if not config_path.exists():
281        logging.warning(f"{config_path} does not exist")
282        return None
283
284    config = configparser.ConfigParser(allow_no_value=True, strict=False)
285    config.read(config_path)
286    section = f'remote "{remote_name}"'
287    if section in config and "url" in config[section]:
288        url: str = config[section]["url"]
289        return url
290    logger.warning(f"remote-url is not found in {config_path}")
291    return None
292
293
294def get_current_rev(git_root: Path) -> str | None:
295    """Get current commit rev from .git/HEAD."""
296    head_path = git_root / ".git" / "HEAD"
297    if not head_path.exists():
298        logging.warning(f"{head_path} does not exist")
299        return None
300    head_content = head_path.read_text().strip()
301    if not head_content.startswith("ref: "):
302        logging.warning(f"Expect starting with 'ref: ' in {head_path}")
303        return None
304
305    ref_path = git_root / ".git" / head_content.split(":", 1)[1].strip()
306    if not ref_path.exists():
307        logging.warning(f"{ref_path} does not exist")
308        return None
309    return ref_path.read_text().strip()
310
311
312def form_https_url(
313    git_url: str, rev: str, project_path: Path, filepath: Path, lineno: int
314) -> str | None:
315    parsed_url = parse(git_url)
316    template = GIT_HOST_URL_TEMPLATE.get(parsed_url.platform)
317    if not template:
318        logging.warning(f"Unsupported Git host: {parsed_url.platform}")
319        return git_url
320    https_url = template.format(
321        owner=parsed_url.owner,
322        repo=parsed_url.repo,
323        rev=rev,
324        path=pathname2url(str(filepath.absolute().relative_to(project_path))),
325        lineno=str(lineno),
326    )
327    return https_url
328
329
330def remove_leading_sequences(text: str, leading_sequences: list[str]) -> str:
331    lines = text.splitlines(keepends=True)
332    no_comment_lines = []
333    for line in lines:
334        leading_sequence_exist = False
335        for leading_sequence in leading_sequences:
336            leading_sequence_idx = line.find(leading_sequence)
337            if leading_sequence_idx == -1:
338                continue
339            no_comment_lines.append(
340                line[leading_sequence_idx + len(leading_sequence) :]
341            )
342            leading_sequence_exist = True
343            break
344
345        if not leading_sequence_exist:
346            no_comment_lines.append(line)
347
348    return "".join(no_comment_lines)
349
350
351class ExtractedRstType(TypedDict):
352    rst_text: str
353    row_offset: int
354    start_idx: int
355    end_idx: int
356
357
[docs]358# @Extract reStructuredText blocks embedded in comments, IMPL_RST_1, impl, [FE_RST_EXTRACTION]
359def extract_rst(
360    text: str, start_marker: str, end_marker: str
361) -> ExtractedRstType | None:
362    """Extract rst from a comment.
363
364    Two use cases:
365    1. Start_marker and end_marker one the same line.
366
367    The rst text is wrapped by start and the end markers on the same line,
368    so, there is no need to remove the leading chars.ArithmeticError
369    E.g.
370    @rst  .. admonition:: title here @endrst
371
372    2. Start_marker and end_marker in different lines.
373
374    The rst text is expected to start from the next line of the start_marker
375    and ends at he previous line of the end_marker.
376    E.g.
377    @rst
378    .. admonition:: title here
379      :collapsible: open
380
381      This example is collapsible, and initially open.
382    @endrst
383    """
384    start_idx = text.find(start_marker)
385    end_idx = text.rfind(end_marker)
386    if start_idx == -1 or end_idx == -1:
387        return None
388    rst_text = text[start_idx + len(start_marker) : end_idx]
389    row_offset = len(text[:start_idx].splitlines())
390    if not rst_text.strip():
391        # empty string is out of the interest
392        return None
393    if UNIX_NEWLINE not in rst_text:
394        # single line rst text
395        oneline_rst: ExtractedRstType = {
396            "rst_text": rst_text,
397            "row_offset": row_offset,
398            "start_idx": start_idx + len(start_marker),
399            "end_idx": end_idx,
400        }
401        return oneline_rst
402
403    # multiline rst text
404
405    first_newline_idx = rst_text.find(UNIX_NEWLINE)
406    rst_text = rst_text[first_newline_idx + len(UNIX_NEWLINE) :]
407    multiline_rst: ExtractedRstType = {
408        "rst_text": rst_text,
409        "row_offset": row_offset,
410        "start_idx": start_idx
411        + len(start_marker)
412        + first_newline_idx
413        + len(UNIX_NEWLINE),
414        "end_idx": end_idx,
415    }
416
417    return multiline_rst