1from collections.abc import ByteString, Callable
  2import configparser
  3from pathlib import Path
  4from typing import TypedDict
  5from urllib.request import pathname2url
  6
  7from giturlparse import parse  # type: ignore[import-untyped]
  8from tree_sitter import Language, Parser, Point, Query, QueryCursor
  9from tree_sitter import Node as TreeSitterNode
 10
 11from sphinx_codelinks.config import UNIX_NEWLINE, CommentCategory
 12from sphinx_codelinks.logger import get_logger
 13from sphinx_codelinks.source_discover.config import CommentType
 14
 15# Language-specific node types for scope detection.
 16#
 17# YAML and JSONC are intentionally absent. They are data formats, not code, so a
 18# comment associates with the surrounding data structure (key/value pair, list
 19# item, or scalar) rather than with an enclosing or following declaration. That
 20# needs a different algorithm (inline same-row association first, scalar targets,
 21# grammar-specific traversal), implemented in find_yaml_associated_structure and
 22# find_jsonc_associated_structure and dispatched from find_associated_scope.
 23# Those bespoke finders never read this table (only find_next_scope and
 24# find_enclosing_scope do), so an entry here would be dead.
 25SCOPE_NODE_TYPES = {
[docs] 26    # @Python Scope Node Types, IMPL_PY_2, impl, [FE_PY]
 27    CommentType.python: {"function_definition", "class_definition"},
[docs] 28    # @C and C++ Scope Node Types, IMPL_C_2, impl, [FE_C_SUPPORT, FE_CPP]
 29    CommentType.cpp: {"function_definition", "class_definition"},
 30    CommentType.cs: {"method_declaration", "class_declaration", "property_declaration"},
 31    # @Rust Scope Node Types, IMPL_RUST_2, impl, [FE_RUST];
 32    CommentType.rust: {
 33        "function_item",
 34        "struct_item",
 35        "enum_item",
 36        "impl_item",
 37        "trait_item",
 38        "mod_item",
 39    },
[docs] 40    # @Go Scope Node Types, IMPL_GO_4, impl, [FE_GO]
 41    CommentType.go: {
 42        "function_declaration",
 43        "method_declaration",
 44        "type_declaration",
 45        "type_spec",
 46    },
 47}
 48
 49logger = get_logger(__name__)
 50
 51GIT_HOST_URL_TEMPLATE = {
 52    "github": "https://github.com/{owner}/{repo}/blob/{rev}/{path}#L{lineno}",
 53    "gitlab": "https://gitlab.com/{owner}/{repo}/-/blob/{rev}/{path}#L{lineno}",
 54}
 55
 56PYTHON_QUERY = """
 57                ; Match comments
 58                (comment) @comment
 59
 60                ; Match docstrings inside modules, functions, or classes
 61                (module (expression_statement (string)) @comment)
 62                (function_definition (block (expression_statement (string)) @comment))
 63                (class_definition (block (expression_statement (string)) @comment))
 64            """
 65CPP_QUERY = """(comment) @comment"""
 66C_SHARP_QUERY = """(comment) @comment"""
 67YAML_QUERY = """(comment) @comment"""
 68RUST_QUERY = """
 69    (line_comment) @comment
 70    (block_comment) @comment
 71"""
[docs] 72# @Go comment query for tree-sitter, IMPL_GO_3, impl, [FE_GO]
 73GO_QUERY = """
 74    (comment) @comment
 75"""
 76JSONC_QUERY = """(comment) @comment"""
 77
 78# JSON value node types that can be associated with a comment.
 79JSON_STRUCTURE_TYPES = {
 80    "pair",
 81    "object",
 82    "array",
 83    "string",
 84    "number",
 85    "true",
 86    "false",
 87    "null",
 88}
 89
 90
 91def is_text_file(filepath: Path, sample_size: int = 2048) -> bool:
 92    """Return True if file is likely text, False if binary."""
 93    try:
 94        with filepath.open("rb") as f:
 95            chunk = f.read(sample_size)
 96        # Quick binary heuristic: null byte present
 97        if b"\x00" in chunk:
 98            return False
 99        # Try UTF-8 decode on the sample
100        chunk.decode("utf-8")
101        return True
102    except UnicodeDecodeError:
103        return False
104
105
[docs]106# @Tree-sitter parser initialization for multiple languages, IMPL_LANG_1, impl, [FE_C_SUPPORT, FE_CPP, FE_PY, FE_YAML, FE_RUST, FE_GO, FE_JSONC]
107def init_tree_sitter(comment_type: CommentType) -> tuple[Parser, Query]:
108    if comment_type == CommentType.cpp:
109        import tree_sitter_cpp  # noqa: PLC0415
110
111        parsed_language = Language(tree_sitter_cpp.language())
112        query = Query(parsed_language, CPP_QUERY)
113    elif comment_type == CommentType.python:
114        import tree_sitter_python  # noqa: PLC0415
115
116        parsed_language = Language(tree_sitter_python.language())
117        query = Query(parsed_language, PYTHON_QUERY)
118    elif comment_type == CommentType.cs:
119        import tree_sitter_c_sharp  # noqa: PLC0415
120
121        parsed_language = Language(tree_sitter_c_sharp.language())
122        query = Query(parsed_language, C_SHARP_QUERY)
123    elif comment_type == CommentType.yaml:
124        import tree_sitter_yaml  # noqa: PLC0415
125
126        parsed_language = Language(tree_sitter_yaml.language())
127        query = Query(parsed_language, YAML_QUERY)
128    elif comment_type == CommentType.rust:
129        import tree_sitter_rust  # noqa: PLC0415
130
131        parsed_language = Language(tree_sitter_rust.language())
132        query = Query(parsed_language, RUST_QUERY)
133    elif comment_type == CommentType.go:
134        import tree_sitter_go  # noqa: PLC0415
135
136        parsed_language = Language(tree_sitter_go.language())
137        query = Query(parsed_language, GO_QUERY)
138    elif comment_type == CommentType.jsonc:
139        import tree_sitter_json  # noqa: PLC0415
140
141        parsed_language = Language(tree_sitter_json.language())
142        query = Query(parsed_language, JSONC_QUERY)
143    else:
144        raise ValueError(f"Unsupported comment style: {comment_type}")
145    parser = Parser(parsed_language)
146    return parser, query
147
148
149def wrap_read_callable_point(
150    src_string: ByteString,
151) -> Callable[[int, Point], ByteString]:
152    def read_callable_byte_offset(byte_offset: int, _: Point) -> ByteString:
153        return src_string[byte_offset : byte_offset + 1]
154
155    return read_callable_byte_offset
156
157
[docs]158# @Comment extraction from source code using tree-sitter, IMPL_EXTR_1, impl, [FE_DEF]
159def extract_comments(
160    src_string: ByteString, parser: Parser, query: Query
161) -> list[TreeSitterNode] | None:
162    """Get all comments from source files by tree-sitter."""
163    read_point_fn = wrap_read_callable_point(src_string)
164    tree = parser.parse(read_point_fn)
165    query_cursor = QueryCursor(query)
166    captures: dict[str, list[TreeSitterNode]] = query_cursor.captures(tree.root_node)
167
168    return captures.get("comment")
169
170
171def find_enclosing_scope(
172    node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
173) -> TreeSitterNode | None:
174    """Find the enclosing scope of a comment."""
175    scope_types = SCOPE_NODE_TYPES.get(comment_type, SCOPE_NODE_TYPES[CommentType.cpp])
176    current: TreeSitterNode = node
177    while current:
178        if current.type in scope_types:
179            return current
180        current: TreeSitterNode | None = current.parent  # type: ignore[no-redef]  # required for node traversal
181    return None
182
183
184def find_next_scope(
185    node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
186) -> TreeSitterNode | None:
187    """Find the next scope of a comment."""
188    scope_types = SCOPE_NODE_TYPES.get(comment_type, SCOPE_NODE_TYPES[CommentType.cpp])
189    current: TreeSitterNode = node
190    while current:
191        if current.type in scope_types:
192            return current
193        current: TreeSitterNode | None = current.next_named_sibling  # type: ignore[no-redef]  # required for node traversal
194        if current and current.type == "block":
195            for child in current.named_children:
196                if child.type in scope_types:
197                    return child
198    return None
199
200
201def _find_yaml_structure_in_block_node(
202    block_node: TreeSitterNode,
203) -> TreeSitterNode | None:
204    """Find YAML structure elements within a block_node."""
205    for grandchild in block_node.named_children:
206        if grandchild.type == "block_mapping":
207            for ggchild in grandchild.named_children:
208                if ggchild.type == "block_mapping_pair":
209                    return ggchild
210        elif grandchild.type == "block_sequence":
211            for ggchild in grandchild.named_children:
212                if ggchild.type == "block_sequence_item":
213                    return ggchild
214    return None
215
216
217def find_yaml_next_structure(node: TreeSitterNode) -> TreeSitterNode | None:
218    """Find the next YAML structure element after the comment node."""
219    current = node.next_named_sibling
220    while current:
221        if current.type in {
222            "block_mapping_pair",
223            "block_sequence_item",
224            "flow_mapping",
225            "flow_sequence",
226        }:
227            return current
228        if current.type == "document":
229            for child in current.named_children:
230                if child.type == "block_node":
231                    result = _find_yaml_structure_in_block_node(child)
232                    if result:
233                        return result
234        if current.type == "block_node":
235            result = _find_yaml_structure_in_block_node(current)
236            if result:
237                return result
238        current = current.next_named_sibling
239    return None
240
241
242def find_prev_sibling_on_same_row(node: TreeSitterNode) -> TreeSitterNode | None:
243    """Find a previous named sibling that is on the same row as the comment.
244
245    Grammar-agnostic: used to detect inline comments in both YAML and JSONC.
246    """
247    comment_row = node.start_point.row
248    current = node.prev_named_sibling
249
250    while current:
251        # Check if this sibling ends on the same row as the comment starts
252        # This indicates it's an inline comment
253        if current.end_point.row == comment_row:
254            return current
255        # If we find a sibling that ends before the comment row, we can stop
256        # as we won't find any siblings on the same row going backwards
257        if current.end_point.row < comment_row:
258            break
259        current = current.prev_named_sibling
260
261    return None
262
263
264def find_yaml_associated_structure(node: TreeSitterNode) -> TreeSitterNode | None:
265    """Find the YAML structure (key-value pair, list item, etc.) associated with a comment."""
266    # First, check if this is an inline comment by looking for a previous sibling on the same row
267    prev_sibling_same_row = find_prev_sibling_on_same_row(node)
268    if prev_sibling_same_row:
269        return prev_sibling_same_row
270
271    # If no previous sibling on same row, try to find the next named sibling (structure after the comment)
272    structure = find_yaml_next_structure(node)
273    if structure:
274        return structure
275
276    # If no next sibling found, traverse up to find parent structure
277    parent = node.parent
278    while parent:
279        if parent.type in {"block_mapping_pair", "block_sequence_item"}:
280            return parent
281        parent = parent.parent
282
283    return None
284
285
[docs]286# @JSONC comment-to-structure association, IMPL_JSONC_2, impl, [FE_JSONC]
287def find_jsonc_associated_structure(node: TreeSitterNode) -> TreeSitterNode | None:
288    """Find the JSON structure (key/value pair, value, list item) for a comment.
289
290    JSON is data rather than code, so association follows the same intent as YAML:
291    an inline comment belongs to the value on its row, a leading comment belongs to
292    the following structure, otherwise it belongs to the enclosing structure.
293    """
294    # Inline comment: a value/pair on the same row, before the comment
295    prev_sibling_same_row = find_prev_sibling_on_same_row(node)
296    if prev_sibling_same_row:
297        return prev_sibling_same_row
298
299    # Leading comment: the next structure following the comment
300    current = node.next_named_sibling
301    while current:
302        if current.type in JSON_STRUCTURE_TYPES:
303            return current
304        current = current.next_named_sibling
305
306    # Otherwise: the enclosing structure
307    parent = node.parent
308    while parent:
309        if parent.type in {"pair", "object", "array"}:
310            return parent
311        parent = parent.parent
312
313    return None
314
315
316def find_associated_scope(
317    node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
318) -> TreeSitterNode | None:
319    """Find the associated scope of a comment."""
320    if comment_type == CommentType.yaml:
321        # YAML uses different structure association logic
322        return find_yaml_associated_structure(node)
323
324    if comment_type == CommentType.jsonc:
325        # JSONC uses data-aware structure association logic
326        return find_jsonc_associated_structure(node)
327
328    if node.type == CommentCategory.docstring:
329        # Only for python's docstring
330        return find_enclosing_scope(node, comment_type)
331    # General comments regardless of comment types
332    associated_scope = find_next_scope(node, comment_type)
333    if not associated_scope:
334        associated_scope = find_enclosing_scope(node, comment_type)
335    return associated_scope
336
337
338def locate_git_root(src_dir: Path) -> Path | None:
339    """Traverse upwards to find git root."""
340    current = src_dir.resolve()
341    parents = list(current.parents)
342    parents.append(current)
343    for parent in parents:
344        if (parent / ".git").exists() and (parent / ".git").is_dir():
345            return parent
346    logger.warning(
347        f"git root is not found in the parent of {src_dir}",
348        subtype="git_root",
349        location=str(src_dir),
350    )
351    return None
352
353
354def get_remote_url(git_root: Path, remote_name: str = "origin") -> str | None:
355    """Get remote url from .git/config."""
356    config_path = git_root / ".git" / "config"
357    if not config_path.exists():
358        logger.warning(
359            f"{config_path} does not exist",
360            subtype="git_config",
361            location=str(config_path),
362        )
363        return None
364
365    config = configparser.ConfigParser(allow_no_value=True, strict=False)
366    config.read(config_path)
367    section = f'remote "{remote_name}"'
368    if section in config and "url" in config[section]:
369        url: str = config[section]["url"]
370        return url
371    logger.warning(
372        f"remote-url is not found in {config_path}",
373        subtype="git_remote",
374        location=str(config_path),
375    )
376    return None
377
378
379def get_current_rev(git_root: Path) -> str | None:
380    """Get current commit rev from .git/HEAD."""
381    head_path = git_root / ".git" / "HEAD"
382    if not head_path.exists():
383        logger.warning(
384            f"{head_path} does not exist",
385            subtype="git_head",
386            location=str(head_path),
387        )
388        return None
389    head_content = head_path.read_text().strip()
390    if not head_content.startswith("ref: "):
391        # Detached HEAD (e.g. CI checkouts): .git/HEAD holds the commit SHA
392        # directly, which is exactly the rev we want.
393        return head_content
394
395    ref_path = git_root / ".git" / head_content.split(":", 1)[1].strip()
396    if not ref_path.exists():
397        logger.warning(
398            f"{ref_path} does not exist",
399            subtype="git_ref",
400            location=str(ref_path),
401        )
402        return None
403    return ref_path.read_text().strip()
404
405
406def form_https_url(
407    git_url: str, rev: str, project_path: Path, filepath: Path, lineno: int
408) -> str | None:
409    parsed_url = parse(git_url)
410    template = GIT_HOST_URL_TEMPLATE.get(parsed_url.platform)
411    if not template:
412        logger.warning(
413            f"Unsupported Git host: {parsed_url.platform}",
414            subtype="git_host",
415        )
416        return git_url
417    https_url = template.format(
418        owner=parsed_url.owner,
419        repo=parsed_url.repo,
420        rev=rev,
421        path=pathname2url(str(filepath.absolute().relative_to(project_path))),
422        lineno=str(lineno),
423    )
424    return https_url
425
426
427def remove_leading_sequences(text: str, leading_sequences: list[str]) -> str:
428    lines = text.splitlines(keepends=True)
429    no_comment_lines = []
430    for line in lines:
431        leading_sequence_exist = False
432        for leading_sequence in leading_sequences:
433            leading_sequence_idx = line.find(leading_sequence)
434            if leading_sequence_idx == -1:
435                continue
436            no_comment_lines.append(
437                line[leading_sequence_idx + len(leading_sequence) :]
438            )
439            leading_sequence_exist = True
440            break
441
442        if not leading_sequence_exist:
443            no_comment_lines.append(line)
444
445    return "".join(no_comment_lines)
446
447
448class ExtractedRstType(TypedDict):
449    rst_text: str
450    row_offset: int
451    start_idx: int
452    end_idx: int
453
454
[docs]455# @Extract reStructuredText blocks embedded in comments, IMPL_RST_1, impl, [FE_RST_EXTRACTION]
456def extract_rst(
457    text: str, start_marker: str, end_marker: str
458) -> ExtractedRstType | None:
459    """Extract rst from a comment.
460
461    Two use cases:
462    1. Start_marker and end_marker one the same line.
463
464    The rst text is wrapped by start and the end markers on the same line,
465    so, there is no need to remove the leading chars.ArithmeticError
466    E.g.
467    @rst  .. admonition:: title here @endrst
468
469    2. Start_marker and end_marker in different lines.
470
471    The rst text is expected to start from the next line of the start_marker
472    and ends at he previous line of the end_marker.
473    E.g.
474    @rst
475    .. admonition:: title here
476      :collapsible: open
477
478      This example is collapsible, and initially open.
479    @endrst
480    """
481    start_idx = text.find(start_marker)
482    end_idx = text.rfind(end_marker)
483    if start_idx == -1 or end_idx == -1:
484        return None
485    rst_text = text[start_idx + len(start_marker) : end_idx]
486    row_offset = len(text[:start_idx].splitlines())
487    if not rst_text.strip():
488        # empty string is out of the interest
489        return None
490    if UNIX_NEWLINE not in rst_text:
491        # single line rst text
492        oneline_rst: ExtractedRstType = {
493            "rst_text": rst_text,
494            "row_offset": row_offset,
495            "start_idx": start_idx + len(start_marker),
496            "end_idx": end_idx,
497        }
498        return oneline_rst
499
500    # multiline rst text
501
502    first_newline_idx = rst_text.find(UNIX_NEWLINE)
503    rst_text = rst_text[first_newline_idx + len(UNIX_NEWLINE) :]
504    multiline_rst: ExtractedRstType = {
505        "rst_text": rst_text,
506        "row_offset": row_offset,
507        "start_idx": start_idx
508        + len(start_marker)
509        + first_newline_idx
510        + len(UNIX_NEWLINE),
511        "end_idx": end_idx,
512    }
513
514    return multiline_rst