1from collections.abc import ByteString, Callable
2import configparser
3from pathlib import Path
4from typing import TypedDict
5from urllib.request import pathname2url
6
7from giturlparse import parse # type: ignore[import-untyped]
8from tree_sitter import Language, Parser, Point, Query, QueryCursor
9from tree_sitter import Node as TreeSitterNode
10
11from sphinx_codelinks.config import UNIX_NEWLINE, CommentCategory
12from sphinx_codelinks.logger import get_logger
13from sphinx_codelinks.source_discover.config import CommentType
14
15# Language-specific node types for scope detection.
16#
17# YAML and JSONC are intentionally absent. They are data formats, not code, so a
18# comment associates with the surrounding data structure (key/value pair, list
19# item, or scalar) rather than with an enclosing or following declaration. That
20# needs a different algorithm (inline same-row association first, scalar targets,
21# grammar-specific traversal), implemented in find_yaml_associated_structure and
22# find_jsonc_associated_structure and dispatched from find_associated_scope.
23# Those bespoke finders never read this table (only find_next_scope and
24# find_enclosing_scope do), so an entry here would be dead.
25SCOPE_NODE_TYPES = {
[docs] 26 # @Python Scope Node Types, IMPL_PY_2, impl, [FE_PY]
27 CommentType.python: {"function_definition", "class_definition"},
[docs] 28 # @C and C++ Scope Node Types, IMPL_C_2, impl, [FE_C_SUPPORT, FE_CPP]
29 CommentType.cpp: {"function_definition", "class_definition"},
30 CommentType.cs: {"method_declaration", "class_declaration", "property_declaration"},
31 # @Rust Scope Node Types, IMPL_RUST_2, impl, [FE_RUST];
32 CommentType.rust: {
33 "function_item",
34 "struct_item",
35 "enum_item",
36 "impl_item",
37 "trait_item",
38 "mod_item",
39 },
[docs] 40 # @Go Scope Node Types, IMPL_GO_4, impl, [FE_GO]
41 CommentType.go: {
42 "function_declaration",
43 "method_declaration",
44 "type_declaration",
45 "type_spec",
46 },
47}
48
49logger = get_logger(__name__)
50
51GIT_HOST_URL_TEMPLATE = {
52 "github": "https://github.com/{owner}/{repo}/blob/{rev}/{path}#L{lineno}",
53 "gitlab": "https://gitlab.com/{owner}/{repo}/-/blob/{rev}/{path}#L{lineno}",
54}
55
56PYTHON_QUERY = """
57 ; Match comments
58 (comment) @comment
59
60 ; Match docstrings inside modules, functions, or classes
61 (module (expression_statement (string)) @comment)
62 (function_definition (block (expression_statement (string)) @comment))
63 (class_definition (block (expression_statement (string)) @comment))
64 """
65CPP_QUERY = """(comment) @comment"""
66C_SHARP_QUERY = """(comment) @comment"""
67YAML_QUERY = """(comment) @comment"""
68RUST_QUERY = """
69 (line_comment) @comment
70 (block_comment) @comment
71"""
[docs] 72# @Go comment query for tree-sitter, IMPL_GO_3, impl, [FE_GO]
73GO_QUERY = """
74 (comment) @comment
75"""
76JSONC_QUERY = """(comment) @comment"""
77
78# JSON value node types that can be associated with a comment.
79JSON_STRUCTURE_TYPES = {
80 "pair",
81 "object",
82 "array",
83 "string",
84 "number",
85 "true",
86 "false",
87 "null",
88}
89
90
91def is_text_file(filepath: Path, sample_size: int = 2048) -> bool:
92 """Return True if file is likely text, False if binary."""
93 try:
94 with filepath.open("rb") as f:
95 chunk = f.read(sample_size)
96 # Quick binary heuristic: null byte present
97 if b"\x00" in chunk:
98 return False
99 # Try UTF-8 decode on the sample
100 chunk.decode("utf-8")
101 return True
102 except UnicodeDecodeError:
103 return False
104
105
[docs]106# @Tree-sitter parser initialization for multiple languages, IMPL_LANG_1, impl, [FE_C_SUPPORT, FE_CPP, FE_PY, FE_YAML, FE_RUST, FE_GO, FE_JSONC]
107def init_tree_sitter(comment_type: CommentType) -> tuple[Parser, Query]:
108 if comment_type == CommentType.cpp:
109 import tree_sitter_cpp # noqa: PLC0415
110
111 parsed_language = Language(tree_sitter_cpp.language())
112 query = Query(parsed_language, CPP_QUERY)
113 elif comment_type == CommentType.python:
114 import tree_sitter_python # noqa: PLC0415
115
116 parsed_language = Language(tree_sitter_python.language())
117 query = Query(parsed_language, PYTHON_QUERY)
118 elif comment_type == CommentType.cs:
119 import tree_sitter_c_sharp # noqa: PLC0415
120
121 parsed_language = Language(tree_sitter_c_sharp.language())
122 query = Query(parsed_language, C_SHARP_QUERY)
123 elif comment_type == CommentType.yaml:
124 import tree_sitter_yaml # noqa: PLC0415
125
126 parsed_language = Language(tree_sitter_yaml.language())
127 query = Query(parsed_language, YAML_QUERY)
128 elif comment_type == CommentType.rust:
129 import tree_sitter_rust # noqa: PLC0415
130
131 parsed_language = Language(tree_sitter_rust.language())
132 query = Query(parsed_language, RUST_QUERY)
133 elif comment_type == CommentType.go:
134 import tree_sitter_go # noqa: PLC0415
135
136 parsed_language = Language(tree_sitter_go.language())
137 query = Query(parsed_language, GO_QUERY)
138 elif comment_type == CommentType.jsonc:
139 import tree_sitter_json # noqa: PLC0415
140
141 parsed_language = Language(tree_sitter_json.language())
142 query = Query(parsed_language, JSONC_QUERY)
143 else:
144 raise ValueError(f"Unsupported comment style: {comment_type}")
145 parser = Parser(parsed_language)
146 return parser, query
147
148
149def wrap_read_callable_point(
150 src_string: ByteString,
151) -> Callable[[int, Point], ByteString]:
152 def read_callable_byte_offset(byte_offset: int, _: Point) -> ByteString:
153 return src_string[byte_offset : byte_offset + 1]
154
155 return read_callable_byte_offset
156
157
[docs]158# @Comment extraction from source code using tree-sitter, IMPL_EXTR_1, impl, [FE_DEF]
159def extract_comments(
160 src_string: ByteString, parser: Parser, query: Query
161) -> list[TreeSitterNode] | None:
162 """Get all comments from source files by tree-sitter."""
163 read_point_fn = wrap_read_callable_point(src_string)
164 tree = parser.parse(read_point_fn)
165 query_cursor = QueryCursor(query)
166 captures: dict[str, list[TreeSitterNode]] = query_cursor.captures(tree.root_node)
167
168 return captures.get("comment")
169
170
171def find_enclosing_scope(
172 node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
173) -> TreeSitterNode | None:
174 """Find the enclosing scope of a comment."""
175 scope_types = SCOPE_NODE_TYPES.get(comment_type, SCOPE_NODE_TYPES[CommentType.cpp])
176 current: TreeSitterNode = node
177 while current:
178 if current.type in scope_types:
179 return current
180 current: TreeSitterNode | None = current.parent # type: ignore[no-redef] # required for node traversal
181 return None
182
183
184def find_next_scope(
185 node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
186) -> TreeSitterNode | None:
187 """Find the next scope of a comment."""
188 scope_types = SCOPE_NODE_TYPES.get(comment_type, SCOPE_NODE_TYPES[CommentType.cpp])
189 current: TreeSitterNode = node
190 while current:
191 if current.type in scope_types:
192 return current
193 current: TreeSitterNode | None = current.next_named_sibling # type: ignore[no-redef] # required for node traversal
194 if current and current.type == "block":
195 for child in current.named_children:
196 if child.type in scope_types:
197 return child
198 return None
199
200
201def _find_yaml_structure_in_block_node(
202 block_node: TreeSitterNode,
203) -> TreeSitterNode | None:
204 """Find YAML structure elements within a block_node."""
205 for grandchild in block_node.named_children:
206 if grandchild.type == "block_mapping":
207 for ggchild in grandchild.named_children:
208 if ggchild.type == "block_mapping_pair":
209 return ggchild
210 elif grandchild.type == "block_sequence":
211 for ggchild in grandchild.named_children:
212 if ggchild.type == "block_sequence_item":
213 return ggchild
214 return None
215
216
217def find_yaml_next_structure(node: TreeSitterNode) -> TreeSitterNode | None:
218 """Find the next YAML structure element after the comment node."""
219 current = node.next_named_sibling
220 while current:
221 if current.type in {
222 "block_mapping_pair",
223 "block_sequence_item",
224 "flow_mapping",
225 "flow_sequence",
226 }:
227 return current
228 if current.type == "document":
229 for child in current.named_children:
230 if child.type == "block_node":
231 result = _find_yaml_structure_in_block_node(child)
232 if result:
233 return result
234 if current.type == "block_node":
235 result = _find_yaml_structure_in_block_node(current)
236 if result:
237 return result
238 current = current.next_named_sibling
239 return None
240
241
242def find_prev_sibling_on_same_row(node: TreeSitterNode) -> TreeSitterNode | None:
243 """Find a previous named sibling that is on the same row as the comment.
244
245 Grammar-agnostic: used to detect inline comments in both YAML and JSONC.
246 """
247 comment_row = node.start_point.row
248 current = node.prev_named_sibling
249
250 while current:
251 # Check if this sibling ends on the same row as the comment starts
252 # This indicates it's an inline comment
253 if current.end_point.row == comment_row:
254 return current
255 # If we find a sibling that ends before the comment row, we can stop
256 # as we won't find any siblings on the same row going backwards
257 if current.end_point.row < comment_row:
258 break
259 current = current.prev_named_sibling
260
261 return None
262
263
264def find_yaml_associated_structure(node: TreeSitterNode) -> TreeSitterNode | None:
265 """Find the YAML structure (key-value pair, list item, etc.) associated with a comment."""
266 # First, check if this is an inline comment by looking for a previous sibling on the same row
267 prev_sibling_same_row = find_prev_sibling_on_same_row(node)
268 if prev_sibling_same_row:
269 return prev_sibling_same_row
270
271 # If no previous sibling on same row, try to find the next named sibling (structure after the comment)
272 structure = find_yaml_next_structure(node)
273 if structure:
274 return structure
275
276 # If no next sibling found, traverse up to find parent structure
277 parent = node.parent
278 while parent:
279 if parent.type in {"block_mapping_pair", "block_sequence_item"}:
280 return parent
281 parent = parent.parent
282
283 return None
284
285
[docs]286# @JSONC comment-to-structure association, IMPL_JSONC_2, impl, [FE_JSONC]
287def find_jsonc_associated_structure(node: TreeSitterNode) -> TreeSitterNode | None:
288 """Find the JSON structure (key/value pair, value, list item) for a comment.
289
290 JSON is data rather than code, so association follows the same intent as YAML:
291 an inline comment belongs to the value on its row, a leading comment belongs to
292 the following structure, otherwise it belongs to the enclosing structure.
293 """
294 # Inline comment: a value/pair on the same row, before the comment
295 prev_sibling_same_row = find_prev_sibling_on_same_row(node)
296 if prev_sibling_same_row:
297 return prev_sibling_same_row
298
299 # Leading comment: the next structure following the comment
300 current = node.next_named_sibling
301 while current:
302 if current.type in JSON_STRUCTURE_TYPES:
303 return current
304 current = current.next_named_sibling
305
306 # Otherwise: the enclosing structure
307 parent = node.parent
308 while parent:
309 if parent.type in {"pair", "object", "array"}:
310 return parent
311 parent = parent.parent
312
313 return None
314
315
316def find_associated_scope(
317 node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
318) -> TreeSitterNode | None:
319 """Find the associated scope of a comment."""
320 if comment_type == CommentType.yaml:
321 # YAML uses different structure association logic
322 return find_yaml_associated_structure(node)
323
324 if comment_type == CommentType.jsonc:
325 # JSONC uses data-aware structure association logic
326 return find_jsonc_associated_structure(node)
327
328 if node.type == CommentCategory.docstring:
329 # Only for python's docstring
330 return find_enclosing_scope(node, comment_type)
331 # General comments regardless of comment types
332 associated_scope = find_next_scope(node, comment_type)
333 if not associated_scope:
334 associated_scope = find_enclosing_scope(node, comment_type)
335 return associated_scope
336
337
338def locate_git_root(src_dir: Path) -> Path | None:
339 """Traverse upwards to find git root."""
340 current = src_dir.resolve()
341 parents = list(current.parents)
342 parents.append(current)
343 for parent in parents:
344 if (parent / ".git").exists() and (parent / ".git").is_dir():
345 return parent
346 logger.warning(
347 f"git root is not found in the parent of {src_dir}",
348 subtype="git_root",
349 location=str(src_dir),
350 )
351 return None
352
353
354def get_remote_url(git_root: Path, remote_name: str = "origin") -> str | None:
355 """Get remote url from .git/config."""
356 config_path = git_root / ".git" / "config"
357 if not config_path.exists():
358 logger.warning(
359 f"{config_path} does not exist",
360 subtype="git_config",
361 location=str(config_path),
362 )
363 return None
364
365 config = configparser.ConfigParser(allow_no_value=True, strict=False)
366 config.read(config_path)
367 section = f'remote "{remote_name}"'
368 if section in config and "url" in config[section]:
369 url: str = config[section]["url"]
370 return url
371 logger.warning(
372 f"remote-url is not found in {config_path}",
373 subtype="git_remote",
374 location=str(config_path),
375 )
376 return None
377
378
379def get_current_rev(git_root: Path) -> str | None:
380 """Get current commit rev from .git/HEAD."""
381 head_path = git_root / ".git" / "HEAD"
382 if not head_path.exists():
383 logger.warning(
384 f"{head_path} does not exist",
385 subtype="git_head",
386 location=str(head_path),
387 )
388 return None
389 head_content = head_path.read_text().strip()
390 if not head_content.startswith("ref: "):
391 # Detached HEAD (e.g. CI checkouts): .git/HEAD holds the commit SHA
392 # directly, which is exactly the rev we want.
393 return head_content
394
395 ref_path = git_root / ".git" / head_content.split(":", 1)[1].strip()
396 if not ref_path.exists():
397 logger.warning(
398 f"{ref_path} does not exist",
399 subtype="git_ref",
400 location=str(ref_path),
401 )
402 return None
403 return ref_path.read_text().strip()
404
405
406def form_https_url(
407 git_url: str, rev: str, project_path: Path, filepath: Path, lineno: int
408) -> str | None:
409 parsed_url = parse(git_url)
410 template = GIT_HOST_URL_TEMPLATE.get(parsed_url.platform)
411 if not template:
412 logger.warning(
413 f"Unsupported Git host: {parsed_url.platform}",
414 subtype="git_host",
415 )
416 return git_url
417 https_url = template.format(
418 owner=parsed_url.owner,
419 repo=parsed_url.repo,
420 rev=rev,
421 path=pathname2url(str(filepath.absolute().relative_to(project_path))),
422 lineno=str(lineno),
423 )
424 return https_url
425
426
427def remove_leading_sequences(text: str, leading_sequences: list[str]) -> str:
428 lines = text.splitlines(keepends=True)
429 no_comment_lines = []
430 for line in lines:
431 leading_sequence_exist = False
432 for leading_sequence in leading_sequences:
433 leading_sequence_idx = line.find(leading_sequence)
434 if leading_sequence_idx == -1:
435 continue
436 no_comment_lines.append(
437 line[leading_sequence_idx + len(leading_sequence) :]
438 )
439 leading_sequence_exist = True
440 break
441
442 if not leading_sequence_exist:
443 no_comment_lines.append(line)
444
445 return "".join(no_comment_lines)
446
447
448class ExtractedRstType(TypedDict):
449 rst_text: str
450 row_offset: int
451 start_idx: int
452 end_idx: int
453
454
[docs]455# @Extract reStructuredText blocks embedded in comments, IMPL_RST_1, impl, [FE_RST_EXTRACTION]
456def extract_rst(
457 text: str, start_marker: str, end_marker: str
458) -> ExtractedRstType | None:
459 """Extract rst from a comment.
460
461 Two use cases:
462 1. Start_marker and end_marker one the same line.
463
464 The rst text is wrapped by start and the end markers on the same line,
465 so, there is no need to remove the leading chars.ArithmeticError
466 E.g.
467 @rst .. admonition:: title here @endrst
468
469 2. Start_marker and end_marker in different lines.
470
471 The rst text is expected to start from the next line of the start_marker
472 and ends at he previous line of the end_marker.
473 E.g.
474 @rst
475 .. admonition:: title here
476 :collapsible: open
477
478 This example is collapsible, and initially open.
479 @endrst
480 """
481 start_idx = text.find(start_marker)
482 end_idx = text.rfind(end_marker)
483 if start_idx == -1 or end_idx == -1:
484 return None
485 rst_text = text[start_idx + len(start_marker) : end_idx]
486 row_offset = len(text[:start_idx].splitlines())
487 if not rst_text.strip():
488 # empty string is out of the interest
489 return None
490 if UNIX_NEWLINE not in rst_text:
491 # single line rst text
492 oneline_rst: ExtractedRstType = {
493 "rst_text": rst_text,
494 "row_offset": row_offset,
495 "start_idx": start_idx + len(start_marker),
496 "end_idx": end_idx,
497 }
498 return oneline_rst
499
500 # multiline rst text
501
502 first_newline_idx = rst_text.find(UNIX_NEWLINE)
503 rst_text = rst_text[first_newline_idx + len(UNIX_NEWLINE) :]
504 multiline_rst: ExtractedRstType = {
505 "rst_text": rst_text,
506 "row_offset": row_offset,
507 "start_idx": start_idx
508 + len(start_marker)
509 + first_newline_idx
510 + len(UNIX_NEWLINE),
511 "end_idx": end_idx,
512 }
513
514 return multiline_rst