1from collections.abc import ByteString, Callable
2import configparser
3import logging
4from pathlib import Path
5from typing import TypedDict
6from urllib.request import pathname2url
7
8from giturlparse import parse # type: ignore[import-untyped]
9from tree_sitter import Language, Parser, Point, Query, QueryCursor
10from tree_sitter import Node as TreeSitterNode
11
12from sphinx_codelinks.config import UNIX_NEWLINE, CommentCategory
13from sphinx_codelinks.source_discover.config import CommentType
14
15# Language-specific node types for scope detection
16SCOPE_NODE_TYPES = {
[docs] 17 # @Python Scope Node Types, IMPL_PY_2, impl, [FE_PY]
18 CommentType.python: {"function_definition", "class_definition"},
[docs] 19 # @C and C++ Scope Node Types, IMPL_C_2, impl, [FE_C_SUPPORT, FE_CPP]
20 CommentType.cpp: {"function_definition", "class_definition"},
21 CommentType.cs: {"method_declaration", "class_declaration", "property_declaration"},
22 CommentType.yaml: {"block_mapping_pair", "block_sequence_item", "document"},
23 # @Rust Scope Node Types, IMPL_RUST_2, impl, [FE_RUST];
24 CommentType.rust: {
25 "function_item",
26 "struct_item",
27 "enum_item",
28 "impl_item",
29 "trait_item",
30 "mod_item",
31 },
32}
33
34# initialize logger
35logger = logging.getLogger(__name__)
36logger.setLevel(logging.INFO)
37# log to the console
38console = logging.StreamHandler()
39console.setLevel(logging.INFO)
40logger.addHandler(console)
41
42GIT_HOST_URL_TEMPLATE = {
43 "github": "https://github.com/{owner}/{repo}/blob/{rev}/{path}#L{lineno}",
44 "gitlab": "https://gitlab.com/{owner}/{repo}/-/blob/{rev}/{path}#L{lineno}",
45}
46
47PYTHON_QUERY = """
48 ; Match comments
49 (comment) @comment
50
51 ; Match docstrings inside modules, functions, or classes
52 (module (expression_statement (string)) @comment)
53 (function_definition (block (expression_statement (string)) @comment))
54 (class_definition (block (expression_statement (string)) @comment))
55 """
56CPP_QUERY = """(comment) @comment"""
57C_SHARP_QUERY = """(comment) @comment"""
58YAML_QUERY = """(comment) @comment"""
59RUST_QUERY = """
60 (line_comment) @comment
61 (block_comment) @comment
62"""
63
64
65def is_text_file(filepath: Path, sample_size: int = 2048) -> bool:
66 """Return True if file is likely text, False if binary."""
67 try:
68 with filepath.open("rb") as f:
69 chunk = f.read(sample_size)
70 # Quick binary heuristic: null byte present
71 if b"\x00" in chunk:
72 return False
73 # Try UTF-8 decode on the sample
74 chunk.decode("utf-8")
75 return True
76 except UnicodeDecodeError:
77 return False
78
79
[docs] 80# @Tree-sitter parser initialization for multiple languages, IMPL_LANG_1, impl, [FE_C_SUPPORT, FE_CPP, FE_PY, FE_YAML, FE_RUST]
81def init_tree_sitter(comment_type: CommentType) -> tuple[Parser, Query]:
82 if comment_type == CommentType.cpp:
83 import tree_sitter_cpp # noqa: PLC0415
84
85 parsed_language = Language(tree_sitter_cpp.language())
86 query = Query(parsed_language, CPP_QUERY)
87 elif comment_type == CommentType.python:
88 import tree_sitter_python # noqa: PLC0415
89
90 parsed_language = Language(tree_sitter_python.language())
91 query = Query(parsed_language, PYTHON_QUERY)
92 elif comment_type == CommentType.cs:
93 import tree_sitter_c_sharp # noqa: PLC0415
94
95 parsed_language = Language(tree_sitter_c_sharp.language())
96 query = Query(parsed_language, C_SHARP_QUERY)
97 elif comment_type == CommentType.yaml:
98 import tree_sitter_yaml # noqa: PLC0415
99
100 parsed_language = Language(tree_sitter_yaml.language())
101 query = Query(parsed_language, YAML_QUERY)
102 elif comment_type == CommentType.rust:
103 import tree_sitter_rust # noqa: PLC0415
104
105 parsed_language = Language(tree_sitter_rust.language())
106 query = Query(parsed_language, RUST_QUERY)
107 else:
108 raise ValueError(f"Unsupported comment style: {comment_type}")
109 parser = Parser(parsed_language)
110 return parser, query
111
112
113def wrap_read_callable_point(
114 src_string: ByteString,
115) -> Callable[[int, Point], ByteString]:
116 def read_callable_byte_offset(byte_offset: int, _: Point) -> ByteString:
117 return src_string[byte_offset : byte_offset + 1]
118
119 return read_callable_byte_offset
120
121
[docs]122# @Comment extraction from source code using tree-sitter, IMPL_EXTR_1, impl, [FE_DEF]
123def extract_comments(
124 src_string: ByteString, parser: Parser, query: Query
125) -> list[TreeSitterNode] | None:
126 """Get all comments from source files by tree-sitter."""
127 read_point_fn = wrap_read_callable_point(src_string)
128 tree = parser.parse(read_point_fn)
129 query_cursor = QueryCursor(query)
130 captures: dict[str, list[TreeSitterNode]] = query_cursor.captures(tree.root_node)
131
132 return captures.get("comment")
133
134
135def find_enclosing_scope(
136 node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
137) -> TreeSitterNode | None:
138 """Find the enclosing scope of a comment."""
139 scope_types = SCOPE_NODE_TYPES.get(comment_type, SCOPE_NODE_TYPES[CommentType.cpp])
140 current: TreeSitterNode = node
141 while current:
142 if current.type in scope_types:
143 return current
144 current: TreeSitterNode | None = current.parent # type: ignore[no-redef] # required for node traversal
145 return None
146
147
148def find_next_scope(
149 node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
150) -> TreeSitterNode | None:
151 """Find the next scope of a comment."""
152 scope_types = SCOPE_NODE_TYPES.get(comment_type, SCOPE_NODE_TYPES[CommentType.cpp])
153 current: TreeSitterNode = node
154 while current:
155 if current.type in scope_types:
156 return current
157 current: TreeSitterNode | None = current.next_named_sibling # type: ignore[no-redef] # required for node traversal
158 if current and current.type == "block":
159 for child in current.named_children:
160 if child.type in scope_types:
161 return child
162 return None
163
164
165def _find_yaml_structure_in_block_node(
166 block_node: TreeSitterNode,
167) -> TreeSitterNode | None:
168 """Find YAML structure elements within a block_node."""
169 for grandchild in block_node.named_children:
170 if grandchild.type == "block_mapping":
171 for ggchild in grandchild.named_children:
172 if ggchild.type == "block_mapping_pair":
173 return ggchild
174 elif grandchild.type == "block_sequence":
175 for ggchild in grandchild.named_children:
176 if ggchild.type == "block_sequence_item":
177 return ggchild
178 return None
179
180
181def find_yaml_next_structure(node: TreeSitterNode) -> TreeSitterNode | None:
182 """Find the next YAML structure element after the comment node."""
183 current = node.next_named_sibling
184 while current:
185 if current.type in {
186 "block_mapping_pair",
187 "block_sequence_item",
188 "flow_mapping",
189 "flow_sequence",
190 }:
191 return current
192 if current.type == "document":
193 for child in current.named_children:
194 if child.type == "block_node":
195 result = _find_yaml_structure_in_block_node(child)
196 if result:
197 return result
198 if current.type == "block_node":
199 result = _find_yaml_structure_in_block_node(current)
200 if result:
201 return result
202 current = current.next_named_sibling
203 return None
204
205
206def find_yaml_prev_sibling_on_same_row(node: TreeSitterNode) -> TreeSitterNode | None:
207 """Find a previous named sibling that is on the same row as the comment."""
208 comment_row = node.start_point.row
209 current = node.prev_named_sibling
210
211 while current:
212 # Check if this sibling ends on the same row as the comment starts
213 # This indicates it's an inline comment
214 if current.end_point.row == comment_row:
215 return current
216 # If we find a sibling that ends before the comment row, we can stop
217 # as we won't find any siblings on the same row going backwards
218 if current.end_point.row < comment_row:
219 break
220 current = current.prev_named_sibling
221
222 return None
223
224
225def find_yaml_associated_structure(node: TreeSitterNode) -> TreeSitterNode | None:
226 """Find the YAML structure (key-value pair, list item, etc.) associated with a comment."""
227 # First, check if this is an inline comment by looking for a previous sibling on the same row
228 prev_sibling_same_row = find_yaml_prev_sibling_on_same_row(node)
229 if prev_sibling_same_row:
230 return prev_sibling_same_row
231
232 # If no previous sibling on same row, try to find the next named sibling (structure after the comment)
233 structure = find_yaml_next_structure(node)
234 if structure:
235 return structure
236
237 # If no next sibling found, traverse up to find parent structure
238 parent = node.parent
239 while parent:
240 if parent.type in {"block_mapping_pair", "block_sequence_item"}:
241 return parent
242 parent = parent.parent
243
244 return None
245
246
247def find_associated_scope(
248 node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
249) -> TreeSitterNode | None:
250 """Find the associated scope of a comment."""
251 if comment_type == CommentType.yaml:
252 # YAML uses different structure association logic
253 return find_yaml_associated_structure(node)
254
255 if node.type == CommentCategory.docstring:
256 # Only for python's docstring
257 return find_enclosing_scope(node, comment_type)
258 # General comments regardless of comment types
259 associated_scope = find_next_scope(node, comment_type)
260 if not associated_scope:
261 associated_scope = find_enclosing_scope(node, comment_type)
262 return associated_scope
263
264
265def locate_git_root(src_dir: Path) -> Path | None:
266 """Traverse upwards to find git root."""
267 current = src_dir.resolve()
268 parents = list(current.parents)
269 parents.append(current)
270 for parent in parents:
271 if (parent / ".git").exists() and (parent / ".git").is_dir():
272 return parent
273 logger.warning(f"git root is not found in the parent of {src_dir}")
274 return None
275
276
277def get_remote_url(git_root: Path, remote_name: str = "origin") -> str | None:
278 """Get remote url from .git/config."""
279 config_path = git_root / ".git" / "config"
280 if not config_path.exists():
281 logging.warning(f"{config_path} does not exist")
282 return None
283
284 config = configparser.ConfigParser(allow_no_value=True, strict=False)
285 config.read(config_path)
286 section = f'remote "{remote_name}"'
287 if section in config and "url" in config[section]:
288 url: str = config[section]["url"]
289 return url
290 logger.warning(f"remote-url is not found in {config_path}")
291 return None
292
293
294def get_current_rev(git_root: Path) -> str | None:
295 """Get current commit rev from .git/HEAD."""
296 head_path = git_root / ".git" / "HEAD"
297 if not head_path.exists():
298 logging.warning(f"{head_path} does not exist")
299 return None
300 head_content = head_path.read_text().strip()
301 if not head_content.startswith("ref: "):
302 logging.warning(f"Expect starting with 'ref: ' in {head_path}")
303 return None
304
305 ref_path = git_root / ".git" / head_content.split(":", 1)[1].strip()
306 if not ref_path.exists():
307 logging.warning(f"{ref_path} does not exist")
308 return None
309 return ref_path.read_text().strip()
310
311
312def form_https_url(
313 git_url: str, rev: str, project_path: Path, filepath: Path, lineno: int
314) -> str | None:
315 parsed_url = parse(git_url)
316 template = GIT_HOST_URL_TEMPLATE.get(parsed_url.platform)
317 if not template:
318 logging.warning(f"Unsupported Git host: {parsed_url.platform}")
319 return git_url
320 https_url = template.format(
321 owner=parsed_url.owner,
322 repo=parsed_url.repo,
323 rev=rev,
324 path=pathname2url(str(filepath.absolute().relative_to(project_path))),
325 lineno=str(lineno),
326 )
327 return https_url
328
329
330def remove_leading_sequences(text: str, leading_sequences: list[str]) -> str:
331 lines = text.splitlines(keepends=True)
332 no_comment_lines = []
333 for line in lines:
334 leading_sequence_exist = False
335 for leading_sequence in leading_sequences:
336 leading_sequence_idx = line.find(leading_sequence)
337 if leading_sequence_idx == -1:
338 continue
339 no_comment_lines.append(
340 line[leading_sequence_idx + len(leading_sequence) :]
341 )
342 leading_sequence_exist = True
343 break
344
345 if not leading_sequence_exist:
346 no_comment_lines.append(line)
347
348 return "".join(no_comment_lines)
349
350
351class ExtractedRstType(TypedDict):
352 rst_text: str
353 row_offset: int
354 start_idx: int
355 end_idx: int
356
357
[docs]358# @Extract reStructuredText blocks embedded in comments, IMPL_RST_1, impl, [FE_RST_EXTRACTION]
359def extract_rst(
360 text: str, start_marker: str, end_marker: str
361) -> ExtractedRstType | None:
362 """Extract rst from a comment.
363
364 Two use cases:
365 1. Start_marker and end_marker one the same line.
366
367 The rst text is wrapped by start and the end markers on the same line,
368 so, there is no need to remove the leading chars.ArithmeticError
369 E.g.
370 @rst .. admonition:: title here @endrst
371
372 2. Start_marker and end_marker in different lines.
373
374 The rst text is expected to start from the next line of the start_marker
375 and ends at he previous line of the end_marker.
376 E.g.
377 @rst
378 .. admonition:: title here
379 :collapsible: open
380
381 This example is collapsible, and initially open.
382 @endrst
383 """
384 start_idx = text.find(start_marker)
385 end_idx = text.rfind(end_marker)
386 if start_idx == -1 or end_idx == -1:
387 return None
388 rst_text = text[start_idx + len(start_marker) : end_idx]
389 row_offset = len(text[:start_idx].splitlines())
390 if not rst_text.strip():
391 # empty string is out of the interest
392 return None
393 if UNIX_NEWLINE not in rst_text:
394 # single line rst text
395 oneline_rst: ExtractedRstType = {
396 "rst_text": rst_text,
397 "row_offset": row_offset,
398 "start_idx": start_idx + len(start_marker),
399 "end_idx": end_idx,
400 }
401 return oneline_rst
402
403 # multiline rst text
404
405 first_newline_idx = rst_text.find(UNIX_NEWLINE)
406 rst_text = rst_text[first_newline_idx + len(UNIX_NEWLINE) :]
407 multiline_rst: ExtractedRstType = {
408 "rst_text": rst_text,
409 "row_offset": row_offset,
410 "start_idx": start_idx
411 + len(start_marker)
412 + first_newline_idx
413 + len(UNIX_NEWLINE),
414 "end_idx": end_idx,
415 }
416
417 return multiline_rst