From 05d28147f4434e902a167124dcc8b82e14b171d7 Mon Sep 17 00:00:00 2001 From: Zhe Yu Date: Sun, 4 May 2025 20:53:03 +0800 Subject: [PATCH] fix(cli): preserve spaces and newlines in treesitter chunking. --- src/vectorcode/chunking.py | 13 +++++++++++-- tests/test_chunking.py | 13 ++++++++----- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py index 55bb6e10..ef13e8c0 100644 --- a/src/vectorcode/chunking.py +++ b/src/vectorcode/chunking.py @@ -150,7 +150,7 @@ def __chunk_node( f"Traversing at node {node.text.decode()} at position {node.byte_range}" ) current_chunk: str = "" - + prev_node = None current_start = None for child in node.children: @@ -184,10 +184,19 @@ def __chunk_node( current_start = Point( row=child.start_point.row + 1, column=child.start_point.column ) + prev_node = child - elif len(current_chunk) + child_length <= self.config.chunk_size: + elif len(current_chunk) + child_length + 1 <= self.config.chunk_size: # Add to current chunk + if prev_node: + if prev_node.end_point.row != child.start_point.row: + current_chunk += "\n" + else: + current_chunk += " " * ( + child.start_point.column - prev_node.end_point.column + ) current_chunk += child_bytes.decode() + prev_node = child else: # Yield current chunk and start new one diff --git a/tests/test_chunking.py b/tests/test_chunking.py index c13fb46b..094b7b02 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -250,7 +250,7 @@ def bar(): def test_treesitter_chunker_filter_wildcard(): - chunker = TreeSitterChunker(Config(chunk_size=30, chunk_filters={"*": [".*foo.*"]})) + chunker = TreeSitterChunker(Config(chunk_size=35, chunk_filters={"*": [".*foo.*"]})) test_content = r""" def foo(): @@ -283,12 +283,12 @@ def bar(): test_file = tmp_file.name chunks = list(str(i) for i in chunker.chunk(test_file)) - assert chunks == ['functionbar()return "bar"end'] + assert chunks == ['function bar()\n return "bar"\nend'] os.remove(test_file) def test_treesitter_chunker_lua(): - chunker = TreeSitterChunker(Config(chunk_size=30)) + chunker = TreeSitterChunker(Config(chunk_size=35)) test_content = r""" function foo() return "foo" @@ -304,7 +304,10 @@ def test_treesitter_chunker_lua(): test_file = tmp_file.name chunks = list(str(i) for i in chunker.chunk(test_file)) - assert chunks == ['functionfoo()return "foo"end', 'functionbar()return "bar"end'] + assert chunks == [ + 'function foo()\n return "foo"\nend', + 'function bar()\n return "bar"\nend', + ] os.remove(test_file) @@ -403,7 +406,7 @@ def bar(): assert len(chunks) >= 2 # Should have at least 2 chunks # First chunk should contain the function definition start - assert "deffoo():" in chunks[0].text + assert "def foo():" in chunks[0].text assert chunks[0].start == Point(1, 0) # Last chunk should contain the final return statement