2025-05-24 12:15:48 +02:00

48 lines
1.8 KiB
Python

import os
def chunk_file(input_file, output_dir=None, start_num=1, padding=2):
"""
Split a file into chunks and save each chunk as a separate file.
Args:
input_file (str): Path to the input file
output_dir (str, optional): Directory to save chunk files. Defaults to current directory.
start_num (int, optional): Starting number for the chunk files. Defaults to 1.
padding (int, optional): Number of digits to pad the incremental numbers. Defaults to 2.
"""
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(input_file) as f:
content = f.read()
chunks = content.split("---")
chunk_count = start_num
for chunk in chunks:
chunk = chunk.replace('---', '').strip()
if not chunk: # Skip empty chunks
continue
# Define output path with padded incremental number
file_name = f'chunk_{chunk_count:0{padding}d}.md'
if output_dir:
outfile_path = os.path.join(output_dir, file_name)
else:
outfile_path = file_name
with open(outfile_path, 'w') as outfile:
outfile.write(chunk)
chunk_count += 1
return chunk_count - start_num # Return the number of chunks written
# Example usage
if __name__ == "__main__":
#input_file = "/home/gra/PycharmProjects/librarian_vspace/examples/chunks/knowledge_chunks_detailed.md"
input_file = "/home/gra/PycharmProjects/librarian_vspace/examples/chunks/knowledge_chunks_1500.md"
# You can specify an output directory or omit it to use the current directory
output_dir = "/examples/chunks/chunk_md_x"
chunk_file(input_file, output_dir)