48 lines
1.8 KiB
Python
48 lines
1.8 KiB
Python
import os
|
|
|
|
def chunk_file(input_file, output_dir=None, start_num=1, padding=2):
|
|
"""
|
|
Split a file into chunks and save each chunk as a separate file.
|
|
|
|
Args:
|
|
input_file (str): Path to the input file
|
|
output_dir (str, optional): Directory to save chunk files. Defaults to current directory.
|
|
start_num (int, optional): Starting number for the chunk files. Defaults to 1.
|
|
padding (int, optional): Number of digits to pad the incremental numbers. Defaults to 2.
|
|
"""
|
|
if output_dir and not os.path.exists(output_dir):
|
|
os.makedirs(output_dir)
|
|
|
|
with open(input_file) as f:
|
|
content = f.read()
|
|
chunks = content.split("---")
|
|
|
|
chunk_count = start_num
|
|
for chunk in chunks:
|
|
chunk = chunk.replace('---', '').strip()
|
|
if not chunk: # Skip empty chunks
|
|
continue
|
|
|
|
# Define output path with padded incremental number
|
|
file_name = f'chunk_{chunk_count:0{padding}d}.md'
|
|
if output_dir:
|
|
outfile_path = os.path.join(output_dir, file_name)
|
|
else:
|
|
outfile_path = file_name
|
|
|
|
with open(outfile_path, 'w') as outfile:
|
|
outfile.write(chunk)
|
|
|
|
chunk_count += 1
|
|
|
|
return chunk_count - start_num # Return the number of chunks written
|
|
|
|
# Example usage
|
|
if __name__ == "__main__":
|
|
#input_file = "/home/gra/PycharmProjects/librarian_vspace/examples/chunks/knowledge_chunks_detailed.md"
|
|
input_file = "/home/gra/PycharmProjects/librarian_vspace/examples/chunks/knowledge_chunks_1500.md"
|
|
# You can specify an output directory or omit it to use the current directory
|
|
output_dir = "/examples/chunks/chunk_md_x"
|
|
chunk_file(input_file, output_dir)
|
|
|