Perform compression in batches for texts exceeding the 8192 token limit of llama3.
dillfrescott opened this issue · 6 comments
dillfrescott commented
This would be a nice feature!
dillfrescott commented
I've noticed that a good chunk size could be 20k characters as that usually is just below the token limit. But I might be slightly off.
dillfrescott commented
Here is my attempt at a batch script (with the help of chatgpt):
import sys
import os
import subprocess
def split_file(input_file, chunk_size=20000, output_dir="pieces"):
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Read the content of the file
with open(input_file, 'r', encoding='utf-8') as file:
content = file.read()
# Determine the number of chunks
num_chunks = len(content) // chunk_size + (1 if len(content) % chunk_size > 0 else 0)
# Split the content into chunks and write to separate files in the output directory
base_filename = os.path.splitext(os.path.basename(input_file))[0]
chunk_files = []
for i in range(num_chunks):
chunk_content = content[i * chunk_size:(i + 1) * chunk_size]
chunk_filename = os.path.join(output_dir, f"{base_filename}_part{i + 1}.txt")
with open(chunk_filename, 'w', encoding='utf-8') as chunk_file:
chunk_file.write(chunk_content)
chunk_files.append(chunk_filename)
print(f"Created {chunk_filename}")
return chunk_files
def compress_files(chunk_files, compressed_dir="compressed"):
# Create the compressed directory if it doesn't exist
if not os.path.exists(compressed_dir):
os.makedirs(compressed_dir)
# Iterate over chunk files and compress each
for chunk_file in chunk_files:
base_filename = os.path.splitext(os.path.basename(chunk_file))[0]
compressed_filename = os.path.join(compressed_dir, f"{base_filename}.compressed.txt")
command = f"llama-zip Meta-Llama-3-8B-Instruct.Q8_0.gguf -c < {chunk_file} > {compressed_filename}"
try:
subprocess.run(command, shell=True, check=True)
print(f"Compressed {chunk_file} to {compressed_filename}")
except subprocess.CalledProcessError as e:
print(f"Error compressing {chunk_file}: {e}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <file.txt>")
sys.exit(1)
input_file = sys.argv[1]
if not os.path.isfile(input_file):
print(f"File {input_file} does not exist.")
sys.exit(1)
chunk_files = split_file(input_file)
compress_files(chunk_files)
AlexBuz commented
I think a batching feature is a great idea, perhaps paired with some partial context overlap with the previous batch to improve inference quality. I will look into this. Of course, the input will have to be split post-tokenization, though, as 20k characters is not guaranteed to fit within the context window.
dillfrescott commented
Yes, agreed! I'm also glad you think it's a good idea!
AlexBuz commented
Arbitrarily long inputs are now supported. Enjoy!
dillfrescott commented
Awesome!! Thanks!