import tiktoken def read_text_file(file_path: str) -> str: """Read the contents of a text file and return as a string.""" with open(file_path, 'r', encoding='utf-8') as file: file_content = file.read() return file_content import tiktoken def read_text_file(file_path: str) -> str: """Read the contents of a text file and return as a string.""" with open(file_path, 'r', encoding='utf-8') as file: file_content = file.read() return file_content def split_string_by_tokens(text: str, max_tokens: int, model: str = "gpt-3.5-turbo") -> list: """Split a string into chunks based on a maximum number of tokens.""" chunks = [] # will hold the output encoding = tiktoken.encoding_for_model(model) tokens = encoding.encode(text) while len(tokens) > 0: #if length of tokens is less than max tokens if len(tokens) <= max_tokens: #append last chunk chunks.append(text) break # Find the split point split_point = max_tokens # Ensure the split point occurs at a word boundary while split_point > 0 and text[split_point] != ' ': split_point -= 1 #tokens are not text # Split the text , chunk = first to split point, and append to list chunk = text[:split_point] chunks.append(chunk) # Update the remaining text and tokens, new text is from split point to end text = text[split_point:] tokens = tokens[split_point:] return chunks # Read the text file input_text = read_text_file('/home/ok/Desktop/OneDrive/PDF/combined.txt') max_tokens_per_chunk = 1536 # Split the text into chunks result_chunks = split_string_by_tokens(input_text, max_tokens_per_chunk) print(result_chunks, len(result_chunks))