import tiktoken

def read_text_file(file_path: str) -> str:
    """Read the contents of a text file and return as a string."""
    with open(file_path, 'r', encoding='utf-8') as file:
        file_content = file.read()
    return file_content


import tiktoken

def read_text_file(file_path: str) -> str:
    """Read the contents of a text file and return as a string."""
    with open(file_path, 'r', encoding='utf-8') as file:
        file_content = file.read()
    return file_content


def split_string_by_tokens(text: str, max_tokens: int, model: str = "gpt-3.5-turbo") -> list:
    """Split a string into chunks based on a maximum number of tokens."""
    chunks = [] # will hold the output
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)

    while len(tokens) > 0:
        #if length of tokens is less than max tokens
        if len(tokens) <= max_tokens:
            #append last chunk
            chunks.append(text)
            break

        # Find the split point
        split_point = max_tokens

        # Ensure the split point occurs at a word boundary
        while split_point > 0 and text[split_point] != ' ':
            split_point -= 1
        #tokens are not text

        # Split the text , chunk = first to split point, and append to list
        chunk = text[:split_point]
        chunks.append(chunk)

        # Update the remaining text and tokens, new text is from split point to end
        text = text[split_point:]
        tokens = tokens[split_point:]

    return chunks


# Read the text file
input_text = read_text_file('/home/ok/Desktop/OneDrive/PDF/combined.txt')
max_tokens_per_chunk = 1536


# Split the text into chunks
result_chunks = split_string_by_tokens(input_text, max_tokens_per_chunk)
print(result_chunks, len(result_chunks))