Source code for edu_convokit.preprocessors.token_preprocessor

import pandas as pd
from typing import Any, List, Union, Tuple
import tiktoken


[docs]
class TokenPreprocessor: 


[docs]
    def __init__(self, model: str):
        self.model = model
        pass


    def _format_text_for_token_counting(
            self,
            text: Union[str, List[str], List[dict]],
    ) -> List[dict]:
        """
        Format text for token counting. Final output is a list of dictionaries with keys "role" and "content".
        """
        if isinstance(text, str):
            text = [text]

        if isinstance(text[0], str):
            text = [{"role": "user", "content": text}]
        
        elif isinstance(text[0], dict):
            contains_keys = all([key in _ for _ in text for key in ["role", "content"]])
            assert contains_keys, "Each dictionary in text must contain keys 'role' and 'content'."

        return text
    

[docs]
    def get_num_tokens_from_string(
            self,
            string: str,
    ) -> int:
        """
        Returns the number of tokens in a text string. From https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb.
        """
        encoding = tiktoken.encoding_for_model(self.model)
        num_tokens = len(encoding.encode(string))
        return num_tokens



[docs]
    def get_num_tokens_from_messages(
            self,
            messages: Union[str, List[str], List[dict]],
    ) -> int:
        """
        Return the number of tokens in a string or list of strings. Code adapted from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

        Arguments:
            text (Union[str, List[str]]): string or list of strings

        Returns:
            Union[int, List[int]]: number of tokens in text
        """

        messages = self._format_text_for_token_counting(messages)
        
        try:
            encoding = tiktoken.encoding_for_model(self.model)
        except KeyError:
            print("Warning: model not found. Using cl100k_base encoding.")
            encoding = tiktoken.get_encoding("cl100k_base")

        if self.model in {
            "gpt-3.5-turbo-0613",
            "gpt-3.5-turbo-16k-0613",
            "gpt-4-0314",
            "gpt-4-32k-0314",
            "gpt-4-0613",
            "gpt-4-32k-0613",
            }:
            tokens_per_message = 3
            tokens_per_name = 1
        
        elif self.model == "gpt-3.5-turbo-0301":
            tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
            tokens_per_name = -1 # if there's a name, the role is omitted

        elif "gpt-3.5-turbo" in self.model:
            print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
            self.model = "gpt-3.5-turbo-0613"
            return self.get_num_tokens_from_messages(messages)
        elif "gpt-4" in self.model:
            print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
            self.model = "gpt-4-0613"
            return self.get_num_tokens_from_messages(messages)
        else:
            raise NotImplementedError(
                f"""get_num_tokens_from_messages() is not implemented for model {self.model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
            )

        num_tokens = 0
        for message in messages:
            num_tokens += tokens_per_message
            for key, value in message.items():
                num_tokens += len(encoding.encode(value))
                if key == "name":
                    num_tokens += tokens_per_name
        num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
        return num_tokens



[docs]
    def format_transcript_within_budget(
            self, 
            df: pd.DataFrame,
            text_column: str,
            speaker_column: str,
            max_token_budget: int,
            format_template: str = "{speaker}: {text}",
            add_line_numbers: bool = False, # {line} must be in format_template
            print_num_tokens: bool = False,
    ) -> str:
        """
        Format a transcript within a token budget. 

        Arguments:
            df (pd.DataFrame): pandas dataframe
            text_column (str): name of column containing text
            speaker_column (str): name of column containing speaker names
            max_token_budget (int): maximum number of tokens
            format_template (str): format string
            add_line_numbers (bool): whether to add line numbers
            print_num_tokens (bool): whether to print the number of tokens

        Returns:
            str: formatted string
        """
        assert text_column in df.columns, f"Text column {text_column} not found in dataframe."
        assert speaker_column in df.columns, f"Speaker column {speaker_column} not found in dataframe."
        assert "{speaker}" in format_template, "format_template must contain {speaker}."
        assert "{text}" in format_template, "format_template must contain {text}."
        if add_line_numbers:
            assert "{line}" in format_template, "format_template must contain {line}."

        text = ""
        num_tokens = 0
        line = 0
        # Naive approach: add as many rows as possible until max_token_budget is reached
        for i, row in df.iterrows():
            next_text = format_template.format(
                speaker=row[speaker_column], 
                text=row[text_column],
                line=line) + "\n"
            next_text_num_tokens = self.get_num_tokens_from_string(next_text)

            if num_tokens + next_text_num_tokens <= max_token_budget:
                text += next_text
                num_tokens += next_text_num_tokens
                line += 1
            else:
                break

        # Remove last newline
        text = text[:-1]
        if print_num_tokens:
            print(f"Number of tokens: {num_tokens}")
        return text