Source code for edu_convokit.analyzers.qualitative_analyzer



from edu_convokit.analyzers import analyzer
from typing import List, Dict, Tuple, Union
import pandas as pd


[docs]
class QualitativeAnalyzer(analyzer.Analyzer):


[docs]
    def print_examples(
            self,
            speaker_column: str,
            text_column: str,
            feature_column: str,
            df: pd.DataFrame = None,
            feature_value: Union[str, List[str]] = None, # If None, then use all values
            max_num_values: int = 2,
            max_num_examples: int = 3,
            show_k_previous_lines: int = 0,
            show_k_next_lines: int = 0,
            dropna: bool = False
        ) -> None:
        """
        Get text examples for a feature value.

        Output = 
            [(
            [(speaker, text), ...)], # previous text
            (speaker, current_text), # current text
            [(speaker, text), ...], # next text
            feature_value)
            ), ...]

        Arguments:
            speaker_column (str): name of column containing speaker names
            text_column (str): name of column containing text to get predictions for
            feature_column (str): name of column containing feature to get examples for
            df (pd.DataFrame): pandas dataframe. If None, then use self.dfs from constructor
            feature_value (Union[str, List[str]]): if not None, only get examples for this feature value
            show_k_previous_lines (int): show k previous lines
            show_k_next_lines (int): show k next lines
            dropna (bool): drop rows with NaN values in feature_column

        Returns:
            None
        """        
        examples = self._get_examples(
            df=df,
            speaker_column=speaker_column,
            text_column=text_column,
            feature_column=feature_column,
            feature_value=feature_value,
            max_num_values=max_num_values,
            max_num_examples=max_num_examples,
            show_k_previous_lines=show_k_previous_lines,
            show_k_next_lines=show_k_next_lines,
            dropna=dropna
        )
        print(self._format_examples(examples, feature_column))



[docs]
    def report_examples(
            self,
            speaker_column: str,
            text_column: str,
            feature_column: str,
            df: pd.DataFrame = None,
            feature_value: Union[float, List[float]] = None, # If None, then use all values
            max_num_values: int = 2,
            max_num_examples: int = 3,
            show_k_previous_lines: int = 0,
            show_k_next_lines: int = 0,
            dropna: bool = False
        ) -> str:
        """
        Get text examples for a feature value.

        Output = 
            [(
            [(speaker, text), ...)], # previous text
            (speaker, current_text), # current text
            [(speaker, text), ...], # next text
            feature_value)
            ), ...]

        Arguments:
            speaker_column (str): name of column containing speaker names
            text_column (str): name of column containing text to get predictions for
            feature_column (str): name of column containing feature to get examples for
            df (pd.DataFrame): pandas dataframe. If None, then use self.dfs from constructor
            feature_value (Union[float, List[float]]): if not None, only get examples for this feature value
            show_k_previous_lines (int): show k previous lines
            show_k_next_lines (int): show k next lines
            dropna (bool): drop rows with NaN values in feature_column

        Returns:
            str: formatted examples
        """        
        examples = self._get_examples(
            df=df,
            speaker_column=speaker_column,
            text_column=text_column,
            feature_column=feature_column,
            feature_value=feature_value,
            max_num_values=max_num_values,
            max_num_examples=max_num_examples,
            show_k_previous_lines=show_k_previous_lines,
            show_k_next_lines=show_k_next_lines,
            dropna=dropna
        )
        return self._format_examples(examples, feature_column)

    
    def _get_examples(
            self,
            speaker_column: str,
            text_column: str,
            feature_column: str,
            df: pd.DataFrame = None,
            feature_value: Union[float, List[float]] = None, # If None, then use all values
            max_num_values: int = 2,
            max_num_examples: int = 3,
            show_k_previous_lines: int = 0,
            show_k_next_lines: int = 0,
            dropna: bool = False
            ) -> List[Tuple[List[Tuple[str, str]], Tuple[str, str], List[Tuple[str, str]], float]]:
        """
        Get text examples for a feature value.

        Output = [(
            [(speaker, text), ...)], # previous text
            (speaker, current_text), # current text
            [(speaker, text), ...], # next text
            feature_value)
        ), ...]

        Arguments:
            speaker_column (str): name of column containing speaker names
            text_column (str): name of column containing text to get predictions for
            feature_column (str): name of column containing feature to get examples for
            df (pd.DataFrame): pandas dataframe. If None, then use self.dfs from constructor
            feature_value (Union[float, List[float]]): if not None, only get examples for this feature value
            show_k_previous_lines (int): show k previous lines
            show_k_next_lines (int): show k next lines
            dropna (bool): drop rows with NaN values in feature_column

        Returns:
            List[Tuple[List[Tuple[str, str]], Tuple[str, str], List[Tuple[str, str]], float]]: list of examples
        """

        if df is None:
            # Merge self.dfs into one df
            df = pd.concat(self.dfs)

        assert text_column in df.columns, f"Text column {text_column} not found in dataframe."
        assert feature_column in df.columns, f"Feature column {feature_column} not found in dataframe."
        
        if dropna:
            df = df.dropna(subset=[feature_column])

        if feature_value is None: 
            feature_value = df[feature_column].unique()

        elif isinstance(feature_value, float): 
            feature_value = [feature_value]

        # Casting types for df and feature_value
        df[text_column] = df[text_column].astype(str)

        examples = []

        num_values = 0
        for value in feature_value:
            # Get all rows with feature_value
            rows = df[df[feature_column] == value]

            
            num_examples = 0
            # Get examples
            for i, row in rows.iterrows():
                # Get previous lines
                prev_lines = []
                for j in range(max(i - show_k_previous_lines, 0), i):
                    if j < 0:
                        continue
                    prev_lines.append((df.iloc[j][speaker_column], df.iloc[j][text_column]))

                # Get next lines
                next_lines = []
                for j in range(i + 1, min(i + show_k_next_lines + 1, len(df))):
                    if j >= len(df):
                        continue
                    next_lines.append((df.iloc[j][speaker_column], df.iloc[j][text_column]))

                num_examples += 1
                examples.append((prev_lines, (row[speaker_column], row[text_column]), next_lines, row[feature_column]))

                if num_examples >= max_num_examples:
                    break
            
            if num_values >= max_num_values:
                break
            num_values += 1
        
        return examples

    def _format_examples(
            self,
            examples: List[Tuple[List[Tuple[str, str]], Tuple[str, str], List[Tuple[str, str]], str]],
            feature_column: str,
            ) -> str:
        """
        Format examples returned by get_examples.

        Output: 

        Feature value: <feature_value>
        <speaker>: <text> # Previous text
        >> <speaker>: <text> # Current text 
        <speaker>: <text> # Next text

        Feature value: <feature_value>
        ...
        """

        formatted_examples = []
        for prev_lines, current_line, next_lines, feature_value in examples:
            formatted_examples.append(f"{feature_column}: {feature_value}")
            for speaker, text in prev_lines:
                formatted_examples.append(f"{speaker}: {text}")
            formatted_examples.append(f">> {current_line[0]}: {current_line[1]}")
            for speaker, text in next_lines:
                formatted_examples.append(f"{speaker}: {text}")
            formatted_examples.append("")

        return "\n".join(formatted_examples)