Source code for edu_convokit.annotation.annotator

import pandas as pd
from typing import List, Union, Tuple
import spacy
import logging
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from edu_convokit import uptake_utils
from scipy.special import softmax
import logging
import re

from edu_convokit.constants import (
    STUDENT_REASONING_HF_MODEL_NAME,
    STUDENT_REASONING_MIN_NUM_WORDS,
    STUDENT_REASONING_MAX_INPUT_LENGTH,
    FOCUSING_HF_MODEL_NAME,
    FOCUSING_MIN_NUM_WORDS,
    FOCUSING_MAX_INPUT_LENGTH,
    UPTAKE_HF_MODEL_NAME,
    UPTAKE_MIN_NUM_WORDS_SPEAKER_A,
    HIGH_UPTAKE_THRESHOLD,
    UPTAKE_MAX_INPUT_LENGTH,
    MATH_PREFIXES,
    MATH_WORDS,
    TEACHER_TALK_MOVES_HF_MODEL_NAME,
    STUDENT_TALK_MOVES_HF_MODEL_NAME,
    TEACHER_LAUNCH_FEATURES_2_NL
)


[docs]
class Annotator:
    """
        Annotator class for edu-convokit. Contains methods for annotating data.
    """

[docs]
    def __init__(self):
        pass


    def _populate_analysis_unit(
            self,
            df: pd.DataFrame,
            analysis_unit: str,
            text_column: str,
            time_start_column: str,
            time_end_column: str,
            output_column: str,
            ) -> pd.DataFrame:
        """
        Populate output_column with number of words, sentences, or timestamps.
        """

        if analysis_unit == "words":
            df[output_column] = df[text_column].str.split().str.len()
        elif analysis_unit == "sentences":
            # Use nlp to split text into sentences
            nlp = spacy.load("en_core_web_sm")
            df[output_column] = df[text_column].apply(lambda x: len(list(nlp(x).sents)))
        elif analysis_unit == "timestamps":
            # Check type of time_start_column and time_end_column
            if df[time_start_column].dtype != "float64":
                df[time_start_column] = df[time_start_column].astype("float64")
            if df[time_end_column].dtype != "float64":
                df[time_end_column] = df[time_end_column].astype("float64")
            df[output_column] = df[time_end_column] - df[time_start_column]
        else:
            raise ValueError(f"Analysis unit {analysis_unit} not supported.")
        return df


[docs]
    def get_talktime(
            self,
            df: pd.DataFrame,
            text_column: str = None,
            analysis_unit: str = "words", # words, sentences, timestamps
            representation: str = "frequency", # frequency
            time_start_column: str = None,
            time_end_column: str = None,
            output_column: str = "talktime_analysis",
            ) -> pd.DataFrame:
        """
        Analyze talk time of speakers in a dataframe. Return original df and new dataframe with talk time analysis.

        Arguments:
            df (pd.DataFrame): dataframe to analyze
            text_column (str): name of column containing text to analyze. Only required if analysis_unit is words or sentences.
            analysis_unit (str): unit to analyze. Choose from "words", "sentences", "timestamps".
            representation (str): representation of talk time. Choose from "frequency", "proportion".
            time_start_column (str): name of column containing start time. Only required if analysis_unit is timestamps.
            time_end_column (str): name of column containing end time. Only required if analysis_unit is timestamps.
            output_column (str): name of column to store result.

        Returns:
            df (pd.DataFrame): dataframe with talk time analysis
        """ 
        assert analysis_unit in ["words", "sentences", "timestamps"], f"Analysis unit {analysis_unit} not supported."
        assert representation in ["frequency", "proportion"], f"Representation {representation} not supported."

        if text_column is not None and analysis_unit in ["words", "sentences"]:
            assert text_column in df.columns, f"Text column {text_column} not found in dataframe."

        if time_start_column is not None and analysis_unit == "timestamps":
            assert time_start_column in df.columns, f"Time start column {time_start_column} not found in dataframe."
            assert time_end_column in df.columns, f"Time end column {time_end_column} not found in dataframe."

        # First populate output_column with number of words, sentences, or timestamps
        df = self._populate_analysis_unit(df, analysis_unit, text_column, time_start_column, time_end_column, output_column)

        # Return dataframe with talk time analysis
        if representation == 'proportion':
            total = df[output_column].sum()
            df[output_column] = df[output_column] / total

        return df

    
    # HF models
    def _initialize(self, model_shortname):
        # Load model directly
        tokenizer = AutoTokenizer.from_pretrained(model_shortname)
        model = AutoModelForSequenceClassification.from_pretrained(model_shortname)
        model.eval()
        return tokenizer, model

    def _get_classification_predictions(
            self,
            df: pd.DataFrame,
            text_column: str,
            output_column: str,
            model_name: str,
            min_num_words: int = 0,
            max_num_words: int = None,
            speaker_column: str = None,
            speaker_value: Union[str, List[str]] = None,
    ) -> pd.DataFrame:
        """
        Get classification predictions for a dataframe.

        Arguments:
            df: pandas dataframe
            text_column: name of column containing text to get predictions for
            output_column: name of column to store predictions
            speaker_column: name of column that contains speaker names.
            speaker_value: if speaker_column is not None, only get predictions for this speaker.
            model_name: name of model to use.
        """
        assert text_column in df.columns, f"Text column {text_column} not found in dataframe."

        if output_column in df.columns:
            logging.warning(f"Target column {output_column} already exists in dataframe. Skipping.")
            return df

        if speaker_column is not None:
            assert speaker_column in df.columns, f"Speaker column {speaker_column} not found in dataframe."

            if isinstance(speaker_value, str):
                speaker_value = [speaker_value]

        tokenizer, model = self._initialize(model_name)

        # Get predictions
        predictions = []
        for i, row in df.iterrows():
            # Skip if speaker doesn't match
            if speaker_column is not None:
                if row[speaker_column] not in speaker_value:
                    predictions.append(None)
                    continue

            text = row[text_column]

            # Skip if text is too short
            if len(text.split()) < min_num_words:
                predictions.append(None)
                continue

            with torch.no_grad():
                inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_num_words)
                outputs = model(**inputs)
                logits = outputs.logits
                predictions.append(logits.argmax().item())

        df[output_column] = predictions
        return df


[docs]
    def get_student_reasoning(
            self,
            df: pd.DataFrame,
            text_column: str,
            output_column: str,
            speaker_column: str = None,
            speaker_value: Union[str, List[str]] = None,
    ) -> pd.DataFrame:
        """
        Get student reasoning predictions for a dataframe.

        Arguments:
            df (pd.DataFrame): dataframe to analyze
            text_column (str): name of column containing text to analyze
            output_column (str): name of column to store result
            speaker_column (str): name of column containing speaker names. Only required if speaker_value is not None.
            speaker_value (str or list): if speaker_column is not None, only get predictions for this speaker.
        
        Returns:
            df (pd.DataFrame): dataframe with student reasoning predictions
        """

        # Print out note that the predictions should only be run on student reasoning as that's what the model was trained on.
        logging.warning("""Note: This model was trained on student reasoning, so it should be used on student utterances.
    For more details on the model, see https://arxiv.org/pdf/2211.11772.pdf""")

        return self._get_classification_predictions(
            df=df,
            text_column=text_column,
            output_column=output_column,
            model_name=STUDENT_REASONING_HF_MODEL_NAME,
            min_num_words=STUDENT_REASONING_MIN_NUM_WORDS,
            max_num_words=STUDENT_REASONING_MAX_INPUT_LENGTH,
            speaker_column=speaker_column,
            speaker_value=speaker_value
        )

    

[docs]
    def get_teacher_talk_moves(
            self, 
            df: pd.DataFrame,
            text_column: str,
            output_column: str,
            speaker_column: str = None,
            speaker_value: Union[str, List[str]] = None,
    ) -> pd.DataFrame:
        """
        Get teacher talk move predictions for a dataframe.

        Arguments:
            df (pd.DataFrame): dataframe to analyze
            text_column (str): name of column containing text to analyze
            output_column (str): name of column to store result
            speaker_column (str): name of column containing speaker names. Only required if speaker_value is not None.
            speaker_value (str or list): if speaker_column is not None, only get predictions for this speaker.

        Returns:
            df (pd.DataFrame): dataframe with teacher talk move predictions
        """

        logging.warning("""Note: This model was trained on teacher talk moves, so it should be used on teacher utterances.
    For more details on the model, see https://github.com/SumnerLab/TalkMoves/tree/main""")

        return self._get_classification_predictions(
            df=df,
            text_column=text_column,
            output_column=output_column,
            model_name=TEACHER_TALK_MOVES_HF_MODEL_NAME,
            min_num_words=0,
            max_num_words=None,
            speaker_column=speaker_column,
            speaker_value=speaker_value
        )



[docs]
    def get_student_talk_moves(
            self, 
            df: pd.DataFrame,
            text_column: str,
            output_column: str,
            speaker_column: str = None,
            speaker_value: Union[str, List[str]] = None,
    ) -> pd.DataFrame:
        """
        Get student talk move predictions for a dataframe.

        Arguments:
            df (pd.DataFrame): dataframe to analyze
            text_column (str): name of column containing text to analyze
            output_column (str): name of column to store result
            speaker_column (str): name of column containing speaker names. Only required if speaker_value is not None.
            speaker_value (str or list): if speaker_column is not None, only get predictions for this speaker.

        Returns:
            df (pd.DataFrame): dataframe with teacher talk move predictions
        """

        logging.warning("""Note: This model was trained on student talk moves, so it should be used on student utterances.
    For more details on the model, see https://github.com/SumnerLab/TalkMoves/tree/main""")

        return self._get_classification_predictions(
            df=df,
            text_column=text_column,
            output_column=output_column,
            model_name=STUDENT_TALK_MOVES_HF_MODEL_NAME,
            min_num_words=0,
            max_num_words=None,
            speaker_column=speaker_column,
            speaker_value=speaker_value
        )



[docs]
    def get_focusing_questions(
            self,
            df: pd.DataFrame,
            text_column: str,
            output_column: str,
            speaker_column: str = None,
            speaker_value: Union[str, List[str]] = None,
    ) -> pd.DataFrame:
        """
        Get focusing question predictions for a dataframe.

        Arguments:
            df (pd.DataFrame): dataframe to analyze
            text_column (str): name of column containing text to analyze
            output_column (str): name of column to store result
            speaker_column (str): name of column containing speaker names. Only required if speaker_value is not None.
            speaker_value (str or list): if speaker_column is not None, only get predictions for this speaker.

        Returns:
            df (pd.DataFrame): dataframe with focusing question predictions
        """

        logging.warning("""Note: This model was trained on teacher focusing questions, so it should be used on teacher utterances.
    For more details on the model, see https://aclanthology.org/2022.bea-1.27.pdf""")

        return self._get_classification_predictions(
            df=df,
            text_column=text_column,
            output_column=output_column,
            model_name=FOCUSING_HF_MODEL_NAME,
            min_num_words=FOCUSING_MIN_NUM_WORDS,
            max_num_words=FOCUSING_MAX_INPUT_LENGTH,
            speaker_column=speaker_column,
            speaker_value=speaker_value
        )


    def _get_uptake_prediction(self, model, device, instance):
        instance["attention_mask"] = [[1] * len(instance["input_ids"])]
        for key in ["input_ids", "token_type_ids", "attention_mask"]:
            instance[key] = torch.tensor(instance[key]).unsqueeze(0)  # Batch size = 1
            instance[key] = instance[key].to(device)

        output = model(input_ids=instance["input_ids"],
                        attention_mask=instance["attention_mask"],
                        token_type_ids=instance["token_type_ids"],
                        return_pooler_output=False)
        return output


[docs]
    def get_uptake(
        self,
        df: pd.DataFrame,
        text_column: str,
        output_column: str,
        speaker_column: str, # Mandatory because we are interested in measuring speaker2's uptake of speaker1's words
        speaker1: Union[str, List[str]], # speaker1 is the student
        speaker2: Union[str, List[str]], # speaker2 is the teacher
        result_type: str = "binary", # raw: uptake score, binary: 1 if uptake score > threshold, 0 otherwise
    ) -> pd.DataFrame:
        """
        Get uptake predictions for a dataframe.
        Following the implementation here:
        https://huggingface.co/ddemszky/uptake-model/blob/main/handler.py

        Arguments:
            df (pd.DataFrame): dataframe to analyze
            text_column (str): name of column containing text to analyze
            output_column (str): name of column to store result
            speaker_column (str): name of column containing speaker names.
            speaker1 (str or list): speaker1 is the student
            speaker2 (str or list): speaker2 is the teacher
            result_type (str): raw or binary

        Returns:
            df (pd.DataFrame): dataframe with uptake predictions
        """

        logging.warning("""Note: This model was trained on teacher's uptake of student's utterances. So, speaker1 should be the student and speaker2 should be the teacher.
    For more details on the model, see https://arxiv.org/pdf/2106.03873.pdf""")

        logging.warning("""Note: It's recommended that you merge utterances from the same speaker before running this model. You can do that with edu_convokit.text_preprocessing.merge_utterances_from_same_speaker.""")

        assert text_column in df.columns, f"Text column {text_column} not found in dataframe."
        assert speaker_column in df.columns, f"Speaker column {speaker_column} not found in dataframe."

        if output_column in df.columns:
            logging.warning(f"Target column {output_column} already exists in dataframe. Skipping.")
            return df

        if isinstance(speaker1, str):
            speaker1 = [speaker1]

        if isinstance(speaker2, str):
            speaker2 = [speaker2]

        # Uptake model is run slightly differently. So this is a separate function.
        input_builder, device, model = uptake_utils._initialize(UPTAKE_HF_MODEL_NAME)

        predictions = []

        with torch.no_grad():
            for i, row in df.iterrows():
                if i == 0:
                    predictions.append(None)
                    continue

                s1 = df[speaker_column].iloc[i-1]
                s2 = df[speaker_column].iloc[i]
                textA = df[text_column].iloc[i-1]
                textB = df[text_column].iloc[i]

                # Skip if text is too short
                if len(textA.split()) < UPTAKE_MIN_NUM_WORDS_SPEAKER_A:
                    predictions.append(None)
                    continue

                if s1 in speaker1 and s2 in speaker2:
                    textA = uptake_utils._get_clean_text(textA, remove_punct=False)
                    textB = uptake_utils._get_clean_text(textB, remove_punct=False)

                    instance = input_builder.build_inputs([textA], textB,
                                                            max_length=UPTAKE_MAX_INPUT_LENGTH,
                                                            input_str=True)
                    output = self._get_uptake_prediction(model, device, instance)
                    uptake_score = softmax(output["nsp_logits"][0].tolist())[1]
                    if result_type == "binary":
                        uptake_score = 1 if uptake_score > HIGH_UPTAKE_THRESHOLD else 0

                    predictions.append(uptake_score)
                else:
                    predictions.append(None)
        df[output_column] = predictions

        return df


    # Math density
    def _load_math_terms(self):
        """
        modify_stem <- function(s) {
                return(paste0('(^|[^a-zA-Z])', s,'(s|es)?([^a-zA-Z]|$)'))
                }

        modify_list <- c('sum', 'arc', 'mass', 'digit', 'graph', 
                        'liter', 'gram', 'add', 'angle', 'scale',
                        'data', 'array', 'ruler', 'meter', 'total',
                        'unit', 'prism', 'median', 'ratio', 'area')

        # Modify those entries in the glossary
        gloss[gloss %in% modify_list] <- modify_stem(gloss[gloss %in% modify_list])

        For every term in MATH_WORDS, we modify the term to include the regex pattern that matches the term and its plural form if it is in MATH_PREFIXES.
        """
        math_terms = []
        for term in MATH_WORDS:
            if term in MATH_PREFIXES:
                math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
            else:
                math_terms.append(term)
        return math_terms


[docs]
    def get_math_density(
            self,
            df: pd.DataFrame,
            text_column: str,
            output_column: str,
            count_type: str = "total", # total
            result_type: str = "total", # total, proportion
    ) -> pd.DataFrame:
        """
        Get math density for a dataframe. Following the implementation here: https://edworkingpapers.com/sites/default/files/ai23-855.pdf

        Arguments:
            df (pd.DataFrame): dataframe to analyze
            text_column (str): name of column containing text to analyze
            output_column (str): name of column to store result
            count_type (str): total or unique
            result_type (str): total or proportion

        Returns:
            df (pd.DataFrame): dataframe with math density analysis
        """
        assert text_column in df.columns, f"Text column {text_column} not found in dataframe."
        # assert count_type in ["total", "unique"], f"Count type {count_type} not supported. Choose from 'total' or 'unique'."
        assert result_type in ["total", "proportion"], f"Result type {result_type} not supported. Choose from 'total' or 'proportion'."

        if output_column in df.columns:
            logging.warning(f"Result column {output_column} already exists in dataframe. Skipping.")
            return df

        math_terms = sorted(self._load_math_terms(), key=len, reverse=True)

        df = df.copy()
        df[output_column] = 0

        # Speaker 2 unique terms found
        for i, utt in df.iterrows():
            text = utt[text_column]

            # Count number of math terms in text
            total = 0

            # Check if term is already matched
            matched_positions = set()

            for term in math_terms:
                matches = list(re.finditer(term, text, re.IGNORECASE))
                matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
                count = len(matches)
                total += count

                matched_positions.update((match.start(), match.end()) for match in matches)
                
            # Store result
            df.loc[i, output_column] = total

        return df