Source code for edu_convokit.annotation.annotator

import pandas as pd
from typing import List, Union, Tuple
import spacy
import logging
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from edu_convokit import uptake_utils
from scipy.special import softmax
import logging
import re

from edu_convokit.constants import (
    STUDENT_REASONING_HF_MODEL_NAME,
    STUDENT_REASONING_MIN_NUM_WORDS,
    STUDENT_REASONING_MAX_INPUT_LENGTH,
    FOCUSING_HF_MODEL_NAME,
    FOCUSING_MIN_NUM_WORDS,
    FOCUSING_MAX_INPUT_LENGTH,
    UPTAKE_HF_MODEL_NAME,
    UPTAKE_MIN_NUM_WORDS_SPEAKER_A,
    HIGH_UPTAKE_THRESHOLD,
    UPTAKE_MAX_INPUT_LENGTH,
    MATH_PREFIXES,
    MATH_WORDS,
    TEACHER_TALK_MOVES_HF_MODEL_NAME,
    STUDENT_TALK_MOVES_HF_MODEL_NAME,
    TEACHER_LAUNCH_FEATURES_2_NL
)

[docs] class Annotator: """ Annotator class for edu-convokit. Contains methods for annotating data. """
[docs] def __init__(self): pass
def _populate_analysis_unit( self, df: pd.DataFrame, analysis_unit: str, text_column: str, time_start_column: str, time_end_column: str, output_column: str, ) -> pd.DataFrame: """ Populate output_column with number of words, sentences, or timestamps. """ if analysis_unit == "words": df[output_column] = df[text_column].str.split().str.len() elif analysis_unit == "sentences": # Use nlp to split text into sentences nlp = spacy.load("en_core_web_sm") df[output_column] = df[text_column].apply(lambda x: len(list(nlp(x).sents))) elif analysis_unit == "timestamps": # Check type of time_start_column and time_end_column if df[time_start_column].dtype != "float64": df[time_start_column] = df[time_start_column].astype("float64") if df[time_end_column].dtype != "float64": df[time_end_column] = df[time_end_column].astype("float64") df[output_column] = df[time_end_column] - df[time_start_column] else: raise ValueError(f"Analysis unit {analysis_unit} not supported.") return df
[docs] def get_talktime( self, df: pd.DataFrame, text_column: str = None, analysis_unit: str = "words", # words, sentences, timestamps representation: str = "frequency", # frequency time_start_column: str = None, time_end_column: str = None, output_column: str = "talktime_analysis", ) -> pd.DataFrame: """ Analyze talk time of speakers in a dataframe. Return original df and new dataframe with talk time analysis. Arguments: df (pd.DataFrame): dataframe to analyze text_column (str): name of column containing text to analyze. Only required if analysis_unit is words or sentences. analysis_unit (str): unit to analyze. Choose from "words", "sentences", "timestamps". representation (str): representation of talk time. Choose from "frequency", "proportion". time_start_column (str): name of column containing start time. Only required if analysis_unit is timestamps. time_end_column (str): name of column containing end time. Only required if analysis_unit is timestamps. output_column (str): name of column to store result. Returns: df (pd.DataFrame): dataframe with talk time analysis """ assert analysis_unit in ["words", "sentences", "timestamps"], f"Analysis unit {analysis_unit} not supported." assert representation in ["frequency", "proportion"], f"Representation {representation} not supported." if text_column is not None and analysis_unit in ["words", "sentences"]: assert text_column in df.columns, f"Text column {text_column} not found in dataframe." if time_start_column is not None and analysis_unit == "timestamps": assert time_start_column in df.columns, f"Time start column {time_start_column} not found in dataframe." assert time_end_column in df.columns, f"Time end column {time_end_column} not found in dataframe." # First populate output_column with number of words, sentences, or timestamps df = self._populate_analysis_unit(df, analysis_unit, text_column, time_start_column, time_end_column, output_column) # Return dataframe with talk time analysis if representation == 'proportion': total = df[output_column].sum() df[output_column] = df[output_column] / total return df
# HF models def _initialize(self, model_shortname): # Load model directly tokenizer = AutoTokenizer.from_pretrained(model_shortname) model = AutoModelForSequenceClassification.from_pretrained(model_shortname) model.eval() return tokenizer, model def _get_classification_predictions( self, df: pd.DataFrame, text_column: str, output_column: str, model_name: str, min_num_words: int = 0, max_num_words: int = None, speaker_column: str = None, speaker_value: Union[str, List[str]] = None, ) -> pd.DataFrame: """ Get classification predictions for a dataframe. Arguments: df: pandas dataframe text_column: name of column containing text to get predictions for output_column: name of column to store predictions speaker_column: name of column that contains speaker names. speaker_value: if speaker_column is not None, only get predictions for this speaker. model_name: name of model to use. """ assert text_column in df.columns, f"Text column {text_column} not found in dataframe." if output_column in df.columns: logging.warning(f"Target column {output_column} already exists in dataframe. Skipping.") return df if speaker_column is not None: assert speaker_column in df.columns, f"Speaker column {speaker_column} not found in dataframe." if isinstance(speaker_value, str): speaker_value = [speaker_value] tokenizer, model = self._initialize(model_name) # Get predictions predictions = [] for i, row in df.iterrows(): # Skip if speaker doesn't match if speaker_column is not None: if row[speaker_column] not in speaker_value: predictions.append(None) continue text = row[text_column] # Skip if text is too short if len(text.split()) < min_num_words: predictions.append(None) continue with torch.no_grad(): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_num_words) outputs = model(**inputs) logits = outputs.logits predictions.append(logits.argmax().item()) df[output_column] = predictions return df
[docs] def get_student_reasoning( self, df: pd.DataFrame, text_column: str, output_column: str, speaker_column: str = None, speaker_value: Union[str, List[str]] = None, ) -> pd.DataFrame: """ Get student reasoning predictions for a dataframe. Arguments: df (pd.DataFrame): dataframe to analyze text_column (str): name of column containing text to analyze output_column (str): name of column to store result speaker_column (str): name of column containing speaker names. Only required if speaker_value is not None. speaker_value (str or list): if speaker_column is not None, only get predictions for this speaker. Returns: df (pd.DataFrame): dataframe with student reasoning predictions """ # Print out note that the predictions should only be run on student reasoning as that's what the model was trained on. logging.warning("""Note: This model was trained on student reasoning, so it should be used on student utterances. For more details on the model, see https://arxiv.org/pdf/2211.11772.pdf""") return self._get_classification_predictions( df=df, text_column=text_column, output_column=output_column, model_name=STUDENT_REASONING_HF_MODEL_NAME, min_num_words=STUDENT_REASONING_MIN_NUM_WORDS, max_num_words=STUDENT_REASONING_MAX_INPUT_LENGTH, speaker_column=speaker_column, speaker_value=speaker_value )
[docs] def get_teacher_talk_moves( self, df: pd.DataFrame, text_column: str, output_column: str, speaker_column: str = None, speaker_value: Union[str, List[str]] = None, ) -> pd.DataFrame: """ Get teacher talk move predictions for a dataframe. Arguments: df (pd.DataFrame): dataframe to analyze text_column (str): name of column containing text to analyze output_column (str): name of column to store result speaker_column (str): name of column containing speaker names. Only required if speaker_value is not None. speaker_value (str or list): if speaker_column is not None, only get predictions for this speaker. Returns: df (pd.DataFrame): dataframe with teacher talk move predictions """ logging.warning("""Note: This model was trained on teacher talk moves, so it should be used on teacher utterances. For more details on the model, see https://github.com/SumnerLab/TalkMoves/tree/main""") return self._get_classification_predictions( df=df, text_column=text_column, output_column=output_column, model_name=TEACHER_TALK_MOVES_HF_MODEL_NAME, min_num_words=0, max_num_words=None, speaker_column=speaker_column, speaker_value=speaker_value )
[docs] def get_student_talk_moves( self, df: pd.DataFrame, text_column: str, output_column: str, speaker_column: str = None, speaker_value: Union[str, List[str]] = None, ) -> pd.DataFrame: """ Get student talk move predictions for a dataframe. Arguments: df (pd.DataFrame): dataframe to analyze text_column (str): name of column containing text to analyze output_column (str): name of column to store result speaker_column (str): name of column containing speaker names. Only required if speaker_value is not None. speaker_value (str or list): if speaker_column is not None, only get predictions for this speaker. Returns: df (pd.DataFrame): dataframe with teacher talk move predictions """ logging.warning("""Note: This model was trained on student talk moves, so it should be used on student utterances. For more details on the model, see https://github.com/SumnerLab/TalkMoves/tree/main""") return self._get_classification_predictions( df=df, text_column=text_column, output_column=output_column, model_name=STUDENT_TALK_MOVES_HF_MODEL_NAME, min_num_words=0, max_num_words=None, speaker_column=speaker_column, speaker_value=speaker_value )
[docs] def get_focusing_questions( self, df: pd.DataFrame, text_column: str, output_column: str, speaker_column: str = None, speaker_value: Union[str, List[str]] = None, ) -> pd.DataFrame: """ Get focusing question predictions for a dataframe. Arguments: df (pd.DataFrame): dataframe to analyze text_column (str): name of column containing text to analyze output_column (str): name of column to store result speaker_column (str): name of column containing speaker names. Only required if speaker_value is not None. speaker_value (str or list): if speaker_column is not None, only get predictions for this speaker. Returns: df (pd.DataFrame): dataframe with focusing question predictions """ logging.warning("""Note: This model was trained on teacher focusing questions, so it should be used on teacher utterances. For more details on the model, see https://aclanthology.org/2022.bea-1.27.pdf""") return self._get_classification_predictions( df=df, text_column=text_column, output_column=output_column, model_name=FOCUSING_HF_MODEL_NAME, min_num_words=FOCUSING_MIN_NUM_WORDS, max_num_words=FOCUSING_MAX_INPUT_LENGTH, speaker_column=speaker_column, speaker_value=speaker_value )
def _get_uptake_prediction(self, model, device, instance): instance["attention_mask"] = [[1] * len(instance["input_ids"])] for key in ["input_ids", "token_type_ids", "attention_mask"]: instance[key] = torch.tensor(instance[key]).unsqueeze(0) # Batch size = 1 instance[key] = instance[key].to(device) output = model(input_ids=instance["input_ids"], attention_mask=instance["attention_mask"], token_type_ids=instance["token_type_ids"], return_pooler_output=False) return output
[docs] def get_uptake( self, df: pd.DataFrame, text_column: str, output_column: str, speaker_column: str, # Mandatory because we are interested in measuring speaker2's uptake of speaker1's words speaker1: Union[str, List[str]], # speaker1 is the student speaker2: Union[str, List[str]], # speaker2 is the teacher result_type: str = "binary", # raw: uptake score, binary: 1 if uptake score > threshold, 0 otherwise ) -> pd.DataFrame: """ Get uptake predictions for a dataframe. Following the implementation here: https://huggingface.co/ddemszky/uptake-model/blob/main/handler.py Arguments: df (pd.DataFrame): dataframe to analyze text_column (str): name of column containing text to analyze output_column (str): name of column to store result speaker_column (str): name of column containing speaker names. speaker1 (str or list): speaker1 is the student speaker2 (str or list): speaker2 is the teacher result_type (str): raw or binary Returns: df (pd.DataFrame): dataframe with uptake predictions """ logging.warning("""Note: This model was trained on teacher's uptake of student's utterances. So, speaker1 should be the student and speaker2 should be the teacher. For more details on the model, see https://arxiv.org/pdf/2106.03873.pdf""") logging.warning("""Note: It's recommended that you merge utterances from the same speaker before running this model. You can do that with edu_convokit.text_preprocessing.merge_utterances_from_same_speaker.""") assert text_column in df.columns, f"Text column {text_column} not found in dataframe." assert speaker_column in df.columns, f"Speaker column {speaker_column} not found in dataframe." if output_column in df.columns: logging.warning(f"Target column {output_column} already exists in dataframe. Skipping.") return df if isinstance(speaker1, str): speaker1 = [speaker1] if isinstance(speaker2, str): speaker2 = [speaker2] # Uptake model is run slightly differently. So this is a separate function. input_builder, device, model = uptake_utils._initialize(UPTAKE_HF_MODEL_NAME) predictions = [] with torch.no_grad(): for i, row in df.iterrows(): if i == 0: predictions.append(None) continue s1 = df[speaker_column].iloc[i-1] s2 = df[speaker_column].iloc[i] textA = df[text_column].iloc[i-1] textB = df[text_column].iloc[i] # Skip if text is too short if len(textA.split()) < UPTAKE_MIN_NUM_WORDS_SPEAKER_A: predictions.append(None) continue if s1 in speaker1 and s2 in speaker2: textA = uptake_utils._get_clean_text(textA, remove_punct=False) textB = uptake_utils._get_clean_text(textB, remove_punct=False) instance = input_builder.build_inputs([textA], textB, max_length=UPTAKE_MAX_INPUT_LENGTH, input_str=True) output = self._get_uptake_prediction(model, device, instance) uptake_score = softmax(output["nsp_logits"][0].tolist())[1] if result_type == "binary": uptake_score = 1 if uptake_score > HIGH_UPTAKE_THRESHOLD else 0 predictions.append(uptake_score) else: predictions.append(None) df[output_column] = predictions return df
# Math density def _load_math_terms(self): """ modify_stem <- function(s) { return(paste0('(^|[^a-zA-Z])', s,'(s|es)?([^a-zA-Z]|$)')) } modify_list <- c('sum', 'arc', 'mass', 'digit', 'graph', 'liter', 'gram', 'add', 'angle', 'scale', 'data', 'array', 'ruler', 'meter', 'total', 'unit', 'prism', 'median', 'ratio', 'area') # Modify those entries in the glossary gloss[gloss %in% modify_list] <- modify_stem(gloss[gloss %in% modify_list]) For every term in MATH_WORDS, we modify the term to include the regex pattern that matches the term and its plural form if it is in MATH_PREFIXES. """ math_terms = [] for term in MATH_WORDS: if term in MATH_PREFIXES: math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)") else: math_terms.append(term) return math_terms
[docs] def get_math_density( self, df: pd.DataFrame, text_column: str, output_column: str, count_type: str = "total", # total result_type: str = "total", # total, proportion ) -> pd.DataFrame: """ Get math density for a dataframe. Following the implementation here: https://edworkingpapers.com/sites/default/files/ai23-855.pdf Arguments: df (pd.DataFrame): dataframe to analyze text_column (str): name of column containing text to analyze output_column (str): name of column to store result count_type (str): total or unique result_type (str): total or proportion Returns: df (pd.DataFrame): dataframe with math density analysis """ assert text_column in df.columns, f"Text column {text_column} not found in dataframe." # assert count_type in ["total", "unique"], f"Count type {count_type} not supported. Choose from 'total' or 'unique'." assert result_type in ["total", "proportion"], f"Result type {result_type} not supported. Choose from 'total' or 'proportion'." if output_column in df.columns: logging.warning(f"Result column {output_column} already exists in dataframe. Skipping.") return df math_terms = sorted(self._load_math_terms(), key=len, reverse=True) df = df.copy() df[output_column] = 0 # Speaker 2 unique terms found for i, utt in df.iterrows(): text = utt[text_column] # Count number of math terms in text total = 0 # Check if term is already matched matched_positions = set() for term in math_terms: matches = list(re.finditer(term, text, re.IGNORECASE)) matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)] count = len(matches) total += count matched_positions.update((match.start(), match.end()) for match in matches) # Store result df.loc[i, output_column] = total return df