Source code for edu_convokit.analyzers.quantitative_analyzer

import seaborn as sns
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple, Union
import pandas as pd
from edu_convokit.analyzers import analyzer
from edu_convokit import utils
import tqdm
import os

[docs] class QuantitativeAnalyzer(analyzer.Analyzer): def _compute_statistics( self, speaker_column: str, feature_column: str, df: pd.DataFrame = None, dropna: bool = False ) -> pd.DataFrame: """ Compute statistics for a feature across all speakers. Arguments: speaker_column: name of column containing speaker names feature_column: name of column containing feature to compute statistics for """ if df is None: # Load all the dataframes if no df is passed. assert self.filenames is not None, "No filenames passed to analyzer. Either initialize the analyzer with data_dir, filenames, or df, or pass a dataframe to this function." dfs = [utils.load_data(fname) for fname in self.filenames] else: dfs = [df] results_df = [] for df in dfs: assert speaker_column in df.columns, f"Speaker column {speaker_column} not found in dataframe." assert feature_column in df.columns, f"Feature column {feature_column} not found in dataframe." if dropna: df = df.dropna(subset=[feature_column]) feature_sum = df[feature_column].sum() for speaker in df[speaker_column].unique(): speaker_df = df[df[speaker_column] == speaker] result = { "speaker": speaker, f"raw_speaker_{feature_column}": speaker_df[feature_column].sum(), f"avg_speaker_{feature_column}": speaker_df[feature_column].mean(), f"prop_speaker_{feature_column}": speaker_df[feature_column].sum() / feature_sum, } results_df.append(result) results_df = pd.DataFrame(results_df) return results_df
[docs] def report_statistics( self, feature_column: str, df: pd.DataFrame = None, speaker_column: str = None, value_as: str = "raw", # raw, avg, prop, all dropna: bool = False, ) -> str: """ Report statistics for a feature across all speakers. Arguments: feature_column (str): name of column containing feature to compute statistics for df (pd.DataFrame): pandas dataframe. If None, then use self.dfs from constructor speaker_column (str): name of column containing speaker names value_as (str): raw, avg, prop, all dropna (bool): drop rows with NaN values in feature_column Returns: str: string representation of statistics """ assert value_as in ["raw", "avg", "prop", "all"], f"Invalid value_as {value_as}. Must be one of ['raw', 'avg', 'prop']." results_df = self._compute_statistics( speaker_column=speaker_column, feature_column=feature_column, df=df, dropna=dropna ) text = "" text += f"{feature_column}\n\n" if value_as == "raw": text += "Raw statistics\n" text += results_df.groupby("speaker")[f"raw_speaker_{feature_column}"].describe().to_string() text += "\n\n" elif value_as == "avg": text += "Average statistics\n" text += results_df.groupby("speaker")[f"avg_speaker_{feature_column}"].describe().to_string() text += "\n\n" elif value_as == "prop": text += "Proportion statistics\n" text += results_df.groupby("speaker")[f"prop_speaker_{feature_column}"].describe().to_string() return text
[docs] def print_statistics( self, feature_column: str, df: pd.DataFrame = None, speaker_column: str = None, value_as: str = "raw", # raw, avg, prop dropna: bool = False, ): """ Print statistics for a feature across all speakers. Arguments: feature_column (str): name of column containing feature to compute statistics for df (pd.DataFrame): pandas dataframe. If None, then use self.dfs from constructor speaker_column (str): name of column containing speaker names value_as (str): raw, avg, prop dropna (bool): drop rows with NaN values in feature_column Returns: None """ text = self.report_statistics( df=df, feature_column=feature_column, speaker_column=speaker_column, value_as=value_as, dropna=dropna ) print(text)
[docs] def plot_statistics( self, feature_column: str, df: pd.DataFrame = None, speaker_column: str = None, value_as: str = "raw", # raw, avg, prop, all dropna: bool = False, title: str = None, xlabel: str = None, ylabel: str = None, save_path: str = None, xrange: Tuple[float, float] = None, yrange: Tuple[float, float] = None, label_mapping: Dict[str, str] = None ): """ Plot statistics for a feature across all speakers. Arguments: feature_column (str): name of column containing feature to compute statistics for df (pd.DataFrame): pandas dataframe. If None, then use self.dfs from constructor speaker_column (str): name of column containing speaker names value_as (str): raw, avg, prop, all dropna (bool): drop rows with NaN values in feature_column title (str): title of plot xlabel (str): x-axis label ylabel (str): y-axis label save_path (str): path to save plot xrange (Tuple[float, float]): x-axis range yrange (Tuple[float, float]): y-axis range label_mapping (Dict[str, str]): mapping from speaker names to labels Returns: None """ assert value_as in ["raw", "avg", "prop", "all"], f"Invalid value_as {value_as}. Must be one of ['raw', 'avg', 'prop']." sns.set_theme(style="whitegrid") sns.set_context("paper", font_scale=1.5, rc={"lines.linewidth": 2.5}) plt.rcParams["font.family"] = "serif" results_df = self._compute_statistics( speaker_column=speaker_column, feature_column=feature_column, df=df, dropna=dropna ) if label_mapping is not None: results_df["speaker"] = results_df["speaker"].map(label_mapping) y = f"{value_as}_speaker_{feature_column}" ax = sns.boxplot( x="speaker", y=y, data=results_df ) if title is not None: ax.set_title(title) if ylabel is not None: ax.set_ylabel(ylabel) if xlabel is not None: ax.set_xlabel(xlabel) if xrange is not None: ax.set_xlim(xrange) if yrange is not None: ax.set_ylim(yrange) if save_path is not None: plt.savefig(save_path, bbox_inches="tight") else: plt.show() plt.clf()