Source code for edu_convokit.analyzers.lexical_analyzer

import pandas as pd
from typing import List, Union, Tuple
from collections import defaultdict
import math
from gensim.models import Phrases
from gensim.models.phrases import Phraser 
from edu_convokit import utils
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

import itertools
import re
import logging
from edu_convokit.analyzers import analyzer



[docs]
class LexicalAnalyzer(analyzer.Analyzer):


[docs]
    def print_word_frequency(
            self,
            text_column: str,
            topk: int = 5,
            df: pd.DataFrame = None,
            speaker_column: str = None,
            run_text_formatting: bool = False,
            run_ngrams: bool = False,
            n: int = 0
    ) -> str: 
        """
        Print word frequency for a dataframe.

        Arguments:
            df (pd.DataFrame): pandas dataframe
            text_column (str): name of column containing text to analyze
            topk (int): number of top words to return
            speaker_column (str): name of column containing speaker names. If specified, it will report word frequency for each speaker.
            run_text_formatting (bool): whether to run standard text formatting
            run_ngrams (bool): whether to run ngrams
            n (int): n for ngrams

        Returns:
            str: word frequency
        """

        text = self.report_word_frequency(
            df=df,
            text_column=text_column,
            topk=topk,
            speaker_column=speaker_column,
            run_text_formatting=run_text_formatting,
            run_ngrams=run_ngrams,
            n=n,
        )
        print(text)



[docs]
    def report_word_frequency(
            self,
            text_column: str,
            topk: int = 5,
            df: pd.DataFrame = None,
            speaker_column: str = None,
            run_text_formatting: bool = False,
            run_ngrams: bool = False,
            n: int = 0
    ) -> str: 
        """
        Reports word frequency for a dataframe as a string.

        Arguments:
            df (pd.DataFrame): pandas dataframe
            text_column (str): name of column containing text to analyze
            topk (int): number of top words to return
            speaker_column (str): name of column containing speaker names. If specified, it will report word frequency for each speaker.
            run_text_formatting (bool): whether to run standard text formatting
            run_ngrams (bool): whether to run ngrams
            n (int): n for ngrams

        Returns:
            str: word frequency
        """
        if df is None: 
            df = self.get_df().copy()

        # Make sure text columns are interpreted as strings
        df[text_column] = df[text_column].astype(str)

        assert text_column in df.columns, f"Text column {text_column} not found in dataframe."

        df = df.copy()

        if run_text_formatting: # Run standard text formatting.
            df[text_column] = df[text_column].apply(utils._clean_text_to_words)

        if run_ngrams: # Run ngrams.
            df = self._compute_ngrams(df, text_column, n=n)

        df = self._format_text_column(df, text_column)

        if speaker_column is None:
            top_words = self._get_top_words(df, text_column, topk)
            text = self._format_word_frequency(top_words)
        else:
            text = "Top Words By Speaker\n"
            for speaker in df[speaker_column].unique():
                # Skip if speaker is nan
                if isinstance(speaker, float) and math.isnan(speaker):
                    continue
                speaker_df = df[df[speaker_column] == speaker]
                top_words = self._get_top_words(speaker_df, text_column, topk)
                text += f"{speaker}\n"
                text += self._format_word_frequency(top_words)
                text += "\n\n"
        return text


    def _get_top_words(
            self, 
            df: pd.DataFrame,
            text_column: str,
            topk: int = 5,
        ) -> List[Tuple[str, int]]:
        words = df[text_column].sum()
        word_counts = nltk.FreqDist(words)
        top_words = word_counts.most_common(topk)
        return top_words

    def _format_word_frequency(self, word_counts):
        text = ""
        for word, count in word_counts:
            text += f"{word}: {count}\n"
        return text

    def _format_text_column(
            self,
            df: pd.DataFrame, 
            text_column: str
        ) -> pd.DataFrame:
        """
        Format text column for lexical analysis. Check that text column is a list of strings, otherwise split on spaces.
        """
        if isinstance(df[text_column].iloc[0], str):
            df[text_column] = df[text_column].str.split()
        return df

    def _get_counts(self, texts, vocab):
        counts = {w: 0 for w in vocab}
        for split in texts:
            count = 0
            prev = ''
            for w in split:
                if w == '':
                    continue
                if w in vocab:
                    counts[w] += 1
                if count > 0: # Enable bigram counts if the vocab allows that.
                    bigram = prev + ' ' + w
                    if bigram in vocab:
                        counts[bigram] += 1
                count += 1
                prev = w
        return counts

    def _logodds(self, counts1, counts2, prior, zscore = True):
        # code from Dan Jurafsky
        # Note: counts1 will be positive and counts2 will be negative

        sigmasquared = defaultdict(float)
        sigma = defaultdict(float)
        delta = defaultdict(float)

        n1 = sum(counts1.values())
        n2 = sum(counts2.values())

        # Since we use the sum of counts from the two groups as a prior, this is equivalent to a simple log odds ratio.
        nprior = sum(prior.values())
        for word in prior.keys():
            if prior[word] == 0:
                delta[word] = 0
                continue
            l1 = float(counts1[word] + prior[word]) / (( n1 + nprior ) - (counts1[word] + prior[word]))
            l2 = float(counts2[word] + prior[word]) / (( n2 + nprior ) - (counts2[word] + prior[word]))
            sigmasquared[word] = 1/(float(counts1[word]) + float(prior[word])) + 1/(float(counts2[word]) + float(prior[word]))
            sigma[word] = math.sqrt(sigmasquared[word])
            delta[word] = (math.log(l1) - math.log(l2))
            if zscore:
                delta[word] /= sigma[word]
        return delta

    def _compute_logodds(
            self,
            df1: pd.DataFrame,
            df2: pd.DataFrame,
            text_column1: str,
            text_column2: str, 
            words2idx: dict,
        ) -> Tuple[dict, dict, dict, dict]:

        counts1 = self._get_counts(df1[text_column1], words2idx)
        counts2 = self._get_counts(df2[text_column2], words2idx)
        prior = {}
        for k, v in counts1.items():
            prior[k] = v + counts2[k]

        # Note: You might not want to z-score if there are significantly larger events in one group than the other.
        delta = self._logodds(counts1, counts2, prior, True)
        return prior, counts1, counts2, delta

    def _get_ngrams(self, text, n):
        ngrams = []
        for i in range(len(text) - n + 1):
            ngrams.append(' '.join(text[i:i+n]))
        return ngrams

    def _compute_ngrams(
            self, 
            df: pd.DataFrame,
            text_column: str, # Values in this column should be lists of strings. If they are not, they will be split on spaces.
            n: int = 0, # If n is 0, will return all ngrams.
            target_text_column: str = None, # Put ngrams in a new column.
            min_count: int = 1, # Minimum number of times an ngram must appear to be included.
    ) -> Union[List[str], pd.DataFrame]:
        df = df.copy()

        if target_text_column is None:
            target_text_column = text_column

        df[target_text_column] = df[text_column]
        df = self._format_text_column(df, target_text_column)

        if n == 0:
            ngram_model = Phrases(df[target_text_column].tolist(), min_count=min_count)
            ngram_phraser = Phraser(ngram_model)
            df[target_text_column] = df[target_text_column].apply(lambda x: ngram_phraser[x])
        else:
            df[target_text_column] = df[target_text_column].apply(lambda x: self._get_ngrams(x, n))
        return df

    def _get_logodds(
            self,
            df1: pd.DataFrame,
            df2: pd.DataFrame,
            text_column1: str,
            text_column2: str,
            topk: int = 5,
            zscore: bool = True,
            logodds_factor: float = 1.0,
            run_text_formatting: bool = False,
            run_ngrams: bool = False,
            n: int = 0,
    ) -> Tuple[List[Tuple[str, float]], List[Tuple[str, float]]]:
        """
        Return topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])

        For more information on log-odds, see: https://en.wikipedia.org/wiki/Odds_ratio

        Arguments:
            df1 (pd.DataFrame): pandas dataframe
            df2 (pd.DataFrame): pandas dataframe
            text_column1 (str): name of column containing text to analyze in df1
            text_column2 (str): name of column containing text to analyze in df2
            topk (int): number of top words to return
            zscore (bool): whether to z-score the log-odds
            logodds_factor (float): factor to multiply standard deviation by to determine top words
            run_text_formatting (bool): whether to run standard text formatting
            run_ngrams (bool): whether to run ngrams
            n (int): n for ngrams

        Returns:
            Tuple[List[Tuple[str, float]], List[Tuple[str, float]]]: topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])
        """
        assert text_column1 in df1.columns, f"Text column {text_column1} not found in dataframe."
        assert text_column2 in df2.columns, f"Text column {text_column2} not found in dataframe."

        df1 = df1.copy()
        df2 = df2.copy()

        # Make sure text columns are interpreted as strings
        df1[text_column1] = df1[text_column1].astype(str)
        df2[text_column2] = df2[text_column2].astype(str)

        if run_text_formatting: # Run standard text formatting.
            df1[text_column1] = df1[text_column1].apply(utils._clean_text_to_words)
            df2[text_column2] = df2[text_column2].apply(utils._clean_text_to_words)

        if run_ngrams: # Run ngrams.
            df1 = self._compute_ngrams(df1, text_column1, n=n)
            df2 = self._compute_ngrams(df2, text_column2, n=n)

        df1_ = self._format_text_column(df1, text_column1)
        df2_ = self._format_text_column(df2, text_column2)

        # Get all words and build word index dictionary.
        words = df1_[text_column1].sum() + df2_[text_column2].sum()
        words = list(set(words))
        words2idx = {w: i for i, w in enumerate(words)}

        # Compute log-odds
        prior, counts1, counts2, logodds = self._compute_logodds(df1_, df2_, text_column1, text_column2, words2idx)

        LOGODDS_COLUMN = 'logodds'
        logodds_df = pd.DataFrame.from_dict(logodds, orient='index', columns=[LOGODDS_COLUMN])
        mean = 0
        std = logodds_df[LOGODDS_COLUMN].std()

        # Get top words and sort
        top_words = logodds_df[logodds_df[LOGODDS_COLUMN] >= mean + logodds_factor * std].sort_values(by=LOGODDS_COLUMN, ascending=False).head(topk)
        bottom_words = logodds_df[logodds_df[LOGODDS_COLUMN] <= mean - logodds_factor * std].sort_values(by=LOGODDS_COLUMN, ascending=True).head(topk)

        return list(zip(top_words.index, top_words[LOGODDS_COLUMN])), list(zip(bottom_words.index, bottom_words[LOGODDS_COLUMN]))

    def _format_log_odds(self, log_odds1, log_odds2):
        text = ""
        text += "Top words for Group 1\n"
        for word, score in log_odds1:
            text += f"{word}: {score}\n"
        text += "\n\n"
        text += "Top words for Group 2\n"
        for word, score in log_odds2:
            text += f"{word}: {score}\n"
        return text
    

[docs]
    def report_log_odds(
            self,
            df1: pd.DataFrame,
            df2: pd.DataFrame,
            text_column1: str,
            text_column2: str,
            topk: int = 5,
            zscore: bool = True,
            logodds_factor: float = 1.0,
            run_text_formatting: bool = False,
            run_ngrams: bool = False,
            n: int = 0,
    ) -> str:
        """
        Return formatted topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])

        Arguments:
            df1 (pd.DataFrame): pandas dataframe
            df2 (pd.DataFrame): pandas dataframe
            text_column1 (str): name of column containing text to analyze in df1
            text_column2 (str): name of column containing text to analyze in df2
            topk (int): number of top words to return
            zscore (bool): whether to z-score the log-odds
            logodds_factor (float): factor to multiply standard deviation by to determine top words
            run_text_formatting (bool): whether to run standard text formatting
            run_ngrams (bool): whether to run ngrams
            n (int): n for ngrams

        Returns:
            str: formatted topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])
        """

        log_odds1, log_odds2 = self._get_logodds(
            df1=df1,
            df2=df2,
            text_column1=text_column1,
            text_column2=text_column2,
            topk=topk,
            zscore=zscore,
            logodds_factor=logodds_factor,
            run_text_formatting=run_text_formatting,
            run_ngrams=run_ngrams,
            n=n,
        )

        text = self._format_log_odds(log_odds1, log_odds2)
        return text

    

[docs]
    def print_log_odds(
            self,
            df1: pd.DataFrame,
            df2: pd.DataFrame,
            text_column1: str, 
            text_column2: str,
            topk: int = 5,
            zscore: bool = True,
            logodds_factor: float = 1.0,
            run_text_formatting: bool = False,
            run_ngrams: bool = False,
            n: int = 0,
    ) -> None:
        """
        Print topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])

        Arguments:
            df1 (pd.DataFrame): pandas dataframe
            df2 (pd.DataFrame): pandas dataframe
            text_column1 (str): name of column containing text to analyze in df1
            text_column2 (str): name of column containing text to analyze in df2
            topk (int): number of top words to return
            zscore (bool): whether to z-score the log-odds
            logodds_factor (float): factor to multiply standard deviation by to determine top words
            run_text_formatting (bool): whether to run standard text formatting
            run_ngrams (bool): whether to run ngrams
            n (int): n for ngrams
        """

        text = self.report_log_odds(
            df1=df1,
            df2=df2,
            text_column1=text_column1,
            text_column2=text_column2,
            topk=topk,
            zscore=zscore,
            logodds_factor=logodds_factor,
            run_text_formatting=run_text_formatting,
            run_ngrams=run_ngrams,
            n=n,
        )

        print(text)



[docs]
    def plot_log_odds(
            self,
            df1: pd.DataFrame,
            df2: pd.DataFrame,
            text_column1: str,
            text_column2: str,
            group1_name: str = "Group 1",
            group2_name: str = "Group 2",
            topk: int = 5,
            save_path: str = None,
            zscore: bool = True,
            logodds_factor: float = 1.0,
            run_text_formatting: bool = False,
            run_ngrams: bool = False,
            n: int = 0,
    ) -> None:
        """
        Plot topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])

        Arguments:
            df1 (pd.DataFrame): pandas dataframe
            df2 (pd.DataFrame): pandas dataframe
            text_column1 (str): name of column containing text to analyze in df1
            text_column2 (str): name of column containing text to analyze in df2
            group1_name (str): name of group 1
            group2_name (str): name of group 2
            topk (int): number of top words to return
            save_path (str): path to save plot
            zscore (bool): whether to z-score the log-odds
            logodds_factor (float): factor to multiply standard deviation by to determine top words
            run_text_formatting (bool): whether to run standard text formatting
            run_ngrams (bool): whether to run ngrams
            n (int): n for ngrams
        """

        sns.set_theme(style="whitegrid")
        sns.set_context("paper", font_scale=1.5, rc={"lines.linewidth": 2.5})
        plt.rcParams["font.family"] = "serif"

        log_odds1, log_odds2 = self._get_logodds(
            df1=df1,
            df2=df2,
            text_column1=text_column1,
            text_column2=text_column2,
            topk=topk,
            zscore=zscore,
            logodds_factor=logodds_factor,
            run_text_formatting=run_text_formatting,
            run_ngrams=run_ngrams,
            n=n,
        )

        # Create dataframe
        log_odds_df = pd.DataFrame(log_odds1 + log_odds2, columns=['word', 'log_odds'])
        # Plot  x-axis: log-odds, y-axis: words
        plt.figure(figsize=(6, len(log_odds_df) / 2))
        sns.barplot(x='log_odds', y='word', data=log_odds_df)
        plt.xlabel('Log odds')
        plt.ylabel('Words')

        x_min, x_max = plt.xlim()
        y_min, y_max = plt.ylim()
        plt.text(x_min, y_min, group2_name, ha='left', va='center') # second group because it's negative
        plt.text(x_max, y_min, group1_name, ha='right', va='center')
        plt.title(f"Log odds: {group1_name} vs. {group2_name}")
        if save_path is not None:
            plt.savefig(save_path, bbox_inches='tight')
        else:
            plt.show()
        plt.clf()