import pandas as pd
from typing import List, Union, Tuple
from collections import defaultdict
import math
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from edu_convokit import utils
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import itertools
import re
import logging
from edu_convokit.analyzers import analyzer
[docs]
class LexicalAnalyzer(analyzer.Analyzer):
[docs]
def print_word_frequency(
self,
text_column: str,
topk: int = 5,
df: pd.DataFrame = None,
speaker_column: str = None,
run_text_formatting: bool = False,
run_ngrams: bool = False,
n: int = 0
) -> str:
"""
Print word frequency for a dataframe.
Arguments:
df (pd.DataFrame): pandas dataframe
text_column (str): name of column containing text to analyze
topk (int): number of top words to return
speaker_column (str): name of column containing speaker names. If specified, it will report word frequency for each speaker.
run_text_formatting (bool): whether to run standard text formatting
run_ngrams (bool): whether to run ngrams
n (int): n for ngrams
Returns:
str: word frequency
"""
text = self.report_word_frequency(
df=df,
text_column=text_column,
topk=topk,
speaker_column=speaker_column,
run_text_formatting=run_text_formatting,
run_ngrams=run_ngrams,
n=n,
)
print(text)
[docs]
def report_word_frequency(
self,
text_column: str,
topk: int = 5,
df: pd.DataFrame = None,
speaker_column: str = None,
run_text_formatting: bool = False,
run_ngrams: bool = False,
n: int = 0
) -> str:
"""
Reports word frequency for a dataframe as a string.
Arguments:
df (pd.DataFrame): pandas dataframe
text_column (str): name of column containing text to analyze
topk (int): number of top words to return
speaker_column (str): name of column containing speaker names. If specified, it will report word frequency for each speaker.
run_text_formatting (bool): whether to run standard text formatting
run_ngrams (bool): whether to run ngrams
n (int): n for ngrams
Returns:
str: word frequency
"""
if df is None:
df = self.get_df().copy()
# Make sure text columns are interpreted as strings
df[text_column] = df[text_column].astype(str)
assert text_column in df.columns, f"Text column {text_column} not found in dataframe."
df = df.copy()
if run_text_formatting: # Run standard text formatting.
df[text_column] = df[text_column].apply(utils._clean_text_to_words)
if run_ngrams: # Run ngrams.
df = self._compute_ngrams(df, text_column, n=n)
df = self._format_text_column(df, text_column)
if speaker_column is None:
top_words = self._get_top_words(df, text_column, topk)
text = self._format_word_frequency(top_words)
else:
text = "Top Words By Speaker\n"
for speaker in df[speaker_column].unique():
# Skip if speaker is nan
if isinstance(speaker, float) and math.isnan(speaker):
continue
speaker_df = df[df[speaker_column] == speaker]
top_words = self._get_top_words(speaker_df, text_column, topk)
text += f"{speaker}\n"
text += self._format_word_frequency(top_words)
text += "\n\n"
return text
def _get_top_words(
self,
df: pd.DataFrame,
text_column: str,
topk: int = 5,
) -> List[Tuple[str, int]]:
words = df[text_column].sum()
word_counts = nltk.FreqDist(words)
top_words = word_counts.most_common(topk)
return top_words
def _format_word_frequency(self, word_counts):
text = ""
for word, count in word_counts:
text += f"{word}: {count}\n"
return text
def _format_text_column(
self,
df: pd.DataFrame,
text_column: str
) -> pd.DataFrame:
"""
Format text column for lexical analysis. Check that text column is a list of strings, otherwise split on spaces.
"""
if isinstance(df[text_column].iloc[0], str):
df[text_column] = df[text_column].str.split()
return df
def _get_counts(self, texts, vocab):
counts = {w: 0 for w in vocab}
for split in texts:
count = 0
prev = ''
for w in split:
if w == '':
continue
if w in vocab:
counts[w] += 1
if count > 0: # Enable bigram counts if the vocab allows that.
bigram = prev + ' ' + w
if bigram in vocab:
counts[bigram] += 1
count += 1
prev = w
return counts
def _logodds(self, counts1, counts2, prior, zscore = True):
# code from Dan Jurafsky
# Note: counts1 will be positive and counts2 will be negative
sigmasquared = defaultdict(float)
sigma = defaultdict(float)
delta = defaultdict(float)
n1 = sum(counts1.values())
n2 = sum(counts2.values())
# Since we use the sum of counts from the two groups as a prior, this is equivalent to a simple log odds ratio.
nprior = sum(prior.values())
for word in prior.keys():
if prior[word] == 0:
delta[word] = 0
continue
l1 = float(counts1[word] + prior[word]) / (( n1 + nprior ) - (counts1[word] + prior[word]))
l2 = float(counts2[word] + prior[word]) / (( n2 + nprior ) - (counts2[word] + prior[word]))
sigmasquared[word] = 1/(float(counts1[word]) + float(prior[word])) + 1/(float(counts2[word]) + float(prior[word]))
sigma[word] = math.sqrt(sigmasquared[word])
delta[word] = (math.log(l1) - math.log(l2))
if zscore:
delta[word] /= sigma[word]
return delta
def _compute_logodds(
self,
df1: pd.DataFrame,
df2: pd.DataFrame,
text_column1: str,
text_column2: str,
words2idx: dict,
) -> Tuple[dict, dict, dict, dict]:
counts1 = self._get_counts(df1[text_column1], words2idx)
counts2 = self._get_counts(df2[text_column2], words2idx)
prior = {}
for k, v in counts1.items():
prior[k] = v + counts2[k]
# Note: You might not want to z-score if there are significantly larger events in one group than the other.
delta = self._logodds(counts1, counts2, prior, True)
return prior, counts1, counts2, delta
def _get_ngrams(self, text, n):
ngrams = []
for i in range(len(text) - n + 1):
ngrams.append(' '.join(text[i:i+n]))
return ngrams
def _compute_ngrams(
self,
df: pd.DataFrame,
text_column: str, # Values in this column should be lists of strings. If they are not, they will be split on spaces.
n: int = 0, # If n is 0, will return all ngrams.
target_text_column: str = None, # Put ngrams in a new column.
min_count: int = 1, # Minimum number of times an ngram must appear to be included.
) -> Union[List[str], pd.DataFrame]:
df = df.copy()
if target_text_column is None:
target_text_column = text_column
df[target_text_column] = df[text_column]
df = self._format_text_column(df, target_text_column)
if n == 0:
ngram_model = Phrases(df[target_text_column].tolist(), min_count=min_count)
ngram_phraser = Phraser(ngram_model)
df[target_text_column] = df[target_text_column].apply(lambda x: ngram_phraser[x])
else:
df[target_text_column] = df[target_text_column].apply(lambda x: self._get_ngrams(x, n))
return df
def _get_logodds(
self,
df1: pd.DataFrame,
df2: pd.DataFrame,
text_column1: str,
text_column2: str,
topk: int = 5,
zscore: bool = True,
logodds_factor: float = 1.0,
run_text_formatting: bool = False,
run_ngrams: bool = False,
n: int = 0,
) -> Tuple[List[Tuple[str, float]], List[Tuple[str, float]]]:
"""
Return topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])
For more information on log-odds, see: https://en.wikipedia.org/wiki/Odds_ratio
Arguments:
df1 (pd.DataFrame): pandas dataframe
df2 (pd.DataFrame): pandas dataframe
text_column1 (str): name of column containing text to analyze in df1
text_column2 (str): name of column containing text to analyze in df2
topk (int): number of top words to return
zscore (bool): whether to z-score the log-odds
logodds_factor (float): factor to multiply standard deviation by to determine top words
run_text_formatting (bool): whether to run standard text formatting
run_ngrams (bool): whether to run ngrams
n (int): n for ngrams
Returns:
Tuple[List[Tuple[str, float]], List[Tuple[str, float]]]: topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])
"""
assert text_column1 in df1.columns, f"Text column {text_column1} not found in dataframe."
assert text_column2 in df2.columns, f"Text column {text_column2} not found in dataframe."
df1 = df1.copy()
df2 = df2.copy()
# Make sure text columns are interpreted as strings
df1[text_column1] = df1[text_column1].astype(str)
df2[text_column2] = df2[text_column2].astype(str)
if run_text_formatting: # Run standard text formatting.
df1[text_column1] = df1[text_column1].apply(utils._clean_text_to_words)
df2[text_column2] = df2[text_column2].apply(utils._clean_text_to_words)
if run_ngrams: # Run ngrams.
df1 = self._compute_ngrams(df1, text_column1, n=n)
df2 = self._compute_ngrams(df2, text_column2, n=n)
df1_ = self._format_text_column(df1, text_column1)
df2_ = self._format_text_column(df2, text_column2)
# Get all words and build word index dictionary.
words = df1_[text_column1].sum() + df2_[text_column2].sum()
words = list(set(words))
words2idx = {w: i for i, w in enumerate(words)}
# Compute log-odds
prior, counts1, counts2, logodds = self._compute_logodds(df1_, df2_, text_column1, text_column2, words2idx)
LOGODDS_COLUMN = 'logodds'
logodds_df = pd.DataFrame.from_dict(logodds, orient='index', columns=[LOGODDS_COLUMN])
mean = 0
std = logodds_df[LOGODDS_COLUMN].std()
# Get top words and sort
top_words = logodds_df[logodds_df[LOGODDS_COLUMN] >= mean + logodds_factor * std].sort_values(by=LOGODDS_COLUMN, ascending=False).head(topk)
bottom_words = logodds_df[logodds_df[LOGODDS_COLUMN] <= mean - logodds_factor * std].sort_values(by=LOGODDS_COLUMN, ascending=True).head(topk)
return list(zip(top_words.index, top_words[LOGODDS_COLUMN])), list(zip(bottom_words.index, bottom_words[LOGODDS_COLUMN]))
def _format_log_odds(self, log_odds1, log_odds2):
text = ""
text += "Top words for Group 1\n"
for word, score in log_odds1:
text += f"{word}: {score}\n"
text += "\n\n"
text += "Top words for Group 2\n"
for word, score in log_odds2:
text += f"{word}: {score}\n"
return text
[docs]
def report_log_odds(
self,
df1: pd.DataFrame,
df2: pd.DataFrame,
text_column1: str,
text_column2: str,
topk: int = 5,
zscore: bool = True,
logodds_factor: float = 1.0,
run_text_formatting: bool = False,
run_ngrams: bool = False,
n: int = 0,
) -> str:
"""
Return formatted topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])
Arguments:
df1 (pd.DataFrame): pandas dataframe
df2 (pd.DataFrame): pandas dataframe
text_column1 (str): name of column containing text to analyze in df1
text_column2 (str): name of column containing text to analyze in df2
topk (int): number of top words to return
zscore (bool): whether to z-score the log-odds
logodds_factor (float): factor to multiply standard deviation by to determine top words
run_text_formatting (bool): whether to run standard text formatting
run_ngrams (bool): whether to run ngrams
n (int): n for ngrams
Returns:
str: formatted topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])
"""
log_odds1, log_odds2 = self._get_logodds(
df1=df1,
df2=df2,
text_column1=text_column1,
text_column2=text_column2,
topk=topk,
zscore=zscore,
logodds_factor=logodds_factor,
run_text_formatting=run_text_formatting,
run_ngrams=run_ngrams,
n=n,
)
text = self._format_log_odds(log_odds1, log_odds2)
return text
[docs]
def print_log_odds(
self,
df1: pd.DataFrame,
df2: pd.DataFrame,
text_column1: str,
text_column2: str,
topk: int = 5,
zscore: bool = True,
logodds_factor: float = 1.0,
run_text_formatting: bool = False,
run_ngrams: bool = False,
n: int = 0,
) -> None:
"""
Print topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])
Arguments:
df1 (pd.DataFrame): pandas dataframe
df2 (pd.DataFrame): pandas dataframe
text_column1 (str): name of column containing text to analyze in df1
text_column2 (str): name of column containing text to analyze in df2
topk (int): number of top words to return
zscore (bool): whether to z-score the log-odds
logodds_factor (float): factor to multiply standard deviation by to determine top words
run_text_formatting (bool): whether to run standard text formatting
run_ngrams (bool): whether to run ngrams
n (int): n for ngrams
"""
text = self.report_log_odds(
df1=df1,
df2=df2,
text_column1=text_column1,
text_column2=text_column2,
topk=topk,
zscore=zscore,
logodds_factor=logodds_factor,
run_text_formatting=run_text_formatting,
run_ngrams=run_ngrams,
n=n,
)
print(text)
[docs]
def plot_log_odds(
self,
df1: pd.DataFrame,
df2: pd.DataFrame,
text_column1: str,
text_column2: str,
group1_name: str = "Group 1",
group2_name: str = "Group 2",
topk: int = 5,
save_path: str = None,
zscore: bool = True,
logodds_factor: float = 1.0,
run_text_formatting: bool = False,
run_ngrams: bool = False,
n: int = 0,
) -> None:
"""
Plot topk log-odds for each df: ([(word, log-odds), ...], [(word, log-odds), ...])
Arguments:
df1 (pd.DataFrame): pandas dataframe
df2 (pd.DataFrame): pandas dataframe
text_column1 (str): name of column containing text to analyze in df1
text_column2 (str): name of column containing text to analyze in df2
group1_name (str): name of group 1
group2_name (str): name of group 2
topk (int): number of top words to return
save_path (str): path to save plot
zscore (bool): whether to z-score the log-odds
logodds_factor (float): factor to multiply standard deviation by to determine top words
run_text_formatting (bool): whether to run standard text formatting
run_ngrams (bool): whether to run ngrams
n (int): n for ngrams
"""
sns.set_theme(style="whitegrid")
sns.set_context("paper", font_scale=1.5, rc={"lines.linewidth": 2.5})
plt.rcParams["font.family"] = "serif"
log_odds1, log_odds2 = self._get_logodds(
df1=df1,
df2=df2,
text_column1=text_column1,
text_column2=text_column2,
topk=topk,
zscore=zscore,
logodds_factor=logodds_factor,
run_text_formatting=run_text_formatting,
run_ngrams=run_ngrams,
n=n,
)
# Create dataframe
log_odds_df = pd.DataFrame(log_odds1 + log_odds2, columns=['word', 'log_odds'])
# Plot x-axis: log-odds, y-axis: words
plt.figure(figsize=(6, len(log_odds_df) / 2))
sns.barplot(x='log_odds', y='word', data=log_odds_df)
plt.xlabel('Log odds')
plt.ylabel('Words')
x_min, x_max = plt.xlim()
y_min, y_max = plt.ylim()
plt.text(x_min, y_min, group2_name, ha='left', va='center') # second group because it's negative
plt.text(x_max, y_min, group1_name, ha='right', va='center')
plt.title(f"Log odds: {group1_name} vs. {group2_name}")
if save_path is not None:
plt.savefig(save_path, bbox_inches='tight')
else:
plt.show()
plt.clf()