-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
cd927bd
commit 92ec97a
Showing
1 changed file
with
51 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,72 +1,64 @@ | ||
\documentclass{article} | ||
\usepackage{amsmath} | ||
\usepackage{algorithm} | ||
\usepackage{algorithmic} | ||
|
||
\begin{document} | ||
|
||
\section*{Outlier/Cluster Detection Algorithm for Bank 10-K Items (Updated)} | ||
\section*{Algorithm: Outlier/Cluster Detection with Topic Distributions} | ||
|
||
\textbf{Algorithm 1: Outlier/Cluster Detection Algorithm for Bank 10-K Items (Full Scope)} | ||
\begin{algorithm} | ||
\caption{Outlier/Cluster Detection Algorithm for Bank 10-K Items} | ||
\begin{algorithmic}[1] | ||
\REQUIRE Text data for Item $i$ of bank $j$ over years $t = 1, \dots, T$. | ||
\ENSURE Continuous outlier and cluster scores normalized by holding company asset size. | ||
|
||
\FORALL{years $t \in \{1, \dots, T\}$} | ||
\FORALL{banks $j$} | ||
\STATE Extract text $I_{i,j,t}$ for Item $i$ of bank $j$ at year $t$. | ||
\STATE Split $I_{i,j,t}$ into semantic chunks or paragraphs. | ||
\STATE Clean $I_{i,j,t}$ by removing stopwords and performing preprocessing. | ||
\STATE Build topic model using BERTopic: | ||
\begin{enumerate} | ||
\item Generate topics $k$ and embeddings $v_{k}$ for each chunk. | ||
\item Apply hierarchical clustering to group topics into parent topics. | ||
\item Select parent topics based on hierarchical levels or topic coverage. | ||
\end{enumerate} | ||
\STATE Compute topic distributions $D_{i,j,t}$ as percentages: | ||
\begin{enumerate} | ||
\item Normalize counts of topics by total topic occurrences in year $t$. | ||
\item Optionally compute weighted topic distributions based on order of topics discussed. | ||
\end{enumerate} | ||
\ENDFOR | ||
\ENDFOR | ||
|
||
\STATE Aggregate topic distributions $D_{i,j,t}$ over all years $t$ for each bank $j$. | ||
|
||
\STATE Compute embedding matrix $V_{i,j}$ using aggregated topic distributions. | ||
|
||
\STATE Apply clustering and outlier detection: | ||
\begin{enumerate} | ||
\item \textbf{Input:} | ||
\begin{itemize} | ||
\item Item $i$ (e.g., ``Item 3: Legal Proceedings'') for bank $j$ over years $t = 1, \dots, T$. | ||
\end{itemize} | ||
|
||
\item \textbf{Output:} | ||
\item Use KMeans clustering to determine optimal clusters $k$ using silhouette score. | ||
\item Test for multivariate normality of $V_{i,j}$: | ||
\begin{itemize} | ||
\item Continuous outlier and cluster scores normalized by holding company asset size. | ||
\item \textbf{If normality is True:} | ||
\begin{enumerate} | ||
\item Apply robust Mahalanobis distance using Minimum Covariance Determinant (MCD). | ||
\item Apply Local Outlier Factor (LOF). | ||
\item Flag points as outliers if identified by both Mahalanobis distance and LOF. | ||
\end{enumerate} | ||
\item \textbf{Else:} | ||
\begin{enumerate} | ||
\item Apply only LOF for outlier detection. | ||
\end{enumerate} | ||
\end{itemize} | ||
|
||
\item \textbf{For each} $t$ in years $T$: | ||
\begin{enumerate} | ||
\item Extract text $I_{i,j,t}$ for Item $i$ of bank $j$ at time $t$. | ||
\item Split $I_{i,j,t}$ into semantic chunks of size $n$ or paragraphs. | ||
\item Clean $I_{i,j,t}$ by removing stopwords and preprocessing text. | ||
\item Build a topic model using BERTopic: | ||
\begin{itemize} | ||
\item Generate topics $k$ and topic embeddings $v_{k}$ for each chunk. | ||
\item Apply hierarchical clustering on topic embeddings to reduce dimensionality: | ||
\begin{itemize} | ||
\item Group similar topics into higher-level parent topics. | ||
\item Select parent topics based on hierarchical levels or topic coverage. | ||
\end{itemize} | ||
\end{itemize} | ||
\item Compute topic distributions $D_{i,j,t}$ as percentages rather than counts: | ||
\begin{itemize} | ||
\item Normalize counts of topics for each bank $j$ by total topic occurrences in year $t$. | ||
\item Optionally compute weighted topic distributions based on order of topic discussion. | ||
\end{itemize} | ||
\end{enumerate} | ||
\item \textbf{End for.} | ||
|
||
\item Aggregate topic distributions over years and compute embeddings $v_{i,j}$ for each bank: | ||
\begin{itemize} | ||
\item Use embeddings of topic distributions as input data. | ||
\end{itemize} | ||
|
||
\item Apply outlier and clustering detection on $v_{i,j}$: | ||
\begin{enumerate} | ||
\item Apply KMeans clustering to determine optimal clusters $k$ using silhouette score. | ||
\item Test for multivariate normality of $v_{i,j}$: | ||
\begin{itemize} | ||
\item \textbf{If normality is True:} | ||
\begin{itemize} | ||
\item Apply robust Mahalanobis distance using Minimum Covariance Determinant (MCD). | ||
\item Apply Local Outlier Factor (LOF) on $v_{i,j}$. | ||
\item Flag points as outliers if identified by both Mahalanobis distance and LOF. | ||
\end{itemize} | ||
\item \textbf{Else:} | ||
\begin{itemize} | ||
\item Apply LOF only, as Mahalanobis distance assumes normality. | ||
\end{itemize} | ||
\end{itemize} | ||
\end{enumerate} | ||
|
||
\item Convert discrete outlier and cluster outputs into continuous scores. | ||
\item Normalize outlier and cluster scores by holding company asset size. | ||
\end{enumerate} | ||
|
||
\textbf{End Algorithm.} | ||
\STATE Convert discrete outlier and cluster outputs into continuous scores. | ||
\STATE Normalize outlier and cluster scores by holding company asset size. | ||
|
||
\RETURN Continuous outlier and cluster scores. | ||
|
||
\end{algorithmic} | ||
\end{algorithm} | ||
|
||
\end{document} |