Skip to content

Commit

Permalink
Update Lit
Browse files Browse the repository at this point in the history
  • Loading branch information
brianferrell787 authored Dec 13, 2024
1 parent cd927bd commit 92ec97a
Showing 1 changed file with 51 additions and 59 deletions.
110 changes: 51 additions & 59 deletions Lit
Original file line number Diff line number Diff line change
@@ -1,72 +1,64 @@
\documentclass{article}
\usepackage{amsmath}
\usepackage{algorithm}
\usepackage{algorithmic}

\begin{document}

\section*{Outlier/Cluster Detection Algorithm for Bank 10-K Items (Updated)}
\section*{Algorithm: Outlier/Cluster Detection with Topic Distributions}

\textbf{Algorithm 1: Outlier/Cluster Detection Algorithm for Bank 10-K Items (Full Scope)}
\begin{algorithm}
\caption{Outlier/Cluster Detection Algorithm for Bank 10-K Items}
\begin{algorithmic}[1]
\REQUIRE Text data for Item $i$ of bank $j$ over years $t = 1, \dots, T$.
\ENSURE Continuous outlier and cluster scores normalized by holding company asset size.

\FORALL{years $t \in \{1, \dots, T\}$}
\FORALL{banks $j$}
\STATE Extract text $I_{i,j,t}$ for Item $i$ of bank $j$ at year $t$.
\STATE Split $I_{i,j,t}$ into semantic chunks or paragraphs.
\STATE Clean $I_{i,j,t}$ by removing stopwords and performing preprocessing.
\STATE Build topic model using BERTopic:
\begin{enumerate}
\item Generate topics $k$ and embeddings $v_{k}$ for each chunk.
\item Apply hierarchical clustering to group topics into parent topics.
\item Select parent topics based on hierarchical levels or topic coverage.
\end{enumerate}
\STATE Compute topic distributions $D_{i,j,t}$ as percentages:
\begin{enumerate}
\item Normalize counts of topics by total topic occurrences in year $t$.
\item Optionally compute weighted topic distributions based on order of topics discussed.
\end{enumerate}
\ENDFOR
\ENDFOR

\STATE Aggregate topic distributions $D_{i,j,t}$ over all years $t$ for each bank $j$.

\STATE Compute embedding matrix $V_{i,j}$ using aggregated topic distributions.

\STATE Apply clustering and outlier detection:
\begin{enumerate}
\item \textbf{Input:}
\begin{itemize}
\item Item $i$ (e.g., ``Item 3: Legal Proceedings'') for bank $j$ over years $t = 1, \dots, T$.
\end{itemize}

\item \textbf{Output:}
\item Use KMeans clustering to determine optimal clusters $k$ using silhouette score.
\item Test for multivariate normality of $V_{i,j}$:
\begin{itemize}
\item Continuous outlier and cluster scores normalized by holding company asset size.
\item \textbf{If normality is True:}
\begin{enumerate}
\item Apply robust Mahalanobis distance using Minimum Covariance Determinant (MCD).
\item Apply Local Outlier Factor (LOF).
\item Flag points as outliers if identified by both Mahalanobis distance and LOF.
\end{enumerate}
\item \textbf{Else:}
\begin{enumerate}
\item Apply only LOF for outlier detection.
\end{enumerate}
\end{itemize}

\item \textbf{For each} $t$ in years $T$:
\begin{enumerate}
\item Extract text $I_{i,j,t}$ for Item $i$ of bank $j$ at time $t$.
\item Split $I_{i,j,t}$ into semantic chunks of size $n$ or paragraphs.
\item Clean $I_{i,j,t}$ by removing stopwords and preprocessing text.
\item Build a topic model using BERTopic:
\begin{itemize}
\item Generate topics $k$ and topic embeddings $v_{k}$ for each chunk.
\item Apply hierarchical clustering on topic embeddings to reduce dimensionality:
\begin{itemize}
\item Group similar topics into higher-level parent topics.
\item Select parent topics based on hierarchical levels or topic coverage.
\end{itemize}
\end{itemize}
\item Compute topic distributions $D_{i,j,t}$ as percentages rather than counts:
\begin{itemize}
\item Normalize counts of topics for each bank $j$ by total topic occurrences in year $t$.
\item Optionally compute weighted topic distributions based on order of topic discussion.
\end{itemize}
\end{enumerate}
\item \textbf{End for.}

\item Aggregate topic distributions over years and compute embeddings $v_{i,j}$ for each bank:
\begin{itemize}
\item Use embeddings of topic distributions as input data.
\end{itemize}

\item Apply outlier and clustering detection on $v_{i,j}$:
\begin{enumerate}
\item Apply KMeans clustering to determine optimal clusters $k$ using silhouette score.
\item Test for multivariate normality of $v_{i,j}$:
\begin{itemize}
\item \textbf{If normality is True:}
\begin{itemize}
\item Apply robust Mahalanobis distance using Minimum Covariance Determinant (MCD).
\item Apply Local Outlier Factor (LOF) on $v_{i,j}$.
\item Flag points as outliers if identified by both Mahalanobis distance and LOF.
\end{itemize}
\item \textbf{Else:}
\begin{itemize}
\item Apply LOF only, as Mahalanobis distance assumes normality.
\end{itemize}
\end{itemize}
\end{enumerate}

\item Convert discrete outlier and cluster outputs into continuous scores.
\item Normalize outlier and cluster scores by holding company asset size.
\end{enumerate}

\textbf{End Algorithm.}
\STATE Convert discrete outlier and cluster outputs into continuous scores.
\STATE Normalize outlier and cluster scores by holding company asset size.

\RETURN Continuous outlier and cluster scores.

\end{algorithmic}
\end{algorithm}

\end{document}

0 comments on commit 92ec97a

Please sign in to comment.