Skip to content

Commit

Permalink
Update Lit
Browse files Browse the repository at this point in the history
  • Loading branch information
brianferrell787 authored Dec 13, 2024
1 parent fcdb1c7 commit 3532576
Showing 1 changed file with 36 additions and 110 deletions.
146 changes: 36 additions & 110 deletions Lit
Original file line number Diff line number Diff line change
@@ -1,123 +1,49 @@
\documentclass{article}
\documentclass[10pt]{article}
\usepackage[margin=1in]{geometry}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{algpseudocode}
\usepackage{amsmath}

\begin{document}

\section*{Algorithm: Outlier/Cluster Detection with Topic Distributions}
\floatname{algorithm}{Algorithm}
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}

\begin{algorithm}
\caption{Outlier/Cluster Detection Algorithm for Bank 10-K Items}
\caption{Outlier and Clustering Detection for Bank Topic Distributions}
\begin{algorithmic}[1]
\REQUIRE Text data for Item $i$ of bank $j$ over years $t = 1, \dots, T$.
\ENSURE Continuous outlier and cluster scores normalized by holding company asset size.

\FORALL{years $t \in \{1, \dots, T\}$}
\FORALL{banks $j$}
\STATE Extract text $I_{i,j,t}$ for Item $i$ of bank $j$ at year $t$.
\STATE Split $I_{i,j,t}$ into semantic chunks or paragraphs.
\STATE Clean $I_{i,j,t}$ by removing stopwords and performing preprocessing.
\STATE Build topic model using BERTopic:
\begin{enumerate}
\item Generate topics $k$ and embeddings $v_{k}$ for each chunk.
\item Apply hierarchical clustering to group topics into parent topics.
\item Select parent topics based on hierarchical levels or topic coverage.
\end{enumerate}
\STATE Compute topic distributions $D_{i,j,t}$ as percentages:
\begin{enumerate}
\item Normalize counts of topics by total topic occurrences in year $t$.
\item Optionally compute weighted topic distributions based on order of topics discussed.
\end{enumerate}
\ENDFOR
\ENDFOR

\STATE Aggregate topic distributions $D_{i,j,t}$ over all years $t$ for each bank $j$.

\STATE Compute embedding matrix $V_{i,j}$ using aggregated topic distributions.

\STATE Apply clustering and outlier detection:
\begin{enumerate}
\item Use KMeans clustering to determine optimal clusters $k$ using silhouette score.
\item Test for multivariate normality of $V_{i,j}$:
\Require
\begin{itemize}
\item \textbf{If normality is True:}
\begin{enumerate}
\item Apply robust Mahalanobis distance using Minimum Covariance Determinant (MCD).
\item Apply Local Outlier Factor (LOF).
\item Flag points as outliers if identified by both Mahalanobis distance and LOF.
\end{enumerate}
\item \textbf{Else:}
\begin{enumerate}
\item Apply only LOF for outlier detection.
\end{enumerate}
\item $I$: Set of SEC filing items (e.g., "Risk Factors", "Item 7")
\item $j$: Set of banks
\item $t \in T$: Years
\end{itemize}
\end{enumerate}

\STATE Convert discrete outlier and cluster outputs into continuous scores.
\STATE Normalize outlier and cluster scores by holding company asset size.

\RETURN Continuous outlier and cluster scores.

\Ensure
Outlier and cluster detection scores for topic distributions across banks in each year.
\For {each item $i \in I$}
\State Construct a topic model $M_i$ for $i$ using BERTopic.
\State Perform hierarchical clustering on $M_i$ to reduce dimensionality by grouping topics into parent topics.
\State For each year $t$, compute topic distributions for all banks $j \in J$:
\State \hspace{\algorithmicindent} Split texts into chunks using semantic or paragraph chunking.
\State \hspace{\algorithmicindent} Assign each chunk to a topic from the reduced topic hierarchy.
\State \hspace{\algorithmicindent} Calculate the percentage distribution of topics for each bank.
\EndFor
\For {each year $t \in T$}
\State Extract the topic distribution vectors $v_{j,t}$ for all banks $j$.
\State Normalize $v_{j,t}$ using a robust scaler to handle extreme values.
\State Apply dimensionality reduction (e.g., PCA) to $v_{j,t}$ while retaining at least 95\% of variance.
\State Perform $k$-means clustering on reduced vectors $v_{j,t}^{PCA}$ to determine clusters with the optimal silhouette score.
\State Test $v_{j,t}^{PCA}$ for multivariate normality.
\If {normality is \textbf{True}}
\State Apply robust Mahalanobis distance to identify potential outliers.
\State Apply Local Outlier Factor (LOF) to enhance detection accuracy.
\State Flag a point as an outlier if it is identified by both Mahalanobis distance and LOF.
\Else
\State Apply LOF only, as Mahalanobis distance assumes normality.
\EndIf
\EndFor
\end{algorithmic}
\end{algorithm}

\end{document}






\documentclass[11pt]{article}
\usepackage{algorithm2e}
\usepackage{amsmath}

\begin{document}

\section*{Algorithm: Topic Distribution and Outlier Detection}

\begin{algorithm}[H]
\SetAlgoLined
\KwIn{
SEC Filing Item \(i\) (e.g., "Item 1A: Risk Factors"),
Financial Institution \(j\),
Reporting Years \(t = 1, 2, \ldots, T\)
}
\KwOut{Clustered topics and detected outliers for Item \(i\) for each institution \(j\)}

\For{each year \(t\)}{
Extract text \(I_{i,j,t}\) for Item \(i\) of institution \(j\) at time \(t\)\;
Split \(I_{i,j,t}\) into semantic chunks using a semantic chunker\;
Build a topic model \(T\) using BERTopic\;
Run hierarchical clustering on \(T\) to reduce the dimensionality of topics\;
Create topic distributions \(D_{i,j,t}\) (as percentages, not counts) per chunk\;
}

\For{each institution \(j\)}{
Aggregate \(D_{i,j,t}\) across years to create temporal vectors\;
Standardize distributions using a robust scaler\;
Apply PCA to reduce dimensionality while retaining 95\% variance\;
Apply KMeans clustering to determine \(k\), the optimal number of clusters, using silhouette scores\;
Test for multivariate normality of the PCA-reduced vectors\;

\eIf{normality is \textbf{True}}{
Apply robust Mahalanobis distance using Minimum Covariance Determinant (MCD)\;
Apply Local Outlier Factor (LOF)\;
Flag outliers if identified by more than one method\;
}{
Apply only LOF for outlier detection\;
}
}

\If{the topic order is relevant (e.g., Risk Factors)}{
Create weighted topic distributions \(W_{i,j,t}\) based on the order of discussion\;
}

Normalize final scores to account for holding company size (e.g., total assets)\;

Convert outlier detection and clustering results into continuous scores for risk assessment\;

\caption{Topic Distribution and Outlier Detection Algorithm}
\end{algorithm}

\end{document}

0 comments on commit 3532576

Please sign in to comment.