-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
fe50c04
commit e559ae5
Showing
1 changed file
with
0 additions
and
97 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,98 +1 @@ | ||
\documentclass[10pt]{article} | ||
\usepackage[margin=1in]{geometry} | ||
\usepackage{algorithm} | ||
\usepackage{algpseudocode} | ||
\usepackage{amsmath} | ||
|
||
\begin{document} | ||
|
||
\floatname{algorithm}{Algorithm} | ||
\renewcommand{\algorithmicrequire}{\textbf{Input:}} | ||
\renewcommand{\algorithmicensure}{\textbf{Output:}} | ||
|
||
\begin{algorithm} | ||
\caption{Outlier and Clustering Detection for Bank Topic Distributions} | ||
\begin{algorithmic}[1] | ||
\Require | ||
\begin{itemize} | ||
\item $I$: Set of SEC filing items (e.g., "Risk Factors", "Item 7") | ||
\item $j$: Set of banks | ||
\item $t \in T$: Years | ||
\end{itemize} | ||
\Ensure | ||
Outlier and cluster detection scores for topic distributions across banks in each year. | ||
\For {each item $i \in I$} | ||
\State Construct a topic model $M_i$ for $i$ using BERTopic. | ||
\State Perform hierarchical clustering on $M_i$ to reduce dimensionality by grouping topics into parent topics. | ||
\State For each year $t$, compute topic distributions for all banks $j \in J$: | ||
\State \hspace{\algorithmicindent} Split texts into chunks using semantic or paragraph chunking. | ||
\State \hspace{\algorithmicindent} Assign each chunk to a topic from the reduced topic hierarchy. | ||
\State \hspace{\algorithmicindent} Calculate the percentage distribution of topics for each bank. | ||
\EndFor | ||
\For {each year $t \in T$} | ||
\State Extract the topic distribution vectors $v_{j,t}$ for all banks $j$. | ||
\State Normalize $v_{j,t}$ using a robust scaler to handle extreme values. | ||
\State Apply dimensionality reduction (e.g., PCA) to $v_{j,t}$ while retaining at least 95\% of variance. | ||
\State Perform $k$-means clustering on reduced vectors $v_{j,t}^{PCA}$ to determine clusters with the optimal silhouette score. | ||
\State Test $v_{j,t}^{PCA}$ for multivariate normality. | ||
\If {normality is \textbf{True}} | ||
\State Apply robust Mahalanobis distance to identify potential outliers. | ||
\State Apply Local Outlier Factor (LOF) to enhance detection accuracy. | ||
\State Flag a point as an outlier if it is identified by both Mahalanobis distance and LOF. | ||
\Else | ||
\State Apply LOF only, as Mahalanobis distance assumes normality. | ||
\EndIf | ||
\EndFor | ||
\end{algorithmic} | ||
\end{algorithm} | ||
|
||
\end{document} | ||
|
||
|
||
|
||
|
||
\begin{algorithm}[H] | ||
\caption{Outlier and Clustering Detection for Bank Topic Distributions} | ||
\KwIn{ | ||
\textbf{I}: Set of SEC filing items (e.g., ``Risk Factors,'' ``MD\&A,'' ``Business Overview'') \\ | ||
\textbf{J}: Set of banks \\ | ||
\textbf{T}: Set of years | ||
} | ||
\KwOut{Outlier and cluster detection scores for topic distributions across banks in each year.} | ||
|
||
\ForEach{filing item $i \in I$}{ | ||
\ForEach{document $d \in i$}{ | ||
Split document $d$ into smaller chunks $c$ (e.g., paragraphs, semantic units)\; | ||
Perform any necessary preprocessing of chunks (e.g., data cleaning)\; | ||
} | ||
Train topic model $M_i$ using BERTopic on all chunks $c$ within all documents $d$ for all years $t$\; | ||
Perform hierarchical clustering on $M_i$ to reduce dimensionality by grouping topics into parent topics\; | ||
\ForEach{year $t \in T$}{ | ||
Compute topic distributions for all banks $j \in J$\; | ||
Assign each chunk to a topic from the reduced topic hierarchy\; | ||
Calculate the percentage distribution of topics for each bank\; | ||
} | ||
} | ||
|
||
\ForEach{year $t \in T$}{ | ||
Extract the topic distribution vectors $v_{j,t}$ for all banks $j$\; | ||
Apply dimensionality reduction (e.g., PCA) to $v_{j,t}$ while retaining at least 95\% of variance\; | ||
Perform clustering (e.g., K-means) on reduced vectors $v_{j,t}^{PCA}$ to determine clusters with the optimal silhouette score\; | ||
Test $v_{j,t}^{PCA}$ for multivariate normality\; | ||
\eIf{normality is \textbf{True}}{ | ||
Apply robust Mahalanobis distance to identify potential outliers\; | ||
Apply Local Outlier Factor (LOF) to enhance detection accuracy\; | ||
Flag a point as an outlier if it is identified by both Mahalanobis distance and LOF\; | ||
}{ | ||
Apply LOF only, as Mahalanobis distance assumes normality\; | ||
} | ||
} | ||
|
||
\textbf{Optional Features:} | ||
\begin{itemize} | ||
\item Normalize $v_{j,t}$ before dimensionality reduction. | ||
\item Convert outlier and clustering outputs into continuous scores and normalize final scores to account for holding company size (e.g., total assets). | ||
\item If the topic order is relevant (e.g., ``Risk Factors''), create weighted topic distributions $W_{j,t}$ based on the order of topics discussed. | ||
\end{itemize} | ||
|
||
\end{algorithm} |