Skip to content

Commit

Permalink
Update Lit
Browse files Browse the repository at this point in the history
  • Loading branch information
brianferrell787 authored Dec 13, 2024
1 parent fe50c04 commit e559ae5
Showing 1 changed file with 0 additions and 97 deletions.
97 changes: 0 additions & 97 deletions Lit
Original file line number Diff line number Diff line change
@@ -1,98 +1 @@
\documentclass[10pt]{article}
\usepackage[margin=1in]{geometry}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{amsmath}

\begin{document}

\floatname{algorithm}{Algorithm}
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}

\begin{algorithm}
\caption{Outlier and Clustering Detection for Bank Topic Distributions}
\begin{algorithmic}[1]
\Require
\begin{itemize}
\item $I$: Set of SEC filing items (e.g., "Risk Factors", "Item 7")
\item $j$: Set of banks
\item $t \in T$: Years
\end{itemize}
\Ensure
Outlier and cluster detection scores for topic distributions across banks in each year.
\For {each item $i \in I$}
\State Construct a topic model $M_i$ for $i$ using BERTopic.
\State Perform hierarchical clustering on $M_i$ to reduce dimensionality by grouping topics into parent topics.
\State For each year $t$, compute topic distributions for all banks $j \in J$:
\State \hspace{\algorithmicindent} Split texts into chunks using semantic or paragraph chunking.
\State \hspace{\algorithmicindent} Assign each chunk to a topic from the reduced topic hierarchy.
\State \hspace{\algorithmicindent} Calculate the percentage distribution of topics for each bank.
\EndFor
\For {each year $t \in T$}
\State Extract the topic distribution vectors $v_{j,t}$ for all banks $j$.
\State Normalize $v_{j,t}$ using a robust scaler to handle extreme values.
\State Apply dimensionality reduction (e.g., PCA) to $v_{j,t}$ while retaining at least 95\% of variance.
\State Perform $k$-means clustering on reduced vectors $v_{j,t}^{PCA}$ to determine clusters with the optimal silhouette score.
\State Test $v_{j,t}^{PCA}$ for multivariate normality.
\If {normality is \textbf{True}}
\State Apply robust Mahalanobis distance to identify potential outliers.
\State Apply Local Outlier Factor (LOF) to enhance detection accuracy.
\State Flag a point as an outlier if it is identified by both Mahalanobis distance and LOF.
\Else
\State Apply LOF only, as Mahalanobis distance assumes normality.
\EndIf
\EndFor
\end{algorithmic}
\end{algorithm}

\end{document}




\begin{algorithm}[H]
\caption{Outlier and Clustering Detection for Bank Topic Distributions}
\KwIn{
\textbf{I}: Set of SEC filing items (e.g., ``Risk Factors,'' ``MD\&A,'' ``Business Overview'') \\
\textbf{J}: Set of banks \\
\textbf{T}: Set of years
}
\KwOut{Outlier and cluster detection scores for topic distributions across banks in each year.}

\ForEach{filing item $i \in I$}{
\ForEach{document $d \in i$}{
Split document $d$ into smaller chunks $c$ (e.g., paragraphs, semantic units)\;
Perform any necessary preprocessing of chunks (e.g., data cleaning)\;
}
Train topic model $M_i$ using BERTopic on all chunks $c$ within all documents $d$ for all years $t$\;
Perform hierarchical clustering on $M_i$ to reduce dimensionality by grouping topics into parent topics\;
\ForEach{year $t \in T$}{
Compute topic distributions for all banks $j \in J$\;
Assign each chunk to a topic from the reduced topic hierarchy\;
Calculate the percentage distribution of topics for each bank\;
}
}

\ForEach{year $t \in T$}{
Extract the topic distribution vectors $v_{j,t}$ for all banks $j$\;
Apply dimensionality reduction (e.g., PCA) to $v_{j,t}$ while retaining at least 95\% of variance\;
Perform clustering (e.g., K-means) on reduced vectors $v_{j,t}^{PCA}$ to determine clusters with the optimal silhouette score\;
Test $v_{j,t}^{PCA}$ for multivariate normality\;
\eIf{normality is \textbf{True}}{
Apply robust Mahalanobis distance to identify potential outliers\;
Apply Local Outlier Factor (LOF) to enhance detection accuracy\;
Flag a point as an outlier if it is identified by both Mahalanobis distance and LOF\;
}{
Apply LOF only, as Mahalanobis distance assumes normality\;
}
}

\textbf{Optional Features:}
\begin{itemize}
\item Normalize $v_{j,t}$ before dimensionality reduction.
\item Convert outlier and clustering outputs into continuous scores and normalize final scores to account for holding company size (e.g., total assets).
\item If the topic order is relevant (e.g., ``Risk Factors''), create weighted topic distributions $W_{j,t}$ based on the order of topics discussed.
\end{itemize}

\end{algorithm}

0 comments on commit e559ae5

Please sign in to comment.