Skip to content

Commit

Permalink
Update Lit
Browse files Browse the repository at this point in the history
  • Loading branch information
brianferrell787 authored Dec 13, 2024
1 parent 3532576 commit fe50c04
Showing 1 changed file with 49 additions and 0 deletions.
49 changes: 49 additions & 0 deletions Lit
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,52 @@
\end{algorithm}

\end{document}




\begin{algorithm}[H]
\caption{Outlier and Clustering Detection for Bank Topic Distributions}
\KwIn{
\textbf{I}: Set of SEC filing items (e.g., ``Risk Factors,'' ``MD\&A,'' ``Business Overview'') \\
\textbf{J}: Set of banks \\
\textbf{T}: Set of years
}
\KwOut{Outlier and cluster detection scores for topic distributions across banks in each year.}

\ForEach{filing item $i \in I$}{
\ForEach{document $d \in i$}{
Split document $d$ into smaller chunks $c$ (e.g., paragraphs, semantic units)\;
Perform any necessary preprocessing of chunks (e.g., data cleaning)\;
}
Train topic model $M_i$ using BERTopic on all chunks $c$ within all documents $d$ for all years $t$\;
Perform hierarchical clustering on $M_i$ to reduce dimensionality by grouping topics into parent topics\;
\ForEach{year $t \in T$}{
Compute topic distributions for all banks $j \in J$\;
Assign each chunk to a topic from the reduced topic hierarchy\;
Calculate the percentage distribution of topics for each bank\;
}
}

\ForEach{year $t \in T$}{
Extract the topic distribution vectors $v_{j,t}$ for all banks $j$\;
Apply dimensionality reduction (e.g., PCA) to $v_{j,t}$ while retaining at least 95\% of variance\;
Perform clustering (e.g., K-means) on reduced vectors $v_{j,t}^{PCA}$ to determine clusters with the optimal silhouette score\;
Test $v_{j,t}^{PCA}$ for multivariate normality\;
\eIf{normality is \textbf{True}}{
Apply robust Mahalanobis distance to identify potential outliers\;
Apply Local Outlier Factor (LOF) to enhance detection accuracy\;
Flag a point as an outlier if it is identified by both Mahalanobis distance and LOF\;
}{
Apply LOF only, as Mahalanobis distance assumes normality\;
}
}

\textbf{Optional Features:}
\begin{itemize}
\item Normalize $v_{j,t}$ before dimensionality reduction.
\item Convert outlier and clustering outputs into continuous scores and normalize final scores to account for holding company size (e.g., total assets).
\item If the topic order is relevant (e.g., ``Risk Factors''), create weighted topic distributions $W_{j,t}$ based on the order of topics discussed.
\end{itemize}

\end{algorithm}

0 comments on commit fe50c04

Please sign in to comment.