Update Lit

brianferrell787 · Dec 13, 2024 · fe50c04 · fe50c04
1 parent 3532576
commit fe50c04
Showing 1 changed file with 49 additions and 0 deletions.
diff --git a/Lit b/Lit
@@ -47,3 +47,52 @@
 \end{algorithm}
 
 \end{document}
+
+
+
+
+\begin{algorithm}[H]
+\caption{Outlier and Clustering Detection for Bank Topic Distributions}
+\KwIn{
+    \textbf{I}: Set of SEC filing items (e.g., ``Risk Factors,'' ``MD\&A,'' ``Business Overview'') \\
+    \textbf{J}: Set of banks \\
+    \textbf{T}: Set of years
+}
+\KwOut{Outlier and cluster detection scores for topic distributions across banks in each year.}
+
+\ForEach{filing item $i \in I$}{
+    \ForEach{document $d \in i$}{
+        Split document $d$ into smaller chunks $c$ (e.g., paragraphs, semantic units)\;
+        Perform any necessary preprocessing of chunks (e.g., data cleaning)\;
+    }
+    Train topic model $M_i$ using BERTopic on all chunks $c$ within all documents $d$ for all years $t$\;
+    Perform hierarchical clustering on $M_i$ to reduce dimensionality by grouping topics into parent topics\;
+    \ForEach{year $t \in T$}{
+        Compute topic distributions for all banks $j \in J$\;
+        Assign each chunk to a topic from the reduced topic hierarchy\;
+        Calculate the percentage distribution of topics for each bank\;
+    }
+}
+
+\ForEach{year $t \in T$}{
+    Extract the topic distribution vectors $v_{j,t}$ for all banks $j$\;
+    Apply dimensionality reduction (e.g., PCA) to $v_{j,t}$ while retaining at least 95\% of variance\;
+    Perform clustering (e.g., K-means) on reduced vectors $v_{j,t}^{PCA}$ to determine clusters with the optimal silhouette score\;
+    Test $v_{j,t}^{PCA}$ for multivariate normality\;
+    \eIf{normality is \textbf{True}}{
+        Apply robust Mahalanobis distance to identify potential outliers\;
+        Apply Local Outlier Factor (LOF) to enhance detection accuracy\;
+        Flag a point as an outlier if it is identified by both Mahalanobis distance and LOF\;
+    }{
+        Apply LOF only, as Mahalanobis distance assumes normality\;
+    }
+}
+
+\textbf{Optional Features:}
+\begin{itemize}
+    \item Normalize $v_{j,t}$ before dimensionality reduction.
+    \item Convert outlier and clustering outputs into continuous scores and normalize final scores to account for holding company size (e.g., total assets).
+    \item If the topic order is relevant (e.g., ``Risk Factors''), create weighted topic distributions $W_{j,t}$ based on the order of topics discussed.
+\end{itemize}
+
+\end{algorithm}