Update Lit

brianferrell787 · Dec 13, 2024 · e559ae5 · e559ae5
1 parent fe50c04
commit e559ae5
Showing 1 changed file with 0 additions and 97 deletions.
diff --git a/Lit b/Lit
@@ -1,98 +1 @@
-\documentclass[10pt]{article}
-\usepackage[margin=1in]{geometry}
-\usepackage{algorithm}
-\usepackage{algpseudocode}
-\usepackage{amsmath}
 
-\begin{document}
-
-\floatname{algorithm}{Algorithm}
-\renewcommand{\algorithmicrequire}{\textbf{Input:}}
-\renewcommand{\algorithmicensure}{\textbf{Output:}}
-
-\begin{algorithm}
-\caption{Outlier and Clustering Detection for Bank Topic Distributions}
-\begin{algorithmic}[1]
-\Require 
-    \begin{itemize}
-        \item $I$: Set of SEC filing items (e.g., "Risk Factors", "Item 7")
-        \item $j$: Set of banks
-        \item $t \in T$: Years
-    \end{itemize}
-\Ensure 
-    Outlier and cluster detection scores for topic distributions across banks in each year.
-\For {each item $i \in I$}
-    \State Construct a topic model $M_i$ for $i$ using BERTopic.
-    \State Perform hierarchical clustering on $M_i$ to reduce dimensionality by grouping topics into parent topics.
-    \State For each year $t$, compute topic distributions for all banks $j \in J$:
-    \State \hspace{\algorithmicindent} Split texts into chunks using semantic or paragraph chunking.
-    \State \hspace{\algorithmicindent} Assign each chunk to a topic from the reduced topic hierarchy.
-    \State \hspace{\algorithmicindent} Calculate the percentage distribution of topics for each bank.
-\EndFor
-\For {each year $t \in T$}
-    \State Extract the topic distribution vectors $v_{j,t}$ for all banks $j$.
-    \State Normalize $v_{j,t}$ using a robust scaler to handle extreme values.
-    \State Apply dimensionality reduction (e.g., PCA) to $v_{j,t}$ while retaining at least 95\% of variance.
-    \State Perform $k$-means clustering on reduced vectors $v_{j,t}^{PCA}$ to determine clusters with the optimal silhouette score.
-    \State Test $v_{j,t}^{PCA}$ for multivariate normality.
-    \If {normality is \textbf{True}}
-        \State Apply robust Mahalanobis distance to identify potential outliers.
-        \State Apply Local Outlier Factor (LOF) to enhance detection accuracy.
-        \State Flag a point as an outlier if it is identified by both Mahalanobis distance and LOF.
-    \Else
-        \State Apply LOF only, as Mahalanobis distance assumes normality.
-    \EndIf
-\EndFor
-\end{algorithmic}
-\end{algorithm}
-
-\end{document}
-
-
-
-
-\begin{algorithm}[H]
-\caption{Outlier and Clustering Detection for Bank Topic Distributions}
-\KwIn{
-    \textbf{I}: Set of SEC filing items (e.g., ``Risk Factors,'' ``MD\&A,'' ``Business Overview'') \\
-    \textbf{J}: Set of banks \\
-    \textbf{T}: Set of years
-}
-\KwOut{Outlier and cluster detection scores for topic distributions across banks in each year.}
-
-\ForEach{filing item $i \in I$}{
-    \ForEach{document $d \in i$}{
-        Split document $d$ into smaller chunks $c$ (e.g., paragraphs, semantic units)\;
-        Perform any necessary preprocessing of chunks (e.g., data cleaning)\;
-    }
-    Train topic model $M_i$ using BERTopic on all chunks $c$ within all documents $d$ for all years $t$\;
-    Perform hierarchical clustering on $M_i$ to reduce dimensionality by grouping topics into parent topics\;
-    \ForEach{year $t \in T$}{
-        Compute topic distributions for all banks $j \in J$\;
-        Assign each chunk to a topic from the reduced topic hierarchy\;
-        Calculate the percentage distribution of topics for each bank\;
-    }
-}
-
-\ForEach{year $t \in T$}{
-    Extract the topic distribution vectors $v_{j,t}$ for all banks $j$\;
-    Apply dimensionality reduction (e.g., PCA) to $v_{j,t}$ while retaining at least 95\% of variance\;
-    Perform clustering (e.g., K-means) on reduced vectors $v_{j,t}^{PCA}$ to determine clusters with the optimal silhouette score\;
-    Test $v_{j,t}^{PCA}$ for multivariate normality\;
-    \eIf{normality is \textbf{True}}{
-        Apply robust Mahalanobis distance to identify potential outliers\;
-        Apply Local Outlier Factor (LOF) to enhance detection accuracy\;
-        Flag a point as an outlier if it is identified by both Mahalanobis distance and LOF\;
-    }{
-        Apply LOF only, as Mahalanobis distance assumes normality\;
-    }
-}
-
-\textbf{Optional Features:}
-\begin{itemize}
-    \item Normalize $v_{j,t}$ before dimensionality reduction.
-    \item Convert outlier and clustering outputs into continuous scores and normalize final scores to account for holding company size (e.g., total assets).
-    \item If the topic order is relevant (e.g., ``Risk Factors''), create weighted topic distributions $W_{j,t}$ based on the order of topics discussed.
-\end{itemize}
-
-\end{algorithm}