-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added light weight numpy-versatile implementation of the `Linear Supp…
…ort Vector Machines Classifier` (#13) Signed-off-by: Ayush Joshi <[email protected]>
- Loading branch information
1 parent
54368bb
commit fbecf95
Showing
2 changed files
with
249 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# Copyright 2023 The AI Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
"""**Support vector machines (SVMs)** are a set of supervised learning methods | ||
used for classification, regression and outliers detection. | ||
The advantages of support vector machines are: | ||
* Effective in high dimensional spaces. | ||
* Still effective in cases where number of dimensions is greater than the number | ||
of samples. | ||
* Uses a subset of training points in the decision function (called support | ||
vectors), so it is also memory efficient. | ||
* Versatile: different Kernel functions can be specified for the decision | ||
function. Common kernels are provided, but it is also possible to specify | ||
custom kernels. | ||
The disadvantages of support vector machines include: | ||
* If the number of features is much greater than the number of samples, avoid | ||
over-fitting in choosing Kernel functions and regularization term is crucial. | ||
* SVMs do not directly provide probability estimates, these are calculated using | ||
an expensive five-fold cross-validation (see Scores and probabilities, below). | ||
Types of **SVMs**: | ||
* **Linear SVM** | ||
* Linear SVMs use a linear decision boundary to separate the data points of | ||
different classes. When the data can be precisely linearly separated, | ||
linear SVMs are very suitable. This means that a single straight line | ||
(in 2D) or a hyperplane (in higher dimensions) can entirely divide the | ||
data points into their respective classes. A hyperplane that maximizes the | ||
margin between the classes is the decision boundary. | ||
* **Non-Linear SVM** | ||
* Non-Linear SVM can be used to classify data when it cannot be separated | ||
into two classes by a straight line (in the case of 2D). By using kernel | ||
functions, nonlinear SVMs can handle nonlinearly separable data. The | ||
original input data is transformed by these kernel functions into a | ||
higher-dimensional feature space, where the data points can be linearly | ||
separated. A linear SVM is used to locate a nonlinear decision boundary in | ||
this modified space. | ||
The followings are the **SVMs** implementations that `ai` includes: | ||
* `ai.svm.classes.LinearSVC` | ||
""" | ||
|
||
from .classes import LinearSVC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,187 @@ | ||
# Copyright 2023 The AI Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from typing import Optional | ||
|
||
import numpy as np | ||
|
||
|
||
class LinearSVC: | ||
"""**Linear SVC classifier** implementation. | ||
Linear SVMs use a linear decision boundary to separate the data points of | ||
different classes. When the data can be precisely linearly separated, linear | ||
SVMs are very suitable. This means that a single straight line (in 2D) or a | ||
hyperplane (in higher dimensions) can entirely divide the data points into | ||
their respective classes. A hyperplane that maximizes the margin between the | ||
classes is the decision boundary. | ||
The objective of training a `LinearSVC` is to minimize the norm of the weight | ||
vector :math:`||w||`, which is the slope of the decision function. | ||
.. math:: | ||
\\min_{{w, b}} \\dfrac{1}{2} w^{T}w = \\dfrac{1}{2} ||w||^{2} | ||
.. math:: | ||
\\mathrm{subject\\ to}\\ t^{i} (w^{T}x^{i} + b) \\ge 1 | ||
.. note:: | ||
We are minimizing :math:`\\dfrac{1}{2}w^{T}w`, which is equal to | ||
:math:`\\dfrac{1}{2}||w||^{2}`, rather than minimizing :math:`||w||`. | ||
Indeed, :math:`\\dfrac{1}{2}||w||^{2}` has a nice and simple derivative | ||
(just :math:`w`) while :math:`||w||` is not differentiable at :math:`w = 0`. | ||
Optimization algorithms work much better on differentiable functions. | ||
The cost function used by **Linear SVM classifier** is the following: | ||
.. math:: | ||
J(w, b) = \\dfrac{1}{2}w^{T}w + C\\sum_{i=1}^{m}\\max( | ||
0, | ||
1 - t^{i}(w^{T}x^{i} + b) | ||
) | ||
The loss function used is the **Hinge Loss** function that clips the value at | ||
:math:`0`: | ||
.. math:: | ||
\\max(0, 1 - t^{i}(w^{T}x^{i} + b)) | ||
""" | ||
|
||
def __init__( | ||
self, | ||
*, | ||
alpha: Optional[np.float16] = .01, | ||
lambda_p: Optional[np.float32] = .01, | ||
n_iters: Optional[np.int64] = 1_000 | ||
): | ||
"""Initializes model's `alpha`, `lambda parameter` and number of | ||
`iterations`. | ||
Args: | ||
alpha: Model's learning rate. High value might over shoot the minimum | ||
loss, while low values might make the model to take forever to learn. | ||
lambda_p: Lambda parameter for updating weights. | ||
n_iters: Maximum number of updations to make over the weights and bias in | ||
order to reach to a effecient prediction that minimizes the loss. | ||
""" | ||
self._alpha = alpha | ||
self._lambda_p = lambda_p | ||
self._n_iters = n_iters | ||
|
||
self._bias = None | ||
self._weights = None | ||
|
||
def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearSVC': | ||
"""Fit `LinearSVC` according to X, y. | ||
The objective of training a `LinearSVC` is to minimize the norm of the | ||
weight vector :math:`||w||`, which is the slope of the decision function. | ||
.. math:: | ||
\\min_{{w, b}} \\dfrac{1}{2} w^{T}w = \\dfrac{1}{2} ||w||^{2} | ||
.. math:: | ||
\\mathrm{subject\\ to}\\ t^{i} (w^{T}x^{i} + b) \\ge 1 | ||
.. note:: | ||
We are minimizing :math:`\\dfrac{1}{2}w^{T}w`, which is equal to | ||
:math:`\\dfrac{1}{2}||w||^{2}`, rather than minimizing :math:`||w||`. | ||
Indeed, :math:`\\dfrac{1}{2}||w||^{2}` has a nice and simple derivative | ||
(just :math:`w`) while :math:`||w||` is not differentiable at | ||
:math:`w = 0`. Optimization algorithms work much better on differentiable | ||
functions. | ||
The cost function used by **Linear SVM classifier** is the following: | ||
.. math:: | ||
J(w, b) = \\dfrac{1}{2}w^{T}w + C\\sum_{i=1}^{m}\\max( | ||
0, | ||
1 - t^{i}(w^{T}x^{i} + b) | ||
) | ||
The loss function used is the **Hinge Loss** function that clips the value | ||
at :math:`0`: | ||
.. math:: | ||
\\max(0, 1 - t^{i}(w^{T}x^{i} + b)) | ||
Args: | ||
X: Training vectors, where `n_samples` is the number of samples and | ||
`n_features` is the number of features. | ||
y: Target values. | ||
Returns: | ||
Returns the instance itself. | ||
""" | ||
self._bias = 0 | ||
self._weights = np.random.rand(X.shape[1]) | ||
|
||
# We need the decision function to be greater than 1 for all positive | ||
# training instances, and lower than -1 for all negative training instances. | ||
# If we define ``t_i = -1`` for negative instances, and ``t_i = 1`` for | ||
# positive instances, then we can acheive less margin violation. | ||
t = np.where(y <= 0, -1, 1) | ||
for _ in range(self._n_iters): | ||
for idx, x_i in enumerate(X): | ||
# Since, cost function is different at different decision boundaries we | ||
# calculate weights and bias differently. | ||
if t[idx] * (np.dot(x_i, self._weights) - self._bias) >= 1: | ||
self._weights -= self._alpha * (2 * self._lambda_p * self._weights) | ||
else: | ||
self._weights -= self._alpha * ( | ||
2 * self._lambda_p * self._weights - np.dot(t[idx], x_i) | ||
) | ||
self._bias -= self._alpha * t[idx] | ||
return self | ||
|
||
def predict(self, X: np.ndarray) -> np.ndarray: | ||
"""Predict for `X` using the previously calculated `weights` and `bias`. | ||
Args: | ||
X: Feature vector. | ||
Returns: | ||
Target vector. | ||
Raises: | ||
RuntimeError: If `predict` is called before `fit`. | ||
ValueError: If shape of the given `X` differs from the shape of the `X` | ||
given to the `fit` function. | ||
""" | ||
if self._weights is None or self._bias is None: | ||
raise RuntimeError( | ||
f'{self.__class__.__name__}: predict called before fitting data' | ||
) | ||
|
||
if X.shape[1] != self._weights.shape[0]: | ||
raise ValueError( | ||
( | ||
f'Number of features {X.shape[1]} does not match previous data ' | ||
f'{self._weights.shape[0]}.' | ||
) | ||
) | ||
|
||
return np.sign(np.dot(X, self._weights) - self._bias) |