Skip to content

Commit

Permalink
Merge pull request #114 from koaning/optimize-performance
Browse files Browse the repository at this point in the history
Optimize performance in various modules
  • Loading branch information
koaning authored Jan 18, 2025
2 parents dd8072b + 8e3a374 commit ec51024
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 23 deletions.
13 changes: 8 additions & 5 deletions embetter/external/_cohere.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import os
import numpy as np
from itertools import islice

from embetter.base import EmbetterBase


def _batch(iterable, n=1):
len_iter = len(iterable)
for ndx in range(0, len_iter, n):
yield iterable[ndx : min(ndx + n, len_iter)]
it = iter(iterable)
while batch := list(islice(it, n)):
yield batch


class CohereEncoder(EmbetterBase):
Expand All @@ -27,6 +28,7 @@ class CohereEncoder(EmbetterBase):
Arguments:
model: name of model, can be "small" or "large"
batch_size: Batch size to send to Cohere.
**Usage**:
Expand Down Expand Up @@ -67,16 +69,17 @@ class CohereEncoder(EmbetterBase):
```
"""

def __init__(self, model="large"):
def __init__(self, model="large", batch_size=10):
from cohere import Client

self.client = Client(os.getenv("COHERE_KEY"))
self.model = model
self.batch_size = batch_size

def transform(self, X, y=None):
"""Transforms the text into a numeric representation."""
result = []
for b in _batch(X, 10):
for b in _batch(X, self.batch_size):
response = self.client.embed(b)
result.extend(response.embeddings)
return np.array(result)
7 changes: 4 additions & 3 deletions embetter/external/_openai.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import numpy as np
from openai import OpenAI
from itertools import islice

from embetter.base import EmbetterBase


def _batch(iterable, n=1):
len_iter = len(iterable)
for ndx in range(0, len_iter, n):
yield iterable[ndx : min(ndx + n, len_iter)]
it = iter(iterable)
while batch := list(islice(it, n)):
yield batch


class OpenAIEncoder(EmbetterBase):
Expand Down
20 changes: 12 additions & 8 deletions embetter/finetune/_constrastive_learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,19 @@ def fit(self, X1, X2, y):
X2_torch = torch.from_numpy(X2).detach().float()
y_torch = torch.from_numpy(np.array(y)).detach().float()

dataset = torch.utils.data.TensorDataset(X1_torch, X2_torch, y_torch)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

for _ in range(self.epochs): # loop over the dataset multiple times
# zero the parameter gradients
optimizer.zero_grad()

# forward + backward + optimize
cos_sim = self.network_(X1_torch, X2_torch)
loss = criterion(cos_sim, y_torch)
loss.backward()
optimizer.step()
for batch_X1, batch_X2, batch_y in dataloader:
# zero the parameter gradients
optimizer.zero_grad()

# forward + backward + optimize
cos_sim = self.network_(batch_X1, batch_X2)
loss = criterion(cos_sim, batch_y)
loss.backward()
optimizer.step()
return self

def transform(self, X, y=None):
Expand Down
2 changes: 1 addition & 1 deletion embetter/finetune/_contrastive_tuner.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from sklearn.base import BaseEstimator, TransformerMixin
import random
from collections import defaultdict
from itertools import chain, groupby
from itertools import chain, groupby, islice

import numpy as np
import torch
Expand Down
17 changes: 11 additions & 6 deletions embetter/finetune/_forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ class FeedForwardTuner(BaseEstimator, TransformerMixin):
learning_rate: The learning rate of the feed forward model
"""

def __init__(self, hidden_dim=50, n_epochs=500, learning_rate=0.01) -> None:
def __init__(self, hidden_dim=50, n_epochs=500, learning_rate=0.01, batch_size=32) -> None:
self.hidden_dim = hidden_dim
self.n_epochs = n_epochs
self.learning_rate = learning_rate
self.batch_size = batch_size
self.label_enc = LabelEncoder()

def fit(self, X, y):
Expand Down Expand Up @@ -66,12 +67,16 @@ def partial_fit(self, X, y, classes=None):
torch_X = torch.from_numpy(X).detach().float()
torch_y = torch.from_numpy(self.label_enc.transform(y)).detach()

dataset = torch.utils.data.TensorDataset(torch_X, torch_y)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

for _ in range(self.n_epochs):
self._optimizer.zero_grad()
out = self._model(torch_X)
loss = self._criterion(out, torch_y)
loss.backward()
self._optimizer.step()
for batch_X, batch_y in dataloader:
self._optimizer.zero_grad()
out = self._model(batch_X)
loss = self._criterion(out, batch_y)
loss.backward()
self._optimizer.step()

return self

Expand Down

0 comments on commit ec51024

Please sign in to comment.