Skip to content

Commit

Permalink
add scale_factor and bias to description zscore normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
hawkeye217 committed Oct 7, 2024
1 parent 5cda95f commit e3a81db
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 29 deletions.
2 changes: 1 addition & 1 deletion frigate/embeddings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class EmbeddingsContext:
def __init__(self, db: SqliteVecQueueDatabase):
self.embeddings = Embeddings(db)
self.thumb_stats = ZScoreNormalization()
self.desc_stats = ZScoreNormalization()
self.desc_stats = ZScoreNormalization(scale_factor=2.5, bias=0.5)

# load stats from disk
try:
Expand Down
4 changes: 2 additions & 2 deletions frigate/embeddings/functions/minilm_l6_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def _download_model(self, path: str):
elif os.path.basename(path) == self.TOKENIZER_FILE:
logger.info("Downloading MiniLM tokenizer")
tokenizer = AutoTokenizer.from_pretrained(
self.MODEL_NAME, clean_up_tokenization_spaces=False
self.MODEL_NAME, clean_up_tokenization_spaces=True
)
tokenizer.save_pretrained(path)

Expand Down Expand Up @@ -78,7 +78,7 @@ def _load_model_and_tokenizer(self):
def _load_tokenizer(self):
tokenizer_path = os.path.join(self.DOWNLOAD_PATH, self.TOKENIZER_FILE)
return AutoTokenizer.from_pretrained(
tokenizer_path, clean_up_tokenization_spaces=False
tokenizer_path, clean_up_tokenization_spaces=True
)

def _load_model(self, path: str, providers: List[str]):
Expand Down
14 changes: 10 additions & 4 deletions frigate/embeddings/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@


class ZScoreNormalization:
"""Running Z-score normalization for search distance."""

def __init__(self):
def __init__(self, scale_factor: float = 1.0, bias: float = 0.0):
"""Initialize with optional scaling and bias adjustments."""
"""scale_factor adjusts the magnitude of each score"""
"""bias will artificially shift the entire distribution upwards"""
self.n = 0
self.mean = 0
self.m2 = 0
self.scale_factor = scale_factor
self.bias = bias

@property
def variance(self):
Expand All @@ -23,7 +26,10 @@ def normalize(self, distances: list[float]):
self._update(distances)
if self.stddev == 0:
return distances
return [(x - self.mean) / self.stddev for x in distances]
return [
(x - self.mean) / self.stddev * self.scale_factor + self.bias
for x in distances
]

def _update(self, distances: list[float]):
for x in distances:
Expand Down
26 changes: 4 additions & 22 deletions web/src/views/search/SearchView.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -189,19 +189,9 @@ export default function SearchView({

// confidence score - probably needs tweaking

const zScoreToConfidence = (score: number, source: string) => {
let midpoint, scale;

if (source === "thumbnail") {
midpoint = 2;
scale = 0.5;
} else {
midpoint = 0.5;
scale = 1.5;
}

const zScoreToConfidence = (score: number) => {
// Sigmoid function: 1 / (1 + e^x)
const confidence = 1 / (1 + Math.exp((score - midpoint) * scale));
const confidence = 1 / (1 + Math.exp(score));

return Math.round(confidence * 100);
};
Expand Down Expand Up @@ -412,21 +402,13 @@ export default function SearchView({
) : (
<LuText className="mr-1 size-3" />
)}
{zScoreToConfidence(
value.search_distance,
value.search_source,
)}
%
{zScoreToConfidence(value.search_distance)}%
</Chip>
</TooltipTrigger>
<TooltipPortal>
<TooltipContent>
Matched {value.search_source} at{" "}
{zScoreToConfidence(
value.search_distance,
value.search_source,
)}
%
{zScoreToConfidence(value.search_distance)}%
</TooltipContent>
</TooltipPortal>
</Tooltip>
Expand Down

0 comments on commit e3a81db

Please sign in to comment.