From 6ad0dff18be38ddb52add79901c097c2a250386b Mon Sep 17 00:00:00 2001
From: Cheng <git@zcbenz.com>
Date: Mon, 16 Sep 2024 17:18:13 +0900
Subject: [PATCH] Allow providing a batchSize to hint memory cache limit

---
 src/index.ts | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/index.ts b/src/index.ts
index 6a7bb66..beebd2e 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,4 +1,4 @@
-import {readFileSync} from 'node:fs'
+import {statSync, readFileSync} from 'node:fs'
 import {TokenizerLoader} from '@lenml/tokenizers';
 import {core as mx, nn} from '@frost-beta/mlx';
 
@@ -35,7 +35,7 @@ export class Clip {
   #imageProcessor?: ClipImageProcessor;
   #model?: ClipModel;
 
-  constructor(public modelDir: string) {}
+  constructor(public modelDir: string, public batchSize?: number) {}
 
   get tokenizer() {
     if (!this.#tokenizer)
@@ -50,8 +50,17 @@ export class Clip {
   }
 
   get model() {
-    if (!this.#model)
+    if (!this.#model) {
+      if (this.batchSize) {
+        // When batchSize is hinted, we will set a cache limit. This is needed
+        // because the model can burst to use many RAM and MLX's cache memory
+        // will leave app's RAM usage at the peak. We should eventually fix the
+        // model but for now setting cache limit is enough.
+        const {size} = statSync(`${this.modelDir}/model.safetensors`);
+        mx.metal.setCacheLimit(size * (1 + this.batchSize));
+      }
       this.#model = loadModel(this.modelDir);
+    }
     return this.#model;
   }