Merge pull request #2 from askorama/docs/adds-comments

general refactor
oramasearch · Mar 1, 2024 · 9379abd · 9379abd
2 parents 5d89f31 + 1397df7
commit 9379abd
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 28 deletions.
diff --git a/.github/workflows/deno.yml b/.github/workflows/deno.yml
@@ -31,3 +31,6 @@ jobs:
 
       - name: Run tests
         run: deno test -A
+
+      - name: Build Node.js package
+        run: deno run -A dnt.ts
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ import { FixedChunker, NLPChunker } from 'jsr:@orama/chunker'
 
 Orama Chunker is really easy to use. First of all, you have to decide which chunking strategy you want to adopt.
 
-![Chunking Strategies](/misc/chunking-strategies.png)
+![[Chunking Strategies](/misc/chunking-strategies.png)](https://raw.githubusercontent.com/askorama/chunker/main/misc/chunking-strategies.png)
 
 The **Fixed Chunker** will divide your input text into several pieces of a specified size. It does not consider the semantics of your input text, as its sole purpose is to divide the text and ensure that each piece contains a maximum number of tokens. It is slightly faster and lighter as it requires fewer computations to determine the chunking strategy.
 

diff --git a/deno.lock b/deno.lock
diff --git a/src/common.ts b/src/common.ts
@@ -1,27 +1,23 @@
-import { AutoTokenizer, env } from 'https://cdn.jsdelivr.net/npm/@xenova/[email protected]'
-
-env.useBrowserCache = false
-env.allowLocalModels = false
+import type { Tiktoken } from 'npm:js-tiktoken'
+import { getEncoding } from 'npm:js-tiktoken'
 
+/**
+ * Represents a Chunker object that can be used to tokenize input strings and count the number of tokens.
+ */
 export class Chunker {
   protected verbose = false
-  protected ready: Promise<boolean>
-  // deno-lint-ignore no-explicit-any
-  private tokenizer: any
+  private tokenizer: Tiktoken
 
   constructor() {
-    this.ready = this.init()
-      .then(() => true)
-      .catch(() => false)
-  }
-
-  private async init() {
-    this.tokenizer = await AutoTokenizer.from_pretrained('Xenova/bert-base-uncased')
+    this.tokenizer = getEncoding('gpt2')
   }
 
-  public async getNumberOfTokens(input: string): Promise<number> {
-    await this.ready
-    const result = await this.tokenizer(input)
-    return result.input_ids.size
+  /**
+   * Gets the number of tokens in the input string.
+   * @param input - The input string to tokenize.
+   * @returns A promise that resolves with the number of tokens in the input string.
+   */
+  public getNumberOfTokens(input: string): number {
+    return this.tokenizer.encode(input).length
   }
 }
diff --git a/src/fixed.ts b/src/fixed.ts
@@ -1,7 +1,16 @@
 import { Chunker } from './common.ts'
 
+/**
+ * Represents a fixed chunker that splits a string into chunks based on a maximum number of tokens per chunk.
+ */
 export class FixedChunker extends Chunker {
-  public async chunk(input: string, maxTokensPerChunk: number): Promise<string[]> {
+  /**
+   * Splits the input string into chunks based on the maximum number of tokens per chunk.
+   * @param {String} input - The input string to be chunked.
+   * @param {Number} maxTokensPerChunk - The maximum number of tokens allowed per chunk.
+   * @returns An array of strings representing the chunks.
+   */
+  public chunk(input: string, maxTokensPerChunk: number): string[] {
     const words = input.split(/\s+/)
     const chunks: string[] = []
 
@@ -17,7 +26,7 @@ export class FixedChunker extends Chunker {
       while (low < high) {
         const mid = low + Math.floor((high - low) / 2)
         const testChunk = words.slice(start, mid + 1).join(' ')
-        const tokenCount = await this.getNumberOfTokens(testChunk)
+        const tokenCount = this.getNumberOfTokens(testChunk)
 
         if (tokenCount <= maxTokensPerChunk) {
           validChunk = testChunk

diff --git a/src/nlp.ts b/src/nlp.ts
@@ -1,20 +1,28 @@
-import nlp from 'https://esm.sh/[email protected]/one'
+/**
+ * Represents a chunker that uses natural language processing (NLP) to split text into chunks.
+ * This chunker extends the base `Chunker` class.
+ */
+import nlp from 'npm:[email protected]/one'
 import { Chunker } from './common.ts'
 
 export class NLPChunker extends Chunker {
-  public async chunk(input: string, maxTokensPerChunk: number): Promise<string[]> {
+  /**
+   * Splits the input text into chunks based on the maximum number of tokens per chunk.
+   * @param {String} input - The input text to be chunked.
+   * @param {Number} maxTokensPerChunk - The maximum number of tokens allowed per chunk.
+   * @returns A promise that resolves to an array of chunks.
+   */
+  public chunk(input: string, maxTokensPerChunk: number): string[] {
     const sentences = nlp.tokenize(input).fullSentences().out('array')
     const chunks: string[] = []
 
     let currentChunk = ''
     for (const sentence of sentences) {
-      const [sentenceTokenCount, currentChunkTokenCount] = await Promise.all([
-        this.getNumberOfTokens(sentence),
-        this.getNumberOfTokens(currentChunk),
-      ])
+      const sentenceTokenCount = this.getNumberOfTokens(sentence)
+      const currentChunkTokenCount = this.getNumberOfTokens(currentChunk)
 
       if (sentenceTokenCount + currentChunkTokenCount <= maxTokensPerChunk) {
-        currentChunk += (currentChunk ? ' ' : '') + sentence // Ensure space between sentences
+        currentChunk += (currentChunk ? ' ' : '') + sentence
       } else {
         if (currentChunk) {
           chunks.push(currentChunk)