Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a 5k token buffer before the end of the context window #1289

Merged
merged 1 commit into from
Mar 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 38 additions & 16 deletions src/core/sliding-window/__tests__/sliding-window.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import { Anthropic } from "@anthropic-ai/sdk"

import { ModelInfo } from "../../../shared/api"
import { estimateTokenCount, truncateConversation, truncateConversationIfNeeded } from "../index"
import { TOKEN_BUFFER, estimateTokenCount, truncateConversation, truncateConversationIfNeeded } from "../index"

/**
* Tests for the truncateConversation function
Expand Down Expand Up @@ -121,10 +121,10 @@ describe("getMaxTokens", () => {
// Create messages with very small content in the last one to avoid token overflow
const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]

// Below max tokens - no truncation
// Below max tokens and buffer - no truncation
const result1 = truncateConversationIfNeeded({
messages: messagesWithSmallContent,
totalTokens: 49999,
totalTokens: 44999, // Well below threshold + buffer
contextWindow: modelInfo.contextWindow,
maxTokens: modelInfo.maxTokens,
})
Expand All @@ -133,7 +133,7 @@ describe("getMaxTokens", () => {
// Above max tokens - truncate
const result2 = truncateConversationIfNeeded({
messages: messagesWithSmallContent,
totalTokens: 50001,
totalTokens: 50001, // Above threshold
contextWindow: modelInfo.contextWindow,
maxTokens: modelInfo.maxTokens,
})
Expand All @@ -148,10 +148,10 @@ describe("getMaxTokens", () => {
// Create messages with very small content in the last one to avoid token overflow
const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]

// Below max tokens - no truncation
// Below max tokens and buffer - no truncation
const result1 = truncateConversationIfNeeded({
messages: messagesWithSmallContent,
totalTokens: 79999,
totalTokens: 74999, // Well below threshold + buffer
contextWindow: modelInfo.contextWindow,
maxTokens: modelInfo.maxTokens,
})
Expand All @@ -160,7 +160,7 @@ describe("getMaxTokens", () => {
// Above max tokens - truncate
const result2 = truncateConversationIfNeeded({
messages: messagesWithSmallContent,
totalTokens: 80001,
totalTokens: 80001, // Above threshold
contextWindow: modelInfo.contextWindow,
maxTokens: modelInfo.maxTokens,
})
Expand All @@ -175,10 +175,10 @@ describe("getMaxTokens", () => {
// Create messages with very small content in the last one to avoid token overflow
const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]

// Below max tokens - no truncation
// Below max tokens and buffer - no truncation
const result1 = truncateConversationIfNeeded({
messages: messagesWithSmallContent,
totalTokens: 39999,
totalTokens: 34999, // Well below threshold + buffer
contextWindow: modelInfo.contextWindow,
maxTokens: modelInfo.maxTokens,
})
Expand All @@ -187,7 +187,7 @@ describe("getMaxTokens", () => {
// Above max tokens - truncate
const result2 = truncateConversationIfNeeded({
messages: messagesWithSmallContent,
totalTokens: 40001,
totalTokens: 40001, // Above threshold
contextWindow: modelInfo.contextWindow,
maxTokens: modelInfo.maxTokens,
})
Expand All @@ -202,10 +202,10 @@ describe("getMaxTokens", () => {
// Create messages with very small content in the last one to avoid token overflow
const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]

// Below max tokens - no truncation
// Below max tokens and buffer - no truncation
const result1 = truncateConversationIfNeeded({
messages: messagesWithSmallContent,
totalTokens: 169999,
totalTokens: 164999, // Well below threshold + buffer
contextWindow: modelInfo.contextWindow,
maxTokens: modelInfo.maxTokens,
})
Expand All @@ -214,7 +214,7 @@ describe("getMaxTokens", () => {
// Above max tokens - truncate
const result2 = truncateConversationIfNeeded({
messages: messagesWithSmallContent,
totalTokens: 170001,
totalTokens: 170001, // Above threshold
contextWindow: modelInfo.contextWindow,
maxTokens: modelInfo.maxTokens,
})
Expand Down Expand Up @@ -244,7 +244,7 @@ describe("truncateConversationIfNeeded", () => {
it("should not truncate if tokens are below max tokens threshold", () => {
const modelInfo = createModelInfo(100000, true, 30000)
const maxTokens = 100000 - 30000 // 70000
const totalTokens = 69999 // Below threshold
const totalTokens = 64999 // Well below threshold + buffer

// Create messages with very small content in the last one to avoid token overflow
const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]
Expand Down Expand Up @@ -337,8 +337,8 @@ describe("truncateConversationIfNeeded", () => {
{ role: messages[messages.length - 1].role, content: smallContent },
]

// Set base tokens so total is below threshold even with small content added
const baseTokensForSmall = availableTokens - smallContentTokens - 10
// Set base tokens so total is well below threshold + buffer even with small content added
const baseTokensForSmall = availableTokens - smallContentTokens - TOKEN_BUFFER - 10
const resultWithSmall = truncateConversationIfNeeded({
messages: messagesWithSmallContent,
totalTokens: baseTokensForSmall,
Expand Down Expand Up @@ -388,7 +388,29 @@ describe("truncateConversationIfNeeded", () => {
})
expect(resultWithVeryLarge).not.toEqual(messagesWithVeryLargeContent) // Should truncate
})

it("should truncate if tokens are within TOKEN_BUFFER of the threshold", () => {
const modelInfo = createModelInfo(100000, true, 30000)
const maxTokens = 100000 - 30000 // 70000
const totalTokens = 66000 // Within 5000 of threshold (70000)

// Create messages with very small content in the last one to avoid token overflow
const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]

// When truncating, always uses 0.5 fraction
// With 4 messages after the first, 0.5 fraction means remove 2 messages
const expectedResult = [messagesWithSmallContent[0], messagesWithSmallContent[3], messagesWithSmallContent[4]]

const result = truncateConversationIfNeeded({
messages: messagesWithSmallContent,
totalTokens,
contextWindow: modelInfo.contextWindow,
maxTokens: modelInfo.maxTokens,
})
expect(result).toEqual(expectedResult)
})
})

/**
* Tests for the estimateTokenCount function
*/
Expand Down
6 changes: 4 additions & 2 deletions src/core/sliding-window/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ import { Anthropic } from "@anthropic-ai/sdk"
import { Tiktoken } from "js-tiktoken/lite"
import o200kBase from "js-tiktoken/ranks/o200k_base"

const TOKEN_FUDGE_FACTOR = 1.5
export const TOKEN_FUDGE_FACTOR = 1.5
export const TOKEN_BUFFER = 5000

/**
* Counts tokens for user content using tiktoken for text
Expand Down Expand Up @@ -110,5 +111,6 @@ export function truncateConversationIfNeeded({
const allowedTokens = contextWindow - reservedTokens

// Determine if truncation is needed and apply if necessary
return effectiveTokens < allowedTokens ? messages : truncateConversation(messages, 0.5)
// Truncate if we're within TOKEN_BUFFER of the limit
return effectiveTokens > allowedTokens - TOKEN_BUFFER ? truncateConversation(messages, 0.5) : messages
}