Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add loader, parser, embedding, indexer, retriever examples #72

Merged
merged 4 commits into from
Jan 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* Copyright 2025 CloudWeGo Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"context"
"time"

"github.com/cloudwego/eino/callbacks"
"github.com/cloudwego/eino/components/document"
"github.com/cloudwego/eino/schema"
)

type customLoaderOptions struct {
Timeout time.Duration
RetryCount int
}

func WithTimeout(timeout time.Duration) document.LoaderOption {
return document.WrapLoaderImplSpecificOptFn(func(o *customLoaderOptions) {
o.Timeout = timeout
})
}

func WithRetryCount(count int) document.LoaderOption {
return document.WrapLoaderImplSpecificOptFn(func(o *customLoaderOptions) {
o.RetryCount = count
})
}

func NewCustomLoader(config *Config) (*CustomLoader, error) {
return &CustomLoader{
timeout: config.DefaultTimeout,
retryCount: config.DefaultRetryCount,
}, nil
}

type CustomLoader struct {
timeout time.Duration
retryCount int
}

type Config struct {
DefaultTimeout time.Duration
DefaultRetryCount int
}

func (l *CustomLoader) Load(ctx context.Context, src document.Source, opts ...document.LoaderOption) ([]*schema.Document, error) {
// 1. 处理 option
options := &customLoaderOptions{
Timeout: l.timeout,
RetryCount: l.retryCount,
}
options = document.GetLoaderImplSpecificOptions(options, opts...)
var err error

// 2. 处理错误,并进行错误回调方法
defer func() {
if err != nil {
callbacks.OnError(ctx, err)
}
}()

// 3. 开始加载前的回调
ctx = callbacks.OnStart(ctx, &document.LoaderCallbackInput{
Source: src,
})

// 4. 执行加载逻辑
docs, err := l.doLoad(ctx, src, options)

if err != nil {
return nil, err
}

ctx = callbacks.OnEnd(ctx, &document.LoaderCallbackOutput{
Source: src,
Docs: docs,
})

return docs, nil
}

func (l *CustomLoader) doLoad(ctx context.Context, src document.Source, opts *customLoaderOptions) ([]*schema.Document, error) {
// 实现文档加载逻辑
// 1. 加载文档内容
// 2. 构造 Document 对象,注意可在 MetaData 中保存文档来源等重要信息
return []*schema.Document{{
Content: "Hello World",
}}, nil
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2024 CloudWeGo Authors
* Copyright 2025 CloudWeGo Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,35 +18,33 @@ package main

import (
"context"
"fmt"
"os"
"log"
"time"

"github.com/cloudwego/eino-ext/components/embedding/openai"
"github.com/cloudwego/eino/components/document"
)

func main() {
accessKey := os.Getenv("OPENAI_API_KEY")

ctx := context.Background()

var (
defaultDim = 1024
)

embedding, err := openai.NewEmbedder(ctx, &openai.EmbeddingConfig{
APIKey: accessKey,
Model: "text-embedding-3-large",
Dimensions: &defaultDim,
Timeout: 0,
log.Printf("===== call Custom Loader directly =====")
// 初始化 loader
loader, err := NewCustomLoader(&Config{
DefaultTimeout: 10 * time.Second,
DefaultRetryCount: 10,
})
if err != nil {
panic(fmt.Errorf("new embedder error: %v\n", err))
log.Fatalf("NewCustomLoader failed, err=%v", err)
}

resp, err := embedding.EmbedStrings(ctx, []string{"hello", "how are you"})
// 加载文档
filePath := "../../testdata/test.md"
docs, err := loader.Load(ctx, document.Source{
URI: filePath,
})
if err != nil {
panic(fmt.Errorf("generate failed, err=%v", err))
log.Fatalf("loader.Load failed, err=%v", err)
}

fmt.Printf("output=%v", resp)
log.Printf("doc content: %v", docs[0].Content)
}
93 changes: 93 additions & 0 deletions components/document/loader/file/examples/fileloader/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Copyright 2024 CloudWeGo Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"context"
"log"

"github.com/cloudwego/eino/callbacks"
"github.com/cloudwego/eino/components/document"
"github.com/cloudwego/eino/compose"
"github.com/cloudwego/eino/schema"
callbacksHelper "github.com/cloudwego/eino/utils/callbacks"

"github.com/cloudwego/eino-ext/components/document/loader/file"
)

func main() {
ctx := context.Background()

log.Printf("===== call File Loader directly =====")
// 初始化 loader (以file loader为例)
loader, err := file.NewFileLoader(ctx, &file.FileLoaderConfig{
// 配置参数
UseNameAsID: true,
})
if err != nil {
log.Fatalf("file.NewFileLoader failed, err=%v", err)
}

// 加载文档
filePath := "../../testdata/test.md"
docs, err := loader.Load(ctx, document.Source{
URI: filePath,
})
if err != nil {
log.Fatalf("loader.Load failed, err=%v", err)
}

log.Printf("doc content: %v", docs[0].Content)
log.Printf("Extension: %s\n", docs[0].MetaData[file.MetaKeyExtension]) // 输出: Extension: .txt
log.Printf("Source: %s\n", docs[0].MetaData[file.MetaKeySource]) // 输出: Source: ./document.txt

log.Printf("===== call File Loader in Chain =====")
// 创建 callback handler
handlerHelper := &callbacksHelper.LoaderCallbackHandler{
OnStart: func(ctx context.Context, info *callbacks.RunInfo, input *document.LoaderCallbackInput) context.Context {
log.Printf("start loading docs...: %s\n", input.Source.URI)
return ctx
},
OnEnd: func(ctx context.Context, info *callbacks.RunInfo, output *document.LoaderCallbackOutput) context.Context {
log.Printf("complete loading docs,total loaded docs: %d\n", len(output.Docs))
return ctx
},
// OnError
}

// 使用 callback handler
handler := callbacksHelper.NewHandlerHelper().
Loader(handlerHelper).
Handler()

chain := compose.NewChain[document.Source, []*schema.Document]()
chain.AppendLoader(loader)
// 在运行时使用
run, err := chain.Compile(ctx)
if err != nil {
log.Fatalf("chain.Compile failed, err=%v", err)
}

outDocs, err := run.Invoke(ctx, document.Source{
URI: filePath,
}, compose.WithCallbacks(handler))
if err != nil {
log.Fatalf("run.Invoke failed, err=%v", err)
}

log.Printf("doc content: %v", outDocs[0].Content)
}
1 change: 1 addition & 0 deletions components/document/loader/file/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95
github.com/x-cray/logrus-prefixed-formatter v0.5.2 h1:00txxvfBM9muc0jiLIEAkAcIMJzfthRT6usrui8uGmg=
github.com/yargevad/filepathx v1.0.0 h1:SYcT+N3tYGi+NvazubCNlvgIPbzAk7i7y2dwg3I5FYc=
github.com/yargevad/filepathx v1.0.0/go.mod h1:BprfX/gpYNJHJfc35GjRRpVcwWXS89gGulUIU5tK3tA=
go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU=
golang.org/x/arch v0.0.0-20210923205945-b76863e36670 h1:18EFjUmQOcUvxNYSkA6jO9VAiXCnxFY6NyDX0bHDmkU=
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
* Copyright 2025 CloudWeGo Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"context"
"log"

"github.com/cloudwego/eino/callbacks"
"github.com/cloudwego/eino/components/document"
"github.com/cloudwego/eino/compose"
"github.com/cloudwego/eino/schema"
callbacksHelper "github.com/cloudwego/eino/utils/callbacks"

"github.com/cloudwego/eino-ext/components/document/transformer/splitter/markdown"
)

func main() {
ctx := context.Background()

// 初始化 transformer (以 markdown 为例)
transformer, err := markdown.NewHeaderSplitter(ctx, &markdown.HeaderConfig{
// 配置参数
Headers: map[string]string{
"##": "headerNameOfLevel2",
},
})
if err != nil {
log.Fatalf("markdown.NewHeaderSplitter failed, err=%v", err)
}

markdownDoc := &schema.Document{
Content: "## Title 1\nHello Word\n## Title 2\nWord Hello",
}

log.Printf("===== call Header Splitter directly =====")

// 转换文档
transformedDocs, err := transformer.Transform(ctx, []*schema.Document{markdownDoc})
if err != nil {
log.Fatalf("transformer.Transform failed, err=%v", err)
}

for idx, doc := range transformedDocs {
log.Printf("doc segment %v: %v", idx, doc.Content)
}

log.Printf("===== call Header Splitter in chain =====")

// 创建 callback handler
handlerHelper := &callbacksHelper.TransformerCallbackHandler{
OnStart: func(ctx context.Context, info *callbacks.RunInfo, input *document.TransformerCallbackInput) context.Context {
log.Printf("input access, len: %v, content: %s\n", len(input.Input), input.Input[0].Content)
return ctx
},
OnEnd: func(ctx context.Context, info *callbacks.RunInfo, output *document.TransformerCallbackOutput) context.Context {
log.Printf("output finished, len: %v\n", len(output.Output))
return ctx
},
// OnError
}

// 使用 callback handler
handler := callbacksHelper.NewHandlerHelper().
Transformer(handlerHelper).
Handler()

chain := compose.NewChain[[]*schema.Document, []*schema.Document]()
chain.AppendDocumentTransformer(transformer)

// 在运行时使用
run, err := chain.Compile(ctx)
if err != nil {
log.Fatalf("chain.Compile failed, err=%v", err)
}

outDocs, err := run.Invoke(ctx, []*schema.Document{markdownDoc}, compose.WithCallbacks(handler))
if err != nil {
log.Fatalf("run.Invoke failed, err=%v", err)
}

for idx, doc := range outDocs {
log.Printf("doc segment %v: %v", idx, doc.Content)
}
}
23 changes: 22 additions & 1 deletion components/document/transformer/splitter/markdown/header.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,28 @@ import (
)

type HeaderConfig struct {
// Headers specify the headers to be identified and their names in document metadata. Headers can only consist of '#'.
// Headers specify the headers to be identified and their names in document metadata.
// Headers can only consist of '#'.
// e.g.
// // the Header Config:
// config := &HeaderConfig{
// Headers: map[string]string{ "##": "headerNameOfLevel2" },
// TrimHeaders: false,
// }
//
// // the original document:
// originDoc := &schema.Document{
// Content: "hell\n##Title 2\n hello world",
// }
//
// // one of the split documents:
// splitDoc := &schema.Document{
// Content: "##Title 2\n hello world",
// Metadata: map[string]any{
// // other fields
// "headerNameOfLevel2": "Title 2",
// },
// }
Headers map[string]string
// TrimHeaders specify if results contain header lines.
TrimHeaders bool
Expand Down
Loading
Loading