Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【RAG】文档上传支持图表解析、表格深度解析 && 切片接口增加知识库ID参数 #703

Merged
merged 1 commit into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 34 additions & 25 deletions docs/BasisModule/Platform/KnowledgeBase/knowledgebase.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ class DocumentProcessOption(BaseModel):
description="模板类型,ppt:模版配置—ppt幻灯片, resume:模版配置—简历文档, paper:模版配置—论文文档, custom:自定义配置—自定义切片, default:自定义配置—默认切分",
enum=["ppt", "paper", "qaPair", "resume", " custom", "default"],
)
parser: Optional[DocumentChoices] = Field(None, description="解析方法(文字提取默认启动,参数不体现,layoutAnalysis版面分析,ocr按需增加)")
parser: Optional[DocumentChoices] = Field(None, description="解析方法(文字提取默认启动,参数不体现,layoutAnalysis版面分析,ocr光学字符识别,pageImageAnalysis文档图片解析,chartAnalysis图表解析,tableAnalysis表格深度解析,按需增加)")
knowledgeAugmentation: Optional[DocumentChoices] = Field(
None, description="知识增强,faq、spokenQuery、spo、shortSummary按需增加。问题生成:faq、spokenQuery,段落摘要:shortSummary,三元组知识抽取:spo"
)
Expand Down Expand Up @@ -662,10 +662,11 @@ for message in doc_list:

#### 方法参数

| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| ---------- | -------- | -------- | -------- | -------------- |
| documentId | string | 是 | 文档ID | "正确的文档ID" |
| content | string | 是 | 切片内容 | "内容" |
| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| --------------- | -------- | -------- | -------- | -------------- |
| knowledgeBaseId | string | 是 | 知识库ID | |
| documentId | string | 是 | 文档ID | "正确的文档ID" |
| content | string | 是 | 切片内容 | "内容" |

#### 方法返回值

Expand All @@ -686,7 +687,7 @@ os.environ["APPBUILDER_TOKEN"] = "your_appbuilder_token"
my_knowledge_base_id = "your_knowledge_base_id"
my_knowledge = appbuilder.KnowledgeBase(my_knowledge_base_id)
print("知识库ID: ", my_knowledge.knowledge_id)
resp = my_knowledge.create_chunk("your_document_id", "content")
resp = my_knowledge.create_chunk("your_document_id", "content", knowledgebase_id=knowledge_base_id)
print("切片ID: ", resp.id)
chunk_id = resp.id
```
Expand All @@ -697,6 +698,7 @@ chunk_id = resp.id

| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| ---------- | -------- | ------------ | -------------- | -------------- |
| knowledgeBaseId | string | 是 | 知识库ID | |
| chunkId | string | 是 | 文档ID | "正确的切片ID" |
| content | string | 是 | 切片内容 | "内容" |
| enable | bool | 是 | 是否用该切片 | True |
Expand All @@ -711,16 +713,17 @@ os.environ["APPBUILDER_TOKEN"] = "your_appbuilder_token"
my_knowledge_base_id = "your_knowledge_base_id"
my_knowledge = appbuilder.KnowledgeBase(my_knowledge_base_id)
print("知识库ID: ", my_knowledge.knowledge_id)
my_knowledge.modify_chunk("your_chunk_id", "content", True)
my_knowledge.modify_chunk("your_chunk_id", "content", True, knowledgebase_id=my_knowledge_base_id)
```

### 16. 删除切片`delete_chunk(chunkId: str)`

#### 方法参数

| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| -------- | -------- | -------- | ------ | -------------- |
| chunkId | string | 是 | 文档ID | "正确的切片ID" |
| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| --------------- | -------- | -------- | -------- | -------------- |
| knowledgeBaseId | string | 是 | 知识库ID | |
| chunkId | string | 是 | 文档ID | "正确的切片ID" |

#### 方法示例

Expand All @@ -732,16 +735,17 @@ os.environ["APPBUILDER_TOKEN"] = "your_appbuilder_token"
my_knowledge_base_id = "your_knowledge_base_id"
my_knowledge = appbuilder.KnowledgeBase(my_knowledge_base_id)
print("知识库ID: ", my_knowledge.knowledge_id)
my_knowledge.delete_chunk("your_chunk_id")
my_knowledge.delete_chunk("your_chunk_id", knowledgebase_id=my_knowledge_base_id)
```

### 17. 获取切片信息`describe_chunk(chunkId: str)`

#### 方法参数

| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| -------- | -------- | -------- | ------ | -------------- |
| chunkId | string | 是 | 文档ID | "正确的切片ID" |
| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| --------------- | -------- | -------- | -------- | -------------- |
| knowledgeBaseId | string | 是 | 知识库ID | |
| chunkId | string | 是 | 文档ID | "正确的切片ID" |

#### 方法返回值

Expand Down Expand Up @@ -774,7 +778,7 @@ os.environ["APPBUILDER_TOKEN"] = "your_appbuilder_token"
my_knowledge_base_id = "your_knowledge_base_id"
my_knowledge = appbuilder.KnowledgeBase(my_knowledge_base_id)
print("知识库ID: ", my_knowledge.knowledge_id)
resp = my_knowledge.describe_chunk("your_chunk_id")
resp = my_knowledge.describe_chunk("your_chunk_id", knowledgebase_id=my_knowledge_base_id)
print("切片详情:")
print(resp)
```
Expand All @@ -783,12 +787,13 @@ print(resp)

#### 方法参数

| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| ---------- | -------- | -------- | ------------------------------------------------------------ | -------------- |
| documentId | string | 是 | 文档ID | "正确的文档ID" |
| marker | string | 否 | 起始位置,切片ID | "正确的切片ID" |
| maxKeys | string | 否 | 返回文档数量大小,默认10,最大值100 | 10 |
| type | string | 否 | 根据类型获取切片列表(RAW、NEW、COPY),RAW:原文切片,NEW:新增切片,COPY:复制切片 | "RAW" |
| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| --------------- | -------- | -------- | ------------------------------------------------------------ | -------------- |
| knowledgeBaseId | string | 是 | 知识库ID | |
| documentId | string | 是 | 文档ID | "正确的文档ID" |
| marker | string | 否 | 起始位置,切片ID | "正确的切片ID" |
| maxKeys | string | 否 | 返回文档数量大小,默认10,最大值100 | 10 |
| type | string | 否 | 根据类型获取切片列表(RAW、NEW、COPY),RAW:原文切片,NEW:新增切片,COPY:复制切片 | "RAW" |

#### 方法返回值

Expand Down Expand Up @@ -834,7 +839,7 @@ os.environ["APPBUILDER_TOKEN"] = "your_appbuilder_token"
my_knowledge_base_id = "your_knowledge_base_id"
my_knowledge = appbuilder.KnowledgeBase(my_knowledge_base_id)
print("知识库ID: ", my_knowledge.knowledge_id)
resp = my_knowledge.describe_chunks("your_document_id")
resp = my_knowledge.describe_chunks("your_document_id", knowledgebase_id=my_knowledge_base_id)
print("切片列表:")
print(resp)
```
Expand Down Expand Up @@ -964,7 +969,11 @@ public class KnowledgebaseTest {
@Test
public void testCreateChunk() throws IOException, AppBuilderServerException {
String documentId = "";
Knowledgebase knowledgebase = new Knowledgebase();
// 知识库ID
String knowledgeBaseId = "";
// Appbuilder Token
String secretKey = "";
Knowledgebase knowledgebase = new Knowledgebase(knowledgeBaseID, secretKey);
// 创建切片
String chunkId = knowledgebase.createChunk(documentId, "test");
// 修改切片
Expand Down Expand Up @@ -1189,13 +1198,13 @@ func TestChunk(t *testing.T) {
os.Setenv("APPBUILDER_LOGLEVEL", "DEBUG")
os.Setenv("APPBUILDER_TOKEN", "")
documentID := ""

knowledgeBaseID := "";
config, err := NewSDKConfig("", "")
if err != nil {
t.Fatalf("new http client config failed: %v", err)
}

client, err := NewKnowledgeBase(config)
client, err := NewKnowledgeBaseWithKnowledgeBaseID(knowledgeBaseID, config)
if err != nil {
t.Fatalf("new Knowledge base instance failed")
}
Expand Down
36 changes: 34 additions & 2 deletions go/appbuilder/knowledge_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,21 @@ func NewKnowledgeBase(config *SDKConfig) (*KnowledgeBase, error) {
return &KnowledgeBase{sdkConfig: config, client: client}, nil
}

func NewKnowledgeBaseWithKnowledgeBaseID(knowledgeBaseID string, config *SDKConfig) (*KnowledgeBase, error) {
if config == nil {
return nil, errors.New("invalid config")
}
client := config.HTTPClient
if client == nil {
client = &http.Client{Timeout: 60 * time.Second}
}
return &KnowledgeBase{knowledgeBaseID: knowledgeBaseID, sdkConfig: config, client: client}, nil
}

type KnowledgeBase struct {
sdkConfig *SDKConfig
client HTTPClient
knowledgeBaseID string
sdkConfig *SDKConfig
client HTTPClient
}

func (t *KnowledgeBase) CreateDocument(req CreateDocumentRequest) (CreateDocumentResponse, error) {
Expand Down Expand Up @@ -618,6 +630,10 @@ func (t *KnowledgeBase) CreateChunk(req CreateChunkRequest) (string, error) {
if req.ClientToken == "" {
req.ClientToken = uuid.New().String()
}

if req.KnowledgeBaseID == "" && t.knowledgeBaseID != "" {
req.KnowledgeBaseID = t.knowledgeBaseID
}
serviceURL, err := t.sdkConfig.ServiceURLV2("/knowledgeBase?Action=CreateChunk&clientToken=" + req.ClientToken)
if err != nil {
return "", err
Expand Down Expand Up @@ -657,6 +673,10 @@ func (t *KnowledgeBase) ModifyChunk(req ModifyChunkRequest) error {
if req.ClientToken == "" {
req.ClientToken = uuid.New().String()
}

if req.KnowledgeBaseID == "" && t.knowledgeBaseID != "" {
req.KnowledgeBaseID = t.knowledgeBaseID
}
serviceURL, err := t.sdkConfig.ServiceURLV2("/knowledgeBase?Action=ModifyChunk&clientToken=" + req.ClientToken)
if err != nil {
return err
Expand Down Expand Up @@ -715,6 +735,10 @@ func (t *KnowledgeBase) deleteChunk(chunkID string, clientToken string) error {
req := DeleteChunkRequest{
ChunkID: chunkID,
}

if t.knowledgeBaseID != "" {
req.KnowledgeBaseID = t.knowledgeBaseID
}
data, _ := json.Marshal(req)
request.Body = NopCloser(bytes.NewReader(data))
t.sdkConfig.BuildCurlCommand(&request)
Expand Down Expand Up @@ -754,6 +778,10 @@ func (t *KnowledgeBase) DescribeChunk(chunkID string) (DescribeChunkResponse, er
req := DescribeChunkRequest{
ChunkID: chunkID,
}

if t.knowledgeBaseID != "" {
req.KnowledgeBaseID = t.knowledgeBaseID
}
data, _ := json.Marshal(req)
request.Body = NopCloser(bytes.NewReader(data))
t.sdkConfig.BuildCurlCommand(&request)
Expand Down Expand Up @@ -786,6 +814,10 @@ func (t *KnowledgeBase) DescribeChunks(req DescribeChunksRequest) (DescribeChunk
if err != nil {
return DescribeChunksResponse{}, err
}

if req.KnowledgeBaseID == "" && t.knowledgeBaseID != "" {
req.KnowledgeBaseID = t.knowledgeBaseID
}
request.URL = serviceURL
request.Method = "POST"
header.Set("Content-Type", "application/json")
Expand Down
33 changes: 19 additions & 14 deletions go/appbuilder/knowledge_base_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,29 +190,33 @@ type UploadDocumentsResponse struct {
}

type CreateChunkRequest struct {
DocumentID string `json:"documentId"`
Content string `json:"content"`
ClientToken string `json:"client_token,omitempty"`
KnowledgeBaseID string `json:"knowledgeBaseId"`
DocumentID string `json:"documentId"`
Content string `json:"content"`
ClientToken string `json:"client_token,omitempty"`
}

type CreateChunkResponse struct {
ID string `json:"id"`
}

type ModifyChunkRequest struct {
ChunkID string `json:"chunkId"`
Content string `json:"content"`
Enable bool `json:"enable"`
ClientToken string `json:"client_token,omitempty"`
KnowledgeBaseID string `json:"knowledgeBaseId"`
ChunkID string `json:"chunkId"`
Content string `json:"content"`
Enable bool `json:"enable"`
ClientToken string `json:"client_token,omitempty"`
}

type DeleteChunkRequest struct {
ChunkID string `json:"chunkId"`
ClientToken string `json:"client_token,omitempty"`
KnowledgeBaseID string `json:"knowledgeBaseId"`
ChunkID string `json:"chunkId"`
ClientToken string `json:"client_token,omitempty"`
}

type DescribeChunkRequest struct {
ChunkID string `json:"chunkId"`
KnowledgeBaseID string `json:"knowledgeBaseId"`
ChunkID string `json:"chunkId"`
}

type DescribeChunkResponse struct {
Expand All @@ -232,10 +236,11 @@ type DescribeChunkResponse struct {
}

type DescribeChunksRequest struct {
DocumnetID string `json:"documentId"`
Marker string `json:"marker,omitempty"`
MaxKeys int `json:"maxKeys,omitempty"`
Type string `json:"type,omitempty"`
KnowledgeBaseID string `json:"knowledgeBaseId"`
DocumnetID string `json:"documentId"`
Marker string `json:"marker,omitempty"`
MaxKeys int `json:"maxKeys,omitempty"`
Type string `json:"type,omitempty"`
}

type DescribeChunksResponse struct {
Expand Down
51 changes: 37 additions & 14 deletions go/appbuilder/knowledge_base_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -822,18 +822,28 @@ func TestChunkError(t *testing.T) {
t.Parallel() // 并发运行
os.Setenv("APPBUILDER_LOGLEVEL", "DEBUG")

documentID := os.Getenv(DocumentIDV3)
config, err := NewSDKConfig("", os.Getenv(SecretKeyV3))
knowledgeBaseID := os.Getenv(DatasetID)
config, err := NewSDKConfig("", os.Getenv(SecretKey))
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("new http client config failed: %v", err)
}

client, err := NewKnowledgeBase(config)
client, err := NewKnowledgeBaseWithKnowledgeBaseID(knowledgeBaseID, config)
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("new Knowledge base instance failed")
}

documentsRes, err := client.GetDocumentList(GetDocumentListRequest{
KnowledgeBaseID: knowledgeBaseID,
})
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("get document list failed: %v", err)
}
documentID := documentsRes.Data[0].ID

var clientT = client.client
var GatewayURL = client.sdkConfig.GatewayURLV2

Expand Down Expand Up @@ -1405,22 +1415,33 @@ func TestChunk(t *testing.T) {
fmt.Fprintf(&logBuffer, format+"\n", args...)
}

documentID := os.Getenv(DocumentIDV3)
config, err := NewSDKConfig("", os.Getenv(SecretKeyV3))
knowledgeBaseID := os.Getenv(DatasetID)
config, err := NewSDKConfig("", os.Getenv(SecretKey))
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("new http client config failed: %v", err)
}

client, err := NewKnowledgeBase(config)
client, err := NewKnowledgeBaseWithKnowledgeBaseID(knowledgeBaseID, config)
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("new Knowledge base instance failed")
}

documentsRes, err := client.GetDocumentList(GetDocumentListRequest{
KnowledgeBaseID: knowledgeBaseID,
})
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("get document list failed: %v", err)
}
log("Documents retrieved: %+v", documentsRes)
documentID := documentsRes.Data[0].ID
// 创建切片
chunkID, err := client.CreateChunk(CreateChunkRequest{
DocumentID: documentID,
Content: "test",
KnowledgeBaseID: knowledgeBaseID,
DocumentID: documentID,
Content: "test",
})
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
Expand All @@ -1430,9 +1451,10 @@ func TestChunk(t *testing.T) {

// 修改切片
err = client.ModifyChunk(ModifyChunkRequest{
ChunkID: chunkID,
Content: "new test",
Enable: true,
KnowledgeBaseID: knowledgeBaseID,
ChunkID: chunkID,
Content: "new test",
Enable: true,
})
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
Expand All @@ -1450,9 +1472,10 @@ func TestChunk(t *testing.T) {

// 获取切片列表
describeChunksRes, err := client.DescribeChunks(DescribeChunksRequest{
DocumnetID: documentID,
Marker: chunkID,
MaxKeys: 10,
KnowledgeBaseID: knowledgeBaseID,
DocumnetID: documentID,
Marker: chunkID,
MaxKeys: 10,
})
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
Expand Down
Loading
Loading