-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocument.go
More file actions
156 lines (131 loc) · 4.33 KB
/
document.go
File metadata and controls
156 lines (131 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// document.go
package knowledgesdk
import (
"context"
"fmt"
"gorm.io/gorm"
)
// AddDocument 添加文档并立即分块
func (k *KnowledgeSDK) AddDocument(
ctx context.Context,
kbID, name, content string,
chunkConfig ChunkConfig,
) (*Document, error) {
return k.AddDocumentWithMetadata(ctx, kbID, name, content, "", nil, "", chunkConfig)
}
// AddDocumentWithCreator 添加文档并指定创建者ID
func (k *KnowledgeSDK) AddDocumentWithCreator(
ctx context.Context,
kbID, name, content string,
creatorID string,
chunkConfig ChunkConfig,
) (*Document, error) {
return k.AddDocumentWithMetadata(ctx, kbID, name, content, "", nil, creatorID, chunkConfig)
}
// splitDocumentIntoChunks 将文档内容分割成块并保存到数据库
func (k *KnowledgeSDK) splitDocumentIntoChunks(ctx context.Context, tx *gorm.DB, docID, content string, config ChunkConfig) error {
contentRunes := []rune(content)
contentLength := len(contentRunes)
// 首先删除已存在的分块(如更新文档的情况)
if err := tx.Where("document_id = ?", docID).Delete(&Chunk{}).Error; err != nil {
return fmt.Errorf("删除已有分块失败: %w", err)
}
// 分块并插入
pos := 0
chunkIndex := 0
for pos < contentLength {
endPos := pos + config.ChunkSize
if endPos > contentLength {
endPos = contentLength
}
chunkText := string(contentRunes[pos:endPos])
chunk := NewChunk(docID, chunkIndex, chunkText)
if err := tx.Create(chunk).Error; err != nil {
return fmt.Errorf("创建分块失败: %w", err)
}
chunkIndex++
pos += config.ChunkSize - config.Overlap
if pos >= contentLength {
break
}
}
return nil
}
// GetDocument 获取文档
func (k *KnowledgeSDK) GetDocument(ctx context.Context, docID string) (*Document, error) {
var doc Document
err := k.db.WithContext(ctx).Where("document_id = ?", docID).First(&doc).Error
if err != nil {
return nil, fmt.Errorf("获取文档失败: %w", err)
}
return &doc, nil
}
// GetDocumentWithChunks 获取文档及其分块
func (k *KnowledgeSDK) GetDocumentWithChunks(ctx context.Context, docID string) (*Document, error) {
var doc Document
err := k.db.WithContext(ctx).Where("document_id = ?", docID).First(&doc).Error
if err != nil {
return nil, fmt.Errorf("获取文档失败: %w", err)
}
var chunks []Chunk
err = k.db.WithContext(ctx).Where("document_id = ?", docID).Find(&chunks).Error
if err != nil {
return nil, fmt.Errorf("获取文档分块失败: %w", err)
}
doc.Chunks = chunks
return &doc, nil
}
// DeleteDocument 删除文档
func (k *KnowledgeSDK) DeleteDocument(ctx context.Context, docID string) error {
return k.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
// 删除文档会级联删除分块
if err := tx.Where("document_id = ?", docID).Delete(&Document{}).Error; err != nil {
return fmt.Errorf("删除文档失败: %w", err)
}
return nil
})
}
// AddDocumentWithMetadata 添加文档并立即分块,支持元数据和创建者ID
func (k *KnowledgeSDK) AddDocumentWithMetadata(
ctx context.Context,
kbID, name, content string,
contentType string,
metadata MetadataMap,
creatorID string,
chunkConfig ChunkConfig,
) (*Document, error) {
// 检查知识库是否存在
var kb KnowledgeBase
if err := k.db.WithContext(ctx).Where("kb_id = ?", kbID).First(&kb).Error; err != nil {
return nil, fmt.Errorf("找不到知识库: %w", err)
}
// 创建文档
doc := NewDocument(kbID, name, content, creatorID)
doc.ContentType = String(contentType)
doc.Metadata = metadata
doc.Status = String(DocStatusExtractSuccess) // 直接设置为抽取成功状态,准备切分
// 开始事务
err := k.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
// 创建文档
if err := tx.Create(doc).Error; err != nil {
return fmt.Errorf("创建文档失败: %w", err)
}
// 进行文档分块
if err := k.splitDocumentIntoChunks(ctx, tx, StringValue(doc.ID), StringValue(doc.OriginalContent), chunkConfig); err != nil {
// 如果分块失败,更新状态
doc.Status = String(DocStatusSplitFailed)
tx.Save(doc)
return fmt.Errorf("文档分块失败: %w", err)
}
// 更新文档状态
doc.Status = String(DocStatusSplitSuccess) // 切分成功,等待索引
if err := tx.Save(doc).Error; err != nil {
return fmt.Errorf("更新文档状态失败: %w", err)
}
return nil
})
if err != nil {
return nil, err
}
return doc, nil
}