Skip to content

Commit 2b458e1

Browse files
committed
Support S3 metadata
Signed-off-by: Denis Jannot <denis.jannot@solo.io>
1 parent c8d21ee commit 2b458e1

3 files changed

Lines changed: 28 additions & 3 deletions

File tree

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ Configuration is managed through two files:
229229
* `encoding`: (Optional) Text file encoding (defaults to `'utf8'`). Does not apply to binary files (PDF, DOC, DOCX).
230230
* `url_rewrite_prefix`: (Optional) URL prefix to rewrite `s3://` URLs (e.g., `'https://docs.example.com'`).
231231

232+
**S3 user metadata resolution:** The `product_name` and `version` fields support a `metadata(...)` syntax to dynamically resolve values from S3 object user metadata. For example, `product_name: 'metadata(x-amz-meta-product-name)'` will set `product_name` to the value of the `x-amz-meta-product-name` user metadata on each S3 object. If the metadata key doesn't exist on an object, an empty string is used. Literal values (without the `metadata(...)` wrapper) work as before.
233+
232234
Authentication uses the AWS SDK default credential chain: environment variables (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`), `~/.aws/credentials`, IAM roles, etc.
233235
234236
Incremental sync tracks object `LastModified` timestamps so only new or updated objects are processed on subsequent runs. Deleted objects are automatically cleaned up.
@@ -365,7 +367,7 @@ Configuration is managed through two files:
365367
366368
# S3 bucket source example
367369
- type: 's3'
368-
product_name: 'my-docs'
370+
product_name: 'metadata(x-amz-meta-product-name)'
369371
version: 'latest'
370372
bucket: 'my-documentation-bucket'
371373
prefix: 'docs/v2/'

doc2vec.ts

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -851,7 +851,15 @@ export class Doc2Vec {
851851
fileUrl = `s3://${config.bucket}/${obj.key}`;
852852
}
853853

854-
const chunks = await this.contentProcessor.chunkMarkdown(content, config, fileUrl);
854+
// Resolve metadata(...) references in product_name and version
855+
const s3Meta = getResponse.Metadata || {};
856+
const resolvedConfig = {
857+
...config,
858+
product_name: this.resolveS3MetadataValue(config.product_name, s3Meta),
859+
version: this.resolveS3MetadataValue(config.version, s3Meta),
860+
};
861+
862+
const chunks = await this.contentProcessor.chunkMarkdown(content, resolvedConfig, fileUrl);
855863
logger.info(`Created ${chunks.length} chunks for ${obj.key}`);
856864

857865
await this.processChunksForUrl(chunks, fileUrl, dbConnection, logger);
@@ -901,6 +909,21 @@ export class Doc2Vec {
901909
logger.info(`Finished processing S3 bucket: ${config.bucket}`);
902910
}
903911

912+
/**
913+
* Resolves a config value that may use the metadata(...) syntax.
914+
* e.g. "metadata(x-amz-meta-product-name)" looks up "product-name" in the S3 object's user metadata.
915+
* Returns the original value if no metadata(...) pattern is found.
916+
* Returns empty string if the referenced metadata key doesn't exist on the object.
917+
*/
918+
private resolveS3MetadataValue(configValue: string, s3Metadata: Record<string, string>): string {
919+
const match = configValue.match(/^metadata\((.+)\)$/);
920+
if (!match) return configValue;
921+
const metaKey = match[1];
922+
// AWS SDK returns user metadata keys without the x-amz-meta- prefix
923+
const lookupKey = metaKey.replace(/^x-amz-meta-/, '');
924+
return s3Metadata[lookupKey] ?? '';
925+
}
926+
904927
private async processCodeSource(config: CodeSourceConfig, parentLogger: Logger): Promise<void> {
905928
const logger = parentLogger.child('process');
906929
logger.info(`Starting processing for code source (${config.source})`);

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "doc2vec",
3-
"version": "2.8.0",
3+
"version": "2.9.0",
44
"type": "commonjs",
55
"description": "",
66
"main": "dist/doc2vec.js",

0 commit comments

Comments
 (0)