Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions ui/__tests__/scientific-sources.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import {
buildSemanticScholarEntries,
buildUploadedDocumentEntry,
detectScientificSection,
formatRetrievedScientificSources,
parseBoundedInteger,
parseSemanticScholarReferences,
} from '@/utils/server/scientific-sources';

import { describe, expect, it } from 'vitest';

describe('scientific source helpers', () => {
it('parses Semantic Scholar references from direct arrays and wrapped fields', () => {
expect(
parseSemanticScholarReferences(
JSON.stringify({
references: [{ paperId: 'abc', title: 'Reference title' }],
}),
),
).toEqual([{ paperId: 'abc', title: 'Reference title' }]);
expect(
parseSemanticScholarReferences([{ title: 'A' }, { title: 'B' }]),
).toHaveLength(2);
});

it('builds citation-ready Semantic Scholar entries', () => {
const entries = buildSemanticScholarEntries(
[
{
paperId: '649def34f8be52c8b66281af98ae884c09aef38b',
title: 'Attention Is All You Need',
abstract: 'We propose a new simple network architecture.',
authors: [{ name: 'Ashish Vaswani' }, { name: 'Noam Shazeer' }],
year: 2017,
venue: 'NeurIPS',
externalIds: { DOI: '10.5555/3295222.3295349' },
},
],
() => 'id-1',
);

expect(entries).toHaveLength(1);
expect(entries[0].metadata).toMatchObject({
sourceType: 'semantic_scholar_reference',
citationKey: 'scholar:649def34f8be52c8b66281af98ae884c09aef38b:2017',
authors: 'Ashish Vaswani, Noam Shazeer',
doi: '10.5555/3295222.3295349',
year: 2017,
});
expect(entries[0].document).toContain('Abstract: We propose');
});

it('builds uploaded document metadata with stable page and chunk citations', () => {
const entry = buildUploadedDocumentEntry(
{
pageContent: 'Methods\nWe measured retrieval accuracy.',
metadata: {
source: '/tmp/papers/demo.pdf',
pdf: { info: { Title: 'Demo Study' } },
loc: { pageNumber: 3 },
},
},
2,
'chunk-id',
);

expect(entry.metadata).toMatchObject({
citationKey: 'doc:demo-study:p3:c3',
page: 3,
section: 'methods',
sourceType: 'uploaded_document',
title: 'Demo Study',
});
});

it('formats mixed Chroma results into prompt context with exact citation keys', () => {
const formatted = formatRetrievedScientificSources({
documents: [['Uploaded excerpt', 'Reference abstract']],
metadatas: [
[
{
sourceType: 'uploaded_document',
citationKey: 'doc:demo:p1:c1',
title: 'Demo',
page: 1,
},
{
sourceType: 'semantic_scholar_reference',
citationKey: 'scholar:paper:2024',
title: 'Paper',
year: 2024,
},
],
],
distances: [[0.12, 0.34]],
});

expect(formatted.sources).toHaveLength(2);
expect(formatted.context).toContain('Citation: [doc:demo:p1:c1]');
expect(formatted.context).toContain('Semantic Scholar reference');
expect(formatted.context).toContain('Retrieval distance: 0.3400');
});

it('detects scientific sections and clamps retrieval counts', () => {
expect(
detectScientificSection('Results\nThe measured recall improved.'),
).toBe('results');
expect(parseBoundedInteger('50', 8, 1, 20)).toBe(20);
expect(parseBoundedInteger('nope', 8, 1, 20)).toBe(8);
});
});
52 changes: 39 additions & 13 deletions ui/pages/api/fetch-documents.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,51 @@
import type { NextApiRequest, NextApiResponse } from "next";
import { ChromaClient, TransformersEmbeddingFunction } from "chromadb";
import type { NextApiRequest, NextApiResponse } from 'next';

export default async function handler(req: NextApiRequest, res: NextApiResponse) {
import {
formatRetrievedScientificSources,
parseBoundedInteger,
} from '@/utils/server/scientific-sources';

import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';

export default async function handler(
req: NextApiRequest,
res: NextApiResponse,
) {
try {
if (req.method !== 'POST') {
return res.status(405).end();
}

const client = new ChromaClient({
path: "http://chroma-server:8000",
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const query = req.body.input;
const query =
typeof req.body.input === 'string' ? req.body.input.trim() : '';

if (!query) {
return res.status(400).json({ error: 'Missing query input' });
}

const nResults = parseBoundedInteger(req.body.nResults, 8, 1, 20);

const embedder = new TransformersEmbeddingFunction();

const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder });
const collection = await client.getOrCreateCollection({
name: 'default-collection',
embeddingFunction: embedder,
});

// query the collection
const results = await collection.query({
nResults: 4,
queryTexts: [query]
})
const results = await collection.query({
nResults,
queryTexts: [query],
});
const formatted = formatRetrievedScientificSources(results);

res.status(200).json(results);
res.status(200).json({
...results,
...formatted,
});
} catch (error) {
if (error instanceof Error) {
console.error('Error message:', error.message);
Expand All @@ -29,4 +55,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
}
res.status(500).json({ error: 'An unexpected error occurred :(' });
}
}
}
105 changes: 54 additions & 51 deletions ui/pages/api/inject-documents.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import type { NextApiRequest, NextApiResponse } from 'next';

import {
SCIENTIFIC_TEXT_SEPARATORS,
buildSemanticScholarEntries,
buildUploadedDocumentEntry,
parseSemanticScholarReferences,
} from '@/utils/server/scientific-sources';

import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
import { IncomingForm } from 'formidable';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";

import path from 'path';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { v4 as uuidv4 } from 'uuid';

export const config = {
Expand Down Expand Up @@ -33,22 +38,48 @@ export default async function handler(
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const loader = new PDFLoader(files.pdf[0].filepath);

const originalDocs = await loader.load();

console.log(JSON.stringify(originalDocs));


const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 500,
chunkOverlap: 100,
});
chunkSize: 900,
chunkOverlap: 140,
separators: SCIENTIFIC_TEXT_SEPARATORS,
});

const docs = await splitter.splitDocuments(originalDocs);

// Process the documents and perform other logic
const { ids, metadatas, documentContents } = processDocuments(docs);
const pdfFiles = Array.isArray(files.pdf)
? files.pdf
: files.pdf
? [files.pdf]
: [];
const references = parseSemanticScholarReferences(
fields.references ??
fields.semanticScholarReferences ??
fields.savedReferences,
);
const uploadedDocumentEntries = [];

for (const file of pdfFiles) {
const loader = new PDFLoader(file.filepath);
const originalDocs = await loader.load();
const docs = await splitter.splitDocuments(originalDocs);

uploadedDocumentEntries.push(
...docs.map((document, index) =>
buildUploadedDocumentEntry(document, index, uuidv4()),
),
);
}

const semanticScholarEntries = buildSemanticScholarEntries(
references,
uuidv4,
);
const entries = [...uploadedDocumentEntries, ...semanticScholarEntries];

if (entries.length === 0) {
return res.status(400).json({
error:
'Upload at least one PDF or provide Semantic Scholar references.',
});
}

const embedder = new TransformersEmbeddingFunction();
const collection = await client.getOrCreateCollection({
Expand All @@ -57,14 +88,16 @@ export default async function handler(
});

await collection.add({
ids,
metadatas,
documents: documentContents,
ids: entries.map((entry) => entry.id),
metadatas: entries.map((entry) => entry.metadata),
documents: entries.map((entry) => entry.document),
});

res.status(200).json({
message: 'Documents processed successfully',
documentCount: ids.length,
documentCount: uploadedDocumentEntries.length,
referenceCount: semanticScholarEntries.length,
sourceCount: entries.length,
});
});
} catch (error) {
Expand All @@ -74,33 +107,3 @@ export default async function handler(
.json({ message: 'An error occurred while processing the documents' });
}
}

function processDocuments(docs: any) {
const ids = [];
const metadatas = [];
const documentContents = [];

for (const document of docs) {
// Generate an ID for each document, or use some existing unique identifier
const id = uuidv4();
ids.push(id);

const fallbackTitle = path.basename(document.metadata.source);
const titleFromMetadata = document.metadata.pdf.info.Title;

const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;


const metadata = {
title: title,
page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
source: document.metadata.source, // Define this function to extract verse info
};
metadatas.push(metadata);

// Add the page content to the documents array
documentContents.push(document.pageContent);
}

return { ids, metadatas, documentContents };
}
Loading