diff --git a/DEMO_SETUP.md b/DEMO_SETUP.md index 8bfacd8..70e9815 100644 --- a/DEMO_SETUP.md +++ b/DEMO_SETUP.md @@ -226,6 +226,42 @@ The test suite creates temporary test data. You can reference `tests/conftest.py - Read-only: checked (recommended for demo) 3. Click **"Create Mount"** +### API Endpoints (for Links Integration) + +1. Navigate to **Settings** > **Links** section +2. Scroll to "API Endpoint Mappings" +3. Configure a new endpoint: + - **Name**: Descriptive name (e.g., "Users API") + - **URL**: Full API endpoint URL (e.g., `https://api.example.com/users`) + - **Auth Method**: Select authentication type: + - `None`: No authentication + - `Bearer Token`: OAuth/JWT bearer token + - `API Key`: API key in X-API-Key header + - **Auth Value**: Enter token/key if authentication is required + - **JSONPath** (optional): Extract specific data (e.g., `$.data[*]`) + - **Maps to Label** (optional): Target Label for imported data +4. Click **"Test Connection"** to verify the endpoint +5. Click **"Save Endpoint"** to register it + +**Using API Endpoints in Links:** +- Registered endpoints appear in the Links wizard +- Select an endpoint as a data source when creating links +- Field mappings automatically populate from endpoint configuration + +**Security Notes:** +- Auth tokens are encrypted at rest in the settings database +- For production, set `SCIDK_API_ENCRYPTION_KEY` environment variable +- Without this variable, an ephemeral key is generated (not persistent across restarts) + +**Example: JSONPlaceholder Test API** +``` +Name: JSONPlaceholder Users +URL: https://jsonplaceholder.typicode.com/users +Auth Method: None +JSONPath: $[*] +Maps to Label: User +``` + ## Troubleshooting ### Application Won't Start diff --git a/e2e/chat.spec.ts b/e2e/chat.spec.ts index e848c05..97398f5 100644 --- a/e2e/chat.spec.ts +++ b/e2e/chat.spec.ts @@ -18,7 +18,7 @@ test('chat page loads and displays beta badge', async ({ page, baseURL }) => { await page.waitForLoadState('networkidle'); // Verify page loads - await expect(page).toHaveTitle(/SciDK - Chats/i, { timeout: 10_000 }); + await expect(page).toHaveTitle(/-SciDK-> Chats/i, { timeout: 10_000 }); // Check for Beta badge const betaBadge = page.locator('.badge'); @@ -57,7 +57,7 @@ test('chat navigation link is visible in header', async ({ page, baseURL }) => { // Click it and verify we navigate to chat page await chatsLink.click(); await page.waitForLoadState('networkidle'); - await expect(page).toHaveTitle(/SciDK - Chats/i); + await expect(page).toHaveTitle(/-SciDK-> Chats/i); }); test('chat form can accept input', async ({ page, baseURL }) => { diff --git a/e2e/core-flows.spec.ts b/e2e/core-flows.spec.ts index 535900d..6150141 100644 --- a/e2e/core-flows.spec.ts +++ b/e2e/core-flows.spec.ts @@ -136,7 +136,7 @@ test('navigation covers all 7 pages', async ({ page, baseURL }) => { { testId: 'nav-maps', url: '/map', titlePattern: /Map/i }, { testId: 'nav-chats', url: '/chat', titlePattern: /Chat/i }, { testId: 'nav-labels', url: '/labels', titlePattern: /Labels/i }, - { testId: 'nav-links', url: '/links', titlePattern: /Links/i }, + { testId: 'nav-integrate', url: '/integrate', titlePattern: /-SciDK-> Integrations/i }, { testId: 'nav-settings', url: '/settings', titlePattern: /Settings/i }, ]; diff --git a/e2e/global-setup.ts b/e2e/global-setup.ts index 300868d..28ed2b2 100644 --- a/e2e/global-setup.ts +++ b/e2e/global-setup.ts @@ -48,6 +48,16 @@ export default async function globalSetup(config: FullConfig) { (process as any).env.BASE_URL = baseUrl; await waitForReady(baseUrl); + + // Clean up any leftover test data from previous runs + try { + await fetch(`${baseUrl}/api/admin/cleanup-test-scans`, { method: 'POST' }); + await fetch(`${baseUrl}/api/admin/cleanup-test-labels`, { method: 'POST' }); + await fetch(`${baseUrl}/api/admin/cleanup-test-endpoints`, { method: 'POST' }); + console.log('[setup] Test data cleaned up'); + } catch (error) { + console.error('[setup] Failed to cleanup test data:', error); + } } export async function teardown() { diff --git a/e2e/global-teardown.ts b/e2e/global-teardown.ts index fe49b97..9146a76 100644 --- a/e2e/global-teardown.ts +++ b/e2e/global-teardown.ts @@ -28,6 +28,17 @@ export default async function globalTeardown(config: FullConfig) { } catch (error) { console.error('[cleanup] Failed to cleanup test labels:', error); } + + // Clean up test API endpoints + try { + const response = await fetch(`${baseUrl}/api/admin/cleanup-test-endpoints`, { + method: 'POST', + }); + const result = await response.json(); + console.log('[cleanup] Test API endpoints cleaned up:', result); + } catch (error) { + console.error('[cleanup] Failed to cleanup test API endpoints:', error); + } } // Kill the server process diff --git a/e2e/links-advanced.spec.ts b/e2e/integrations-advanced.spec.ts similarity index 92% rename from e2e/links-advanced.spec.ts rename to e2e/integrations-advanced.spec.ts index 4e5278c..2713d7c 100644 --- a/e2e/links-advanced.spec.ts +++ b/e2e/integrations-advanced.spec.ts @@ -7,14 +7,14 @@ import { test, expect } from '@playwright/test'; test('links page api source inputs are functional', async ({ page, baseURL }) => { const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); + await page.goto(`${base}/integrate`); await page.waitForLoadState('networkidle'); // Wait for labels to load (Links page needs labels for dropdowns) await page.waitForTimeout(2000); // Create new link - await page.getByTestId('new-link-btn').click(); + await page.getByTestId('new-integration-btn').click(); // Switch to API source type const apiSourceButton = page.locator('button').filter({ hasText: /^API$/i }); @@ -38,14 +38,14 @@ test('links page api source inputs are functional', async ({ page, baseURL }) => test('links page target graph label input is functional', async ({ page, baseURL }) => { const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); + await page.goto(`${base}/integrate`); await page.waitForLoadState('networkidle'); // Wait for labels to load (Links page needs labels for dropdowns) await page.waitForTimeout(2000); // Create new link - await page.getByTestId('new-link-btn').click(); + await page.getByTestId('new-integration-btn').click(); // Navigate to target step (wizard has: source -> target -> matching -> relationship) const nextButton = page.locator('#btn-next'); @@ -78,14 +78,14 @@ test('links page target graph label input is functional', async ({ page, baseURL test('links page cypher matching query input is functional', async ({ page, baseURL }) => { const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); + await page.goto(`${base}/integrate`); await page.waitForLoadState('networkidle'); // Wait for labels to load (Links page needs labels for dropdowns) await page.waitForTimeout(2000); // Create new link - await page.getByTestId('new-link-btn').click(); + await page.getByTestId('new-integration-btn').click(); // Navigate through wizard to matching step (4 steps to reach matching) const nextButton = page.locator('#btn-next'); @@ -118,14 +118,14 @@ test('links page cypher matching query input is functional', async ({ page, base test('links page preview button is present', async ({ page, baseURL }) => { const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); + await page.goto(`${base}/integrate`); await page.waitForLoadState('networkidle'); // Wait for labels to load (Links page needs labels for dropdowns) await page.waitForTimeout(2000); // Create new link - await page.getByTestId('new-link-btn').click(); + await page.getByTestId('new-integration-btn').click(); // Navigate through wizard const nextButton = page.locator('#btn-next'); @@ -155,7 +155,7 @@ test('links page preview button is present', async ({ page, baseURL }) => { test('links page execute button is present and functional', async ({ page, baseURL }) => { const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); + await page.goto(`${base}/integrate`); await page.waitForLoadState('networkidle'); // Wait for labels to load (Links page needs labels for dropdowns) @@ -174,7 +174,7 @@ test('links page execute button is present and functional', async ({ page, baseU await expect(executeButton).toBeVisible(); // Mock API to prevent actual execution - await page.route('**/api/links/*/execute', async (route) => { + await page.route('**/api/integrate/*/execute', async (route) => { await route.fulfill({ status: 200, contentType: 'application/json', @@ -191,10 +191,10 @@ test('links page execute button is present and functional', async ({ page, baseU } } else { // Create a new link and save it first - await page.getByTestId('new-link-btn').click(); + await page.getByTestId('new-integration-btn').click(); // Fill in minimal link data - await page.locator('#link-name').fill('Test Execute Link'); + await page.locator('#integration-name').fill('Test Execute Link'); // Fill CSV data const csvData = page.locator('#csv-data'); diff --git a/e2e/integrations.spec.ts b/e2e/integrations.spec.ts new file mode 100644 index 0000000..ec30b94 --- /dev/null +++ b/e2e/integrations.spec.ts @@ -0,0 +1,597 @@ +import { test, expect } from '@playwright/test'; + +/** + * E2E tests for Links page functionality. + * Tests the complete workflow: create link definition → configure source → configure target → define relationship → preview → execute + */ + +test('links page loads and displays empty state', async ({ page, baseURL }) => { + const consoleMessages: { type: string; text: string }[] = []; + page.on('console', (msg) => { + consoleMessages.push({ type: msg.type(), text: msg.text() }); + }); + + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + + // Navigate to Links page + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + // Verify page loads + await expect(page).toHaveTitle(/-SciDK-> Integrations/i, { timeout: 10_000 }); + + // Check for new link button + await expect(page.getByTestId('new-integration-btn')).toBeVisible(); + + // Check for link list + await expect(page.getByTestId('integration-list')).toBeVisible(); + + // No console errors + const errors = consoleMessages.filter((m) => m.type === 'error'); + expect(errors.length).toBe(0); +}); + +test('links navigation link is visible in header', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + + await page.goto(base); + await page.waitForLoadState('networkidle'); + + // Check that Links link exists in navigation + const linksLink = page.getByTestId('nav-integrate'); + await expect(linksLink).toBeVisible(); + + // Click it and verify we navigate to links page + await linksLink.click(); + await page.waitForLoadState('networkidle'); + await expect(page).toHaveTitle(/-SciDK-> Integrations/i); +}); + +test('wizard navigation: can navigate through all 3 steps (Label→Label refactor)', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + + // Create labels needed for this test + await page.goto(`${base}/labels`); + await page.waitForLoadState('networkidle'); + + await page.getByTestId('new-label-btn').click(); + await page.getByTestId('label-name').fill('WizTestLabel1'); + await page.getByTestId('save-label-btn').click(); + await page.waitForTimeout(500); + + await page.getByTestId('new-label-btn').click(); + await page.getByTestId('label-name').fill('WizTestLabel2'); + await page.getByTestId('save-label-btn').click(); + await page.waitForTimeout(500); + + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + // Click "New Link" button + await page.getByTestId('new-integration-btn').click(); + + // Verify wizard is visible + await expect(page.locator('#link-wizard')).toBeVisible(); + + // Step 1 should be active (Source Label) + await expect(page.locator('.wizard-step[data-step="1"]')).toHaveClass(/active/); + + // Enter link name and select source label + await page.getByTestId('integration-name').fill('Test Link'); + await page.getByTestId('source-label-select').selectOption({ index: 1 }); // Select first label + + // Click Next to go to step 2 (Match Strategy) + await page.locator('#btn-next').click(); + await expect(page.locator('.wizard-step[data-step="2"]')).toHaveClass(/active/); + + // Click Next to go to step 3 (Target & Relationship) + await page.locator('#btn-next').click(); + await expect(page.locator('.wizard-step[data-step="3"]')).toHaveClass(/active/); + + // Select target label and enter relationship type + await page.getByTestId('target-label-select').selectOption({ index: 1 }); + await page.getByTestId('rel-type').fill('TEST_REL'); + + // Verify Back button is visible + await expect(page.locator('#btn-prev')).toBeVisible(); + + // Click Back to go to step 2 + await page.locator('#btn-prev').click(); + await expect(page.locator('.wizard-step[data-step="2"]')).toHaveClass(/active/); +}); + +test('can create table import link definition (Label→Label refactor)', async ({ page, baseURL }) => { + const consoleMessages: { type: string; text: string }[] = []; + page.on('console', (msg) => { + consoleMessages.push({ type: msg.type(), text: msg.text() }); + }); + + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + + // First create labels we'll use + await page.goto(`${base}/labels`); + await page.waitForLoadState('networkidle'); + + // Create Author label + await page.getByTestId('new-label-btn').click(); + await page.getByTestId('label-name').fill('Author'); + await page.getByTestId('save-label-btn').click(); + await page.waitForTimeout(500); + + // Create File label + await page.getByTestId('new-label-btn').click(); + await page.getByTestId('label-name').fill('File'); + await page.getByTestId('save-label-btn').click(); + await page.waitForTimeout(500); + + // Now go to Links page + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + // Click "New Link" button + await page.getByTestId('new-integration-btn').click(); + + // Step 1: Select Source Label + await page.getByTestId('integration-name').fill('Import Authors to Files'); + await page.getByTestId('source-label-select').selectOption('Author'); + + // Go to Step 2 + await page.locator('#btn-next').click(); + + // Step 2: Configure Match Strategy (table_import) + await page.locator('.match-strategy-btn[data-strategy="table_import"]').click(); + + // Enter table data + const csvData = 'name,email,file_path\nAlice,alice@ex.com,file1.txt\nBob,bob@ex.com,file2.txt'; + await page.locator('#table-data').fill(csvData); + + // Go to Step 3 + await page.locator('#btn-next').click(); + + // Step 3: Target Label & Relationship + await page.getByTestId('target-label-select').selectOption('File'); + await page.getByTestId('rel-type').fill('AUTHORED'); + + // Add a relationship property + await page.locator('#btn-add-rel-prop').click(); + const propRows = page.locator('#rel-props-container .property-row'); + await expect(propRows).toHaveCount(1); + await propRows.locator('[data-prop-key]').fill('date'); + await propRows.locator('[data-prop-value]').fill('2024-01-15'); + + // Save the definition + await page.locator('#btn-save-def').click(); + await page.waitForTimeout(1500); // Wait for save + + // Verify link appears in list + const linkItems = page.locator('.link-item'); + await expect(linkItems.first()).toBeVisible(); + const linkText = await linkItems.first().textContent(); + expect(linkText).toContain('Import Authors to Files'); + expect(linkText).toContain('Author'); + expect(linkText).toContain('File'); + expect(linkText).toContain('AUTHORED'); + + // No console errors + const errors = consoleMessages.filter((m) => m.type === 'error'); + expect(errors.length).toBe(0); +}); + +test('can create Label to Label link definition with property matching', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + + // First create labels we'll use + await page.goto(`${base}/labels`); + await page.waitForLoadState('networkidle'); + + // Create Person label + await page.getByTestId('new-label-btn').click(); + await page.getByTestId('label-name').fill('Person'); + await page.getByTestId('save-label-btn').click(); + await page.waitForTimeout(500); + + // Create Document label + await page.getByTestId('new-label-btn').click(); + await page.getByTestId('label-name').fill('Document'); + await page.getByTestId('save-label-btn').click(); + await page.waitForTimeout(500); + + // Now go to Links page + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + // Click "New Link" button + await page.getByTestId('new-integration-btn').click(); + + // Step 1: Select Source Label + await page.getByTestId('integration-name').fill('Person to Document Link'); + await page.getByTestId('source-label-select').selectOption('Person'); + + // Go to Step 2 + await page.locator('#btn-next').click(); + + // Step 2: Configure Match Strategy (property matching - default) + await page.locator('#match-source-field').fill('email'); + await page.locator('#match-target-field').fill('author_email'); + + // Go to Step 3 + await page.locator('#btn-next').click(); + + // Step 3: Target Label & Relationship + await page.getByTestId('target-label-select').selectOption('Document'); + await page.getByTestId('rel-type').fill('AUTHORED'); + + // Save the definition + await page.locator('#btn-save-def').click(); + await page.waitForTimeout(1500); + + // Verify link appears in list + const linkItems = page.locator('.link-item'); + const linkText = await linkItems.first().textContent(); + expect(linkText).toContain('Person to Document Link'); + expect(linkText).toContain('Person'); + expect(linkText).toContain('Document'); + expect(linkText).toContain('AUTHORED'); +}); + +test('can save and load link definition', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + + const uniqueName = `Test Save Load ${Date.now()}`; + + // First create labels + await page.goto(`${base}/labels`); + await page.waitForLoadState('networkidle'); + + await page.getByTestId('new-label-btn').click(); + await page.getByTestId('label-name').fill('SaveLoadSource'); + await page.getByTestId('save-label-btn').click(); + await page.waitForTimeout(500); + + await page.getByTestId('new-label-btn').click(); + await page.getByTestId('label-name').fill('SaveLoadTarget'); + await page.getByTestId('save-label-btn').click(); + await page.waitForTimeout(500); + + // Now go to Links + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + // Create a link definition + await page.getByTestId('new-integration-btn').click(); + await page.getByTestId('integration-name').fill(uniqueName); + await page.getByTestId('source-label-select').selectOption('SaveLoadSource'); + await page.locator('#btn-next').click(); + await page.locator('.match-strategy-btn[data-strategy="property"]').click(); + await page.locator('#match-source-field').fill('col1'); + await page.locator('#match-target-field').fill('field1'); + await page.locator('#btn-next').click(); + await page.getByTestId('target-label-select').selectOption('SaveLoadTarget'); + await page.getByTestId('rel-type').fill('TEST_REL'); + await page.locator('#btn-save-def').click(); + await page.waitForTimeout(1500); + + // Click on the saved link by finding it by name + const linkItem = page.locator('.link-item').filter({ hasText: uniqueName }); + await linkItem.click(); + await page.waitForTimeout(500); + + // Verify wizard is populated with saved data + await expect(page.getByTestId('integration-name')).toHaveValue(uniqueName); + + // Check that source label is selected + await expect(page.getByTestId('source-label-select')).toHaveValue('SaveLoadSource'); + + // Navigate to step 2 and verify match strategy + await page.locator('#btn-next').click(); + await expect(page.locator('#match-source-field')).toHaveValue('col1'); + await expect(page.locator('#match-target-field')).toHaveValue('field1'); + + // Navigate to step 3 and verify target and relationship + await page.locator('#btn-next').click(); + await expect(page.getByTestId('target-label-select')).toHaveValue('SaveLoadTarget'); + await expect(page.getByTestId('rel-type')).toHaveValue('TEST_REL'); + + // Cleanup: Delete the test link + page.once('dialog', async (dialog) => await dialog.accept()); + await page.locator('#btn-delete-def').click(); + await page.waitForTimeout(1000); +}); + +test('can delete link definition', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + + // Capture console logs and errors + const consoleLogs: string[] = []; + page.on('console', msg => consoleLogs.push(`[${msg.type()}] ${msg.text()}`)); + page.on('pageerror', err => consoleLogs.push(`[ERROR] ${err.message}`)); + + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + const uniqueName = `To Delete ${Date.now()}`; + + // First create labels + await page.goto(`${base}/labels`); + await page.waitForLoadState('networkidle'); + await page.getByTestId('new-label-btn').click(); + await page.getByTestId('label-name').fill('DeleteTest'); + await page.getByTestId('save-label-btn').click(); + await page.waitForTimeout(500); + + // Now create a link definition + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + await page.getByTestId('new-integration-btn').click(); + await page.getByTestId('integration-name').fill(uniqueName); + await page.getByTestId('source-label-select').selectOption('DeleteTest'); + await page.locator('#btn-next').click(); + await page.locator('#btn-next').click(); + await page.getByTestId('target-label-select').selectOption('DeleteTest'); + await page.getByTestId('rel-type').fill('DELETE_ME'); + await page.locator('#btn-save-def').click(); + await page.waitForTimeout(1500); + + // Load the link by finding it by name + const linkItem = page.locator('.link-item').filter({ hasText: uniqueName }); + await linkItem.click(); + await page.waitForTimeout(500); + + // Delete button should be visible + const deleteBtn = page.locator('#btn-delete-def'); + await expect(deleteBtn).toBeVisible(); + + // Handle confirmation dialog + page.once('dialog', async (dialog) => { + expect(dialog.type()).toBe('confirm'); + await dialog.accept(); + }); + + await deleteBtn.click(); + + // Wait for wizard to hide (indicates delete completed) + try { + await expect(page.locator('#link-wizard')).toBeHidden({ timeout: 5000 }); + } catch (e) { + console.log('Console logs:', consoleLogs.join('\n')); + throw e; + } + + // Wait a bit more for list to update + await page.waitForTimeout(1000); + + // Verify link is removed from list - it should not appear anywhere + const listItems = await page.locator('.link-item').all(); + const listTexts = await Promise.all(listItems.map(item => item.textContent())); + const found = listTexts.some(text => text?.includes(uniqueName)); + + if (found) { + console.log('Console logs:', consoleLogs.join('\n')); + } + + expect(found).toBe(false); +}); + +test('validation: cannot save without name', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + // Create new link but don't enter name + await page.getByTestId('new-integration-btn').click(); + + // Try to save without name + await page.locator('#btn-save-def').click(); + await page.waitForTimeout(500); + + // Should still be on wizard (not saved) + await expect(page.getByTestId('integration-name')).toBeVisible(); + const value = await page.getByTestId('integration-name').inputValue(); + expect(value).toBe(''); +}); + +test('validation: cannot save without relationship type', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + // Create new link with name but no relationship type + await page.getByTestId('new-integration-btn').click(); + await page.getByTestId('integration-name').fill('No Rel Type'); + + // Navigate to step 3 + await page.locator('#btn-next').click(); + await page.locator('#btn-next').click(); + + // Don't enter relationship type + + // Try to save + await page.locator('#btn-save-def').click(); + await page.waitForTimeout(500); + + // Should still be on wizard + await expect(page.locator('#rel-type')).toBeVisible(); + const value = await page.locator('#rel-type').inputValue(); + expect(value).toBe(''); +}); + +test('Label→Label: source and target are label dropdowns', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + await page.getByTestId('new-integration-btn').click(); + + // Step 1: Source label dropdown should be visible + await expect(page.getByTestId('source-label-select')).toBeVisible(); + + // Navigate to step 3 + await page.locator('#btn-next').click(); + await page.locator('#btn-next').click(); + + // Step 3: Target label dropdown should be visible + await expect(page.getByTestId('target-label-select')).toBeVisible(); +}); + +test('can switch between match strategies (Label→Label refactor)', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + await page.getByTestId('new-integration-btn').click(); + + // Navigate to step 2 (Match Strategy) + await page.locator('#btn-next').click(); + + // Property match should be visible by default + await expect(page.locator('#match-property')).toBeVisible(); + await expect(page.locator('#match-fuzzy')).not.toBeVisible(); + await expect(page.locator('#match-table-import')).not.toBeVisible(); + await expect(page.locator('#match-api-endpoint')).not.toBeVisible(); + + // Switch to Fuzzy match + await page.locator('.match-strategy-btn[data-strategy="fuzzy"]').click(); + await expect(page.locator('#match-property')).not.toBeVisible(); + await expect(page.locator('#match-fuzzy')).toBeVisible(); + + // Switch to Table Import + await page.locator('.match-strategy-btn[data-strategy="table_import"]').click(); + await expect(page.locator('#match-fuzzy')).not.toBeVisible(); + await expect(page.locator('#match-table-import')).toBeVisible(); + + // Switch to API Endpoint + await page.locator('.match-strategy-btn[data-strategy="api_endpoint"]').click(); + await expect(page.locator('#match-table-import')).not.toBeVisible(); + await expect(page.locator('#match-api-endpoint')).toBeVisible(); + + // Switch back to Property match + await page.locator('.match-strategy-btn[data-strategy="property"]').click(); + await expect(page.locator('#match-api-endpoint')).not.toBeVisible(); + await expect(page.locator('#match-property')).toBeVisible(); +}); + +test('can add and remove relationship properties', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + await page.getByTestId('new-integration-btn').click(); + + // Navigate to step 3 + await page.locator('#btn-next').click(); + await page.locator('#btn-next').click(); + + // Add 3 relationship properties + for (let i = 0; i < 3; i++) { + await page.locator('#btn-add-rel-prop').click(); + } + + // Verify 3 property rows exist + const propRows = page.locator('#rel-props-container .property-row'); + await expect(propRows).toHaveCount(3); + + // Fill in values + await propRows.nth(0).locator('[data-prop-key]').fill('key1'); + await propRows.nth(1).locator('[data-prop-key]').fill('key2'); + await propRows.nth(2).locator('[data-prop-key]').fill('key3'); + + // Remove the second property + await propRows.nth(1).locator('button').click(); + + // Verify only 2 properties remain + await expect(page.locator('#rel-props-container .property-row')).toHaveCount(2); +}); + +test('wizard visual summary: step circles show summaries for completed steps', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + + // Create test labels first + await page.goto(`${base}/labels`); + await page.waitForLoadState('networkidle'); + + // Create Person label + await page.getByTestId('new-label-btn').click(); + await page.getByTestId('label-name').fill('Person'); + await page.getByTestId('save-label-btn').click(); + await page.waitForTimeout(500); + + // Create File label + await page.getByTestId('new-label-btn').click(); + await page.getByTestId('label-name').fill('File'); + await page.getByTestId('save-label-btn').click(); + await page.waitForTimeout(500); + + // Go back to Links + await page.goto(`${base}/integrate`); + await page.waitForLoadState('networkidle'); + await page.getByTestId('new-integration-btn').click(); + + // Step 1: Initial state should show "1" + let step1Circle = page.getByTestId('step-1-circle'); + await expect(step1Circle).toHaveText('1'); + + // Fill out Step 1 + await page.getByTestId('integration-name').fill('Test Visual Summary'); + await page.getByTestId('source-label-select').selectOption('Person'); + + // Navigate to Step 2 + await page.locator('#btn-next').click(); + await page.waitForTimeout(200); + + // Step 1 should now show "Person" (source label name) + await expect(step1Circle).toHaveText('Person'); + + // Step 2 should be active and show "2" + let step2Circle = page.getByTestId('step-2-circle'); + await expect(step2Circle).toHaveText('2'); + + // Select fuzzy match strategy + await page.locator('.match-strategy-btn[data-strategy="fuzzy"]').click(); + + // Navigate to Step 3 + await page.locator('#btn-next').click(); + await page.waitForTimeout(200); + + // Step 2 should now show "~" (fuzzy icon) + await expect(step2Circle).toHaveText('~'); + + // Fill out Step 3 + await page.getByTestId('target-label-select').selectOption('File'); + await page.getByTestId('rel-type').fill('AUTHORED'); + + // Navigate back to Step 2 + await page.locator('#btn-prev').click(); + await page.waitForTimeout(200); + + // Step 1 should still show "Person" + await expect(step1Circle).toHaveText('Person'); + + // Switch to table_import strategy + await page.locator('.match-strategy-btn[data-strategy="table_import"]').click(); + + // Navigate to Step 3 again + await page.locator('#btn-next').click(); + await page.waitForTimeout(200); + + // Step 2 should now show "📊" (table icon) + await expect(step2Circle).toHaveText('📊'); + + // Navigate back to Step 1 + await page.locator('#btn-prev').click(); + await page.locator('#btn-prev').click(); + await page.waitForTimeout(200); + + // Change source label + await page.getByTestId('source-label-select').selectOption('File'); + await page.locator('#btn-next').click(); + await page.waitForTimeout(200); + + // Step 1 should now show "File" + await expect(step1Circle).toHaveText('File'); + + // Test tooltip visibility on hover (Step 1) + const step1Tooltip = page.getByTestId('step-1-tooltip'); + await expect(step1Tooltip).toHaveText('Source: File'); +}); diff --git a/e2e/labels-arrows.spec.ts b/e2e/labels-arrows.spec.ts index 03917eb..3ef86da 100644 --- a/e2e/labels-arrows.spec.ts +++ b/e2e/labels-arrows.spec.ts @@ -84,7 +84,7 @@ test('can open import modal and close it', async ({ page, baseURL }) => { await expect(modal).toBeVisible(); // Check modal title (using custom modal structure) - await expect(modal.locator('.custom-modal-header h5')).toHaveText(/Import Schema from Arrows\.app/i); + await expect(modal.locator('.custom-modal-header h5')).toHaveText(/Import Schema/i); // Check textarea is present const textarea = modal.locator('#arrows-json-input'); diff --git a/e2e/labels.spec.ts b/e2e/labels.spec.ts index c28ff2f..082ba74 100644 --- a/e2e/labels.spec.ts +++ b/e2e/labels.spec.ts @@ -34,7 +34,7 @@ test('labels page loads and displays empty state', async ({ page, baseURL }) => await page.waitForLoadState('networkidle'); // Verify page loads - await expect(page).toHaveTitle(/SciDK - Labels/i, { timeout: 10_000 }); + await expect(page).toHaveTitle(/-SciDK-> Labels/i, { timeout: 10_000 }); // Check for new label button await expect(page.getByTestId('new-label-btn')).toBeVisible(); @@ -60,7 +60,7 @@ test('labels navigation link is visible in header', async ({ page, baseURL }) => // Click it and verify we navigate to labels page await labelsLink.click(); await page.waitForLoadState('networkidle'); - await expect(page).toHaveTitle(/SciDK - Labels/i); + await expect(page).toHaveTitle(/-SciDK-> Labels/i); }); test('complete label workflow: create → edit → delete', async ({ page, baseURL }) => { @@ -123,8 +123,8 @@ test('complete label workflow: create → edit → delete', async ({ page, baseU const editPropertyRows = page.getByTestId('property-row'); await expect(editPropertyRows).toHaveCount(2); - // Step 8: Delete the label - const deleteBtn = page.getByTestId('delete-label-btn'); + // Step 8: Delete the label (use readonly button since we're in read-only mode) + const deleteBtn = page.getByTestId('delete-label-readonly-btn'); await expect(deleteBtn).toBeVisible(); // Handle confirmation dialog @@ -198,7 +198,7 @@ test('can add and remove multiple properties', async ({ page, baseURL }) => { // Cleanup: delete the label await foundLabel!.click(); page.on('dialog', async (dialog) => await dialog.accept()); - await page.getByTestId('delete-label-btn').click(); + await page.getByTestId('delete-label-readonly-btn').click(); await page.waitForTimeout(500); }); @@ -239,7 +239,7 @@ test('can create label with relationships', async ({ page, baseURL }) => { const item = labelItems.filter({ hasText: labelName }); await item.click(); await page.waitForTimeout(300); - await page.getByTestId('delete-label-btn').click(); + await page.getByTestId('delete-label-readonly-btn').click(); await page.waitForTimeout(500); } }); @@ -309,7 +309,7 @@ test('neo4j: push label to neo4j', async ({ page, baseURL, request: pageRequest page.on('dialog', async (dialog) => await dialog.accept()); await ourLabel!.click(); await page.waitForTimeout(300); - await page.getByTestId('delete-label-btn').click(); + await page.getByTestId('delete-label-readonly-btn').click(); await page.waitForTimeout(500); }); @@ -345,3 +345,62 @@ test('neo4j: pull labels from neo4j', async ({ page, baseURL }) => { const labelList = page.getByTestId('label-list'); await expect(labelList).toBeVisible(); }); + +test('import modal has EDA option', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + await page.goto(`${base}/labels`); + await page.waitForLoadState('networkidle'); + + // Click Import button to open modal + const importBtn = page.getByTestId('import-arrows-btn'); + await importBtn.click(); + + // Wait for modal to be visible + await page.waitForTimeout(200); + + // Verify both import type radio buttons exist + const arrowsRadio = page.getByTestId('import-type-arrows'); + const edaRadio = page.getByTestId('import-type-eda'); + + await expect(arrowsRadio).toBeVisible(); + await expect(edaRadio).toBeVisible(); + + // Verify Arrows is selected by default + await expect(arrowsRadio).toBeChecked(); + + // Verify EDA file input exists + const edaFileInput = page.getByTestId('eda-file-input'); + await expect(edaFileInput).toBeAttached(); +}); + +test('import modal switches between import types', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + await page.goto(`${base}/labels`); + await page.waitForLoadState('networkidle'); + + // Open import modal + await page.getByTestId('import-arrows-btn').click(); + await page.waitForTimeout(200); + + // Initially Arrows section should be visible + const arrowsSection = page.locator('#arrows-import-section'); + const edaSection = page.locator('#eda-import-section'); + + await expect(arrowsSection).toBeVisible(); + await expect(edaSection).not.toBeVisible(); + + // Click EDA radio button + await page.getByTestId('import-type-eda').click(); + await page.waitForTimeout(100); + + // Now EDA section should be visible + await expect(arrowsSection).not.toBeVisible(); + await expect(edaSection).toBeVisible(); + + // Switch back to Arrows + await page.getByTestId('import-type-arrows').click(); + await page.waitForTimeout(100); + + await expect(arrowsSection).toBeVisible(); + await expect(edaSection).not.toBeVisible(); +}); diff --git a/e2e/links.spec.ts b/e2e/links.spec.ts deleted file mode 100644 index 80139cb..0000000 --- a/e2e/links.spec.ts +++ /dev/null @@ -1,441 +0,0 @@ -import { test, expect } from '@playwright/test'; - -/** - * E2E tests for Links page functionality. - * Tests the complete workflow: create link definition → configure source → configure target → define relationship → preview → execute - */ - -test('links page loads and displays empty state', async ({ page, baseURL }) => { - const consoleMessages: { type: string; text: string }[] = []; - page.on('console', (msg) => { - consoleMessages.push({ type: msg.type(), text: msg.text() }); - }); - - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - - // Navigate to Links page - await page.goto(`${base}/links`); - await page.waitForLoadState('networkidle'); - - // Verify page loads - await expect(page).toHaveTitle(/SciDK - Links/i, { timeout: 10_000 }); - - // Check for new link button - await expect(page.getByTestId('new-link-btn')).toBeVisible(); - - // Check for link list - await expect(page.getByTestId('link-list')).toBeVisible(); - - // No console errors - const errors = consoleMessages.filter((m) => m.type === 'error'); - expect(errors.length).toBe(0); -}); - -test('links navigation link is visible in header', async ({ page, baseURL }) => { - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - - await page.goto(base); - await page.waitForLoadState('networkidle'); - - // Check that Links link exists in navigation - const linksLink = page.getByTestId('nav-links'); - await expect(linksLink).toBeVisible(); - - // Click it and verify we navigate to links page - await linksLink.click(); - await page.waitForLoadState('networkidle'); - await expect(page).toHaveTitle(/SciDK - Links/i); -}); - -test('wizard navigation: can navigate through all 4 steps', async ({ page, baseURL }) => { - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); - await page.waitForLoadState('networkidle'); - - // Click "New Link" button - await page.getByTestId('new-link-btn').click(); - - // Verify wizard is visible - await expect(page.locator('#link-wizard')).toBeVisible(); - - // Step 1 should be active - await expect(page.locator('.wizard-step[data-step="1"]')).toHaveClass(/active/); - - // Enter link name - await page.getByTestId('link-name').fill('Test Link'); - - // Click Next to go to step 2 - await page.locator('#btn-next').click(); - await expect(page.locator('.wizard-step[data-step="2"]')).toHaveClass(/active/); - - // Click Next to go to step 3 - await page.locator('#btn-next').click(); - await expect(page.locator('.wizard-step[data-step="3"]')).toHaveClass(/active/); - - // Enter relationship type - await page.locator('#rel-type').fill('TEST_REL'); - - // Click Next to go to step 4 - await page.locator('#btn-next').click(); - await expect(page.locator('.wizard-step[data-step="4"]')).toHaveClass(/active/); - - // Verify Back button is visible - await expect(page.locator('#btn-prev')).toBeVisible(); - - // Click Back to go to step 3 - await page.locator('#btn-prev').click(); - await expect(page.locator('.wizard-step[data-step="3"]')).toHaveClass(/active/); -}); - -test('can create CSV to Graph link definition', async ({ page, baseURL }) => { - const consoleMessages: { type: string; text: string }[] = []; - page.on('console', (msg) => { - consoleMessages.push({ type: msg.type(), text: msg.text() }); - }); - - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); - await page.waitForLoadState('networkidle'); - - // Click "New Link" button - await page.getByTestId('new-link-btn').click(); - - // Step 1: Configure Source - await page.getByTestId('link-name').fill('CSV Authors to Files'); - - // Select CSV source type - await page.locator('.source-type-btn[data-source="csv"]').click(); - - // Enter CSV data - const csvData = 'name,email,file_path\nAlice,alice@ex.com,file1.txt\nBob,bob@ex.com,file2.txt'; - await page.locator('#csv-data').fill(csvData); - - // Go to Step 2 - await page.locator('#btn-next').click(); - - // Step 2: Configure Target - // Label target should be selected by default - await page.locator('#target-label-name').fill('File'); - - // Configure match strategy (property should be default) - await page.locator('#match-source-field').fill('file_path'); - await page.locator('#match-target-field').fill('path'); - - // Go to Step 3 - await page.locator('#btn-next').click(); - - // Step 3: Define Relationship - await page.locator('#rel-type').fill('AUTHORED'); - - // Add a relationship property - await page.locator('#btn-add-rel-prop').click(); - const propRows = page.locator('#rel-props-container .property-row'); - await expect(propRows).toHaveCount(1); - await propRows.locator('[data-prop-key]').fill('date'); - await propRows.locator('[data-prop-value]').fill('2024-01-15'); - - // Save the definition - await page.locator('#btn-save-def').click(); - await page.waitForTimeout(1500); // Wait for save - - // Verify link appears in list - const linkItems = page.locator('.link-item'); - await expect(linkItems.first()).toBeVisible(); - const linkText = await linkItems.first().textContent(); - expect(linkText).toContain('CSV Authors to Files'); - expect(linkText).toContain('csv'); - expect(linkText).toContain('AUTHORED'); - - // No console errors - const errors = consoleMessages.filter((m) => m.type === 'error'); - expect(errors.length).toBe(0); -}); - -test('can create Graph to Graph link definition', async ({ page, baseURL }) => { - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); - await page.waitForLoadState('networkidle'); - - // Click "New Link" button - await page.getByTestId('new-link-btn').click(); - - // Step 1: Configure Source (Graph is default) - await page.getByTestId('link-name').fill('Person to File Link'); - await page.locator('#source-label').fill('Person'); - await page.locator('#source-where').fill('p.role = "author"'); - - // Go to Step 2 - await page.locator('#btn-next').click(); - - // Step 2: Configure Target - await page.locator('#target-label-name').fill('File'); - await page.locator('#match-source-field').fill('email'); - await page.locator('#match-target-field').fill('author_email'); - - // Go to Step 3 - await page.locator('#btn-next').click(); - - // Step 3: Define Relationship - await page.locator('#rel-type').fill('AUTHORED_BY'); - - // Save the definition - await page.locator('#btn-save-def').click(); - await page.waitForTimeout(1500); - - // Verify link appears in list - const linkItems = page.locator('.link-item'); - const linkText = await linkItems.first().textContent(); - expect(linkText).toContain('Person to File Link'); - expect(linkText).toContain('graph'); -}); - -test('can save and load link definition', async ({ page, baseURL }) => { - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); - await page.waitForLoadState('networkidle'); - - const uniqueName = `Test Save Load ${Date.now()}`; - - // Create a link definition - await page.getByTestId('new-link-btn').click(); - await page.getByTestId('link-name').fill(uniqueName); - await page.locator('.source-type-btn[data-source="csv"]').click(); - await page.locator('#csv-data').fill('col1,col2\nval1,val2'); - await page.locator('#btn-next').click(); - await page.locator('#target-label-name').fill('TestLabel'); - await page.locator('#match-source-field').fill('col1'); - await page.locator('#match-target-field').fill('field1'); - await page.locator('#btn-next').click(); - await page.locator('#rel-type').fill('TEST_REL'); - await page.locator('#btn-save-def').click(); - await page.waitForTimeout(1500); - - // Click on the saved link by finding it by name - const linkItem = page.locator('.link-item').filter({ hasText: uniqueName }); - await linkItem.click(); - await page.waitForTimeout(500); - - // Verify wizard is populated with saved data - await expect(page.getByTestId('link-name')).toHaveValue(uniqueName); - - // Check that CSV button is active - await expect(page.locator('.source-type-btn[data-source="csv"]')).toHaveClass(/active/); - - // Navigate to step 2 and verify - await page.locator('#btn-next').click(); - await expect(page.locator('#target-label-name')).toHaveValue('TestLabel'); - await expect(page.locator('#match-source-field')).toHaveValue('col1'); - await expect(page.locator('#match-target-field')).toHaveValue('field1'); - - // Navigate to step 3 and verify - await page.locator('#btn-next').click(); - await expect(page.locator('#rel-type')).toHaveValue('TEST_REL'); - - // Cleanup: Delete the test link - page.once('dialog', async (dialog) => await dialog.accept()); - await page.locator('#btn-delete-def').click(); - await page.waitForTimeout(1000); -}); - -test('can delete link definition', async ({ page, baseURL }) => { - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - - // Capture console logs and errors - const consoleLogs: string[] = []; - page.on('console', msg => consoleLogs.push(`[${msg.type()}] ${msg.text()}`)); - page.on('pageerror', err => consoleLogs.push(`[ERROR] ${err.message}`)); - - await page.goto(`${base}/links`); - await page.waitForLoadState('networkidle'); - - const uniqueName = `To Delete ${Date.now()}`; - - // Create a link definition - await page.getByTestId('new-link-btn').click(); - await page.getByTestId('link-name').fill(uniqueName); - await page.locator('#btn-next').click(); - await page.locator('#target-label-name').fill('TestLabel'); - await page.locator('#btn-next').click(); - await page.locator('#rel-type').fill('DELETE_ME'); - await page.locator('#btn-save-def').click(); - await page.waitForTimeout(1500); - - // Load the link by finding it by name - const linkItem = page.locator('.link-item').filter({ hasText: uniqueName }); - await linkItem.click(); - await page.waitForTimeout(500); - - // Delete button should be visible - const deleteBtn = page.locator('#btn-delete-def'); - await expect(deleteBtn).toBeVisible(); - - // Handle confirmation dialog - page.once('dialog', async (dialog) => { - expect(dialog.type()).toBe('confirm'); - await dialog.accept(); - }); - - await deleteBtn.click(); - - // Wait for wizard to hide (indicates delete completed) - try { - await expect(page.locator('#link-wizard')).toBeHidden({ timeout: 5000 }); - } catch (e) { - console.log('Console logs:', consoleLogs.join('\n')); - throw e; - } - - // Wait a bit more for list to update - await page.waitForTimeout(1000); - - // Verify link is removed from list - it should not appear anywhere - const listItems = await page.locator('.link-item').all(); - const listTexts = await Promise.all(listItems.map(item => item.textContent())); - const found = listTexts.some(text => text?.includes(uniqueName)); - - if (found) { - console.log('Console logs:', consoleLogs.join('\n')); - } - - expect(found).toBe(false); -}); - -test('validation: cannot save without name', async ({ page, baseURL }) => { - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); - await page.waitForLoadState('networkidle'); - - // Create new link but don't enter name - await page.getByTestId('new-link-btn').click(); - - // Try to save without name - await page.locator('#btn-save-def').click(); - await page.waitForTimeout(500); - - // Should still be on wizard (not saved) - await expect(page.getByTestId('link-name')).toBeVisible(); - const value = await page.getByTestId('link-name').inputValue(); - expect(value).toBe(''); -}); - -test('validation: cannot save without relationship type', async ({ page, baseURL }) => { - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); - await page.waitForLoadState('networkidle'); - - // Create new link with name but no relationship type - await page.getByTestId('new-link-btn').click(); - await page.getByTestId('link-name').fill('No Rel Type'); - - // Navigate to step 3 - await page.locator('#btn-next').click(); - await page.locator('#btn-next').click(); - - // Don't enter relationship type - - // Try to save - await page.locator('#btn-save-def').click(); - await page.waitForTimeout(500); - - // Should still be on wizard - await expect(page.locator('#rel-type')).toBeVisible(); - const value = await page.locator('#rel-type').inputValue(); - expect(value).toBe(''); -}); - -test('can switch between source types', async ({ page, baseURL }) => { - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); - await page.waitForLoadState('networkidle'); - - await page.getByTestId('new-link-btn').click(); - - // Graph source should be visible by default - await expect(page.locator('#source-graph')).toBeVisible(); - await expect(page.locator('#source-csv')).not.toBeVisible(); - await expect(page.locator('#source-api')).not.toBeVisible(); - - // Switch to CSV - await page.locator('.source-type-btn[data-source="csv"]').click(); - await expect(page.locator('#source-graph')).not.toBeVisible(); - await expect(page.locator('#source-csv')).toBeVisible(); - await expect(page.locator('#source-api')).not.toBeVisible(); - - // Switch to API - await page.locator('.source-type-btn[data-source="api"]').click(); - await expect(page.locator('#source-graph')).not.toBeVisible(); - await expect(page.locator('#source-csv')).not.toBeVisible(); - await expect(page.locator('#source-api')).toBeVisible(); - - // Switch back to Graph - await page.locator('.source-type-btn[data-source="graph"]').click(); - await expect(page.locator('#source-graph')).toBeVisible(); - await expect(page.locator('#source-csv')).not.toBeVisible(); - await expect(page.locator('#source-api')).not.toBeVisible(); -}); - -test('can switch between match strategies', async ({ page, baseURL }) => { - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); - await page.waitForLoadState('networkidle'); - - await page.getByTestId('new-link-btn').click(); - - // Navigate to step 2 - await page.locator('#btn-next').click(); - - // Property match should be visible by default - await expect(page.locator('#match-property')).toBeVisible(); - await expect(page.locator('#match-id')).not.toBeVisible(); - await expect(page.locator('#match-cypher')).not.toBeVisible(); - - // Switch to ID match - await page.locator('.match-strategy-btn[data-strategy="id"]').click(); - await expect(page.locator('#match-property')).not.toBeVisible(); - await expect(page.locator('#match-id')).toBeVisible(); - await expect(page.locator('#match-cypher')).not.toBeVisible(); - - // Switch to Cypher match - await page.locator('.match-strategy-btn[data-strategy="cypher"]').click(); - await expect(page.locator('#match-property')).not.toBeVisible(); - await expect(page.locator('#match-id')).not.toBeVisible(); - await expect(page.locator('#match-cypher')).toBeVisible(); - - // Switch back to Property match - await page.locator('.match-strategy-btn[data-strategy="property"]').click(); - await expect(page.locator('#match-property')).toBeVisible(); - await expect(page.locator('#match-id')).not.toBeVisible(); - await expect(page.locator('#match-cypher')).not.toBeVisible(); -}); - -test('can add and remove relationship properties', async ({ page, baseURL }) => { - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - await page.goto(`${base}/links`); - await page.waitForLoadState('networkidle'); - - await page.getByTestId('new-link-btn').click(); - - // Navigate to step 3 - await page.locator('#btn-next').click(); - await page.locator('#btn-next').click(); - - // Add 3 relationship properties - for (let i = 0; i < 3; i++) { - await page.locator('#btn-add-rel-prop').click(); - } - - // Verify 3 property rows exist - const propRows = page.locator('#rel-props-container .property-row'); - await expect(propRows).toHaveCount(3); - - // Fill in values - await propRows.nth(0).locator('[data-prop-key]').fill('key1'); - await propRows.nth(1).locator('[data-prop-key]').fill('key2'); - await propRows.nth(2).locator('[data-prop-key]').fill('key3'); - - // Remove the second property - await propRows.nth(1).locator('button').click(); - - // Verify only 2 properties remain - await expect(page.locator('#rel-props-container .property-row')).toHaveCount(2); -}); diff --git a/e2e/map.spec.ts b/e2e/map.spec.ts index 59ed0c0..da7f4eb 100644 --- a/e2e/map.spec.ts +++ b/e2e/map.spec.ts @@ -18,7 +18,7 @@ test('map page loads and displays graph visualization', async ({ page, baseURL } await page.waitForLoadState('networkidle'); // Verify page loads - await expect(page).toHaveTitle(/SciDK - Maps/i, { timeout: 10_000 }); + await expect(page).toHaveTitle(/-SciDK-> Maps/i, { timeout: 10_000 }); // Check for main sections await expect(page.locator('h2').filter({ hasText: 'Schema Graph' })).toBeVisible(); @@ -51,7 +51,7 @@ test('map navigation link is visible in header', async ({ page, baseURL }) => { // Click it and verify we navigate to map page await mapsLink.click(); await page.waitForLoadState('networkidle'); - await expect(page).toHaveTitle(/SciDK - Maps/i); + await expect(page).toHaveTitle(/-SciDK-> Maps/i); }); test('graph filter controls are present and functional', async ({ page, baseURL }) => { diff --git a/e2e/settings-advanced.spec.ts b/e2e/settings-advanced.spec.ts index 7d5024d..8eafa9c 100644 --- a/e2e/settings-advanced.spec.ts +++ b/e2e/settings-advanced.spec.ts @@ -96,6 +96,10 @@ test('interpreter checkboxes can be toggled', async ({ page, baseURL }) => { await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); + // Navigate to Interpreters section + await page.locator('.settings-sidebar-item[data-section="interpreters"]').click(); + await page.waitForTimeout(200); + // Wait for interpreters table to populate await page.waitForTimeout(1500); @@ -144,6 +148,10 @@ test('interpreter checkbox has data-iid attribute', async ({ page, baseURL }) => await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); + // Navigate to Interpreters section + await page.locator('.settings-sidebar-item[data-section="interpreters"]').click(); + await page.waitForTimeout(200); + // Wait for interpreters table to populate await page.waitForTimeout(1500); diff --git a/e2e/settings-api-endpoints.spec.ts b/e2e/settings-api-endpoints.spec.ts new file mode 100644 index 0000000..865b01c --- /dev/null +++ b/e2e/settings-api-endpoints.spec.ts @@ -0,0 +1,180 @@ +import { test, expect } from '@playwright/test'; + +test.describe('Settings - API Endpoints', () => { + test.beforeEach(async ({ page, baseURL }) => { + // Clean up test endpoints before each test + const response = await fetch(`${baseURL}/api/admin/cleanup-test-endpoints`, { method: 'POST' }); + await response.json(); // Wait for cleanup to complete + + await page.goto(`${baseURL}/settings#integrations`); + await page.waitForLoadState('domcontentloaded'); // Wait for DOM to be ready + await page.waitForSelector('[data-testid="api-endpoint-name"]'); + await page.waitForLoadState('networkidle'); // Then wait for all API calls to complete + await page.waitForTimeout(200); // Small delay for JS initialization + }); + + test.afterEach(async ({ baseURL }) => { + // Clean up test endpoints after each test + await fetch(`${baseURL}/api/admin/cleanup-test-endpoints`, { method: 'POST' }); + }); + + test('should display API endpoint form @smoke', async ({ page }) => { + // Check all form fields are present + await expect(page.locator('[data-testid="api-endpoint-name"]')).toBeVisible(); + await expect(page.locator('[data-testid="api-endpoint-url"]')).toBeVisible(); + await expect(page.locator('[data-testid="api-endpoint-auth-method"]')).toBeVisible(); + await expect(page.locator('[data-testid="api-endpoint-auth-value"]')).toBeVisible(); + await expect(page.locator('[data-testid="api-endpoint-json-path"]')).toBeVisible(); + await expect(page.locator('[data-testid="api-endpoint-target-label"]')).toBeVisible(); + await expect(page.locator('[data-testid="btn-test-api-endpoint"]')).toBeVisible(); + await expect(page.locator('[data-testid="btn-save-api-endpoint"]')).toBeVisible(); + }); + + test.skip('should create a new API endpoint @smoke', async ({ page }) => { + // Fill in endpoint details + await page.fill('[data-testid="api-endpoint-name"]', 'Test Users API'); + await page.fill('[data-testid="api-endpoint-url"]', 'https://jsonplaceholder.typicode.com/users'); + await page.selectOption('[data-testid="api-endpoint-auth-method"]', 'none'); + await page.fill('[data-testid="api-endpoint-json-path"]', '$[*]'); + + // Save endpoint + await page.click('[data-testid="btn-save-api-endpoint"]'); + + // Wait for success message + await expect(page.locator('#api-endpoint-message')).toContainText('Endpoint saved!'); + + // Verify endpoint appears in list + await expect(page.locator('#api-endpoints-list')).toContainText('Test Users API'); + await expect(page.locator('#api-endpoints-list')).toContainText('jsonplaceholder.typicode.com'); + }); + + test('should validate required fields @smoke', async ({ page }) => { + // Try to save without filling required fields + await page.click('[data-testid="btn-save-api-endpoint"]'); + + // Should show error message + await expect(page.locator('#api-endpoint-message')).toContainText('Name and URL are required'); + }); + + test('should test API endpoint connection', async ({ page }) => { + // Fill in endpoint details with a real API + await page.fill('[data-testid="api-endpoint-name"]', 'Test JSONPlaceholder'); + await page.fill('[data-testid="api-endpoint-url"]', 'https://jsonplaceholder.typicode.com/users'); + await page.selectOption('[data-testid="api-endpoint-auth-method"]', 'none'); + + // Test connection + await page.click('[data-testid="btn-test-api-endpoint"]'); + + // Wait for test result (may take a moment) + await expect(page.locator('#api-endpoint-message')).toContainText('Connection successful', { timeout: 15000 }); + }); + + test.skip('should handle bearer token auth', async ({ page }) => { + await page.fill('[data-testid="api-endpoint-name"]', 'Secure API'); + await page.fill('[data-testid="api-endpoint-url"]', 'https://api.example.com/data'); + await page.selectOption('[data-testid="api-endpoint-auth-method"]', 'bearer'); + await page.fill('[data-testid="api-endpoint-auth-value"]', 'test_token_123'); + + // Save endpoint + await page.click('[data-testid="btn-save-api-endpoint"]'); + + // Verify saved + await expect(page.locator('#api-endpoint-message')).toContainText('Endpoint saved!'); + await expect(page.locator('#api-endpoints-list')).toContainText('Secure API'); + await expect(page.locator('#api-endpoints-list')).toContainText('bearer'); + }); + + test.skip('should edit an existing endpoint', async ({ page }) => { + // First create an endpoint + await page.fill('[data-testid="api-endpoint-name"]', 'Original API'); + await page.fill('[data-testid="api-endpoint-url"]', 'https://api.example.com/original'); + await page.click('[data-testid="btn-save-api-endpoint"]'); + await page.waitForSelector('#api-endpoints-list:has-text("Original API")'); + + // Click edit button + await page.click('#api-endpoints-list button:has-text("Edit")'); + + // Wait for form to populate + await expect(page.locator('[data-testid="api-endpoint-name"]')).toHaveValue('Original API'); + await expect(page.locator('[data-testid="btn-save-api-endpoint"]')).toContainText('Update Endpoint'); + + // Modify fields + await page.fill('[data-testid="api-endpoint-name"]', 'Updated API'); + await page.fill('[data-testid="api-endpoint-url"]', 'https://api.example.com/updated'); + + // Save changes + await page.click('[data-testid="btn-save-api-endpoint"]'); + + // Verify update + await expect(page.locator('#api-endpoint-message')).toContainText('Endpoint updated!'); + await expect(page.locator('#api-endpoints-list')).toContainText('Updated API'); + await expect(page.locator('#api-endpoints-list')).not.toContainText('Original API'); + }); + + test('should delete an endpoint @smoke', async ({ page }) => { + // Create an endpoint + await page.fill('[data-testid="api-endpoint-name"]', 'Delete Me API'); + await page.fill('[data-testid="api-endpoint-url"]', 'https://api.example.com/deleteme'); + await page.click('[data-testid="btn-save-api-endpoint"]'); + await page.waitForSelector('#api-endpoints-list:has-text("Delete Me API")'); + + // Set up dialog handler + page.on('dialog', dialog => dialog.accept()); + + // Click delete button + await page.click('#api-endpoints-list button:has-text("Delete")'); + + // Verify deletion + await expect(page.locator('#api-endpoint-message')).toContainText('Endpoint deleted'); + await expect(page.locator('#api-endpoints-list')).not.toContainText('Delete Me API'); + }); + + test('should cancel editing', async ({ page }) => { + // Create an endpoint + await page.fill('[data-testid="api-endpoint-name"]', 'Cancel Test API'); + await page.fill('[data-testid="api-endpoint-url"]', 'https://api.example.com/cancel'); + await page.click('[data-testid="btn-save-api-endpoint"]'); + await page.waitForSelector('#api-endpoints-list:has-text("Cancel Test API")'); + + // Start editing + await page.click('#api-endpoints-list button:has-text("Edit")'); + await expect(page.locator('[data-testid="btn-cancel-api-endpoint"]')).toBeVisible(); + + // Modify a field + await page.fill('[data-testid="api-endpoint-name"]', 'Should Not Save'); + + // Cancel + await page.click('[data-testid="btn-cancel-api-endpoint"]'); + + // Verify form is reset + await expect(page.locator('[data-testid="api-endpoint-name"]')).toHaveValue(''); + await expect(page.locator('[data-testid="btn-save-api-endpoint"]')).toContainText('Save Endpoint'); + await expect(page.locator('[data-testid="btn-cancel-api-endpoint"]')).not.toBeVisible(); + + // Verify original endpoint still exists unchanged + await expect(page.locator('#api-endpoints-list')).toContainText('Cancel Test API'); + }); + + test('should display empty state when no endpoints exist', async ({ page }) => { + // By default, no endpoints should exist + const listContent = await page.locator('#api-endpoints-list').textContent(); + + // Should show empty message or "No endpoints" text + expect(listContent).toMatch(/No endpoints|empty/i); + }); + + test('should populate label dropdown from existing labels', async ({ page }) => { + const labelSelect = page.locator('[data-testid="api-endpoint-target-label"]'); + + // Wait for labels to load + await page.waitForTimeout(500); + + // Check that dropdown has at least the default option + const options = await labelSelect.locator('option').count(); + expect(options).toBeGreaterThanOrEqual(1); + + // First option should be "-- Select Label --" + const firstOption = await labelSelect.locator('option').first().textContent(); + expect(firstOption).toContain('Select Label'); + }); +}); diff --git a/e2e/settings-fuzzy-matching.spec.ts b/e2e/settings-fuzzy-matching.spec.ts new file mode 100644 index 0000000..2aef5d9 --- /dev/null +++ b/e2e/settings-fuzzy-matching.spec.ts @@ -0,0 +1,125 @@ +import { test, expect } from '@playwright/test'; + +test.describe('Settings - Fuzzy Matching', () => { + test.beforeEach(async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + await page.goto(`${base}/settings#integrations`); + await page.waitForLoadState('domcontentloaded'); + await page.waitForSelector('[data-testid="fuzzy-algorithm"]'); + await page.waitForLoadState('networkidle'); + await page.waitForTimeout(200); // Small delay for JS initialization + }); + + test('should display fuzzy matching form @smoke', async ({ page }) => { + // Check all form fields are present + await expect(page.locator('[data-testid="fuzzy-algorithm"]')).toBeVisible(); + await expect(page.locator('[data-testid="fuzzy-threshold"]')).toBeVisible(); + await expect(page.locator('[data-testid="fuzzy-case-sensitive"]')).toBeVisible(); + await expect(page.locator('[data-testid="fuzzy-normalize-whitespace"]')).toBeVisible(); + await expect(page.locator('[data-testid="fuzzy-strip-punctuation"]')).toBeVisible(); + await expect(page.locator('[data-testid="btn-save-fuzzy-settings"]')).toBeVisible(); + await expect(page.locator('[data-testid="btn-reset-fuzzy-settings"]')).toBeVisible(); + }); + + test('should load default settings @smoke', async ({ page }) => { + // Default algorithm should be Levenshtein + const algorithmValue = await page.locator('[data-testid="fuzzy-algorithm"]').inputValue(); + expect(algorithmValue).toBe('levenshtein'); + + // Default threshold should be 80% + const thresholdValue = await page.locator('[data-testid="fuzzy-threshold"]').inputValue(); + expect(parseInt(thresholdValue)).toBe(80); + + // Normalize whitespace should be checked by default + await expect(page.locator('[data-testid="fuzzy-normalize-whitespace"]')).toBeChecked(); + + // Strip punctuation should be checked by default + await expect(page.locator('[data-testid="fuzzy-strip-punctuation"]')).toBeChecked(); + }); + + test('should update threshold value display @smoke', async ({ page }) => { + const thresholdSlider = page.locator('[data-testid="fuzzy-threshold"]'); + const thresholdDisplay = page.locator('#fuzzy-threshold-value'); + + // Change threshold + await thresholdSlider.fill('75'); + + // Display should update + await expect(thresholdDisplay).toContainText('75'); + }); + + test('should save fuzzy matching settings @smoke', async ({ page }) => { + // Change settings + await page.selectOption('[data-testid="fuzzy-algorithm"]', 'jaro_winkler'); + await page.locator('[data-testid="fuzzy-threshold"]').fill('85'); + await page.locator('[data-testid="fuzzy-case-sensitive"]').check(); + + // Save settings + await page.click('[data-testid="btn-save-fuzzy-settings"]'); + + // Wait for success message + await expect(page.locator('#fuzzy-settings-message')).toContainText('saved successfully', { timeout: 5000 }); + + // Reload page to verify persistence + await page.reload(); + await page.waitForLoadState('networkidle'); + + // Check that settings persisted + const algorithmValue = await page.locator('[data-testid="fuzzy-algorithm"]').inputValue(); + expect(algorithmValue).toBe('jaro_winkler'); + + const thresholdValue = await page.locator('[data-testid="fuzzy-threshold"]').inputValue(); + expect(parseInt(thresholdValue)).toBe(85); + + await expect(page.locator('[data-testid="fuzzy-case-sensitive"]')).toBeChecked(); + }); + + test('should show phonetic settings when algorithm is phonetic', async ({ page }) => { + const phoneticSettings = page.locator('#fuzzy-phonetic-settings'); + + // Initially hidden + await expect(phoneticSettings).toBeHidden(); + + // Select phonetic algorithm + await page.selectOption('[data-testid="fuzzy-algorithm"]', 'phonetic'); + + // Phonetic settings should now be visible + await expect(phoneticSettings).toBeVisible(); + await expect(page.locator('[data-testid="fuzzy-phonetic-enabled"]')).toBeChecked(); + }); + + test('should reset to defaults @smoke', async ({ page }) => { + // Change settings + await page.selectOption('[data-testid="fuzzy-algorithm"]', 'exact'); + await page.locator('[data-testid="fuzzy-threshold"]').fill('50'); + await page.locator('[data-testid="fuzzy-normalize-whitespace"]').uncheck(); + + // Save changes + await page.click('[data-testid="btn-save-fuzzy-settings"]'); + await page.waitForSelector('#fuzzy-settings-message:has-text("saved")'); + + // Reset to defaults + page.on('dialog', dialog => dialog.accept()); // Accept confirmation + await page.click('[data-testid="btn-reset-fuzzy-settings"]'); + + // Wait for reset message + await expect(page.locator('#fuzzy-settings-message')).toContainText('Reset to defaults', { timeout: 5000 }); + + // Check defaults are restored + const algorithmValue = await page.locator('[data-testid="fuzzy-algorithm"]').inputValue(); + expect(algorithmValue).toBe('levenshtein'); + + const thresholdValue = await page.locator('[data-testid="fuzzy-threshold"]').inputValue(); + expect(parseInt(thresholdValue)).toBe(80); + + await expect(page.locator('[data-testid="fuzzy-normalize-whitespace"]')).toBeChecked(); + }); + + test('should display architecture info panel', async ({ page }) => { + // Check that the architecture explanation is visible + await expect(page.locator('text=Hybrid Matching Architecture')).toBeVisible(); + await expect(page.locator('text=Phase 1 (Client-Side)')).toBeVisible(); + await expect(page.locator('text=Phase 2 (Server-Side)')).toBeVisible(); + await expect(page.locator('text=Neo4j APOC')).toBeVisible(); + }); +}); diff --git a/e2e/settings-table-formats.spec.ts b/e2e/settings-table-formats.spec.ts new file mode 100644 index 0000000..0e2b1a2 --- /dev/null +++ b/e2e/settings-table-formats.spec.ts @@ -0,0 +1,169 @@ +import { test, expect } from '@playwright/test'; + +test.describe('Settings - Table Format Registry', () => { + test.beforeEach(async ({ page, baseURL }) => { + await page.goto(`${baseURL}/settings#integrations`); + await page.waitForLoadState('domcontentloaded'); + await page.waitForSelector('[data-testid="table-format-name"]'); + await page.waitForLoadState('networkidle'); + await page.waitForTimeout(200); // Small delay for JS initialization + }); + + test('should display table format form @smoke', async ({ page }) => { + // Check all form fields are present + await expect(page.locator('[data-testid="table-format-name"]')).toBeVisible(); + await expect(page.locator('[data-testid="table-format-file-type"]')).toBeVisible(); + await expect(page.locator('[data-testid="table-format-delimiter"]')).toBeVisible(); + await expect(page.locator('[data-testid="table-format-encoding"]')).toBeVisible(); + await expect(page.locator('[data-testid="table-format-target-label"]')).toBeVisible(); + await expect(page.locator('[data-testid="table-format-has-header"]')).toBeVisible(); + await expect(page.locator('[data-testid="table-format-description"]')).toBeVisible(); + await expect(page.locator('[data-testid="btn-save-table-format"]')).toBeVisible(); + }); + + test('should display preprogrammed formats @smoke', async ({ page }) => { + // Check that preprogrammed formats are listed + const formatsList = page.locator('#table-formats-list'); + + // Should show at least the preprogrammed formats + await expect(formatsList).toContainText('CSV (Standard)'); + await expect(formatsList).toContainText('TSV (Standard)'); + await expect(formatsList).toContainText('Excel (Standard)'); + await expect(formatsList).toContainText('Parquet (Standard)'); + + // Preprogrammed formats should be marked as read-only + await expect(formatsList).toContainText('Preprogrammed'); + }); + + test('should create a new custom format @smoke', async ({ page }) => { + const uniqueName = `Test Custom CSV ${Date.now()}`; + // Fill in format details + await expect(page.locator('[data-testid="table-format-name"]')).toBeVisible(); + await page.fill('[data-testid="table-format-name"]', uniqueName); + await page.selectOption('[data-testid="table-format-file-type"]', 'csv'); + await page.fill('[data-testid="table-format-delimiter"]', ';'); + await page.selectOption('[data-testid="table-format-encoding"]', 'utf-8'); + await page.fill('[data-testid="table-format-description"]', 'Test semicolon-separated format'); + + // Save format + await expect(page.locator('[data-testid="btn-save-table-format"]')).toBeVisible(); + await page.click('[data-testid="btn-save-table-format"]'); + + // Wait for format to appear in list (more reliable than message) + await expect(page.locator('#table-formats-list')).toContainText(uniqueName, { timeout: 5000 }); + await expect(page.locator('#table-formats-list')).toContainText(';'); + }); + + test('should validate required fields @smoke', async ({ page }) => { + // Try to save without filling required fields + await page.click('[data-testid="btn-save-table-format"]'); + + // Should show error message + await expect(page.locator('#table-format-message')).toContainText('Name is required'); + }); + + test('should update delimiter based on file type', async ({ page }) => { + // Select CSV + await page.selectOption('[data-testid="table-format-file-type"]', 'csv'); + await expect(page.locator('[data-testid="table-format-delimiter"]')).toHaveValue(','); + + // Select TSV + await page.selectOption('[data-testid="table-format-file-type"]', 'tsv'); + await expect(page.locator('[data-testid="table-format-delimiter"]')).toHaveValue('\t'); + + // Select Excel - delimiter should be disabled + await page.selectOption('[data-testid="table-format-file-type"]', 'excel'); + await expect(page.locator('[data-testid="table-format-delimiter"]')).toBeDisabled(); + }); + + test('should delete custom format', async ({ page }) => { + const uniqueName = `Format To Delete ${Date.now()}`; + // First create a format + await page.fill('[data-testid="table-format-name"]', uniqueName); + await page.selectOption('[data-testid="table-format-file-type"]', 'csv'); + await page.click('[data-testid="btn-save-table-format"]'); + await page.waitForTimeout(500); + await page.waitForSelector(`#table-formats-list:has-text("${uniqueName}")`); + + // Find the row containing our format and click its delete button + const formatRow = page.locator(`#table-formats-list tr:has-text("${uniqueName}")`); + const deleteButton = formatRow.locator('button:has-text("Delete")'); + + // Set up dialog handler before clicking + page.once('dialog', dialog => dialog.accept()); + await deleteButton.click(); + + // Wait a moment for deletion to complete + await page.waitForTimeout(1500); + + // Verify format is removed from list + await expect(page.locator('#table-formats-list')).not.toContainText(uniqueName); + }); + + test('should not allow deletion of preprogrammed formats', async ({ page }) => { + const formatsList = page.locator('#table-formats-list'); + + // Check that preprogrammed formats don't have delete buttons + const preprogrammedRow = page.locator('#table-formats-list tr:has-text("CSV (Standard)")'); + await expect(preprogrammedRow).toContainText('Read-only'); + + // Should not have Edit or Delete buttons for preprogrammed formats + const deleteButtons = preprogrammedRow.locator('button:has-text("Delete")'); + await expect(deleteButtons).toHaveCount(0); + }); + + test('should edit custom format', async ({ page }) => { + const originalName = `Original Format ${Date.now()}`; + const updatedName = `Updated Format ${Date.now()}`; + // First create a format + await page.fill('[data-testid="table-format-name"]', originalName); + await page.selectOption('[data-testid="table-format-file-type"]', 'csv'); + await page.fill('[data-testid="table-format-delimiter"]', ','); + await page.click('[data-testid="btn-save-table-format"]'); + await page.waitForTimeout(500); + await page.waitForSelector(`#table-formats-list:has-text("${originalName}")`); + + // Find the row containing our format and click its edit button + const formatRow = page.locator(`#table-formats-list tr:has-text("${originalName}")`); + const editButton = formatRow.locator('button:has-text("Edit")'); + await editButton.click(); + await page.waitForTimeout(300); + + // Wait for form to populate + await expect(page.locator('[data-testid="table-format-name"]')).toHaveValue(originalName); + await expect(page.locator('[data-testid="btn-save-table-format"]')).toContainText('Update Format'); + + // Edit the name + await page.fill('[data-testid="table-format-name"]', updatedName); + await page.fill('[data-testid="table-format-delimiter"]', ';'); + + // Save changes + await page.click('[data-testid="btn-save-table-format"]'); + await page.waitForTimeout(500); + + // Verify changes appear in list (more reliable than message) + await expect(page.locator('#table-formats-list')).toContainText(updatedName, { timeout: 5000 }); + await expect(page.locator('#table-formats-list')).toContainText(';'); + }); + + test('should show cancel button when editing', async ({ page }) => { + // Create a format first + await page.fill('[data-testid="table-format-name"]', 'Edit Test'); + await page.click('[data-testid="btn-save-table-format"]'); + await page.waitForSelector('#table-formats-list:has-text("Edit Test")'); + + // Click edit + await page.locator('#table-formats-list button:has-text("Edit")').first().click(); + + // Cancel button should now be visible + await expect(page.locator('[data-testid="btn-cancel-table-format"]')).toBeVisible(); + + // Click cancel + await page.click('[data-testid="btn-cancel-table-format"]'); + + // Form should be reset + await expect(page.locator('[data-testid="table-format-name"]')).toHaveValue(''); + await expect(page.locator('[data-testid="btn-save-table-format"]')).toContainText('Save Format'); + await expect(page.locator('[data-testid="btn-cancel-table-format"]')).not.toBeVisible(); + }); +}); diff --git a/e2e/settings.spec.ts b/e2e/settings.spec.ts index 5475aab..247fb7e 100644 --- a/e2e/settings.spec.ts +++ b/e2e/settings.spec.ts @@ -18,22 +18,26 @@ test('settings page loads and displays system information', async ({ page, baseU await page.waitForLoadState('networkidle'); // Verify page loads - await expect(page).toHaveTitle(/SciDK - Settings/i, { timeout: 10_000 }); + await expect(page).toHaveTitle(/-SciDK-> Settings/i, { timeout: 10_000 }); - // Check for main sections - await expect(page.locator('main h1')).toContainText('Settings'); - await expect(page.locator('h2').filter({ hasText: 'Neo4j Connection' })).toBeVisible(); - await expect(page.locator('h2').filter({ hasText: 'Interpreters' })).toBeVisible(); - await expect(page.locator('h2').filter({ hasText: 'Plugins' })).toBeVisible(); - await expect(page.locator('h2').filter({ hasText: 'Rclone Interpretation' })).toBeVisible(); + // Check for sidebar navigation + await expect(page.locator('.settings-sidebar')).toBeVisible(); + await expect(page.locator('.settings-sidebar-item[data-section="general"]')).toBeVisible(); + await expect(page.locator('.settings-sidebar-item[data-section="neo4j"]')).toBeVisible(); + await expect(page.locator('.settings-sidebar-item[data-section="interpreters"]')).toBeVisible(); + + // Check that General section is active by default + const generalSection = page.locator('#general-section'); + await expect(generalSection).toBeVisible(); + await expect(generalSection.locator('h1')).toHaveText('General'); // Check for system info badges - const badges = page.locator('.badge'); + const badges = generalSection.locator('.badge'); await expect(badges.first()).toBeVisible(); // Check for unexpected console errors (allow API 404s for interpreters) - const errors = consoleMessages.filter((m) => - m.type === 'error' && + const errors = consoleMessages.filter((m) => + m.type === 'error' && !m.text.includes('Failed to load resource') && !m.text.includes('404') ); @@ -53,7 +57,7 @@ test('settings navigation link is visible in header', async ({ page, baseURL }) // Click it and verify we navigate to settings page await settingsLink.click(); await page.waitForLoadState('networkidle'); - await expect(page).toHaveTitle(/SciDK - Settings/i); + await expect(page).toHaveTitle(/-SciDK-> Settings/i); }); test('neo4j connection form has all required inputs', async ({ page, baseURL }) => { @@ -61,6 +65,10 @@ test('neo4j connection form has all required inputs', async ({ page, baseURL }) await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); + // Navigate to Neo4j section + await page.locator('.settings-sidebar-item[data-section="neo4j"]').click(); + await page.waitForTimeout(200); + // Check Neo4j form inputs const uriInput = page.locator('#neo4j-uri'); const userInput = page.locator('#neo4j-user'); @@ -94,6 +102,10 @@ test('neo4j password visibility toggle works', async ({ page, baseURL }) => { await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); + // Navigate to Neo4j section + await page.locator('.settings-sidebar-item[data-section="neo4j"]').click(); + await page.waitForTimeout(200); + const passInput = page.locator('#neo4j-pass'); const showCheckbox = page.locator('#neo4j-pass-show'); @@ -116,6 +128,10 @@ test('neo4j form can accept input', async ({ page, baseURL }) => { await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); + // Navigate to Neo4j section + await page.locator('.settings-sidebar-item[data-section="neo4j"]').click(); + await page.waitForTimeout(200); + const uriInput = page.locator('#neo4j-uri'); const userInput = page.locator('#neo4j-user'); const dbInput = page.locator('#neo4j-db'); @@ -139,6 +155,10 @@ test('neo4j save button sends POST request', async ({ page, baseURL }) => { await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); + // Navigate to Neo4j section + await page.locator('.settings-sidebar-item[data-section="neo4j"]').click(); + await page.waitForTimeout(200); + // Mock the save API await page.route('**/api/settings/neo4j', async (route) => { if (route.request().method() === 'POST') { @@ -173,6 +193,10 @@ test('neo4j test connection button works', async ({ page, baseURL }) => { await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); + // Navigate to Neo4j section + await page.locator('.settings-sidebar-item[data-section="neo4j"]').click(); + await page.waitForTimeout(200); + // Expand advanced section const advancedDetails = page.locator('details').filter({ hasText: 'Advanced / Health' }); await advancedDetails.locator('summary').click(); @@ -237,6 +261,10 @@ test('interpreters table loads and displays data', async ({ page, baseURL }) => await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); + // Navigate to Interpreters section + await page.locator('.settings-sidebar-item[data-section="interpreters"]').click(); + await page.waitForTimeout(200); + // Wait for table to be populated await page.waitForTimeout(1000); @@ -294,6 +322,10 @@ test('interpreter toggle sends API request', async ({ page, baseURL }) => { await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); + // Navigate to Interpreters section + await page.locator('.settings-sidebar-item[data-section="interpreters"]').click(); + await page.waitForTimeout(200); + // Wait for table to be populated await page.waitForTimeout(1000); @@ -334,6 +366,10 @@ test('rclone interpretation settings can be updated', async ({ page, baseURL }) await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); + // Navigate to Rclone section + await page.locator('.settings-sidebar-item[data-section="rclone"]').click(); + await page.waitForTimeout(200); + // Wait for settings to load await page.waitForTimeout(1000); @@ -365,57 +401,71 @@ test('rclone interpretation settings can be updated', async ({ page, baseURL }) await expect(msgSpan).toContainText('Saved'); }); -test('rclone mounts section displays when feature is enabled', async ({ page, baseURL }) => { +test('rclone section displays interpretation settings', async ({ page, baseURL }) => { const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); - // Check for Rclone Mounts section - const mountsSection = page.locator('h2').filter({ hasText: 'Rclone Mounts' }); - await expect(mountsSection).toBeVisible(); - - // Check for mount form inputs - const remoteInput = page.locator('#rc-remote'); - const subpathInput = page.locator('#rc-subpath'); - const nameInput = page.locator('#rc-name'); - const roCheckbox = page.locator('#rc-ro'); - const createButton = page.locator('#rc-create'); - - await expect(remoteInput).toBeVisible(); - await expect(subpathInput).toBeVisible(); - await expect(nameInput).toBeVisible(); - await expect(roCheckbox).toBeVisible(); - await expect(createButton).toBeVisible(); - - // Check for refresh button - const refreshButton = page.locator('#rc-refresh'); - await expect(refreshButton).toBeVisible(); - - // Check for mounts table - const mountsTable = page.locator('#rc-table-body'); - await expect(mountsTable).toBeVisible(); -}); + // Navigate to Rclone section + await page.locator('.settings-sidebar-item[data-section="rclone"]').click(); + await page.waitForTimeout(200); -test('settings page anchor links work for section navigation', async ({ page, baseURL }) => { - const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; + // Check for Rclone section header + const rcloneSection = page.locator('#rclone-section'); + await expect(rcloneSection).toBeVisible(); + await expect(rcloneSection.locator('h1')).toHaveText('Rclone'); - // Navigate to interpreters section via anchor - await page.goto(`${base}/settings#interpreters`); - await page.waitForLoadState('networkidle'); + // Check for Interpretation subsection + const interpretSection = rcloneSection.locator('h2').filter({ hasText: 'Interpretation' }); + await expect(interpretSection).toBeVisible(); - // Verify we're at settings page - await expect(page).toHaveTitle(/SciDK - Settings/i); + // Check for interpretation form inputs + const suggestInput = page.locator('#rc-suggest'); + const batchInput = page.locator('#rc-batch'); + const saveButton = page.locator('#rc-save'); + + await expect(suggestInput).toBeVisible(); + await expect(batchInput).toBeVisible(); + await expect(saveButton).toBeVisible(); +}); - // Verify interpreters section is visible - const interpretersHeading = page.locator('#interpreters'); - await expect(interpretersHeading).toBeVisible(); +test('settings page sidebar navigation works', async ({ page, baseURL }) => { + const base = baseURL || process.env.BASE_URL || 'http://127.0.0.1:5000'; - // Navigate to plugins section via anchor - await page.goto(`${base}/settings#plugins`); + await page.goto(`${base}/settings`); await page.waitForLoadState('networkidle'); - // Verify plugins section is visible - const pluginsHeading = page.locator('#plugins'); - await expect(pluginsHeading).toBeVisible(); + // Verify we're at settings page + await expect(page).toHaveTitle(/-SciDK-> Settings/i); + + // General section should be active by default + const generalSection = page.locator('#general-section'); + await expect(generalSection).toBeVisible(); + await expect(generalSection).toHaveClass(/active/); + + // Click on Interpreters sidebar item + const interpretersSidebarItem = page.locator('.settings-sidebar-item[data-section="interpreters"]'); + await interpretersSidebarItem.click(); + await page.waitForTimeout(200); + + // Verify interpreters section is now visible and active + const interpretersSection = page.locator('#interpreters-section'); + await expect(interpretersSection).toBeVisible(); + await expect(interpretersSection).toHaveClass(/active/); + await expect(interpretersSidebarItem).toHaveClass(/active/); + + // Click on Plugins sidebar item + const pluginsSidebarItem = page.locator('.settings-sidebar-item[data-section="plugins"]'); + await pluginsSidebarItem.click(); + await page.waitForTimeout(200); + + // Verify plugins section is now visible and active + const pluginsSection = page.locator('#plugins-section'); + await expect(pluginsSection).toBeVisible(); + await expect(pluginsSection).toHaveClass(/active/); + await expect(pluginsSidebarItem).toHaveClass(/active/); + + // Verify interpreters section is no longer active + await expect(interpretersSection).not.toHaveClass(/active/); }); diff --git a/package.json b/package.json index cbfb55f..00b1346 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,8 @@ }, "scripts": { "e2e": "playwright test -c e2e/playwright.config.ts", + "e2e:fast": "playwright test -c e2e/playwright.config.ts --grep @smoke", + "e2e:full": "playwright test -c e2e/playwright.config.ts", "e2e:headed": "PWDEBUG=1 playwright test -c e2e/playwright.config.ts --headed", "e2e:install": "npx playwright install --with-deps" } diff --git a/pyproject.toml b/pyproject.toml index 2b88a90..afddf05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,10 @@ dependencies = [ "neo4j>=5.14", "psutil>=5.9", "python-dateutil>=2.8", + "cryptography>=41.0", + "jsonpath-ng>=1.6", + "pandas>=2.0", + "rapidfuzz>=3.0", ] [project.optional-dependencies] diff --git a/requirements.txt b/requirements.txt index 3ac9af4..4e9d434 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,10 +6,15 @@ PyYAML>=6.0 neo4j>=5.14 psutil>=5.9 python-dateutil>=2.8 +cryptography>=41.0 +jsonpath-ng>=1.6 +pandas>=2.0 +rapidfuzz>=3.0 # Dev/test dependencies (same as pyproject.toml [project.optional-dependencies].dev) pytest>=7.4 pytest-playwright==0.4.3 playwright==1.40.0 requests>=2.32 +beautifulsoup4>=4.12 coverage>=7.4 diff --git a/scidk/core/api_endpoint_registry.py b/scidk/core/api_endpoint_registry.py new file mode 100644 index 0000000..4d350ba --- /dev/null +++ b/scidk/core/api_endpoint_registry.py @@ -0,0 +1,336 @@ +""" +API Endpoint Registry for Links integration. + +Manages persistent storage of API endpoint configurations for use in Links wizard. +Supports authentication, field mappings, and test connections. +""" + +import sqlite3 +import json +import uuid +from datetime import datetime, timezone +from typing import List, Dict, Any, Optional +from cryptography.fernet import Fernet +import os + + +class APIEndpointRegistry: + """ + Registry for API endpoint configurations. + + Stores endpoint metadata including: + - URL and authentication + - JSONPath extraction rules + - Field mappings to Label properties + - Encrypted auth tokens + """ + + def __init__(self, db_path: str, encryption_key: Optional[str] = None): + """ + Initialize registry with SQLite database. + + Args: + db_path: Path to settings database + encryption_key: Fernet key for auth token encryption (base64-encoded) + If None, generates a new key (only for development!) + """ + self.db_path = db_path + self.db = sqlite3.connect(db_path, check_same_thread=False) + self.db.execute('PRAGMA journal_mode=WAL;') + self.db.row_factory = sqlite3.Row + + # Initialize encryption + if encryption_key: + self.cipher = Fernet(encryption_key.encode()) + else: + # Generate ephemeral key (WARNING: not persistent across restarts) + self.cipher = Fernet(Fernet.generate_key()) + + self.init_tables() + + def init_tables(self): + """Create tables if they don't exist.""" + self.db.execute( + """ + CREATE TABLE IF NOT EXISTS api_endpoints ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + url TEXT NOT NULL, + auth_method TEXT NOT NULL DEFAULT 'none', + auth_value_encrypted TEXT, + json_path TEXT, + target_label TEXT, + field_mappings TEXT, + created_at REAL NOT NULL, + updated_at REAL NOT NULL + ) + """ + ) + self.db.commit() + + def create_endpoint(self, endpoint_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Create a new API endpoint configuration. + + Args: + endpoint_data: Dict with keys: + - name: Endpoint name (required) + - url: API URL (required) + - auth_method: "none", "bearer", or "api_key" (default: "none") + - auth_value: Auth token/key (optional, encrypted at rest) + - json_path: JSONPath for extracting data (optional) + - target_label: Target Label name (optional) + - field_mappings: Dict {api_field: label_property} (optional) + + Returns: + Created endpoint dict with id + + Raises: + ValueError: If required fields missing or endpoint name exists + """ + # Validation + if not endpoint_data.get('name'): + raise ValueError("Endpoint name is required") + if not endpoint_data.get('url'): + raise ValueError("Endpoint URL is required") + + # Check for duplicate name + existing = self.get_endpoint_by_name(endpoint_data['name']) + if existing: + raise ValueError(f"Endpoint with name '{endpoint_data['name']}' already exists") + + endpoint_id = str(uuid.uuid4()) + now = datetime.now(timezone.utc).timestamp() + + # Encrypt auth value if present + auth_value = endpoint_data.get('auth_value', '') + auth_value_encrypted = None + if auth_value: + auth_value_encrypted = self.cipher.encrypt(auth_value.encode()).decode() + + # Serialize field mappings + field_mappings_json = json.dumps(endpoint_data.get('field_mappings', {})) + + self.db.execute( + """ + INSERT INTO api_endpoints + (id, name, url, auth_method, auth_value_encrypted, json_path, + target_label, field_mappings, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + endpoint_id, + endpoint_data['name'], + endpoint_data['url'], + endpoint_data.get('auth_method', 'none'), + auth_value_encrypted, + endpoint_data.get('json_path', ''), + endpoint_data.get('target_label', ''), + field_mappings_json, + now, + now + ) + ) + self.db.commit() + + return self.get_endpoint(endpoint_id) + + def get_endpoint(self, endpoint_id: str) -> Optional[Dict[str, Any]]: + """ + Get endpoint by ID. + + Args: + endpoint_id: Endpoint UUID + + Returns: + Endpoint dict (without decrypted auth_value) or None + """ + cur = self.db.execute( + "SELECT * FROM api_endpoints WHERE id = ?", + (endpoint_id,) + ) + row = cur.fetchone() + if not row: + return None + + return self._row_to_dict(row, include_auth=False) + + def get_endpoint_by_name(self, name: str) -> Optional[Dict[str, Any]]: + """Get endpoint by name.""" + cur = self.db.execute( + "SELECT * FROM api_endpoints WHERE name = ?", + (name,) + ) + row = cur.fetchone() + if not row: + return None + + return self._row_to_dict(row, include_auth=False) + + def list_endpoints(self) -> List[Dict[str, Any]]: + """ + List all endpoints. + + Returns: + List of endpoint dicts (without decrypted auth values) + """ + cur = self.db.execute( + "SELECT * FROM api_endpoints ORDER BY name" + ) + rows = cur.fetchall() + return [self._row_to_dict(row, include_auth=False) for row in rows] + + def update_endpoint(self, endpoint_id: str, updates: Dict[str, Any]) -> Dict[str, Any]: + """ + Update an existing endpoint. + + Args: + endpoint_id: Endpoint UUID + updates: Dict with fields to update + + Returns: + Updated endpoint dict + + Raises: + ValueError: If endpoint not found or name conflict + """ + endpoint = self.get_endpoint(endpoint_id) + if not endpoint: + raise ValueError(f"Endpoint {endpoint_id} not found") + + # Check for name conflict if renaming + if 'name' in updates and updates['name'] != endpoint['name']: + existing = self.get_endpoint_by_name(updates['name']) + if existing and existing['id'] != endpoint_id: + raise ValueError(f"Endpoint with name '{updates['name']}' already exists") + + # Build update query dynamically + set_clauses = [] + values = [] + + if 'name' in updates: + set_clauses.append("name = ?") + values.append(updates['name']) + + if 'url' in updates: + set_clauses.append("url = ?") + values.append(updates['url']) + + if 'auth_method' in updates: + set_clauses.append("auth_method = ?") + values.append(updates['auth_method']) + + if 'auth_value' in updates: + if updates['auth_value']: + auth_encrypted = self.cipher.encrypt(updates['auth_value'].encode()).decode() + set_clauses.append("auth_value_encrypted = ?") + values.append(auth_encrypted) + else: + set_clauses.append("auth_value_encrypted = NULL") + + if 'json_path' in updates: + set_clauses.append("json_path = ?") + values.append(updates.get('json_path', '')) + + if 'target_label' in updates: + set_clauses.append("target_label = ?") + values.append(updates.get('target_label', '')) + + if 'field_mappings' in updates: + set_clauses.append("field_mappings = ?") + values.append(json.dumps(updates['field_mappings'])) + + if not set_clauses: + return endpoint + + set_clauses.append("updated_at = ?") + values.append(datetime.now(timezone.utc).timestamp()) + + values.append(endpoint_id) + + query = f"UPDATE api_endpoints SET {', '.join(set_clauses)} WHERE id = ?" + self.db.execute(query, values) + self.db.commit() + + return self.get_endpoint(endpoint_id) + + def delete_endpoint(self, endpoint_id: str) -> bool: + """ + Delete an endpoint. + + Args: + endpoint_id: Endpoint UUID + + Returns: + True if deleted, False if not found + """ + cursor = self.db.execute( + "DELETE FROM api_endpoints WHERE id = ?", + (endpoint_id,) + ) + self.db.commit() + return cursor.rowcount > 0 + + def get_decrypted_auth(self, endpoint_id: str) -> Optional[str]: + """ + Get decrypted auth value for an endpoint. + + Args: + endpoint_id: Endpoint UUID + + Returns: + Decrypted auth value or None + """ + cur = self.db.execute( + "SELECT auth_value_encrypted FROM api_endpoints WHERE id = ?", + (endpoint_id,) + ) + row = cur.fetchone() + if not row or not row['auth_value_encrypted']: + return None + + try: + return self.cipher.decrypt(row['auth_value_encrypted'].encode()).decode() + except Exception: + return None + + def _row_to_dict(self, row: sqlite3.Row, include_auth: bool = False) -> Dict[str, Any]: + """Convert SQLite row to dict.""" + data = { + 'id': row['id'], + 'name': row['name'], + 'url': row['url'], + 'auth_method': row['auth_method'], + 'json_path': row['json_path'] or '', + 'target_label': row['target_label'] or '', + 'field_mappings': json.loads(row['field_mappings']) if row['field_mappings'] else {}, + 'created_at': row['created_at'], + 'updated_at': row['updated_at'] + } + + if include_auth and row['auth_value_encrypted']: + try: + data['auth_value'] = self.cipher.decrypt(row['auth_value_encrypted'].encode()).decode() + except Exception: + data['auth_value'] = None + + return data + + +def get_encryption_key() -> str: + """ + Get encryption key from environment or generate one. + + For production, set SCIDK_API_ENCRYPTION_KEY environment variable. + For development, a key is generated (but not persisted!). + + Returns: + Base64-encoded Fernet key + """ + key = os.environ.get('SCIDK_API_ENCRYPTION_KEY') + if key: + return key + + # Development: generate ephemeral key + # WARNING: This means auth tokens won't survive app restarts + return Fernet.generate_key().decode() diff --git a/scidk/core/fuzzy_matching.py b/scidk/core/fuzzy_matching.py new file mode 100644 index 0000000..3d092dc --- /dev/null +++ b/scidk/core/fuzzy_matching.py @@ -0,0 +1,516 @@ +""" +Fuzzy Matching Service for Links Integration. + +Provides hybrid fuzzy matching capabilities: +- Phase 1: Pre-import matching (client-side) for external data +- Phase 2: Post-import matching (server-side) using Neo4j APOC functions + +Supports multiple algorithms: +- Levenshtein Distance (edit distance) +- Jaro-Winkler Distance (name-optimized) +- Phonetic matching (Soundex, Metaphone via APOC) +""" + +from typing import Dict, List, Any, Optional, Tuple +from dataclasses import dataclass +import sqlite3 +import json +import uuid +from datetime import datetime, timezone + + +@dataclass +class FuzzyMatchSettings: + """Configuration for fuzzy matching operations.""" + algorithm: str = 'levenshtein' # levenshtein, jaro_winkler, phonetic, exact + threshold: float = 0.80 # 0.0 to 1.0 similarity threshold + case_sensitive: bool = False + normalize_whitespace: bool = True + strip_punctuation: bool = True + phonetic_enabled: bool = False + phonetic_algorithm: str = 'metaphone' # soundex, metaphone, double_metaphone + min_string_length: int = 3 + max_comparisons: int = 10000 + show_confidence_scores: bool = True + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + return { + 'algorithm': self.algorithm, + 'threshold': self.threshold, + 'case_sensitive': self.case_sensitive, + 'normalize_whitespace': self.normalize_whitespace, + 'strip_punctuation': self.strip_punctuation, + 'phonetic_enabled': self.phonetic_enabled, + 'phonetic_algorithm': self.phonetic_algorithm, + 'min_string_length': self.min_string_length, + 'max_comparisons': self.max_comparisons, + 'show_confidence_scores': self.show_confidence_scores + } + + @staticmethod + def from_dict(data: Dict[str, Any]) -> 'FuzzyMatchSettings': + """Create from dictionary.""" + return FuzzyMatchSettings( + algorithm=data.get('algorithm', 'levenshtein'), + threshold=data.get('threshold', 0.80), + case_sensitive=data.get('case_sensitive', False), + normalize_whitespace=data.get('normalize_whitespace', True), + strip_punctuation=data.get('strip_punctuation', True), + phonetic_enabled=data.get('phonetic_enabled', False), + phonetic_algorithm=data.get('phonetic_algorithm', 'metaphone'), + min_string_length=data.get('min_string_length', 3), + max_comparisons=data.get('max_comparisons', 10000), + show_confidence_scores=data.get('show_confidence_scores', True) + ) + + +class FuzzyMatchingService: + """ + Hybrid fuzzy matching service for entity resolution. + + Phase 1: Client-side matching for pre-import data (using rapidfuzz) + Phase 2: Server-side matching for in-database entities (using Neo4j APOC) + """ + + def __init__(self, db_path: str): + """ + Initialize service with settings database. + + Args: + db_path: Path to settings database + """ + self.db_path = db_path + self.db = sqlite3.connect(db_path, check_same_thread=False) + self.db.execute('PRAGMA journal_mode=WAL;') + self.db.row_factory = sqlite3.Row + self._matcher = None # Lazy-load rapidfuzz + self.init_tables() + + def init_tables(self): + """Create settings table if it doesn't exist.""" + self.db.execute( + """ + CREATE TABLE IF NOT EXISTS fuzzy_match_settings ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + algorithm TEXT NOT NULL, + threshold REAL NOT NULL, + case_sensitive INTEGER NOT NULL, + normalize_whitespace INTEGER NOT NULL, + strip_punctuation INTEGER NOT NULL, + phonetic_enabled INTEGER NOT NULL, + phonetic_algorithm TEXT, + min_string_length INTEGER NOT NULL, + max_comparisons INTEGER NOT NULL, + show_confidence_scores INTEGER NOT NULL, + is_global INTEGER NOT NULL DEFAULT 0, + created_at REAL NOT NULL, + updated_at REAL NOT NULL + ) + """ + ) + self.db.commit() + + # Seed global default if it doesn't exist + self._seed_global_default() + + def _seed_global_default(self): + """Insert global default settings if they don't exist.""" + cursor = self.db.execute( + "SELECT id FROM fuzzy_match_settings WHERE is_global = 1" + ) + if cursor.fetchone(): + return # Already exists + + default = FuzzyMatchSettings() + now = datetime.now(timezone.utc).timestamp() + + self.db.execute( + """ + INSERT INTO fuzzy_match_settings + (id, name, algorithm, threshold, case_sensitive, normalize_whitespace, + strip_punctuation, phonetic_enabled, phonetic_algorithm, min_string_length, + max_comparisons, show_confidence_scores, is_global, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, ?, ?) + """, + ( + 'global-default', + 'Global Default', + default.algorithm, + default.threshold, + 1 if default.case_sensitive else 0, + 1 if default.normalize_whitespace else 0, + 1 if default.strip_punctuation else 0, + 1 if default.phonetic_enabled else 0, + default.phonetic_algorithm, + default.min_string_length, + default.max_comparisons, + 1 if default.show_confidence_scores else 0, + now, + now + ) + ) + self.db.commit() + + def get_global_settings(self) -> FuzzyMatchSettings: + """Get global fuzzy matching settings.""" + cursor = self.db.execute( + """ + SELECT algorithm, threshold, case_sensitive, normalize_whitespace, + strip_punctuation, phonetic_enabled, phonetic_algorithm, + min_string_length, max_comparisons, show_confidence_scores + FROM fuzzy_match_settings + WHERE is_global = 1 + """ + ) + row = cursor.fetchone() + if not row: + # Fallback to defaults + return FuzzyMatchSettings() + + return FuzzyMatchSettings( + algorithm=row['algorithm'], + threshold=row['threshold'], + case_sensitive=bool(row['case_sensitive']), + normalize_whitespace=bool(row['normalize_whitespace']), + strip_punctuation=bool(row['strip_punctuation']), + phonetic_enabled=bool(row['phonetic_enabled']), + phonetic_algorithm=row['phonetic_algorithm'], + min_string_length=row['min_string_length'], + max_comparisons=row['max_comparisons'], + show_confidence_scores=bool(row['show_confidence_scores']) + ) + + def update_global_settings(self, settings: Dict[str, Any]) -> FuzzyMatchSettings: + """Update global fuzzy matching settings.""" + updates = [] + params = [] + + if 'algorithm' in settings: + updates.append("algorithm = ?") + params.append(settings['algorithm']) + if 'threshold' in settings: + updates.append("threshold = ?") + params.append(settings['threshold']) + if 'case_sensitive' in settings: + updates.append("case_sensitive = ?") + params.append(1 if settings['case_sensitive'] else 0) + if 'normalize_whitespace' in settings: + updates.append("normalize_whitespace = ?") + params.append(1 if settings['normalize_whitespace'] else 0) + if 'strip_punctuation' in settings: + updates.append("strip_punctuation = ?") + params.append(1 if settings['strip_punctuation'] else 0) + if 'phonetic_enabled' in settings: + updates.append("phonetic_enabled = ?") + params.append(1 if settings['phonetic_enabled'] else 0) + if 'phonetic_algorithm' in settings: + updates.append("phonetic_algorithm = ?") + params.append(settings['phonetic_algorithm']) + if 'min_string_length' in settings: + updates.append("min_string_length = ?") + params.append(settings['min_string_length']) + if 'max_comparisons' in settings: + updates.append("max_comparisons = ?") + params.append(settings['max_comparisons']) + if 'show_confidence_scores' in settings: + updates.append("show_confidence_scores = ?") + params.append(1 if settings['show_confidence_scores'] else 0) + + # Update timestamp + updates.append("updated_at = ?") + params.append(datetime.now(timezone.utc).timestamp()) + + if updates: + sql = f"UPDATE fuzzy_match_settings SET {', '.join(updates)} WHERE is_global = 1" + self.db.execute(sql, params) + self.db.commit() + + return self.get_global_settings() + + # ========================================== + # Phase 1: Pre-Import Matching (Client-Side) + # ========================================== + + def _ensure_matcher(self): + """Lazy-load rapidfuzz library.""" + if self._matcher is None: + try: + from rapidfuzz import fuzz, process + self._matcher = {'fuzz': fuzz, 'process': process} + except ImportError: + raise RuntimeError( + "rapidfuzz library not installed. " + "Install with: pip install rapidfuzz>=3.0" + ) + + def _normalize_string(self, text: str, settings: FuzzyMatchSettings) -> str: + """Normalize string according to settings.""" + if not isinstance(text, str): + text = str(text) + + if not settings.case_sensitive: + text = text.lower() + + if settings.normalize_whitespace: + text = ' '.join(text.split()) + + if settings.strip_punctuation: + import string + text = text.translate(str.maketrans('', '', string.punctuation)) + + return text.strip() + + def match_external_data( + self, + external_records: List[Dict[str, Any]], + existing_nodes: List[Dict[str, Any]], + match_key: str, + settings: Optional[FuzzyMatchSettings] = None + ) -> List[Dict[str, Any]]: + """ + Phase 1: Match external data against existing Neo4j nodes (client-side). + + Args: + external_records: List of external records to match + existing_nodes: List of existing Neo4j nodes to match against + match_key: Property key to use for matching (e.g., 'name', 'email') + settings: Optional fuzzy match settings (uses global if None) + + Returns: + List of match results with structure: + { + 'external_record': {...}, + 'matched_node': {...} or None, + 'confidence': float (0.0-1.0), + 'is_match': bool + } + """ + self._ensure_matcher() + if settings is None: + settings = self.get_global_settings() + + if settings.algorithm == 'exact': + return self._match_exact(external_records, existing_nodes, match_key, settings) + + fuzz = self._matcher['fuzz'] + matches = [] + + # Normalize all existing node values for comparison + existing_normalized = {} + for node in existing_nodes: + if match_key in node and node[match_key]: + original = node[match_key] + normalized = self._normalize_string(str(original), settings) + if len(normalized) >= settings.min_string_length: + existing_normalized[normalized] = node + + # Match each external record + for record in external_records: + if match_key not in record or not record[match_key]: + matches.append({ + 'external_record': record, + 'matched_node': None, + 'confidence': 0.0, + 'is_match': False, + 'reason': 'Missing match key' + }) + continue + + external_value = self._normalize_string(str(record[match_key]), settings) + + if len(external_value) < settings.min_string_length: + matches.append({ + 'external_record': record, + 'matched_node': None, + 'confidence': 0.0, + 'is_match': False, + 'reason': f'String too short (< {settings.min_string_length} chars)' + }) + continue + + # Find best match + best_match = None + best_confidence = 0.0 + + for norm_value, node in existing_normalized.items(): + confidence = self._compute_similarity( + external_value, norm_value, settings.algorithm, fuzz + ) + + if confidence > best_confidence: + best_confidence = confidence + best_match = node + + is_match = best_confidence >= settings.threshold + + matches.append({ + 'external_record': record, + 'matched_node': best_match if is_match else None, + 'confidence': best_confidence, + 'is_match': is_match + }) + + return matches + + def _match_exact( + self, + external_records: List[Dict[str, Any]], + existing_nodes: List[Dict[str, Any]], + match_key: str, + settings: FuzzyMatchSettings + ) -> List[Dict[str, Any]]: + """Exact matching (no fuzzy logic).""" + # Build lookup dict + lookup = {} + for node in existing_nodes: + if match_key in node and node[match_key]: + normalized = self._normalize_string(str(node[match_key]), settings) + lookup[normalized] = node + + matches = [] + for record in external_records: + if match_key not in record or not record[match_key]: + matches.append({ + 'external_record': record, + 'matched_node': None, + 'confidence': 0.0, + 'is_match': False + }) + continue + + normalized = self._normalize_string(str(record[match_key]), settings) + matched_node = lookup.get(normalized) + + matches.append({ + 'external_record': record, + 'matched_node': matched_node, + 'confidence': 1.0 if matched_node else 0.0, + 'is_match': matched_node is not None + }) + + return matches + + def _compute_similarity( + self, + str1: str, + str2: str, + algorithm: str, + fuzz + ) -> float: + """Compute similarity score using specified algorithm.""" + if algorithm == 'levenshtein': + # Levenshtein ratio (0-100), normalize to 0.0-1.0 + return fuzz.ratio(str1, str2) / 100.0 + + elif algorithm == 'jaro_winkler': + # Jaro-Winkler distance (0-100), normalize to 0.0-1.0 + return fuzz.Jaro.distance(str1, str2) + + else: + # Default to Levenshtein + return fuzz.ratio(str1, str2) / 100.0 + + # ========================================== + # Phase 2: Post-Import Matching (Server-Side) + # ========================================== + + def generate_cypher_fuzzy_match( + self, + source_label: str, + target_label: str, + source_property: str, + target_property: str, + relationship_type: str, + settings: Optional[FuzzyMatchSettings] = None + ) -> str: + """ + Phase 2: Generate Cypher query using Neo4j APOC fuzzy functions (server-side). + + Args: + source_label: Source node label + target_label: Target node label + source_property: Property on source node to match + target_property: Property on target node to match + relationship_type: Type of relationship to create + settings: Optional fuzzy match settings (uses global if None) + + Returns: + Cypher query string for Neo4j execution + """ + if settings is None: + settings = self.get_global_settings() + + if settings.algorithm == 'exact': + # Exact match using standard Cypher + cypher = f""" + MATCH (source:{source_label}), (target:{target_label}) + WHERE source.{source_property} = target.{target_property} + CREATE (source)-[:{relationship_type} {{confidence: 1.0}}]->(target) + RETURN source, target, 1.0 as confidence + """ + + elif settings.algorithm == 'levenshtein': + cypher = f""" + MATCH (source:{source_label}), (target:{target_label}) + WHERE apoc.text.levenshteinSimilarity( + source.{source_property}, + target.{target_property} + ) >= {settings.threshold} + WITH source, target, + apoc.text.levenshteinSimilarity( + source.{source_property}, + target.{target_property} + ) as confidence + CREATE (source)-[:{relationship_type} {{confidence: confidence}}]->(target) + RETURN source, target, confidence + """ + + elif settings.algorithm == 'jaro_winkler': + cypher = f""" + MATCH (source:{source_label}), (target:{target_label}) + WHERE apoc.text.jaroWinklerDistance( + source.{source_property}, + target.{target_property} + ) >= {settings.threshold} + WITH source, target, + apoc.text.jaroWinklerDistance( + source.{source_property}, + target.{target_property} + ) as confidence + CREATE (source)-[:{relationship_type} {{confidence: confidence}}]->(target) + RETURN source, target, confidence + """ + + elif settings.algorithm == 'phonetic' and settings.phonetic_enabled: + phonetic_func = 'apoc.text.phonetic' if settings.phonetic_algorithm == 'soundex' else 'apoc.text.doubleMetaphone' + cypher = f""" + MATCH (source:{source_label}), (target:{target_label}) + WHERE {phonetic_func}(source.{source_property}) = {phonetic_func}(target.{target_property}) + CREATE (source)-[:{relationship_type} {{confidence: 0.9, method: 'phonetic'}}]->(target) + RETURN source, target, 0.9 as confidence + """ + + else: + # Fallback to Levenshtein + cypher = self.generate_cypher_fuzzy_match( + source_label, target_label, source_property, target_property, + relationship_type, + FuzzyMatchSettings(algorithm='levenshtein', threshold=settings.threshold) + ) + + return cypher + + +def get_fuzzy_matching_service(db_path: str = 'scidk_settings.db') -> FuzzyMatchingService: + """ + Get or create a FuzzyMatchingService instance. + + Args: + db_path: Path to settings database + + Returns: + FuzzyMatchingService instance + """ + return FuzzyMatchingService(db_path) diff --git a/scidk/core/migrations.py b/scidk/core/migrations.py index d3ceeca..d9dc939 100644 --- a/scidk/core/migrations.py +++ b/scidk/core/migrations.py @@ -306,6 +306,14 @@ def migrate(conn: Optional[sqlite3.Connection] = None) -> int: _set_version(conn, 6) version = 6 + # v7: Add source_label and target_label columns to link_definitions for Label→Label refactor + if version < 7: + cur.execute("ALTER TABLE link_definitions ADD COLUMN source_label TEXT;") + cur.execute("ALTER TABLE link_definitions ADD COLUMN target_label TEXT;") + conn.commit() + _set_version(conn, 7) + version = 7 + return version finally: if own: diff --git a/scidk/core/table_format_registry.py b/scidk/core/table_format_registry.py new file mode 100644 index 0000000..6620cdd --- /dev/null +++ b/scidk/core/table_format_registry.py @@ -0,0 +1,589 @@ +""" +Table Format Registry for Links integration. + +Manages persistent storage of table format configurations (CSV, TSV, Excel, Parquet) +for importing tabular data as Link source instances in the Links wizard. +""" + +import sqlite3 +import json +import uuid +from datetime import datetime, timezone +from typing import List, Dict, Any, Optional +import pandas as pd +import io + + +class TableFormatRegistry: + """ + Registry for table format configurations. + + Stores format metadata including: + - File type (CSV, TSV, Excel, Parquet) + - Delimiter, encoding, header configuration + - Column mappings to Label properties + - Target label for data import + """ + + # Pre-programmed formats + PREPROGRAMMED_FORMATS = { + 'csv_standard': { + 'name': 'CSV (Standard)', + 'file_type': 'csv', + 'delimiter': ',', + 'encoding': 'utf-8', + 'has_header': True, + 'header_row': 0, + 'description': 'Standard comma-separated values with UTF-8 encoding' + }, + 'tsv_standard': { + 'name': 'TSV (Standard)', + 'file_type': 'tsv', + 'delimiter': '\t', + 'encoding': 'utf-8', + 'has_header': True, + 'header_row': 0, + 'description': 'Tab-separated values with UTF-8 encoding' + }, + 'excel_standard': { + 'name': 'Excel (Standard)', + 'file_type': 'excel', + 'delimiter': None, + 'encoding': 'utf-8', + 'has_header': True, + 'header_row': 0, + 'description': 'Microsoft Excel (.xlsx) with first sheet' + }, + 'parquet_standard': { + 'name': 'Parquet (Standard)', + 'file_type': 'parquet', + 'delimiter': None, + 'encoding': 'utf-8', + 'has_header': True, + 'header_row': 0, + 'description': 'Apache Parquet columnar format with auto-detected schema' + } + } + + def __init__(self, db_path: str): + """ + Initialize registry with SQLite database. + + Args: + db_path: Path to settings database + """ + self.db_path = db_path + self.db = sqlite3.connect(db_path, check_same_thread=False) + self.db.execute('PRAGMA journal_mode=WAL;') + self.db.row_factory = sqlite3.Row + self.init_tables() + + def init_tables(self): + """Create tables if they don't exist.""" + self.db.execute( + """ + CREATE TABLE IF NOT EXISTS table_formats ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + file_type TEXT NOT NULL, + delimiter TEXT, + encoding TEXT NOT NULL DEFAULT 'utf-8', + has_header INTEGER NOT NULL DEFAULT 1, + header_row INTEGER NOT NULL DEFAULT 0, + sheet_name TEXT, + target_label TEXT, + column_mappings TEXT, + description TEXT, + is_preprogrammed INTEGER NOT NULL DEFAULT 0, + created_at REAL NOT NULL, + updated_at REAL NOT NULL + ) + """ + ) + self.db.commit() + + # Seed preprogrammed formats if they don't exist + self._seed_preprogrammed_formats() + + def _seed_preprogrammed_formats(self): + """Insert preprogrammed formats if they don't exist.""" + for format_id, format_data in self.PREPROGRAMMED_FORMATS.items(): + existing = self._get_format_by_name_internal(format_data['name']) + if not existing: + now = datetime.now(timezone.utc).timestamp() + self.db.execute( + """ + INSERT INTO table_formats + (id, name, file_type, delimiter, encoding, has_header, header_row, + description, is_preprogrammed, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, ?, ?) + """, + ( + format_id, + format_data['name'], + format_data['file_type'], + format_data['delimiter'], + format_data['encoding'], + 1 if format_data['has_header'] else 0, + format_data['header_row'], + format_data['description'], + now, + now + ) + ) + self.db.commit() + + def _get_format_by_name_internal(self, name: str) -> Optional[Dict[str, Any]]: + """Internal method to get format by name without full serialization.""" + cursor = self.db.execute( + "SELECT * FROM table_formats WHERE name = ?", + (name,) + ) + row = cursor.fetchone() + if row: + return dict(row) + return None + + def list_formats(self, include_preprogrammed: bool = True) -> List[Dict[str, Any]]: + """ + Get all table format configurations. + + Args: + include_preprogrammed: Whether to include pre-programmed formats + + Returns: + List of format dicts + """ + cursor = self.db.execute( + """ + SELECT id, name, file_type, delimiter, encoding, has_header, header_row, + sheet_name, target_label, column_mappings, description, + is_preprogrammed, created_at, updated_at + FROM table_formats + ORDER BY is_preprogrammed DESC, name ASC + """ + ) + rows = cursor.fetchall() + + formats = [] + for row in rows: + if not include_preprogrammed and row['is_preprogrammed']: + continue + + formats.append({ + 'id': row['id'], + 'name': row['name'], + 'file_type': row['file_type'], + 'delimiter': row['delimiter'], + 'encoding': row['encoding'], + 'has_header': bool(row['has_header']), + 'header_row': row['header_row'], + 'sheet_name': row['sheet_name'], + 'target_label': row['target_label'], + 'column_mappings': json.loads(row['column_mappings']) if row['column_mappings'] else {}, + 'description': row['description'], + 'is_preprogrammed': bool(row['is_preprogrammed']), + 'created_at': row['created_at'], + 'updated_at': row['updated_at'] + }) + + return formats + + def get_format(self, format_id: str) -> Optional[Dict[str, Any]]: + """ + Get a specific table format by ID. + + Args: + format_id: Format ID + + Returns: + Format dict or None if not found + """ + cursor = self.db.execute( + """ + SELECT id, name, file_type, delimiter, encoding, has_header, header_row, + sheet_name, target_label, column_mappings, description, + is_preprogrammed, created_at, updated_at + FROM table_formats + WHERE id = ? + """, + (format_id,) + ) + row = cursor.fetchone() + + if not row: + return None + + return { + 'id': row['id'], + 'name': row['name'], + 'file_type': row['file_type'], + 'delimiter': row['delimiter'], + 'encoding': row['encoding'], + 'has_header': bool(row['has_header']), + 'header_row': row['header_row'], + 'sheet_name': row['sheet_name'], + 'target_label': row['target_label'], + 'column_mappings': json.loads(row['column_mappings']) if row['column_mappings'] else {}, + 'description': row['description'], + 'is_preprogrammed': bool(row['is_preprogrammed']), + 'created_at': row['created_at'], + 'updated_at': row['updated_at'] + } + + def create_format(self, format_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Create a new table format configuration. + + Args: + format_data: Dict with keys: + - name: Format name (required) + - file_type: "csv", "tsv", "excel", "parquet" (required) + - delimiter: Column delimiter (optional, for CSV/TSV) + - encoding: File encoding (default: "utf-8") + - has_header: Whether file has header row (default: True) + - header_row: Row index of header (default: 0) + - sheet_name: Sheet name for Excel files (optional) + - target_label: Target Label name (optional) + - column_mappings: Dict {table_column: {label_property, type_hint, ignore}} (optional) + - description: Format description (optional) + + Returns: + Created format dict with id + + Raises: + ValueError: If required fields missing or format name exists + """ + # Validation + if not format_data.get('name'): + raise ValueError("Format name is required") + if not format_data.get('file_type'): + raise ValueError("File type is required") + + valid_types = ['csv', 'tsv', 'excel', 'parquet'] + if format_data['file_type'] not in valid_types: + raise ValueError(f"File type must be one of: {', '.join(valid_types)}") + + # Check for duplicate name + existing = self._get_format_by_name_internal(format_data['name']) + if existing: + raise ValueError(f"Format with name '{format_data['name']}' already exists") + + # Generate ID and timestamps + format_id = str(uuid.uuid4()) + now = datetime.now(timezone.utc).timestamp() + + # Extract fields with defaults + name = format_data['name'] + file_type = format_data['file_type'] + delimiter = format_data.get('delimiter') + encoding = format_data.get('encoding', 'utf-8') + has_header = format_data.get('has_header', True) + header_row = format_data.get('header_row', 0) + sheet_name = format_data.get('sheet_name') + target_label = format_data.get('target_label') + column_mappings = format_data.get('column_mappings', {}) + description = format_data.get('description') + + # Insert into database + self.db.execute( + """ + INSERT INTO table_formats + (id, name, file_type, delimiter, encoding, has_header, header_row, + sheet_name, target_label, column_mappings, description, + is_preprogrammed, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0, ?, ?) + """, + ( + format_id, + name, + file_type, + delimiter, + encoding, + 1 if has_header else 0, + header_row, + sheet_name, + target_label, + json.dumps(column_mappings) if column_mappings else None, + description, + now, + now + ) + ) + self.db.commit() + + return self.get_format(format_id) + + def update_format(self, format_id: str, format_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Update an existing table format. + + Args: + format_id: Format ID + format_data: Dict with fields to update (same as create_format) + + Returns: + Updated format dict + + Raises: + ValueError: If format not found or is preprogrammed + """ + existing = self.get_format(format_id) + if not existing: + raise ValueError(f"Format '{format_id}' not found") + + if existing['is_preprogrammed']: + raise ValueError("Cannot modify preprogrammed formats") + + # Check for name conflict if name is being changed + if 'name' in format_data and format_data['name'] != existing['name']: + name_check = self._get_format_by_name_internal(format_data['name']) + if name_check and name_check['id'] != format_id: + raise ValueError(f"Format with name '{format_data['name']}' already exists") + + # Build update statement dynamically + updates = [] + params = [] + + if 'name' in format_data: + updates.append("name = ?") + params.append(format_data['name']) + if 'file_type' in format_data: + valid_types = ['csv', 'tsv', 'excel', 'parquet'] + if format_data['file_type'] not in valid_types: + raise ValueError(f"File type must be one of: {', '.join(valid_types)}") + updates.append("file_type = ?") + params.append(format_data['file_type']) + if 'delimiter' in format_data: + updates.append("delimiter = ?") + params.append(format_data['delimiter']) + if 'encoding' in format_data: + updates.append("encoding = ?") + params.append(format_data['encoding']) + if 'has_header' in format_data: + updates.append("has_header = ?") + params.append(1 if format_data['has_header'] else 0) + if 'header_row' in format_data: + updates.append("header_row = ?") + params.append(format_data['header_row']) + if 'sheet_name' in format_data: + updates.append("sheet_name = ?") + params.append(format_data['sheet_name']) + if 'target_label' in format_data: + updates.append("target_label = ?") + params.append(format_data['target_label']) + if 'column_mappings' in format_data: + updates.append("column_mappings = ?") + params.append(json.dumps(format_data['column_mappings']) if format_data['column_mappings'] else None) + if 'description' in format_data: + updates.append("description = ?") + params.append(format_data['description']) + + # Update timestamp + updates.append("updated_at = ?") + params.append(datetime.now(timezone.utc).timestamp()) + + # Add format_id to params + params.append(format_id) + + if updates: + sql = f"UPDATE table_formats SET {', '.join(updates)} WHERE id = ?" + self.db.execute(sql, params) + self.db.commit() + + return self.get_format(format_id) + + def delete_format(self, format_id: str) -> bool: + """ + Delete a table format. + + Args: + format_id: Format ID + + Returns: + True if deleted, False if not found + + Raises: + ValueError: If format is preprogrammed + """ + existing = self.get_format(format_id) + if not existing: + return False + + if existing['is_preprogrammed']: + raise ValueError("Cannot delete preprogrammed formats") + + self.db.execute("DELETE FROM table_formats WHERE id = ?", (format_id,)) + self.db.commit() + return True + + def detect_format(self, file_content: bytes, filename: str = None) -> Dict[str, Any]: + """ + Auto-detect table format from file content. + + Args: + file_content: Raw file bytes + filename: Original filename (for extension hints) + + Returns: + Dict with detected format parameters: + - file_type: Detected type + - delimiter: Detected delimiter (for CSV/TSV) + - encoding: Detected encoding + - has_header: Whether header row detected + - sample_columns: List of detected column names + - error: Error message if detection failed + """ + try: + # Try to detect encoding + encodings = ['utf-8', 'latin-1', 'utf-16'] + detected_encoding = 'utf-8' + decoded_content = None + + for enc in encodings: + try: + decoded_content = file_content.decode(enc) + detected_encoding = enc + break + except (UnicodeDecodeError, AttributeError): + continue + + if decoded_content is None: + return {'error': 'Unable to decode file with supported encodings'} + + # Detect file type from extension + file_type = None + if filename: + ext = filename.lower().split('.')[-1] + if ext in ['csv']: + file_type = 'csv' + elif ext in ['tsv', 'txt']: + file_type = 'tsv' + elif ext in ['xlsx', 'xls']: + file_type = 'excel' + elif ext in ['parquet']: + file_type = 'parquet' + + # If no extension hint, try to detect from content + if not file_type: + # Try CSV sniffer + try: + sniffer = csv.Sniffer() + dialect = sniffer.sniff(decoded_content[:1024]) + delimiter = dialect.delimiter + if delimiter == ',': + file_type = 'csv' + elif delimiter == '\t': + file_type = 'tsv' + else: + file_type = 'csv' # Default to CSV + except: + file_type = 'csv' # Default fallback + + # Detect delimiter for CSV/TSV + delimiter = ',' + if file_type in ['csv', 'tsv']: + try: + sniffer = csv.Sniffer() + dialect = sniffer.sniff(decoded_content[:1024]) + delimiter = dialect.delimiter + except: + delimiter = ',' if file_type == 'csv' else '\t' + + # Try to parse first few rows to detect columns + sample_columns = [] + try: + if file_type in ['csv', 'tsv']: + df = pd.read_csv(io.StringIO(decoded_content), delimiter=delimiter, nrows=1) + sample_columns = df.columns.tolist() + elif file_type == 'excel': + df = pd.read_excel(io.BytesIO(file_content), nrows=1) + sample_columns = df.columns.tolist() + elif file_type == 'parquet': + df = pd.read_parquet(io.BytesIO(file_content)) + sample_columns = df.columns.tolist() + except Exception as e: + return {'error': f'Failed to parse file: {str(e)}'} + + return { + 'file_type': file_type, + 'delimiter': delimiter, + 'encoding': detected_encoding, + 'has_header': len(sample_columns) > 0, + 'sample_columns': sample_columns + } + + except Exception as e: + return {'error': f'Format detection failed: {str(e)}'} + + def preview_data(self, file_content: bytes, format_id: str, num_rows: int = 5) -> Dict[str, Any]: + """ + Preview table data using a format configuration. + + Args: + file_content: Raw file bytes + format_id: Format ID to use for parsing + num_rows: Number of rows to preview (default: 5) + + Returns: + Dict with preview data: + - columns: List of column names + - rows: List of row dicts + - total_rows: Total row count + - error: Error message if preview failed + """ + format_config = self.get_format(format_id) + if not format_config: + return {'error': f'Format "{format_id}" not found'} + + try: + df = None + + if format_config['file_type'] in ['csv', 'tsv']: + # Decode content + content_str = file_content.decode(format_config['encoding']) + df = pd.read_csv( + io.StringIO(content_str), + delimiter=format_config['delimiter'], + header=format_config['header_row'] if format_config['has_header'] else None + ) + + elif format_config['file_type'] == 'excel': + df = pd.read_excel( + io.BytesIO(file_content), + sheet_name=format_config.get('sheet_name', 0), + header=format_config['header_row'] if format_config['has_header'] else None + ) + + elif format_config['file_type'] == 'parquet': + df = pd.read_parquet(io.BytesIO(file_content)) + + else: + return {'error': f'Unsupported file type: {format_config["file_type"]}'} + + # Convert to preview format + columns = df.columns.tolist() + rows = df.head(num_rows).to_dict(orient='records') + total_rows = len(df) + + return { + 'columns': columns, + 'rows': rows, + 'total_rows': total_rows + } + + except Exception as e: + return {'error': f'Preview failed: {str(e)}'} + + +def get_table_format_registry(db_path: str = 'scidk_settings.db') -> TableFormatRegistry: + """ + Get or create a TableFormatRegistry instance. + + Args: + db_path: Path to settings database + + Returns: + TableFormatRegistry instance + """ + return TableFormatRegistry(db_path) diff --git a/scidk/interpreters/eda_interpreter.py b/scidk/interpreters/eda_interpreter.py new file mode 100644 index 0000000..266d20e --- /dev/null +++ b/scidk/interpreters/eda_interpreter.py @@ -0,0 +1,244 @@ +""" +Interpreter for NC3Rs Experimental Design Assistant (EDA) files. + +EDA files are ZIP archives containing JSON experimental designs. +Reference implementation: dev/code-imports/nc3rsEDA/nc3rsEDA/nc3rsEDA.py +""" + +import json +import zipfile +from pathlib import Path +from typing import List, Dict, Any + +# Type mapping from EDA to scidk +EDA_TO_SCIDK_TYPE = { + 'String': 'string', + 'Integer': 'number', + 'Float': 'number', + 'Boolean': 'boolean', + 'Date': 'date' +} + +# Relationship type inference based on stencil pairs +RELATIONSHIP_TYPES = { + ('Treatment', 'Subject'): 'APPLIED_TO', + ('Subject', 'Measurement'): 'HAS_MEASUREMENT', + ('Experiment', 'Subject'): 'INCLUDES', + ('Group', 'Subject'): 'CONTAINS', + ('TimePoint', 'Measurement'): 'MEASURED_AT', + ('Subject', 'Sample'): 'HAS_SAMPLE' +} + + +def parse_eda_file(filepath: str) -> List[Dict[str, Any]]: + """ + Parse .eda file (ZIP with JSON) and extract nodes. + + Args: + filepath: Path to .eda file + + Returns: + list: Parsed EDA nodes with structure: + [ + { + 'resourceId': 'n0', + 'stencil': {'id': 'Treatment'}, + 'properties': {...}, + 'propertyTypes': {...}, + 'outgoing': [...], + 'incoming': [...] + }, + ... + ] + + Raises: + FileNotFoundError: If file doesn't exist + ValueError: If file format is invalid + """ + path = Path(filepath) + if not path.exists(): + raise FileNotFoundError(f"EDA file not found: {filepath}") + + if not path.suffix == '.eda': + raise ValueError(f"Not an EDA file: {filepath}") + + # Extract JSON from ZIP + with zipfile.ZipFile(filepath, 'r') as zip_ref: + # EDA files typically have a single JSON file named 'model' + json_files = [f for f in zip_ref.namelist() if f.endswith('.json') or f == 'model'] + + if not json_files: + # Try reading first file + if len(zip_ref.namelist()) > 0: + json_files = [zip_ref.namelist()[0]] + else: + raise ValueError("EDA file is empty") + + json_content = zip_ref.read(json_files[0]) + data = json.loads(json_content) + + # EDA files contain a top-level object with childShapes array + nodes = [] + edges = [] + + if isinstance(data, dict): + # Standard EDA format has childShapes array + if 'childShapes' in data: + for shape in data['childShapes']: + # Edges have 'target' field + if 'target' in shape: + edges.append(shape) + else: + nodes.append(shape) + else: + # Single node format + nodes = [data] + elif isinstance(data, list): + # Array of nodes + for item in data: + if isinstance(item, dict) and 'target' in item: + edges.append(item) + else: + nodes.append(item) + else: + raise ValueError("Invalid EDA file format: expected JSON object or array") + + return nodes, edges + + +def eda_to_labels(eda_nodes: List[Dict[str, Any]], eda_edges: List[Dict[str, Any]] = None) -> List[Dict[str, Any]]: + """ + Convert EDA nodes to scidk Label definitions. + + Args: + eda_nodes: List of parsed EDA nodes + eda_edges: List of parsed EDA edges (optional, can also extract from node outgoing/incoming) + + Returns: + list: Label definitions ready for LabelService.create_label() + """ + if eda_edges is None: + eda_edges = [] + + labels = [] + node_map = {} # resourceId -> label name + stencil_map = {} # resourceId -> stencil type + + # First pass: create labels from nodes + for node in eda_nodes: + resource_id = node.get('resourceId') + stencil_id = node.get('stencil', {}).get('id', 'Unknown') + + if not resource_id or not stencil_id: + continue + + node_map[resource_id] = stencil_id + stencil_map[resource_id] = stencil_id + + # Convert properties + properties = [] + node_props = node.get('properties', {}) + prop_types = node.get('propertyTypes', {}) + + for prop_name in node_props.keys(): + eda_type = prop_types.get(prop_name, 'String') + scidk_type = EDA_TO_SCIDK_TYPE.get(eda_type, 'string') + + properties.append({ + 'name': prop_name, + 'type': scidk_type, + 'required': False + }) + + # Check if label already exists + existing_label = next((l for l in labels if l['name'] == stencil_id), None) + + if existing_label: + # Merge properties (avoid duplicates) + for prop in properties: + if not any(p['name'] == prop['name'] for p in existing_label['properties']): + existing_label['properties'].append(prop) + else: + labels.append({ + 'name': stencil_id, + 'properties': properties, + 'relationships': [] + }) + + # Second pass: add relationships from node outgoing arrays + label_dict = {l['name']: l for l in labels} + + for node in eda_nodes: + resource_id = node.get('resourceId') + from_stencil = stencil_map.get(resource_id) + + if not from_stencil or from_stencil not in label_dict: + continue + + # Process outgoing relationships + for outgoing in node.get('outgoing', []): + # Try both 'target' and 'resourceId' fields + target_id = outgoing.get('target') + if not target_id: + target_id = outgoing.get('resourceId') + + to_stencil = stencil_map.get(target_id) + + if not to_stencil: + continue + + # Infer relationship type + rel_type = RELATIONSHIP_TYPES.get((from_stencil, to_stencil), 'RELATED_TO') + + # Check if relationship already exists + existing_rel = any( + r['type'] == rel_type and r['target_label'] == to_stencil + for r in label_dict[from_stencil]['relationships'] + ) + + if not existing_rel: + label_dict[from_stencil]['relationships'].append({ + 'type': rel_type, + 'target_label': to_stencil, + 'properties': [] + }) + + # Third pass: add relationships from explicit edge objects + for edge in eda_edges: + edge_type = edge.get('stencil', {}).get('id', 'RELATED_TO') + + # Find source and target + incoming_id = None + outgoing_id = None + + if 'incoming' in edge and len(edge['incoming']) > 0: + incoming_id = edge['incoming'][0].get('resourceId') + if 'outgoing' in edge and len(edge['outgoing']) > 0: + outgoing_id = edge['outgoing'][0].get('resourceId') + + if not incoming_id or not outgoing_id: + continue + + from_stencil = stencil_map.get(incoming_id) + to_stencil = stencil_map.get(outgoing_id) + + if not from_stencil or not to_stencil: + continue + + if from_stencil not in label_dict: + continue + + # Check if relationship already exists + existing_rel = any( + r['type'] == edge_type and r['target_label'] == to_stencil + for r in label_dict[from_stencil]['relationships'] + ) + + if not existing_rel: + label_dict[from_stencil]['relationships'].append({ + 'type': edge_type, + 'target_label': to_stencil, + 'properties': [] + }) + + return labels diff --git a/scidk/services/label_service.py b/scidk/services/label_service.py index 415c30a..221ba0a 100644 --- a/scidk/services/label_service.py +++ b/scidk/services/label_service.py @@ -559,3 +559,157 @@ def get_neo4j_schema(self) -> Dict[str, Any]: 'status': 'error', 'error': str(e) } + + def get_label_instances(self, name: str, limit: int = 100, offset: int = 0) -> Dict[str, Any]: + """ + Get instances of a label from Neo4j. + + Args: + name: Label name + limit: Maximum number of instances to return + offset: Pagination offset + + Returns: + Dict with status, instances list, and pagination info + """ + label_def = self.get_label(name) + if not label_def: + raise ValueError(f"Label '{name}' not found") + + try: + from .neo4j_client import get_neo4j_client + neo4j_client = get_neo4j_client() + + if not neo4j_client: + raise Exception("Neo4j client not configured") + + # Query for instances of this label + query = f""" + MATCH (n:{name}) + RETURN elementId(n) as id, properties(n) as properties + SKIP $offset + LIMIT $limit + """ + + results = neo4j_client.execute_read(query, {'offset': offset, 'limit': limit}) + + instances = [] + for r in results: + instances.append({ + 'id': r.get('id'), + 'properties': r.get('properties', {}) + }) + + # Get total count + count_query = f"MATCH (n:{name}) RETURN count(n) as total" + count_results = neo4j_client.execute_read(count_query) + total = count_results[0].get('total', 0) if count_results else 0 + + return { + 'status': 'success', + 'instances': instances, + 'total': total, + 'limit': limit, + 'offset': offset + } + except Exception as e: + return { + 'status': 'error', + 'error': str(e) + } + + def get_label_instance_count(self, name: str) -> Dict[str, Any]: + """ + Get count of instances for a label from Neo4j. + + Args: + name: Label name + + Returns: + Dict with status and count + """ + label_def = self.get_label(name) + if not label_def: + raise ValueError(f"Label '{name}' not found") + + try: + from .neo4j_client import get_neo4j_client + neo4j_client = get_neo4j_client() + + if not neo4j_client: + raise Exception("Neo4j client not configured") + + # Query for count + query = f"MATCH (n:{name}) RETURN count(n) as count" + results = neo4j_client.execute_read(query) + count = results[0].get('count', 0) if results else 0 + + return { + 'status': 'success', + 'count': count + } + except Exception as e: + return { + 'status': 'error', + 'error': str(e) + } + + def update_label_instance(self, name: str, instance_id: str, property_name: str, property_value: Any) -> Dict[str, Any]: + """ + Update a single property of a label instance in Neo4j. + + Args: + name: Label name + instance_id: Neo4j element ID + property_name: Property to update + property_value: New value + + Returns: + Dict with status and updated instance + """ + label_def = self.get_label(name) + if not label_def: + raise ValueError(f"Label '{name}' not found") + + # Verify property exists in label definition + prop_names = [p.get('name') for p in label_def.get('properties', [])] + if property_name not in prop_names: + raise ValueError(f"Property '{property_name}' not defined for label '{name}'") + + try: + from .neo4j_client import get_neo4j_client + neo4j_client = get_neo4j_client() + + if not neo4j_client: + raise Exception("Neo4j client not configured") + + # Update the property + query = f""" + MATCH (n:{name}) + WHERE elementId(n) = $instance_id + SET n.{property_name} = $value + RETURN elementId(n) as id, properties(n) as properties + """ + + results = neo4j_client.execute_write(query, { + 'instance_id': instance_id, + 'value': property_value + }) + + if not results: + raise Exception(f"Instance with ID '{instance_id}' not found") + + instance = { + 'id': results[0].get('id'), + 'properties': results[0].get('properties', {}) + } + + return { + 'status': 'success', + 'instance': instance + } + except Exception as e: + return { + 'status': 'error', + 'error': str(e) + } diff --git a/scidk/services/link_migration.py b/scidk/services/link_migration.py new file mode 100644 index 0000000..c3fc011 --- /dev/null +++ b/scidk/services/link_migration.py @@ -0,0 +1,211 @@ +""" +Migration utility for converting old link definitions to Label→Label model. + +This module helps migrate existing link definitions from the old model: +- source_type: graph/csv/api +- target_type: graph/label + +To the new Label→Label model: +- source_label: Label name (required) +- target_label: Label name (required) +- match_strategy: property/fuzzy/table_import/api_endpoint +""" +from __future__ import annotations +from typing import Dict, List, Any +import json + + +def migrate_link_definition(old_def: Dict[str, Any]) -> Dict[str, Any]: + """ + Migrate a single link definition from old to new format. + + Args: + old_def: Old link definition dict + + Returns: + Migrated link definition dict + + Raises: + ValueError: If migration is not possible (missing required data) + """ + migrated = old_def.copy() + + # Extract source label + if 'source_label' not in migrated or not migrated['source_label']: + source_type = old_def.get('source_type', '') + source_config = old_def.get('source_config', {}) + + if source_type == 'graph': + # Extract label from graph source config + source_label = source_config.get('label', '') + if not source_label: + raise ValueError(f"Cannot migrate link '{old_def.get('name')}': graph source missing label") + migrated['source_label'] = source_label + + elif source_type == 'csv': + # CSV becomes table_import match strategy + # Need to infer or prompt for label name + raise ValueError( + f"Cannot auto-migrate CSV source for link '{old_def.get('name')}'. " + f"Please manually specify source_label and update match_strategy to 'table_import'." + ) + + elif source_type == 'api': + # API becomes api_endpoint match strategy + raise ValueError( + f"Cannot auto-migrate API source for link '{old_def.get('name')}'. " + f"Please manually specify source_label and update match_strategy to 'api_endpoint'." + ) + else: + raise ValueError(f"Unknown source_type: {source_type}") + + # Extract target label + if 'target_label' not in migrated or not migrated['target_label']: + target_type = old_def.get('target_type', '') + target_config = old_def.get('target_config', {}) + + if target_type == 'label': + target_label = target_config.get('label', '') + if not target_label: + raise ValueError(f"Cannot migrate link '{old_def.get('name')}': label target missing label name") + migrated['target_label'] = target_label + + elif target_type == 'graph': + target_label = target_config.get('label', '') + if not target_label: + raise ValueError(f"Cannot migrate link '{old_def.get('name')}': graph target missing label") + migrated['target_label'] = target_label + else: + raise ValueError(f"Unknown target_type: {target_type}") + + # Update match strategy for CSV/API sources + source_type = old_def.get('source_type', '') + match_strategy = old_def.get('match_strategy', 'property') + + if source_type == 'csv' and match_strategy not in ['table_import', 'api_endpoint']: + migrated['match_strategy'] = 'table_import' + # Move CSV data to match_config if needed + csv_data = old_def.get('source_config', {}).get('csv_data', '') + if csv_data: + migrated['match_config'] = migrated.get('match_config', {}) + migrated['match_config']['table_data'] = csv_data + + elif source_type == 'api' and match_strategy not in ['table_import', 'api_endpoint']: + migrated['match_strategy'] = 'api_endpoint' + # Move API config to match_config + api_config = old_def.get('source_config', {}) + if api_config: + migrated['match_config'] = migrated.get('match_config', {}) + migrated['match_config'].update(api_config) + + return migrated + + +def migrate_all_links(link_service) -> Dict[str, Any]: + """ + Migrate all link definitions in the database. + + Args: + link_service: LinkService instance + + Returns: + Dict with migration results: + { + 'migrated': [list of migrated link IDs], + 'skipped': [list of skipped link IDs with reasons], + 'errors': [list of error messages] + } + """ + results = { + 'migrated': [], + 'skipped': [], + 'errors': [] + } + + try: + links = link_service.list_link_definitions() + + for link in links: + link_id = link.get('id') + link_name = link.get('name', 'Unknown') + + # Skip if already migrated + if link.get('source_label') and link.get('target_label'): + results['skipped'].append({ + 'id': link_id, + 'name': link_name, + 'reason': 'Already migrated' + }) + continue + + try: + migrated_link = migrate_link_definition(link) + link_service.save_link_definition(migrated_link) + results['migrated'].append({ + 'id': link_id, + 'name': link_name + }) + except ValueError as e: + results['errors'].append({ + 'id': link_id, + 'name': link_name, + 'error': str(e) + }) + except Exception as e: + results['errors'].append({ + 'id': link_id, + 'name': link_name, + 'error': f"Unexpected error: {str(e)}" + }) + + except Exception as e: + results['errors'].append({ + 'error': f"Failed to list link definitions: {str(e)}" + }) + + return results + + +def generate_migration_report(results: Dict[str, Any]) -> str: + """ + Generate a human-readable migration report. + + Args: + results: Migration results from migrate_all_links() + + Returns: + Formatted report string + """ + report = [] + report.append("=== Link Migration Report ===\n") + + migrated = results.get('migrated', []) + skipped = results.get('skipped', []) + errors = results.get('errors', []) + + report.append(f"Migrated: {len(migrated)}") + report.append(f"Skipped: {len(skipped)}") + report.append(f"Errors: {len(errors)}\n") + + if migrated: + report.append("Migrated Links:") + for item in migrated: + report.append(f" ✓ {item['name']} ({item['id']})") + report.append("") + + if skipped: + report.append("Skipped Links:") + for item in skipped: + report.append(f" - {item['name']}: {item['reason']}") + report.append("") + + if errors: + report.append("Errors:") + for item in errors: + if 'id' in item: + report.append(f" ✗ {item['name']} ({item['id']}): {item['error']}") + else: + report.append(f" ✗ {item['error']}") + report.append("") + + return "\n".join(report) diff --git a/scidk/services/link_service.py b/scidk/services/link_service.py index e8697e6..a562d49 100644 --- a/scidk/services/link_service.py +++ b/scidk/services/link_service.py @@ -1,12 +1,12 @@ """ -Link service for managing relationship creation workflows. +Link service for managing Label→Label relationship creation workflows. This service provides operations for: - CRUD operations on link definitions (stored in SQLite) - Preview and execution of link jobs -- Source adapters (Graph, CSV, API) -- Target adapters (Graph, Label) -- Matching strategies (Property, ID, Custom Cypher) +- Label→Label mapping enforcement (both source and target are Labels) +- Match strategies: Property, Fuzzy, Table Import, API Endpoint +- Legacy migration support for old source/target types """ from __future__ import annotations from typing import Dict, List, Any, Optional @@ -40,11 +40,12 @@ def list_link_definitions(self) -> List[Dict[str, Any]]: conn = self._get_conn() try: cursor = conn.cursor() + # First try new schema with source_label and target_label cursor.execute( """ - SELECT id, name, source_type, source_config, target_type, target_config, - match_strategy, match_config, relationship_type, relationship_props, - created_at, updated_at + SELECT id, name, source_label, target_label, source_type, source_config, + target_type, target_config, match_strategy, match_config, + relationship_type, relationship_props, created_at, updated_at FROM link_definitions ORDER BY updated_at DESC """ @@ -53,11 +54,13 @@ def list_link_definitions(self) -> List[Dict[str, Any]]: definitions = [] for row in rows: - (id, name, source_type, source_config, target_type, target_config, + (id, name, source_label, target_label, source_type, source_config, target_type, target_config, match_strategy, match_config, rel_type, rel_props, created_at, updated_at) = row definitions.append({ 'id': id, 'name': name, + 'source_label': source_label, + 'target_label': target_label, 'source_type': source_type, 'source_config': json.loads(source_config) if source_config else {}, 'target_type': target_type, @@ -88,9 +91,9 @@ def get_link_definition(self, link_id: str) -> Optional[Dict[str, Any]]: cursor = conn.cursor() cursor.execute( """ - SELECT id, name, source_type, source_config, target_type, target_config, - match_strategy, match_config, relationship_type, relationship_props, - created_at, updated_at + SELECT id, name, source_label, target_label, source_type, source_config, + target_type, target_config, match_strategy, match_config, + relationship_type, relationship_props, created_at, updated_at FROM link_definitions WHERE id = ? """, @@ -101,11 +104,13 @@ def get_link_definition(self, link_id: str) -> Optional[Dict[str, Any]]: if not row: return None - (id, name, source_type, source_config, target_type, target_config, + (id, name, source_label, target_label, source_type, source_config, target_type, target_config, match_strategy, match_config, rel_type, rel_props, created_at, updated_at) = row return { 'id': id, 'name': name, + 'source_label': source_label, + 'target_label': target_label, 'source_type': source_type, 'source_config': json.loads(source_config) if source_config else {}, 'target_type': target_type, @@ -122,10 +127,10 @@ def get_link_definition(self, link_id: str) -> Optional[Dict[str, Any]]: def save_link_definition(self, definition: Dict[str, Any]) -> Dict[str, Any]: """ - Create or update a link definition. + Create or update a link definition (Label→Label). Args: - definition: Dict with required keys: name, source_type, target_type, match_strategy, relationship_type + definition: Dict with required keys: name, source_label, target_label, match_strategy, relationship_type Returns: Updated link definition @@ -138,17 +143,27 @@ def save_link_definition(self, definition: Dict[str, Any]) -> Dict[str, Any]: if not name: raise ValueError("Link name is required") - source_type = definition.get('source_type', '').strip() - if source_type not in ['graph', 'csv', 'api']: - raise ValueError("source_type must be 'graph', 'csv', or 'api'") + # New Label→Label model + source_label = definition.get('source_label', '').strip() + if not source_label: + raise ValueError("source_label is required (must reference an existing Label)") - target_type = definition.get('target_type', '').strip() - if target_type not in ['graph', 'label']: - raise ValueError("target_type must be 'graph' or 'label'") + target_label = definition.get('target_label', '').strip() + if not target_label: + raise ValueError("target_label is required (must reference an existing Label)") + # Validate that labels exist + self._validate_label_exists(source_label) + self._validate_label_exists(target_label) + + # Legacy support: auto-migrate old source_type/target_type to new model + source_type = definition.get('source_type', 'label') + target_type = definition.get('target_type', 'label') + + # Match strategy now includes table_import and api_endpoint match_strategy = definition.get('match_strategy', '').strip() - if match_strategy not in ['property', 'id', 'cypher']: - raise ValueError("match_strategy must be 'property', 'id', or 'cypher'") + if match_strategy not in ['property', 'fuzzy', 'table_import', 'api_endpoint', 'id', 'cypher']: + raise ValueError("match_strategy must be 'property', 'fuzzy', 'table_import', 'api_endpoint', 'id', or 'cypher'") relationship_type = definition.get('relationship_type', '').strip() if not relationship_type: @@ -172,12 +187,12 @@ def save_link_definition(self, definition: Dict[str, Any]) -> Dict[str, Any]: cursor.execute( """ UPDATE link_definitions - SET name = ?, source_type = ?, source_config = ?, target_type = ?, - target_config = ?, match_strategy = ?, match_config = ?, + SET name = ?, source_label = ?, target_label = ?, source_type = ?, source_config = ?, + target_type = ?, target_config = ?, match_strategy = ?, match_config = ?, relationship_type = ?, relationship_props = ?, updated_at = ? WHERE id = ? """, - (name, source_type, source_config, target_type, target_config, + (name, source_label, target_label, source_type, source_config, target_type, target_config, match_strategy, match_config, relationship_type, relationship_props, now, link_id) ) created_at = existing['created_at'] @@ -186,12 +201,12 @@ def save_link_definition(self, definition: Dict[str, Any]) -> Dict[str, Any]: cursor.execute( """ INSERT INTO link_definitions - (id, name, source_type, source_config, target_type, target_config, + (id, name, source_label, target_label, source_type, source_config, target_type, target_config, match_strategy, match_config, relationship_type, relationship_props, created_at, updated_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, - (link_id, name, source_type, source_config, target_type, target_config, + (link_id, name, source_label, target_label, source_type, source_config, target_type, target_config, match_strategy, match_config, relationship_type, relationship_props, now, now) ) created_at = now @@ -201,6 +216,8 @@ def save_link_definition(self, definition: Dict[str, Any]) -> Dict[str, Any]: return { 'id': link_id, 'name': name, + 'source_label': source_label, + 'target_label': target_label, 'source_type': source_type, 'source_config': json.loads(source_config), 'target_type': target_type, @@ -396,6 +413,22 @@ def list_jobs(self, limit: int = 20) -> List[Dict[str, Any]]: # --- Internal helpers --- + def _validate_label_exists(self, label_name: str): + """ + Validate that a label exists in the label registry. + + Args: + label_name: Name of the label to validate + + Raises: + ValueError: If label does not exist + """ + from .label_service import LabelService + label_service = LabelService(self.app) + label = label_service.get_label(label_name) + if not label: + raise ValueError(f"Label '{label_name}' does not exist. Please create it in the Labels page first.") + def _fetch_source_data(self, definition: Dict[str, Any]) -> List[Dict[str, Any]]: """Fetch data from source based on source_type.""" source_type = definition.get('source_type') diff --git a/scidk/ui/templates/base.html b/scidk/ui/templates/base.html index 52c508d..f5371c2 100644 --- a/scidk/ui/templates/base.html +++ b/scidk/ui/templates/base.html @@ -2,7 +2,7 @@ - {% block title %}SciDK{% endblock %} + {% block title %}-SciDK->{% endblock %} +{% endblock %} +{% block content %} + + +

Integrations

+

Create relationships between data instances using graph, CSV, or API sources.

+ + + + +{% endblock %} diff --git a/scidk/ui/templates/labels.html b/scidk/ui/templates/labels.html index e46299c..c5eef2a 100644 --- a/scidk/ui/templates/labels.html +++ b/scidk/ui/templates/labels.html @@ -1,5 +1,5 @@ {% extends 'base.html' %} -{% block title %}SciDK - Labels{% endblock %} +{% block title %}-SciDK-> Labels{% endblock %} {% block head %} +{% endblock %} {% block content %} +{% endblock %} {% block content %}

Schema Graph (Interactive)

@@ -16,13 +25,11 @@

Schema Graph (Interactive)

@@ -173,8 +180,8 @@

Interpretation Types

if (labelSelect) { // Preserve current selection const currentLabel = labelSelect.value; - // Get unique labels from schema - const labels = [...new Set(schema.nodes?.map(n => n.label) || [])].sort(); + // Get unique labels from schema, filtering out empty/null values + const labels = [...new Set(schema.nodes?.map(n => n.label).filter(l => l) || [])].sort(); labelSelect.innerHTML = '' + labels.map(l => ``).join(''); // Restore selection if still valid @@ -186,8 +193,8 @@

Interpretation Types

if (relSelect) { // Preserve current selection const currentRel = relSelect.value; - // Get unique relationship types from schema - const rels = [...new Set(schema.edges?.map(e => e.rel_type) || [])].sort(); + // Get unique relationship types from schema, filtering out empty/null values + const rels = [...new Set(schema.edges?.map(e => e.rel_type).filter(r => r) || [])].sort(); relSelect.innerHTML = '' + rels.map(r => ``).join(''); // Restore selection if still valid diff --git a/scidk/ui/templates/plugins.html b/scidk/ui/templates/plugins.html index 331d537..3f09938 100644 --- a/scidk/ui/templates/plugins.html +++ b/scidk/ui/templates/plugins.html @@ -1,5 +1,5 @@ {% extends 'base.html' %} -{% block title %}SciDK - Plugins{% endblock %} +{% block title %}-SciDK-> Plugins{% endblock %} {% block content %}

Plugins

Plugin registry and management UI will appear here.

diff --git a/scidk/ui/templates/settings.html b/scidk/ui/templates/settings.html index c892300..bccc7fe 100644 --- a/scidk/ui/templates/settings.html +++ b/scidk/ui/templates/settings.html @@ -1,23 +1,109 @@ {% extends 'base.html' %} -{% block title %}SciDK - Settings{% endblock %} +{% block title %}-SciDK-> Settings{% endblock %} +{% block head %} + +{% endblock %} {% block content %} -

Settings

-

Basic runtime information and counts.

- -
- Channel: {{ info.channel or 'stable' }} - Providers: {{ info.providers }} - Files viewer: {{ info.files_viewer or '(default)' }} -
+
+ + + + +
+ +
+

General

+

Basic runtime information and counts.

+
    +
  • Host: {{ info.host }}
  • +
  • Port: {{ info.port }}
  • +
  • Debug: {{ info.debug }}
  • +
  • Datasets: {{ info.dataset_count }}
  • +
  • Interpreters: {{ info.interpreter_count }}
  • +
+
+ Channel: {{ info.channel or 'stable' }} + Providers: {{ info.providers }} + Files viewer: {{ info.files_viewer or '(default)' }} +
+
-
-

Neo4j Connection

+ +
+

Neo4j Connection

+

Configure Neo4j database connection and settings.

@@ -57,10 +143,11 @@

Neo4j Connection

You can also set env vars: NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, SCIDK_NEO4J_DATABASE

If your Neo4j has authentication disabled, set environment variable NEO4J_AUTH=none before starting the app.

-
+
-
-

Interpreters

+ +
+

Interpreters

Registered interpreter mappings and selection rules.

Mappings (extension → interpreter ids)

    @@ -175,19 +262,24 @@

    Interpreter toggles

    fetchEffective().then(render); })(); -
+
-
-

Plugins

+ +
+

Plugins

Plugin registry summary.

  • Registered interpreter count: {{ interp_count or 0 }}
  • Extensions mapped: {{ ext_count or 0 }}
-
+
-
-

Rclone Interpretation

+ +
+

Rclone

+

Configure rclone settings for interpretation and mounts.

+ +

Interpretation

Tune streaming-based interpretation from rclone remotes. For very large scans, consider mounting the remote.

@@ -203,10 +295,8 @@

Rclone Interpretation

-
-
-

Rclone Mounts

+

Mounts

Manage rclone mounts under ./data/mounts.

@@ -242,9 +332,282 @@

Rclone Mounts


   

Note: On Windows, cmount/WinFsp may be required; this UI targets Linux/macOS primarily.

-
-{% endblock %} -{% block head %} +
+ + +
+

Integrations

+

Configure integration mappings, API endpoints, and matching options.

+ +

API Endpoint Mappings

+

Define API endpoints that map to Label types in SciDK.

+ + +
+

Add New Endpoint

+
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+
+ + + +
+
+
+ + +

Registered Endpoints

+
+

No endpoints registered yet

+
+ +

Table Format Registry

+

Manage table formats for importing CSV, TSV, Excel, and Parquet files as link sources.

+ + +
+

Add Custom Format

+
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+
+ + +
+
+
+
+ + +
+
+ + + +
+
+ + + +
+ + +

Registered Formats

+
+

Loading formats...

+
+ +

Fuzzy Matching Options

+

Configure fuzzy matching algorithms for entity resolution in link creation.

+ + +
+

Global Fuzzy Matching Settings

+ + +
+
+ + +
Levenshtein: general fuzzy matching | Jaro-Winkler: names | Phonetic: sound-alike
+
+
+ + +
Minimum similarity score (0-100%) to consider a match
+
+
+ + +
+
+
+ + +
+
+
+
+ + +
+
+
+
+ + +
+
+
+ + + + + +
+ Advanced Options +
+
+ + +
+
+ + +
+
+
+ + +
+
+
+
+ + +
+ + +
+
+
+ + +
+

Hybrid Matching Architecture

+

+ Phase 1 (Client-Side): Pre-import matching using rapidfuzz - match external API/CSV data before pushing to Neo4j. +

+

+ Phase 2 (Server-Side): Post-import matching using Neo4j APOC functions - ultra-fast in-database entity resolution for existing nodes. +

+
+
+
+
+ + {% endblock %} diff --git a/scidk/ui/templates/workbook.html b/scidk/ui/templates/workbook.html index 7f5d1d9..3f507bf 100644 --- a/scidk/ui/templates/workbook.html +++ b/scidk/ui/templates/workbook.html @@ -1,5 +1,5 @@ {% extends 'base.html' %} -{% block title %}SciDK - Workbook Viewer{% endblock %} +{% block title %}-SciDK-> Workbook Viewer{% endblock %} {% block content %}

← Back to Files

{% if not dataset %} diff --git a/scidk/web/routes/__init__.py b/scidk/web/routes/__init__.py index 24eccba..6118be3 100644 --- a/scidk/web/routes/__init__.py +++ b/scidk/web/routes/__init__.py @@ -36,6 +36,8 @@ def register_blueprints(app): from . import api_annotations from . import api_labels from . import api_links + from . import api_integrations + from . import api_settings # Register UI blueprint app.register_blueprint(ui.bp) @@ -51,4 +53,6 @@ def register_blueprints(app): app.register_blueprint(api_providers.bp) app.register_blueprint(api_annotations.bp) app.register_blueprint(api_labels.bp) - app.register_blueprint(api_links.bp) + app.register_blueprint(api_integrations.bp) + app.register_blueprint(api_links.bp) # Keep for backward compatibility + app.register_blueprint(api_settings.bp) diff --git a/scidk/web/routes/api_admin.py b/scidk/web/routes/api_admin.py index 3c04f69..4c18dd8 100644 --- a/scidk/web/routes/api_admin.py +++ b/scidk/web/routes/api_admin.py @@ -367,3 +367,76 @@ def api_admin_cleanup_test_labels(): except Exception as e: return jsonify({'error': str(e)}), 500 + +@bp.post('/admin/cleanup-test-endpoints') +def api_admin_cleanup_test_endpoints(): + """Remove test API endpoints from the database (endpoints with test prefixes). + + This endpoint cleans up API endpoints created during testing that accumulate over time. + + Returns: + JSON with counts of deleted endpoints + """ + try: + import sqlite3 + + # Test endpoint patterns to delete + test_patterns = [ + 'Test%', # Test Users API, etc + 'E2E%', # E2E test endpoints + 'Secure%', # Secure API from auth tests + 'Updated%', # Updated API from update tests + 'Bearer%', # Bearer API from auth tests + 'API%Key%', # API Key API from auth tests + '%JSONPath%', # JSONPath API tests + 'Original%', # Original API from edit tests + 'Delete%', # Delete Me API from delete tests + 'Cancel%', # Cancel Test API from cancel tests + ] + + # Use settings DB (where API endpoints are stored, not path_index) + settings_db = current_app.config.get('SCIDK_SETTINGS_DB', 'scidk_settings.db') + conn = sqlite3.connect(settings_db) + conn.execute('PRAGMA journal_mode=WAL') + try: + cur = conn.cursor() + + # Check if api_endpoints table exists + cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='api_endpoints'") + if not cur.fetchone(): + return jsonify({ + 'deleted_endpoints': 0, + 'message': 'API endpoints table does not exist' + }), 200 + + # Collect endpoint names that match test patterns + deleted_endpoints = [] + total_deleted = 0 + + for pattern in test_patterns: + cur.execute("SELECT name FROM api_endpoints WHERE name LIKE ?", (pattern,)) + matching_endpoints = [row[0] for row in cur.fetchall()] + deleted_endpoints.extend(matching_endpoints) + + # Delete matching endpoints + cur.execute("DELETE FROM api_endpoints WHERE name LIKE ?", (pattern,)) + total_deleted += cur.rowcount + + conn.commit() + + return jsonify({ + 'deleted_endpoints': total_deleted, + 'endpoint_names': deleted_endpoints[:10] + (['...'] if len(deleted_endpoints) > 10 else []), + 'total_test_endpoints_found': len(deleted_endpoints), + 'message': f'Successfully deleted {total_deleted} test endpoints' + }), 200 + + finally: + try: + conn.close() + except Exception: + pass + + except Exception as e: + return jsonify({'error': str(e)}), 500 + diff --git a/scidk/web/routes/api_integrations.py b/scidk/web/routes/api_integrations.py new file mode 100644 index 0000000..33ba2ba --- /dev/null +++ b/scidk/web/routes/api_integrations.py @@ -0,0 +1,397 @@ +""" +Blueprint for Integrations API routes. + +Provides REST endpoints for: +- Integration definitions CRUD +- Preview and execution of integration jobs +- Job status tracking +""" +from flask import Blueprint, jsonify, request, current_app + +bp = Blueprint('integrations', __name__, url_prefix='/api') + + +def _get_link_service(): + """Get or create LinkService instance.""" + from ...services.link_service import LinkService + if 'link_service' not in current_app.extensions.get('scidk', {}): + if 'scidk' not in current_app.extensions: + current_app.extensions['scidk'] = {} + current_app.extensions['scidk']['link_service'] = LinkService(current_app) + return current_app.extensions['scidk']['link_service'] + + +@bp.route('/integrations', methods=['GET']) +def list_links(): + """ + Get all link definitions. + + Returns: + { + "status": "success", + "links": [ + { + "id": "uuid", + "name": "Author to File", + "source_type": "csv", + "target_type": "label", + "match_strategy": "property", + "relationship_type": "AUTHORED", + ... + } + ] + } + """ + try: + service = _get_link_service() + links = service.list_link_definitions() + return jsonify({ + 'status': 'success', + 'links': links + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/integrations/', methods=['GET']) +def get_link(link_id): + """ + Get a specific link definition by ID. + + Returns: + { + "status": "success", + "link": {...} + } + """ + try: + service = _get_link_service() + link = service.get_link_definition(link_id) + + if not link: + return jsonify({ + 'status': 'error', + 'error': f'Link "{link_id}" not found' + }), 404 + + return jsonify({ + 'status': 'success', + 'link': link + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/integrations', methods=['POST']) +def create_or_update_link(): + """ + Create or update a link definition. + + Request body: + { + "id": "optional-uuid", + "name": "Author to File", + "source_type": "csv", + "source_config": { + "csv_data": "name,email,file_path\\nAlice,alice@ex.com,file1.txt" + }, + "target_type": "label", + "target_config": { + "label": "File" + }, + "match_strategy": "property", + "match_config": { + "source_field": "file_path", + "target_field": "path" + }, + "relationship_type": "AUTHORED", + "relationship_props": { + "date": "2024-01-15" + } + } + + Returns: + { + "status": "success", + "link": {...} + } + """ + try: + data = request.get_json(force=True, silent=True) or {} + + if not data.get('name'): + return jsonify({ + 'status': 'error', + 'error': 'Link name is required' + }), 400 + + service = _get_link_service() + link = service.save_link_definition(data) + + return jsonify({ + 'status': 'success', + 'link': link + }), 200 + except ValueError as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 400 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/integrations/', methods=['DELETE']) +def delete_link(link_id): + """ + Delete a link definition. + + Returns: + { + "status": "success", + "message": "Link deleted" + } + """ + try: + service = _get_link_service() + deleted = service.delete_link_definition(link_id) + + if not deleted: + return jsonify({ + 'status': 'error', + 'error': f'Link "{link_id}" not found' + }), 404 + + return jsonify({ + 'status': 'success', + 'message': f'Link "{link_id}" deleted' + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/integrations//preview', methods=['POST']) +def preview_link(link_id): + """ + Preview link matches (dry-run). + + Request body (optional): + { + "limit": 10 + } + + Returns: + { + "status": "success", + "matches": [ + { + "source": {"name": "Alice", "email": "alice@ex.com", ...}, + "target": {"path": "file1.txt", ...} + } + ], + "count": 5 + } + """ + try: + service = _get_link_service() + link = service.get_link_definition(link_id) + + if not link: + return jsonify({ + 'status': 'error', + 'error': f'Link "{link_id}" not found' + }), 404 + + data = request.get_json(force=True, silent=True) or {} + limit = data.get('limit', 10) + + matches = service.preview_matches(link, limit=limit) + + return jsonify({ + 'status': 'success', + 'matches': matches, + 'count': len(matches) + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/integrations//execute', methods=['POST']) +def execute_link(link_id): + """ + Execute link job (create relationships in Neo4j). + + Returns: + { + "status": "success", + "job_id": "uuid" + } + """ + try: + service = _get_link_service() + job_id = service.execute_link_job(link_id) + + return jsonify({ + 'status': 'success', + 'job_id': job_id + }), 200 + except ValueError as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 404 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/integrations/jobs/', methods=['GET']) +def get_job_status(job_id): + """ + Get job status and progress. + + Returns: + { + "status": "success", + "job": { + "id": "uuid", + "link_def_id": "uuid", + "status": "completed", + "preview_count": 0, + "executed_count": 23, + "error": null, + "started_at": 1234567890.123, + "completed_at": 1234567895.456 + } + } + """ + try: + service = _get_link_service() + job = service.get_job_status(job_id) + + if not job: + return jsonify({ + 'status': 'error', + 'error': f'Job "{job_id}" not found' + }), 404 + + return jsonify({ + 'status': 'success', + 'job': job + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/integrations/jobs', methods=['GET']) +def list_jobs(): + """ + List recent link jobs. + + Query params: + - limit: Maximum number of jobs to return (default: 20) + + Returns: + { + "status": "success", + "jobs": [...] + } + """ + try: + limit = int(request.args.get('limit', 20)) + service = _get_link_service() + jobs = service.list_jobs(limit=limit) + + return jsonify({ + 'status': 'success', + 'jobs': jobs + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/integrations/available-labels', methods=['GET']) +def get_available_labels(): + """ + Get list of available labels for dropdown population. + + Returns: + { + "status": "success", + "labels": [ + {"name": "Person", "properties": [...]}, + {"name": "File", "properties": [...]} + ] + } + """ + try: + from ...services.label_service import LabelService + label_service = LabelService(current_app) + labels = label_service.list_labels() + + return jsonify({ + 'status': 'success', + 'labels': labels + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/integrations/migrate', methods=['POST']) +def migrate_links(): + """ + Migrate existing link definitions to Label→Label model. + + Returns: + { + "status": "success", + "report": { + "migrated": [...], + "skipped": [...], + "errors": [...] + } + } + """ + try: + from ...services.link_migration import migrate_all_links, generate_migration_report + service = _get_link_service() + + results = migrate_all_links(service) + report_text = generate_migration_report(results) + + return jsonify({ + 'status': 'success', + 'results': results, + 'report': report_text + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 diff --git a/scidk/web/routes/api_labels.py b/scidk/web/routes/api_labels.py index 8231a82..2e2ebd0 100644 --- a/scidk/web/routes/api_labels.py +++ b/scidk/web/routes/api_labels.py @@ -483,3 +483,212 @@ def batch_delete_labels(): except Exception as e: return jsonify({'status': 'error', 'error': str(e)}), 500 + + +@bp.route('/labels//instances', methods=['GET']) +def get_label_instances(name): + """ + Get instances of a label from Neo4j. + + Query params: + - limit: max number of instances (default: 100) + - offset: pagination offset (default: 0) + + Returns: + { + "status": "success", + "instances": [ + {"id": "...", "properties": {"name": "John", "age": 30}}, + ... + ], + "total": 150, + "limit": 100, + "offset": 0 + } + """ + try: + service = _get_label_service() + limit = int(request.args.get('limit', 100)) + offset = int(request.args.get('offset', 0)) + + result = service.get_label_instances(name, limit=limit, offset=offset) + + if result.get('status') == 'error': + return jsonify(result), 500 + + return jsonify(result), 200 + + except ValueError as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 404 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/labels//instance-count', methods=['GET']) +def get_label_instance_count(name): + """ + Get count of instances for a label from Neo4j. + + Returns: + { + "status": "success", + "count": 42 + } + """ + try: + service = _get_label_service() + result = service.get_label_instance_count(name) + + if result.get('status') == 'error': + return jsonify(result), 500 + + return jsonify(result), 200 + + except ValueError as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 404 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/labels//instances/', methods=['PATCH']) +def update_label_instance(name, instance_id): + """ + Update a single property of a label instance in Neo4j. + + Request body: + { + "property": "name", + "value": "New Value" + } + + Returns: + { + "status": "success", + "instance": {...} + } + """ + try: + data = request.get_json(force=True, silent=True) or {} + property_name = data.get('property') + property_value = data.get('value') + + if not property_name: + return jsonify({ + 'status': 'error', + 'error': 'Property name is required' + }), 400 + + service = _get_label_service() + result = service.update_label_instance(name, instance_id, property_name, property_value) + + if result.get('status') == 'error': + return jsonify(result), 500 + + return jsonify(result), 200 + + except ValueError as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 404 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/labels/import/eda', methods=['POST']) +def import_eda_file(): + """ + Import experimental design from NC3Rs EDA file. + + Expects multipart/form-data with 'file' field containing .eda file. + + Returns: + { + "status": "success", + "imported": { + "labels": 3, + "relationships": 5 + }, + "labels": [...] + } + """ + import tempfile + import os + from werkzeug.utils import secure_filename + from ...interpreters.eda_interpreter import parse_eda_file, eda_to_labels + + try: + # Check if file present + if 'file' not in request.files: + return jsonify({'status': 'error', 'error': 'No file provided'}), 400 + + file = request.files['file'] + if file.filename == '': + return jsonify({'status': 'error', 'error': 'Empty filename'}), 400 + + if not file.filename.endswith('.eda'): + return jsonify({'status': 'error', 'error': 'File must be .eda format'}), 400 + + # Save to temporary file + filename = secure_filename(file.filename) + with tempfile.NamedTemporaryFile(delete=False, suffix='.eda') as tmp: + file.save(tmp.name) + tmp_path = tmp.name + + try: + # Parse EDA file + eda_nodes, eda_edges = parse_eda_file(tmp_path) + labels_to_create = eda_to_labels(eda_nodes, eda_edges) + + # Create labels + service = _get_label_service() + created = [] + skipped = [] + + for label_def in labels_to_create: + try: + result = service.save_label(label_def) + created.append(result) + except Exception as e: + # Skip duplicates + skipped.append(label_def['name']) + continue + + total_relationships = sum(len(l.get('relationships', [])) for l in labels_to_create) + + response = { + 'status': 'success', + 'imported': { + 'labels': len(created), + 'relationships': total_relationships + }, + 'labels': created + } + + if skipped: + response['skipped'] = skipped + + return jsonify(response), 200 + + finally: + # Clean up temp file + if os.path.exists(tmp_path): + os.remove(tmp_path) + + except Exception as e: + return jsonify({'status': 'error', 'error': str(e)}), 500 diff --git a/scidk/web/routes/api_links.py b/scidk/web/routes/api_links.py index ef67a17..fd69586 100644 --- a/scidk/web/routes/api_links.py +++ b/scidk/web/routes/api_links.py @@ -1,12 +1,18 @@ """ -Blueprint for Links API routes. +Blueprint for Links API routes (DEPRECATED). + +**DEPRECATED**: This module is kept for backward compatibility only. +Use api_integrations.py instead. All /api/links/* endpoints redirect to /api/integrations/* Provides REST endpoints for: -- Link definitions CRUD -- Preview and execution of link jobs -- Job status tracking +- Link definitions CRUD (deprecated, use integrations) +- Preview and execution of link jobs (deprecated, use integrations) +- Job status tracking (deprecated, use integrations) """ -from flask import Blueprint, jsonify, request, current_app +from flask import Blueprint, jsonify, request, current_app, redirect, url_for +import logging + +logger = logging.getLogger(__name__) bp = Blueprint('links', __name__, url_prefix='/api') @@ -26,6 +32,8 @@ def list_links(): """ Get all link definitions. + **DEPRECATED**: Use /api/integrations instead. + Returns: { "status": "success", @@ -42,6 +50,7 @@ def list_links(): ] } """ + logger.warning("DEPRECATED: /api/links endpoint called. Use /api/integrations instead.") try: service = _get_link_service() links = service.list_link_definitions() @@ -331,3 +340,67 @@ def list_jobs(): 'status': 'error', 'error': str(e) }), 500 + + +@bp.route('/links/available-labels', methods=['GET']) +def get_available_labels(): + """ + Get list of available labels for dropdown population. + + Returns: + { + "status": "success", + "labels": [ + {"name": "Person", "properties": [...]}, + {"name": "File", "properties": [...]} + ] + } + """ + try: + from ...services.label_service import LabelService + label_service = LabelService(current_app) + labels = label_service.list_labels() + + return jsonify({ + 'status': 'success', + 'labels': labels + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/links/migrate', methods=['POST']) +def migrate_links(): + """ + Migrate existing link definitions to Label→Label model. + + Returns: + { + "status": "success", + "report": { + "migrated": [...], + "skipped": [...], + "errors": [...] + } + } + """ + try: + from ...services.link_migration import migrate_all_links, generate_migration_report + service = _get_link_service() + + results = migrate_all_links(service) + report_text = generate_migration_report(results) + + return jsonify({ + 'status': 'success', + 'results': results, + 'report': report_text + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 diff --git a/scidk/web/routes/api_settings.py b/scidk/web/routes/api_settings.py new file mode 100644 index 0000000..148337b --- /dev/null +++ b/scidk/web/routes/api_settings.py @@ -0,0 +1,747 @@ +""" +Blueprint for Settings API routes. + +Provides REST endpoints for: +- API endpoint registry CRUD +- Endpoint connection testing +- Settings persistence +""" +from flask import Blueprint, jsonify, request, current_app +import requests +from jsonpath_ng import parse as jsonpath_parse + +bp = Blueprint('settings', __name__, url_prefix='/api') + + +def _get_endpoint_registry(): + """Get or create APIEndpointRegistry instance.""" + from ...core.api_endpoint_registry import APIEndpointRegistry, get_encryption_key + + if 'api_endpoint_registry' not in current_app.extensions.get('scidk', {}): + if 'scidk' not in current_app.extensions: + current_app.extensions['scidk'] = {} + + # Get settings DB path + settings_db = current_app.config.get('SCIDK_SETTINGS_DB', 'scidk_settings.db') + encryption_key = get_encryption_key() + + current_app.extensions['scidk']['api_endpoint_registry'] = APIEndpointRegistry( + db_path=settings_db, + encryption_key=encryption_key + ) + + return current_app.extensions['scidk']['api_endpoint_registry'] + + +@bp.route('/settings/api-endpoints', methods=['GET']) +def list_api_endpoints(): + """ + Get all registered API endpoints. + + Returns: + { + "status": "success", + "endpoints": [ + { + "id": "uuid", + "name": "Users API", + "url": "https://api.example.com/users", + "auth_method": "bearer", + "json_path": "$.data[*]", + "target_label": "User", + "field_mappings": {"email": "email", "name": "fullName"}, + "created_at": 1234567890.123, + "updated_at": 1234567890.123 + } + ] + } + """ + try: + registry = _get_endpoint_registry() + endpoints = registry.list_endpoints() + return jsonify({ + 'status': 'success', + 'endpoints': endpoints + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/api-endpoints/', methods=['GET']) +def get_api_endpoint(endpoint_id): + """ + Get a specific API endpoint by ID. + + Returns: + { + "status": "success", + "endpoint": {...} + } + """ + try: + registry = _get_endpoint_registry() + endpoint = registry.get_endpoint(endpoint_id) + + if not endpoint: + return jsonify({ + 'status': 'error', + 'error': f'Endpoint "{endpoint_id}" not found' + }), 404 + + return jsonify({ + 'status': 'success', + 'endpoint': endpoint + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/api-endpoints', methods=['POST']) +def create_api_endpoint(): + """ + Create a new API endpoint configuration. + + Request body: + { + "name": "Users API", + "url": "https://api.example.com/users", + "auth_method": "bearer", // "none", "bearer", or "api_key" + "auth_value": "token123", // optional + "json_path": "$.data[*]", // optional + "target_label": "User", // optional + "field_mappings": { // optional + "email": "email", + "name": "fullName" + } + } + + Returns: + { + "status": "success", + "endpoint": {...} + } + """ + try: + data = request.get_json() + if not data: + return jsonify({ + 'status': 'error', + 'error': 'Request body must be JSON' + }), 400 + + registry = _get_endpoint_registry() + endpoint = registry.create_endpoint(data) + + return jsonify({ + 'status': 'success', + 'endpoint': endpoint + }), 201 + except ValueError as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 400 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/api-endpoints/', methods=['PUT', 'PATCH']) +def update_api_endpoint(endpoint_id): + """ + Update an existing API endpoint. + + Request body: Same as create, but all fields optional + + Returns: + { + "status": "success", + "endpoint": {...} + } + """ + try: + data = request.get_json() + if not data: + return jsonify({ + 'status': 'error', + 'error': 'Request body must be JSON' + }), 400 + + registry = _get_endpoint_registry() + endpoint = registry.update_endpoint(endpoint_id, data) + + return jsonify({ + 'status': 'success', + 'endpoint': endpoint + }), 200 + except ValueError as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 400 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/api-endpoints/', methods=['DELETE']) +def delete_api_endpoint(endpoint_id): + """ + Delete an API endpoint. + + Returns: + { + "status": "success" + } + """ + try: + registry = _get_endpoint_registry() + deleted = registry.delete_endpoint(endpoint_id) + + if not deleted: + return jsonify({ + 'status': 'error', + 'error': f'Endpoint "{endpoint_id}" not found' + }), 404 + + return jsonify({ + 'status': 'success' + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/api-endpoints//test', methods=['POST']) +def test_api_endpoint(endpoint_id): + """ + Test an API endpoint connection and return sample data. + + Returns: + { + "status": "success", + "test_result": { + "success": true, + "status_code": 200, + "sample_data": [...], // First 5 records after JSONPath extraction + "total_records": 100, // Total records found + "error": null + } + } + """ + try: + registry = _get_endpoint_registry() + endpoint = registry.get_endpoint(endpoint_id) + + if not endpoint: + return jsonify({ + 'status': 'error', + 'error': f'Endpoint "{endpoint_id}" not found' + }), 404 + + # Get decrypted auth value + auth_value = registry.get_decrypted_auth(endpoint_id) + + # Build request headers + headers = {} + if endpoint['auth_method'] == 'bearer' and auth_value: + headers['Authorization'] = f'Bearer {auth_value}' + elif endpoint['auth_method'] == 'api_key' and auth_value: + headers['X-API-Key'] = auth_value + + # Make request + try: + response = requests.get(endpoint['url'], headers=headers, timeout=10) + response.raise_for_status() + except requests.exceptions.Timeout: + return jsonify({ + 'status': 'success', + 'test_result': { + 'success': False, + 'error': 'Request timed out after 10 seconds' + } + }), 200 + except requests.exceptions.RequestException as e: + return jsonify({ + 'status': 'success', + 'test_result': { + 'success': False, + 'error': str(e) + } + }), 200 + + # Parse JSON response + try: + data = response.json() + except Exception: + return jsonify({ + 'status': 'success', + 'test_result': { + 'success': False, + 'error': 'Response is not valid JSON' + } + }), 200 + + # Apply JSONPath if specified + records = data + if endpoint.get('json_path'): + try: + jsonpath_expr = jsonpath_parse(endpoint['json_path']) + matches = [match.value for match in jsonpath_expr.find(data)] + records = matches + except Exception as e: + return jsonify({ + 'status': 'success', + 'test_result': { + 'success': False, + 'error': f'JSONPath error: {str(e)}' + } + }), 200 + + # Ensure records is a list + if not isinstance(records, list): + records = [records] + + # Return sample + return jsonify({ + 'status': 'success', + 'test_result': { + 'success': True, + 'status_code': response.status_code, + 'sample_data': records[:5], # First 5 records + 'total_records': len(records), + 'error': None + } + }), 200 + + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +def _get_table_format_registry(): + """Get or create TableFormatRegistry instance.""" + from ...core.table_format_registry import TableFormatRegistry + + if 'table_format_registry' not in current_app.extensions.get('scidk', {}): + if 'scidk' not in current_app.extensions: + current_app.extensions['scidk'] = {} + + # Get settings DB path + settings_db = current_app.config.get('SCIDK_SETTINGS_DB', 'scidk_settings.db') + + current_app.extensions['scidk']['table_format_registry'] = TableFormatRegistry( + db_path=settings_db + ) + + return current_app.extensions['scidk']['table_format_registry'] + + +@bp.route('/settings/table-formats', methods=['GET']) +def list_table_formats(): + """ + Get all registered table format configurations. + + Query params: + - include_preprogrammed: Include pre-programmed formats (default: true) + + Returns: + { + "status": "success", + "formats": [...] + } + """ + try: + include_preprogrammed = request.args.get('include_preprogrammed', 'true').lower() == 'true' + registry = _get_table_format_registry() + formats = registry.list_formats(include_preprogrammed=include_preprogrammed) + return jsonify({ + 'status': 'success', + 'formats': formats + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/table-formats/', methods=['GET']) +def get_table_format(format_id): + """Get a specific table format by ID.""" + try: + registry = _get_table_format_registry() + format_config = registry.get_format(format_id) + + if not format_config: + return jsonify({ + 'status': 'error', + 'error': f'Format "{format_id}" not found' + }), 404 + + return jsonify({ + 'status': 'success', + 'format': format_config + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/table-formats', methods=['POST']) +def create_table_format(): + """Create a new table format configuration.""" + try: + data = request.get_json() + if not data: + return jsonify({ + 'status': 'error', + 'error': 'Request body must be JSON' + }), 400 + + registry = _get_table_format_registry() + format_config = registry.create_format(data) + + return jsonify({ + 'status': 'success', + 'format': format_config + }), 201 + except ValueError as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 400 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/table-formats/', methods=['PUT', 'PATCH']) +def update_table_format(format_id): + """Update an existing table format.""" + try: + data = request.get_json() + if not data: + return jsonify({ + 'status': 'error', + 'error': 'Request body must be JSON' + }), 400 + + registry = _get_table_format_registry() + format_config = registry.update_format(format_id, data) + + return jsonify({ + 'status': 'success', + 'format': format_config + }), 200 + except ValueError as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 400 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/table-formats/', methods=['DELETE']) +def delete_table_format(format_id): + """Delete a table format.""" + try: + registry = _get_table_format_registry() + deleted = registry.delete_format(format_id) + + if not deleted: + return jsonify({ + 'status': 'error', + 'error': f'Format "{format_id}" not found' + }), 404 + + return jsonify({ + 'status': 'success' + }), 200 + except ValueError as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 400 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/table-formats/detect', methods=['POST']) +def detect_table_format(): + """Auto-detect table format from uploaded file.""" + try: + if 'file' not in request.files: + return jsonify({ + 'status': 'error', + 'error': 'No file provided' + }), 400 + + file = request.files['file'] + if not file or not file.filename: + return jsonify({ + 'status': 'error', + 'error': 'Invalid file' + }), 400 + + file_content = file.read() + registry = _get_table_format_registry() + detected = registry.detect_format(file_content, filename=file.filename) + + if 'error' in detected: + return jsonify({ + 'status': 'error', + 'error': detected['error'] + }), 400 + + return jsonify({ + 'status': 'success', + 'detected': detected + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/table-formats//preview', methods=['POST']) +def preview_table_data(format_id): + """Preview table data using a format configuration.""" + try: + if 'file' not in request.files: + return jsonify({ + 'status': 'error', + 'error': 'No file provided' + }), 400 + + file = request.files['file'] + if not file or not file.filename: + return jsonify({ + 'status': 'error', + 'error': 'Invalid file' + }), 400 + + file_content = file.read() + num_rows = int(request.args.get('num_rows', 5)) + + registry = _get_table_format_registry() + preview = registry.preview_data(file_content, format_id, num_rows=num_rows) + + if 'error' in preview: + return jsonify({ + 'status': 'error', + 'error': preview['error'] + }), 400 + + return jsonify({ + 'status': 'success', + 'preview': preview + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +def _get_fuzzy_matching_service(): + """Get or create FuzzyMatchingService instance.""" + from ...core.fuzzy_matching import FuzzyMatchingService + + if 'fuzzy_matching_service' not in current_app.extensions.get('scidk', {}): + if 'scidk' not in current_app.extensions: + current_app.extensions['scidk'] = {} + + # Get settings DB path + settings_db = current_app.config.get('SCIDK_SETTINGS_DB', 'scidk_settings.db') + + current_app.extensions['scidk']['fuzzy_matching_service'] = FuzzyMatchingService( + db_path=settings_db + ) + + return current_app.extensions['scidk']['fuzzy_matching_service'] + + +@bp.route('/settings/fuzzy-matching', methods=['GET']) +def get_fuzzy_matching_settings(): + """ + Get global fuzzy matching settings. + + Returns: + { + "status": "success", + "settings": { + "algorithm": "levenshtein", + "threshold": 0.80, + "case_sensitive": false, + ... + } + } + """ + try: + service = _get_fuzzy_matching_service() + settings = service.get_global_settings() + + return jsonify({ + 'status': 'success', + 'settings': settings.to_dict() + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/fuzzy-matching', methods=['POST', 'PUT']) +def update_fuzzy_matching_settings(): + """ + Update global fuzzy matching settings. + + Request body: + { + "algorithm": "levenshtein", + "threshold": 0.75, + "case_sensitive": false, + "normalize_whitespace": true, + "strip_punctuation": true, + "phonetic_enabled": false, + "phonetic_algorithm": "metaphone", + "min_string_length": 3, + "max_comparisons": 10000, + "show_confidence_scores": true + } + + Returns: + { + "status": "success", + "settings": {...} + } + """ + try: + data = request.get_json() + if not data: + return jsonify({ + 'status': 'error', + 'error': 'Request body must be JSON' + }), 400 + + service = _get_fuzzy_matching_service() + settings = service.update_global_settings(data) + + return jsonify({ + 'status': 'success', + 'settings': settings.to_dict() + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 + + +@bp.route('/settings/fuzzy-matching/preview', methods=['POST']) +def preview_fuzzy_matching(): + """ + Preview fuzzy matching results for external data. + + Request body: + { + "external_records": [ + {"name": "Jon Smith", "email": "jon@example.com"}, + ... + ], + "existing_nodes": [ + {"name": "John Smith", "email": "john@example.com"}, + ... + ], + "match_key": "name", + "settings": { // Optional override + "algorithm": "levenshtein", + "threshold": 0.75 + } + } + + Returns: + { + "status": "success", + "matches": [ + { + "external_record": {...}, + "matched_node": {...} or null, + "confidence": 0.85, + "is_match": true + } + ] + } + """ + try: + data = request.get_json() + if not data: + return jsonify({ + 'status': 'error', + 'error': 'Request body must be JSON' + }), 400 + + external_records = data.get('external_records', []) + existing_nodes = data.get('existing_nodes', []) + match_key = data.get('match_key') + + if not match_key: + return jsonify({ + 'status': 'error', + 'error': 'match_key is required' + }), 400 + + service = _get_fuzzy_matching_service() + + # Parse settings override if provided + settings = None + if 'settings' in data and data['settings']: + from ...core.fuzzy_matching import FuzzyMatchSettings + settings = FuzzyMatchSettings.from_dict(data['settings']) + + matches = service.match_external_data( + external_records, + existing_nodes, + match_key, + settings + ) + + return jsonify({ + 'status': 'success', + 'matches': matches, + 'total_external': len(external_records), + 'total_matched': sum(1 for m in matches if m['is_match']) + }), 200 + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': str(e) + }), 500 diff --git a/scidk/web/routes/ui.py b/scidk/web/routes/ui.py index 396e385..730d5d6 100644 --- a/scidk/web/routes/ui.py +++ b/scidk/web/routes/ui.py @@ -188,10 +188,16 @@ def labels(): return render_template('labels.html') +@bp.get('/integrate') +def integrate(): + """Integration definitions page for relationship creation workflows.""" + return render_template('integrations.html') + + @bp.get('/links') -def links(): - """Link definitions page for relationship creation workflows.""" - return render_template('links.html') +def links_redirect(): + """Backward compatibility redirect: /links → /integrate""" + return redirect(url_for('ui.integrate')) @bp.get('/settings') diff --git a/tests/test_api_endpoint_registry.py b/tests/test_api_endpoint_registry.py new file mode 100644 index 0000000..faa4d5c --- /dev/null +++ b/tests/test_api_endpoint_registry.py @@ -0,0 +1,314 @@ +""" +Tests for API Endpoint Registry. + +Tests CRUD operations, encryption, and validation. +""" +import pytest +import tempfile +import os +from scidk.core.api_endpoint_registry import APIEndpointRegistry, get_encryption_key +from cryptography.fernet import Fernet + + +@pytest.fixture +def registry(): + """Create a temporary registry for testing.""" + # Use temporary database + with tempfile.NamedTemporaryFile(suffix='.db', delete=False) as tmp: + db_path = tmp.name + + # Generate test encryption key + encryption_key = Fernet.generate_key().decode() + + reg = APIEndpointRegistry(db_path=db_path, encryption_key=encryption_key) + + yield reg + + # Cleanup + reg.db.close() + if os.path.exists(db_path): + os.unlink(db_path) + + +def test_create_endpoint(registry): + """Test creating a new API endpoint.""" + endpoint_data = { + 'name': 'Test API', + 'url': 'https://api.example.com/users', + 'auth_method': 'bearer', + 'auth_value': 'secret_token_123', + 'json_path': '$.data[*]', + 'target_label': 'User', + 'field_mappings': { + 'email': 'email', + 'full_name': 'name' + } + } + + endpoint = registry.create_endpoint(endpoint_data) + + assert endpoint['id'] is not None + assert endpoint['name'] == 'Test API' + assert endpoint['url'] == 'https://api.example.com/users' + assert endpoint['auth_method'] == 'bearer' + assert endpoint['json_path'] == '$.data[*]' + assert endpoint['target_label'] == 'User' + assert endpoint['field_mappings'] == {'email': 'email', 'full_name': 'name'} + assert 'auth_value' not in endpoint # Should not be included by default + + +def test_create_endpoint_validation(registry): + """Test endpoint creation validation.""" + # Missing name + with pytest.raises(ValueError, match="Endpoint name is required"): + registry.create_endpoint({'url': 'https://example.com'}) + + # Missing URL + with pytest.raises(ValueError, match="Endpoint URL is required"): + registry.create_endpoint({'name': 'Test'}) + + +def test_create_duplicate_name(registry): + """Test that duplicate names are rejected.""" + data = { + 'name': 'Duplicate Test', + 'url': 'https://api.example.com/users' + } + + # First creation should succeed + registry.create_endpoint(data) + + # Second creation with same name should fail + with pytest.raises(ValueError, match="already exists"): + registry.create_endpoint(data) + + +def test_get_endpoint(registry): + """Test retrieving an endpoint by ID.""" + data = { + 'name': 'Get Test', + 'url': 'https://api.example.com/data' + } + + created = registry.create_endpoint(data) + endpoint_id = created['id'] + + retrieved = registry.get_endpoint(endpoint_id) + + assert retrieved is not None + assert retrieved['id'] == endpoint_id + assert retrieved['name'] == 'Get Test' + + +def test_get_nonexistent_endpoint(registry): + """Test getting a nonexistent endpoint returns None.""" + result = registry.get_endpoint('nonexistent-id') + assert result is None + + +def test_get_endpoint_by_name(registry): + """Test retrieving an endpoint by name.""" + data = { + 'name': 'Name Search Test', + 'url': 'https://api.example.com/search' + } + + created = registry.create_endpoint(data) + retrieved = registry.get_endpoint_by_name('Name Search Test') + + assert retrieved is not None + assert retrieved['id'] == created['id'] + assert retrieved['name'] == 'Name Search Test' + + +def test_list_endpoints(registry): + """Test listing all endpoints.""" + # Create multiple endpoints + registry.create_endpoint({'name': 'API 1', 'url': 'https://example.com/1'}) + registry.create_endpoint({'name': 'API 2', 'url': 'https://example.com/2'}) + registry.create_endpoint({'name': 'API 3', 'url': 'https://example.com/3'}) + + endpoints = registry.list_endpoints() + + assert len(endpoints) == 3 + names = [e['name'] for e in endpoints] + assert 'API 1' in names + assert 'API 2' in names + assert 'API 3' in names + + +def test_list_endpoints_empty(registry): + """Test listing endpoints when none exist.""" + endpoints = registry.list_endpoints() + assert endpoints == [] + + +def test_update_endpoint(registry): + """Test updating an endpoint.""" + data = { + 'name': 'Original Name', + 'url': 'https://example.com/original', + 'auth_method': 'none' + } + + created = registry.create_endpoint(data) + endpoint_id = created['id'] + + # Update + updates = { + 'name': 'Updated Name', + 'url': 'https://example.com/updated', + 'auth_method': 'bearer', + 'auth_value': 'new_token', + 'target_label': 'UpdatedLabel' + } + + updated = registry.update_endpoint(endpoint_id, updates) + + assert updated['name'] == 'Updated Name' + assert updated['url'] == 'https://example.com/updated' + assert updated['auth_method'] == 'bearer' + assert updated['target_label'] == 'UpdatedLabel' + + +def test_update_nonexistent_endpoint(registry): + """Test updating a nonexistent endpoint raises error.""" + with pytest.raises(ValueError, match="not found"): + registry.update_endpoint('nonexistent-id', {'name': 'Test'}) + + +def test_update_endpoint_name_conflict(registry): + """Test that renaming to an existing name is rejected.""" + registry.create_endpoint({'name': 'Endpoint 1', 'url': 'https://example.com/1'}) + created2 = registry.create_endpoint({'name': 'Endpoint 2', 'url': 'https://example.com/2'}) + + # Try to rename Endpoint 2 to Endpoint 1 + with pytest.raises(ValueError, match="already exists"): + registry.update_endpoint(created2['id'], {'name': 'Endpoint 1'}) + + +def test_delete_endpoint(registry): + """Test deleting an endpoint.""" + data = { + 'name': 'Delete Test', + 'url': 'https://example.com/delete' + } + + created = registry.create_endpoint(data) + endpoint_id = created['id'] + + # Verify it exists + assert registry.get_endpoint(endpoint_id) is not None + + # Delete + result = registry.delete_endpoint(endpoint_id) + assert result is True + + # Verify it's gone + assert registry.get_endpoint(endpoint_id) is None + + +def test_delete_nonexistent_endpoint(registry): + """Test deleting a nonexistent endpoint returns False.""" + result = registry.delete_endpoint('nonexistent-id') + assert result is False + + +def test_auth_value_encryption(registry): + """Test that auth values are encrypted at rest.""" + data = { + 'name': 'Encryption Test', + 'url': 'https://example.com/secure', + 'auth_method': 'bearer', + 'auth_value': 'super_secret_token' + } + + endpoint = registry.create_endpoint(data) + endpoint_id = endpoint['id'] + + # Get decrypted auth value + decrypted = registry.get_decrypted_auth(endpoint_id) + assert decrypted == 'super_secret_token' + + # Verify it's encrypted in the database + cursor = registry.db.execute( + "SELECT auth_value_encrypted FROM api_endpoints WHERE id = ?", + (endpoint_id,) + ) + row = cursor.fetchone() + encrypted_value = row[0] + + # Encrypted value should be different from original + assert encrypted_value != 'super_secret_token' + assert len(encrypted_value) > len('super_secret_token') + + +def test_auth_value_optional(registry): + """Test that auth value is optional.""" + data = { + 'name': 'No Auth Test', + 'url': 'https://example.com/public', + 'auth_method': 'none' + } + + endpoint = registry.create_endpoint(data) + assert endpoint['auth_method'] == 'none' + + # Getting auth for endpoint with no auth should return None + decrypted = registry.get_decrypted_auth(endpoint['id']) + assert decrypted is None + + +def test_field_mappings_serialization(registry): + """Test that field mappings are correctly serialized/deserialized.""" + data = { + 'name': 'Mappings Test', + 'url': 'https://example.com/api', + 'field_mappings': { + 'api_field_1': 'label_prop_1', + 'api_field_2': 'label_prop_2', + 'nested.field': 'flat_field' + } + } + + endpoint = registry.create_endpoint(data) + retrieved = registry.get_endpoint(endpoint['id']) + + assert retrieved['field_mappings'] == data['field_mappings'] + assert isinstance(retrieved['field_mappings'], dict) + + +def test_default_values(registry): + """Test that optional fields have sensible defaults.""" + data = { + 'name': 'Minimal Test', + 'url': 'https://example.com/minimal' + } + + endpoint = registry.create_endpoint(data) + + assert endpoint['auth_method'] == 'none' + assert endpoint['json_path'] == '' + assert endpoint['target_label'] == '' + assert endpoint['field_mappings'] == {} + + +def test_get_encryption_key_from_env(monkeypatch): + """Test getting encryption key from environment variable.""" + test_key = Fernet.generate_key().decode() + monkeypatch.setenv('SCIDK_API_ENCRYPTION_KEY', test_key) + + key = get_encryption_key() + assert key == test_key + + +def test_get_encryption_key_generates_ephemeral(monkeypatch): + """Test that encryption key is generated when not in environment.""" + monkeypatch.delenv('SCIDK_API_ENCRYPTION_KEY', raising=False) + + key = get_encryption_key() + assert key is not None + assert len(key) > 0 + + # Verify it's a valid Fernet key + Fernet(key.encode()) # Should not raise diff --git a/tests/test_eda_interpreter.py b/tests/test_eda_interpreter.py new file mode 100644 index 0000000..bb88263 --- /dev/null +++ b/tests/test_eda_interpreter.py @@ -0,0 +1,297 @@ +""" +Unit tests for EDA file interpreter. +""" +import json +import zipfile +import tempfile +from pathlib import Path +import pytest + +from scidk.interpreters.eda_interpreter import parse_eda_file, eda_to_labels + + +def create_test_eda_file(data, filename='test.eda'): + """Helper to create a test EDA file.""" + tmp = tempfile.NamedTemporaryFile(suffix='.eda', delete=False) + with zipfile.ZipFile(tmp.name, 'w') as zf: + zf.writestr('model', json.dumps(data)) + tmp.close() + return tmp.name + + +def test_parse_eda_file_single_node(): + """Test parsing .eda file with single node.""" + eda_data = { + 'childShapes': [ + { + 'resourceId': 'n0', + 'stencil': {'id': 'Treatment'}, + 'properties': {'name': 'Drug A', 'dose': '10mg'}, + 'propertyTypes': {'name': 'String', 'dose': 'String'}, + 'outgoing': [], + 'incoming': [] + } + ] + } + + tmp_path = create_test_eda_file(eda_data) + try: + nodes, edges = parse_eda_file(tmp_path) + assert len(nodes) == 1 + assert len(edges) == 0 + assert nodes[0]['stencil']['id'] == 'Treatment' + assert nodes[0]['properties']['name'] == 'Drug A' + finally: + Path(tmp_path).unlink() + + +def test_parse_eda_file_with_edges(): + """Test parsing .eda file with nodes and edges.""" + eda_data = { + 'childShapes': [ + { + 'resourceId': 'n0', + 'stencil': {'id': 'Treatment'}, + 'properties': {'name': 'Drug A'}, + 'propertyTypes': {'name': 'String'}, + 'outgoing': [], + 'incoming': [] + }, + { + 'resourceId': 'n1', + 'stencil': {'id': 'Subject'}, + 'properties': {'id': 'Mouse001'}, + 'propertyTypes': {'id': 'String'}, + 'outgoing': [], + 'incoming': [] + }, + { + 'resourceId': 'e0', + 'stencil': {'id': 'APPLIED_TO'}, + 'target': {'resourceId': 'n1'}, + 'properties': {'name': 'applies'}, + 'propertyTypes': {}, + 'outgoing': [{'resourceId': 'n1'}], + 'incoming': [{'resourceId': 'n0'}] + } + ] + } + + tmp_path = create_test_eda_file(eda_data) + try: + nodes, edges = parse_eda_file(tmp_path) + assert len(nodes) == 2 + assert len(edges) == 1 + assert edges[0]['stencil']['id'] == 'APPLIED_TO' + finally: + Path(tmp_path).unlink() + + +def test_parse_eda_file_not_found(): + """Test parsing nonexistent file raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + parse_eda_file('/nonexistent/file.eda') + + +def test_parse_eda_file_wrong_extension(): + """Test parsing file with wrong extension raises ValueError.""" + tmp = tempfile.NamedTemporaryFile(suffix='.txt', delete=False) + tmp.close() + try: + with pytest.raises(ValueError, match='Not an EDA file'): + parse_eda_file(tmp.name) + finally: + Path(tmp.name).unlink() + + +def test_eda_to_labels_basic(): + """Test converting EDA nodes to labels.""" + eda_nodes = [ + { + 'resourceId': 'n0', + 'stencil': {'id': 'Treatment'}, + 'properties': {'name': 'Drug A', 'dose': '10mg'}, + 'propertyTypes': {'name': 'String', 'dose': 'String'}, + 'outgoing': [], + 'incoming': [] + } + ] + + labels = eda_to_labels(eda_nodes) + + assert len(labels) == 1 + assert labels[0]['name'] == 'Treatment' + assert len(labels[0]['properties']) == 2 + + prop_names = [p['name'] for p in labels[0]['properties']] + assert 'name' in prop_names + assert 'dose' in prop_names + + # Check type mapping + for prop in labels[0]['properties']: + assert prop['type'] == 'string' + assert prop['required'] == False + + +def test_eda_to_labels_type_mapping(): + """Test EDA to scidk type mapping.""" + eda_nodes = [ + { + 'resourceId': 'n0', + 'stencil': {'id': 'Measurement'}, + 'properties': { + 'text': 'value', + 'count': 42, + 'weight': 3.14, + 'active': True, + 'date': '2024-01-01' + }, + 'propertyTypes': { + 'text': 'String', + 'count': 'Integer', + 'weight': 'Float', + 'active': 'Boolean', + 'date': 'Date' + }, + 'outgoing': [], + 'incoming': [] + } + ] + + labels = eda_to_labels(eda_nodes) + props = {p['name']: p['type'] for p in labels[0]['properties']} + + assert props['text'] == 'string' + assert props['count'] == 'number' + assert props['weight'] == 'number' + assert props['active'] == 'boolean' + assert props['date'] == 'date' + + +def test_eda_to_labels_with_relationships(): + """Test converting EDA nodes with relationships.""" + eda_nodes = [ + { + 'resourceId': 'n0', + 'stencil': {'id': 'Treatment'}, + 'properties': {'name': 'Drug A'}, + 'propertyTypes': {'name': 'String'}, + 'outgoing': [{'target': 'n1'}], + 'incoming': [] + }, + { + 'resourceId': 'n1', + 'stencil': {'id': 'Subject'}, + 'properties': {'id': 'Mouse001'}, + 'propertyTypes': {'id': 'String'}, + 'outgoing': [], + 'incoming': [{'target': 'n0'}] + } + ] + + labels = eda_to_labels(eda_nodes) + + assert len(labels) == 2 + + treatment = next(l for l in labels if l['name'] == 'Treatment') + assert len(treatment['relationships']) == 1 + assert treatment['relationships'][0]['type'] == 'APPLIED_TO' + assert treatment['relationships'][0]['target_label'] == 'Subject' + + +def test_eda_to_labels_dedupe_properties(): + """Test that duplicate properties are merged.""" + eda_nodes = [ + { + 'resourceId': 'n0', + 'stencil': {'id': 'Subject'}, + 'properties': {'id': 'M001', 'weight': 25}, + 'propertyTypes': {'id': 'String', 'weight': 'Integer'}, + 'outgoing': [], + 'incoming': [] + }, + { + 'resourceId': 'n1', + 'stencil': {'id': 'Subject'}, + 'properties': {'id': 'M002', 'age': 8}, + 'propertyTypes': {'id': 'String', 'age': 'Integer'}, + 'outgoing': [], + 'incoming': [] + } + ] + + labels = eda_to_labels(eda_nodes) + + # Should have only one Subject label + assert len(labels) == 1 + assert labels[0]['name'] == 'Subject' + + # Should have properties from both nodes + prop_names = [p['name'] for p in labels[0]['properties']] + assert 'id' in prop_names + assert 'weight' in prop_names + assert 'age' in prop_names + + +def test_eda_to_labels_with_explicit_edges(): + """Test converting with explicit edge objects.""" + eda_nodes = [ + { + 'resourceId': 'n0', + 'stencil': {'id': 'Experiment'}, + 'properties': {'name': 'Exp1'}, + 'propertyTypes': {'name': 'String'}, + 'outgoing': [], + 'incoming': [] + }, + { + 'resourceId': 'n1', + 'stencil': {'id': 'Subject'}, + 'properties': {'id': 'M001'}, + 'propertyTypes': {'id': 'String'}, + 'outgoing': [], + 'incoming': [] + } + ] + + eda_edges = [ + { + 'resourceId': 'e0', + 'stencil': {'id': 'INCLUDES'}, + 'incoming': [{'resourceId': 'n0'}], + 'outgoing': [{'resourceId': 'n1'}], + 'properties': {}, + 'propertyTypes': {} + } + ] + + labels = eda_to_labels(eda_nodes, eda_edges) + + experiment = next(l for l in labels if l['name'] == 'Experiment') + assert len(experiment['relationships']) == 1 + assert experiment['relationships'][0]['type'] == 'INCLUDES' + assert experiment['relationships'][0]['target_label'] == 'Subject' + + +def test_eda_to_labels_empty_input(): + """Test with empty input.""" + labels = eda_to_labels([]) + assert labels == [] + + +def test_eda_to_labels_missing_stencil(): + """Test handling nodes with missing stencil.""" + eda_nodes = [ + { + 'resourceId': 'n0', + 'properties': {'name': 'value'}, + 'propertyTypes': {'name': 'String'}, + 'outgoing': [], + 'incoming': [] + } + ] + + labels = eda_to_labels(eda_nodes) + # Should skip nodes without stencil or create with default + # Current implementation skips them + assert len(labels) == 0 or labels[0]['name'] == 'Unknown' diff --git a/tests/test_fuzzy_matching.py b/tests/test_fuzzy_matching.py new file mode 100644 index 0000000..5aee5eb --- /dev/null +++ b/tests/test_fuzzy_matching.py @@ -0,0 +1,341 @@ +""" +Tests for Fuzzy Matching Service. + +Tests settings persistence, client-side matching (Phase 1), +and server-side Cypher generation (Phase 2). +""" +import pytest +import tempfile +import os +from scidk.core.fuzzy_matching import FuzzyMatchingService, FuzzyMatchSettings + +# Check if rapidfuzz is available for client-side matching tests +try: + import rapidfuzz + RAPIDFUZZ_AVAILABLE = True +except ImportError: + RAPIDFUZZ_AVAILABLE = False + +requires_rapidfuzz = pytest.mark.skipif( + not RAPIDFUZZ_AVAILABLE, + reason="rapidfuzz not installed (optional dependency for Phase 1 client-side matching)" +) + + +@pytest.fixture +def service(): + """Create a temporary fuzzy matching service for testing.""" + with tempfile.NamedTemporaryFile(suffix='.db', delete=False) as tmp: + db_path = tmp.name + + svc = FuzzyMatchingService(db_path=db_path) + + yield svc + + # Cleanup + svc.db.close() + if os.path.exists(db_path): + os.unlink(db_path) + + +def test_global_default_seeded(service): + """Test that global default settings are seeded on initialization.""" + settings = service.get_global_settings() + + assert settings.algorithm == 'levenshtein' + assert settings.threshold == 0.80 + assert settings.case_sensitive is False + assert settings.normalize_whitespace is True + assert settings.strip_punctuation is True + + +def test_update_global_settings(service): + """Test updating global fuzzy matching settings.""" + updates = { + 'algorithm': 'jaro_winkler', + 'threshold': 0.75, + 'case_sensitive': True + } + + updated = service.update_global_settings(updates) + + assert updated.algorithm == 'jaro_winkler' + assert updated.threshold == 0.75 + assert updated.case_sensitive is True + # Other fields should remain unchanged + assert updated.normalize_whitespace is True + + +def test_settings_to_dict(service): + """Test serializing settings to dictionary.""" + settings = service.get_global_settings() + data = settings.to_dict() + + assert isinstance(data, dict) + assert 'algorithm' in data + assert 'threshold' in data + assert data['algorithm'] == 'levenshtein' + + +def test_settings_from_dict(): + """Test deserializing settings from dictionary.""" + data = { + 'algorithm': 'jaro_winkler', + 'threshold': 0.90, + 'case_sensitive': True + } + + settings = FuzzyMatchSettings.from_dict(data) + + assert settings.algorithm == 'jaro_winkler' + assert settings.threshold == 0.90 + assert settings.case_sensitive is True + + +# ========================================== +# Phase 1: Client-Side Matching Tests +# ========================================== + +@requires_rapidfuzz +def test_match_external_data_exact(service): + """Test exact matching (no fuzzy logic).""" + external_records = [ + {'name': 'John Smith', 'email': 'john@example.com'}, + {'name': 'Jane Doe', 'email': 'jane@example.com'}, + {'name': 'Unknown Person', 'email': 'unknown@example.com'} + ] + + existing_nodes = [ + {'name': 'John Smith', 'id': 1}, + {'name': 'Jane Doe', 'id': 2} + ] + + settings = FuzzyMatchSettings(algorithm='exact') + matches = service.match_external_data( + external_records, + existing_nodes, + 'name', + settings + ) + + assert len(matches) == 3 + assert matches[0]['is_match'] is True + assert matches[0]['matched_node']['id'] == 1 + assert matches[1]['is_match'] is True + assert matches[1]['matched_node']['id'] == 2 + assert matches[2]['is_match'] is False + + +@requires_rapidfuzz +def test_match_external_data_levenshtein(service): + """Test Levenshtein fuzzy matching.""" + external_records = [ + {'name': 'Jon Smith'}, # Typo in "John" + {'name': 'Jane Doe'}, # Exact match + {'name': 'Completely Different'} + ] + + existing_nodes = [ + {'name': 'John Smith', 'id': 1}, + {'name': 'Jane Doe', 'id': 2} + ] + + settings = FuzzyMatchSettings(algorithm='levenshtein', threshold=0.80) + matches = service.match_external_data( + external_records, + existing_nodes, + 'name', + settings + ) + + # "Jon Smith" should match "John Smith" (high similarity) + assert matches[0]['is_match'] is True + assert matches[0]['matched_node']['id'] == 1 + assert matches[0]['confidence'] > 0.80 + + # "Jane Doe" exact match + assert matches[1]['is_match'] is True + assert matches[1]['confidence'] > 0.95 + + # "Completely Different" should not match + assert matches[2]['is_match'] is False + + +def test_normalize_string_case_insensitive(service): + """Test string normalization with case insensitivity.""" + settings = FuzzyMatchSettings(case_sensitive=False) + result = service._normalize_string('John SMITH', settings) + assert result == 'john smith' + + +def test_normalize_string_strip_punctuation(service): + """Test string normalization with punctuation stripping.""" + settings = FuzzyMatchSettings(strip_punctuation=True) + result = service._normalize_string("O'Brien, John", settings) + # Punctuation should be removed + assert ',' not in result + assert "'" not in result + + +def test_normalize_whitespace(service): + """Test whitespace normalization.""" + settings = FuzzyMatchSettings(normalize_whitespace=True) + result = service._normalize_string('John Smith ', settings) + assert result == 'john smith' # Normalized to single spaces, trimmed + + +@requires_rapidfuzz +def test_match_with_missing_key(service): + """Test matching when external record is missing the match key.""" + external_records = [ + {'email': 'john@example.com'}, # Missing 'name' + ] + + existing_nodes = [ + {'name': 'John Smith', 'id': 1} + ] + + matches = service.match_external_data( + external_records, + existing_nodes, + 'name', + FuzzyMatchSettings() + ) + + assert len(matches) == 1 + assert matches[0]['is_match'] is False + assert 'reason' in matches[0] + + +@requires_rapidfuzz +def test_match_with_short_string(service): + """Test matching with strings below min_string_length.""" + external_records = [ + {'name': 'Jo'}, # Too short (< 3 chars) + ] + + existing_nodes = [ + {'name': 'John', 'id': 1} + ] + + settings = FuzzyMatchSettings(min_string_length=3) + matches = service.match_external_data( + external_records, + existing_nodes, + 'name', + settings + ) + + assert matches[0]['is_match'] is False + assert 'too short' in matches[0].get('reason', '').lower() + + +# ========================================== +# Phase 2: Server-Side Cypher Generation Tests +# ========================================== + +def test_generate_cypher_exact_match(service): + """Test Cypher generation for exact matching.""" + cypher = service.generate_cypher_fuzzy_match( + source_label='Person', + target_label='Company', + source_property='name', + target_property='contact_name', + relationship_type='WORKS_AT', + settings=FuzzyMatchSettings(algorithm='exact') + ) + + assert 'MATCH (source:Person)' in cypher + assert 'MATCH' in cypher and 'target:Company' in cypher + assert 'source.name = target.contact_name' in cypher + assert 'CREATE (source)-[:WORKS_AT' in cypher + + +def test_generate_cypher_levenshtein(service): + """Test Cypher generation for Levenshtein matching.""" + settings = FuzzyMatchSettings(algorithm='levenshtein', threshold=0.80) + cypher = service.generate_cypher_fuzzy_match( + source_label='Person', + target_label='Company', + source_property='name', + target_property='contact_name', + relationship_type='WORKS_AT', + settings=settings + ) + + assert 'apoc.text.levenshteinSimilarity' in cypher + assert '>= 0.8' in cypher + assert 'confidence' in cypher + + +def test_generate_cypher_jaro_winkler(service): + """Test Cypher generation for Jaro-Winkler matching.""" + settings = FuzzyMatchSettings(algorithm='jaro_winkler', threshold=0.85) + cypher = service.generate_cypher_fuzzy_match( + source_label='Person', + target_label='Organization', + source_property='full_name', + target_property='owner_name', + relationship_type='OWNS', + settings=settings + ) + + assert 'apoc.text.jaroWinklerDistance' in cypher + assert '>= 0.85' in cypher + + +def test_generate_cypher_phonetic_soundex(service): + """Test Cypher generation for phonetic (soundex) matching.""" + settings = FuzzyMatchSettings( + algorithm='phonetic', + phonetic_enabled=True, + phonetic_algorithm='soundex' + ) + cypher = service.generate_cypher_fuzzy_match( + source_label='Person', + target_label='Person', + source_property='last_name', + target_property='surname', + relationship_type='SIMILAR_TO', + settings=settings + ) + + assert 'apoc.text.phonetic' in cypher + assert 'soundex' in cypher.lower() or 'phonetic' in cypher.lower() + + +def test_generate_cypher_phonetic_metaphone(service): + """Test Cypher generation for phonetic (metaphone) matching.""" + settings = FuzzyMatchSettings( + algorithm='phonetic', + phonetic_enabled=True, + phonetic_algorithm='metaphone' + ) + cypher = service.generate_cypher_fuzzy_match( + source_label='Author', + target_label='Contributor', + source_property='name', + target_property='name', + relationship_type='SAME_PERSON', + settings=settings + ) + + assert 'apoc.text.doubleMetaphone' in cypher + + +def test_cypher_includes_labels_and_properties(service): + """Test that generated Cypher includes correct labels and properties.""" + cypher = service.generate_cypher_fuzzy_match( + source_label='Customer', + target_label='Order', + source_property='customer_email', + target_property='buyer_email', + relationship_type='PLACED', + settings=FuzzyMatchSettings() + ) + + assert 'Customer' in cypher + assert 'Order' in cypher + assert 'customer_email' in cypher + assert 'buyer_email' in cypher + assert 'PLACED' in cypher diff --git a/tests/test_labels_api.py b/tests/test_labels_api.py index 04cda49..9f2b5da 100644 --- a/tests/test_labels_api.py +++ b/tests/test_labels_api.py @@ -438,3 +438,70 @@ def test_batch_delete_labels_partial_success(client): # Verify the existing label was deleted get_response = client.get('/api/labels/DeleteExists') assert get_response.status_code == 404 + + +def test_get_label_instances_no_neo4j(client): + """Test getting instances when Neo4j is not configured.""" + # Create a label first + payload = { + 'name': 'Person', + 'properties': [ + {'name': 'name', 'type': 'string', 'required': False}, + {'name': 'age', 'type': 'number', 'required': False} + ], + 'relationships': [] + } + client.post('/api/labels', json=payload) + + # Try to get instances (will fail without Neo4j) + response = client.get('/api/labels/Person/instances') + data = response.get_json() + # Without Neo4j configured, this should return an error + assert data['status'] == 'error' + + +def test_get_instance_count_no_neo4j(client): + """Test getting instance count when Neo4j is not configured.""" + # Create a label first + payload = { + 'name': 'Person', + 'properties': [ + {'name': 'name', 'type': 'string', 'required': False} + ], + 'relationships': [] + } + client.post('/api/labels', json=payload) + + # Try to get count (will fail without Neo4j) + response = client.get('/api/labels/Person/instance-count') + data = response.get_json() + assert data['status'] == 'error' + + +def test_update_instance_label_not_found(client): + """Test updating instance for non-existent label.""" + response = client.patch('/api/labels/NonExistent/instances/some-id', json={ + 'property': 'name', + 'value': 'John' + }) + assert response.status_code == 404 + + +def test_update_instance_missing_property(client): + """Test updating instance without property name.""" + # Create a label first + payload = { + 'name': 'Person', + 'properties': [{'name': 'name', 'type': 'string', 'required': False}], + 'relationships': [] + } + client.post('/api/labels', json=payload) + + # Try to update without property + response = client.patch('/api/labels/Person/instances/some-id', json={ + 'value': 'John' + }) + assert response.status_code == 400 + data = response.get_json() + assert data['status'] == 'error' + assert 'required' in data['error'].lower() diff --git a/tests/test_links_api.py b/tests/test_links_api.py index b108d0b..59ef90b 100644 --- a/tests/test_links_api.py +++ b/tests/test_links_api.py @@ -10,6 +10,8 @@ - POST /api/links//execute - execute link job - GET /api/links/jobs/ - get job status - GET /api/links/jobs - list jobs +- GET /api/links/available-labels - get available labels for dropdowns +- POST /api/links/migrate - migrate existing links to Label→Label model """ import json import pytest @@ -26,21 +28,26 @@ def test_list_links_empty(client): def test_create_link_success(client): - """Test creating a link definition with all required fields.""" + """Test creating a link definition with all required fields (Label→Label model).""" + # Create labels first + client.post('/api/labels', json={ + 'name': 'Author', + 'properties': [{'name': 'email', 'type': 'string'}], + 'relationships': [] + }) + client.post('/api/labels', json={ + 'name': 'File', + 'properties': [{'name': 'path', 'type': 'string'}], + 'relationships': [] + }) + payload = { 'name': 'Authors to Files', - 'source_type': 'csv', - 'source_config': { - 'csv_data': 'name,email,file_path\nAlice,alice@ex.com,file1.txt' - }, - 'target_type': 'label', - 'target_config': { - 'label': 'File' - }, - 'match_strategy': 'property', + 'source_label': 'Author', + 'target_label': 'File', + 'match_strategy': 'table_import', 'match_config': { - 'source_field': 'file_path', - 'target_field': 'path' + 'table_data': 'name,email,file_path\nAlice,alice@ex.com,file1.txt' }, 'relationship_type': 'AUTHORED', 'relationship_props': { @@ -54,9 +61,9 @@ def test_create_link_success(client): assert data['status'] == 'success' assert 'link' in data assert data['link']['name'] == 'Authors to Files' - assert data['link']['source_type'] == 'csv' - assert data['link']['target_type'] == 'label' - assert data['link']['match_strategy'] == 'property' + assert data['link']['source_label'] == 'Author' + assert data['link']['target_label'] == 'File' + assert data['link']['match_strategy'] == 'table_import' assert data['link']['relationship_type'] == 'AUTHORED' assert 'id' in data['link'] @@ -78,11 +85,10 @@ def test_create_link_missing_name(client): def test_create_link_invalid_source_type(client): - """Test creating link with invalid source_type fails.""" + """Test creating link without source_label fails (Label→Label model).""" payload = { 'name': 'Bad Link', - 'source_type': 'invalid', - 'target_type': 'label', + 'target_label': 'File', 'match_strategy': 'property', 'relationship_type': 'RELATED' } @@ -91,15 +97,14 @@ def test_create_link_invalid_source_type(client): assert response.status_code == 400 data = response.get_json() assert data['status'] == 'error' - assert 'source_type' in data['error'].lower() + assert 'source_label' in data['error'].lower() def test_create_link_invalid_target_type(client): - """Test creating link with invalid target_type fails.""" + """Test creating link without target_label fails (Label→Label model).""" payload = { 'name': 'Bad Link', - 'source_type': 'graph', - 'target_type': 'invalid', + 'source_label': 'Person', 'match_strategy': 'property', 'relationship_type': 'RELATED' } @@ -108,15 +113,22 @@ def test_create_link_invalid_target_type(client): assert response.status_code == 400 data = response.get_json() assert data['status'] == 'error' - assert 'target_type' in data['error'].lower() + assert 'target_label' in data['error'].lower() def test_create_link_invalid_match_strategy(client): """Test creating link with invalid match_strategy fails.""" + # Create labels first + client.post('/api/labels', json={ + 'name': 'TestLabel', + 'properties': [], + 'relationships': [] + }) + payload = { 'name': 'Bad Link', - 'source_type': 'graph', - 'target_type': 'label', + 'source_label': 'TestLabel', + 'target_label': 'TestLabel', 'match_strategy': 'invalid', 'relationship_type': 'RELATED' } @@ -130,10 +142,17 @@ def test_create_link_invalid_match_strategy(client): def test_create_link_missing_relationship_type(client): """Test creating link without relationship_type fails.""" + # Create labels first + client.post('/api/labels', json={ + 'name': 'Person', + 'properties': [], + 'relationships': [] + }) + payload = { 'name': 'Bad Link', - 'source_type': 'graph', - 'target_type': 'label', + 'source_label': 'Person', + 'target_label': 'Person', 'match_strategy': 'property' } @@ -146,13 +165,23 @@ def test_create_link_missing_relationship_type(client): def test_get_link_success(client): """Test retrieving an existing link definition.""" - # First create a link + # Create labels first + client.post('/api/labels', json={ + 'name': 'Person', + 'properties': [{'name': 'email', 'type': 'string'}], + 'relationships': [] + }) + client.post('/api/labels', json={ + 'name': 'File', + 'properties': [{'name': 'path', 'type': 'string'}], + 'relationships': [] + }) + + # Create a link payload = { 'name': 'Test Link', - 'source_type': 'graph', - 'source_config': {'label': 'Person'}, - 'target_type': 'label', - 'target_config': {'label': 'File'}, + 'source_label': 'Person', + 'target_label': 'File', 'match_strategy': 'property', 'match_config': {'source_field': 'email', 'target_field': 'author'}, 'relationship_type': 'AUTHORED', @@ -180,13 +209,23 @@ def test_get_link_not_found(client): def test_update_link_success(client): """Test updating an existing link definition.""" + # Create labels first + client.post('/api/labels', json={ + 'name': 'Person', + 'properties': [{'name': 'email', 'type': 'string'}], + 'relationships': [] + }) + client.post('/api/labels', json={ + 'name': 'File', + 'properties': [{'name': 'path', 'type': 'string'}], + 'relationships': [] + }) + # Create a link payload = { 'name': 'Original Name', - 'source_type': 'graph', - 'source_config': {'label': 'Person'}, - 'target_type': 'label', - 'target_config': {'label': 'File'}, + 'source_label': 'Person', + 'target_label': 'File', 'match_strategy': 'property', 'match_config': {'source_field': 'email', 'target_field': 'author'}, 'relationship_type': 'AUTHORED', @@ -210,13 +249,23 @@ def test_update_link_success(client): def test_delete_link_success(client): """Test deleting a link definition.""" + # Create labels first + client.post('/api/labels', json={ + 'name': 'Person', + 'properties': [{'name': 'email', 'type': 'string'}], + 'relationships': [] + }) + client.post('/api/labels', json={ + 'name': 'File', + 'properties': [{'name': 'path', 'type': 'string'}], + 'relationships': [] + }) + # Create a link payload = { 'name': 'To Delete', - 'source_type': 'graph', - 'source_config': {'label': 'Person'}, - 'target_type': 'label', - 'target_config': {'label': 'File'}, + 'source_label': 'Person', + 'target_label': 'File', 'match_strategy': 'property', 'match_config': {'source_field': 'email', 'target_field': 'author'}, 'relationship_type': 'AUTHORED', @@ -246,6 +295,18 @@ def test_delete_link_not_found(client): def test_list_links_after_create(client): """Test that created links appear in list.""" + # Create labels first + client.post('/api/labels', json={ + 'name': 'Person', + 'properties': [{'name': 'email', 'type': 'string'}], + 'relationships': [] + }) + client.post('/api/labels', json={ + 'name': 'File', + 'properties': [{'name': 'path', 'type': 'string'}], + 'relationships': [] + }) + # Get initial count initial_response = client.get('/api/links') initial_count = len(initial_response.get_json()['links']) @@ -255,10 +316,8 @@ def test_list_links_after_create(client): for i in range(3): payload = { 'name': f'Link {i}', - 'source_type': 'graph', - 'source_config': {'label': 'Person'}, - 'target_type': 'label', - 'target_config': {'label': 'File'}, + 'source_label': 'Person', + 'target_label': 'File', 'match_strategy': 'property', 'match_config': {'source_field': 'email', 'target_field': 'author'}, 'relationship_type': f'REL_{i}', @@ -312,3 +371,203 @@ def test_get_job_status_not_found(client): assert response.status_code == 404 data = response.get_json() assert data['status'] == 'error' + + +# === Label→Label Refactor Tests === + + +def test_create_link_with_labels(client): + """Test creating a link with source_label and target_label (new model).""" + # First, create labels + client.post('/api/labels', json={ + 'name': 'Person', + 'properties': [{'name': 'email', 'type': 'string'}], + 'relationships': [] + }) + client.post('/api/labels', json={ + 'name': 'File', + 'properties': [{'name': 'path', 'type': 'string'}], + 'relationships': [] + }) + + payload = { + 'name': 'Person to File', + 'source_label': 'Person', + 'target_label': 'File', + 'match_strategy': 'property', + 'match_config': { + 'source_field': 'email', + 'target_field': 'author_email' + }, + 'relationship_type': 'AUTHORED' + } + + response = client.post('/api/links', json=payload) + assert response.status_code == 200 + data = response.get_json() + assert data['status'] == 'success' + assert data['link']['source_label'] == 'Person' + assert data['link']['target_label'] == 'File' + assert data['link']['match_strategy'] == 'property' + + +def test_create_link_missing_source_label(client): + """Test that source_label is required.""" + payload = { + 'name': 'Bad Link', + 'target_label': 'File', + 'match_strategy': 'property', + 'relationship_type': 'RELATED' + } + + response = client.post('/api/links', json=payload) + assert response.status_code == 400 + data = response.get_json() + assert data['status'] == 'error' + assert 'source_label' in data['error'].lower() + + +def test_create_link_missing_target_label(client): + """Test that target_label is required.""" + payload = { + 'name': 'Bad Link', + 'source_label': 'Person', + 'match_strategy': 'property', + 'relationship_type': 'RELATED' + } + + response = client.post('/api/links', json=payload) + assert response.status_code == 400 + data = response.get_json() + assert data['status'] == 'error' + assert 'target_label' in data['error'].lower() + + +def test_create_link_nonexistent_label(client): + """Test that labels must exist in registry.""" + payload = { + 'name': 'Bad Link', + 'source_label': 'NonexistentLabel', + 'target_label': 'AlsoDoesNotExist', + 'match_strategy': 'property', + 'relationship_type': 'RELATED' + } + + response = client.post('/api/links', json=payload) + assert response.status_code == 400 + data = response.get_json() + assert data['status'] == 'error' + + +def test_create_link_fuzzy_match_strategy(client): + """Test creating link with fuzzy match strategy.""" + # Create labels first + client.post('/api/labels', json={ + 'name': 'Author', + 'properties': [{'name': 'name', 'type': 'string'}], + 'relationships': [] + }) + client.post('/api/labels', json={ + 'name': 'Document', + 'properties': [{'name': 'author_name', 'type': 'string'}], + 'relationships': [] + }) + + payload = { + 'name': 'Author to Document (Fuzzy)', + 'source_label': 'Author', + 'target_label': 'Document', + 'match_strategy': 'fuzzy', + 'match_config': { + 'source_field': 'name', + 'target_field': 'author_name', + 'threshold': 85 + }, + 'relationship_type': 'AUTHORED' + } + + response = client.post('/api/links', json=payload) + assert response.status_code == 200 + data = response.get_json() + assert data['status'] == 'success' + assert data['link']['match_strategy'] == 'fuzzy' + + +def test_create_link_table_import_strategy(client): + """Test creating link with table_import match strategy.""" + # Create labels + client.post('/api/labels', json={ + 'name': 'Project', + 'properties': [{'name': 'name', 'type': 'string'}], + 'relationships': [] + }) + + payload = { + 'name': 'Import Projects from CSV', + 'source_label': 'Project', + 'target_label': 'Project', + 'match_strategy': 'table_import', + 'match_config': { + 'table_data': 'name,budget\nProject A,100000\nProject B,200000' + }, + 'relationship_type': 'RELATED_TO' + } + + response = client.post('/api/links', json=payload) + assert response.status_code == 200 + data = response.get_json() + assert data['status'] == 'success' + assert data['link']['match_strategy'] == 'table_import' + + +def test_create_link_api_endpoint_strategy(client): + """Test creating link with api_endpoint match strategy.""" + # Create labels + client.post('/api/labels', json={ + 'name': 'User', + 'properties': [{'name': 'id', 'type': 'number'}], + 'relationships': [] + }) + + payload = { + 'name': 'Fetch Users from API', + 'source_label': 'User', + 'target_label': 'User', + 'match_strategy': 'api_endpoint', + 'match_config': { + 'url': 'https://api.example.com/users', + 'json_path': '$.data.users[*]' + }, + 'relationship_type': 'RELATED_TO' + } + + response = client.post('/api/links', json=payload) + assert response.status_code == 200 + data = response.get_json() + assert data['status'] == 'success' + assert data['link']['match_strategy'] == 'api_endpoint' + + +def test_get_available_labels(client): + """Test getting available labels for dropdown population.""" + # Create some labels + client.post('/api/labels', json={ + 'name': 'Person', + 'properties': [{'name': 'name', 'type': 'string'}], + 'relationships': [] + }) + client.post('/api/labels', json={ + 'name': 'File', + 'properties': [{'name': 'path', 'type': 'string'}], + 'relationships': [] + }) + + response = client.get('/api/links/available-labels') + assert response.status_code == 200 + data = response.get_json() + assert data['status'] == 'success' + assert 'labels' in data + assert len(data['labels']) >= 2 + label_names = [l['name'] for l in data['labels']] + assert 'Person' in label_names + assert 'File' in label_names diff --git a/tests/test_table_format_registry.py b/tests/test_table_format_registry.py new file mode 100644 index 0000000..9b342c5 --- /dev/null +++ b/tests/test_table_format_registry.py @@ -0,0 +1,332 @@ +""" +Tests for Table Format Registry. + +Tests CRUD operations, format detection, and data preview. +""" +import pytest +import tempfile +import os +import io +import csv +from scidk.core.table_format_registry import TableFormatRegistry + + +@pytest.fixture +def registry(): + """Create a temporary registry for testing.""" + # Use temporary database + with tempfile.NamedTemporaryFile(suffix='.db', delete=False) as tmp: + db_path = tmp.name + + reg = TableFormatRegistry(db_path=db_path) + + yield reg + + # Cleanup + reg.db.close() + if os.path.exists(db_path): + os.unlink(db_path) + + +def test_preprogrammed_formats_seeded(registry): + """Test that preprogrammed formats are seeded on initialization.""" + formats = registry.list_formats(include_preprogrammed=True) + + # Should have at least the 4 preprogrammed formats + preprogrammed = [f for f in formats if f['is_preprogrammed']] + assert len(preprogrammed) == 4 + + names = {f['name'] for f in preprogrammed} + assert 'CSV (Standard)' in names + assert 'TSV (Standard)' in names + assert 'Excel (Standard)' in names + assert 'Parquet (Standard)' in names + + +def test_create_format(registry): + """Test creating a new table format.""" + format_data = { + 'name': 'My CSV Format', + 'file_type': 'csv', + 'delimiter': ';', + 'encoding': 'utf-8', + 'has_header': True, + 'header_row': 0, + 'target_label': 'Person', + 'description': 'Custom semicolon-separated format' + } + + format_config = registry.create_format(format_data) + + assert format_config['id'] is not None + assert format_config['name'] == 'My CSV Format' + assert format_config['file_type'] == 'csv' + assert format_config['delimiter'] == ';' + assert format_config['encoding'] == 'utf-8' + assert format_config['has_header'] is True + assert format_config['header_row'] == 0 + assert format_config['target_label'] == 'Person' + assert format_config['description'] == 'Custom semicolon-separated format' + assert format_config['is_preprogrammed'] is False + + +def test_create_format_validation(registry): + """Test format creation validation.""" + # Missing name + with pytest.raises(ValueError, match="Format name is required"): + registry.create_format({'file_type': 'csv'}) + + # Missing file_type + with pytest.raises(ValueError, match="File type is required"): + registry.create_format({'name': 'Test'}) + + # Invalid file_type + with pytest.raises(ValueError, match="File type must be one of"): + registry.create_format({'name': 'Test', 'file_type': 'invalid'}) + + +def test_create_duplicate_name(registry): + """Test that duplicate names are rejected.""" + data = { + 'name': 'Duplicate Format', + 'file_type': 'csv' + } + + # First creation should succeed + registry.create_format(data) + + # Second creation with same name should fail + with pytest.raises(ValueError, match="already exists"): + registry.create_format(data) + + +def test_get_format(registry): + """Test retrieving a format by ID.""" + data = { + 'name': 'Get Test Format', + 'file_type': 'tsv', + 'delimiter': '\t' + } + + created = registry.create_format(data) + format_id = created['id'] + + retrieved = registry.get_format(format_id) + + assert retrieved is not None + assert retrieved['id'] == format_id + assert retrieved['name'] == 'Get Test Format' + assert retrieved['file_type'] == 'tsv' + + +def test_get_nonexistent_format(registry): + """Test retrieving a format that doesn't exist.""" + result = registry.get_format('nonexistent-id') + assert result is None + + +def test_list_formats(registry): + """Test listing all formats.""" + # Create custom formats + registry.create_format({'name': 'Custom CSV', 'file_type': 'csv'}) + registry.create_format({'name': 'Custom TSV', 'file_type': 'tsv'}) + + # List all formats + all_formats = registry.list_formats(include_preprogrammed=True) + assert len(all_formats) >= 6 # 4 preprogrammed + 2 custom + + # List only custom formats + custom_formats = registry.list_formats(include_preprogrammed=False) + assert len(custom_formats) == 2 + assert all(not f['is_preprogrammed'] for f in custom_formats) + + +def test_update_format(registry): + """Test updating an existing format.""" + # Create format + data = { + 'name': 'Update Test', + 'file_type': 'csv', + 'delimiter': ',' + } + created = registry.create_format(data) + format_id = created['id'] + + # Update format + updates = { + 'name': 'Updated Name', + 'delimiter': ';', + 'target_label': 'NewLabel' + } + updated = registry.update_format(format_id, updates) + + assert updated['name'] == 'Updated Name' + assert updated['delimiter'] == ';' + assert updated['target_label'] == 'NewLabel' + assert updated['file_type'] == 'csv' # Unchanged + + +def test_update_nonexistent_format(registry): + """Test updating a format that doesn't exist.""" + with pytest.raises(ValueError, match="not found"): + registry.update_format('nonexistent-id', {'name': 'New Name'}) + + +def test_update_preprogrammed_format(registry): + """Test that preprogrammed formats cannot be updated.""" + formats = registry.list_formats(include_preprogrammed=True) + preprogrammed = [f for f in formats if f['is_preprogrammed']][0] + + with pytest.raises(ValueError, match="Cannot modify preprogrammed"): + registry.update_format(preprogrammed['id'], {'name': 'New Name'}) + + +def test_delete_format(registry): + """Test deleting a format.""" + # Create format + data = {'name': 'Delete Test', 'file_type': 'csv'} + created = registry.create_format(data) + format_id = created['id'] + + # Delete format + deleted = registry.delete_format(format_id) + assert deleted is True + + # Verify deleted + retrieved = registry.get_format(format_id) + assert retrieved is None + + +def test_delete_nonexistent_format(registry): + """Test deleting a format that doesn't exist.""" + deleted = registry.delete_format('nonexistent-id') + assert deleted is False + + +def test_delete_preprogrammed_format(registry): + """Test that preprogrammed formats cannot be deleted.""" + formats = registry.list_formats(include_preprogrammed=True) + preprogrammed = [f for f in formats if f['is_preprogrammed']][0] + + with pytest.raises(ValueError, match="Cannot delete preprogrammed"): + registry.delete_format(preprogrammed['id']) + + +def test_detect_format_csv(registry): + """Test auto-detecting CSV format.""" + # Create sample CSV content + csv_content = "Name,Email,Age\nJohn,john@example.com,30\nJane,jane@example.com,25" + file_bytes = csv_content.encode('utf-8') + + detected = registry.detect_format(file_bytes, filename='test.csv') + + assert detected['file_type'] == 'csv' + assert detected['delimiter'] == ',' + assert detected['encoding'] == 'utf-8' + assert detected['has_header'] is True + assert detected['sample_columns'] == ['Name', 'Email', 'Age'] + + +def test_detect_format_tsv(registry): + """Test auto-detecting TSV format.""" + # Create sample TSV content + tsv_content = "Name\tEmail\tAge\nJohn\tjohn@example.com\t30\nJane\tjane@example.com\t25" + file_bytes = tsv_content.encode('utf-8') + + detected = registry.detect_format(file_bytes, filename='test.tsv') + + assert detected['file_type'] == 'tsv' + assert detected['delimiter'] == '\t' + assert detected['encoding'] == 'utf-8' + assert detected['has_header'] is True + assert detected['sample_columns'] == ['Name', 'Email', 'Age'] + + +def test_detect_format_fallback_encoding(registry): + """Test detecting format with non-UTF-8 encoding falls back to latin-1.""" + # Create binary data that's not valid UTF-8 but valid latin-1 + invalid_utf8_bytes = bytes([0xFF, 0xFE, 0xFD]) + + detected = registry.detect_format(invalid_utf8_bytes, filename='test.csv') + + # Should fall back to latin-1 encoding + assert detected['encoding'] in ['latin-1', 'utf-16'] # Could detect either + assert detected['file_type'] == 'csv' + + +def test_preview_data_csv(registry): + """Test previewing CSV data.""" + # Create CSV format + format_data = { + 'name': 'Preview Test CSV', + 'file_type': 'csv', + 'delimiter': ',', + 'encoding': 'utf-8', + 'has_header': True, + 'header_row': 0 + } + format_config = registry.create_format(format_data) + format_id = format_config['id'] + + # Create sample CSV content + csv_content = "Name,Email,Age\nJohn,john@example.com,30\nJane,jane@example.com,25\nBob,bob@example.com,35" + file_bytes = csv_content.encode('utf-8') + + preview = registry.preview_data(file_bytes, format_id, num_rows=2) + + assert 'error' not in preview + assert preview['columns'] == ['Name', 'Email', 'Age'] + assert len(preview['rows']) == 2 + assert preview['rows'][0]['Name'] == 'John' + assert preview['rows'][1]['Name'] == 'Jane' + assert preview['total_rows'] == 3 + + +def test_preview_data_invalid_format(registry): + """Test previewing data with non-existent format.""" + file_bytes = b"test" + preview = registry.preview_data(file_bytes, 'nonexistent-id') + + assert 'error' in preview + assert 'not found' in preview['error'] + + +def test_column_mappings(registry): + """Test creating format with column mappings.""" + format_data = { + 'name': 'Mapped Format', + 'file_type': 'csv', + 'column_mappings': { + 'Name': { + 'label_property': 'full_name', + 'type_hint': 'string', + 'ignore': False + }, + 'Age': { + 'label_property': 'age_years', + 'type_hint': 'number', + 'ignore': False + } + } + } + + format_config = registry.create_format(format_data) + + assert 'column_mappings' in format_config + assert 'Name' in format_config['column_mappings'] + assert format_config['column_mappings']['Name']['label_property'] == 'full_name' + assert format_config['column_mappings']['Age']['type_hint'] == 'number' + + +def test_format_with_sheet_name(registry): + """Test creating Excel format with sheet name.""" + format_data = { + 'name': 'Excel with Sheet', + 'file_type': 'excel', + 'sheet_name': 'Data' + } + + format_config = registry.create_format(format_data) + + assert format_config['sheet_name'] == 'Data' + assert format_config['file_type'] == 'excel'