diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/ContentExclusionUtil.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/ContentExclusionUtil.java index cd2ba9618..39cd5aee2 100644 --- a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/ContentExclusionUtil.java +++ b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/ContentExclusionUtil.java @@ -15,6 +15,7 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; +import javax.xml.namespace.NamespaceContext; import javax.xml.namespace.QName; import javax.xml.parsers.DocumentBuilder; import javax.xml.transform.OutputKeys; @@ -29,6 +30,8 @@ import java.io.ByteArrayInputStream; import java.io.StringWriter; import java.nio.charset.StandardCharsets; +import java.util.Iterator; +import java.util.Map; /** * Utility class for applying content exclusions to documents before hash calculation. @@ -99,23 +102,28 @@ private static void removeNodeAtPointer(String uri, JsonNode rootNode, String js * * @param uri the document URI (used for logging purposes) * @param xmlContent the XML content as a string + * @param namespaces a map of namespace prefixes to URIs for use in XPath expressions, or null * @param xpathExpressions array of XPath expressions identifying elements to exclude * @return the modified XML content with specified elements removed * @throws Exception if the XML content cannot be parsed or serialized */ - static String applyXmlExclusions(String uri, String xmlContent, String... xpathExpressions) throws Exception { + static String applyXmlExclusions(String uri, String xmlContent, Map namespaces, String... xpathExpressions) throws Exception { if (xpathExpressions == null || xpathExpressions.length == 0) { return xmlContent; } DocumentBuilder builder = XmlFactories.getDocumentBuilderFactory().newDocumentBuilder(); Document document = builder.parse(new ByteArrayInputStream(xmlContent.getBytes(StandardCharsets.UTF_8))); - applyXmlExclusions(uri, document, xpathExpressions); + applyXmlExclusions(uri, document, namespaces, xpathExpressions); return serializeDocument(document); } - private static void applyXmlExclusions(String uri, Document document, String[] xpathExpressions) { + private static void applyXmlExclusions(String uri, Document document, Map namespaces, String[] xpathExpressions) { final XPath xpath = XmlFactories.getXPathFactory().newXPath(); + if (namespaces != null && !namespaces.isEmpty()) { + xpath.setNamespaceContext(new SimpleNamespaceContext(namespaces)); + } + for (String xpathExpression : xpathExpressions) { try { XPathExpression expr = xpath.compile(xpathExpression); diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java index c48d95273..fc0546798 100644 --- a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java +++ b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java @@ -12,6 +12,7 @@ import com.marklogic.client.document.DocumentWriteSet; import com.marklogic.client.io.JacksonHandle; +import java.util.Map; import java.util.function.Consumer; /** @@ -31,8 +32,8 @@ class IncrementalWriteEvalFilter extends IncrementalWriteFilter { """; IncrementalWriteEvalFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson, - Consumer skippedDocumentsConsumer, String[] jsonExclusions, String[] xmlExclusions) { - super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions); + Consumer skippedDocumentsConsumer, String[] jsonExclusions, String[] xmlExclusions, Map xmlNamespaces) { + super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions, xmlNamespaces); } @Override diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java index f7de86a12..730910c0b 100644 --- a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java +++ b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java @@ -25,6 +25,7 @@ import java.time.Instant; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.function.Consumer; import java.util.function.Function; @@ -51,6 +52,7 @@ public static class Builder { private Consumer skippedDocumentsConsumer; private String[] jsonExclusions; private String[] xmlExclusions; + private Map xmlNamespaces; /** * @param keyName the name of the MarkLogic metadata key that will hold the hash value; defaults to "incrementalWriteHash". @@ -117,13 +119,22 @@ public Builder xmlExclusions(String... xpathExpressions) { return this; } + /** + * @param namespaces a map of namespace prefixes to URIs for use in XPath exclusion expressions. + * For example, Map.of("ns", "http://example.com/ns") allows XPath like "//ns:timestamp". + */ + public Builder xmlNamespaces(Map namespaces) { + this.xmlNamespaces = namespaces; + return this; + } + public IncrementalWriteFilter build() { validateJsonExclusions(); validateXmlExclusions(); if (useEvalQuery) { - return new IncrementalWriteEvalFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions); + return new IncrementalWriteEvalFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions, xmlNamespaces); } - return new IncrementalWriteOpticFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions); + return new IncrementalWriteOpticFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions, xmlNamespaces); } private void validateJsonExclusions() { @@ -151,6 +162,9 @@ private void validateXmlExclusions() { return; } XPath xpath = XmlFactories.getXPathFactory().newXPath(); + if (xmlNamespaces != null && !xmlNamespaces.isEmpty()) { + xpath.setNamespaceContext(new SimpleNamespaceContext(xmlNamespaces)); + } for (String xpathExpression : xmlExclusions) { if (xpathExpression == null || xpathExpression.trim().isEmpty()) { throw new IllegalArgumentException( @@ -173,18 +187,20 @@ private void validateXmlExclusions() { private final Consumer skippedDocumentsConsumer; private final String[] jsonExclusions; private final String[] xmlExclusions; + private final Map xmlNamespaces; // Hardcoding this for now, with a good general purpose hashing function. // See https://xxhash.com for benchmarks. private final LongHashFunction hashFunction = LongHashFunction.xx3(); - public IncrementalWriteFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson, Consumer skippedDocumentsConsumer, String[] jsonExclusions, String[] xmlExclusions) { + public IncrementalWriteFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson, Consumer skippedDocumentsConsumer, String[] jsonExclusions, String[] xmlExclusions, Map xmlNamespaces) { this.hashKeyName = hashKeyName; this.timestampKeyName = timestampKeyName; this.canonicalizeJson = canonicalizeJson; this.skippedDocumentsConsumer = skippedDocumentsConsumer; this.jsonExclusions = jsonExclusions; this.xmlExclusions = xmlExclusions; + this.xmlNamespaces = xmlNamespaces; } protected final DocumentWriteSet filterDocuments(Context context, Function hashRetriever) { @@ -260,7 +276,7 @@ private String serializeContent(DocumentWriteOperation doc) { } } else if (xmlExclusions != null && xmlExclusions.length > 0) { try { - content = ContentExclusionUtil.applyXmlExclusions(doc.getUri(), content, xmlExclusions); + content = ContentExclusionUtil.applyXmlExclusions(doc.getUri(), content, xmlNamespaces, xmlExclusions); } catch (Exception e) { logger.warn("Unable to apply XML exclusions for URI {}, using original content for hashing; cause: {}", doc.getUri(), e.getMessage()); diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java index d760f3ab4..a52d21ad3 100644 --- a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java +++ b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java @@ -20,8 +20,8 @@ class IncrementalWriteOpticFilter extends IncrementalWriteFilter { IncrementalWriteOpticFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson, - Consumer skippedDocumentsConsumer, String[] jsonExclusions, String[] xmlExclusions) { - super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions); + Consumer skippedDocumentsConsumer, String[] jsonExclusions, String[] xmlExclusions, Map xmlNamespaces) { + super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions, xmlNamespaces); } @Override diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/SimpleNamespaceContext.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/SimpleNamespaceContext.java new file mode 100644 index 000000000..196b36d71 --- /dev/null +++ b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/SimpleNamespaceContext.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2010-2026 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. + */ +package com.marklogic.client.datamovement.filter; + +import javax.xml.namespace.NamespaceContext; +import java.util.Iterator; +import java.util.Map; + +/** + * A simple implementation of {@link NamespaceContext} backed by a Map of prefix to namespace URI mappings. + * Used for XPath evaluation with namespace-qualified expressions. + * + * @since 8.1.0 + */ +class SimpleNamespaceContext implements NamespaceContext { + + private final Map prefixToNamespaceUri; + + SimpleNamespaceContext(Map prefixToNamespaceUri) { + this.prefixToNamespaceUri = prefixToNamespaceUri; + } + + @Override + public String getNamespaceURI(String prefix) { + return prefixToNamespaceUri.get(prefix); + } + + @Override + public String getPrefix(String namespaceURI) { + for (Map.Entry entry : prefixToNamespaceUri.entrySet()) { + if (entry.getValue().equals(namespaceURI)) { + return entry.getKey(); + } + } + return null; + } + + @Override + public Iterator getPrefixes(String namespaceURI) { + return prefixToNamespaceUri.entrySet().stream() + .filter(entry -> entry.getValue().equals(namespaceURI)) + .map(Map.Entry::getKey) + .iterator(); + } +} diff --git a/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/ApplyExclusionsToIncrementalWriteTest.java b/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/ApplyExclusionsToIncrementalWriteTest.java index 4c418e578..1597ef6df 100644 --- a/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/ApplyExclusionsToIncrementalWriteTest.java +++ b/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/ApplyExclusionsToIncrementalWriteTest.java @@ -11,8 +11,9 @@ import org.junit.jupiter.api.Test; import java.util.ArrayList; +import java.util.Map; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.*; class ApplyExclusionsToIncrementalWriteTest extends AbstractIncrementalWriteTest { @@ -85,15 +86,17 @@ void xmlExclusions() { // Write initial documents docs = new ArrayList<>(); for (int i = 1; i <= 5; i++) { - String xml = "" + - "" + i + "" + - "Document " + i + "" + - "2025-01-01T10:00:00Z" + - "" + - "Test User" + - "2025-01-01T10:00:00Z" + - "" + - ""; + String xml = """ + + %d + Document %d + 2025-01-01T10:00:00Z + + Test User + 2025-01-01T10:00:00Z + + + """.formatted(i, i); docs.add(new DocumentWriteOperationImpl("/incremental/test/xml-doc-" + i + ".xml", METADATA, new StringHandle(xml).withFormat(Format.XML))); } @@ -104,15 +107,17 @@ void xmlExclusions() { // Write again with different values for excluded fields - should be skipped docs = new ArrayList<>(); for (int i = 1; i <= 5; i++) { - String xml = "" + - "" + i + "" + - "Document " + i + "" + - "2026-01-02T15:30:00Z" + // Changed - "" + - "Test User" + - "2026-01-02T15:30:00Z" + // Changed - "" + - ""; + String xml = """ + + %d + Document %d + 2026-01-02T15:30:00Z + + Test User + 2026-01-02T15:30:00Z + + + """.formatted(i, i); docs.add(new DocumentWriteOperationImpl("/incremental/test/xml-doc-" + i + ".xml", METADATA, new StringHandle(xml).withFormat(Format.XML))); } @@ -123,15 +128,17 @@ void xmlExclusions() { // Write again with actual content change - should NOT be skipped docs = new ArrayList<>(); for (int i = 1; i <= 5; i++) { - String xml = "" + - "" + i + "" + - "Modified Document " + i + "" + // Changed - "2026-01-02T16:00:00Z" + - "" + - "Test User" + - "2026-01-02T16:00:00Z" + - "" + - ""; + String xml = """ + + %d + Modified Document %d + 2026-01-02T16:00:00Z + + Test User + 2026-01-02T16:00:00Z + + + """.formatted(i, i); docs.add(new DocumentWriteOperationImpl("/incremental/test/xml-doc-" + i + ".xml", METADATA, new StringHandle(xml).withFormat(Format.XML))); } @@ -158,7 +165,12 @@ void jsonExclusionsIgnoredForXmlDocuments() { jsonDoc.put("timestamp", "2025-01-01T10:00:00Z"); docs.add(new DocumentWriteOperationImpl("/incremental/test/mixed-doc.json", METADATA, new JacksonHandle(jsonDoc))); - String xmlDoc = "12025-01-01T10:00:00Z"; + String xmlDoc = """ + + 1 + 2025-01-01T10:00:00Z + + """; docs.add(new DocumentWriteOperationImpl("/incremental/test/mixed-doc.xml", METADATA, new StringHandle(xmlDoc).withFormat(Format.XML))); writeDocs(docs); @@ -172,7 +184,12 @@ void jsonExclusionsIgnoredForXmlDocuments() { jsonDoc.put("timestamp", "2026-01-02T15:30:00Z"); // Changed docs.add(new DocumentWriteOperationImpl("/incremental/test/mixed-doc.json", METADATA, new JacksonHandle(jsonDoc))); - xmlDoc = "12026-01-02T15:30:00Z"; // Changed + xmlDoc = """ + + 1 + 2026-01-02T15:30:00Z + + """; docs.add(new DocumentWriteOperationImpl("/incremental/test/mixed-doc.xml", METADATA, new StringHandle(xmlDoc).withFormat(Format.XML))); writeDocs(docs); @@ -242,4 +259,76 @@ void jsonCanonicalizedProducesSameHashForReorderedKeys() { assertEquals(1, writtenCount.get(), "Document should be skipped because canonicalized JSON produces the same hash"); assertEquals(1, skippedCount.get(), "One document should be skipped"); } + + @Test + void xmlExclusionsWithNamespaces() { + filter = IncrementalWriteFilter.newBuilder() + .xmlExclusions("//ns:timestamp", "//ns:metadata/ns:lastModified") + .xmlNamespaces(Map.of("ns", "http://example.com/ns")) + .onDocumentsSkipped(docs -> skippedCount.addAndGet(docs.length)) + .build(); + + // Write initial documents with namespaced elements + docs = new ArrayList<>(); + for (int i = 1; i <= 3; i++) { + String xml = """ + + %d + Document %d + 2025-01-01T10:00:00Z + + Test User + 2025-01-01T10:00:00Z + + + """.formatted(i, i); + docs.add(new DocumentWriteOperationImpl("/incremental/test/ns-xml-doc-" + i + ".xml", METADATA, new StringHandle(xml).withFormat(Format.XML))); + } + + writeDocs(docs); + assertEquals(3, writtenCount.get()); + assertEquals(0, skippedCount.get()); + + // Write again with different values for excluded fields - should be skipped + docs = new ArrayList<>(); + for (int i = 1; i <= 3; i++) { + String xml = """ + + %d + Document %d + 2026-01-02T15:30:00Z + + Test User + 2026-01-02T15:30:00Z + + + """.formatted(i, i); + docs.add(new DocumentWriteOperationImpl("/incremental/test/ns-xml-doc-" + i + ".xml", METADATA, new StringHandle(xml).withFormat(Format.XML))); + } + + writeDocs(docs); + assertEquals(3, writtenCount.get(), "Documents should be skipped since only excluded fields changed"); + assertEquals(3, skippedCount.get()); + + // Write again with actual content change - should NOT be skipped + docs = new ArrayList<>(); + for (int i = 1; i <= 3; i++) { + String xml = """ + + %d + Modified Document %d + 2026-01-02T16:00:00Z + + Test User + 2026-01-02T16:00:00Z + + + """.formatted(i, i); + docs.add(new DocumentWriteOperationImpl("/incremental/test/ns-xml-doc-" + i + ".xml", METADATA, new StringHandle(xml).withFormat(Format.XML))); + } + + writeDocs(docs); + assertEquals(6, writtenCount.get(), "Documents should be written since non-excluded content changed"); + assertEquals(3, skippedCount.get(), "Skip count should remain at 3"); + } }