From ec9fb818e3a2cafc30250bc043db4cd44a1898de Mon Sep 17 00:00:00 2001
From: "Jonathan D.A. Jewell" <6759885+hyperpolymath@users.noreply.github.com>
Date: Fri, 13 Mar 2026 16:51:41 +0000
Subject: [PATCH] feat: wire NER entity extraction into Lithoglyph importer
 pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After evidence is imported from Lithoglyph, extract named entities from
content_text using regex-based NER (titled names, org suffixes, capitalised
sequences), resolve them against existing entities via Entities.resolve_ner_output
(exact → fuzzy → auto-create), and create :mentions relationship edges in
ArangoDB. Entity linking is best-effort — failures are logged but don't block
the import.

- Add NERExtractor module with 3 extraction strategies
- Wire NER into Importer.import_single_record post-create step
- Extend Relationship schema with :entity type and :mentions edges
- Update graph traversal helpers to handle entity nodes
- Add 13 unit tests for NER extraction (all passing)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lib/evidence_graph/lithoglyph/importer.ex     |  67 +++++++++
 .../lithoglyph/ner_extractor.ex               | 135 ++++++++++++++++++
 lib/evidence_graph/relationships.ex           |  20 ++-
 .../relationships/relationship.ex             |   8 +-
 mix.lock                                      |   4 +
 .../lithoglyph/ner_extractor_test.exs         | 130 +++++++++++++++++
 6 files changed, 358 insertions(+), 6 deletions(-)
 create mode 100644 lib/evidence_graph/lithoglyph/ner_extractor.ex
 create mode 100644 test/evidence_graph/lithoglyph/ner_extractor_test.exs

diff --git a/lib/evidence_graph/lithoglyph/importer.ex b/lib/evidence_graph/lithoglyph/importer.ex
index 7f5387a..2075974 100644
--- a/lib/evidence_graph/lithoglyph/importer.ex
+++ b/lib/evidence_graph/lithoglyph/importer.ex
@@ -30,8 +30,11 @@ defmodule EvidenceGraph.Lithoglyph.Importer do
   require Logger
 
   alias EvidenceGraph.Lithoglyph.Client, as: LithClient
+  alias EvidenceGraph.Lithoglyph.NERExtractor
   alias EvidenceGraph.ArangoDB
   alias EvidenceGraph.Evidence
+  alias EvidenceGraph.Entities
+  alias EvidenceGraph.Relationships
 
   @batch_size 100
   @progress_interval 50
@@ -204,6 +207,8 @@ defmodule EvidenceGraph.Lithoglyph.Importer do
         # Step 3: Create evidence in ArangoDB
         case Evidence.create_evidence(attrs) do
           {:ok, evidence} ->
+            # Step 4: Extract entities and create evidence→entity edges
+            link_entities_to_evidence(evidence, attrs, investigation_id)
             {:imported, evidence.id}
 
           {:error, reason} ->
@@ -215,6 +220,68 @@ defmodule EvidenceGraph.Lithoglyph.Importer do
     end
   end
 
+  # Extract named entities from evidence content and link them via graph edges.
+  # Failures here are logged but do NOT fail the overall import — entity linking
+  # is best-effort enrichment, not a hard requirement.
+  defp link_entities_to_evidence(evidence, attrs, investigation_id) do
+    ner_strings = NERExtractor.extract_from_evidence(attrs)
+
+    if ner_strings == [] do
+      Logger.debug("No entities extracted for evidence=#{evidence.id}")
+    else
+      Logger.info("Extracted #{length(ner_strings)} entity candidates for evidence=#{evidence.id}")
+
+      resolved = Entities.resolve_ner_output(ner_strings, investigation_id)
+
+      Enum.each(resolved, fn {ner_string, result} ->
+        case result do
+          {:existing, entity} ->
+            create_mentions_edge(evidence.id, entity.id, ner_string, 1.0)
+
+          {:created, entity} ->
+            create_mentions_edge(evidence.id, entity.id, ner_string, 0.8)
+
+          {:suggest_merge, entity, similarity} ->
+            # Link to the existing entity but with lower confidence (fuzzy match)
+            create_mentions_edge(evidence.id, entity.id, ner_string, similarity * 0.9)
+            Logger.info(
+              "Fuzzy match for '#{ner_string}' → '#{entity.primary_name}' " <>
+                "(similarity=#{Float.round(similarity, 3)})"
+            )
+
+          {:error, reason} ->
+            Logger.warning("Failed to resolve entity '#{ner_string}': #{inspect(reason)}")
+        end
+      end)
+    end
+  rescue
+    error ->
+      Logger.warning("Entity linking failed for evidence=#{evidence.id}: #{inspect(error)}")
+  end
+
+  defp create_mentions_edge(evidence_id, entity_id, ner_string, confidence) do
+    case Relationships.create_relationship(%{
+           from_id: evidence_id,
+           from_type: :evidence,
+           to_id: entity_id,
+           to_type: :entity,
+           relationship_type: :mentions,
+           weight: 1.0,
+           confidence: confidence,
+           reasoning: "NER extraction matched '#{ner_string}'",
+           created_by: "lithoglyph_importer",
+           metadata: %{"ner_source" => ner_string, "extraction_method" => "regex"}
+         }) do
+      {:ok, _rel} ->
+        :ok
+
+      {:error, reason} ->
+        Logger.warning(
+          "Failed to create mentions edge evidence=#{evidence_id} → entity=#{entity_id}: #{inspect(reason)}"
+        )
+    end
+  end
+
   defp check_duplicate(nil), do: {:ok, :new}
 
   defp check_duplicate(sha256) do
diff --git a/lib/evidence_graph/lithoglyph/ner_extractor.ex b/lib/evidence_graph/lithoglyph/ner_extractor.ex
new file mode 100644
index 0000000..4e992e3
--- /dev/null
+++ b/lib/evidence_graph/lithoglyph/ner_extractor.ex
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: PMPL-1.0-or-later
+# Copyright (c) 2026 Jonathan D.A. Jewell (hyperpolymath) <j.d.a.jewell@open.ac.uk>
+defmodule EvidenceGraph.Lithoglyph.NERExtractor do
+  @moduledoc """
+  Regex-based Named Entity Recognition for evidence content.
+
+  Extracts candidate entity names from `content_text` using pattern
+  matching heuristics. This is a lightweight extraction layer — the
+  heavy lifting of deduplication and fuzzy matching is handled by
+  `EvidenceGraph.Entities.resolve_ner_output/2`.
+
+  ## Extraction Strategies
+
+  1. **Capitalised multi-word sequences** — "Jeffrey Epstein", "HSBC Holdings"
+  2. **Title-prefixed names** — "Dr. Jane Smith", "Prof. Alan Turing"
+  3. **Organisation suffixes** — "Acme Corp", "Global Fund Ltd"
+  4. **Location indicators** — "New York City", "United Kingdom"
+
+  ## Usage
+
+      iex> NERExtractor.extract("The ONS reported that HSBC Holdings received funding.")
+      ["ONS", "HSBC Holdings"]
+  """
+
+  require Logger
+
+  # Titles that precede person names
+  @title_prefixes ~w(Mr Mrs Ms Dr Prof Sir Dame Lord Lady Judge Justice Sen Rep Gov)
+
+  # Organisation suffixes (case-insensitive matching applied separately)
+  @org_suffixes ~w(
+    Ltd LLC Inc Corp plc PLC Group Foundation Trust Fund
+    Association Institute University College Council Commission
+    Authority Agency Bureau Department Ministry
+  )
+
+  # Common words that look like proper nouns but aren't entities
+  @stopwords MapSet.new(~w(
+    The This That These Those Which Where When What Who How
+    January February March April May June July August September October November December
+    Monday Tuesday Wednesday Thursday Friday Saturday Sunday
+    Section Article Chapter Page Table Figure
+    However Moreover Furthermore Additionally Nevertheless
+    According Also Although Because Before Between During
+    Evidence Investigation Report Document Statement
+    Said Says Told Asked Added Noted Claimed Stated
+    New Old First Last Next Previous Current Former
+    United States Kingdom Nations
+  ))
+
+  @doc """
+  Extract candidate entity names from content text.
+
+  Returns a deduplicated list of entity name strings, ordered by first
+  appearance in the text.
+  """
+  @spec extract(String.t()) :: [String.t()]
+  def extract(nil), do: []
+  def extract(""), do: []
+
+  def extract(content) when is_binary(content) do
+    # Normalize whitespace: collapse newlines/tabs into single spaces
+    # so regexes don't span across line breaks
+    normalized = String.replace(content, ~r/\s+/, " ")
+
+    candidates =
+      []
+      |> Kernel.++(extract_titled_names(normalized))
+      |> Kernel.++(extract_org_names(normalized))
+      |> Kernel.++(extract_capitalised_sequences(normalized))
+
+    candidates
+    |> Enum.map(&String.trim/1)
+    |> Enum.reject(&(String.length(&1) < 2))
+    |> Enum.reject(&stopword?/1)
+    |> Enum.uniq()
+  end
+
+  @doc """
+  Extract entities from an evidence record's metadata.
+
+  Pulls `content_text` from the record's metadata map and extracts
+  entity candidates. Also considers the `title` field.
+  """
+  @spec extract_from_evidence(map()) :: [String.t()]
+  def extract_from_evidence(%{} = evidence) do
+    content = get_in(evidence, [:metadata, :content_text]) ||
+              get_in(evidence, ["metadata", "content_text"]) || ""
+
+    title = Map.get(evidence, :title) || Map.get(evidence, "title") || ""
+
+    combined = "#{title}\n\n#{content}"
+    extract(combined)
+  end
+
+  # -- Private extraction strategies --
+
+  # Strategy 1: Title-prefixed names ("Dr. Jane Smith", "Prof. Alan Turing")
+  defp extract_titled_names(content) do
+    title_pattern = @title_prefixes |> Enum.join("|")
+    {:ok, regex} = Regex.compile("(?:#{title_pattern})\\.?\\s+([A-Z][a-z]+(?:\\s+[A-Z][a-z]+){0,3})")
+
+    regex
+    |> Regex.scan(content)
+    |> Enum.map(fn [full_match | _] -> String.trim(full_match) end)
+  end
+
+  # Strategy 2: Organisation names with known suffixes
+  defp extract_org_names(content) do
+    suffix_pattern = @org_suffixes |> Enum.join("|")
+    {:ok, regex} = Regex.compile("([A-Z][A-Za-z&-]+(?:\\s+[A-Z][A-Za-z&-]+){0,5})\\s+(?:#{suffix_pattern})\\b")
+
+    regex
+    |> Regex.scan(content)
+    |> Enum.map(fn [full_match | _] -> String.trim(full_match) end)
+  end
+
+  # Strategy 3: Capitalised multi-word sequences (2+ words starting with capitals)
+  defp extract_capitalised_sequences(content) do
+    ~r/\b([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)+)\b/
+    |> Regex.scan(content)
+    |> Enum.map(fn [match | _] -> String.trim(match) end)
+  end
+
+  defp stopword?(name) do
+    # Check if the entire name is a stopword, or if it's a single-word
+    # name that appears in our stoplist
+    words = String.split(name)
+
+    case words do
+      [single] -> MapSet.member?(@stopwords, single)
+      _ -> Enum.all?(words, &MapSet.member?(@stopwords, &1))
+    end
+  end
+end
diff --git a/lib/evidence_graph/relationships.ex b/lib/evidence_graph/relationships.ex
index ce72576..e7c49c5 100644
--- a/lib/evidence_graph/relationships.ex
+++ b/lib/evidence_graph/relationships.ex
@@ -88,7 +88,12 @@ defmodule EvidenceGraph.Relationships do
   Get all relationships for a node (claim or evidence).
   """
   def get_node_relationships(node_id, node_type) do
-    collection = if node_type == :claim, do: "claims", else: "evidence"
+    collection =
+      case node_type do
+        :claim -> "claims"
+        :evidence -> "evidence"
+        :entity -> "entities"
+      end
 
     aql = """
     FOR node IN #{collection}
@@ -156,6 +161,9 @@ defmodule EvidenceGraph.Relationships do
         String.starts_with?(node["_id"], "evidence/") ->
           {:evidence, EvidenceGraph.Evidence.Evidence.from_arango_doc(node)}
 
+        String.starts_with?(node["_id"], "entities/") ->
+          {:entity, EvidenceGraph.Entities.Entity.from_arango_doc(node)}
+
         true ->
           {:unknown, node}
       end
@@ -168,8 +176,14 @@ defmodule EvidenceGraph.Relationships do
   Uses ArangoDB's shortest path algorithm.
   """
   def find_path(from_id, from_type, to_id, to_type, _max_depth \\ 5) do
-    from_collection = if from_type == :claim, do: "claims", else: "evidence"
-    to_collection = if to_type == :claim, do: "claims", else: "evidence"
+    collection_name = fn
+      :claim -> "claims"
+      :evidence -> "evidence"
+      :entity -> "entities"
+    end
+
+    from_collection = collection_name.(from_type)
+    to_collection = collection_name.(to_type)
 
     aql = """
     FOR path IN ANY SHORTEST_PATH
diff --git a/lib/evidence_graph/relationships/relationship.ex b/lib/evidence_graph/relationships/relationship.ex
index 5ff4b0f..39b5299 100644
--- a/lib/evidence_graph/relationships/relationship.ex
+++ b/lib/evidence_graph/relationships/relationship.ex
@@ -25,14 +25,14 @@ defmodule EvidenceGraph.Relationships.Relationship do
           inserted_at: DateTime.t() | nil
         }
 
-  @relationship_types [:supports, :contradicts, :contextualizes]
+  @relationship_types [:supports, :contradicts, :contextualizes, :mentions]
 
   @primary_key {:id, :string, autogenerate: false}
   schema "relationships" do
     field :from_id, :string
-    field :from_type, Ecto.Enum, values: [:claim, :evidence]
+    field :from_type, Ecto.Enum, values: [:claim, :evidence, :entity]
     field :to_id, :string
-    field :to_type, Ecto.Enum, values: [:claim, :evidence]
+    field :to_type, Ecto.Enum, values: [:claim, :evidence, :entity]
     field :relationship_type, Ecto.Enum, values: @relationship_types
     field :weight, :float, default: 0.5
     field :confidence, :float, default: 0.5
@@ -91,6 +91,7 @@ defmodule EvidenceGraph.Relationships.Relationship do
 
   defp collection_for(:claim), do: "claims"
   defp collection_for(:evidence), do: "evidence"
+  defp collection_for(:entity), do: "entities"
 
   @doc """
   Convert from ArangoDB edge document to Relationship struct.
@@ -122,6 +123,7 @@ defmodule EvidenceGraph.Relationships.Relationship do
       case collection do
         "claims" -> :claim
         "evidence" -> :evidence
+        "entities" -> :entity
         _ -> :unknown
       end
 
diff --git a/mix.lock b/mix.lock
index ab3e385..c64ff44 100644
--- a/mix.lock
+++ b/mix.lock
@@ -24,6 +24,7 @@
   "esbuild": {:hex, :esbuild, "0.10.0", "b0aa3388a1c23e727c5a3e7427c932d89ee791746b0081bbe56103e9ef3d291f", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "468489cda427b974a7cc9f03ace55368a83e1a7be12fba7e30969af78e5f8c70"},
   "expo": {:hex, :expo, "1.1.1", "4202e1d2ca6e2b3b63e02f69cfe0a404f77702b041d02b58597c00992b601db5", [:mix], [], "hexpm", "5fb308b9cb359ae200b7e23d37c76978673aa1b06e2b3075d814ce12c5811640"},
   "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"},
+  "finch": {:hex, :finch, "0.21.0", "b1c3b2d48af02d0c66d2a9ebfb5622be5c5ecd62937cf79a88a7f98d48a8290c", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "87dc6e169794cb2570f75841a19da99cfde834249568f2a5b121b809588a4377"},
   "fine": {:hex, :fine, "0.1.4", "b19a89c1476c7c57afb5f9314aed5960b5bc95d5277de4cb5ee8e1d1616ce379", [:mix], [], "hexpm", "be3324cc454a42d80951cf6023b9954e9ff27c6daa255483b3e8d608670303f5"},
   "floki": {:hex, :floki, "0.38.0", "62b642386fa3f2f90713f6e231da0fa3256e41ef1089f83b6ceac7a3fd3abf33", [:mix], [], "hexpm", "a5943ee91e93fb2d635b612caf5508e36d37548e84928463ef9dd986f0d1abd9"},
   "gettext": {:hex, :gettext, "1.0.2", "5457e1fd3f4abe47b0e13ff85086aabae760497a3497909b8473e0acee57673b", [:mix], [{:expo, "~> 0.5.1 or ~> 1.0", [hex: :expo, repo: "hexpm", optional: false]}], "hexpm", "eab805501886802071ad290714515c8c4a17196ea76e5afc9d06ca85fb1bfeb3"},
@@ -33,7 +34,9 @@
   "lazy_html": {:hex, :lazy_html, "0.1.10", "ffe42a0b4e70859cf21a33e12a251e0c76c1dff76391609bd56702a0ef5bc429", [:make, :mix], [{:cc_precompiler, "~> 0.1", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.9.0", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:fine, "~> 0.1.0", [hex: :fine, repo: "hexpm", optional: false]}], "hexpm", "50f67e5faa09d45a99c1ddf3fac004f051997877dc8974c5797bb5ccd8e27058"},
   "mime": {:hex, :mime, "2.0.7", "b8d739037be7cd402aee1ba0306edfdef982687ee7e9859bee6198c1e7e2f128", [:mix], [], "hexpm", "6171188e399ee16023ffc5b76ce445eb6d9672e2e241d2df6050f3c771e80ccd"},
   "mint": {:hex, :mint, "1.7.1", "113fdb2b2f3b59e47c7955971854641c61f378549d73e829e1768de90fc1abf1", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0 or ~> 1.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "fceba0a4d0f24301ddee3024ae116df1c3f4bb7a563a731f45fdfeb9d39a231b"},
+  "nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"},
   "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"},
+  "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
   "oban": {:hex, :oban, "2.20.3", "e4d27336941955886cc7113420c32c63b70b64f10b27e08e3cf2b001153953cd", [:mix], [{:ecto_sql, "~> 3.10", [hex: :ecto_sql, repo: "hexpm", optional: false]}, {:ecto_sqlite3, "~> 0.9", [hex: :ecto_sqlite3, repo: "hexpm", optional: true]}, {:igniter, "~> 0.5", [hex: :igniter, repo: "hexpm", optional: true]}, {:jason, "~> 1.1", [hex: :jason, repo: "hexpm", optional: true]}, {:myxql, "~> 0.7", [hex: :myxql, repo: "hexpm", optional: true]}, {:postgrex, "~> 0.20", [hex: :postgrex, repo: "hexpm", optional: true]}, {:telemetry, "~> 1.3", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "075ffbf1279a96bec495bc63d647b08929837d70bcc0427249ffe4d1dddaec33"},
   "phoenix": {:hex, :phoenix, "1.8.5", "919db335247e6d4891764dc3063415b0d2457641c5f9b3751b5df03d8e20bbcf", [:mix], [{:bandit, "~> 1.0", [hex: :bandit, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.1", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:phoenix_template, "~> 1.0", [hex: :phoenix_template, repo: "hexpm", optional: false]}, {:phoenix_view, "~> 2.0", [hex: :phoenix_view, repo: "hexpm", optional: true]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.7", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:websock_adapter, "~> 0.5.3", [hex: :websock_adapter, repo: "hexpm", optional: false]}], "hexpm", "83b2bb125127e02e9f475c8e3e92736325b5b01b0b9b05407bcb4083b7a32485"},
   "phoenix_copy": {:hex, :phoenix_copy, "0.1.4", "a0b798288eed3a7223464b56abcef72f11c832a168dc79e88279d4f8c08fa842", [:mix], [{:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}], "hexpm", "9752026f7b258b1dee5fca7937a2b30c3a15e61ed07a4366c78cf077850a4325"},
@@ -49,6 +52,7 @@
   "plug_crypto": {:hex, :plug_crypto, "2.1.1", "19bda8184399cb24afa10be734f84a16ea0a2bc65054e23a62bb10f06bc89491", [:mix], [], "hexpm", "6470bce6ffe41c8bd497612ffde1a7e4af67f36a15eea5f921af71cf3e11247c"},
   "postgrex": {:hex, :postgrex, "0.22.0", "fb027b58b6eab1f6de5396a2abcdaaeb168f9ed4eccbb594e6ac393b02078cbd", [:mix], [{:db_connection, "~> 2.9", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "a68c4261e299597909e03e6f8ff5a13876f5caadaddd0d23af0d0a61afcc5d84"},
   "ranch": {:hex, :ranch, "2.2.0", "25528f82bc8d7c6152c57666ca99ec716510fe0925cb188172f41ce93117b1b0", [:make, :rebar3], [], "hexpm", "fa0b99a1780c80218a4197a59ea8d3bdae32fbff7e88527d7d8a4787eff4f8e7"},
+  "req": {:hex, :req, "0.5.17", "0096ddd5b0ed6f576a03dde4b158a0c727215b15d2795e59e0916c6971066ede", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0b8bc6ffdfebbc07968e59d3ff96d52f2202d0536f10fef4dc11dc02a2a43e39"},
   "swoosh": {:hex, :swoosh, "1.23.0", "a1b7f41705357ffb06457d177e734bf378022901ce53889a68bcc59d10a23c27", [:mix], [{:bandit, ">= 1.0.0", [hex: :bandit, repo: "hexpm", optional: true]}, {:cowboy, "~> 1.1 or ~> 2.4", [hex: :cowboy, repo: "hexpm", optional: true]}, {:ex_aws, "~> 2.1", [hex: :ex_aws, repo: "hexpm", optional: true]}, {:finch, "~> 0.6", [hex: :finch, repo: "hexpm", optional: true]}, {:gen_smtp, "~> 0.13 or ~> 1.0", [hex: :gen_smtp, repo: "hexpm", optional: true]}, {:hackney, "~> 1.9", [hex: :hackney, repo: "hexpm", optional: true]}, {:idna, "~> 6.0", [hex: :idna, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mail, "~> 0.2", [hex: :mail, repo: "hexpm", optional: true]}, {:mime, "~> 1.1 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mua, "~> 0.2.3", [hex: :mua, repo: "hexpm", optional: true]}, {:multipart, "~> 0.4", [hex: :multipart, repo: "hexpm", optional: true]}, {:plug, "~> 1.9", [hex: :plug, repo: "hexpm", optional: true]}, {:plug_cowboy, ">= 1.0.0", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:req, "~> 0.5.10 or ~> 0.6 or ~> 1.0", [hex: :req, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.2 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "97aaf04481ce8a351e2d15a3907778bdf3b1ea071cfff3eb8728b65943c77f6d"},
   "tailwind": {:hex, :tailwind, "0.4.1", "e7bcc222fe96a1e55f948e76d13dd84a1a7653fb051d2a167135db3b4b08d3e9", [:mix], [], "hexpm", "6249d4f9819052911120dbdbe9e532e6bd64ea23476056adb7f730aa25c220d1"},
   "telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"},
diff --git a/test/evidence_graph/lithoglyph/ner_extractor_test.exs b/test/evidence_graph/lithoglyph/ner_extractor_test.exs
new file mode 100644
index 0000000..ce8178a
--- /dev/null
+++ b/test/evidence_graph/lithoglyph/ner_extractor_test.exs
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: PMPL-1.0-or-later
+# Copyright (c) 2026 Jonathan D.A. Jewell (hyperpolymath) <j.d.a.jewell@open.ac.uk>
+defmodule EvidenceGraph.Lithoglyph.NERExtractorTest do
+  use ExUnit.Case, async: true
+
+  alias EvidenceGraph.Lithoglyph.NERExtractor
+
+  describe "extract/1" do
+    test "returns empty list for nil" do
+      assert NERExtractor.extract(nil) == []
+    end
+
+    test "returns empty list for empty string" do
+      assert NERExtractor.extract("") == []
+    end
+
+    test "extracts capitalised multi-word sequences" do
+      content = "The report by Jeffrey Epstein was filed in New York."
+      entities = NERExtractor.extract(content)
+
+      assert "Jeffrey Epstein" in entities
+      assert "New York" in entities
+    end
+
+    test "extracts titled names" do
+      content = "Dr. Jane Smith and Prof. Alan Turing discussed the results."
+      entities = NERExtractor.extract(content)
+
+      assert Enum.any?(entities, &String.contains?(&1, "Jane Smith"))
+      assert Enum.any?(entities, &String.contains?(&1, "Alan Turing"))
+    end
+
+    test "extracts organisation names with suffixes" do
+      content = "Documents from HSBC Holdings plc showed transfers to Global Fund Ltd."
+      entities = NERExtractor.extract(content)
+
+      assert Enum.any?(entities, &String.contains?(&1, "HSBC"))
+      assert Enum.any?(entities, &String.contains?(&1, "Global Fund"))
+    end
+
+    test "filters out stopwords" do
+      content = "The Evidence from the Investigation was clear."
+      entities = NERExtractor.extract(content)
+
+      refute "The Evidence" in entities
+      refute "Evidence" in entities
+      refute "Investigation" in entities
+    end
+
+    test "deduplicates entity names" do
+      content = "Jeffrey Epstein met with Jeffrey Epstein's lawyer."
+      entities = NERExtractor.extract(content)
+
+      epstein_count =
+        Enum.count(entities, &(&1 == "Jeffrey Epstein"))
+
+      assert epstein_count <= 1
+    end
+
+    test "handles complex investigative journalism content" do
+      content = """
+      Court documents filed in the Southern District of New York reveal that
+      Ghislaine Maxwell facilitated meetings between Jeffrey Epstein and
+      Prince Andrew at Buckingham Palace. Deutsche Bank processed multiple
+      wire transfers totalling $4.6 million. The Federal Bureau of Investigation
+      obtained flight logs from Palm Beach International Airport showing
+      frequent travel to Little Saint James.
+      """
+
+      entities = NERExtractor.extract(content)
+
+      assert Enum.any?(entities, &String.contains?(&1, "Ghislaine Maxwell"))
+      assert Enum.any?(entities, &String.contains?(&1, "Jeffrey Epstein"))
+      assert Enum.any?(entities, &String.contains?(&1, "Prince Andrew"))
+      assert Enum.any?(entities, &String.contains?(&1, "Deutsche Bank"))
+    end
+
+    test "rejects very short names" do
+      content = "A B met C D at the conference."
+      entities = NERExtractor.extract(content)
+
+      refute Enum.any?(entities, &(String.length(&1) < 2))
+    end
+  end
+
+  describe "extract_from_evidence/1" do
+    test "extracts from metadata.content_text" do
+      evidence = %{
+        title: "Epstein Flight Logs",
+        metadata: %{
+          content_text: "Jeffrey Epstein flew to Palm Beach on his private jet."
+        }
+      }
+
+      entities = NERExtractor.extract_from_evidence(evidence)
+
+      assert Enum.any?(entities, &String.contains?(&1, "Jeffrey Epstein"))
+      assert Enum.any?(entities, &String.contains?(&1, "Palm Beach"))
+    end
+
+    test "includes title in extraction" do
+      evidence = %{
+        title: "HSBC Holdings Investigation Report",
+        metadata: %{content_text: "The bank was fined for compliance failures."}
+      }
+
+      entities = NERExtractor.extract_from_evidence(evidence)
+
+      assert Enum.any?(entities, &String.contains?(&1, "HSBC Holdings"))
+    end
+
+    test "handles string keys in metadata" do
+      evidence = %{
+        "title" => "Maxwell Documents",
+        "metadata" => %{"content_text" => "Ghislaine Maxwell attended the event."}
+      }
+
+      entities = NERExtractor.extract_from_evidence(evidence)
+
+      assert Enum.any?(entities, &String.contains?(&1, "Ghislaine Maxwell"))
+    end
+
+    test "handles missing content gracefully" do
+      evidence = %{title: "Empty", metadata: %{}}
+      entities = NERExtractor.extract_from_evidence(evidence)
+
+      assert is_list(entities)
+    end
+  end
+end