From ec9fb818e3a2cafc30250bc043db4cd44a1898de Mon Sep 17 00:00:00 2001 From: "Jonathan D.A. Jewell" <6759885+hyperpolymath@users.noreply.github.com> Date: Fri, 13 Mar 2026 16:51:41 +0000 Subject: [PATCH] feat: wire NER entity extraction into Lithoglyph importer pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After evidence is imported from Lithoglyph, extract named entities from content_text using regex-based NER (titled names, org suffixes, capitalised sequences), resolve them against existing entities via Entities.resolve_ner_output (exact → fuzzy → auto-create), and create :mentions relationship edges in ArangoDB. Entity linking is best-effort — failures are logged but don't block the import. - Add NERExtractor module with 3 extraction strategies - Wire NER into Importer.import_single_record post-create step - Extend Relationship schema with :entity type and :mentions edges - Update graph traversal helpers to handle entity nodes - Add 13 unit tests for NER extraction (all passing) Co-Authored-By: Claude Opus 4.6 --- lib/evidence_graph/lithoglyph/importer.ex | 67 +++++++++ .../lithoglyph/ner_extractor.ex | 135 ++++++++++++++++++ lib/evidence_graph/relationships.ex | 20 ++- .../relationships/relationship.ex | 8 +- mix.lock | 4 + .../lithoglyph/ner_extractor_test.exs | 130 +++++++++++++++++ 6 files changed, 358 insertions(+), 6 deletions(-) create mode 100644 lib/evidence_graph/lithoglyph/ner_extractor.ex create mode 100644 test/evidence_graph/lithoglyph/ner_extractor_test.exs diff --git a/lib/evidence_graph/lithoglyph/importer.ex b/lib/evidence_graph/lithoglyph/importer.ex index 7f5387a..2075974 100644 --- a/lib/evidence_graph/lithoglyph/importer.ex +++ b/lib/evidence_graph/lithoglyph/importer.ex @@ -30,8 +30,11 @@ defmodule EvidenceGraph.Lithoglyph.Importer do require Logger alias EvidenceGraph.Lithoglyph.Client, as: LithClient + alias EvidenceGraph.Lithoglyph.NERExtractor alias EvidenceGraph.ArangoDB alias EvidenceGraph.Evidence + alias EvidenceGraph.Entities + alias EvidenceGraph.Relationships @batch_size 100 @progress_interval 50 @@ -204,6 +207,8 @@ defmodule EvidenceGraph.Lithoglyph.Importer do # Step 3: Create evidence in ArangoDB case Evidence.create_evidence(attrs) do {:ok, evidence} -> + # Step 4: Extract entities and create evidence→entity edges + link_entities_to_evidence(evidence, attrs, investigation_id) {:imported, evidence.id} {:error, reason} -> @@ -215,6 +220,68 @@ defmodule EvidenceGraph.Lithoglyph.Importer do end end + # Extract named entities from evidence content and link them via graph edges. + # Failures here are logged but do NOT fail the overall import — entity linking + # is best-effort enrichment, not a hard requirement. + defp link_entities_to_evidence(evidence, attrs, investigation_id) do + ner_strings = NERExtractor.extract_from_evidence(attrs) + + if ner_strings == [] do + Logger.debug("No entities extracted for evidence=#{evidence.id}") + else + Logger.info("Extracted #{length(ner_strings)} entity candidates for evidence=#{evidence.id}") + + resolved = Entities.resolve_ner_output(ner_strings, investigation_id) + + Enum.each(resolved, fn {ner_string, result} -> + case result do + {:existing, entity} -> + create_mentions_edge(evidence.id, entity.id, ner_string, 1.0) + + {:created, entity} -> + create_mentions_edge(evidence.id, entity.id, ner_string, 0.8) + + {:suggest_merge, entity, similarity} -> + # Link to the existing entity but with lower confidence (fuzzy match) + create_mentions_edge(evidence.id, entity.id, ner_string, similarity * 0.9) + Logger.info( + "Fuzzy match for '#{ner_string}' → '#{entity.primary_name}' " <> + "(similarity=#{Float.round(similarity, 3)})" + ) + + {:error, reason} -> + Logger.warning("Failed to resolve entity '#{ner_string}': #{inspect(reason)}") + end + end) + end + rescue + error -> + Logger.warning("Entity linking failed for evidence=#{evidence.id}: #{inspect(error)}") + end + + defp create_mentions_edge(evidence_id, entity_id, ner_string, confidence) do + case Relationships.create_relationship(%{ + from_id: evidence_id, + from_type: :evidence, + to_id: entity_id, + to_type: :entity, + relationship_type: :mentions, + weight: 1.0, + confidence: confidence, + reasoning: "NER extraction matched '#{ner_string}'", + created_by: "lithoglyph_importer", + metadata: %{"ner_source" => ner_string, "extraction_method" => "regex"} + }) do + {:ok, _rel} -> + :ok + + {:error, reason} -> + Logger.warning( + "Failed to create mentions edge evidence=#{evidence_id} → entity=#{entity_id}: #{inspect(reason)}" + ) + end + end + defp check_duplicate(nil), do: {:ok, :new} defp check_duplicate(sha256) do diff --git a/lib/evidence_graph/lithoglyph/ner_extractor.ex b/lib/evidence_graph/lithoglyph/ner_extractor.ex new file mode 100644 index 0000000..4e992e3 --- /dev/null +++ b/lib/evidence_graph/lithoglyph/ner_extractor.ex @@ -0,0 +1,135 @@ +# SPDX-License-Identifier: PMPL-1.0-or-later +# Copyright (c) 2026 Jonathan D.A. Jewell (hyperpolymath) +defmodule EvidenceGraph.Lithoglyph.NERExtractor do + @moduledoc """ + Regex-based Named Entity Recognition for evidence content. + + Extracts candidate entity names from `content_text` using pattern + matching heuristics. This is a lightweight extraction layer — the + heavy lifting of deduplication and fuzzy matching is handled by + `EvidenceGraph.Entities.resolve_ner_output/2`. + + ## Extraction Strategies + + 1. **Capitalised multi-word sequences** — "Jeffrey Epstein", "HSBC Holdings" + 2. **Title-prefixed names** — "Dr. Jane Smith", "Prof. Alan Turing" + 3. **Organisation suffixes** — "Acme Corp", "Global Fund Ltd" + 4. **Location indicators** — "New York City", "United Kingdom" + + ## Usage + + iex> NERExtractor.extract("The ONS reported that HSBC Holdings received funding.") + ["ONS", "HSBC Holdings"] + """ + + require Logger + + # Titles that precede person names + @title_prefixes ~w(Mr Mrs Ms Dr Prof Sir Dame Lord Lady Judge Justice Sen Rep Gov) + + # Organisation suffixes (case-insensitive matching applied separately) + @org_suffixes ~w( + Ltd LLC Inc Corp plc PLC Group Foundation Trust Fund + Association Institute University College Council Commission + Authority Agency Bureau Department Ministry + ) + + # Common words that look like proper nouns but aren't entities + @stopwords MapSet.new(~w( + The This That These Those Which Where When What Who How + January February March April May June July August September October November December + Monday Tuesday Wednesday Thursday Friday Saturday Sunday + Section Article Chapter Page Table Figure + However Moreover Furthermore Additionally Nevertheless + According Also Although Because Before Between During + Evidence Investigation Report Document Statement + Said Says Told Asked Added Noted Claimed Stated + New Old First Last Next Previous Current Former + United States Kingdom Nations + )) + + @doc """ + Extract candidate entity names from content text. + + Returns a deduplicated list of entity name strings, ordered by first + appearance in the text. + """ + @spec extract(String.t()) :: [String.t()] + def extract(nil), do: [] + def extract(""), do: [] + + def extract(content) when is_binary(content) do + # Normalize whitespace: collapse newlines/tabs into single spaces + # so regexes don't span across line breaks + normalized = String.replace(content, ~r/\s+/, " ") + + candidates = + [] + |> Kernel.++(extract_titled_names(normalized)) + |> Kernel.++(extract_org_names(normalized)) + |> Kernel.++(extract_capitalised_sequences(normalized)) + + candidates + |> Enum.map(&String.trim/1) + |> Enum.reject(&(String.length(&1) < 2)) + |> Enum.reject(&stopword?/1) + |> Enum.uniq() + end + + @doc """ + Extract entities from an evidence record's metadata. + + Pulls `content_text` from the record's metadata map and extracts + entity candidates. Also considers the `title` field. + """ + @spec extract_from_evidence(map()) :: [String.t()] + def extract_from_evidence(%{} = evidence) do + content = get_in(evidence, [:metadata, :content_text]) || + get_in(evidence, ["metadata", "content_text"]) || "" + + title = Map.get(evidence, :title) || Map.get(evidence, "title") || "" + + combined = "#{title}\n\n#{content}" + extract(combined) + end + + # -- Private extraction strategies -- + + # Strategy 1: Title-prefixed names ("Dr. Jane Smith", "Prof. Alan Turing") + defp extract_titled_names(content) do + title_pattern = @title_prefixes |> Enum.join("|") + {:ok, regex} = Regex.compile("(?:#{title_pattern})\\.?\\s+([A-Z][a-z]+(?:\\s+[A-Z][a-z]+){0,3})") + + regex + |> Regex.scan(content) + |> Enum.map(fn [full_match | _] -> String.trim(full_match) end) + end + + # Strategy 2: Organisation names with known suffixes + defp extract_org_names(content) do + suffix_pattern = @org_suffixes |> Enum.join("|") + {:ok, regex} = Regex.compile("([A-Z][A-Za-z&-]+(?:\\s+[A-Z][A-Za-z&-]+){0,5})\\s+(?:#{suffix_pattern})\\b") + + regex + |> Regex.scan(content) + |> Enum.map(fn [full_match | _] -> String.trim(full_match) end) + end + + # Strategy 3: Capitalised multi-word sequences (2+ words starting with capitals) + defp extract_capitalised_sequences(content) do + ~r/\b([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)+)\b/ + |> Regex.scan(content) + |> Enum.map(fn [match | _] -> String.trim(match) end) + end + + defp stopword?(name) do + # Check if the entire name is a stopword, or if it's a single-word + # name that appears in our stoplist + words = String.split(name) + + case words do + [single] -> MapSet.member?(@stopwords, single) + _ -> Enum.all?(words, &MapSet.member?(@stopwords, &1)) + end + end +end diff --git a/lib/evidence_graph/relationships.ex b/lib/evidence_graph/relationships.ex index ce72576..e7c49c5 100644 --- a/lib/evidence_graph/relationships.ex +++ b/lib/evidence_graph/relationships.ex @@ -88,7 +88,12 @@ defmodule EvidenceGraph.Relationships do Get all relationships for a node (claim or evidence). """ def get_node_relationships(node_id, node_type) do - collection = if node_type == :claim, do: "claims", else: "evidence" + collection = + case node_type do + :claim -> "claims" + :evidence -> "evidence" + :entity -> "entities" + end aql = """ FOR node IN #{collection} @@ -156,6 +161,9 @@ defmodule EvidenceGraph.Relationships do String.starts_with?(node["_id"], "evidence/") -> {:evidence, EvidenceGraph.Evidence.Evidence.from_arango_doc(node)} + String.starts_with?(node["_id"], "entities/") -> + {:entity, EvidenceGraph.Entities.Entity.from_arango_doc(node)} + true -> {:unknown, node} end @@ -168,8 +176,14 @@ defmodule EvidenceGraph.Relationships do Uses ArangoDB's shortest path algorithm. """ def find_path(from_id, from_type, to_id, to_type, _max_depth \\ 5) do - from_collection = if from_type == :claim, do: "claims", else: "evidence" - to_collection = if to_type == :claim, do: "claims", else: "evidence" + collection_name = fn + :claim -> "claims" + :evidence -> "evidence" + :entity -> "entities" + end + + from_collection = collection_name.(from_type) + to_collection = collection_name.(to_type) aql = """ FOR path IN ANY SHORTEST_PATH diff --git a/lib/evidence_graph/relationships/relationship.ex b/lib/evidence_graph/relationships/relationship.ex index 5ff4b0f..39b5299 100644 --- a/lib/evidence_graph/relationships/relationship.ex +++ b/lib/evidence_graph/relationships/relationship.ex @@ -25,14 +25,14 @@ defmodule EvidenceGraph.Relationships.Relationship do inserted_at: DateTime.t() | nil } - @relationship_types [:supports, :contradicts, :contextualizes] + @relationship_types [:supports, :contradicts, :contextualizes, :mentions] @primary_key {:id, :string, autogenerate: false} schema "relationships" do field :from_id, :string - field :from_type, Ecto.Enum, values: [:claim, :evidence] + field :from_type, Ecto.Enum, values: [:claim, :evidence, :entity] field :to_id, :string - field :to_type, Ecto.Enum, values: [:claim, :evidence] + field :to_type, Ecto.Enum, values: [:claim, :evidence, :entity] field :relationship_type, Ecto.Enum, values: @relationship_types field :weight, :float, default: 0.5 field :confidence, :float, default: 0.5 @@ -91,6 +91,7 @@ defmodule EvidenceGraph.Relationships.Relationship do defp collection_for(:claim), do: "claims" defp collection_for(:evidence), do: "evidence" + defp collection_for(:entity), do: "entities" @doc """ Convert from ArangoDB edge document to Relationship struct. @@ -122,6 +123,7 @@ defmodule EvidenceGraph.Relationships.Relationship do case collection do "claims" -> :claim "evidence" -> :evidence + "entities" -> :entity _ -> :unknown end diff --git a/mix.lock b/mix.lock index ab3e385..c64ff44 100644 --- a/mix.lock +++ b/mix.lock @@ -24,6 +24,7 @@ "esbuild": {:hex, :esbuild, "0.10.0", "b0aa3388a1c23e727c5a3e7427c932d89ee791746b0081bbe56103e9ef3d291f", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "468489cda427b974a7cc9f03ace55368a83e1a7be12fba7e30969af78e5f8c70"}, "expo": {:hex, :expo, "1.1.1", "4202e1d2ca6e2b3b63e02f69cfe0a404f77702b041d02b58597c00992b601db5", [:mix], [], "hexpm", "5fb308b9cb359ae200b7e23d37c76978673aa1b06e2b3075d814ce12c5811640"}, "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"}, + "finch": {:hex, :finch, "0.21.0", "b1c3b2d48af02d0c66d2a9ebfb5622be5c5ecd62937cf79a88a7f98d48a8290c", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "87dc6e169794cb2570f75841a19da99cfde834249568f2a5b121b809588a4377"}, "fine": {:hex, :fine, "0.1.4", "b19a89c1476c7c57afb5f9314aed5960b5bc95d5277de4cb5ee8e1d1616ce379", [:mix], [], "hexpm", "be3324cc454a42d80951cf6023b9954e9ff27c6daa255483b3e8d608670303f5"}, "floki": {:hex, :floki, "0.38.0", "62b642386fa3f2f90713f6e231da0fa3256e41ef1089f83b6ceac7a3fd3abf33", [:mix], [], "hexpm", "a5943ee91e93fb2d635b612caf5508e36d37548e84928463ef9dd986f0d1abd9"}, "gettext": {:hex, :gettext, "1.0.2", "5457e1fd3f4abe47b0e13ff85086aabae760497a3497909b8473e0acee57673b", [:mix], [{:expo, "~> 0.5.1 or ~> 1.0", [hex: :expo, repo: "hexpm", optional: false]}], "hexpm", "eab805501886802071ad290714515c8c4a17196ea76e5afc9d06ca85fb1bfeb3"}, @@ -33,7 +34,9 @@ "lazy_html": {:hex, :lazy_html, "0.1.10", "ffe42a0b4e70859cf21a33e12a251e0c76c1dff76391609bd56702a0ef5bc429", [:make, :mix], [{:cc_precompiler, "~> 0.1", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.9.0", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:fine, "~> 0.1.0", [hex: :fine, repo: "hexpm", optional: false]}], "hexpm", "50f67e5faa09d45a99c1ddf3fac004f051997877dc8974c5797bb5ccd8e27058"}, "mime": {:hex, :mime, "2.0.7", "b8d739037be7cd402aee1ba0306edfdef982687ee7e9859bee6198c1e7e2f128", [:mix], [], "hexpm", "6171188e399ee16023ffc5b76ce445eb6d9672e2e241d2df6050f3c771e80ccd"}, "mint": {:hex, :mint, "1.7.1", "113fdb2b2f3b59e47c7955971854641c61f378549d73e829e1768de90fc1abf1", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0 or ~> 1.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "fceba0a4d0f24301ddee3024ae116df1c3f4bb7a563a731f45fdfeb9d39a231b"}, + "nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"}, "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"}, + "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"}, "oban": {:hex, :oban, "2.20.3", "e4d27336941955886cc7113420c32c63b70b64f10b27e08e3cf2b001153953cd", [:mix], [{:ecto_sql, "~> 3.10", [hex: :ecto_sql, repo: "hexpm", optional: false]}, {:ecto_sqlite3, "~> 0.9", [hex: :ecto_sqlite3, repo: "hexpm", optional: true]}, {:igniter, "~> 0.5", [hex: :igniter, repo: "hexpm", optional: true]}, {:jason, "~> 1.1", [hex: :jason, repo: "hexpm", optional: true]}, {:myxql, "~> 0.7", [hex: :myxql, repo: "hexpm", optional: true]}, {:postgrex, "~> 0.20", [hex: :postgrex, repo: "hexpm", optional: true]}, {:telemetry, "~> 1.3", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "075ffbf1279a96bec495bc63d647b08929837d70bcc0427249ffe4d1dddaec33"}, "phoenix": {:hex, :phoenix, "1.8.5", "919db335247e6d4891764dc3063415b0d2457641c5f9b3751b5df03d8e20bbcf", [:mix], [{:bandit, "~> 1.0", [hex: :bandit, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.1", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:phoenix_template, "~> 1.0", [hex: :phoenix_template, repo: "hexpm", optional: false]}, {:phoenix_view, "~> 2.0", [hex: :phoenix_view, repo: "hexpm", optional: true]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.7", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:websock_adapter, "~> 0.5.3", [hex: :websock_adapter, repo: "hexpm", optional: false]}], "hexpm", "83b2bb125127e02e9f475c8e3e92736325b5b01b0b9b05407bcb4083b7a32485"}, "phoenix_copy": {:hex, :phoenix_copy, "0.1.4", "a0b798288eed3a7223464b56abcef72f11c832a168dc79e88279d4f8c08fa842", [:mix], [{:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}], "hexpm", "9752026f7b258b1dee5fca7937a2b30c3a15e61ed07a4366c78cf077850a4325"}, @@ -49,6 +52,7 @@ "plug_crypto": {:hex, :plug_crypto, "2.1.1", "19bda8184399cb24afa10be734f84a16ea0a2bc65054e23a62bb10f06bc89491", [:mix], [], "hexpm", "6470bce6ffe41c8bd497612ffde1a7e4af67f36a15eea5f921af71cf3e11247c"}, "postgrex": {:hex, :postgrex, "0.22.0", "fb027b58b6eab1f6de5396a2abcdaaeb168f9ed4eccbb594e6ac393b02078cbd", [:mix], [{:db_connection, "~> 2.9", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "a68c4261e299597909e03e6f8ff5a13876f5caadaddd0d23af0d0a61afcc5d84"}, "ranch": {:hex, :ranch, "2.2.0", "25528f82bc8d7c6152c57666ca99ec716510fe0925cb188172f41ce93117b1b0", [:make, :rebar3], [], "hexpm", "fa0b99a1780c80218a4197a59ea8d3bdae32fbff7e88527d7d8a4787eff4f8e7"}, + "req": {:hex, :req, "0.5.17", "0096ddd5b0ed6f576a03dde4b158a0c727215b15d2795e59e0916c6971066ede", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0b8bc6ffdfebbc07968e59d3ff96d52f2202d0536f10fef4dc11dc02a2a43e39"}, "swoosh": {:hex, :swoosh, "1.23.0", "a1b7f41705357ffb06457d177e734bf378022901ce53889a68bcc59d10a23c27", [:mix], [{:bandit, ">= 1.0.0", [hex: :bandit, repo: "hexpm", optional: true]}, {:cowboy, "~> 1.1 or ~> 2.4", [hex: :cowboy, repo: "hexpm", optional: true]}, {:ex_aws, "~> 2.1", [hex: :ex_aws, repo: "hexpm", optional: true]}, {:finch, "~> 0.6", [hex: :finch, repo: "hexpm", optional: true]}, {:gen_smtp, "~> 0.13 or ~> 1.0", [hex: :gen_smtp, repo: "hexpm", optional: true]}, {:hackney, "~> 1.9", [hex: :hackney, repo: "hexpm", optional: true]}, {:idna, "~> 6.0", [hex: :idna, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mail, "~> 0.2", [hex: :mail, repo: "hexpm", optional: true]}, {:mime, "~> 1.1 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mua, "~> 0.2.3", [hex: :mua, repo: "hexpm", optional: true]}, {:multipart, "~> 0.4", [hex: :multipart, repo: "hexpm", optional: true]}, {:plug, "~> 1.9", [hex: :plug, repo: "hexpm", optional: true]}, {:plug_cowboy, ">= 1.0.0", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:req, "~> 0.5.10 or ~> 0.6 or ~> 1.0", [hex: :req, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.2 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "97aaf04481ce8a351e2d15a3907778bdf3b1ea071cfff3eb8728b65943c77f6d"}, "tailwind": {:hex, :tailwind, "0.4.1", "e7bcc222fe96a1e55f948e76d13dd84a1a7653fb051d2a167135db3b4b08d3e9", [:mix], [], "hexpm", "6249d4f9819052911120dbdbe9e532e6bd64ea23476056adb7f730aa25c220d1"}, "telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"}, diff --git a/test/evidence_graph/lithoglyph/ner_extractor_test.exs b/test/evidence_graph/lithoglyph/ner_extractor_test.exs new file mode 100644 index 0000000..ce8178a --- /dev/null +++ b/test/evidence_graph/lithoglyph/ner_extractor_test.exs @@ -0,0 +1,130 @@ +# SPDX-License-Identifier: PMPL-1.0-or-later +# Copyright (c) 2026 Jonathan D.A. Jewell (hyperpolymath) +defmodule EvidenceGraph.Lithoglyph.NERExtractorTest do + use ExUnit.Case, async: true + + alias EvidenceGraph.Lithoglyph.NERExtractor + + describe "extract/1" do + test "returns empty list for nil" do + assert NERExtractor.extract(nil) == [] + end + + test "returns empty list for empty string" do + assert NERExtractor.extract("") == [] + end + + test "extracts capitalised multi-word sequences" do + content = "The report by Jeffrey Epstein was filed in New York." + entities = NERExtractor.extract(content) + + assert "Jeffrey Epstein" in entities + assert "New York" in entities + end + + test "extracts titled names" do + content = "Dr. Jane Smith and Prof. Alan Turing discussed the results." + entities = NERExtractor.extract(content) + + assert Enum.any?(entities, &String.contains?(&1, "Jane Smith")) + assert Enum.any?(entities, &String.contains?(&1, "Alan Turing")) + end + + test "extracts organisation names with suffixes" do + content = "Documents from HSBC Holdings plc showed transfers to Global Fund Ltd." + entities = NERExtractor.extract(content) + + assert Enum.any?(entities, &String.contains?(&1, "HSBC")) + assert Enum.any?(entities, &String.contains?(&1, "Global Fund")) + end + + test "filters out stopwords" do + content = "The Evidence from the Investigation was clear." + entities = NERExtractor.extract(content) + + refute "The Evidence" in entities + refute "Evidence" in entities + refute "Investigation" in entities + end + + test "deduplicates entity names" do + content = "Jeffrey Epstein met with Jeffrey Epstein's lawyer." + entities = NERExtractor.extract(content) + + epstein_count = + Enum.count(entities, &(&1 == "Jeffrey Epstein")) + + assert epstein_count <= 1 + end + + test "handles complex investigative journalism content" do + content = """ + Court documents filed in the Southern District of New York reveal that + Ghislaine Maxwell facilitated meetings between Jeffrey Epstein and + Prince Andrew at Buckingham Palace. Deutsche Bank processed multiple + wire transfers totalling $4.6 million. The Federal Bureau of Investigation + obtained flight logs from Palm Beach International Airport showing + frequent travel to Little Saint James. + """ + + entities = NERExtractor.extract(content) + + assert Enum.any?(entities, &String.contains?(&1, "Ghislaine Maxwell")) + assert Enum.any?(entities, &String.contains?(&1, "Jeffrey Epstein")) + assert Enum.any?(entities, &String.contains?(&1, "Prince Andrew")) + assert Enum.any?(entities, &String.contains?(&1, "Deutsche Bank")) + end + + test "rejects very short names" do + content = "A B met C D at the conference." + entities = NERExtractor.extract(content) + + refute Enum.any?(entities, &(String.length(&1) < 2)) + end + end + + describe "extract_from_evidence/1" do + test "extracts from metadata.content_text" do + evidence = %{ + title: "Epstein Flight Logs", + metadata: %{ + content_text: "Jeffrey Epstein flew to Palm Beach on his private jet." + } + } + + entities = NERExtractor.extract_from_evidence(evidence) + + assert Enum.any?(entities, &String.contains?(&1, "Jeffrey Epstein")) + assert Enum.any?(entities, &String.contains?(&1, "Palm Beach")) + end + + test "includes title in extraction" do + evidence = %{ + title: "HSBC Holdings Investigation Report", + metadata: %{content_text: "The bank was fined for compliance failures."} + } + + entities = NERExtractor.extract_from_evidence(evidence) + + assert Enum.any?(entities, &String.contains?(&1, "HSBC Holdings")) + end + + test "handles string keys in metadata" do + evidence = %{ + "title" => "Maxwell Documents", + "metadata" => %{"content_text" => "Ghislaine Maxwell attended the event."} + } + + entities = NERExtractor.extract_from_evidence(evidence) + + assert Enum.any?(entities, &String.contains?(&1, "Ghislaine Maxwell")) + end + + test "handles missing content gracefully" do + evidence = %{title: "Empty", metadata: %{}} + entities = NERExtractor.extract_from_evidence(evidence) + + assert is_list(entities) + end + end +end