-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRegexHeadingClassifier.cs
More file actions
126 lines (113 loc) · 5.05 KB
/
RegexHeadingClassifier.cs
File metadata and controls
126 lines (113 loc) · 5.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
// Copyright (c) Jong Hyun Kim. All rights reserved.
// Licensed under the Apache License, Version 2.0.
using System.Text.RegularExpressions;
using PdfStruct.Models;
namespace PdfStruct.Analysis;
/// <summary>
/// A pairing of a regex pattern, the heading level to assign when a block's
/// first line matches it, and an optional structural label.
/// </summary>
/// <param name="Match">Regex applied to the block's first (trimmed) line.</param>
/// <param name="HeadingLevel">Heading level to assign on match (1 = h1, 2 = h2, ...). Uncapped on the data model.</param>
/// <param name="Label">
/// Optional <see cref="HeadingElement.Level"/> string. If null, a default
/// label is derived from <paramref name="HeadingLevel"/>: <c>Doctitle</c>
/// at level 1, <c>Subtitle</c> at level 2 and below.
/// </param>
public readonly record struct HeadingPattern(
Regex Match,
int HeadingLevel,
string? Label = null);
/// <summary>
/// Classifies text blocks as headings when the block's first line matches
/// any caller-supplied regex pattern. Language- and domain-agnostic — all
/// pattern knowledge is injected at construction time, so callers can wire
/// in patterns appropriate to their corpus (legal, scientific, contracts,
/// etc.) without the library hard-coding any of them.
/// </summary>
/// <remarks>
/// Designed to be composed in front of <see cref="FontBasedElementClassifier"/>
/// via <see cref="CompositeElementClassifier"/> when font-size signals alone
/// are insufficient — for example, in documents whose chapter and section
/// headings are typeset at the same size as body text and rely on textual
/// markers ("Chapter 1.", "제1장 총강", "Article I.") for hierarchy.
/// </remarks>
public sealed class RegexHeadingClassifier : IElementClassifier
{
private readonly HeadingPattern[] _patterns;
/// <summary>
/// Initializes a new <see cref="RegexHeadingClassifier"/> with the supplied
/// patterns. Patterns are evaluated in order; the first match wins.
/// </summary>
/// <param name="patterns">Regex patterns paired with their target heading levels.</param>
/// <exception cref="ArgumentNullException">Thrown when <paramref name="patterns"/> is <c>null</c>.</exception>
public RegexHeadingClassifier(IEnumerable<HeadingPattern> patterns)
{
ArgumentNullException.ThrowIfNull(patterns);
_patterns = patterns.ToArray();
}
/// <inheritdoc />
public IReadOnlyList<ContentElement> Classify(
IReadOnlyList<DocumentTextBlock> documentBlocks, ref int startId)
{
var results = new List<ContentElement>(documentBlocks.Count);
foreach (var entry in documentBlocks)
{
if (entry.IsStatsOnly) continue;
ContentElement element = TryClassifyHeading(entry.Block, entry.PageNumber, ref startId)
?? (ContentElement)CreateParagraph(entry.Block, entry.PageNumber, ref startId);
results.Add(element);
}
return results;
}
/// <summary>Returns a <see cref="HeadingElement"/> when the block's first line matches a configured pattern, otherwise <c>null</c>.</summary>
private HeadingElement? TryClassifyHeading(TextBlock block, int pageNumber, ref int id)
{
var firstLine = FirstLine(block.Text);
if (firstLine.Length == 0) return null;
foreach (var pattern in _patterns)
{
if (!pattern.Match.IsMatch(firstLine)) continue;
return new HeadingElement
{
Id = id++,
PageNumber = pageNumber,
BoundingBox = block.BoundingBox,
HeadingLevel = pattern.HeadingLevel,
Level = pattern.Label ?? DefaultLevelLabel(pattern.HeadingLevel),
Text = new TextProperties
{
Font = block.FontName,
FontSize = block.FontSize,
Content = block.Text.Trim()
}
};
}
return null;
}
/// <summary>Creates a fallback <see cref="ParagraphElement"/> for blocks that did not match any heading pattern.</summary>
private static ParagraphElement CreateParagraph(TextBlock block, int pageNumber, ref int id) => new()
{
Id = id++,
PageNumber = pageNumber,
BoundingBox = block.BoundingBox,
Text = new TextProperties
{
Font = block.FontName,
FontSize = block.FontSize,
Content = block.Text.Trim()
}
};
/// <summary>Extracts the first non-empty line of a block's text, trimmed.</summary>
private static string FirstLine(string text)
{
var newline = text.IndexOf('\n');
return (newline >= 0 ? text[..newline] : text).Trim();
}
/// <summary>
/// Returns the default <see cref="HeadingElement.Level"/> string for a
/// heading level when the pattern does not supply its own label:
/// <c>Doctitle</c> at level 1, <c>Subtitle</c> at level 2 and below.
/// </summary>
private static string DefaultLevelLabel(int level) => level == 1 ? "Doctitle" : "Subtitle";
}