ForgeLM/tests/test_integration.py at main · HodeTech/ForgeLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
"""Integration smoke test — CPU only, no GPU/torch required.

Tests the full pipeline features that don't require model training:
- Config validation with all Phase 8 fields
- CLI dry-run with compliance/risk/governance config
- Compliance export standalone
- Audit logger
- Model integrity verification
- Deployer instructions generation
- Evidence bundle export
- Data format detection
- Wizard config generation (mocked input)
"""

import json
import os
from unittest.mock import patch

import pytest
import yaml

from forgelm.cli import (
    EXIT_SUCCESS,
    _run_compliance_export,
    _run_dry_run,
    main,
)
from forgelm.compliance import (
    AuditLogger,
    export_compliance_artifacts,
    export_evidence_bundle,
    generate_deployer_instructions,
    generate_model_integrity,
    generate_training_manifest,
)
from forgelm.config import ForgeConfig, load_config

try:
    from forgelm.data import _detect_dataset_format
except ImportError:
    _detect_dataset_format = None


def _full_config():
    """A config using ALL Phase 8 fields."""
    return {
        "model": {
            "name_or_path": "HuggingFaceTB/SmolLM2-135M-Instruct",
            "max_length": 512,
            "load_in_4bit": False,
            "trust_remote_code": False,
            "offline": False,
        },
        "lora": {
            "r": 16,
            "alpha": 32,
            "method": "dora",
            "target_modules": ["q_proj", "v_proj"],
        },
        "training": {
            "trainer_type": "sft",
            "output_dir": "./test_checkpoints",
            "num_train_epochs": 1,
            "per_device_train_batch_size": 2,
            "learning_rate": 2e-5,
            "report_to": "none",
        },
        "data": {
            "dataset_name_or_path": "test_data.jsonl",
            "governance": {
                "collection_method": "Manual curation by domain experts",
                "annotation_process": "Two annotators, adjudication by senior",
                "known_biases": "English-skewed, EU region only",
                "personal_data_included": True,
                "dpia_completed": True,
            },
        },
        "evaluation": {
            "auto_revert": True,
            "max_acceptable_loss": 2.0,
            "require_human_approval": False,
            "benchmark": {
                "enabled": True,
                "tasks": ["arc_easy"],
                "min_score": 0.3,
            },
            "safety": {
                "enabled": True,
                "classifier": "meta-llama/Llama-Guard-3-8B",
                "test_prompts": "safety_prompts.jsonl",
                "max_safety_regression": 0.05,
            },
            "llm_judge": {
                "enabled": True,
                "judge_model": "gpt-4o",
                "judge_api_key_env": "OPENAI_API_KEY",
                "eval_dataset": "eval_prompts.jsonl",
                "min_score": 7.0,
            },
        },
        "compliance": {
            "provider_name": "Test Corp",
            "provider_contact": "ai@testcorp.com",
            "system_name": "Customer Support Bot",
            "intended_purpose": "Automated customer support for insurance claims",
            "known_limitations": "Not suitable for medical or legal advice",
            "system_version": "2.1.0",
            "risk_classification": "high-risk",
        },
        "risk_assessment": {
            "intended_use": "Customer support chatbot for insurance",
            "foreseeable_misuse": [
                "Users may ask for medical advice",
                "Model may generate incorrect policy details",
            ],
            "risk_category": "high-risk",
            "mitigation_measures": [
                "Safety classifier blocks harmful outputs",
                "Human review required for policy responses",
            ],
            "vulnerable_groups_considered": True,
        },
        "monitoring": {
            "enabled": True,
            "endpoint_env": "MONITORING_URL",
            "metrics_export": "prometheus",
            "alert_on_drift": True,
            "check_interval_hours": 12,
        },
        "webhook": {
            "url_env": "FORGELM_WEBHOOK_URL",
            "notify_on_start": True,
            "notify_on_success": True,
            "notify_on_failure": True,
            "timeout": 10,
        },
    }


class TestFullConfigValidation:
    """Test that a config with ALL fields validates correctly."""

    def test_full_config_parses(self):
        cfg = ForgeConfig(**_full_config())
        assert cfg.model.name_or_path == "HuggingFaceTB/SmolLM2-135M-Instruct"
        assert cfg.compliance.risk_classification == "high-risk"
        assert cfg.risk_assessment.risk_category == "high-risk"
        assert cfg.data.governance.dpia_completed is True
        assert cfg.monitoring.metrics_export == "prometheus"
        assert cfg.evaluation.require_human_approval is False
        assert cfg.webhook.timeout == 10

    def test_full_config_yaml_round_trip(self, tmp_path):
        cfg_path = str(tmp_path / "full_config.yaml")
        with open(cfg_path, "w") as f:
            yaml.dump(_full_config(), f)
        cfg = load_config(cfg_path)
        assert cfg.compliance.provider_name == "Test Corp"
        assert cfg.risk_assessment.foreseeable_misuse[0] == "Users may ask for medical advice"
        assert cfg.data.governance.collection_method == "Manual curation by domain experts"

    @pytest.mark.parametrize("tier", ["high-risk", "unacceptable"])
    def test_strict_tier_without_safety_raises_config_error(self, tier):
        """Wave 3 / Faz 28 F-compliance-110 + F-W3FU-T-05: BOTH strict
        tiers (Article 9 ``high-risk`` AND Article 5 ``unacceptable``)
        are hard ``ConfigError`` when safety eval is disabled at the
        full-config integration layer.  EU AI Act risk-management
        evidence cannot be derived from a disabled safety eval; the
        unit test in test_eu_ai_act.py pins the validator-layer
        contract, and this test pins the YAML round-trip + Pydantic
        validator interaction at the integration layer."""
        from forgelm.config import ConfigError

        data = _full_config()
        data["risk_assessment"]["risk_category"] = tier
        del data["evaluation"]["safety"]
        with pytest.raises(ConfigError, match="evaluation.safety.enabled"):
            ForgeConfig(**data)


class TestDryRunWithCompliance:
    """Test --dry-run with full Phase 8 config."""

    def test_dry_run_json_includes_compliance(self, capsys):
        cfg = ForgeConfig(**_full_config())
        _run_dry_run(cfg, "json")
        result = json.loads(capsys.readouterr().out)
        assert result["status"] == "valid"

    def test_dry_run_via_main(self, tmp_path):
        cfg_path = str(tmp_path / "config.yaml")
        with open(cfg_path, "w") as f:
            yaml.dump(_full_config(), f)
        with patch("sys.argv", ["forgelm", "--config", cfg_path, "--dry-run", "--output-format", "json"]):
            with pytest.raises(SystemExit) as exc_info:
                main()
            assert exc_info.value.code == EXIT_SUCCESS


class TestComplianceExportIntegration:
    """Test standalone compliance export end-to-end."""

    def test_export_creates_all_artifacts(self, tmp_path):
        cfg = ForgeConfig(**_full_config())
        output_dir = str(tmp_path / "audit")
        _run_compliance_export(cfg, output_dir, "text")

        # Verify files exist
        assert os.path.isfile(os.path.join(output_dir, "compliance_report.json"))
        assert os.path.isfile(os.path.join(output_dir, "training_manifest.yaml"))
        assert os.path.isfile(os.path.join(output_dir, "data_provenance.json"))
        assert os.path.isfile(os.path.join(output_dir, "risk_assessment.json"))
        assert os.path.isfile(os.path.join(output_dir, "annex_iv_metadata.json"))

    def test_compliance_report_has_annex_iv(self, tmp_path):
        cfg = ForgeConfig(**_full_config())
        manifest = generate_training_manifest(cfg, {"eval_loss": 0.5})

        assert "annex_iv" in manifest
        assert manifest["annex_iv"]["provider_name"] == "Test Corp"
        assert manifest["annex_iv"]["risk_classification"] == "high-risk"
        assert manifest["annex_iv"]["intended_purpose"] == "Automated customer support for insurance claims"

    def test_compliance_report_has_risk_assessment(self, tmp_path):
        cfg = ForgeConfig(**_full_config())
        manifest = generate_training_manifest(cfg, {})

        assert "risk_assessment" in manifest
        assert manifest["risk_assessment"]["risk_category"] == "high-risk"
        assert len(manifest["risk_assessment"]["foreseeable_misuse"]) == 2

    def test_compliance_report_has_monitoring(self, tmp_path):
        cfg = ForgeConfig(**_full_config())
        manifest = generate_training_manifest(cfg, {})

        assert "monitoring" in manifest
        assert manifest["monitoring"]["metrics_export"] == "prometheus"

    def test_export_json_output(self, tmp_path, capsys):
        cfg = ForgeConfig(**_full_config())
        output_dir = str(tmp_path / "audit")
        _run_compliance_export(cfg, output_dir, "json")
        result = json.loads(capsys.readouterr().out)
        assert result["success"] is True
        assert len(result["files"]) >= 5

    def test_annex_iv_file_content(self, tmp_path):
        cfg = ForgeConfig(**_full_config())
        output_dir = str(tmp_path / "audit")
        manifest = generate_training_manifest(cfg, {"eval_loss": 0.5})
        export_compliance_artifacts(manifest, output_dir)

        annex_path = os.path.join(output_dir, "annex_iv_metadata.json")
        with open(annex_path) as f:
            annex = json.load(f)
        # Wave 2b Round-4 (F-W2B-01): writer now emits the §1-9 canonical
        # layout that ``verify-annex-iv`` accepts; the operator-friendly
        # 7-key provider block is preserved under ``provider_metadata``.
        assert annex["system_identification"]["provider_name"] == "Test Corp"
        assert annex["system_identification"]["system_name"] == "Customer Support Bot"
        assert annex["provider_metadata"]["provider_name"] == "Test Corp"
        assert annex["provider_metadata"]["system_name"] == "Customer Support Bot"
        # Tampering-detection hash must be present and canonical.
        assert "metadata" in annex
        assert "manifest_hash" in annex["metadata"]

    def test_risk_assessment_file_content(self, tmp_path):
        cfg = ForgeConfig(**_full_config())
        output_dir = str(tmp_path / "audit")
        manifest = generate_training_manifest(cfg, {})
        export_compliance_artifacts(manifest, output_dir)

        risk_path = os.path.join(output_dir, "risk_assessment.json")
        with open(risk_path) as f:
            risk = json.load(f)
        assert risk["intended_use"] == "Customer support chatbot for insurance"
        assert risk["vulnerable_groups_considered"] is True


class TestAuditLoggerIntegration:
    """Test audit logger end-to-end."""

    def test_full_event_chain(self, tmp_path):
        audit = AuditLogger(str(tmp_path), run_id="test-run-001")

        audit.log_event("pipeline.initialized", model="test-model")
        audit.log_event("training.started")
        audit.log_event("evaluation.loss_check", eval_loss=0.5, passed=True)
        audit.log_event("evaluation.safety", safe_ratio=0.95, passed=True)
        # Sonar python:S5443 hotspot avoidance: route the path-shaped
        # literal through the per-test ``tmp_path`` fixture so it cannot
        # be misread as a publicly-writable directory reference.  The
        # value is a structured-log payload only — no file is created.
        audit.log_event("human_approval.required", model_path=str(tmp_path / "model"))
        audit.log_event("pipeline.completed", success=True)

        log_path = os.path.join(str(tmp_path), "audit_log.jsonl")
        with open(log_path) as f:
            events = [json.loads(line) for line in f]

        assert len(events) == 6
        assert all(e["run_id"] == "test-run-001" for e in events)
        assert events[0]["event"] == "pipeline.initialized"
        assert events[3]["event"] == "evaluation.safety"
        assert events[3]["safe_ratio"] == pytest.approx(0.95)
        assert events[4]["event"] == "human_approval.required"
        assert events[5]["event"] == "pipeline.completed"

        # Verify chronological order
        timestamps = [e["timestamp"] for e in events]
        assert timestamps == sorted(timestamps)


class TestModelIntegrityIntegration:
    """Test model integrity verification end-to-end."""

    def test_checksums_on_real_files(self, tmp_path):
        model_dir = tmp_path / "final_model"
        model_dir.mkdir()
        (model_dir / "adapter_model.safetensors").write_bytes(b"fake adapter weights " * 100)
        (model_dir / "adapter_config.json").write_text('{"r": 16, "alpha": 32}')
        (model_dir / "tokenizer.json").write_text('{"model": "test"}')
        (model_dir / "tokenizer_config.json").write_text('{"pad_token": "<pad>"}')

        integrity = generate_model_integrity(str(model_dir))
        assert len(integrity["artifacts"]) == 4
        assert integrity["verified_at"]

        # Verify checksums are deterministic
        integrity2 = generate_model_integrity(str(model_dir))
        for a1, a2 in zip(integrity["artifacts"], integrity2["artifacts"]):
            assert a1["sha256"] == a2["sha256"]
            assert a1["size_bytes"] == a2["size_bytes"]


class TestDeployerInstructionsIntegration:
    """Test deployer instructions generation end-to-end."""

    def test_full_instructions(self, tmp_path):
        cfg = ForgeConfig(**_full_config())
        final_path = str(tmp_path / "model")
        doc_path = generate_deployer_instructions(cfg, {"eval_loss": 0.5, "safety/safe_ratio": 0.97}, final_path)

        content = open(doc_path).read()
        # Metric names + bullet bodies + table values are CommonMark-escaped
        # by ``_sanitize_md`` so config-derived strings cannot inject pipes,
        # headings, or links. Stripping backslashes recovers the
        # human-readable form for these substring assertions.
        plain = content.replace("\\", "")
        assert "Test Corp" in plain
        assert "Customer Support Bot" in plain
        assert "insurance claims" in plain
        assert "medical" in plain.lower()  # foreseeable misuse
        assert "eval_loss" in plain
        assert "Human Oversight" in content
        assert "Incident Reporting" in content


class TestEvidenceBundleIntegration:
    """Test evidence bundle ZIP creation end-to-end."""

    def test_bundle_contains_all_files(self, tmp_path):
        cfg = ForgeConfig(**_full_config())

        # Generate all compliance artifacts
        compliance_dir = str(tmp_path / "compliance")
        manifest = generate_training_manifest(cfg, {"eval_loss": 0.5})
        files = export_compliance_artifacts(manifest, compliance_dir)
        assert len(files) >= 5

        # Create bundle
        bundle_path = str(tmp_path / "evidence_bundle.zip")
        result = export_evidence_bundle(compliance_dir, bundle_path)
        assert os.path.isfile(result)

        import zipfile

        with zipfile.ZipFile(bundle_path) as zf:
            names = zf.namelist()
        assert len(names) >= 5
        assert any("compliance_report.json" in n for n in names)
        assert any("risk_assessment.json" in n for n in names)
        assert any("annex_iv_metadata.json" in n for n in names)


@pytest.mark.skipif(_detect_dataset_format is None, reason="datasets library not installed")
class TestDataFormatDetection:
    """Test dataset format auto-detection."""

    def test_sft_format(self):
        result = _detect_dataset_format(["User", "Assistant", "System"])
        assert result["suggested_trainer"] == "sft"
        assert "instruction" in result["description"].lower() or "User" in result["description"]

    def test_dpo_format(self):
        result = _detect_dataset_format(["prompt", "chosen", "rejected"])
        assert result["suggested_trainer"] == "dpo"

    def test_kto_format(self):
        result = _detect_dataset_format(["prompt", "completion", "label"])
        assert result["suggested_trainer"] == "kto"

    def test_grpo_format(self):
        result = _detect_dataset_format(["prompt"])
        assert result["suggested_trainer"] == "grpo"

    def test_messages_format(self):
        result = _detect_dataset_format(["messages"])
        assert result["suggested_trainer"] == "sft"

    def test_unknown_format(self):
        result = _detect_dataset_format(["col_a", "col_b", "col_c"])
        assert result["suggested_trainer"] == "sft"
        assert "unknown" in result["description"].lower()


class TestConfigTemplateWithPhase8:
    """Verify config_template.yaml includes Phase 8 sections."""

    def test_template_has_compliance_section(self):
        template_path = os.path.join(os.path.dirname(__file__), "..", "config_template.yaml")
        with open(template_path) as f:
            content = f.read()
        assert "compliance:" in content
        assert "provider_name:" in content
        assert "risk_classification:" in content

    def test_template_has_risk_assessment_section(self):
        template_path = os.path.join(os.path.dirname(__file__), "..", "config_template.yaml")
        with open(template_path) as f:
            content = f.read()
        assert "risk_assessment:" in content
        assert "foreseeable_misuse:" in content

    def test_template_has_monitoring_section(self):
        template_path = os.path.join(os.path.dirname(__file__), "..", "config_template.yaml")
        with open(template_path) as f:
            content = f.read()
        assert "monitoring:" in content
        assert "metrics_export:" in content

    def test_template_still_parses(self):
        template_path = os.path.join(os.path.dirname(__file__), "..", "config_template.yaml")
        cfg = load_config(template_path)
        assert cfg.model.name_or_path
        assert cfg.training.trainer_type == "sft"