-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcli.py
More file actions
137 lines (104 loc) · 4.92 KB
/
Copy pathcli.py
File metadata and controls
137 lines (104 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""CLI entry point for the BridgingWorlds pipeline (Python stages)."""
from __future__ import annotations
import json
from pathlib import Path
import click
from rich.console import Console
from .config import load_config
console = Console()
@click.group()
@click.option("--config", "config_path", default="config/default.yaml", help="Config file path")
@click.pass_context
def main(ctx, config_path):
"""BridgingWorlds — Social media data portability pipeline."""
ctx.ensure_object(dict)
ctx.obj["config"] = load_config(config_path)
@main.command()
@click.argument("source", type=click.Path(exists=True))
@click.option("--output-dir", default="output/normalized", help="Output directory")
def ingest(source, output_dir):
"""Parse an Instagram GDPR/DMA export (ZIP or directory)."""
from .ingest.extractor import extract_export
from .ingest.normalizer import normalize_all
console.print(f"[bold]Ingesting from:[/bold] {source}")
export_root = extract_export(source)
console.print(f"Export root: {export_root}")
output_path = Path(output_dir)
counts = normalize_all(export_root, output_path)
console.print("\n[bold green]Ingest complete:[/bold green]")
for name, count in counts.items():
console.print(f" {name}: {count} items")
@main.command()
@click.option("--input-dir", default="output/normalized", help="Normalized JSON directory")
@click.option("--output-dir", default="output/rdf", help="RDF output directory")
@click.option("--username", required=True, help="Instagram username")
def convert(input_dir, output_dir, username):
"""Convert normalized data to RDF Turtle files."""
from .convert.graph_builder import convert_all
from .convert.media_handler import write_manifest
console.print(f"[bold]Converting to RDF for user:[/bold] {username}")
input_path = Path(input_dir)
output_path = Path(output_dir)
counts = convert_all(input_path, output_path, username)
console.print("\n[bold green]Conversion complete (public data only):[/bold green]")
total = 0
for name, count in counts.items():
console.print(f" {name}: {count} triples")
total += count
console.print(f" [bold]Total: {total} triples[/bold]")
console.print(
" [dim]Not converted (private or aggregated activity log): "
"likes, comments, messages, saved, searches[/dim]"
)
# Also build media manifest if export root available
# Try to find it relative to input_dir
export_root = input_path.parent.parent
ig_dirs = [d for d in export_root.iterdir() if d.is_dir() and d.name.startswith("instagram-")]
if ig_dirs:
media_count = write_manifest(ig_dirs[0], input_path, output_path)
console.print(f" Media manifest: {media_count} files")
@main.command()
@click.argument("source", type=click.Path(exists=True))
@click.option("--username", required=True, help="Instagram username")
@click.option("--output-dir", default="output", help="Base output directory")
def run(source, username, output_dir):
"""Run full pipeline: ingest + convert."""
from .ingest.extractor import extract_export
from .ingest.normalizer import normalize_all
from .convert.graph_builder import convert_all
from .convert.media_handler import write_manifest
output_base = Path(output_dir)
normalized_dir = output_base / "normalized"
rdf_dir = output_base / "rdf"
# Stage 1: Ingest
console.print("[bold cyan]Stage 1: Ingest[/bold cyan]")
export_root = extract_export(source)
counts = normalize_all(export_root, normalized_dir)
for name, count in counts.items():
console.print(f" {name}: {count} items")
# Stage 2: Convert
console.print("\n[bold cyan]Stage 2: Convert to RDF[/bold cyan]")
triple_counts = convert_all(normalized_dir, rdf_dir, username)
total = sum(triple_counts.values())
console.print(f" Generated {total} triples across {len(triple_counts)} graphs")
media_count = write_manifest(export_root, normalized_dir, rdf_dir)
console.print(f" Media manifest: {media_count} files")
console.print("\n[bold green]Pipeline complete.[/bold green]")
console.print(f"RDF output: {rdf_dir}")
console.print("Next: run TypeScript stages for Pod storage + export")
@main.command()
@click.argument("source", type=click.Path(exists=True))
@click.option("--normalized-dir", default="output/normalized")
@click.option("--rdf-dir", default="output/rdf")
@click.option("--output", default=None, help="Write metrics JSON to file")
def metrics(source, normalized_dir, rdf_dir, output):
"""Compute evaluation metrics for the paper."""
from .metrics.evaluator import compute_metrics, print_metrics
m = compute_metrics(Path(source), Path(normalized_dir), Path(rdf_dir))
print_metrics(m)
if output:
with open(output, "w") as f:
json.dump(m, f, indent=2, default=str)
console.print(f"\nMetrics written to {output}")
if __name__ == "__main__":
main()