SetForge/config.example.yaml at main · codermillat/SetForge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# SetForge Configuration File
# ==========================
# This file centralizes all configuration for the SetForge data pipelines.

# API Provider Configuration
# --------------------------
# List all API providers you intend to use.
# The manager will rotate through them, prioritizing paid tiers.
#
# - name: A unique identifier for this client.
# - provider: 'vertex_ai' or 'google_ai_studio'.
# - model: The specific model name (e.g., 'gemini-1.5-pro-preview-0409').
# - api_key: Your API key (required for google_ai_studio).
# - project_id: Your Google Cloud Project ID (required for vertex_ai).
# - tier: 'paid' or 'free'. Paid tiers are always prioritized.
# - rpm: Requests Per Minute limit for this key.
# - tpm: Tokens Per Minute limit for this key.

api_providers:
  - name: "studio_tier_1"
    provider: "google_ai_studio"
    model: "gemini-2.5-flash"
    api_key: "API_KEY"
    tier: "paid"
    rpm: 4000
    tpm: 4000000
  - name: "studio_free_1"
    provider: "google_ai_studio"
    model: "gemini-2.0-flash"
    api_key: "API_KEY"
    tier: "free"
    rpm: 15
    tpm: 250000
  - name: "studio_free_2"
    provider: "google_ai_studio"
    model: "gemini-2.0-flash"
    api_key: "API_KEY"
    tier: "free"
    rpm: 15
    tpm: 1000000
  - name: "studio_free_3"
    provider: "google_ai_studio"
    model: "gemini-2.0-flash"
    api_key: "API_KEY"
    tier: "free"
    rpm: 15
    tpm: 1000000
  - name: "studio_free_4"
    provider: "google_ai_studio"
    model: "gemini-2.0-flash"
    api_key: "API_KEY"
    tier: "free"
    rpm: 15
    tpm: 1000000
  - name: "studio_free_5"
    provider: "google_ai_studio"
    model: "gemini-2.0-flash"
    api_key: "API_KEY"
    tier: "free"
    rpm: 15
    tpm: 1000000
  - name: "studio_free_6"
    provider: "google_ai_studio"
    model: "gemini-2.0-flash"
    api_key: "API_KEY"
    tier: "free"
    rpm: 15
    tpm: 1000000
  - name: "studio_free_7"
    provider: "google_ai_studio"
    model: "gemini-2.0-flash"
    api_key: "API_KEY"
    tier: "free"
    rpm: 15
    tpm: 1000000
  - name: "studio_free_8"
    provider: "google_ai_studio"
    model: "gemini-2.0-flash"
    api_key: "API_KEY"
    tier: "free"
    rpm: 15
    tpm: 1000000

# Data Pipeline Configuration
# ---------------------------
# Defines the directory structure for the data processing pipelines.
data_config:
  raw_dir: "data_raw"
  cleaned_dir: "data_cleaned"
  structured_dir: "data_structured"
  qa_dir: "data_qa"
  log_dir: "logs"
  checkpoint_dir: "checkpoints"

# Web Scraper Configuration
# -------------------------
scraper_config:
  user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
  timeout: 30 # seconds
  max_depth: 5 # Maximum recursion depth for crawling
  max_files_per_domain: 100 # Safety limit for number of files per domain

# Q&A Generation Pipeline Configuration
# -------------------------------------
qa_pipeline_config:
  concurrency_limit: 5      # Max number of files to process at the same time
  max_retries: 3            # Retries for a file before it's considered failed
  dead_letter_queue_dir: "data/dead_letter_queue/qa" # Folder for failed files
  qa_output_file: "data_qa/qna_dataset.jsonl"       # Main dataset output
  qa_review_file: "data_qa/needs_review.jsonl"     # Output for Q&A pairs that fail validation