-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathconfig.example.yaml
More file actions
110 lines (105 loc) · 3.28 KB
/
config.example.yaml
File metadata and controls
110 lines (105 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# SetForge Configuration File
# ==========================
# This file centralizes all configuration for the SetForge data pipelines.
# API Provider Configuration
# --------------------------
# List all API providers you intend to use.
# The manager will rotate through them, prioritizing paid tiers.
#
# - name: A unique identifier for this client.
# - provider: 'vertex_ai' or 'google_ai_studio'.
# - model: The specific model name (e.g., 'gemini-1.5-pro-preview-0409').
# - api_key: Your API key (required for google_ai_studio).
# - project_id: Your Google Cloud Project ID (required for vertex_ai).
# - tier: 'paid' or 'free'. Paid tiers are always prioritized.
# - rpm: Requests Per Minute limit for this key.
# - tpm: Tokens Per Minute limit for this key.
api_providers:
- name: "studio_tier_1"
provider: "google_ai_studio"
model: "gemini-2.5-flash"
api_key: "API_KEY"
tier: "paid"
rpm: 4000
tpm: 4000000
- name: "studio_free_1"
provider: "google_ai_studio"
model: "gemini-2.0-flash"
api_key: "API_KEY"
tier: "free"
rpm: 15
tpm: 250000
- name: "studio_free_2"
provider: "google_ai_studio"
model: "gemini-2.0-flash"
api_key: "API_KEY"
tier: "free"
rpm: 15
tpm: 1000000
- name: "studio_free_3"
provider: "google_ai_studio"
model: "gemini-2.0-flash"
api_key: "API_KEY"
tier: "free"
rpm: 15
tpm: 1000000
- name: "studio_free_4"
provider: "google_ai_studio"
model: "gemini-2.0-flash"
api_key: "API_KEY"
tier: "free"
rpm: 15
tpm: 1000000
- name: "studio_free_5"
provider: "google_ai_studio"
model: "gemini-2.0-flash"
api_key: "API_KEY"
tier: "free"
rpm: 15
tpm: 1000000
- name: "studio_free_6"
provider: "google_ai_studio"
model: "gemini-2.0-flash"
api_key: "API_KEY"
tier: "free"
rpm: 15
tpm: 1000000
- name: "studio_free_7"
provider: "google_ai_studio"
model: "gemini-2.0-flash"
api_key: "API_KEY"
tier: "free"
rpm: 15
tpm: 1000000
- name: "studio_free_8"
provider: "google_ai_studio"
model: "gemini-2.0-flash"
api_key: "API_KEY"
tier: "free"
rpm: 15
tpm: 1000000
# Data Pipeline Configuration
# ---------------------------
# Defines the directory structure for the data processing pipelines.
data_config:
raw_dir: "data_raw"
cleaned_dir: "data_cleaned"
structured_dir: "data_structured"
qa_dir: "data_qa"
log_dir: "logs"
checkpoint_dir: "checkpoints"
# Web Scraper Configuration
# -------------------------
scraper_config:
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
timeout: 30 # seconds
max_depth: 5 # Maximum recursion depth for crawling
max_files_per_domain: 100 # Safety limit for number of files per domain
# Q&A Generation Pipeline Configuration
# -------------------------------------
qa_pipeline_config:
concurrency_limit: 5 # Max number of files to process at the same time
max_retries: 3 # Retries for a file before it's considered failed
dead_letter_queue_dir: "data/dead_letter_queue/qa" # Folder for failed files
qa_output_file: "data_qa/qna_dataset.jsonl" # Main dataset output
qa_review_file: "data_qa/needs_review.jsonl" # Output for Q&A pairs that fail validation