From 92879459ff4f6b7d8e8bdbe78de3fc314427c4f6 Mon Sep 17 00:00:00 2001
From: Omar Soliman <osolima6@uwo.ca>
Date: Fri, 29 May 2026 22:54:01 -0400
Subject: [PATCH] Add Toronto Open Data pipeline and dataset documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ml/fetch.py: CKAN download helpers with SSL bypass (Windows cert issue),
  ZIP/shapefile extraction, case-insensitive CSV lat/lon conversion, and
  GTFS stop parsing for TTC
- ml/data_pipeline.py: downloads all 17 datasets to data/ as GeoParquet/
  Parquet; caches on disk, filters permits to 2020+, joins neighbourhood
  profiles (income/density) onto polygon boundaries
- ml/requirements.txt: geopandas, xgboost, rasterio, python-dotenv
- data/data.md: full dataset guide — buckets, column specs, handoff notes
- data/coefficients/: ITE trip generation rates + StatsCan I-O multipliers
- .env.example: all required keys (WAQI, Mapbox, Anthropic, Ollama)
- .gitignore: exclude data/*.parquet, data/*.tif, backend/models/, .venv
- .vscode/settings.json: Python interpreter + ml/ extra path for team
---
 .env.example                                  |  28 ++
 .gitignore                                    |  17 +
 .vscode/settings.json                         |   4 +
 README.md                                     |  88 ++++-
 data/coefficients/ite_trip_rates.csv          |   9 +
 data/coefficients/statscan_io_multipliers.csv |   6 +
 data/data.md                                  | 210 +++++++++++
 ml/data_pipeline.py                           | 354 ++++++++++++++++++
 ml/fetch.py                                   | 213 +++++++++++
 ml/requirements.txt                           |  10 +
 10 files changed, 937 insertions(+), 2 deletions(-)
 create mode 100644 .env.example
 create mode 100644 .vscode/settings.json
 create mode 100644 data/coefficients/ite_trip_rates.csv
 create mode 100644 data/coefficients/statscan_io_multipliers.csv
 create mode 100644 data/data.md
 create mode 100644 ml/data_pipeline.py
 create mode 100644 ml/fetch.py
 create mode 100644 ml/requirements.txt

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..360f508
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,28 @@
+# Toronto Open Data CKAN API (no key required)
+TORONTO_CKAN_BASE=https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/
+
+# Local data directory (relative to project root)
+DATA_DIR=./data
+
+# Date filter for building permits (YYYY-MM-DD)
+PERMITS_FILTER_DATE=2020-01-01
+
+# Set to 1 to force re-download of all datasets (ignores cache)
+FORCE_REFRESH=0
+
+# Live APIs used at inference time
+# Get free WAQI token at: https://aqicn.org/data-platform/token/
+WAQI_API_KEY=your_waqi_token_here
+
+# Get Mapbox token at: https://account.mapbox.com/
+MAPBOX_TOKEN=your_mapbox_token_here
+
+# Get Anthropic API key at: https://console.anthropic.com/
+ANTHROPIC_API_KEY=your_anthropic_key_here
+
+# Optional: IESO real-time electricity prices (no key required)
+IESO_BASE_URL=https://reports-public.ieso.ca/public/
+
+# Ollama local LLM (running on Blackwell GPU)
+OLLAMA_HOST=http://localhost:11434
+OLLAMA_MODEL=llama3.1:8b
diff --git a/.gitignore b/.gitignore
index 83972fa..064e7ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -216,3 +216,20 @@ __marimo__/
 
 # Streamlit
 .streamlit/secrets.toml
+
+# UrbanForge — downloaded data files (large, reproducible via data_pipeline.py)
+data/*.parquet
+data/*.tif
+data/*.tiff
+
+# Trained ML model artifacts (reproducible via train_models.py)
+backend/models/*.pkl
+backend/models/*.json
+
+# Frontend
+frontend/node_modules/
+frontend/.next/
+frontend/.env.local
+
+# Docker
+.docker/
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..c7aa6f5
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,4 @@
+{
+  "python.defaultInterpreterPath": "${workspaceFolder}/.venv/Scripts/python.exe",
+  "python.analysis.extraPaths": ["${workspaceFolder}/ml"]
+}
diff --git a/README.md b/README.md
index 5489ddd..192c978 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,86 @@
-# Nvidia-Hackathon
-Nvidia Hackathon
+# Toronto Construction Impact Analyzer
+
+Drop a pin on a Toronto map, describe a proposed building, and get an instant impact report:
+**economy** (jobs, tax revenue, utility costs) · **environment** (trees, CO₂, air quality) · **traffic** · **community benefit score** + an AI-generated narrative — all powered by a local Blackwell GPU.
+
+---
+
+## Quick Start
+
+### 1. Download & process Toronto open data
+```bash
+cd ml
+pip install -r requirements.txt
+python data_pipeline.py
+```
+
+### 2. Train the ML models (requires NVIDIA GPU)
+```bash
+python train_models.py
+```
+
+### 3. Start Ollama + FastAPI backend
+```bash
+# From the project root
+docker-compose up -d
+# Pull the LLM model on first run
+docker exec -it nvidia-hackathon-ollama-1 ollama pull llama3.1:8b
+```
+
+### 4. Start the frontend
+```bash
+cd frontend
+npm install
+# Add your Mapbox token to .env.local
+echo "NEXT_PUBLIC_MAPBOX_TOKEN=pk.your_token_here" > .env.local
+npm run dev
+```
+
+Open http://localhost:3000 — click anywhere on Toronto to begin.
+
+---
+
+## Architecture
+
+```
+Next.js (Mapbox map + form + report card)
+        │  POST /api/analyze
+        ▼
+FastAPI backend
+  ├── Spatial lookup   → GeoParquet files (Toronto Open Data)
+  ├── ML inference     → 3 XGBoost models (Blackwell GPU)
+  ├── Community score  → rule-based composite
+  └── LLM narrative    → Ollama llama3.1:8b (Blackwell GPU)
+```
+
+## ML Models
+
+| Model | Inputs | Outputs |
+|---|---|---|
+| Economic | building type, sqft, floors, neighbourhood | jobs, tax revenue, utility cost |
+| Environmental | building type, sqft, floors, nearby trees, traffic | trees at risk, CO₂/year, AQI delta |
+| Traffic | building type, sqft, floors, transit access, traffic baseline | daily trips, peak congestion %, parking demand |
+
+All three models are XGBoost regressors trained with `device=cuda` on Toronto Building Permits × spatial context features.
+
+## Data Sources
+
+See [`data/data.md`](data/data.md) for the complete dataset guide.
+
+Key sources:
+- Toronto Energy and Water Reporting (EWRB) — utility model training
+- Toronto Building Permits — training backbone
+- Toronto Employment Survey — jobs model
+- Transportation Tomorrow Survey (TTS) — traffic model
+- Toronto Street Trees, Neighbourhood Profiles, Zoning By-law — spatial context
+
+## Community Benefit Score
+
+Rule-based 0–100 composite:
+- +20 residential / +10 mixed-use (housing supply)
+- +15 affordable housing mentioned
+- +15 / +8 / −8 transit access (distance to TTC stop)
+- +10 ground-floor commercial
+- −5 to −20 traffic congestion burden
+- −5 to −15 tree/canopy loss
+- +10 low-carbon building
diff --git a/data/coefficients/ite_trip_rates.csv b/data/coefficients/ite_trip_rates.csv
new file mode 100644
index 0000000..c4b28b4
--- /dev/null
+++ b/data/coefficients/ite_trip_rates.csv
@@ -0,0 +1,9 @@
+building_type,ite_code,daily_trips_per_1000sqft,peak_am_per_1000sqft,peak_pm_per_1000sqft,source
+residential,220,6.65,0.35,0.44,ITE Trip Generation 11th Ed (per unit: 6.65/unit residential mid-rise)
+commercial_office,710,11.03,1.56,1.49,ITE Trip Generation 11th Ed
+retail_general,820,42.70,1.03,4.24,ITE Trip Generation 11th Ed
+industrial_general,110,6.97,0.97,0.86,ITE Trip Generation 11th Ed
+mixed_use,230,10.50,0.65,0.78,ITE Mixed-Use Trip Generation (blended estimate)
+hotel,310,8.70,0.54,0.60,ITE Trip Generation 11th Ed
+supermarket,850,102.24,8.00,9.48,ITE Trip Generation 11th Ed
+fast_food,934,496.12,34.64,26.61,ITE Trip Generation 11th Ed
diff --git a/data/coefficients/statscan_io_multipliers.csv b/data/coefficients/statscan_io_multipliers.csv
new file mode 100644
index 0000000..60ac0e4
--- /dev/null
+++ b/data/coefficients/statscan_io_multipliers.csv
@@ -0,0 +1,6 @@
+sector,province,gdp_multiplier_per_1m_cad,employment_person_years_per_1m_cad,source_year,notes
+residential_construction,Ontario,1.71,8.2,2021,StatsCan Input-Output Multipliers Table 381-0031
+commercial_construction,Ontario,1.68,7.1,2021,StatsCan Input-Output Multipliers Table 381-0031
+industrial_construction,Ontario,1.62,5.8,2021,StatsCan Input-Output Multipliers Table 381-0031
+mixed_use_construction,Ontario,1.70,7.8,2021,Blended average residential/commercial
+infrastructure_construction,Ontario,1.75,8.9,2021,StatsCan Input-Output Multipliers Table 381-0031
diff --git a/data/data.md b/data/data.md
new file mode 100644
index 0000000..126ffd1
--- /dev/null
+++ b/data/data.md
@@ -0,0 +1,210 @@
+# Data Guide — Toronto Construction Impact Analyzer
+
+This document is the single source of truth for every dataset the project needs.
+Datasets are split into four buckets: training data, spatial context layers, coefficient lookup tables, and live runtime APIs.
+
+---
+
+## Bucket 1 — Training Data
+
+*Row-by-row historical datasets used to train or calibrate ML models.*
+
+### EWRB — Toronto Energy and Water Reporting & Benchmarking
+
+- **Used by:** Model 1 (Energy / Utility Cost)
+- **What it is:** Every Toronto building over 10,000 sq ft must report annual electricity (kWh), gas (m³), and water (m³) consumption plus basic building characteristics (type, floor area, year built, floors, heating type).
+- **Why it's ideal:** The dataset is already in exactly the shape you need for regression — features on the left, consumption targets on the right. Multiple years are available so you can also model trends.
+- **Where to get it:** Toronto Open Data portal → search "Energy and Water Reporting and Benchmarking". Direct CSV download, free.
+- **Feature columns:** `building_type`, `floor_area_sqft`, `year_built`, `num_floors`, `neighbourhood`, `heating_type`
+- **Target columns:** `annual_electricity_kwh`, `annual_gas_m3`, `annual_water_m3`
+
+---
+
+### Toronto Building Permits (Cleared)
+
+- **Used by:** Model 2 (Construction Jobs), Model 4 (Property Tax), Model 8 (Community Impact)
+- **What it is:** Every building permit issued by the City of Toronto — includes declared construction value, building type, address, square footage, number of storeys.
+- **Why it's useful:** Lets you derive cost-per-sqft distributions by building type (needed to estimate construction value from height + footprint in the model). Also the backbone for training the community impact model (neighborhood building stock changes over time).
+- **Where to get it:** Toronto Open Data → "Building Permits — Active Permits" and "Building Permits — Cleared Permits". CSV or API, free.
+- **Key columns:** `permit_type`, `declared_valuation`, `work_type`, `address`, `no_of_storeys`, `floor_area_m2`, `issued_date`
+
+---
+
+### Toronto Employment Survey
+
+- **Used by:** Model 3 (Operational Jobs)
+- **What it is:** City of Toronto surveys every business with location and employee count. You can aggregate to building-level and get jobs-per-sqft by building type.
+- **Why it's useful:** Training set for "how many permanent jobs does a 50,000 sqft office building in the Annex produce?"
+- **Where to get it:** Toronto Open Data → "Toronto Employment Survey". CSV, free.
+- **Key columns:** `address`, `industry`, `employees`, `gfa_sqm` (if available)
+- **Fallback:** If this dataset is too coarse, use ITE employment generation rates (Bucket 3) as a lookup table instead of training a model.
+
+---
+
+### Transportation Tomorrow Survey (TTS)
+
+- **Used by:** Model 6 (Traffic Generation)
+- **What it is:** Household travel survey covering the Greater Toronto Area. Origin-destination pairs by mode, aggregated to traffic zones with land-use info. Considered the gold standard for GTA travel demand modeling.
+- **Why it's useful:** Training data for trip generation rates by building type and transit access.
+- **Where to get it:** Data Management Group, University of Toronto (dmg.utoronto.ca). Free for academic/research use — request access.
+- **Key columns (zone-level aggregate):** `zone_id`, `building_type_mix`, `residential_units`, `commercial_gfa`, `transit_access_score`, `daily_trips_generated`
+
+---
+
+### Traffic Volumes at Intersections (all modes)
+
+- **Used by:** Model 6 (Traffic), also as Bucket 2 spatial context
+- **What it is:** 30+ years of vehicle, pedestrian, and cycling counts at major Toronto intersections.
+- **Why it's useful:** Ground-truth for calibrating trip generation predictions. Cross-reference pre/post building construction to measure actual traffic delta.
+- **Where to get it:** Toronto Open Data → "Traffic Volumes at Intersections for All Modes". CSV, free.
+- **Key columns:** `count_id`, `location`, `latitude`, `longitude`, `year`, `8hr_vehicle_volume`
+
+---
+
+### Air Quality Ontario — Hourly Monitoring Data
+
+- **Used by:** Model 5 (Air Quality)
+- **What it is:** Hourly PM2.5 and NO₂ readings from all air quality monitoring stations across Ontario. Multi-decade history available.
+- **Why it's useful:** Training target for predicting how land-use changes (more traffic, less canopy) affect local air quality.
+- **Where to get it:** Ontario Ministry of Environment → Air Quality Ontario data downloads (airqualityontario.com). Free CSV downloads by year/station.
+- **Feature engineering:** Join station readings with surrounding land-use features (traffic volume within 500 m, % tree canopy within 500 m, building density, distance to highway) to build the training feature matrix.
+
+---
+
+### 311 Service Requests (2014–present)
+
+- **Used by:** Model 8 (Community Impact)
+- **What it is:** Every 311 complaint/service request filed by Toronto residents, with category, address, and date. Over 10 years of history.
+- **Why it's useful:** Proxy for neighbourhood service strain. Train a model: "as building density increases, which complaint categories increase?"
+- **Where to get it:** Toronto Open Data → "311 Service Requests". CSV/API, free.
+- **Key columns:** `service_request_id`, `type`, `ward`, `neighbourhood`, `opened_date`, `latitude`, `longitude`
+
+---
+
+### MPAC Property Assessment Data
+
+- **Used by:** Model 4 (Property Tax Revenue)
+- **What it is:** Municipal Property Assessment Corporation — assessed values for all Ontario properties. Full parcel-level data is licensed, but Toronto Open Data provides aggregated property tax collection statistics.
+- **Where to get it:** Toronto Open Data → "Property Tax Collection". For parcel-level, request through MPAC directly or use the publicly available Assessment Roll summaries.
+- **Key columns:** `roll_number`, `property_class`, `assessed_value`, `gfa_sqm`, `ward`, `neighbourhood`
+
+---
+
+## Bucket 2 — Spatial Context Layers
+
+*Load once at startup, query at a given lat/lng at demo time.*
+
+
+| Dataset                                 | Use at Runtime                                        | Source                                                          |
+| --------------------------------------- | ----------------------------------------------------- | --------------------------------------------------------------- |
+| **Zoning By-law (GeoJSON)**             | Zoning class (e.g., CR 3.0) at pin location           | Toronto Open Data                                               |
+| **Property Boundaries (parcels)**       | Lot size and shape at pin                             | Toronto Open Data → "Toronto Parcel Data"                       |
+| **Street Tree Inventory**               | Count trees within 500 m of pin; estimate canopy loss | Toronto Open Data → "Street Tree Data"                          |
+| **Forest & Land Cover raster**          | % canopy cover in 500 m buffer                        | Toronto Open Data → "Forest & Land Cover"                       |
+| **TTC Stops (subway + LRT + bus)**      | Transit access score = distance to nearest stop       | Toronto Open Data → "TTC Ridership" / GTFS                      |
+| **Neighbourhood Boundaries + Profiles** | Neighbourhood name, median income, population density | Toronto Open Data → "Neighbourhoods" + "Neighbourhood Profiles" |
+| **TRCA Flood Plain**                    | Flag if pin is in flood zone                          | TRCA GIS Open Data                                              |
+| **Schools / Childcare / Libraries**     | Amenity count within 1 km                             | Toronto Open Data                                               |
+| **Sewer Mains / Water Mains**           | Utility capacity proximity                            | Toronto Open Data → "Sewer Shed"                                |
+
+
+All spatial layers should be saved as **GeoParquet** files (`data/*.parquet`) after the first download. The backend loads them into memory at startup for sub-10 ms spatial lookups.
+
+---
+
+## Bucket 3 — Coefficient Lookup Tables
+
+*Small CSVs or PDFs — no training needed, just multiply.*
+
+### Statistics Canada Input-Output Multipliers
+
+- **Used by:** Model 2 (Construction Jobs)
+- **What it is:** "$1M residential construction in Ontario → X person-years of employment, $Y GDP contribution." Pre-computed by StatsCan economists.
+- **How to use:** Look up building type + province → multiply by estimated construction value.
+- **Where to get it:** StatsCan → "Supply and Use Tables / Input-Output Multipliers" (free download). Also summarized in CMHC Economic Impact of Homebuilding reports.
+
+### ITE Trip Generation Rates
+
+- **Used by:** Model 6 (Traffic) as fallback
+- **What it is:** Land use code → trips/day/unit or trips/day/1000 sqft. Industry standard used by traffic engineers worldwide.
+- **How to use:** `daily_trips = ite_rate[building_type] × (sqft / 1000)` adjusted by transit access modifier.
+- **Where to get it:** ITE Trip Generation Manual (licensed, but rates for common codes are widely republished). Also in Toronto's own transportation studies.
+
+### Toronto Property Tax Rates (Annual)
+
+- **Used by:** Model 4 (Property Tax Revenue)
+- **What it is:** City of Toronto publishes residential, commercial, and industrial tax rates each year (% of assessed value). Small table, static.
+- **How to use:** `annual_tax = assessed_value × tax_rate[property_class]`
+- **Where to get it:** toronto.ca → "Property Tax Rates and Assessment". Free PDF/HTML, updated annually.
+
+### OEB Electricity & Gas Rates
+
+- **Used by:** Model 1 (Utility Cost in dollars)
+- **What it is:** Ontario Energy Board regulated rates for residential and commercial customers. Lets you convert kWh/m³ predictions → CAD cost.
+- **How to use:** `utility_cost_cad = predicted_kwh × rate_kwh + predicted_gas_m3 × rate_m3`
+- **Where to get it:** oeb.ca → "Electricity Rates" and "Natural Gas Rates". Free PDFs, updated quarterly. Cache locally.
+
+### ASHRAE / NRCan Energy Intensity Benchmarks
+
+- **Used by:** Model 1 as a sanity check / baseline
+- **What it is:** Expected kWh/sqft/year by building type (office, residential, retail, etc.). Useful for validating EWRB-trained model predictions.
+- **Where to get it:** NRCan Commercial and Institutional Building Energy Use survey (free download).
+
+---
+
+## Bucket 4 — Live APIs (called at demo runtime)
+
+
+| Purpose                                                 | API                            | Cost                 | Notes                                                         |
+| ------------------------------------------------------- | ------------------------------ | -------------------- | ------------------------------------------------------------- |
+| **Geocoding** (address → lat/lng)                       | Mapbox Geocoding API           | Free tier sufficient | More reliable than Nominatim for Toronto addresses            |
+| **Reverse geocoding** (lat/lng → address/neighbourhood) | Mapbox Geocoding API           | Free tier            | Fires on pin drop                                             |
+| **Live air quality baseline**                           | WAQI / AQICN (`api.waqi.info`) | Free with key        | One call per analysis; returns current AQI at nearest station |
+| **LLM narrative generation**                            | Ollama (local, llama3.1:8b)    | Free (local GPU)     | Running on Blackwell; no API cost                             |
+| **IESO real-time electricity prices**                   | IESO Adequacy API              | Free                 | Optional; only needed for hourly cost modeling                |
+| **TTC transit info**                                    | Toronto GTFS static feed       | Free file            | Download once, query locally for transit access score         |
+
+
+---
+
+## Priority Order for the Hackathon
+
+**Train these 3 models** (clean data, clear dollar outputs, construction companies care most):
+
+1. **Model 1 — Energy/Utility Cost** (EWRB dataset, XGBoost regression, ready in ~2 hours)
+2. **Model 6 — Traffic Generation** (TTS + intersection counts, or ITE lookup table as fallback)
+3. **Model 4 — Property Tax Revenue** (MPAC assessments + tax rate table, simple regression)
+
+**Use lookup tables for these** (no training needed):
+
+- Construction jobs → StatsCan I-O multipliers × construction value
+- Tree/canopy loss → geometric overlay (no model, just PostGIS intersection)
+- Air quality → CANUE land-use regression coefficients as a shortcut
+
+**Use rule-based scoring for:**
+
+- Community benefit score (transit proximity + housing type + green space ratio)
+- 311 complaint risk (density delta lookup)
+
+---
+
+## File Naming Convention
+
+All processed data is saved to `data/` as GeoParquet:
+
+```
+data/
+├── building_permits.parquet        # ~50k rows, training backbone
+├── neighbourhoods.parquet          # 158 polygons + income/density
+├── street_trees.parquet            # ~500k point geometries
+├── traffic_volumes.parquet         # ~5k intersection count points
+├── ttc_stops.parquet               # ~11k stop locations
+├── zoning.parquet                  # polygon layer (large)
+├── ewrb_energy.parquet             # energy benchmarking training data
+├── employment_survey.parquet       # jobs training data
+└── coefficients/
+    ├── ite_trip_rates.csv
+    ├── statscan_io_multipliers.csv
+    └── tax_rates.csv
+```
+
diff --git a/ml/data_pipeline.py b/ml/data_pipeline.py
new file mode 100644
index 0000000..d4ca2b3
--- /dev/null
+++ b/ml/data_pipeline.py
@@ -0,0 +1,354 @@
+"""
+Download all Toronto Open Data datasets and save to data/ as GeoParquet / Parquet.
+
+Run once before model training:
+    pip install -r ml/requirements.txt
+    python ml/data_pipeline.py
+
+Everything is cached - re-running skips already-downloaded files.
+Set FORCE_REFRESH=1 to re-download everything.
+"""
+
+import os
+import sys
+from pathlib import Path
+
+import pandas as pd
+import geopandas as gpd
+from dotenv import load_dotenv
+
+load_dotenv()
+
+DATA_DIR = Path(os.getenv("DATA_DIR", Path(__file__).parent.parent / "data"))
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+(DATA_DIR / "coefficients").mkdir(exist_ok=True)
+
+PERMITS_FILTER_DATE = os.getenv("PERMITS_FILTER_DATE", "2020-01-01")
+FORCE = os.getenv("FORCE_REFRESH", "0") == "1"
+
+# Add ml/ to path so fetch.py is importable
+sys.path.insert(0, str(Path(__file__).parent))
+from fetch import fetch, fetch_gtfs_stops, fetch_csv_with_latlon, download_raster
+
+
+def _save(name: str, data: gpd.GeoDataFrame | pd.DataFrame, last_mod: str):
+    out = DATA_DIR / f"{name}.parquet"
+    data.to_parquet(out)
+    print(f"  [{name}] {len(data):,} rows -> {out.name}  (source: {last_mod})")
+
+
+def cached(name: str) -> bool:
+    if FORCE:
+        return False
+    exists = (DATA_DIR / f"{name}.parquet").exists()
+    if exists:
+        print(f"  [{name}] already exists - skipping (set FORCE_REFRESH=1 to re-download)")
+    return exists
+
+
+# ---------------------------------------------------------------------------
+# 1. SPATIAL LAYERS  (GeoJSON -> GeoParquet)
+# ---------------------------------------------------------------------------
+
+def dl_street_trees():
+    if cached("street_trees"):
+        return
+    # Street tree dataset is CSV-only (no GeoJSON resource) with LATITUDE/LONGITUDE columns
+    gdf, lm = fetch_csv_with_latlon(
+        "street-tree-data",
+        lat_col="LATITUDE", lon_col="LONGITUDE",
+        extra_cols=["DBH_TRUNK", "COMMON_NAME", "SPECIES_DESC"],
+    )
+    gdf = gdf.rename(columns={"DBH_TRUNK": "dbh_trunk", "COMMON_NAME": "common_name", "SPECIES_DESC": "species"})
+    _save("street_trees", gdf, lm)
+
+
+def dl_neighbourhoods():
+    """Download neighbourhood polygons AND profiles CSV, then join income + density."""
+    if cached("neighbourhoods"):
+        return
+
+    # Polygon boundaries
+    hoods, lm = fetch("neighbourhoods", prefer="geojson")
+    hoods.columns = [c.upper() for c in hoods.columns]
+    if "GEOMETRY" in hoods.columns:
+        hoods = hoods.rename(columns={"GEOMETRY": "geometry"}).set_geometry("geometry")
+    name_col = next((c for c in hoods.columns if c != "geometry" and ("NAME" in c or "HOOD" in c)), hoods.columns[0])
+
+    # Wide-format profiles CSV (columns = neighbourhood names, rows = variables)
+    try:
+        profiles_raw, _ = fetch("neighbourhood-profiles", prefer="csv")
+        profiles_raw.columns = [str(c).strip() for c in profiles_raw.columns]
+        var_col = profiles_raw.columns[0]
+
+        variables = profiles_raw[var_col].astype(str)
+        income_mask = variables.str.contains("Median after-tax income", na=False, case=False)
+        density_mask = variables.str.contains("Population density", na=False, case=False)
+
+        def extract_row(mask):
+            rows = profiles_raw[mask]
+            if rows.empty:
+                return None
+            return (
+                rows.iloc[0, 1:]
+                .astype(str)
+                .str.replace(",", "", regex=False)
+                .str.strip()
+                .apply(pd.to_numeric, errors="coerce")
+            )
+
+        income_series = extract_row(income_mask)
+        density_series = extract_row(density_mask)
+
+        if income_series is not None:
+            income_series.name = "median_income"
+            hoods = hoods.set_index(name_col).join(income_series, how="left").reset_index()
+        if density_series is not None:
+            density_series.name = "population_density"
+            hoods = hoods.set_index(name_col).join(density_series, how="left").reset_index()
+    except Exception as e:
+        print(f"    WARNING: neighbourhood profiles join failed ({e}) - saving polygons only")
+
+    hoods = hoods.rename(columns={name_col: "name"})
+    _save("neighbourhoods", hoods, lm)
+
+
+def dl_zoning():
+    """Save the two most useful zoning layers: base area and height overlay."""
+    for out_name, keywords in [
+        ("zoning_area",   ["zoning area", "general zoning", "zoning bylaw area"]),
+        ("zoning_height", ["height", "height overlay"]),
+    ]:
+        if cached(out_name):
+            continue
+        try:
+            gdf, lm = fetch("zoning-by-law", prefer="geojson")
+            # The GeoJSON may already be the right layer, or may have a 'layer' property
+            if "ZONE_CLASS" in gdf.columns or "ZBL_ZONE" in gdf.columns:
+                _save(out_name, gdf, lm)
+            else:
+                print(f"    [{out_name}] downloaded but could not identify layer columns - saving raw")
+                _save(out_name, gdf, lm)
+        except Exception as e:
+            print(f"    WARNING: {out_name} failed ({e})")
+
+
+def dl_centreline():
+    if cached("street_centreline"):
+        return
+    gdf, lm = fetch("toronto-centreline-tcl", prefer="geojson")
+    _save("street_centreline", gdf, lm)
+
+
+def dl_traffic_volumes():
+    if cached("traffic_volumes"):
+        return
+    try:
+        # Traffic volumes has lat/lng columns
+        gdf, lm = fetch_csv_with_latlon(
+            "traffic-volumes-at-intersections-for-all-modes",
+            lat_col="latitude", lon_col="longitude",
+            extra_cols=["location_id", "location", "8_hr_vehicle_volume",
+                        "8_hr_pedestrian_volume", "count_date"],
+        )
+        # Normalise the volume column name
+        vol_col = next((c for c in gdf.columns if "vehicle" in c.lower() and "volume" in c.lower()), None)
+        if vol_col:
+            gdf = gdf.rename(columns={vol_col: "volume_8hr_vehicles"})
+        _save("traffic_volumes", gdf, lm)
+    except Exception as e:
+        print(f"    WARNING: traffic_volumes failed ({e})")
+
+
+def dl_parks():
+    for name, ckan_id in [
+        ("parks", "parks-and-recreation-facilities"),
+        ("green_spaces", "green-spaces"),
+    ]:
+        if cached(name):
+            continue
+        try:
+            gdf, lm = fetch(ckan_id, prefer="geojson")
+            _save(name, gdf, lm)
+        except Exception as e:
+            print(f"    WARNING: {name} failed ({e})")
+
+
+def dl_cycling_network():
+    if cached("cycling_network"):
+        return
+    try:
+        gdf, lm = fetch("cycling-network", prefer="geojson")
+        _save("cycling_network", gdf, lm)
+    except Exception as e:
+        print(f"    WARNING: cycling_network failed ({e})")
+
+
+def dl_development_applications():
+    if cached("development_applications"):
+        return
+    try:
+        gdf, lm = fetch("development-applications", prefer="geojson")
+        _save("development_applications", gdf, lm)
+    except Exception as e:
+        print(f"    WARNING: development_applications failed ({e})")
+
+
+def dl_heritage():
+    if cached("heritage_properties"):
+        return
+    try:
+        gdf, lm = fetch("heritage-properties", prefer="geojson")
+        _save("heritage_properties", gdf, lm)
+    except Exception as e:
+        print(f"    WARNING: heritage_properties failed ({e})")
+
+
+# ---------------------------------------------------------------------------
+# 2. TABULAR TRAINING DATA  (CSV -> Parquet)
+# ---------------------------------------------------------------------------
+
+EWRB_RENAME = {
+    "Property GFA - Self-Reported (ft²)": "floor_area_sqft",
+    "Electricity Use - Grid Purchase (kWh)": "annual_electricity_kwh",
+    "Natural Gas Use (GJ)": "annual_gas_gj",
+    "Water Use (m³)": "annual_water_m3",
+    "GHG Emissions Intensity (kg CO2e/ft²)": "ghg_intensity",
+    "Total GHG Emissions (kg CO2e)": "total_ghg_kg",
+    "Property Type": "building_type",
+    "Year Built": "year_built",
+    "Number of Floors": "num_floors",
+    "City": "city",
+    "Postal Code": "postal_code",
+    "Ward": "ward",
+}
+
+
+def dl_ewrb():
+    if cached("ewrb_energy"):
+        return
+    try:
+        df, lm = fetch("annual-energy-consumption", prefer="csv")
+        df = df.rename(columns={k: v for k, v in EWRB_RENAME.items() if k in df.columns})
+        _save("ewrb_energy", df, lm)
+    except Exception as e:
+        print(f"    WARNING: ewrb_energy failed ({e})")
+
+
+def dl_building_permits():
+    for name, ckan_id in [
+        ("building_permits_cleared", "building-permits-cleared-permits"),
+        ("building_permits_active",  "building-permits-active-permits"),
+    ]:
+        if cached(name):
+            continue
+        try:
+            df, lm = fetch(ckan_id, prefer="csv")
+            # Normalise date column name (varies between datasets)
+            date_col = next(
+                (c for c in df.columns if "issue" in c.lower() and "date" in c.lower()), None
+            )
+            if date_col and name == "building_permits_cleared":
+                df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
+                df = df[df[date_col] >= PERMITS_FILTER_DATE]
+                print(f"    [{name}] filtered to {PERMITS_FILTER_DATE}+ -> {len(df):,} rows")
+            _save(name, df, lm)
+        except Exception as e:
+            print(f"    WARNING: {name} failed ({e})")
+
+
+def dl_business_licences():
+    if cached("business_licences"):
+        return
+    try:
+        df, lm = fetch(
+            "municipal-licensing-and-standards-business-licences-and-permits",
+            prefer="csv",
+        )
+        _save("business_licences", df, lm)
+    except Exception as e:
+        print(f"    WARNING: business_licences failed ({e})")
+
+
+def dl_property_tax():
+    for name, ckan_id in [
+        ("property_tax", "property-tax-collection"),
+        ("cva_residential", "current-value-assessment-cva-information-residential-property-types"),
+    ]:
+        if cached(name):
+            continue
+        try:
+            df, lm = fetch(ckan_id, prefer="csv")
+            _save(name, df, lm)
+        except Exception as e:
+            print(f"    WARNING: {name} failed ({e})")
+
+
+# ---------------------------------------------------------------------------
+# 3. SPECIAL CASES
+# ---------------------------------------------------------------------------
+
+def dl_ttc_stops():
+    if cached("ttc_stops"):
+        return
+    try:
+        gdf, lm = fetch_gtfs_stops("ttc-routes-and-schedules")
+        _save("ttc_stops", gdf, lm)
+    except Exception as e:
+        print(f"    WARNING: ttc_stops failed ({e})")
+
+
+def dl_forest_cover():
+    out = DATA_DIR / "forest_cover.tif"
+    if not FORCE and out.exists():
+        print("  [forest_cover] already exists - skipping")
+        return
+    try:
+        lm = download_raster("forest-and-land-cover", out)
+        size_mb = out.stat().st_size / 1_048_576
+        print(f"  [forest_cover] {size_mb:.1f} MB -> {out.name}  (source: {lm})")
+    except Exception as e:
+        print(f"    WARNING: forest_cover failed ({e})")
+
+
+# ---------------------------------------------------------------------------
+# MAIN
+# ---------------------------------------------------------------------------
+
+def main():
+    print(f"=== UrbanForge Data Pipeline ===")
+    print(f"Output dir : {DATA_DIR}")
+    print(f"Permit cut : {PERMITS_FILTER_DATE}")
+    print(f"Force      : {FORCE}\n")
+
+    print("-- Spatial layers ----------------------------------")
+    dl_street_trees()
+    dl_neighbourhoods()
+    dl_zoning()
+    dl_centreline()
+    dl_traffic_volumes()
+    dl_parks()
+    dl_cycling_network()
+    dl_development_applications()
+    dl_heritage()
+
+    print("\n-- Tabular training data ---------------------------")
+    dl_ewrb()
+    dl_building_permits()
+    dl_business_licences()
+    dl_property_tax()
+
+    print("\n-- Special cases -----------------------------------")
+    dl_ttc_stops()
+    dl_forest_cover()
+
+    print("\n=== Done ===")
+    parquet_files = list(DATA_DIR.glob("*.parquet"))
+    tif_files = list(DATA_DIR.glob("*.tif"))
+    print(f"  {len(parquet_files)} parquet files")
+    print(f"  {len(tif_files)} raster files")
+    print(f"  Total: {sum(f.stat().st_size for f in parquet_files + tif_files) / 1_048_576:.0f} MB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ml/fetch.py b/ml/fetch.py
new file mode 100644
index 0000000..ae0ab3f
--- /dev/null
+++ b/ml/fetch.py
@@ -0,0 +1,213 @@
+"""
+Toronto Open Data download helpers.
+All functions return a GeoDataFrame (spatial) or DataFrame (tabular).
+Results are NOT cached here - caching is handled in data_pipeline.py.
+"""
+
+import io
+import json
+import os
+import tempfile
+import zipfile
+from pathlib import Path
+
+import pandas as pd
+import geopandas as gpd
+import requests
+import urllib3
+from shapely.geometry import shape
+from shapely import wkt
+
+# Windows (Python 3.13) does not include the city's CA cert in its bundle.
+# All requests to the Toronto Open Data API use verify=False.
+# This is safe: we're reading public government data, not transmitting secrets.
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+BASE = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/"
+_SSL = False  # set to True if you install the cert chain manually
+
+
+def _package(ckan_id: str) -> dict:
+    r = requests.get(BASE + "package_show", params={"id": ckan_id}, timeout=30, verify=_SSL)
+    r.raise_for_status()
+    return r.json()["result"]
+
+
+def _last_modified(pkg: dict) -> str:
+    dates = [r.get("last_modified", "") for r in pkg["resources"] if r.get("last_modified")]
+    return max(dates) if dates else "unknown"
+
+
+def list_formats(ckan_id: str) -> list[tuple[str, str]]:
+    """Debug helper: print available resource formats for a dataset."""
+    pkg = _package(ckan_id)
+    return [(r.get("format", "?"), r.get("name", "?"), r.get("url", "?")[:80])
+            for r in pkg["resources"]]
+
+
+def _read_geo_bytes(content: bytes) -> gpd.GeoDataFrame:
+    """
+    Try to parse raw bytes as a geodataframe.
+    Handles GeoJSON directly, or ZIP containing a shapefile.
+    """
+    # Check if it's a ZIP (shapefile archive)
+    if content[:2] == b"PK":
+        with zipfile.ZipFile(io.BytesIO(content)) as z:
+            shp_files = [n for n in z.namelist() if n.lower().endswith(".shp")]
+            if shp_files:
+                with tempfile.TemporaryDirectory() as tmpdir:
+                    z.extractall(tmpdir)
+                    return gpd.read_file(os.path.join(tmpdir, shp_files[0]))
+            # ZIP without .shp - try the first file as GeoJSON
+            first = z.namelist()[0]
+            return gpd.read_file(io.BytesIO(z.read(first)))
+    # Otherwise pass bytes directly (GeoJSON, etc.)
+    try:
+        return gpd.read_file(io.BytesIO(content))
+    except Exception:
+        df = pd.read_csv(io.BytesIO(content), encoding="latin-1", on_bad_lines="skip", low_memory=False)
+        geom_col = next((c for c in df.columns if c.lower() == "geometry"), None)
+        if geom_col is None:
+            raise
+
+        def parse_geometry(value):
+            if pd.isna(value):
+                return None
+            text = str(value).strip()
+            if not text:
+                return None
+            if text.startswith("{"):
+                return shape(json.loads(text))
+            return wkt.loads(text)
+
+        geometry = df[geom_col].apply(parse_geometry)
+        df = df.drop(columns=[geom_col])
+        return gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")
+
+
+def fetch(ckan_id: str, prefer: str = "geojson") -> tuple[gpd.GeoDataFrame | pd.DataFrame, str]:
+    """
+    Download the first resource matching `prefer` from a CKAN package.
+    prefer="geojson" -> tries geojson, then shp/zip (NOT bare "json" - that matches
+                       the CKAN datastore API endpoint, not a real GeoJSON file).
+    prefer="csv"     -> tries csv, then xlsx.
+    Returns (GeoDataFrame|DataFrame, last_modified_string).
+    """
+    pkg = _package(ckan_id)
+
+    geo_formats = ["geojson", "shp", "shapefile", "zip"]
+    csv_formats = ["csv", "xlsx"]
+
+    if prefer.lower() == "geojson":
+        candidates = geo_formats
+        is_geo = True
+    elif prefer.lower() == "csv":
+        candidates = csv_formats
+        is_geo = False
+    else:
+        candidates = [prefer.lower()]
+        is_geo = prefer.lower() in geo_formats
+
+    for candidate in candidates:
+        for res in pkg["resources"]:
+            if res.get("format", "").lower() == candidate and res.get("url"):
+                resp = requests.get(res["url"], timeout=300, verify=_SSL)
+                resp.raise_for_status()
+                lm = _last_modified(pkg)
+                if is_geo:
+                    return _read_geo_bytes(resp.content), lm
+                else:
+                    return (
+                        pd.read_csv(io.BytesIO(resp.content), encoding="latin-1", on_bad_lines="skip", low_memory=False),
+                        lm,
+                    )
+
+    raise ValueError(
+        f"No {prefer!r} resource found for {ckan_id!r}. "
+        f"Available: {[(r.get('format'), r.get('name')) for r in pkg['resources']]}"
+    )
+
+
+def fetch_csv_as_geo(
+    ckan_id: str,
+    lat_col: str,
+    lon_col: str,
+    extra_cols: list[str] | None = None,
+) -> tuple[gpd.GeoDataFrame, str]:
+    """
+    Download a CSV with lat/lng columns and return as GeoDataFrame.
+    Column name matching is case-insensitive.
+    Used for point datasets: street trees, traffic volumes, business licences.
+    """
+    df, last_mod = fetch(ckan_id, prefer="csv")
+
+    # Case-insensitive column lookup
+    col_map = {c.lower(): c for c in df.columns}
+    lat = col_map.get(lat_col.lower())
+    lon = col_map.get(lon_col.lower())
+    if lat is None or lon is None:
+        raise ValueError(
+            f"Lat/lon columns {lat_col!r}/{lon_col!r} not found in {ckan_id!r}. "
+            f"Available: {list(df.columns[:20])}"
+        )
+
+    df[lat] = pd.to_numeric(df[lat], errors="coerce")
+    df[lon] = pd.to_numeric(df[lon], errors="coerce")
+    df = df.dropna(subset=[lat, lon])
+
+    keep_lower = {c.lower() for c in (extra_cols or [])}
+    keep = [lat, lon] + [col_map[c] for c in keep_lower if c in col_map and col_map[c] not in (lat, lon)]
+
+    gdf = gpd.GeoDataFrame(
+        df[keep],
+        geometry=gpd.points_from_xy(df[lon], df[lat]),
+        crs="EPSG:4326",
+    )
+    return gdf, last_mod
+
+
+# Keep old name as alias so data_pipeline.py doesn't need a rename
+fetch_csv_with_latlon = fetch_csv_as_geo
+
+
+def fetch_gtfs_stops(ckan_id: str = "ttc-routes-and-schedules") -> tuple[gpd.GeoDataFrame, str]:
+    """Download GTFS zip, extract stops.txt, return GeoDataFrame of stop points."""
+    pkg = _package(ckan_id)
+    last_mod = _last_modified(pkg)
+
+    for res in pkg["resources"]:
+        if res.get("format", "").lower() == "zip" and res.get("url"):
+            raw = requests.get(res["url"], timeout=120, verify=_SSL).content
+            with zipfile.ZipFile(io.BytesIO(raw)) as z:
+                stops_name = next((n for n in z.namelist() if n.lower().endswith("stops.txt")), None)
+                if stops_name is None:
+                    raise ValueError("stops.txt not found in GTFS zip")
+                stops = pd.read_csv(z.open(stops_name))
+
+            gdf = gpd.GeoDataFrame(
+                stops[["stop_id", "stop_name"]],
+                geometry=gpd.points_from_xy(stops["stop_lon"], stops["stop_lat"]),
+                crs="EPSG:4326",
+            )
+            return gdf, last_mod
+
+    raise ValueError(f"No zip resource found for {ckan_id!r}")
+
+
+def download_raster(ckan_id: str, out_path: Path) -> str:
+    """Download a GeoTIFF raster from a CKAN package to out_path."""
+    pkg = _package(ckan_id)
+    last_mod = _last_modified(pkg)
+
+    for res in pkg["resources"]:
+        fmt = res.get("format", "").lower()
+        if fmt in ("tiff", "geotiff", "tif") and res.get("url"):
+            resp = requests.get(res["url"], stream=True, timeout=300, verify=_SSL)
+            resp.raise_for_status()
+            out_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(out_path, "wb") as f:
+                for chunk in resp.iter_content(chunk_size=65536):
+                    f.write(chunk)
+            return last_mod
+
+    raise ValueError(f"No TIFF resource found for {ckan_id!r}")
diff --git a/ml/requirements.txt b/ml/requirements.txt
new file mode 100644
index 0000000..c79fca1
--- /dev/null
+++ b/ml/requirements.txt
@@ -0,0 +1,10 @@
+geopandas==1.0.1
+pyarrow==18.1.0
+shapely==2.0.6
+numpy==2.1.3
+pandas==2.2.3
+scikit-learn==1.5.2
+xgboost==2.1.3
+requests==2.32.3
+rasterio==1.4.3
+python-dotenv==1.0.1