From 92879459ff4f6b7d8e8bdbe78de3fc314427c4f6 Mon Sep 17 00:00:00 2001 From: Omar Soliman Date: Fri, 29 May 2026 22:54:01 -0400 Subject: [PATCH] Add Toronto Open Data pipeline and dataset documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ml/fetch.py: CKAN download helpers with SSL bypass (Windows cert issue), ZIP/shapefile extraction, case-insensitive CSV lat/lon conversion, and GTFS stop parsing for TTC - ml/data_pipeline.py: downloads all 17 datasets to data/ as GeoParquet/ Parquet; caches on disk, filters permits to 2020+, joins neighbourhood profiles (income/density) onto polygon boundaries - ml/requirements.txt: geopandas, xgboost, rasterio, python-dotenv - data/data.md: full dataset guide — buckets, column specs, handoff notes - data/coefficients/: ITE trip generation rates + StatsCan I-O multipliers - .env.example: all required keys (WAQI, Mapbox, Anthropic, Ollama) - .gitignore: exclude data/*.parquet, data/*.tif, backend/models/, .venv - .vscode/settings.json: Python interpreter + ml/ extra path for team --- .env.example | 28 ++ .gitignore | 17 + .vscode/settings.json | 4 + README.md | 88 ++++- data/coefficients/ite_trip_rates.csv | 9 + data/coefficients/statscan_io_multipliers.csv | 6 + data/data.md | 210 +++++++++++ ml/data_pipeline.py | 354 ++++++++++++++++++ ml/fetch.py | 213 +++++++++++ ml/requirements.txt | 10 + 10 files changed, 937 insertions(+), 2 deletions(-) create mode 100644 .env.example create mode 100644 .vscode/settings.json create mode 100644 data/coefficients/ite_trip_rates.csv create mode 100644 data/coefficients/statscan_io_multipliers.csv create mode 100644 data/data.md create mode 100644 ml/data_pipeline.py create mode 100644 ml/fetch.py create mode 100644 ml/requirements.txt diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..360f508 --- /dev/null +++ b/.env.example @@ -0,0 +1,28 @@ +# Toronto Open Data CKAN API (no key required) +TORONTO_CKAN_BASE=https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/ + +# Local data directory (relative to project root) +DATA_DIR=./data + +# Date filter for building permits (YYYY-MM-DD) +PERMITS_FILTER_DATE=2020-01-01 + +# Set to 1 to force re-download of all datasets (ignores cache) +FORCE_REFRESH=0 + +# Live APIs used at inference time +# Get free WAQI token at: https://aqicn.org/data-platform/token/ +WAQI_API_KEY=your_waqi_token_here + +# Get Mapbox token at: https://account.mapbox.com/ +MAPBOX_TOKEN=your_mapbox_token_here + +# Get Anthropic API key at: https://console.anthropic.com/ +ANTHROPIC_API_KEY=your_anthropic_key_here + +# Optional: IESO real-time electricity prices (no key required) +IESO_BASE_URL=https://reports-public.ieso.ca/public/ + +# Ollama local LLM (running on Blackwell GPU) +OLLAMA_HOST=http://localhost:11434 +OLLAMA_MODEL=llama3.1:8b diff --git a/.gitignore b/.gitignore index 83972fa..064e7ae 100644 --- a/.gitignore +++ b/.gitignore @@ -216,3 +216,20 @@ __marimo__/ # Streamlit .streamlit/secrets.toml + +# UrbanForge — downloaded data files (large, reproducible via data_pipeline.py) +data/*.parquet +data/*.tif +data/*.tiff + +# Trained ML model artifacts (reproducible via train_models.py) +backend/models/*.pkl +backend/models/*.json + +# Frontend +frontend/node_modules/ +frontend/.next/ +frontend/.env.local + +# Docker +.docker/ diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..c7aa6f5 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "python.defaultInterpreterPath": "${workspaceFolder}/.venv/Scripts/python.exe", + "python.analysis.extraPaths": ["${workspaceFolder}/ml"] +} diff --git a/README.md b/README.md index 5489ddd..192c978 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,86 @@ -# Nvidia-Hackathon -Nvidia Hackathon +# Toronto Construction Impact Analyzer + +Drop a pin on a Toronto map, describe a proposed building, and get an instant impact report: +**economy** (jobs, tax revenue, utility costs) · **environment** (trees, CO₂, air quality) · **traffic** · **community benefit score** + an AI-generated narrative — all powered by a local Blackwell GPU. + +--- + +## Quick Start + +### 1. Download & process Toronto open data +```bash +cd ml +pip install -r requirements.txt +python data_pipeline.py +``` + +### 2. Train the ML models (requires NVIDIA GPU) +```bash +python train_models.py +``` + +### 3. Start Ollama + FastAPI backend +```bash +# From the project root +docker-compose up -d +# Pull the LLM model on first run +docker exec -it nvidia-hackathon-ollama-1 ollama pull llama3.1:8b +``` + +### 4. Start the frontend +```bash +cd frontend +npm install +# Add your Mapbox token to .env.local +echo "NEXT_PUBLIC_MAPBOX_TOKEN=pk.your_token_here" > .env.local +npm run dev +``` + +Open http://localhost:3000 — click anywhere on Toronto to begin. + +--- + +## Architecture + +``` +Next.js (Mapbox map + form + report card) + │ POST /api/analyze + ▼ +FastAPI backend + ├── Spatial lookup → GeoParquet files (Toronto Open Data) + ├── ML inference → 3 XGBoost models (Blackwell GPU) + ├── Community score → rule-based composite + └── LLM narrative → Ollama llama3.1:8b (Blackwell GPU) +``` + +## ML Models + +| Model | Inputs | Outputs | +|---|---|---| +| Economic | building type, sqft, floors, neighbourhood | jobs, tax revenue, utility cost | +| Environmental | building type, sqft, floors, nearby trees, traffic | trees at risk, CO₂/year, AQI delta | +| Traffic | building type, sqft, floors, transit access, traffic baseline | daily trips, peak congestion %, parking demand | + +All three models are XGBoost regressors trained with `device=cuda` on Toronto Building Permits × spatial context features. + +## Data Sources + +See [`data/data.md`](data/data.md) for the complete dataset guide. + +Key sources: +- Toronto Energy and Water Reporting (EWRB) — utility model training +- Toronto Building Permits — training backbone +- Toronto Employment Survey — jobs model +- Transportation Tomorrow Survey (TTS) — traffic model +- Toronto Street Trees, Neighbourhood Profiles, Zoning By-law — spatial context + +## Community Benefit Score + +Rule-based 0–100 composite: +- +20 residential / +10 mixed-use (housing supply) +- +15 affordable housing mentioned +- +15 / +8 / −8 transit access (distance to TTC stop) +- +10 ground-floor commercial +- −5 to −20 traffic congestion burden +- −5 to −15 tree/canopy loss +- +10 low-carbon building diff --git a/data/coefficients/ite_trip_rates.csv b/data/coefficients/ite_trip_rates.csv new file mode 100644 index 0000000..c4b28b4 --- /dev/null +++ b/data/coefficients/ite_trip_rates.csv @@ -0,0 +1,9 @@ +building_type,ite_code,daily_trips_per_1000sqft,peak_am_per_1000sqft,peak_pm_per_1000sqft,source +residential,220,6.65,0.35,0.44,ITE Trip Generation 11th Ed (per unit: 6.65/unit residential mid-rise) +commercial_office,710,11.03,1.56,1.49,ITE Trip Generation 11th Ed +retail_general,820,42.70,1.03,4.24,ITE Trip Generation 11th Ed +industrial_general,110,6.97,0.97,0.86,ITE Trip Generation 11th Ed +mixed_use,230,10.50,0.65,0.78,ITE Mixed-Use Trip Generation (blended estimate) +hotel,310,8.70,0.54,0.60,ITE Trip Generation 11th Ed +supermarket,850,102.24,8.00,9.48,ITE Trip Generation 11th Ed +fast_food,934,496.12,34.64,26.61,ITE Trip Generation 11th Ed diff --git a/data/coefficients/statscan_io_multipliers.csv b/data/coefficients/statscan_io_multipliers.csv new file mode 100644 index 0000000..60ac0e4 --- /dev/null +++ b/data/coefficients/statscan_io_multipliers.csv @@ -0,0 +1,6 @@ +sector,province,gdp_multiplier_per_1m_cad,employment_person_years_per_1m_cad,source_year,notes +residential_construction,Ontario,1.71,8.2,2021,StatsCan Input-Output Multipliers Table 381-0031 +commercial_construction,Ontario,1.68,7.1,2021,StatsCan Input-Output Multipliers Table 381-0031 +industrial_construction,Ontario,1.62,5.8,2021,StatsCan Input-Output Multipliers Table 381-0031 +mixed_use_construction,Ontario,1.70,7.8,2021,Blended average residential/commercial +infrastructure_construction,Ontario,1.75,8.9,2021,StatsCan Input-Output Multipliers Table 381-0031 diff --git a/data/data.md b/data/data.md new file mode 100644 index 0000000..126ffd1 --- /dev/null +++ b/data/data.md @@ -0,0 +1,210 @@ +# Data Guide — Toronto Construction Impact Analyzer + +This document is the single source of truth for every dataset the project needs. +Datasets are split into four buckets: training data, spatial context layers, coefficient lookup tables, and live runtime APIs. + +--- + +## Bucket 1 — Training Data + +*Row-by-row historical datasets used to train or calibrate ML models.* + +### EWRB — Toronto Energy and Water Reporting & Benchmarking + +- **Used by:** Model 1 (Energy / Utility Cost) +- **What it is:** Every Toronto building over 10,000 sq ft must report annual electricity (kWh), gas (m³), and water (m³) consumption plus basic building characteristics (type, floor area, year built, floors, heating type). +- **Why it's ideal:** The dataset is already in exactly the shape you need for regression — features on the left, consumption targets on the right. Multiple years are available so you can also model trends. +- **Where to get it:** Toronto Open Data portal → search "Energy and Water Reporting and Benchmarking". Direct CSV download, free. +- **Feature columns:** `building_type`, `floor_area_sqft`, `year_built`, `num_floors`, `neighbourhood`, `heating_type` +- **Target columns:** `annual_electricity_kwh`, `annual_gas_m3`, `annual_water_m3` + +--- + +### Toronto Building Permits (Cleared) + +- **Used by:** Model 2 (Construction Jobs), Model 4 (Property Tax), Model 8 (Community Impact) +- **What it is:** Every building permit issued by the City of Toronto — includes declared construction value, building type, address, square footage, number of storeys. +- **Why it's useful:** Lets you derive cost-per-sqft distributions by building type (needed to estimate construction value from height + footprint in the model). Also the backbone for training the community impact model (neighborhood building stock changes over time). +- **Where to get it:** Toronto Open Data → "Building Permits — Active Permits" and "Building Permits — Cleared Permits". CSV or API, free. +- **Key columns:** `permit_type`, `declared_valuation`, `work_type`, `address`, `no_of_storeys`, `floor_area_m2`, `issued_date` + +--- + +### Toronto Employment Survey + +- **Used by:** Model 3 (Operational Jobs) +- **What it is:** City of Toronto surveys every business with location and employee count. You can aggregate to building-level and get jobs-per-sqft by building type. +- **Why it's useful:** Training set for "how many permanent jobs does a 50,000 sqft office building in the Annex produce?" +- **Where to get it:** Toronto Open Data → "Toronto Employment Survey". CSV, free. +- **Key columns:** `address`, `industry`, `employees`, `gfa_sqm` (if available) +- **Fallback:** If this dataset is too coarse, use ITE employment generation rates (Bucket 3) as a lookup table instead of training a model. + +--- + +### Transportation Tomorrow Survey (TTS) + +- **Used by:** Model 6 (Traffic Generation) +- **What it is:** Household travel survey covering the Greater Toronto Area. Origin-destination pairs by mode, aggregated to traffic zones with land-use info. Considered the gold standard for GTA travel demand modeling. +- **Why it's useful:** Training data for trip generation rates by building type and transit access. +- **Where to get it:** Data Management Group, University of Toronto (dmg.utoronto.ca). Free for academic/research use — request access. +- **Key columns (zone-level aggregate):** `zone_id`, `building_type_mix`, `residential_units`, `commercial_gfa`, `transit_access_score`, `daily_trips_generated` + +--- + +### Traffic Volumes at Intersections (all modes) + +- **Used by:** Model 6 (Traffic), also as Bucket 2 spatial context +- **What it is:** 30+ years of vehicle, pedestrian, and cycling counts at major Toronto intersections. +- **Why it's useful:** Ground-truth for calibrating trip generation predictions. Cross-reference pre/post building construction to measure actual traffic delta. +- **Where to get it:** Toronto Open Data → "Traffic Volumes at Intersections for All Modes". CSV, free. +- **Key columns:** `count_id`, `location`, `latitude`, `longitude`, `year`, `8hr_vehicle_volume` + +--- + +### Air Quality Ontario — Hourly Monitoring Data + +- **Used by:** Model 5 (Air Quality) +- **What it is:** Hourly PM2.5 and NO₂ readings from all air quality monitoring stations across Ontario. Multi-decade history available. +- **Why it's useful:** Training target for predicting how land-use changes (more traffic, less canopy) affect local air quality. +- **Where to get it:** Ontario Ministry of Environment → Air Quality Ontario data downloads (airqualityontario.com). Free CSV downloads by year/station. +- **Feature engineering:** Join station readings with surrounding land-use features (traffic volume within 500 m, % tree canopy within 500 m, building density, distance to highway) to build the training feature matrix. + +--- + +### 311 Service Requests (2014–present) + +- **Used by:** Model 8 (Community Impact) +- **What it is:** Every 311 complaint/service request filed by Toronto residents, with category, address, and date. Over 10 years of history. +- **Why it's useful:** Proxy for neighbourhood service strain. Train a model: "as building density increases, which complaint categories increase?" +- **Where to get it:** Toronto Open Data → "311 Service Requests". CSV/API, free. +- **Key columns:** `service_request_id`, `type`, `ward`, `neighbourhood`, `opened_date`, `latitude`, `longitude` + +--- + +### MPAC Property Assessment Data + +- **Used by:** Model 4 (Property Tax Revenue) +- **What it is:** Municipal Property Assessment Corporation — assessed values for all Ontario properties. Full parcel-level data is licensed, but Toronto Open Data provides aggregated property tax collection statistics. +- **Where to get it:** Toronto Open Data → "Property Tax Collection". For parcel-level, request through MPAC directly or use the publicly available Assessment Roll summaries. +- **Key columns:** `roll_number`, `property_class`, `assessed_value`, `gfa_sqm`, `ward`, `neighbourhood` + +--- + +## Bucket 2 — Spatial Context Layers + +*Load once at startup, query at a given lat/lng at demo time.* + + +| Dataset | Use at Runtime | Source | +| --------------------------------------- | ----------------------------------------------------- | --------------------------------------------------------------- | +| **Zoning By-law (GeoJSON)** | Zoning class (e.g., CR 3.0) at pin location | Toronto Open Data | +| **Property Boundaries (parcels)** | Lot size and shape at pin | Toronto Open Data → "Toronto Parcel Data" | +| **Street Tree Inventory** | Count trees within 500 m of pin; estimate canopy loss | Toronto Open Data → "Street Tree Data" | +| **Forest & Land Cover raster** | % canopy cover in 500 m buffer | Toronto Open Data → "Forest & Land Cover" | +| **TTC Stops (subway + LRT + bus)** | Transit access score = distance to nearest stop | Toronto Open Data → "TTC Ridership" / GTFS | +| **Neighbourhood Boundaries + Profiles** | Neighbourhood name, median income, population density | Toronto Open Data → "Neighbourhoods" + "Neighbourhood Profiles" | +| **TRCA Flood Plain** | Flag if pin is in flood zone | TRCA GIS Open Data | +| **Schools / Childcare / Libraries** | Amenity count within 1 km | Toronto Open Data | +| **Sewer Mains / Water Mains** | Utility capacity proximity | Toronto Open Data → "Sewer Shed" | + + +All spatial layers should be saved as **GeoParquet** files (`data/*.parquet`) after the first download. The backend loads them into memory at startup for sub-10 ms spatial lookups. + +--- + +## Bucket 3 — Coefficient Lookup Tables + +*Small CSVs or PDFs — no training needed, just multiply.* + +### Statistics Canada Input-Output Multipliers + +- **Used by:** Model 2 (Construction Jobs) +- **What it is:** "$1M residential construction in Ontario → X person-years of employment, $Y GDP contribution." Pre-computed by StatsCan economists. +- **How to use:** Look up building type + province → multiply by estimated construction value. +- **Where to get it:** StatsCan → "Supply and Use Tables / Input-Output Multipliers" (free download). Also summarized in CMHC Economic Impact of Homebuilding reports. + +### ITE Trip Generation Rates + +- **Used by:** Model 6 (Traffic) as fallback +- **What it is:** Land use code → trips/day/unit or trips/day/1000 sqft. Industry standard used by traffic engineers worldwide. +- **How to use:** `daily_trips = ite_rate[building_type] × (sqft / 1000)` adjusted by transit access modifier. +- **Where to get it:** ITE Trip Generation Manual (licensed, but rates for common codes are widely republished). Also in Toronto's own transportation studies. + +### Toronto Property Tax Rates (Annual) + +- **Used by:** Model 4 (Property Tax Revenue) +- **What it is:** City of Toronto publishes residential, commercial, and industrial tax rates each year (% of assessed value). Small table, static. +- **How to use:** `annual_tax = assessed_value × tax_rate[property_class]` +- **Where to get it:** toronto.ca → "Property Tax Rates and Assessment". Free PDF/HTML, updated annually. + +### OEB Electricity & Gas Rates + +- **Used by:** Model 1 (Utility Cost in dollars) +- **What it is:** Ontario Energy Board regulated rates for residential and commercial customers. Lets you convert kWh/m³ predictions → CAD cost. +- **How to use:** `utility_cost_cad = predicted_kwh × rate_kwh + predicted_gas_m3 × rate_m3` +- **Where to get it:** oeb.ca → "Electricity Rates" and "Natural Gas Rates". Free PDFs, updated quarterly. Cache locally. + +### ASHRAE / NRCan Energy Intensity Benchmarks + +- **Used by:** Model 1 as a sanity check / baseline +- **What it is:** Expected kWh/sqft/year by building type (office, residential, retail, etc.). Useful for validating EWRB-trained model predictions. +- **Where to get it:** NRCan Commercial and Institutional Building Energy Use survey (free download). + +--- + +## Bucket 4 — Live APIs (called at demo runtime) + + +| Purpose | API | Cost | Notes | +| ------------------------------------------------------- | ------------------------------ | -------------------- | ------------------------------------------------------------- | +| **Geocoding** (address → lat/lng) | Mapbox Geocoding API | Free tier sufficient | More reliable than Nominatim for Toronto addresses | +| **Reverse geocoding** (lat/lng → address/neighbourhood) | Mapbox Geocoding API | Free tier | Fires on pin drop | +| **Live air quality baseline** | WAQI / AQICN (`api.waqi.info`) | Free with key | One call per analysis; returns current AQI at nearest station | +| **LLM narrative generation** | Ollama (local, llama3.1:8b) | Free (local GPU) | Running on Blackwell; no API cost | +| **IESO real-time electricity prices** | IESO Adequacy API | Free | Optional; only needed for hourly cost modeling | +| **TTC transit info** | Toronto GTFS static feed | Free file | Download once, query locally for transit access score | + + +--- + +## Priority Order for the Hackathon + +**Train these 3 models** (clean data, clear dollar outputs, construction companies care most): + +1. **Model 1 — Energy/Utility Cost** (EWRB dataset, XGBoost regression, ready in ~2 hours) +2. **Model 6 — Traffic Generation** (TTS + intersection counts, or ITE lookup table as fallback) +3. **Model 4 — Property Tax Revenue** (MPAC assessments + tax rate table, simple regression) + +**Use lookup tables for these** (no training needed): + +- Construction jobs → StatsCan I-O multipliers × construction value +- Tree/canopy loss → geometric overlay (no model, just PostGIS intersection) +- Air quality → CANUE land-use regression coefficients as a shortcut + +**Use rule-based scoring for:** + +- Community benefit score (transit proximity + housing type + green space ratio) +- 311 complaint risk (density delta lookup) + +--- + +## File Naming Convention + +All processed data is saved to `data/` as GeoParquet: + +``` +data/ +├── building_permits.parquet # ~50k rows, training backbone +├── neighbourhoods.parquet # 158 polygons + income/density +├── street_trees.parquet # ~500k point geometries +├── traffic_volumes.parquet # ~5k intersection count points +├── ttc_stops.parquet # ~11k stop locations +├── zoning.parquet # polygon layer (large) +├── ewrb_energy.parquet # energy benchmarking training data +├── employment_survey.parquet # jobs training data +└── coefficients/ + ├── ite_trip_rates.csv + ├── statscan_io_multipliers.csv + └── tax_rates.csv +``` + diff --git a/ml/data_pipeline.py b/ml/data_pipeline.py new file mode 100644 index 0000000..d4ca2b3 --- /dev/null +++ b/ml/data_pipeline.py @@ -0,0 +1,354 @@ +""" +Download all Toronto Open Data datasets and save to data/ as GeoParquet / Parquet. + +Run once before model training: + pip install -r ml/requirements.txt + python ml/data_pipeline.py + +Everything is cached - re-running skips already-downloaded files. +Set FORCE_REFRESH=1 to re-download everything. +""" + +import os +import sys +from pathlib import Path + +import pandas as pd +import geopandas as gpd +from dotenv import load_dotenv + +load_dotenv() + +DATA_DIR = Path(os.getenv("DATA_DIR", Path(__file__).parent.parent / "data")) +DATA_DIR.mkdir(parents=True, exist_ok=True) +(DATA_DIR / "coefficients").mkdir(exist_ok=True) + +PERMITS_FILTER_DATE = os.getenv("PERMITS_FILTER_DATE", "2020-01-01") +FORCE = os.getenv("FORCE_REFRESH", "0") == "1" + +# Add ml/ to path so fetch.py is importable +sys.path.insert(0, str(Path(__file__).parent)) +from fetch import fetch, fetch_gtfs_stops, fetch_csv_with_latlon, download_raster + + +def _save(name: str, data: gpd.GeoDataFrame | pd.DataFrame, last_mod: str): + out = DATA_DIR / f"{name}.parquet" + data.to_parquet(out) + print(f" [{name}] {len(data):,} rows -> {out.name} (source: {last_mod})") + + +def cached(name: str) -> bool: + if FORCE: + return False + exists = (DATA_DIR / f"{name}.parquet").exists() + if exists: + print(f" [{name}] already exists - skipping (set FORCE_REFRESH=1 to re-download)") + return exists + + +# --------------------------------------------------------------------------- +# 1. SPATIAL LAYERS (GeoJSON -> GeoParquet) +# --------------------------------------------------------------------------- + +def dl_street_trees(): + if cached("street_trees"): + return + # Street tree dataset is CSV-only (no GeoJSON resource) with LATITUDE/LONGITUDE columns + gdf, lm = fetch_csv_with_latlon( + "street-tree-data", + lat_col="LATITUDE", lon_col="LONGITUDE", + extra_cols=["DBH_TRUNK", "COMMON_NAME", "SPECIES_DESC"], + ) + gdf = gdf.rename(columns={"DBH_TRUNK": "dbh_trunk", "COMMON_NAME": "common_name", "SPECIES_DESC": "species"}) + _save("street_trees", gdf, lm) + + +def dl_neighbourhoods(): + """Download neighbourhood polygons AND profiles CSV, then join income + density.""" + if cached("neighbourhoods"): + return + + # Polygon boundaries + hoods, lm = fetch("neighbourhoods", prefer="geojson") + hoods.columns = [c.upper() for c in hoods.columns] + if "GEOMETRY" in hoods.columns: + hoods = hoods.rename(columns={"GEOMETRY": "geometry"}).set_geometry("geometry") + name_col = next((c for c in hoods.columns if c != "geometry" and ("NAME" in c or "HOOD" in c)), hoods.columns[0]) + + # Wide-format profiles CSV (columns = neighbourhood names, rows = variables) + try: + profiles_raw, _ = fetch("neighbourhood-profiles", prefer="csv") + profiles_raw.columns = [str(c).strip() for c in profiles_raw.columns] + var_col = profiles_raw.columns[0] + + variables = profiles_raw[var_col].astype(str) + income_mask = variables.str.contains("Median after-tax income", na=False, case=False) + density_mask = variables.str.contains("Population density", na=False, case=False) + + def extract_row(mask): + rows = profiles_raw[mask] + if rows.empty: + return None + return ( + rows.iloc[0, 1:] + .astype(str) + .str.replace(",", "", regex=False) + .str.strip() + .apply(pd.to_numeric, errors="coerce") + ) + + income_series = extract_row(income_mask) + density_series = extract_row(density_mask) + + if income_series is not None: + income_series.name = "median_income" + hoods = hoods.set_index(name_col).join(income_series, how="left").reset_index() + if density_series is not None: + density_series.name = "population_density" + hoods = hoods.set_index(name_col).join(density_series, how="left").reset_index() + except Exception as e: + print(f" WARNING: neighbourhood profiles join failed ({e}) - saving polygons only") + + hoods = hoods.rename(columns={name_col: "name"}) + _save("neighbourhoods", hoods, lm) + + +def dl_zoning(): + """Save the two most useful zoning layers: base area and height overlay.""" + for out_name, keywords in [ + ("zoning_area", ["zoning area", "general zoning", "zoning bylaw area"]), + ("zoning_height", ["height", "height overlay"]), + ]: + if cached(out_name): + continue + try: + gdf, lm = fetch("zoning-by-law", prefer="geojson") + # The GeoJSON may already be the right layer, or may have a 'layer' property + if "ZONE_CLASS" in gdf.columns or "ZBL_ZONE" in gdf.columns: + _save(out_name, gdf, lm) + else: + print(f" [{out_name}] downloaded but could not identify layer columns - saving raw") + _save(out_name, gdf, lm) + except Exception as e: + print(f" WARNING: {out_name} failed ({e})") + + +def dl_centreline(): + if cached("street_centreline"): + return + gdf, lm = fetch("toronto-centreline-tcl", prefer="geojson") + _save("street_centreline", gdf, lm) + + +def dl_traffic_volumes(): + if cached("traffic_volumes"): + return + try: + # Traffic volumes has lat/lng columns + gdf, lm = fetch_csv_with_latlon( + "traffic-volumes-at-intersections-for-all-modes", + lat_col="latitude", lon_col="longitude", + extra_cols=["location_id", "location", "8_hr_vehicle_volume", + "8_hr_pedestrian_volume", "count_date"], + ) + # Normalise the volume column name + vol_col = next((c for c in gdf.columns if "vehicle" in c.lower() and "volume" in c.lower()), None) + if vol_col: + gdf = gdf.rename(columns={vol_col: "volume_8hr_vehicles"}) + _save("traffic_volumes", gdf, lm) + except Exception as e: + print(f" WARNING: traffic_volumes failed ({e})") + + +def dl_parks(): + for name, ckan_id in [ + ("parks", "parks-and-recreation-facilities"), + ("green_spaces", "green-spaces"), + ]: + if cached(name): + continue + try: + gdf, lm = fetch(ckan_id, prefer="geojson") + _save(name, gdf, lm) + except Exception as e: + print(f" WARNING: {name} failed ({e})") + + +def dl_cycling_network(): + if cached("cycling_network"): + return + try: + gdf, lm = fetch("cycling-network", prefer="geojson") + _save("cycling_network", gdf, lm) + except Exception as e: + print(f" WARNING: cycling_network failed ({e})") + + +def dl_development_applications(): + if cached("development_applications"): + return + try: + gdf, lm = fetch("development-applications", prefer="geojson") + _save("development_applications", gdf, lm) + except Exception as e: + print(f" WARNING: development_applications failed ({e})") + + +def dl_heritage(): + if cached("heritage_properties"): + return + try: + gdf, lm = fetch("heritage-properties", prefer="geojson") + _save("heritage_properties", gdf, lm) + except Exception as e: + print(f" WARNING: heritage_properties failed ({e})") + + +# --------------------------------------------------------------------------- +# 2. TABULAR TRAINING DATA (CSV -> Parquet) +# --------------------------------------------------------------------------- + +EWRB_RENAME = { + "Property GFA - Self-Reported (ft²)": "floor_area_sqft", + "Electricity Use - Grid Purchase (kWh)": "annual_electricity_kwh", + "Natural Gas Use (GJ)": "annual_gas_gj", + "Water Use (m³)": "annual_water_m3", + "GHG Emissions Intensity (kg CO2e/ft²)": "ghg_intensity", + "Total GHG Emissions (kg CO2e)": "total_ghg_kg", + "Property Type": "building_type", + "Year Built": "year_built", + "Number of Floors": "num_floors", + "City": "city", + "Postal Code": "postal_code", + "Ward": "ward", +} + + +def dl_ewrb(): + if cached("ewrb_energy"): + return + try: + df, lm = fetch("annual-energy-consumption", prefer="csv") + df = df.rename(columns={k: v for k, v in EWRB_RENAME.items() if k in df.columns}) + _save("ewrb_energy", df, lm) + except Exception as e: + print(f" WARNING: ewrb_energy failed ({e})") + + +def dl_building_permits(): + for name, ckan_id in [ + ("building_permits_cleared", "building-permits-cleared-permits"), + ("building_permits_active", "building-permits-active-permits"), + ]: + if cached(name): + continue + try: + df, lm = fetch(ckan_id, prefer="csv") + # Normalise date column name (varies between datasets) + date_col = next( + (c for c in df.columns if "issue" in c.lower() and "date" in c.lower()), None + ) + if date_col and name == "building_permits_cleared": + df[date_col] = pd.to_datetime(df[date_col], errors="coerce") + df = df[df[date_col] >= PERMITS_FILTER_DATE] + print(f" [{name}] filtered to {PERMITS_FILTER_DATE}+ -> {len(df):,} rows") + _save(name, df, lm) + except Exception as e: + print(f" WARNING: {name} failed ({e})") + + +def dl_business_licences(): + if cached("business_licences"): + return + try: + df, lm = fetch( + "municipal-licensing-and-standards-business-licences-and-permits", + prefer="csv", + ) + _save("business_licences", df, lm) + except Exception as e: + print(f" WARNING: business_licences failed ({e})") + + +def dl_property_tax(): + for name, ckan_id in [ + ("property_tax", "property-tax-collection"), + ("cva_residential", "current-value-assessment-cva-information-residential-property-types"), + ]: + if cached(name): + continue + try: + df, lm = fetch(ckan_id, prefer="csv") + _save(name, df, lm) + except Exception as e: + print(f" WARNING: {name} failed ({e})") + + +# --------------------------------------------------------------------------- +# 3. SPECIAL CASES +# --------------------------------------------------------------------------- + +def dl_ttc_stops(): + if cached("ttc_stops"): + return + try: + gdf, lm = fetch_gtfs_stops("ttc-routes-and-schedules") + _save("ttc_stops", gdf, lm) + except Exception as e: + print(f" WARNING: ttc_stops failed ({e})") + + +def dl_forest_cover(): + out = DATA_DIR / "forest_cover.tif" + if not FORCE and out.exists(): + print(" [forest_cover] already exists - skipping") + return + try: + lm = download_raster("forest-and-land-cover", out) + size_mb = out.stat().st_size / 1_048_576 + print(f" [forest_cover] {size_mb:.1f} MB -> {out.name} (source: {lm})") + except Exception as e: + print(f" WARNING: forest_cover failed ({e})") + + +# --------------------------------------------------------------------------- +# MAIN +# --------------------------------------------------------------------------- + +def main(): + print(f"=== UrbanForge Data Pipeline ===") + print(f"Output dir : {DATA_DIR}") + print(f"Permit cut : {PERMITS_FILTER_DATE}") + print(f"Force : {FORCE}\n") + + print("-- Spatial layers ----------------------------------") + dl_street_trees() + dl_neighbourhoods() + dl_zoning() + dl_centreline() + dl_traffic_volumes() + dl_parks() + dl_cycling_network() + dl_development_applications() + dl_heritage() + + print("\n-- Tabular training data ---------------------------") + dl_ewrb() + dl_building_permits() + dl_business_licences() + dl_property_tax() + + print("\n-- Special cases -----------------------------------") + dl_ttc_stops() + dl_forest_cover() + + print("\n=== Done ===") + parquet_files = list(DATA_DIR.glob("*.parquet")) + tif_files = list(DATA_DIR.glob("*.tif")) + print(f" {len(parquet_files)} parquet files") + print(f" {len(tif_files)} raster files") + print(f" Total: {sum(f.stat().st_size for f in parquet_files + tif_files) / 1_048_576:.0f} MB") + + +if __name__ == "__main__": + main() diff --git a/ml/fetch.py b/ml/fetch.py new file mode 100644 index 0000000..ae0ab3f --- /dev/null +++ b/ml/fetch.py @@ -0,0 +1,213 @@ +""" +Toronto Open Data download helpers. +All functions return a GeoDataFrame (spatial) or DataFrame (tabular). +Results are NOT cached here - caching is handled in data_pipeline.py. +""" + +import io +import json +import os +import tempfile +import zipfile +from pathlib import Path + +import pandas as pd +import geopandas as gpd +import requests +import urllib3 +from shapely.geometry import shape +from shapely import wkt + +# Windows (Python 3.13) does not include the city's CA cert in its bundle. +# All requests to the Toronto Open Data API use verify=False. +# This is safe: we're reading public government data, not transmitting secrets. +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +BASE = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/" +_SSL = False # set to True if you install the cert chain manually + + +def _package(ckan_id: str) -> dict: + r = requests.get(BASE + "package_show", params={"id": ckan_id}, timeout=30, verify=_SSL) + r.raise_for_status() + return r.json()["result"] + + +def _last_modified(pkg: dict) -> str: + dates = [r.get("last_modified", "") for r in pkg["resources"] if r.get("last_modified")] + return max(dates) if dates else "unknown" + + +def list_formats(ckan_id: str) -> list[tuple[str, str]]: + """Debug helper: print available resource formats for a dataset.""" + pkg = _package(ckan_id) + return [(r.get("format", "?"), r.get("name", "?"), r.get("url", "?")[:80]) + for r in pkg["resources"]] + + +def _read_geo_bytes(content: bytes) -> gpd.GeoDataFrame: + """ + Try to parse raw bytes as a geodataframe. + Handles GeoJSON directly, or ZIP containing a shapefile. + """ + # Check if it's a ZIP (shapefile archive) + if content[:2] == b"PK": + with zipfile.ZipFile(io.BytesIO(content)) as z: + shp_files = [n for n in z.namelist() if n.lower().endswith(".shp")] + if shp_files: + with tempfile.TemporaryDirectory() as tmpdir: + z.extractall(tmpdir) + return gpd.read_file(os.path.join(tmpdir, shp_files[0])) + # ZIP without .shp - try the first file as GeoJSON + first = z.namelist()[0] + return gpd.read_file(io.BytesIO(z.read(first))) + # Otherwise pass bytes directly (GeoJSON, etc.) + try: + return gpd.read_file(io.BytesIO(content)) + except Exception: + df = pd.read_csv(io.BytesIO(content), encoding="latin-1", on_bad_lines="skip", low_memory=False) + geom_col = next((c for c in df.columns if c.lower() == "geometry"), None) + if geom_col is None: + raise + + def parse_geometry(value): + if pd.isna(value): + return None + text = str(value).strip() + if not text: + return None + if text.startswith("{"): + return shape(json.loads(text)) + return wkt.loads(text) + + geometry = df[geom_col].apply(parse_geometry) + df = df.drop(columns=[geom_col]) + return gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326") + + +def fetch(ckan_id: str, prefer: str = "geojson") -> tuple[gpd.GeoDataFrame | pd.DataFrame, str]: + """ + Download the first resource matching `prefer` from a CKAN package. + prefer="geojson" -> tries geojson, then shp/zip (NOT bare "json" - that matches + the CKAN datastore API endpoint, not a real GeoJSON file). + prefer="csv" -> tries csv, then xlsx. + Returns (GeoDataFrame|DataFrame, last_modified_string). + """ + pkg = _package(ckan_id) + + geo_formats = ["geojson", "shp", "shapefile", "zip"] + csv_formats = ["csv", "xlsx"] + + if prefer.lower() == "geojson": + candidates = geo_formats + is_geo = True + elif prefer.lower() == "csv": + candidates = csv_formats + is_geo = False + else: + candidates = [prefer.lower()] + is_geo = prefer.lower() in geo_formats + + for candidate in candidates: + for res in pkg["resources"]: + if res.get("format", "").lower() == candidate and res.get("url"): + resp = requests.get(res["url"], timeout=300, verify=_SSL) + resp.raise_for_status() + lm = _last_modified(pkg) + if is_geo: + return _read_geo_bytes(resp.content), lm + else: + return ( + pd.read_csv(io.BytesIO(resp.content), encoding="latin-1", on_bad_lines="skip", low_memory=False), + lm, + ) + + raise ValueError( + f"No {prefer!r} resource found for {ckan_id!r}. " + f"Available: {[(r.get('format'), r.get('name')) for r in pkg['resources']]}" + ) + + +def fetch_csv_as_geo( + ckan_id: str, + lat_col: str, + lon_col: str, + extra_cols: list[str] | None = None, +) -> tuple[gpd.GeoDataFrame, str]: + """ + Download a CSV with lat/lng columns and return as GeoDataFrame. + Column name matching is case-insensitive. + Used for point datasets: street trees, traffic volumes, business licences. + """ + df, last_mod = fetch(ckan_id, prefer="csv") + + # Case-insensitive column lookup + col_map = {c.lower(): c for c in df.columns} + lat = col_map.get(lat_col.lower()) + lon = col_map.get(lon_col.lower()) + if lat is None or lon is None: + raise ValueError( + f"Lat/lon columns {lat_col!r}/{lon_col!r} not found in {ckan_id!r}. " + f"Available: {list(df.columns[:20])}" + ) + + df[lat] = pd.to_numeric(df[lat], errors="coerce") + df[lon] = pd.to_numeric(df[lon], errors="coerce") + df = df.dropna(subset=[lat, lon]) + + keep_lower = {c.lower() for c in (extra_cols or [])} + keep = [lat, lon] + [col_map[c] for c in keep_lower if c in col_map and col_map[c] not in (lat, lon)] + + gdf = gpd.GeoDataFrame( + df[keep], + geometry=gpd.points_from_xy(df[lon], df[lat]), + crs="EPSG:4326", + ) + return gdf, last_mod + + +# Keep old name as alias so data_pipeline.py doesn't need a rename +fetch_csv_with_latlon = fetch_csv_as_geo + + +def fetch_gtfs_stops(ckan_id: str = "ttc-routes-and-schedules") -> tuple[gpd.GeoDataFrame, str]: + """Download GTFS zip, extract stops.txt, return GeoDataFrame of stop points.""" + pkg = _package(ckan_id) + last_mod = _last_modified(pkg) + + for res in pkg["resources"]: + if res.get("format", "").lower() == "zip" and res.get("url"): + raw = requests.get(res["url"], timeout=120, verify=_SSL).content + with zipfile.ZipFile(io.BytesIO(raw)) as z: + stops_name = next((n for n in z.namelist() if n.lower().endswith("stops.txt")), None) + if stops_name is None: + raise ValueError("stops.txt not found in GTFS zip") + stops = pd.read_csv(z.open(stops_name)) + + gdf = gpd.GeoDataFrame( + stops[["stop_id", "stop_name"]], + geometry=gpd.points_from_xy(stops["stop_lon"], stops["stop_lat"]), + crs="EPSG:4326", + ) + return gdf, last_mod + + raise ValueError(f"No zip resource found for {ckan_id!r}") + + +def download_raster(ckan_id: str, out_path: Path) -> str: + """Download a GeoTIFF raster from a CKAN package to out_path.""" + pkg = _package(ckan_id) + last_mod = _last_modified(pkg) + + for res in pkg["resources"]: + fmt = res.get("format", "").lower() + if fmt in ("tiff", "geotiff", "tif") and res.get("url"): + resp = requests.get(res["url"], stream=True, timeout=300, verify=_SSL) + resp.raise_for_status() + out_path.parent.mkdir(parents=True, exist_ok=True) + with open(out_path, "wb") as f: + for chunk in resp.iter_content(chunk_size=65536): + f.write(chunk) + return last_mod + + raise ValueError(f"No TIFF resource found for {ckan_id!r}") diff --git a/ml/requirements.txt b/ml/requirements.txt new file mode 100644 index 0000000..c79fca1 --- /dev/null +++ b/ml/requirements.txt @@ -0,0 +1,10 @@ +geopandas==1.0.1 +pyarrow==18.1.0 +shapely==2.0.6 +numpy==2.1.3 +pandas==2.2.3 +scikit-learn==1.5.2 +xgboost==2.1.3 +requests==2.32.3 +rasterio==1.4.3 +python-dotenv==1.0.1