diff --git a/.gitignore b/.gitignore index cb4ccda..4315c38 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ data node_modules package-lock.json +.npm-cache +.geonames-build +.DS_Store +tmp +*.sqlite +*.db diff --git a/README.md b/README.md index 554bbcc..ae657c1 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Offline Geocoder -Node library for reverse geocoding. Designed to be used offline (for example -embedded in a desktop or mobile application) - no web requests are made to -perform a lookup. +Node and React Native library for offline geocoding. Designed to be used +offline (for example embedded in a desktop or mobile application) — no web +requests are made to perform a lookup. ## Data @@ -32,20 +32,44 @@ lookups per second with a single process. npm install --save offline-geocoder ``` +For Node you also need `sqlite3`: + +``` +npm install --save sqlite3 +``` + +For Expo / React Native, install `expo-sqlite` instead: + +``` +npx expo install expo-sqlite +``` + You also need to obtain a database which isn't included in the package, to -generate your own take a look in `scripts`. +generate your own take a look at the [Generating the database](#generating-the-database) +section below. ## Usage When you initialize the library you need to pass the location of the database: ```javascript -const geocoder = require('offline-geocoder')({ database: 'data/geodata.db' }) +const geocoder = require('offline-geocoder')({ database: 'data/geocoder.sqlite' }) +``` + +To enable boundary-aware reverse geocoding, pass `reverseMode: 'boundary'` +(default is `centroid` for backward compatibility): + +```javascript +const geocoder = require('offline-geocoder')({ + database: 'data/geocoder.sqlite', + reverseMode: 'boundary', + boundary: { basePrecision: 4, maxPrecision: 7 } +}) ``` ### Reverse Geocoding -To perform a revese geocode lookup just pass the coordinates: +To perform a reverse geocode lookup just pass the coordinates: ```javascript geocoder.reverse(41.89, 12.49) @@ -76,6 +100,222 @@ geocoder.reverse(41.89, 12.49, function(error, result) { }) ``` +Boundary mode keeps the same return payload shape and supports two boundary +storage modes: +- compact lookup (`compact_places` + `compact_geohash_lookup`) +- full polygon mode (`places` + `place_geohash_cover` + `place_geometry`) + +### Forward Geocoding + +Forward geocoding matches a city name to its canonical entry. Requires a +database generated with the updated schema (see below). + +```javascript +geocoder.forward('rome') + .then(function(result) { + console.log(result) + }) +``` + +Returns `undefined` when no match is found, or when using an older database +without the required columns. + +### Location Lookup + +Look up a city by its GeoNames id: + +```javascript +geocoder.location().find(3169070) +geocoder.location.find('geonames:3169070') +``` + +Returns `undefined` when the id doesn't exist. Both numeric ids and +`geonames:` strings are accepted — use the prefixed form as a stable +grouping key across datasets. + +## Expo / React Native + +The React Native entrypoint avoids Node-only modules: + +```javascript +const createGeocoder = require('offline-geocoder/expo') + +const db = await SQLite.openDatabaseAsync('geocoder.sqlite') +const geocoder = createGeocoder({ db: db }) + +geocoder.reverse(41.89, 12.49) + .then(function(result) { + console.log(result) + }) +``` + +You'll need to bundle the SQLite database file with your app assets and copy +it to a location accessible by `expo-sqlite` on first launch. + +## Generating the database + +The repo includes a script to generate a SQLite database from GeoNames dumps: + +```bash +./scripts/generate_geonames.sh data/geocoder.sqlite +``` + +Environment variables for customization: + +| Variable | Default | Description | +|---|---|---| +| `GEONAMES_DATASET` | `cities1000` | GeoNames dump file to use | +| `GEONAMES_WORKDIR` | current directory | Working directory for temp files | +| `GEONAMES_DOWNLOAD` | `1` | Set to `0` to skip downloads | +| `GEONAMES_FEATURE_CODES` | `PPLA,PPLA2,PPLA3,PPLA4,PPLA5,PPLC` | Feature codes to keep | +| `GEONAMES_MIN_POPULATION` | `0` | Minimum population filter | +| `GEONAMES_INCLUDE_ADMIN1` | `1` | Set to `0` to skip admin1 data | + +The default feature codes exclude `PPL` which can include neighbourhood-like +populated places. The schema is defined in [`scripts/schema.sql`](scripts/schema.sql). + +### Generating a Boundary Index + +Build boundary-aware reverse lookup tables from a polygon source (GeoJSON +FeatureCollection/Feature or newline-delimited GeoJSON): + +```bash +node scripts/generate_boundary_index.js \ + --database data/geocoder.sqlite \ + --input data/localities.geojson \ + --index-mode compact \ + --include-region true \ + --min-population 10000 \ + --base-precision 4 \ + --max-precision 7 +``` + +You can also run `npm run build:boundary -- --database ... --input ...`. + +You can point the builder directly at directories of WOF GeoJSON files: + +```bash +node scripts/generate_boundary_index.js \ + --database data/geocoder.sqlite \ + --input-dir tmp/wof-build/extracted/fr/.../data \ + --index-mode compact \ + --include-region true \ + --min-population 10000 \ + --base-precision 4 \ + --max-precision 7 \ + --drop-contained-localities true +``` + +`--drop-contained-localities true` removes `locality` polygons that are fully +contained in larger localities within the same country/admin1 group. This is +intended to suppress duplicate neighbourhood-like localities while keeping +small isolated places (for example islands) that are not contained. + +#### Place selection pipeline + +The builder uses a multi-stage pipeline to decide which localities make it into the index: + +1. **Primary filter** (`--min-population`): localities at or above this threshold are always included. Country capitals are always included regardless of population. +2. **Isolation pass** (`--isolation-min-population`): localities between the isolation floor and the primary threshold are evaluated as candidates. A candidate is promoted if at least one of its geohash cover cells (at base precision) is not already claimed by a primary locality. This ensures small but geographically isolated places like islands, remote towns, and oases get their own label without adding noise in dense urban areas. +3. **Country guarantee** (`--ensure-country-locality`): after the isolation pass, any country that still has zero localities gets its highest-population candidate promoted unconditionally. +4. **Contained-locality pruning** (`--drop-contained-localities`): removes localities whose polygon is fully contained inside a larger locality in the same country/admin1 group. +5. **Dominant-city rollup**: in the geohash index, when a major city (population >= `--dominant-locality-population`) dominates its neighbours by a ratio of `--dominant-locality-ratio`, smaller nearby localities are absorbed into the major city label. +6. **Locality-over-region promotion**: when a locality and a region compete for the same parent geohash cell, the locality wins if it covers >= `--parent-locality-min-share` of child cells. + +Builder notes: + +- Keeps current records only (drops deprecated/superseded where source metadata is present) +- Includes `locality` placetypes by default (`localadmin` optional via `--include-localadmin true`) +- Optional `region` fallback polygons via `--include-region true` +- `--min-population` applies to `locality` only, so low-pop localities can roll up to broader admin areas when `region` is included +- Point-only capital localities are retained (single-cell locality fallback) so country/admin capitals are not dropped by polygon-only filtering +- Per-placetype precision caps are supported: + - `--locality-max-precision` + - `--localadmin-max-precision` + - `--region-max-precision` + - `--region-sparse-max-precision` + `--region-sparse-min-area-km2` for very large sparse regions (for example geohash-3 in Amazon-like interiors) +- `--promote-locality-over-region` (default `true`) prefers locality labels in shared parent cells when there is no competing locality (keeps city labels sticky against region-only outskirts) +- Dominant-city rollup keeps broad city labels sticky in mixed city/suburb cells unless there is competing major-city pressure: + - `--dominant-locality-population` (default `100000`) + - `--dominant-locality-ratio` (default `3`) +- Parent-cell takeover guard: + - `--parent-locality-min-share` (default `0.5`) requires locality ownership of at least that child-cell share before replacing a parent cell label +- Excludes neighbourhood-like placetypes from default reverse output +- `--index-mode compact` (default) stores only geohash-to-place mappings (`compact_geohash_lookup`) and no runtime geometry payloads. + Compact schema uses `compact_places(id,name,country_id,admin1_id,placetype_code,latitude,longitude)`. +- `--index-mode full` stores geohash cover + geometry for runtime point-in-polygon + +### Building From Who's On First (WOF) + +Use the WOF helper script to download country admin repos and build in one step: + +```bash +WOF_COUNTRIES=FR,IT \ +WOF_BASE_PRECISION=4 \ +WOF_MAX_PRECISION=5 \ +WOF_INCLUDE_REGION=1 \ +WOF_MIN_POPULATION=10000 \ +./scripts/generate_wof_boundary.sh data/geocoder.sqlite +``` + +Equivalent npm script: + +```bash +npm run build:wof -- data/geocoder.sqlite +``` + +Useful WOF build env vars: + +- `WOF_COUNTRIES` comma-separated country codes (default `FR,IT`) +- `WOF_WORKDIR` working directory for downloads/extracted files (default `tmp/wof-build`) +- `WOF_DOWNLOAD=0` reuse existing archives only +- `WOF_REF` branch/ref to download (default `master`) +- `WOF_REF_LOCK_FILE` optional per-country pinned refs (` ` per line); when set, this overrides `WOF_REF` per country +- `WOF_LOCALITY_MAX_PRECISION` locality precision cap +- `WOF_REGION_MAX_PRECISION` region precision cap (default `4`) +- `WOF_REGION_SPARSE_MAX_PRECISION` sparse very-large-region precision (default `3`) +- `WOF_REGION_SPARSE_MIN_AREA_KM2` area threshold for sparse region precision (default `80000`) +- `WOF_PROMOTE_LOCALITY_OVER_REGION=1|0` prefer locality labels over region in shared parent cells (default `1`) +- `WOF_DOMINANT_LOCALITY_POPULATION` major-locality threshold for dominant-city rollup (default `100000`) +- `WOF_DOMINANT_LOCALITY_RATIO` dominant-vs-next locality population ratio (default `3`) +- `WOF_PARENT_LOCALITY_MIN_SHARE` minimum child-cell share for locality parent takeover (default `0.5`) +- `WOF_GEOMETRY_DECIMALS` round coordinates before storage/indexing (for example `4`) +- `WOF_MIN_POPULATION` filter out places below threshold (for example `10000`) +- `WOF_ISOLATION_MIN_POPULATION` lower population floor for isolated localities (default `500`). Places between this and `WOF_MIN_POPULATION` are included only if they occupy otherwise-empty geohash cells +- `WOF_ENSURE_COUNTRY_LOCALITY=1|0` guarantee at least one locality per country (default `1`) +- `WOF_INCLUDE_REGION=1|0` include/exclude region fallback boundaries +- `WOF_MAX_PLACES` cap places for experiment runs +- `WOF_DROP_CONTAINED_LOCALITIES=1|0` enable/disable contained-locality pruning +- `WOF_SKIP_INVALID_REPOS=1|0` skip malformed/unexpected WOF admin repos during bulk runs (default `1`) +- `WOF_APPEND=1|0` append to an existing compact DB instead of replacing schema (default `0`) + +Boundary runtime modes: + +- `reverseMode: 'centroid'` (default): legacy nearest-centroid reverse lookup +- `reverseMode: 'boundary'`: boundary tables lookup. + - Uses compact `compact_geohash_lookup` when present (fast geohash-to-place). + - Falls back to full polygon-aware tables when compact rows are absent. + +### External Reverse Validation (LocationIQ) + +Use this script to compare local reverse results against LocationIQ at sampled +coordinates, with persistent SQLite caching so requests are not repeated: + +```bash +LOCATIONIQ_API_KEY=... node scripts/validate_with_locationiq.js \ + --database tmp/wof-fr-it-compact-p5-d3-pop10k-region.sqlite \ + --samples 300 \ + --export-csv tmp/locationiq-validation-fr-it.csv +``` + +It creates/updates: + +- `sample_points` (coordinates sampled from your geohash table) +- `locationiq_cache` (raw LocationIQ responses keyed by coordinate) +- `validation_results` (local vs LocationIQ comparison verdicts) + +Cache DB path is automatic (default behavior): `tmp/locationiq-validation-.sqlite`. + ## License This library is licensed under [the MIT license](https://github.com/lucaspiller/offline-geocoder/blob/master/LICENSE). diff --git a/bin/geocoder b/bin/geocoder index 272750b..2813e66 100755 --- a/bin/geocoder +++ b/bin/geocoder @@ -3,7 +3,26 @@ "use strict"; -const geocoder = require('../src/index.js')() +function parseOptionalNumber(value) { + if (value === undefined) return undefined + var parsed = Number(value) + return Number.isFinite(parsed) ? parsed : undefined +} + +var options = {} +if (process.env.GEOCODER_REVERSE_MODE) { + options.reverseMode = process.env.GEOCODER_REVERSE_MODE +} + +var boundaryBase = parseOptionalNumber(process.env.GEOCODER_BOUNDARY_BASE_PRECISION) +var boundaryMax = parseOptionalNumber(process.env.GEOCODER_BOUNDARY_MAX_PRECISION) +if (boundaryBase !== undefined || boundaryMax !== undefined) { + options.boundary = {} + if (boundaryBase !== undefined) options.boundary.basePrecision = boundaryBase + if (boundaryMax !== undefined) options.boundary.maxPrecision = boundaryMax +} + +const geocoder = require('../src/index.js')(options) const args = process.argv.slice(2) if (args.length != 2) { diff --git a/bin/geocoder-bench b/bin/geocoder-bench index 939dcaa..0366ea3 100755 --- a/bin/geocoder-bench +++ b/bin/geocoder-bench @@ -3,7 +3,26 @@ "use strict"; -const geocoder = require('../src/index.js')() +function parseOptionalNumber(value) { + if (value === undefined) return undefined + var parsed = Number(value) + return Number.isFinite(parsed) ? parsed : undefined +} + +var options = {} +if (process.env.GEOCODER_REVERSE_MODE) { + options.reverseMode = process.env.GEOCODER_REVERSE_MODE +} + +var boundaryBase = parseOptionalNumber(process.env.GEOCODER_BOUNDARY_BASE_PRECISION) +var boundaryMax = parseOptionalNumber(process.env.GEOCODER_BOUNDARY_MAX_PRECISION) +if (boundaryBase !== undefined || boundaryMax !== undefined) { + options.boundary = {} + if (boundaryBase !== undefined) options.boundary.basePrecision = boundaryBase + if (boundaryMax !== undefined) options.boundary.maxPrecision = boundaryMax +} + +const geocoder = require('../src/index.js')(options) const args = process.argv.slice(2) if (args.length != 2) { diff --git a/bin/geocoder-build-boundary b/bin/geocoder-build-boundary new file mode 100755 index 0000000..86dc909 --- /dev/null +++ b/bin/geocoder-build-boundary @@ -0,0 +1,4 @@ +#!/usr/bin/env node +"use strict"; + +require('../scripts/generate_boundary_index') diff --git a/bin/geocoder-build-wof b/bin/geocoder-build-wof new file mode 100755 index 0000000..c5b947b --- /dev/null +++ b/bin/geocoder-build-wof @@ -0,0 +1,14 @@ +#!/usr/bin/env node +"use strict"; + +const { spawnSync } = require('child_process') +const path = require('path') + +const script = path.join(__dirname, '..', 'scripts', 'generate_wof_boundary.sh') +const args = process.argv.slice(2) + +const result = spawnSync(script, args, { stdio: 'inherit' }) +if (result.error) { + throw result.error +} +process.exit(result.status === null ? 1 : result.status) diff --git a/package.json b/package.json index 767a09c..26910f0 100644 --- a/package.json +++ b/package.json @@ -1,17 +1,42 @@ { "name": "offline-geocoder", "version": "1.0.0", - "description": "Node library for offline geocoding", + "description": "Offline reverse and forward geocoding for Node and React Native", "repository": "https://github.com/lucaspiller/offline-geocoder", "main": "src/index.js", - "dependencies": { - "sqlite3": "^4.0.0" + "bin": { + "geocoder": "bin/geocoder", + "geocoder-bench": "bin/geocoder-bench", + "geocoder-build-boundary": "bin/geocoder-build-boundary", + "geocoder-build-wof": "bin/geocoder-build-wof" }, + "react-native": "src/expo.js", + "exports": { + ".": { + "react-native": "./src/expo.js", + "require": "./src/index.js", + "default": "./src/index.js" + }, + "./expo": "./src/expo.js" + }, + "peerDependencies": { + "sqlite3": "^5.1.7" + }, + "peerDependenciesMeta": { + "sqlite3": { + "optional": true + } + }, + "dependencies": {}, "devDependencies": { - "jasmine": "^3.1.0" + "jasmine": "^5.12.0", + "sqlite3": "^5.1.7" }, "scripts": { - "test": "jasmine" + "test": "jasmine", + "build:boundary": "node scripts/generate_boundary_index.js", + "build:wof": "bash scripts/generate_wof_boundary.sh", + "validate:locationiq": "node scripts/validate_with_locationiq.js" }, "author": "Luca Spiller", "license": "MIT" diff --git a/scripts/analyze_compact_index.py b/scripts/analyze_compact_index.py new file mode 100644 index 0000000..9e5092a --- /dev/null +++ b/scripts/analyze_compact_index.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python3 +"""Analyze compact geohash lookup DB size drivers. + +Usage: + python scripts/analyze_compact_index.py \ + --db tmp/wof-fr-it-compact-p5-d3-pop10k-region.sqlite \ + --top 20 \ + --export-place-id 85683531 \ + --export-geojson tmp/region_cells.geojson +""" + +from __future__ import annotations + +import argparse +import json +import math +import sqlite3 +from pathlib import Path +from typing import Dict, Iterable, List, Sequence, Tuple + +BASE32 = "0123456789bcdefghjkmnpqrstuvwxyz" +BASE32_MAP = {ch: i for i, ch in enumerate(BASE32)} +PLACETYPE_BY_CODE = {0: "locality", 1: "localadmin", 2: "region"} + + +def decode_geohash_bbox(geohash: str) -> Tuple[float, float, float, float]: + lat_min, lat_max = -90.0, 90.0 + lon_min, lon_max = -180.0, 180.0 + even = True + + for ch in geohash.lower(): + value = BASE32_MAP[ch] + for mask in (16, 8, 4, 2, 1): + if even: + lon_mid = (lon_min + lon_max) / 2.0 + if value & mask: + lon_min = lon_mid + else: + lon_max = lon_mid + else: + lat_mid = (lat_min + lat_max) / 2.0 + if value & mask: + lat_min = lat_mid + else: + lat_max = lat_mid + even = not even + + return (lat_min, lon_min, lat_max, lon_max) + + +def cell_area_km2(geohash: str) -> float: + min_lat, min_lon, max_lat, max_lon = decode_geohash_bbox(geohash) + center_lat = (min_lat + max_lat) / 2.0 + d_lat = abs(max_lat - min_lat) + d_lon = abs(max_lon - min_lon) + + lat_km = d_lat * 111.32 + lon_km = d_lon * 111.32 * math.cos(math.radians(center_lat)) + return max(0.0, lat_km * lon_km) + + +def query_all(conn: sqlite3.Connection, sql: str, params: Sequence[object] = ()) -> List[sqlite3.Row]: + cur = conn.execute(sql, params) + rows = cur.fetchall() + cur.close() + return rows + + +def compact_has_placetype_code(conn: sqlite3.Connection) -> bool: + cols = query_all(conn, "PRAGMA table_info(compact_places)") + names = {row["name"] for row in cols} + return "placetype_code" in names + + +def print_summary(conn: sqlite3.Connection, top_n: int) -> None: + has_code = compact_has_placetype_code(conn) + placetype_expr = ( + "CASE p.placetype_code WHEN 0 THEN 'locality' WHEN 1 THEN 'localadmin' WHEN 2 THEN 'region' ELSE 'unknown' END" + if has_code + else "p.placetype" + ) + + total_rows = query_all(conn, "SELECT COUNT(*) AS c FROM compact_geohash_lookup")[0]["c"] + total_places = query_all(conn, "SELECT COUNT(*) AS c FROM compact_places")[0]["c"] + lengths = query_all( + conn, + "SELECT LENGTH(geohash) AS precision, COUNT(*) AS c " + "FROM compact_geohash_lookup GROUP BY precision ORDER BY precision", + ) + + print("=== Compact Index Summary ===") + print(f"Places: {total_places}") + print(f"Lookup rows: {total_rows}") + print("Geohash precision distribution:") + for row in lengths: + print(f" p{row['precision']}: {row['c']}") + + by_type = query_all( + conn, + """ + SELECT + {placetype_expr} AS placetype, + COUNT(DISTINCT p.id) AS place_count, + COUNT(*) AS lookup_rows + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + GROUP BY placetype + ORDER BY lookup_rows DESC + """.format(placetype_expr=placetype_expr), + ) + print("Rows by placetype:") + for row in by_type: + pct = (row["lookup_rows"] / total_rows * 100.0) if total_rows else 0.0 + print( + f" {row['placetype']}: places={row['place_count']}, " + f"rows={row['lookup_rows']} ({pct:.1f}%)" + ) + + top_places = query_all( + conn, + """ + SELECT + p.id, + p.name, + p.country_id, + {placetype_expr} AS placetype, + COUNT(*) AS lookup_rows + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + GROUP BY p.id + ORDER BY lookup_rows DESC, p.id ASC + LIMIT ? + """.format(placetype_expr=placetype_expr), + (top_n,), + ) + + print(f"Top {top_n} places by lookup rows:") + for row in top_places: + print( + f" {row['id']} | {row['placetype']} | {row['country_id']} | " + f"{row['name']} | rows={row['lookup_rows']}" + ) + + region_area = query_all( + conn, + """ + SELECT l.geohash + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + WHERE {placetype_expr} = 'region' + """.format(placetype_expr=placetype_expr), + (), + ) + total_region_area = sum(cell_area_km2(row["geohash"]) for row in region_area) + print(f"Approx area represented by region rows (km^2): {total_region_area:,.0f}") + + +def export_place_geojson( + conn: sqlite3.Connection, + place_id: int, + output_path: Path, + limit: int | None = None, +) -> None: + has_code = compact_has_placetype_code(conn) + if has_code: + place_rows = query_all( + conn, + """ + SELECT + id, + name, + country_id, + admin1_id, + CASE placetype_code + WHEN 0 THEN 'locality' + WHEN 1 THEN 'localadmin' + WHEN 2 THEN 'region' + ELSE 'unknown' + END AS placetype + FROM compact_places + WHERE id = ? + """, + (place_id,), + ) + else: + place_rows = query_all( + conn, + "SELECT id, name, country_id, admin1_id, placetype FROM compact_places WHERE id = ?", + (place_id,), + ) + if not place_rows: + raise SystemExit(f"place_id={place_id} not found in compact_places") + place = place_rows[0] + + sql = "SELECT geohash FROM compact_geohash_lookup WHERE place_id = ? ORDER BY geohash" + params: List[object] = [place_id] + if limit is not None and limit > 0: + sql += " LIMIT ?" + params.append(limit) + + geohash_rows = query_all(conn, sql, tuple(params)) + + features: List[Dict[str, object]] = [] + for row in geohash_rows: + geoh = row["geohash"] + min_lat, min_lon, max_lat, max_lon = decode_geohash_bbox(geoh) + polygon = [ + [min_lon, min_lat], + [max_lon, min_lat], + [max_lon, max_lat], + [min_lon, max_lat], + [min_lon, min_lat], + ] + features.append( + { + "type": "Feature", + "properties": { + "place_id": place["id"], + "name": place["name"], + "placetype": place["placetype"], + "country_id": place["country_id"], + "admin1_id": place["admin1_id"], + "geohash": geoh, + "precision": len(geoh), + }, + "geometry": {"type": "Polygon", "coordinates": [polygon]}, + } + ) + + payload = {"type": "FeatureCollection", "features": features} + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(payload), encoding="utf-8") + + print( + f"Wrote {len(features)} cell polygons for place_id={place_id} " + f"({place['name']}) to {output_path}" + ) + + +def export_all_geojson(conn: sqlite3.Connection, output_path: Path, limit: int | None = None) -> None: + has_code = compact_has_placetype_code(conn) + if has_code: + sql = """ + SELECT + l.geohash AS geohash, + p.id AS place_id, + p.name AS name, + p.country_id AS country_id, + p.admin1_id AS admin1_id, + CASE p.placetype_code + WHEN 0 THEN 'locality' + WHEN 1 THEN 'localadmin' + WHEN 2 THEN 'region' + ELSE 'unknown' + END AS placetype + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + ORDER BY l.geohash + """ + else: + sql = """ + SELECT + l.geohash AS geohash, + p.id AS place_id, + p.name AS name, + p.country_id AS country_id, + p.admin1_id AS admin1_id, + p.placetype AS placetype + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + ORDER BY l.geohash + """ + + params: List[object] = [] + if limit is not None and limit > 0: + sql += " LIMIT ?" + params.append(limit) + + rows = query_all(conn, sql, tuple(params)) + + features: List[Dict[str, object]] = [] + for row in rows: + geoh = row["geohash"] + min_lat, min_lon, max_lat, max_lon = decode_geohash_bbox(geoh) + polygon = [ + [min_lon, min_lat], + [max_lon, min_lat], + [max_lon, max_lat], + [min_lon, max_lat], + [min_lon, min_lat], + ] + features.append( + { + "type": "Feature", + "properties": { + "place_id": row["place_id"], + "name": row["name"], + "placetype": row["placetype"], + "country_id": row["country_id"], + "admin1_id": row["admin1_id"], + "geohash": geoh, + "precision": len(geoh), + }, + "geometry": {"type": "Polygon", "coordinates": [polygon]}, + } + ) + + payload = {"type": "FeatureCollection", "features": features} + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(payload), encoding="utf-8") + print(f"Wrote {len(features)} cell polygons to {output_path}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Analyze compact geohash lookup DB") + parser.add_argument("--db", required=True, help="Path to SQLite DB with compact_* tables") + parser.add_argument("--top", type=int, default=20, help="Show top N places by lookup rows") + parser.add_argument("--export-place-id", type=int, default=None, help="Place id to export as cell polygons") + parser.add_argument( + "--export-geojson", + default="tmp/compact_place_cells.geojson", + help="GeoJSON output path (used with --export-place-id)", + ) + parser.add_argument( + "--export-limit", + type=int, + default=None, + help="Optional max number of geohash cells to export", + ) + parser.add_argument( + "--export-all-geojson", + default=None, + help="Write all geohash cells with place metadata to this GeoJSON path", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + db_path = Path(args.db) + if not db_path.exists(): + raise SystemExit(f"DB not found: {db_path}") + + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + try: + tables = {row["name"] for row in query_all(conn, "SELECT name FROM sqlite_master WHERE type='table'")} + if "compact_places" not in tables or "compact_geohash_lookup" not in tables: + raise SystemExit("DB does not contain compact_places + compact_geohash_lookup") + + print_summary(conn, args.top) + + if args.export_place_id is not None: + export_place_geojson( + conn, + place_id=args.export_place_id, + output_path=Path(args.export_geojson), + limit=args.export_limit, + ) + + if args.export_all_geojson: + export_all_geojson( + conn, + output_path=Path(args.export_all_geojson), + limit=args.export_limit, + ) + finally: + conn.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/generate_boundary_index.js b/scripts/generate_boundary_index.js new file mode 100755 index 0000000..9942bcc --- /dev/null +++ b/scripts/generate_boundary_index.js @@ -0,0 +1,1795 @@ +#!/usr/bin/env node +"use strict"; + +const fs = require('fs') +const path = require('path') +const sqlite3 = require('sqlite3') +const geometry = require('../src/geometry') +const boundaryCover = require('../src/boundary_cover') +const geohash = require('../src/geohash') + +const PLACETYPE_CODES = { + locality: 0, + localadmin: 1, + region: 2, + county: 3 +} + +function parseBool(value, defaultValue) { + if (value === undefined || value === null || value === '') { + return defaultValue + } + + var normalized = String(value).toLowerCase().trim() + if (normalized === '1' || normalized === 'true' || normalized === 'yes' || normalized === 'y') { + return true + } + if (normalized === '0' || normalized === 'false' || normalized === 'no' || normalized === 'n') { + return false + } + + return defaultValue +} + +function parseArgs(argv) { + var opts = { + database: null, + input: [], + inputDir: [], + basePrecision: 4, + maxPrecision: 7, + includeLocaladmin: false, + includeCounty: false, + includeRegion: false, + replace: true, + includeAlt: false, + dropContainedLocalities: true, + maxPlaces: null, + geometryDecimals: null, + minPopulation: 0, + isolationMinPopulation: null, + ensureCountryLocality: true, + indexMode: 'compact', + localityMaxPrecision: null, + localadminMaxPrecision: null, + countyMaxPrecision: null, + regionMaxPrecision: null, + regionSparseMaxPrecision: null, + regionSparseMinAreaKm2: null, + promoteLocalityOverRegion: true, + dominantLocalityPopulation: 100000, + dominantLocalityRatio: 3, + parentLocalityMinShare: 0.5 + } + + for (var i = 0; i < argv.length; i++) { + var arg = argv[i] + + if (arg === '--database' || arg === '-d') { + opts.database = argv[++i] + } else if (arg === '--input' || arg === '-i') { + opts.input.push(argv[++i]) + } else if (arg === '--input-dir') { + opts.inputDir.push(argv[++i]) + } else if (arg === '--base-precision') { + opts.basePrecision = Number(argv[++i]) + } else if (arg === '--max-precision') { + opts.maxPrecision = Number(argv[++i]) + } else if (arg === '--include-localadmin') { + opts.includeLocaladmin = parseBool(argv[++i], false) + } else if (arg === '--include-county') { + opts.includeCounty = parseBool(argv[++i], false) + } else if (arg === '--include-region') { + opts.includeRegion = parseBool(argv[++i], false) + } else if (arg === '--include-alt') { + opts.includeAlt = parseBool(argv[++i], false) + } else if (arg === '--drop-contained-localities') { + opts.dropContainedLocalities = parseBool(argv[++i], true) + } else if (arg === '--max-places') { + var maxPlaces = Number(argv[++i]) + opts.maxPlaces = Number.isFinite(maxPlaces) && maxPlaces > 0 ? Math.trunc(maxPlaces) : null + } else if (arg === '--geometry-decimals') { + var decimals = Number(argv[++i]) + opts.geometryDecimals = Number.isFinite(decimals) && decimals >= 0 ? Math.trunc(decimals) : null + } else if (arg === '--min-population') { + var minPopulation = Number(argv[++i]) + opts.minPopulation = Number.isFinite(minPopulation) && minPopulation > 0 ? Math.trunc(minPopulation) : 0 + } else if (arg === '--isolation-min-population') { + var isolationMin = Number(argv[++i]) + opts.isolationMinPopulation = Number.isFinite(isolationMin) && isolationMin > 0 ? Math.trunc(isolationMin) : null + } else if (arg === '--ensure-country-locality') { + opts.ensureCountryLocality = parseBool(argv[++i], true) + } else if (arg === '--index-mode') { + opts.indexMode = String(argv[++i] || '').toLowerCase().trim() + } else if (arg === '--locality-max-precision') { + var localityMax = Number(argv[++i]) + opts.localityMaxPrecision = Number.isFinite(localityMax) ? Math.trunc(localityMax) : null + } else if (arg === '--localadmin-max-precision') { + var localadminMax = Number(argv[++i]) + opts.localadminMaxPrecision = Number.isFinite(localadminMax) ? Math.trunc(localadminMax) : null + } else if (arg === '--county-max-precision') { + var countyMax = Number(argv[++i]) + opts.countyMaxPrecision = Number.isFinite(countyMax) ? Math.trunc(countyMax) : null + } else if (arg === '--region-max-precision') { + var regionMax = Number(argv[++i]) + opts.regionMaxPrecision = Number.isFinite(regionMax) ? Math.trunc(regionMax) : null + } else if (arg === '--region-sparse-max-precision') { + var regionSparseMax = Number(argv[++i]) + opts.regionSparseMaxPrecision = Number.isFinite(regionSparseMax) ? Math.trunc(regionSparseMax) : null + } else if (arg === '--region-sparse-min-area-km2') { + var sparseAreaKm2 = Number(argv[++i]) + opts.regionSparseMinAreaKm2 = Number.isFinite(sparseAreaKm2) && sparseAreaKm2 > 0 ? sparseAreaKm2 : null + } else if (arg === '--promote-locality-over-region') { + opts.promoteLocalityOverRegion = parseBool(argv[++i], true) + } else if (arg === '--dominant-locality-population') { + var dominantPopulation = Number(argv[++i]) + opts.dominantLocalityPopulation = Number.isFinite(dominantPopulation) ? dominantPopulation : opts.dominantLocalityPopulation + } else if (arg === '--dominant-locality-ratio') { + var dominantRatio = Number(argv[++i]) + opts.dominantLocalityRatio = Number.isFinite(dominantRatio) ? dominantRatio : opts.dominantLocalityRatio + } else if (arg === '--parent-locality-min-share') { + var minShare = Number(argv[++i]) + opts.parentLocalityMinShare = Number.isFinite(minShare) ? minShare : opts.parentLocalityMinShare + } else if (arg === '--append') { + opts.replace = false + } else if (arg === '--replace') { + opts.replace = true + } else if (arg === '--help' || arg === '-h') { + opts.help = true + } else { + throw new Error('Unknown argument: ' + arg) + } + } + + return opts +} + +function usage() { + return [ + 'Usage: node scripts/generate_boundary_index.js --database [--input ] [--input-dir ]', + '', + 'Options:', + ' --database, -d SQLite output path (required)', + ' --input, -i GeoJSON FeatureCollection/Feature or NDJSON file (repeatable)', + ' --input-dir Directory to recursively scan for GeoJSON feature files (repeatable)', + ' --base-precision Geohash base precision (default: 4)', + ' --max-precision Geohash max precision for partial subdivision (default: 7)', + ' --include-localadmin Include localadmin placetypes (default: false)', + ' --include-county Include county placetypes (default: false)', + ' --include-region Include region placetypes (default: false)', + ' --include-alt Include WOF alt geometries (default: false)', + ' --drop-contained-localities Drop locality polygons fully contained by larger localities (default: true)', + ' --max-places Stop after this many normalized places (useful for experiments)', + ' --geometry-decimals Round geometry coordinates to N decimals before indexing/storage', + ' --min-population Drop localities below this threshold (default: 0, country capitals kept)', + ' --isolation-min-population Lower population floor for localities in otherwise-empty geohash cells (default: off)', + ' --ensure-country-locality Guarantee at least one locality per country (default: true)', + ' --index-mode compact|full (default: compact)', + ' --locality-max-precision Max precision override for locality placetype', + ' --localadmin-max-precision Max precision override for localadmin placetype', + ' --county-max-precision Max precision override for county placetype', + ' --region-max-precision Max precision override for region placetype', + ' --region-sparse-max-precision Optional precision for very large region polygons (for example 3)', + ' --region-sparse-min-area-km2 Area threshold to apply sparse region precision', + ' --promote-locality-over-region Prefer locality over region in shared parent cells when no competing locality exists (default: true)', + ' --dominant-locality-population Population threshold that marks locality as major for dominant-city rollups (default: 100000)', + ' --dominant-locality-ratio Required dominant-vs-next population ratio for locality rollups (default: 3)', + ' --parent-locality-min-share Minimum child-cell share (0..1) required to let a locality take over a parent cell (default: 0.5)', + ' --append Keep existing boundary rows and append/replace by place id', + ' --replace Clear boundary rows first (default)', + ' --help, -h Show this help message' + ].join('\n') +} + +function collectGeojsonFiles(dirPath, includeAlt, files) { + var entries = fs.readdirSync(dirPath, { withFileTypes: true }) + + for (var i = 0; i < entries.length; i++) { + var entry = entries[i] + var absolutePath = path.join(dirPath, entry.name) + + if (entry.isDirectory()) { + collectGeojsonFiles(absolutePath, includeAlt, files) + continue + } + + if (!entry.isFile()) { + continue + } + + var lower = entry.name.toLowerCase() + var isGeojson = lower.endsWith('.geojson') || lower.endsWith('.json') || lower.endsWith('.ndjson') + if (!isGeojson) { + continue + } + + if (!includeAlt && lower.indexOf('-alt-') !== -1) { + continue + } + + files.push(absolutePath) + } +} + +function collectInputFiles(opts) { + var all = [] + + for (var i = 0; i < opts.input.length; i++) { + all.push(path.resolve(opts.input[i])) + } + + for (var j = 0; j < opts.inputDir.length; j++) { + var inputDir = path.resolve(opts.inputDir[j]) + if (!fs.existsSync(inputDir) || !fs.statSync(inputDir).isDirectory()) { + throw new Error('Input directory does not exist: ' + inputDir) + } + + collectGeojsonFiles(inputDir, opts.includeAlt, all) + } + + var dedup = Object.create(null) + all.forEach(function(filePath) { + dedup[filePath] = true + }) + + return Object.keys(dedup).sort() +} + +function readFeatures(filePath) { + var content = fs.readFileSync(filePath, 'utf8') + var trimmed = content.trim() + + if (!trimmed) { + return [] + } + + if (trimmed.charAt(0) === '{' || trimmed.charAt(0) === '[') { + var parsed = JSON.parse(trimmed) + + if (Array.isArray(parsed)) { + return parsed + } + + if (parsed.type === 'FeatureCollection' && Array.isArray(parsed.features)) { + return parsed.features + } + + if (parsed.type === 'Feature') { + return [parsed] + } + + throw new Error('Unsupported JSON root in ' + filePath + '. Expected FeatureCollection, Feature, or array.') + } + + return trimmed + .split(/\r?\n/) + .map(function(line) { return line.trim() }) + .filter(function(line) { return line && line.charAt(0) !== '#' }) + .map(function(line) { return JSON.parse(line) }) +} + +function pickFirstString(value) { + if (typeof value === 'string' && value.trim()) { + return value.trim() + } + + if (Array.isArray(value)) { + for (var i = 0; i < value.length; i++) { + var candidate = pickFirstString(value[i]) + if (candidate) return candidate + } + } + + if (value && typeof value === 'object') { + var keys = Object.keys(value) + for (var j = 0; j < keys.length; j++) { + var nested = pickFirstString(value[keys[j]]) + if (nested) return nested + } + } + + return null +} + +function parseOptionalInt(value) { + if (value === null || value === undefined || value === '') return null + var parsed = Number(value) + if (Number.isFinite(parsed)) return Math.trunc(parsed) + return null +} + +function parseOptionalFloat(value) { + if (value === null || value === undefined || value === '') return null + var parsed = Number(value) + return Number.isFinite(parsed) ? parsed : null +} + +function clampPrecision(value, basePrecision, fallback) { + if (!Number.isFinite(value) || value < basePrecision) { + return fallback + } + return Math.trunc(value) +} + +function parseList(value) { + if (Array.isArray(value)) return value + if (typeof value === 'string') { + var trimmed = value.trim() + if (!trimmed) return [] + if (trimmed.charAt(0) === '[') { + try { + var parsed = JSON.parse(trimmed) + return Array.isArray(parsed) ? parsed : [] + } catch (err) { + return [trimmed] + } + } + return trimmed.split(',').map(function(item) { return item.trim() }).filter(Boolean) + } + return [] +} + +function isCurrentRecord(properties) { + var props = properties || {} + + var isCurrent = props.is_current + if (isCurrent === undefined) { + isCurrent = props['mz:is_current'] + } + if (isCurrent !== undefined && isCurrent !== null && Number(isCurrent) <= 0) { + return false + } + + var deprecated = props.deprecated + if (deprecated === undefined) { + deprecated = props['edtf:deprecated'] + } + if (deprecated && String(deprecated).toLowerCase() !== 'uuuu') { + return false + } + + var supersededBy = props.superseded_by + if (supersededBy === undefined) { + supersededBy = props['wof:superseded_by'] + } + if (parseList(supersededBy).length > 0) { + return false + } + + return true +} + +function extractName(properties, feature) { + var props = properties || {} + return pickFirstString(props.name) || + pickFirstString(props['wof:name']) || + pickFirstString(props['name:preferred']) || + pickFirstString(props.name_preferred) || + pickFirstString(feature && feature.id) +} + +function extractPlacetype(properties) { + var props = properties || {} + return pickFirstString(props.placetype) || + pickFirstString(props['wof:placetype']) || + pickFirstString(props.place_type) +} + +function extractCountryId(properties) { + var props = properties || {} + return pickFirstString(props.country_id) || + pickFirstString(props['iso:country']) || + pickFirstString(props.country_code) || + pickFirstString(props.country) || + pickFirstString(props['wof:country']) || + '' +} + +function extractHierarchyRegionId(properties) { + var hierarchy = properties && properties['wof:hierarchy'] + if (!Array.isArray(hierarchy) || hierarchy.length === 0) { + return null + } + + for (var i = 0; i < hierarchy.length; i++) { + var branch = hierarchy[i] + if (!branch || typeof branch !== 'object') continue + + var region = parseOptionalInt(branch.region_id) + if (region !== null) return region + } + + return null +} + +function extractAdmin1Id(properties) { + var props = properties || {} + return parseOptionalInt(props.admin1_id) || + parseOptionalInt(props['gn:admin1_id']) || + parseOptionalInt(props.region_id) || + extractHierarchyRegionId(props) || + null +} + +function extractCentroid(properties, normalizedGeometry) { + var props = properties || {} + + var lat = parseOptionalFloat(props.centroid_lat) + if (lat === null) lat = parseOptionalFloat(props['lbl:latitude']) + if (lat === null) lat = parseOptionalFloat(props['geom:latitude']) + + var lon = parseOptionalFloat(props.centroid_lon) + if (lon === null) lon = parseOptionalFloat(props['lbl:longitude']) + if (lon === null) lon = parseOptionalFloat(props['geom:longitude']) + + if (lat !== null && lon !== null) { + return { latitude: lat, longitude: lon } + } + + var bbox = geometry.geometryBbox(normalizedGeometry) + return { + latitude: (bbox.minLat + bbox.maxLat) / 2, + longitude: (bbox.minLon + bbox.maxLon) / 2 + } +} + +function extractPopulation(properties) { + var props = properties || {} + + var population = parseOptionalInt(props.population) + if (population === null) population = parseOptionalInt(props['gn:population']) + if (population === null) population = parseOptionalInt(props['wof:population']) + if (population === null) population = parseOptionalInt(props['mz:population']) + + if (population === null || population < 0) { + return 0 + } + + return population +} + +function bboxAreaKm2(bbox) { + var centerLat = (Number(bbox.minLat) + Number(bbox.maxLat)) / 2 + var deltaLat = Math.abs(Number(bbox.maxLat) - Number(bbox.minLat)) + var deltaLon = Math.abs(Number(bbox.maxLon) - Number(bbox.minLon)) + var latKm = deltaLat * 111.32 + var lonKm = deltaLon * 111.32 * Math.cos(centerLat * Math.PI / 180) + return Math.max(0, latKm * Math.max(0, lonKm)) +} + +function roundCoordinate(value, decimals) { + var factor = Math.pow(10, decimals) + return Math.round(Number(value) * factor) / factor +} + +function roundRing(ring, decimals) { + var points = [] + for (var i = 0; i < ring.length; i++) { + var lon = roundCoordinate(ring[i][0], decimals) + var lat = roundCoordinate(ring[i][1], decimals) + + if (!points.length) { + points.push([lon, lat]) + continue + } + + var prev = points[points.length - 1] + if (prev[0] !== lon || prev[1] !== lat) { + points.push([lon, lat]) + } + } + + if (!points.length) return points + + var first = points[0] + var last = points[points.length - 1] + if (first[0] !== last[0] || first[1] !== last[1]) { + points.push([first[0], first[1]]) + } + + return points +} + +function quantizeGeometry(inputGeometry, decimals) { + if (!Number.isFinite(decimals)) { + return geometry.normalizeGeometry(inputGeometry) + } + + var normalized = geometry.normalizeGeometry(inputGeometry) + var rounded = normalized.coordinates.map(function(polygon) { + return polygon + .map(function(ring) { return roundRing(ring, decimals) }) + .filter(function(ring) { return ring.length >= 4 }) + }).filter(function(polygon) { + return polygon.length > 0 + }) + + if (!rounded.length) { + return normalized + } + + return geometry.normalizeGeometry({ + type: 'MultiPolygon', + coordinates: rounded + }) +} + +function isCapitalLocality(properties) { + var props = properties || {} + + var featureCode = pickFirstString(props['gn:feature_code']) || + pickFirstString(props['gn:fcode']) || + pickFirstString(props['ne:FEATURE_CO']) || + '' + + if (String(featureCode).toUpperCase() === 'PPLC') { + return true + } + + var capitalOf = props['wof:capital_of'] + return Array.isArray(capitalOf) && capitalOf.length > 0 +} + +function extractPointCoordinates(pointGeometry) { + if (!pointGeometry || pointGeometry.type !== 'Point' || !Array.isArray(pointGeometry.coordinates)) { + return null + } + + var lon = parseOptionalFloat(pointGeometry.coordinates[0]) + var lat = parseOptionalFloat(pointGeometry.coordinates[1]) + if (lat === null || lon === null) { + return null + } + + return { + latitude: lat, + longitude: lon + } +} + +function bboxPolygon(bbox) { + return { + type: 'Polygon', + coordinates: [[ + [bbox.minLon, bbox.minLat], + [bbox.maxLon, bbox.minLat], + [bbox.maxLon, bbox.maxLat], + [bbox.minLon, bbox.maxLat], + [bbox.minLon, bbox.minLat] + ]] + } +} + +function normalizeFeature(feature, opts) { + if (!feature || feature.type !== 'Feature') { + return null + } + + if (!feature.geometry || !feature.geometry.type) { + return null + } + + var properties = feature.properties || {} + var placetype = (extractPlacetype(properties) || '').toLowerCase() + if (!isCurrentRecord(properties)) { + return null + } + + var include = placetype === 'locality' || + (opts.includeLocaladmin && placetype === 'localadmin') || + (opts.includeCounty && placetype === 'county') || + (opts.includeRegion && placetype === 'region') + if (!include) { + return null + } + + var population = extractPopulation(properties) + var isCapital = placetype === 'locality' && isCapitalLocality(properties) + var isCityLikePlacetype = placetype === 'locality' || placetype === 'county' + var isolationCandidate = false + if (isCityLikePlacetype && population < opts.minPopulation && !isCapital) { + var isolationFloor = opts.isolationMinPopulation + if (Number.isFinite(isolationFloor) && isolationFloor > 0 && population >= isolationFloor) { + isolationCandidate = true + } else { + return null + } + } + + var rawId = feature.id + if (rawId === undefined || rawId === null || rawId === '') rawId = properties.id + if (rawId === undefined || rawId === null || rawId === '') rawId = properties['wof:id'] + var id = parseOptionalInt(rawId) + if (id === null) { + return null + } + + var geometryType = feature.geometry.type + var isPolygonGeometry = geometryType === 'Polygon' || geometryType === 'MultiPolygon' + var isPointCapital = geometryType === 'Point' && isCapital + if (!isPolygonGeometry && !isPointCapital) { + return null + } + + var normalizedGeometry + var pointCapitalHash = null + + if (isPointCapital) { + var point = extractPointCoordinates(feature.geometry) + if (!point) { + return null + } + + pointCapitalHash = geohash.encode(point.latitude, point.longitude, opts.localityMaxPrecision) + normalizedGeometry = geometry.normalizeGeometry(bboxPolygon(geohash.decodeBbox(pointCapitalHash))) + } else { + normalizedGeometry = quantizeGeometry(feature.geometry, opts.geometryDecimals) + } + + var bbox = geometry.geometryBbox(normalizedGeometry) + var centroid = extractCentroid(properties, normalizedGeometry) + var countryId = extractCountryId(properties) + + var name = extractName(properties, feature) + if (!name) { + return null + } + + var priorityRank = parseOptionalInt(properties.priority_rank) + if (priorityRank === null) priorityRank = 0 + var maxPrecisionForPlace = resolveMaxPrecisionForPlacetype(opts, placetype, bbox) + var cover = pointCapitalHash + ? [{ + geohash: pointCapitalHash, + precision: pointCapitalHash.length, + coverageType: 'full' + }] + : boundaryCover.buildGeohashCoverForGeometry(normalizedGeometry, { + basePrecision: Math.min(opts.basePrecision, maxPrecisionForPlace), + maxPrecision: maxPrecisionForPlace + }) + + var area = geometry.geometryArea(normalizedGeometry) + + // In compact mode, geometry is not written to the DB. Retain it only + // for full index mode which stores polygons. + var retainGeometry = opts.indexMode !== 'compact' + + return { + id: id, + name: name, + countryId: countryId, + admin1Id: extractAdmin1Id(properties), + placetype: placetype, + placetypeCode: placetypeCode(placetype), + centroidLat: centroid.latitude, + centroidLon: centroid.longitude, + population: population, + bboxMinLat: bbox.minLat, + bboxMinLon: bbox.minLon, + bboxMaxLat: bbox.maxLat, + bboxMaxLon: bbox.maxLon, + priorityRank: priorityRank, + area: area, + countryName: pickFirstString(properties.country_name) || countryId || null, + admin1Name: pickFirstString(properties.admin1_name) || null, + geometry: retainGeometry ? normalizedGeometry : null, + cover: cover, + isolationCandidate: isolationCandidate + } +} + +function localityGroupKey(place) { + return String(place.countryId || '') + '|' + String(place.admin1Id === null ? '' : place.admin1Id) +} + +function pruneContainedLocalities(places, enabled) { + if (!enabled) { + return { + places: places, + dropped: [] + } + } + + var localitiesByGroup = Object.create(null) + for (var i = 0; i < places.length; i++) { + var place = places[i] + if (!isCityPlacetypeCode(place.placetypeCode)) continue + + // Group by placetype + country/admin1 so localities are only pruned by + // other localities, not by counties that happen to contain them. + var key = place.placetypeCode + '|' + localityGroupKey(place) + if (!localitiesByGroup[key]) { + localitiesByGroup[key] = [] + } + + localitiesByGroup[key].push(place) + } + + var dropById = Object.create(null) + + var groupKeys = Object.keys(localitiesByGroup) + for (var g = 0; g < groupKeys.length; g++) { + var key = groupKeys[g] + var group = localitiesByGroup[key] + + group.sort(function(a, b) { + if (a.area !== b.area) return a.area - b.area + return a.id - b.id + }) + + for (var i = 0; i < group.length; i++) { + var candidate = group[i] + if (dropById[candidate.id]) continue + + for (var j = i + 1; j < group.length; j++) { + var container = group[j] + if (dropById[container.id]) continue + if (container.area <= candidate.area) continue + + var containsBbox = geometry.bboxContainsBbox({ + minLat: container.bboxMinLat, + minLon: container.bboxMinLon, + maxLat: container.bboxMaxLat, + maxLon: container.bboxMaxLon + }, { + minLat: candidate.bboxMinLat, + minLon: candidate.bboxMinLon, + maxLat: candidate.bboxMaxLat, + maxLon: candidate.bboxMaxLon + }) + + if (!containsBbox) { + continue + } + + // Use full geometry containment when available, otherwise bbox is sufficient + var geometryContains = container.geometry && candidate.geometry + ? geometry.geometryContainsGeometry(container.geometry, candidate.geometry) + : true + if (geometryContains) { + dropById[candidate.id] = { + placeId: candidate.id, + containedBy: container.id, + group: key + } + break + } + } + } + } + + var dropped = Object.keys(dropById).map(function(id) { return dropById[id] }) + var filtered = places.filter(function(place) { + return !dropById[place.id] + }) + + return { + places: filtered, + dropped: dropped + } +} + +function pruneRedundantRegions(places) { + var localitiesByKey = Object.create(null) + for (var i = 0; i < places.length; i++) { + var place = places[i] + if (place.placetypeCode !== PLACETYPE_CODES.locality) continue + var key = place.name + '|' + place.countryId + if (!localitiesByKey[key]) { + localitiesByKey[key] = [] + } + localitiesByKey[key].push(place) + } + + // Only treat a region as redundant when a same-named locality covers most + // of the region's bbox area. A bare bbox-contains check would drop legit + // regions that simply happen to contain a same-named city (e.g. New York + // state contains New York city). + var REDUNDANT_AREA_SHARE = 0.5 + + var dropById = Object.create(null) + for (var i = 0; i < places.length; i++) { + var region = places[i] + if (region.placetypeCode !== PLACETYPE_CODES.region) continue + + var key = region.name + '|' + region.countryId + var matchingLocalities = localitiesByKey[key] + if (!matchingLocalities) continue + + var regionBbox = { + minLat: region.bboxMinLat, + minLon: region.bboxMinLon, + maxLat: region.bboxMaxLat, + maxLon: region.bboxMaxLon + } + var regionAreaKm2 = bboxAreaKm2(regionBbox) + if (!(regionAreaKm2 > 0)) continue + + for (var j = 0; j < matchingLocalities.length; j++) { + var locality = matchingLocalities[j] + var localityBbox = { + minLat: locality.bboxMinLat, + minLon: locality.bboxMinLon, + maxLat: locality.bboxMaxLat, + maxLon: locality.bboxMaxLon + } + + if (!geometry.bboxContainsBbox(regionBbox, localityBbox)) continue + + var localityAreaKm2 = bboxAreaKm2(localityBbox) + if (localityAreaKm2 / regionAreaKm2 < REDUNDANT_AREA_SHARE) continue + + dropById[region.id] = { + placeId: region.id, + replacedBy: locality.id + } + break + } + } + + var dropped = Object.keys(dropById).map(function(id) { return dropById[id] }) + var filtered = places.filter(function(place) { + return !dropById[place.id] + }) + + return { + places: filtered, + dropped: dropped + } +} + +function placetypeRank(placetype) { + if (placetype === 'locality') return 0 + if (placetype === 'localadmin') return 1 + if (placetype === 'county') return 1 + if (placetype === 'region') return 2 + return 3 +} + +function placetypeCode(placetype) { + var code = PLACETYPE_CODES[placetype] + return Number.isFinite(code) ? code : 9 +} + +function resolveMaxPrecisionForPlacetype(opts, placetype, bbox) { + if (placetype === 'locality') return opts.localityMaxPrecision + if (placetype === 'localadmin') return opts.localadminMaxPrecision + if (placetype === 'county') return opts.countyMaxPrecision + if (placetype === 'region') { + var regionPrecision = opts.regionMaxPrecision + if (Number.isFinite(opts.regionSparseMaxPrecision) && Number.isFinite(opts.regionSparseMinAreaKm2)) { + var areaKm2 = bboxAreaKm2(bbox) + if (areaKm2 >= opts.regionSparseMinAreaKm2) { + regionPrecision = Math.min(regionPrecision, opts.regionSparseMaxPrecision) + } + } + return regionPrecision + } + return opts.maxPrecision +} + +function pointDistanceScore(latitude, longitude, targetLatitude, targetLongitude) { + var lat = Number(latitude) + var lon = Number(longitude) + var targetLat = Number(targetLatitude) + var targetLon = Number(targetLongitude) + var scale = Math.pow(Math.cos(lat * Math.PI / 180), 2) + + return ((lat - targetLat) * (lat - targetLat)) + + ((lon - targetLon) * (lon - targetLon) * scale) +} + +function comparePlacesForHash(a, b, hash, hashCenterCache) { + var typeRankA = placetypeRank(a.placetype) + var typeRankB = placetypeRank(b.placetype) + if (typeRankA !== typeRankB) { + return typeRankA - typeRankB + } + + if (a.population !== b.population) { + return b.population - a.population + } + + var center = hashCenterCache[hash] + if (!center) { + var bbox = geohash.decodeBbox(hash) + center = { + latitude: (bbox.minLat + bbox.maxLat) / 2, + longitude: (bbox.minLon + bbox.maxLon) / 2 + } + hashCenterCache[hash] = center + } + + var distanceA = pointDistanceScore(center.latitude, center.longitude, a.centroidLat, a.centroidLon) + var distanceB = pointDistanceScore(center.latitude, center.longitude, b.centroidLat, b.centroidLon) + if (distanceA !== distanceB) { + return distanceA - distanceB + } + + if (a.area !== b.area) { + return a.area - b.area + } + + return a.id - b.id +} + +function isCityPlacetypeCode(code) { + return code === PLACETYPE_CODES.locality || code === PLACETYPE_CODES.localadmin || code === PLACETYPE_CODES.county +} + +function placePopulation(place) { + if (!place) return 0 + + var pop = Number(place.population) + if (!Number.isFinite(pop) || pop < 0) { + return 0 + } + + return pop +} + +function isMajorLocality(place, opts) { + if (!place || !isCityPlacetypeCode(place.placetypeCode)) { + return false + } + + var threshold = Number(opts.dominantLocalityPopulation) + if (!Number.isFinite(threshold) || threshold <= 0) { + return false + } + + return placePopulation(place) >= threshold +} + +function selectDominantLocalityId(localityIds, placeById, opts) { + if (!Array.isArray(localityIds) || localityIds.length < 2) { + return null + } + + var threshold = Number(opts.dominantLocalityPopulation) + if (!Number.isFinite(threshold) || threshold <= 0) { + return null + } + + var ratio = Number(opts.dominantLocalityRatio) + if (!Number.isFinite(ratio) || ratio < 1) { + ratio = 1 + } + + var ranked = localityIds + .map(function(id) { + var place = placeById[String(id)] + return { + id: Number(id), + population: placePopulation(place) + } + }) + .sort(function(a, b) { + if (a.population !== b.population) { + return b.population - a.population + } + return a.id - b.id + }) + + if (!ranked.length) { + return null + } + + var top = ranked[0] + if (top.population < threshold) { + return null + } + + for (var i = 1; i < ranked.length; i++) { + if (ranked[i].population >= threshold) { + return null + } + } + + var secondPopulation = ranked.length > 1 ? ranked[1].population : 0 + if (secondPopulation > 0 && top.population < secondPopulation * ratio) { + return null + } + + return top.id +} + +function localityShareInParent(localityId, group) { + if (!group) return 0 + + var counts = group.localityCellCountById || Object.create(null) + var localityCount = Number(counts[String(localityId)] || 0) + if (!localityCount) { + return 0 + } + + return localityCount / 32 +} + +function localityShareMeetsThreshold(localityId, group, opts) { + var threshold = Number(opts.parentLocalityMinShare) + if (!Number.isFinite(threshold) || threshold <= 0) { + return true + } + + return localityShareInParent(localityId, group) >= threshold +} + +function promoteLocalityParentsByRegionCompetition(bestByHash, placeById, opts) { + if (!opts.promoteLocalityOverRegion) { + return + } + + var minPrecision = Number(opts.basePrecision || 1) + var maxPrecision = Number(opts.maxPrecision || minPrecision) + if (maxPrecision <= minPrecision) { + return + } + + for (var precision = maxPrecision - 1; precision >= minPrecision; precision--) { + var childPrecision = precision + 1 + var groupByParent = Object.create(null) + var hashes = Object.keys(bestByHash) + + for (var i = 0; i < hashes.length; i++) { + var hash = hashes[i] + if (hash.length !== childPrecision) continue + + var place = placeById[String(bestByHash[hash])] + if (!place) continue + + var parent = hash.slice(0, precision) + var group = groupByParent[parent] + if (!group) { + group = { + localityById: Object.create(null), + localityCellCountById: Object.create(null), + hasRegion: false + } + groupByParent[parent] = group + } + + if (isCityPlacetypeCode(place.placetypeCode)) { + group.localityById[String(place.id)] = true + group.localityCellCountById[String(place.id)] = Number(group.localityCellCountById[String(place.id)] || 0) + 1 + } else if (place.placetypeCode === PLACETYPE_CODES.region) { + group.hasRegion = true + } + } + + var promotedParents = Object.create(null) + var parentHashes = Object.keys(groupByParent) + for (var parentIndex = 0; parentIndex < parentHashes.length; parentIndex++) { + var parentHash = parentHashes[parentIndex] + var group = groupByParent[parentHash] + var localityIds = Object.keys(group.localityById) + if (!localityIds.length) { + continue + } + + var promotion = null + var existingId = bestByHash[parentHash] + var existingPlace = existingId !== undefined ? placeById[String(existingId)] : null + + if (localityIds.length === 1) { + var localityId = Number(localityIds[0]) + if (!localityShareMeetsThreshold(localityId, group, opts)) { + continue + } + + var hasRegionCompetition = group.hasRegion + if (existingPlace && isCityPlacetypeCode(existingPlace.placetypeCode) && Number(existingId) !== localityId) { + continue + } + if (existingPlace && existingPlace.placetypeCode === PLACETYPE_CODES.region) { + hasRegionCompetition = true + } + + if (!hasRegionCompetition) { + continue + } + + promotion = { + localityId: localityId, + suppressMinorLocalities: false + } + } else { + var dominantLocalityId = selectDominantLocalityId(localityIds, placeById, opts) + if (dominantLocalityId === null) { + continue + } + if (!localityShareMeetsThreshold(dominantLocalityId, group, opts)) { + continue + } + + if (existingPlace && + isCityPlacetypeCode(existingPlace.placetypeCode) && + Number(existingId) !== dominantLocalityId && + isMajorLocality(existingPlace, opts)) { + continue + } + + promotion = { + localityId: dominantLocalityId, + suppressMinorLocalities: true + } + } + + bestByHash[parentHash] = promotion.localityId + promotedParents[parentHash] = promotion + } + + if (!Object.keys(promotedParents).length) { + continue + } + + var descendantHashes = Object.keys(bestByHash) + for (var hashIndex = 0; hashIndex < descendantHashes.length; hashIndex++) { + var descendantHash = descendantHashes[hashIndex] + if (descendantHash.length <= precision) continue + + var ancestor = descendantHash.slice(0, precision) + var promoted = promotedParents[ancestor] + if (!promoted || descendantHash === ancestor) { + continue + } + + var descendantPlaceId = Number(bestByHash[descendantHash]) + if (descendantPlaceId === promoted.localityId) { + continue + } + + var descendantPlace = placeById[String(descendantPlaceId)] + if (!descendantPlace) { + continue + } + + if (descendantPlace.placetypeCode === PLACETYPE_CODES.region) { + delete bestByHash[descendantHash] + continue + } + + if (promoted.suppressMinorLocalities && + isCityPlacetypeCode(descendantPlace.placetypeCode) && + !isMajorLocality(descendantPlace, opts)) { + delete bestByHash[descendantHash] + } + } + } +} + +function buildCompactLookupRows(places, opts) { + var bestByHash = Object.create(null) + var hashCenterCache = Object.create(null) + var placeById = Object.create(null) + + for (var index = 0; index < places.length; index++) { + placeById[String(places[index].id)] = places[index] + } + + for (var i = 0; i < places.length; i++) { + var place = places[i] + for (var j = 0; j < place.cover.length; j++) { + var cell = place.cover[j] + var hash = cell.geohash + var current = bestByHash[hash] + if (!current || comparePlacesForHash(place, current, hash, hashCenterCache) < 0) { + bestByHash[hash] = place + } + } + } + + var bestByHashId = Object.create(null) + var allHashes = Object.keys(bestByHash) + for (var hashIndex = 0; hashIndex < allHashes.length; hashIndex++) { + var currentHash = allHashes[hashIndex] + bestByHashId[currentHash] = bestByHash[currentHash].id + } + + promoteLocalityParentsByRegionCompetition(bestByHashId, placeById, opts) + + var rows = Object.keys(bestByHashId).map(function(hash) { + return { + geohash: hash, + placeId: bestByHashId[hash] + } + }) + + rows.sort(function(a, b) { + if (a.geohash.length !== b.geohash.length) { + return a.geohash.length - b.geohash.length + } + if (a.geohash < b.geohash) return -1 + if (a.geohash > b.geohash) return 1 + return 0 + }) + + var compact = [] + var selectedByHash = Object.create(null) + + for (var index = 0; index < rows.length; index++) { + var row = rows[index] + var redundant = false + + for (var precision = 1; precision < row.geohash.length; precision++) { + var prefix = row.geohash.slice(0, precision) + if (selectedByHash[prefix] === row.placeId) { + redundant = true + break + } + } + + if (redundant) { + continue + } + + selectedByHash[row.geohash] = row.placeId + compact.push(row) + } + + return compact +} + +function dbExec(db, sql) { + return new Promise(function(resolve, reject) { + db.exec(sql, function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +function dbRun(db, sql, params) { + return new Promise(function(resolve, reject) { + db.run(sql, params || [], function(err) { + if (err) reject(err) + else resolve(this) + }) + }) +} + +function dbClose(db) { + return new Promise(function(resolve, reject) { + db.close(function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +function stmtRun(stmt, params) { + return new Promise(function(resolve, reject) { + stmt.run(params, function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +function stmtFinalize(stmt) { + return new Promise(function(resolve, reject) { + stmt.finalize(function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +async function ensureBoundarySchema(db, opts) { + if (opts.indexMode === 'compact') { + if (opts.replace) { + await dbExec(db, ` + DROP TABLE IF EXISTS compact_geohash_lookup; + DROP TABLE IF EXISTS compact_places; + DROP TABLE IF EXISTS place_geohash_cover; + DROP TABLE IF EXISTS place_geometry; + DROP TABLE IF EXISTS place_geohash_lookup; + DROP TABLE IF EXISTS places; + DROP TABLE IF EXISTS countries; + DROP TABLE IF EXISTS admin1; + `) + } + + await dbExec(db, ` + CREATE TABLE IF NOT EXISTS compact_places( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + country_id TEXT NOT NULL, + admin1_id INTEGER, + placetype_code INTEGER NOT NULL, + latitude REAL NOT NULL, + longitude REAL NOT NULL + ); + + CREATE TABLE IF NOT EXISTS compact_geohash_lookup( + geohash TEXT PRIMARY KEY, + place_id INTEGER NOT NULL, + FOREIGN KEY (place_id) REFERENCES compact_places(id) + ); + + CREATE INDEX IF NOT EXISTS compact_places_placetype_code ON compact_places (placetype_code); + CREATE INDEX IF NOT EXISTS compact_geohash_lookup_place_id ON compact_geohash_lookup (place_id); + `) + return + } + + await dbExec(db, ` + CREATE TABLE IF NOT EXISTS countries( + id TEXT PRIMARY KEY, + name TEXT NOT NULL + ); + + CREATE TABLE IF NOT EXISTS admin1( + country_id TEXT NOT NULL, + id INTEGER NOT NULL, + name TEXT NOT NULL, + PRIMARY KEY (country_id, id) + ); + + CREATE TABLE IF NOT EXISTS places( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + country_id TEXT NOT NULL, + admin1_id INTEGER, + placetype TEXT NOT NULL, + centroid_lat REAL NOT NULL, + centroid_lon REAL NOT NULL, + bbox_min_lat REAL NOT NULL, + bbox_min_lon REAL NOT NULL, + bbox_max_lat REAL NOT NULL, + bbox_max_lon REAL NOT NULL, + priority_rank INTEGER NOT NULL DEFAULT 0, + area REAL NOT NULL DEFAULT 0, + country_name TEXT, + admin1_name TEXT + ); + + CREATE TABLE IF NOT EXISTS place_geohash_cover( + geohash TEXT NOT NULL, + precision INTEGER NOT NULL, + place_id INTEGER NOT NULL, + coverage_type TEXT NOT NULL CHECK (coverage_type IN ('full', 'partial')), + PRIMARY KEY (geohash, precision, place_id), + FOREIGN KEY (place_id) REFERENCES places(id) + ); + + CREATE TABLE IF NOT EXISTS place_geometry( + place_id INTEGER PRIMARY KEY, + encoding TEXT NOT NULL DEFAULT 'json', + geometry BLOB NOT NULL, + FOREIGN KEY (place_id) REFERENCES places(id) + ); + + CREATE TABLE IF NOT EXISTS place_geohash_lookup( + geohash TEXT PRIMARY KEY, + place_id INTEGER NOT NULL, + FOREIGN KEY (place_id) REFERENCES places(id) + ); + + CREATE INDEX IF NOT EXISTS place_geohash_cover_hash_precision ON place_geohash_cover (geohash, precision); + CREATE INDEX IF NOT EXISTS place_geohash_cover_place_id ON place_geohash_cover (place_id); + CREATE INDEX IF NOT EXISTS places_placetype ON places (placetype); + CREATE INDEX IF NOT EXISTS place_geometry_place_id ON place_geometry (place_id); + CREATE INDEX IF NOT EXISTS place_geohash_lookup_place_id ON place_geohash_lookup (place_id); + `) +} + +function normalizePlaces(files, opts) { + var byId = Object.create(null) + var candidateById = Object.create(null) + var normalizedCount = 0 + + for (var i = 0; i < files.length; i++) { + var features = readFeatures(files[i]) + + for (var j = 0; j < features.length; j++) { + var place = normalizeFeature(features[j], opts) + if (!place) continue + + normalizedCount += 1 + + if (place.isolationCandidate) { + candidateById[String(place.id)] = place + } else { + byId[String(place.id)] = place + } + + if (opts.maxPlaces && Object.keys(byId).length >= opts.maxPlaces) { + break + } + } + + if (opts.maxPlaces && Object.keys(byId).length >= opts.maxPlaces) { + break + } + } + + var places = Object.keys(byId) + .map(function(id) { return byId[id] }) + .sort(function(a, b) { return a.id - b.id }) + + var candidates = Object.keys(candidateById) + .map(function(id) { return candidateById[id] }) + .sort(function(a, b) { return a.id - b.id }) + + return { + places: places, + candidates: candidates, + normalizedCount: normalizedCount + } +} + +function promoteIsolatedLocalities(places, candidates, opts) { + if (!candidates.length) { + return { places: places, promoted: 0, countryFills: 0 } + } + + // Build set of geohash cells already claimed by primary places at base precision + var claimedCells = Object.create(null) + for (var i = 0; i < places.length; i++) { + var place = places[i] + if (place.placetype !== 'locality' && place.placetype !== 'localadmin') continue + for (var j = 0; j < place.cover.length; j++) { + var hash = place.cover[j].geohash + // Claim at base precision: truncate to basePrecision length + var baseHash = hash.length > opts.basePrecision ? hash.slice(0, opts.basePrecision) : hash + claimedCells[baseHash] = true + } + } + + // Track which countries already have at least one city-like place + var countriesWithLocality = Object.create(null) + for (var i = 0; i < places.length; i++) { + if (isCityPlacetypeCode(places[i].placetypeCode)) { + countriesWithLocality[places[i].countryId] = true + } + } + + // Sort candidates by population descending so higher-pop isolated places win first + var sortedCandidates = candidates.slice().sort(function(a, b) { + return b.population - a.population + }) + + var promoted = 0 + var countryFills = 0 + var result = places.slice() + + for (var c = 0; c < sortedCandidates.length; c++) { + var candidate = sortedCandidates[c] + var isIsolated = false + + for (var k = 0; k < candidate.cover.length; k++) { + var hash = candidate.cover[k].geohash + var baseHash = hash.length > opts.basePrecision ? hash.slice(0, opts.basePrecision) : hash + if (!claimedCells[baseHash]) { + isIsolated = true + break + } + } + + if (!isIsolated) continue + + candidate.isolationCandidate = false + result.push(candidate) + promoted += 1 + + // Mark its cells as claimed + for (var k = 0; k < candidate.cover.length; k++) { + var hash = candidate.cover[k].geohash + var baseHash = hash.length > opts.basePrecision ? hash.slice(0, opts.basePrecision) : hash + claimedCells[baseHash] = true + } + + if (!countriesWithLocality[candidate.countryId]) { + countriesWithLocality[candidate.countryId] = true + countryFills += 1 + } + } + + // Ensure every country has at least one locality + if (opts.ensureCountryLocality) { + var candidatesByCountry = Object.create(null) + for (var c = 0; c < sortedCandidates.length; c++) { + var candidate = sortedCandidates[c] + if (candidate.isolationCandidate === false) continue // already promoted + if (!isCityPlacetypeCode(candidate.placetypeCode)) continue + var cc = candidate.countryId + if (!candidatesByCountry[cc]) { + candidatesByCountry[cc] = candidate // first = highest pop (already sorted) + } + } + + var countryKeys = Object.keys(candidatesByCountry) + for (var i = 0; i < countryKeys.length; i++) { + var cc = countryKeys[i] + if (countriesWithLocality[cc]) continue + + var best = candidatesByCountry[cc] + best.isolationCandidate = false + result.push(best) + countryFills += 1 + promoted += 1 + countriesWithLocality[cc] = true + } + } + + return { places: result, promoted: promoted, countryFills: countryFills } +} + +async function writePlaces(db, places, opts, compactLookupRows) { + await dbExec(db, 'BEGIN') + + try { + if (opts.replace && opts.indexMode !== 'compact') { + await dbRun(db, 'DELETE FROM place_geohash_lookup') + await dbRun(db, 'DELETE FROM place_geohash_cover') + await dbRun(db, 'DELETE FROM place_geometry') + await dbRun(db, 'DELETE FROM places') + } + + var placeStmt = null + + var geometryStmt = null + var coverStmt = null + var compactStmt = null + + if (opts.indexMode === 'full') { + placeStmt = db.prepare(` + INSERT OR REPLACE INTO places( + id, name, country_id, admin1_id, placetype, + centroid_lat, centroid_lon, + bbox_min_lat, bbox_min_lon, bbox_max_lat, bbox_max_lon, + priority_rank, area, country_name, admin1_name + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `) + + geometryStmt = db.prepare(` + INSERT OR REPLACE INTO place_geometry(place_id, encoding, geometry) + VALUES (?, ?, ?) + `) + + coverStmt = db.prepare(` + INSERT OR REPLACE INTO place_geohash_cover(geohash, precision, place_id, coverage_type) + VALUES (?, ?, ?, ?) + `) + } else { + placeStmt = db.prepare(` + INSERT OR REPLACE INTO compact_places( + id, name, country_id, admin1_id, placetype_code, + latitude, longitude + ) VALUES (?, ?, ?, ?, ?, ?, ?) + `) + + compactStmt = db.prepare(` + INSERT OR REPLACE INTO compact_geohash_lookup(geohash, place_id) + VALUES (?, ?) + `) + } + + try { + for (var i = 0; i < places.length; i++) { + var place = places[i] + + if (!opts.replace) { + if (opts.indexMode === 'compact') { + await dbRun(db, 'DELETE FROM compact_geohash_lookup WHERE place_id = ?', [place.id]) + await dbRun(db, 'DELETE FROM compact_places WHERE id = ?', [place.id]) + } else { + await dbRun(db, 'DELETE FROM place_geohash_lookup WHERE place_id = ?', [place.id]) + await dbRun(db, 'DELETE FROM place_geohash_cover WHERE place_id = ?', [place.id]) + await dbRun(db, 'DELETE FROM place_geometry WHERE place_id = ?', [place.id]) + } + } + + if (opts.indexMode === 'compact') { + await stmtRun(placeStmt, [ + place.id, + place.name, + place.countryId, + place.admin1Id, + place.placetypeCode, + place.centroidLat, + place.centroidLon + ]) + } else { + await stmtRun(placeStmt, [ + place.id, + place.name, + place.countryId, + place.admin1Id, + place.placetype, + place.centroidLat, + place.centroidLon, + place.bboxMinLat, + place.bboxMinLon, + place.bboxMaxLat, + place.bboxMaxLon, + place.priorityRank, + place.area, + place.countryName, + place.admin1Name + ]) + } + + if (opts.indexMode === 'full') { + await stmtRun(geometryStmt, [ + place.id, + 'json', + JSON.stringify(place.geometry) + ]) + + for (var j = 0; j < place.cover.length; j++) { + var cell = place.cover[j] + await stmtRun(coverStmt, [ + cell.geohash, + cell.precision, + place.id, + cell.coverageType + ]) + } + } + } + + if (opts.indexMode === 'compact') { + for (var rowIndex = 0; rowIndex < compactLookupRows.length; rowIndex++) { + var row = compactLookupRows[rowIndex] + await stmtRun(compactStmt, [row.geohash, row.placeId]) + } + } + } finally { + await stmtFinalize(placeStmt) + if (geometryStmt) await stmtFinalize(geometryStmt) + if (coverStmt) await stmtFinalize(coverStmt) + if (compactStmt) await stmtFinalize(compactStmt) + } + + await dbExec(db, 'COMMIT') + } catch (err) { + await dbExec(db, 'ROLLBACK') + throw err + } +} + +async function main() { + var options = parseArgs(process.argv.slice(2)) + + if (options.help) { + console.log(usage()) + process.exit(0) + } + + if (!options.database) { + throw new Error('Missing required --database argument') + } + + if (!options.input.length && !options.inputDir.length) { + throw new Error('Provide at least one --input file or --input-dir') + } + + if (!Number.isFinite(options.basePrecision) || options.basePrecision < 1) { + throw new Error('--base-precision must be a positive number') + } + + if (!Number.isFinite(options.maxPrecision) || options.maxPrecision < options.basePrecision) { + throw new Error('--max-precision must be >= --base-precision') + } + + if (options.indexMode !== 'compact' && options.indexMode !== 'full') { + throw new Error('--index-mode must be either compact or full') + } + + options.localityMaxPrecision = clampPrecision(options.localityMaxPrecision, options.basePrecision, options.maxPrecision) + options.localadminMaxPrecision = clampPrecision(options.localadminMaxPrecision, options.basePrecision, options.maxPrecision) + options.countyMaxPrecision = clampPrecision(options.countyMaxPrecision, options.basePrecision, options.maxPrecision) + options.regionMaxPrecision = clampPrecision(options.regionMaxPrecision, options.basePrecision, options.maxPrecision) + if (!Number.isFinite(options.dominantLocalityPopulation) || options.dominantLocalityPopulation <= 0) { + options.dominantLocalityPopulation = 0 + } else { + options.dominantLocalityPopulation = Math.trunc(options.dominantLocalityPopulation) + } + if (!Number.isFinite(options.dominantLocalityRatio) || options.dominantLocalityRatio < 1) { + options.dominantLocalityRatio = 1 + } + if (!Number.isFinite(options.parentLocalityMinShare)) { + options.parentLocalityMinShare = 0.5 + } + if (options.parentLocalityMinShare < 0) { + options.parentLocalityMinShare = 0 + } else if (options.parentLocalityMinShare > 1) { + options.parentLocalityMinShare = 1 + } + if (options.regionSparseMaxPrecision !== null) { + if (!Number.isFinite(options.regionSparseMaxPrecision) || options.regionSparseMaxPrecision < 1) { + options.regionSparseMaxPrecision = null + } else { + options.regionSparseMaxPrecision = Math.trunc(options.regionSparseMaxPrecision) + if (options.regionSparseMaxPrecision > options.regionMaxPrecision) { + options.regionSparseMaxPrecision = options.regionMaxPrecision + } + } + } + + var files = collectInputFiles(options) + if (!files.length) { + throw new Error('No input files were found after filtering') + } + + var normalized = normalizePlaces(files, options) + var primaryPlaces = normalized.places + + if (!primaryPlaces.length && !normalized.candidates.length) { + throw new Error('No valid locality/localadmin/region records were found in the provided input files') + } + + var isolation = promoteIsolatedLocalities(primaryPlaces, normalized.candidates, options) + var dedupedPlaces = isolation.places + + var pruned = pruneContainedLocalities(dedupedPlaces, options.dropContainedLocalities) + var regionPrune = pruneRedundantRegions(pruned.places) + var finalPlaces = regionPrune.places + var compactLookupRows = options.indexMode === 'compact' ? buildCompactLookupRows(finalPlaces, options) : [] + + var databasePath = path.resolve(options.database) + var db = new sqlite3.Database(databasePath) + + try { + await ensureBoundarySchema(db, options) + await writePlaces(db, finalPlaces, options, compactLookupRows) + + var coverCount = finalPlaces.reduce(function(total, place) { + return total + place.cover.length + }, 0) + + console.log('Boundary index build complete') + console.log('Database: ' + databasePath) + console.log('Input files scanned: ' + files.length) + console.log('Features normalized: ' + normalized.normalizedCount) + console.log('Primary places (>= min-population): ' + primaryPlaces.length) + if (normalized.candidates.length) { + console.log('Isolation candidates evaluated: ' + normalized.candidates.length) + console.log('Isolated localities promoted: ' + isolation.promoted + ' (country fills: ' + isolation.countryFills + ')') + } + console.log('Places after isolation pass: ' + dedupedPlaces.length) + console.log('Places dropped (contained locality prune): ' + pruned.dropped.length) + console.log('Regions dropped (redundant with same-name locality): ' + regionPrune.dropped.length) + console.log('Places written: ' + finalPlaces.length) + if (options.indexMode === 'compact') { + console.log('Geohash lookup rows: ' + compactLookupRows.length) + } else { + console.log('Geohash cover rows: ' + coverCount) + } + var modeLabel = 'locality' + if (options.includeLocaladmin) modeLabel += ' + localadmin' + if (options.includeCounty) modeLabel += ' + county' + if (options.includeRegion) modeLabel += ' + region' + console.log('Mode: ' + modeLabel) + console.log('Precision: ' + options.basePrecision + ' -> ' + options.maxPrecision) + console.log('Placetype precision caps: locality=' + options.localityMaxPrecision + ', localadmin=' + options.localadminMaxPrecision + ', county=' + options.countyMaxPrecision + ', region=' + options.regionMaxPrecision) + if (Number.isFinite(options.regionSparseMaxPrecision) && Number.isFinite(options.regionSparseMinAreaKm2)) { + console.log('Sparse region rule: area_km2>=' + options.regionSparseMinAreaKm2 + ' => max_precision=' + options.regionSparseMaxPrecision) + } + if (options.dominantLocalityPopulation > 0) { + console.log('Dominant locality rollup: major_population>=' + options.dominantLocalityPopulation + ', ratio>=' + options.dominantLocalityRatio) + } else { + console.log('Dominant locality rollup: disabled') + } + console.log('Parent locality takeover min share: ' + options.parentLocalityMinShare) + console.log('Index mode: ' + options.indexMode) + console.log('Promote locality over region: ' + (options.promoteLocalityOverRegion ? 'true' : 'false')) + console.log('Min population: ' + options.minPopulation) + if (Number.isFinite(options.isolationMinPopulation) && options.isolationMinPopulation > 0) { + console.log('Isolation min population: ' + options.isolationMinPopulation) + } else { + console.log('Isolation pass: disabled') + } + console.log('Ensure country locality: ' + (options.ensureCountryLocality ? 'true' : 'false')) + } finally { + await dbClose(db) + } +} + +main().catch(function(err) { + console.error(err.message || err) + process.exit(1) +}) diff --git a/scripts/generate_geonames.sh b/scripts/generate_geonames.sh index 20f4413..ebb1666 100755 --- a/scripts/generate_geonames.sh +++ b/scripts/generate_geonames.sh @@ -1,104 +1,159 @@ #!/bin/bash +set -euo pipefail + +# Generates a geocoder SQLite database from GeoNames dump files. +# Usage: +# ./scripts/generate_geonames.sh [output_db_path] +# +# Environment variables: +# GEONAMES_DATASET cities dump name without extension (default: cities1000) +# GEONAMES_WORKDIR working dir for output and temp files (default: current dir) +# GEONAMES_DOWNLOAD set to 0 to skip downloads and use existing local files +# GEONAMES_FEATURE_CODES comma-separated GeoNames feature codes to keep +# (default: PPLA,PPLA2,PPLA3,PPLA4,PPLA5,PPLC) +# Note: PPL can include neighborhood-like entries. +# GEONAMES_MIN_POPULATION minimum population to keep (default: 0) +# GEONAMES_INCLUDE_ADMIN1 set to 0 to skip admin1 import entirely (default: 1) + +GEONAMES_DATASET="${GEONAMES_DATASET:-cities1000}" +GEONAMES_WORKDIR="${GEONAMES_WORKDIR:-$(pwd)}" +GEONAMES_DOWNLOAD="${GEONAMES_DOWNLOAD:-1}" +GEONAMES_FEATURE_CODES="${GEONAMES_FEATURE_CODES:-PPLA,PPLA2,PPLA3,PPLA4,PPLA5,PPLC}" +GEONAMES_MIN_POPULATION="${GEONAMES_MIN_POPULATION:-0}" +GEONAMES_INCLUDE_ADMIN1="${GEONAMES_INCLUDE_ADMIN1:-1}" +OUTPUT="${1:-db.sqlite}" + +# Resolve to absolute so the later cd into GEONAMES_WORKDIR doesn't break it +case "${OUTPUT}" in + /*) ;; + *) OUTPUT="$(pwd)/${OUTPUT}" ;; +esac + +DATA_FILE="${GEONAMES_DATASET}.txt" +ADMIN1_FILE="admin1CodesASCII.txt" +COUNTRY_FILE="countryInfo.txt" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCHEMA_FILE="${SCRIPT_DIR}/schema.sql" +TMP_DIR="${GEONAMES_WORKDIR}/.geonames-build" +SOURCE_DIR="${TMP_DIR}/source" + +mkdir -p "${GEONAMES_WORKDIR}" "${TMP_DIR}" "${SOURCE_DIR}" + +download_if_missing() { + local file="$1" + local url="$2" + + if [[ -f "${SOURCE_DIR}/${file}" ]]; then + echo "Using existing ${file}" + return + fi -DATA="cities1000.txt" -ADMIN1="admin1CodesASCII.txt" -COUNTRIES="countryInfo.txt" -OUTPUT="db.sqlite" - -if [ ! -f "$DATA" ]; then - echo "Downloading cities from Geonames..." - wget "http://download.geonames.org/export/dump/cities1000.zip" - unzip "cities1000.zip" -else - echo "Using existing $DATA" -fi + if [[ "${GEONAMES_DOWNLOAD}" != "1" ]]; then + echo "Missing ${file} and GEONAMES_DOWNLOAD=${GEONAMES_DOWNLOAD}." >&2 + echo "Provide local files in ${SOURCE_DIR} or enable downloads." >&2 + exit 1 + fi -if [ ! -f "$ADMIN1" ]; then - echo "Downloading admin1 from Geonames..." - wget "http://download.geonames.org/export/dump/admin1CodesASCII.txt" -else - echo "Using existing $ADMIN1" -fi + echo "Downloading ${file}..." + curl -fsSL "${url}" -o "${SOURCE_DIR}/${file}" +} -if [ ! -f "$COUNTRIES" ]; then - echo "Downloading countries from Geonames..." - wget "http://download.geonames.org/export/dump/countryInfo.txt" -else - echo "Using existing $COUNTRIES" -fi +download_and_extract_dataset_if_missing() { + if [[ -f "${SOURCE_DIR}/${DATA_FILE}" ]]; then + echo "Using existing ${DATA_FILE}" + return + fi -if [ -f "$OUTPUT" ]; then - echo - echo "The file $OUTPUT already exists." - read -p "Do you want to override it? (y/N) " -n 1 -r - echo - if [[ ! $REPLY =~ ^[Yy]$ ]]; then + if [[ "${GEONAMES_DOWNLOAD}" != "1" ]]; then + echo "Missing ${DATA_FILE} and GEONAMES_DOWNLOAD=${GEONAMES_DOWNLOAD}." >&2 + echo "Provide local files in ${SOURCE_DIR} or enable downloads." >&2 exit 1 fi - rm "$OUTPUT" + local zip_file="${GEONAMES_DATASET}.zip" + echo "Downloading ${zip_file}..." + curl -fsSL "https://download.geonames.org/export/dump/${zip_file}" -o "${SOURCE_DIR}/${zip_file}" + unzip -o -q "${SOURCE_DIR}/${zip_file}" -d "${SOURCE_DIR}" +} + +download_and_extract_dataset_if_missing +download_if_missing "${COUNTRY_FILE}" "https://download.geonames.org/export/dump/${COUNTRY_FILE}" +if [[ "${GEONAMES_INCLUDE_ADMIN1}" == "1" ]]; then + download_if_missing "${ADMIN1_FILE}" "https://download.geonames.org/export/dump/${ADMIN1_FILE}" fi -echo -echo "Generating..." - -awk 'BEGIN { FS="\t"; OFS=";" } { gsub("\"", "", $2); gsub(";", "", $2); print $1,$2,$9,$11 }' $DATA > features.tsv -awk 'BEGIN { FS="\t"; OFS=";" } { print $1,$5,$6 }' $DATA > coordinates.tsv -awk 'BEGIN { FS="\t"; OFS=";" } { split($1, id, "."); gsub("\"", "", $2); gsub(";", "", $2); print id[1],id[2],$2 }' $ADMIN1 > admin1.tsv -grep -vE '^#' $COUNTRIES | awk 'BEGIN { FS="\t"; OFS=";" } { print $1,$5 }' > countries.tsv - -echo ' -CREATE TABLE coordinates( - feature_id INTEGER, - latitude REAL, - longitude REAL, - PRIMARY KEY (feature_id) -); - -CREATE TABLE features( - id INTEGER, - name TEXT, - country_id TEXT, - admin1_id INTEGER, - PRIMARY KEY (id) -); - -CREATE TABLE admin1( - country_id TEXT, - id INTEGER, - name TEXT, - PRIMARY KEY (country_id, id) -); - -CREATE TABLE countries( - id TEXT, - name TEXT, - PRIMARY KEY (id) -); - -CREATE VIEW everything AS - SELECT - features.id, - features.name, - admin1.id AS admin1_id, - admin1.name AS admin1_name, - countries.id AS country_id, - countries.name AS country_name, - coordinates.latitude AS latitude, - coordinates.longitude AS longitude - FROM features - LEFT JOIN countries ON features.country_id = countries.id - LEFT JOIN admin1 ON features.country_id = admin1.country_id AND features.admin1_id = admin1.id - JOIN coordinates ON features.id = coordinates.feature_id; +echo "Preparing TSV files in ${TMP_DIR}..." +echo "Feature codes: ${GEONAMES_FEATURE_CODES}" +echo "Minimum population: ${GEONAMES_MIN_POPULATION}" +echo "Include admin1: ${GEONAMES_INCLUDE_ADMIN1}" +rm -f "${TMP_DIR}/features.tsv" "${TMP_DIR}/coordinates.tsv" +awk -v feature_codes="${GEONAMES_FEATURE_CODES}" -v min_population="${GEONAMES_MIN_POPULATION}" -v include_admin1="${GEONAMES_INCLUDE_ADMIN1}" -v features_out="${TMP_DIR}/features.tsv" -v coordinates_out="${TMP_DIR}/coordinates.tsv" 'BEGIN { + FS="\t"; + OFS=";"; + split(feature_codes, raw_codes, ","); + for (i in raw_codes) { + code = raw_codes[i]; + gsub(/^[[:space:]]+|[[:space:]]+$/, "", code); + if (code != "") { + allowed_codes[code] = 1; + } + } +} +{ + if (!($8 in allowed_codes)) { + next; + } + + population = ($15 == "" ? 0 : $15); + if (population < min_population) { + next; + } + + gsub("\"", "", $2); + gsub(";", "", $2); + gsub("\"", "", $3); + gsub(";", "", $3); + admin1_id = (include_admin1 == "1" ? $11 : ""); + print $1,$2,$3,$9,admin1_id,population >> features_out; + print $1,$5,$6 >> coordinates_out; +}' "${SOURCE_DIR}/${DATA_FILE}" + +if [[ "${GEONAMES_INCLUDE_ADMIN1}" == "1" ]]; then + awk 'BEGIN { FS="\t"; OFS=";" } + { + split($1, id, "."); + gsub("\"", "", $2); + gsub(";", "", $2); + print id[1],id[2],$2 + }' "${SOURCE_DIR}/${ADMIN1_FILE}" > "${TMP_DIR}/admin1.tsv" +else + : > "${TMP_DIR}/admin1.tsv" +fi -.separator ";" -.import coordinates.tsv coordinates -.import features.tsv features -.import admin1.tsv admin1 -.import countries.tsv countries +grep -vE '^#' "${SOURCE_DIR}/${COUNTRY_FILE}" | awk 'BEGIN { FS="\t"; OFS=";" } +{ + gsub("\"", "", $5); + gsub(";", "", $5); + print $1,$5 +}' > "${TMP_DIR}/countries.tsv" -CREATE INDEX coordinates_lat_lng ON coordinates (latitude, longitude); -' | sqlite3 "$OUTPUT" +rm -f "${OUTPUT}" +echo "Building ${OUTPUT}..." -COUNT=`sqlite3 "$OUTPUT" "SELECT COUNT(*) FROM features;"` -echo "Created $OUTPUT with $COUNT features." +{ + cat "${SCHEMA_FILE}" + cat <<'SQL' +.separator ";" +.import .geonames-build/coordinates.tsv coordinates +.import .geonames-build/features.tsv features +.import .geonames-build/admin1.tsv admin1 +.import .geonames-build/countries.tsv countries +SQL +} | ( + cd "${GEONAMES_WORKDIR}" && + sqlite3 "${OUTPUT}" +) + +COUNT="$(sqlite3 "${OUTPUT}" "SELECT COUNT(*) FROM features;")" +echo "Created ${OUTPUT} with ${COUNT} features." diff --git a/scripts/generate_wof_boundary.sh b/scripts/generate_wof_boundary.sh new file mode 100755 index 0000000..8ef4a4f --- /dev/null +++ b/scripts/generate_wof_boundary.sh @@ -0,0 +1,293 @@ +#!/bin/bash +set -euo pipefail + +# Build boundary-aware lookup tables from Who's On First admin repositories. +# +# Usage: +# ./scripts/generate_wof_boundary.sh [output_db_path] +# +# Environment variables: +# WOF_COUNTRIES Comma-separated ISO2 country codes (default: FR,IT) +# WOF_WORKDIR Working directory for archives/extraction (default: ./tmp/wof-build) +# WOF_DOWNLOAD Set to 0 to skip downloads and reuse existing archives (default: 1) +# WOF_REF Git ref to download from codeload (default: master) +# WOF_REF_LOCK_FILE Optional file with per-country pinned refs: " " per line +# WOF_BASE_PRECISION Geohash base precision (default: 4) +# WOF_MAX_PRECISION Geohash max precision (default: 5) +# WOF_LOCALITY_MAX_PRECISION Locality max precision override (default: WOF_MAX_PRECISION) +# WOF_LOCALADMIN_MAX_PRECISION Localadmin max precision override (default: WOF_MAX_PRECISION) +# WOF_COUNTY_MAX_PRECISION County max precision override (default: WOF_MAX_PRECISION) +# WOF_REGION_MAX_PRECISION Region max precision override (default: 4) +# WOF_REGION_SPARSE_MAX_PRECISION Sparse large-region precision (default: 3) +# WOF_REGION_SPARSE_MIN_AREA_KM2 Area threshold for sparse region precision (default: 80000) +# WOF_PROMOTE_LOCALITY_OVER_REGION Prefer locality labels over region in shared parent cells (default: 1) +# WOF_DOMINANT_LOCALITY_POPULATION Major-locality threshold for dominant-city rollup (default: 100000) +# WOF_DOMINANT_LOCALITY_RATIO Dominant-vs-next locality population ratio (default: 3) +# WOF_PARENT_LOCALITY_MIN_SHARE Minimum child-cell share (0..1) required for locality parent takeover (default: 0.5) +# WOF_INCLUDE_LOCALADMIN Include localadmin placetypes (default: 0) +# WOF_INCLUDE_COUNTY Include county placetypes (default: 1) +# WOF_INCLUDE_REGION Include region placetypes (default: 1) +# WOF_DROP_CONTAINED_LOCALITIES Drop localities contained in larger localities (default: 1) +# WOF_INCLUDE_ALT Include -alt- geometries (default: 0) +# WOF_GEOMETRY_DECIMALS Optional coordinate rounding precision (e.g. 4) +# WOF_MIN_POPULATION Optional minimum population filter (default: 0) +# WOF_ISOLATION_MIN_POPULATION Lower population floor for isolated localities (default: 500) +# WOF_ENSURE_COUNTRY_LOCALITY Guarantee at least one locality per country (default: 1) +# WOF_MAX_PLACES Optional cap for experiment runs +# WOF_SKIP_INVALID_REPOS Skip repos missing expected extracted data dir (default: 1) +# WOF_BATCH_SIZE Countries per node invocation to limit memory (default: 10) +# WOF_APPEND Append to existing DB instead of replacing schema (default: 0) +# +# Notes: +# - This helper always builds `--index-mode compact` (geohash -> place only). + +WOF_COUNTRIES="${WOF_COUNTRIES:-FR,IT}" +WOF_WORKDIR="${WOF_WORKDIR:-$(pwd)/tmp/wof-build}" +WOF_DOWNLOAD="${WOF_DOWNLOAD:-1}" +WOF_REF="${WOF_REF:-master}" +WOF_REF_LOCK_FILE="${WOF_REF_LOCK_FILE:-}" +WOF_BASE_PRECISION="${WOF_BASE_PRECISION:-4}" +WOF_MAX_PRECISION="${WOF_MAX_PRECISION:-5}" +WOF_LOCALITY_MAX_PRECISION="${WOF_LOCALITY_MAX_PRECISION:-${WOF_MAX_PRECISION}}" +WOF_LOCALADMIN_MAX_PRECISION="${WOF_LOCALADMIN_MAX_PRECISION:-${WOF_MAX_PRECISION}}" +WOF_COUNTY_MAX_PRECISION="${WOF_COUNTY_MAX_PRECISION:-${WOF_MAX_PRECISION}}" +WOF_REGION_MAX_PRECISION="${WOF_REGION_MAX_PRECISION:-4}" +WOF_REGION_SPARSE_MAX_PRECISION="${WOF_REGION_SPARSE_MAX_PRECISION:-3}" +WOF_REGION_SPARSE_MIN_AREA_KM2="${WOF_REGION_SPARSE_MIN_AREA_KM2:-80000}" +WOF_PROMOTE_LOCALITY_OVER_REGION="${WOF_PROMOTE_LOCALITY_OVER_REGION:-1}" +WOF_DOMINANT_LOCALITY_POPULATION="${WOF_DOMINANT_LOCALITY_POPULATION:-100000}" +WOF_DOMINANT_LOCALITY_RATIO="${WOF_DOMINANT_LOCALITY_RATIO:-3}" +WOF_PARENT_LOCALITY_MIN_SHARE="${WOF_PARENT_LOCALITY_MIN_SHARE:-0.5}" +WOF_INCLUDE_LOCALADMIN="${WOF_INCLUDE_LOCALADMIN:-0}" +WOF_INCLUDE_COUNTY="${WOF_INCLUDE_COUNTY:-1}" +WOF_INCLUDE_REGION="${WOF_INCLUDE_REGION:-1}" +WOF_DROP_CONTAINED_LOCALITIES="${WOF_DROP_CONTAINED_LOCALITIES:-1}" +WOF_INCLUDE_ALT="${WOF_INCLUDE_ALT:-0}" +WOF_GEOMETRY_DECIMALS="${WOF_GEOMETRY_DECIMALS:-}" +WOF_MIN_POPULATION="${WOF_MIN_POPULATION:-0}" +WOF_ISOLATION_MIN_POPULATION="${WOF_ISOLATION_MIN_POPULATION:-500}" +WOF_ENSURE_COUNTRY_LOCALITY="${WOF_ENSURE_COUNTRY_LOCALITY:-1}" +WOF_MAX_PLACES="${WOF_MAX_PLACES:-}" +WOF_SKIP_INVALID_REPOS="${WOF_SKIP_INVALID_REPOS:-1}" +WOF_APPEND="${WOF_APPEND:-0}" +OUTPUT="${1:-db.sqlite}" + +case "${OUTPUT}" in + /*) ;; + *) OUTPUT="$(pwd)/${OUTPUT}" ;; +esac + +if [[ -n "${WOF_REF_LOCK_FILE}" ]]; then + case "${WOF_REF_LOCK_FILE}" in + /*) ;; + *) WOF_REF_LOCK_FILE="$(pwd)/${WOF_REF_LOCK_FILE}" ;; + esac + + if [[ ! -f "${WOF_REF_LOCK_FILE}" ]]; then + echo "WOF_REF_LOCK_FILE does not exist: ${WOF_REF_LOCK_FILE}" >&2 + exit 1 + fi +fi + +resolve_country_ref() { + local country="$1" + local fallback_ref="$2" + + if [[ -z "${WOF_REF_LOCK_FILE}" ]]; then + echo "${fallback_ref}" + return 0 + fi + + local resolved_ref + resolved_ref="$(awk -F'[,\t ]+' -v cc="${country}" ' + BEGIN { lower = tolower(cc) } + /^[[:space:]]*#/ { next } + NF < 2 { next } + { + if (tolower($1) == lower) { + print $2 + exit + } + } + ' "${WOF_REF_LOCK_FILE}")" + + if [[ -z "${resolved_ref}" ]]; then + echo "Missing pinned ref for country ${country} in ${WOF_REF_LOCK_FILE}" >&2 + exit 1 + fi + + echo "${resolved_ref}" +} + +ARCHIVE_DIR="${WOF_WORKDIR}/archives" +EXTRACT_DIR="${WOF_WORKDIR}/extracted" +mkdir -p "${ARCHIVE_DIR}" "${EXTRACT_DIR}" + +# Build the common flags array shared by every invocation. +COMMON_FLAGS=( + --index-mode "compact" + --base-precision "${WOF_BASE_PRECISION}" + --max-precision "${WOF_MAX_PRECISION}" + --locality-max-precision "${WOF_LOCALITY_MAX_PRECISION}" + --localadmin-max-precision "${WOF_LOCALADMIN_MAX_PRECISION}" + --county-max-precision "${WOF_COUNTY_MAX_PRECISION}" + --region-max-precision "${WOF_REGION_MAX_PRECISION}" + --region-sparse-max-precision "${WOF_REGION_SPARSE_MAX_PRECISION}" + --region-sparse-min-area-km2 "${WOF_REGION_SPARSE_MIN_AREA_KM2}" + --promote-locality-over-region "${WOF_PROMOTE_LOCALITY_OVER_REGION}" + --dominant-locality-population "${WOF_DOMINANT_LOCALITY_POPULATION}" + --dominant-locality-ratio "${WOF_DOMINANT_LOCALITY_RATIO}" + --parent-locality-min-share "${WOF_PARENT_LOCALITY_MIN_SHARE}" + --include-localadmin "${WOF_INCLUDE_LOCALADMIN}" + --include-county "${WOF_INCLUDE_COUNTY}" + --include-region "${WOF_INCLUDE_REGION}" + --drop-contained-localities "${WOF_DROP_CONTAINED_LOCALITIES}" + --include-alt "${WOF_INCLUDE_ALT}" + --min-population "${WOF_MIN_POPULATION}" + --isolation-min-population "${WOF_ISOLATION_MIN_POPULATION}" + --ensure-country-locality "${WOF_ENSURE_COUNTRY_LOCALITY}" +) + +if [[ -n "${WOF_MAX_PLACES}" ]]; then + COMMON_FLAGS+=(--max-places "${WOF_MAX_PLACES}") +fi + +if [[ -n "${WOF_GEOMETRY_DECIMALS}" ]]; then + COMMON_FLAGS+=(--geometry-decimals "${WOF_GEOMETRY_DECIMALS}") +fi + +WOF_BATCH_SIZE="${WOF_BATCH_SIZE:-10}" + +# Phase 1: Download all archives (small on disk, skip extraction). +# Collect country codes and their archive paths for batched processing. +COUNTRY_CODES=() +COUNTRY_ARCHIVES=() + +IFS=',' read -r -a COUNTRY_ITEMS <<< "${WOF_COUNTRIES}" +for item in "${COUNTRY_ITEMS[@]}"; do + country="$(echo "${item}" | tr '[:upper:]' '[:lower:]' | xargs)" + if [[ -z "${country}" ]]; then + continue + fi + + repo="whosonfirst-data-admin-${country}" + country_ref="$(resolve_country_ref "${country}" "${WOF_REF}")" + archive="${ARCHIVE_DIR}/${repo}-${country_ref}.tar.gz" + + if [[ ! -f "${archive}" ]]; then + if [[ "${WOF_DOWNLOAD}" != "1" ]]; then + echo "Missing ${archive} and WOF_DOWNLOAD=${WOF_DOWNLOAD}." >&2 + echo "Provide the archive locally or set WOF_DOWNLOAD=1." >&2 + exit 1 + fi + + url="https://codeload.github.com/whosonfirst-data/${repo}/tar.gz/${country_ref}" + echo "Downloading ${repo}@${country_ref}..." + curl --fail --silent --show-error --location \ + --retry 5 --retry-delay 2 --retry-connrefused \ + "${url}" -o "${archive}" + else + echo "Using existing archive ${archive}" + fi + + COUNTRY_CODES+=("${country}") + COUNTRY_ARCHIVES+=("${archive}") +done + +if [[ ${#COUNTRY_CODES[@]} -eq 0 ]]; then + echo "No countries resolved from WOF_COUNTRIES=${WOF_COUNTRIES}" >&2 + exit 1 +fi + +# Phase 2: Extract, process, and clean up in batches to limit disk usage. +# Each batch extracts its countries, runs the node script, then removes +# the extracted data before the next batch starts. + +# Helper: extract a single country archive, print its data dir path. +# Returns 1 if the country should be skipped. +extract_country() { + local country="$1" + local archive="$2" + + local country_extract="${EXTRACT_DIR}/${country}" + rm -rf "${country_extract}" + mkdir -p "${country_extract}" + if ! tar -xzf "${archive}" -C "${country_extract}"; then + if [[ "${WOF_SKIP_INVALID_REPOS}" == "1" ]]; then + echo "Warning: failed to extract ${archive}; skipping" >&2 + return 1 + fi + echo "Failed to extract ${archive}" >&2 + exit 1 + fi + + local root_dir + root_dir="$(find "${country_extract}" -mindepth 1 -maxdepth 1 -type d | head -n 1)" + if [[ -z "${root_dir}" ]]; then + if [[ "${WOF_SKIP_INVALID_REPOS}" == "1" ]]; then + echo "Warning: no extracted root directory for ${country}; skipping" >&2 + return 1 + fi + echo "Failed to find extracted root directory for ${country}" >&2 + exit 1 + fi + + local data_dir="${root_dir}/data" + if [[ ! -d "${data_dir}" ]]; then + if [[ "${WOF_SKIP_INVALID_REPOS}" == "1" ]]; then + echo "Warning: expected data directory not found for ${country}; skipping (${data_dir})" >&2 + return 1 + fi + echo "Expected data directory not found: ${data_dir}" >&2 + exit 1 + fi + + echo "${data_dir}" +} + +TOTAL="${#COUNTRY_CODES[@]}" +IS_FIRST=1 +BATCH_IDX=0 + +while [[ "${BATCH_IDX}" -lt "${TOTAL}" ]]; do + BATCH_END=$(( BATCH_IDX + WOF_BATCH_SIZE )) + if [[ "${BATCH_END}" -gt "${TOTAL}" ]]; then + BATCH_END="${TOTAL}" + fi + + BATCH_COUNTRIES=("${COUNTRY_CODES[@]:${BATCH_IDX}:${WOF_BATCH_SIZE}}") + BATCH_ARCHIVES=("${COUNTRY_ARCHIVES[@]:${BATCH_IDX}:${WOF_BATCH_SIZE}}") + + echo "--- Batch $(( BATCH_IDX / WOF_BATCH_SIZE + 1 )): ${#BATCH_COUNTRIES[@]} countries (${BATCH_COUNTRIES[*]}) ---" + + # Extract this batch's countries. + INPUT_ARGS=() + EXTRACTED_DIRS=() + for (( i=0; i < ${#BATCH_COUNTRIES[@]}; i++ )); do + data_dir="$(extract_country "${BATCH_COUNTRIES[$i]}" "${BATCH_ARCHIVES[$i]}")" || continue + INPUT_ARGS+=(--input-dir "${data_dir}") + EXTRACTED_DIRS+=("${EXTRACT_DIR}/${BATCH_COUNTRIES[$i]}") + done + + if [[ ${#INPUT_ARGS[@]} -gt 0 ]]; then + CMD=( + node "$(pwd)/scripts/generate_boundary_index.js" + --database "${OUTPUT}" + "${COMMON_FLAGS[@]}" + ) + if [[ "${IS_FIRST}" == "1" ]] && [[ "${WOF_APPEND}" != "1" ]]; then + IS_FIRST=0 + else + CMD+=(--append) + fi + CMD+=("${INPUT_ARGS[@]}") + "${CMD[@]}" + fi + + # Clean up extracted data for this batch to free disk space. + for dir in "${EXTRACTED_DIRS[@]}"; do + rm -rf "${dir}" + done + + BATCH_IDX="${BATCH_END}" +done diff --git a/scripts/schema.sql b/scripts/schema.sql new file mode 100644 index 0000000..da30fed --- /dev/null +++ b/scripts/schema.sql @@ -0,0 +1,94 @@ +CREATE TABLE coordinates( + feature_id INTEGER PRIMARY KEY, + latitude REAL NOT NULL, + longitude REAL NOT NULL +); + +CREATE TABLE features( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + asciiname TEXT, + country_id TEXT NOT NULL, + admin1_id INTEGER, + population INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE admin1( + country_id TEXT NOT NULL, + id INTEGER NOT NULL, + name TEXT NOT NULL, + PRIMARY KEY (country_id, id) +); + +CREATE TABLE countries( + id TEXT PRIMARY KEY, + name TEXT NOT NULL +); + +CREATE VIEW everything AS + SELECT + features.id AS id, + features.name AS name, + features.asciiname AS asciiname, + features.population AS population, + admin1.id AS admin1_id, + admin1.name AS admin1_name, + countries.id AS country_id, + countries.name AS country_name, + coordinates.latitude AS latitude, + coordinates.longitude AS longitude + FROM features + LEFT JOIN countries ON features.country_id = countries.id + LEFT JOIN admin1 ON features.country_id = admin1.country_id AND features.admin1_id = admin1.id + JOIN coordinates ON features.id = coordinates.feature_id; + +CREATE INDEX coordinates_lat_lng ON coordinates (latitude, longitude); +CREATE INDEX features_name_nocase ON features (name COLLATE NOCASE); +CREATE INDEX features_asciiname_nocase ON features (asciiname COLLATE NOCASE); +CREATE INDEX features_population_desc ON features (population DESC); + +CREATE TABLE places( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + country_id TEXT NOT NULL, + admin1_id INTEGER, + placetype TEXT NOT NULL, + centroid_lat REAL NOT NULL, + centroid_lon REAL NOT NULL, + bbox_min_lat REAL NOT NULL, + bbox_min_lon REAL NOT NULL, + bbox_max_lat REAL NOT NULL, + bbox_max_lon REAL NOT NULL, + priority_rank INTEGER NOT NULL DEFAULT 0, + area REAL NOT NULL DEFAULT 0, + country_name TEXT, + admin1_name TEXT +); + +CREATE TABLE place_geohash_cover( + geohash TEXT NOT NULL, + precision INTEGER NOT NULL, + place_id INTEGER NOT NULL, + coverage_type TEXT NOT NULL CHECK (coverage_type IN ('full', 'partial')), + PRIMARY KEY (geohash, precision, place_id), + FOREIGN KEY (place_id) REFERENCES places(id) +); + +CREATE TABLE place_geometry( + place_id INTEGER PRIMARY KEY, + encoding TEXT NOT NULL DEFAULT 'json', + geometry BLOB NOT NULL, + FOREIGN KEY (place_id) REFERENCES places(id) +); + +CREATE TABLE place_geohash_lookup( + geohash TEXT PRIMARY KEY, + place_id INTEGER NOT NULL, + FOREIGN KEY (place_id) REFERENCES places(id) +); + +CREATE INDEX place_geohash_cover_hash_precision ON place_geohash_cover (geohash, precision); +CREATE INDEX place_geohash_cover_place_id ON place_geohash_cover (place_id); +CREATE INDEX places_placetype ON places (placetype); +CREATE INDEX place_geometry_place_id ON place_geometry (place_id); +CREATE INDEX place_geohash_lookup_place_id ON place_geohash_lookup (place_id); diff --git a/scripts/validate_with_locationiq.js b/scripts/validate_with_locationiq.js new file mode 100644 index 0000000..414fbdf --- /dev/null +++ b/scripts/validate_with_locationiq.js @@ -0,0 +1,897 @@ +#!/usr/bin/env node +"use strict"; + +const fs = require('fs') +const path = require('path') +const https = require('https') +const sqlite3 = require('sqlite3') + +const createGeocoder = require('../src/index') +const geohash = require('../src/geohash') + +function usage() { + return [ + 'Usage: node scripts/validate_with_locationiq.js --database [options]', + '', + 'Options:', + ' --database Geocoder SQLite database to validate (required)', + ' --api-key LocationIQ API key (or env LOCATIONIQ_API_KEY)', + ' --samples Number of sample points to evaluate (default: 200)', + ' --seed RNG seed for repeatable sample generation (default: 1337)', + ' --rps Max LocationIQ requests per second when uncached (default: 1)', + ' --force-refresh Ignore cached LocationIQ responses (default: false)', + ' --reverse-mode centroid|boundary (default: boundary)', + ' --base-precision Boundary lookup base precision (default: 4)', + ' --max-precision Boundary lookup max precision (default: 7)', + ' --endpoint LocationIQ reverse endpoint (default: https://us1.locationiq.com/v1/reverse)', + ' --export-csv Optional CSV export of the evaluated sample rows', + ' --help, -h Show this help message', + '', + 'Example:', + ' LOCATIONIQ_API_KEY=... node scripts/validate_with_locationiq.js \\', + ' --database tmp/wof-fr-it-compact-p5-d3-pop10k-region.sqlite \\', + ' --samples 300 \\', + ' --export-csv tmp/locationiq-validation-fr-it.csv' + ].join('\n') +} + +function parseBool(value, defaultValue) { + if (value === undefined || value === null || value === '') { + return defaultValue + } + + var normalized = String(value).toLowerCase().trim() + if (normalized === '1' || normalized === 'true' || normalized === 'yes' || normalized === 'y') { + return true + } + if (normalized === '0' || normalized === 'false' || normalized === 'no' || normalized === 'n') { + return false + } + return defaultValue +} + +function parseArgs(argv) { + var opts = { + database: null, + apiKey: process.env.LOCATIONIQ_API_KEY || '', + cacheDb: null, + samples: 200, + seed: 1337, + rps: 1, + forceRefresh: false, + reverseMode: 'boundary', + basePrecision: 4, + maxPrecision: 7, + endpoint: 'https://us1.locationiq.com/v1/reverse', + exportCsv: null + } + + for (var i = 0; i < argv.length; i++) { + var arg = argv[i] + + if (arg === '--database' || arg === '-d') { + opts.database = path.resolve(argv[++i]) + } else if (arg === '--api-key') { + opts.apiKey = String(argv[++i] || '') + } else if (arg === '--samples') { + opts.samples = Math.max(1, Math.trunc(Number(argv[++i]))) + } else if (arg === '--seed') { + opts.seed = Math.trunc(Number(argv[++i])) + } else if (arg === '--rps') { + opts.rps = Math.max(0.2, Number(argv[++i])) + } else if (arg === '--force-refresh') { + opts.forceRefresh = parseBool(argv[++i], false) + } else if (arg === '--reverse-mode') { + opts.reverseMode = String(argv[++i] || 'boundary').toLowerCase() + } else if (arg === '--base-precision') { + opts.basePrecision = Math.max(1, Math.trunc(Number(argv[++i]))) + } else if (arg === '--max-precision') { + opts.maxPrecision = Math.max(opts.basePrecision, Math.trunc(Number(argv[++i]))) + } else if (arg === '--endpoint') { + opts.endpoint = String(argv[++i] || opts.endpoint) + } else if (arg === '--export-csv') { + opts.exportCsv = path.resolve(argv[++i]) + } else if (arg === '--help' || arg === '-h') { + opts.help = true + } else { + throw new Error('Unknown argument: ' + arg) + } + } + + return opts +} + +function defaultCachePath(databasePath) { + var base = path.basename(databasePath) + if (base.toLowerCase().endsWith('.sqlite')) { + base = base.slice(0, -7) + } + base = base.replace(/[^a-z0-9._-]+/ig, '-').replace(/-+/g, '-').replace(/^-|-$/g, '') + if (!base) base = 'geocoder' + return path.resolve('tmp/locationiq-validation-' + base + '.sqlite') +} + +function sleep(ms) { + return new Promise(function(resolve) { + setTimeout(resolve, ms) + }) +} + +function mulberry32(seed) { + var state = seed >>> 0 + return function() { + state |= 0 + state = (state + 0x6D2B79F5) | 0 + var t = Math.imul(state ^ (state >>> 15), 1 | state) + t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t + return ((t ^ (t >>> 14)) >>> 0) / 4294967296 + } +} + +function dbOpen(dbPath) { + return new sqlite3.Database(dbPath) +} + +function dbExec(db, sql) { + return new Promise(function(resolve, reject) { + db.exec(sql, function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +function dbRun(db, sql, params) { + return new Promise(function(resolve, reject) { + db.run(sql, params || [], function(err) { + if (err) reject(err) + else resolve(this) + }) + }) +} + +function dbGet(db, sql, params) { + return new Promise(function(resolve, reject) { + db.get(sql, params || [], function(err, row) { + if (err) reject(err) + else resolve(row) + }) + }) +} + +function dbAll(db, sql, params) { + return new Promise(function(resolve, reject) { + db.all(sql, params || [], function(err, rows) { + if (err) reject(err) + else resolve(rows || []) + }) + }) +} + +function dbClose(db) { + return new Promise(function(resolve, reject) { + db.close(function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +async function ensureCacheSchema(cacheDb) { + await dbExec(cacheDb, ` + CREATE TABLE IF NOT EXISTS sample_points( + coord_key TEXT PRIMARY KEY, + latitude REAL NOT NULL, + longitude REAL NOT NULL, + source_geohash TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + + CREATE TABLE IF NOT EXISTS locationiq_cache( + coord_key TEXT PRIMARY KEY, + latitude REAL NOT NULL, + longitude REAL NOT NULL, + http_status INTEGER, + response_json TEXT, + error_text TEXT, + fetched_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + + CREATE TABLE IF NOT EXISTS validation_results( + coord_key TEXT PRIMARY KEY, + latitude REAL NOT NULL, + longitude REAL NOT NULL, + source_geohash TEXT NOT NULL, + local_name TEXT, + local_placetype TEXT, + local_country_id TEXT, + local_admin1_id TEXT, + local_json TEXT, + liq_locality TEXT, + liq_country_code TEXT, + liq_display_name TEXT, + liq_json TEXT, + locality_match INTEGER NOT NULL DEFAULT 0, + country_match INTEGER NOT NULL DEFAULT 0, + policy_match INTEGER NOT NULL DEFAULT 0, + policy_reason TEXT, + policy_verdict TEXT NOT NULL DEFAULT 'policy_unset', + verdict TEXT NOT NULL, + updated_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + `) +} + +async function ensureValidationColumns(cacheDb) { + var columns = await dbAll(cacheDb, "PRAGMA table_info(validation_results)") + var byName = Object.create(null) + for (var i = 0; i < columns.length; i++) { + byName[String(columns[i].name)] = true + } + + var additions = [ + ['local_placetype', 'ALTER TABLE validation_results ADD COLUMN local_placetype TEXT'], + ['policy_match', 'ALTER TABLE validation_results ADD COLUMN policy_match INTEGER NOT NULL DEFAULT 0'], + ['policy_reason', 'ALTER TABLE validation_results ADD COLUMN policy_reason TEXT'], + ['policy_verdict', "ALTER TABLE validation_results ADD COLUMN policy_verdict TEXT NOT NULL DEFAULT 'policy_unset'"] + ] + + for (var j = 0; j < additions.length; j++) { + var name = additions[j][0] + var sql = additions[j][1] + if (!byName[name]) { + await dbExec(cacheDb, sql) + } + } +} + +function hashString32(value) { + var hash = 2166136261 + var text = String(value || '') + for (var i = 0; i < text.length; i++) { + hash ^= text.charCodeAt(i) + hash = Math.imul(hash, 16777619) + } + return hash >>> 0 +} + +function deterministicPointInHash(hash, seed, index) { + var bbox = geohash.decodeBbox(hash) + var localSeed = (hashString32(hash) ^ hashString32(seed) ^ (index >>> 0)) >>> 0 + var rng = mulberry32(localSeed) + return { + latitude: bbox.minLat + (bbox.maxLat - bbox.minLat) * rng(), + longitude: bbox.minLon + (bbox.maxLon - bbox.minLon) * rng() + } +} + +function deterministicShuffle(items, seed) { + var rng = mulberry32(seed) + var list = items.slice() + for (var i = list.length - 1; i > 0; i--) { + var j = Math.floor(rng() * (i + 1)) + var tmp = list[i] + list[i] = list[j] + list[j] = tmp + } + return list +} + +async function detectLookupTable(sourceDb) { + var rows = await dbAll( + sourceDb, + "SELECT name FROM sqlite_master WHERE type='table' AND name IN ('compact_geohash_lookup','place_geohash_lookup')" + ) + var names = Object.create(null) + for (var i = 0; i < rows.length; i++) { + names[rows[i].name] = true + } + if (names.compact_geohash_lookup) return 'compact_geohash_lookup' + if (names.place_geohash_lookup) return 'place_geohash_lookup' + throw new Error('No geohash lookup table found (expected compact_geohash_lookup or place_geohash_lookup)') +} + +async function detectPlacetypeSource(sourceDb) { + var rows = await dbAll( + sourceDb, + "SELECT name FROM sqlite_master WHERE type='table' AND name IN ('compact_places','places')" + ) + var names = Object.create(null) + for (var i = 0; i < rows.length; i++) { + names[rows[i].name] = true + } + if (names.compact_places) return 'compact_places' + if (names.places) return 'places' + return null +} + +async function resolveLocalPlacetype(sourceDb, source, placeId, cache) { + if (placeId === undefined || placeId === null || placeId === '') { + return '' + } + if (!source) { + return '' + } + + var key = String(placeId) + if (cache[key]) { + return cache[key] + } + + var row + if (source === 'compact_places') { + row = await dbGet( + sourceDb, + `SELECT CASE placetype_code + WHEN 0 THEN 'locality' + WHEN 1 THEN 'localadmin' + WHEN 2 THEN 'region' + ELSE '' + END AS placetype + FROM compact_places + WHERE id = ? + LIMIT 1`, + [placeId] + ) + } else { + row = await dbGet( + sourceDb, + 'SELECT placetype FROM places WHERE id = ? LIMIT 1', + [placeId] + ) + } + + var value = row && row.placetype ? String(row.placetype) : '' + cache[key] = value + return value +} + +function coordKey(latitude, longitude) { + return Number(latitude).toFixed(6) + ',' + Number(longitude).toFixed(6) +} + +async function ensureSamplePoints(sourceDb, cacheDb, lookupTable, targetCount, seed) { + var countRow = await dbGet(cacheDb, 'SELECT COUNT(*) AS count FROM sample_points') + var current = countRow ? Number(countRow.count || 0) : 0 + if (current >= targetCount) { + return + } + + var geohashRows = await dbAll( + sourceDb, + 'SELECT geohash FROM ' + lookupTable + ' WHERE geohash IS NOT NULL ORDER BY geohash ASC' + ) + if (!geohashRows.length) { + throw new Error('Unable to sample geohashes from ' + lookupTable) + } + + var geohashes = [] + for (var idx = 0; idx < geohashRows.length; idx++) { + if (geohashRows[idx].geohash) { + geohashes.push(geohashRows[idx].geohash) + } + } + geohashes = deterministicShuffle(geohashes, seed) + + var needed = targetCount - current + var insertedTotal = 0 + for (var i = 0; i < geohashes.length && insertedTotal < needed; i++) { + var hash = geohashes[i] + var point = deterministicPointInHash(hash, seed, i) + var key = coordKey(point.latitude, point.longitude) + var result = await dbRun( + cacheDb, + 'INSERT OR IGNORE INTO sample_points(coord_key, latitude, longitude, source_geohash) VALUES (?, ?, ?, ?)', + [key, point.latitude, point.longitude, hash] + ) + if (result && result.changes > 0) { + insertedTotal += 1 + } + } + + if (current + insertedTotal < targetCount) { + throw new Error( + 'Could not create enough unique sample points for requested --samples=' + targetCount + + ' (available=' + (current + insertedTotal) + ')' + ) + } +} + +function normalizeName(value) { + if (!value) return '' + return String(value) + .normalize('NFKD') + .replace(/[\u0300-\u036f]/g, '') + .toLowerCase() + .replace(/[^a-z0-9]+/g, ' ') + .trim() + .replace(/\s+/g, ' ') +} + +function namesMatch(left, right) { + if (!left || !right) return false + if (left === right) return true + if (left.indexOf(right) !== -1 || right.indexOf(left) !== -1) return true + return false +} + +function extractLocationIqLocality(address) { + if (!address || typeof address !== 'object') return '' + var keys = [ + 'city', + 'town', + 'village', + 'municipality', + 'borough', + 'suburb', + 'county', + 'state_district', + 'state' + ] + for (var i = 0; i < keys.length; i++) { + var value = address[keys[i]] + if (value) return String(value) + } + return '' +} + +function matchAddressValue(normalizedLocalName, address, keys) { + if (!normalizedLocalName || !address || typeof address !== 'object') { + return null + } + + for (var i = 0; i < keys.length; i++) { + var key = keys[i] + var value = address[key] + if (!value) continue + if (namesMatch(normalizedLocalName, normalizeName(value))) { + return { key: key, value: String(value) } + } + } + + return null +} + +function displayNameMatch(normalizedLocalName, displayName) { + if (!normalizedLocalName || !displayName) { + return false + } + + var segments = String(displayName).split(',') + for (var i = 0; i < segments.length; i++) { + if (namesMatch(normalizedLocalName, normalizeName(segments[i]))) { + return true + } + } + + return false +} + +function buildPolicyVerdict(params) { + var countryMatch = Boolean(params.countryMatch) + var strictLocalityMatch = Boolean(params.strictLocalityMatch) + var localPlacetype = String(params.localPlacetype || '') + var localName = String(params.localName || '') + var normalizedLocalName = normalizeName(localName) + var liqAddress = params.liqAddress || {} + var liqDisplayName = String(params.liqDisplayName || '') + + if (!countryMatch) { + return { + match: false, + reason: 'country_mismatch', + verdict: 'policy_country_mismatch' + } + } + + if (!normalizedLocalName) { + return { + match: false, + reason: 'missing_local_name', + verdict: 'policy_missing_local_name' + } + } + + if (strictLocalityMatch) { + return { + match: true, + reason: 'strict_locality', + verdict: 'policy_match_strict' + } + } + + var majorKeys = ['city', 'town', 'municipality', 'county', 'state_district', 'state', 'region', 'province'] + var majorMatch = matchAddressValue(normalizedLocalName, liqAddress, majorKeys) + if (majorMatch) { + return { + match: true, + reason: 'major_' + majorMatch.key, + verdict: localPlacetype === 'region' ? 'policy_match_region_rollup' : 'policy_match_major_admin' + } + } + + var minorKeys = ['village', 'borough', 'suburb', 'hamlet', 'quarter', 'neighbourhood', 'city_district', 'district'] + var minorMatch = matchAddressValue(normalizedLocalName, liqAddress, minorKeys) + if (minorMatch) { + return { + match: true, + reason: 'minor_' + minorMatch.key, + verdict: 'policy_match_minor_admin' + } + } + + if (displayNameMatch(normalizedLocalName, liqDisplayName)) { + return { + match: true, + reason: 'display_name_segment', + verdict: localPlacetype === 'region' ? 'policy_match_region_rollup' : 'policy_match_display_name' + } + } + + if (localPlacetype === 'region') { + return { + match: false, + reason: 'region_name_not_present', + verdict: 'policy_region_mismatch' + } + } + + return { + match: false, + reason: 'no_policy_match', + verdict: 'policy_mismatch' + } +} + +function buildVerdict(localityMatch, countryMatch, localName, liqLocality) { + if (localityMatch && countryMatch) return 'match_city_country' + if (localityMatch) return 'match_city_only' + if (countryMatch) return 'match_country_only' + if (!localName && !liqLocality) return 'missing_both_locality' + if (!localName) return 'missing_local_locality' + if (!liqLocality) return 'missing_locationiq_locality' + return 'mismatch' +} + +function fetchJson(endpointUrl, timeoutMs) { + return new Promise(function(resolve, reject) { + var req = https.get(endpointUrl, function(response) { + var chunks = [] + response.on('data', function(chunk) { chunks.push(chunk) }) + response.on('end', function() { + var body = Buffer.concat(chunks).toString('utf8') + try { + var parsed = JSON.parse(body) + resolve({ status: response.statusCode || 0, json: parsed, raw: body }) + } catch (err) { + reject(new Error('Invalid JSON response (' + (response.statusCode || 0) + '): ' + body.slice(0, 200))) + } + }) + }) + + req.on('error', reject) + req.setTimeout(timeoutMs, function() { + req.destroy(new Error('Request timed out after ' + timeoutMs + 'ms')) + }) + }) +} + +function buildLocationIqUrl(endpoint, apiKey, latitude, longitude) { + var url = new URL(endpoint) + url.searchParams.set('key', apiKey) + url.searchParams.set('lat', String(latitude)) + url.searchParams.set('lon', String(longitude)) + url.searchParams.set('format', 'json') + url.searchParams.set('normalizecity', '1') + url.searchParams.set('addressdetails', '1') + return url.toString() +} + +async function getLocationIqResponse(cacheDb, opts, latitude, longitude) { + var key = coordKey(latitude, longitude) + if (!opts.forceRefresh) { + var cached = await dbGet(cacheDb, 'SELECT * FROM locationiq_cache WHERE coord_key = ?', [key]) + if (cached && cached.response_json) { + return { + status: Number(cached.http_status || 0), + json: JSON.parse(cached.response_json), + fromCache: true + } + } + } + + var url = buildLocationIqUrl(opts.endpoint, opts.apiKey, latitude, longitude) + var fetchedAt = new Date().toISOString() + try { + var response = await fetchJson(url, 20000) + await dbRun( + cacheDb, + `INSERT INTO locationiq_cache(coord_key, latitude, longitude, http_status, response_json, error_text, fetched_at) + VALUES (?, ?, ?, ?, ?, NULL, ?) + ON CONFLICT(coord_key) DO UPDATE SET + latitude=excluded.latitude, + longitude=excluded.longitude, + http_status=excluded.http_status, + response_json=excluded.response_json, + error_text=NULL, + fetched_at=excluded.fetched_at`, + [key, latitude, longitude, response.status, JSON.stringify(response.json), fetchedAt] + ) + return { + status: response.status, + json: response.json, + fromCache: false + } + } catch (err) { + await dbRun( + cacheDb, + `INSERT INTO locationiq_cache(coord_key, latitude, longitude, http_status, response_json, error_text, fetched_at) + VALUES (?, ?, ?, NULL, NULL, ?, ?) + ON CONFLICT(coord_key) DO UPDATE SET + latitude=excluded.latitude, + longitude=excluded.longitude, + http_status=NULL, + response_json=NULL, + error_text=excluded.error_text, + fetched_at=excluded.fetched_at`, + [key, latitude, longitude, String(err && err.message ? err.message : err), fetchedAt] + ) + throw err + } +} + +function csvEscape(value) { + if (value === null || value === undefined) return '' + var text = String(value) + if (text.indexOf('"') !== -1 || text.indexOf(',') !== -1 || text.indexOf('\n') !== -1) { + return '"' + text.replace(/"/g, '""') + '"' + } + return text +} + +async function writeCsv(cacheDb, csvPath, limit) { + var rows = await dbAll( + cacheDb, + `SELECT coord_key, latitude, longitude, source_geohash, local_name, local_placetype, local_country_id, liq_locality, liq_country_code, verdict, policy_verdict, policy_reason + FROM validation_results + ORDER BY updated_at DESC + LIMIT ?`, + [limit] + ) + + var headers = [ + 'coord_key', + 'latitude', + 'longitude', + 'source_geohash', + 'local_name', + 'local_placetype', + 'local_country_id', + 'liq_locality', + 'liq_country_code', + 'verdict', + 'policy_verdict', + 'policy_reason' + ] + + var lines = [headers.join(',')] + for (var i = 0; i < rows.length; i++) { + var row = rows[i] + lines.push(headers.map(function(key) { return csvEscape(row[key]) }).join(',')) + } + + fs.mkdirSync(path.dirname(csvPath), { recursive: true }) + fs.writeFileSync(csvPath, lines.join('\n') + '\n', 'utf8') +} + +async function main() { + var opts = parseArgs(process.argv.slice(2)) + if (opts.help) { + console.log(usage()) + process.exit(0) + } + + if (!opts.database) { + throw new Error('Missing required --database') + } + if (!fs.existsSync(opts.database)) { + throw new Error('Database not found: ' + opts.database) + } + if (!opts.apiKey) { + throw new Error('Missing LocationIQ API key (--api-key or LOCATIONIQ_API_KEY)') + } + if (!Number.isFinite(opts.samples) || opts.samples <= 0) { + throw new Error('--samples must be > 0') + } + opts.cacheDb = defaultCachePath(opts.database) + + fs.mkdirSync(path.dirname(opts.cacheDb), { recursive: true }) + + var sourceDb = dbOpen(opts.database) + var cacheDb = dbOpen(opts.cacheDb) + var geocoder = createGeocoder({ + database: opts.database, + reverseMode: opts.reverseMode === 'centroid' ? 'centroid' : 'boundary', + boundary: { + basePrecision: opts.basePrecision, + maxPrecision: opts.maxPrecision + } + }) + + try { + await ensureCacheSchema(cacheDb) + await ensureValidationColumns(cacheDb) + var lookupTable = await detectLookupTable(sourceDb) + var placetypeSource = await detectPlacetypeSource(sourceDb) + var placetypeCache = Object.create(null) + await ensureSamplePoints(sourceDb, cacheDb, lookupTable, opts.samples, Number.isFinite(opts.seed) ? opts.seed : 1337) + + var points = await dbAll( + cacheDb, + 'SELECT coord_key, latitude, longitude, source_geohash FROM sample_points ORDER BY created_at ASC, coord_key ASC LIMIT ?', + [opts.samples] + ) + + var uncachedCalls = 0 + var delayMs = Math.ceil(1000 / opts.rps) + + for (var i = 0; i < points.length; i++) { + var point = points[i] + + var localResult = await geocoder.reverse(point.latitude, point.longitude) + if (!localResult) localResult = {} + + var liqResult = await getLocationIqResponse(cacheDb, opts, point.latitude, point.longitude) + if (!liqResult.fromCache) { + uncachedCalls += 1 + } + + var liqAddress = (liqResult.json && liqResult.json.address) || {} + var liqLocality = extractLocationIqLocality(liqAddress) + var liqCountryCode = liqAddress.country_code ? String(liqAddress.country_code).toUpperCase() : '' + var liqDisplayName = liqResult.json && liqResult.json.display_name ? String(liqResult.json.display_name) : '' + + var localName = localResult.name || '' + var localCountryId = (localResult.country && localResult.country.id) ? String(localResult.country.id).toUpperCase() : '' + var localPlacetype = await resolveLocalPlacetype(sourceDb, placetypeSource, localResult.id, placetypeCache) + var localityMatch = namesMatch(normalizeName(localName), normalizeName(liqLocality)) + var countryMatch = Boolean(localCountryId && liqCountryCode && localCountryId === liqCountryCode) + var verdict = buildVerdict(localityMatch, countryMatch, localName, liqLocality) + var policyVerdict = buildPolicyVerdict({ + countryMatch: countryMatch, + strictLocalityMatch: localityMatch, + localPlacetype: localPlacetype, + localName: localName, + liqAddress: liqAddress, + liqDisplayName: liqDisplayName + }) + + await dbRun( + cacheDb, + `INSERT INTO validation_results( + coord_key, latitude, longitude, source_geohash, + local_name, local_placetype, local_country_id, local_admin1_id, local_json, + liq_locality, liq_country_code, liq_display_name, liq_json, + locality_match, country_match, policy_match, policy_reason, policy_verdict, verdict, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now')) + ON CONFLICT(coord_key) DO UPDATE SET + latitude=excluded.latitude, + longitude=excluded.longitude, + source_geohash=excluded.source_geohash, + local_name=excluded.local_name, + local_placetype=excluded.local_placetype, + local_country_id=excluded.local_country_id, + local_admin1_id=excluded.local_admin1_id, + local_json=excluded.local_json, + liq_locality=excluded.liq_locality, + liq_country_code=excluded.liq_country_code, + liq_display_name=excluded.liq_display_name, + liq_json=excluded.liq_json, + locality_match=excluded.locality_match, + country_match=excluded.country_match, + policy_match=excluded.policy_match, + policy_reason=excluded.policy_reason, + policy_verdict=excluded.policy_verdict, + verdict=excluded.verdict, + updated_at=excluded.updated_at`, + [ + point.coord_key, + point.latitude, + point.longitude, + point.source_geohash, + localName || null, + localPlacetype || null, + localCountryId || null, + (localResult.admin1 && localResult.admin1.id) ? String(localResult.admin1.id) : null, + JSON.stringify(localResult), + liqLocality || null, + liqCountryCode || null, + liqDisplayName || null, + JSON.stringify(liqResult.json), + localityMatch ? 1 : 0, + countryMatch ? 1 : 0, + policyVerdict.match ? 1 : 0, + policyVerdict.reason || null, + policyVerdict.verdict || 'policy_unset', + verdict + ] + ) + + if (!liqResult.fromCache && i < points.length - 1 && delayMs > 0) { + await sleep(delayMs) + } + } + + var verdictRows = await dbAll( + cacheDb, + `SELECT verdict, COUNT(*) AS count + FROM validation_results + WHERE coord_key IN (SELECT coord_key FROM sample_points ORDER BY created_at ASC, coord_key ASC LIMIT ?) + GROUP BY verdict + ORDER BY count DESC, verdict ASC`, + [opts.samples] + ) + + var policyVerdictRows = await dbAll( + cacheDb, + `SELECT policy_verdict, COUNT(*) AS count + FROM validation_results + WHERE coord_key IN (SELECT coord_key FROM sample_points ORDER BY created_at ASC, coord_key ASC LIMIT ?) + GROUP BY policy_verdict + ORDER BY count DESC, policy_verdict ASC`, + [opts.samples] + ) + + var totalRow = await dbGet( + cacheDb, + `SELECT COUNT(*) AS count + FROM validation_results + WHERE coord_key IN (SELECT coord_key FROM sample_points ORDER BY created_at ASC, coord_key ASC LIMIT ?)`, + [opts.samples] + ) + var policyMatchRow = await dbGet( + cacheDb, + `SELECT COUNT(*) AS count + FROM validation_results + WHERE policy_match = 1 + AND coord_key IN (SELECT coord_key FROM sample_points ORDER BY created_at ASC, coord_key ASC LIMIT ?)`, + [opts.samples] + ) + var totalCount = Number(totalRow && totalRow.count ? totalRow.count : 0) + var policyMatchCount = Number(policyMatchRow && policyMatchRow.count ? policyMatchRow.count : 0) + var policyRatePct = totalCount > 0 ? ((policyMatchCount * 100) / totalCount) : 0 + + console.log('Validation complete') + console.log('Geocoder DB: ' + opts.database) + console.log('Cache DB: ' + opts.cacheDb) + console.log('Samples evaluated: ' + totalCount) + console.log('LocationIQ uncached calls this run: ' + uncachedCalls) + console.log('Policy match rate: ' + policyMatchCount + '/' + totalCount + ' (' + policyRatePct.toFixed(1) + '%)') + console.log('Policy verdict distribution:') + for (var k = 0; k < policyVerdictRows.length; k++) { + console.log(' ' + policyVerdictRows[k].policy_verdict + ': ' + policyVerdictRows[k].count) + } + console.log('Verdict distribution:') + for (var j = 0; j < verdictRows.length; j++) { + console.log(' ' + verdictRows[j].verdict + ': ' + verdictRows[j].count) + } + + if (opts.exportCsv) { + await writeCsv(cacheDb, opts.exportCsv, opts.samples) + console.log('CSV export: ' + opts.exportCsv) + } + } finally { + if (geocoder && geocoder.db && typeof geocoder.db.close === 'function') { + await new Promise(function(resolve) { + geocoder.db.close(function() { resolve() }) + }) + } + await dbClose(sourceDb) + await dbClose(cacheDb) + } +} + +main().catch(function(err) { + console.error(err.message || err) + process.exit(1) +}) diff --git a/spec/boundary_builder_spec.js b/spec/boundary_builder_spec.js new file mode 100644 index 0000000..d788e76 --- /dev/null +++ b/spec/boundary_builder_spec.js @@ -0,0 +1,648 @@ +const fs = require('fs'); +const os = require('os'); +const path = require('path'); +const { spawnSync } = require('child_process'); +const sqlite3 = require('sqlite3'); +const geohash = require('../src/geohash'); + +function all(db, sql) { + return new Promise((resolve, reject) => { + db.all(sql, [], (err, rows) => (err ? reject(err) : resolve(rows || []))); + }); +} + +function close(db) { + return new Promise((resolve, reject) => { + db.close((err) => (err ? reject(err) : resolve())); + }); +} + +describe('boundary builder', () => { + it('drops contained localities when pruning is enabled', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'localities.geojson'); + const dbPath = path.join(dir, 'boundary.sqlite'); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 2001, + properties: { + name: 'Outer City', + placetype: 'locality', + country_id: 'US', + admin1_id: 5, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-5, -5], [5, -5], [5, 5], [-5, 5], [-5, -5]]] + } + }, + { + type: 'Feature', + id: 2002, + properties: { + name: 'Inner Duplicate', + placetype: 'locality', + country_id: 'US', + admin1_id: 5, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-1, -1], [1, -1], [1, 1], [-1, 1], [-1, -1]]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--base-precision', '4', + '--max-precision', '5', + '--index-mode', 'compact', + '--drop-contained-localities', 'true' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const rows = await all(db, 'SELECT id, name FROM compact_places ORDER BY id ASC'); + expect(rows).toEqual([{ id: 2001, name: 'Outer City' }]); + + const lookupRows = await all(db, 'SELECT geohash, place_id FROM compact_geohash_lookup'); + expect(lookupRows.length).toBeGreaterThan(0); + + const legacyRows = await all(db, "SELECT count(*) AS count FROM sqlite_master WHERE type='table' AND name='place_geometry'"); + expect(legacyRows[0].count).toEqual(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('rolls small localities up to region boundaries when configured', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'rollup.geojson'); + const dbPath = path.join(dir, 'rollup.sqlite'); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 3001, + properties: { + name: 'Wide Region', + placetype: 'region', + country_id: 'US', + admin1_id: 5, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-5, -5], [5, -5], [5, 5], [-5, 5], [-5, -5]]] + } + }, + { + type: 'Feature', + id: 3002, + properties: { + name: 'Small Village', + placetype: 'locality', + country_id: 'US', + admin1_id: 5, + population: 1200, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-0.5, -0.5], [0.5, -0.5], [0.5, 0.5], [-0.5, 0.5], [-0.5, -0.5]]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--min-population', '5000', + '--base-precision', '4', + '--max-precision', '5' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const rows = await all(db, 'SELECT id, name, placetype_code FROM compact_places ORDER BY id ASC'); + expect(rows).toEqual([{ id: 3001, name: 'Wide Region', placetype_code: 2 }]); + + const lookupRows = await all(db, 'SELECT count(*) AS count FROM compact_geohash_lookup'); + expect(lookupRows[0].count).toBeGreaterThan(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('keeps point capitals as locality cells even below min population', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'point-capital.geojson'); + const dbPath = path.join(dir, 'point-capital.sqlite'); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 4001, + properties: { + name: 'Fallback Region', + placetype: 'region', + country_id: 'GF', + admin1_id: 85671195, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-53.2, 4.8], [-52.2, 4.8], [-52.2, 5.2], [-53.2, 5.2], [-53.2, 4.8]]] + } + }, + { + type: 'Feature', + id: 4002, + properties: { + name: 'Cayenne', + placetype: 'locality', + country_id: 'GF', + admin1_id: 85671195, + population: 600, + 'gn:feature_code': 'PPLC', + is_current: 1 + }, + geometry: { + type: 'Point', + coordinates: [-52.33333, 4.93333] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--min-population', '5000', + '--base-precision', '4', + '--max-precision', '5' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const places = await all(db, 'SELECT id, name, placetype_code FROM compact_places ORDER BY id ASC'); + expect(places).toEqual([ + { id: 4001, name: 'Fallback Region', placetype_code: 2 }, + { id: 4002, name: 'Cayenne', placetype_code: 0 } + ]); + + const capitalRows = await all(db, 'SELECT geohash FROM compact_geohash_lookup WHERE place_id = 4002'); + expect(capitalRows.length).toEqual(1); + expect(capitalRows[0].geohash.length).toBeGreaterThanOrEqual(4); + expect(capitalRows[0].geohash.length).toBeLessThanOrEqual(5); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('promotes locality over region when there is no competing locality in the same parent cell', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'locality-region-promotion.geojson'); + const dbPath = path.join(dir, 'locality-region-promotion.sqlite'); + const parentHash = 's000'; + const parentBbox = geohash.decodeBbox(parentHash); + const midLon = (parentBbox.minLon + parentBbox.maxLon) / 2; + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 5001, + properties: { + name: 'Wide Region', + placetype: 'region', + country_id: 'AR', + admin1_id: 1, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 5002, + properties: { + name: 'Metro City', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 1000000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [midLon, parentBbox.minLat], + [midLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--base-precision', '4', + '--max-precision', '5', + '--promote-locality-over-region', 'true' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const parentRow = await all(db, `SELECT geohash, place_id FROM compact_geohash_lookup WHERE geohash='${parentHash}'`); + expect(parentRow).toEqual([{ geohash: parentHash, place_id: 5002 }]); + + const regionDescendants = await all( + db, + `SELECT COUNT(*) AS count FROM compact_geohash_lookup WHERE geohash LIKE '${parentHash}%' AND geohash <> '${parentHash}' AND place_id = 5001` + ); + expect(regionDescendants[0].count).toEqual(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('does not promote locality to parent cell when locality child-share is below threshold', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'locality-region-share-threshold.geojson'); + const dbPath = path.join(dir, 'locality-region-share-threshold.sqlite'); + const parentHash = 's000'; + const parentBbox = geohash.decodeBbox(parentHash); + const splitLon = parentBbox.minLon + ((parentBbox.maxLon - parentBbox.minLon) * 0.25); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 5101, + properties: { + name: 'Wide Region', + placetype: 'region', + country_id: 'AR', + admin1_id: 1, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 5102, + properties: { + name: 'Small Town', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 40000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [splitLon, parentBbox.minLat], + [splitLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--base-precision', '4', + '--max-precision', '5', + '--parent-locality-min-share', '0.5' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const parentRow = await all(db, `SELECT geohash, place_id FROM compact_geohash_lookup WHERE geohash='${parentHash}'`); + expect(parentRow).toEqual([{ geohash: parentHash, place_id: 5101 }]); + + const localityDescendants = await all( + db, + `SELECT COUNT(*) AS count FROM compact_geohash_lookup WHERE geohash LIKE '${parentHash}%' AND geohash <> '${parentHash}' AND place_id = 5102` + ); + expect(localityDescendants[0].count).toBeGreaterThan(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('rolls parent cells to a dominant major locality and suppresses minor locality descendants', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'dominant-locality.geojson'); + const dbPath = path.join(dir, 'dominant-locality.sqlite'); + const parentHash = 's000'; + const parentBbox = geohash.decodeBbox(parentHash); + const splitLon = parentBbox.minLon + ((parentBbox.maxLon - parentBbox.minLon) * 0.75); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 6001, + properties: { + name: 'Fallback Region', + placetype: 'region', + country_id: 'AR', + admin1_id: 1, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 6002, + properties: { + name: 'Metro Core', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 1200000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [splitLon, parentBbox.minLat], + [splitLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 6003, + properties: { + name: 'Outer Hamlet', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 18000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [splitLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [splitLon, parentBbox.maxLat], + [splitLon, parentBbox.minLat] + ]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--base-precision', '4', + '--max-precision', '5', + '--dominant-locality-population', '100000', + '--dominant-locality-ratio', '3' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const parentRow = await all(db, `SELECT geohash, place_id FROM compact_geohash_lookup WHERE geohash='${parentHash}'`); + expect(parentRow).toEqual([{ geohash: parentHash, place_id: 6002 }]); + + const minorDescendants = await all( + db, + `SELECT COUNT(*) AS count FROM compact_geohash_lookup WHERE geohash LIKE '${parentHash}%' AND geohash <> '${parentHash}' AND place_id = 6003` + ); + expect(minorDescendants[0].count).toEqual(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('keeps fine locality borders when multiple major localities compete', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'major-competition.geojson'); + const dbPath = path.join(dir, 'major-competition.sqlite'); + const parentHash = 's000'; + const parentBbox = geohash.decodeBbox(parentHash); + const midLon = (parentBbox.minLon + parentBbox.maxLon) / 2; + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 7001, + properties: { + name: 'Fallback Region', + placetype: 'region', + country_id: 'AR', + admin1_id: 1, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 7002, + properties: { + name: 'West Major City', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 1000000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [midLon, parentBbox.minLat], + [midLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 7003, + properties: { + name: 'East Major City', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 850000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [midLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [midLon, parentBbox.maxLat], + [midLon, parentBbox.minLat] + ]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--base-precision', '4', + '--max-precision', '5', + '--dominant-locality-population', '100000', + '--dominant-locality-ratio', '3' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const majorRows = await all( + db, + `SELECT place_id, COUNT(*) AS count + FROM compact_geohash_lookup + WHERE geohash LIKE '${parentHash}%' AND place_id IN (7002, 7003) + GROUP BY place_id + ORDER BY place_id` + ); + expect(majorRows).toEqual([ + { place_id: 7002, count: jasmine.any(Number) }, + { place_id: 7003, count: jasmine.any(Number) } + ]); + expect(majorRows[0].count).toBeGreaterThan(0); + expect(majorRows[1].count).toBeGreaterThan(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/spec/boundary_cover_spec.js b/spec/boundary_cover_spec.js new file mode 100644 index 0000000..a75e97d --- /dev/null +++ b/spec/boundary_cover_spec.js @@ -0,0 +1,52 @@ +const geohash = require('../src/geohash'); +const boundaryCover = require('../src/boundary_cover'); + +describe('boundary geohash cover', () => { + it('marks fully contained geohash cells as full', () => { + const cell = geohash.decodeBbox('s000'); + const exactCellPolygon = { + type: 'Polygon', + coordinates: [[ + [cell.minLon, cell.minLat], + [cell.maxLon, cell.minLat], + [cell.maxLon, cell.maxLat], + [cell.minLon, cell.maxLat], + [cell.minLon, cell.minLat] + ]] + }; + + const cover = boundaryCover.buildGeohashCoverForGeometry(exactCellPolygon, { + basePrecision: 4, + maxPrecision: 6 + }); + + expect(cover).toContain(jasmine.objectContaining({ + geohash: 's000', + precision: 4, + coverageType: 'full' + })); + }); + + it('subdivides partial cells until max precision', () => { + const cell = geohash.decodeBbox('s000'); + const diagonalPolygon = { + type: 'Polygon', + coordinates: [[ + [cell.minLon, cell.minLat], + [cell.maxLon, cell.minLat], + [cell.minLon, cell.maxLat], + [cell.minLon, cell.minLat] + ]] + }; + + const cover = boundaryCover.buildGeohashCoverForGeometry(diagonalPolygon, { + basePrecision: 4, + maxPrecision: 5 + }); + + expect(cover.some((entry) => entry.precision === 5)).toBeTrue(); + + const uniqueKeys = new Set(cover.map((entry) => `${entry.geohash}|${entry.precision}`)); + expect(uniqueKeys.size).toEqual(cover.length); + }); +}); diff --git a/spec/expo_adapter_spec.js b/spec/expo_adapter_spec.js new file mode 100644 index 0000000..0500da4 --- /dev/null +++ b/spec/expo_adapter_spec.js @@ -0,0 +1,28 @@ +const createExpoGeocoder = require('../src/expo.js'); +const fixtureDb = require('./helpers/fixture_db'); + +describe('expo adapter', () => { + var fixture; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + + it('runs queries through the getAllAsync shim', (done) => { + var db = fixtureDb.createExpoDb(fixture.databasePath); + var geocoder = createExpoGeocoder({ db: db }); + + geocoder.reverse(41.89, 12.49) + .then(function(result) { + expect(result.id).toEqual(3169070); + done(); + }); + }); +}); diff --git a/spec/forward_spec.js b/spec/forward_spec.js new file mode 100644 index 0000000..8746659 --- /dev/null +++ b/spec/forward_spec.js @@ -0,0 +1,49 @@ +const createGeocoder = require('../src/index.js'); +const fixtureDb = require('./helpers/fixture_db'); + +describe('geocoder.forward', () => { + var fixture, geocoder; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + geocoder = createGeocoder({ database: fixture.databasePath }); + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + + it('returns the best match for an exact query', (done) => { + geocoder.forward('Rome') + .then(function(result) { + expect(result).toEqual({ + id: 3169070, + name: 'Rome', + formatted: 'Rome, Latium, Italy', + country: { id: 'IT', name: 'Italy' }, + admin1: { id: 7, name: 'Latium' }, + coordinates: { latitude: 41.89193, longitude: 12.51133 } + }); + done(); + }); + }); + + it('falls back to fuzzy matching', (done) => { + geocoder.forward('angeles') + .then(function(result) { + expect(result.id).toEqual(5368361); + done(); + }); + }); + + it('returns undefined when nothing matches', (done) => { + geocoder.forward('xyzzy-not-a-city') + .then(function(result) { + expect(result).toBeUndefined(); + done(); + }); + }); +}); diff --git a/spec/geometry_spec.js b/spec/geometry_spec.js new file mode 100644 index 0000000..6fd54e6 --- /dev/null +++ b/spec/geometry_spec.js @@ -0,0 +1,47 @@ +const geometry = require('../src/geometry'); + +describe('geometry utilities', () => { + it('handles polygon holes in point-in-polygon checks', () => { + const polygonWithHole = { + type: 'Polygon', + coordinates: [ + [[-5, -5], [5, -5], [5, 5], [-5, 5], [-5, -5]], + [[-1, -1], [1, -1], [1, 1], [-1, 1], [-1, -1]] + ] + }; + + expect(geometry.pointInGeometry(polygonWithHole, 2, 2)).toBeTrue(); + expect(geometry.pointInGeometry(polygonWithHole, 0, 0)).toBeFalse(); + }); + + it('supports multipolygon containment', () => { + const multipolygon = { + type: 'MultiPolygon', + coordinates: [ + [[[-11, -11], [-9, -11], [-9, -9], [-11, -9], [-11, -11]]], + [[[9, 9], [11, 9], [11, 11], [9, 11], [9, 9]]] + ] + }; + + expect(geometry.pointInGeometry(multipolygon, 10, 10)).toBeTrue(); + expect(geometry.pointInGeometry(multipolygon, 0, 0)).toBeFalse(); + }); + + it('detects when one geometry is contained by another', () => { + const outer = { + type: 'Polygon', + coordinates: [[[-5, -5], [5, -5], [5, 5], [-5, 5], [-5, -5]]] + }; + const inner = { + type: 'Polygon', + coordinates: [[[-1, -1], [1, -1], [1, 1], [-1, 1], [-1, -1]]] + }; + const farAway = { + type: 'Polygon', + coordinates: [[[10, 10], [11, 10], [11, 11], [10, 11], [10, 10]]] + }; + + expect(geometry.geometryContainsGeometry(outer, inner)).toBeTrue(); + expect(geometry.geometryContainsGeometry(outer, farAway)).toBeFalse(); + }); +}); diff --git a/spec/helpers/fixture_db.js b/spec/helpers/fixture_db.js new file mode 100644 index 0000000..1a8deee --- /dev/null +++ b/spec/helpers/fixture_db.js @@ -0,0 +1,261 @@ +"use strict"; + +const fs = require('fs') +const os = require('os') +const path = require('path') +const sqlite3 = require('sqlite3') + +const boundaryCover = require('../../src/boundary_cover') +const geometry = require('../../src/geometry') + +const schemaSql = fs.readFileSync(path.join(__dirname, '../../scripts/schema.sql'), 'utf8') + +const fixtureSql = ` +INSERT INTO countries(id, name) VALUES ('IT', 'Italy'), ('FR', 'France'), ('US', 'United States'); +INSERT INTO admin1(country_id, id, name) VALUES + ('IT', 7, 'Latium'), + ('FR', 11, 'Ile-de-France'), + ('US', 36, 'New York'), + ('US', 5, 'California'); +INSERT INTO features(id, name, asciiname, country_id, admin1_id, population) VALUES + (3169070, 'Rome', 'Rome', 'IT', 7, 2873000), + (2988507, 'Paris', 'Paris', 'FR', 11, 2138551), + (5128581, 'New York City', 'New York City', 'US', 36, 8175133), + (5368361, 'Los Angeles', 'Los Angeles', 'US', 5, 3792621), + (9100001, 'Westville', 'Westville', 'US', 5, 50000), + (9100002, 'Eastville', 'Eastville', 'US', 5, 60000), + (9100003, 'Centerville', 'Centerville', 'US', 5, 20000), + (9100004, 'Midtown', 'Midtown', 'US', 5, 1000); +INSERT INTO coordinates(feature_id, latitude, longitude) VALUES + (3169070, 41.89193, 12.51133), + (2988507, 48.85341, 2.3488), + (5128581, 40.71427, -74.00597), + (5368361, 34.05223, -118.24368), + (9100001, 0, -2), + (9100002, 0, 0.2), + (9100003, 0, 0.1), + (9100004, 0, 0.05); +` + +const boundaryFixtures = [ + { + id: 9100001, + name: 'Westville', + countryId: 'US', + admin1Id: 5, + placetype: 'locality', + priorityRank: 20, + geometry: { + type: 'Polygon', + coordinates: [[ + [-1, -1], + [0, -1], + [0, 1], + [-1, 1], + [-1, -1] + ]] + } + }, + { + id: 9100002, + name: 'Eastville', + countryId: 'US', + admin1Id: 5, + placetype: 'locality', + priorityRank: 30, + geometry: { + type: 'Polygon', + coordinates: [[ + [0, -1], + [1, -1], + [1, 1], + [0, 1], + [0, -1] + ]] + } + }, + { + id: 9100003, + name: 'Centerville', + countryId: 'US', + admin1Id: 5, + placetype: 'locality', + priorityRank: 10, + geometry: { + type: 'Polygon', + coordinates: [[ + [-0.2, -0.2], + [0.2, -0.2], + [0.2, 0.2], + [-0.2, 0.2], + [-0.2, -0.2] + ]] + } + }, + { + id: 9100004, + name: 'Midtown', + countryId: 'US', + admin1Id: 5, + placetype: 'neighbourhood', + priorityRank: 1, + geometry: { + type: 'Polygon', + coordinates: [[ + [-0.15, -0.15], + [0.15, -0.15], + [0.15, 0.15], + [-0.15, 0.15], + [-0.15, -0.15] + ]] + } + } +] + +function exec(db, sql) { + return new Promise(function(resolve, reject) { + db.exec(sql, function(err) { err ? reject(err) : resolve() }) + }) +} + +function run(db, sql, params) { + return new Promise(function(resolve, reject) { + db.run(sql, params || [], function(err) { err ? reject(err) : resolve() }) + }) +} + +function close(db) { + return new Promise(function(resolve, reject) { + db.close(function(err) { err ? reject(err) : resolve() }) + }) +} + +async function seedBoundaryData(db) { + await exec(db, 'BEGIN') + + try { + var compactByHash = Object.create(null) + + for (var i = 0; i < boundaryFixtures.length; i++) { + var place = boundaryFixtures[i] + var normalizedGeometry = geometry.normalizeGeometry(place.geometry) + var bbox = geometry.geometryBbox(normalizedGeometry) + var area = geometry.geometryArea(normalizedGeometry) + var cover = boundaryCover.buildGeohashCoverForGeometry(normalizedGeometry, { + basePrecision: 4, + maxPrecision: 7 + }) + + await run(db, ` + INSERT INTO places( + id, name, country_id, admin1_id, placetype, + centroid_lat, centroid_lon, + bbox_min_lat, bbox_min_lon, bbox_max_lat, bbox_max_lon, + priority_rank, area, country_name, admin1_name + ) + SELECT + f.id, + f.name, + ?, + ?, + ?, + c.latitude, + c.longitude, + ?, ?, ?, ?, + ?, ?, + 'United States', + 'California' + FROM features f + JOIN coordinates c ON c.feature_id = f.id + WHERE f.id = ? + `, [ + place.countryId, + place.admin1Id, + place.placetype, + bbox.minLat, + bbox.minLon, + bbox.maxLat, + bbox.maxLon, + place.priorityRank, + area, + place.id + ]) + + await run(db, ` + INSERT INTO place_geometry(place_id, encoding, geometry) + VALUES (?, 'json', ?) + `, [place.id, JSON.stringify(normalizedGeometry)]) + + for (var j = 0; j < cover.length; j++) { + await run(db, ` + INSERT INTO place_geohash_cover(geohash, precision, place_id, coverage_type) + VALUES (?, ?, ?, ?) + `, [cover[j].geohash, cover[j].precision, place.id, cover[j].coverageType]) + + if (place.placetype === 'locality' || place.placetype === 'localadmin') { + var existing = compactByHash[cover[j].geohash] + if (!existing || place.priorityRank < existing.priorityRank || (place.priorityRank === existing.priorityRank && place.id < existing.placeId)) { + compactByHash[cover[j].geohash] = { + placeId: place.id, + priorityRank: place.priorityRank + } + } + } + } + } + + var hashes = Object.keys(compactByHash) + for (var h = 0; h < hashes.length; h++) { + var hash = hashes[h] + await run(db, ` + INSERT INTO place_geohash_lookup(geohash, place_id) + VALUES (?, ?) + `, [hash, compactByHash[hash].placeId]) + } + + await exec(db, 'COMMIT') + } catch (err) { + await exec(db, 'ROLLBACK') + throw err + } +} + +function createFixtureDatabase() { + var dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-')) + var dbPath = path.join(dir, 'fixture.sqlite') + var db = new sqlite3.Database(dbPath) + + return exec(db, schemaSql) + .then(function() { return exec(db, fixtureSql) }) + .then(function() { return seedBoundaryData(db) }) + .then(function() { return close(db) }) + .then(function() { + return { + databasePath: dbPath, + cleanup: function() { + fs.rmSync(dir, { recursive: true, force: true }) + } + } + }) +} + +// Minimal shim that looks like an expo-sqlite database so we can test the +// Expo adapter without pulling in the real package. +function createExpoDb(dbPath) { + var db = new sqlite3.Database(dbPath) + return { + getAllAsync: function(sql, params) { + return new Promise(function(resolve, reject) { + db.all(sql, params || [], function(err, rows) { + err ? reject(err) : resolve(rows || []) + }) + }) + }, + closeAsync: function() { return close(db) } + } +} + +module.exports = { + createFixtureDatabase: createFixtureDatabase, + createExpoDb: createExpoDb +} diff --git a/spec/location_spec.js b/spec/location_spec.js index 72c9de1..b53a637 100644 --- a/spec/location_spec.js +++ b/spec/location_spec.js @@ -1,6 +1,21 @@ -const geocoder = require('../src/index.js')(); +const createGeocoder = require('../src/index.js'); +const fixtureDb = require('./helpers/fixture_db'); describe('geocoder.location', () => { + var fixture, geocoder; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + geocoder = createGeocoder({ database: fixture.databasePath }); + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + describe('.find', () => { it('performs a lookup by id', (done) => { geocoder.location().find(3169070) @@ -17,6 +32,14 @@ describe('geocoder.location', () => { }); }); + it('accepts geonames: prefixed ids', (done) => { + geocoder.location.find('geonames:3169070') + .then(function(result) { + expect(result.id).toEqual(3169070); + done(); + }); + }); + it("resolves undefined when a location can't be found", (done) => { geocoder.location().find(-1) .then(function(result) { diff --git a/spec/reverse_boundary_compact_spec.js b/spec/reverse_boundary_compact_spec.js new file mode 100644 index 0000000..c8bd750 --- /dev/null +++ b/spec/reverse_boundary_compact_spec.js @@ -0,0 +1,67 @@ +const sqlite3 = require('sqlite3'); +const createGeocoder = require('../src/index.js'); +const fixtureDb = require('./helpers/fixture_db'); + +function exec(db, sql) { + return new Promise((resolve, reject) => { + db.exec(sql, (err) => (err ? reject(err) : resolve())); + }); +} + +function close(db) { + return new Promise((resolve, reject) => { + db.close((err) => (err ? reject(err) : resolve())); + }); +} + +describe('geocoder.reverse boundary mode (compact geohash lookup)', () => { + var fixture; + var geocoder; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(async function(f) { + fixture = f; + + const db = new sqlite3.Database(fixture.databasePath); + try { + // Force compact-only runtime path in this fixture. + await exec(db, 'DELETE FROM place_geohash_cover; DELETE FROM place_geometry;'); + } finally { + await close(db); + } + + geocoder = createGeocoder({ + database: fixture.databasePath, + reverseMode: 'boundary', + boundary: { + basePrecision: 4, + maxPrecision: 7 + } + }); + + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + + it('uses compact geohash lookup for containing areas', (done) => { + geocoder.reverse(0, -0.5) + .then(function(result) { + expect(result.id).toEqual(9100001); + expect(result.name).toEqual('Westville'); + done(); + }); + }); + + it('falls back to nearest boundary centroid when no compact hash matches', (done) => { + geocoder.reverse(0, 1.5) + .then(function(result) { + expect(result.id).toEqual(9100002); + expect(result.name).toEqual('Eastville'); + done(); + }); + }); +}); diff --git a/spec/reverse_boundary_rollup_spec.js b/spec/reverse_boundary_rollup_spec.js new file mode 100644 index 0000000..b13f9cf --- /dev/null +++ b/spec/reverse_boundary_rollup_spec.js @@ -0,0 +1,114 @@ +const fs = require('fs'); +const os = require('os'); +const path = require('path'); +const { spawnSync } = require('child_process'); +const createGeocoder = require('../src/index.js'); + +describe('geocoder.reverse boundary mode (locality roll-up)', () => { + var dir; + var dbPath; + var geocoder; + + beforeAll(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-rollup-')); + const inputPath = path.join(dir, 'rollup.geojson'); + dbPath = path.join(dir, 'rollup.sqlite'); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 4001, + properties: { + name: 'Macro Region', + placetype: 'region', + country_id: 'US', + admin1_id: 5, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-6, -6], [6, -6], [6, 6], [-6, 6], [-6, -6]]] + } + }, + { + type: 'Feature', + id: 4002, + properties: { + name: 'Tiny Hamlet', + placetype: 'locality', + country_id: 'US', + admin1_id: 5, + population: 800, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-0.8, -0.8], [0.8, -0.8], [0.8, 0.8], [-0.8, 0.8], [-0.8, -0.8]]] + } + }, + { + type: 'Feature', + id: 4003, + properties: { + name: 'Big City', + placetype: 'locality', + country_id: 'US', + admin1_id: 5, + population: 90000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[2, 2], [3.5, 2], [3.5, 3.5], [2, 3.5], [2, 2]]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--min-population', '5000', + '--base-precision', '4', + '--max-precision', '5' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + geocoder = createGeocoder({ + database: dbPath, + reverseMode: 'boundary', + boundary: { + basePrecision: 4, + maxPrecision: 5 + } + }); + }); + + afterAll(() => { + fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('returns region for low-pop locality area when locality is rolled up', (done) => { + geocoder.reverse(0, 0) + .then(function(result) { + expect(result.id).toEqual(4001); + expect(result.name).toEqual('Macro Region'); + done(); + }); + }); + + it('keeps higher-pop locality labels where available', (done) => { + geocoder.reverse(2.7, 2.7) + .then(function(result) { + expect(result.id).toEqual(4003); + expect(result.name).toEqual('Big City'); + done(); + }); + }); +}); diff --git a/spec/reverse_boundary_spec.js b/spec/reverse_boundary_spec.js new file mode 100644 index 0000000..786fe14 --- /dev/null +++ b/spec/reverse_boundary_spec.js @@ -0,0 +1,53 @@ +const createGeocoder = require('../src/index.js'); +const fixtureDb = require('./helpers/fixture_db'); + +describe('geocoder.reverse boundary mode', () => { + var fixture; + var geocoder; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + geocoder = createGeocoder({ + database: fixture.databasePath, + reverseMode: 'boundary', + boundary: { + basePrecision: 4, + maxPrecision: 7 + } + }); + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + + it('chooses containing locality polygons instead of nearest centroids', (done) => { + geocoder.reverse(0, -0.5) + .then(function(result) { + expect(result.id).toEqual(9100001); + expect(result.name).toEqual('Westville'); + done(); + }); + }); + + it('uses deterministic tie-breakers and ignores neighbourhood placetypes', (done) => { + geocoder.reverse(0.1, 0.1) + .then(function(result) { + expect(result.id).toEqual(9100003); + expect(result.name).toEqual('Centerville'); + done(); + }); + }); + + it('falls back to nearest boundary centroid when no polygon contains the point', (done) => { + geocoder.reverse(0, 1.5) + .then(function(result) { + expect(result.id).toEqual(9100002); + expect(result.name).toEqual('Eastville'); + done(); + }); + }); +}); diff --git a/spec/reverse_spec.js b/spec/reverse_spec.js index a5c40e0..1708f91 100644 --- a/spec/reverse_spec.js +++ b/spec/reverse_spec.js @@ -1,6 +1,21 @@ -const geocoder = require('../src/index.js')(); +const createGeocoder = require('../src/index.js'); +const fixtureDb = require('./helpers/fixture_db'); describe('geocoder.reverse', () => { + var fixture, geocoder; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + geocoder = createGeocoder({ database: fixture.databasePath }); + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + it('performs reverse geocoding on a latitude and longitude', (done) => { geocoder.reverse(41.89, 12.49) .then(function(result) { @@ -17,7 +32,7 @@ describe('geocoder.reverse', () => { }); it("resolves an empty object when a location can't be found", (done) => { - geocoder.reverse(0, 0) + geocoder.reverse(80, 80) .then(function(result) { expect(result).toEqual({}); done(); diff --git a/spec/schema_spec.js b/spec/schema_spec.js new file mode 100644 index 0000000..5aba1da --- /dev/null +++ b/spec/schema_spec.js @@ -0,0 +1,65 @@ +const sqlite3 = require('sqlite3'); +const fixtureDb = require('./helpers/fixture_db'); + +describe('generated schema', () => { + var fixture, db; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + db = new sqlite3.Database(fixture.databasePath); + done(); + }); + }); + + afterAll((done) => { + db.close(function() { + fixture.cleanup(); + done(); + }); + }); + + it('has asciiname and population in the everything view', (done) => { + db.all('PRAGMA table_info(everything)', [], function(err, cols) { + var names = cols.map(function(c) { return c.name }); + expect(names).toContain('asciiname'); + expect(names).toContain('population'); + done(); + }); + }); + + it('creates indexes for reverse and forward lookups', (done) => { + db.all("PRAGMA index_list('coordinates')", [], function(err, coordIndexes) { + db.all("PRAGMA index_list('features')", [], function(err, featIndexes) { + db.all("PRAGMA index_list('place_geohash_cover')", [], function(err, boundaryIndexes) { + db.all("PRAGMA index_list('place_geohash_lookup')", [], function(err, compactIndexes) { + var coordNames = coordIndexes.map(function(i) { return i.name }); + var featNames = featIndexes.map(function(i) { return i.name }); + var boundaryNames = boundaryIndexes.map(function(i) { return i.name }); + var compactNames = compactIndexes.map(function(i) { return i.name }); + + expect(coordNames).toContain('coordinates_lat_lng'); + expect(featNames).toContain('features_name_nocase'); + expect(featNames).toContain('features_asciiname_nocase'); + expect(featNames).toContain('features_population_desc'); + expect(boundaryNames).toContain('place_geohash_cover_hash_precision'); + expect(boundaryNames).toContain('place_geohash_cover_place_id'); + expect(compactNames).toContain('place_geohash_lookup_place_id'); + done(); + }); + }); + }); + }); + }); + + it('includes boundary lookup tables', (done) => { + db.all("SELECT name FROM sqlite_master WHERE type='table' AND name IN ('places', 'place_geohash_cover', 'place_geometry', 'place_geohash_lookup')", [], function(err, rows) { + var names = rows.map(function(row) { return row.name }); + expect(names).toContain('places'); + expect(names).toContain('place_geohash_cover'); + expect(names).toContain('place_geometry'); + expect(names).toContain('place_geohash_lookup'); + done(); + }); + }); +}); diff --git a/src/boundary_cover.js b/src/boundary_cover.js new file mode 100644 index 0000000..dc8ee86 --- /dev/null +++ b/src/boundary_cover.js @@ -0,0 +1,99 @@ +"use strict"; + +const geohash = require('./geohash') +const geometry = require('./geometry') + +const EPSILON = 1e-12 + +function clampLatitude(value) { + var lat = Number(value) + if (lat > 90) lat = 90 + if (lat < -90) lat = -90 + return lat +} + +function clampLongitude(value) { + var lon = Number(value) + if (lon > 180) lon = 180 + if (lon < -180) lon = -180 + return lon +} + +function seedGeohashesForBbox(bbox, precision) { + var size = geohash.cellSize(precision) + var minLat = clampLatitude(bbox.minLat) + var maxLat = clampLatitude(bbox.maxLat - EPSILON) + var minLon = clampLongitude(bbox.minLon) + var maxLon = clampLongitude(bbox.maxLon - EPSILON) + + var latStart = Math.floor((minLat + 90) / size.lat) + var latEnd = Math.floor((maxLat + 90) / size.lat) + var lonStart = Math.floor((minLon + 180) / size.lon) + var lonEnd = Math.floor((maxLon + 180) / size.lon) + + var hashes = Object.create(null) + + for (var latIndex = latStart; latIndex <= latEnd; latIndex++) { + var centerLat = -90 + (latIndex + 0.5) * size.lat + for (var lonIndex = lonStart; lonIndex <= lonEnd; lonIndex++) { + var centerLon = -180 + (lonIndex + 0.5) * size.lon + hashes[geohash.encode(centerLat, centerLon, precision)] = true + } + } + + return Object.keys(hashes) +} + +function buildGeohashCoverForGeometry(inputGeometry, options) { + var opts = options || {} + var basePrecision = Number(opts.basePrecision || 4) + var maxPrecision = Number(opts.maxPrecision || 7) + + if (basePrecision < 1) basePrecision = 1 + if (maxPrecision < basePrecision) maxPrecision = basePrecision + + var normalized = geometry.normalizeGeometry(inputGeometry) + var bounds = geometry.geometryBbox(normalized) + var seeds = seedGeohashesForBbox(bounds, basePrecision) + var terminal = Object.create(null) + + function walk(hash, precision) { + var cellBbox = geohash.decodeBbox(hash) + var status = geometry.classifyCell(normalized, cellBbox) + + if (status === 'outside') { + return + } + + if (status === 'partial' && precision < maxPrecision) { + geohash.children(hash).forEach(function(child) { + walk(child, precision + 1) + }) + return + } + + terminal[hash + '|' + precision] = { + geohash: hash, + precision: precision, + coverageType: status + } + } + + seeds.forEach(function(hash) { + walk(hash, basePrecision) + }) + + return Object.keys(terminal) + .map(function(key) { return terminal[key] }) + .sort(function(a, b) { + if (a.precision !== b.precision) return a.precision - b.precision + if (a.geohash < b.geohash) return -1 + if (a.geohash > b.geohash) return 1 + return 0 + }) +} + +module.exports = { + buildGeohashCoverForGeometry: buildGeohashCoverForGeometry, + seedGeohashesForBbox: seedGeohashesForBbox +} diff --git a/src/expo.js b/src/expo.js new file mode 100644 index 0000000..183d86f --- /dev/null +++ b/src/expo.js @@ -0,0 +1,108 @@ +"use strict"; + +const reverse = require('./reverse') +const forward = require('./forward') +const findLocation = require('./location').find + +function normalizeReverseMode(options) { + var mode = options.reverseMode + if (mode === undefined && options.reverse && options.reverse.mode) { + mode = options.reverse.mode + } + + var normalized = String(mode || 'centroid').toLowerCase() + return normalized === 'boundary' ? 'boundary' : 'centroid' +} + +function resolveBoundaryOptions(options) { + var boundary = options.boundary || {} + + var basePrecision = Number(boundary.basePrecision) + if (!Number.isFinite(basePrecision) || basePrecision < 1) { + basePrecision = 4 + } + + var maxPrecision = Number(boundary.maxPrecision) + if (!Number.isFinite(maxPrecision) || maxPrecision < basePrecision) { + maxPrecision = 7 + } + + return { + basePrecision: basePrecision, + maxPrecision: maxPrecision + } +} + +// Wraps an expo-sqlite database to match the node-sqlite3 callback +// interface that reverse.js, forward.js and location.js expect. +function wrapExpoDb(expoDb) { + return { + all: function(sql, params, callback) { + expoDb.getAllAsync(sql, params || []) + .then(function(rows) { callback(null, rows) }) + .catch(function(err) { callback(err) }) + }, + close: function(callback) { + if (typeof expoDb.closeAsync === 'function') { + expoDb.closeAsync() + .then(function() { if (callback) callback(null) }) + .catch(function(err) { if (callback) callback(err) }) + } else if (callback) { + callback(null) + } + } + } +} + +function ExpoGeocoder(options) { + var opts = options || {} + var expoDb = opts.db || opts.database + + if (!expoDb || typeof expoDb.getAllAsync !== 'function') { + throw new Error('Pass an opened expo-sqlite db via { db }.') + } + + this.db = wrapExpoDb(expoDb) + this.options = opts + this.reverseMode = normalizeReverseMode(opts) + this.reverseDebug = Boolean(opts.reverseDebug) + this.boundaryOptions = resolveBoundaryOptions(opts) +} + +ExpoGeocoder.prototype.reverse = function(latitude, longitude, callback) { + return reverse(this, latitude, longitude, callback) +} + +ExpoGeocoder.prototype.forward = function(query, callback) { + return forward(this, query, callback) +} + +ExpoGeocoder.prototype.location = function() { + const _this = this + + return { + find: function(locationId) { + return findLocation(_this, locationId) + } + } +} + +function createExpoGeocoder(options) { + var instance = new ExpoGeocoder(options) + + var locationFn = function() { + return { + find: function(locationId) { + return findLocation(instance, locationId) + } + } + } + locationFn.find = function(locationId) { + return findLocation(instance, locationId) + } + instance.location = locationFn + + return instance +} + +module.exports = createExpoGeocoder; diff --git a/src/forward.js b/src/forward.js new file mode 100644 index 0000000..8203d97 --- /dev/null +++ b/src/forward.js @@ -0,0 +1,131 @@ +"use strict"; + +const formatLocation = require('./location').format + +// Forward geocoding: tries an exact match on name/asciiname first, then +// falls back to prefix and substring matching. +// +// Requires the updated schema with asciiname and population columns. +// Returns undefined on databases without those columns. +function findByName(geocoder, query, callback) { + return new Promise(function(resolve, reject) { + var q = typeof query === 'string' ? query.trim() : '' + if (!q) { + if (typeof(callback) == 'function') { + callback(undefined, undefined) + } else { + resolve(undefined) + } + return + } + + // Check if the database supports forward search (cached per geocoder) + if (geocoder._forwardSupported === false) { + if (typeof(callback) == 'function') { + callback(undefined, undefined) + } else { + resolve(undefined) + } + return + } + + function doSearch() { + var exactQuery = `SELECT * FROM everything + WHERE name = ? COLLATE NOCASE OR asciiname = ? COLLATE NOCASE + ORDER BY + CASE WHEN name = ? COLLATE NOCASE THEN 0 + WHEN asciiname = ? COLLATE NOCASE THEN 1 + ELSE 2 END, + population DESC, id ASC + LIMIT 1` + + geocoder.db.all(exactQuery, [q, q, q, q], function(err, rows) { + if (err) { + if (typeof(callback) == 'function') { + callback(err, undefined) + } else if (typeof(reject) == 'function') { + reject(err) + } + return + } + + if (rows && rows[0]) { + const result = formatLocation(rows[0]) + if (typeof(callback) == 'function') { + callback(undefined, result) + } else { + resolve(result) + } + return + } + + // Fall back to prefix / substring match + var prefix = q + '%' + var contains = '%' + q + '%' + var fuzzyQuery = `SELECT * FROM everything + WHERE name LIKE ? COLLATE NOCASE + OR name LIKE ? COLLATE NOCASE + OR asciiname LIKE ? COLLATE NOCASE + OR asciiname LIKE ? COLLATE NOCASE + ORDER BY + CASE WHEN name LIKE ? COLLATE NOCASE THEN 0 + WHEN asciiname LIKE ? COLLATE NOCASE THEN 1 + ELSE 2 END, + population DESC, LENGTH(name) ASC, id ASC + LIMIT 1` + + geocoder.db.all(fuzzyQuery, [prefix, contains, prefix, contains, prefix, prefix], function(err, rows) { + if (err) { + if (typeof(callback) == 'function') { + callback(err, undefined) + } else if (typeof(reject) == 'function') { + reject(err) + } + } else { + const result = formatResult(rows) + if (typeof(callback) == 'function') { + callback(undefined, result) + } else { + resolve(result) + } + } + }) + }) + } + + if (geocoder._forwardSupported === true) { + doSearch() + return + } + + // Probe for the asciiname column (first call only) + geocoder.db.all('SELECT asciiname FROM everything LIMIT 0', [], function(err) { + geocoder._forwardSupported = !err + if (err) { + if (typeof(callback) == 'function') { + callback(undefined, undefined) + } else { + resolve(undefined) + } + } else { + doSearch() + } + }) + }) +} + +function formatResult(rows) { + const row = rows[0] + + if (row === undefined) { + return undefined + } else { + return formatLocation(row) + } +} + +function Forward(geocoder, query, callback) { + return findByName(geocoder, query, callback) +} + +module.exports = Forward; diff --git a/src/geohash.js b/src/geohash.js new file mode 100644 index 0000000..efae700 --- /dev/null +++ b/src/geohash.js @@ -0,0 +1,158 @@ +"use strict"; + +const BASE32 = '0123456789bcdefghjkmnpqrstuvwxyz' +const BASE32_MAP = Object.create(null) +for (var i = 0; i < BASE32.length; i++) { + BASE32_MAP[BASE32.charAt(i)] = i +} + +function normalizeLatitude(value) { + var latitude = Number(value) + if (Number.isNaN(latitude)) { + latitude = 0 + } + if (latitude > 90) latitude = 90 + if (latitude < -90) latitude = -90 + return latitude +} + +function normalizeLongitude(value) { + var longitude = Number(value) + if (Number.isNaN(longitude)) { + longitude = 0 + } + + while (longitude < -180) longitude += 360 + while (longitude > 180) longitude -= 360 + + if (longitude === 180) { + longitude = 179.99999999999997 + } + + return longitude +} + +function encode(latitude, longitude, precision) { + var targetPrecision = Number(precision) + if (!targetPrecision || targetPrecision < 1) { + targetPrecision = 1 + } + + var lat = normalizeLatitude(latitude) + var lon = normalizeLongitude(longitude) + + var latMin = -90 + var latMax = 90 + var lonMin = -180 + var lonMax = 180 + var hash = '' + var bit = 0 + var ch = 0 + var evenBit = true + + while (hash.length < targetPrecision) { + if (evenBit) { + var lonMid = (lonMin + lonMax) / 2 + if (lon >= lonMid) { + ch = (ch << 1) + 1 + lonMin = lonMid + } else { + ch = (ch << 1) + lonMax = lonMid + } + } else { + var latMid = (latMin + latMax) / 2 + if (lat >= latMid) { + ch = (ch << 1) + 1 + latMin = latMid + } else { + ch = (ch << 1) + latMax = latMid + } + } + + evenBit = !evenBit + bit += 1 + + if (bit === 5) { + hash += BASE32.charAt(ch) + bit = 0 + ch = 0 + } + } + + return hash +} + +function decodeBbox(hash) { + var value = String(hash || '').toLowerCase() + var latMin = -90 + var latMax = 90 + var lonMin = -180 + var lonMax = 180 + var evenBit = true + + for (var i = 0; i < value.length; i++) { + var ch = value.charAt(i) + if (BASE32_MAP[ch] === undefined) { + throw new Error('Invalid geohash character: ' + ch) + } + + var current = BASE32_MAP[ch] + for (var mask = 16; mask > 0; mask >>= 1) { + if (evenBit) { + var lonMid = (lonMin + lonMax) / 2 + if (current & mask) { + lonMin = lonMid + } else { + lonMax = lonMid + } + } else { + var latMid = (latMin + latMax) / 2 + if (current & mask) { + latMin = latMid + } else { + latMax = latMid + } + } + + evenBit = !evenBit + } + } + + return { + minLat: latMin, + minLon: lonMin, + maxLat: latMax, + maxLon: lonMax + } +} + +function cellSize(precision) { + var p = Math.max(1, Number(precision) || 1) + var totalBits = p * 5 + var lonBits = Math.ceil(totalBits / 2) + var latBits = Math.floor(totalBits / 2) + + return { + lat: 180 / Math.pow(2, latBits), + lon: 360 / Math.pow(2, lonBits) + } +} + +function children(hash) { + var prefix = String(hash || '') + var values = [] + for (var i = 0; i < BASE32.length; i++) { + values.push(prefix + BASE32.charAt(i)) + } + return values +} + +module.exports = { + encode: encode, + decodeBbox: decodeBbox, + cellSize: cellSize, + children: children, + base32: BASE32 +} diff --git a/src/geometry.js b/src/geometry.js new file mode 100644 index 0000000..9136a17 --- /dev/null +++ b/src/geometry.js @@ -0,0 +1,411 @@ +"use strict"; + +const EPSILON = 1e-12 + +function markNormalized(value) { + Object.defineProperty(value, '__normalized', { + value: true, + enumerable: false, + configurable: true, + writable: false + }) + return value +} + +function setCachedBbox(value, bbox) { + Object.defineProperty(value, '__bbox', { + value: bbox, + enumerable: false, + configurable: true, + writable: true + }) +} + +function closeRing(ring) { + if (!Array.isArray(ring) || ring.length === 0) return [] + + var normalized = ring.map(function(point) { + return [Number(point[0]), Number(point[1])] + }) + + var first = normalized[0] + var last = normalized[normalized.length - 1] + if (first[0] !== last[0] || first[1] !== last[1]) { + normalized.push([first[0], first[1]]) + } + + return normalized +} + +function normalizeGeometry(geometry) { + if (!geometry || !geometry.type || !geometry.coordinates) { + throw new Error('Invalid geometry payload') + } + + if (geometry.type === 'MultiPolygon' && geometry.__normalized) { + return geometry + } + + if (geometry.type === 'Polygon') { + return markNormalized({ + type: 'MultiPolygon', + coordinates: [geometry.coordinates.map(closeRing)] + }) + } + + if (geometry.type === 'MultiPolygon') { + return markNormalized({ + type: 'MultiPolygon', + coordinates: geometry.coordinates.map(function(polygon) { + return polygon.map(closeRing) + }) + }) + } + + throw new Error('Unsupported geometry type: ' + geometry.type) +} + +function geometryBbox(geometry) { + var normalized = normalizeGeometry(geometry) + if (normalized.__bbox) { + return normalized.__bbox + } + + var minLat = Infinity + var minLon = Infinity + var maxLat = -Infinity + var maxLon = -Infinity + + normalized.coordinates.forEach(function(polygon) { + polygon.forEach(function(ring) { + ring.forEach(function(point) { + var lon = Number(point[0]) + var lat = Number(point[1]) + + if (lat < minLat) minLat = lat + if (lat > maxLat) maxLat = lat + if (lon < minLon) minLon = lon + if (lon > maxLon) maxLon = lon + }) + }) + }) + + var bbox = { + minLat: minLat, + minLon: minLon, + maxLat: maxLat, + maxLon: maxLon + } + + setCachedBbox(normalized, bbox) + return bbox +} + +function signedRingArea(ring) { + var area = 0 + for (var i = 0; i < ring.length - 1; i++) { + var current = ring[i] + var next = ring[i + 1] + area += (current[0] * next[1]) - (next[0] * current[1]) + } + return area / 2 +} + +function geometryArea(geometry) { + var normalized = normalizeGeometry(geometry) + var total = 0 + + normalized.coordinates.forEach(function(polygon) { + if (!polygon[0] || polygon[0].length < 4) return + + var polygonArea = Math.abs(signedRingArea(polygon[0])) + for (var i = 1; i < polygon.length; i++) { + polygonArea -= Math.abs(signedRingArea(polygon[i])) + } + + total += Math.max(0, polygonArea) + }) + + return total +} + +function almostEqual(a, b) { + return Math.abs(a - b) <= EPSILON +} + +function pointOnSegment(point, a, b) { + var sqLen = (b[0] - a[0]) * (b[0] - a[0]) + (b[1] - a[1]) * (b[1] - a[1]) + if (sqLen <= EPSILON) { + return almostEqual(point[0], a[0]) && almostEqual(point[1], a[1]) + } + + var cross = (point[1] - a[1]) * (b[0] - a[0]) - (point[0] - a[0]) * (b[1] - a[1]) + if (Math.abs(cross) > EPSILON) { + return false + } + + var dot = (point[0] - a[0]) * (b[0] - a[0]) + (point[1] - a[1]) * (b[1] - a[1]) + if (dot < -EPSILON) { + return false + } + + if (dot - sqLen > EPSILON) { + return false + } + + return true +} + +function pointInRing(point, ring) { + if (!ring || ring.length < 4) return false + + var inside = false + var last = ring.length - 1 + + for (var i = 0, j = last - 1; i < last; j = i++) { + var a = ring[i] + var b = ring[j] + + if (pointOnSegment(point, a, b)) { + return true + } + + var yi = a[1] + var yj = b[1] + var xi = a[0] + var xj = b[0] + + var intersects = ((yi > point[1]) !== (yj > point[1])) && + (point[0] < (xj - xi) * (point[1] - yi) / ((yj - yi) || EPSILON) + xi) + + if (intersects) inside = !inside + } + + return inside +} + +function pointInPolygon(point, polygon) { + if (!polygon[0] || !pointInRing(point, polygon[0])) { + return false + } + + for (var i = 1; i < polygon.length; i++) { + if (pointInRing(point, polygon[i])) { + return false + } + } + + return true +} + +function pointInGeometry(geometry, latitude, longitude) { + var normalized = normalizeGeometry(geometry) + var point = [Number(longitude), Number(latitude)] + + for (var i = 0; i < normalized.coordinates.length; i++) { + if (pointInPolygon(point, normalized.coordinates[i])) { + return true + } + } + + return false +} + +function bboxContainsPoint(bbox, latitude, longitude) { + return Number(latitude) >= bbox.minLat && Number(latitude) <= bbox.maxLat && + Number(longitude) >= bbox.minLon && Number(longitude) <= bbox.maxLon +} + +function bboxIntersects(a, b) { + return !(a.maxLon < b.minLon || a.minLon > b.maxLon || a.maxLat < b.minLat || a.minLat > b.maxLat) +} + +function bboxContainsBbox(outer, inner) { + return outer.minLat <= inner.minLat && + outer.minLon <= inner.minLon && + outer.maxLat >= inner.maxLat && + outer.maxLon >= inner.maxLon +} + +function orientation(a, b, c) { + var value = (b[1] - a[1]) * (c[0] - b[0]) - (b[0] - a[0]) * (c[1] - b[1]) + if (almostEqual(value, 0)) return 0 + return value > 0 ? 1 : 2 +} + +function segmentsIntersect(a, b, c, d) { + var o1 = orientation(a, b, c) + var o2 = orientation(a, b, d) + var o3 = orientation(c, d, a) + var o4 = orientation(c, d, b) + + if (o1 !== o2 && o3 !== o4) { + return true + } + + if (o1 === 0 && pointOnSegment(c, a, b)) return true + if (o2 === 0 && pointOnSegment(d, a, b)) return true + if (o3 === 0 && pointOnSegment(a, c, d)) return true + if (o4 === 0 && pointOnSegment(b, c, d)) return true + return false +} + +function segmentIntersectsRect(a, b, rect) { + var rectPoints = [ + [rect.minLon, rect.minLat], + [rect.maxLon, rect.minLat], + [rect.maxLon, rect.maxLat], + [rect.minLon, rect.maxLat] + ] + + if (bboxContainsPoint(rect, a[1], a[0]) || bboxContainsPoint(rect, b[1], b[0])) { + return true + } + + for (var i = 0; i < rectPoints.length; i++) { + var p1 = rectPoints[i] + var p2 = rectPoints[(i + 1) % rectPoints.length] + if (segmentsIntersect(a, b, p1, p2)) { + return true + } + } + + return false +} + +function pointOnRectBoundary(rect, point) { + var lon = point[0] + var lat = point[1] + + var onVertical = (almostEqual(lon, rect.minLon) || almostEqual(lon, rect.maxLon)) && + lat >= rect.minLat - EPSILON && lat <= rect.maxLat + EPSILON + var onHorizontal = (almostEqual(lat, rect.minLat) || almostEqual(lat, rect.maxLat)) && + lon >= rect.minLon - EPSILON && lon <= rect.maxLon + EPSILON + + return onVertical || onHorizontal +} + +function anyVertexInsideRect(geometry, rect, includeBoundary) { + var normalized = normalizeGeometry(geometry) + + for (var i = 0; i < normalized.coordinates.length; i++) { + var polygon = normalized.coordinates[i] + for (var j = 0; j < polygon.length; j++) { + var ring = polygon[j] + for (var k = 0; k < ring.length; k++) { + var point = ring[k] + if (bboxContainsPoint(rect, point[1], point[0])) { + if (!includeBoundary && pointOnRectBoundary(rect, point)) { + continue + } + return true + } + } + } + } + + return false +} + +function anyEdgeIntersectsRect(geometry, rect) { + var normalized = normalizeGeometry(geometry) + + for (var i = 0; i < normalized.coordinates.length; i++) { + var polygon = normalized.coordinates[i] + for (var j = 0; j < polygon.length; j++) { + var ring = polygon[j] + for (var k = 0; k < ring.length - 1; k++) { + if (segmentIntersectsRect(ring[k], ring[k + 1], rect)) { + return true + } + } + } + } + + return false +} + +function classifyCell(geometry, cellBbox) { + var bounds = geometryBbox(geometry) + if (!bboxIntersects(bounds, cellBbox)) { + return 'outside' + } + + var corners = [ + [cellBbox.minLon, cellBbox.minLat], + [cellBbox.maxLon, cellBbox.minLat], + [cellBbox.maxLon, cellBbox.maxLat], + [cellBbox.minLon, cellBbox.maxLat] + ] + + var cornersInside = 0 + for (var i = 0; i < corners.length; i++) { + if (pointInGeometry(geometry, corners[i][1], corners[i][0])) { + cornersInside += 1 + } + } + + var centerLat = (cellBbox.minLat + cellBbox.maxLat) / 2 + var centerLon = (cellBbox.minLon + cellBbox.maxLon) / 2 + var centerInside = pointInGeometry(geometry, centerLat, centerLon) + var hasInnerVertex = anyVertexInsideRect(geometry, cellBbox, false) + + if (cornersInside === 4 && centerInside && !hasInnerVertex) { + return 'full' + } + + if (anyEdgeIntersectsRect(geometry, cellBbox)) { + return 'partial' + } + + if (anyVertexInsideRect(geometry, cellBbox, true)) { + return 'partial' + } + + if (cornersInside > 0) { + return 'partial' + } + + if (centerInside) { + return 'partial' + } + + return 'outside' +} + +function geometryContainsGeometry(containerGeometry, candidateGeometry) { + var container = normalizeGeometry(containerGeometry) + var candidate = normalizeGeometry(candidateGeometry) + + if (!bboxContainsBbox(geometryBbox(container), geometryBbox(candidate))) { + return false + } + + for (var i = 0; i < candidate.coordinates.length; i++) { + var polygon = candidate.coordinates[i] + for (var j = 0; j < polygon.length; j++) { + var ring = polygon[j] + var limit = ring.length > 1 ? ring.length - 1 : ring.length + for (var k = 0; k < limit; k++) { + var point = ring[k] + if (!pointInGeometry(container, point[1], point[0])) { + return false + } + } + } + } + + return true +} + +module.exports = { + normalizeGeometry: normalizeGeometry, + geometryBbox: geometryBbox, + geometryArea: geometryArea, + pointInGeometry: pointInGeometry, + bboxContainsPoint: bboxContainsPoint, + bboxIntersects: bboxIntersects, + bboxContainsBbox: bboxContainsBbox, + classifyCell: classifyCell, + geometryContainsGeometry: geometryContainsGeometry +} diff --git a/src/index.js b/src/index.js index 9ba44e7..4d78832 100644 --- a/src/index.js +++ b/src/index.js @@ -1,25 +1,74 @@ "use strict"; const path = require('path') -const sqlite3 = require('sqlite3').verbose() const reverse = require('./reverse') +const forward = require('./forward') const findLocation = require('./location').find +function normalizeReverseMode(options) { + var mode = options.reverseMode + if (mode === undefined && options.reverse && options.reverse.mode) { + mode = options.reverse.mode + } + + var normalized = String(mode || 'centroid').toLowerCase() + return normalized === 'boundary' ? 'boundary' : 'centroid' +} + +function resolveBoundaryOptions(options) { + var boundary = options.boundary || {} + + var basePrecision = Number(boundary.basePrecision) + if (!Number.isFinite(basePrecision) || basePrecision < 1) { + basePrecision = 4 + } + + var maxPrecision = Number(boundary.maxPrecision) + if (!Number.isFinite(maxPrecision) || maxPrecision < basePrecision) { + maxPrecision = 7 + } + + return { + basePrecision: basePrecision, + maxPrecision: maxPrecision + } +} + function Geocoder(options) { var geocoder = function(options) { this.options = options || {} + this.reverseMode = normalizeReverseMode(this.options) + this.reverseDebug = Boolean(this.options.reverseDebug) + this.boundaryOptions = resolveBoundaryOptions(this.options) - if (this.options.database === undefined) { - this.options.database = path.join(__filename, '../../data/db.sqlite') - } + if (this.options.db) { + // Accept a pre-opened database object (must have .all(sql, params, cb)) + this.db = this.options.db + } else { + var sqlite3 + try { + sqlite3 = (this.options.sqlite3 || require('sqlite3')).verbose() + } catch (err) { + err.message = 'sqlite3 is required for Node usage. Install it with `npm install sqlite3`.' + throw err + } - this.db = new sqlite3.Database(this.options.database) + if (this.options.database === undefined) { + this.options.database = path.join(__dirname, '../data/db.sqlite') + } + + this.db = new sqlite3.Database(this.options.database) + } } geocoder.prototype.reverse = function(latitude, longitude, callback) { return reverse(this, latitude, longitude, callback) } + geocoder.prototype.forward = function(query, callback) { + return forward(this, query, callback) + } + geocoder.prototype.location = function() { const _this = this @@ -30,7 +79,22 @@ function Geocoder(options) { } } - return new geocoder(options) + var instance = new geocoder(options) + + // Also support geocoder.location.find(id) without calling location() + var locationFn = function() { + return { + find: function(locationId) { + return findLocation(instance, locationId) + } + } + } + locationFn.find = function(locationId) { + return findLocation(instance, locationId) + } + instance.location = locationFn + + return instance } module.exports = Geocoder; diff --git a/src/location.js b/src/location.js index e24affe..1d4d097 100644 --- a/src/location.js +++ b/src/location.js @@ -1,25 +1,23 @@ "use strict"; +function normalizeId(value) { + if (typeof value === 'string') { + var match = /^geonames:(\d+)$/i.exec(value.trim()) + if (match) return Number(match[1]) + return Number(value) + } + return value +} + function find(geocoder, locationId) { return new Promise(function(resolve, reject) { - const query = `SELECT * FROM everything WHERE id = $id LIMIT 1` + const query = `SELECT * FROM everything WHERE id = ? LIMIT 1` - geocoder.db.all(query, { - $id: locationId - }, function(err, rows) { + geocoder.db.all(query, [normalizeId(locationId)], function(err, rows) { if (err) { - if (typeof(callback) == 'function') { - callback(err, undefined) - } else if (typeof(reject) == 'function') { - reject(err) - } + reject(err) } else { - const result = formatResult(rows) - if (typeof(callback) == 'function') { - callback(undefined, result) - } else if (typeof(resolve) == 'function') { - resolve(result) - } + resolve(formatResult(rows)) } }) }) diff --git a/src/reverse.js b/src/reverse.js index db92cd9..6d89272 100644 --- a/src/reverse.js +++ b/src/reverse.js @@ -1,68 +1,664 @@ "use strict"; const formatLocation = require('./location').format +const geohash = require('./geohash') +const geometry = require('./geometry') -// This finds the closest feature based upon Pythagoras's theorem. It is an -// approximation, and won't provide results as accurate as the haversine -// formula, but trades that for performance. For our use case this is good -// enough as the data is just an approximation of the centre point of a -// feature. -// -// The scale parameter accounts for the fact that 1 degree in longitude is -// different at the poles vs the equator. -// -// Based upon http://stackoverflow.com/a/7261601/155715 -function findFeature(geocoder, latitude, longitude, callback) { +const SUPPORTED_PLACETYPES = ['locality', 'localadmin', 'region', 'county'] +const SUPPORTED_PLACETYPE_CODES = [0, 1, 2, 3] +const PLACEHOLDER_EMPTY = {} + +function dbAll(geocoder, query, params) { return new Promise(function(resolve, reject) { - const query = `SELECT * FROM everything WHERE id IN ( - SELECT feature_id - FROM coordinates - WHERE latitude BETWEEN $lat - 1.5 AND $lat + 1.5 - AND longitude BETWEEN $lon - 1.5 AND $lon + 1.5 - ORDER BY ( - ($lat - latitude) * ($lat - latitude) + - ($lon - longitude) * ($lon - longitude) * $scale - ) ASC - LIMIT 1 - )` - - const scale = Math.pow(Math.cos(latitude * Math.PI / 180), 2) - - geocoder.db.all(query, { - $lat: latitude, - $lon: longitude, - $scale: scale - }, function(err, rows) { - if (err) { - if (typeof(callback) == 'function') { - callback(err, undefined) - } else if (typeof(reject) == 'function') { - reject(err) + geocoder.db.all(query, params || [], function(err, rows) { + if (err) reject(err) + else resolve(rows || []) + }) + }) +} + +function pointDistanceScore(latitude, longitude, row) { + var lat = Number(latitude) + var lon = Number(longitude) + var targetLat = Number(row.latitude) + var targetLon = Number(row.longitude) + var scale = Math.pow(Math.cos(lat * Math.PI / 180), 2) + + return ((lat - targetLat) * (lat - targetLat)) + + ((lon - targetLon) * (lon - targetLon) * scale) +} + +function placetypeRank(value) { + if (value === 'locality') return 0 + if (value === 'localadmin') return 1 + if (value === 'region') return 2 + return 3 +} + +function formatRow(row) { + if (!row) return PLACEHOLDER_EMPTY + return formatLocation(row) +} + +function executeWithCallback(promise, callback) { + if (typeof callback !== 'function') { + return promise + } + + promise.then(function(result) { + callback(undefined, result) + }).catch(function(err) { + callback(err, undefined) + }) + + return promise +} + +function findLegacyCentroidRow(geocoder, latitude, longitude) { + var query = `SELECT * FROM everything WHERE id IN ( + SELECT feature_id + FROM coordinates + WHERE latitude BETWEEN ? - 1.5 AND ? + 1.5 + AND longitude BETWEEN ? - 1.5 AND ? + 1.5 + ORDER BY ( + (? - latitude) * (? - latitude) + + (? - longitude) * (? - longitude) * ? + ) ASC + LIMIT 1 + )` + + var scale = Math.pow(Math.cos(Number(latitude) * Math.PI / 180), 2) + + return dbAll(geocoder, query, [ + latitude, latitude, + longitude, longitude, + latitude, latitude, + longitude, longitude, + scale + ]).then(function(rows) { + return rows[0] + }) +} + +function getBoundarySchemaStatus(geocoder) { + if (geocoder._boundarySchemaStatus) { + return Promise.resolve(geocoder._boundarySchemaStatus) + } + + var query = ` + SELECT name + FROM sqlite_master + WHERE type='table' + AND name IN ('compact_places', 'compact_geohash_lookup', 'places', 'place_geohash_lookup', 'place_geohash_cover', 'place_geometry') + ` + + return dbAll(geocoder, query, []).then(function(rows) { + var names = Object.create(null) + rows.forEach(function(row) { + names[row.name] = true + }) + + var status = { + hasCompactV2: Boolean(names.compact_places && names.compact_geohash_lookup), + hasCompactLegacy: Boolean(names.places && names.place_geohash_lookup), + hasFull: Boolean(names.places && names.place_geohash_cover && names.place_geometry) + } + + geocoder._boundarySchemaStatus = status + return status + }).catch(function() { + var status = { hasCompactV2: false, hasCompactLegacy: false, hasFull: false } + geocoder._boundarySchemaStatus = status + return status + }) +} + +function reverseHashes(latitude, longitude, basePrecision, maxPrecision) { + var hashes = [] + for (var precision = maxPrecision; precision >= basePrecision; precision--) { + hashes.push({ + precision: precision, + geohash: geohash.encode(latitude, longitude, precision) + }) + } + return hashes +} + +function fetchCompactBoundaryMatchV2(geocoder, hashes) { + if (!hashes.length) { + return Promise.resolve(undefined) + } + + var placeholders = hashes.map(function() { return '?' }).join(',') + var params = hashes.map(function(hash) { return hash.geohash }) + var placetypePlaceholders = SUPPORTED_PLACETYPE_CODES.map(function() { return '?' }).join(', ') + params = params.concat(SUPPORTED_PLACETYPE_CODES) + + var query = ` + SELECT + l.geohash AS geohash, + p.id AS id, + p.name AS name, + p.country_id AS country_id, + p.country_id AS country_name, + p.admin1_id AS admin1_id, + COALESCE(a.name, '') AS admin1_name, + p.latitude AS latitude, + p.longitude AS longitude, + CASE p.placetype_code + WHEN 0 THEN 'locality' + WHEN 1 THEN 'localadmin' + WHEN 2 THEN 'region' + WHEN 3 THEN 'county' + ELSE 'region' + END AS placetype, + 0 AS priority_rank, + 0 AS area + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + LEFT JOIN compact_places a ON a.id = p.admin1_id AND a.placetype_code = 2 + WHERE l.geohash IN (${placeholders}) + AND p.placetype_code IN (${placetypePlaceholders}) + ORDER BY + LENGTH(l.geohash) DESC, + p.placetype_code ASC, + p.id ASC + LIMIT 1 + ` + + return dbAll(geocoder, query, params).then(function(rows) { + return rows[0] + }) +} + +function fetchCompactBoundaryMatchLegacy(geocoder, hashes) { + if (!hashes.length) { + return Promise.resolve(undefined) + } + + var placeholders = hashes.map(function() { return '?' }).join(',') + var params = hashes.map(function(hash) { return hash.geohash }) + var placetypePlaceholders = SUPPORTED_PLACETYPES.map(function() { return '?' }).join(', ') + params = params.concat(SUPPORTED_PLACETYPES) + + var query = ` + SELECT + l.geohash AS geohash, + p.id AS id, + p.name AS name, + p.country_id AS country_id, + COALESCE(c.name, p.country_name, p.country_id, '') AS country_name, + p.admin1_id AS admin1_id, + COALESCE(a.name, p.admin1_name, '') AS admin1_name, + p.centroid_lat AS latitude, + p.centroid_lon AS longitude, + p.placetype AS placetype, + p.priority_rank AS priority_rank, + p.area AS area + FROM place_geohash_lookup l + JOIN places p ON p.id = l.place_id + LEFT JOIN countries c ON c.id = p.country_id + LEFT JOIN admin1 a ON a.country_id = p.country_id AND a.id = p.admin1_id + WHERE l.geohash IN (${placeholders}) + AND p.placetype IN (${placetypePlaceholders}) + ORDER BY + LENGTH(l.geohash) DESC, + CASE p.placetype WHEN 'locality' THEN 0 WHEN 'localadmin' THEN 1 ELSE 2 END, + p.priority_rank ASC, + p.id ASC + LIMIT 1 + ` + + return dbAll(geocoder, query, params).then(function(rows) { + return rows[0] + }) +} + +function fetchBoundaryCandidates(geocoder, hashes) { + if (!hashes.length) { + return Promise.resolve([]) + } + + var clauses = [] + var params = [] + for (var i = 0; i < hashes.length; i++) { + clauses.push('(g.geohash = ? AND g.precision = ?)') + params.push(hashes[i].geohash) + params.push(hashes[i].precision) + } + var placetypePlaceholders = SUPPORTED_PLACETYPES.map(function() { return '?' }).join(', ') + + var query = ` + SELECT DISTINCT + p.id AS id, + p.name AS name, + p.country_id AS country_id, + COALESCE(c.name, p.country_name, p.country_id, '') AS country_name, + p.admin1_id AS admin1_id, + COALESCE(a.name, p.admin1_name, '') AS admin1_name, + p.centroid_lat AS latitude, + p.centroid_lon AS longitude, + p.placetype AS placetype, + p.priority_rank AS priority_rank, + p.area AS area, + p.bbox_min_lat AS bbox_min_lat, + p.bbox_min_lon AS bbox_min_lon, + p.bbox_max_lat AS bbox_max_lat, + p.bbox_max_lon AS bbox_max_lon + FROM place_geohash_cover g + JOIN places p ON p.id = g.place_id + LEFT JOIN countries c ON c.id = p.country_id + LEFT JOIN admin1 a ON a.country_id = p.country_id AND a.id = p.admin1_id + WHERE (${clauses.join(' OR ')}) + AND p.placetype IN (${placetypePlaceholders}) + ` + + params = params.concat(SUPPORTED_PLACETYPES) + return dbAll(geocoder, query, params) +} + +function hasPointInPlaceBbox(place, latitude, longitude) { + return geometry.bboxContainsPoint({ + minLat: Number(place.bbox_min_lat), + minLon: Number(place.bbox_min_lon), + maxLat: Number(place.bbox_max_lat), + maxLon: Number(place.bbox_max_lon) + }, latitude, longitude) +} + +function loadPlaceGeometries(geocoder, placeIds) { + if (!placeIds.length) { + return Promise.resolve(Object.create(null)) + } + + var cache = geocoder._boundaryGeometryCache + if (!cache) { + cache = Object.create(null) + geocoder._boundaryGeometryCache = cache + } + + var missing = [] + for (var i = 0; i < placeIds.length; i++) { + var key = String(placeIds[i]) + if (!cache[key]) { + missing.push(placeIds[i]) + } + } + + if (!missing.length) { + return Promise.resolve(cache) + } + + var placeholders = missing.map(function() { return '?' }).join(',') + var query = `SELECT place_id, encoding, geometry FROM place_geometry WHERE place_id IN (${placeholders})` + + return dbAll(geocoder, query, missing).then(function(rows) { + rows.forEach(function(row) { + var key = String(row.place_id) + if (cache[key]) { + return + } + + var raw = row.geometry + if (typeof Buffer !== 'undefined' && Buffer.isBuffer(raw)) { + raw = raw.toString('utf8') + } + + if (typeof raw !== 'string') { + raw = String(raw) + } + + try { + cache[key] = geometry.normalizeGeometry(JSON.parse(raw)) + } catch (err) { + cache[key] = null + } + }) + + return cache + }) +} + +function sortContainedPlaces(matches) { + return matches.sort(function(a, b) { + var typeRankA = placetypeRank(a.placetype) + var typeRankB = placetypeRank(b.placetype) + if (typeRankA !== typeRankB) { + return typeRankA - typeRankB + } + + var areaA = Number(a.area) + var areaB = Number(b.area) + if (!Number.isFinite(areaA)) areaA = Infinity + if (!Number.isFinite(areaB)) areaB = Infinity + if (areaA !== areaB) { + return areaA - areaB + } + + var priorityA = Number(a.priority_rank) + var priorityB = Number(b.priority_rank) + if (!Number.isFinite(priorityA)) priorityA = Number.MAX_SAFE_INTEGER + if (!Number.isFinite(priorityB)) priorityB = Number.MAX_SAFE_INTEGER + if (priorityA !== priorityB) { + return priorityA - priorityB + } + + var idA = String(a.id) + var idB = String(b.id) + if (idA < idB) return -1 + if (idA > idB) return 1 + return 0 + }) +} + +function pickNearest(rows, latitude, longitude) { + if (!rows.length) return undefined + + var sorted = rows.slice().sort(function(a, b) { + var scoreA = pointDistanceScore(latitude, longitude, a) + var scoreB = pointDistanceScore(latitude, longitude, b) + + if (scoreA !== scoreB) { + return scoreA - scoreB + } + + var typeRankA = placetypeRank(a.placetype) + var typeRankB = placetypeRank(b.placetype) + if (typeRankA !== typeRankB) { + return typeRankA - typeRankB + } + + var priorityA = Number(a.priority_rank) + var priorityB = Number(b.priority_rank) + if (!Number.isFinite(priorityA)) priorityA = Number.MAX_SAFE_INTEGER + if (!Number.isFinite(priorityB)) priorityB = Number.MAX_SAFE_INTEGER + if (priorityA !== priorityB) { + return priorityA - priorityB + } + + var idA = String(a.id) + var idB = String(b.id) + if (idA < idB) return -1 + if (idA > idB) return 1 + return 0 + }) + + return sorted[0] +} + +function fetchNearestBoundaryByRegion(geocoder, latitude, longitude, region) { + var where = [ + 'p.placetype IN (' + SUPPORTED_PLACETYPES.map(function() { return '?' }).join(', ') + ')' + ] + var params = SUPPORTED_PLACETYPES.slice() + + if (region && region.countryId) { + where.push('p.country_id = ?') + params.push(region.countryId) + } + + if (region && region.admin1Id !== undefined && region.admin1Id !== null) { + where.push('p.admin1_id = ?') + params.push(region.admin1Id) + } + + var scale = Math.pow(Math.cos(Number(latitude) * Math.PI / 180), 2) + + var query = ` + SELECT + p.id AS id, + p.name AS name, + p.country_id AS country_id, + COALESCE(c.name, p.country_name, p.country_id, '') AS country_name, + p.admin1_id AS admin1_id, + COALESCE(a.name, p.admin1_name, '') AS admin1_name, + p.centroid_lat AS latitude, + p.centroid_lon AS longitude, + p.placetype AS placetype, + p.priority_rank AS priority_rank, + p.area AS area + FROM places p + LEFT JOIN countries c ON c.id = p.country_id + LEFT JOIN admin1 a ON a.country_id = p.country_id AND a.id = p.admin1_id + WHERE ${where.join(' AND ')} + ORDER BY + ((? - p.centroid_lat) * (? - p.centroid_lat) + + (? - p.centroid_lon) * (? - p.centroid_lon) * ?) ASC, + CASE p.placetype WHEN 'locality' THEN 0 WHEN 'localadmin' THEN 1 ELSE 2 END, + p.priority_rank ASC, + p.id ASC + LIMIT 1 + ` + + params.push(latitude, latitude, longitude, longitude, scale) + + return dbAll(geocoder, query, params).then(function(rows) { + return rows[0] + }) +} + +function fetchNearestCompactByRegionV2(geocoder, latitude, longitude, region) { + var where = [ + 'p.placetype_code IN (' + SUPPORTED_PLACETYPE_CODES.map(function() { return '?' }).join(', ') + ')' + ] + var params = SUPPORTED_PLACETYPE_CODES.slice() + + if (region && region.countryId) { + where.push('p.country_id = ?') + params.push(region.countryId) + } + + if (region && region.admin1Id !== undefined && region.admin1Id !== null) { + where.push('p.admin1_id = ?') + params.push(region.admin1Id) + } + + var scale = Math.pow(Math.cos(Number(latitude) * Math.PI / 180), 2) + + var query = ` + SELECT + p.id AS id, + p.name AS name, + p.country_id AS country_id, + p.country_id AS country_name, + p.admin1_id AS admin1_id, + COALESCE(a.name, '') AS admin1_name, + p.latitude AS latitude, + p.longitude AS longitude, + CASE p.placetype_code + WHEN 0 THEN 'locality' + WHEN 1 THEN 'localadmin' + WHEN 2 THEN 'region' + WHEN 3 THEN 'county' + ELSE 'region' + END AS placetype, + 0 AS priority_rank, + 0 AS area + FROM compact_places p + LEFT JOIN compact_places a ON a.id = p.admin1_id AND a.placetype_code = 2 + WHERE ${where.join(' AND ')} + ORDER BY + ((? - p.latitude) * (? - p.latitude) + + (? - p.longitude) * (? - p.longitude) * ?) ASC, + p.placetype_code ASC, + p.id ASC + LIMIT 1 + ` + + params.push(latitude, latitude, longitude, longitude, scale) + + return dbAll(geocoder, query, params).then(function(rows) { + return rows[0] + }) +} + +function attachDebug(geocoder, payload, reason) { + if (!geocoder.reverseDebug || !payload || !Object.keys(payload).length) { + return payload + } + + var result = Object.assign({}, payload) + result._debug = { + mode: 'boundary', + reason: reason + } + return result +} + +function fallbackNearestBoundary(geocoder, latitude, longitude, mode) { + var fetchNearest = mode === 'compact_v2' ? fetchNearestCompactByRegionV2 : fetchNearestBoundaryByRegion + + return fetchNearest(geocoder, latitude, longitude, null) + .then(function(globalNearest) { + if (!globalNearest) { + return { + row: undefined, + reason: 'no_boundary_places' } - } else { - const result = formatResult(rows) - if (typeof(callback) == 'function') { - callback(undefined, result) - } else if (typeof(resolve) == 'function') { - resolve(result) + } + + return fetchNearest(geocoder, latitude, longitude, { + countryId: globalNearest.country_id, + admin1Id: globalNearest.admin1_id + }).then(function(regionalNearest) { + return { + row: regionalNearest || globalNearest, + reason: 'regional_centroid_fallback' + } + }) + }) +} + +function tryCompactBoundaryLookup(geocoder, latitude, longitude, hashes, mode) { + var fetchCompact = mode === 'compact_v2' ? fetchCompactBoundaryMatchV2 : fetchCompactBoundaryMatchLegacy + + return fetchCompact(geocoder, hashes).then(function(row) { + if (row) { + return { + row: row, + reason: 'geohash_lookup' + } + } + return { + row: undefined, + reason: 'no_compact_match' + } + }) +} + +function tryFullBoundaryLookup(geocoder, latitude, longitude, hashes) { + return fetchBoundaryCandidates(geocoder, hashes) + .then(function(candidates) { + var bboxCandidates = candidates.filter(function(candidate) { + return hasPointInPlaceBbox(candidate, latitude, longitude) + }) + + if (!bboxCandidates.length) { + return { + row: pickNearest(candidates, latitude, longitude), + reason: 'boundary_centroid_fallback' } } + + var candidateIds = bboxCandidates.map(function(candidate) { return candidate.id }) + return loadPlaceGeometries(geocoder, candidateIds) + .then(function(geometryById) { + var contained = bboxCandidates.filter(function(candidate) { + var polygon = geometryById[String(candidate.id)] + if (!polygon) return false + return geometry.pointInGeometry(polygon, latitude, longitude) + }) + + if (contained.length) { + var selected = sortContainedPlaces(contained)[0] + return { + row: selected, + reason: 'polygon_contains' + } + } + + var nearestInCandidates = pickNearest(bboxCandidates, latitude, longitude) + if (nearestInCandidates) { + return { + row: nearestInCandidates, + reason: 'bbox_candidate_centroid_fallback' + } + } + + return { + row: undefined, + reason: 'no_boundary_candidate' + } + }) + }) + .then(function(result) { + if (result && result.row) { + return result + } + + return fallbackNearestBoundary(geocoder, latitude, longitude, 'full') }) +} + +function tryBoundaryLookup(geocoder, latitude, longitude) { + return getBoundarySchemaStatus(geocoder).then(function(status) { + if (!status.hasCompactV2 && !status.hasCompactLegacy && !status.hasFull) { + return undefined + } + + var options = geocoder.boundaryOptions || {} + var basePrecision = Number(options.basePrecision || 4) + var maxPrecision = Number(options.maxPrecision || 7) + if (basePrecision < 1) basePrecision = 1 + if (maxPrecision < basePrecision) maxPrecision = basePrecision + + var hashes = reverseHashes(latitude, longitude, basePrecision, maxPrecision) + + var compactMode = status.hasCompactV2 ? 'compact_v2' : (status.hasCompactLegacy ? 'compact_legacy' : null) + + if (compactMode) { + return tryCompactBoundaryLookup(geocoder, latitude, longitude, hashes, compactMode) + .then(function(result) { + if (result && result.row) { + return result + } + + if (status.hasFull) { + return tryFullBoundaryLookup(geocoder, latitude, longitude, hashes) + } + + return fallbackNearestBoundary(geocoder, latitude, longitude, compactMode) + }) + } + + return tryFullBoundaryLookup(geocoder, latitude, longitude, hashes) }) } -function formatResult(rows) { - const row = rows[0] +function findFeature(geocoder, latitude, longitude) { + var mode = geocoder.reverseMode || 'centroid' - if (!row || row === undefined) { - return {} - } else { - return formatLocation(row) + if (mode !== 'boundary') { + return findLegacyCentroidRow(geocoder, latitude, longitude).then(function(row) { + return formatRow(row) + }) } + + return tryBoundaryLookup(geocoder, latitude, longitude) + .then(function(boundaryResult) { + if (boundaryResult && boundaryResult.row) { + return attachDebug(geocoder, formatRow(boundaryResult.row), boundaryResult.reason) + } + + return findLegacyCentroidRow(geocoder, latitude, longitude) + .then(function(row) { + return attachDebug(geocoder, formatRow(row), 'legacy_centroid_fallback') + }) + }) } function Reverse(geocoder, latitude, longitude, callback) { - return findFeature(geocoder, latitude, longitude, callback) + return executeWithCallback(findFeature(geocoder, latitude, longitude), callback) } module.exports = Reverse;