From b9d4b85349c68c8360adca2ded50f1e1e1f14f89 Mon Sep 17 00:00:00 2001 From: Sebastian Schloesser Date: Thu, 5 Mar 2026 12:03:33 -0500 Subject: [PATCH 01/10] Add forward geocoding, Expo/React Native support, and updated db generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add forward geocoding (geocoder.forward(query)) that does exact match with prefix/substring fallback, ranked by population. Requires a database generated with the updated schema — gracefully returns undefined on older databases missing the asciiname/population columns. Add Expo/React Native entrypoint (src/expo.js) that wraps expo-sqlite into the node-sqlite3 callback interface the existing modules expect. Decouple sqlite3 from index.js so it's lazily required only for Node usage. Add geonames: support to location.find() for stable identifiers, and fix a copy-paste bug in location.js where callback was referenced but never defined. Switch named SQL params ($lat, $id) to positional (?) for cross-runtime compatibility with both sqlite3 and expo-sqlite. Update the database generation script with configurable feature codes, population filters, and admin1 toggle. Extract schema to scripts/schema.sql. Tests now use a fixture database instead of requiring the full data/ dir. --- .gitignore | 3 + README.md | 94 +++++++++++++- package.json | 24 +++- scripts/generate_geonames.sh | 237 +++++++++++++++++++++-------------- scripts/schema.sql | 48 +++++++ spec/expo_adapter_spec.js | 28 +++++ spec/forward_spec.js | 49 ++++++++ spec/helpers/fixture_db.js | 78 ++++++++++++ spec/location_spec.js | 25 +++- spec/reverse_spec.js | 17 ++- spec/schema_spec.js | 45 +++++++ src/expo.js | 75 +++++++++++ src/forward.js | 131 +++++++++++++++++++ src/index.js | 44 ++++++- src/location.js | 28 ++--- src/reverse.js | 17 ++- 16 files changed, 810 insertions(+), 133 deletions(-) create mode 100644 scripts/schema.sql create mode 100644 spec/expo_adapter_spec.js create mode 100644 spec/forward_spec.js create mode 100644 spec/helpers/fixture_db.js create mode 100644 spec/schema_spec.js create mode 100644 src/expo.js create mode 100644 src/forward.js diff --git a/.gitignore b/.gitignore index cb4ccda..54e5a81 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ data node_modules package-lock.json +.npm-cache +.geonames-build +.DS_Store diff --git a/README.md b/README.md index 554bbcc..3ddcbd7 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Offline Geocoder -Node library for reverse geocoding. Designed to be used offline (for example -embedded in a desktop or mobile application) - no web requests are made to -perform a lookup. +Node and React Native library for offline geocoding. Designed to be used +offline (for example embedded in a desktop or mobile application) — no web +requests are made to perform a lookup. ## Data @@ -32,20 +32,33 @@ lookups per second with a single process. npm install --save offline-geocoder ``` +For Node you also need `sqlite3`: + +``` +npm install --save sqlite3 +``` + +For Expo / React Native, install `expo-sqlite` instead: + +``` +npx expo install expo-sqlite +``` + You also need to obtain a database which isn't included in the package, to -generate your own take a look in `scripts`. +generate your own take a look at the [Generating the database](#generating-the-database) +section below. ## Usage When you initialize the library you need to pass the location of the database: ```javascript -const geocoder = require('offline-geocoder')({ database: 'data/geodata.db' }) +const geocoder = require('offline-geocoder')({ database: 'data/geocoder.sqlite' }) ``` ### Reverse Geocoding -To perform a revese geocode lookup just pass the coordinates: +To perform a reverse geocode lookup just pass the coordinates: ```javascript geocoder.reverse(41.89, 12.49) @@ -76,6 +89,75 @@ geocoder.reverse(41.89, 12.49, function(error, result) { }) ``` +### Forward Geocoding + +Forward geocoding matches a city name to its canonical entry. Requires a +database generated with the updated schema (see below). + +```javascript +geocoder.forward('rome') + .then(function(result) { + console.log(result) + }) +``` + +Returns `undefined` when no match is found, or when using an older database +without the required columns. + +### Location Lookup + +Look up a city by its GeoNames id: + +```javascript +geocoder.location().find(3169070) +geocoder.location.find('geonames:3169070') +``` + +Returns `undefined` when the id doesn't exist. Both numeric ids and +`geonames:` strings are accepted — use the prefixed form as a stable +grouping key across datasets. + +## Expo / React Native + +The React Native entrypoint avoids Node-only modules: + +```javascript +const createGeocoder = require('offline-geocoder/expo') + +const db = await SQLite.openDatabaseAsync('geocoder.sqlite') +const geocoder = createGeocoder({ db: db }) + +geocoder.reverse(41.89, 12.49) + .then(function(result) { + console.log(result) + }) +``` + +You'll need to bundle the SQLite database file with your app assets and copy +it to a location accessible by `expo-sqlite` on first launch. + +## Generating the database + +The repo includes a script to generate a SQLite database from GeoNames dumps: + +```bash +./scripts/generate_geonames.sh data/geocoder.sqlite +``` + +Environment variables for customization: + +| Variable | Default | Description | +|---|---|---| +| `GEONAMES_DATASET` | `cities1000` | GeoNames dump file to use | +| `GEONAMES_WORKDIR` | current directory | Working directory for temp files | +| `GEONAMES_DOWNLOAD` | `1` | Set to `0` to skip downloads | +| `GEONAMES_FEATURE_CODES` | `PPLA,PPLA2,PPLA3,PPLA4,PPLA5,PPLC` | Feature codes to keep | +| `GEONAMES_MIN_POPULATION` | `0` | Minimum population filter | +| `GEONAMES_INCLUDE_ADMIN1` | `1` | Set to `0` to skip admin1 data | + +The default feature codes exclude `PPL` which can include neighbourhood-like +populated places. The schema is defined in [`scripts/schema.sql`](scripts/schema.sql). + ## License This library is licensed under [the MIT license](https://github.com/lucaspiller/offline-geocoder/blob/master/LICENSE). diff --git a/package.json b/package.json index 767a09c..e9af3f0 100644 --- a/package.json +++ b/package.json @@ -1,14 +1,30 @@ { "name": "offline-geocoder", "version": "1.0.0", - "description": "Node library for offline geocoding", + "description": "Offline reverse and forward geocoding for Node and React Native", "repository": "https://github.com/lucaspiller/offline-geocoder", "main": "src/index.js", - "dependencies": { - "sqlite3": "^4.0.0" + "react-native": "src/expo.js", + "exports": { + ".": { + "react-native": "./src/expo.js", + "require": "./src/index.js", + "default": "./src/index.js" + }, + "./expo": "./src/expo.js" }, + "peerDependencies": { + "sqlite3": "^5.1.7" + }, + "peerDependenciesMeta": { + "sqlite3": { + "optional": true + } + }, + "dependencies": {}, "devDependencies": { - "jasmine": "^3.1.0" + "jasmine": "^5.12.0", + "sqlite3": "^5.1.7" }, "scripts": { "test": "jasmine" diff --git a/scripts/generate_geonames.sh b/scripts/generate_geonames.sh index 20f4413..ebb1666 100755 --- a/scripts/generate_geonames.sh +++ b/scripts/generate_geonames.sh @@ -1,104 +1,159 @@ #!/bin/bash +set -euo pipefail + +# Generates a geocoder SQLite database from GeoNames dump files. +# Usage: +# ./scripts/generate_geonames.sh [output_db_path] +# +# Environment variables: +# GEONAMES_DATASET cities dump name without extension (default: cities1000) +# GEONAMES_WORKDIR working dir for output and temp files (default: current dir) +# GEONAMES_DOWNLOAD set to 0 to skip downloads and use existing local files +# GEONAMES_FEATURE_CODES comma-separated GeoNames feature codes to keep +# (default: PPLA,PPLA2,PPLA3,PPLA4,PPLA5,PPLC) +# Note: PPL can include neighborhood-like entries. +# GEONAMES_MIN_POPULATION minimum population to keep (default: 0) +# GEONAMES_INCLUDE_ADMIN1 set to 0 to skip admin1 import entirely (default: 1) + +GEONAMES_DATASET="${GEONAMES_DATASET:-cities1000}" +GEONAMES_WORKDIR="${GEONAMES_WORKDIR:-$(pwd)}" +GEONAMES_DOWNLOAD="${GEONAMES_DOWNLOAD:-1}" +GEONAMES_FEATURE_CODES="${GEONAMES_FEATURE_CODES:-PPLA,PPLA2,PPLA3,PPLA4,PPLA5,PPLC}" +GEONAMES_MIN_POPULATION="${GEONAMES_MIN_POPULATION:-0}" +GEONAMES_INCLUDE_ADMIN1="${GEONAMES_INCLUDE_ADMIN1:-1}" +OUTPUT="${1:-db.sqlite}" + +# Resolve to absolute so the later cd into GEONAMES_WORKDIR doesn't break it +case "${OUTPUT}" in + /*) ;; + *) OUTPUT="$(pwd)/${OUTPUT}" ;; +esac + +DATA_FILE="${GEONAMES_DATASET}.txt" +ADMIN1_FILE="admin1CodesASCII.txt" +COUNTRY_FILE="countryInfo.txt" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCHEMA_FILE="${SCRIPT_DIR}/schema.sql" +TMP_DIR="${GEONAMES_WORKDIR}/.geonames-build" +SOURCE_DIR="${TMP_DIR}/source" + +mkdir -p "${GEONAMES_WORKDIR}" "${TMP_DIR}" "${SOURCE_DIR}" + +download_if_missing() { + local file="$1" + local url="$2" + + if [[ -f "${SOURCE_DIR}/${file}" ]]; then + echo "Using existing ${file}" + return + fi -DATA="cities1000.txt" -ADMIN1="admin1CodesASCII.txt" -COUNTRIES="countryInfo.txt" -OUTPUT="db.sqlite" - -if [ ! -f "$DATA" ]; then - echo "Downloading cities from Geonames..." - wget "http://download.geonames.org/export/dump/cities1000.zip" - unzip "cities1000.zip" -else - echo "Using existing $DATA" -fi + if [[ "${GEONAMES_DOWNLOAD}" != "1" ]]; then + echo "Missing ${file} and GEONAMES_DOWNLOAD=${GEONAMES_DOWNLOAD}." >&2 + echo "Provide local files in ${SOURCE_DIR} or enable downloads." >&2 + exit 1 + fi -if [ ! -f "$ADMIN1" ]; then - echo "Downloading admin1 from Geonames..." - wget "http://download.geonames.org/export/dump/admin1CodesASCII.txt" -else - echo "Using existing $ADMIN1" -fi + echo "Downloading ${file}..." + curl -fsSL "${url}" -o "${SOURCE_DIR}/${file}" +} -if [ ! -f "$COUNTRIES" ]; then - echo "Downloading countries from Geonames..." - wget "http://download.geonames.org/export/dump/countryInfo.txt" -else - echo "Using existing $COUNTRIES" -fi +download_and_extract_dataset_if_missing() { + if [[ -f "${SOURCE_DIR}/${DATA_FILE}" ]]; then + echo "Using existing ${DATA_FILE}" + return + fi -if [ -f "$OUTPUT" ]; then - echo - echo "The file $OUTPUT already exists." - read -p "Do you want to override it? (y/N) " -n 1 -r - echo - if [[ ! $REPLY =~ ^[Yy]$ ]]; then + if [[ "${GEONAMES_DOWNLOAD}" != "1" ]]; then + echo "Missing ${DATA_FILE} and GEONAMES_DOWNLOAD=${GEONAMES_DOWNLOAD}." >&2 + echo "Provide local files in ${SOURCE_DIR} or enable downloads." >&2 exit 1 fi - rm "$OUTPUT" + local zip_file="${GEONAMES_DATASET}.zip" + echo "Downloading ${zip_file}..." + curl -fsSL "https://download.geonames.org/export/dump/${zip_file}" -o "${SOURCE_DIR}/${zip_file}" + unzip -o -q "${SOURCE_DIR}/${zip_file}" -d "${SOURCE_DIR}" +} + +download_and_extract_dataset_if_missing +download_if_missing "${COUNTRY_FILE}" "https://download.geonames.org/export/dump/${COUNTRY_FILE}" +if [[ "${GEONAMES_INCLUDE_ADMIN1}" == "1" ]]; then + download_if_missing "${ADMIN1_FILE}" "https://download.geonames.org/export/dump/${ADMIN1_FILE}" fi -echo -echo "Generating..." - -awk 'BEGIN { FS="\t"; OFS=";" } { gsub("\"", "", $2); gsub(";", "", $2); print $1,$2,$9,$11 }' $DATA > features.tsv -awk 'BEGIN { FS="\t"; OFS=";" } { print $1,$5,$6 }' $DATA > coordinates.tsv -awk 'BEGIN { FS="\t"; OFS=";" } { split($1, id, "."); gsub("\"", "", $2); gsub(";", "", $2); print id[1],id[2],$2 }' $ADMIN1 > admin1.tsv -grep -vE '^#' $COUNTRIES | awk 'BEGIN { FS="\t"; OFS=";" } { print $1,$5 }' > countries.tsv - -echo ' -CREATE TABLE coordinates( - feature_id INTEGER, - latitude REAL, - longitude REAL, - PRIMARY KEY (feature_id) -); - -CREATE TABLE features( - id INTEGER, - name TEXT, - country_id TEXT, - admin1_id INTEGER, - PRIMARY KEY (id) -); - -CREATE TABLE admin1( - country_id TEXT, - id INTEGER, - name TEXT, - PRIMARY KEY (country_id, id) -); - -CREATE TABLE countries( - id TEXT, - name TEXT, - PRIMARY KEY (id) -); - -CREATE VIEW everything AS - SELECT - features.id, - features.name, - admin1.id AS admin1_id, - admin1.name AS admin1_name, - countries.id AS country_id, - countries.name AS country_name, - coordinates.latitude AS latitude, - coordinates.longitude AS longitude - FROM features - LEFT JOIN countries ON features.country_id = countries.id - LEFT JOIN admin1 ON features.country_id = admin1.country_id AND features.admin1_id = admin1.id - JOIN coordinates ON features.id = coordinates.feature_id; +echo "Preparing TSV files in ${TMP_DIR}..." +echo "Feature codes: ${GEONAMES_FEATURE_CODES}" +echo "Minimum population: ${GEONAMES_MIN_POPULATION}" +echo "Include admin1: ${GEONAMES_INCLUDE_ADMIN1}" +rm -f "${TMP_DIR}/features.tsv" "${TMP_DIR}/coordinates.tsv" +awk -v feature_codes="${GEONAMES_FEATURE_CODES}" -v min_population="${GEONAMES_MIN_POPULATION}" -v include_admin1="${GEONAMES_INCLUDE_ADMIN1}" -v features_out="${TMP_DIR}/features.tsv" -v coordinates_out="${TMP_DIR}/coordinates.tsv" 'BEGIN { + FS="\t"; + OFS=";"; + split(feature_codes, raw_codes, ","); + for (i in raw_codes) { + code = raw_codes[i]; + gsub(/^[[:space:]]+|[[:space:]]+$/, "", code); + if (code != "") { + allowed_codes[code] = 1; + } + } +} +{ + if (!($8 in allowed_codes)) { + next; + } + + population = ($15 == "" ? 0 : $15); + if (population < min_population) { + next; + } + + gsub("\"", "", $2); + gsub(";", "", $2); + gsub("\"", "", $3); + gsub(";", "", $3); + admin1_id = (include_admin1 == "1" ? $11 : ""); + print $1,$2,$3,$9,admin1_id,population >> features_out; + print $1,$5,$6 >> coordinates_out; +}' "${SOURCE_DIR}/${DATA_FILE}" + +if [[ "${GEONAMES_INCLUDE_ADMIN1}" == "1" ]]; then + awk 'BEGIN { FS="\t"; OFS=";" } + { + split($1, id, "."); + gsub("\"", "", $2); + gsub(";", "", $2); + print id[1],id[2],$2 + }' "${SOURCE_DIR}/${ADMIN1_FILE}" > "${TMP_DIR}/admin1.tsv" +else + : > "${TMP_DIR}/admin1.tsv" +fi -.separator ";" -.import coordinates.tsv coordinates -.import features.tsv features -.import admin1.tsv admin1 -.import countries.tsv countries +grep -vE '^#' "${SOURCE_DIR}/${COUNTRY_FILE}" | awk 'BEGIN { FS="\t"; OFS=";" } +{ + gsub("\"", "", $5); + gsub(";", "", $5); + print $1,$5 +}' > "${TMP_DIR}/countries.tsv" -CREATE INDEX coordinates_lat_lng ON coordinates (latitude, longitude); -' | sqlite3 "$OUTPUT" +rm -f "${OUTPUT}" +echo "Building ${OUTPUT}..." -COUNT=`sqlite3 "$OUTPUT" "SELECT COUNT(*) FROM features;"` -echo "Created $OUTPUT with $COUNT features." +{ + cat "${SCHEMA_FILE}" + cat <<'SQL' +.separator ";" +.import .geonames-build/coordinates.tsv coordinates +.import .geonames-build/features.tsv features +.import .geonames-build/admin1.tsv admin1 +.import .geonames-build/countries.tsv countries +SQL +} | ( + cd "${GEONAMES_WORKDIR}" && + sqlite3 "${OUTPUT}" +) + +COUNT="$(sqlite3 "${OUTPUT}" "SELECT COUNT(*) FROM features;")" +echo "Created ${OUTPUT} with ${COUNT} features." diff --git a/scripts/schema.sql b/scripts/schema.sql new file mode 100644 index 0000000..4457d55 --- /dev/null +++ b/scripts/schema.sql @@ -0,0 +1,48 @@ +CREATE TABLE coordinates( + feature_id INTEGER PRIMARY KEY, + latitude REAL NOT NULL, + longitude REAL NOT NULL +); + +CREATE TABLE features( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + asciiname TEXT, + country_id TEXT NOT NULL, + admin1_id INTEGER, + population INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE admin1( + country_id TEXT NOT NULL, + id INTEGER NOT NULL, + name TEXT NOT NULL, + PRIMARY KEY (country_id, id) +); + +CREATE TABLE countries( + id TEXT PRIMARY KEY, + name TEXT NOT NULL +); + +CREATE VIEW everything AS + SELECT + features.id AS id, + features.name AS name, + features.asciiname AS asciiname, + features.population AS population, + admin1.id AS admin1_id, + admin1.name AS admin1_name, + countries.id AS country_id, + countries.name AS country_name, + coordinates.latitude AS latitude, + coordinates.longitude AS longitude + FROM features + LEFT JOIN countries ON features.country_id = countries.id + LEFT JOIN admin1 ON features.country_id = admin1.country_id AND features.admin1_id = admin1.id + JOIN coordinates ON features.id = coordinates.feature_id; + +CREATE INDEX coordinates_lat_lng ON coordinates (latitude, longitude); +CREATE INDEX features_name_nocase ON features (name COLLATE NOCASE); +CREATE INDEX features_asciiname_nocase ON features (asciiname COLLATE NOCASE); +CREATE INDEX features_population_desc ON features (population DESC); diff --git a/spec/expo_adapter_spec.js b/spec/expo_adapter_spec.js new file mode 100644 index 0000000..0500da4 --- /dev/null +++ b/spec/expo_adapter_spec.js @@ -0,0 +1,28 @@ +const createExpoGeocoder = require('../src/expo.js'); +const fixtureDb = require('./helpers/fixture_db'); + +describe('expo adapter', () => { + var fixture; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + + it('runs queries through the getAllAsync shim', (done) => { + var db = fixtureDb.createExpoDb(fixture.databasePath); + var geocoder = createExpoGeocoder({ db: db }); + + geocoder.reverse(41.89, 12.49) + .then(function(result) { + expect(result.id).toEqual(3169070); + done(); + }); + }); +}); diff --git a/spec/forward_spec.js b/spec/forward_spec.js new file mode 100644 index 0000000..8746659 --- /dev/null +++ b/spec/forward_spec.js @@ -0,0 +1,49 @@ +const createGeocoder = require('../src/index.js'); +const fixtureDb = require('./helpers/fixture_db'); + +describe('geocoder.forward', () => { + var fixture, geocoder; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + geocoder = createGeocoder({ database: fixture.databasePath }); + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + + it('returns the best match for an exact query', (done) => { + geocoder.forward('Rome') + .then(function(result) { + expect(result).toEqual({ + id: 3169070, + name: 'Rome', + formatted: 'Rome, Latium, Italy', + country: { id: 'IT', name: 'Italy' }, + admin1: { id: 7, name: 'Latium' }, + coordinates: { latitude: 41.89193, longitude: 12.51133 } + }); + done(); + }); + }); + + it('falls back to fuzzy matching', (done) => { + geocoder.forward('angeles') + .then(function(result) { + expect(result.id).toEqual(5368361); + done(); + }); + }); + + it('returns undefined when nothing matches', (done) => { + geocoder.forward('xyzzy-not-a-city') + .then(function(result) { + expect(result).toBeUndefined(); + done(); + }); + }); +}); diff --git a/spec/helpers/fixture_db.js b/spec/helpers/fixture_db.js new file mode 100644 index 0000000..e5c8ba1 --- /dev/null +++ b/spec/helpers/fixture_db.js @@ -0,0 +1,78 @@ +"use strict"; + +const fs = require('fs') +const os = require('os') +const path = require('path') +const sqlite3 = require('sqlite3') + +const schemaSql = fs.readFileSync(path.join(__dirname, '../../scripts/schema.sql'), 'utf8') + +const fixtureSql = ` +INSERT INTO countries(id, name) VALUES ('IT', 'Italy'), ('FR', 'France'), ('US', 'United States'); +INSERT INTO admin1(country_id, id, name) VALUES + ('IT', 7, 'Latium'), + ('FR', 11, 'Ile-de-France'), + ('US', 36, 'New York'), + ('US', 5, 'California'); +INSERT INTO features(id, name, asciiname, country_id, admin1_id, population) VALUES + (3169070, 'Rome', 'Rome', 'IT', 7, 2873000), + (2988507, 'Paris', 'Paris', 'FR', 11, 2138551), + (5128581, 'New York City', 'New York City', 'US', 36, 8175133), + (5368361, 'Los Angeles', 'Los Angeles', 'US', 5, 3792621); +INSERT INTO coordinates(feature_id, latitude, longitude) VALUES + (3169070, 41.89193, 12.51133), + (2988507, 48.85341, 2.3488), + (5128581, 40.71427, -74.00597), + (5368361, 34.05223, -118.24368); +` + +function exec(db, sql) { + return new Promise(function(resolve, reject) { + db.exec(sql, function(err) { err ? reject(err) : resolve() }) + }) +} + +function close(db) { + return new Promise(function(resolve, reject) { + db.close(function(err) { err ? reject(err) : resolve() }) + }) +} + +function createFixtureDatabase() { + var dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-')) + var dbPath = path.join(dir, 'fixture.sqlite') + var db = new sqlite3.Database(dbPath) + + return exec(db, schemaSql) + .then(function() { return exec(db, fixtureSql) }) + .then(function() { return close(db) }) + .then(function() { + return { + databasePath: dbPath, + cleanup: function() { + fs.rmSync(dir, { recursive: true, force: true }) + } + } + }) +} + +// Minimal shim that looks like an expo-sqlite database so we can test the +// Expo adapter without pulling in the real package. +function createExpoDb(dbPath) { + var db = new sqlite3.Database(dbPath) + return { + getAllAsync: function(sql, params) { + return new Promise(function(resolve, reject) { + db.all(sql, params || [], function(err, rows) { + err ? reject(err) : resolve(rows || []) + }) + }) + }, + closeAsync: function() { return close(db) } + } +} + +module.exports = { + createFixtureDatabase: createFixtureDatabase, + createExpoDb: createExpoDb +} diff --git a/spec/location_spec.js b/spec/location_spec.js index 72c9de1..b53a637 100644 --- a/spec/location_spec.js +++ b/spec/location_spec.js @@ -1,6 +1,21 @@ -const geocoder = require('../src/index.js')(); +const createGeocoder = require('../src/index.js'); +const fixtureDb = require('./helpers/fixture_db'); describe('geocoder.location', () => { + var fixture, geocoder; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + geocoder = createGeocoder({ database: fixture.databasePath }); + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + describe('.find', () => { it('performs a lookup by id', (done) => { geocoder.location().find(3169070) @@ -17,6 +32,14 @@ describe('geocoder.location', () => { }); }); + it('accepts geonames: prefixed ids', (done) => { + geocoder.location.find('geonames:3169070') + .then(function(result) { + expect(result.id).toEqual(3169070); + done(); + }); + }); + it("resolves undefined when a location can't be found", (done) => { geocoder.location().find(-1) .then(function(result) { diff --git a/spec/reverse_spec.js b/spec/reverse_spec.js index a5c40e0..673c45d 100644 --- a/spec/reverse_spec.js +++ b/spec/reverse_spec.js @@ -1,6 +1,21 @@ -const geocoder = require('../src/index.js')(); +const createGeocoder = require('../src/index.js'); +const fixtureDb = require('./helpers/fixture_db'); describe('geocoder.reverse', () => { + var fixture, geocoder; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + geocoder = createGeocoder({ database: fixture.databasePath }); + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + it('performs reverse geocoding on a latitude and longitude', (done) => { geocoder.reverse(41.89, 12.49) .then(function(result) { diff --git a/spec/schema_spec.js b/spec/schema_spec.js new file mode 100644 index 0000000..667f07e --- /dev/null +++ b/spec/schema_spec.js @@ -0,0 +1,45 @@ +const sqlite3 = require('sqlite3'); +const fixtureDb = require('./helpers/fixture_db'); + +describe('generated schema', () => { + var fixture, db; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + db = new sqlite3.Database(fixture.databasePath); + done(); + }); + }); + + afterAll((done) => { + db.close(function() { + fixture.cleanup(); + done(); + }); + }); + + it('has asciiname and population in the everything view', (done) => { + db.all('PRAGMA table_info(everything)', [], function(err, cols) { + var names = cols.map(function(c) { return c.name }); + expect(names).toContain('asciiname'); + expect(names).toContain('population'); + done(); + }); + }); + + it('creates indexes for reverse and forward lookups', (done) => { + db.all("PRAGMA index_list('coordinates')", [], function(err, coordIndexes) { + db.all("PRAGMA index_list('features')", [], function(err, featIndexes) { + var coordNames = coordIndexes.map(function(i) { return i.name }); + var featNames = featIndexes.map(function(i) { return i.name }); + + expect(coordNames).toContain('coordinates_lat_lng'); + expect(featNames).toContain('features_name_nocase'); + expect(featNames).toContain('features_asciiname_nocase'); + expect(featNames).toContain('features_population_desc'); + done(); + }); + }); + }); +}); diff --git a/src/expo.js b/src/expo.js new file mode 100644 index 0000000..0d66f67 --- /dev/null +++ b/src/expo.js @@ -0,0 +1,75 @@ +"use strict"; + +const reverse = require('./reverse') +const forward = require('./forward') +const findLocation = require('./location').find + +// Wraps an expo-sqlite database to match the node-sqlite3 callback +// interface that reverse.js, forward.js and location.js expect. +function wrapExpoDb(expoDb) { + return { + all: function(sql, params, callback) { + expoDb.getAllAsync(sql, params || []) + .then(function(rows) { callback(null, rows) }) + .catch(function(err) { callback(err) }) + }, + close: function(callback) { + if (typeof expoDb.closeAsync === 'function') { + expoDb.closeAsync() + .then(function() { if (callback) callback(null) }) + .catch(function(err) { if (callback) callback(err) }) + } else if (callback) { + callback(null) + } + } + } +} + +function ExpoGeocoder(options) { + var opts = options || {} + var expoDb = opts.db || opts.database + + if (!expoDb || typeof expoDb.getAllAsync !== 'function') { + throw new Error('Pass an opened expo-sqlite db via { db }.') + } + + this.db = wrapExpoDb(expoDb) +} + +ExpoGeocoder.prototype.reverse = function(latitude, longitude, callback) { + return reverse(this, latitude, longitude, callback) +} + +ExpoGeocoder.prototype.forward = function(query, callback) { + return forward(this, query, callback) +} + +ExpoGeocoder.prototype.location = function() { + const _this = this + + return { + find: function(locationId) { + return findLocation(_this, locationId) + } + } +} + +function createExpoGeocoder(options) { + var instance = new ExpoGeocoder(options) + + var locationFn = function() { + return { + find: function(locationId) { + return findLocation(instance, locationId) + } + } + } + locationFn.find = function(locationId) { + return findLocation(instance, locationId) + } + instance.location = locationFn + + return instance +} + +module.exports = createExpoGeocoder; diff --git a/src/forward.js b/src/forward.js new file mode 100644 index 0000000..8203d97 --- /dev/null +++ b/src/forward.js @@ -0,0 +1,131 @@ +"use strict"; + +const formatLocation = require('./location').format + +// Forward geocoding: tries an exact match on name/asciiname first, then +// falls back to prefix and substring matching. +// +// Requires the updated schema with asciiname and population columns. +// Returns undefined on databases without those columns. +function findByName(geocoder, query, callback) { + return new Promise(function(resolve, reject) { + var q = typeof query === 'string' ? query.trim() : '' + if (!q) { + if (typeof(callback) == 'function') { + callback(undefined, undefined) + } else { + resolve(undefined) + } + return + } + + // Check if the database supports forward search (cached per geocoder) + if (geocoder._forwardSupported === false) { + if (typeof(callback) == 'function') { + callback(undefined, undefined) + } else { + resolve(undefined) + } + return + } + + function doSearch() { + var exactQuery = `SELECT * FROM everything + WHERE name = ? COLLATE NOCASE OR asciiname = ? COLLATE NOCASE + ORDER BY + CASE WHEN name = ? COLLATE NOCASE THEN 0 + WHEN asciiname = ? COLLATE NOCASE THEN 1 + ELSE 2 END, + population DESC, id ASC + LIMIT 1` + + geocoder.db.all(exactQuery, [q, q, q, q], function(err, rows) { + if (err) { + if (typeof(callback) == 'function') { + callback(err, undefined) + } else if (typeof(reject) == 'function') { + reject(err) + } + return + } + + if (rows && rows[0]) { + const result = formatLocation(rows[0]) + if (typeof(callback) == 'function') { + callback(undefined, result) + } else { + resolve(result) + } + return + } + + // Fall back to prefix / substring match + var prefix = q + '%' + var contains = '%' + q + '%' + var fuzzyQuery = `SELECT * FROM everything + WHERE name LIKE ? COLLATE NOCASE + OR name LIKE ? COLLATE NOCASE + OR asciiname LIKE ? COLLATE NOCASE + OR asciiname LIKE ? COLLATE NOCASE + ORDER BY + CASE WHEN name LIKE ? COLLATE NOCASE THEN 0 + WHEN asciiname LIKE ? COLLATE NOCASE THEN 1 + ELSE 2 END, + population DESC, LENGTH(name) ASC, id ASC + LIMIT 1` + + geocoder.db.all(fuzzyQuery, [prefix, contains, prefix, contains, prefix, prefix], function(err, rows) { + if (err) { + if (typeof(callback) == 'function') { + callback(err, undefined) + } else if (typeof(reject) == 'function') { + reject(err) + } + } else { + const result = formatResult(rows) + if (typeof(callback) == 'function') { + callback(undefined, result) + } else { + resolve(result) + } + } + }) + }) + } + + if (geocoder._forwardSupported === true) { + doSearch() + return + } + + // Probe for the asciiname column (first call only) + geocoder.db.all('SELECT asciiname FROM everything LIMIT 0', [], function(err) { + geocoder._forwardSupported = !err + if (err) { + if (typeof(callback) == 'function') { + callback(undefined, undefined) + } else { + resolve(undefined) + } + } else { + doSearch() + } + }) + }) +} + +function formatResult(rows) { + const row = rows[0] + + if (row === undefined) { + return undefined + } else { + return formatLocation(row) + } +} + +function Forward(geocoder, query, callback) { + return findByName(geocoder, query, callback) +} + +module.exports = Forward; diff --git a/src/index.js b/src/index.js index 9ba44e7..1d4d555 100644 --- a/src/index.js +++ b/src/index.js @@ -1,25 +1,42 @@ "use strict"; const path = require('path') -const sqlite3 = require('sqlite3').verbose() const reverse = require('./reverse') +const forward = require('./forward') const findLocation = require('./location').find function Geocoder(options) { var geocoder = function(options) { this.options = options || {} - if (this.options.database === undefined) { - this.options.database = path.join(__filename, '../../data/db.sqlite') - } + if (this.options.db) { + // Accept a pre-opened database object (must have .all(sql, params, cb)) + this.db = this.options.db + } else { + var sqlite3 + try { + sqlite3 = (this.options.sqlite3 || require('sqlite3')).verbose() + } catch (err) { + err.message = 'sqlite3 is required for Node usage. Install it with `npm install sqlite3`.' + throw err + } + + if (this.options.database === undefined) { + this.options.database = path.join(__dirname, '../data/db.sqlite') + } - this.db = new sqlite3.Database(this.options.database) + this.db = new sqlite3.Database(this.options.database) + } } geocoder.prototype.reverse = function(latitude, longitude, callback) { return reverse(this, latitude, longitude, callback) } + geocoder.prototype.forward = function(query, callback) { + return forward(this, query, callback) + } + geocoder.prototype.location = function() { const _this = this @@ -30,7 +47,22 @@ function Geocoder(options) { } } - return new geocoder(options) + var instance = new geocoder(options) + + // Also support geocoder.location.find(id) without calling location() + var locationFn = function() { + return { + find: function(locationId) { + return findLocation(instance, locationId) + } + } + } + locationFn.find = function(locationId) { + return findLocation(instance, locationId) + } + instance.location = locationFn + + return instance } module.exports = Geocoder; diff --git a/src/location.js b/src/location.js index e24affe..1d4d097 100644 --- a/src/location.js +++ b/src/location.js @@ -1,25 +1,23 @@ "use strict"; +function normalizeId(value) { + if (typeof value === 'string') { + var match = /^geonames:(\d+)$/i.exec(value.trim()) + if (match) return Number(match[1]) + return Number(value) + } + return value +} + function find(geocoder, locationId) { return new Promise(function(resolve, reject) { - const query = `SELECT * FROM everything WHERE id = $id LIMIT 1` + const query = `SELECT * FROM everything WHERE id = ? LIMIT 1` - geocoder.db.all(query, { - $id: locationId - }, function(err, rows) { + geocoder.db.all(query, [normalizeId(locationId)], function(err, rows) { if (err) { - if (typeof(callback) == 'function') { - callback(err, undefined) - } else if (typeof(reject) == 'function') { - reject(err) - } + reject(err) } else { - const result = formatResult(rows) - if (typeof(callback) == 'function') { - callback(undefined, result) - } else if (typeof(resolve) == 'function') { - resolve(result) - } + resolve(formatResult(rows)) } }) }) diff --git a/src/reverse.js b/src/reverse.js index db92cd9..c7d030a 100644 --- a/src/reverse.js +++ b/src/reverse.js @@ -17,22 +17,21 @@ function findFeature(geocoder, latitude, longitude, callback) { const query = `SELECT * FROM everything WHERE id IN ( SELECT feature_id FROM coordinates - WHERE latitude BETWEEN $lat - 1.5 AND $lat + 1.5 - AND longitude BETWEEN $lon - 1.5 AND $lon + 1.5 + WHERE latitude BETWEEN ? - 1.5 AND ? + 1.5 + AND longitude BETWEEN ? - 1.5 AND ? + 1.5 ORDER BY ( - ($lat - latitude) * ($lat - latitude) + - ($lon - longitude) * ($lon - longitude) * $scale + (? - latitude) * (? - latitude) + + (? - longitude) * (? - longitude) * ? ) ASC LIMIT 1 )` const scale = Math.pow(Math.cos(latitude * Math.PI / 180), 2) - geocoder.db.all(query, { - $lat: latitude, - $lon: longitude, - $scale: scale - }, function(err, rows) { + geocoder.db.all(query, [ + latitude, latitude, longitude, longitude, + latitude, latitude, longitude, longitude, scale + ], function(err, rows) { if (err) { if (typeof(callback) == 'function') { callback(err, undefined) From c2b1e98c0ebcc16df205d8e3b06f1c76cb9caf91 Mon Sep 17 00:00:00 2001 From: Sebastian Schloesser Date: Tue, 24 Mar 2026 14:08:22 -0400 Subject: [PATCH 02/10] IMPLEMENT ADAPTIVE RESOLUTION GEOHASH GEOCODING ALGORITHM --- .gitignore | 1 + README.md | 114 ++ bin/geocoder | 21 +- bin/geocoder-bench | 21 +- bin/geocoder-build-boundary | 4 + bin/geocoder-build-wof | 14 + package.json | 10 +- scripts/analyze_compact_index.py | 372 +++++++ scripts/generate_boundary_index.js | 1373 +++++++++++++++++++++++++ scripts/generate_wof_boundary.sh | 143 +++ scripts/schema.sql | 46 + spec/boundary_builder_spec.js | 329 ++++++ spec/boundary_cover_spec.js | 52 + spec/geometry_spec.js | 47 + spec/helpers/fixture_db.js | 193 +++- spec/reverse_boundary_compact_spec.js | 67 ++ spec/reverse_boundary_rollup_spec.js | 114 ++ spec/reverse_boundary_spec.js | 53 + spec/reverse_spec.js | 2 +- spec/schema_spec.js | 34 +- src/boundary_cover.js | 99 ++ src/expo.js | 33 + src/geohash.js | 158 +++ src/geometry.js | 411 ++++++++ src/index.js | 32 + src/reverse.js | 687 ++++++++++++- 26 files changed, 4367 insertions(+), 63 deletions(-) create mode 100755 bin/geocoder-build-boundary create mode 100755 bin/geocoder-build-wof create mode 100644 scripts/analyze_compact_index.py create mode 100755 scripts/generate_boundary_index.js create mode 100755 scripts/generate_wof_boundary.sh create mode 100644 spec/boundary_builder_spec.js create mode 100644 spec/boundary_cover_spec.js create mode 100644 spec/geometry_spec.js create mode 100644 spec/reverse_boundary_compact_spec.js create mode 100644 spec/reverse_boundary_rollup_spec.js create mode 100644 spec/reverse_boundary_spec.js create mode 100644 src/boundary_cover.js create mode 100644 src/geohash.js create mode 100644 src/geometry.js diff --git a/.gitignore b/.gitignore index 54e5a81..c198c54 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ package-lock.json .npm-cache .geonames-build .DS_Store +tmp diff --git a/README.md b/README.md index 3ddcbd7..8dae03d 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,17 @@ When you initialize the library you need to pass the location of the database: const geocoder = require('offline-geocoder')({ database: 'data/geocoder.sqlite' }) ``` +To enable boundary-aware reverse geocoding, pass `reverseMode: 'boundary'` +(default is `centroid` for backward compatibility): + +```javascript +const geocoder = require('offline-geocoder')({ + database: 'data/geocoder.sqlite', + reverseMode: 'boundary', + boundary: { basePrecision: 4, maxPrecision: 7 } +}) +``` + ### Reverse Geocoding To perform a reverse geocode lookup just pass the coordinates: @@ -89,6 +100,11 @@ geocoder.reverse(41.89, 12.49, function(error, result) { }) ``` +Boundary mode keeps the same return payload shape and supports two boundary +storage modes: +- compact lookup (`compact_places` + `compact_geohash_lookup`) +- full polygon mode (`places` + `place_geohash_cover` + `place_geometry`) + ### Forward Geocoding Forward geocoding matches a city name to its canonical entry. Requires a @@ -158,6 +174,104 @@ Environment variables for customization: The default feature codes exclude `PPL` which can include neighbourhood-like populated places. The schema is defined in [`scripts/schema.sql`](scripts/schema.sql). +### Generating a Boundary Index + +Build boundary-aware reverse lookup tables from a polygon source (GeoJSON +FeatureCollection/Feature or newline-delimited GeoJSON): + +```bash +node scripts/generate_boundary_index.js \ + --database data/geocoder.sqlite \ + --input data/localities.geojson \ + --index-mode compact \ + --include-region true \ + --min-population 10000 \ + --base-precision 4 \ + --max-precision 7 +``` + +You can also run `npm run build:boundary -- --database ... --input ...`. + +You can point the builder directly at directories of WOF GeoJSON files: + +```bash +node scripts/generate_boundary_index.js \ + --database data/geocoder.sqlite \ + --input-dir tmp/wof-build/extracted/fr/.../data \ + --index-mode compact \ + --include-region true \ + --min-population 10000 \ + --base-precision 4 \ + --max-precision 7 \ + --drop-contained-localities true +``` + +`--drop-contained-localities true` removes `locality` polygons that are fully +contained in larger localities within the same country/admin1 group. This is +intended to suppress duplicate neighbourhood-like localities while keeping +small isolated places (for example islands) that are not contained. + +Builder notes: + +- Keeps current records only (drops deprecated/superseded where source metadata is present) +- Includes `locality` placetypes by default (`localadmin` optional via `--include-localadmin true`) +- Optional `region` fallback polygons via `--include-region true` +- `--min-population` applies to `locality` only, so low-pop localities can roll up to broader admin areas when `region` is included +- Point-only capital localities are retained (single-cell locality fallback) so country/admin capitals are not dropped by polygon-only filtering +- Per-placetype precision caps are supported: + - `--locality-max-precision` + - `--localadmin-max-precision` + - `--region-max-precision` + - `--region-sparse-max-precision` + `--region-sparse-min-area-km2` for very large sparse regions (for example geohash-3 in Amazon-like interiors) +- `--promote-locality-over-region` (default `true`) prefers locality labels in shared parent cells when there is no competing locality (keeps city labels sticky against region-only outskirts) +- Excludes neighbourhood-like placetypes from default reverse output +- `--index-mode compact` (default) stores only geohash-to-place mappings (`compact_geohash_lookup`) and no runtime geometry payloads. + Compact schema uses `compact_places(id,name,country_id,admin1_id,placetype_code,latitude,longitude)`. +- `--index-mode full` stores geohash cover + geometry for runtime point-in-polygon + +### Building From Who's On First (WOF) + +Use the WOF helper script to download country admin repos and build in one step: + +```bash +WOF_COUNTRIES=FR,IT \ +WOF_BASE_PRECISION=4 \ +WOF_MAX_PRECISION=5 \ +WOF_INCLUDE_REGION=1 \ +WOF_MIN_POPULATION=10000 \ +./scripts/generate_wof_boundary.sh data/geocoder.sqlite +``` + +Equivalent npm script: + +```bash +npm run build:wof -- data/geocoder.sqlite +``` + +Useful WOF build env vars: + +- `WOF_COUNTRIES` comma-separated country codes (default `FR,IT`) +- `WOF_WORKDIR` working directory for downloads/extracted files (default `tmp/wof-build`) +- `WOF_DOWNLOAD=0` reuse existing archives only +- `WOF_REF` branch/ref to download (default `master`) +- `WOF_LOCALITY_MAX_PRECISION` locality precision cap +- `WOF_REGION_MAX_PRECISION` region precision cap (default `4`) +- `WOF_REGION_SPARSE_MAX_PRECISION` sparse very-large-region precision (default `3`) +- `WOF_REGION_SPARSE_MIN_AREA_KM2` area threshold for sparse region precision (default `80000`) +- `WOF_PROMOTE_LOCALITY_OVER_REGION=1|0` prefer locality labels over region in shared parent cells (default `1`) +- `WOF_GEOMETRY_DECIMALS` round coordinates before storage/indexing (for example `4`) +- `WOF_MIN_POPULATION` filter out places below threshold (for example `10000`) +- `WOF_INCLUDE_REGION=1|0` include/exclude region fallback boundaries +- `WOF_MAX_PLACES` cap places for experiment runs +- `WOF_DROP_CONTAINED_LOCALITIES=1|0` enable/disable contained-locality pruning + +Boundary runtime modes: + +- `reverseMode: 'centroid'` (default): legacy nearest-centroid reverse lookup +- `reverseMode: 'boundary'`: boundary tables lookup. + - Uses compact `compact_geohash_lookup` when present (fast geohash-to-place). + - Falls back to full polygon-aware tables when compact rows are absent. + ## License This library is licensed under [the MIT license](https://github.com/lucaspiller/offline-geocoder/blob/master/LICENSE). diff --git a/bin/geocoder b/bin/geocoder index 272750b..2813e66 100755 --- a/bin/geocoder +++ b/bin/geocoder @@ -3,7 +3,26 @@ "use strict"; -const geocoder = require('../src/index.js')() +function parseOptionalNumber(value) { + if (value === undefined) return undefined + var parsed = Number(value) + return Number.isFinite(parsed) ? parsed : undefined +} + +var options = {} +if (process.env.GEOCODER_REVERSE_MODE) { + options.reverseMode = process.env.GEOCODER_REVERSE_MODE +} + +var boundaryBase = parseOptionalNumber(process.env.GEOCODER_BOUNDARY_BASE_PRECISION) +var boundaryMax = parseOptionalNumber(process.env.GEOCODER_BOUNDARY_MAX_PRECISION) +if (boundaryBase !== undefined || boundaryMax !== undefined) { + options.boundary = {} + if (boundaryBase !== undefined) options.boundary.basePrecision = boundaryBase + if (boundaryMax !== undefined) options.boundary.maxPrecision = boundaryMax +} + +const geocoder = require('../src/index.js')(options) const args = process.argv.slice(2) if (args.length != 2) { diff --git a/bin/geocoder-bench b/bin/geocoder-bench index 939dcaa..0366ea3 100755 --- a/bin/geocoder-bench +++ b/bin/geocoder-bench @@ -3,7 +3,26 @@ "use strict"; -const geocoder = require('../src/index.js')() +function parseOptionalNumber(value) { + if (value === undefined) return undefined + var parsed = Number(value) + return Number.isFinite(parsed) ? parsed : undefined +} + +var options = {} +if (process.env.GEOCODER_REVERSE_MODE) { + options.reverseMode = process.env.GEOCODER_REVERSE_MODE +} + +var boundaryBase = parseOptionalNumber(process.env.GEOCODER_BOUNDARY_BASE_PRECISION) +var boundaryMax = parseOptionalNumber(process.env.GEOCODER_BOUNDARY_MAX_PRECISION) +if (boundaryBase !== undefined || boundaryMax !== undefined) { + options.boundary = {} + if (boundaryBase !== undefined) options.boundary.basePrecision = boundaryBase + if (boundaryMax !== undefined) options.boundary.maxPrecision = boundaryMax +} + +const geocoder = require('../src/index.js')(options) const args = process.argv.slice(2) if (args.length != 2) { diff --git a/bin/geocoder-build-boundary b/bin/geocoder-build-boundary new file mode 100755 index 0000000..86dc909 --- /dev/null +++ b/bin/geocoder-build-boundary @@ -0,0 +1,4 @@ +#!/usr/bin/env node +"use strict"; + +require('../scripts/generate_boundary_index') diff --git a/bin/geocoder-build-wof b/bin/geocoder-build-wof new file mode 100755 index 0000000..c5b947b --- /dev/null +++ b/bin/geocoder-build-wof @@ -0,0 +1,14 @@ +#!/usr/bin/env node +"use strict"; + +const { spawnSync } = require('child_process') +const path = require('path') + +const script = path.join(__dirname, '..', 'scripts', 'generate_wof_boundary.sh') +const args = process.argv.slice(2) + +const result = spawnSync(script, args, { stdio: 'inherit' }) +if (result.error) { + throw result.error +} +process.exit(result.status === null ? 1 : result.status) diff --git a/package.json b/package.json index e9af3f0..5cc8b31 100644 --- a/package.json +++ b/package.json @@ -4,6 +4,12 @@ "description": "Offline reverse and forward geocoding for Node and React Native", "repository": "https://github.com/lucaspiller/offline-geocoder", "main": "src/index.js", + "bin": { + "geocoder": "bin/geocoder", + "geocoder-bench": "bin/geocoder-bench", + "geocoder-build-boundary": "bin/geocoder-build-boundary", + "geocoder-build-wof": "bin/geocoder-build-wof" + }, "react-native": "src/expo.js", "exports": { ".": { @@ -27,7 +33,9 @@ "sqlite3": "^5.1.7" }, "scripts": { - "test": "jasmine" + "test": "jasmine", + "build:boundary": "node scripts/generate_boundary_index.js", + "build:wof": "bash scripts/generate_wof_boundary.sh" }, "author": "Luca Spiller", "license": "MIT" diff --git a/scripts/analyze_compact_index.py b/scripts/analyze_compact_index.py new file mode 100644 index 0000000..9e5092a --- /dev/null +++ b/scripts/analyze_compact_index.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python3 +"""Analyze compact geohash lookup DB size drivers. + +Usage: + python scripts/analyze_compact_index.py \ + --db tmp/wof-fr-it-compact-p5-d3-pop10k-region.sqlite \ + --top 20 \ + --export-place-id 85683531 \ + --export-geojson tmp/region_cells.geojson +""" + +from __future__ import annotations + +import argparse +import json +import math +import sqlite3 +from pathlib import Path +from typing import Dict, Iterable, List, Sequence, Tuple + +BASE32 = "0123456789bcdefghjkmnpqrstuvwxyz" +BASE32_MAP = {ch: i for i, ch in enumerate(BASE32)} +PLACETYPE_BY_CODE = {0: "locality", 1: "localadmin", 2: "region"} + + +def decode_geohash_bbox(geohash: str) -> Tuple[float, float, float, float]: + lat_min, lat_max = -90.0, 90.0 + lon_min, lon_max = -180.0, 180.0 + even = True + + for ch in geohash.lower(): + value = BASE32_MAP[ch] + for mask in (16, 8, 4, 2, 1): + if even: + lon_mid = (lon_min + lon_max) / 2.0 + if value & mask: + lon_min = lon_mid + else: + lon_max = lon_mid + else: + lat_mid = (lat_min + lat_max) / 2.0 + if value & mask: + lat_min = lat_mid + else: + lat_max = lat_mid + even = not even + + return (lat_min, lon_min, lat_max, lon_max) + + +def cell_area_km2(geohash: str) -> float: + min_lat, min_lon, max_lat, max_lon = decode_geohash_bbox(geohash) + center_lat = (min_lat + max_lat) / 2.0 + d_lat = abs(max_lat - min_lat) + d_lon = abs(max_lon - min_lon) + + lat_km = d_lat * 111.32 + lon_km = d_lon * 111.32 * math.cos(math.radians(center_lat)) + return max(0.0, lat_km * lon_km) + + +def query_all(conn: sqlite3.Connection, sql: str, params: Sequence[object] = ()) -> List[sqlite3.Row]: + cur = conn.execute(sql, params) + rows = cur.fetchall() + cur.close() + return rows + + +def compact_has_placetype_code(conn: sqlite3.Connection) -> bool: + cols = query_all(conn, "PRAGMA table_info(compact_places)") + names = {row["name"] for row in cols} + return "placetype_code" in names + + +def print_summary(conn: sqlite3.Connection, top_n: int) -> None: + has_code = compact_has_placetype_code(conn) + placetype_expr = ( + "CASE p.placetype_code WHEN 0 THEN 'locality' WHEN 1 THEN 'localadmin' WHEN 2 THEN 'region' ELSE 'unknown' END" + if has_code + else "p.placetype" + ) + + total_rows = query_all(conn, "SELECT COUNT(*) AS c FROM compact_geohash_lookup")[0]["c"] + total_places = query_all(conn, "SELECT COUNT(*) AS c FROM compact_places")[0]["c"] + lengths = query_all( + conn, + "SELECT LENGTH(geohash) AS precision, COUNT(*) AS c " + "FROM compact_geohash_lookup GROUP BY precision ORDER BY precision", + ) + + print("=== Compact Index Summary ===") + print(f"Places: {total_places}") + print(f"Lookup rows: {total_rows}") + print("Geohash precision distribution:") + for row in lengths: + print(f" p{row['precision']}: {row['c']}") + + by_type = query_all( + conn, + """ + SELECT + {placetype_expr} AS placetype, + COUNT(DISTINCT p.id) AS place_count, + COUNT(*) AS lookup_rows + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + GROUP BY placetype + ORDER BY lookup_rows DESC + """.format(placetype_expr=placetype_expr), + ) + print("Rows by placetype:") + for row in by_type: + pct = (row["lookup_rows"] / total_rows * 100.0) if total_rows else 0.0 + print( + f" {row['placetype']}: places={row['place_count']}, " + f"rows={row['lookup_rows']} ({pct:.1f}%)" + ) + + top_places = query_all( + conn, + """ + SELECT + p.id, + p.name, + p.country_id, + {placetype_expr} AS placetype, + COUNT(*) AS lookup_rows + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + GROUP BY p.id + ORDER BY lookup_rows DESC, p.id ASC + LIMIT ? + """.format(placetype_expr=placetype_expr), + (top_n,), + ) + + print(f"Top {top_n} places by lookup rows:") + for row in top_places: + print( + f" {row['id']} | {row['placetype']} | {row['country_id']} | " + f"{row['name']} | rows={row['lookup_rows']}" + ) + + region_area = query_all( + conn, + """ + SELECT l.geohash + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + WHERE {placetype_expr} = 'region' + """.format(placetype_expr=placetype_expr), + (), + ) + total_region_area = sum(cell_area_km2(row["geohash"]) for row in region_area) + print(f"Approx area represented by region rows (km^2): {total_region_area:,.0f}") + + +def export_place_geojson( + conn: sqlite3.Connection, + place_id: int, + output_path: Path, + limit: int | None = None, +) -> None: + has_code = compact_has_placetype_code(conn) + if has_code: + place_rows = query_all( + conn, + """ + SELECT + id, + name, + country_id, + admin1_id, + CASE placetype_code + WHEN 0 THEN 'locality' + WHEN 1 THEN 'localadmin' + WHEN 2 THEN 'region' + ELSE 'unknown' + END AS placetype + FROM compact_places + WHERE id = ? + """, + (place_id,), + ) + else: + place_rows = query_all( + conn, + "SELECT id, name, country_id, admin1_id, placetype FROM compact_places WHERE id = ?", + (place_id,), + ) + if not place_rows: + raise SystemExit(f"place_id={place_id} not found in compact_places") + place = place_rows[0] + + sql = "SELECT geohash FROM compact_geohash_lookup WHERE place_id = ? ORDER BY geohash" + params: List[object] = [place_id] + if limit is not None and limit > 0: + sql += " LIMIT ?" + params.append(limit) + + geohash_rows = query_all(conn, sql, tuple(params)) + + features: List[Dict[str, object]] = [] + for row in geohash_rows: + geoh = row["geohash"] + min_lat, min_lon, max_lat, max_lon = decode_geohash_bbox(geoh) + polygon = [ + [min_lon, min_lat], + [max_lon, min_lat], + [max_lon, max_lat], + [min_lon, max_lat], + [min_lon, min_lat], + ] + features.append( + { + "type": "Feature", + "properties": { + "place_id": place["id"], + "name": place["name"], + "placetype": place["placetype"], + "country_id": place["country_id"], + "admin1_id": place["admin1_id"], + "geohash": geoh, + "precision": len(geoh), + }, + "geometry": {"type": "Polygon", "coordinates": [polygon]}, + } + ) + + payload = {"type": "FeatureCollection", "features": features} + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(payload), encoding="utf-8") + + print( + f"Wrote {len(features)} cell polygons for place_id={place_id} " + f"({place['name']}) to {output_path}" + ) + + +def export_all_geojson(conn: sqlite3.Connection, output_path: Path, limit: int | None = None) -> None: + has_code = compact_has_placetype_code(conn) + if has_code: + sql = """ + SELECT + l.geohash AS geohash, + p.id AS place_id, + p.name AS name, + p.country_id AS country_id, + p.admin1_id AS admin1_id, + CASE p.placetype_code + WHEN 0 THEN 'locality' + WHEN 1 THEN 'localadmin' + WHEN 2 THEN 'region' + ELSE 'unknown' + END AS placetype + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + ORDER BY l.geohash + """ + else: + sql = """ + SELECT + l.geohash AS geohash, + p.id AS place_id, + p.name AS name, + p.country_id AS country_id, + p.admin1_id AS admin1_id, + p.placetype AS placetype + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + ORDER BY l.geohash + """ + + params: List[object] = [] + if limit is not None and limit > 0: + sql += " LIMIT ?" + params.append(limit) + + rows = query_all(conn, sql, tuple(params)) + + features: List[Dict[str, object]] = [] + for row in rows: + geoh = row["geohash"] + min_lat, min_lon, max_lat, max_lon = decode_geohash_bbox(geoh) + polygon = [ + [min_lon, min_lat], + [max_lon, min_lat], + [max_lon, max_lat], + [min_lon, max_lat], + [min_lon, min_lat], + ] + features.append( + { + "type": "Feature", + "properties": { + "place_id": row["place_id"], + "name": row["name"], + "placetype": row["placetype"], + "country_id": row["country_id"], + "admin1_id": row["admin1_id"], + "geohash": geoh, + "precision": len(geoh), + }, + "geometry": {"type": "Polygon", "coordinates": [polygon]}, + } + ) + + payload = {"type": "FeatureCollection", "features": features} + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(payload), encoding="utf-8") + print(f"Wrote {len(features)} cell polygons to {output_path}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Analyze compact geohash lookup DB") + parser.add_argument("--db", required=True, help="Path to SQLite DB with compact_* tables") + parser.add_argument("--top", type=int, default=20, help="Show top N places by lookup rows") + parser.add_argument("--export-place-id", type=int, default=None, help="Place id to export as cell polygons") + parser.add_argument( + "--export-geojson", + default="tmp/compact_place_cells.geojson", + help="GeoJSON output path (used with --export-place-id)", + ) + parser.add_argument( + "--export-limit", + type=int, + default=None, + help="Optional max number of geohash cells to export", + ) + parser.add_argument( + "--export-all-geojson", + default=None, + help="Write all geohash cells with place metadata to this GeoJSON path", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + db_path = Path(args.db) + if not db_path.exists(): + raise SystemExit(f"DB not found: {db_path}") + + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + try: + tables = {row["name"] for row in query_all(conn, "SELECT name FROM sqlite_master WHERE type='table'")} + if "compact_places" not in tables or "compact_geohash_lookup" not in tables: + raise SystemExit("DB does not contain compact_places + compact_geohash_lookup") + + print_summary(conn, args.top) + + if args.export_place_id is not None: + export_place_geojson( + conn, + place_id=args.export_place_id, + output_path=Path(args.export_geojson), + limit=args.export_limit, + ) + + if args.export_all_geojson: + export_all_geojson( + conn, + output_path=Path(args.export_all_geojson), + limit=args.export_limit, + ) + finally: + conn.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/generate_boundary_index.js b/scripts/generate_boundary_index.js new file mode 100755 index 0000000..ea9d448 --- /dev/null +++ b/scripts/generate_boundary_index.js @@ -0,0 +1,1373 @@ +#!/usr/bin/env node +"use strict"; + +const fs = require('fs') +const path = require('path') +const sqlite3 = require('sqlite3') +const geometry = require('../src/geometry') +const boundaryCover = require('../src/boundary_cover') +const geohash = require('../src/geohash') + +const PLACETYPE_CODES = { + locality: 0, + localadmin: 1, + region: 2 +} + +function parseBool(value, defaultValue) { + if (value === undefined || value === null || value === '') { + return defaultValue + } + + var normalized = String(value).toLowerCase().trim() + if (normalized === '1' || normalized === 'true' || normalized === 'yes' || normalized === 'y') { + return true + } + if (normalized === '0' || normalized === 'false' || normalized === 'no' || normalized === 'n') { + return false + } + + return defaultValue +} + +function parseArgs(argv) { + var opts = { + database: null, + input: [], + inputDir: [], + basePrecision: 4, + maxPrecision: 7, + includeLocaladmin: false, + includeRegion: false, + replace: true, + includeAlt: false, + dropContainedLocalities: true, + maxPlaces: null, + geometryDecimals: null, + minPopulation: 0, + indexMode: 'compact', + localityMaxPrecision: null, + localadminMaxPrecision: null, + regionMaxPrecision: null, + regionSparseMaxPrecision: null, + regionSparseMinAreaKm2: null, + promoteLocalityOverRegion: true + } + + for (var i = 0; i < argv.length; i++) { + var arg = argv[i] + + if (arg === '--database' || arg === '-d') { + opts.database = argv[++i] + } else if (arg === '--input' || arg === '-i') { + opts.input.push(argv[++i]) + } else if (arg === '--input-dir') { + opts.inputDir.push(argv[++i]) + } else if (arg === '--base-precision') { + opts.basePrecision = Number(argv[++i]) + } else if (arg === '--max-precision') { + opts.maxPrecision = Number(argv[++i]) + } else if (arg === '--include-localadmin') { + opts.includeLocaladmin = parseBool(argv[++i], false) + } else if (arg === '--include-region') { + opts.includeRegion = parseBool(argv[++i], false) + } else if (arg === '--include-alt') { + opts.includeAlt = parseBool(argv[++i], false) + } else if (arg === '--drop-contained-localities') { + opts.dropContainedLocalities = parseBool(argv[++i], true) + } else if (arg === '--max-places') { + var maxPlaces = Number(argv[++i]) + opts.maxPlaces = Number.isFinite(maxPlaces) && maxPlaces > 0 ? Math.trunc(maxPlaces) : null + } else if (arg === '--geometry-decimals') { + var decimals = Number(argv[++i]) + opts.geometryDecimals = Number.isFinite(decimals) && decimals >= 0 ? Math.trunc(decimals) : null + } else if (arg === '--min-population') { + var minPopulation = Number(argv[++i]) + opts.minPopulation = Number.isFinite(minPopulation) && minPopulation > 0 ? Math.trunc(minPopulation) : 0 + } else if (arg === '--index-mode') { + opts.indexMode = String(argv[++i] || '').toLowerCase().trim() + } else if (arg === '--locality-max-precision') { + var localityMax = Number(argv[++i]) + opts.localityMaxPrecision = Number.isFinite(localityMax) ? Math.trunc(localityMax) : null + } else if (arg === '--localadmin-max-precision') { + var localadminMax = Number(argv[++i]) + opts.localadminMaxPrecision = Number.isFinite(localadminMax) ? Math.trunc(localadminMax) : null + } else if (arg === '--region-max-precision') { + var regionMax = Number(argv[++i]) + opts.regionMaxPrecision = Number.isFinite(regionMax) ? Math.trunc(regionMax) : null + } else if (arg === '--region-sparse-max-precision') { + var regionSparseMax = Number(argv[++i]) + opts.regionSparseMaxPrecision = Number.isFinite(regionSparseMax) ? Math.trunc(regionSparseMax) : null + } else if (arg === '--region-sparse-min-area-km2') { + var sparseAreaKm2 = Number(argv[++i]) + opts.regionSparseMinAreaKm2 = Number.isFinite(sparseAreaKm2) && sparseAreaKm2 > 0 ? sparseAreaKm2 : null + } else if (arg === '--promote-locality-over-region') { + opts.promoteLocalityOverRegion = parseBool(argv[++i], true) + } else if (arg === '--append') { + opts.replace = false + } else if (arg === '--replace') { + opts.replace = true + } else if (arg === '--help' || arg === '-h') { + opts.help = true + } else { + throw new Error('Unknown argument: ' + arg) + } + } + + return opts +} + +function usage() { + return [ + 'Usage: node scripts/generate_boundary_index.js --database [--input ] [--input-dir ]', + '', + 'Options:', + ' --database, -d SQLite output path (required)', + ' --input, -i GeoJSON FeatureCollection/Feature or NDJSON file (repeatable)', + ' --input-dir Directory to recursively scan for GeoJSON feature files (repeatable)', + ' --base-precision Geohash base precision (default: 4)', + ' --max-precision Geohash max precision for partial subdivision (default: 7)', + ' --include-localadmin Include localadmin placetypes (default: false)', + ' --include-region Include region placetypes (default: false)', + ' --include-alt Include WOF alt geometries (default: false)', + ' --drop-contained-localities Drop locality polygons fully contained by larger localities (default: true)', + ' --max-places Stop after this many normalized places (useful for experiments)', + ' --geometry-decimals Round geometry coordinates to N decimals before indexing/storage', + ' --min-population Drop localities below this threshold (default: 0, country capitals kept)', + ' --index-mode compact|full (default: compact)', + ' --locality-max-precision Max precision override for locality placetype', + ' --localadmin-max-precision Max precision override for localadmin placetype', + ' --region-max-precision Max precision override for region placetype', + ' --region-sparse-max-precision Optional precision for very large region polygons (for example 3)', + ' --region-sparse-min-area-km2 Area threshold to apply sparse region precision', + ' --promote-locality-over-region Prefer locality over region in shared parent cells when no competing locality exists (default: true)', + ' --append Keep existing boundary rows and append/replace by place id', + ' --replace Clear boundary rows first (default)', + ' --help, -h Show this help message' + ].join('\n') +} + +function collectGeojsonFiles(dirPath, includeAlt, files) { + var entries = fs.readdirSync(dirPath, { withFileTypes: true }) + + for (var i = 0; i < entries.length; i++) { + var entry = entries[i] + var absolutePath = path.join(dirPath, entry.name) + + if (entry.isDirectory()) { + collectGeojsonFiles(absolutePath, includeAlt, files) + continue + } + + if (!entry.isFile()) { + continue + } + + var lower = entry.name.toLowerCase() + var isGeojson = lower.endsWith('.geojson') || lower.endsWith('.json') || lower.endsWith('.ndjson') + if (!isGeojson) { + continue + } + + if (!includeAlt && lower.indexOf('-alt-') !== -1) { + continue + } + + files.push(absolutePath) + } +} + +function collectInputFiles(opts) { + var all = [] + + for (var i = 0; i < opts.input.length; i++) { + all.push(path.resolve(opts.input[i])) + } + + for (var j = 0; j < opts.inputDir.length; j++) { + var inputDir = path.resolve(opts.inputDir[j]) + if (!fs.existsSync(inputDir) || !fs.statSync(inputDir).isDirectory()) { + throw new Error('Input directory does not exist: ' + inputDir) + } + + collectGeojsonFiles(inputDir, opts.includeAlt, all) + } + + var dedup = Object.create(null) + all.forEach(function(filePath) { + dedup[filePath] = true + }) + + return Object.keys(dedup).sort() +} + +function readFeatures(filePath) { + var content = fs.readFileSync(filePath, 'utf8') + var trimmed = content.trim() + + if (!trimmed) { + return [] + } + + if (trimmed.charAt(0) === '{' || trimmed.charAt(0) === '[') { + var parsed = JSON.parse(trimmed) + + if (Array.isArray(parsed)) { + return parsed + } + + if (parsed.type === 'FeatureCollection' && Array.isArray(parsed.features)) { + return parsed.features + } + + if (parsed.type === 'Feature') { + return [parsed] + } + + throw new Error('Unsupported JSON root in ' + filePath + '. Expected FeatureCollection, Feature, or array.') + } + + return trimmed + .split(/\r?\n/) + .map(function(line) { return line.trim() }) + .filter(function(line) { return line && line.charAt(0) !== '#' }) + .map(function(line) { return JSON.parse(line) }) +} + +function pickFirstString(value) { + if (typeof value === 'string' && value.trim()) { + return value.trim() + } + + if (Array.isArray(value)) { + for (var i = 0; i < value.length; i++) { + var candidate = pickFirstString(value[i]) + if (candidate) return candidate + } + } + + if (value && typeof value === 'object') { + var keys = Object.keys(value) + for (var j = 0; j < keys.length; j++) { + var nested = pickFirstString(value[keys[j]]) + if (nested) return nested + } + } + + return null +} + +function parseOptionalInt(value) { + if (value === null || value === undefined || value === '') return null + var parsed = Number(value) + if (Number.isFinite(parsed)) return Math.trunc(parsed) + return null +} + +function parseOptionalFloat(value) { + if (value === null || value === undefined || value === '') return null + var parsed = Number(value) + return Number.isFinite(parsed) ? parsed : null +} + +function clampPrecision(value, basePrecision, fallback) { + if (!Number.isFinite(value) || value < basePrecision) { + return fallback + } + return Math.trunc(value) +} + +function parseList(value) { + if (Array.isArray(value)) return value + if (typeof value === 'string') { + var trimmed = value.trim() + if (!trimmed) return [] + if (trimmed.charAt(0) === '[') { + try { + var parsed = JSON.parse(trimmed) + return Array.isArray(parsed) ? parsed : [] + } catch (err) { + return [trimmed] + } + } + return trimmed.split(',').map(function(item) { return item.trim() }).filter(Boolean) + } + return [] +} + +function isCurrentRecord(properties) { + var props = properties || {} + + var isCurrent = props.is_current + if (isCurrent === undefined) { + isCurrent = props['mz:is_current'] + } + if (isCurrent !== undefined && isCurrent !== null && Number(isCurrent) <= 0) { + return false + } + + var deprecated = props.deprecated + if (deprecated === undefined) { + deprecated = props['edtf:deprecated'] + } + if (deprecated && String(deprecated).toLowerCase() !== 'uuuu') { + return false + } + + var supersededBy = props.superseded_by + if (supersededBy === undefined) { + supersededBy = props['wof:superseded_by'] + } + if (parseList(supersededBy).length > 0) { + return false + } + + return true +} + +function extractName(properties, feature) { + var props = properties || {} + return pickFirstString(props.name) || + pickFirstString(props['wof:name']) || + pickFirstString(props['name:preferred']) || + pickFirstString(props.name_preferred) || + pickFirstString(feature && feature.id) +} + +function extractPlacetype(properties) { + var props = properties || {} + return pickFirstString(props.placetype) || + pickFirstString(props['wof:placetype']) || + pickFirstString(props.place_type) +} + +function extractCountryId(properties) { + var props = properties || {} + return pickFirstString(props.country_id) || + pickFirstString(props['iso:country']) || + pickFirstString(props.country_code) || + pickFirstString(props.country) || + pickFirstString(props['wof:country']) || + '' +} + +function extractHierarchyRegionId(properties) { + var hierarchy = properties && properties['wof:hierarchy'] + if (!Array.isArray(hierarchy) || hierarchy.length === 0) { + return null + } + + for (var i = 0; i < hierarchy.length; i++) { + var branch = hierarchy[i] + if (!branch || typeof branch !== 'object') continue + + var region = parseOptionalInt(branch.region_id) + if (region !== null) return region + } + + return null +} + +function extractAdmin1Id(properties) { + var props = properties || {} + return parseOptionalInt(props.admin1_id) || + parseOptionalInt(props['gn:admin1_id']) || + parseOptionalInt(props.region_id) || + extractHierarchyRegionId(props) || + null +} + +function extractCentroid(properties, normalizedGeometry) { + var props = properties || {} + + var lat = parseOptionalFloat(props.centroid_lat) + if (lat === null) lat = parseOptionalFloat(props['lbl:latitude']) + if (lat === null) lat = parseOptionalFloat(props['geom:latitude']) + + var lon = parseOptionalFloat(props.centroid_lon) + if (lon === null) lon = parseOptionalFloat(props['lbl:longitude']) + if (lon === null) lon = parseOptionalFloat(props['geom:longitude']) + + if (lat !== null && lon !== null) { + return { latitude: lat, longitude: lon } + } + + var bbox = geometry.geometryBbox(normalizedGeometry) + return { + latitude: (bbox.minLat + bbox.maxLat) / 2, + longitude: (bbox.minLon + bbox.maxLon) / 2 + } +} + +function extractPopulation(properties) { + var props = properties || {} + + var population = parseOptionalInt(props.population) + if (population === null) population = parseOptionalInt(props['gn:population']) + if (population === null) population = parseOptionalInt(props['wof:population']) + if (population === null) population = parseOptionalInt(props['mz:population']) + + if (population === null || population < 0) { + return 0 + } + + return population +} + +function bboxAreaKm2(bbox) { + var centerLat = (Number(bbox.minLat) + Number(bbox.maxLat)) / 2 + var deltaLat = Math.abs(Number(bbox.maxLat) - Number(bbox.minLat)) + var deltaLon = Math.abs(Number(bbox.maxLon) - Number(bbox.minLon)) + var latKm = deltaLat * 111.32 + var lonKm = deltaLon * 111.32 * Math.cos(centerLat * Math.PI / 180) + return Math.max(0, latKm * Math.max(0, lonKm)) +} + +function roundCoordinate(value, decimals) { + var factor = Math.pow(10, decimals) + return Math.round(Number(value) * factor) / factor +} + +function roundRing(ring, decimals) { + var points = [] + for (var i = 0; i < ring.length; i++) { + var lon = roundCoordinate(ring[i][0], decimals) + var lat = roundCoordinate(ring[i][1], decimals) + + if (!points.length) { + points.push([lon, lat]) + continue + } + + var prev = points[points.length - 1] + if (prev[0] !== lon || prev[1] !== lat) { + points.push([lon, lat]) + } + } + + if (!points.length) return points + + var first = points[0] + var last = points[points.length - 1] + if (first[0] !== last[0] || first[1] !== last[1]) { + points.push([first[0], first[1]]) + } + + return points +} + +function quantizeGeometry(inputGeometry, decimals) { + if (!Number.isFinite(decimals)) { + return geometry.normalizeGeometry(inputGeometry) + } + + var normalized = geometry.normalizeGeometry(inputGeometry) + var rounded = normalized.coordinates.map(function(polygon) { + return polygon + .map(function(ring) { return roundRing(ring, decimals) }) + .filter(function(ring) { return ring.length >= 4 }) + }).filter(function(polygon) { + return polygon.length > 0 + }) + + if (!rounded.length) { + return normalized + } + + return geometry.normalizeGeometry({ + type: 'MultiPolygon', + coordinates: rounded + }) +} + +function isCapitalLocality(properties) { + var props = properties || {} + + var featureCode = pickFirstString(props['gn:feature_code']) || + pickFirstString(props['gn:fcode']) || + pickFirstString(props['ne:FEATURE_CO']) || + '' + + if (String(featureCode).toUpperCase() === 'PPLC') { + return true + } + + var capitalOf = props['wof:capital_of'] + return Array.isArray(capitalOf) && capitalOf.length > 0 +} + +function extractPointCoordinates(pointGeometry) { + if (!pointGeometry || pointGeometry.type !== 'Point' || !Array.isArray(pointGeometry.coordinates)) { + return null + } + + var lon = parseOptionalFloat(pointGeometry.coordinates[0]) + var lat = parseOptionalFloat(pointGeometry.coordinates[1]) + if (lat === null || lon === null) { + return null + } + + return { + latitude: lat, + longitude: lon + } +} + +function bboxPolygon(bbox) { + return { + type: 'Polygon', + coordinates: [[ + [bbox.minLon, bbox.minLat], + [bbox.maxLon, bbox.minLat], + [bbox.maxLon, bbox.maxLat], + [bbox.minLon, bbox.maxLat], + [bbox.minLon, bbox.minLat] + ]] + } +} + +function normalizeFeature(feature, opts) { + if (!feature || feature.type !== 'Feature') { + return null + } + + if (!feature.geometry || !feature.geometry.type) { + return null + } + + var properties = feature.properties || {} + if (!isCurrentRecord(properties)) { + return null + } + + var placetype = (extractPlacetype(properties) || '').toLowerCase() + var include = placetype === 'locality' || + (opts.includeLocaladmin && placetype === 'localadmin') || + (opts.includeRegion && placetype === 'region') + if (!include) { + return null + } + + var population = extractPopulation(properties) + var isCapital = placetype === 'locality' && isCapitalLocality(properties) + if (placetype === 'locality' && population < opts.minPopulation && !isCapital) { + return null + } + + var rawId = feature.id + if (rawId === undefined || rawId === null || rawId === '') rawId = properties.id + if (rawId === undefined || rawId === null || rawId === '') rawId = properties['wof:id'] + var id = parseOptionalInt(rawId) + if (id === null) { + return null + } + + var geometryType = feature.geometry.type + var isPolygonGeometry = geometryType === 'Polygon' || geometryType === 'MultiPolygon' + var isPointCapital = geometryType === 'Point' && isCapital + if (!isPolygonGeometry && !isPointCapital) { + return null + } + + var normalizedGeometry + var pointCapitalHash = null + + if (isPointCapital) { + var point = extractPointCoordinates(feature.geometry) + if (!point) { + return null + } + + pointCapitalHash = geohash.encode(point.latitude, point.longitude, opts.localityMaxPrecision) + normalizedGeometry = geometry.normalizeGeometry(bboxPolygon(geohash.decodeBbox(pointCapitalHash))) + } else { + normalizedGeometry = quantizeGeometry(feature.geometry, opts.geometryDecimals) + } + + var bbox = geometry.geometryBbox(normalizedGeometry) + var centroid = extractCentroid(properties, normalizedGeometry) + var countryId = extractCountryId(properties) + + var name = extractName(properties, feature) + if (!name) { + return null + } + + var priorityRank = parseOptionalInt(properties.priority_rank) + if (priorityRank === null) priorityRank = 0 + var maxPrecisionForPlace = resolveMaxPrecisionForPlacetype(opts, placetype, bbox) + var cover = pointCapitalHash + ? [{ + geohash: pointCapitalHash, + precision: pointCapitalHash.length, + coverageType: 'full' + }] + : boundaryCover.buildGeohashCoverForGeometry(normalizedGeometry, { + basePrecision: Math.min(opts.basePrecision, maxPrecisionForPlace), + maxPrecision: maxPrecisionForPlace + }) + + return { + id: id, + name: name, + countryId: countryId, + admin1Id: extractAdmin1Id(properties), + placetype: placetype, + placetypeCode: placetypeCode(placetype), + centroidLat: centroid.latitude, + centroidLon: centroid.longitude, + population: population, + bboxMinLat: bbox.minLat, + bboxMinLon: bbox.minLon, + bboxMaxLat: bbox.maxLat, + bboxMaxLon: bbox.maxLon, + priorityRank: priorityRank, + area: geometry.geometryArea(normalizedGeometry), + countryName: pickFirstString(properties.country_name) || countryId || null, + admin1Name: pickFirstString(properties.admin1_name) || null, + geometry: normalizedGeometry, + cover: cover + } +} + +function localityGroupKey(place) { + return String(place.countryId || '') + '|' + String(place.admin1Id === null ? '' : place.admin1Id) +} + +function pruneContainedLocalities(places, enabled) { + if (!enabled) { + return { + places: places, + dropped: [] + } + } + + var localitiesByGroup = Object.create(null) + for (var i = 0; i < places.length; i++) { + var place = places[i] + if (place.placetype !== 'locality') continue + + var key = localityGroupKey(place) + if (!localitiesByGroup[key]) { + localitiesByGroup[key] = [] + } + + localitiesByGroup[key].push(place) + } + + var dropById = Object.create(null) + + var groupKeys = Object.keys(localitiesByGroup) + for (var g = 0; g < groupKeys.length; g++) { + var key = groupKeys[g] + var group = localitiesByGroup[key] + + group.sort(function(a, b) { + if (a.area !== b.area) return a.area - b.area + return a.id - b.id + }) + + for (var i = 0; i < group.length; i++) { + var candidate = group[i] + if (dropById[candidate.id]) continue + + for (var j = i + 1; j < group.length; j++) { + var container = group[j] + if (dropById[container.id]) continue + if (container.area <= candidate.area) continue + + var containsBbox = geometry.bboxContainsBbox({ + minLat: container.bboxMinLat, + minLon: container.bboxMinLon, + maxLat: container.bboxMaxLat, + maxLon: container.bboxMaxLon + }, { + minLat: candidate.bboxMinLat, + minLon: candidate.bboxMinLon, + maxLat: candidate.bboxMaxLat, + maxLon: candidate.bboxMaxLon + }) + + if (!containsBbox) { + continue + } + + if (geometry.geometryContainsGeometry(container.geometry, candidate.geometry)) { + dropById[candidate.id] = { + placeId: candidate.id, + containedBy: container.id, + group: key + } + break + } + } + } + } + + var dropped = Object.keys(dropById).map(function(id) { return dropById[id] }) + var filtered = places.filter(function(place) { + return !dropById[place.id] + }) + + return { + places: filtered, + dropped: dropped + } +} + +function placetypeRank(placetype) { + if (placetype === 'locality') return 0 + if (placetype === 'localadmin') return 1 + if (placetype === 'region') return 2 + return 3 +} + +function placetypeCode(placetype) { + var code = PLACETYPE_CODES[placetype] + return Number.isFinite(code) ? code : 9 +} + +function resolveMaxPrecisionForPlacetype(opts, placetype, bbox) { + if (placetype === 'locality') return opts.localityMaxPrecision + if (placetype === 'localadmin') return opts.localadminMaxPrecision + if (placetype === 'region') { + var regionPrecision = opts.regionMaxPrecision + if (Number.isFinite(opts.regionSparseMaxPrecision) && Number.isFinite(opts.regionSparseMinAreaKm2)) { + var areaKm2 = bboxAreaKm2(bbox) + if (areaKm2 >= opts.regionSparseMinAreaKm2) { + regionPrecision = Math.min(regionPrecision, opts.regionSparseMaxPrecision) + } + } + return regionPrecision + } + return opts.maxPrecision +} + +function pointDistanceScore(latitude, longitude, targetLatitude, targetLongitude) { + var lat = Number(latitude) + var lon = Number(longitude) + var targetLat = Number(targetLatitude) + var targetLon = Number(targetLongitude) + var scale = Math.pow(Math.cos(lat * Math.PI / 180), 2) + + return ((lat - targetLat) * (lat - targetLat)) + + ((lon - targetLon) * (lon - targetLon) * scale) +} + +function comparePlacesForHash(a, b, hash, hashCenterCache) { + var typeRankA = placetypeRank(a.placetype) + var typeRankB = placetypeRank(b.placetype) + if (typeRankA !== typeRankB) { + return typeRankA - typeRankB + } + + if (a.population !== b.population) { + return b.population - a.population + } + + var center = hashCenterCache[hash] + if (!center) { + var bbox = geohash.decodeBbox(hash) + center = { + latitude: (bbox.minLat + bbox.maxLat) / 2, + longitude: (bbox.minLon + bbox.maxLon) / 2 + } + hashCenterCache[hash] = center + } + + var distanceA = pointDistanceScore(center.latitude, center.longitude, a.centroidLat, a.centroidLon) + var distanceB = pointDistanceScore(center.latitude, center.longitude, b.centroidLat, b.centroidLon) + if (distanceA !== distanceB) { + return distanceA - distanceB + } + + if (a.area !== b.area) { + return a.area - b.area + } + + return a.id - b.id +} + +function isCityPlacetypeCode(code) { + return code === PLACETYPE_CODES.locality || code === PLACETYPE_CODES.localadmin +} + +function promoteLocalityParentsByRegionCompetition(bestByHash, placeById, opts) { + if (!opts.promoteLocalityOverRegion) { + return + } + + var minPrecision = Number(opts.basePrecision || 1) + var maxPrecision = Number(opts.maxPrecision || minPrecision) + if (maxPrecision <= minPrecision) { + return + } + + for (var precision = maxPrecision - 1; precision >= minPrecision; precision--) { + var childPrecision = precision + 1 + var groupByParent = Object.create(null) + var hashes = Object.keys(bestByHash) + + for (var i = 0; i < hashes.length; i++) { + var hash = hashes[i] + if (hash.length !== childPrecision) continue + + var place = placeById[String(bestByHash[hash])] + if (!place) continue + + var parent = hash.slice(0, precision) + var group = groupByParent[parent] + if (!group) { + group = { + localityById: Object.create(null), + hasRegion: false + } + groupByParent[parent] = group + } + + if (isCityPlacetypeCode(place.placetypeCode)) { + group.localityById[String(place.id)] = true + } else if (place.placetypeCode === PLACETYPE_CODES.region) { + group.hasRegion = true + } + } + + var promotedParents = Object.create(null) + var parentHashes = Object.keys(groupByParent) + for (var parentIndex = 0; parentIndex < parentHashes.length; parentIndex++) { + var parentHash = parentHashes[parentIndex] + var group = groupByParent[parentHash] + var localityIds = Object.keys(group.localityById) + if (localityIds.length !== 1) { + continue + } + + var localityId = localityIds[0] + var existingId = bestByHash[parentHash] + var hasRegionCompetition = group.hasRegion + if (existingId !== undefined) { + var existingPlace = placeById[String(existingId)] + if (existingPlace && isCityPlacetypeCode(existingPlace.placetypeCode) && String(existingId) !== localityId) { + continue + } + if (existingPlace && existingPlace.placetypeCode === PLACETYPE_CODES.region) { + hasRegionCompetition = true + } + } + + if (!hasRegionCompetition) { + continue + } + + bestByHash[parentHash] = Number(localityId) + promotedParents[parentHash] = true + } + + if (!Object.keys(promotedParents).length) { + continue + } + + var descendantHashes = Object.keys(bestByHash) + for (var hashIndex = 0; hashIndex < descendantHashes.length; hashIndex++) { + var descendantHash = descendantHashes[hashIndex] + if (descendantHash.length <= precision) continue + + var ancestor = descendantHash.slice(0, precision) + if (!promotedParents[ancestor] || descendantHash === ancestor) { + continue + } + + var descendantPlace = placeById[String(bestByHash[descendantHash])] + if (descendantPlace && descendantPlace.placetypeCode === PLACETYPE_CODES.region) { + delete bestByHash[descendantHash] + } + } + } +} + +function buildCompactLookupRows(places, opts) { + var bestByHash = Object.create(null) + var hashCenterCache = Object.create(null) + var placeById = Object.create(null) + + for (var index = 0; index < places.length; index++) { + placeById[String(places[index].id)] = places[index] + } + + for (var i = 0; i < places.length; i++) { + var place = places[i] + for (var j = 0; j < place.cover.length; j++) { + var cell = place.cover[j] + var hash = cell.geohash + var current = bestByHash[hash] + if (!current || comparePlacesForHash(place, current, hash, hashCenterCache) < 0) { + bestByHash[hash] = place + } + } + } + + var bestByHashId = Object.create(null) + var allHashes = Object.keys(bestByHash) + for (var hashIndex = 0; hashIndex < allHashes.length; hashIndex++) { + var currentHash = allHashes[hashIndex] + bestByHashId[currentHash] = bestByHash[currentHash].id + } + + promoteLocalityParentsByRegionCompetition(bestByHashId, placeById, opts) + + var rows = Object.keys(bestByHashId).map(function(hash) { + return { + geohash: hash, + placeId: bestByHashId[hash] + } + }) + + rows.sort(function(a, b) { + if (a.geohash.length !== b.geohash.length) { + return a.geohash.length - b.geohash.length + } + if (a.geohash < b.geohash) return -1 + if (a.geohash > b.geohash) return 1 + return 0 + }) + + var compact = [] + var selectedByHash = Object.create(null) + + for (var index = 0; index < rows.length; index++) { + var row = rows[index] + var redundant = false + + for (var precision = 1; precision < row.geohash.length; precision++) { + var prefix = row.geohash.slice(0, precision) + if (selectedByHash[prefix] === row.placeId) { + redundant = true + break + } + } + + if (redundant) { + continue + } + + selectedByHash[row.geohash] = row.placeId + compact.push(row) + } + + return compact +} + +function dbExec(db, sql) { + return new Promise(function(resolve, reject) { + db.exec(sql, function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +function dbRun(db, sql, params) { + return new Promise(function(resolve, reject) { + db.run(sql, params || [], function(err) { + if (err) reject(err) + else resolve(this) + }) + }) +} + +function dbClose(db) { + return new Promise(function(resolve, reject) { + db.close(function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +function stmtRun(stmt, params) { + return new Promise(function(resolve, reject) { + stmt.run(params, function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +function stmtFinalize(stmt) { + return new Promise(function(resolve, reject) { + stmt.finalize(function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +async function ensureBoundarySchema(db, opts) { + if (opts.indexMode === 'compact') { + if (opts.replace) { + await dbExec(db, ` + DROP TABLE IF EXISTS compact_geohash_lookup; + DROP TABLE IF EXISTS compact_places; + DROP TABLE IF EXISTS place_geohash_cover; + DROP TABLE IF EXISTS place_geometry; + DROP TABLE IF EXISTS place_geohash_lookup; + DROP TABLE IF EXISTS places; + DROP TABLE IF EXISTS countries; + DROP TABLE IF EXISTS admin1; + `) + } + + await dbExec(db, ` + CREATE TABLE IF NOT EXISTS compact_places( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + country_id TEXT NOT NULL, + admin1_id INTEGER, + placetype_code INTEGER NOT NULL, + latitude REAL NOT NULL, + longitude REAL NOT NULL + ); + + CREATE TABLE IF NOT EXISTS compact_geohash_lookup( + geohash TEXT PRIMARY KEY, + place_id INTEGER NOT NULL, + FOREIGN KEY (place_id) REFERENCES compact_places(id) + ); + + CREATE INDEX IF NOT EXISTS compact_places_placetype_code ON compact_places (placetype_code); + CREATE INDEX IF NOT EXISTS compact_geohash_lookup_place_id ON compact_geohash_lookup (place_id); + `) + return + } + + await dbExec(db, ` + CREATE TABLE IF NOT EXISTS countries( + id TEXT PRIMARY KEY, + name TEXT NOT NULL + ); + + CREATE TABLE IF NOT EXISTS admin1( + country_id TEXT NOT NULL, + id INTEGER NOT NULL, + name TEXT NOT NULL, + PRIMARY KEY (country_id, id) + ); + + CREATE TABLE IF NOT EXISTS places( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + country_id TEXT NOT NULL, + admin1_id INTEGER, + placetype TEXT NOT NULL, + centroid_lat REAL NOT NULL, + centroid_lon REAL NOT NULL, + bbox_min_lat REAL NOT NULL, + bbox_min_lon REAL NOT NULL, + bbox_max_lat REAL NOT NULL, + bbox_max_lon REAL NOT NULL, + priority_rank INTEGER NOT NULL DEFAULT 0, + area REAL NOT NULL DEFAULT 0, + country_name TEXT, + admin1_name TEXT + ); + + CREATE TABLE IF NOT EXISTS place_geohash_cover( + geohash TEXT NOT NULL, + precision INTEGER NOT NULL, + place_id INTEGER NOT NULL, + coverage_type TEXT NOT NULL CHECK (coverage_type IN ('full', 'partial')), + PRIMARY KEY (geohash, precision, place_id), + FOREIGN KEY (place_id) REFERENCES places(id) + ); + + CREATE TABLE IF NOT EXISTS place_geometry( + place_id INTEGER PRIMARY KEY, + encoding TEXT NOT NULL DEFAULT 'json', + geometry BLOB NOT NULL, + FOREIGN KEY (place_id) REFERENCES places(id) + ); + + CREATE TABLE IF NOT EXISTS place_geohash_lookup( + geohash TEXT PRIMARY KEY, + place_id INTEGER NOT NULL, + FOREIGN KEY (place_id) REFERENCES places(id) + ); + + CREATE INDEX IF NOT EXISTS place_geohash_cover_hash_precision ON place_geohash_cover (geohash, precision); + CREATE INDEX IF NOT EXISTS place_geohash_cover_place_id ON place_geohash_cover (place_id); + CREATE INDEX IF NOT EXISTS places_placetype ON places (placetype); + CREATE INDEX IF NOT EXISTS place_geometry_place_id ON place_geometry (place_id); + CREATE INDEX IF NOT EXISTS place_geohash_lookup_place_id ON place_geohash_lookup (place_id); + `) +} + +function normalizePlaces(files, opts) { + var byId = Object.create(null) + var normalizedCount = 0 + + for (var i = 0; i < files.length; i++) { + var features = readFeatures(files[i]) + + for (var j = 0; j < features.length; j++) { + var place = normalizeFeature(features[j], opts) + if (!place) continue + + byId[String(place.id)] = place + normalizedCount += 1 + + if (opts.maxPlaces && Object.keys(byId).length >= opts.maxPlaces) { + break + } + } + + if (opts.maxPlaces && Object.keys(byId).length >= opts.maxPlaces) { + break + } + } + + var places = Object.keys(byId) + .map(function(id) { return byId[id] }) + .sort(function(a, b) { return a.id - b.id }) + + return { + places: places, + normalizedCount: normalizedCount + } +} + +async function writePlaces(db, places, opts, compactLookupRows) { + await dbExec(db, 'BEGIN') + + try { + if (opts.replace && opts.indexMode !== 'compact') { + await dbRun(db, 'DELETE FROM place_geohash_lookup') + await dbRun(db, 'DELETE FROM place_geohash_cover') + await dbRun(db, 'DELETE FROM place_geometry') + await dbRun(db, 'DELETE FROM places') + } + + var placeStmt = null + + var geometryStmt = null + var coverStmt = null + var compactStmt = null + + if (opts.indexMode === 'full') { + placeStmt = db.prepare(` + INSERT OR REPLACE INTO places( + id, name, country_id, admin1_id, placetype, + centroid_lat, centroid_lon, + bbox_min_lat, bbox_min_lon, bbox_max_lat, bbox_max_lon, + priority_rank, area, country_name, admin1_name + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `) + + geometryStmt = db.prepare(` + INSERT OR REPLACE INTO place_geometry(place_id, encoding, geometry) + VALUES (?, ?, ?) + `) + + coverStmt = db.prepare(` + INSERT OR REPLACE INTO place_geohash_cover(geohash, precision, place_id, coverage_type) + VALUES (?, ?, ?, ?) + `) + } else { + placeStmt = db.prepare(` + INSERT OR REPLACE INTO compact_places( + id, name, country_id, admin1_id, placetype_code, + latitude, longitude + ) VALUES (?, ?, ?, ?, ?, ?, ?) + `) + + compactStmt = db.prepare(` + INSERT OR REPLACE INTO compact_geohash_lookup(geohash, place_id) + VALUES (?, ?) + `) + } + + try { + for (var i = 0; i < places.length; i++) { + var place = places[i] + + if (!opts.replace) { + if (opts.indexMode === 'compact') { + await dbRun(db, 'DELETE FROM compact_geohash_lookup WHERE place_id = ?', [place.id]) + await dbRun(db, 'DELETE FROM compact_places WHERE id = ?', [place.id]) + } else { + await dbRun(db, 'DELETE FROM place_geohash_lookup WHERE place_id = ?', [place.id]) + await dbRun(db, 'DELETE FROM place_geohash_cover WHERE place_id = ?', [place.id]) + await dbRun(db, 'DELETE FROM place_geometry WHERE place_id = ?', [place.id]) + } + } + + if (opts.indexMode === 'compact') { + await stmtRun(placeStmt, [ + place.id, + place.name, + place.countryId, + place.admin1Id, + place.placetypeCode, + place.centroidLat, + place.centroidLon + ]) + } else { + await stmtRun(placeStmt, [ + place.id, + place.name, + place.countryId, + place.admin1Id, + place.placetype, + place.centroidLat, + place.centroidLon, + place.bboxMinLat, + place.bboxMinLon, + place.bboxMaxLat, + place.bboxMaxLon, + place.priorityRank, + place.area, + place.countryName, + place.admin1Name + ]) + } + + if (opts.indexMode === 'full') { + await stmtRun(geometryStmt, [ + place.id, + 'json', + JSON.stringify(place.geometry) + ]) + + for (var j = 0; j < place.cover.length; j++) { + var cell = place.cover[j] + await stmtRun(coverStmt, [ + cell.geohash, + cell.precision, + place.id, + cell.coverageType + ]) + } + } + } + + if (opts.indexMode === 'compact') { + for (var rowIndex = 0; rowIndex < compactLookupRows.length; rowIndex++) { + var row = compactLookupRows[rowIndex] + await stmtRun(compactStmt, [row.geohash, row.placeId]) + } + } + } finally { + await stmtFinalize(placeStmt) + if (geometryStmt) await stmtFinalize(geometryStmt) + if (coverStmt) await stmtFinalize(coverStmt) + if (compactStmt) await stmtFinalize(compactStmt) + } + + await dbExec(db, 'COMMIT') + } catch (err) { + await dbExec(db, 'ROLLBACK') + throw err + } +} + +async function main() { + var options = parseArgs(process.argv.slice(2)) + + if (options.help) { + console.log(usage()) + process.exit(0) + } + + if (!options.database) { + throw new Error('Missing required --database argument') + } + + if (!options.input.length && !options.inputDir.length) { + throw new Error('Provide at least one --input file or --input-dir') + } + + if (!Number.isFinite(options.basePrecision) || options.basePrecision < 1) { + throw new Error('--base-precision must be a positive number') + } + + if (!Number.isFinite(options.maxPrecision) || options.maxPrecision < options.basePrecision) { + throw new Error('--max-precision must be >= --base-precision') + } + + if (options.indexMode !== 'compact' && options.indexMode !== 'full') { + throw new Error('--index-mode must be either compact or full') + } + + options.localityMaxPrecision = clampPrecision(options.localityMaxPrecision, options.basePrecision, options.maxPrecision) + options.localadminMaxPrecision = clampPrecision(options.localadminMaxPrecision, options.basePrecision, options.maxPrecision) + options.regionMaxPrecision = clampPrecision(options.regionMaxPrecision, options.basePrecision, options.maxPrecision) + if (options.regionSparseMaxPrecision !== null) { + if (!Number.isFinite(options.regionSparseMaxPrecision) || options.regionSparseMaxPrecision < 1) { + options.regionSparseMaxPrecision = null + } else { + options.regionSparseMaxPrecision = Math.trunc(options.regionSparseMaxPrecision) + if (options.regionSparseMaxPrecision > options.regionMaxPrecision) { + options.regionSparseMaxPrecision = options.regionMaxPrecision + } + } + } + + var files = collectInputFiles(options) + if (!files.length) { + throw new Error('No input files were found after filtering') + } + + var normalized = normalizePlaces(files, options) + var dedupedPlaces = normalized.places + + if (!dedupedPlaces.length) { + throw new Error('No valid locality/localadmin/region records were found in the provided input files') + } + + var pruned = pruneContainedLocalities(dedupedPlaces, options.dropContainedLocalities) + var finalPlaces = pruned.places + var compactLookupRows = options.indexMode === 'compact' ? buildCompactLookupRows(finalPlaces, options) : [] + + var databasePath = path.resolve(options.database) + var db = new sqlite3.Database(databasePath) + + try { + await ensureBoundarySchema(db, options) + await writePlaces(db, finalPlaces, options, compactLookupRows) + + var coverCount = finalPlaces.reduce(function(total, place) { + return total + place.cover.length + }, 0) + + console.log('Boundary index build complete') + console.log('Database: ' + databasePath) + console.log('Input files scanned: ' + files.length) + console.log('Features normalized: ' + normalized.normalizedCount) + console.log('Places (deduped by id): ' + dedupedPlaces.length) + console.log('Places dropped (contained locality prune): ' + pruned.dropped.length) + console.log('Places written: ' + finalPlaces.length) + if (options.indexMode === 'compact') { + console.log('Geohash lookup rows: ' + compactLookupRows.length) + } else { + console.log('Geohash cover rows: ' + coverCount) + } + var modeLabel = 'locality' + if (options.includeLocaladmin) modeLabel += ' + localadmin' + if (options.includeRegion) modeLabel += ' + region' + console.log('Mode: ' + modeLabel) + console.log('Precision: ' + options.basePrecision + ' -> ' + options.maxPrecision) + console.log('Placetype precision caps: locality=' + options.localityMaxPrecision + ', localadmin=' + options.localadminMaxPrecision + ', region=' + options.regionMaxPrecision) + if (Number.isFinite(options.regionSparseMaxPrecision) && Number.isFinite(options.regionSparseMinAreaKm2)) { + console.log('Sparse region rule: area_km2>=' + options.regionSparseMinAreaKm2 + ' => max_precision=' + options.regionSparseMaxPrecision) + } + console.log('Index mode: ' + options.indexMode) + console.log('Promote locality over region: ' + (options.promoteLocalityOverRegion ? 'true' : 'false')) + console.log('Min population: ' + options.minPopulation) + } finally { + await dbClose(db) + } +} + +main().catch(function(err) { + console.error(err.message || err) + process.exit(1) +}) diff --git a/scripts/generate_wof_boundary.sh b/scripts/generate_wof_boundary.sh new file mode 100755 index 0000000..eecb668 --- /dev/null +++ b/scripts/generate_wof_boundary.sh @@ -0,0 +1,143 @@ +#!/bin/bash +set -euo pipefail + +# Build boundary-aware lookup tables from Who's On First admin repositories. +# +# Usage: +# ./scripts/generate_wof_boundary.sh [output_db_path] +# +# Environment variables: +# WOF_COUNTRIES Comma-separated ISO2 country codes (default: FR,IT) +# WOF_WORKDIR Working directory for archives/extraction (default: ./tmp/wof-build) +# WOF_DOWNLOAD Set to 0 to skip downloads and reuse existing archives (default: 1) +# WOF_REF Git ref to download from codeload (default: master) +# WOF_BASE_PRECISION Geohash base precision (default: 4) +# WOF_MAX_PRECISION Geohash max precision (default: 5) +# WOF_LOCALITY_MAX_PRECISION Locality max precision override (default: WOF_MAX_PRECISION) +# WOF_LOCALADMIN_MAX_PRECISION Localadmin max precision override (default: WOF_MAX_PRECISION) +# WOF_REGION_MAX_PRECISION Region max precision override (default: 4) +# WOF_REGION_SPARSE_MAX_PRECISION Sparse large-region precision (default: 3) +# WOF_REGION_SPARSE_MIN_AREA_KM2 Area threshold for sparse region precision (default: 80000) +# WOF_PROMOTE_LOCALITY_OVER_REGION Prefer locality labels over region in shared parent cells (default: 1) +# WOF_INCLUDE_LOCALADMIN Include localadmin placetypes (default: 0) +# WOF_INCLUDE_REGION Include region placetypes (default: 1) +# WOF_DROP_CONTAINED_LOCALITIES Drop localities contained in larger localities (default: 1) +# WOF_INCLUDE_ALT Include -alt- geometries (default: 0) +# WOF_GEOMETRY_DECIMALS Optional coordinate rounding precision (e.g. 4) +# WOF_MIN_POPULATION Optional minimum population filter (default: 0) +# WOF_MAX_PLACES Optional cap for experiment runs +# +# Notes: +# - This helper always builds `--index-mode compact` (geohash -> place only). + +WOF_COUNTRIES="${WOF_COUNTRIES:-FR,IT}" +WOF_WORKDIR="${WOF_WORKDIR:-$(pwd)/tmp/wof-build}" +WOF_DOWNLOAD="${WOF_DOWNLOAD:-1}" +WOF_REF="${WOF_REF:-master}" +WOF_BASE_PRECISION="${WOF_BASE_PRECISION:-4}" +WOF_MAX_PRECISION="${WOF_MAX_PRECISION:-5}" +WOF_LOCALITY_MAX_PRECISION="${WOF_LOCALITY_MAX_PRECISION:-${WOF_MAX_PRECISION}}" +WOF_LOCALADMIN_MAX_PRECISION="${WOF_LOCALADMIN_MAX_PRECISION:-${WOF_MAX_PRECISION}}" +WOF_REGION_MAX_PRECISION="${WOF_REGION_MAX_PRECISION:-4}" +WOF_REGION_SPARSE_MAX_PRECISION="${WOF_REGION_SPARSE_MAX_PRECISION:-3}" +WOF_REGION_SPARSE_MIN_AREA_KM2="${WOF_REGION_SPARSE_MIN_AREA_KM2:-80000}" +WOF_PROMOTE_LOCALITY_OVER_REGION="${WOF_PROMOTE_LOCALITY_OVER_REGION:-1}" +WOF_INCLUDE_LOCALADMIN="${WOF_INCLUDE_LOCALADMIN:-0}" +WOF_INCLUDE_REGION="${WOF_INCLUDE_REGION:-1}" +WOF_DROP_CONTAINED_LOCALITIES="${WOF_DROP_CONTAINED_LOCALITIES:-1}" +WOF_INCLUDE_ALT="${WOF_INCLUDE_ALT:-0}" +WOF_GEOMETRY_DECIMALS="${WOF_GEOMETRY_DECIMALS:-}" +WOF_MIN_POPULATION="${WOF_MIN_POPULATION:-0}" +WOF_MAX_PLACES="${WOF_MAX_PLACES:-}" +OUTPUT="${1:-db.sqlite}" + +case "${OUTPUT}" in + /*) ;; + *) OUTPUT="$(pwd)/${OUTPUT}" ;; +esac + +ARCHIVE_DIR="${WOF_WORKDIR}/archives" +EXTRACT_DIR="${WOF_WORKDIR}/extracted" +mkdir -p "${ARCHIVE_DIR}" "${EXTRACT_DIR}" + +INPUT_ARGS=() + +IFS=',' read -r -a COUNTRY_ITEMS <<< "${WOF_COUNTRIES}" +for item in "${COUNTRY_ITEMS[@]}"; do + country="$(echo "${item}" | tr '[:upper:]' '[:lower:]' | xargs)" + if [[ -z "${country}" ]]; then + continue + fi + + repo="whosonfirst-data-admin-${country}" + archive="${ARCHIVE_DIR}/${repo}-${WOF_REF}.tar.gz" + + if [[ ! -f "${archive}" ]]; then + if [[ "${WOF_DOWNLOAD}" != "1" ]]; then + echo "Missing ${archive} and WOF_DOWNLOAD=${WOF_DOWNLOAD}." >&2 + echo "Provide the archive locally or set WOF_DOWNLOAD=1." >&2 + exit 1 + fi + + url="https://codeload.github.com/whosonfirst-data/${repo}/tar.gz/${WOF_REF}" + echo "Downloading ${repo}@${WOF_REF}..." + curl -fsSL "${url}" -o "${archive}" + else + echo "Using existing archive ${archive}" + fi + + country_extract="${EXTRACT_DIR}/${country}" + rm -rf "${country_extract}" + mkdir -p "${country_extract}" + tar -xzf "${archive}" -C "${country_extract}" + + root_dir="$(find "${country_extract}" -mindepth 1 -maxdepth 1 -type d | head -n 1)" + if [[ -z "${root_dir}" ]]; then + echo "Failed to find extracted root directory for ${repo}" >&2 + exit 1 + fi + + data_dir="${root_dir}/data" + if [[ ! -d "${data_dir}" ]]; then + echo "Expected data directory not found: ${data_dir}" >&2 + exit 1 + fi + + INPUT_ARGS+=(--input-dir "${data_dir}") +done + +if [[ ${#INPUT_ARGS[@]} -eq 0 ]]; then + echo "No input directories resolved from WOF_COUNTRIES=${WOF_COUNTRIES}" >&2 + exit 1 +fi + +CMD=( + node "$(pwd)/scripts/generate_boundary_index.js" + --database "${OUTPUT}" + --index-mode "compact" + --base-precision "${WOF_BASE_PRECISION}" + --max-precision "${WOF_MAX_PRECISION}" + --locality-max-precision "${WOF_LOCALITY_MAX_PRECISION}" + --localadmin-max-precision "${WOF_LOCALADMIN_MAX_PRECISION}" + --region-max-precision "${WOF_REGION_MAX_PRECISION}" + --region-sparse-max-precision "${WOF_REGION_SPARSE_MAX_PRECISION}" + --region-sparse-min-area-km2 "${WOF_REGION_SPARSE_MIN_AREA_KM2}" + --promote-locality-over-region "${WOF_PROMOTE_LOCALITY_OVER_REGION}" + --include-localadmin "${WOF_INCLUDE_LOCALADMIN}" + --include-region "${WOF_INCLUDE_REGION}" + --drop-contained-localities "${WOF_DROP_CONTAINED_LOCALITIES}" + --include-alt "${WOF_INCLUDE_ALT}" + --min-population "${WOF_MIN_POPULATION}" +) + +if [[ -n "${WOF_MAX_PLACES}" ]]; then + CMD+=(--max-places "${WOF_MAX_PLACES}") +fi + +if [[ -n "${WOF_GEOMETRY_DECIMALS}" ]]; then + CMD+=(--geometry-decimals "${WOF_GEOMETRY_DECIMALS}") +fi + +CMD+=("${INPUT_ARGS[@]}") + +"${CMD[@]}" diff --git a/scripts/schema.sql b/scripts/schema.sql index 4457d55..da30fed 100644 --- a/scripts/schema.sql +++ b/scripts/schema.sql @@ -46,3 +46,49 @@ CREATE INDEX coordinates_lat_lng ON coordinates (latitude, longitude); CREATE INDEX features_name_nocase ON features (name COLLATE NOCASE); CREATE INDEX features_asciiname_nocase ON features (asciiname COLLATE NOCASE); CREATE INDEX features_population_desc ON features (population DESC); + +CREATE TABLE places( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + country_id TEXT NOT NULL, + admin1_id INTEGER, + placetype TEXT NOT NULL, + centroid_lat REAL NOT NULL, + centroid_lon REAL NOT NULL, + bbox_min_lat REAL NOT NULL, + bbox_min_lon REAL NOT NULL, + bbox_max_lat REAL NOT NULL, + bbox_max_lon REAL NOT NULL, + priority_rank INTEGER NOT NULL DEFAULT 0, + area REAL NOT NULL DEFAULT 0, + country_name TEXT, + admin1_name TEXT +); + +CREATE TABLE place_geohash_cover( + geohash TEXT NOT NULL, + precision INTEGER NOT NULL, + place_id INTEGER NOT NULL, + coverage_type TEXT NOT NULL CHECK (coverage_type IN ('full', 'partial')), + PRIMARY KEY (geohash, precision, place_id), + FOREIGN KEY (place_id) REFERENCES places(id) +); + +CREATE TABLE place_geometry( + place_id INTEGER PRIMARY KEY, + encoding TEXT NOT NULL DEFAULT 'json', + geometry BLOB NOT NULL, + FOREIGN KEY (place_id) REFERENCES places(id) +); + +CREATE TABLE place_geohash_lookup( + geohash TEXT PRIMARY KEY, + place_id INTEGER NOT NULL, + FOREIGN KEY (place_id) REFERENCES places(id) +); + +CREATE INDEX place_geohash_cover_hash_precision ON place_geohash_cover (geohash, precision); +CREATE INDEX place_geohash_cover_place_id ON place_geohash_cover (place_id); +CREATE INDEX places_placetype ON places (placetype); +CREATE INDEX place_geometry_place_id ON place_geometry (place_id); +CREATE INDEX place_geohash_lookup_place_id ON place_geohash_lookup (place_id); diff --git a/spec/boundary_builder_spec.js b/spec/boundary_builder_spec.js new file mode 100644 index 0000000..637c63b --- /dev/null +++ b/spec/boundary_builder_spec.js @@ -0,0 +1,329 @@ +const fs = require('fs'); +const os = require('os'); +const path = require('path'); +const { spawnSync } = require('child_process'); +const sqlite3 = require('sqlite3'); +const geohash = require('../src/geohash'); + +function all(db, sql) { + return new Promise((resolve, reject) => { + db.all(sql, [], (err, rows) => (err ? reject(err) : resolve(rows || []))); + }); +} + +function close(db) { + return new Promise((resolve, reject) => { + db.close((err) => (err ? reject(err) : resolve())); + }); +} + +describe('boundary builder', () => { + it('drops contained localities when pruning is enabled', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'localities.geojson'); + const dbPath = path.join(dir, 'boundary.sqlite'); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 2001, + properties: { + name: 'Outer City', + placetype: 'locality', + country_id: 'US', + admin1_id: 5, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-5, -5], [5, -5], [5, 5], [-5, 5], [-5, -5]]] + } + }, + { + type: 'Feature', + id: 2002, + properties: { + name: 'Inner Duplicate', + placetype: 'locality', + country_id: 'US', + admin1_id: 5, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-1, -1], [1, -1], [1, 1], [-1, 1], [-1, -1]]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--base-precision', '4', + '--max-precision', '5', + '--index-mode', 'compact', + '--drop-contained-localities', 'true' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const rows = await all(db, 'SELECT id, name FROM compact_places ORDER BY id ASC'); + expect(rows).toEqual([{ id: 2001, name: 'Outer City' }]); + + const lookupRows = await all(db, 'SELECT geohash, place_id FROM compact_geohash_lookup'); + expect(lookupRows.length).toBeGreaterThan(0); + + const legacyRows = await all(db, "SELECT count(*) AS count FROM sqlite_master WHERE type='table' AND name='place_geometry'"); + expect(legacyRows[0].count).toEqual(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('rolls small localities up to region boundaries when configured', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'rollup.geojson'); + const dbPath = path.join(dir, 'rollup.sqlite'); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 3001, + properties: { + name: 'Wide Region', + placetype: 'region', + country_id: 'US', + admin1_id: 5, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-5, -5], [5, -5], [5, 5], [-5, 5], [-5, -5]]] + } + }, + { + type: 'Feature', + id: 3002, + properties: { + name: 'Small Village', + placetype: 'locality', + country_id: 'US', + admin1_id: 5, + population: 1200, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-0.5, -0.5], [0.5, -0.5], [0.5, 0.5], [-0.5, 0.5], [-0.5, -0.5]]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--min-population', '5000', + '--base-precision', '4', + '--max-precision', '5' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const rows = await all(db, 'SELECT id, name, placetype_code FROM compact_places ORDER BY id ASC'); + expect(rows).toEqual([{ id: 3001, name: 'Wide Region', placetype_code: 2 }]); + + const lookupRows = await all(db, 'SELECT count(*) AS count FROM compact_geohash_lookup'); + expect(lookupRows[0].count).toBeGreaterThan(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('keeps point capitals as locality cells even below min population', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'point-capital.geojson'); + const dbPath = path.join(dir, 'point-capital.sqlite'); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 4001, + properties: { + name: 'Fallback Region', + placetype: 'region', + country_id: 'GF', + admin1_id: 85671195, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-53.2, 4.8], [-52.2, 4.8], [-52.2, 5.2], [-53.2, 5.2], [-53.2, 4.8]]] + } + }, + { + type: 'Feature', + id: 4002, + properties: { + name: 'Cayenne', + placetype: 'locality', + country_id: 'GF', + admin1_id: 85671195, + population: 600, + 'gn:feature_code': 'PPLC', + is_current: 1 + }, + geometry: { + type: 'Point', + coordinates: [-52.33333, 4.93333] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--min-population', '5000', + '--base-precision', '4', + '--max-precision', '5' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const places = await all(db, 'SELECT id, name, placetype_code FROM compact_places ORDER BY id ASC'); + expect(places).toEqual([ + { id: 4001, name: 'Fallback Region', placetype_code: 2 }, + { id: 4002, name: 'Cayenne', placetype_code: 0 } + ]); + + const capitalRows = await all(db, 'SELECT geohash FROM compact_geohash_lookup WHERE place_id = 4002'); + expect(capitalRows.length).toEqual(1); + expect(capitalRows[0].geohash.length).toBeGreaterThanOrEqual(4); + expect(capitalRows[0].geohash.length).toBeLessThanOrEqual(5); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('promotes locality over region when there is no competing locality in the same parent cell', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'locality-region-promotion.geojson'); + const dbPath = path.join(dir, 'locality-region-promotion.sqlite'); + const parentHash = 's000'; + const parentBbox = geohash.decodeBbox(parentHash); + const midLon = (parentBbox.minLon + parentBbox.maxLon) / 2; + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 5001, + properties: { + name: 'Wide Region', + placetype: 'region', + country_id: 'AR', + admin1_id: 1, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 5002, + properties: { + name: 'Metro City', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 1000000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [midLon, parentBbox.minLat], + [midLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--base-precision', '4', + '--max-precision', '5', + '--promote-locality-over-region', 'true' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const parentRow = await all(db, `SELECT geohash, place_id FROM compact_geohash_lookup WHERE geohash='${parentHash}'`); + expect(parentRow).toEqual([{ geohash: parentHash, place_id: 5002 }]); + + const regionDescendants = await all( + db, + `SELECT COUNT(*) AS count FROM compact_geohash_lookup WHERE geohash LIKE '${parentHash}%' AND geohash <> '${parentHash}' AND place_id = 5001` + ); + expect(regionDescendants[0].count).toEqual(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/spec/boundary_cover_spec.js b/spec/boundary_cover_spec.js new file mode 100644 index 0000000..a75e97d --- /dev/null +++ b/spec/boundary_cover_spec.js @@ -0,0 +1,52 @@ +const geohash = require('../src/geohash'); +const boundaryCover = require('../src/boundary_cover'); + +describe('boundary geohash cover', () => { + it('marks fully contained geohash cells as full', () => { + const cell = geohash.decodeBbox('s000'); + const exactCellPolygon = { + type: 'Polygon', + coordinates: [[ + [cell.minLon, cell.minLat], + [cell.maxLon, cell.minLat], + [cell.maxLon, cell.maxLat], + [cell.minLon, cell.maxLat], + [cell.minLon, cell.minLat] + ]] + }; + + const cover = boundaryCover.buildGeohashCoverForGeometry(exactCellPolygon, { + basePrecision: 4, + maxPrecision: 6 + }); + + expect(cover).toContain(jasmine.objectContaining({ + geohash: 's000', + precision: 4, + coverageType: 'full' + })); + }); + + it('subdivides partial cells until max precision', () => { + const cell = geohash.decodeBbox('s000'); + const diagonalPolygon = { + type: 'Polygon', + coordinates: [[ + [cell.minLon, cell.minLat], + [cell.maxLon, cell.minLat], + [cell.minLon, cell.maxLat], + [cell.minLon, cell.minLat] + ]] + }; + + const cover = boundaryCover.buildGeohashCoverForGeometry(diagonalPolygon, { + basePrecision: 4, + maxPrecision: 5 + }); + + expect(cover.some((entry) => entry.precision === 5)).toBeTrue(); + + const uniqueKeys = new Set(cover.map((entry) => `${entry.geohash}|${entry.precision}`)); + expect(uniqueKeys.size).toEqual(cover.length); + }); +}); diff --git a/spec/geometry_spec.js b/spec/geometry_spec.js new file mode 100644 index 0000000..6fd54e6 --- /dev/null +++ b/spec/geometry_spec.js @@ -0,0 +1,47 @@ +const geometry = require('../src/geometry'); + +describe('geometry utilities', () => { + it('handles polygon holes in point-in-polygon checks', () => { + const polygonWithHole = { + type: 'Polygon', + coordinates: [ + [[-5, -5], [5, -5], [5, 5], [-5, 5], [-5, -5]], + [[-1, -1], [1, -1], [1, 1], [-1, 1], [-1, -1]] + ] + }; + + expect(geometry.pointInGeometry(polygonWithHole, 2, 2)).toBeTrue(); + expect(geometry.pointInGeometry(polygonWithHole, 0, 0)).toBeFalse(); + }); + + it('supports multipolygon containment', () => { + const multipolygon = { + type: 'MultiPolygon', + coordinates: [ + [[[-11, -11], [-9, -11], [-9, -9], [-11, -9], [-11, -11]]], + [[[9, 9], [11, 9], [11, 11], [9, 11], [9, 9]]] + ] + }; + + expect(geometry.pointInGeometry(multipolygon, 10, 10)).toBeTrue(); + expect(geometry.pointInGeometry(multipolygon, 0, 0)).toBeFalse(); + }); + + it('detects when one geometry is contained by another', () => { + const outer = { + type: 'Polygon', + coordinates: [[[-5, -5], [5, -5], [5, 5], [-5, 5], [-5, -5]]] + }; + const inner = { + type: 'Polygon', + coordinates: [[[-1, -1], [1, -1], [1, 1], [-1, 1], [-1, -1]]] + }; + const farAway = { + type: 'Polygon', + coordinates: [[[10, 10], [11, 10], [11, 11], [10, 11], [10, 10]]] + }; + + expect(geometry.geometryContainsGeometry(outer, inner)).toBeTrue(); + expect(geometry.geometryContainsGeometry(outer, farAway)).toBeFalse(); + }); +}); diff --git a/spec/helpers/fixture_db.js b/spec/helpers/fixture_db.js index e5c8ba1..1a8deee 100644 --- a/spec/helpers/fixture_db.js +++ b/spec/helpers/fixture_db.js @@ -1,10 +1,13 @@ "use strict"; -const fs = require('fs') -const os = require('os') -const path = require('path') +const fs = require('fs') +const os = require('os') +const path = require('path') const sqlite3 = require('sqlite3') +const boundaryCover = require('../../src/boundary_cover') +const geometry = require('../../src/geometry') + const schemaSql = fs.readFileSync(path.join(__dirname, '../../scripts/schema.sql'), 'utf8') const fixtureSql = ` @@ -18,26 +21,205 @@ INSERT INTO features(id, name, asciiname, country_id, admin1_id, population) VAL (3169070, 'Rome', 'Rome', 'IT', 7, 2873000), (2988507, 'Paris', 'Paris', 'FR', 11, 2138551), (5128581, 'New York City', 'New York City', 'US', 36, 8175133), - (5368361, 'Los Angeles', 'Los Angeles', 'US', 5, 3792621); + (5368361, 'Los Angeles', 'Los Angeles', 'US', 5, 3792621), + (9100001, 'Westville', 'Westville', 'US', 5, 50000), + (9100002, 'Eastville', 'Eastville', 'US', 5, 60000), + (9100003, 'Centerville', 'Centerville', 'US', 5, 20000), + (9100004, 'Midtown', 'Midtown', 'US', 5, 1000); INSERT INTO coordinates(feature_id, latitude, longitude) VALUES (3169070, 41.89193, 12.51133), (2988507, 48.85341, 2.3488), (5128581, 40.71427, -74.00597), - (5368361, 34.05223, -118.24368); + (5368361, 34.05223, -118.24368), + (9100001, 0, -2), + (9100002, 0, 0.2), + (9100003, 0, 0.1), + (9100004, 0, 0.05); ` +const boundaryFixtures = [ + { + id: 9100001, + name: 'Westville', + countryId: 'US', + admin1Id: 5, + placetype: 'locality', + priorityRank: 20, + geometry: { + type: 'Polygon', + coordinates: [[ + [-1, -1], + [0, -1], + [0, 1], + [-1, 1], + [-1, -1] + ]] + } + }, + { + id: 9100002, + name: 'Eastville', + countryId: 'US', + admin1Id: 5, + placetype: 'locality', + priorityRank: 30, + geometry: { + type: 'Polygon', + coordinates: [[ + [0, -1], + [1, -1], + [1, 1], + [0, 1], + [0, -1] + ]] + } + }, + { + id: 9100003, + name: 'Centerville', + countryId: 'US', + admin1Id: 5, + placetype: 'locality', + priorityRank: 10, + geometry: { + type: 'Polygon', + coordinates: [[ + [-0.2, -0.2], + [0.2, -0.2], + [0.2, 0.2], + [-0.2, 0.2], + [-0.2, -0.2] + ]] + } + }, + { + id: 9100004, + name: 'Midtown', + countryId: 'US', + admin1Id: 5, + placetype: 'neighbourhood', + priorityRank: 1, + geometry: { + type: 'Polygon', + coordinates: [[ + [-0.15, -0.15], + [0.15, -0.15], + [0.15, 0.15], + [-0.15, 0.15], + [-0.15, -0.15] + ]] + } + } +] + function exec(db, sql) { return new Promise(function(resolve, reject) { db.exec(sql, function(err) { err ? reject(err) : resolve() }) }) } +function run(db, sql, params) { + return new Promise(function(resolve, reject) { + db.run(sql, params || [], function(err) { err ? reject(err) : resolve() }) + }) +} + function close(db) { return new Promise(function(resolve, reject) { db.close(function(err) { err ? reject(err) : resolve() }) }) } +async function seedBoundaryData(db) { + await exec(db, 'BEGIN') + + try { + var compactByHash = Object.create(null) + + for (var i = 0; i < boundaryFixtures.length; i++) { + var place = boundaryFixtures[i] + var normalizedGeometry = geometry.normalizeGeometry(place.geometry) + var bbox = geometry.geometryBbox(normalizedGeometry) + var area = geometry.geometryArea(normalizedGeometry) + var cover = boundaryCover.buildGeohashCoverForGeometry(normalizedGeometry, { + basePrecision: 4, + maxPrecision: 7 + }) + + await run(db, ` + INSERT INTO places( + id, name, country_id, admin1_id, placetype, + centroid_lat, centroid_lon, + bbox_min_lat, bbox_min_lon, bbox_max_lat, bbox_max_lon, + priority_rank, area, country_name, admin1_name + ) + SELECT + f.id, + f.name, + ?, + ?, + ?, + c.latitude, + c.longitude, + ?, ?, ?, ?, + ?, ?, + 'United States', + 'California' + FROM features f + JOIN coordinates c ON c.feature_id = f.id + WHERE f.id = ? + `, [ + place.countryId, + place.admin1Id, + place.placetype, + bbox.minLat, + bbox.minLon, + bbox.maxLat, + bbox.maxLon, + place.priorityRank, + area, + place.id + ]) + + await run(db, ` + INSERT INTO place_geometry(place_id, encoding, geometry) + VALUES (?, 'json', ?) + `, [place.id, JSON.stringify(normalizedGeometry)]) + + for (var j = 0; j < cover.length; j++) { + await run(db, ` + INSERT INTO place_geohash_cover(geohash, precision, place_id, coverage_type) + VALUES (?, ?, ?, ?) + `, [cover[j].geohash, cover[j].precision, place.id, cover[j].coverageType]) + + if (place.placetype === 'locality' || place.placetype === 'localadmin') { + var existing = compactByHash[cover[j].geohash] + if (!existing || place.priorityRank < existing.priorityRank || (place.priorityRank === existing.priorityRank && place.id < existing.placeId)) { + compactByHash[cover[j].geohash] = { + placeId: place.id, + priorityRank: place.priorityRank + } + } + } + } + } + + var hashes = Object.keys(compactByHash) + for (var h = 0; h < hashes.length; h++) { + var hash = hashes[h] + await run(db, ` + INSERT INTO place_geohash_lookup(geohash, place_id) + VALUES (?, ?) + `, [hash, compactByHash[hash].placeId]) + } + + await exec(db, 'COMMIT') + } catch (err) { + await exec(db, 'ROLLBACK') + throw err + } +} + function createFixtureDatabase() { var dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-')) var dbPath = path.join(dir, 'fixture.sqlite') @@ -45,6 +227,7 @@ function createFixtureDatabase() { return exec(db, schemaSql) .then(function() { return exec(db, fixtureSql) }) + .then(function() { return seedBoundaryData(db) }) .then(function() { return close(db) }) .then(function() { return { diff --git a/spec/reverse_boundary_compact_spec.js b/spec/reverse_boundary_compact_spec.js new file mode 100644 index 0000000..c8bd750 --- /dev/null +++ b/spec/reverse_boundary_compact_spec.js @@ -0,0 +1,67 @@ +const sqlite3 = require('sqlite3'); +const createGeocoder = require('../src/index.js'); +const fixtureDb = require('./helpers/fixture_db'); + +function exec(db, sql) { + return new Promise((resolve, reject) => { + db.exec(sql, (err) => (err ? reject(err) : resolve())); + }); +} + +function close(db) { + return new Promise((resolve, reject) => { + db.close((err) => (err ? reject(err) : resolve())); + }); +} + +describe('geocoder.reverse boundary mode (compact geohash lookup)', () => { + var fixture; + var geocoder; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(async function(f) { + fixture = f; + + const db = new sqlite3.Database(fixture.databasePath); + try { + // Force compact-only runtime path in this fixture. + await exec(db, 'DELETE FROM place_geohash_cover; DELETE FROM place_geometry;'); + } finally { + await close(db); + } + + geocoder = createGeocoder({ + database: fixture.databasePath, + reverseMode: 'boundary', + boundary: { + basePrecision: 4, + maxPrecision: 7 + } + }); + + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + + it('uses compact geohash lookup for containing areas', (done) => { + geocoder.reverse(0, -0.5) + .then(function(result) { + expect(result.id).toEqual(9100001); + expect(result.name).toEqual('Westville'); + done(); + }); + }); + + it('falls back to nearest boundary centroid when no compact hash matches', (done) => { + geocoder.reverse(0, 1.5) + .then(function(result) { + expect(result.id).toEqual(9100002); + expect(result.name).toEqual('Eastville'); + done(); + }); + }); +}); diff --git a/spec/reverse_boundary_rollup_spec.js b/spec/reverse_boundary_rollup_spec.js new file mode 100644 index 0000000..b13f9cf --- /dev/null +++ b/spec/reverse_boundary_rollup_spec.js @@ -0,0 +1,114 @@ +const fs = require('fs'); +const os = require('os'); +const path = require('path'); +const { spawnSync } = require('child_process'); +const createGeocoder = require('../src/index.js'); + +describe('geocoder.reverse boundary mode (locality roll-up)', () => { + var dir; + var dbPath; + var geocoder; + + beforeAll(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-rollup-')); + const inputPath = path.join(dir, 'rollup.geojson'); + dbPath = path.join(dir, 'rollup.sqlite'); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 4001, + properties: { + name: 'Macro Region', + placetype: 'region', + country_id: 'US', + admin1_id: 5, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-6, -6], [6, -6], [6, 6], [-6, 6], [-6, -6]]] + } + }, + { + type: 'Feature', + id: 4002, + properties: { + name: 'Tiny Hamlet', + placetype: 'locality', + country_id: 'US', + admin1_id: 5, + population: 800, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[-0.8, -0.8], [0.8, -0.8], [0.8, 0.8], [-0.8, 0.8], [-0.8, -0.8]]] + } + }, + { + type: 'Feature', + id: 4003, + properties: { + name: 'Big City', + placetype: 'locality', + country_id: 'US', + admin1_id: 5, + population: 90000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[[2, 2], [3.5, 2], [3.5, 3.5], [2, 3.5], [2, 2]]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--min-population', '5000', + '--base-precision', '4', + '--max-precision', '5' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + geocoder = createGeocoder({ + database: dbPath, + reverseMode: 'boundary', + boundary: { + basePrecision: 4, + maxPrecision: 5 + } + }); + }); + + afterAll(() => { + fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('returns region for low-pop locality area when locality is rolled up', (done) => { + geocoder.reverse(0, 0) + .then(function(result) { + expect(result.id).toEqual(4001); + expect(result.name).toEqual('Macro Region'); + done(); + }); + }); + + it('keeps higher-pop locality labels where available', (done) => { + geocoder.reverse(2.7, 2.7) + .then(function(result) { + expect(result.id).toEqual(4003); + expect(result.name).toEqual('Big City'); + done(); + }); + }); +}); diff --git a/spec/reverse_boundary_spec.js b/spec/reverse_boundary_spec.js new file mode 100644 index 0000000..786fe14 --- /dev/null +++ b/spec/reverse_boundary_spec.js @@ -0,0 +1,53 @@ +const createGeocoder = require('../src/index.js'); +const fixtureDb = require('./helpers/fixture_db'); + +describe('geocoder.reverse boundary mode', () => { + var fixture; + var geocoder; + + beforeAll((done) => { + fixtureDb.createFixtureDatabase().then(function(f) { + fixture = f; + geocoder = createGeocoder({ + database: fixture.databasePath, + reverseMode: 'boundary', + boundary: { + basePrecision: 4, + maxPrecision: 7 + } + }); + done(); + }); + }); + + afterAll(() => { + fixture.cleanup(); + }); + + it('chooses containing locality polygons instead of nearest centroids', (done) => { + geocoder.reverse(0, -0.5) + .then(function(result) { + expect(result.id).toEqual(9100001); + expect(result.name).toEqual('Westville'); + done(); + }); + }); + + it('uses deterministic tie-breakers and ignores neighbourhood placetypes', (done) => { + geocoder.reverse(0.1, 0.1) + .then(function(result) { + expect(result.id).toEqual(9100003); + expect(result.name).toEqual('Centerville'); + done(); + }); + }); + + it('falls back to nearest boundary centroid when no polygon contains the point', (done) => { + geocoder.reverse(0, 1.5) + .then(function(result) { + expect(result.id).toEqual(9100002); + expect(result.name).toEqual('Eastville'); + done(); + }); + }); +}); diff --git a/spec/reverse_spec.js b/spec/reverse_spec.js index 673c45d..1708f91 100644 --- a/spec/reverse_spec.js +++ b/spec/reverse_spec.js @@ -32,7 +32,7 @@ describe('geocoder.reverse', () => { }); it("resolves an empty object when a location can't be found", (done) => { - geocoder.reverse(0, 0) + geocoder.reverse(80, 80) .then(function(result) { expect(result).toEqual({}); done(); diff --git a/spec/schema_spec.js b/spec/schema_spec.js index 667f07e..5aba1da 100644 --- a/spec/schema_spec.js +++ b/spec/schema_spec.js @@ -31,15 +31,35 @@ describe('generated schema', () => { it('creates indexes for reverse and forward lookups', (done) => { db.all("PRAGMA index_list('coordinates')", [], function(err, coordIndexes) { db.all("PRAGMA index_list('features')", [], function(err, featIndexes) { - var coordNames = coordIndexes.map(function(i) { return i.name }); - var featNames = featIndexes.map(function(i) { return i.name }); + db.all("PRAGMA index_list('place_geohash_cover')", [], function(err, boundaryIndexes) { + db.all("PRAGMA index_list('place_geohash_lookup')", [], function(err, compactIndexes) { + var coordNames = coordIndexes.map(function(i) { return i.name }); + var featNames = featIndexes.map(function(i) { return i.name }); + var boundaryNames = boundaryIndexes.map(function(i) { return i.name }); + var compactNames = compactIndexes.map(function(i) { return i.name }); - expect(coordNames).toContain('coordinates_lat_lng'); - expect(featNames).toContain('features_name_nocase'); - expect(featNames).toContain('features_asciiname_nocase'); - expect(featNames).toContain('features_population_desc'); - done(); + expect(coordNames).toContain('coordinates_lat_lng'); + expect(featNames).toContain('features_name_nocase'); + expect(featNames).toContain('features_asciiname_nocase'); + expect(featNames).toContain('features_population_desc'); + expect(boundaryNames).toContain('place_geohash_cover_hash_precision'); + expect(boundaryNames).toContain('place_geohash_cover_place_id'); + expect(compactNames).toContain('place_geohash_lookup_place_id'); + done(); + }); + }); }); }); }); + + it('includes boundary lookup tables', (done) => { + db.all("SELECT name FROM sqlite_master WHERE type='table' AND name IN ('places', 'place_geohash_cover', 'place_geometry', 'place_geohash_lookup')", [], function(err, rows) { + var names = rows.map(function(row) { return row.name }); + expect(names).toContain('places'); + expect(names).toContain('place_geohash_cover'); + expect(names).toContain('place_geometry'); + expect(names).toContain('place_geohash_lookup'); + done(); + }); + }); }); diff --git a/src/boundary_cover.js b/src/boundary_cover.js new file mode 100644 index 0000000..dc8ee86 --- /dev/null +++ b/src/boundary_cover.js @@ -0,0 +1,99 @@ +"use strict"; + +const geohash = require('./geohash') +const geometry = require('./geometry') + +const EPSILON = 1e-12 + +function clampLatitude(value) { + var lat = Number(value) + if (lat > 90) lat = 90 + if (lat < -90) lat = -90 + return lat +} + +function clampLongitude(value) { + var lon = Number(value) + if (lon > 180) lon = 180 + if (lon < -180) lon = -180 + return lon +} + +function seedGeohashesForBbox(bbox, precision) { + var size = geohash.cellSize(precision) + var minLat = clampLatitude(bbox.minLat) + var maxLat = clampLatitude(bbox.maxLat - EPSILON) + var minLon = clampLongitude(bbox.minLon) + var maxLon = clampLongitude(bbox.maxLon - EPSILON) + + var latStart = Math.floor((minLat + 90) / size.lat) + var latEnd = Math.floor((maxLat + 90) / size.lat) + var lonStart = Math.floor((minLon + 180) / size.lon) + var lonEnd = Math.floor((maxLon + 180) / size.lon) + + var hashes = Object.create(null) + + for (var latIndex = latStart; latIndex <= latEnd; latIndex++) { + var centerLat = -90 + (latIndex + 0.5) * size.lat + for (var lonIndex = lonStart; lonIndex <= lonEnd; lonIndex++) { + var centerLon = -180 + (lonIndex + 0.5) * size.lon + hashes[geohash.encode(centerLat, centerLon, precision)] = true + } + } + + return Object.keys(hashes) +} + +function buildGeohashCoverForGeometry(inputGeometry, options) { + var opts = options || {} + var basePrecision = Number(opts.basePrecision || 4) + var maxPrecision = Number(opts.maxPrecision || 7) + + if (basePrecision < 1) basePrecision = 1 + if (maxPrecision < basePrecision) maxPrecision = basePrecision + + var normalized = geometry.normalizeGeometry(inputGeometry) + var bounds = geometry.geometryBbox(normalized) + var seeds = seedGeohashesForBbox(bounds, basePrecision) + var terminal = Object.create(null) + + function walk(hash, precision) { + var cellBbox = geohash.decodeBbox(hash) + var status = geometry.classifyCell(normalized, cellBbox) + + if (status === 'outside') { + return + } + + if (status === 'partial' && precision < maxPrecision) { + geohash.children(hash).forEach(function(child) { + walk(child, precision + 1) + }) + return + } + + terminal[hash + '|' + precision] = { + geohash: hash, + precision: precision, + coverageType: status + } + } + + seeds.forEach(function(hash) { + walk(hash, basePrecision) + }) + + return Object.keys(terminal) + .map(function(key) { return terminal[key] }) + .sort(function(a, b) { + if (a.precision !== b.precision) return a.precision - b.precision + if (a.geohash < b.geohash) return -1 + if (a.geohash > b.geohash) return 1 + return 0 + }) +} + +module.exports = { + buildGeohashCoverForGeometry: buildGeohashCoverForGeometry, + seedGeohashesForBbox: seedGeohashesForBbox +} diff --git a/src/expo.js b/src/expo.js index 0d66f67..183d86f 100644 --- a/src/expo.js +++ b/src/expo.js @@ -4,6 +4,35 @@ const reverse = require('./reverse') const forward = require('./forward') const findLocation = require('./location').find +function normalizeReverseMode(options) { + var mode = options.reverseMode + if (mode === undefined && options.reverse && options.reverse.mode) { + mode = options.reverse.mode + } + + var normalized = String(mode || 'centroid').toLowerCase() + return normalized === 'boundary' ? 'boundary' : 'centroid' +} + +function resolveBoundaryOptions(options) { + var boundary = options.boundary || {} + + var basePrecision = Number(boundary.basePrecision) + if (!Number.isFinite(basePrecision) || basePrecision < 1) { + basePrecision = 4 + } + + var maxPrecision = Number(boundary.maxPrecision) + if (!Number.isFinite(maxPrecision) || maxPrecision < basePrecision) { + maxPrecision = 7 + } + + return { + basePrecision: basePrecision, + maxPrecision: maxPrecision + } +} + // Wraps an expo-sqlite database to match the node-sqlite3 callback // interface that reverse.js, forward.js and location.js expect. function wrapExpoDb(expoDb) { @@ -34,6 +63,10 @@ function ExpoGeocoder(options) { } this.db = wrapExpoDb(expoDb) + this.options = opts + this.reverseMode = normalizeReverseMode(opts) + this.reverseDebug = Boolean(opts.reverseDebug) + this.boundaryOptions = resolveBoundaryOptions(opts) } ExpoGeocoder.prototype.reverse = function(latitude, longitude, callback) { diff --git a/src/geohash.js b/src/geohash.js new file mode 100644 index 0000000..efae700 --- /dev/null +++ b/src/geohash.js @@ -0,0 +1,158 @@ +"use strict"; + +const BASE32 = '0123456789bcdefghjkmnpqrstuvwxyz' +const BASE32_MAP = Object.create(null) +for (var i = 0; i < BASE32.length; i++) { + BASE32_MAP[BASE32.charAt(i)] = i +} + +function normalizeLatitude(value) { + var latitude = Number(value) + if (Number.isNaN(latitude)) { + latitude = 0 + } + if (latitude > 90) latitude = 90 + if (latitude < -90) latitude = -90 + return latitude +} + +function normalizeLongitude(value) { + var longitude = Number(value) + if (Number.isNaN(longitude)) { + longitude = 0 + } + + while (longitude < -180) longitude += 360 + while (longitude > 180) longitude -= 360 + + if (longitude === 180) { + longitude = 179.99999999999997 + } + + return longitude +} + +function encode(latitude, longitude, precision) { + var targetPrecision = Number(precision) + if (!targetPrecision || targetPrecision < 1) { + targetPrecision = 1 + } + + var lat = normalizeLatitude(latitude) + var lon = normalizeLongitude(longitude) + + var latMin = -90 + var latMax = 90 + var lonMin = -180 + var lonMax = 180 + var hash = '' + var bit = 0 + var ch = 0 + var evenBit = true + + while (hash.length < targetPrecision) { + if (evenBit) { + var lonMid = (lonMin + lonMax) / 2 + if (lon >= lonMid) { + ch = (ch << 1) + 1 + lonMin = lonMid + } else { + ch = (ch << 1) + lonMax = lonMid + } + } else { + var latMid = (latMin + latMax) / 2 + if (lat >= latMid) { + ch = (ch << 1) + 1 + latMin = latMid + } else { + ch = (ch << 1) + latMax = latMid + } + } + + evenBit = !evenBit + bit += 1 + + if (bit === 5) { + hash += BASE32.charAt(ch) + bit = 0 + ch = 0 + } + } + + return hash +} + +function decodeBbox(hash) { + var value = String(hash || '').toLowerCase() + var latMin = -90 + var latMax = 90 + var lonMin = -180 + var lonMax = 180 + var evenBit = true + + for (var i = 0; i < value.length; i++) { + var ch = value.charAt(i) + if (BASE32_MAP[ch] === undefined) { + throw new Error('Invalid geohash character: ' + ch) + } + + var current = BASE32_MAP[ch] + for (var mask = 16; mask > 0; mask >>= 1) { + if (evenBit) { + var lonMid = (lonMin + lonMax) / 2 + if (current & mask) { + lonMin = lonMid + } else { + lonMax = lonMid + } + } else { + var latMid = (latMin + latMax) / 2 + if (current & mask) { + latMin = latMid + } else { + latMax = latMid + } + } + + evenBit = !evenBit + } + } + + return { + minLat: latMin, + minLon: lonMin, + maxLat: latMax, + maxLon: lonMax + } +} + +function cellSize(precision) { + var p = Math.max(1, Number(precision) || 1) + var totalBits = p * 5 + var lonBits = Math.ceil(totalBits / 2) + var latBits = Math.floor(totalBits / 2) + + return { + lat: 180 / Math.pow(2, latBits), + lon: 360 / Math.pow(2, lonBits) + } +} + +function children(hash) { + var prefix = String(hash || '') + var values = [] + for (var i = 0; i < BASE32.length; i++) { + values.push(prefix + BASE32.charAt(i)) + } + return values +} + +module.exports = { + encode: encode, + decodeBbox: decodeBbox, + cellSize: cellSize, + children: children, + base32: BASE32 +} diff --git a/src/geometry.js b/src/geometry.js new file mode 100644 index 0000000..9136a17 --- /dev/null +++ b/src/geometry.js @@ -0,0 +1,411 @@ +"use strict"; + +const EPSILON = 1e-12 + +function markNormalized(value) { + Object.defineProperty(value, '__normalized', { + value: true, + enumerable: false, + configurable: true, + writable: false + }) + return value +} + +function setCachedBbox(value, bbox) { + Object.defineProperty(value, '__bbox', { + value: bbox, + enumerable: false, + configurable: true, + writable: true + }) +} + +function closeRing(ring) { + if (!Array.isArray(ring) || ring.length === 0) return [] + + var normalized = ring.map(function(point) { + return [Number(point[0]), Number(point[1])] + }) + + var first = normalized[0] + var last = normalized[normalized.length - 1] + if (first[0] !== last[0] || first[1] !== last[1]) { + normalized.push([first[0], first[1]]) + } + + return normalized +} + +function normalizeGeometry(geometry) { + if (!geometry || !geometry.type || !geometry.coordinates) { + throw new Error('Invalid geometry payload') + } + + if (geometry.type === 'MultiPolygon' && geometry.__normalized) { + return geometry + } + + if (geometry.type === 'Polygon') { + return markNormalized({ + type: 'MultiPolygon', + coordinates: [geometry.coordinates.map(closeRing)] + }) + } + + if (geometry.type === 'MultiPolygon') { + return markNormalized({ + type: 'MultiPolygon', + coordinates: geometry.coordinates.map(function(polygon) { + return polygon.map(closeRing) + }) + }) + } + + throw new Error('Unsupported geometry type: ' + geometry.type) +} + +function geometryBbox(geometry) { + var normalized = normalizeGeometry(geometry) + if (normalized.__bbox) { + return normalized.__bbox + } + + var minLat = Infinity + var minLon = Infinity + var maxLat = -Infinity + var maxLon = -Infinity + + normalized.coordinates.forEach(function(polygon) { + polygon.forEach(function(ring) { + ring.forEach(function(point) { + var lon = Number(point[0]) + var lat = Number(point[1]) + + if (lat < minLat) minLat = lat + if (lat > maxLat) maxLat = lat + if (lon < minLon) minLon = lon + if (lon > maxLon) maxLon = lon + }) + }) + }) + + var bbox = { + minLat: minLat, + minLon: minLon, + maxLat: maxLat, + maxLon: maxLon + } + + setCachedBbox(normalized, bbox) + return bbox +} + +function signedRingArea(ring) { + var area = 0 + for (var i = 0; i < ring.length - 1; i++) { + var current = ring[i] + var next = ring[i + 1] + area += (current[0] * next[1]) - (next[0] * current[1]) + } + return area / 2 +} + +function geometryArea(geometry) { + var normalized = normalizeGeometry(geometry) + var total = 0 + + normalized.coordinates.forEach(function(polygon) { + if (!polygon[0] || polygon[0].length < 4) return + + var polygonArea = Math.abs(signedRingArea(polygon[0])) + for (var i = 1; i < polygon.length; i++) { + polygonArea -= Math.abs(signedRingArea(polygon[i])) + } + + total += Math.max(0, polygonArea) + }) + + return total +} + +function almostEqual(a, b) { + return Math.abs(a - b) <= EPSILON +} + +function pointOnSegment(point, a, b) { + var sqLen = (b[0] - a[0]) * (b[0] - a[0]) + (b[1] - a[1]) * (b[1] - a[1]) + if (sqLen <= EPSILON) { + return almostEqual(point[0], a[0]) && almostEqual(point[1], a[1]) + } + + var cross = (point[1] - a[1]) * (b[0] - a[0]) - (point[0] - a[0]) * (b[1] - a[1]) + if (Math.abs(cross) > EPSILON) { + return false + } + + var dot = (point[0] - a[0]) * (b[0] - a[0]) + (point[1] - a[1]) * (b[1] - a[1]) + if (dot < -EPSILON) { + return false + } + + if (dot - sqLen > EPSILON) { + return false + } + + return true +} + +function pointInRing(point, ring) { + if (!ring || ring.length < 4) return false + + var inside = false + var last = ring.length - 1 + + for (var i = 0, j = last - 1; i < last; j = i++) { + var a = ring[i] + var b = ring[j] + + if (pointOnSegment(point, a, b)) { + return true + } + + var yi = a[1] + var yj = b[1] + var xi = a[0] + var xj = b[0] + + var intersects = ((yi > point[1]) !== (yj > point[1])) && + (point[0] < (xj - xi) * (point[1] - yi) / ((yj - yi) || EPSILON) + xi) + + if (intersects) inside = !inside + } + + return inside +} + +function pointInPolygon(point, polygon) { + if (!polygon[0] || !pointInRing(point, polygon[0])) { + return false + } + + for (var i = 1; i < polygon.length; i++) { + if (pointInRing(point, polygon[i])) { + return false + } + } + + return true +} + +function pointInGeometry(geometry, latitude, longitude) { + var normalized = normalizeGeometry(geometry) + var point = [Number(longitude), Number(latitude)] + + for (var i = 0; i < normalized.coordinates.length; i++) { + if (pointInPolygon(point, normalized.coordinates[i])) { + return true + } + } + + return false +} + +function bboxContainsPoint(bbox, latitude, longitude) { + return Number(latitude) >= bbox.minLat && Number(latitude) <= bbox.maxLat && + Number(longitude) >= bbox.minLon && Number(longitude) <= bbox.maxLon +} + +function bboxIntersects(a, b) { + return !(a.maxLon < b.minLon || a.minLon > b.maxLon || a.maxLat < b.minLat || a.minLat > b.maxLat) +} + +function bboxContainsBbox(outer, inner) { + return outer.minLat <= inner.minLat && + outer.minLon <= inner.minLon && + outer.maxLat >= inner.maxLat && + outer.maxLon >= inner.maxLon +} + +function orientation(a, b, c) { + var value = (b[1] - a[1]) * (c[0] - b[0]) - (b[0] - a[0]) * (c[1] - b[1]) + if (almostEqual(value, 0)) return 0 + return value > 0 ? 1 : 2 +} + +function segmentsIntersect(a, b, c, d) { + var o1 = orientation(a, b, c) + var o2 = orientation(a, b, d) + var o3 = orientation(c, d, a) + var o4 = orientation(c, d, b) + + if (o1 !== o2 && o3 !== o4) { + return true + } + + if (o1 === 0 && pointOnSegment(c, a, b)) return true + if (o2 === 0 && pointOnSegment(d, a, b)) return true + if (o3 === 0 && pointOnSegment(a, c, d)) return true + if (o4 === 0 && pointOnSegment(b, c, d)) return true + return false +} + +function segmentIntersectsRect(a, b, rect) { + var rectPoints = [ + [rect.minLon, rect.minLat], + [rect.maxLon, rect.minLat], + [rect.maxLon, rect.maxLat], + [rect.minLon, rect.maxLat] + ] + + if (bboxContainsPoint(rect, a[1], a[0]) || bboxContainsPoint(rect, b[1], b[0])) { + return true + } + + for (var i = 0; i < rectPoints.length; i++) { + var p1 = rectPoints[i] + var p2 = rectPoints[(i + 1) % rectPoints.length] + if (segmentsIntersect(a, b, p1, p2)) { + return true + } + } + + return false +} + +function pointOnRectBoundary(rect, point) { + var lon = point[0] + var lat = point[1] + + var onVertical = (almostEqual(lon, rect.minLon) || almostEqual(lon, rect.maxLon)) && + lat >= rect.minLat - EPSILON && lat <= rect.maxLat + EPSILON + var onHorizontal = (almostEqual(lat, rect.minLat) || almostEqual(lat, rect.maxLat)) && + lon >= rect.minLon - EPSILON && lon <= rect.maxLon + EPSILON + + return onVertical || onHorizontal +} + +function anyVertexInsideRect(geometry, rect, includeBoundary) { + var normalized = normalizeGeometry(geometry) + + for (var i = 0; i < normalized.coordinates.length; i++) { + var polygon = normalized.coordinates[i] + for (var j = 0; j < polygon.length; j++) { + var ring = polygon[j] + for (var k = 0; k < ring.length; k++) { + var point = ring[k] + if (bboxContainsPoint(rect, point[1], point[0])) { + if (!includeBoundary && pointOnRectBoundary(rect, point)) { + continue + } + return true + } + } + } + } + + return false +} + +function anyEdgeIntersectsRect(geometry, rect) { + var normalized = normalizeGeometry(geometry) + + for (var i = 0; i < normalized.coordinates.length; i++) { + var polygon = normalized.coordinates[i] + for (var j = 0; j < polygon.length; j++) { + var ring = polygon[j] + for (var k = 0; k < ring.length - 1; k++) { + if (segmentIntersectsRect(ring[k], ring[k + 1], rect)) { + return true + } + } + } + } + + return false +} + +function classifyCell(geometry, cellBbox) { + var bounds = geometryBbox(geometry) + if (!bboxIntersects(bounds, cellBbox)) { + return 'outside' + } + + var corners = [ + [cellBbox.minLon, cellBbox.minLat], + [cellBbox.maxLon, cellBbox.minLat], + [cellBbox.maxLon, cellBbox.maxLat], + [cellBbox.minLon, cellBbox.maxLat] + ] + + var cornersInside = 0 + for (var i = 0; i < corners.length; i++) { + if (pointInGeometry(geometry, corners[i][1], corners[i][0])) { + cornersInside += 1 + } + } + + var centerLat = (cellBbox.minLat + cellBbox.maxLat) / 2 + var centerLon = (cellBbox.minLon + cellBbox.maxLon) / 2 + var centerInside = pointInGeometry(geometry, centerLat, centerLon) + var hasInnerVertex = anyVertexInsideRect(geometry, cellBbox, false) + + if (cornersInside === 4 && centerInside && !hasInnerVertex) { + return 'full' + } + + if (anyEdgeIntersectsRect(geometry, cellBbox)) { + return 'partial' + } + + if (anyVertexInsideRect(geometry, cellBbox, true)) { + return 'partial' + } + + if (cornersInside > 0) { + return 'partial' + } + + if (centerInside) { + return 'partial' + } + + return 'outside' +} + +function geometryContainsGeometry(containerGeometry, candidateGeometry) { + var container = normalizeGeometry(containerGeometry) + var candidate = normalizeGeometry(candidateGeometry) + + if (!bboxContainsBbox(geometryBbox(container), geometryBbox(candidate))) { + return false + } + + for (var i = 0; i < candidate.coordinates.length; i++) { + var polygon = candidate.coordinates[i] + for (var j = 0; j < polygon.length; j++) { + var ring = polygon[j] + var limit = ring.length > 1 ? ring.length - 1 : ring.length + for (var k = 0; k < limit; k++) { + var point = ring[k] + if (!pointInGeometry(container, point[1], point[0])) { + return false + } + } + } + } + + return true +} + +module.exports = { + normalizeGeometry: normalizeGeometry, + geometryBbox: geometryBbox, + geometryArea: geometryArea, + pointInGeometry: pointInGeometry, + bboxContainsPoint: bboxContainsPoint, + bboxIntersects: bboxIntersects, + bboxContainsBbox: bboxContainsBbox, + classifyCell: classifyCell, + geometryContainsGeometry: geometryContainsGeometry +} diff --git a/src/index.js b/src/index.js index 1d4d555..4d78832 100644 --- a/src/index.js +++ b/src/index.js @@ -5,9 +5,41 @@ const reverse = require('./reverse') const forward = require('./forward') const findLocation = require('./location').find +function normalizeReverseMode(options) { + var mode = options.reverseMode + if (mode === undefined && options.reverse && options.reverse.mode) { + mode = options.reverse.mode + } + + var normalized = String(mode || 'centroid').toLowerCase() + return normalized === 'boundary' ? 'boundary' : 'centroid' +} + +function resolveBoundaryOptions(options) { + var boundary = options.boundary || {} + + var basePrecision = Number(boundary.basePrecision) + if (!Number.isFinite(basePrecision) || basePrecision < 1) { + basePrecision = 4 + } + + var maxPrecision = Number(boundary.maxPrecision) + if (!Number.isFinite(maxPrecision) || maxPrecision < basePrecision) { + maxPrecision = 7 + } + + return { + basePrecision: basePrecision, + maxPrecision: maxPrecision + } +} + function Geocoder(options) { var geocoder = function(options) { this.options = options || {} + this.reverseMode = normalizeReverseMode(this.options) + this.reverseDebug = Boolean(this.options.reverseDebug) + this.boundaryOptions = resolveBoundaryOptions(this.options) if (this.options.db) { // Accept a pre-opened database object (must have .all(sql, params, cb)) diff --git a/src/reverse.js b/src/reverse.js index c7d030a..68bc17e 100644 --- a/src/reverse.js +++ b/src/reverse.js @@ -1,67 +1,660 @@ "use strict"; const formatLocation = require('./location').format +const geohash = require('./geohash') +const geometry = require('./geometry') -// This finds the closest feature based upon Pythagoras's theorem. It is an -// approximation, and won't provide results as accurate as the haversine -// formula, but trades that for performance. For our use case this is good -// enough as the data is just an approximation of the centre point of a -// feature. -// -// The scale parameter accounts for the fact that 1 degree in longitude is -// different at the poles vs the equator. -// -// Based upon http://stackoverflow.com/a/7261601/155715 -function findFeature(geocoder, latitude, longitude, callback) { +const SUPPORTED_PLACETYPES = ['locality', 'localadmin', 'region'] +const SUPPORTED_PLACETYPE_CODES = [0, 1, 2] +const PLACEHOLDER_EMPTY = {} + +function dbAll(geocoder, query, params) { return new Promise(function(resolve, reject) { - const query = `SELECT * FROM everything WHERE id IN ( - SELECT feature_id - FROM coordinates - WHERE latitude BETWEEN ? - 1.5 AND ? + 1.5 - AND longitude BETWEEN ? - 1.5 AND ? + 1.5 - ORDER BY ( - (? - latitude) * (? - latitude) + - (? - longitude) * (? - longitude) * ? - ) ASC - LIMIT 1 - )` - - const scale = Math.pow(Math.cos(latitude * Math.PI / 180), 2) - - geocoder.db.all(query, [ - latitude, latitude, longitude, longitude, - latitude, latitude, longitude, longitude, scale - ], function(err, rows) { - if (err) { - if (typeof(callback) == 'function') { - callback(err, undefined) - } else if (typeof(reject) == 'function') { - reject(err) + geocoder.db.all(query, params || [], function(err, rows) { + if (err) reject(err) + else resolve(rows || []) + }) + }) +} + +function pointDistanceScore(latitude, longitude, row) { + var lat = Number(latitude) + var lon = Number(longitude) + var targetLat = Number(row.latitude) + var targetLon = Number(row.longitude) + var scale = Math.pow(Math.cos(lat * Math.PI / 180), 2) + + return ((lat - targetLat) * (lat - targetLat)) + + ((lon - targetLon) * (lon - targetLon) * scale) +} + +function placetypeRank(value) { + if (value === 'locality') return 0 + if (value === 'localadmin') return 1 + if (value === 'region') return 2 + return 3 +} + +function formatRow(row) { + if (!row) return PLACEHOLDER_EMPTY + return formatLocation(row) +} + +function executeWithCallback(promise, callback) { + if (typeof callback !== 'function') { + return promise + } + + promise.then(function(result) { + callback(undefined, result) + }).catch(function(err) { + callback(err, undefined) + }) + + return promise +} + +function findLegacyCentroidRow(geocoder, latitude, longitude) { + var query = `SELECT * FROM everything WHERE id IN ( + SELECT feature_id + FROM coordinates + WHERE latitude BETWEEN ? - 1.5 AND ? + 1.5 + AND longitude BETWEEN ? - 1.5 AND ? + 1.5 + ORDER BY ( + (? - latitude) * (? - latitude) + + (? - longitude) * (? - longitude) * ? + ) ASC + LIMIT 1 + )` + + var scale = Math.pow(Math.cos(Number(latitude) * Math.PI / 180), 2) + + return dbAll(geocoder, query, [ + latitude, latitude, + longitude, longitude, + latitude, latitude, + longitude, longitude, + scale + ]).then(function(rows) { + return rows[0] + }) +} + +function getBoundarySchemaStatus(geocoder) { + if (geocoder._boundarySchemaStatus) { + return Promise.resolve(geocoder._boundarySchemaStatus) + } + + var query = ` + SELECT name + FROM sqlite_master + WHERE type='table' + AND name IN ('compact_places', 'compact_geohash_lookup', 'places', 'place_geohash_lookup', 'place_geohash_cover', 'place_geometry') + ` + + return dbAll(geocoder, query, []).then(function(rows) { + var names = Object.create(null) + rows.forEach(function(row) { + names[row.name] = true + }) + + var status = { + hasCompactV2: Boolean(names.compact_places && names.compact_geohash_lookup), + hasCompactLegacy: Boolean(names.places && names.place_geohash_lookup), + hasFull: Boolean(names.places && names.place_geohash_cover && names.place_geometry) + } + + geocoder._boundarySchemaStatus = status + return status + }).catch(function() { + var status = { hasCompactV2: false, hasCompactLegacy: false, hasFull: false } + geocoder._boundarySchemaStatus = status + return status + }) +} + +function reverseHashes(latitude, longitude, basePrecision, maxPrecision) { + var hashes = [] + for (var precision = maxPrecision; precision >= basePrecision; precision--) { + hashes.push({ + precision: precision, + geohash: geohash.encode(latitude, longitude, precision) + }) + } + return hashes +} + +function fetchCompactBoundaryMatchV2(geocoder, hashes) { + if (!hashes.length) { + return Promise.resolve(undefined) + } + + var placeholders = hashes.map(function() { return '?' }).join(',') + var params = hashes.map(function(hash) { return hash.geohash }) + var placetypePlaceholders = SUPPORTED_PLACETYPE_CODES.map(function() { return '?' }).join(', ') + params = params.concat(SUPPORTED_PLACETYPE_CODES) + + var query = ` + SELECT + l.geohash AS geohash, + p.id AS id, + p.name AS name, + p.country_id AS country_id, + p.country_id AS country_name, + p.admin1_id AS admin1_id, + '' AS admin1_name, + p.latitude AS latitude, + p.longitude AS longitude, + CASE p.placetype_code + WHEN 0 THEN 'locality' + WHEN 1 THEN 'localadmin' + WHEN 2 THEN 'region' + ELSE 'region' + END AS placetype, + 0 AS priority_rank, + 0 AS area + FROM compact_geohash_lookup l + JOIN compact_places p ON p.id = l.place_id + WHERE l.geohash IN (${placeholders}) + AND p.placetype_code IN (${placetypePlaceholders}) + ORDER BY + LENGTH(l.geohash) DESC, + p.placetype_code ASC, + p.id ASC + LIMIT 1 + ` + + return dbAll(geocoder, query, params).then(function(rows) { + return rows[0] + }) +} + +function fetchCompactBoundaryMatchLegacy(geocoder, hashes) { + if (!hashes.length) { + return Promise.resolve(undefined) + } + + var placeholders = hashes.map(function() { return '?' }).join(',') + var params = hashes.map(function(hash) { return hash.geohash }) + var placetypePlaceholders = SUPPORTED_PLACETYPES.map(function() { return '?' }).join(', ') + params = params.concat(SUPPORTED_PLACETYPES) + + var query = ` + SELECT + l.geohash AS geohash, + p.id AS id, + p.name AS name, + p.country_id AS country_id, + COALESCE(c.name, p.country_name, p.country_id, '') AS country_name, + p.admin1_id AS admin1_id, + COALESCE(a.name, p.admin1_name, '') AS admin1_name, + p.centroid_lat AS latitude, + p.centroid_lon AS longitude, + p.placetype AS placetype, + p.priority_rank AS priority_rank, + p.area AS area + FROM place_geohash_lookup l + JOIN places p ON p.id = l.place_id + LEFT JOIN countries c ON c.id = p.country_id + LEFT JOIN admin1 a ON a.country_id = p.country_id AND a.id = p.admin1_id + WHERE l.geohash IN (${placeholders}) + AND p.placetype IN (${placetypePlaceholders}) + ORDER BY + LENGTH(l.geohash) DESC, + CASE p.placetype WHEN 'locality' THEN 0 WHEN 'localadmin' THEN 1 ELSE 2 END, + p.priority_rank ASC, + p.id ASC + LIMIT 1 + ` + + return dbAll(geocoder, query, params).then(function(rows) { + return rows[0] + }) +} + +function fetchBoundaryCandidates(geocoder, hashes) { + if (!hashes.length) { + return Promise.resolve([]) + } + + var clauses = [] + var params = [] + for (var i = 0; i < hashes.length; i++) { + clauses.push('(g.geohash = ? AND g.precision = ?)') + params.push(hashes[i].geohash) + params.push(hashes[i].precision) + } + var placetypePlaceholders = SUPPORTED_PLACETYPES.map(function() { return '?' }).join(', ') + + var query = ` + SELECT DISTINCT + p.id AS id, + p.name AS name, + p.country_id AS country_id, + COALESCE(c.name, p.country_name, p.country_id, '') AS country_name, + p.admin1_id AS admin1_id, + COALESCE(a.name, p.admin1_name, '') AS admin1_name, + p.centroid_lat AS latitude, + p.centroid_lon AS longitude, + p.placetype AS placetype, + p.priority_rank AS priority_rank, + p.area AS area, + p.bbox_min_lat AS bbox_min_lat, + p.bbox_min_lon AS bbox_min_lon, + p.bbox_max_lat AS bbox_max_lat, + p.bbox_max_lon AS bbox_max_lon + FROM place_geohash_cover g + JOIN places p ON p.id = g.place_id + LEFT JOIN countries c ON c.id = p.country_id + LEFT JOIN admin1 a ON a.country_id = p.country_id AND a.id = p.admin1_id + WHERE (${clauses.join(' OR ')}) + AND p.placetype IN (${placetypePlaceholders}) + ` + + params = params.concat(SUPPORTED_PLACETYPES) + return dbAll(geocoder, query, params) +} + +function hasPointInPlaceBbox(place, latitude, longitude) { + return geometry.bboxContainsPoint({ + minLat: Number(place.bbox_min_lat), + minLon: Number(place.bbox_min_lon), + maxLat: Number(place.bbox_max_lat), + maxLon: Number(place.bbox_max_lon) + }, latitude, longitude) +} + +function loadPlaceGeometries(geocoder, placeIds) { + if (!placeIds.length) { + return Promise.resolve(Object.create(null)) + } + + var cache = geocoder._boundaryGeometryCache + if (!cache) { + cache = Object.create(null) + geocoder._boundaryGeometryCache = cache + } + + var missing = [] + for (var i = 0; i < placeIds.length; i++) { + var key = String(placeIds[i]) + if (!cache[key]) { + missing.push(placeIds[i]) + } + } + + if (!missing.length) { + return Promise.resolve(cache) + } + + var placeholders = missing.map(function() { return '?' }).join(',') + var query = `SELECT place_id, encoding, geometry FROM place_geometry WHERE place_id IN (${placeholders})` + + return dbAll(geocoder, query, missing).then(function(rows) { + rows.forEach(function(row) { + var key = String(row.place_id) + if (cache[key]) { + return + } + + var raw = row.geometry + if (typeof Buffer !== 'undefined' && Buffer.isBuffer(raw)) { + raw = raw.toString('utf8') + } + + if (typeof raw !== 'string') { + raw = String(raw) + } + + try { + cache[key] = geometry.normalizeGeometry(JSON.parse(raw)) + } catch (err) { + cache[key] = null + } + }) + + return cache + }) +} + +function sortContainedPlaces(matches) { + return matches.sort(function(a, b) { + var typeRankA = placetypeRank(a.placetype) + var typeRankB = placetypeRank(b.placetype) + if (typeRankA !== typeRankB) { + return typeRankA - typeRankB + } + + var areaA = Number(a.area) + var areaB = Number(b.area) + if (!Number.isFinite(areaA)) areaA = Infinity + if (!Number.isFinite(areaB)) areaB = Infinity + if (areaA !== areaB) { + return areaA - areaB + } + + var priorityA = Number(a.priority_rank) + var priorityB = Number(b.priority_rank) + if (!Number.isFinite(priorityA)) priorityA = Number.MAX_SAFE_INTEGER + if (!Number.isFinite(priorityB)) priorityB = Number.MAX_SAFE_INTEGER + if (priorityA !== priorityB) { + return priorityA - priorityB + } + + var idA = String(a.id) + var idB = String(b.id) + if (idA < idB) return -1 + if (idA > idB) return 1 + return 0 + }) +} + +function pickNearest(rows, latitude, longitude) { + if (!rows.length) return undefined + + var sorted = rows.slice().sort(function(a, b) { + var scoreA = pointDistanceScore(latitude, longitude, a) + var scoreB = pointDistanceScore(latitude, longitude, b) + + if (scoreA !== scoreB) { + return scoreA - scoreB + } + + var typeRankA = placetypeRank(a.placetype) + var typeRankB = placetypeRank(b.placetype) + if (typeRankA !== typeRankB) { + return typeRankA - typeRankB + } + + var priorityA = Number(a.priority_rank) + var priorityB = Number(b.priority_rank) + if (!Number.isFinite(priorityA)) priorityA = Number.MAX_SAFE_INTEGER + if (!Number.isFinite(priorityB)) priorityB = Number.MAX_SAFE_INTEGER + if (priorityA !== priorityB) { + return priorityA - priorityB + } + + var idA = String(a.id) + var idB = String(b.id) + if (idA < idB) return -1 + if (idA > idB) return 1 + return 0 + }) + + return sorted[0] +} + +function fetchNearestBoundaryByRegion(geocoder, latitude, longitude, region) { + var where = [ + 'p.placetype IN (' + SUPPORTED_PLACETYPES.map(function() { return '?' }).join(', ') + ')' + ] + var params = SUPPORTED_PLACETYPES.slice() + + if (region && region.countryId) { + where.push('p.country_id = ?') + params.push(region.countryId) + } + + if (region && region.admin1Id !== undefined && region.admin1Id !== null) { + where.push('p.admin1_id = ?') + params.push(region.admin1Id) + } + + var scale = Math.pow(Math.cos(Number(latitude) * Math.PI / 180), 2) + + var query = ` + SELECT + p.id AS id, + p.name AS name, + p.country_id AS country_id, + COALESCE(c.name, p.country_name, p.country_id, '') AS country_name, + p.admin1_id AS admin1_id, + COALESCE(a.name, p.admin1_name, '') AS admin1_name, + p.centroid_lat AS latitude, + p.centroid_lon AS longitude, + p.placetype AS placetype, + p.priority_rank AS priority_rank, + p.area AS area + FROM places p + LEFT JOIN countries c ON c.id = p.country_id + LEFT JOIN admin1 a ON a.country_id = p.country_id AND a.id = p.admin1_id + WHERE ${where.join(' AND ')} + ORDER BY + ((? - p.centroid_lat) * (? - p.centroid_lat) + + (? - p.centroid_lon) * (? - p.centroid_lon) * ?) ASC, + CASE p.placetype WHEN 'locality' THEN 0 WHEN 'localadmin' THEN 1 ELSE 2 END, + p.priority_rank ASC, + p.id ASC + LIMIT 1 + ` + + params.push(latitude, latitude, longitude, longitude, scale) + + return dbAll(geocoder, query, params).then(function(rows) { + return rows[0] + }) +} + +function fetchNearestCompactByRegionV2(geocoder, latitude, longitude, region) { + var where = [ + 'p.placetype_code IN (' + SUPPORTED_PLACETYPE_CODES.map(function() { return '?' }).join(', ') + ')' + ] + var params = SUPPORTED_PLACETYPE_CODES.slice() + + if (region && region.countryId) { + where.push('p.country_id = ?') + params.push(region.countryId) + } + + if (region && region.admin1Id !== undefined && region.admin1Id !== null) { + where.push('p.admin1_id = ?') + params.push(region.admin1Id) + } + + var scale = Math.pow(Math.cos(Number(latitude) * Math.PI / 180), 2) + + var query = ` + SELECT + p.id AS id, + p.name AS name, + p.country_id AS country_id, + p.country_id AS country_name, + p.admin1_id AS admin1_id, + '' AS admin1_name, + p.latitude AS latitude, + p.longitude AS longitude, + CASE p.placetype_code + WHEN 0 THEN 'locality' + WHEN 1 THEN 'localadmin' + WHEN 2 THEN 'region' + ELSE 'region' + END AS placetype, + 0 AS priority_rank, + 0 AS area + FROM compact_places p + WHERE ${where.join(' AND ')} + ORDER BY + ((? - p.latitude) * (? - p.latitude) + + (? - p.longitude) * (? - p.longitude) * ?) ASC, + p.placetype_code ASC, + p.id ASC + LIMIT 1 + ` + + params.push(latitude, latitude, longitude, longitude, scale) + + return dbAll(geocoder, query, params).then(function(rows) { + return rows[0] + }) +} + +function attachDebug(geocoder, payload, reason) { + if (!geocoder.reverseDebug || !payload || !Object.keys(payload).length) { + return payload + } + + var result = Object.assign({}, payload) + result._debug = { + mode: 'boundary', + reason: reason + } + return result +} + +function fallbackNearestBoundary(geocoder, latitude, longitude, mode) { + var fetchNearest = mode === 'compact_v2' ? fetchNearestCompactByRegionV2 : fetchNearestBoundaryByRegion + + return fetchNearest(geocoder, latitude, longitude, null) + .then(function(globalNearest) { + if (!globalNearest) { + return { + row: undefined, + reason: 'no_boundary_places' } - } else { - const result = formatResult(rows) - if (typeof(callback) == 'function') { - callback(undefined, result) - } else if (typeof(resolve) == 'function') { - resolve(result) + } + + return fetchNearest(geocoder, latitude, longitude, { + countryId: globalNearest.country_id, + admin1Id: globalNearest.admin1_id + }).then(function(regionalNearest) { + return { + row: regionalNearest || globalNearest, + reason: 'regional_centroid_fallback' + } + }) + }) +} + +function tryCompactBoundaryLookup(geocoder, latitude, longitude, hashes, mode) { + var fetchCompact = mode === 'compact_v2' ? fetchCompactBoundaryMatchV2 : fetchCompactBoundaryMatchLegacy + + return fetchCompact(geocoder, hashes).then(function(row) { + if (row) { + return { + row: row, + reason: 'geohash_lookup' + } + } + return { + row: undefined, + reason: 'no_compact_match' + } + }) +} + +function tryFullBoundaryLookup(geocoder, latitude, longitude, hashes) { + return fetchBoundaryCandidates(geocoder, hashes) + .then(function(candidates) { + var bboxCandidates = candidates.filter(function(candidate) { + return hasPointInPlaceBbox(candidate, latitude, longitude) + }) + + if (!bboxCandidates.length) { + return { + row: pickNearest(candidates, latitude, longitude), + reason: 'boundary_centroid_fallback' } } + + var candidateIds = bboxCandidates.map(function(candidate) { return candidate.id }) + return loadPlaceGeometries(geocoder, candidateIds) + .then(function(geometryById) { + var contained = bboxCandidates.filter(function(candidate) { + var polygon = geometryById[String(candidate.id)] + if (!polygon) return false + return geometry.pointInGeometry(polygon, latitude, longitude) + }) + + if (contained.length) { + var selected = sortContainedPlaces(contained)[0] + return { + row: selected, + reason: 'polygon_contains' + } + } + + var nearestInCandidates = pickNearest(bboxCandidates, latitude, longitude) + if (nearestInCandidates) { + return { + row: nearestInCandidates, + reason: 'bbox_candidate_centroid_fallback' + } + } + + return { + row: undefined, + reason: 'no_boundary_candidate' + } + }) + }) + .then(function(result) { + if (result && result.row) { + return result + } + + return fallbackNearestBoundary(geocoder, latitude, longitude, 'full') }) +} + +function tryBoundaryLookup(geocoder, latitude, longitude) { + return getBoundarySchemaStatus(geocoder).then(function(status) { + if (!status.hasCompactV2 && !status.hasCompactLegacy && !status.hasFull) { + return undefined + } + + var options = geocoder.boundaryOptions || {} + var basePrecision = Number(options.basePrecision || 4) + var maxPrecision = Number(options.maxPrecision || 7) + if (basePrecision < 1) basePrecision = 1 + if (maxPrecision < basePrecision) maxPrecision = basePrecision + + var hashes = reverseHashes(latitude, longitude, basePrecision, maxPrecision) + + var compactMode = status.hasCompactV2 ? 'compact_v2' : (status.hasCompactLegacy ? 'compact_legacy' : null) + + if (compactMode) { + return tryCompactBoundaryLookup(geocoder, latitude, longitude, hashes, compactMode) + .then(function(result) { + if (result && result.row) { + return result + } + + if (status.hasFull) { + return tryFullBoundaryLookup(geocoder, latitude, longitude, hashes) + } + + return fallbackNearestBoundary(geocoder, latitude, longitude, compactMode) + }) + } + + return tryFullBoundaryLookup(geocoder, latitude, longitude, hashes) }) } -function formatResult(rows) { - const row = rows[0] +function findFeature(geocoder, latitude, longitude) { + var mode = geocoder.reverseMode || 'centroid' - if (!row || row === undefined) { - return {} - } else { - return formatLocation(row) + if (mode !== 'boundary') { + return findLegacyCentroidRow(geocoder, latitude, longitude).then(function(row) { + return formatRow(row) + }) } + + return tryBoundaryLookup(geocoder, latitude, longitude) + .then(function(boundaryResult) { + if (boundaryResult && boundaryResult.row) { + return attachDebug(geocoder, formatRow(boundaryResult.row), boundaryResult.reason) + } + + return findLegacyCentroidRow(geocoder, latitude, longitude) + .then(function(row) { + return attachDebug(geocoder, formatRow(row), 'legacy_centroid_fallback') + }) + }) } function Reverse(geocoder, latitude, longitude, callback) { - return findFeature(geocoder, latitude, longitude, callback) + return executeWithCallback(findFeature(geocoder, latitude, longitude), callback) } module.exports = Reverse; From 97b347e4929146237fa3363576b3e6613314e45d Mon Sep 17 00:00:00 2001 From: Sebastian Schloesser Date: Wed, 25 Mar 2026 22:56:51 -0400 Subject: [PATCH 03/10] Add validation script based on LocationIQ geocoder service as source of truth --- README.md | 20 + package.json | 3 +- scripts/validate_with_locationiq.js | 657 ++++++++++++++++++++++++++++ 3 files changed, 679 insertions(+), 1 deletion(-) create mode 100644 scripts/validate_with_locationiq.js diff --git a/README.md b/README.md index 8dae03d..5cf7e1d 100644 --- a/README.md +++ b/README.md @@ -272,6 +272,26 @@ Boundary runtime modes: - Uses compact `compact_geohash_lookup` when present (fast geohash-to-place). - Falls back to full polygon-aware tables when compact rows are absent. +### External Reverse Validation (LocationIQ) + +Use this script to compare local reverse results against LocationIQ at sampled +coordinates, with persistent SQLite caching so requests are not repeated: + +```bash +LOCATIONIQ_API_KEY=... node scripts/validate_with_locationiq.js \ + --database tmp/wof-fr-it-compact-p5-d3-pop10k-region.sqlite \ + --samples 300 \ + --export-csv tmp/locationiq-validation-fr-it.csv +``` + +It creates/updates: + +- `sample_points` (coordinates sampled from your geohash table) +- `locationiq_cache` (raw LocationIQ responses keyed by coordinate) +- `validation_results` (local vs LocationIQ comparison verdicts) + +Cache DB path is automatic (default behavior): `tmp/locationiq-validation-.sqlite`. + ## License This library is licensed under [the MIT license](https://github.com/lucaspiller/offline-geocoder/blob/master/LICENSE). diff --git a/package.json b/package.json index 5cc8b31..26910f0 100644 --- a/package.json +++ b/package.json @@ -35,7 +35,8 @@ "scripts": { "test": "jasmine", "build:boundary": "node scripts/generate_boundary_index.js", - "build:wof": "bash scripts/generate_wof_boundary.sh" + "build:wof": "bash scripts/generate_wof_boundary.sh", + "validate:locationiq": "node scripts/validate_with_locationiq.js" }, "author": "Luca Spiller", "license": "MIT" diff --git a/scripts/validate_with_locationiq.js b/scripts/validate_with_locationiq.js new file mode 100644 index 0000000..0f8d541 --- /dev/null +++ b/scripts/validate_with_locationiq.js @@ -0,0 +1,657 @@ +#!/usr/bin/env node +"use strict"; + +const fs = require('fs') +const path = require('path') +const https = require('https') +const sqlite3 = require('sqlite3') + +const createGeocoder = require('../src/index') +const geohash = require('../src/geohash') + +function usage() { + return [ + 'Usage: node scripts/validate_with_locationiq.js --database [options]', + '', + 'Options:', + ' --database Geocoder SQLite database to validate (required)', + ' --api-key LocationIQ API key (or env LOCATIONIQ_API_KEY)', + ' --samples Number of sample points to evaluate (default: 200)', + ' --seed RNG seed for repeatable sample generation (default: 1337)', + ' --rps Max LocationIQ requests per second when uncached (default: 1)', + ' --force-refresh Ignore cached LocationIQ responses (default: false)', + ' --reverse-mode centroid|boundary (default: boundary)', + ' --base-precision Boundary lookup base precision (default: 4)', + ' --max-precision Boundary lookup max precision (default: 7)', + ' --endpoint LocationIQ reverse endpoint (default: https://us1.locationiq.com/v1/reverse)', + ' --export-csv Optional CSV export of the evaluated sample rows', + ' --help, -h Show this help message', + '', + 'Example:', + ' LOCATIONIQ_API_KEY=... node scripts/validate_with_locationiq.js \\', + ' --database tmp/wof-fr-it-compact-p5-d3-pop10k-region.sqlite \\', + ' --samples 300 \\', + ' --export-csv tmp/locationiq-validation-fr-it.csv' + ].join('\n') +} + +function parseBool(value, defaultValue) { + if (value === undefined || value === null || value === '') { + return defaultValue + } + + var normalized = String(value).toLowerCase().trim() + if (normalized === '1' || normalized === 'true' || normalized === 'yes' || normalized === 'y') { + return true + } + if (normalized === '0' || normalized === 'false' || normalized === 'no' || normalized === 'n') { + return false + } + return defaultValue +} + +function parseArgs(argv) { + var opts = { + database: null, + apiKey: process.env.LOCATIONIQ_API_KEY || '', + cacheDb: null, + samples: 200, + seed: 1337, + rps: 1, + forceRefresh: false, + reverseMode: 'boundary', + basePrecision: 4, + maxPrecision: 7, + endpoint: 'https://us1.locationiq.com/v1/reverse', + exportCsv: null + } + + for (var i = 0; i < argv.length; i++) { + var arg = argv[i] + + if (arg === '--database' || arg === '-d') { + opts.database = path.resolve(argv[++i]) + } else if (arg === '--api-key') { + opts.apiKey = String(argv[++i] || '') + } else if (arg === '--samples') { + opts.samples = Math.max(1, Math.trunc(Number(argv[++i]))) + } else if (arg === '--seed') { + opts.seed = Math.trunc(Number(argv[++i])) + } else if (arg === '--rps') { + opts.rps = Math.max(0.2, Number(argv[++i])) + } else if (arg === '--force-refresh') { + opts.forceRefresh = parseBool(argv[++i], false) + } else if (arg === '--reverse-mode') { + opts.reverseMode = String(argv[++i] || 'boundary').toLowerCase() + } else if (arg === '--base-precision') { + opts.basePrecision = Math.max(1, Math.trunc(Number(argv[++i]))) + } else if (arg === '--max-precision') { + opts.maxPrecision = Math.max(opts.basePrecision, Math.trunc(Number(argv[++i]))) + } else if (arg === '--endpoint') { + opts.endpoint = String(argv[++i] || opts.endpoint) + } else if (arg === '--export-csv') { + opts.exportCsv = path.resolve(argv[++i]) + } else if (arg === '--help' || arg === '-h') { + opts.help = true + } else { + throw new Error('Unknown argument: ' + arg) + } + } + + return opts +} + +function defaultCachePath(databasePath) { + var base = path.basename(databasePath) + if (base.toLowerCase().endsWith('.sqlite')) { + base = base.slice(0, -7) + } + base = base.replace(/[^a-z0-9._-]+/ig, '-').replace(/-+/g, '-').replace(/^-|-$/g, '') + if (!base) base = 'geocoder' + return path.resolve('tmp/locationiq-validation-' + base + '.sqlite') +} + +function sleep(ms) { + return new Promise(function(resolve) { + setTimeout(resolve, ms) + }) +} + +function mulberry32(seed) { + var state = seed >>> 0 + return function() { + state |= 0 + state = (state + 0x6D2B79F5) | 0 + var t = Math.imul(state ^ (state >>> 15), 1 | state) + t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t + return ((t ^ (t >>> 14)) >>> 0) / 4294967296 + } +} + +function dbOpen(dbPath) { + return new sqlite3.Database(dbPath) +} + +function dbExec(db, sql) { + return new Promise(function(resolve, reject) { + db.exec(sql, function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +function dbRun(db, sql, params) { + return new Promise(function(resolve, reject) { + db.run(sql, params || [], function(err) { + if (err) reject(err) + else resolve(this) + }) + }) +} + +function dbGet(db, sql, params) { + return new Promise(function(resolve, reject) { + db.get(sql, params || [], function(err, row) { + if (err) reject(err) + else resolve(row) + }) + }) +} + +function dbAll(db, sql, params) { + return new Promise(function(resolve, reject) { + db.all(sql, params || [], function(err, rows) { + if (err) reject(err) + else resolve(rows || []) + }) + }) +} + +function dbClose(db) { + return new Promise(function(resolve, reject) { + db.close(function(err) { + if (err) reject(err) + else resolve() + }) + }) +} + +async function ensureCacheSchema(cacheDb) { + await dbExec(cacheDb, ` + CREATE TABLE IF NOT EXISTS sample_points( + coord_key TEXT PRIMARY KEY, + latitude REAL NOT NULL, + longitude REAL NOT NULL, + source_geohash TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + + CREATE TABLE IF NOT EXISTS locationiq_cache( + coord_key TEXT PRIMARY KEY, + latitude REAL NOT NULL, + longitude REAL NOT NULL, + http_status INTEGER, + response_json TEXT, + error_text TEXT, + fetched_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + + CREATE TABLE IF NOT EXISTS validation_results( + coord_key TEXT PRIMARY KEY, + latitude REAL NOT NULL, + longitude REAL NOT NULL, + source_geohash TEXT NOT NULL, + local_name TEXT, + local_country_id TEXT, + local_admin1_id TEXT, + local_json TEXT, + liq_locality TEXT, + liq_country_code TEXT, + liq_display_name TEXT, + liq_json TEXT, + locality_match INTEGER NOT NULL DEFAULT 0, + country_match INTEGER NOT NULL DEFAULT 0, + verdict TEXT NOT NULL, + updated_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + `) +} + +function hashString32(value) { + var hash = 2166136261 + var text = String(value || '') + for (var i = 0; i < text.length; i++) { + hash ^= text.charCodeAt(i) + hash = Math.imul(hash, 16777619) + } + return hash >>> 0 +} + +function deterministicPointInHash(hash, seed, index) { + var bbox = geohash.decodeBbox(hash) + var localSeed = (hashString32(hash) ^ hashString32(seed) ^ (index >>> 0)) >>> 0 + var rng = mulberry32(localSeed) + return { + latitude: bbox.minLat + (bbox.maxLat - bbox.minLat) * rng(), + longitude: bbox.minLon + (bbox.maxLon - bbox.minLon) * rng() + } +} + +function deterministicShuffle(items, seed) { + var rng = mulberry32(seed) + var list = items.slice() + for (var i = list.length - 1; i > 0; i--) { + var j = Math.floor(rng() * (i + 1)) + var tmp = list[i] + list[i] = list[j] + list[j] = tmp + } + return list +} + +async function detectLookupTable(sourceDb) { + var rows = await dbAll( + sourceDb, + "SELECT name FROM sqlite_master WHERE type='table' AND name IN ('compact_geohash_lookup','place_geohash_lookup')" + ) + var names = Object.create(null) + for (var i = 0; i < rows.length; i++) { + names[rows[i].name] = true + } + if (names.compact_geohash_lookup) return 'compact_geohash_lookup' + if (names.place_geohash_lookup) return 'place_geohash_lookup' + throw new Error('No geohash lookup table found (expected compact_geohash_lookup or place_geohash_lookup)') +} + +function coordKey(latitude, longitude) { + return Number(latitude).toFixed(6) + ',' + Number(longitude).toFixed(6) +} + +async function ensureSamplePoints(sourceDb, cacheDb, lookupTable, targetCount, seed) { + var countRow = await dbGet(cacheDb, 'SELECT COUNT(*) AS count FROM sample_points') + var current = countRow ? Number(countRow.count || 0) : 0 + if (current >= targetCount) { + return + } + + var geohashRows = await dbAll( + sourceDb, + 'SELECT geohash FROM ' + lookupTable + ' WHERE geohash IS NOT NULL ORDER BY geohash ASC' + ) + if (!geohashRows.length) { + throw new Error('Unable to sample geohashes from ' + lookupTable) + } + + var geohashes = [] + for (var idx = 0; idx < geohashRows.length; idx++) { + if (geohashRows[idx].geohash) { + geohashes.push(geohashRows[idx].geohash) + } + } + geohashes = deterministicShuffle(geohashes, seed) + + var needed = targetCount - current + var insertedTotal = 0 + for (var i = 0; i < geohashes.length && insertedTotal < needed; i++) { + var hash = geohashes[i] + var point = deterministicPointInHash(hash, seed, i) + var key = coordKey(point.latitude, point.longitude) + var result = await dbRun( + cacheDb, + 'INSERT OR IGNORE INTO sample_points(coord_key, latitude, longitude, source_geohash) VALUES (?, ?, ?, ?)', + [key, point.latitude, point.longitude, hash] + ) + if (result && result.changes > 0) { + insertedTotal += 1 + } + } + + if (current + insertedTotal < targetCount) { + throw new Error( + 'Could not create enough unique sample points for requested --samples=' + targetCount + + ' (available=' + (current + insertedTotal) + ')' + ) + } +} + +function normalizeName(value) { + if (!value) return '' + return String(value) + .normalize('NFKD') + .replace(/[\u0300-\u036f]/g, '') + .toLowerCase() + .replace(/[^a-z0-9]+/g, ' ') + .trim() + .replace(/\s+/g, ' ') +} + +function namesMatch(left, right) { + if (!left || !right) return false + if (left === right) return true + if (left.indexOf(right) !== -1 || right.indexOf(left) !== -1) return true + return false +} + +function extractLocationIqLocality(address) { + if (!address || typeof address !== 'object') return '' + var keys = [ + 'city', + 'town', + 'village', + 'municipality', + 'borough', + 'suburb', + 'county', + 'state_district', + 'state' + ] + for (var i = 0; i < keys.length; i++) { + var value = address[keys[i]] + if (value) return String(value) + } + return '' +} + +function buildVerdict(localityMatch, countryMatch, localName, liqLocality) { + if (localityMatch && countryMatch) return 'match_city_country' + if (localityMatch) return 'match_city_only' + if (countryMatch) return 'match_country_only' + if (!localName && !liqLocality) return 'missing_both_locality' + if (!localName) return 'missing_local_locality' + if (!liqLocality) return 'missing_locationiq_locality' + return 'mismatch' +} + +function fetchJson(endpointUrl, timeoutMs) { + return new Promise(function(resolve, reject) { + var req = https.get(endpointUrl, function(response) { + var chunks = [] + response.on('data', function(chunk) { chunks.push(chunk) }) + response.on('end', function() { + var body = Buffer.concat(chunks).toString('utf8') + try { + var parsed = JSON.parse(body) + resolve({ status: response.statusCode || 0, json: parsed, raw: body }) + } catch (err) { + reject(new Error('Invalid JSON response (' + (response.statusCode || 0) + '): ' + body.slice(0, 200))) + } + }) + }) + + req.on('error', reject) + req.setTimeout(timeoutMs, function() { + req.destroy(new Error('Request timed out after ' + timeoutMs + 'ms')) + }) + }) +} + +function buildLocationIqUrl(endpoint, apiKey, latitude, longitude) { + var url = new URL(endpoint) + url.searchParams.set('key', apiKey) + url.searchParams.set('lat', String(latitude)) + url.searchParams.set('lon', String(longitude)) + url.searchParams.set('format', 'json') + url.searchParams.set('normalizecity', '1') + url.searchParams.set('addressdetails', '1') + return url.toString() +} + +async function getLocationIqResponse(cacheDb, opts, latitude, longitude) { + var key = coordKey(latitude, longitude) + if (!opts.forceRefresh) { + var cached = await dbGet(cacheDb, 'SELECT * FROM locationiq_cache WHERE coord_key = ?', [key]) + if (cached && cached.response_json) { + return { + status: Number(cached.http_status || 0), + json: JSON.parse(cached.response_json), + fromCache: true + } + } + } + + var url = buildLocationIqUrl(opts.endpoint, opts.apiKey, latitude, longitude) + var fetchedAt = new Date().toISOString() + try { + var response = await fetchJson(url, 20000) + await dbRun( + cacheDb, + `INSERT INTO locationiq_cache(coord_key, latitude, longitude, http_status, response_json, error_text, fetched_at) + VALUES (?, ?, ?, ?, ?, NULL, ?) + ON CONFLICT(coord_key) DO UPDATE SET + latitude=excluded.latitude, + longitude=excluded.longitude, + http_status=excluded.http_status, + response_json=excluded.response_json, + error_text=NULL, + fetched_at=excluded.fetched_at`, + [key, latitude, longitude, response.status, JSON.stringify(response.json), fetchedAt] + ) + return { + status: response.status, + json: response.json, + fromCache: false + } + } catch (err) { + await dbRun( + cacheDb, + `INSERT INTO locationiq_cache(coord_key, latitude, longitude, http_status, response_json, error_text, fetched_at) + VALUES (?, ?, ?, NULL, NULL, ?, ?) + ON CONFLICT(coord_key) DO UPDATE SET + latitude=excluded.latitude, + longitude=excluded.longitude, + http_status=NULL, + response_json=NULL, + error_text=excluded.error_text, + fetched_at=excluded.fetched_at`, + [key, latitude, longitude, String(err && err.message ? err.message : err), fetchedAt] + ) + throw err + } +} + +function csvEscape(value) { + if (value === null || value === undefined) return '' + var text = String(value) + if (text.indexOf('"') !== -1 || text.indexOf(',') !== -1 || text.indexOf('\n') !== -1) { + return '"' + text.replace(/"/g, '""') + '"' + } + return text +} + +async function writeCsv(cacheDb, csvPath, limit) { + var rows = await dbAll( + cacheDb, + `SELECT coord_key, latitude, longitude, source_geohash, local_name, local_country_id, liq_locality, liq_country_code, verdict + FROM validation_results + ORDER BY updated_at DESC + LIMIT ?`, + [limit] + ) + + var headers = [ + 'coord_key', + 'latitude', + 'longitude', + 'source_geohash', + 'local_name', + 'local_country_id', + 'liq_locality', + 'liq_country_code', + 'verdict' + ] + + var lines = [headers.join(',')] + for (var i = 0; i < rows.length; i++) { + var row = rows[i] + lines.push(headers.map(function(key) { return csvEscape(row[key]) }).join(',')) + } + + fs.mkdirSync(path.dirname(csvPath), { recursive: true }) + fs.writeFileSync(csvPath, lines.join('\n') + '\n', 'utf8') +} + +async function main() { + var opts = parseArgs(process.argv.slice(2)) + if (opts.help) { + console.log(usage()) + process.exit(0) + } + + if (!opts.database) { + throw new Error('Missing required --database') + } + if (!fs.existsSync(opts.database)) { + throw new Error('Database not found: ' + opts.database) + } + if (!opts.apiKey) { + throw new Error('Missing LocationIQ API key (--api-key or LOCATIONIQ_API_KEY)') + } + if (!Number.isFinite(opts.samples) || opts.samples <= 0) { + throw new Error('--samples must be > 0') + } + opts.cacheDb = defaultCachePath(opts.database) + + fs.mkdirSync(path.dirname(opts.cacheDb), { recursive: true }) + + var sourceDb = dbOpen(opts.database) + var cacheDb = dbOpen(opts.cacheDb) + var geocoder = createGeocoder({ + database: opts.database, + reverseMode: opts.reverseMode === 'centroid' ? 'centroid' : 'boundary', + boundary: { + basePrecision: opts.basePrecision, + maxPrecision: opts.maxPrecision + } + }) + + try { + await ensureCacheSchema(cacheDb) + var lookupTable = await detectLookupTable(sourceDb) + await ensureSamplePoints(sourceDb, cacheDb, lookupTable, opts.samples, Number.isFinite(opts.seed) ? opts.seed : 1337) + + var points = await dbAll( + cacheDb, + 'SELECT coord_key, latitude, longitude, source_geohash FROM sample_points ORDER BY created_at ASC, coord_key ASC LIMIT ?', + [opts.samples] + ) + + var uncachedCalls = 0 + var delayMs = Math.ceil(1000 / opts.rps) + + for (var i = 0; i < points.length; i++) { + var point = points[i] + + var localResult = await geocoder.reverse(point.latitude, point.longitude) + if (!localResult) localResult = {} + + var liqResult = await getLocationIqResponse(cacheDb, opts, point.latitude, point.longitude) + if (!liqResult.fromCache) { + uncachedCalls += 1 + } + + var liqAddress = (liqResult.json && liqResult.json.address) || {} + var liqLocality = extractLocationIqLocality(liqAddress) + var liqCountryCode = liqAddress.country_code ? String(liqAddress.country_code).toUpperCase() : '' + + var localName = localResult.name || '' + var localCountryId = (localResult.country && localResult.country.id) ? String(localResult.country.id).toUpperCase() : '' + var localityMatch = namesMatch(normalizeName(localName), normalizeName(liqLocality)) + var countryMatch = localCountryId && liqCountryCode && localCountryId === liqCountryCode + var verdict = buildVerdict(localityMatch, countryMatch, localName, liqLocality) + + await dbRun( + cacheDb, + `INSERT INTO validation_results( + coord_key, latitude, longitude, source_geohash, + local_name, local_country_id, local_admin1_id, local_json, + liq_locality, liq_country_code, liq_display_name, liq_json, + locality_match, country_match, verdict, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now')) + ON CONFLICT(coord_key) DO UPDATE SET + latitude=excluded.latitude, + longitude=excluded.longitude, + source_geohash=excluded.source_geohash, + local_name=excluded.local_name, + local_country_id=excluded.local_country_id, + local_admin1_id=excluded.local_admin1_id, + local_json=excluded.local_json, + liq_locality=excluded.liq_locality, + liq_country_code=excluded.liq_country_code, + liq_display_name=excluded.liq_display_name, + liq_json=excluded.liq_json, + locality_match=excluded.locality_match, + country_match=excluded.country_match, + verdict=excluded.verdict, + updated_at=excluded.updated_at`, + [ + point.coord_key, + point.latitude, + point.longitude, + point.source_geohash, + localName || null, + localCountryId || null, + (localResult.admin1 && localResult.admin1.id) ? String(localResult.admin1.id) : null, + JSON.stringify(localResult), + liqLocality || null, + liqCountryCode || null, + liqResult.json && liqResult.json.display_name ? String(liqResult.json.display_name) : null, + JSON.stringify(liqResult.json), + localityMatch ? 1 : 0, + countryMatch ? 1 : 0, + verdict + ] + ) + + if (!liqResult.fromCache && i < points.length - 1 && delayMs > 0) { + await sleep(delayMs) + } + } + + var verdictRows = await dbAll( + cacheDb, + `SELECT verdict, COUNT(*) AS count + FROM validation_results + WHERE coord_key IN (SELECT coord_key FROM sample_points ORDER BY created_at ASC, coord_key ASC LIMIT ?) + GROUP BY verdict + ORDER BY count DESC, verdict ASC`, + [opts.samples] + ) + + var totalRow = await dbGet( + cacheDb, + `SELECT COUNT(*) AS count + FROM validation_results + WHERE coord_key IN (SELECT coord_key FROM sample_points ORDER BY created_at ASC, coord_key ASC LIMIT ?)`, + [opts.samples] + ) + + console.log('Validation complete') + console.log('Geocoder DB: ' + opts.database) + console.log('Cache DB: ' + opts.cacheDb) + console.log('Samples evaluated: ' + Number(totalRow && totalRow.count ? totalRow.count : 0)) + console.log('LocationIQ uncached calls this run: ' + uncachedCalls) + console.log('Verdict distribution:') + for (var j = 0; j < verdictRows.length; j++) { + console.log(' ' + verdictRows[j].verdict + ': ' + verdictRows[j].count) + } + + if (opts.exportCsv) { + await writeCsv(cacheDb, opts.exportCsv, opts.samples) + console.log('CSV export: ' + opts.exportCsv) + } + } finally { + if (geocoder && geocoder.db && typeof geocoder.db.close === 'function') { + await new Promise(function(resolve) { + geocoder.db.close(function() { resolve() }) + }) + } + await dbClose(sourceDb) + await dbClose(cacheDb) + } +} + +main().catch(function(err) { + console.error(err.message || err) + process.exit(1) +}) From ad20bfb45d65876e94b299f905c8fbe5cb2566b7 Mon Sep 17 00:00:00 2001 From: Sebastian Schloesser Date: Thu, 26 Mar 2026 02:58:44 -0400 Subject: [PATCH 04/10] Add dominant-city rollup controls and validation metadata --- README.md | 5 + scripts/generate_boundary_index.js | 171 +++++++++++++++++++-- scripts/generate_wof_boundary.sh | 6 + scripts/validate_with_locationiq.js | 197 +++++++++++++++++++++++- spec/boundary_builder_spec.js | 230 ++++++++++++++++++++++++++++ 5 files changed, 592 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 5cf7e1d..466d46b 100644 --- a/README.md +++ b/README.md @@ -224,6 +224,9 @@ Builder notes: - `--region-max-precision` - `--region-sparse-max-precision` + `--region-sparse-min-area-km2` for very large sparse regions (for example geohash-3 in Amazon-like interiors) - `--promote-locality-over-region` (default `true`) prefers locality labels in shared parent cells when there is no competing locality (keeps city labels sticky against region-only outskirts) +- Dominant-city rollup keeps broad city labels sticky in mixed city/suburb cells unless there is competing major-city pressure: + - `--dominant-locality-population` (default `100000`) + - `--dominant-locality-ratio` (default `3`) - Excludes neighbourhood-like placetypes from default reverse output - `--index-mode compact` (default) stores only geohash-to-place mappings (`compact_geohash_lookup`) and no runtime geometry payloads. Compact schema uses `compact_places(id,name,country_id,admin1_id,placetype_code,latitude,longitude)`. @@ -259,6 +262,8 @@ Useful WOF build env vars: - `WOF_REGION_SPARSE_MAX_PRECISION` sparse very-large-region precision (default `3`) - `WOF_REGION_SPARSE_MIN_AREA_KM2` area threshold for sparse region precision (default `80000`) - `WOF_PROMOTE_LOCALITY_OVER_REGION=1|0` prefer locality labels over region in shared parent cells (default `1`) +- `WOF_DOMINANT_LOCALITY_POPULATION` major-locality threshold for dominant-city rollup (default `100000`) +- `WOF_DOMINANT_LOCALITY_RATIO` dominant-vs-next locality population ratio (default `3`) - `WOF_GEOMETRY_DECIMALS` round coordinates before storage/indexing (for example `4`) - `WOF_MIN_POPULATION` filter out places below threshold (for example `10000`) - `WOF_INCLUDE_REGION=1|0` include/exclude region fallback boundaries diff --git a/scripts/generate_boundary_index.js b/scripts/generate_boundary_index.js index ea9d448..19ef38c 100755 --- a/scripts/generate_boundary_index.js +++ b/scripts/generate_boundary_index.js @@ -51,7 +51,9 @@ function parseArgs(argv) { regionMaxPrecision: null, regionSparseMaxPrecision: null, regionSparseMinAreaKm2: null, - promoteLocalityOverRegion: true + promoteLocalityOverRegion: true, + dominantLocalityPopulation: 100000, + dominantLocalityRatio: 3 } for (var i = 0; i < argv.length; i++) { @@ -103,6 +105,12 @@ function parseArgs(argv) { opts.regionSparseMinAreaKm2 = Number.isFinite(sparseAreaKm2) && sparseAreaKm2 > 0 ? sparseAreaKm2 : null } else if (arg === '--promote-locality-over-region') { opts.promoteLocalityOverRegion = parseBool(argv[++i], true) + } else if (arg === '--dominant-locality-population') { + var dominantPopulation = Number(argv[++i]) + opts.dominantLocalityPopulation = Number.isFinite(dominantPopulation) ? dominantPopulation : opts.dominantLocalityPopulation + } else if (arg === '--dominant-locality-ratio') { + var dominantRatio = Number(argv[++i]) + opts.dominantLocalityRatio = Number.isFinite(dominantRatio) ? dominantRatio : opts.dominantLocalityRatio } else if (arg === '--append') { opts.replace = false } else if (arg === '--replace') { @@ -141,6 +149,8 @@ function usage() { ' --region-sparse-max-precision Optional precision for very large region polygons (for example 3)', ' --region-sparse-min-area-km2 Area threshold to apply sparse region precision', ' --promote-locality-over-region Prefer locality over region in shared parent cells when no competing locality exists (default: true)', + ' --dominant-locality-population Population threshold that marks locality as major for dominant-city rollups (default: 100000)', + ' --dominant-locality-ratio Required dominant-vs-next population ratio for locality rollups (default: 3)', ' --append Keep existing boundary rows and append/replace by place id', ' --replace Clear boundary rows first (default)', ' --help, -h Show this help message' @@ -792,6 +802,83 @@ function isCityPlacetypeCode(code) { return code === PLACETYPE_CODES.locality || code === PLACETYPE_CODES.localadmin } +function placePopulation(place) { + if (!place) return 0 + + var pop = Number(place.population) + if (!Number.isFinite(pop) || pop < 0) { + return 0 + } + + return pop +} + +function isMajorLocality(place, opts) { + if (!place || !isCityPlacetypeCode(place.placetypeCode)) { + return false + } + + var threshold = Number(opts.dominantLocalityPopulation) + if (!Number.isFinite(threshold) || threshold <= 0) { + return false + } + + return placePopulation(place) >= threshold +} + +function selectDominantLocalityId(localityIds, placeById, opts) { + if (!Array.isArray(localityIds) || localityIds.length < 2) { + return null + } + + var threshold = Number(opts.dominantLocalityPopulation) + if (!Number.isFinite(threshold) || threshold <= 0) { + return null + } + + var ratio = Number(opts.dominantLocalityRatio) + if (!Number.isFinite(ratio) || ratio < 1) { + ratio = 1 + } + + var ranked = localityIds + .map(function(id) { + var place = placeById[String(id)] + return { + id: Number(id), + population: placePopulation(place) + } + }) + .sort(function(a, b) { + if (a.population !== b.population) { + return b.population - a.population + } + return a.id - b.id + }) + + if (!ranked.length) { + return null + } + + var top = ranked[0] + if (top.population < threshold) { + return null + } + + for (var i = 1; i < ranked.length; i++) { + if (ranked[i].population >= threshold) { + return null + } + } + + var secondPopulation = ranked.length > 1 ? ranked[1].population : 0 + if (secondPopulation > 0 && top.population < secondPopulation * ratio) { + return null + } + + return top.id +} + function promoteLocalityParentsByRegionCompetition(bestByHash, placeById, opts) { if (!opts.promoteLocalityOverRegion) { return @@ -838,29 +925,53 @@ function promoteLocalityParentsByRegionCompetition(bestByHash, placeById, opts) var parentHash = parentHashes[parentIndex] var group = groupByParent[parentHash] var localityIds = Object.keys(group.localityById) - if (localityIds.length !== 1) { + if (!localityIds.length) { continue } - var localityId = localityIds[0] + var promotion = null var existingId = bestByHash[parentHash] - var hasRegionCompetition = group.hasRegion - if (existingId !== undefined) { - var existingPlace = placeById[String(existingId)] - if (existingPlace && isCityPlacetypeCode(existingPlace.placetypeCode) && String(existingId) !== localityId) { + var existingPlace = existingId !== undefined ? placeById[String(existingId)] : null + + if (localityIds.length === 1) { + var localityId = Number(localityIds[0]) + var hasRegionCompetition = group.hasRegion + if (existingPlace && isCityPlacetypeCode(existingPlace.placetypeCode) && Number(existingId) !== localityId) { continue } if (existingPlace && existingPlace.placetypeCode === PLACETYPE_CODES.region) { hasRegionCompetition = true } - } - if (!hasRegionCompetition) { - continue + if (!hasRegionCompetition) { + continue + } + + promotion = { + localityId: localityId, + suppressMinorLocalities: false + } + } else { + var dominantLocalityId = selectDominantLocalityId(localityIds, placeById, opts) + if (dominantLocalityId === null) { + continue + } + + if (existingPlace && + isCityPlacetypeCode(existingPlace.placetypeCode) && + Number(existingId) !== dominantLocalityId && + isMajorLocality(existingPlace, opts)) { + continue + } + + promotion = { + localityId: dominantLocalityId, + suppressMinorLocalities: true + } } - bestByHash[parentHash] = Number(localityId) - promotedParents[parentHash] = true + bestByHash[parentHash] = promotion.localityId + promotedParents[parentHash] = promotion } if (!Object.keys(promotedParents).length) { @@ -873,12 +984,29 @@ function promoteLocalityParentsByRegionCompetition(bestByHash, placeById, opts) if (descendantHash.length <= precision) continue var ancestor = descendantHash.slice(0, precision) - if (!promotedParents[ancestor] || descendantHash === ancestor) { + var promoted = promotedParents[ancestor] + if (!promoted || descendantHash === ancestor) { + continue + } + + var descendantPlaceId = Number(bestByHash[descendantHash]) + if (descendantPlaceId === promoted.localityId) { + continue + } + + var descendantPlace = placeById[String(descendantPlaceId)] + if (!descendantPlace) { continue } - var descendantPlace = placeById[String(bestByHash[descendantHash])] - if (descendantPlace && descendantPlace.placetypeCode === PLACETYPE_CODES.region) { + if (descendantPlace.placetypeCode === PLACETYPE_CODES.region) { + delete bestByHash[descendantHash] + continue + } + + if (promoted.suppressMinorLocalities && + isCityPlacetypeCode(descendantPlace.placetypeCode) && + !isMajorLocality(descendantPlace, opts)) { delete bestByHash[descendantHash] } } @@ -1300,6 +1428,14 @@ async function main() { options.localityMaxPrecision = clampPrecision(options.localityMaxPrecision, options.basePrecision, options.maxPrecision) options.localadminMaxPrecision = clampPrecision(options.localadminMaxPrecision, options.basePrecision, options.maxPrecision) options.regionMaxPrecision = clampPrecision(options.regionMaxPrecision, options.basePrecision, options.maxPrecision) + if (!Number.isFinite(options.dominantLocalityPopulation) || options.dominantLocalityPopulation <= 0) { + options.dominantLocalityPopulation = 0 + } else { + options.dominantLocalityPopulation = Math.trunc(options.dominantLocalityPopulation) + } + if (!Number.isFinite(options.dominantLocalityRatio) || options.dominantLocalityRatio < 1) { + options.dominantLocalityRatio = 1 + } if (options.regionSparseMaxPrecision !== null) { if (!Number.isFinite(options.regionSparseMaxPrecision) || options.regionSparseMaxPrecision < 1) { options.regionSparseMaxPrecision = null @@ -1359,6 +1495,11 @@ async function main() { if (Number.isFinite(options.regionSparseMaxPrecision) && Number.isFinite(options.regionSparseMinAreaKm2)) { console.log('Sparse region rule: area_km2>=' + options.regionSparseMinAreaKm2 + ' => max_precision=' + options.regionSparseMaxPrecision) } + if (options.dominantLocalityPopulation > 0) { + console.log('Dominant locality rollup: major_population>=' + options.dominantLocalityPopulation + ', ratio>=' + options.dominantLocalityRatio) + } else { + console.log('Dominant locality rollup: disabled') + } console.log('Index mode: ' + options.indexMode) console.log('Promote locality over region: ' + (options.promoteLocalityOverRegion ? 'true' : 'false')) console.log('Min population: ' + options.minPopulation) diff --git a/scripts/generate_wof_boundary.sh b/scripts/generate_wof_boundary.sh index eecb668..c7b03b9 100755 --- a/scripts/generate_wof_boundary.sh +++ b/scripts/generate_wof_boundary.sh @@ -19,6 +19,8 @@ set -euo pipefail # WOF_REGION_SPARSE_MAX_PRECISION Sparse large-region precision (default: 3) # WOF_REGION_SPARSE_MIN_AREA_KM2 Area threshold for sparse region precision (default: 80000) # WOF_PROMOTE_LOCALITY_OVER_REGION Prefer locality labels over region in shared parent cells (default: 1) +# WOF_DOMINANT_LOCALITY_POPULATION Major-locality threshold for dominant-city rollup (default: 100000) +# WOF_DOMINANT_LOCALITY_RATIO Dominant-vs-next locality population ratio (default: 3) # WOF_INCLUDE_LOCALADMIN Include localadmin placetypes (default: 0) # WOF_INCLUDE_REGION Include region placetypes (default: 1) # WOF_DROP_CONTAINED_LOCALITIES Drop localities contained in larger localities (default: 1) @@ -42,6 +44,8 @@ WOF_REGION_MAX_PRECISION="${WOF_REGION_MAX_PRECISION:-4}" WOF_REGION_SPARSE_MAX_PRECISION="${WOF_REGION_SPARSE_MAX_PRECISION:-3}" WOF_REGION_SPARSE_MIN_AREA_KM2="${WOF_REGION_SPARSE_MIN_AREA_KM2:-80000}" WOF_PROMOTE_LOCALITY_OVER_REGION="${WOF_PROMOTE_LOCALITY_OVER_REGION:-1}" +WOF_DOMINANT_LOCALITY_POPULATION="${WOF_DOMINANT_LOCALITY_POPULATION:-100000}" +WOF_DOMINANT_LOCALITY_RATIO="${WOF_DOMINANT_LOCALITY_RATIO:-3}" WOF_INCLUDE_LOCALADMIN="${WOF_INCLUDE_LOCALADMIN:-0}" WOF_INCLUDE_REGION="${WOF_INCLUDE_REGION:-1}" WOF_DROP_CONTAINED_LOCALITIES="${WOF_DROP_CONTAINED_LOCALITIES:-1}" @@ -123,6 +127,8 @@ CMD=( --region-sparse-max-precision "${WOF_REGION_SPARSE_MAX_PRECISION}" --region-sparse-min-area-km2 "${WOF_REGION_SPARSE_MIN_AREA_KM2}" --promote-locality-over-region "${WOF_PROMOTE_LOCALITY_OVER_REGION}" + --dominant-locality-population "${WOF_DOMINANT_LOCALITY_POPULATION}" + --dominant-locality-ratio "${WOF_DOMINANT_LOCALITY_RATIO}" --include-localadmin "${WOF_INCLUDE_LOCALADMIN}" --include-region "${WOF_INCLUDE_REGION}" --drop-contained-localities "${WOF_DROP_CONTAINED_LOCALITIES}" diff --git a/scripts/validate_with_locationiq.js b/scripts/validate_with_locationiq.js index 0f8d541..6509d95 100644 --- a/scripts/validate_with_locationiq.js +++ b/scripts/validate_with_locationiq.js @@ -203,6 +203,7 @@ async function ensureCacheSchema(cacheDb) { longitude REAL NOT NULL, source_geohash TEXT NOT NULL, local_name TEXT, + local_placetype TEXT, local_country_id TEXT, local_admin1_id TEXT, local_json TEXT, @@ -212,12 +213,38 @@ async function ensureCacheSchema(cacheDb) { liq_json TEXT, locality_match INTEGER NOT NULL DEFAULT 0, country_match INTEGER NOT NULL DEFAULT 0, + policy_match INTEGER NOT NULL DEFAULT 0, + policy_reason TEXT, + policy_verdict TEXT NOT NULL DEFAULT 'policy_unset', verdict TEXT NOT NULL, updated_at TEXT NOT NULL DEFAULT (datetime('now')) ); `) } +async function ensureValidationColumns(cacheDb) { + var columns = await dbAll(cacheDb, "PRAGMA table_info(validation_results)") + var byName = Object.create(null) + for (var i = 0; i < columns.length; i++) { + byName[String(columns[i].name)] = true + } + + var additions = [ + ['local_placetype', 'ALTER TABLE validation_results ADD COLUMN local_placetype TEXT'], + ['policy_match', 'ALTER TABLE validation_results ADD COLUMN policy_match INTEGER NOT NULL DEFAULT 0'], + ['policy_reason', 'ALTER TABLE validation_results ADD COLUMN policy_reason TEXT'], + ['policy_verdict', "ALTER TABLE validation_results ADD COLUMN policy_verdict TEXT NOT NULL DEFAULT 'policy_unset'"] + ] + + for (var j = 0; j < additions.length; j++) { + var name = additions[j][0] + var sql = additions[j][1] + if (!byName[name]) { + await dbExec(cacheDb, sql) + } + } +} + function hashString32(value) { var hash = 2166136261 var text = String(value || '') @@ -264,6 +291,61 @@ async function detectLookupTable(sourceDb) { throw new Error('No geohash lookup table found (expected compact_geohash_lookup or place_geohash_lookup)') } +async function detectPlacetypeSource(sourceDb) { + var rows = await dbAll( + sourceDb, + "SELECT name FROM sqlite_master WHERE type='table' AND name IN ('compact_places','places')" + ) + var names = Object.create(null) + for (var i = 0; i < rows.length; i++) { + names[rows[i].name] = true + } + if (names.compact_places) return 'compact_places' + if (names.places) return 'places' + return null +} + +async function resolveLocalPlacetype(sourceDb, source, placeId, cache) { + if (placeId === undefined || placeId === null || placeId === '') { + return '' + } + if (!source) { + return '' + } + + var key = String(placeId) + if (cache[key]) { + return cache[key] + } + + var row + if (source === 'compact_places') { + row = await dbGet( + sourceDb, + `SELECT CASE placetype_code + WHEN 0 THEN 'locality' + WHEN 1 THEN 'localadmin' + WHEN 2 THEN 'region' + ELSE '' + END AS placetype + FROM compact_places + WHERE id = ? + LIMIT 1`, + [placeId] + ) + } else { + row = await dbGet( + sourceDb, + 'SELECT placetype FROM places WHERE id = ? LIMIT 1', + [placeId] + ) + } + + var value = row && row.placetype ? String(row.placetype) : '' + cache[key] = value + return value +} + function coordKey(latitude, longitude) { return Number(latitude).toFixed(6) + ',' + Number(longitude).toFixed(6) } @@ -353,6 +435,114 @@ function extractLocationIqLocality(address) { return '' } +function matchAddressValue(normalizedLocalName, address, keys) { + if (!normalizedLocalName || !address || typeof address !== 'object') { + return null + } + + for (var i = 0; i < keys.length; i++) { + var key = keys[i] + var value = address[key] + if (!value) continue + if (namesMatch(normalizedLocalName, normalizeName(value))) { + return { key: key, value: String(value) } + } + } + + return null +} + +function displayNameMatch(normalizedLocalName, displayName) { + if (!normalizedLocalName || !displayName) { + return false + } + + var segments = String(displayName).split(',') + for (var i = 0; i < segments.length; i++) { + if (namesMatch(normalizedLocalName, normalizeName(segments[i]))) { + return true + } + } + + return false +} + +function buildPolicyVerdict(params) { + var countryMatch = Boolean(params.countryMatch) + var strictLocalityMatch = Boolean(params.strictLocalityMatch) + var localPlacetype = String(params.localPlacetype || '') + var localName = String(params.localName || '') + var normalizedLocalName = normalizeName(localName) + var liqAddress = params.liqAddress || {} + var liqDisplayName = String(params.liqDisplayName || '') + + if (!countryMatch) { + return { + match: false, + reason: 'country_mismatch', + verdict: 'policy_country_mismatch' + } + } + + if (!normalizedLocalName) { + return { + match: false, + reason: 'missing_local_name', + verdict: 'policy_missing_local_name' + } + } + + if (strictLocalityMatch) { + return { + match: true, + reason: 'strict_locality', + verdict: 'policy_match_strict' + } + } + + var majorKeys = ['city', 'town', 'municipality', 'county', 'state_district', 'state', 'region', 'province'] + var majorMatch = matchAddressValue(normalizedLocalName, liqAddress, majorKeys) + if (majorMatch) { + return { + match: true, + reason: 'major_' + majorMatch.key, + verdict: localPlacetype === 'region' ? 'policy_match_region_rollup' : 'policy_match_major_admin' + } + } + + var minorKeys = ['village', 'borough', 'suburb', 'hamlet', 'quarter', 'neighbourhood', 'city_district', 'district'] + var minorMatch = matchAddressValue(normalizedLocalName, liqAddress, minorKeys) + if (minorMatch) { + return { + match: true, + reason: 'minor_' + minorMatch.key, + verdict: 'policy_match_minor_admin' + } + } + + if (displayNameMatch(normalizedLocalName, liqDisplayName)) { + return { + match: true, + reason: 'display_name_segment', + verdict: localPlacetype === 'region' ? 'policy_match_region_rollup' : 'policy_match_display_name' + } + } + + if (localPlacetype === 'region') { + return { + match: false, + reason: 'region_name_not_present', + verdict: 'policy_region_mismatch' + } + } + + return { + match: false, + reason: 'no_policy_match', + verdict: 'policy_mismatch' + } +} + function buildVerdict(localityMatch, countryMatch, localName, liqLocality) { if (localityMatch && countryMatch) return 'match_city_country' if (localityMatch) return 'match_city_only' @@ -462,7 +652,7 @@ function csvEscape(value) { async function writeCsv(cacheDb, csvPath, limit) { var rows = await dbAll( cacheDb, - `SELECT coord_key, latitude, longitude, source_geohash, local_name, local_country_id, liq_locality, liq_country_code, verdict + `SELECT coord_key, latitude, longitude, source_geohash, local_name, local_placetype, local_country_id, liq_locality, liq_country_code, verdict, policy_verdict, policy_reason FROM validation_results ORDER BY updated_at DESC LIMIT ?`, @@ -475,10 +665,13 @@ async function writeCsv(cacheDb, csvPath, limit) { 'longitude', 'source_geohash', 'local_name', + 'local_placetype', 'local_country_id', 'liq_locality', 'liq_country_code', - 'verdict' + 'verdict', + 'policy_verdict', + 'policy_reason' ] var lines = [headers.join(',')] diff --git a/spec/boundary_builder_spec.js b/spec/boundary_builder_spec.js index 637c63b..94c3c18 100644 --- a/spec/boundary_builder_spec.js +++ b/spec/boundary_builder_spec.js @@ -326,4 +326,234 @@ describe('boundary builder', () => { fs.rmSync(dir, { recursive: true, force: true }); } }); + + it('rolls parent cells to a dominant major locality and suppresses minor locality descendants', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'dominant-locality.geojson'); + const dbPath = path.join(dir, 'dominant-locality.sqlite'); + const parentHash = 's000'; + const parentBbox = geohash.decodeBbox(parentHash); + const splitLon = parentBbox.minLon + ((parentBbox.maxLon - parentBbox.minLon) * 0.75); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 6001, + properties: { + name: 'Fallback Region', + placetype: 'region', + country_id: 'AR', + admin1_id: 1, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 6002, + properties: { + name: 'Metro Core', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 1200000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [splitLon, parentBbox.minLat], + [splitLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 6003, + properties: { + name: 'Outer Hamlet', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 18000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [splitLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [splitLon, parentBbox.maxLat], + [splitLon, parentBbox.minLat] + ]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--base-precision', '4', + '--max-precision', '5', + '--dominant-locality-population', '100000', + '--dominant-locality-ratio', '3' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const parentRow = await all(db, `SELECT geohash, place_id FROM compact_geohash_lookup WHERE geohash='${parentHash}'`); + expect(parentRow).toEqual([{ geohash: parentHash, place_id: 6002 }]); + + const minorDescendants = await all( + db, + `SELECT COUNT(*) AS count FROM compact_geohash_lookup WHERE geohash LIKE '${parentHash}%' AND geohash <> '${parentHash}' AND place_id = 6003` + ); + expect(minorDescendants[0].count).toEqual(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('keeps fine locality borders when multiple major localities compete', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'major-competition.geojson'); + const dbPath = path.join(dir, 'major-competition.sqlite'); + const parentHash = 's000'; + const parentBbox = geohash.decodeBbox(parentHash); + const midLon = (parentBbox.minLon + parentBbox.maxLon) / 2; + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 7001, + properties: { + name: 'Fallback Region', + placetype: 'region', + country_id: 'AR', + admin1_id: 1, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 7002, + properties: { + name: 'West Major City', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 1000000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [midLon, parentBbox.minLat], + [midLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 7003, + properties: { + name: 'East Major City', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 850000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [midLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [midLon, parentBbox.maxLat], + [midLon, parentBbox.minLat] + ]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--base-precision', '4', + '--max-precision', '5', + '--dominant-locality-population', '100000', + '--dominant-locality-ratio', '3' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const majorRows = await all( + db, + `SELECT place_id, COUNT(*) AS count + FROM compact_geohash_lookup + WHERE geohash LIKE '${parentHash}%' AND place_id IN (7002, 7003) + GROUP BY place_id + ORDER BY place_id` + ); + expect(majorRows).toEqual([ + { place_id: 7002, count: jasmine.any(Number) }, + { place_id: 7003, count: jasmine.any(Number) } + ]); + expect(majorRows[0].count).toBeGreaterThan(0); + expect(majorRows[1].count).toBeGreaterThan(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); }); From 238eb58f7c2655595d5ef7a74ff91fba1288cfea Mon Sep 17 00:00:00 2001 From: Sebastian Schloesser Date: Thu, 26 Mar 2026 03:05:17 -0400 Subject: [PATCH 05/10] Require 50% locality share for parent-cell takeover --- README.md | 3 + scripts/generate_boundary_index.js | 46 ++++++++++++++- scripts/generate_wof_boundary.sh | 3 + spec/boundary_builder_spec.js | 89 ++++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 466d46b..15e4fa4 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,8 @@ Builder notes: - Dominant-city rollup keeps broad city labels sticky in mixed city/suburb cells unless there is competing major-city pressure: - `--dominant-locality-population` (default `100000`) - `--dominant-locality-ratio` (default `3`) +- Parent-cell takeover guard: + - `--parent-locality-min-share` (default `0.5`) requires locality ownership of at least that child-cell share before replacing a parent cell label - Excludes neighbourhood-like placetypes from default reverse output - `--index-mode compact` (default) stores only geohash-to-place mappings (`compact_geohash_lookup`) and no runtime geometry payloads. Compact schema uses `compact_places(id,name,country_id,admin1_id,placetype_code,latitude,longitude)`. @@ -264,6 +266,7 @@ Useful WOF build env vars: - `WOF_PROMOTE_LOCALITY_OVER_REGION=1|0` prefer locality labels over region in shared parent cells (default `1`) - `WOF_DOMINANT_LOCALITY_POPULATION` major-locality threshold for dominant-city rollup (default `100000`) - `WOF_DOMINANT_LOCALITY_RATIO` dominant-vs-next locality population ratio (default `3`) +- `WOF_PARENT_LOCALITY_MIN_SHARE` minimum child-cell share for locality parent takeover (default `0.5`) - `WOF_GEOMETRY_DECIMALS` round coordinates before storage/indexing (for example `4`) - `WOF_MIN_POPULATION` filter out places below threshold (for example `10000`) - `WOF_INCLUDE_REGION=1|0` include/exclude region fallback boundaries diff --git a/scripts/generate_boundary_index.js b/scripts/generate_boundary_index.js index 19ef38c..30d7283 100755 --- a/scripts/generate_boundary_index.js +++ b/scripts/generate_boundary_index.js @@ -53,7 +53,8 @@ function parseArgs(argv) { regionSparseMinAreaKm2: null, promoteLocalityOverRegion: true, dominantLocalityPopulation: 100000, - dominantLocalityRatio: 3 + dominantLocalityRatio: 3, + parentLocalityMinShare: 0.5 } for (var i = 0; i < argv.length; i++) { @@ -111,6 +112,9 @@ function parseArgs(argv) { } else if (arg === '--dominant-locality-ratio') { var dominantRatio = Number(argv[++i]) opts.dominantLocalityRatio = Number.isFinite(dominantRatio) ? dominantRatio : opts.dominantLocalityRatio + } else if (arg === '--parent-locality-min-share') { + var minShare = Number(argv[++i]) + opts.parentLocalityMinShare = Number.isFinite(minShare) ? minShare : opts.parentLocalityMinShare } else if (arg === '--append') { opts.replace = false } else if (arg === '--replace') { @@ -151,6 +155,7 @@ function usage() { ' --promote-locality-over-region Prefer locality over region in shared parent cells when no competing locality exists (default: true)', ' --dominant-locality-population Population threshold that marks locality as major for dominant-city rollups (default: 100000)', ' --dominant-locality-ratio Required dominant-vs-next population ratio for locality rollups (default: 3)', + ' --parent-locality-min-share Minimum child-cell share (0..1) required to let a locality take over a parent cell (default: 0.5)', ' --append Keep existing boundary rows and append/replace by place id', ' --replace Clear boundary rows first (default)', ' --help, -h Show this help message' @@ -879,6 +884,27 @@ function selectDominantLocalityId(localityIds, placeById, opts) { return top.id } +function localityShareInParent(localityId, group) { + if (!group) return 0 + + var counts = group.localityCellCountById || Object.create(null) + var localityCount = Number(counts[String(localityId)] || 0) + if (!localityCount) { + return 0 + } + + return localityCount / 32 +} + +function localityShareMeetsThreshold(localityId, group, opts) { + var threshold = Number(opts.parentLocalityMinShare) + if (!Number.isFinite(threshold) || threshold <= 0) { + return true + } + + return localityShareInParent(localityId, group) >= threshold +} + function promoteLocalityParentsByRegionCompetition(bestByHash, placeById, opts) { if (!opts.promoteLocalityOverRegion) { return @@ -907,6 +933,7 @@ function promoteLocalityParentsByRegionCompetition(bestByHash, placeById, opts) if (!group) { group = { localityById: Object.create(null), + localityCellCountById: Object.create(null), hasRegion: false } groupByParent[parent] = group @@ -914,6 +941,7 @@ function promoteLocalityParentsByRegionCompetition(bestByHash, placeById, opts) if (isCityPlacetypeCode(place.placetypeCode)) { group.localityById[String(place.id)] = true + group.localityCellCountById[String(place.id)] = Number(group.localityCellCountById[String(place.id)] || 0) + 1 } else if (place.placetypeCode === PLACETYPE_CODES.region) { group.hasRegion = true } @@ -935,6 +963,10 @@ function promoteLocalityParentsByRegionCompetition(bestByHash, placeById, opts) if (localityIds.length === 1) { var localityId = Number(localityIds[0]) + if (!localityShareMeetsThreshold(localityId, group, opts)) { + continue + } + var hasRegionCompetition = group.hasRegion if (existingPlace && isCityPlacetypeCode(existingPlace.placetypeCode) && Number(existingId) !== localityId) { continue @@ -956,6 +988,9 @@ function promoteLocalityParentsByRegionCompetition(bestByHash, placeById, opts) if (dominantLocalityId === null) { continue } + if (!localityShareMeetsThreshold(dominantLocalityId, group, opts)) { + continue + } if (existingPlace && isCityPlacetypeCode(existingPlace.placetypeCode) && @@ -1436,6 +1471,14 @@ async function main() { if (!Number.isFinite(options.dominantLocalityRatio) || options.dominantLocalityRatio < 1) { options.dominantLocalityRatio = 1 } + if (!Number.isFinite(options.parentLocalityMinShare)) { + options.parentLocalityMinShare = 0.5 + } + if (options.parentLocalityMinShare < 0) { + options.parentLocalityMinShare = 0 + } else if (options.parentLocalityMinShare > 1) { + options.parentLocalityMinShare = 1 + } if (options.regionSparseMaxPrecision !== null) { if (!Number.isFinite(options.regionSparseMaxPrecision) || options.regionSparseMaxPrecision < 1) { options.regionSparseMaxPrecision = null @@ -1500,6 +1543,7 @@ async function main() { } else { console.log('Dominant locality rollup: disabled') } + console.log('Parent locality takeover min share: ' + options.parentLocalityMinShare) console.log('Index mode: ' + options.indexMode) console.log('Promote locality over region: ' + (options.promoteLocalityOverRegion ? 'true' : 'false')) console.log('Min population: ' + options.minPopulation) diff --git a/scripts/generate_wof_boundary.sh b/scripts/generate_wof_boundary.sh index c7b03b9..6d9cee9 100755 --- a/scripts/generate_wof_boundary.sh +++ b/scripts/generate_wof_boundary.sh @@ -21,6 +21,7 @@ set -euo pipefail # WOF_PROMOTE_LOCALITY_OVER_REGION Prefer locality labels over region in shared parent cells (default: 1) # WOF_DOMINANT_LOCALITY_POPULATION Major-locality threshold for dominant-city rollup (default: 100000) # WOF_DOMINANT_LOCALITY_RATIO Dominant-vs-next locality population ratio (default: 3) +# WOF_PARENT_LOCALITY_MIN_SHARE Minimum child-cell share (0..1) required for locality parent takeover (default: 0.5) # WOF_INCLUDE_LOCALADMIN Include localadmin placetypes (default: 0) # WOF_INCLUDE_REGION Include region placetypes (default: 1) # WOF_DROP_CONTAINED_LOCALITIES Drop localities contained in larger localities (default: 1) @@ -46,6 +47,7 @@ WOF_REGION_SPARSE_MIN_AREA_KM2="${WOF_REGION_SPARSE_MIN_AREA_KM2:-80000}" WOF_PROMOTE_LOCALITY_OVER_REGION="${WOF_PROMOTE_LOCALITY_OVER_REGION:-1}" WOF_DOMINANT_LOCALITY_POPULATION="${WOF_DOMINANT_LOCALITY_POPULATION:-100000}" WOF_DOMINANT_LOCALITY_RATIO="${WOF_DOMINANT_LOCALITY_RATIO:-3}" +WOF_PARENT_LOCALITY_MIN_SHARE="${WOF_PARENT_LOCALITY_MIN_SHARE:-0.5}" WOF_INCLUDE_LOCALADMIN="${WOF_INCLUDE_LOCALADMIN:-0}" WOF_INCLUDE_REGION="${WOF_INCLUDE_REGION:-1}" WOF_DROP_CONTAINED_LOCALITIES="${WOF_DROP_CONTAINED_LOCALITIES:-1}" @@ -129,6 +131,7 @@ CMD=( --promote-locality-over-region "${WOF_PROMOTE_LOCALITY_OVER_REGION}" --dominant-locality-population "${WOF_DOMINANT_LOCALITY_POPULATION}" --dominant-locality-ratio "${WOF_DOMINANT_LOCALITY_RATIO}" + --parent-locality-min-share "${WOF_PARENT_LOCALITY_MIN_SHARE}" --include-localadmin "${WOF_INCLUDE_LOCALADMIN}" --include-region "${WOF_INCLUDE_REGION}" --drop-contained-localities "${WOF_DROP_CONTAINED_LOCALITIES}" diff --git a/spec/boundary_builder_spec.js b/spec/boundary_builder_spec.js index 94c3c18..d788e76 100644 --- a/spec/boundary_builder_spec.js +++ b/spec/boundary_builder_spec.js @@ -327,6 +327,95 @@ describe('boundary builder', () => { } }); + it('does not promote locality to parent cell when locality child-share is below threshold', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); + try { + const inputPath = path.join(dir, 'locality-region-share-threshold.geojson'); + const dbPath = path.join(dir, 'locality-region-share-threshold.sqlite'); + const parentHash = 's000'; + const parentBbox = geohash.decodeBbox(parentHash); + const splitLon = parentBbox.minLon + ((parentBbox.maxLon - parentBbox.minLon) * 0.25); + + fs.writeFileSync(inputPath, JSON.stringify({ + type: 'FeatureCollection', + features: [ + { + type: 'Feature', + id: 5101, + properties: { + name: 'Wide Region', + placetype: 'region', + country_id: 'AR', + admin1_id: 1, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.minLat], + [parentBbox.maxLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + }, + { + type: 'Feature', + id: 5102, + properties: { + name: 'Small Town', + placetype: 'locality', + country_id: 'AR', + admin1_id: 1, + population: 40000, + is_current: 1 + }, + geometry: { + type: 'Polygon', + coordinates: [[ + [parentBbox.minLon, parentBbox.minLat], + [splitLon, parentBbox.minLat], + [splitLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.maxLat], + [parentBbox.minLon, parentBbox.minLat] + ]] + } + } + ] + })); + + const result = spawnSync('node', [ + path.join(__dirname, '..', 'scripts', 'generate_boundary_index.js'), + '--database', dbPath, + '--input', inputPath, + '--index-mode', 'compact', + '--include-region', 'true', + '--base-precision', '4', + '--max-precision', '5', + '--parent-locality-min-share', '0.5' + ], { encoding: 'utf8' }); + + expect(result.status).toEqual(0); + + const db = new sqlite3.Database(dbPath); + try { + const parentRow = await all(db, `SELECT geohash, place_id FROM compact_geohash_lookup WHERE geohash='${parentHash}'`); + expect(parentRow).toEqual([{ geohash: parentHash, place_id: 5101 }]); + + const localityDescendants = await all( + db, + `SELECT COUNT(*) AS count FROM compact_geohash_lookup WHERE geohash LIKE '${parentHash}%' AND geohash <> '${parentHash}' AND place_id = 5102` + ); + expect(localityDescendants[0].count).toBeGreaterThan(0); + } finally { + await close(db); + } + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + it('rolls parent cells to a dominant major locality and suppresses minor locality descendants', async () => { const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'offline-geocoder-builder-')); try { From 5ab2ae414d181d995c80bed70aeefa23cfd61707 Mon Sep 17 00:00:00 2001 From: Sebastian Schloesser Date: Tue, 7 Apr 2026 11:42:51 +0700 Subject: [PATCH 06/10] Implement lockfile to pin WOF data sets --- README.md | 1 + scripts/generate_wof_boundary.sh | 51 +++++++++++++++++++++++-- scripts/validate_with_locationiq.js | 59 ++++++++++++++++++++++++++--- 3 files changed, 102 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 15e4fa4..7a0dd68 100644 --- a/README.md +++ b/README.md @@ -259,6 +259,7 @@ Useful WOF build env vars: - `WOF_WORKDIR` working directory for downloads/extracted files (default `tmp/wof-build`) - `WOF_DOWNLOAD=0` reuse existing archives only - `WOF_REF` branch/ref to download (default `master`) +- `WOF_REF_LOCK_FILE` optional per-country pinned refs (` ` per line); when set, this overrides `WOF_REF` per country - `WOF_LOCALITY_MAX_PRECISION` locality precision cap - `WOF_REGION_MAX_PRECISION` region precision cap (default `4`) - `WOF_REGION_SPARSE_MAX_PRECISION` sparse very-large-region precision (default `3`) diff --git a/scripts/generate_wof_boundary.sh b/scripts/generate_wof_boundary.sh index 6d9cee9..3985580 100755 --- a/scripts/generate_wof_boundary.sh +++ b/scripts/generate_wof_boundary.sh @@ -11,6 +11,7 @@ set -euo pipefail # WOF_WORKDIR Working directory for archives/extraction (default: ./tmp/wof-build) # WOF_DOWNLOAD Set to 0 to skip downloads and reuse existing archives (default: 1) # WOF_REF Git ref to download from codeload (default: master) +# WOF_REF_LOCK_FILE Optional file with per-country pinned refs: " " per line # WOF_BASE_PRECISION Geohash base precision (default: 4) # WOF_MAX_PRECISION Geohash max precision (default: 5) # WOF_LOCALITY_MAX_PRECISION Locality max precision override (default: WOF_MAX_PRECISION) @@ -37,6 +38,7 @@ WOF_COUNTRIES="${WOF_COUNTRIES:-FR,IT}" WOF_WORKDIR="${WOF_WORKDIR:-$(pwd)/tmp/wof-build}" WOF_DOWNLOAD="${WOF_DOWNLOAD:-1}" WOF_REF="${WOF_REF:-master}" +WOF_REF_LOCK_FILE="${WOF_REF_LOCK_FILE:-}" WOF_BASE_PRECISION="${WOF_BASE_PRECISION:-4}" WOF_MAX_PRECISION="${WOF_MAX_PRECISION:-5}" WOF_LOCALITY_MAX_PRECISION="${WOF_LOCALITY_MAX_PRECISION:-${WOF_MAX_PRECISION}}" @@ -62,6 +64,48 @@ case "${OUTPUT}" in *) OUTPUT="$(pwd)/${OUTPUT}" ;; esac +if [[ -n "${WOF_REF_LOCK_FILE}" ]]; then + case "${WOF_REF_LOCK_FILE}" in + /*) ;; + *) WOF_REF_LOCK_FILE="$(pwd)/${WOF_REF_LOCK_FILE}" ;; + esac + + if [[ ! -f "${WOF_REF_LOCK_FILE}" ]]; then + echo "WOF_REF_LOCK_FILE does not exist: ${WOF_REF_LOCK_FILE}" >&2 + exit 1 + fi +fi + +resolve_country_ref() { + local country="$1" + local fallback_ref="$2" + + if [[ -z "${WOF_REF_LOCK_FILE}" ]]; then + echo "${fallback_ref}" + return 0 + fi + + local resolved_ref + resolved_ref="$(awk -F'[,\t ]+' -v cc="${country}" ' + BEGIN { lower = tolower(cc) } + /^[[:space:]]*#/ { next } + NF < 2 { next } + { + if (tolower($1) == lower) { + print $2 + exit + } + } + ' "${WOF_REF_LOCK_FILE}")" + + if [[ -z "${resolved_ref}" ]]; then + echo "Missing pinned ref for country ${country} in ${WOF_REF_LOCK_FILE}" >&2 + exit 1 + fi + + echo "${resolved_ref}" +} + ARCHIVE_DIR="${WOF_WORKDIR}/archives" EXTRACT_DIR="${WOF_WORKDIR}/extracted" mkdir -p "${ARCHIVE_DIR}" "${EXTRACT_DIR}" @@ -76,7 +120,8 @@ for item in "${COUNTRY_ITEMS[@]}"; do fi repo="whosonfirst-data-admin-${country}" - archive="${ARCHIVE_DIR}/${repo}-${WOF_REF}.tar.gz" + country_ref="$(resolve_country_ref "${country}" "${WOF_REF}")" + archive="${ARCHIVE_DIR}/${repo}-${country_ref}.tar.gz" if [[ ! -f "${archive}" ]]; then if [[ "${WOF_DOWNLOAD}" != "1" ]]; then @@ -85,8 +130,8 @@ for item in "${COUNTRY_ITEMS[@]}"; do exit 1 fi - url="https://codeload.github.com/whosonfirst-data/${repo}/tar.gz/${WOF_REF}" - echo "Downloading ${repo}@${WOF_REF}..." + url="https://codeload.github.com/whosonfirst-data/${repo}/tar.gz/${country_ref}" + echo "Downloading ${repo}@${country_ref}..." curl -fsSL "${url}" -o "${archive}" else echo "Using existing archive ${archive}" diff --git a/scripts/validate_with_locationiq.js b/scripts/validate_with_locationiq.js index 6509d95..414fbdf 100644 --- a/scripts/validate_with_locationiq.js +++ b/scripts/validate_with_locationiq.js @@ -720,7 +720,10 @@ async function main() { try { await ensureCacheSchema(cacheDb) + await ensureValidationColumns(cacheDb) var lookupTable = await detectLookupTable(sourceDb) + var placetypeSource = await detectPlacetypeSource(sourceDb) + var placetypeCache = Object.create(null) await ensureSamplePoints(sourceDb, cacheDb, lookupTable, opts.samples, Number.isFinite(opts.seed) ? opts.seed : 1337) var points = await dbAll( @@ -746,26 +749,37 @@ async function main() { var liqAddress = (liqResult.json && liqResult.json.address) || {} var liqLocality = extractLocationIqLocality(liqAddress) var liqCountryCode = liqAddress.country_code ? String(liqAddress.country_code).toUpperCase() : '' + var liqDisplayName = liqResult.json && liqResult.json.display_name ? String(liqResult.json.display_name) : '' var localName = localResult.name || '' var localCountryId = (localResult.country && localResult.country.id) ? String(localResult.country.id).toUpperCase() : '' + var localPlacetype = await resolveLocalPlacetype(sourceDb, placetypeSource, localResult.id, placetypeCache) var localityMatch = namesMatch(normalizeName(localName), normalizeName(liqLocality)) - var countryMatch = localCountryId && liqCountryCode && localCountryId === liqCountryCode + var countryMatch = Boolean(localCountryId && liqCountryCode && localCountryId === liqCountryCode) var verdict = buildVerdict(localityMatch, countryMatch, localName, liqLocality) + var policyVerdict = buildPolicyVerdict({ + countryMatch: countryMatch, + strictLocalityMatch: localityMatch, + localPlacetype: localPlacetype, + localName: localName, + liqAddress: liqAddress, + liqDisplayName: liqDisplayName + }) await dbRun( cacheDb, `INSERT INTO validation_results( coord_key, latitude, longitude, source_geohash, - local_name, local_country_id, local_admin1_id, local_json, + local_name, local_placetype, local_country_id, local_admin1_id, local_json, liq_locality, liq_country_code, liq_display_name, liq_json, - locality_match, country_match, verdict, updated_at - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now')) + locality_match, country_match, policy_match, policy_reason, policy_verdict, verdict, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now')) ON CONFLICT(coord_key) DO UPDATE SET latitude=excluded.latitude, longitude=excluded.longitude, source_geohash=excluded.source_geohash, local_name=excluded.local_name, + local_placetype=excluded.local_placetype, local_country_id=excluded.local_country_id, local_admin1_id=excluded.local_admin1_id, local_json=excluded.local_json, @@ -775,6 +789,9 @@ async function main() { liq_json=excluded.liq_json, locality_match=excluded.locality_match, country_match=excluded.country_match, + policy_match=excluded.policy_match, + policy_reason=excluded.policy_reason, + policy_verdict=excluded.policy_verdict, verdict=excluded.verdict, updated_at=excluded.updated_at`, [ @@ -783,15 +800,19 @@ async function main() { point.longitude, point.source_geohash, localName || null, + localPlacetype || null, localCountryId || null, (localResult.admin1 && localResult.admin1.id) ? String(localResult.admin1.id) : null, JSON.stringify(localResult), liqLocality || null, liqCountryCode || null, - liqResult.json && liqResult.json.display_name ? String(liqResult.json.display_name) : null, + liqDisplayName || null, JSON.stringify(liqResult.json), localityMatch ? 1 : 0, countryMatch ? 1 : 0, + policyVerdict.match ? 1 : 0, + policyVerdict.reason || null, + policyVerdict.verdict || 'policy_unset', verdict ] ) @@ -811,6 +832,16 @@ async function main() { [opts.samples] ) + var policyVerdictRows = await dbAll( + cacheDb, + `SELECT policy_verdict, COUNT(*) AS count + FROM validation_results + WHERE coord_key IN (SELECT coord_key FROM sample_points ORDER BY created_at ASC, coord_key ASC LIMIT ?) + GROUP BY policy_verdict + ORDER BY count DESC, policy_verdict ASC`, + [opts.samples] + ) + var totalRow = await dbGet( cacheDb, `SELECT COUNT(*) AS count @@ -818,12 +849,28 @@ async function main() { WHERE coord_key IN (SELECT coord_key FROM sample_points ORDER BY created_at ASC, coord_key ASC LIMIT ?)`, [opts.samples] ) + var policyMatchRow = await dbGet( + cacheDb, + `SELECT COUNT(*) AS count + FROM validation_results + WHERE policy_match = 1 + AND coord_key IN (SELECT coord_key FROM sample_points ORDER BY created_at ASC, coord_key ASC LIMIT ?)`, + [opts.samples] + ) + var totalCount = Number(totalRow && totalRow.count ? totalRow.count : 0) + var policyMatchCount = Number(policyMatchRow && policyMatchRow.count ? policyMatchRow.count : 0) + var policyRatePct = totalCount > 0 ? ((policyMatchCount * 100) / totalCount) : 0 console.log('Validation complete') console.log('Geocoder DB: ' + opts.database) console.log('Cache DB: ' + opts.cacheDb) - console.log('Samples evaluated: ' + Number(totalRow && totalRow.count ? totalRow.count : 0)) + console.log('Samples evaluated: ' + totalCount) console.log('LocationIQ uncached calls this run: ' + uncachedCalls) + console.log('Policy match rate: ' + policyMatchCount + '/' + totalCount + ' (' + policyRatePct.toFixed(1) + '%)') + console.log('Policy verdict distribution:') + for (var k = 0; k < policyVerdictRows.length; k++) { + console.log(' ' + policyVerdictRows[k].policy_verdict + ': ' + policyVerdictRows[k].count) + } console.log('Verdict distribution:') for (var j = 0; j < verdictRows.length; j++) { console.log(' ' + verdictRows[j].verdict + ': ' + verdictRows[j].count) From d1e1f99d1564474d2aecdc621458801492734552 Mon Sep 17 00:00:00 2001 From: Sebastian Schloesser Date: Tue, 7 Apr 2026 18:00:01 +0700 Subject: [PATCH 07/10] Optimize script to run with limited memory --- README.md | 2 ++ scripts/generate_wof_boundary.sh | 29 +++++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7a0dd68..5f9d09c 100644 --- a/README.md +++ b/README.md @@ -273,6 +273,8 @@ Useful WOF build env vars: - `WOF_INCLUDE_REGION=1|0` include/exclude region fallback boundaries - `WOF_MAX_PLACES` cap places for experiment runs - `WOF_DROP_CONTAINED_LOCALITIES=1|0` enable/disable contained-locality pruning +- `WOF_SKIP_INVALID_REPOS=1|0` skip malformed/unexpected WOF admin repos during bulk runs (default `1`) +- `WOF_APPEND=1|0` append to an existing compact DB instead of replacing schema (default `0`) Boundary runtime modes: diff --git a/scripts/generate_wof_boundary.sh b/scripts/generate_wof_boundary.sh index 3985580..067452f 100755 --- a/scripts/generate_wof_boundary.sh +++ b/scripts/generate_wof_boundary.sh @@ -30,6 +30,8 @@ set -euo pipefail # WOF_GEOMETRY_DECIMALS Optional coordinate rounding precision (e.g. 4) # WOF_MIN_POPULATION Optional minimum population filter (default: 0) # WOF_MAX_PLACES Optional cap for experiment runs +# WOF_SKIP_INVALID_REPOS Skip repos missing expected extracted data dir (default: 1) +# WOF_APPEND Append to existing DB instead of replacing schema (default: 0) # # Notes: # - This helper always builds `--index-mode compact` (geohash -> place only). @@ -57,6 +59,8 @@ WOF_INCLUDE_ALT="${WOF_INCLUDE_ALT:-0}" WOF_GEOMETRY_DECIMALS="${WOF_GEOMETRY_DECIMALS:-}" WOF_MIN_POPULATION="${WOF_MIN_POPULATION:-0}" WOF_MAX_PLACES="${WOF_MAX_PLACES:-}" +WOF_SKIP_INVALID_REPOS="${WOF_SKIP_INVALID_REPOS:-1}" +WOF_APPEND="${WOF_APPEND:-0}" OUTPUT="${1:-db.sqlite}" case "${OUTPUT}" in @@ -132,7 +136,9 @@ for item in "${COUNTRY_ITEMS[@]}"; do url="https://codeload.github.com/whosonfirst-data/${repo}/tar.gz/${country_ref}" echo "Downloading ${repo}@${country_ref}..." - curl -fsSL "${url}" -o "${archive}" + curl --fail --silent --show-error --location \ + --retry 5 --retry-delay 2 --retry-connrefused \ + "${url}" -o "${archive}" else echo "Using existing archive ${archive}" fi @@ -140,16 +146,31 @@ for item in "${COUNTRY_ITEMS[@]}"; do country_extract="${EXTRACT_DIR}/${country}" rm -rf "${country_extract}" mkdir -p "${country_extract}" - tar -xzf "${archive}" -C "${country_extract}" + if ! tar -xzf "${archive}" -C "${country_extract}"; then + if [[ "${WOF_SKIP_INVALID_REPOS}" == "1" ]]; then + echo "Warning: failed to extract ${archive}; skipping ${repo}" >&2 + continue + fi + echo "Failed to extract ${archive}" >&2 + exit 1 + fi root_dir="$(find "${country_extract}" -mindepth 1 -maxdepth 1 -type d | head -n 1)" if [[ -z "${root_dir}" ]]; then + if [[ "${WOF_SKIP_INVALID_REPOS}" == "1" ]]; then + echo "Warning: failed to find extracted root directory for ${repo}; skipping" >&2 + continue + fi echo "Failed to find extracted root directory for ${repo}" >&2 exit 1 fi data_dir="${root_dir}/data" if [[ ! -d "${data_dir}" ]]; then + if [[ "${WOF_SKIP_INVALID_REPOS}" == "1" ]]; then + echo "Warning: expected data directory not found for ${repo}; skipping (${data_dir})" >&2 + continue + fi echo "Expected data directory not found: ${data_dir}" >&2 exit 1 fi @@ -192,6 +213,10 @@ if [[ -n "${WOF_GEOMETRY_DECIMALS}" ]]; then CMD+=(--geometry-decimals "${WOF_GEOMETRY_DECIMALS}") fi +if [[ "${WOF_APPEND}" == "1" ]]; then + CMD+=(--append) +fi + CMD+=("${INPUT_ARGS[@]}") "${CMD[@]}" From bd87be1953850d0c9c338c1446439d6d9aefee09 Mon Sep 17 00:00:00 2001 From: Sebastian Schloesser Date: Fri, 10 Apr 2026 09:09:25 +0700 Subject: [PATCH 08/10] Include county boundaries - (capped at geohash precision 4) so places like Ko Pha-ngan that lack locality records still appear Restructure the WOF build script to extract and clean up per-batch instead of extracting all 264 countries upfront, keeping disk usage bounded during world builds. --- .gitignore | 2 + README.md | 13 ++ scripts/generate_boundary_index.js | 198 ++++++++++++++++++++++++++--- scripts/generate_wof_boundary.sh | 167 +++++++++++++++++------- src/reverse.js | 6 +- 5 files changed, 320 insertions(+), 66 deletions(-) diff --git a/.gitignore b/.gitignore index c198c54..4315c38 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ package-lock.json .geonames-build .DS_Store tmp +*.sqlite +*.db diff --git a/README.md b/README.md index 5f9d09c..ae657c1 100644 --- a/README.md +++ b/README.md @@ -211,6 +211,17 @@ contained in larger localities within the same country/admin1 group. This is intended to suppress duplicate neighbourhood-like localities while keeping small isolated places (for example islands) that are not contained. +#### Place selection pipeline + +The builder uses a multi-stage pipeline to decide which localities make it into the index: + +1. **Primary filter** (`--min-population`): localities at or above this threshold are always included. Country capitals are always included regardless of population. +2. **Isolation pass** (`--isolation-min-population`): localities between the isolation floor and the primary threshold are evaluated as candidates. A candidate is promoted if at least one of its geohash cover cells (at base precision) is not already claimed by a primary locality. This ensures small but geographically isolated places like islands, remote towns, and oases get their own label without adding noise in dense urban areas. +3. **Country guarantee** (`--ensure-country-locality`): after the isolation pass, any country that still has zero localities gets its highest-population candidate promoted unconditionally. +4. **Contained-locality pruning** (`--drop-contained-localities`): removes localities whose polygon is fully contained inside a larger locality in the same country/admin1 group. +5. **Dominant-city rollup**: in the geohash index, when a major city (population >= `--dominant-locality-population`) dominates its neighbours by a ratio of `--dominant-locality-ratio`, smaller nearby localities are absorbed into the major city label. +6. **Locality-over-region promotion**: when a locality and a region compete for the same parent geohash cell, the locality wins if it covers >= `--parent-locality-min-share` of child cells. + Builder notes: - Keeps current records only (drops deprecated/superseded where source metadata is present) @@ -270,6 +281,8 @@ Useful WOF build env vars: - `WOF_PARENT_LOCALITY_MIN_SHARE` minimum child-cell share for locality parent takeover (default `0.5`) - `WOF_GEOMETRY_DECIMALS` round coordinates before storage/indexing (for example `4`) - `WOF_MIN_POPULATION` filter out places below threshold (for example `10000`) +- `WOF_ISOLATION_MIN_POPULATION` lower population floor for isolated localities (default `500`). Places between this and `WOF_MIN_POPULATION` are included only if they occupy otherwise-empty geohash cells +- `WOF_ENSURE_COUNTRY_LOCALITY=1|0` guarantee at least one locality per country (default `1`) - `WOF_INCLUDE_REGION=1|0` include/exclude region fallback boundaries - `WOF_MAX_PLACES` cap places for experiment runs - `WOF_DROP_CONTAINED_LOCALITIES=1|0` enable/disable contained-locality pruning diff --git a/scripts/generate_boundary_index.js b/scripts/generate_boundary_index.js index 30d7283..59d092f 100755 --- a/scripts/generate_boundary_index.js +++ b/scripts/generate_boundary_index.js @@ -11,7 +11,8 @@ const geohash = require('../src/geohash') const PLACETYPE_CODES = { locality: 0, localadmin: 1, - region: 2 + region: 2, + county: 3 } function parseBool(value, defaultValue) { @@ -38,6 +39,7 @@ function parseArgs(argv) { basePrecision: 4, maxPrecision: 7, includeLocaladmin: false, + includeCounty: false, includeRegion: false, replace: true, includeAlt: false, @@ -45,9 +47,12 @@ function parseArgs(argv) { maxPlaces: null, geometryDecimals: null, minPopulation: 0, + isolationMinPopulation: null, + ensureCountryLocality: true, indexMode: 'compact', localityMaxPrecision: null, localadminMaxPrecision: null, + countyMaxPrecision: null, regionMaxPrecision: null, regionSparseMaxPrecision: null, regionSparseMinAreaKm2: null, @@ -72,6 +77,8 @@ function parseArgs(argv) { opts.maxPrecision = Number(argv[++i]) } else if (arg === '--include-localadmin') { opts.includeLocaladmin = parseBool(argv[++i], false) + } else if (arg === '--include-county') { + opts.includeCounty = parseBool(argv[++i], false) } else if (arg === '--include-region') { opts.includeRegion = parseBool(argv[++i], false) } else if (arg === '--include-alt') { @@ -87,6 +94,11 @@ function parseArgs(argv) { } else if (arg === '--min-population') { var minPopulation = Number(argv[++i]) opts.minPopulation = Number.isFinite(minPopulation) && minPopulation > 0 ? Math.trunc(minPopulation) : 0 + } else if (arg === '--isolation-min-population') { + var isolationMin = Number(argv[++i]) + opts.isolationMinPopulation = Number.isFinite(isolationMin) && isolationMin > 0 ? Math.trunc(isolationMin) : null + } else if (arg === '--ensure-country-locality') { + opts.ensureCountryLocality = parseBool(argv[++i], true) } else if (arg === '--index-mode') { opts.indexMode = String(argv[++i] || '').toLowerCase().trim() } else if (arg === '--locality-max-precision') { @@ -95,6 +107,9 @@ function parseArgs(argv) { } else if (arg === '--localadmin-max-precision') { var localadminMax = Number(argv[++i]) opts.localadminMaxPrecision = Number.isFinite(localadminMax) ? Math.trunc(localadminMax) : null + } else if (arg === '--county-max-precision') { + var countyMax = Number(argv[++i]) + opts.countyMaxPrecision = Number.isFinite(countyMax) ? Math.trunc(countyMax) : null } else if (arg === '--region-max-precision') { var regionMax = Number(argv[++i]) opts.regionMaxPrecision = Number.isFinite(regionMax) ? Math.trunc(regionMax) : null @@ -140,15 +155,19 @@ function usage() { ' --base-precision Geohash base precision (default: 4)', ' --max-precision Geohash max precision for partial subdivision (default: 7)', ' --include-localadmin Include localadmin placetypes (default: false)', + ' --include-county Include county placetypes (default: false)', ' --include-region Include region placetypes (default: false)', ' --include-alt Include WOF alt geometries (default: false)', ' --drop-contained-localities Drop locality polygons fully contained by larger localities (default: true)', ' --max-places Stop after this many normalized places (useful for experiments)', ' --geometry-decimals Round geometry coordinates to N decimals before indexing/storage', ' --min-population Drop localities below this threshold (default: 0, country capitals kept)', + ' --isolation-min-population Lower population floor for localities in otherwise-empty geohash cells (default: off)', + ' --ensure-country-locality Guarantee at least one locality per country (default: true)', ' --index-mode compact|full (default: compact)', ' --locality-max-precision Max precision override for locality placetype', ' --localadmin-max-precision Max precision override for localadmin placetype', + ' --county-max-precision Max precision override for county placetype', ' --region-max-precision Max precision override for region placetype', ' --region-sparse-max-precision Optional precision for very large region polygons (for example 3)', ' --region-sparse-min-area-km2 Area threshold to apply sparse region precision', @@ -551,13 +570,14 @@ function normalizeFeature(feature, opts) { } var properties = feature.properties || {} + var placetype = (extractPlacetype(properties) || '').toLowerCase() if (!isCurrentRecord(properties)) { return null } - var placetype = (extractPlacetype(properties) || '').toLowerCase() var include = placetype === 'locality' || (opts.includeLocaladmin && placetype === 'localadmin') || + (opts.includeCounty && placetype === 'county') || (opts.includeRegion && placetype === 'region') if (!include) { return null @@ -565,8 +585,15 @@ function normalizeFeature(feature, opts) { var population = extractPopulation(properties) var isCapital = placetype === 'locality' && isCapitalLocality(properties) - if (placetype === 'locality' && population < opts.minPopulation && !isCapital) { - return null + var isCityLikePlacetype = placetype === 'locality' || placetype === 'county' + var isolationCandidate = false + if (isCityLikePlacetype && population < opts.minPopulation && !isCapital) { + var isolationFloor = opts.isolationMinPopulation + if (Number.isFinite(isolationFloor) && isolationFloor > 0 && population >= isolationFloor) { + isolationCandidate = true + } else { + return null + } } var rawId = feature.id @@ -622,6 +649,12 @@ function normalizeFeature(feature, opts) { maxPrecision: maxPrecisionForPlace }) + var area = geometry.geometryArea(normalizedGeometry) + + // In compact mode, geometry is not written to the DB. Retain it only + // for full index mode which stores polygons. + var retainGeometry = opts.indexMode !== 'compact' + return { id: id, name: name, @@ -637,11 +670,12 @@ function normalizeFeature(feature, opts) { bboxMaxLat: bbox.maxLat, bboxMaxLon: bbox.maxLon, priorityRank: priorityRank, - area: geometry.geometryArea(normalizedGeometry), + area: area, countryName: pickFirstString(properties.country_name) || countryId || null, admin1Name: pickFirstString(properties.admin1_name) || null, - geometry: normalizedGeometry, - cover: cover + geometry: retainGeometry ? normalizedGeometry : null, + cover: cover, + isolationCandidate: isolationCandidate } } @@ -660,9 +694,11 @@ function pruneContainedLocalities(places, enabled) { var localitiesByGroup = Object.create(null) for (var i = 0; i < places.length; i++) { var place = places[i] - if (place.placetype !== 'locality') continue + if (!isCityPlacetypeCode(place.placetypeCode)) continue - var key = localityGroupKey(place) + // Group by placetype + country/admin1 so localities are only pruned by + // other localities, not by counties that happen to contain them. + var key = place.placetypeCode + '|' + localityGroupKey(place) if (!localitiesByGroup[key]) { localitiesByGroup[key] = [] } @@ -707,7 +743,11 @@ function pruneContainedLocalities(places, enabled) { continue } - if (geometry.geometryContainsGeometry(container.geometry, candidate.geometry)) { + // Use full geometry containment when available, otherwise bbox is sufficient + var geometryContains = container.geometry && candidate.geometry + ? geometry.geometryContainsGeometry(container.geometry, candidate.geometry) + : true + if (geometryContains) { dropById[candidate.id] = { placeId: candidate.id, containedBy: container.id, @@ -733,6 +773,7 @@ function pruneContainedLocalities(places, enabled) { function placetypeRank(placetype) { if (placetype === 'locality') return 0 if (placetype === 'localadmin') return 1 + if (placetype === 'county') return 1 if (placetype === 'region') return 2 return 3 } @@ -745,6 +786,7 @@ function placetypeCode(placetype) { function resolveMaxPrecisionForPlacetype(opts, placetype, bbox) { if (placetype === 'locality') return opts.localityMaxPrecision if (placetype === 'localadmin') return opts.localadminMaxPrecision + if (placetype === 'county') return opts.countyMaxPrecision if (placetype === 'region') { var regionPrecision = opts.regionMaxPrecision if (Number.isFinite(opts.regionSparseMaxPrecision) && Number.isFinite(opts.regionSparseMinAreaKm2)) { @@ -804,7 +846,7 @@ function comparePlacesForHash(a, b, hash, hashCenterCache) { } function isCityPlacetypeCode(code) { - return code === PLACETYPE_CODES.locality || code === PLACETYPE_CODES.localadmin + return code === PLACETYPE_CODES.locality || code === PLACETYPE_CODES.localadmin || code === PLACETYPE_CODES.county } function placePopulation(place) { @@ -1266,6 +1308,7 @@ async function ensureBoundarySchema(db, opts) { function normalizePlaces(files, opts) { var byId = Object.create(null) + var candidateById = Object.create(null) var normalizedCount = 0 for (var i = 0; i < files.length; i++) { @@ -1275,9 +1318,14 @@ function normalizePlaces(files, opts) { var place = normalizeFeature(features[j], opts) if (!place) continue - byId[String(place.id)] = place normalizedCount += 1 + if (place.isolationCandidate) { + candidateById[String(place.id)] = place + } else { + byId[String(place.id)] = place + } + if (opts.maxPlaces && Object.keys(byId).length >= opts.maxPlaces) { break } @@ -1292,12 +1340,114 @@ function normalizePlaces(files, opts) { .map(function(id) { return byId[id] }) .sort(function(a, b) { return a.id - b.id }) + var candidates = Object.keys(candidateById) + .map(function(id) { return candidateById[id] }) + .sort(function(a, b) { return a.id - b.id }) + return { places: places, + candidates: candidates, normalizedCount: normalizedCount } } +function promoteIsolatedLocalities(places, candidates, opts) { + if (!candidates.length) { + return { places: places, promoted: 0, countryFills: 0 } + } + + // Build set of geohash cells already claimed by primary places at base precision + var claimedCells = Object.create(null) + for (var i = 0; i < places.length; i++) { + var place = places[i] + if (place.placetype !== 'locality' && place.placetype !== 'localadmin') continue + for (var j = 0; j < place.cover.length; j++) { + var hash = place.cover[j].geohash + // Claim at base precision: truncate to basePrecision length + var baseHash = hash.length > opts.basePrecision ? hash.slice(0, opts.basePrecision) : hash + claimedCells[baseHash] = true + } + } + + // Track which countries already have at least one city-like place + var countriesWithLocality = Object.create(null) + for (var i = 0; i < places.length; i++) { + if (isCityPlacetypeCode(places[i].placetypeCode)) { + countriesWithLocality[places[i].countryId] = true + } + } + + // Sort candidates by population descending so higher-pop isolated places win first + var sortedCandidates = candidates.slice().sort(function(a, b) { + return b.population - a.population + }) + + var promoted = 0 + var countryFills = 0 + var result = places.slice() + + for (var c = 0; c < sortedCandidates.length; c++) { + var candidate = sortedCandidates[c] + var isIsolated = false + + for (var k = 0; k < candidate.cover.length; k++) { + var hash = candidate.cover[k].geohash + var baseHash = hash.length > opts.basePrecision ? hash.slice(0, opts.basePrecision) : hash + if (!claimedCells[baseHash]) { + isIsolated = true + break + } + } + + if (!isIsolated) continue + + candidate.isolationCandidate = false + result.push(candidate) + promoted += 1 + + // Mark its cells as claimed + for (var k = 0; k < candidate.cover.length; k++) { + var hash = candidate.cover[k].geohash + var baseHash = hash.length > opts.basePrecision ? hash.slice(0, opts.basePrecision) : hash + claimedCells[baseHash] = true + } + + if (!countriesWithLocality[candidate.countryId]) { + countriesWithLocality[candidate.countryId] = true + countryFills += 1 + } + } + + // Ensure every country has at least one locality + if (opts.ensureCountryLocality) { + var candidatesByCountry = Object.create(null) + for (var c = 0; c < sortedCandidates.length; c++) { + var candidate = sortedCandidates[c] + if (candidate.isolationCandidate === false) continue // already promoted + if (!isCityPlacetypeCode(candidate.placetypeCode)) continue + var cc = candidate.countryId + if (!candidatesByCountry[cc]) { + candidatesByCountry[cc] = candidate // first = highest pop (already sorted) + } + } + + var countryKeys = Object.keys(candidatesByCountry) + for (var i = 0; i < countryKeys.length; i++) { + var cc = countryKeys[i] + if (countriesWithLocality[cc]) continue + + var best = candidatesByCountry[cc] + best.isolationCandidate = false + result.push(best) + countryFills += 1 + promoted += 1 + countriesWithLocality[cc] = true + } + } + + return { places: result, promoted: promoted, countryFills: countryFills } +} + async function writePlaces(db, places, opts, compactLookupRows) { await dbExec(db, 'BEGIN') @@ -1462,6 +1612,7 @@ async function main() { options.localityMaxPrecision = clampPrecision(options.localityMaxPrecision, options.basePrecision, options.maxPrecision) options.localadminMaxPrecision = clampPrecision(options.localadminMaxPrecision, options.basePrecision, options.maxPrecision) + options.countyMaxPrecision = clampPrecision(options.countyMaxPrecision, options.basePrecision, options.maxPrecision) options.regionMaxPrecision = clampPrecision(options.regionMaxPrecision, options.basePrecision, options.maxPrecision) if (!Number.isFinite(options.dominantLocalityPopulation) || options.dominantLocalityPopulation <= 0) { options.dominantLocalityPopulation = 0 @@ -1496,12 +1647,15 @@ async function main() { } var normalized = normalizePlaces(files, options) - var dedupedPlaces = normalized.places + var primaryPlaces = normalized.places - if (!dedupedPlaces.length) { + if (!primaryPlaces.length && !normalized.candidates.length) { throw new Error('No valid locality/localadmin/region records were found in the provided input files') } + var isolation = promoteIsolatedLocalities(primaryPlaces, normalized.candidates, options) + var dedupedPlaces = isolation.places + var pruned = pruneContainedLocalities(dedupedPlaces, options.dropContainedLocalities) var finalPlaces = pruned.places var compactLookupRows = options.indexMode === 'compact' ? buildCompactLookupRows(finalPlaces, options) : [] @@ -1521,7 +1675,12 @@ async function main() { console.log('Database: ' + databasePath) console.log('Input files scanned: ' + files.length) console.log('Features normalized: ' + normalized.normalizedCount) - console.log('Places (deduped by id): ' + dedupedPlaces.length) + console.log('Primary places (>= min-population): ' + primaryPlaces.length) + if (normalized.candidates.length) { + console.log('Isolation candidates evaluated: ' + normalized.candidates.length) + console.log('Isolated localities promoted: ' + isolation.promoted + ' (country fills: ' + isolation.countryFills + ')') + } + console.log('Places after isolation pass: ' + dedupedPlaces.length) console.log('Places dropped (contained locality prune): ' + pruned.dropped.length) console.log('Places written: ' + finalPlaces.length) if (options.indexMode === 'compact') { @@ -1531,10 +1690,11 @@ async function main() { } var modeLabel = 'locality' if (options.includeLocaladmin) modeLabel += ' + localadmin' + if (options.includeCounty) modeLabel += ' + county' if (options.includeRegion) modeLabel += ' + region' console.log('Mode: ' + modeLabel) console.log('Precision: ' + options.basePrecision + ' -> ' + options.maxPrecision) - console.log('Placetype precision caps: locality=' + options.localityMaxPrecision + ', localadmin=' + options.localadminMaxPrecision + ', region=' + options.regionMaxPrecision) + console.log('Placetype precision caps: locality=' + options.localityMaxPrecision + ', localadmin=' + options.localadminMaxPrecision + ', county=' + options.countyMaxPrecision + ', region=' + options.regionMaxPrecision) if (Number.isFinite(options.regionSparseMaxPrecision) && Number.isFinite(options.regionSparseMinAreaKm2)) { console.log('Sparse region rule: area_km2>=' + options.regionSparseMinAreaKm2 + ' => max_precision=' + options.regionSparseMaxPrecision) } @@ -1547,6 +1707,12 @@ async function main() { console.log('Index mode: ' + options.indexMode) console.log('Promote locality over region: ' + (options.promoteLocalityOverRegion ? 'true' : 'false')) console.log('Min population: ' + options.minPopulation) + if (Number.isFinite(options.isolationMinPopulation) && options.isolationMinPopulation > 0) { + console.log('Isolation min population: ' + options.isolationMinPopulation) + } else { + console.log('Isolation pass: disabled') + } + console.log('Ensure country locality: ' + (options.ensureCountryLocality ? 'true' : 'false')) } finally { await dbClose(db) } diff --git a/scripts/generate_wof_boundary.sh b/scripts/generate_wof_boundary.sh index 067452f..8ef4a4f 100755 --- a/scripts/generate_wof_boundary.sh +++ b/scripts/generate_wof_boundary.sh @@ -16,6 +16,7 @@ set -euo pipefail # WOF_MAX_PRECISION Geohash max precision (default: 5) # WOF_LOCALITY_MAX_PRECISION Locality max precision override (default: WOF_MAX_PRECISION) # WOF_LOCALADMIN_MAX_PRECISION Localadmin max precision override (default: WOF_MAX_PRECISION) +# WOF_COUNTY_MAX_PRECISION County max precision override (default: WOF_MAX_PRECISION) # WOF_REGION_MAX_PRECISION Region max precision override (default: 4) # WOF_REGION_SPARSE_MAX_PRECISION Sparse large-region precision (default: 3) # WOF_REGION_SPARSE_MIN_AREA_KM2 Area threshold for sparse region precision (default: 80000) @@ -24,13 +25,17 @@ set -euo pipefail # WOF_DOMINANT_LOCALITY_RATIO Dominant-vs-next locality population ratio (default: 3) # WOF_PARENT_LOCALITY_MIN_SHARE Minimum child-cell share (0..1) required for locality parent takeover (default: 0.5) # WOF_INCLUDE_LOCALADMIN Include localadmin placetypes (default: 0) +# WOF_INCLUDE_COUNTY Include county placetypes (default: 1) # WOF_INCLUDE_REGION Include region placetypes (default: 1) # WOF_DROP_CONTAINED_LOCALITIES Drop localities contained in larger localities (default: 1) # WOF_INCLUDE_ALT Include -alt- geometries (default: 0) # WOF_GEOMETRY_DECIMALS Optional coordinate rounding precision (e.g. 4) # WOF_MIN_POPULATION Optional minimum population filter (default: 0) +# WOF_ISOLATION_MIN_POPULATION Lower population floor for isolated localities (default: 500) +# WOF_ENSURE_COUNTRY_LOCALITY Guarantee at least one locality per country (default: 1) # WOF_MAX_PLACES Optional cap for experiment runs # WOF_SKIP_INVALID_REPOS Skip repos missing expected extracted data dir (default: 1) +# WOF_BATCH_SIZE Countries per node invocation to limit memory (default: 10) # WOF_APPEND Append to existing DB instead of replacing schema (default: 0) # # Notes: @@ -45,6 +50,7 @@ WOF_BASE_PRECISION="${WOF_BASE_PRECISION:-4}" WOF_MAX_PRECISION="${WOF_MAX_PRECISION:-5}" WOF_LOCALITY_MAX_PRECISION="${WOF_LOCALITY_MAX_PRECISION:-${WOF_MAX_PRECISION}}" WOF_LOCALADMIN_MAX_PRECISION="${WOF_LOCALADMIN_MAX_PRECISION:-${WOF_MAX_PRECISION}}" +WOF_COUNTY_MAX_PRECISION="${WOF_COUNTY_MAX_PRECISION:-${WOF_MAX_PRECISION}}" WOF_REGION_MAX_PRECISION="${WOF_REGION_MAX_PRECISION:-4}" WOF_REGION_SPARSE_MAX_PRECISION="${WOF_REGION_SPARSE_MAX_PRECISION:-3}" WOF_REGION_SPARSE_MIN_AREA_KM2="${WOF_REGION_SPARSE_MIN_AREA_KM2:-80000}" @@ -53,11 +59,14 @@ WOF_DOMINANT_LOCALITY_POPULATION="${WOF_DOMINANT_LOCALITY_POPULATION:-100000}" WOF_DOMINANT_LOCALITY_RATIO="${WOF_DOMINANT_LOCALITY_RATIO:-3}" WOF_PARENT_LOCALITY_MIN_SHARE="${WOF_PARENT_LOCALITY_MIN_SHARE:-0.5}" WOF_INCLUDE_LOCALADMIN="${WOF_INCLUDE_LOCALADMIN:-0}" +WOF_INCLUDE_COUNTY="${WOF_INCLUDE_COUNTY:-1}" WOF_INCLUDE_REGION="${WOF_INCLUDE_REGION:-1}" WOF_DROP_CONTAINED_LOCALITIES="${WOF_DROP_CONTAINED_LOCALITIES:-1}" WOF_INCLUDE_ALT="${WOF_INCLUDE_ALT:-0}" WOF_GEOMETRY_DECIMALS="${WOF_GEOMETRY_DECIMALS:-}" WOF_MIN_POPULATION="${WOF_MIN_POPULATION:-0}" +WOF_ISOLATION_MIN_POPULATION="${WOF_ISOLATION_MIN_POPULATION:-500}" +WOF_ENSURE_COUNTRY_LOCALITY="${WOF_ENSURE_COUNTRY_LOCALITY:-1}" WOF_MAX_PLACES="${WOF_MAX_PLACES:-}" WOF_SKIP_INVALID_REPOS="${WOF_SKIP_INVALID_REPOS:-1}" WOF_APPEND="${WOF_APPEND:-0}" @@ -114,7 +123,45 @@ ARCHIVE_DIR="${WOF_WORKDIR}/archives" EXTRACT_DIR="${WOF_WORKDIR}/extracted" mkdir -p "${ARCHIVE_DIR}" "${EXTRACT_DIR}" -INPUT_ARGS=() +# Build the common flags array shared by every invocation. +COMMON_FLAGS=( + --index-mode "compact" + --base-precision "${WOF_BASE_PRECISION}" + --max-precision "${WOF_MAX_PRECISION}" + --locality-max-precision "${WOF_LOCALITY_MAX_PRECISION}" + --localadmin-max-precision "${WOF_LOCALADMIN_MAX_PRECISION}" + --county-max-precision "${WOF_COUNTY_MAX_PRECISION}" + --region-max-precision "${WOF_REGION_MAX_PRECISION}" + --region-sparse-max-precision "${WOF_REGION_SPARSE_MAX_PRECISION}" + --region-sparse-min-area-km2 "${WOF_REGION_SPARSE_MIN_AREA_KM2}" + --promote-locality-over-region "${WOF_PROMOTE_LOCALITY_OVER_REGION}" + --dominant-locality-population "${WOF_DOMINANT_LOCALITY_POPULATION}" + --dominant-locality-ratio "${WOF_DOMINANT_LOCALITY_RATIO}" + --parent-locality-min-share "${WOF_PARENT_LOCALITY_MIN_SHARE}" + --include-localadmin "${WOF_INCLUDE_LOCALADMIN}" + --include-county "${WOF_INCLUDE_COUNTY}" + --include-region "${WOF_INCLUDE_REGION}" + --drop-contained-localities "${WOF_DROP_CONTAINED_LOCALITIES}" + --include-alt "${WOF_INCLUDE_ALT}" + --min-population "${WOF_MIN_POPULATION}" + --isolation-min-population "${WOF_ISOLATION_MIN_POPULATION}" + --ensure-country-locality "${WOF_ENSURE_COUNTRY_LOCALITY}" +) + +if [[ -n "${WOF_MAX_PLACES}" ]]; then + COMMON_FLAGS+=(--max-places "${WOF_MAX_PLACES}") +fi + +if [[ -n "${WOF_GEOMETRY_DECIMALS}" ]]; then + COMMON_FLAGS+=(--geometry-decimals "${WOF_GEOMETRY_DECIMALS}") +fi + +WOF_BATCH_SIZE="${WOF_BATCH_SIZE:-10}" + +# Phase 1: Download all archives (small on disk, skip extraction). +# Collect country codes and their archive paths for batched processing. +COUNTRY_CODES=() +COUNTRY_ARCHIVES=() IFS=',' read -r -a COUNTRY_ITEMS <<< "${WOF_COUNTRIES}" for item in "${COUNTRY_ITEMS[@]}"; do @@ -143,80 +190,104 @@ for item in "${COUNTRY_ITEMS[@]}"; do echo "Using existing archive ${archive}" fi - country_extract="${EXTRACT_DIR}/${country}" + COUNTRY_CODES+=("${country}") + COUNTRY_ARCHIVES+=("${archive}") +done + +if [[ ${#COUNTRY_CODES[@]} -eq 0 ]]; then + echo "No countries resolved from WOF_COUNTRIES=${WOF_COUNTRIES}" >&2 + exit 1 +fi + +# Phase 2: Extract, process, and clean up in batches to limit disk usage. +# Each batch extracts its countries, runs the node script, then removes +# the extracted data before the next batch starts. + +# Helper: extract a single country archive, print its data dir path. +# Returns 1 if the country should be skipped. +extract_country() { + local country="$1" + local archive="$2" + + local country_extract="${EXTRACT_DIR}/${country}" rm -rf "${country_extract}" mkdir -p "${country_extract}" if ! tar -xzf "${archive}" -C "${country_extract}"; then if [[ "${WOF_SKIP_INVALID_REPOS}" == "1" ]]; then - echo "Warning: failed to extract ${archive}; skipping ${repo}" >&2 - continue + echo "Warning: failed to extract ${archive}; skipping" >&2 + return 1 fi echo "Failed to extract ${archive}" >&2 exit 1 fi + local root_dir root_dir="$(find "${country_extract}" -mindepth 1 -maxdepth 1 -type d | head -n 1)" if [[ -z "${root_dir}" ]]; then if [[ "${WOF_SKIP_INVALID_REPOS}" == "1" ]]; then - echo "Warning: failed to find extracted root directory for ${repo}; skipping" >&2 - continue + echo "Warning: no extracted root directory for ${country}; skipping" >&2 + return 1 fi - echo "Failed to find extracted root directory for ${repo}" >&2 + echo "Failed to find extracted root directory for ${country}" >&2 exit 1 fi - data_dir="${root_dir}/data" + local data_dir="${root_dir}/data" if [[ ! -d "${data_dir}" ]]; then if [[ "${WOF_SKIP_INVALID_REPOS}" == "1" ]]; then - echo "Warning: expected data directory not found for ${repo}; skipping (${data_dir})" >&2 - continue + echo "Warning: expected data directory not found for ${country}; skipping (${data_dir})" >&2 + return 1 fi echo "Expected data directory not found: ${data_dir}" >&2 exit 1 fi - INPUT_ARGS+=(--input-dir "${data_dir}") -done + echo "${data_dir}" +} -if [[ ${#INPUT_ARGS[@]} -eq 0 ]]; then - echo "No input directories resolved from WOF_COUNTRIES=${WOF_COUNTRIES}" >&2 - exit 1 -fi +TOTAL="${#COUNTRY_CODES[@]}" +IS_FIRST=1 +BATCH_IDX=0 -CMD=( - node "$(pwd)/scripts/generate_boundary_index.js" - --database "${OUTPUT}" - --index-mode "compact" - --base-precision "${WOF_BASE_PRECISION}" - --max-precision "${WOF_MAX_PRECISION}" - --locality-max-precision "${WOF_LOCALITY_MAX_PRECISION}" - --localadmin-max-precision "${WOF_LOCALADMIN_MAX_PRECISION}" - --region-max-precision "${WOF_REGION_MAX_PRECISION}" - --region-sparse-max-precision "${WOF_REGION_SPARSE_MAX_PRECISION}" - --region-sparse-min-area-km2 "${WOF_REGION_SPARSE_MIN_AREA_KM2}" - --promote-locality-over-region "${WOF_PROMOTE_LOCALITY_OVER_REGION}" - --dominant-locality-population "${WOF_DOMINANT_LOCALITY_POPULATION}" - --dominant-locality-ratio "${WOF_DOMINANT_LOCALITY_RATIO}" - --parent-locality-min-share "${WOF_PARENT_LOCALITY_MIN_SHARE}" - --include-localadmin "${WOF_INCLUDE_LOCALADMIN}" - --include-region "${WOF_INCLUDE_REGION}" - --drop-contained-localities "${WOF_DROP_CONTAINED_LOCALITIES}" - --include-alt "${WOF_INCLUDE_ALT}" - --min-population "${WOF_MIN_POPULATION}" -) +while [[ "${BATCH_IDX}" -lt "${TOTAL}" ]]; do + BATCH_END=$(( BATCH_IDX + WOF_BATCH_SIZE )) + if [[ "${BATCH_END}" -gt "${TOTAL}" ]]; then + BATCH_END="${TOTAL}" + fi -if [[ -n "${WOF_MAX_PLACES}" ]]; then - CMD+=(--max-places "${WOF_MAX_PLACES}") -fi + BATCH_COUNTRIES=("${COUNTRY_CODES[@]:${BATCH_IDX}:${WOF_BATCH_SIZE}}") + BATCH_ARCHIVES=("${COUNTRY_ARCHIVES[@]:${BATCH_IDX}:${WOF_BATCH_SIZE}}") -if [[ -n "${WOF_GEOMETRY_DECIMALS}" ]]; then - CMD+=(--geometry-decimals "${WOF_GEOMETRY_DECIMALS}") -fi + echo "--- Batch $(( BATCH_IDX / WOF_BATCH_SIZE + 1 )): ${#BATCH_COUNTRIES[@]} countries (${BATCH_COUNTRIES[*]}) ---" -if [[ "${WOF_APPEND}" == "1" ]]; then - CMD+=(--append) -fi + # Extract this batch's countries. + INPUT_ARGS=() + EXTRACTED_DIRS=() + for (( i=0; i < ${#BATCH_COUNTRIES[@]}; i++ )); do + data_dir="$(extract_country "${BATCH_COUNTRIES[$i]}" "${BATCH_ARCHIVES[$i]}")" || continue + INPUT_ARGS+=(--input-dir "${data_dir}") + EXTRACTED_DIRS+=("${EXTRACT_DIR}/${BATCH_COUNTRIES[$i]}") + done -CMD+=("${INPUT_ARGS[@]}") + if [[ ${#INPUT_ARGS[@]} -gt 0 ]]; then + CMD=( + node "$(pwd)/scripts/generate_boundary_index.js" + --database "${OUTPUT}" + "${COMMON_FLAGS[@]}" + ) + if [[ "${IS_FIRST}" == "1" ]] && [[ "${WOF_APPEND}" != "1" ]]; then + IS_FIRST=0 + else + CMD+=(--append) + fi + CMD+=("${INPUT_ARGS[@]}") + "${CMD[@]}" + fi -"${CMD[@]}" + # Clean up extracted data for this batch to free disk space. + for dir in "${EXTRACTED_DIRS[@]}"; do + rm -rf "${dir}" + done + + BATCH_IDX="${BATCH_END}" +done diff --git a/src/reverse.js b/src/reverse.js index 68bc17e..941e32d 100644 --- a/src/reverse.js +++ b/src/reverse.js @@ -4,8 +4,8 @@ const formatLocation = require('./location').format const geohash = require('./geohash') const geometry = require('./geometry') -const SUPPORTED_PLACETYPES = ['locality', 'localadmin', 'region'] -const SUPPORTED_PLACETYPE_CODES = [0, 1, 2] +const SUPPORTED_PLACETYPES = ['locality', 'localadmin', 'region', 'county'] +const SUPPORTED_PLACETYPE_CODES = [0, 1, 2, 3] const PLACEHOLDER_EMPTY = {} function dbAll(geocoder, query, params) { @@ -149,6 +149,7 @@ function fetchCompactBoundaryMatchV2(geocoder, hashes) { WHEN 0 THEN 'locality' WHEN 1 THEN 'localadmin' WHEN 2 THEN 'region' + WHEN 3 THEN 'county' ELSE 'region' END AS placetype, 0 AS priority_rank, @@ -467,6 +468,7 @@ function fetchNearestCompactByRegionV2(geocoder, latitude, longitude, region) { WHEN 0 THEN 'locality' WHEN 1 THEN 'localadmin' WHEN 2 THEN 'region' + WHEN 3 THEN 'county' ELSE 'region' END AS placetype, 0 AS priority_rank, From fed51a04c1c5e964aa53ed8f9b43211756ee0cbd Mon Sep 17 00:00:00 2001 From: Sebastian Schloesser Date: Wed, 15 Apr 2026 11:20:17 +0700 Subject: [PATCH 09/10] Fix duplicate places caused by regions sharing names with localities Add pruneRedundantRegions() to drop region records when a locality with the same name and country already exists and is contained within the region bbox. Patch the live global DB to remove 590 redundant regions and reassign 3,679 geohash lookup rows to their matching localities. --- scripts/generate_boundary_index.js | 62 +++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/scripts/generate_boundary_index.js b/scripts/generate_boundary_index.js index 59d092f..4374a3e 100755 --- a/scripts/generate_boundary_index.js +++ b/scripts/generate_boundary_index.js @@ -770,6 +770,64 @@ function pruneContainedLocalities(places, enabled) { } } +function pruneRedundantRegions(places) { + var localitiesByKey = Object.create(null) + for (var i = 0; i < places.length; i++) { + var place = places[i] + if (place.placetypeCode !== PLACETYPE_CODES.locality) continue + var key = place.name + '|' + place.countryId + if (!localitiesByKey[key]) { + localitiesByKey[key] = [] + } + localitiesByKey[key].push(place) + } + + var dropById = Object.create(null) + for (var i = 0; i < places.length; i++) { + var region = places[i] + if (region.placetypeCode !== PLACETYPE_CODES.region) continue + + var key = region.name + '|' + region.countryId + var matchingLocalities = localitiesByKey[key] + if (!matchingLocalities) continue + + var regionBbox = { + minLat: region.bboxMinLat, + minLon: region.bboxMinLon, + maxLat: region.bboxMaxLat, + maxLon: region.bboxMaxLon + } + + for (var j = 0; j < matchingLocalities.length; j++) { + var locality = matchingLocalities[j] + var localityBbox = { + minLat: locality.bboxMinLat, + minLon: locality.bboxMinLon, + maxLat: locality.bboxMaxLat, + maxLon: locality.bboxMaxLon + } + + if (geometry.bboxContainsBbox(regionBbox, localityBbox)) { + dropById[region.id] = { + placeId: region.id, + replacedBy: locality.id + } + break + } + } + } + + var dropped = Object.keys(dropById).map(function(id) { return dropById[id] }) + var filtered = places.filter(function(place) { + return !dropById[place.id] + }) + + return { + places: filtered, + dropped: dropped + } +} + function placetypeRank(placetype) { if (placetype === 'locality') return 0 if (placetype === 'localadmin') return 1 @@ -1657,7 +1715,8 @@ async function main() { var dedupedPlaces = isolation.places var pruned = pruneContainedLocalities(dedupedPlaces, options.dropContainedLocalities) - var finalPlaces = pruned.places + var regionPrune = pruneRedundantRegions(pruned.places) + var finalPlaces = regionPrune.places var compactLookupRows = options.indexMode === 'compact' ? buildCompactLookupRows(finalPlaces, options) : [] var databasePath = path.resolve(options.database) @@ -1682,6 +1741,7 @@ async function main() { } console.log('Places after isolation pass: ' + dedupedPlaces.length) console.log('Places dropped (contained locality prune): ' + pruned.dropped.length) + console.log('Regions dropped (redundant with same-name locality): ' + regionPrune.dropped.length) console.log('Places written: ' + finalPlaces.length) if (options.indexMode === 'compact') { console.log('Geohash lookup rows: ' + compactLookupRows.length) From f07830a8a35bff3341c17a4d4192c0e02a72abc2 Mon Sep 17 00:00:00 2001 From: Sebastian Schloesser Date: Fri, 24 Apr 2026 19:38:20 -0400 Subject: [PATCH 10/10] Preserve regions that contain a same-named locality Tighten pruneRedundantRegions() to require the locality bbox area to be at least 50% of the region bbox area before treating the region as redundant. The bare bbox-contains check dropped legitimate region records like New York, California, Washington, and most UK counties just because they happen to enclose a same-named locality. Also extend the compact-v2 reverse queries with a self-join on compact_places so admin1_name is returned alongside admin1_id, removing the empty-string placeholder and saving callers a second lookup. Rebuilt global DB: 4,132 regions (up from 3,621), all 50 US states + DC present, 79 region/locality duplicates still pruned (capitals like Luanda/Beirut and UK city authorities like Birmingham/Manchester). --- scripts/generate_boundary_index.js | 23 +++++++++++++++++------ src/reverse.js | 6 ++++-- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/scripts/generate_boundary_index.js b/scripts/generate_boundary_index.js index 4374a3e..9942bcc 100755 --- a/scripts/generate_boundary_index.js +++ b/scripts/generate_boundary_index.js @@ -782,6 +782,12 @@ function pruneRedundantRegions(places) { localitiesByKey[key].push(place) } + // Only treat a region as redundant when a same-named locality covers most + // of the region's bbox area. A bare bbox-contains check would drop legit + // regions that simply happen to contain a same-named city (e.g. New York + // state contains New York city). + var REDUNDANT_AREA_SHARE = 0.5 + var dropById = Object.create(null) for (var i = 0; i < places.length; i++) { var region = places[i] @@ -797,6 +803,8 @@ function pruneRedundantRegions(places) { maxLat: region.bboxMaxLat, maxLon: region.bboxMaxLon } + var regionAreaKm2 = bboxAreaKm2(regionBbox) + if (!(regionAreaKm2 > 0)) continue for (var j = 0; j < matchingLocalities.length; j++) { var locality = matchingLocalities[j] @@ -807,13 +815,16 @@ function pruneRedundantRegions(places) { maxLon: locality.bboxMaxLon } - if (geometry.bboxContainsBbox(regionBbox, localityBbox)) { - dropById[region.id] = { - placeId: region.id, - replacedBy: locality.id - } - break + if (!geometry.bboxContainsBbox(regionBbox, localityBbox)) continue + + var localityAreaKm2 = bboxAreaKm2(localityBbox) + if (localityAreaKm2 / regionAreaKm2 < REDUNDANT_AREA_SHARE) continue + + dropById[region.id] = { + placeId: region.id, + replacedBy: locality.id } + break } } diff --git a/src/reverse.js b/src/reverse.js index 941e32d..6d89272 100644 --- a/src/reverse.js +++ b/src/reverse.js @@ -142,7 +142,7 @@ function fetchCompactBoundaryMatchV2(geocoder, hashes) { p.country_id AS country_id, p.country_id AS country_name, p.admin1_id AS admin1_id, - '' AS admin1_name, + COALESCE(a.name, '') AS admin1_name, p.latitude AS latitude, p.longitude AS longitude, CASE p.placetype_code @@ -156,6 +156,7 @@ function fetchCompactBoundaryMatchV2(geocoder, hashes) { 0 AS area FROM compact_geohash_lookup l JOIN compact_places p ON p.id = l.place_id + LEFT JOIN compact_places a ON a.id = p.admin1_id AND a.placetype_code = 2 WHERE l.geohash IN (${placeholders}) AND p.placetype_code IN (${placetypePlaceholders}) ORDER BY @@ -461,7 +462,7 @@ function fetchNearestCompactByRegionV2(geocoder, latitude, longitude, region) { p.country_id AS country_id, p.country_id AS country_name, p.admin1_id AS admin1_id, - '' AS admin1_name, + COALESCE(a.name, '') AS admin1_name, p.latitude AS latitude, p.longitude AS longitude, CASE p.placetype_code @@ -474,6 +475,7 @@ function fetchNearestCompactByRegionV2(geocoder, latitude, longitude, region) { 0 AS priority_rank, 0 AS area FROM compact_places p + LEFT JOIN compact_places a ON a.id = p.admin1_id AND a.placetype_code = 2 WHERE ${where.join(' AND ')} ORDER BY ((? - p.latitude) * (? - p.latitude) +