diff --git a/lib/solidpod.dart b/lib/solidpod.dart index f9bfd4d5..ed084112 100644 --- a/lib/solidpod.dart +++ b/lib/solidpod.dart @@ -38,7 +38,15 @@ export 'src/solid/constants/solid_constants.dart'; // Legacy exports for backward compatibility (deprecated, use SolidConstants instead) export 'src/solid/constants/common.dart' - show foaf, terms, ResourceStatus, permStr, agentStr, whatIsWebID, demoWebID; + show + foaf, + terms, + ResourceStatus, + WebIdStatus, + permStr, + agentStr, + whatIsWebID, + demoWebID; export 'src/solid/constants/schema.dart' show appsTerms; export 'src/solid/constants/path_type.dart' show PathType; @@ -217,6 +225,7 @@ export 'src/solid/shared_resources.dart'; export 'src/solid/api/rest_api.dart' show checkResourceStatus, + checkWebIdProfile, createResource, deleteResource, getResource, @@ -224,6 +233,16 @@ export 'src/solid/api/rest_api.dart' initialStructureTest, updateFileByQuery; +/// WebID validation pipeline. + +export 'src/solid/utils/webid_validator.dart' + show + WebIdCheckResult, + WebIdCheckStatus, + isValidIpv4, + looksLikeIpv4Attempt, + validateWebId; + /// Function to get the latest log enties export 'src/solid/api/common_permission.dart' diff --git a/lib/src/solid/api/rest_api.dart b/lib/src/solid/api/rest_api.dart index 8fe7e1cf..11a2ae8b 100644 --- a/lib/src/solid/api/rest_api.dart +++ b/lib/src/solid/api/rest_api.dart @@ -374,6 +374,89 @@ Future checkWebIdExists(String webIdUrl) async { } } +/// MIME types accepted as evidence that a 200 response is an RDF document and +/// therefore plausibly a Solid WebID profile. Anything else (most notably +/// `text/html`) is rejected so that arbitrary websites cannot masquerade as +/// WebIDs simply by responding 200 for an unknown path. +const Set _rdfProfileMimeTypes = { + 'text/turtle', + 'application/ld+json', + 'application/n-triples', + 'application/n-quads', + 'application/rdf+xml', + 'application/trig', + 'text/n3', +}; + +bool _isRdfProfileContentType(String contentType) { + final ct = contentType.toLowerCase().trim(); + if (ct.isEmpty) return false; + for (final mimeType in _rdfProfileMimeTypes) { + if (ct == mimeType || ct.startsWith('$mimeType;')) return true; + } + return false; +} + +/// Validate that [webIdUrl] points to a real Solid WebID *profile document*, +/// not just any HTTP resource that happens to return 200. +/// +/// A plain existence check (status code only) is unsafe because many ordinary +/// websites return a 200 HTML page for arbitrary paths — for example a +/// WordPress site's catch-all "soft 404" — which would otherwise be +/// indistinguishable from a real WebID. To guard against this we +/// content-negotiate for the usual RDF serialisations and accept the URL only +/// when the response carries an RDF `Content-Type`. +/// +/// This function does not catch network exceptions; callers should wrap it in +/// a try/catch when a host-resolution or connectivity error needs to be +/// surfaced separately from a "not a profile" outcome. +Future checkWebIdProfile(String webIdUrl) async { + // HTTP requests never carry the URL fragment, but stripping it explicitly + // keeps the request URL and any debug logs honest. + final uri = Uri.parse(webIdUrl).removeFragment(); + + final response = await http.get( + uri, + headers: const { + // Solid servers content-negotiate on `Accept`. List the common RDF + // serialisations in preference order; the trailing `*/*;q=0.1` allows + // us to still inspect the response if the server ignores `Accept`. + 'Accept': 'text/turtle, ' + 'application/ld+json;q=0.95, ' + 'application/n-triples;q=0.9, ' + 'application/rdf+xml;q=0.85, ' + 'application/n-quads;q=0.8, ' + 'application/trig;q=0.75, ' + 'text/n3;q=0.7, ' + '*/*;q=0.1', + }, + ); + + if (response.statusCode == 404) { + return WebIdStatus.notExist; + } + + if (response.statusCode != 200 && response.statusCode != 204) { + debugPrint( + 'checkWebIdProfile: unexpected status\n' + 'URL: $uri\n' + 'Status: ${response.statusCode}\n' + 'Body: ${response.body}', + ); + return WebIdStatus.unknown; + } + + final contentType = response.headers['content-type'] ?? ''; + if (_isRdfProfileContentType(contentType)) { + return WebIdStatus.valid; + } + + debugPrint( + 'checkWebIdProfile: 200 but non-RDF content type "$contentType" for $uri', + ); + return WebIdStatus.notProfile; +} + /// Given a WebID check if their POD is initialised using the Solidpod /// directory structure Future checkPodInitialised(String webIdUrl) async { diff --git a/lib/src/solid/constants/common.dart b/lib/src/solid/constants/common.dart index b7a277ff..bdb21e51 100644 --- a/lib/src/solid/constants/common.dart +++ b/lib/src/solid/constants/common.dart @@ -192,6 +192,28 @@ enum ResourceStatus { forbidden, } +/// Outcome of validating that a URL points to a real Solid WebID profile +/// document, as opposed to merely returning a 200 response. Plain existence +/// is insufficient because many ordinary websites happily return 200 HTML +/// for any unmatched path (SPA catch-alls, soft 404s, etc.), which would +/// otherwise be mistaken for a valid WebID. +enum WebIdStatus { + /// The URL responds with 200/204 and an RDF content type, so it is very + /// likely a genuine WebID profile document. + valid, + + /// The URL responds with 200/204 but the body is not RDF (typically a + /// `text/html` page from a regular website). The URL is reachable but is + /// *not* a WebID profile. + notProfile, + + /// The URL returned 404. + notExist, + + /// Some other status code (e.g. 403, 5xx) — could not determine. + unknown, +} + /// Types of the content of resources enum ResourceContentType { /// Detect the MIME type automatically at runtime diff --git a/lib/src/solid/utils/webid_validator.dart b/lib/src/solid/utils/webid_validator.dart new file mode 100644 index 00000000..595111cf --- /dev/null +++ b/lib/src/solid/utils/webid_validator.dart @@ -0,0 +1,175 @@ +/// Pure-Dart validation pipeline for candidate Solid WebID URLs. +/// +/// Copyright (C) 2026, Software Innovation Institute, ANU. +/// +/// Licensed under the MIT License (the "License"). +/// +/// License: https://choosealicense.com/licenses/mit/. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +/// +/// Authors: Tony Chen + +library; + +import 'package:solidpod/src/solid/api/rest_api.dart' show checkWebIdProfile; +import 'package:solidpod/src/solid/constants/common.dart' show WebIdStatus; + +/// Regular expression matching strings made up solely of decimal digits and +/// dots. Such a host is almost certainly an attempt at an IPv4 literal rather +/// than a domain name and should therefore be validated as IPv4 before any +/// network call is dispatched. + +final RegExp _digitsAndDotsOnly = RegExp(r'^[0-9.]+$'); + +/// Returns true when [host] looks like an attempt to type an IPv4 literal, +/// i.e. it only contains decimal digits and dots. This is intentionally +/// permissive: malformed inputs such as `192`, `192.168`, `192.168.1`, +/// `1.2.3.4.5` and `256.0.0.1` all return true so callers can then reject +/// them via [isValidIpv4]. + +bool looksLikeIpv4Attempt(String host) { + if (host.isEmpty) return false; + return _digitsAndDotsOnly.hasMatch(host); +} + +/// Returns true when [host] is a syntactically valid IPv4 address: four +/// dot-separated octets, each a decimal number in the range 0..255 with no +/// leading sign and at most three digits. + +bool isValidIpv4(String host) { + final parts = host.split('.'); + if (parts.length != 4) return false; + for (final part in parts) { + if (part.isEmpty || part.length > 3) return false; + final n = int.tryParse(part); + if (n == null || n < 0 || n > 255) return false; + } + return true; +} + +/// Outcome of validating a candidate WebID URL. + +enum WebIdCheckStatus { + /// The URL points to a valid Solid WebID profile document. + + valid, + + /// The URL is not a syntactically absolute URL. + + notAbsoluteUrl, + + /// The host looks like an IPv4 literal but is malformed (e.g. `192`, + /// `192.168.1`, `1.2.3.4.5`, `256.0.0.1`). + + invalidIpv4, + + /// The URL could not be reached (DNS lookup or network failure). + + unreachable, + + /// The URL is reachable and responded 200/204 but the body is not an RDF + /// profile document (typically a `text/html` page from a regular website). + + notProfile, + + /// The URL returned 404. + + notExist, + + /// Some other HTTP error (e.g. 403, 5xx) — could not determine. + + unknown, +} + +/// Structured outcome of [validateWebId]. + +class WebIdCheckResult { + const WebIdCheckResult( + this.status, { + this.host = '', + this.error, + }); + + /// The categorised failure mode (or [WebIdCheckStatus.valid] for success). + + final WebIdCheckStatus status; + + /// The host part of the WebID URL (may be empty if the URL was unparseable). + + final String host; + + /// The exception that caused the network failure, if any. Only populated + /// for [WebIdCheckStatus.unreachable]. + + final Object? error; + + /// Convenience: true when [status] is [WebIdCheckStatus.valid]. + + bool get isValid => status == WebIdCheckStatus.valid; +} + +/// Validate a candidate [webId] URL. +/// +/// The validation pipeline is: +/// 1. Reject URLs that are not syntactically absolute. +/// 2. Reject hosts that look like an IPv4 attempt but are malformed. +/// 3. Query [checkWebIdProfile] to confirm the URL points to a real Solid +/// WebID profile document, distinguishing "not a profile", "not found", +/// and "unknown" responses from genuine success. +/// 4. Map network exceptions to [WebIdCheckStatus.unreachable] so the +/// caller can surface a clear, actionable message instead of letting +/// the UI hang. + +Future validateWebId(String webId) async { + // Fragments such as `#me` are stripped before the absoluteness check + // because some `Uri.parse` paths treat them as part of the path. + + if (!Uri.parse(webId.replaceAll('#me', '')).isAbsolute) { + return const WebIdCheckResult(WebIdCheckStatus.notAbsoluteUrl); + } + + final host = Uri.tryParse(webId)?.host ?? ''; + + if (looksLikeIpv4Attempt(host) && !isValidIpv4(host)) { + return WebIdCheckResult(WebIdCheckStatus.invalidIpv4, host: host); + } + + WebIdStatus status; + try { + status = await checkWebIdProfile(webId); + } on Exception catch (e) { + return WebIdCheckResult( + WebIdCheckStatus.unreachable, + host: host, + error: e, + ); + } + + switch (status) { + case WebIdStatus.valid: + return WebIdCheckResult(WebIdCheckStatus.valid, host: host); + case WebIdStatus.notProfile: + return WebIdCheckResult(WebIdCheckStatus.notProfile, host: host); + case WebIdStatus.notExist: + return WebIdCheckResult(WebIdCheckStatus.notExist, host: host); + case WebIdStatus.unknown: + return WebIdCheckResult(WebIdCheckStatus.unknown, host: host); + } +}