diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e4d9a274..fe888472 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -79,6 +79,10 @@ jobs: git clone https://github.com/OpenGov-OpenData/ckanext-scheming pip install -e ckanext-scheming git clone https://github.com/ckan/ckanext-fluent + # Checkout a specific commit that works with ckan 2.9 + cd ckanext-fluent + git checkout 4e9340a934050e937bed49e6009d0971d880410a + cd .. pip install -e ckanext-fluent - name: Setup extension run: | diff --git a/ckanext/dcat/converters.py b/ckanext/dcat/converters.py index dad13aed..b5b4b750 100644 --- a/ckanext/dcat/converters.py +++ b/ckanext/dcat/converters.py @@ -101,6 +101,21 @@ def dcat_to_ckan(dcat_dict): ) continue + # Normalize distribution URL values + if 'downloadURL' in distribution: + normalized_downloadURL = _normalize_url_value(distribution['downloadURL'], 'downloadURL') + distribution['downloadURL'] = normalized_downloadURL + + if 'accessURL' in distribution: + normalized_accessURL = _normalize_url_value(distribution['accessURL'], 'accessURL') + distribution['accessURL'] = normalized_accessURL + + if not distribution.get('downloadURL') and not distribution.get('accessURL'): + log.debug('Skip resource %s, no valid URL in downloadURL or accessURL' % ( + distribution.get('title', dcat_dict.get('title')) + )) + continue + # skip data dictionaries if asbool(distribution.get('isDataDictionary', False)): log.debug('Skip data dictionary for %s: %s' % ( @@ -271,3 +286,26 @@ def get_bbox_geojson(spatial): coordinates = '[{},{},{},{},{}]'.format(point_a, point_b, point_c, point_d, point_a) bbox_str = '{\"type\": \"Polygon\", \"coordinates\": [' + coordinates + ']}' return bbox_str + + +def _normalize_url_value(value, field_name='URL'): + """Extract first valid URL from string or list.""" + if not value: + return '' + + SUPPORTED_PROTOCOLS = ('http://', 'https://', 'ftp://', 'ftps://', 's3://') + + if isinstance(value, str): + return value + + if isinstance(value, list): + log.debug('%s provided as list with %d items', field_name, len(value)) + for item in value: + if isinstance(item, str) and item.startswith(SUPPORTED_PROTOCOLS): + log.debug('%s: using first valid URL from list: %s', field_name, item) + return item + log.debug('%s provided as list but no valid URLs found', field_name) + return '' + + log.warning('%s has unexpected type: %s', field_name, type(value).__name__) + return '' diff --git a/ckanext/dcat/harvesters/_json.py b/ckanext/dcat/harvesters/_json.py index 120f078f..e91e22a4 100644 --- a/ckanext/dcat/harvesters/_json.py +++ b/ckanext/dcat/harvesters/_json.py @@ -482,7 +482,10 @@ def push_data_dictionary(context, resource, distribution): # Check for resource's data dictionary in the distribution fields = [] for dist in distribution: - if ((dist.get('downloadURL') == resource.get('url') or dist.get('accessURL') == resource.get('url')) + # Normalize URLs for comparison + download_url = converters._normalize_url_value(dist.get('downloadURL'), 'downloadURL') + access_url = converters._normalize_url_value(dist.get('accessURL'), 'accessURL') + if ((download_url == resource.get('url') or access_url == resource.get('url')) and dist.get('title') == resource.get('name') and 'action/datastore_search' in dist.get('describedBy', '')): try: diff --git a/ckanext/dcat/tests/test_converters.py b/ckanext/dcat/tests/test_converters.py index 368db7b5..65e48615 100644 --- a/ckanext/dcat/tests/test_converters.py +++ b/ckanext/dcat/tests/test_converters.py @@ -82,3 +82,15 @@ def test_get_bbox_geojson(): '[-115.5028, 41.3149], [-115.5028, 32.5718], ' '[-124.161, 32.5718]]]}' ) + +def test_normalize_url_value_with_list(): + value = [ + "withheld", + "withheld", + "https://www.wildlife.ca.gov/Data/BIOS", + "https://services2.arcgis.com/Uq9r85Potqm3MfRV/arcgis/rest/services/biosds69_fmu/FeatureServer", + "http://dx.doi.org/doi:10.5066/F7X06527" + ] + + result = converters._normalize_url_value(value, 'accessURL') + assert result == "https://www.wildlife.ca.gov/Data/BIOS"