diff --git a/ckanext/dcat/configuration_processors.py b/ckanext/dcat/configuration_processors.py index 10597031..f60b4926 100644 --- a/ckanext/dcat/configuration_processors.py +++ b/ckanext/dcat/configuration_processors.py @@ -488,11 +488,7 @@ class FormatFilter(BaseConfigProcessor): @staticmethod def check_config(config_obj): - if 'format_filter_include' in config_obj \ - and 'format_filter_exclude' in config_obj: - raise ValueError('Harvest configuration cannot contain both ' - 'format_filter_include and format_filter_exclude') - for key in ['format_filter_include', 'format_filter_exclude']: + for key in ['format_filter_exclude', 'format_filter_include']: if key in config_obj: formats_list = config_obj[key] if not isinstance(formats_list, list): @@ -510,11 +506,7 @@ class TagFilter(BaseConfigProcessor): @staticmethod def check_config(config_obj): - if 'tag_filter_include' in config_obj \ - and 'tag_filter_exclude' in config_obj: - raise ValueError('Harvest configuration cannot contain both ' - 'tag_filter_include and tag_filter_exclude') - for key in ['tag_filter_include', 'tag_filter_exclude']: + for key in ['tag_filter_exclude', 'tag_filter_include']: if key in config_obj: tags_list = config_obj[key] if not isinstance(tags_list, list): diff --git a/ckanext/dcat/harvesters/_json.py b/ckanext/dcat/harvesters/_json.py index 01151dd1..120f078f 100644 --- a/ckanext/dcat/harvesters/_json.py +++ b/ckanext/dcat/harvesters/_json.py @@ -48,17 +48,17 @@ def _get_guids_and_datasets(self, content): else: raise ValueError('Wrong JSON object') - # Filter in/out datasets from particular organizations + # Filter datasets from particular organizations org_filter_include = self.config.get('organizations_filter_include', []) org_filter_exclude = self.config.get('organizations_filter_exclude', []) - # Filter in/out datasets with particular formats - format_filter_include = self.config.get('format_filter_include', []) + # Filter datasets with particular formats format_filter_exclude = self.config.get('format_filter_exclude', []) + format_filter_include = self.config.get('format_filter_include', []) - # Filter in/out datasets with particular tags - tag_filter_include = self.config.get('tag_filter_include', []) + # Filter datasets with particular tags tag_filter_exclude = self.config.get('tag_filter_exclude', []) + tag_filter_include = self.config.get('tag_filter_include', []) for dataset in datasets: # Get the organization name for the dataset @@ -80,27 +80,27 @@ def _get_guids_and_datasets(self, content): if dcat_publisher_name in org_filter_exclude: continue - # Include/exclude dataset based on particular formats - if format_filter_include or format_filter_exclude: + # Exclude/include dataset based on particular formats + if format_filter_exclude or format_filter_include: resource_formats = [ dist.get('format', '').lower() for dist in dataset.get('distribution', []) if dist.get('format') ] + if format_filter_exclude: + if any(fmt in resource_formats for fmt in format_filter_exclude): + continue if format_filter_include: if not any(fmt in resource_formats for fmt in format_filter_include): continue - elif format_filter_exclude: - if any(fmt in resource_formats for fmt in format_filter_exclude): - continue - # Include/exclude dataset based on particular tags + # Exclude/include dataset based on particular tags + if tag_filter_exclude: + if any(tag in dataset.get('keyword', []) for tag in tag_filter_exclude): + continue if tag_filter_include: if not any(tag in dataset.get('keyword', []) for tag in tag_filter_include): continue - elif tag_filter_exclude: - if any(tag in dataset.get('keyword', []) for tag in tag_filter_exclude): - continue as_string = json.dumps(dataset) diff --git a/ckanext/dcat/tests/test_configuration_processors.py b/ckanext/dcat/tests/test_configuration_processors.py index 1c751621..c7e9fa16 100644 --- a/ckanext/dcat/tests/test_configuration_processors.py +++ b/ckanext/dcat/tests/test_configuration_processors.py @@ -935,31 +935,31 @@ class TestFormatFilter: def test_validation_correct_format(self): config = { - "format_filter_include": [ - "CSV", - "GeoJSON" + "format_filter_exclude": [ + "PDF" ] } try: self.processor.check_config(config) - assert config["format_filter_include"] == ["csv", "geojson"] + assert config["format_filter_exclude"] == ["pdf"] except ValueError: assert False config = { - "format_filter_exclude": [ - "PDF" + "format_filter_include": [ + "CSV", + "GeoJSON" ] } try: self.processor.check_config(config) - assert config["format_filter_exclude"] == ["pdf"] + assert config["format_filter_include"] == ["csv", "geojson"] except ValueError: assert False def test_validation_wrong_format(self): config = { - "format_filter_include": "CSV, GeoJSON" + "format_filter_exclude": "PDF" } try: self.processor.check_config(config) @@ -968,7 +968,7 @@ def test_validation_wrong_format(self): assert True config = { - "format_filter_exclude": "PDF" + "format_filter_include": "CSV, GeoJSON" } try: self.processor.check_config(config) @@ -983,9 +983,9 @@ class TestTagFilter: def test_validation_correct_format(self): config = { - "tag_filter_include": [ - "Climate", - "Water" + "tag_filter_exclude": [ + "Application", + "Software" ] } try: @@ -994,9 +994,9 @@ def test_validation_correct_format(self): assert False config = { - "tag_filter_exclude": [ - "Application", - "Software" + "tag_filter_include": [ + "Climate", + "Water" ] } try: @@ -1006,16 +1006,16 @@ def test_validation_correct_format(self): def test_validation_wrong_format(self): config = { - "tag_filter_include": "Climate, Water" + "tag_filter_exclude": "Application, Software" } try: self.processor.check_config(config) assert False except ValueError: assert True - + config = { - "tag_filter_exclude": "Application, Software" + "tag_filter_include": "Climate, Water" } try: self.processor.check_config(config)