From 9fe8e7bb3ceccd00743ad4b1f9530f26ee054abe Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Feb 2026 14:31:01 +0000 Subject: [PATCH 1/4] Initial plan From fddb3ff743a0346397af2ce2ced5189c3521a9c0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Feb 2026 14:35:02 +0000 Subject: [PATCH 2/4] Add fallback domain support for loadlanguages static catalog Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- processing/load_languages.py | 35 ++++++++++++++++++++++++++++++----- tests/test_load_languages.py | 2 +- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/processing/load_languages.py b/processing/load_languages.py index 5ee4308..8e3210a 100644 --- a/processing/load_languages.py +++ b/processing/load_languages.py @@ -26,6 +26,7 @@ logger = logging.getLogger(__name__) SENTRY_DSN = os.environ.get('SENTRY_DSN', None) LOGGING_LEVEL = os.environ.get('LOGGING_LEVEL', 'DEBUG') +STATIC_CATALOG_FALLBACK_DOMAIN = os.environ.get('STATIC_CATALOG_FALLBACK_DOMAIN', None) LOGGING = { 'version': 1, @@ -181,8 +182,9 @@ def load_documents(collection, articlemeta_db, all_records=False): class StaticCatalog(object): - def __init__(self, collection): + def __init__(self, collection, fallback_domain=None): self.catalog = {} + self.fallback_domain = fallback_domain self._load_static_catalog(collection, 'pdf') self._load_static_catalog(collection, 'html') self._load_static_catalog(collection, 'xml') @@ -225,7 +227,19 @@ def _load_static_catalog(self, source, tipe): url = '/'.join(['http:/', source, filename]) - content = do_request(url, json=False).iter_lines(decode_unicode='utf-8') + response = do_request(url, json=False) + + # If primary domain fails and fallback domain is configured, try fallback + if response is None and self.fallback_domain: + logger.warning(u'Failed to load from %s, trying fallback domain %s', source, self.fallback_domain) + fallback_url = '/'.join(['http:/', self.fallback_domain, filename]) + response = do_request(fallback_url, json=False) + + if response is None: + logger.error(u'Failed to load static catalog from %s (and fallback if configured)', source) + return + + content = response.iter_lines(decode_unicode='utf-8') for line in sorted([i for i in content]): splitedline = line.lower().split('/')[1:] @@ -382,7 +396,7 @@ def fulltexts(self, document): return ldata -def run(collections, articlemeta_db, all_records=False, forced_url=None): +def run(collections, articlemeta_db, all_records=False, forced_url=None, fallback_domain=None): if not isinstance(collections, list): logger.error('Collections must be a list o collection acronym') @@ -395,8 +409,13 @@ def run(collections, articlemeta_db, all_records=False, forced_url=None): collection_domain = forced_url if forced_url else coll_info['domain'] logger.info(u'Loading languages for %s', collection_domain) logger.info(u'Using mode all_records %s', str(all_records)) + + # Use environment variable fallback if not provided as parameter + effective_fallback = fallback_domain if fallback_domain else STATIC_CATALOG_FALLBACK_DOMAIN + if effective_fallback: + logger.info(u'Using fallback domain: %s', effective_fallback) - static_catalogs = StaticCatalog(collection_domain) + static_catalogs = StaticCatalog(collection_domain, fallback_domain=effective_fallback) for document in load_documents(collection, articlemeta_db, all_records=all_records): @@ -485,6 +504,12 @@ def main(): help='Collection domain to get Static catalog' ) + parser.add_argument( + '--fallback_domain', + '-f', + help='Fallback domain to try if primary domain fails (e.g., antigo.scielo.br)' + ) + args = parser.parse_args() LOGGING['handlers']['console']['level'] = args.logging_level for lg, content in LOGGING['loggers'].items(): @@ -494,7 +519,7 @@ def main(): collections = [args.collection] if args.collection else _collections_acronyms - run(collections, articlemeta_db, args.all_records, args.domain) + run(collections, articlemeta_db, args.all_records, args.domain, args.fallback_domain) if __name__ == '__main__': diff --git a/tests/test_load_languages.py b/tests/test_load_languages.py index c64a8de..274a8ae 100644 --- a/tests/test_load_languages.py +++ b/tests/test_load_languages.py @@ -10,7 +10,7 @@ from articlemeta import controller -def mock_static_catalog_init_method(self, collection): +def mock_static_catalog_init_method(self, collection, fallback_domain=None): self.catalog = { "rsp": { "v52": { From 7882f801d9f34c29f0e3a6560bc6e89ba5476f6d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Feb 2026 14:35:51 +0000 Subject: [PATCH 3/4] Add unit tests for fallback domain functionality Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- tests/test_load_languages.py | 52 ++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/test_load_languages.py b/tests/test_load_languages.py index 274a8ae..7324e31 100644 --- a/tests/test_load_languages.py +++ b/tests/test_load_languages.py @@ -114,6 +114,58 @@ def test_run(self): document['fulltexts']['html']) self.assertIsNotNone(document['fulltexts'].get('pdf')) + def test_static_catalog_fallback(self): + """Test that StaticCatalog tries fallback domain when primary fails""" + + # Mock do_request to simulate primary domain failure + def mock_do_request_primary_fails(url, json=True): + if 'www.scielo.br' in url: + return None # Primary fails + elif 'antigo.scielo.br' in url: + # Fallback succeeds with empty response + class MockResponse: + def iter_lines(self, decode_unicode=None): + return [] + return MockResponse() + return None + + with patch.object(load_languages, 'do_request', side_effect=mock_do_request_primary_fails): + # Test with fallback domain - should not raise error + catalog = load_languages.StaticCatalog('www.scielo.br', fallback_domain='antigo.scielo.br') + self.assertIsInstance(catalog.catalog, dict) + + # Test without fallback domain - catalog should still be created but empty + catalog_no_fallback = load_languages.StaticCatalog('www.scielo.br', fallback_domain=None) + self.assertIsInstance(catalog_no_fallback.catalog, dict) + + @patch.object( + load_languages.StaticCatalog, "__init__", mock_static_catalog_init_method + ) + def test_run_with_fallback_domain(self): + """Test run function with fallback_domain parameter""" + mocked_articlemeta_db = mongomock.MongoClient().db + mocked_articlemeta_db['collections'].insert_many([ + { + "acron": "scl", + "code": "scl", + "domain": "www.scielo.br" + }, + ]) + mocked_articlemeta_db['articles'].insert_one(self._raw_json) + + # Test with fallback_domain parameter + load_languages.run(['scl'], + mocked_articlemeta_db, + all_records=True, + forced_url='www.scielo.br', + fallback_domain='antigo.scielo.br') + + document = mocked_articlemeta_db['articles'].find_one( + {'code': self._raw_json['code']}, + {'_id': 0, 'citations': 0} + ) + self.assertIsNotNone(document) + if __name__ == '__main__': main() From 4599c6e8ada7da31a14ac49ef279a8a1d43789c5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Feb 2026 14:36:40 +0000 Subject: [PATCH 4/4] Address code review feedback: improve tests and variable naming Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- processing/load_languages.py | 8 ++++---- tests/test_load_languages.py | 30 +++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/processing/load_languages.py b/processing/load_languages.py index 8e3210a..93d1fc1 100644 --- a/processing/load_languages.py +++ b/processing/load_languages.py @@ -411,11 +411,11 @@ def run(collections, articlemeta_db, all_records=False, forced_url=None, fallbac logger.info(u'Using mode all_records %s', str(all_records)) # Use environment variable fallback if not provided as parameter - effective_fallback = fallback_domain if fallback_domain else STATIC_CATALOG_FALLBACK_DOMAIN - if effective_fallback: - logger.info(u'Using fallback domain: %s', effective_fallback) + effective_fallback_domain = fallback_domain if fallback_domain else STATIC_CATALOG_FALLBACK_DOMAIN + if effective_fallback_domain: + logger.info(u'Using fallback domain: %s', effective_fallback_domain) - static_catalogs = StaticCatalog(collection_domain, fallback_domain=effective_fallback) + static_catalogs = StaticCatalog(collection_domain, fallback_domain=effective_fallback_domain) for document in load_documents(collection, articlemeta_db, all_records=all_records): diff --git a/tests/test_load_languages.py b/tests/test_load_languages.py index 7324e31..7470a2a 100644 --- a/tests/test_load_languages.py +++ b/tests/test_load_languages.py @@ -117,26 +117,42 @@ def test_run(self): def test_static_catalog_fallback(self): """Test that StaticCatalog tries fallback domain when primary fails""" - # Mock do_request to simulate primary domain failure + # Mock do_request to simulate primary domain failure with actual catalog data def mock_do_request_primary_fails(url, json=True): if 'www.scielo.br' in url: return None # Primary fails elif 'antigo.scielo.br' in url: - # Fallback succeeds with empty response + # Fallback succeeds with catalog data class MockResponse: def iter_lines(self, decode_unicode=None): - return [] + # Return sample catalog entries + return [ + 'serial/rsp/v52/0034-8910-rsp-s1518-87872018052000131.pdf', + 'serial/rsp/v52/0034-8910-rsp-s1518-87872018052000131.xml', + ] return MockResponse() return None with patch.object(load_languages, 'do_request', side_effect=mock_do_request_primary_fails): - # Test with fallback domain - should not raise error + # Test with fallback domain - should populate catalog catalog = load_languages.StaticCatalog('www.scielo.br', fallback_domain='antigo.scielo.br') self.assertIsInstance(catalog.catalog, dict) + # Verify catalog was populated from fallback + self.assertIn('rsp', catalog.catalog) + self.assertIn('v52', catalog.catalog['rsp']) - # Test without fallback domain - catalog should still be created but empty - catalog_no_fallback = load_languages.StaticCatalog('www.scielo.br', fallback_domain=None) - self.assertIsInstance(catalog_no_fallback.catalog, dict) + # Test without fallback domain - should log error and have empty catalog + def mock_do_request_always_fails(url, json=True): + return None # Both primary and fallback fail + + with patch.object(load_languages, 'do_request', side_effect=mock_do_request_always_fails): + with patch.object(load_languages, 'logger') as mock_logger: + catalog_no_fallback = load_languages.StaticCatalog('www.scielo.br', fallback_domain=None) + self.assertIsInstance(catalog_no_fallback.catalog, dict) + # Verify error was logged for each file type (pdf, html, xml) + error_calls = [call for call in mock_logger.error.call_args_list + if 'Failed to load static catalog' in str(call)] + self.assertGreaterEqual(len(error_calls), 3) # At least one for each file type @patch.object( load_languages.StaticCatalog, "__init__", mock_static_catalog_init_method