From 0b1de1a1aae7d5001322c1749d8dffe50c884baa Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Thu, 28 May 2026 19:06:50 +0000 Subject: [PATCH] [cli]: Add DPU recovery CLI commands for SmartSwitch Extend 'show chassis modules status' with a Ready-Status column on SmartSwitch platforms. Add new 'show chassis modules recovery' command to expose detailed DPU recovery state (recovery_status, reset_count, last_down_time, last_ready_time) from CHASSIS_STATE_DB. Add unit tests for the new CLI commands. Signed-off-by: Vasundhara Volam Signed-off-by: Vasundhara Volam --- show/chassis_modules.py | 89 +++++++++++++++++++++- tests/chassis_modules_test.py | 138 ++++++++++++++++++++++++++++++++++ 2 files changed, 223 insertions(+), 4 deletions(-) diff --git a/show/chassis_modules.py b/show/chassis_modules.py index 48110ff24d..24753aae91 100644 --- a/show/chassis_modules.py +++ b/show/chassis_modules.py @@ -21,6 +21,16 @@ CHASSIS_MIDPLANE_INFO_IP_FIELD = 'ip_address' CHASSIS_MIDPLANE_INFO_ACCESS_FIELD = 'access' +DPU_STATE_TABLE = 'DPU_STATE' +DPU_STATE_READY_STATUS_FIELD = 'ready_status' +DPU_STATE_RECOVERY_STATUS_FIELD = 'recovery_status' +DPU_STATE_RESET_COUNT_FIELD = 'reset_count' +DPU_STATE_LAST_DOWN_TIME_FIELD = 'last_down_time' +DPU_STATE_LAST_READY_TIME_FIELD = 'last_ready_time' + +CHASSIS_SERVER = 'redis_chassis.server' +CHASSIS_SERVER_PORT = 6380 + @click.group(cls=clicommon.AliasedGroup) def chassis(): """Chassis commands group""" @@ -37,7 +47,11 @@ def modules(): def status(db, chassis_module_name): """Show chassis-modules status""" + smartswitch = is_smartswitch() header = ['Name', 'Description', 'Physical-Slot', 'Oper-Status', 'Admin-Status', 'Serial'] + if smartswitch: + header.append('Ready-Status') + chassis_cfg_table = db.cfgdb.get_table('CHASSIS_MODULE') state_db = SonicV2Connector(host="127.0.0.1") @@ -65,6 +79,19 @@ def status(db, chassis_module_name): except Exception: pass + # For SmartSwitch, connect to CHASSIS_STATE_DB to read DPU_STATE + dpu_state_data = {} + if smartswitch: + chassis_state_db = SonicV2Connector(host=CHASSIS_SERVER, port=CHASSIS_SERVER_PORT) + chassis_state_db.connect(chassis_state_db.CHASSIS_STATE_DB) + dpu_key_pattern = DPU_STATE_TABLE + '|*' + dpu_keys = chassis_state_db.keys(chassis_state_db.CHASSIS_STATE_DB, dpu_key_pattern) + if dpu_keys: + for dpu_key in dpu_keys: + dpu_name = dpu_key.split('|')[1] + dpu_state_data[dpu_name] = chassis_state_db.get_all( + chassis_state_db.CHASSIS_STATE_DB, dpu_key) + table = [] for key in natsorted(keys): key_list = key.split('|') @@ -87,7 +114,7 @@ def status(db, chassis_module_name): oper_status = platform_oper_status # Determine admin_status - if is_smartswitch(): + if smartswitch: admin_status = 'down' else: admin_status = 'up' @@ -95,13 +122,67 @@ def status(db, chassis_module_name): if config_data is not None: admin_status = config_data.get(CHASSIS_MODULE_INFO_ADMINSTATUS_FIELD, admin_status) - table.append((key_list[1], desc, slot, oper_status, admin_status, serial)) + row = [key_list[1], desc, slot, oper_status, admin_status, serial] + + if smartswitch: + dpu_info = dpu_state_data.get(key_list[1], {}) + ready_status = dpu_info.get(DPU_STATE_READY_STATUS_FIELD, 'N/A') + row.append(ready_status) + + table.append(tuple(row)) if table: click.echo(tabulate(table, header, tablefmt='simple', stralign='right')) else: click.echo('No data available in CHASSIS_MODULE_TABLE\n') + +@modules.command() +@click.argument('chassis_module_name', metavar='', required=False) +def recovery(chassis_module_name): + """Show chassis-modules recovery information""" + + if not is_smartswitch(): + click.echo('This command is only supported on SmartSwitch platforms') + return + + header = ['Name', 'Ready-Status', 'Recovery-Status', 'Reset-Count', + 'Last-Down-Time', 'Last-Ready-Time'] + + chassis_state_db = SonicV2Connector(host=CHASSIS_SERVER, port=CHASSIS_SERVER_PORT) + chassis_state_db.connect(chassis_state_db.CHASSIS_STATE_DB) + + key_pattern = DPU_STATE_TABLE + '|*' + if chassis_module_name: + key_pattern = DPU_STATE_TABLE + '|' + chassis_module_name + + keys = chassis_state_db.keys(chassis_state_db.CHASSIS_STATE_DB, key_pattern) + if not keys: + click.echo('No DPU recovery data available') + return + + table = [] + for key in natsorted(keys): + key_list = key.split('|') + if len(key_list) != 2: + continue + + data_dict = chassis_state_db.get_all(chassis_state_db.CHASSIS_STATE_DB, key) + + ready_status = data_dict.get(DPU_STATE_READY_STATUS_FIELD, 'N/A') + recovery_status = data_dict.get(DPU_STATE_RECOVERY_STATUS_FIELD, 'N/A') + reset_count = data_dict.get(DPU_STATE_RESET_COUNT_FIELD, '0') + last_down_time = data_dict.get(DPU_STATE_LAST_DOWN_TIME_FIELD, '-') + last_ready_time = data_dict.get(DPU_STATE_LAST_READY_TIME_FIELD, '-') + + table.append((key_list[1], ready_status, recovery_status, reset_count, + last_down_time, last_ready_time)) + + if table: + click.echo(tabulate(table, header, tablefmt='simple', stralign='right')) + else: + click.echo('No DPU recovery data available') + @modules.command() @click.argument('chassis_module_name', metavar='', required=False) def midplane_status(chassis_module_name): @@ -143,7 +224,7 @@ def midplane_status(chassis_module_name): @chassis.command() @click.argument('systemportname', required=False) -@click.option('--namespace', '-n', 'namespace', required=True if multi_asic.is_multi_asic() else False, +@click.option('--namespace', '-n', 'namespace', required=True if multi_asic.is_multi_asic() else False, default=None, type=str, show_default=False, help='Namespace name or all') @click.option('--verbose', is_flag=True, help="Enable verbose output") def system_ports(systemportname, namespace, verbose): @@ -153,7 +234,7 @@ def system_ports(systemportname, namespace, verbose): if systemportname is not None: cmd += ['-i', str(systemportname)] - + if namespace is not None: cmd += ['-n', str(namespace)] diff --git a/tests/chassis_modules_test.py b/tests/chassis_modules_test.py index d255ef4917..95fba11118 100755 --- a/tests/chassis_modules_test.py +++ b/tests/chassis_modules_test.py @@ -879,3 +879,141 @@ def test_show_status_bmc_module_helper_init_failure_falls_back_to_db(self): def teardown_class(cls): print("TEARDOWN") os.environ["UTILITIES_UNIT_TESTING"] = "0" + + +class TestChassisModulesRecovery(object): + """Tests for 'show chassis modules recovery' and Ready-Status in status command""" + + @classmethod + def setup_class(cls): + print("SETUP") + os.environ["UTILITIES_UNIT_TESTING"] = "1" + + def _setup_dpu_state_db(self): + """Create a mock SonicV2Connector with DPU_STATE data in CHASSIS_STATE_DB.""" + from swsssdk import SonicV2Connector as MockSonicV2Connector + conn = MockSonicV2Connector() + conn.connect(conn.CHASSIS_STATE_DB) + # DPU0 - healthy + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU0', "ready_status", "true") + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU0', "recovery_status", "recoverable") + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU0', "reset_count", "2") + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU0', "last_down_time", "2026-05-28 10:15:30 UTC") + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU0', "last_ready_time", "2026-05-28 10:18:45 UTC") + # DPU1 - healthy with no failures + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU1', "ready_status", "true") + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU1', "recovery_status", "recoverable") + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU1', "reset_count", "0") + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU1', "last_ready_time", "2026-05-28 09:00:12 UTC") + # DPU2 - unrecoverable + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU2', "ready_status", "false") + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU2', "recovery_status", "unrecoverable") + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU2', "reset_count", "2") + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU2', "last_down_time", "2026-05-28 11:02:00 UTC") + return conn + + def test_show_recovery_all(self): + """Test show chassis modules recovery shows all DPUs.""" + conn = self._setup_dpu_state_db() + runner = CliRunner() + with mock.patch('show.chassis_modules.is_smartswitch', return_value=True), \ + mock.patch('show.chassis_modules.SonicV2Connector', return_value=conn): + result = runner.invoke( + show.cli.commands["chassis"].commands["modules"].commands["recovery"], []) + print(result.output) + assert result.exit_code == 0 + assert "DPU0" in result.output + assert "DPU1" in result.output + assert "DPU2" in result.output + assert "recoverable" in result.output + assert "unrecoverable" in result.output + + def test_show_recovery_single_module(self): + """Test show chassis modules recovery for a specific DPU.""" + conn = self._setup_dpu_state_db() + runner = CliRunner() + with mock.patch('show.chassis_modules.is_smartswitch', return_value=True), \ + mock.patch('show.chassis_modules.SonicV2Connector', return_value=conn): + result = runner.invoke( + show.cli.commands["chassis"].commands["modules"].commands["recovery"], ["DPU0"]) + print(result.output) + assert result.exit_code == 0 + assert "DPU0" in result.output + assert "true" in result.output + assert "recoverable" in result.output + assert "2026-05-28 10:15:30 UTC" in result.output + assert "2026-05-28 10:18:45 UTC" in result.output + + def test_show_recovery_non_smartswitch(self): + """Test recovery command on non-SmartSwitch platform.""" + runner = CliRunner() + with mock.patch('show.chassis_modules.is_smartswitch', return_value=False): + result = runner.invoke( + show.cli.commands["chassis"].commands["modules"].commands["recovery"], []) + print(result.output) + assert result.exit_code == 0 + assert "only supported on SmartSwitch" in result.output + + def test_show_recovery_no_data(self): + """Test recovery command when no DPU_STATE data is available.""" + from swsssdk import SonicV2Connector as MockSonicV2Connector + conn = MockSonicV2Connector() + conn.connect(conn.CHASSIS_STATE_DB) + runner = CliRunner() + with mock.patch('show.chassis_modules.is_smartswitch', return_value=True), \ + mock.patch('show.chassis_modules.SonicV2Connector', return_value=conn): + result = runner.invoke( + show.cli.commands["chassis"].commands["modules"].commands["recovery"], []) + print(result.output) + assert result.exit_code == 0 + assert "No DPU recovery data available" in result.output + + def test_show_status_with_ready_status_smartswitch(self): + """Test show chassis modules status includes Ready-Status on SmartSwitch.""" + conn = self._setup_dpu_state_db() + runner = CliRunner() + with mock.patch('show.chassis_modules.is_smartswitch', return_value=True), \ + mock.patch('show.chassis_modules.is_bmc', return_value=False), \ + mock.patch('show.chassis_modules.SonicV2Connector', return_value=conn): + result = runner.invoke( + show.cli.commands["chassis"].commands["modules"].commands["status"], []) + print(result.output) + assert result.exit_code == 0 + assert "Ready-Status" in result.output + + def test_show_recovery_unrecoverable_dpu(self): + """Test that unrecoverable DPU shows correct status.""" + conn = self._setup_dpu_state_db() + runner = CliRunner() + with mock.patch('show.chassis_modules.is_smartswitch', return_value=True), \ + mock.patch('show.chassis_modules.SonicV2Connector', return_value=conn): + result = runner.invoke( + show.cli.commands["chassis"].commands["modules"].commands["recovery"], ["DPU2"]) + print(result.output) + assert result.exit_code == 0 + assert "DPU2" in result.output + assert "false" in result.output + assert "unrecoverable" in result.output + assert "2026-05-28 11:02:00 UTC" in result.output + + def test_show_recovery_missing_fields(self): + """Test recovery command gracefully handles missing fields.""" + from swsssdk import SonicV2Connector as MockSonicV2Connector + conn = MockSonicV2Connector() + conn.connect(conn.CHASSIS_STATE_DB) + # DPU with only ready_status set + conn.set(conn.CHASSIS_STATE_DB, 'DPU_STATE|DPU0', "ready_status", "false") + runner = CliRunner() + with mock.patch('show.chassis_modules.is_smartswitch', return_value=True), \ + mock.patch('show.chassis_modules.SonicV2Connector', return_value=conn): + result = runner.invoke( + show.cli.commands["chassis"].commands["modules"].commands["recovery"], []) + print(result.output) + assert result.exit_code == 0 + assert "DPU0" in result.output + assert "false" in result.output + + @classmethod + def teardown_class(cls): + print("TEARDOWN") + os.environ["UTILITIES_UNIT_TESTING"] = "0"