Skip to content

Commit 603bd00

Browse files
authored
Add Azure Service Bus health patterns to K8sLog (#535)
This update introduces new health check patterns for Azure Service Bus in k8s_log.py, enhancing log analysis capabilities. The added patterns focus on connection recovery, idle timeout recovery, link lifecycle management, and reactor disposal, all categorized under a new "HealthyRecovery" category. Additionally, the log processing logic has been updated to track severity 5 items separately, ensuring clearer reporting of normal operational events that do not require action. This enhancement aims to improve monitoring and troubleshooting of Azure Service Bus interactions within Kubernetes workloads.
1 parent 1aeab03 commit 603bd00

1 file changed

Lines changed: 117 additions & 22 deletions

File tree

libraries/RW/K8sLog/k8s_log.py

Lines changed: 117 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,30 @@ def _load_error_patterns(self) -> Dict[str, Any]:
196196
"pattern": r"(?i)(?:timeout|deadline).*[:=]\s*(?:none|0|disabled|false)",
197197
"description": "Disabled or zero timeout configurations",
198198
"exclude": True
199+
},
200+
{
201+
"name": "azure_servicebus_connection_recovery",
202+
"pattern": r"(?i).*(?:onLinkRemoteOpen|onConnectionBound|Emitting new response channel).*(?:connectionId|linkName|entityPath)",
203+
"description": "Azure Service Bus normal connection establishment and recovery",
204+
"exclude": True
205+
},
206+
{
207+
"name": "azure_servicebus_idle_timeout_recovery",
208+
"pattern": r"(?i).*(?:IdleTimerExpired|Idle timeout|Transient error occurred).*(?:retryAfter|attempt)",
209+
"description": "Azure Service Bus normal idle timeout and automatic retry",
210+
"exclude": True
211+
},
212+
{
213+
"name": "azure_servicebus_link_lifecycle",
214+
"pattern": r"(?i).*(?:Freeing resources due to error|link.*is force detached).*(?:IdleTimerExpired|Idle timeout)",
215+
"description": "Azure Service Bus normal link lifecycle and cleanup",
216+
"exclude": True
217+
},
218+
{
219+
"name": "azure_servicebus_reactor_disposal",
220+
"pattern": r"(?i).*Reactor selectable is being disposed.*connectionId",
221+
"description": "Azure Service Bus normal reactor cleanup",
222+
"exclude": True
199223
}
200224
],
201225
"patterns": {
@@ -268,6 +292,32 @@ def _load_error_patterns(self) -> Dict[str, Any]:
268292
"severity": 2,
269293
"next_steps": ["Add null checks in the failing code path", "Verify object initialization order", "Check for race conditions in concurrent code", "Review request data validation"]
270294
}
295+
],
296+
"HealthyRecovery": [
297+
{
298+
"name": "azure_servicebus_connection_recovery",
299+
"pattern": r"(?i).*(?:onLinkRemoteOpen|onConnectionBound|Emitting new response channel).*(?:connectionId|linkName|entityPath)",
300+
"severity": 5,
301+
"next_steps": ["This is normal Azure Service Bus connection recovery behavior", "No action required - connections are re-establishing automatically", "Monitor for excessive connection churn if this occurs very frequently"]
302+
},
303+
{
304+
"name": "azure_servicebus_idle_timeout_recovery",
305+
"pattern": r"(?i).*(?:IdleTimerExpired|Idle timeout|Transient error occurred).*(?:retryAfter|attempt)",
306+
"severity": 5,
307+
"next_steps": ["This is normal Azure Service Bus idle timeout recovery", "Connections idle for 10+ minutes are automatically cleaned up and recreated", "No action required - automatic retry is functioning correctly"]
308+
},
309+
{
310+
"name": "azure_servicebus_link_lifecycle",
311+
"pattern": r"(?i).*(?:Freeing resources due to error|link.*is force detached).*(?:IdleTimerExpired|Idle timeout)",
312+
"severity": 5,
313+
"next_steps": ["This is normal Azure Service Bus link lifecycle management", "Links are cleaned up after idle timeout and recreated as needed", "No action required - this indicates healthy connection management"]
314+
},
315+
{
316+
"name": "azure_servicebus_reactor_disposal",
317+
"pattern": r"(?i).*Reactor selectable is being disposed.*connectionId",
318+
"severity": 5,
319+
"next_steps": ["This is normal Azure Service Bus reactor cleanup", "Old connections are being properly disposed", "No action required - this indicates proper resource cleanup"]
320+
}
271321
]
272322
}
273323
}
@@ -788,7 +838,7 @@ def _scan_logs_for_issues_impl(self, log_dir: str, workload_type: str, workload_
788838
if categories is None:
789839
categories = [
790840
"GenericError", "AppFailure", "StackTrace", "Connection",
791-
"Timeout", "Auth", "Exceptions", "Anomaly", "AppRestart", "Resource"
841+
"Timeout", "Auth", "Exceptions", "Anomaly", "AppRestart", "Resource", "HealthyRecovery"
792842
]
793843

794844
# Use custom patterns if provided, otherwise use embedded patterns
@@ -837,6 +887,7 @@ def _scan_logs_for_issues_impl(self, log_dir: str, workload_type: str, workload_
837887

838888
# Pattern aggregators
839889
category_issues = defaultdict(list)
890+
category_notes = defaultdict(list) # Track severity 5 items separately
840891
max_severity = 5
841892

842893
# Map of numeric severity to text label
@@ -845,6 +896,7 @@ def _scan_logs_for_issues_impl(self, log_dir: str, workload_type: str, workload_
845896
2: "Major",
846897
3: "Minor",
847898
4: "Informational",
899+
5: "Note",
848900
}
849901

850902
pods = [pod["metadata"]["name"] for pod in pods_data]
@@ -935,26 +987,40 @@ def _scan_logs_for_issues_impl(self, log_dir: str, workload_type: str, workload_
935987
break
936988

937989
if matches:
938-
max_severity = min(max_severity, severity)
939-
940-
# Generate context-aware next steps
941-
sample_lines = [m["line"] for m in matches[:3]]
942-
context_aware_steps = self._generate_context_aware_next_steps(
943-
pattern_config["name"], next_steps, sample_lines, workload_name, namespace
944-
)
945-
946-
# Create issue for this pattern
947-
issue = {
948-
"category": category,
949-
"pattern_name": pattern_config["name"],
950-
"severity": severity,
951-
"next_steps": context_aware_steps,
952-
"matches": matches,
953-
"total_occurrences": len(matches),
954-
"sample_lines": sample_lines
955-
}
956-
957-
category_issues[category].append(issue)
990+
# Only add to category_issues if severity is less than 5
991+
if severity < 5:
992+
max_severity = min(max_severity, severity)
993+
994+
# Generate context-aware next steps
995+
sample_lines = [m["line"] for m in matches[:3]]
996+
context_aware_steps = self._generate_context_aware_next_steps(
997+
pattern_config["name"], next_steps, sample_lines, workload_name, namespace
998+
)
999+
1000+
# Create issue for this pattern
1001+
issue = {
1002+
"category": category,
1003+
"pattern_name": pattern_config["name"],
1004+
"severity": severity,
1005+
"next_steps": context_aware_steps,
1006+
"matches": matches,
1007+
"total_occurrences": len(matches),
1008+
"sample_lines": sample_lines
1009+
}
1010+
1011+
category_issues[category].append(issue)
1012+
else:
1013+
# If severity is 5, add to category_notes
1014+
note = {
1015+
"category": category,
1016+
"pattern_name": pattern_config["name"],
1017+
"severity": severity,
1018+
"next_steps": next_steps, # Severity 5 items don't have specific next steps
1019+
"matches": matches,
1020+
"total_occurrences": len(matches),
1021+
"sample_lines": [m["line"] for m in matches[:3]] # Sample lines for notes
1022+
}
1023+
category_notes[category].append(note)
9581024

9591025
# Check if we've reached the total line limit
9601026
if total_lines_processed >= max_total_lines:
@@ -993,7 +1059,7 @@ def _scan_logs_for_issues_impl(self, log_dir: str, workload_type: str, workload_
9931059
details_part = f"**Container:** {', '.join([f'{container} ({count}x)' for container, count in container_info.items()])}"
9941060
consolidated_data["details_parts"].append(details_part)
9951061

996-
# Create final issues from consolidated data
1062+
# Create final issues from consolidated data (severity < 5 only)
9971063
for content_key, issue_data in consolidated_issues.items():
9981064
# Deduplicate and clean up the merged data
9991065
unique_details = []
@@ -1031,9 +1097,38 @@ def _scan_logs_for_issues_impl(self, log_dir: str, workload_type: str, workload_
10311097
"category": issue_data["category"]
10321098
})
10331099

1100+
# Consolidate notes by category and create final results
1101+
consolidated_notes_by_category = {}
1102+
for category, notes in category_notes.items():
1103+
consolidated_notes_by_category[category] = {
1104+
"total_occurrences": sum(n["total_occurrences"] for n in notes),
1105+
"sample_lines": list(dict.fromkeys([n["sample_lines"][0] for n in notes])), # Take one sample line per note
1106+
"unique_next_steps": set(),
1107+
"details_parts": []
1108+
}
1109+
for note in notes:
1110+
consolidated_notes_by_category[category]["unique_next_steps"].update(note["next_steps"])
1111+
# Add details about where the note was found
1112+
container_info = defaultdict(int)
1113+
for match in note["matches"]:
1114+
container_info[match['container']] += 1
1115+
details_part = f"**Container:** {', '.join([f'{container} ({count}x)' for container, count in container_info.items()])}"
1116+
consolidated_notes_by_category[category]["details_parts"].append(details_part)
1117+
1118+
# Add notes to summary
1119+
for category, note_data in consolidated_notes_by_category.items():
1120+
if note_data["total_occurrences"] > 0:
1121+
severity_label = severity_label_map.get(5, "Note") # Severity 5 is always "Note"
1122+
issues_json["summary"].append(
1123+
f"📝 Informational Notes ({category} events): {note_data['total_occurrences']}x. "
1124+
f"These are normal operational events that do not require action."
1125+
)
1126+
10341127
# Generate summary
10351128
total_issues = len(consolidated_issues)
1129+
total_notes = sum(len(notes) for notes in consolidated_notes_by_category.values()) # Count total notes across all categories
10361130
categories_found = set(issue_data["category"] for issue_data in consolidated_issues.values())
1131+
note_categories_found = set(category for category in consolidated_notes_by_category.keys())
10371132

10381133
severity_label = severity_label_map.get(max_severity, f"Unknown({max_severity})")
10391134

0 commit comments

Comments
 (0)