diff --git a/codebundles/k8s-daemonset-healthcheck/runbook.robot b/codebundles/k8s-daemonset-healthcheck/runbook.robot index 501235e66..d2398a114 100644 --- a/codebundles/k8s-daemonset-healthcheck/runbook.robot +++ b/codebundles/k8s-daemonset-healthcheck/runbook.robot @@ -39,6 +39,7 @@ Analyze Application Log Patterns for DaemonSet `${DAEMONSET_NAME}` in Namespace ... context=${CONTEXT} ... kubeconfig=${kubeconfig} ... log_age=${LOG_AGE} + ... excluded_containers=${EXCLUDED_CONTAINERS} ${scan_results}= RW.K8sLog.Scan Logs For Issues ... log_dir=${log_dir} @@ -46,6 +47,7 @@ Analyze Application Log Patterns for DaemonSet `${DAEMONSET_NAME}` in Namespace ... workload_name=${DAEMONSET_NAME} ... namespace=${NAMESPACE} ... categories=@{LOG_PATTERN_CATEGORIES} + ... excluded_containers=${EXCLUDED_CONTAINERS} ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} @@ -98,12 +100,14 @@ Detect Log Anomalies for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE ... context=${CONTEXT} ... kubeconfig=${kubeconfig} ... log_age=${LOG_AGE} + ... excluded_containers=${EXCLUDED_CONTAINERS} ${anomaly_results}= RW.K8sLog.Analyze Log Anomalies ... log_dir=${log_dir} ... workload_type=daemonset ... workload_name=${DAEMONSET_NAME} ... namespace=${NAMESPACE} + ... excluded_containers=${EXCLUDED_CONTAINERS} # Process anomaly issues ${anomaly_issues}= Evaluate $anomaly_results.get('issues', []) @@ -945,9 +949,21 @@ Suite Initialization ... pattern=\d+ ... example=1 ... default=1 + ${EXCLUDED_CONTAINER_NAMES}= RW.Core.Import User Variable EXCLUDED_CONTAINER_NAMES + ... type=string + ... description=Comma-separated list of container names to exclude from log analysis (e.g., linkerd-proxy, istio-proxy, vault-agent). + ... pattern=.* + ... example=linkerd-proxy,istio-proxy,vault-agent + ... default=linkerd-proxy,istio-proxy,vault-agent - # Convert comma-separated string to list + # Convert comma-separated strings to lists @{LOG_PATTERN_CATEGORIES}= Split String ${LOG_PATTERN_CATEGORIES_STR} , + @{EXCLUDED_CONTAINERS_RAW}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + @{EXCLUDED_CONTAINERS}= Create List + FOR ${container} IN @{EXCLUDED_CONTAINERS_RAW} + ${trimmed_container}= Strip String ${container} + Append To List ${EXCLUDED_CONTAINERS} ${trimmed_container} + END Set Suite Variable ${kubeconfig} Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} @@ -961,6 +977,8 @@ Suite Initialization Set Suite Variable ${ANOMALY_THRESHOLD} Set Suite Variable ${CONTAINER_RESTART_AGE} Set Suite Variable ${CONTAINER_RESTART_THRESHOLD} + Set Suite Variable ${EXCLUDED_CONTAINER_NAMES} + Set Suite Variable @{EXCLUDED_CONTAINERS} ${env}= Evaluate {"KUBECONFIG":"${kubeconfig.key}","KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}","CONTEXT":"${CONTEXT}","NAMESPACE":"${NAMESPACE}","DAEMONSET_NAME":"${DAEMONSET_NAME}","CONTAINER_RESTART_AGE":"${CONTAINER_RESTART_AGE}","CONTAINER_RESTART_THRESHOLD":"${CONTAINER_RESTART_THRESHOLD}"} Set Suite Variable ${env} diff --git a/codebundles/k8s-deployment-healthcheck/runbook.robot b/codebundles/k8s-deployment-healthcheck/runbook.robot index 6bfc95274..c2f8fe83a 100755 --- a/codebundles/k8s-deployment-healthcheck/runbook.robot +++ b/codebundles/k8s-deployment-healthcheck/runbook.robot @@ -112,6 +112,12 @@ Suite Initialization ... pattern=.* ... example=linkerd-proxy,istio-proxy,vault-agent ... default=linkerd-proxy,istio-proxy,vault-agent + ${CONTAINER_NAME}= RW.Core.Import User Variable CONTAINER_NAME + ... type=string + ... description=Optional: the specific container name to fetch logs from. If not set, the primary application container is auto-detected by excluding known sidecars. + ... pattern=.* + ... example=controller + ... default= ${CONTAINER_RESTART_AGE}= RW.Core.Import User Variable CONTAINER_RESTART_AGE ... type=string @@ -152,6 +158,7 @@ Suite Initialization Set Suite Variable ${LOG_SCAN_TIMEOUT} Set Suite Variable ${EXCLUDED_CONTAINER_NAMES} Set Suite Variable @{EXCLUDED_CONTAINERS} + Set Suite Variable ${CONTAINER_NAME} Set Suite Variable ${CONTAINER_RESTART_AGE} Set Suite Variable ${CONTAINER_RESTART_THRESHOLD} @@ -401,9 +408,41 @@ Fetch Deployment Logs for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` ... data:logs-bulk # Skip pod-related checks if deployment is scaled to 0 IF not ${SKIP_POD_CHECKS} - # Fetch raw logs + # Determine which container to fetch logs from + IF "${CONTAINER_NAME}" != "" + ${target_container}= Set Variable ${CONTAINER_NAME} + ELSE + # Auto-detect primary container by listing containers and excluding known sidecars + ${container_json}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o jsonpath='{.spec.template.spec.containers[*].name}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... include_in_history=false + @{all_containers}= Split String ${container_json.stdout} + ${container_count}= Get Length ${all_containers} + ${target_container}= Set Variable ${EMPTY} + IF ${container_count} > 0 + FOR ${cname} IN @{all_containers} + ${is_excluded}= Evaluate "${cname}" in ${EXCLUDED_CONTAINERS} + IF not ${is_excluded} + ${target_container}= Set Variable ${cname} + BREAK + END + END + IF "${target_container}" == "" + ${target_container}= Set Variable ${all_containers}[0] + END + END + END + + # Build the kubectl logs command with or without -c flag + IF "${target_container}" != "" + ${logs_cmd}= Set Variable ${KUBERNETES_DISTRIBUTION_BINARY} logs deployment/${DEPLOYMENT_NAME} -c ${target_container} --context ${CONTEXT} -n ${NAMESPACE} --tail=${LOG_LINES} --since=${LOG_AGE} + ELSE + ${logs_cmd}= Set Variable ${KUBERNETES_DISTRIBUTION_BINARY} logs deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} --tail=${LOG_LINES} --since=${LOG_AGE} + END ${deployment_logs}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} logs deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} --tail=${LOG_LINES} --since=${LOG_AGE} + ... cmd=${logs_cmd} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... show_in_rwl_cheatsheet=true @@ -412,13 +451,13 @@ Fetch Deployment Logs for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` IF ${deployment_logs.returncode} == 0 # Filter logs to remove repetitive health check messages and focus on meaningful content ${filtered_logs}= RW.CLI.Run Cli - ... cmd=echo "${deployment_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /|POST /health|probe|liveness|readiness)" | grep -E "(error|ERROR|warn|WARN|exception|Exception|fail|FAIL|fatal|FATAL|panic|stack|trace|timeout|connection.*refused|unable.*connect|authentication.*failed|denied|forbidden|unauthorized|500|502|503|504)" | tail -50 || echo "No significant errors or warnings found in recent logs" + ... cmd=echo "${deployment_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /health|POST /health|probe|liveness|readiness)" | grep -E "(error|ERROR|warn|WARN|exception|Exception|fail|FAIL|fatal|FATAL|panic|stack|trace|timeout|connection.*refused|unable.*connect|authentication.*failed|denied|forbidden|unauthorized|500|502|503|504)" | tail -50 || echo "No significant errors or warnings found in recent logs" ... env=${env} ... include_in_history=false # Also get a sample of non-health-check logs for context ${context_logs}= RW.CLI.Run Cli - ... cmd=echo "${deployment_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /|POST /health|probe|liveness|readiness)" | head -20 | tail -10 + ... cmd=echo "${deployment_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /health|POST /health|probe|liveness|readiness)" | head -20 | tail -10 ... env=${env} ... include_in_history=false diff --git a/codebundles/k8s-stacktrace-health/runbook.robot b/codebundles/k8s-stacktrace-health/runbook.robot index 568a8026c..92cbc147d 100755 --- a/codebundles/k8s-stacktrace-health/runbook.robot +++ b/codebundles/k8s-stacktrace-health/runbook.robot @@ -162,7 +162,12 @@ Analyze Workload Stacktraces for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespac # Skip pod-related checks if workload is scaled to 0 IF not ${SKIP_STACKTRACE_CHECKS} # Convert comma-separated string to list for excluded containers - @{EXCLUDED_CONTAINERS}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + @{EXCLUDED_CONTAINERS_RAW}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + @{EXCLUDED_CONTAINERS}= Create List + FOR ${container} IN @{EXCLUDED_CONTAINERS_RAW} + ${trimmed_container}= Strip String ${container} + Append To List ${EXCLUDED_CONTAINERS} ${trimmed_container} + END # Fetch logs using RW.K8sLog library (same pattern as deployment healthcheck) ${log_dir}= RW.K8sLog.Fetch Workload Logs diff --git a/codebundles/k8s-stacktrace-health/sli.robot b/codebundles/k8s-stacktrace-health/sli.robot index 87270c420..258296f67 100755 --- a/codebundles/k8s-stacktrace-health/sli.robot +++ b/codebundles/k8s-stacktrace-health/sli.robot @@ -83,7 +83,12 @@ Suite Initialization # Convert comma-separated string to list - @{EXCLUDED_CONTAINERS}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + @{EXCLUDED_CONTAINERS_RAW}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + @{EXCLUDED_CONTAINERS}= Create List + FOR ${container} IN @{EXCLUDED_CONTAINERS_RAW} + ${trimmed_container}= Strip String ${container} + Append To List ${EXCLUDED_CONTAINERS} ${trimmed_container} + END Set Suite Variable @{EXCLUDED_CONTAINERS} # Initialize score variables diff --git a/codebundles/k8s-statefulset-healthcheck/runbook.robot b/codebundles/k8s-statefulset-healthcheck/runbook.robot index 5cd67f063..d8dbbdee4 100644 --- a/codebundles/k8s-statefulset-healthcheck/runbook.robot +++ b/codebundles/k8s-statefulset-healthcheck/runbook.robot @@ -39,6 +39,7 @@ Analyze Application Log Patterns for StatefulSet `${STATEFULSET_NAME}` in Namesp ... context=${CONTEXT} ... kubeconfig=${kubeconfig} ... log_age=${LOG_AGE} + ... excluded_containers=${EXCLUDED_CONTAINERS} ${scan_results}= RW.K8sLog.Scan Logs For Issues ... log_dir=${log_dir} @@ -46,6 +47,7 @@ Analyze Application Log Patterns for StatefulSet `${STATEFULSET_NAME}` in Namesp ... workload_name=${STATEFULSET_NAME} ... namespace=${NAMESPACE} ... categories=@{LOG_PATTERN_CATEGORIES} + ... excluded_containers=${EXCLUDED_CONTAINERS} ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} @@ -99,12 +101,14 @@ Detect Log Anomalies for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMES ... context=${CONTEXT} ... kubeconfig=${kubeconfig} ... log_age=${LOG_AGE} + ... excluded_containers=${EXCLUDED_CONTAINERS} ${anomaly_results}= RW.K8sLog.Analyze Log Anomalies ... log_dir=${log_dir} ... workload_type=statefulset ... workload_name=${STATEFULSET_NAME} ... namespace=${NAMESPACE} + ... excluded_containers=${EXCLUDED_CONTAINERS} # Process anomaly issues ${anomaly_issues}= Evaluate $anomaly_results.get('issues', []) @@ -970,9 +974,21 @@ Suite Initialization ... pattern=\d+ ... example=1 ... default=1 + ${EXCLUDED_CONTAINER_NAMES}= RW.Core.Import User Variable EXCLUDED_CONTAINER_NAMES + ... type=string + ... description=Comma-separated list of container names to exclude from log analysis (e.g., linkerd-proxy, istio-proxy, vault-agent). + ... pattern=.* + ... example=linkerd-proxy,istio-proxy,vault-agent + ... default=linkerd-proxy,istio-proxy,vault-agent - # Convert comma-separated string to list + # Convert comma-separated strings to lists @{LOG_PATTERN_CATEGORIES}= Split String ${LOG_PATTERN_CATEGORIES_STR} , + @{EXCLUDED_CONTAINERS_RAW}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + @{EXCLUDED_CONTAINERS}= Create List + FOR ${container} IN @{EXCLUDED_CONTAINERS_RAW} + ${trimmed_container}= Strip String ${container} + Append To List ${EXCLUDED_CONTAINERS} ${trimmed_container} + END Set Suite Variable ${kubeconfig} Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} @@ -986,6 +1002,8 @@ Suite Initialization Set Suite Variable ${ANOMALY_THRESHOLD} Set Suite Variable ${CONTAINER_RESTART_AGE} Set Suite Variable ${CONTAINER_RESTART_THRESHOLD} + Set Suite Variable ${EXCLUDED_CONTAINER_NAMES} + Set Suite Variable @{EXCLUDED_CONTAINERS} ${env}= Evaluate {"KUBECONFIG":"${kubeconfig.key}","KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}","CONTEXT":"${CONTEXT}","NAMESPACE":"${NAMESPACE}","STATEFULSET_NAME":"${STATEFULSET_NAME}","CONTAINER_RESTART_AGE":"${CONTAINER_RESTART_AGE}","CONTAINER_RESTART_THRESHOLD":"${CONTAINER_RESTART_THRESHOLD}"} Set Suite Variable ${env} diff --git a/libraries/RW/K8sLog/k8s_log.py b/libraries/RW/K8sLog/k8s_log.py index 6b6581479..0259fa8d0 100644 --- a/libraries/RW/K8sLog/k8s_log.py +++ b/libraries/RW/K8sLog/k8s_log.py @@ -51,7 +51,7 @@ def _get_timestamp_handler(self): from RW.LogAnalysis.java.timestamp_handler import TimestampHandler self._timestamp_handler = TimestampHandler() except ImportError as e: - logger.warning(f"Could not import TimestampHandler: {e}") + logger.warn(f"Could not import TimestampHandler: {e}") self._timestamp_handler = None return self._timestamp_handler @@ -388,16 +388,26 @@ def _load_error_patterns(self) -> Dict[str, Any]: def _cleanup_log_line_for_grouping(self, line: str) -> str: """Remove variable parts of a log line for better grouping.""" - # Remove timestamps (ISO format or custom 'dd-mm-yyyy hh:mm:ss.ms') - line = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z?', '', line) + # Remove timestamps - ISO format + line = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z?', '', line) + # Remove timestamps - custom 'dd-mm-yyyy hh:mm:ss.ms' line = re.sub(r'\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}\.\d{3}', '', line) - # Remove thread names in brackets + # Remove timestamps - nginx/CLF format: [25/Mar/2026:14:11:41 +0000] + line = re.sub(r'\[\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} [+-]\d{4}\]', '', line) + # Remove timestamps - syslog/common format: Mar 25 14:11:41 or 2026-03-25 14:11:41 + line = re.sub(r'\b\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\b', '', line) + line = re.sub(r'\b\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}(?:\.\d+)?\b', '', line) + # Remove IP addresses (v4) + line = re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?::\d+)?\b', '', line) + # Remove thread names and bracketed content line = re.sub(r'\[[^\][]*\]', '', line) # Remove UUIDs and similar trace/transaction IDs (hex or alphanumeric) line = re.sub(r'[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}', '', line, flags=re.IGNORECASE) - line = re.sub(r'\b[a-f0-9]{10,}\b', '', line, flags=re.IGNORECASE) # long hex strings (trace ids) - line = re.sub(r'\b[a-zA-Z0-9]*unknown[a-zA-Z0-9]*\b', '', line, flags=re.IGNORECASE) # IDs like '11989unknown...' - # Remove any remaining numbers that look like IDs or counters + line = re.sub(r'\b[a-f0-9]{10,}\b', '', line, flags=re.IGNORECASE) + line = re.sub(r'\b[a-zA-Z0-9]*unknown[a-zA-Z0-9]*\b', '', line, flags=re.IGNORECASE) + # Remove standalone floating-point numbers (response times like 0.093, not version strings like v1.8.1) + line = re.sub(r'(? 50 * 1024 * 1024: # 50MB limit - logger.warning(f" Skipping large log file {log_file} ({file_size / 1024 / 1024:.1f}MB)") + logger.warn(f" Skipping large log file {log_file} ({file_size / 1024 / 1024:.1f}MB)") continue except Exception as e: - logger.warning(f" Could not check file size for {log_file}: {e}") + logger.warn(f" Could not check file size for {log_file}: {e}") continue with open(log_file, "r", encoding="utf-8") as lf: @@ -1081,14 +1093,14 @@ def _scan_logs_for_issues_impl(self, log_dir: str, workload_type: str, workload_ for line_num, line in enumerate(log_lines, 1): if pattern.search(line): # Apply exclude patterns from JSON before collecting matches - excluded = False + line_excluded = False for exclude_pattern in exclude_patterns: if exclude_pattern.search(line): - excluded = True + line_excluded = True logger.debug(f"Line excluded by pattern {exclude_pattern.pattern}: {line.strip()[:100]}...") break - if not excluded: + if not line_excluded: # Extract timestamp from the matching log line log_timestamp = self._extract_timestamp_from_log_line(line.strip()) @@ -1302,7 +1314,7 @@ def analyze_log_anomalies(self, log_dir: str, workload_type: str, workload_name: with open(pods_json_path, "r", encoding="utf-8") as f: pods_data = json.load(f) except Exception as e: - logger.warning(f"Error reading pods JSON: {e}") + logger.warn(f"Error reading pods JSON: {e}") return {"issues": [], "summary": ["No pods data found for anomaly analysis."]} issues_json = {"issues": [], "summary": []} @@ -1310,20 +1322,22 @@ def analyze_log_anomalies(self, log_dir: str, workload_type: str, workload_name: logger.info(f"Scanning logs for frequent log anomalies in {workload_type}/{workload_name} in namespace {namespace}...") + excluded = excluded_containers or [] for pod in pods: logger.info(f"Processing Pod: {pod}") pod_obj = next((p for p in pods_data if p["metadata"]["name"] == pod), None) if not pod_obj: continue - containers = [c["name"] for c in pod_obj["spec"]["containers"]] + all_containers = [c["name"] for c in pod_obj["spec"]["containers"]] + containers = [c for c in all_containers if c not in excluded] for container in containers: logger.info(f" Processing Container: {container}") log_file = log_path / f"{workload_type}_{workload_name}_logs" / f"{pod}_{container}_logs.txt" if not log_file.is_file(): - logger.warning(f" Warning: No log file found at {log_file}") + logger.warn(f" Warning: No log file found at {log_file}") continue with open(log_file, "r", encoding="utf-8") as lf: @@ -1331,7 +1345,6 @@ def analyze_log_anomalies(self, log_dir: str, workload_type: str, workload_name: if not log_content.strip(): continue - # logger.error(f"Hrithvika: {log_content}") # Count occurrences of repeating log messages log_lines = log_content.split('\n') @@ -1777,7 +1790,7 @@ def cleanup_temp_files(self): self.temp_dir = None logger.info("Cleaned up temporary log analysis files") except Exception as e: - logger.warning(f"Failed to cleanup temporary files: {str(e)}") + logger.warn(f"Failed to cleanup temporary files: {str(e)}") @keyword def extract_timestamp_from_line(self, log_data: str) -> str: