From e5bf595d3324f4b54601b75a815f148b1970329b Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Thu, 26 Mar 2026 22:27:31 +0800 Subject: [PATCH 1/5] fix ci --- .github/workflows/pxf-ci.yml | 8 ++-- .../pxf-cbdb-dev/common/script/entrypoint.sh | 11 +++++ ci/singlecluster/Dockerfile | 44 ++++++++++++------- server/gradlew-install.sh | 22 +++++++--- 4 files changed, 59 insertions(+), 26 deletions(-) diff --git a/.github/workflows/pxf-ci.yml b/.github/workflows/pxf-ci.yml index 1195d060..28c6c316 100644 --- a/.github/workflows/pxf-ci.yml +++ b/.github/workflows/pxf-ci.yml @@ -364,8 +364,8 @@ jobs: FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}" SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}" - if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then - echo "Test group ${{ matrix.test_group }} failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)" + if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then + echo "Test group ${{ matrix.test_group }} failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)" exit 1 fi @@ -536,8 +536,8 @@ jobs: FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}" SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}" - if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then - echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)" + if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then + echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)" exit 1 fi diff --git a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh index 832e5067..02419354 100755 --- a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh +++ b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh @@ -429,6 +429,17 @@ wait_for_datanode() { log "Attempting to restart DataNode..." # Stop any zombie DataNode processes pkill -f "proc_datanode" 2>/dev/null || true + pkill -f "datanode" 2>/dev/null || true + sleep 2 + # Kill any process still holding DataNode ports (50010/50020/50075) + for port in 50010 50020 50075; do + local pid + pid=$(ss -tlnp "sport = :${port}" 2>/dev/null | grep -oP 'pid=\K[0-9]+' | head -1) + if [ -n "${pid}" ]; then + log "Killing process ${pid} holding port ${port}" + kill -9 "${pid}" 2>/dev/null || true + fi + done sleep 2 # Restart DataNode via the singlecluster script "${GPHD_ROOT}/bin/hadoop-datanode.sh" start 0 2>&1 || true diff --git a/ci/singlecluster/Dockerfile b/ci/singlecluster/Dockerfile index 4d6bb655..08041491 100644 --- a/ci/singlecluster/Dockerfile +++ b/ci/singlecluster/Dockerfile @@ -50,16 +50,8 @@ ENV ZOOKEEPER_SHA512="0e5a64713abc6f36d961dd61a06f681868171a9d9228366e512a013248 ENV HBASE_SHA512="1032521025660daa70260cdc931f52a26c87596be444451fe1fa88b526ede55e9d6b4220e91ff6f7422bec11f30d64fa6745e95a9c36971fdb1a264a2c745693" ENV TEZ_SHA512="a2d94bd9fa778d42a8bac9d9da8e263e469ddfef93968b06434716554995f490231de5607541ac236e770aa0158b64250c38bc1cd57dbfa629fea705f2ffa2f5" -# faster mirror: -ENV APACHE_MIRROR="repo.huaweicloud.com/apache" -#ENV APACHE_MIRROR="archive.apache.org/dist/" -#ENV APACHE_MIRROR="mirror.yandex.ru/mirrors/apache/" - -ENV HADOOP_URL="https://$APACHE_MIRROR/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" -ENV HIVE_URL="https://$APACHE_MIRROR/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz" -ENV ZOOKEEPER_URL="https://$APACHE_MIRROR/zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz" -ENV HBASE_URL="https://$APACHE_MIRROR/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz" -ENV TEZ_URL="https://$APACHE_MIRROR/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz" +# Mirror list: try fast mirrors first, fall back to official archive +ENV APACHE_MIRRORS="repo.huaweicloud.com/apache archive.apache.org/dist" ENV GPHD_ROOT=/home/gpadmin/workspace/singlecluster ENV HADOOP_ROOT=$GPHD_ROOT/hadoop @@ -68,34 +60,54 @@ ENV HIVE_ROOT=$GPHD_ROOT/hive ENV ZOOKEEPER_ROOT=$GPHD_ROOT/zookeeper ENV TEZ_ROOT=$GPHD_ROOT/tez +# Helper: download from first working mirror with retry +# Usage: apache_download +RUN cat > /usr/local/bin/apache_download.sh <<'DLEOF' && chmod +x /usr/local/bin/apache_download.sh +#!/bin/bash +set -e +rel_path="$1"; output="$2" +for mirror in $APACHE_MIRRORS; do + url="https://${mirror}/${rel_path}" + echo "Trying: $url" + if curl -fSL --retry 2 --retry-delay 3 --connect-timeout 15 "$url" -o "$output" 2>&1; then + echo "Downloaded from $mirror" + exit 0 + fi + echo "Failed from $mirror, trying next..." + rm -f "$output" +done +echo "ERROR: all mirrors failed for $rel_path" +exit 1 +DLEOF + RUN mkdir -p $HADOOP_ROOT && \ - curl -fSL "$HADOOP_URL" -o /tmp/hadoop.tar.gz && \ + apache_download.sh "hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" /tmp/hadoop.tar.gz && \ echo "$HADOOP_SHA512 /tmp/hadoop.tar.gz" | sha512sum -c && \ tar xvf /tmp/hadoop.tar.gz -C $HADOOP_ROOT --strip-components 1 --exclude="share/doc/*" --exclude="*-sources.jar" && \ rm /tmp/hadoop.tar.gz && \ - curl -fSL "https://repo1.maven.org/maven2/javax/activation/javax.activation-api/1.2.0/javax.activation-api-1.2.0.jar" \ + curl -fSL --retry 2 "https://repo1.maven.org/maven2/javax/activation/javax.activation-api/1.2.0/javax.activation-api-1.2.0.jar" \ -o $HADOOP_ROOT/share/hadoop/common/lib/javax.activation-api-1.2.0.jar RUN mkdir -p $HIVE_ROOT && \ - curl -fSL $HIVE_URL -o /tmp/hive.tar.gz && \ + apache_download.sh "hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz" /tmp/hive.tar.gz && \ echo "$HIVE_SHA256 /tmp/hive.tar.gz" | sha256sum -c && \ tar xvf /tmp/hive.tar.gz -C $HIVE_ROOT --strip-components 1 && \ rm /tmp/hive.tar.gz RUN mkdir -p $ZOOKEEPER_ROOT && \ - curl -fSL $ZOOKEEPER_URL -o /tmp/zookeeper.tar.gz && \ + apache_download.sh "zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz" /tmp/zookeeper.tar.gz && \ echo "$ZOOKEEPER_SHA512 /tmp/zookeeper.tar.gz" | sha512sum -c && \ tar xvf /tmp/zookeeper.tar.gz -C $ZOOKEEPER_ROOT --strip-components 1 --exclude="docs/*" && \ rm /tmp/zookeeper.tar.gz RUN mkdir -p $HBASE_ROOT && \ - curl -fSL "$HBASE_URL" -o /tmp/hbase.tar.gz && \ + apache_download.sh "hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz" /tmp/hbase.tar.gz && \ echo "$HBASE_SHA512 /tmp/hbase.tar.gz" | sha512sum -c && \ tar xvf /tmp/hbase.tar.gz -C $HBASE_ROOT --strip-components 1 --exclude="docs/*" --exclude="lib/*-tests.jar" --exclude="lib/shaded-clients" && \ rm /tmp/hbase.tar.gz RUN mkdir -p $TEZ_ROOT && \ - curl -fSL "$TEZ_URL" -o /tmp/tez.tar.gz && \ + apache_download.sh "tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz" /tmp/tez.tar.gz && \ echo "$TEZ_SHA512 /tmp/tez.tar.gz" | sha512sum -c && \ tar xvf /tmp/tez.tar.gz -C $TEZ_ROOT --strip-components 1 && \ rm /tmp/tez.tar.gz diff --git a/server/gradlew-install.sh b/server/gradlew-install.sh index 510fa2ad..71dc0c70 100755 --- a/server/gradlew-install.sh +++ b/server/gradlew-install.sh @@ -58,13 +58,23 @@ if [ ! -e "${GRADLE_WRAPPER_JAR}" ]; then # The Gradle version extracted from the `distributionUrl` property does not contain ".0" patch # versions. Need to append a ".0" in that case to download the wrapper jar. GRADLE_VERSION="$(echo "$GRADLE_DIST_VERSION" | sed 's/^\([0-9]*[.][0-9]*\)$/\1.0/')" - curl --location --output "${GRADLE_WRAPPER_JAR}" https://raw.githubusercontent.com/gradle/gradle/v${GRADLE_VERSION}/gradle/wrapper/gradle-wrapper.jar || exit 1 - JAR_CHECKSUM="$(${SHASUM} "${GRADLE_WRAPPER_JAR}" | cut -d\ -f1)" EXPECTED="$(cat "${GRADLE_WRAPPER_SHA256}")" - if [ "${JAR_CHECKSUM}" != "${EXPECTED}" ]; then - # If the (just downloaded) checksum and the downloaded wrapper jar do not match, something - # really bad is going on. + MAX_RETRIES=3 + for _retry in $(seq 1 ${MAX_RETRIES}); do + curl --location --fail --output "${GRADLE_WRAPPER_JAR}" https://raw.githubusercontent.com/gradle/gradle/v${GRADLE_VERSION}/gradle/wrapper/gradle-wrapper.jar || { + echo "Download attempt ${_retry}/${MAX_RETRIES} failed (curl error)" > /dev/stderr + rm -f "${GRADLE_WRAPPER_JAR}" + if [ "${_retry}" -lt "${MAX_RETRIES}" ]; then sleep 5; continue; fi + exit 1 + } + JAR_CHECKSUM="$(${SHASUM} "${GRADLE_WRAPPER_JAR}" | cut -d\ -f1)" + if [ "${JAR_CHECKSUM}" = "${EXPECTED}" ]; then + break + fi + echo "SHA256 mismatch on attempt ${_retry}/${MAX_RETRIES} (got ${JAR_CHECKSUM}, expected ${EXPECTED})" > /dev/stderr + rm -f "${GRADLE_WRAPPER_JAR}" + if [ "${_retry}" -lt "${MAX_RETRIES}" ]; then sleep 5; continue; fi echo "Expected sha256 of the downloaded gradle-wrapper.jar does not match the downloaded sha256!" > /dev/stderr exit 1 - fi + done fi From 8acabecafcc550f531caa81719004a58a77875f8 Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Fri, 27 Mar 2026 09:47:58 +0800 Subject: [PATCH 2/5] fix --- ci/singlecluster/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/singlecluster/Dockerfile b/ci/singlecluster/Dockerfile index 08041491..b8b682e2 100644 --- a/ci/singlecluster/Dockerfile +++ b/ci/singlecluster/Dockerfile @@ -62,7 +62,7 @@ ENV TEZ_ROOT=$GPHD_ROOT/tez # Helper: download from first working mirror with retry # Usage: apache_download -RUN cat > /usr/local/bin/apache_download.sh <<'DLEOF' && chmod +x /usr/local/bin/apache_download.sh +RUN sudo tee /usr/local/bin/apache_download.sh > /dev/null <<'DLEOF' && sudo chmod +x /usr/local/bin/apache_download.sh #!/bin/bash set -e rel_path="$1"; output="$2" From 01d5b268820683400d765d2e15d2c8da64c2b8ea Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Fri, 27 Mar 2026 14:24:48 +0800 Subject: [PATCH 3/5] fix --- .../pxf-cbdb-dev/common/script/entrypoint.sh | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh index 02419354..0a83fc4e 100755 --- a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh +++ b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh @@ -451,6 +451,31 @@ wait_for_datanode() { die "HDFS DataNode failed to start after ${max_attempts} attempts. Tez upload will fail without a running DataNode." } +wait_for_hbase() { + log "waiting for HBase RegionServer to become available..." + local max_wait=60 + for i in $(seq 1 ${max_wait}); do + if pgrep -f HRegionServer >/dev/null 2>&1; then + log "HBase RegionServer is running (after ${i}s)" + return 0 + fi + sleep 1 + done + # RegionServer didn't come up; try restarting HBase once + log "HBase RegionServer not found after ${max_wait}s, attempting restart..." + ${GPHD_ROOT}/bin/stop-hbase.sh 2>/dev/null || true + sleep 2 + ${GPHD_ROOT}/bin/start-hbase.sh 2>/dev/null || true + for i in $(seq 1 60); do + if pgrep -f HRegionServer >/dev/null 2>&1; then + log "HBase RegionServer is running after restart (after ${i}s)" + return 0 + fi + sleep 1 + done + die "HBase RegionServer failed to start after restart" +} + prepare_hadoop_stack() { log "prepare Hadoop/Hive/HBase stack" export JAVA_HOME="${JAVA_HADOOP}" @@ -493,6 +518,7 @@ prepare_hadoop_stack() { if ! ${GPHD_ROOT}/bin/start-hbase.sh; then log "start-hbase.sh returned non-zero (services may already be running), continue" fi + wait_for_hbase start_hive_services } From 52e04d2857cf2299ad2db7df6616f6a72d63122d Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Fri, 27 Mar 2026 15:28:39 +0800 Subject: [PATCH 4/5] fix --- ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh index 0a83fc4e..cd2c5954 100755 --- a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh +++ b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh @@ -65,6 +65,10 @@ setup_locale_and_packages() { sudo locale-gen en_US.UTF-8 ru_RU.CP1251 ru_RU.UTF-8 sudo update-locale LANG=en_US.UTF-8 else + # Disable broken repos that may exist in the base image (e.g. hpc-common) + for repo in hpc-common; do + sudo dnf config-manager --set-disabled "$repo" 2>/dev/null || true + done sudo dnf install -y wget maven unzip openssh-server iproute sudo \ java-11-openjdk-headless java-1.8.0-openjdk-headless \ glibc-langpack-en glibc-locale-source @@ -440,7 +444,15 @@ wait_for_datanode() { kill -9 "${pid}" 2>/dev/null || true fi done - sleep 2 + sleep 5 + # Verify ports are actually released before restarting + for port in 50010 50020 50075; do + if ss -tlnp "sport = :${port}" 2>/dev/null | grep -q "LISTEN"; then + log "Port ${port} still in use, waiting..." + sleep 5 + break + fi + done # Restart DataNode via the singlecluster script "${GPHD_ROOT}/bin/hadoop-datanode.sh" start 0 2>&1 || true "${HADOOP_ROOT}/sbin/hadoop-daemon.sh" --config "${GPHD_ROOT}/storage/hadoop/datanode0/etc/hadoop" start datanode 2>&1 || true From d53ca88a8d57376184232b4a581703976e2045be Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Fri, 27 Mar 2026 19:00:42 +0800 Subject: [PATCH 5/5] fix --- .../pxf-cbdb-dev/common/script/entrypoint.sh | 6 ++-- .../pxf-cbdb-dev/common/script/run_tests.sh | 31 +++++++++++++++++-- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh index cd2c5954..42e98c2b 100755 --- a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh +++ b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh @@ -435,8 +435,8 @@ wait_for_datanode() { pkill -f "proc_datanode" 2>/dev/null || true pkill -f "datanode" 2>/dev/null || true sleep 2 - # Kill any process still holding DataNode ports (50010/50020/50075) - for port in 50010 50020 50075; do + # Kill any process still holding DataNode ports (50010/50020/50075/50080) + for port in 50010 50020 50075 50080; do local pid pid=$(ss -tlnp "sport = :${port}" 2>/dev/null | grep -oP 'pid=\K[0-9]+' | head -1) if [ -n "${pid}" ]; then @@ -446,7 +446,7 @@ wait_for_datanode() { done sleep 5 # Verify ports are actually released before restarting - for port in 50010 50020 50075; do + for port in 50010 50020 50075 50080; do if ss -tlnp "sport = :${port}" 2>/dev/null | grep -q "LISTEN"; then log "Port ${port} still in use, waiting..." sleep 5 diff --git a/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh b/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh index 63b99352..0be51fea 100755 --- a/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh +++ b/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh @@ -90,6 +90,28 @@ health_check_with_retry() { fi } +mvn_with_retry() { + local max_attempts=3 + for attempt in $(seq 1 ${max_attempts}); do + if mvn "$@"; then + return 0 + fi + if [ "${attempt}" -lt "${max_attempts}" ]; then + echo "[run_tests] Maven failed (attempt ${attempt}/${max_attempts}), retrying in 10s..." + sleep 10 + fi + done + echo "[run_tests] Maven failed after ${max_attempts} attempts" + return 1 +} + +resolve_maven_dependencies() { + echo "[run_tests] Pre-resolving Maven dependencies..." + pushd "${REPO_ROOT}/automation" >/dev/null + mvn_with_retry -B -q dependency:resolve -DskipTests 2>&1 || echo "[warn] Maven dependency resolution failed, tests may fail" + popd >/dev/null +} + cleanup_hdfs_test_data() { hdfs dfs -rm -r -f /gpdb-ud-scratch/tmp/pxf_automation_data >/dev/null 2>&1 || true } @@ -526,7 +548,7 @@ ensure_testplugin_jar() { export PXF_HOME=${PXF_HOME:-/usr/local/pxf} if [ ! -f "${PXF_BASE}/lib/pxf-automation-test.jar" ]; then pushd "${REPO_ROOT}/automation" >/dev/null - mvn -q -DskipTests test-compile + mvn_with_retry -q -DskipTests test-compile jar cf "${PXF_BASE}/lib/pxf-automation-test.jar" -C target/classes org/apache/cloudberry/pxf/automation/testplugin popd >/dev/null JAVA_HOME="${JAVA_BUILD}" "${PXF_HOME}/bin/pxf" restart >/dev/null || true @@ -853,10 +875,13 @@ generate_test_summary() { run_single_group() { local group="$1" echo "[run_tests] Running single test group: $group" - + + # Pre-resolve Maven dependencies with retry for transient network failures + resolve_maven_dependencies + # Run health check first health_check_with_retry - + ensure_testuser_pg_hba export PGHOST=127.0.0.1 export PATH="${GPHOME}/bin:${PATH}"