Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/pxf-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,8 @@ jobs:
FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}"
SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}"

if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
exit 1
fi

Expand Down Expand Up @@ -536,8 +536,8 @@ jobs:
FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}"
SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}"

if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
exit 1
fi

Expand Down
49 changes: 49 additions & 0 deletions ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ setup_locale_and_packages() {
sudo locale-gen en_US.UTF-8 ru_RU.CP1251 ru_RU.UTF-8
sudo update-locale LANG=en_US.UTF-8
else
# Disable broken repos that may exist in the base image (e.g. hpc-common)
for repo in hpc-common; do
sudo dnf config-manager --set-disabled "$repo" 2>/dev/null || true
done
sudo dnf install -y wget maven unzip openssh-server iproute sudo \
java-11-openjdk-headless java-1.8.0-openjdk-headless \
glibc-langpack-en glibc-locale-source
Expand Down Expand Up @@ -429,7 +433,26 @@ wait_for_datanode() {
log "Attempting to restart DataNode..."
# Stop any zombie DataNode processes
pkill -f "proc_datanode" 2>/dev/null || true
pkill -f "datanode" 2>/dev/null || true
sleep 2
# Kill any process still holding DataNode ports (50010/50020/50075/50080)
for port in 50010 50020 50075 50080; do
local pid
pid=$(ss -tlnp "sport = :${port}" 2>/dev/null | grep -oP 'pid=\K[0-9]+' | head -1)
if [ -n "${pid}" ]; then
log "Killing process ${pid} holding port ${port}"
kill -9 "${pid}" 2>/dev/null || true
fi
done
sleep 5
# Verify ports are actually released before restarting
for port in 50010 50020 50075 50080; do
if ss -tlnp "sport = :${port}" 2>/dev/null | grep -q "LISTEN"; then
log "Port ${port} still in use, waiting..."
sleep 5
break
fi
done
# Restart DataNode via the singlecluster script
"${GPHD_ROOT}/bin/hadoop-datanode.sh" start 0 2>&1 || true
"${HADOOP_ROOT}/sbin/hadoop-daemon.sh" --config "${GPHD_ROOT}/storage/hadoop/datanode0/etc/hadoop" start datanode 2>&1 || true
Expand All @@ -440,6 +463,31 @@ wait_for_datanode() {
die "HDFS DataNode failed to start after ${max_attempts} attempts. Tez upload will fail without a running DataNode."
}

wait_for_hbase() {
log "waiting for HBase RegionServer to become available..."
local max_wait=60
for i in $(seq 1 ${max_wait}); do
if pgrep -f HRegionServer >/dev/null 2>&1; then
log "HBase RegionServer is running (after ${i}s)"
return 0
fi
sleep 1
done
# RegionServer didn't come up; try restarting HBase once
log "HBase RegionServer not found after ${max_wait}s, attempting restart..."
${GPHD_ROOT}/bin/stop-hbase.sh 2>/dev/null || true
sleep 2
${GPHD_ROOT}/bin/start-hbase.sh 2>/dev/null || true
for i in $(seq 1 60); do
if pgrep -f HRegionServer >/dev/null 2>&1; then
log "HBase RegionServer is running after restart (after ${i}s)"
return 0
fi
sleep 1
done
die "HBase RegionServer failed to start after restart"
}

prepare_hadoop_stack() {
log "prepare Hadoop/Hive/HBase stack"
export JAVA_HOME="${JAVA_HADOOP}"
Expand Down Expand Up @@ -482,6 +530,7 @@ prepare_hadoop_stack() {
if ! ${GPHD_ROOT}/bin/start-hbase.sh; then
log "start-hbase.sh returned non-zero (services may already be running), continue"
fi
wait_for_hbase
start_hive_services
}

Expand Down
31 changes: 28 additions & 3 deletions ci/docker/pxf-cbdb-dev/common/script/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,28 @@ health_check_with_retry() {
fi
}

mvn_with_retry() {
local max_attempts=3
for attempt in $(seq 1 ${max_attempts}); do
if mvn "$@"; then
return 0
fi
if [ "${attempt}" -lt "${max_attempts}" ]; then
echo "[run_tests] Maven failed (attempt ${attempt}/${max_attempts}), retrying in 10s..."
sleep 10
fi
done
echo "[run_tests] Maven failed after ${max_attempts} attempts"
return 1
}

resolve_maven_dependencies() {
echo "[run_tests] Pre-resolving Maven dependencies..."
pushd "${REPO_ROOT}/automation" >/dev/null
mvn_with_retry -B -q dependency:resolve -DskipTests 2>&1 || echo "[warn] Maven dependency resolution failed, tests may fail"
popd >/dev/null
}

cleanup_hdfs_test_data() {
hdfs dfs -rm -r -f /gpdb-ud-scratch/tmp/pxf_automation_data >/dev/null 2>&1 || true
}
Expand Down Expand Up @@ -526,7 +548,7 @@ ensure_testplugin_jar() {
export PXF_HOME=${PXF_HOME:-/usr/local/pxf}
if [ ! -f "${PXF_BASE}/lib/pxf-automation-test.jar" ]; then
pushd "${REPO_ROOT}/automation" >/dev/null
mvn -q -DskipTests test-compile
mvn_with_retry -q -DskipTests test-compile
jar cf "${PXF_BASE}/lib/pxf-automation-test.jar" -C target/classes org/apache/cloudberry/pxf/automation/testplugin
popd >/dev/null
JAVA_HOME="${JAVA_BUILD}" "${PXF_HOME}/bin/pxf" restart >/dev/null || true
Expand Down Expand Up @@ -853,10 +875,13 @@ generate_test_summary() {
run_single_group() {
local group="$1"
echo "[run_tests] Running single test group: $group"


# Pre-resolve Maven dependencies with retry for transient network failures
resolve_maven_dependencies

# Run health check first
health_check_with_retry

ensure_testuser_pg_hba
export PGHOST=127.0.0.1
export PATH="${GPHOME}/bin:${PATH}"
Expand Down
44 changes: 28 additions & 16 deletions ci/singlecluster/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,8 @@ ENV ZOOKEEPER_SHA512="0e5a64713abc6f36d961dd61a06f681868171a9d9228366e512a013248
ENV HBASE_SHA512="1032521025660daa70260cdc931f52a26c87596be444451fe1fa88b526ede55e9d6b4220e91ff6f7422bec11f30d64fa6745e95a9c36971fdb1a264a2c745693"
ENV TEZ_SHA512="a2d94bd9fa778d42a8bac9d9da8e263e469ddfef93968b06434716554995f490231de5607541ac236e770aa0158b64250c38bc1cd57dbfa629fea705f2ffa2f5"

# faster mirror:
ENV APACHE_MIRROR="repo.huaweicloud.com/apache"
#ENV APACHE_MIRROR="archive.apache.org/dist/"
#ENV APACHE_MIRROR="mirror.yandex.ru/mirrors/apache/"

ENV HADOOP_URL="https://$APACHE_MIRROR/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz"
ENV HIVE_URL="https://$APACHE_MIRROR/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz"
ENV ZOOKEEPER_URL="https://$APACHE_MIRROR/zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz"
ENV HBASE_URL="https://$APACHE_MIRROR/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz"
ENV TEZ_URL="https://$APACHE_MIRROR/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz"
# Mirror list: try fast mirrors first, fall back to official archive
ENV APACHE_MIRRORS="repo.huaweicloud.com/apache archive.apache.org/dist"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that we can cache singlecluster image in github action's cache for 7 days... And we will not see this issue often.


ENV GPHD_ROOT=/home/gpadmin/workspace/singlecluster
ENV HADOOP_ROOT=$GPHD_ROOT/hadoop
Expand All @@ -68,34 +60,54 @@ ENV HIVE_ROOT=$GPHD_ROOT/hive
ENV ZOOKEEPER_ROOT=$GPHD_ROOT/zookeeper
ENV TEZ_ROOT=$GPHD_ROOT/tez

# Helper: download from first working mirror with retry
# Usage: apache_download <relative_path> <output_file>
RUN sudo tee /usr/local/bin/apache_download.sh > /dev/null <<'DLEOF' && sudo chmod +x /usr/local/bin/apache_download.sh
#!/bin/bash
set -e
rel_path="$1"; output="$2"
for mirror in $APACHE_MIRRORS; do
url="https://${mirror}/${rel_path}"
echo "Trying: $url"
if curl -fSL --retry 2 --retry-delay 3 --connect-timeout 15 "$url" -o "$output" 2>&1; then
echo "Downloaded from $mirror"
exit 0
fi
echo "Failed from $mirror, trying next..."
rm -f "$output"
done
echo "ERROR: all mirrors failed for $rel_path"
exit 1
DLEOF

RUN mkdir -p $HADOOP_ROOT && \
curl -fSL "$HADOOP_URL" -o /tmp/hadoop.tar.gz && \
apache_download.sh "hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" /tmp/hadoop.tar.gz && \
echo "$HADOOP_SHA512 /tmp/hadoop.tar.gz" | sha512sum -c && \
tar xvf /tmp/hadoop.tar.gz -C $HADOOP_ROOT --strip-components 1 --exclude="share/doc/*" --exclude="*-sources.jar" && \
rm /tmp/hadoop.tar.gz && \
curl -fSL "https://repo1.maven.org/maven2/javax/activation/javax.activation-api/1.2.0/javax.activation-api-1.2.0.jar" \
curl -fSL --retry 2 "https://repo1.maven.org/maven2/javax/activation/javax.activation-api/1.2.0/javax.activation-api-1.2.0.jar" \
-o $HADOOP_ROOT/share/hadoop/common/lib/javax.activation-api-1.2.0.jar

RUN mkdir -p $HIVE_ROOT && \
curl -fSL $HIVE_URL -o /tmp/hive.tar.gz && \
apache_download.sh "hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz" /tmp/hive.tar.gz && \
echo "$HIVE_SHA256 /tmp/hive.tar.gz" | sha256sum -c && \
tar xvf /tmp/hive.tar.gz -C $HIVE_ROOT --strip-components 1 && \
rm /tmp/hive.tar.gz

RUN mkdir -p $ZOOKEEPER_ROOT && \
curl -fSL $ZOOKEEPER_URL -o /tmp/zookeeper.tar.gz && \
apache_download.sh "zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz" /tmp/zookeeper.tar.gz && \
echo "$ZOOKEEPER_SHA512 /tmp/zookeeper.tar.gz" | sha512sum -c && \
tar xvf /tmp/zookeeper.tar.gz -C $ZOOKEEPER_ROOT --strip-components 1 --exclude="docs/*" && \
rm /tmp/zookeeper.tar.gz

RUN mkdir -p $HBASE_ROOT && \
curl -fSL "$HBASE_URL" -o /tmp/hbase.tar.gz && \
apache_download.sh "hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz" /tmp/hbase.tar.gz && \
echo "$HBASE_SHA512 /tmp/hbase.tar.gz" | sha512sum -c && \
tar xvf /tmp/hbase.tar.gz -C $HBASE_ROOT --strip-components 1 --exclude="docs/*" --exclude="lib/*-tests.jar" --exclude="lib/shaded-clients" && \
rm /tmp/hbase.tar.gz

RUN mkdir -p $TEZ_ROOT && \
curl -fSL "$TEZ_URL" -o /tmp/tez.tar.gz && \
apache_download.sh "tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz" /tmp/tez.tar.gz && \
echo "$TEZ_SHA512 /tmp/tez.tar.gz" | sha512sum -c && \
tar xvf /tmp/tez.tar.gz -C $TEZ_ROOT --strip-components 1 && \
rm /tmp/tez.tar.gz
Expand Down
22 changes: 16 additions & 6 deletions server/gradlew-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,23 @@ if [ ! -e "${GRADLE_WRAPPER_JAR}" ]; then
# The Gradle version extracted from the `distributionUrl` property does not contain ".0" patch
# versions. Need to append a ".0" in that case to download the wrapper jar.
GRADLE_VERSION="$(echo "$GRADLE_DIST_VERSION" | sed 's/^\([0-9]*[.][0-9]*\)$/\1.0/')"
curl --location --output "${GRADLE_WRAPPER_JAR}" https://raw.githubusercontent.com/gradle/gradle/v${GRADLE_VERSION}/gradle/wrapper/gradle-wrapper.jar || exit 1
JAR_CHECKSUM="$(${SHASUM} "${GRADLE_WRAPPER_JAR}" | cut -d\ -f1)"
EXPECTED="$(cat "${GRADLE_WRAPPER_SHA256}")"
if [ "${JAR_CHECKSUM}" != "${EXPECTED}" ]; then
# If the (just downloaded) checksum and the downloaded wrapper jar do not match, something
# really bad is going on.
MAX_RETRIES=3
for _retry in $(seq 1 ${MAX_RETRIES}); do
curl --location --fail --output "${GRADLE_WRAPPER_JAR}" https://raw.githubusercontent.com/gradle/gradle/v${GRADLE_VERSION}/gradle/wrapper/gradle-wrapper.jar || {
echo "Download attempt ${_retry}/${MAX_RETRIES} failed (curl error)" > /dev/stderr
rm -f "${GRADLE_WRAPPER_JAR}"
if [ "${_retry}" -lt "${MAX_RETRIES}" ]; then sleep 5; continue; fi
exit 1
}
JAR_CHECKSUM="$(${SHASUM} "${GRADLE_WRAPPER_JAR}" | cut -d\ -f1)"
if [ "${JAR_CHECKSUM}" = "${EXPECTED}" ]; then
break
fi
echo "SHA256 mismatch on attempt ${_retry}/${MAX_RETRIES} (got ${JAR_CHECKSUM}, expected ${EXPECTED})" > /dev/stderr
rm -f "${GRADLE_WRAPPER_JAR}"
if [ "${_retry}" -lt "${MAX_RETRIES}" ]; then sleep 5; continue; fi
echo "Expected sha256 of the downloaded gradle-wrapper.jar does not match the downloaded sha256!" > /dev/stderr
exit 1
fi
done
fi
Loading