diff --git a/.github/workflows/buildkit.yml b/.github/workflows/buildkit.yml index 7d646bb1d..b1f8b9c1e 100644 --- a/.github/workflows/buildkit.yml +++ b/.github/workflows/buildkit.yml @@ -10,7 +10,7 @@ on: workflow_dispatch: push: branches: - - 'master' + - 'main' - 'v[0-9]+.[0-9]+' tags: - 'v*' @@ -23,8 +23,8 @@ on: env: GO_VERSION: "1.21" SETUP_BUILDX_VERSION: "v0.14.1" # TODO(jhorsts): replace with upstream - SETUP_BUILDKIT_IMAGE: "moby/buildkit:latest" - IMAGE_NAME: "moby/buildkit" + SETUP_BUILDKIT_IMAGE: "earthbuild/buildkit:latest" + IMAGE_NAME: "earthbuild/buildkit" PLATFORMS: "linux/amd64,linux/arm/v7,linux/arm64,linux/s390x,linux/ppc64le,linux/riscv64" DESTDIR: "./bin" @@ -182,6 +182,48 @@ jobs: # CACHE_FROM: type=gha,scope=image${{ matrix.target-stage }} # CACHE_TO: type=gha,scope=image${{ matrix.target-stage }} + earthbuild-image: + runs-on: ubuntu-22.04 + needs: + - prepare + - image + if: needs.prepare.outputs.push == 'push' + permissions: + packages: write + steps: + - + name: Checkout + uses: actions/checkout@v4 + - + name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + version: ${{ env.SETUP_BUILDX_VERSION }} + driver-opts: image=${{ env.SETUP_BUILDKIT_IMAGE }} + buildkitd-flags: --debug + - + name: Login to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - + name: Build and push earthbuild buildkitd + uses: docker/build-push-action@v6 + with: + context: . + file: Dockerfile.earthbuild + platforms: linux/amd64,linux/arm64 + push: true + build-args: | + BASE_TAG=${{ needs.prepare.outputs.tag }} + tags: | + ghcr.io/earthbuild/buildkit:${{ needs.prepare.outputs.tag }} + release: runs-on: ubuntu-22.04 needs: diff --git a/Dockerfile.earthbuild b/Dockerfile.earthbuild new file mode 100644 index 000000000..f3e7bb8d9 --- /dev/null +++ b/Dockerfile.earthbuild @@ -0,0 +1,66 @@ +# Builds the full earthbuild buildkitd image on top of the base buildkit image. +# This adds earthbuild-specific tooling, config templates, debugger, and entrypoint. +ARG BASE_TAG=latest + +# Build the earth_debugger from earthbuild2 source. +FROM golang:1.25-alpine AS debugger +RUN apk add --no-cache git +ARG EARTHBUILD_SHA=main +RUN git clone --depth 1 --branch ${EARTHBUILD_SHA} https://github.com/EarthBuild/earthbuild.git /src +WORKDIR /src +RUN CGO_ENABLED=0 go build \ + -tags netgo -installsuffix netgo \ + -o /earth_debugger \ + cmd/debugger/*.go + +FROM earthbuild/buildkit:${BASE_TAG} + +RUN echo "@edge-community http://dl-cdn.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories +RUN apk add --update --no-cache \ + cni-plugins@edge-community \ + gettext \ + git-lfs \ + iptables \ + jq \ + openssh-client \ + pigz \ + util-linux \ + xz + +# Add github, gitlab, and bitbucket to known hosts. +RUN mkdir -p ~/.ssh && \ + echo "github.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOMqqnkVzrm0SdG6UOoqKLsabgH5C9okWi0dh2l9GKJl" >> ~/.ssh/known_hosts && \ + echo "github.com ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCj7ndNxQowgcQnjshcLrqPEiiphnt+VTTvDP6mHBL9j1aNUkY4Ue1gvwnGLVlOhGeYrnZaMgRK6+PKCUXaDbC7qtbW8gIkhL7aGCsOr/C56SJMy/BCZfxd1nWzAOxSDPgVsmerOBYfNqltV9/hWCqBywINIR+5dIg6JTJ72pcEpEjcYgXkE2YEFXV1JHnsKgbLWNlhScqb2UmyRkQyytRLtL+38TGxkxCflmO+5Z8CSSNY7GidjMIZ7Q4zMjA2n1nGrlTDkzwDCsw+wqFPGQA179cnfGWOWRVruj16z6XyvxvjJwbz0wQZ75XK5tKSb7FNyeIEs4TT4jk+S4dhPeAUC5y+bDYirYgM4GC7uEnztnZyaVWQ7B381AK4Qdrwt51ZqExKbQpTUNn+EjqoTwvqNj4kqx5QUCI0ThS/YkOxJCXmPUWZbhjpCg56i+2aB6CmK2JGhn57K5mj0MNdBXA4/WnwH6XoPWJzK5Nyu2zB3nAZp+S5hpQs+p1vN1/wsjk=" >> ~/.ssh/known_hosts && \ + echo "github.com ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBEmKSENjQEezOmxkZMy7opKgwFB9nkt5YRrYMjNuG5N87uRgg6CLrbo5wAdT/y6v0mKV0U2w0WZ2YB/++Tpockg=" >> ~/.ssh/known_hosts && \ + echo "gitlab.com ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCsj2bNKTBSpIYDEGk9KxsGh3mySTRgMtXL583qmBpzeQ+jqCMRgBqB98u3z++J1sKlXHWfM9dyhSevkMwSbhoR8XIq/U0tCNyokEi/ueaBMCvbcTHhO7FcwzY92WK4Yt0aGROY5qX2UKSeOvuP4D6TPqKF1onrSzH9bx9XUf2lEdWT/ia1NEKjunUqu1xOB/StKDHMoX4/OKyIzuS0q/T1zOATthvasJFoPrAjkohTyaDUz2LN5JoH839hViyEG82yB+MjcFV5MU3N1l1QL3cVUCh93xSaua1N85qivl+siMkPGbO5xR/En4iEY6K2XPASUEMaieWVNTRCtJ4S8H+9" >> ~/.ssh/known_hosts && \ + echo "gitlab.com ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFSMqzJeV9rUzU4kWitGjeR4PWSa29SPqJ1fVkhtj3Hw9xjLVXVYrU9QlYWrOLXBpQ6KWjbjTDTdDkoohFzgbEY=" >> ~/.ssh/known_hosts && \ + echo "gitlab.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAfuCHKVTjquxvt6CM6tdG4SLp1Btn/nOeHHE5UOzRdf" >> ~/.ssh/known_hosts && \ + echo "bitbucket.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAubiN81eDcafrgMeLzaFPsw2kNvEcqTKl/VqLat/MaB33pZy0y3rJZtnqwR2qOOvbwKZYKiEO1O6VqNEBxKvJJelCq0dTXWT5pbO2gDXC6h6QDXCaHo6pOHGPUy+YBaGQRGuSusMEASYiWunYN0vCAI8QaXnWMXNMdFP3jHAJH0eDsoiGnLPBlBp4TNm6rYI74nMzgz3B9IikW4WVK+dc8KZJZWYjAuORU3jc1c/NPskD2ASinf8v3xnfXeukU0sJ5N6m5E8VLjObPEO+mN2t/FZTMZLiFqPWc/ALSqnMnnhwrNi2rbfg/rd/IpL8Le3pSBne8+seeFVBoGqzHM9yXw==" >> ~/.ssh/known_hosts + +# Config templates and scripts. +COPY earthbuild/entrypoint.sh /usr/bin/entrypoint.sh +COPY earthbuild/buildkitd.toml.template /etc/buildkitd.toml.template +COPY earthbuild/buildkitd.cache.template /etc/buildkitd.cache.template +COPY earthbuild/buildkitd.cacheduration.template /etc/buildkitd.cacheduration.template +COPY earthbuild/buildkitd.tcp.template /etc/buildkitd.tcp.template +COPY earthbuild/buildkitd.pprof.template /etc/buildkitd.pprof.template +COPY earthbuild/buildkitd.tls.template /etc/buildkitd.tls.template +COPY earthbuild/cni-conf.json.template /etc/cni/cni-conf.json.template +COPY --from=debugger /earth_debugger /usr/bin/earth_debugger +COPY earthbuild/dockerd-wrapper.sh /var/earthbuild/dockerd-wrapper.sh +COPY earthbuild/docker-auto-install.sh /var/earthbuild/docker-auto-install.sh +COPY earthbuild/oom-adjust.sh.template /bin/oom-adjust.sh.template +COPY earthbuild/runc-ps /bin/runc-ps + +ENV EARTHLY_RESET_TMP_DIR=false +ENV EARTHLY_TMP_DIR=/tmp/earthbuild +ENV BUILDKIT_DEBUG=false +ENV BUILDKIT_MAX_PARALLELISM=20 +ENV BUILDKIT_LOCAL_REGISTRY_LISTEN_PORT=8371 +ENV BUILDKIT_STEP_LOG_MAX_SIZE=8388608 +ENV CACHE_SIZE_MB=0 +ENV CACHE_SIZE_PCT=0 +ENV NETWORK_MODE=cni +ENV EARTHLY_CACHE_VERSION="2" +VOLUME /tmp/earthbuild +ENTRYPOINT ["/usr/bin/entrypoint.sh", "buildkitd", "--config=/etc/buildkitd.toml"] diff --git a/earthbuild/buildkitd.cache.template b/earthbuild/buildkitd.cache.template new file mode 100644 index 000000000..e92a0434e --- /dev/null +++ b/earthbuild/buildkitd.cache.template @@ -0,0 +1,11 @@ + # Please note the required indentation to fit in buildkit.toml.template accordingly. + + # gckeepstorage sets storage limit for default gc profile, in MB. + gckeepstorage = ${CACHE_SIZE_MB} + + [[worker.oci.gcpolicy]] + keepBytes = ${SOURCE_FILE_KEEP_BYTES} + filters = [ "type==source.local", "type==source.git.checkout"] + [[worker.oci.gcpolicy]] + all = true + keepBytes = ${CATCH_ALL_KEEP_BYTES} diff --git a/earthbuild/buildkitd.cacheduration.template b/earthbuild/buildkitd.cacheduration.template new file mode 100644 index 000000000..af3378a53 --- /dev/null +++ b/earthbuild/buildkitd.cacheduration.template @@ -0,0 +1,5 @@ + # Please note the required indentation to fit in buildkit.toml.template accordingly. + + [[worker.oci.gcpolicy]] + all = true + keepDuration = ${CACHE_KEEP_DURATION} diff --git a/earthbuild/buildkitd.pprof.template b/earthbuild/buildkitd.pprof.template new file mode 100644 index 000000000..572e18866 --- /dev/null +++ b/earthbuild/buildkitd.pprof.template @@ -0,0 +1 @@ + debugAddress = "0.0.0.0:6060" diff --git a/earthbuild/buildkitd.tcp.template b/earthbuild/buildkitd.tcp.template new file mode 100644 index 000000000..97ef3df9e --- /dev/null +++ b/earthbuild/buildkitd.tcp.template @@ -0,0 +1,2 @@ +[grpc] + address = [ "tcp://0.0.0.0:8372" ] diff --git a/earthbuild/buildkitd.tls.template b/earthbuild/buildkitd.tls.template new file mode 100644 index 000000000..74a40f8a0 --- /dev/null +++ b/earthbuild/buildkitd.tls.template @@ -0,0 +1,6 @@ +# Please note the required indentation to fit in buildkit.toml.template accordingly. + + [grpc.tls] + cert = "/etc/cert.pem" + key = "/etc/key.pem" + ca = "/etc/ca.pem" diff --git a/earthbuild/buildkitd.toml.template b/earthbuild/buildkitd.toml.template new file mode 100644 index 000000000..8cb8d4798 --- /dev/null +++ b/earthbuild/buildkitd.toml.template @@ -0,0 +1,20 @@ +debug = ${BUILDKIT_DEBUG} +root = "${BUILDKIT_ROOT_DIR}" +insecure-entitlements = [ "security.insecure" ] + +${TCP_TRANSPORT} +${PPROF_SETTINGS} +${TLS_ENABLED} + +[worker.oci] + enabled = true + snapshotter = "auto" + max-parallelism = ${BUILDKIT_MAX_PARALLELISM} + gc = true + networkMode = "${NETWORK_MODE}" + cniBinaryPath = "/usr/libexec/cni" + cniConfigPath = "/etc/cni/cni-conf.json" + ${CACHE_DURATION_SETTINGS} + ${CACHE_SETTINGS} + +${EARTHLY_ADDITIONAL_BUILDKIT_CONFIG} diff --git a/earthbuild/cni-conf.json.template b/earthbuild/cni-conf.json.template new file mode 100644 index 000000000..cab225236 --- /dev/null +++ b/earthbuild/cni-conf.json.template @@ -0,0 +1,16 @@ +{ + "cniVersion": "0.3.0", + "name": "buildkitbuild", + "type": "bridge", + "bridge": "cni0", + "isGateway": true, + "ipMasq": true, + "mtu": ${CNI_MTU}, + "ipam": { + "type": "host-local", + "subnet": "172.30.0.0/16", + "routes": [ + { "dst": "0.0.0.0/0" } + ] + } +} diff --git a/earthbuild/docker-auto-install.sh b/earthbuild/docker-auto-install.sh new file mode 100755 index 000000000..1f818dc89 --- /dev/null +++ b/earthbuild/docker-auto-install.sh @@ -0,0 +1,244 @@ +#!/bin/sh + +set -eu + +distro=$(. /etc/os-release && echo "$ID") +DOCKER_VERSION="${DOCKER_VERSION:-}" + +detect_dockerd() { + set +e + command -v dockerd >/dev/null + has_d="$?" + set -e + return "$has_d" +} + +detect_docker_compose() { + set +e + command -v docker-compose >/dev/null + has_dc="$?" + set -e + return "$has_dc" +} + +detect_docker_compose_cmd() { + if command -v docker-compose >/dev/null; then + echo "docker-compose" + return 0 + fi + if docker help | grep -w compose >/dev/null; then + echo "docker compose" + return 0 + fi + echo >&2 "failed to detect docker compose / docker-compose command" + return 1 +} + +detect_jq() { + set +e + command -v jq >/dev/null + has_jq="$?" + set -e + return "$has_jq" +} + +print_debug() { + set +u + if [ "$EARTHLY_DEBUG" = "true" ] ; then + echo "$@" + fi + set -u +} + +detect_alpine_3_18_or_newer() { + VERSION="$(. /etc/os-release && echo "$VERSION_ID")" + if [ -z "$VERSION" ]; then + echo >&2 "Error: unable to detect alpine version" + exit 1 + fi + MAJOR="$(echo "$VERSION" | awk -F. '{print $1}')" + MINOR="$(echo "$VERSION" | awk -F. '{print $2}')" + if [ "$MAJOR" -lt 3 ]; then + return 1 + fi + if [ "$MINOR" -lt 18 ]; then + return 1 + fi + return 0 +} + +install_docker_compose() { + case "$distro" in + alpine) + if detect_alpine_3_18_or_newer; then + apk add --update --no-cache docker-cli-compose + else + apk add --update --no-cache docker-compose + fi + ;; + *) + echo "Detected architecture is $(uname -m)" + case "$(uname -m)" in + armv7l|armhf) + # renovate: datasource=github-releases packageName=linuxserver/docker-docker-compose + curl -L "https://github.com/linuxserver/docker-docker-compose/releases/download/1.27.4-ls27/docker-compose-armhf" -o /usr/local/bin/docker-compose + ;; + arm64|aarch64) + # renovate: datasource=github-releases packageName=linuxserver/docker-docker-compose + curl -L "https://github.com/linuxserver/docker-docker-compose/releases/download/1.27.4-ls27/docker-compose-arm64" -o /usr/local/bin/docker-compose + ;; + *) + # renovate: datasource=github-releases packageName=docker/compose + curl -L "https://github.com/docker/compose/releases/download/1.27.4/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + ;; + esac + chmod +x /usr/local/bin/docker-compose + ;; + esac +} + +install_dockerd() { + case "$distro" in + alpine) + if [ -n "$DOCKER_VERSION" ]; then + apk add --update --no-cache docker="$DOCKER_VERSION" + else + apk add --update --no-cache docker + fi + ;; + + amzn) + install_dockerd_amazon + ;; + + ubuntu) + install_dockerd_debian_like + ;; + + debian) + install_dockerd_debian_like + ;; + + *) + echo "Warning: Distribution $distro not yet supported for Docker-in-EarthBuild." + echo "Will attempt to treat like Debian." + echo "If you would like this distribution to be supported, please open a GitHub issue: https://github.com/EarthBuild/earthbuild/issues" + install_dockerd_debian_like + ;; + esac +} + +apt_update_done="false" +apt_get_update() { + if [ "$apt_update_done" != "true" ]; then + apt-get update + apt_update_done=true + fi +} + +install_docker_apt_repo() { + apt-get install --no-install-recommends -y \ + apt-transport-https \ + ca-certificates \ + curl \ + gpg + install -m 0755 -d /etc/apt/keyrings + curl -fsSL "https://download.docker.com/linux/$distro/gpg" | gpg --no-tty --dearmor -o /etc/apt/keyrings/docker.gpg + chmod a+r /etc/apt/keyrings/docker.gpg + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/$distro \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null +} + +install_dockerd_debian_like() { + export DEBIAN_FRONTEND=noninteractive + apt-get remove -y docker docker-engine docker.io containerd runc || true + apt_get_update + install_docker_apt_repo + apt-get update # dont use apt_get_update since we must update the newly added apt repo + if [ -n "$DOCKER_VERSION" ]; then + apt-get install -y docker-ce="$DOCKER_VERSION" docker-ce-cli="$DOCKER_VERSION" + else + apt-get install -y docker-ce docker-ce-cli + fi + apt-get install -y containerd.io +} + +install_dockerd_amazon() { + version=$(. /etc/os-release && echo "$VERSION") + case "$version" in + 2023) + dnf update -y + dnf install -y docker libxcrypt-compat + ;; + 2) + yes | amazon-linux-extras install docker + ;; + *) + echo "Warning: Amazon Linux $version not yet supported for Docker-in-EarthBuild." + echo "Will attempt to treat like Fedora." + dnf update -y + dnf install -y docker + ;; + esac +} + +install_jq() { + case "$distro" in + alpine) + apk add --update --no-cache jq + ;; + + amzn) + yum -y install jq + ;; + + *) + export DEBIAN_FRONTEND=noninteractive + apt_get_update + apt-get install -y jq + ;; + esac +} + +if [ "$(id -u)" != 0 ]; then + echo "Warning: Docker-in-Earthly needs to be run as root user" +fi + +if ! detect_jq; then + echo "jq is missing. Attempting to install automatically." + install_jq +fi + +if ! detect_dockerd; then + echo "Docker Engine is missing. Attempting to install automatically." + install_dockerd + echo "Docker Engine was missing. It has been installed automatically by Earthly." + dockerd --version + echo "For better use of cache, try using the official earthbuild/dind image for WITH DOCKER." +else + print_debug "dockerd already installed" +fi + +set +u +if [ "$EARTHLY_START_COMPOSE" = "true" ] || [ "$EARTHLY_START_COMPOSE" = "" ]; then + set -u + set +e; + docker_compose="$(detect_docker_compose_cmd)" + set -e + if [ -z "$docker_compose" ]; then + echo "Docker Compose is missing. Attempting to install automatically." + install_docker_compose + + docker_compose="$(detect_docker_compose_cmd)" + echo "Docker Compose was missing. It has been installed automatically by Earthly." + + $docker_compose --version + echo "For better use of cache, try using the official earthbuild/dind image for WITH DOCKER." + else + print_debug "docker-compose already installed" + fi +else + print_debug "docker-compose not needed" +fi diff --git a/earthbuild/dockerd-wrapper.sh b/earthbuild/dockerd-wrapper.sh new file mode 100755 index 000000000..eb179f3ba --- /dev/null +++ b/earthbuild/dockerd-wrapper.sh @@ -0,0 +1,414 @@ +#!/bin/sh +set -eu + +EARTHLY_DOCKERD_CACHE_DATA=${EARTHLY_DOCKERD_CACHE_DATA:-"false"} + +EARTHLY_DOCKER_WRAPPER_DEBUG=${EARTHLY_DOCKER_WRAPPER_DEBUG:-''} +if [ "$EARTHLY_DOCKER_WRAPPER_DEBUG" = "1" ]; then + echo "enabling docker wrapper debug mode" + set -x +fi + +# This host is used to pull images from the embedded BuildKit Docker registry. +buildkit_docker_registry='172.30.0.1:8371' + +# used to prefix images that are persisted to the WITH DOCKER cache +earthly_cached_docker_image_prefix="earthly_cached_" + +detect_docker_compose_cmd() { + if command -v docker-compose >/dev/null; then + echo "docker-compose" + return 0 + fi + if docker help | grep -w compose >/dev/null; then + echo "docker compose" + return 0 + fi + echo >&2 "failed to detect docker compose / docker-compose command" + return 1 +} + +# Runs docker-compose with the right -f flags. +docker_compose_cmd() { + compose_file_flags="" + for f in $EARTHLY_COMPOSE_FILES; do + compose_file_flags="$compose_file_flags -f $f" + done + export COMPOSE_HTTP_TIMEOUT=600 + docker_compose="$(detect_docker_compose_cmd)" + export COMPOSE_PROJECT_NAME="default" # newer versions of docker fail if this is not set; older versions used "default" when it was not set + # shellcheck disable=SC2086 + $docker_compose $compose_file_flags "$@" +} + +write_compose_config() { + mkdir -p /tmp/earthbuild + docker_compose_cmd config >/tmp/earthbuild/compose-config.yml +} + +execute() { + if [ -z "$EARTHLY_DOCKERD_DATA_ROOT" ]; then + echo "EARTHLY_DOCKERD_DATA_ROOT not set" + exit 1 + fi + mkdir -p "$EARTHLY_DOCKERD_DATA_ROOT" + + EARTHLY_FLOCK_AQUIRED=${EARTHLY_FLOCK_AQUIRED:-''} + + if [ -f "/sys/fs/cgroup/cgroup.controllers" ] && [ -z "$EARTHLY_FLOCK_AQUIRED" ]; then + if [ "$EARTHLY_DOCKER_WRAPPER_DEBUG" = "1" ]; then + echo >&2 "detected cgroups v2" + fi + + # move script to separate cgroup, to prevent the root cgroup from becoming threaded (which will prevent systemd images (e.g. kind) from running) + mkdir /sys/fs/cgroup/dockerd-wrapper + echo $$ > /sys/fs/cgroup/dockerd-wrapper/cgroup.procs + + # earthly wraps dockerd-wrapper.sh with a call via /bin/sh -c '....' + # so we also need to move the parent pid into this new group, which is weird + # TODO: we should unwrap this so $$ is all we need to move + echo 1 > /sys/fs/cgroup/dockerd-wrapper/cgroup.procs + + if [ "$(wc -l < /sys/fs/cgroup/cgroup.procs)" != "0" ]; then + echo >&2 "warning: processes exist in the root cgroup; this may cause errors during cgroup initialization" + fi + + root_cgroup_type="$(cat /sys/fs/cgroup/cgroup.type)" + if [ "$root_cgroup_type" != "domain" ]; then + echo >&2 "WARNING: expected cgroup type of \"domain\", but got \"$root_cgroup_type\" instead" + fi + fi + + if [ "$EARTHLY_DOCKERD_CACHE_DATA" = "true" ] && [ -z "$EARTHLY_FLOCK_AQUIRED" ]; then + FLOCK_PATH="$EARTHLY_DOCKERD_DATA_ROOT/.earthly-docker-lock" + echo "aquiring flock for $FLOCK_PATH" + export EARTHLY_FLOCK_AQUIRED="true" + # dockerd-wrapper.sh will be recursively called once the lock is aquired + flock "$FLOCK_PATH" "$0" "$@" + exit 0 + fi + + # Sometimes, when dockerd starts containerd, it doesn't come up in time. This timeout is not configurable from + # dockerd, therefore we retry... since most instances of this timeout seem to be related to networking or scheduling + # when many WITH DOCKER commands are also running. Logs are printed for each failure. + for i in 1 2 3 4 5; do + if start_dockerd; then + break + else + if [ "$i" = 5 ]; then + # Exiting here on the last retry maintains prior behavior of exiting when this cant start. + exit 1 + fi + + if grep -q "^failed to start containerd: timeout waiting for containerd to start$" /var/log/docker.log; then + # This error is the sentinel string for retrying to start dockerd. + echo "Attempting to restart dockerd (attempt $i), since the error may be transient..." + sleep 5 + else + # If the logs do not contain this, then fail fast to maintain prior behavior. + exit 1 + fi + fi + done + + if [ "$EARTHLY_DOCKERD_CACHE_DATA" = "true" ]; then + clean_leftover_docker_objects + + # rename existing tags, so we can track which ones get re-tagged + for img in $(docker images -q); do + docker tag "$img" "${earthly_cached_docker_image_prefix}${img}" + done + docker images -a --format '{{.Repository}}:{{.Tag}}' | grep -v "^$earthly_cached_docker_image_prefix" | xargs --no-run-if-empty docker rmi --force + fi + + load_file_images + load_registry_images + + # delete cached images (which weren't re-tagged via the pull) + if [ "$EARTHLY_DOCKERD_CACHE_DATA" = "true" ]; then + docker images -f reference=$earthly_cached_docker_image_prefix'*' --format '{{.Repository}}:{{.Tag}}' | xargs --no-run-if-empty docker rmi --force + docker images -f "dangling=true" -q | xargs --no-run-if-empty docker rmi --force + fi + + if [ "$EARTHLY_START_COMPOSE" = "true" ]; then + # shellcheck disable=SC2086 + docker_compose_cmd up -d $EARTHLY_COMPOSE_SERVICES + fi + + shift + export EARTHLY_WITH_DOCKER=1 + set +e + "$@" + exit_code="$?" + set -e + + if [ "$EARTHLY_START_COMPOSE" = "true" ]; then + docker_compose_cmd down --remove-orphans + fi + stop_dockerd + return "$exit_code" +} + +start_dockerd() { + if [ "$EARTHLY_DOCKERD_CACHE_DATA" = "true" ]; then + data_root="$EARTHLY_DOCKERD_DATA_ROOT" + else + data_root=$(TMPDIR="$EARTHLY_DOCKERD_DATA_ROOT/" mktemp -d) + fi + echo "Starting dockerd with data root $data_root" + + if uname -a | grep microsoft-standard-WSL >/dev/null; then + if iptables --version | grep nf_tables >/dev/null; then + echo "WARNING: WSL and iptables-nft may not work; attempting to switch to iptables-legacy" + ln -sf "/sbin/iptables-legacy" /sbin/iptables + fi + fi + + # Use a specific IP range to avoid collision with host dockerd (we need to also connect to host + # docker containers for the debugger). + if ! [ -f /etc/docker/daemon.json ]; then + mkdir -p /etc/docker + echo >/etc/docker/daemon.json '{}' + fi + + # compliments of https://stackoverflow.com/a/53666584 + # this will concatenate arrays found in both the LHS and RHS; default jq will overwrite the LHS with the RHS + cat <<'EOF' > /tmp/meld.jq +def meld(a; b): + a as $a | b as $b + | if ($a|type) == "object" and ($b|type) == "object" + then reduce ([$a,$b]|add|keys_unsorted[]) as $k ({}; + .[$k] = meld( $a[$k]; $b[$k]) ) + elif ($a|type) == "array" and ($b|type) == "array" + then $a+$b + elif $b == null then $a + else $b + end; +meld($user; .) +EOF + + # TODO(jhorsts): enable containerd snapshotter once we have proper support for it. + # More https://docs.docker.com/engine/storage/drivers/select-storage-driver/ + # + # Disabling containerd snappshotter is a temporary workaround for ensuring Docker-in-Docker works in EarthBuild. + # + # https://github.com/EarthBuild/earthbuild/issues/195 + + daemon_data="$(cat /etc/docker/daemon.json)" + cat < /etc/docker/daemon.json +{ + "default-address-pools" : [ + { + "base" : "172.21.0.0/16", + "size" : 24 + }, + { + "base" : "172.22.0.0/16", + "size" : 24 + } + ], + "bip": "172.20.0.1/16", + "data-root": "$data_root", + "insecure-registries" : ["$buildkit_docker_registry"], + "registry-mirrors" : ["https://mirror.gcr.io", "https://public.ecr.aws"], + "features": { + "containerd-snapshotter": false + } +} +EOF + + # Start with wiping the dir to make sure a previous interrupted build did not leave its state around. + wipe_data_root "$data_root" + mkdir -p "$data_root" + rm -f /var/run/docker.pid + dockerd >/var/log/docker.log 2>&1 & + dockerd_pid="$!" + i=1 + timeout=300 + while ! docker ps >/dev/null 2>&1; do + sleep 1 + fail=false + if [ "$i" -gt "$timeout" ]; then + echo "ERROR: dockerd start timeout (${timeout}s)" + fail=true + fi + if ! kill -0 "$dockerd_pid" >/dev/null 2>&1; then + echo "ERROR: dockerd crashed on startup" + fail=true + fi + if [ "$fail" = "true" ]; then + # Print dockerd logs on start failure. + print_dockerd_logs + echo "If you are having trouble running docker, try using the official earthbuild/dind image instead" + return 1 + fi + i=$((i+1)) + done +} + +print_dockerd_logs() { + echo "Architecture: $(uname -m)" + echo "==== Begin dockerd logs ====" + cat /var/log/docker.log || true + echo "==== End dockerd logs ====" +} + +stop_dockerd() { + dockerd_pid="$(cat /var/run/docker.pid)" + timeout=30 + + if [ -n "$dockerd_pid" ]; then + kill "$dockerd_pid" >/dev/null 2>&1 + i=1 + while kill -0 "$dockerd_pid" >/dev/null 2>&1; do + sleep 1 + if [ "$i" -gt "$timeout" ]; then + echo "dockerd did not exit after $timeout seconds, force-exiting" + kill -9 "$dockerd_pid" >/dev/null 2>&1 || true + fi + i=$((i+1)) + done + + # Wait for the PID to exit. This ensures that dockerd cannot keep any files in data root open. + wait "$dockerd_pid" || true + fi + + # Wipe dockerd data when done. + wipe_data_root "$data_root" +} + +wipe_data_root() { + if [ "$EARTHLY_DOCKERD_CACHE_DATA" = "true" ]; then + return 0 + fi + if ! rm -rf "$1" 2>/dev/null >&2 && [ -n "$(ls -A "$1")" ]; then + # We have some issues about failing to delete files. + # If we fail, list the processes keeping it open for results. + rm -rf "$1" || true # Do it again, but now print the error. + echo "==== Begin file lsof info ====" + if ! lsof +D "$1" ; then + echo "Failed to run lsof +D $1. Trying lsof $1" + if ! lsof "$1"; then + echo "Failed to run lsof $1" + fi + fi + echo "==== End file lsof info ====" + echo "==== Begin file ls info ====" + if ! ls -Ral "$1"; then + echo "Failed to run ls -Ral $1" + fi + echo "==== End file ls info ====" + echo "" # Add space between above and docker logs + print_dockerd_logs + fi +} + +load_file_images() { + if [ -n "$EARTHLY_DOCKER_LOAD_FILES" ]; then + echo "Loading images from BuildKit via tar files..." + for img in $EARTHLY_DOCKER_LOAD_FILES; do + docker load -i "$img" || (stop_dockerd; exit 1) + done + echo "...done" + fi +} + +get_current_time_ns() { + # Note: busybox does not support date +%s%N; instead we use stat to fetch nanosecond + f="$(mktemp)" + current_time="$(stat -t "$f" | awk '{print $13}')" + current_time_ns="$(stat "$f" | grep Modify | awk '{print $3}' | awk -F . '{print $2}' | grep -o '[1-9].*')" + rm "$f" + + # Note that the current_time_ns must not start with a 0 (which is why there is a grep [1-9]); however + # there's an edge case where current_time_ns="00000000", which would turn into "", so we need to set it back to "0" + if [ "$current_time_ns" = "" ]; then current_time_ns=0; fi + + test -n "$current_time" || (echo "current_time is empty" && exit 1) + test -n "$current_time_ns" || (echo "current_time_ns is empty" && exit 1) + current_time_combined="$((current_time*1000000000+current_time_ns))" + echo "$current_time_combined" +} + +clean_leftover_docker_objects() { + # Kill any existing containers, and prune any resources that may have + # been left behind from a previous execution. + docker container ls --quiet | xargs --no-run-if-empty docker container kill + docker container prune --force + docker volume prune --force + docker network prune --force +} + +load_registry_images() { + EARTHLY_DOCKER_LOAD_REGISTRY=${EARTHLY_DOCKER_LOAD_REGISTRY:-''} + if [ -n "$EARTHLY_DOCKER_LOAD_REGISTRY" ]; then + echo "Loading images from BuildKit via embedded registry..." + + start_time="$(get_current_time_ns)" + bg_processes="" # Initialize the background processes variable + + for img in $EARTHLY_DOCKER_LOAD_REGISTRY; do + case "$img" in + *'|'*) + with_reg="$buildkit_docker_registry/$(printf '%s' "$img" | cut -d'|' -f1)" + user_tag="$(printf '%s' "$img" | cut -d'|' -f2-)" + ;; + *) + # Old format before v0.6.21. + with_reg="$buildkit_docker_registry/$img" + user_tag="$(printf '%s' "$img" | cut -d'/' -f2-)" + echo "Detected old format" + ;; + esac + echo "Pulling $with_reg and retagging as $user_tag" + # Download and tag images in parallel + (docker pull -q "$with_reg" && docker tag "$with_reg" "$user_tag" && docker rmi --force "$with_reg") & + + bg_processes="$bg_processes $!" + + done + + # Wait for all background processes to finish + for pid in $bg_processes; do + wait "$pid" || { + echo "Downloading of images failed" + stop_dockerd + exit 1 + } + done + end_time="$(get_current_time_ns)" + elapsed_ns="$((end_time - start_time))" + elapsed_ms="$((elapsed_ns/1000000))" + echo "Loading images done in ${elapsed_ms} ms" + fi +} + +EARTHLY_DOCKER_WRAPPER_DEBUG_CMD=${EARTHLY_DOCKER_WRAPPER_DEBUG_CMD:-''} +if [ -n "$EARTHLY_DOCKER_WRAPPER_DEBUG_CMD" ]; then + echo "Running debug command: $EARTHLY_DOCKER_WRAPPER_DEBUG_CMD" + eval "$EARTHLY_DOCKER_WRAPPER_DEBUG_CMD" + echo "debug command exited with $?; forcing exit 1 to prevent saving RUN snapshot" + exit 1 +fi + +EARTHLY_DOCKER_WRAPPER_PRE_SCRIPT=${EARTHLY_DOCKER_WRAPPER_PRE_SCRIPT:-"/usr/share/earthly/dockerd-wrapper-pre-script"} +if [ -f "$EARTHLY_DOCKER_WRAPPER_PRE_SCRIPT" ]; then + "$EARTHLY_DOCKER_WRAPPER_PRE_SCRIPT" +fi + +case "$1" in + get-compose-config) + write_compose_config + exit 0 + ;; + + execute) + execute "$@" + exit "$?" + ;; + + *) + echo "Invalid command $1" + exit 1 + ;; +esac diff --git a/earthbuild/entrypoint.sh b/earthbuild/entrypoint.sh new file mode 100755 index 000000000..177b25196 --- /dev/null +++ b/earthbuild/entrypoint.sh @@ -0,0 +1,339 @@ +#!/bin/sh +set -e + +# Docker 29+ (containerd v2) lowered the default open file limit from 1048576 +# to 1024, which starves buildkitd. Ensure we always have enough. +ulimit -n 1048576 2>/dev/null || true + +echo "starting earthly-buildkit with EARTHLY_GIT_HASH=$EARTHLY_GIT_HASH BUILDKIT_BASE_IMAGE=$BUILDKIT_BASE_IMAGE" + +if [ "$BUILDKIT_DEBUG" = "true" ]; then + set -x +fi + +if [ -z "$CACHE_SIZE_MB" ]; then + echo "CACHE_SIZE_MB not set" + exit 1 +fi + +if [ -z "$CACHE_SIZE_PCT" ]; then + echo "CACHE_SIZE_PCT not set" + exit 1 +fi + +if [ -z "$BUILDKIT_DEBUG" ]; then + echo "BUILDKIT_DEBUG not set" + exit 1 +fi + +if [ -z "$BUILDKIT_MAX_PARALLELISM" ]; then + echo "BUILDKIT_MAX_PARALLELISM not set" + exit 1 +fi + +if [ -z "$EARTHLY_TMP_DIR" ]; then + echo "EARTHLY_TMP_DIR not set" + exit 1 +fi + +if [ -z "$NETWORK_MODE" ]; then + echo "NETWORK_MODE not set" + exit 1 +fi + +if [ -z "$EARTHLY_CACHE_VERSION" ]; then + echo "EARTHLY_CACHE_VERSION not set" + exit 1 +fi + +if [ -f "/sys/fs/cgroup/cgroup.controllers" ]; then + echo "detected cgroups v2; buildkit/entrypoint.sh running under pid=$$ with controllers \"$(cat /sys/fs/cgroup/cgroup.controllers)\" in group $(cat /proc/self/cgroup)" + test "$(cat /sys/fs/cgroup/cgroup.type)" = "domain" || (echo >&2 "WARNING: invalid root cgroup type: $(cat /sys/fs/cgroup/cgroup.type)") +fi + +earthly_cache_version_path="${EARTHLY_TMP_DIR}/internal.earthly.version" +if [ -f "$earthly_cache_version_path" ]; then + current_cache_version="$(cat "$earthly_cache_version_path")" +else + current_cache_version="0" +fi +if [ "$EARTHLY_CACHE_VERSION" != "$current_cache_version" ]; then + EARTHLY_RESET_TMP_DIR="true" +fi + +if [ "$EARTHLY_RESET_TMP_DIR" = "true" ]; then + echo "Resetting dir $EARTHLY_TMP_DIR" + rm -rf "${EARTHLY_TMP_DIR:?}"/* || true + mkdir -p "$EARTHLY_TMP_DIR" # required for eine tests + echo "$EARTHLY_CACHE_VERSION" > "$earthly_cache_version_path" +fi + +if [ -z "$IP_TABLES" ]; then + echo "Autodetecting iptables" + + if lsmod | grep -wq "^ip_tables"; then + echo "Detected iptables-legacy module" + IP_TABLES="iptables-legacy" + + elif lsmod | grep -wq "^nf_tables"; then + echo "Detected iptables-nft module" + IP_TABLES="iptables-nft" + else + echo "Could not find an ip_tables module; falling back to heuristics." + + legacylines=$(iptables-legacy -t nat -S --wait | wc -l) + legacycode=$? + + nflines=$(iptables-nft -t nat -S --wait | wc -l) + nfcode=$? + + if [ $legacycode -eq 0 ] && [ $nfcode -ne 0 ]; then + echo "Detected iptables-legacy by exit code ($legacycode, $nfcode)" + IP_TABLES="iptables-legacy" + + elif [ $legacycode -ne 0 ] && [ $nfcode -eq 0 ]; then + echo "Detected iptables-nft by exit code ($legacycode, $nfcode)" + IP_TABLES="iptables-nft" + + elif [ $legacycode -ne 0 ] && [ $nfcode -ne 0 ]; then + echo "iptables-legacy and iptables-nft both exited abnormally ($legacycode, $nfcode). Check your settings and then set the IP_TABLES variable correctly to skip autodetection." + exit 1 + + elif [ "$legacylines" -ge "$nflines" ]; then + # Tie-break goes to legacy, after testing on WSL/Windows + echo "Detected iptables-legacy by output length ($legacylines >= $nflines)" + IP_TABLES="iptables-legacy" + + else + echo "Detected iptables-nft by output length ($legacylines < $nflines)" + IP_TABLES="iptables-nft" + fi + fi +else + echo "Manual iptables specified ($IP_TABLES), skipping autodetection." +fi +if [ ! -e "/sbin/$IP_TABLES" ]; then + echo "IP_TABLES is set to $IP_TABLES, but /sbin/$IP_TABLES does not exist" + exit 1 +fi +ln -sf "/sbin/$IP_TABLES" /sbin/iptables + +# clear any leftovers (that aren't explicitly cached) in the dind dir +find /tmp/earthbuild/dind/ -maxdepth 1 -mindepth 1 | grep -v cache_ | xargs -r rm -rf + +mkdir -p "$EARTHLY_TMP_DIR/dind" + +# setup git credentials and config +i=0 +while true +do + varname=GIT_CREDENTIALS_"$i" + eval data=\$$varname + # shellcheck disable=SC2154 + if [ -n "$data" ] + then + echo 'echo $'$varname' | base64 -d' >/usr/bin/git_credentials_"$i" + chmod +x /usr/bin/git_credentials_"$i" + else + break + fi + i=$((i+1)) +done +echo "$EARTHLY_GIT_CONFIG" | base64 -d >/root/.gitconfig + +#Set up CNI +if [ -z "$CNI_MTU" ]; then + device=$(ip route show | grep ^default | head -n 1 | sed 's|.* dev \(\w*\)\s.*|\1|') + CNI_MTU=$(cat /sys/class/net/"$device"/mtu) + export CNI_MTU +fi +envsubst /etc/cni/cni-conf.json + +# Set up buildkit cache. +export BUILDKIT_ROOT_DIR="$EARTHLY_TMP_DIR"/buildkit +mkdir -p "$BUILDKIT_ROOT_DIR" +CACHE_SETTINGS= + +# Length of time (in seconds) to keep cache. Zero is the same as unset to buildkit. +CACHE_DURATION_SETTINGS= +if [ -n "$CACHE_KEEP_DURATION" ] && [ "$CACHE_KEEP_DURATION" -gt 0 ]; then + CACHE_DURATION_SETTINGS="$(envsubst "Total data blocks" + # %S -> "Fundamental block size" + # -f $EARTHLY_TMP_DIR -> filesystem where directory resides, usually a volume in docker's root directory + CALCULATED_CACHE_MB="$(stat -c "%b * %S * ${CACHE_SIZE_PCT} / 100 / 1024 / 1024" -f "$EARTHLY_TMP_DIR" | bc)" + if [ -z "$EFFECTIVE_CACHE_SIZE_MB" ]; then + EFFECTIVE_CACHE_SIZE_MB="$CALCULATED_CACHE_MB" + elif [ "$CALCULATED_CACHE_MB" -lt "$EFFECTIVE_CACHE_SIZE_MB" ]; then + echo "clamping cache size to $CALCULATED_CACHE_MB MB (${CACHE_SIZE_PCT}% of filesystem)" + EFFECTIVE_CACHE_SIZE_MB="$CALCULATED_CACHE_MB" + else + # In the off-chance they are actual equal, I'm not sure there's much value in calling that out specifically. + # Even if they are both "30GB", the user likely set "30000", whereas the percentage would likely come out to + # be something like "30314" (since we're moving from bytes, unlikely to have a consecutive set of zeroes). + echo "clamping cache size to fixed size of $EFFECTIVE_CACHE_SIZE_MB MB" + fi +fi + +# EFFECTIVE_CACHE_SIZE_MB remains unset if neither percent nor size were specified. It would be simpler to just process whether it was +# set (or not), but we'll continue setting to "0" in case anyone has become dependent on that behavior. +CACHE_SIZE_MB="${EFFECTIVE_CACHE_SIZE_MB:-0}" + +if [ "$CACHE_SIZE_MB" -eq "0" ]; then + # no config value was set by the user; buildkit would set this to 10% by default: + # https://github.com/moby/buildkit/blob/54b8ff2fc8648c86b1b8c35e5cd07517b56ac2d5/cmd/buildkitd/config/gcpolicy_unix.go#L16 + # however, we will be aggressive and set it to min(55%, max(10%, 20GB)) + CACHE_MB_10PCT="$(stat -c "10 * %b * %S / 100 / 1024 / 1024" -f "$EARTHLY_TMP_DIR" | bc)" + CACHE_MB_55PCT="$(stat -c "55 * %b * %S / 100 / 1024 / 1024" -f "$EARTHLY_TMP_DIR" | bc)" + CACHE_SIZE_MB="20480" # first start with 20GB + if [ "$CACHE_MB_10PCT" -gt "$CACHE_SIZE_MB" ]; then + CACHE_SIZE_MB="$CACHE_MB_10PCT" # increase it to 10% of the disk if bigger + elif [ "$CACHE_MB_55PCT" -lt "$CACHE_SIZE_MB" ]; then + CACHE_SIZE_MB="$CACHE_MB_55PCT" # otherwise, prevent it from being bigger than 55% of the disk + fi + echo "cache size set automatically to $CACHE_SIZE_MB MB; this can be changed via the cache_size_mb or cache_size_pct config options" +fi + +# Calculate the cache for source files to be 10% of the overall cache +SOURCE_FILE_KEEP_BYTES="$(echo "($CACHE_SIZE_MB * 1024 * 1024 * 0.5) / 1" | bc)" # Note /1 division truncates to int +export SOURCE_FILE_KEEP_BYTES + +# convert the cache size into bytes +CATCH_ALL_KEEP_BYTES="$(echo "$CACHE_SIZE_MB * 1024 * 1024" | bc)" +export CATCH_ALL_KEEP_BYTES + +# finally populate the cache section of the buildkit toml config +CACHE_SETTINGS="$(envsubst /etc/buildkitd.toml + +# Session history is 1h by default unless otherwise specified +if [ -z "$BUILDKIT_SESSION_HISTORY_DURATION" ]; then + BUILDKIT_SESSION_HISTORY_DURATION="1h" +fi +export BUILDKIT_SESSION_HISTORY_DURATION + +# Session timeout will automatically cancel builds that run for too long +# Configured to 1 day by default unless otherwise specified +if [ -z "$BUILDKIT_SESSION_TIMEOUT" ]; then + BUILDKIT_SESSION_TIMEOUT="24h" +fi +export BUILDKIT_SESSION_TIMEOUT + +# Set up OOM +OOM_SCORE_ADJ="${BUILDKIT_OOM_SCORE_ADJ:-0}" +export OOM_SCORE_ADJ +if [ -n "$OOM_EXCLUDED_PIDS" ]; then + echo "The following PIDs will be ignored by the OOM reaper: $OOM_EXCLUDED_PIDS" +fi + +ignored_by_oom() { + if echo ",$OOM_EXCLUDED_PIDS," | grep -q ",$1,"; then + echo "true" + else + echo "false" + fi +} + +envsubst "\${OOM_SCORE_ADJ} \${BUILDKIT_DEBUG}" /bin/oom-adjust.sh +chmod +x /bin/oom-adjust.sh + +echo "BUILDKIT_ROOT_DIR=$BUILDKIT_ROOT_DIR" +echo "CACHE_SIZE_MB=$CACHE_SIZE_MB" +echo "BUILDKIT_MAX_PARALLELISM=$BUILDKIT_MAX_PARALLELISM" +echo "BUILDKIT_LOCAL_REGISTRY_LISTEN_PORT=$BUILDKIT_LOCAL_REGISTRY_LISTEN_PORT" +echo "EARTHLY_ADDITIONAL_BUILDKIT_CONFIG=$EARTHLY_ADDITIONAL_BUILDKIT_CONFIG" +echo "CNI_MTU=$CNI_MTU" +echo "OOM_SCORE_ADJ=$OOM_SCORE_ADJ" +echo "" +echo "======== CNI config ==========" +cat /etc/cni/cni-conf.json +echo "======== End CNI config ==========" +echo "" +echo "======== Buildkitd config ==========" +cat /etc/buildkitd.toml +echo "======== End buildkitd config ==========" +echo "" +echo "======== OOM Adjust script ==========" +cat /bin/oom-adjust.sh +echo "======== OOM Adjust script ==========" +echo "" +echo "Detected container architecture is $(uname -m)" + +"$@" & +execpid=$! + +stop_buildkit() { + echo "Shutdown signal received. Stopping buildkit..." + for i in $(echo "$OOM_EXCLUDED_PIDS" | sed "s/,/ /g"); do + echo "killing externally provided pid: $i" + kill -SIGTERM "$i" + done + echo "killing buildkit pid: $execpid" + kill -SIGTERM "$execpid" +} + +trap stop_buildkit TERM QUIT INT + +# quit if buildkit dies +set +x +while true +do + if ! kill -0 "$execpid" >/dev/null 2>&1; then + wait "$execpid" + code="$?" + if [ "$code" != "0" ]; then + echo "Error: buildkit process has exited with code $code" + fi + exit "$code" + fi + + for PID in $(pgrep -P 1) + do + # Sometimes, child processes can be reparented to the init (this script). One + # common instance is when something is OOM killed, for instance. This enumerates + # all those PIDs, and kills them to prevent accidental "ghost" loads. + if [ "$PID" != "$execpid" ] && [ "$(ignored_by_oom "$PID")" = "false" ]; then + if [ "$OOM_SCORE_ADJ" -ne "0" ]; then + ! "$BUILDKIT_DEBUG" || echo "$(date) | $PID($(cat /proc/"$PID"/cmdline)) killed with OOM_SCORE_ADJ=$OOM_SCORE_ADJ" >> /var/log/oom_adj + kill -9 "$PID" + else + ! "$BUILDKIT_DEBUG" || echo "$(date) | $PID($(cat /proc/"$PID"/cmdline)) was not killed because OOM_SCORE_ADJ was default or not set" >> /var/log/oom_adj + fi + fi + done + + sleep 1 +done diff --git a/earthbuild/oom-adjust.sh.template b/earthbuild/oom-adjust.sh.template new file mode 100644 index 000000000..860bd3deb --- /dev/null +++ b/earthbuild/oom-adjust.sh.template @@ -0,0 +1,55 @@ +#! /bin/sh + +set -e + +OOM_ADJ="${OOM_SCORE_ADJ}" +DEBUG="${BUILDKIT_DEBUG}" +INVOCATION=$(tr -dc A-Za-z0-9 > /var/log/oom_adj +} + +adjust_oom() { + echo "$1" > /proc/"$2"/oom_score_adj || true # It is ok if the OOM score fails - the PID may have exited, so it no longer matters anyways. +} + +if [ "$OOM_ADJ" -eq "0" ]; then + exit 0 +fi + +for PID in $(pidof buildkit-runc) +do + PID_NAME=$(cat /proc/"$PID"/cmdline || echo "unknown") + + case "$PID_NAME" in + # This is the POSIX way to do a string-starts-with, and accommodates the one prefix we do not want. Order here is important. + "buildkit-runcinit"*) log "$PID was buildkit-runcinit, ignoring"; continue ;; + "buildkit-runc"*) log "$PID is runc parent($PID_NAME), proceeding" ;; + *) log "$PID was $PID_NAME, ignoring"; continue ;; + esac + + for CHILD_PID in $(pgrep -P "$PID") + do + CHILD_PID_NAME=$(cat /proc/"$CHILD_PID"/cmdline || echo "unknown") + CHILD_OOM_ADJ=$(cat /proc/"$CHILD_PID"/oom_score_adj || echo "unknown") + + case "$CHILD_OOM_ADJ" in + "unknown"*) log "$PID has child: $CHILD_PID($CHILD_PID_NAME), which was missing, ignoring"; continue ;; + "0") log "$PID has child: $CHILD_PID($CHILD_PID_NAME) with oom_score_adj: $CHILD_OOM_ADJ"; continue ;; + *) log "$PID has child: $CHILD_PID($CHILD_PID_NAME), oom_score_adj was set to 0: $CHILD_OOM_ADJ" ;; + esac + # The child may have started _after_ this script ran on the parent (or other invocation of) buildkit-runc. + # This undoes the inherited adjustment to make sure we behave properly at OOM time. + adjust_oom "0" "$CHILD_PID" + done + + PID_NAME=$(cat /proc/"$PID"/cmdline || echo "unknown") + case "$PID_NAME" in + # Just in case the process exec-ed another program between the initial filter and now. + "buildkit-runcinit"*) log "$PID was buildkit-runcinit, no longer a candidate for OOM adjustment, ignoring"; continue ;; + "buildkit-runc"*) log "$PID oom_score_adj was set to $OOM_SCORE_ADJ" ;; + *) log "$PID was $PID_NAME, no longer a candidate for OOM adjustment, ignoring"; continue ;; + esac + adjust_oom "$OOM_ADJ" "$PID" +done diff --git a/earthbuild/runc-ps b/earthbuild/runc-ps new file mode 100755 index 000000000..ff87bd1dd --- /dev/null +++ b/earthbuild/runc-ps @@ -0,0 +1,20 @@ +#!/bin/sh +set -e + +total_procs=0 +for bundle in $(ps auxw | grep 'runc.*bundle' | grep -v grep | awk '{print $11}'); do + id="$(basename "$bundle")" + data="$(cat "$bundle/config.json")" + if [ -n "$data" ]; then + echo "=== $bundle ===" + echo "$data" | jq .process.args + statsdata="$(/usr/bin/buildkit-runc events --stats "$id")" + memory="$(echo "$statsdata" | jq .data.memory.usage.usage)" + cpu="$(echo "$statsdata" | jq .data.cpu.usage.total)" + echo "total cpu usage: $cpu" + echo "total memory usage: $memory" + total_procs=$((total_procs + 1)) + fi +done +echo "=== summary ===" +echo "runc-ps found $total_procs container(s)"