From 728cd3c7ac21bdbc47f3b347a0be2cd9a3c869d7 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 5 Jun 2026 11:58:39 -0400
Subject: [PATCH v3 1/2] ci: Improve ccache handling

There previously were a number of issues:

- We'd upload the cache even if we already had a high hit rate. That means we
  churn through the available cache space very quickly.

  For this we now check if the cache hit ratio is already high, and skip
  uploading a new cache in that case.

- We'd generate per-branch caches, even if master's already would suffice,
  because the branch doesn't change much

  This is solved indirectly by the above.

- The cache key allowed prefix matches based on the branch,
  e.g. master-pending would always use master's branch

  Replace the cache key element separator of - with :, which is not a valid
  part of a branch name.

- When rebasing a feature branch, we'd start with just that branch's cache,
  rather than also having the newer cache of master available

  This is solved by downloading by master's and the feature branch's cache,
  simply overlaying both. That's possible because ccache is content addressed.

- The size of a cache would increase to the max, even though there likely will
  be no benefit from old cache entries.

  Address this by explicitly evicting old data and also recompressing the
  cache before uploading it.

In my testing this utilizes the available cache space (10GB for personal
accounts) much more effictively than before.

The not entirely trivial determination of whether it's worth uploading a cache
entry is moved to a python script.  I first had it as shell, but that gets
awkward.  This way it'd also be more viable to use ccache for msvc at some
point.

The per-job redundancies are a bit annoying. There's a way around that, by
using composite actions, but I think that might be harder to understand,
without all that much of an improvement.
---
 .github/workflows/pg-ci.yml       |  94 ++++++++++++++++++++++------
 src/tools/ci/gha_ccache_decide.py | 100 ++++++++++++++++++++++++++++++
 2 files changed, 176 insertions(+), 18 deletions(-)
 create mode 100644 src/tools/ci/gha_ccache_decide.py

diff --git a/.github/workflows/pg-ci.yml b/.github/workflows/pg-ci.yml
index 8560e9389f6..86dc47de8db 100644
--- a/.github/workflows/pg-ci.yml
+++ b/.github/workflows/pg-ci.yml
@@ -130,6 +130,22 @@ env:
   # commit-message directive parsed in the `setup` job below.
   CI_OS_ONLY_JOBS: "linux macos windows mingw compilerwarnings sanitycheck"
 
+  ###
+  # A few variables to make expressions later on shorter
+  ###
+
+  ON_DEFAULT_BRANCH: ${{github.event.repository.default_branch == github.ref_name }}
+
+  # Note that we need to be careful to use a separator that can't be in branch
+  # names, otherwise e.g. caches for 'master' might be restored on the
+  # 'master-pending' branch.
+  CACHE_PREFIX_DEFAULT: >-
+    :${{ github.job }}:${{ github.event.repository.default_branch }}:
+  CACHE_PREFIX_BRANCH: >-
+    :${{ github.job }}:${{ github.ref_name }}:
+  CACHE_SUFFIX: >-
+    ${{ github.run_id }}:${{ github.run_attempt }}
+
 
 jobs:
 
@@ -277,16 +293,30 @@ jobs:
         with:
           fetch-depth: ${{ env.CLONE_DEPTH }}
 
-      - &ccache_restore_step
-        name: Restore ccache
-        id: ccache_restore
+      # We restore both the ccache from the default branch (typically master),
+      # and from the current branch. This will often allow feature branches to
+      # start out with a high cache hit ratio.
+      #
+      # With ccache it turns out to work to just restore two caches into the
+      # same directory, as it's basically a content addressed store. Stats
+      # could be corrupted, but we zero them out anyway.
+      - &ccache_restore_default_step
+        name: "ccache: Restore for default branch ${{github.event.repository.default_branch}}"
+        if: ${{ env.ON_DEFAULT_BRANCH == 'false' }}
         uses: actions/cache/restore@v5
         with:
           path: ${{ env.CCACHE_DIR }}
-          key: ccache-${{ github.job }}-${{ github.ref_name }}-${{ github.run_id }}-${{ github.run_attempt }}
-          restore-keys: |
-            ccache-${{ github.job }}-${{ github.ref_name }}-
-            ccache-${{ github.job }}-
+          key: ccache${{env.CACHE_PREFIX_DEFAULT}}${{env.CACHE_SUFFIX}}
+          restore-keys: ccache${{env.CACHE_PREFIX_DEFAULT}}
+
+      - &ccache_restore_branch_step
+        name: "ccache: Restore for branch ${{ github.ref_name }}"
+        id: ccache-restore-branch
+        uses: actions/cache/restore@v5
+        with:
+          path: ${{ env.CCACHE_DIR }}
+          key: ccache${{env.CACHE_PREFIX_BRANCH}}${{env.CACHE_SUFFIX}}
+          restore-keys: ccache${{env.CACHE_PREFIX_BRANCH}}
 
       - &linux_prepare_workspace_step
         name: Prepare workspace
@@ -325,15 +355,30 @@ jobs:
           ninja -C build -j${{env.BUILD_JOBS}} ${{env.MBUILD_TARGET}}
           ninja -C build -t missingdeps
 
-      # TODO: As long as we use per-run ccache caches, we should probably add
-      # a step that checks if there is sufficient new content to warrant
-      # saving the new cache.
+      # Decide if it's worth uploading a new version of the ccache cache. If
+      # we always do so unconditionally, we'd very quickly go through the
+      # allowed cache space. Instead we check if the hit rate is high enough
+      # already for that not to be worth it.
+      - &ccache_decide_save_step
+        name: "ccache: Decide if cache should be uploaded"
+        id: ccache-pre-save
+        # [Decide to] store the cache whenever the cache was set up, so that
+        # incrementally addressing compiler errors/warnings doesn't have to
+        # start from scratch.
+        if: |
+          always() &&
+          steps.ccache-restore-branch.conclusion == 'success'
+        run: python3 src/tools/ci/gha_ccache_decide.py
+
       - &ccache_save_step
-        name: Save ccache
+        name: "ccache: Upload cache"
         uses: actions/cache/save@v5
+        if: |
+          always() &&
+          steps.ccache-pre-save.outputs.should_save == 'true'
         with:
           path: ${{ env.CCACHE_DIR }}
-          key: ${{ steps.ccache_restore.outputs.cache-primary-key }}
+          key: ${{ steps.ccache-restore-branch.outputs.cache-primary-key }}
 
       # Run a minimal set of tests. The main regression tests take too long
       # for this purpose. For now this is a random quick pg_regress style
@@ -448,7 +493,8 @@ jobs:
 
       - *nix_sysinfo_step
       - *checkout_step
-      - *ccache_restore_step
+      - *ccache_restore_default_step
+      - *ccache_restore_branch_step
       - *linux_prepare_workspace_step
 
       - name: Configure
@@ -467,6 +513,7 @@ jobs:
         run: |
           make -s -j${BUILD_JOBS} world-bin
 
+      - *ccache_decide_save_step
       - *ccache_save_step
 
       - name: Test world
@@ -508,7 +555,8 @@ jobs:
 
       - *nix_sysinfo_step
       - *checkout_step
-      - *ccache_restore_step
+      - *ccache_restore_default_step
+      - *ccache_restore_branch_step
       - *linux_prepare_workspace_step
 
       - name: Configure
@@ -527,6 +575,7 @@ jobs:
         shell: *su_postgres_shell
         run: *ninja_build_cmd
 
+      - *ccache_decide_save_step
       - *ccache_save_step
 
       - name: Test world
@@ -596,7 +645,8 @@ jobs:
 
       - *nix_sysinfo_step
       - *checkout_step
-      - *ccache_restore_step
+      - *ccache_restore_default_step
+      - *ccache_restore_branch_step
       - *linux_prepare_workspace_step
 
       - name: Configure
@@ -613,6 +663,7 @@ jobs:
         shell: *su_postgres_shell
         run: *ninja_build_cmd
 
+      - *ccache_decide_save_step
       - *ccache_save_step
 
       - name: Test world
@@ -682,7 +733,6 @@ jobs:
     steps:
       - *nix_sysinfo_step
       - *checkout_step
-      - *ccache_restore_step
 
       - name: Setup core files
         run: |
@@ -745,6 +795,9 @@ jobs:
           path: ${{ env.MACPORTS_CACHE }}
           key: ${{ steps.mp-key.outputs.key }}
 
+      - *ccache_restore_default_step
+      - *ccache_restore_branch_step
+
       - name: Configure
         env:
           PKG_CONFIG_PATH: /opt/local/lib/pkgconfig/
@@ -762,6 +815,7 @@ jobs:
       - name: Build
         run: *ninja_build_cmd
 
+      - *ccache_decide_save_step
       - *ccache_save_step
 
       - name: Test world
@@ -1062,7 +1116,8 @@ jobs:
         shell: cmd
         run: mkdir ${{env.PG_REGRESS_SOCK_DIR}}
 
-      - *ccache_restore_step
+      - *ccache_restore_default_step
+      - *ccache_restore_branch_step
 
       - name: Configure
         run: |
@@ -1077,6 +1132,7 @@ jobs:
       - name: Build
         run: *ninja_build_cmd
 
+      - *ccache_decide_save_step
       - *ccache_save_step
 
       - name: Test world
@@ -1118,7 +1174,8 @@ jobs:
     steps:
       - *nix_sysinfo_step
       - *checkout_step
-      - *ccache_restore_step
+      - *ccache_restore_default_step
+      - *ccache_restore_branch_step
 
       - name: Setup workspace
         run: |
@@ -1213,5 +1270,6 @@ jobs:
             headerscheck cpluspluscheck \
             EXTRAFLAGS='-fmax-errors=10'
 
+      - *ccache_decide_save_step
       - *ccache_save_step
       - *upload_logs_step
diff --git a/src/tools/ci/gha_ccache_decide.py b/src/tools/ci/gha_ccache_decide.py
new file mode 100644
index 00000000000..920f7bf9685
--- /dev/null
+++ b/src/tools/ci/gha_ccache_decide.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+import os
+import re
+import shutil
+import subprocess
+
+def run(cmd, check=True):
+    return subprocess.run(
+        cmd,
+        check=check,
+        text=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    ).stdout
+
+def parse_ccache_stats():
+    out = run(["ccache", "--print-stats"])
+    hits = 0
+    misses = 0
+
+    for line in out.splitlines():
+        line = line.strip()
+        m = re.match(r"^local_storage_hit\s+(\d+)$", line)
+        if m:
+            hits = int(m.group(1))
+            continue
+        m = re.match(r"^local_storage_miss\s+(\d+)$", line)
+        if m:
+            misses = int(m.group(1))
+            continue
+
+    return hits, misses
+
+def append_github_output(key, value):
+    output_path = os.environ["GITHUB_OUTPUT"]
+    with open(output_path, "a", encoding="utf-8") as f:
+        f.write(f"{key}={value}\n")
+
+def main():
+    on_default_branch = os.environ["ON_DEFAULT_BRANCH"] == "true"
+    ccache_dir = os.environ["CCACHE_DIR"]
+
+    # Decide the target hit percentage below which we decide to upload a new
+    # cache. On non-default branches a few misses aren't that bad. But, as the
+    # caches of the default branch are shared with all branches, it's worth
+    # aiming for a higher ratio there.
+    target_rate = 95 if on_default_branch else 80
+
+    # Log ccache stats, useful for more in-depth understanding. The avoid it
+    # swamping the output, collapse it in a group.
+    print("::group::ccache_stats")
+    print(run(["ccache", "-s", "-vv"]))
+    print("::endgroup::")
+
+    # compute cache hit ratio
+    hits, misses = parse_ccache_stats()
+    total = hits + misses
+    hit_pct = int(( hits / total) * 100) if total > 0 else 100
+
+    print(f"hits: {hits}, misses: {misses}, hit_pct: {hit_pct}, target rate: {target_rate}")
+
+    # If there were either barely any misses, or the cache hit ratio was high,
+    # there no point in generating a new cache entry. We have limited cache
+    # space.
+    should_save = misses > 10 and hit_pct < target_rate
+
+    append_github_output("should_save", str(should_save).lower())
+
+    if not should_save:
+        print(f"hit rate {hit_pct} is above target of {target_rate}, skip creating new cache entry")
+        return 0
+
+    print(f"hit rate {hit_pct} is below target of {target_rate}, create new cache entry")
+
+    # It's not worth persisting old cache entries (e.g. from before a
+    # change to a central header, or from the default branch if this
+    # branch differs a lot). Therefore evict ccache entries that are a
+    # bit older. The cutoff here is fairly arbitrary, it could
+    # probably be improved.
+    print("::group::ccache_shrink")
+    print(run(["ccache", "--evict-older-than", f"{45*60}s"]))
+    print(run(["ccache", "-X", "10"]))
+
+    # Don't store ccache stats , otherwise we'd need to reset the cache access
+    # data after restoring the cache in the next run, to be able to get the
+    # hit ratio of the CI run.
+    print(run(["ccache", "-z"]))
+    print("::endgroup::")
+
+    # Before continuing, try to kill all ccache instances, otherwise
+    # it's possible that on cancellations there is still running
+    # ccaches that cause the upload to fail.
+    if shutil.which("killall"):
+        print(run(["killall", "ccache"], check=False))
+
+    return 0
+
+if __name__ == "__main__":
+    exit(main())
-- 
2.54.0.380.gc69baaf57b

