Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion packages/orchestrator/orchestrator.proto
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ message SandboxCreateRequest {

message SandboxCreateResponse {
string client_id = 1;
SchedulingMetadata scheduling_metadata = 2;
}

message SandboxUpdateRequest {
Expand All @@ -126,12 +127,29 @@ message SandboxPauseRequest {
string build_id = 3;
}

message SchedulingMetadata {
string base_build_id = 1;
uint64 generation = 2;
uint64 memfile_size = 3;
uint64 rootfs_size = 4;
repeated string chain_build_ids = 5;
repeated uint64 memfile_logical_bytes = 6;
repeated uint32 memfile_mapping_counts = 7;
repeated uint64 rootfs_logical_bytes = 8;
repeated uint32 rootfs_mapping_counts = 9;
}

message SandboxPauseResponse {
SchedulingMetadata scheduling_metadata = 1;
}

message SandboxCheckpointRequest {
string sandbox_id = 1;
string build_id = 3;
}

message SandboxCheckpointResponse {
SchedulingMetadata scheduling_metadata = 1;
}

message RunningSandbox {
Expand All @@ -149,6 +167,11 @@ message SandboxListResponse {
message CachedBuildInfo {
string build_id = 1;
google.protobuf.Timestamp expiration_time = 2;
SchedulingMetadata scheduling_metadata = 3;
uint64 memfile_cached_bytes = 4;
uint64 memfile_total_bytes = 5;
uint64 rootfs_cached_bytes = 6;
uint64 rootfs_total_bytes = 7;
}

message SandboxListCachedBuildsResponse {
Expand All @@ -160,7 +183,7 @@ service SandboxService {
rpc Update(SandboxUpdateRequest) returns (google.protobuf.Empty);
rpc List(google.protobuf.Empty) returns (SandboxListResponse);
rpc Delete(SandboxDeleteRequest) returns (google.protobuf.Empty);
rpc Pause(SandboxPauseRequest) returns (google.protobuf.Empty);
rpc Pause(SandboxPauseRequest) returns (SandboxPauseResponse);
rpc Checkpoint(SandboxCheckpointRequest) returns (SandboxCheckpointResponse);

rpc ListCachedBuilds(google.protobuf.Empty) returns (SandboxListCachedBuildsResponse);
Expand Down
4 changes: 2 additions & 2 deletions packages/orchestrator/pkg/dummyserver/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func (s *SandboxServer) Delete(_ context.Context, req *orchestrator.SandboxDelet
return &emptypb.Empty{}, nil
}

func (s *SandboxServer) Pause(_ context.Context, req *orchestrator.SandboxPauseRequest) (*emptypb.Empty, error) {
func (s *SandboxServer) Pause(_ context.Context, req *orchestrator.SandboxPauseRequest) (*orchestrator.SandboxPauseResponse, error) {
if req.GetSandboxId() == "" {
return nil, status.Error(codes.InvalidArgument, "sandbox_id is required")
}
Expand All @@ -139,7 +139,7 @@ func (s *SandboxServer) Pause(_ context.Context, req *orchestrator.SandboxPauseR
delete(s.sandboxes, req.GetSandboxId())
s.mu.Unlock()

return &emptypb.Empty{}, nil
return &orchestrator.SandboxPauseResponse{}, nil
}

func (s *SandboxServer) Checkpoint(_ context.Context, _ *orchestrator.SandboxCheckpointRequest) (*orchestrator.SandboxCheckpointResponse, error) {
Expand Down
44 changes: 44 additions & 0 deletions packages/orchestrator/pkg/sandbox/build/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"context"
"fmt"
"os"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -101,6 +102,14 @@ func NewDiffStore(

type DiffStoreKey string

type CachedBuildStats struct {
BuildID string
MemfileCachedBytes uint64
MemfileTotalBytes uint64
RootfsCachedBytes uint64
RootfsTotalBytes uint64
}

func GetDiffStoreKey(buildID string, diffType DiffType) DiffStoreKey {
return DiffStoreKey(fmt.Sprintf("%s/%s", buildID, diffType))
}
Expand Down Expand Up @@ -171,6 +180,41 @@ func (s *DiffStore) Lookup(key DiffStoreKey) (Diff, bool) {
return item.Value(), true
}

func (s *DiffStore) CachedBuildStats(ctx context.Context) []CachedBuildStats {
byBuildID := make(map[string]*CachedBuildStats)
for key, item := range s.cache.Items() {
buildID, diffType, ok := strings.Cut(string(key), "/")
if !ok || item == nil || item.Value() == nil {
continue
}
stats := byBuildID[buildID]
if stats == nil {
stats = &CachedBuildStats{BuildID: buildID}
byBuildID[buildID] = stats
}

size, err := item.Value().FileSize(ctx)
if err != nil || size < 0 {
continue
}
switch DiffType(diffType) {
case Memfile:
stats.MemfileCachedBytes = uint64(size)
stats.MemfileTotalBytes = uint64(size)
case Rootfs:
stats.RootfsCachedBytes = uint64(size)
stats.RootfsTotalBytes = uint64(size)
Comment on lines +203 to +206
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Report real total bytes for partially cached diffs

When a template was loaded from storage and only some chunks have been read, item.Value().FileSize(ctx) is the local cache file size, not the uncompressed diff size, so copying it into *TotalBytes makes ListCachedBuilds report the diff as 100% cached. In that partial-cache scenario, schedulers using these new fields will overestimate locality and route work to a node that still has to fetch most of the data remotely.

Useful? React with 👍 / 👎.

}
}

result := make([]CachedBuildStats, 0, len(byBuildID))
for _, stats := range byBuildID {
result = append(result, *stats)
}

return result
}

func (s *DiffStore) startDiskSpaceEviction(
ctx context.Context,
config cfg.Config,
Expand Down
19 changes: 11 additions & 8 deletions packages/orchestrator/pkg/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -1185,6 +1185,8 @@ func (s *Sandbox) Pause(
}
cleanup.AddNoContext(ctx, rootfsDiff.Close)

schedulingMetadata := NewSnapshotSchedulingMetadata(originalMemfile.Header(), originalRootfs.Header())
Comment thread
cursor[bot] marked this conversation as resolved.
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Compute snapshot metadata from the new headers

When a pause/checkpoint creates dirty memory or rootfs data, this computes the response from the original template headers, so the newly created buildID and its diff bytes/chunks are absent from chain_build_ids and the per-build counts. The snapshot added to the cache uses memfileDiffHeader/rootfsHeader, but callers of the new Pause/Checkpoint response get metadata that describes only the parent template, causing downstream scheduling decisions based on that response to ignore the snapshot's own data.

Useful? React with 👍 / 👎.


metadataFileLink := template.NewLocalFileLink(cachePaths.CacheMetadata())
cleanup.AddNoContext(ctx, metadataFileLink.Close)

Expand All @@ -1194,14 +1196,15 @@ func (s *Sandbox) Pause(
}

return &Snapshot{
Snapfile: snapfile,
Metafile: metadataFileLink,
MemfileDiff: memfileDiff,
MemfileDiffHeader: memfileDiffHeader,
RootfsDiff: rootfsDiff,
RootfsDiffHeader: NewResolvedDiffHeader(rootfsHeader),
MemfileBlockSize: originalMemfile.Header().Metadata.BlockSize,
RootfsBlockSize: originalRootfs.Header().Metadata.BlockSize,
Snapfile: snapfile,
Metafile: metadataFileLink,
MemfileDiff: memfileDiff,
MemfileDiffHeader: memfileDiffHeader,
RootfsDiff: rootfsDiff,
RootfsDiffHeader: NewResolvedDiffHeader(rootfsHeader),
SchedulingMetadata: schedulingMetadata,
MemfileBlockSize: originalMemfile.Header().Metadata.BlockSize,
RootfsBlockSize: originalRootfs.Header().Metadata.BlockSize,

BuildID: buildID,

Expand Down
21 changes: 14 additions & 7 deletions packages/orchestrator/pkg/sandbox/snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import (

"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/build"
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/template"
"github.com/e2b-dev/infra/packages/orchestrator/pkg/scheduling"
"github.com/e2b-dev/infra/packages/shared/pkg/grpc/orchestrator"
"github.com/e2b-dev/infra/packages/shared/pkg/storage/header"
"github.com/e2b-dev/infra/packages/shared/pkg/utils"
)
Expand All @@ -26,13 +28,14 @@ func NewResolvedDiffHeader(h *header.Header) *DiffHeader {
}

type Snapshot struct {
MemfileDiff build.Diff
MemfileDiffHeader *DiffHeader
RootfsDiff build.Diff
RootfsDiffHeader *DiffHeader
Snapfile template.File
Metafile template.File
BuildID uuid.UUID
MemfileDiff build.Diff
MemfileDiffHeader *DiffHeader
RootfsDiff build.Diff
RootfsDiffHeader *DiffHeader
Snapfile template.File
Metafile template.File
BuildID uuid.UUID
SchedulingMetadata *orchestrator.SchedulingMetadata

// Template block sizes captured sync at Pause time. They equal
// MemfileDiffHeader.Metadata.BlockSize once that header resolves, but
Expand All @@ -53,3 +56,7 @@ func (s *Snapshot) Close(ctx context.Context) error {

return nil
}

func NewSnapshotSchedulingMetadata(memfileHeader, rootfsHeader *header.Header) *orchestrator.SchedulingMetadata {
return scheduling.FromHeaders(memfileHeader, rootfsHeader)
}
4 changes: 4 additions & 0 deletions packages/orchestrator/pkg/sandbox/template/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ func (c *Cache) Items() map[string]*ttlcache.Item[string, Template] {
return c.cache.Items()
}

func (c *Cache) CachedBuildStats(ctx context.Context) []build.CachedBuildStats {
return c.buildStore.CachedBuildStats(ctx)
}

// LookupDiff returns the locally-cached diff for the given build and file name.
// Returns (nil, false) if the diff is not cached locally.
func (c *Cache) LookupDiff(buildID string, diffType build.DiffType) (build.Diff, bool) {
Expand Down
12 changes: 12 additions & 0 deletions packages/orchestrator/pkg/sandbox/template/storage_template.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ import (
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block"
blockmetrics "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block/metrics"
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/build"
"github.com/e2b-dev/infra/packages/orchestrator/pkg/scheduling"
"github.com/e2b-dev/infra/packages/orchestrator/pkg/template/metadata"
"github.com/e2b-dev/infra/packages/shared/pkg/grpc/orchestrator"
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
"github.com/e2b-dev/infra/packages/shared/pkg/storage"
"github.com/e2b-dev/infra/packages/shared/pkg/storage/header"
Expand Down Expand Up @@ -270,6 +272,16 @@ func (t *storageTemplate) Files() storage.CachePaths {
return t.paths
}

func (t *storageTemplate) SchedulingMetadata() *orchestrator.SchedulingMetadata {
memfileHeader, memfileErr := t.memfileHeader.Result()
rootfsHeader, rootfsErr := t.rootfsHeader.Result()
Comment on lines +276 to +277
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Return metadata for storage-loaded templates

When a build enters the cache via GetTemplate from object storage rather than AddSnapshot, both header holders are initialized as resolvedHeader(nil) in cache.go, and NewStorage loads the actual headers internally without writing them back to those holders. In that cache-miss/cold-start path these lines still read the original nil header values, so scheduling.FromHeaders returns nil and ListCachedBuilds or template build completion omits scheduling metadata even though the headers are available.

Useful? React with 👍 / 👎.

if memfileErr != nil || rootfsErr != nil {
return nil
}

return scheduling.FromHeaders(memfileHeader, rootfsHeader)
Comment thread
cursor[bot] marked this conversation as resolved.
Comment thread
cursor[bot] marked this conversation as resolved.
}

func (t *storageTemplate) Memfile(ctx context.Context) (block.ReadonlyDevice, error) {
_, span := tracer.Start(ctx, "storage-template-memfile")
defer span.End()
Expand Down
96 changes: 96 additions & 0 deletions packages/orchestrator/pkg/scheduling/metadata.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package scheduling

import (
"cmp"
"slices"

"github.com/google/uuid"

"github.com/e2b-dev/infra/packages/shared/pkg/grpc/orchestrator"
"github.com/e2b-dev/infra/packages/shared/pkg/storage/header"
)

const chainLimit = 32

type buildContribution struct {
buildID uuid.UUID
memfileBytes uint64
memfileMappingCount uint32
rootfsBytes uint64
rootfsMappingCount uint32
}

func FromHeaders(memfileHeader, rootfsHeader *header.Header) *orchestrator.SchedulingMetadata {
if memfileHeader == nil || memfileHeader.Metadata == nil || rootfsHeader == nil || rootfsHeader.Metadata == nil {
return nil
}

contributions := make(map[uuid.UUID]*buildContribution)
add := func(buildID uuid.UUID, length uint64, isMemfile bool) {
if buildID == uuid.Nil || length == 0 {
return
}
c, ok := contributions[buildID]
if !ok {
c = &buildContribution{buildID: buildID}
contributions[buildID] = c
}
if isMemfile {
c.memfileBytes += length
c.memfileMappingCount++
} else {
c.rootfsBytes += length
c.rootfsMappingCount++
}
}

for _, m := range memfileHeader.Mapping.All() {
add(m.BuildId, m.Length, true)
}
for _, m := range rootfsHeader.Mapping.All() {
add(m.BuildId, m.Length, false)
}

baseBuildID := memfileHeader.Metadata.BaseBuildId
if baseBuildID == uuid.Nil {
baseBuildID = rootfsHeader.Metadata.BaseBuildId
}

chain := make([]buildContribution, 0, len(contributions))
for _, c := range contributions {
chain = append(chain, *c)
}
slices.SortFunc(chain, func(a, b buildContribution) int {
aBytes := a.memfileBytes + a.rootfsBytes
bBytes := b.memfileBytes + b.rootfsBytes
if aBytes == bBytes {
return cmp.Compare(a.buildID.String(), b.buildID.String())
}

return cmp.Compare(bBytes, aBytes)
})
if len(chain) > chainLimit {
chain = chain[:chainLimit]
}

res := &orchestrator.SchedulingMetadata{
BaseBuildId: baseBuildID.String(),
Generation: max(memfileHeader.Metadata.Generation, rootfsHeader.Metadata.Generation) + 1,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid incrementing generation for existing cached templates

This helper is also called by storageTemplate.SchedulingMetadata() for ListCachedBuilds and by the template build result after the snapshot has already been added to cache; in that path the headers already belong to the cached build, so adding one exposes generation N+1 instead of the build's actual generation N. Consumers comparing this metadata with header generation will see every cached/template-manager build as one generation ahead.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generation incremented twice

High Severity

FromHeaders always sets Generation to max(memfile, rootfs) + 1, but storageTemplate.SchedulingMetadata, ListCachedBuilds, and template build success read headers that already went through NextGeneration when the layer was written. Those paths report generation one higher than the on-disk header metadata.

Additional Locations (2)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit d216050. Configure here.

MemfileSize: memfileHeader.Metadata.Size,
RootfsSize: rootfsHeader.Metadata.Size,
ChainBuildIds: make([]string, 0, len(chain)),
MemfileLogicalBytes: make([]uint64, 0, len(chain)),
MemfileMappingCounts: make([]uint32, 0, len(chain)),
RootfsLogicalBytes: make([]uint64, 0, len(chain)),
RootfsMappingCounts: make([]uint32, 0, len(chain)),
}
for _, c := range chain {
res.ChainBuildIds = append(res.ChainBuildIds, c.buildID.String())
res.MemfileLogicalBytes = append(res.MemfileLogicalBytes, c.memfileBytes)
res.MemfileMappingCounts = append(res.MemfileMappingCounts, c.memfileMappingCount)
res.RootfsLogicalBytes = append(res.RootfsLogicalBytes, c.rootfsBytes)
res.RootfsMappingCounts = append(res.RootfsMappingCounts, c.rootfsMappingCount)
}

return res
}
Loading
Loading