From 34ca3d718158c9fd6208feb8c7bcd60e59a4b117 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 15 Jul 2024 13:39:09 +0200 Subject: [PATCH] use lazyregexp for various regular expressions Using regex.MustCompile consumes a significant amount of memory when importing the package, even if those regular expressions are not used. This changes compiling the regular expressions to use a lazyregexp package so that they're only compiled the first time they're used. There are various regular expressions remaining that are still compiled on import, but these are exported, so changing them to a sync.OnceValue would be a breaking change; we can still decide to do so, but leaving that for a follow-up. To verify, compile a basic binary importing the package; package main import _ "github.com/distribution/reference" func main() {} Before: for i in $(seq 1 5); do GODEBUG=inittrace=1 ./before 2>&1 | grep distribution/reference; done init github.com/distribution/reference @0.94 ms, 0.22 ms clock, 415712 bytes, 3599 allocs init github.com/distribution/reference @0.39 ms, 0.22 ms clock, 415712 bytes, 3599 allocs init github.com/distribution/reference @0.39 ms, 0.23 ms clock, 415712 bytes, 3599 allocs init github.com/distribution/reference @0.45 ms, 0.27 ms clock, 415712 bytes, 3599 allocs init github.com/distribution/reference @0.38 ms, 0.24 ms clock, 415712 bytes, 3599 allocs After: for i in $(seq 1 5); do GODEBUG=inittrace=1 ./after 2>&1 | grep distribution/reference; done init github.com/distribution/reference/internal/lazyregexp @0.85 ms, 0 ms clock, 0 bytes, 0 allocs init github.com/distribution/reference @1.0 ms, 0.16 ms clock, 238680 bytes, 1383 allocs init github.com/distribution/reference/internal/lazyregexp @0.33 ms, 0 ms clock, 0 bytes, 0 allocs init github.com/distribution/reference @0.42 ms, 0.16 ms clock, 238680 bytes, 1383 allocs init github.com/distribution/reference/internal/lazyregexp @0.39 ms, 0 ms clock, 0 bytes, 0 allocs init github.com/distribution/reference @0.47 ms, 0.19 ms clock, 238680 bytes, 1383 allocs init github.com/distribution/reference/internal/lazyregexp @0.36 ms, 0 ms clock, 0 bytes, 0 allocs init github.com/distribution/reference @0.47 ms, 0.14 ms clock, 238680 bytes, 1383 allocs init github.com/distribution/reference/internal/lazyregexp @0.29 ms, 0 ms clock, 0 bytes, 0 allocs init github.com/distribution/reference @0.38 ms, 0.15 ms clock, 238680 bytes, 1383 allocs Signed-off-by: Sebastiaan van Stijn --- NOTICE | 11 +++++ internal/lazyregexp/LICENSE | 27 +++++++++++ internal/lazyregexp/PATENTS | 22 +++++++++ internal/lazyregexp/lazyregexp.go | 67 ++++++++++++++++++++++++++ internal/lazyregexp/lazyregexp_test.go | 23 +++++++++ regexp.go | 14 +++--- regexp_test.go | 7 ++- 7 files changed, 164 insertions(+), 7 deletions(-) create mode 100644 NOTICE create mode 100644 internal/lazyregexp/LICENSE create mode 100644 internal/lazyregexp/PATENTS create mode 100644 internal/lazyregexp/lazyregexp.go create mode 100644 internal/lazyregexp/lazyregexp_test.go diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..1658c25 --- /dev/null +++ b/NOTICE @@ -0,0 +1,11 @@ +lazyregexp implementation (internal/lazyregexp) + +The internal/lazyregexp directory contains code derived from the Go project. + +Copyright 2009-2018 The Go Authors. +Licensed under the BSD 3-Clause License. + +Modifications Copyright 2026 The CNCF distribution authors. + +The BSD license text and Go patent grant are included in +internal/lazyregexp/LICENSE and internal/lazyregexp/PATENTS. diff --git a/internal/lazyregexp/LICENSE b/internal/lazyregexp/LICENSE new file mode 100644 index 0000000..2a7cf70 --- /dev/null +++ b/internal/lazyregexp/LICENSE @@ -0,0 +1,27 @@ +Copyright 2009 The Go Authors. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google LLC nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/internal/lazyregexp/PATENTS b/internal/lazyregexp/PATENTS new file mode 100644 index 0000000..7330990 --- /dev/null +++ b/internal/lazyregexp/PATENTS @@ -0,0 +1,22 @@ +Additional IP Rights Grant (Patents) + +"This implementation" means the copyrightable works distributed by +Google as part of the Go project. + +Google hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable (except as stated in this section) +patent license to make, have made, use, offer to sell, sell, import, +transfer and otherwise run, modify and propagate the contents of this +implementation of Go, where such license applies only to those patent +claims, both currently owned or controlled by Google and acquired in +the future, licensable by Google that are necessarily infringed by this +implementation of Go. This grant does not include claims that would be +infringed only as a consequence of further modification of this +implementation. If you or your agent or exclusive licensee institute or +order or agree to the institution of patent litigation against any +entity (including a cross-claim or counterclaim in a lawsuit) alleging +that this implementation of Go or any code incorporated within this +implementation of Go constitutes direct or contributory patent +infringement, or inducement of patent infringement, then any patent +rights granted to you under this License for this implementation of Go +shall terminate as of the date such litigation is filed. diff --git a/internal/lazyregexp/lazyregexp.go b/internal/lazyregexp/lazyregexp.go new file mode 100644 index 0000000..7f2cc9e --- /dev/null +++ b/internal/lazyregexp/lazyregexp.go @@ -0,0 +1,67 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Code below was copied from; +// https://github.com/golang/go/blob/go1.13/src/internal/lazyregexp/lazyre.go + +// Package lazyregexp is a thin wrapper over regexp, allowing the use of global +// regexp variables without forcing them to be compiled at init. +package lazyregexp + +import ( + "os" + "regexp" + "strings" + "sync" +) + +// Regexp is a wrapper around regexp.Regexp, where the underlying regexp will be +// compiled the first time it is needed. +type Regexp struct { + str string + once sync.Once + rx *regexp.Regexp +} + +func (r *Regexp) re() *regexp.Regexp { + r.once.Do(r.build) + return r.rx +} + +func (r *Regexp) build() { + r.rx = regexp.MustCompile(r.str) + r.str = "" +} + +func (r *Regexp) FindStringSubmatch(s string) []string { + return r.re().FindStringSubmatch(s) +} + +func (r *Regexp) MatchString(s string) bool { + return r.re().MatchString(s) +} + +func (r *Regexp) SubexpNames() []string { + return r.re().SubexpNames() +} + +func (r *Regexp) NumSubexp() int { + return r.re().NumSubexp() +} + +func (r *Regexp) String() string { return r.str } + +var inTest = len(os.Args) > 0 && strings.HasSuffix(strings.TrimSuffix(os.Args[0], ".exe"), ".test") + +// New creates a new lazy regexp, delaying the compiling work until it is first +// needed. If the code is being run as part of tests, the regexp compiling will +// happen immediately. +func New(str string) *Regexp { + lr := &Regexp{str: str} + if inTest { + // In tests, always compile the regexps early. + lr.re() + } + return lr +} diff --git a/internal/lazyregexp/lazyregexp_test.go b/internal/lazyregexp/lazyregexp_test.go new file mode 100644 index 0000000..898ea0b --- /dev/null +++ b/internal/lazyregexp/lazyregexp_test.go @@ -0,0 +1,23 @@ +package lazyregexp + +import ( + "testing" +) + +func TestCompileOnce(t *testing.T) { + t.Run("invalid regexp", func(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Errorf("expected a panic") + } + }() + _ = New("[") + }) + t.Run("valid regexp", func(t *testing.T) { + re := New("[a-z]") + ok := re.MatchString("hello") + if !ok { + t.Errorf("expected a match") + } + }) +} diff --git a/regexp.go b/regexp.go index e7531df..b2fc4c9 100644 --- a/regexp.go +++ b/regexp.go @@ -3,6 +3,8 @@ package reference import ( "regexp" "strings" + + "github.com/distribution/reference/internal/lazyregexp" ) // DigestRegexp matches well-formed digests, including algorithm (e.g. "sha256:"). @@ -31,7 +33,7 @@ var NameRegexp = regexp.MustCompile(namePat) // ReferenceRegexp is the full supported format of a reference. The regexp // is anchored and has capturing groups for name, tag, and digest // components. -var ReferenceRegexp = referenceRegexp +var ReferenceRegexp = regexp.MustCompile(referencePat) // TagRegexp matches valid tag names. From [docker/docker:graph/tags.go]. // @@ -112,15 +114,15 @@ var ( // referenceRegexp is the full supported format of a reference. The regexp // is anchored and has capturing groups for name, tag, and digest // components. - referenceRegexp = regexp.MustCompile(referencePat) + referenceRegexp = lazyregexp.New(referencePat) // anchoredTagRegexp matches valid tag names, anchored at the start and // end of the matched string. - anchoredTagRegexp = regexp.MustCompile(anchored(tag)) + anchoredTagRegexp = lazyregexp.New(anchored(tag)) // anchoredDigestRegexp matches valid digests, anchored at the start and // end of the matched string. - anchoredDigestRegexp = regexp.MustCompile(anchored(digestPat)) + anchoredDigestRegexp = lazyregexp.New(anchored(digestPat)) // pathComponent restricts path-components to start with an alphanumeric // character, with following parts able to be separated by a separator @@ -136,14 +138,14 @@ var ( // anchoredNameRegexp is used to parse a name value, capturing the // domain and trailing components. - anchoredNameRegexp = regexp.MustCompile(anchoredNamePat) + anchoredNameRegexp = lazyregexp.New(anchoredNamePat) anchoredNamePat = anchored(optional(capture(domainAndPort), `/`), capture(remoteName)) referencePat = anchored(capture(namePat), optional(`:`, capture(tag)), optional(`@`, capture(digestPat))) // anchoredIdentifierRegexp is used to check or match an // identifier value, anchored at start and end of string. - anchoredIdentifierRegexp = regexp.MustCompile(anchored(identifier)) + anchoredIdentifierRegexp = lazyregexp.New(anchored(identifier)) ) // optional wraps the expression in a non-capturing group and makes the diff --git a/regexp_test.go b/regexp_test.go index 4f69965..db6deb6 100644 --- a/regexp_test.go +++ b/regexp_test.go @@ -6,13 +6,18 @@ import ( "testing" ) +type regExper interface { + FindStringSubmatch(s string) []string + NumSubexp() int +} + type regexpMatch struct { input string match bool subs []string } -func checkRegexp(t *testing.T, r *regexp.Regexp, m regexpMatch) { +func checkRegexp(t *testing.T, r regExper, m regexpMatch) { t.Helper() matches := r.FindStringSubmatch(m.input) if m.match && matches != nil {