diff --git a/extractor/extractor.go b/extractor/extractor.go index 4329daa5..30978bb7 100644 --- a/extractor/extractor.go +++ b/extractor/extractor.go @@ -23,10 +23,12 @@ import ( "os" "path/filepath" "regexp" + "runtime" "slices" "time" "github.com/google/osv-scalibr/extractor/internal" + "github.com/google/osv-scalibr/extractor/internal/units" "github.com/google/osv-scalibr/log" "github.com/google/osv-scalibr/plugin" "github.com/google/osv-scalibr/purl" @@ -77,6 +79,8 @@ type Config struct { ReadSymlinks bool // Optional: Limit for visited inodes. If 0, no limit is applied. MaxInodes int + // Optional: Logs extractor name and path, which trigger a high memory increase. + LogMemoryUsage bool } // LINT.IfChange @@ -120,16 +124,17 @@ func RunFS(ctx context.Context, config *Config) ([]*Inventory, []*plugin.Status, } start := time.Now() wc := walkContext{ - ctx: ctx, - stats: config.Stats, - extractors: config.Extractors, - fs: config.FS, - scanRoot: config.ScanRoot, - dirsToSkip: stringListToMap(config.DirsToSkip), - skipDirRegex: config.SkipDirRegex, - readSymlinks: config.ReadSymlinks, - maxInodes: config.MaxInodes, - inodesVisited: 0, + ctx: ctx, + stats: config.Stats, + extractors: config.Extractors, + fs: config.FS, + scanRoot: config.ScanRoot, + dirsToSkip: stringListToMap(config.DirsToSkip), + skipDirRegex: config.SkipDirRegex, + readSymlinks: config.ReadSymlinks, + maxInodes: config.MaxInodes, + logMemoryUsage: config.LogMemoryUsage, + inodesVisited: 0, lastStatus: time.Now(), @@ -151,15 +156,16 @@ func RunFS(ctx context.Context, config *Config) ([]*Inventory, []*plugin.Status, } type walkContext struct { - ctx context.Context - stats stats.Collector - extractors []InventoryExtractor - fs fs.FS - scanRoot string - dirsToSkip map[string]bool // Anything under these paths should be skipped. - skipDirRegex *regexp.Regexp - maxInodes int - inodesVisited int + ctx context.Context + stats stats.Collector + extractors []InventoryExtractor + fs fs.FS + scanRoot string + dirsToSkip map[string]bool // Anything under these paths should be skipped. + skipDirRegex *regexp.Regexp + maxInodes int + inodesVisited int + logMemoryUsage bool // Inventories found. inventory []*Inventory @@ -228,8 +234,24 @@ func (wc *walkContext) handleFile(path string, d fs.DirEntry, fserr error) error wc.mapInodes[internal.ParentDir(filepath.Dir(path), 3)]++ + var before int64 + if wc.logMemoryUsage { + before = internal.MaxResident() * units.KiB + } for _, ex := range wc.extractors { - wc.runExtractor(ex, path, s.Mode()) + extractRun := wc.runExtractor(ex, path, s.Mode()) + if wc.logMemoryUsage && extractRun { + // Assuming the Extract function is the memory intense function. If no extract run, we don't + // need to query MaxResident again. + after := internal.MaxResident() * units.KiB + if after > before+5*units.MiB { + runtime.GC() + afterGC := internal.MaxResident() * units.KiB + log.Infof("Memory increase: before: %d, after: %d, after GC: %d extractor: %s path: %s\n", + before, after, afterGC, ex.Name(), path) + } + before = after + } } return nil } @@ -244,21 +266,21 @@ func (wc *walkContext) shouldSkipDir(path string) bool { return false } -func (wc *walkContext) runExtractor(ex InventoryExtractor, path string, mode fs.FileMode) { +func (wc *walkContext) runExtractor(ex InventoryExtractor, path string, mode fs.FileMode) bool { if !ex.FileRequired(path, mode) { - return + return false } rc, err := wc.fs.Open(path) if err != nil { addErrToMap(wc.errors, ex.Name(), fmt.Errorf("Open(%s): %v", path, err)) - return + return false } defer rc.Close() info, err := rc.Stat() if err != nil { addErrToMap(wc.errors, ex.Name(), fmt.Errorf("stat(%s): %v", path, err)) - return + return false } wc.mapExtracts[internal.ParentDir(filepath.Dir(path), 3)]++ @@ -277,6 +299,7 @@ func (wc *walkContext) runExtractor(ex InventoryExtractor, path string, mode fs. wc.inventory = append(wc.inventory, r) } } + return true } func stringListToMap(paths []string) map[string]bool { diff --git a/extractor/internal/memory_unix.go b/extractor/internal/memory_unix.go new file mode 100644 index 00000000..7d652883 --- /dev/null +++ b/extractor/internal/memory_unix.go @@ -0,0 +1,34 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package internal + +import ( + "syscall" + + "github.com/google/osv-scalibr/log" +) + +// MaxResident returns the max resident memory. This can be bytes or kilobytes, depending on the +// operating system. +func MaxResident() int64 { + var u syscall.Rusage + err := syscall.Getrusage(syscall.RUSAGE_SELF, &u) + if err != nil { + log.Warnf("Failed to get rusage: %v", err) + return 0 + } + + return u.Maxrss +} diff --git a/extractor/language/javascript/packagejson/extractor.go b/extractor/language/javascript/packagejson/extractor.go index 73d0305d..9ca26958 100644 --- a/extractor/language/javascript/packagejson/extractor.go +++ b/extractor/language/javascript/packagejson/extractor.go @@ -97,7 +97,9 @@ func (e Extractor) FileRequired(path string, _ fs.FileMode) bool { // Extract extracts packages from package.json files passed through the scan input. func (e Extractor) Extract(ctx context.Context, input *extractor.ScanInput) ([]*extractor.Inventory, error) { if input.Info != nil && input.Info.Size() > e.maxJSONSize { - return nil, fmt.Errorf("package.json file %s is too large: %d", input.Path, input.Info.Size()) + err := fmt.Errorf("package.json file %s is too large: %d", input.Path, input.Info.Size()) + log.Error(err) + return nil, err } i, err := parse(input.Path, input.Reader) if err != nil { diff --git a/scalibr.go b/scalibr.go index b11d3a96..80e6ba21 100644 --- a/scalibr.go +++ b/scalibr.go @@ -54,6 +54,8 @@ type ScanConfig struct { ReadSymlinks bool // Optional: Limit for visited inodes. If 0, no limit is applied. MaxInodes int + // Optional: Logs extractor name and path, which trigger a high memory increase. + LogMemoryUsage bool } // LINT.IfChange @@ -94,6 +96,7 @@ func (Scanner) Scan(ctx context.Context, config *ScanConfig) (sr *ScanResult) { SkipDirRegex: config.SkipDirRegex, ScanRoot: config.ScanRoot, MaxInodes: config.MaxInodes, + LogMemoryUsage: config.LogMemoryUsage, } inventories, extractorStatus, err := extractor.Run(ctx, extractorConfig) sro.Inventories = inventories