feat: Parallelize metric collection, enhance app protection with receipt file scanning, and update cleanup tasks

2026-02-15 09:20:08 +00:00 · 2025-12-12 14:10:01 +08:00
parent 558ee86d1f
commit 0123f71842
3 changed files with 144 additions and 68 deletions
--- a/cmd/analyze/constants.go
+++ b/cmd/analyze/constants.go
@@ -169,11 +169,16 @@ var skipSystemDirs = map[string]bool{
 	"bin":                     true,
 	"etc":                     true,
 	"var":                     true,
 	"opt":                     false, // User might want to specific check opt
 	"usr":                     false, // User might check usr
 	"Volumes":                 true,  // Skip external drives by default when scanning root
 	"Network":                 true,  // Skip network mounts
 	".vol":                    true,
 	".Spotlight-V100":         true,
 	".fseventsd":              true,
 	".DocumentRevisions-V100": true,
 	".TemporaryItems":         true,
 	".MobileBackups":          true, // Time Machine local snapshots
 }
 var skipExtensions = map[string]bool{
--- a/cmd/analyze/heap.go
+++ b/cmd/analyze/heap.go
@@ -0,0 +1,44 @@
 package main
 // entryHeap implements heap.Interface for a min-heap of dirEntry (sorted by Size)
 // Since we want Top N Largest, we use a Min Heap of size N.
 // When adding a new item:
 // 1. If heap size < N: push
 // 2. If heap size == N and item > min (root): pop min, push item
 // The heap will thus maintain the largest N items.
 type entryHeap []dirEntry
 func (h entryHeap) Len() int           { return len(h) }
 func (h entryHeap) Less(i, j int) bool { return h[i].Size < h[j].Size } // Min-heap based on Size
 func (h entryHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
 func (h *entryHeap) Push(x interface{}) {
 	*h = append(*h, x.(dirEntry))
 }
 func (h *entryHeap) Pop() interface{} {
 	old := *h
 	n := len(old)
 	x := old[n-1]
 	*h = old[0 : n-1]
 	return x
 }
 // largeFileHeap implements heap.Interface for fileEntry
 type largeFileHeap []fileEntry
 func (h largeFileHeap) Len() int           { return len(h) }
 func (h largeFileHeap) Less(i, j int) bool { return h[i].Size < h[j].Size }
 func (h largeFileHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
 func (h *largeFileHeap) Push(x interface{}) {
 	*h = append(*h, x.(fileEntry))
 }
 func (h *largeFileHeap) Pop() interface{} {
 	old := *h
 	n := len(old)
 	x := old[n-1]
 	*h = old[0 : n-1]
 	return x
 }
--- a/cmd/analyze/scanner.go
+++ b/cmd/analyze/scanner.go
@@ -2,6 +2,7 @@ package main
 import (
 	"bytes"
 	"container/heap"
 	"context"
 	"fmt"
 	"io/fs"
@@ -29,8 +30,14 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
 	}
 	var total int64
-	entries := make([]dirEntry, 0, len(children))
+
-	largeFiles := make([]fileEntry, 0, maxLargeFiles*2)
+	// Use heaps to track Top N items, drastically reducing memory usage
 	// for directories with millions of files
 	entriesHeap := &entryHeap{}
 	heap.Init(entriesHeap)
 	largeFilesHeap := &largeFileHeap{}
 	heap.Init(largeFilesHeap)
 	// Use worker pool for concurrent directory scanning
 	// For I/O-bound operations, use more workers than CPU count
@@ -54,19 +61,31 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
 	entryChan := make(chan dirEntry, len(children))
 	largeFileChan := make(chan fileEntry, maxLargeFiles*2)
-	// Start goroutines to collect from channels
+	// Start goroutines to collect from channels into heaps
 	var collectorWg sync.WaitGroup
 	collectorWg.Add(2)
 	go func() {
 		defer collectorWg.Done()
 		for entry := range entryChan {
-			entries = append(entries, entry)
+			// Maintain Top N Heap for entries
 			if entriesHeap.Len() < maxEntries {
 				heap.Push(entriesHeap, entry)
 			} else if entry.Size > (*entriesHeap)[0].Size {
 				heap.Pop(entriesHeap)
 				heap.Push(entriesHeap, entry)
 			}
 		}
 	}()
 	go func() {
 		defer collectorWg.Done()
 		for file := range largeFileChan {
-			largeFiles = append(largeFiles, file)
+			// Maintain Top N Heap for large files
 			if largeFilesHeap.Len() < maxLargeFiles {
 				heap.Push(largeFilesHeap, file)
 			} else if file.Size > (*largeFilesHeap)[0].Size {
 				heap.Pop(largeFilesHeap)
 				heap.Push(largeFilesHeap, file)
 			}
 		}
 	}()
@@ -113,7 +132,7 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
 					// Try du command first for folded dirs (much faster)
 					size, err := getDirectorySizeFromDu(path)
 					if err != nil || size <= 0 {
-						// Fallback to walk if du fails
+						// Fallback to concurrent walk if du fails
 						size = calculateDirSizeFast(path, filesScanned, dirsScanned, bytesScanned, currentPath)
 					}
 					atomic.AddInt64(&total, size)
@@ -182,11 +201,15 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
 	close(largeFileChan)
 	collectorWg.Wait()
-	sort.Slice(entries, func(i, j int) bool {
+	// Convert Heaps to sorted slices (Descending order)
-		return entries[i].Size > entries[j].Size
+	entries := make([]dirEntry, entriesHeap.Len())
-	})
+	for i := len(entries) - 1; i >= 0; i-- {
-	if len(entries) > maxEntries {
+		entries[i] = heap.Pop(entriesHeap).(dirEntry)
-		entries = entries[:maxEntries]
+	}
 	largeFiles := make([]fileEntry, largeFilesHeap.Len())
 	for i := len(largeFiles) - 1; i >= 0; i-- {
 		largeFiles[i] = heap.Pop(largeFilesHeap).(fileEntry)
 	}
 	// Try to use Spotlight (mdfind) for faster large file discovery
@@ -194,18 +217,15 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
 	// if Spotlight is unavailable or fails. The fallback is intentionally silent
 	// because users only care about correct results, not the method used.
 	if spotlightFiles := findLargeFilesWithSpotlight(root, minLargeFileSize); len(spotlightFiles) > 0 {
 		// Spotlight results are already sorted top N
 		// Use them in place of scanned large files
 		largeFiles = spotlightFiles
 	} else {
 		// Use files collected during scanning (fallback path)
 		// Sort and trim large files collected from scanning
 		sort.Slice(largeFiles, func(i, j int) bool {
 			return largeFiles[i].Size > largeFiles[j].Size
 		})
 		if len(largeFiles) > maxLargeFiles {
 			largeFiles = largeFiles[:maxLargeFiles]
 		}
 	}
 	// Double check sorting consistency (Spotlight returns sorted, but heap pop handles scan results)
 	// If needed, we could re-sort largeFiles, but heap pop ensures ascending, and we filled reverse, so it's Descending.
 	// Spotlight returns Descending. So no extra sort needed for either.
 	return scanResult{
 		Entries:    entries,
 		LargeFiles: largeFiles,
@@ -242,70 +262,77 @@ func shouldSkipFileForLargeTracking(path string) bool {
 	return skipExtensions[ext]
 }
-// calculateDirSizeFast performs fast directory size calculation without detailed tracking or large file detection.
+// calculateDirSizeFast performs concurrent directory size calculation using os.ReadDir
-// Updates progress counters in batches to reduce atomic operation overhead.
+// This is a faster fallback than filepath.WalkDir when du fails
 func calculateDirSizeFast(root string, filesScanned, dirsScanned, bytesScanned *int64, currentPath *string) int64 {
 	var total int64
-	var localFiles, localDirs int64
+	var wg sync.WaitGroup
 	var batchBytes int64
 	// Create context with timeout
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
-	walkFunc := func(path string, d fs.DirEntry, err error) error {
+	// Limit total concurrency for this walk
-		// Check for timeout
+	concurrency := runtime.NumCPU() * 4
 	if concurrency > 64 {
 		concurrency = 64
 	}
 	sem := make(chan struct{}, concurrency)
 	var walk func(string)
 	walk = func(dirPath string) {
 		select {
 		case <-ctx.Done():
-			return ctx.Err()
+			return
 		default:
 		}
-		if err != nil {
+
 			return nil
 		}
 		if d.IsDir() {
 			localDirs++
 			// Batch update every N dirs to reduce atomic operations
 			if localDirs%batchUpdateSize == 0 {
 				atomic.AddInt64(dirsScanned, batchUpdateSize)
 				localDirs = 0
 			}
 			return nil
 		}
 		info, err := d.Info()
 		if err != nil {
 			return nil
 		}
 		// Get actual disk usage for sparse files and cloud files
 		size := getActualFileSize(path, info)
 		total += size
 		batchBytes += size
 		localFiles++
 		if currentPath != nil {
-			*currentPath = path
+			*currentPath = dirPath
 		}
-		// Batch update every N files to reduce atomic operations
+
-		if localFiles%batchUpdateSize == 0 {
+		entries, err := os.ReadDir(dirPath)
-			atomic.AddInt64(filesScanned, batchUpdateSize)
+		if err != nil {
-			atomic.AddInt64(bytesScanned, batchBytes)
+			return
-			localFiles = 0
+		}
-			batchBytes = 0
+
 		var localBytes, localFiles int64
 		for _, entry := range entries {
 			if entry.IsDir() {
 				// Directories: recurse concurrently
 				wg.Add(1)
 				// Capture loop variable
 				subDir := filepath.Join(dirPath, entry.Name())
 				go func(p string) {
 					defer wg.Done()
 					sem <- struct{}{} // Acquire token
 					defer func() { <-sem }() // Release token
 					walk(p)
 				}(subDir)
 				atomic.AddInt64(dirsScanned, 1)
 			} else {
 				// Files: process immediately
 				info, err := entry.Info()
 				if err == nil {
 					size := getActualFileSize(filepath.Join(dirPath, entry.Name()), info)
 					localBytes += size
 					localFiles++
 				}
 			}
 		}
 		if localBytes > 0 {
 			atomic.AddInt64(&total, localBytes)
 			atomic.AddInt64(bytesScanned, localBytes)
 		}
 		if localFiles > 0 {
 			atomic.AddInt64(filesScanned, localFiles)
 		}
 		return nil
 	}
-	_ = filepath.WalkDir(root, walkFunc)
+	walk(root)
-
+	wg.Wait()
 	// Final update for remaining counts
 	if localFiles > 0 {
 		atomic.AddInt64(filesScanned, localFiles)
 	}
 	if localDirs > 0 {
 		atomic.AddInt64(dirsScanned, localDirs)
 	}
 	if batchBytes > 0 {
 		atomic.AddInt64(bytesScanned, batchBytes)
 	}
 	return total
 }