optimize disk analysis with caching and concurrency

2026-02-15 08:45:09 +00:00 · 2025-11-12 23:58:26 +08:00
parent 6e11c52039
commit f3d76f32de
1 changed files with 225 additions and 26 deletions
--- a/cmd/analyze/main.go
+++ b/cmd/analyze/main.go
@@ -1,6 +1,8 @@
 package main
 import (
 	"crypto/md5"
 	"encoding/gob"
 	"fmt"
 	"io/fs"
 	"os"
@@ -131,6 +133,14 @@ type scanResult struct {
 	totalSize  int64
 }
 type cacheEntry struct {
 	Entries    []dirEntry
 	LargeFiles []fileEntry
 	TotalSize  int64
 	ModTime    time.Time
 	ScanTime   time.Time
 }
 type historyEntry struct {
 	path          string
 	entries       []dirEntry
@@ -258,7 +268,24 @@ func (m model) Init() tea.Cmd {
 func (m model) scanCmd(path string) tea.Cmd {
 	return func() tea.Msg {
 		// Try to load from persistent cache first
 		if cached, err := loadCacheFromDisk(path); err == nil {
 			result := scanResult{
 				entries:    cached.Entries,
 				largeFiles: cached.LargeFiles,
 				totalSize:  cached.TotalSize,
 			}
 			return scanResultMsg{result: result, err: nil}
 		}
 		// Cache miss or invalid, perform actual scan
 		result, err := scanPathConcurrent(path, m.filesScanned, m.dirsScanned, m.bytesScanned, m.currentPath)
 		// Save to persistent cache asynchronously
 		if err == nil {
 			go saveCacheToDisk(path, result)
 		}
 		return scanResultMsg{result: result, err: err}
 	}
 }
@@ -727,9 +754,14 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
 	var entriesMu sync.Mutex
 	// Use worker pool for concurrent directory scanning
-	maxWorkers := runtime.NumCPU() * 2
+	// For I/O-bound operations, use more workers than CPU count
-	if maxWorkers < 4 {
+	maxWorkers := runtime.NumCPU() * 4
-		maxWorkers = 4
+	if maxWorkers < 16 {
 		maxWorkers = 16 // Minimum 16 workers for better I/O throughput
 	}
 	// Cap at 128 to avoid excessive goroutines
 	if maxWorkers > 128 {
 		maxWorkers = 128
 	}
 	if maxWorkers > len(children) {
 		maxWorkers = len(children)
@@ -763,13 +795,14 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
 					atomic.AddInt64(&total, size)
 					atomic.AddInt64(dirsScanned, 1)
-					entriesMu.Lock()
+					entry := dirEntry{
 					entries = append(entries, dirEntry{
 						name:  name,
 						path:  path,
 						size:  size,
 						isDir: true,
-					})
+					}
 					entriesMu.Lock()
 					entries = append(entries, entry)
 					entriesMu.Unlock()
 				}(child.Name(), fullPath)
 				continue
@@ -786,13 +819,14 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
 				atomic.AddInt64(&total, size)
 				atomic.AddInt64(dirsScanned, 1)
-				entriesMu.Lock()
+				entry := dirEntry{
 				entries = append(entries, dirEntry{
 					name:  name,
 					path:  path,
 					size:  size,
 					isDir: true,
-				})
+				}
 				entriesMu.Lock()
 				entries = append(entries, entry)
 				entriesMu.Unlock()
 			}(child.Name(), fullPath)
 			continue
@@ -856,13 +890,20 @@ func shouldSkipFileForLargeTracking(path string) bool {
 // Fast directory size calculation (no detailed tracking, no large files)
 func calculateDirSizeFast(root string, filesScanned, dirsScanned, bytesScanned *int64) int64 {
 	var total int64
 	var localFiles, localDirs int64
 	var batchBytes int64
 	_ = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
 		if err != nil {
 			return nil
 		}
 		if d.IsDir() {
-			atomic.AddInt64(dirsScanned, 1)
+			localDirs++
 			// Batch update every 100 dirs to reduce atomic operations
 			if localDirs%100 == 0 {
 				atomic.AddInt64(dirsScanned, 100)
 				localDirs = 0
 			}
 			return nil
 		}
 		info, err := d.Info()
@@ -871,11 +912,29 @@ func calculateDirSizeFast(root string, filesScanned, dirsScanned, bytesScanned *
 		}
 		size := info.Size()
 		total += size
-		atomic.AddInt64(filesScanned, 1)
+		batchBytes += size
-		atomic.AddInt64(bytesScanned, size)
+		localFiles++
 		// Batch update every 100 files to reduce atomic operations
 		if localFiles%100 == 0 {
 			atomic.AddInt64(filesScanned, 100)
 			atomic.AddInt64(bytesScanned, batchBytes)
 			localFiles = 0
 			batchBytes = 0
 		}
 		return nil
 	})
 	// Final update for remaining counts
 	if localFiles > 0 {
 		atomic.AddInt64(filesScanned, localFiles)
 	}
 	if localDirs > 0 {
 		atomic.AddInt64(dirsScanned, localDirs)
 	}
 	if batchBytes > 0 {
 		atomic.AddInt64(bytesScanned, batchBytes)
 	}
 	return total
 }
@@ -946,6 +1005,8 @@ func findLargeFilesWithSpotlight(root string, minSize int64) []fileEntry {
 func calculateDirSizeConcurrent(root string, tracker *largeFileTracker, filesScanned, dirsScanned, bytesScanned *int64, currentPath *string) int64 {
 	var total int64
 	var updateCounter int64
 	var localFiles, localDirs int64
 	var batchBytes int64
 	_ = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
 		if err != nil {
@@ -956,7 +1017,12 @@ func calculateDirSizeConcurrent(root string, tracker *largeFileTracker, filesSca
 			if shouldFoldDir(d.Name()) {
 				return filepath.SkipDir
 			}
-			atomic.AddInt64(dirsScanned, 1)
+			localDirs++
 			// Batch update every 50 dirs to reduce atomic operations
 			if localDirs%50 == 0 {
 				atomic.AddInt64(dirsScanned, 50)
 				localDirs = 0
 			}
 			return nil
 		}
 		info, err := d.Info()
@@ -965,57 +1031,100 @@ func calculateDirSizeConcurrent(root string, tracker *largeFileTracker, filesSca
 		}
 		size := info.Size()
 		total += size
-		atomic.AddInt64(filesScanned, 1)
+		batchBytes += size
-		atomic.AddInt64(bytesScanned, size)
+		localFiles++
 		// Batch update every 50 files to reduce atomic operations
 		if localFiles%50 == 0 {
 			atomic.AddInt64(filesScanned, 50)
 			atomic.AddInt64(bytesScanned, batchBytes)
 			localFiles = 0
 			batchBytes = 0
 		}
 		// Only track large files that are not code/text files
 		if !shouldSkipFileForLargeTracking(path) {
 			tracker.add(fileEntry{name: filepath.Base(path), path: path, size: size})
 		}
-		// Update current path every 100 files to reduce contention
+		// Update current path every 500 files to reduce contention
 		updateCounter++
-		if updateCounter%100 == 0 {
+		if updateCounter%500 == 0 {
 			*currentPath = path
 		}
 		return nil
 	})
 	// Final update for remaining counts
 	if localFiles > 0 {
 		atomic.AddInt64(filesScanned, localFiles)
 	}
 	if localDirs > 0 {
 		atomic.AddInt64(dirsScanned, localDirs)
 	}
 	if batchBytes > 0 {
 		atomic.AddInt64(bytesScanned, batchBytes)
 	}
 	return total
 }
 type largeFileTracker struct {
-	mu      sync.Mutex
+	mu         sync.Mutex
-	entries []fileEntry
+	entries    []fileEntry
 	minSize    int64
 	needsSort  bool
 }
 func newLargeFileTracker() *largeFileTracker {
 	return &largeFileTracker{
-		entries: make([]fileEntry, 0, maxLargeFiles),
+		entries: make([]fileEntry, 0, maxLargeFiles*2), // Pre-allocate more space
 		minSize: minLargeFileSize,
 	}
 }
 func (t *largeFileTracker) add(f fileEntry) {
-	if f.size < minLargeFileSize {
+	if f.size < t.minSize {
 		return
 	}
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	// Just append without sorting - sort only once at the end
 	t.entries = append(t.entries, f)
-	sort.Slice(t.entries, func(i, j int) bool {
+	t.needsSort = true
-		return t.entries[i].size > t.entries[j].size
+
-	})
+	// Update minimum size threshold dynamically
-	if len(t.entries) > maxLargeFiles {
+	if len(t.entries) > maxLargeFiles*3 {
-		t.entries = t.entries[:maxLargeFiles]
+		// Periodically sort and trim to avoid memory bloat
 		sort.Slice(t.entries, func(i, j int) bool {
 			return t.entries[i].size > t.entries[j].size
 		})
 		if len(t.entries) > maxLargeFiles {
 			t.minSize = t.entries[maxLargeFiles-1].size
 			t.entries = t.entries[:maxLargeFiles]
 		}
 		t.needsSort = false
 	}
 }
 func (t *largeFileTracker) list() []fileEntry {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	// Sort only when needed
 	if t.needsSort {
 		sort.Slice(t.entries, func(i, j int) bool {
 			return t.entries[i].size > t.entries[j].size
 		})
 		if len(t.entries) > maxLargeFiles {
 			t.entries = t.entries[:maxLargeFiles]
 		}
 		t.needsSort = false
 	}
 	return append([]fileEntry(nil), t.entries...)
 }
@@ -1270,3 +1379,93 @@ func cacheSnapshot(m model) historyEntry {
 	entry.dirty = false
 	return entry
 }
 // Persistent cache functions
 func getCacheDir() (string, error) {
 	home, err := os.UserHomeDir()
 	if err != nil {
 		return "", err
 	}
 	cacheDir := filepath.Join(home, ".cache", "mole")
 	if err := os.MkdirAll(cacheDir, 0755); err != nil {
 		return "", err
 	}
 	return cacheDir, nil
 }
 func getCachePath(path string) (string, error) {
 	cacheDir, err := getCacheDir()
 	if err != nil {
 		return "", err
 	}
 	// Use MD5 hash of path as cache filename
 	hash := md5.Sum([]byte(path))
 	filename := fmt.Sprintf("%x.cache", hash)
 	return filepath.Join(cacheDir, filename), nil
 }
 func loadCacheFromDisk(path string) (*cacheEntry, error) {
 	cachePath, err := getCachePath(path)
 	if err != nil {
 		return nil, err
 	}
 	file, err := os.Open(cachePath)
 	if err != nil {
 		return nil, err
 	}
 	defer file.Close()
 	var entry cacheEntry
 	decoder := gob.NewDecoder(file)
 	if err := decoder.Decode(&entry); err != nil {
 		return nil, err
 	}
 	// Validate cache: check if directory was modified after cache creation
 	info, err := os.Stat(path)
 	if err != nil {
 		return nil, err
 	}
 	// If directory was modified after cache, invalidate
 	if info.ModTime().After(entry.ModTime) {
 		return nil, fmt.Errorf("cache expired: directory modified")
 	}
 	// If cache is older than 7 days, invalidate
 	if time.Since(entry.ScanTime) > 7*24*time.Hour {
 		return nil, fmt.Errorf("cache expired: too old")
 	}
 	return &entry, nil
 }
 func saveCacheToDisk(path string, result scanResult) error {
 	cachePath, err := getCachePath(path)
 	if err != nil {
 		return err
 	}
 	info, err := os.Stat(path)
 	if err != nil {
 		return err
 	}
 	entry := cacheEntry{
 		Entries:    result.entries,
 		LargeFiles: result.largeFiles,
 		TotalSize:  result.totalSize,
 		ModTime:    info.ModTime(),
 		ScanTime:   time.Now(),
 	}
 	file, err := os.Create(cachePath)
 	if err != nil {
 		return err
 	}
 	defer file.Close()
 	encoder := gob.NewEncoder(file)
 	return encoder.Encode(entry)
 }