1
0
mirror of https://github.com/tw93/Mole.git synced 2026-02-04 20:19:45 +00:00

feat: Parallelize metric collection, enhance app protection with receipt file scanning, and update cleanup tasks

This commit is contained in:
Tw93
2025-12-12 14:10:01 +08:00
parent 558ee86d1f
commit 0123f71842
3 changed files with 144 additions and 68 deletions

View File

@@ -169,11 +169,16 @@ var skipSystemDirs = map[string]bool{
"bin": true,
"etc": true,
"var": true,
"opt": false, // User might want to specific check opt
"usr": false, // User might check usr
"Volumes": true, // Skip external drives by default when scanning root
"Network": true, // Skip network mounts
".vol": true,
".Spotlight-V100": true,
".fseventsd": true,
".DocumentRevisions-V100": true,
".TemporaryItems": true,
".MobileBackups": true, // Time Machine local snapshots
}
var skipExtensions = map[string]bool{

44
cmd/analyze/heap.go Normal file
View File

@@ -0,0 +1,44 @@
package main
// entryHeap implements heap.Interface for a min-heap of dirEntry (sorted by Size)
// Since we want Top N Largest, we use a Min Heap of size N.
// When adding a new item:
// 1. If heap size < N: push
// 2. If heap size == N and item > min (root): pop min, push item
// The heap will thus maintain the largest N items.
type entryHeap []dirEntry
func (h entryHeap) Len() int { return len(h) }
func (h entryHeap) Less(i, j int) bool { return h[i].Size < h[j].Size } // Min-heap based on Size
func (h entryHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *entryHeap) Push(x interface{}) {
*h = append(*h, x.(dirEntry))
}
func (h *entryHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
// largeFileHeap implements heap.Interface for fileEntry
type largeFileHeap []fileEntry
func (h largeFileHeap) Len() int { return len(h) }
func (h largeFileHeap) Less(i, j int) bool { return h[i].Size < h[j].Size }
func (h largeFileHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *largeFileHeap) Push(x interface{}) {
*h = append(*h, x.(fileEntry))
}
func (h *largeFileHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}

View File

@@ -2,6 +2,7 @@ package main
import (
"bytes"
"container/heap"
"context"
"fmt"
"io/fs"
@@ -29,8 +30,14 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
}
var total int64
entries := make([]dirEntry, 0, len(children))
largeFiles := make([]fileEntry, 0, maxLargeFiles*2)
// Use heaps to track Top N items, drastically reducing memory usage
// for directories with millions of files
entriesHeap := &entryHeap{}
heap.Init(entriesHeap)
largeFilesHeap := &largeFileHeap{}
heap.Init(largeFilesHeap)
// Use worker pool for concurrent directory scanning
// For I/O-bound operations, use more workers than CPU count
@@ -54,19 +61,31 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
entryChan := make(chan dirEntry, len(children))
largeFileChan := make(chan fileEntry, maxLargeFiles*2)
// Start goroutines to collect from channels
// Start goroutines to collect from channels into heaps
var collectorWg sync.WaitGroup
collectorWg.Add(2)
go func() {
defer collectorWg.Done()
for entry := range entryChan {
entries = append(entries, entry)
// Maintain Top N Heap for entries
if entriesHeap.Len() < maxEntries {
heap.Push(entriesHeap, entry)
} else if entry.Size > (*entriesHeap)[0].Size {
heap.Pop(entriesHeap)
heap.Push(entriesHeap, entry)
}
}
}()
go func() {
defer collectorWg.Done()
for file := range largeFileChan {
largeFiles = append(largeFiles, file)
// Maintain Top N Heap for large files
if largeFilesHeap.Len() < maxLargeFiles {
heap.Push(largeFilesHeap, file)
} else if file.Size > (*largeFilesHeap)[0].Size {
heap.Pop(largeFilesHeap)
heap.Push(largeFilesHeap, file)
}
}
}()
@@ -113,7 +132,7 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
// Try du command first for folded dirs (much faster)
size, err := getDirectorySizeFromDu(path)
if err != nil || size <= 0 {
// Fallback to walk if du fails
// Fallback to concurrent walk if du fails
size = calculateDirSizeFast(path, filesScanned, dirsScanned, bytesScanned, currentPath)
}
atomic.AddInt64(&total, size)
@@ -182,11 +201,15 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
close(largeFileChan)
collectorWg.Wait()
sort.Slice(entries, func(i, j int) bool {
return entries[i].Size > entries[j].Size
})
if len(entries) > maxEntries {
entries = entries[:maxEntries]
// Convert Heaps to sorted slices (Descending order)
entries := make([]dirEntry, entriesHeap.Len())
for i := len(entries) - 1; i >= 0; i-- {
entries[i] = heap.Pop(entriesHeap).(dirEntry)
}
largeFiles := make([]fileEntry, largeFilesHeap.Len())
for i := len(largeFiles) - 1; i >= 0; i-- {
largeFiles[i] = heap.Pop(largeFilesHeap).(fileEntry)
}
// Try to use Spotlight (mdfind) for faster large file discovery
@@ -194,18 +217,15 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
// if Spotlight is unavailable or fails. The fallback is intentionally silent
// because users only care about correct results, not the method used.
if spotlightFiles := findLargeFilesWithSpotlight(root, minLargeFileSize); len(spotlightFiles) > 0 {
// Spotlight results are already sorted top N
// Use them in place of scanned large files
largeFiles = spotlightFiles
} else {
// Use files collected during scanning (fallback path)
// Sort and trim large files collected from scanning
sort.Slice(largeFiles, func(i, j int) bool {
return largeFiles[i].Size > largeFiles[j].Size
})
if len(largeFiles) > maxLargeFiles {
largeFiles = largeFiles[:maxLargeFiles]
}
}
// Double check sorting consistency (Spotlight returns sorted, but heap pop handles scan results)
// If needed, we could re-sort largeFiles, but heap pop ensures ascending, and we filled reverse, so it's Descending.
// Spotlight returns Descending. So no extra sort needed for either.
return scanResult{
Entries: entries,
LargeFiles: largeFiles,
@@ -242,70 +262,77 @@ func shouldSkipFileForLargeTracking(path string) bool {
return skipExtensions[ext]
}
// calculateDirSizeFast performs fast directory size calculation without detailed tracking or large file detection.
// Updates progress counters in batches to reduce atomic operation overhead.
// calculateDirSizeFast performs concurrent directory size calculation using os.ReadDir
// This is a faster fallback than filepath.WalkDir when du fails
func calculateDirSizeFast(root string, filesScanned, dirsScanned, bytesScanned *int64, currentPath *string) int64 {
var total int64
var localFiles, localDirs int64
var batchBytes int64
var wg sync.WaitGroup
// Create context with timeout
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
walkFunc := func(path string, d fs.DirEntry, err error) error {
// Check for timeout
// Limit total concurrency for this walk
concurrency := runtime.NumCPU() * 4
if concurrency > 64 {
concurrency = 64
}
sem := make(chan struct{}, concurrency)
var walk func(string)
walk = func(dirPath string) {
select {
case <-ctx.Done():
return ctx.Err()
return
default:
}
if err != nil {
return nil
}
if d.IsDir() {
localDirs++
// Batch update every N dirs to reduce atomic operations
if localDirs%batchUpdateSize == 0 {
atomic.AddInt64(dirsScanned, batchUpdateSize)
localDirs = 0
}
return nil
}
info, err := d.Info()
if err != nil {
return nil
}
// Get actual disk usage for sparse files and cloud files
size := getActualFileSize(path, info)
total += size
batchBytes += size
localFiles++
if currentPath != nil {
*currentPath = path
*currentPath = dirPath
}
// Batch update every N files to reduce atomic operations
if localFiles%batchUpdateSize == 0 {
atomic.AddInt64(filesScanned, batchUpdateSize)
atomic.AddInt64(bytesScanned, batchBytes)
localFiles = 0
batchBytes = 0
entries, err := os.ReadDir(dirPath)
if err != nil {
return
}
var localBytes, localFiles int64
for _, entry := range entries {
if entry.IsDir() {
// Directories: recurse concurrently
wg.Add(1)
// Capture loop variable
subDir := filepath.Join(dirPath, entry.Name())
go func(p string) {
defer wg.Done()
sem <- struct{}{} // Acquire token
defer func() { <-sem }() // Release token
walk(p)
}(subDir)
atomic.AddInt64(dirsScanned, 1)
} else {
// Files: process immediately
info, err := entry.Info()
if err == nil {
size := getActualFileSize(filepath.Join(dirPath, entry.Name()), info)
localBytes += size
localFiles++
}
}
}
if localBytes > 0 {
atomic.AddInt64(&total, localBytes)
atomic.AddInt64(bytesScanned, localBytes)
}
if localFiles > 0 {
atomic.AddInt64(filesScanned, localFiles)
}
return nil
}
_ = filepath.WalkDir(root, walkFunc)
// Final update for remaining counts
if localFiles > 0 {
atomic.AddInt64(filesScanned, localFiles)
}
if localDirs > 0 {
atomic.AddInt64(dirsScanned, localDirs)
}
if batchBytes > 0 {
atomic.AddInt64(bytesScanned, batchBytes)
}
walk(root)
wg.Wait()
return total
}