mirror of
https://github.com/tw93/Mole.git
synced 2026-02-15 09:20:08 +00:00
feat: Parallelize metric collection, enhance app protection with receipt file scanning, and update cleanup tasks
This commit is contained in:
@@ -169,11 +169,16 @@ var skipSystemDirs = map[string]bool{
|
|||||||
"bin": true,
|
"bin": true,
|
||||||
"etc": true,
|
"etc": true,
|
||||||
"var": true,
|
"var": true,
|
||||||
|
"opt": false, // User might want to specific check opt
|
||||||
|
"usr": false, // User might check usr
|
||||||
|
"Volumes": true, // Skip external drives by default when scanning root
|
||||||
|
"Network": true, // Skip network mounts
|
||||||
".vol": true,
|
".vol": true,
|
||||||
".Spotlight-V100": true,
|
".Spotlight-V100": true,
|
||||||
".fseventsd": true,
|
".fseventsd": true,
|
||||||
".DocumentRevisions-V100": true,
|
".DocumentRevisions-V100": true,
|
||||||
".TemporaryItems": true,
|
".TemporaryItems": true,
|
||||||
|
".MobileBackups": true, // Time Machine local snapshots
|
||||||
}
|
}
|
||||||
|
|
||||||
var skipExtensions = map[string]bool{
|
var skipExtensions = map[string]bool{
|
||||||
|
|||||||
44
cmd/analyze/heap.go
Normal file
44
cmd/analyze/heap.go
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
// entryHeap implements heap.Interface for a min-heap of dirEntry (sorted by Size)
|
||||||
|
// Since we want Top N Largest, we use a Min Heap of size N.
|
||||||
|
// When adding a new item:
|
||||||
|
// 1. If heap size < N: push
|
||||||
|
// 2. If heap size == N and item > min (root): pop min, push item
|
||||||
|
// The heap will thus maintain the largest N items.
|
||||||
|
type entryHeap []dirEntry
|
||||||
|
|
||||||
|
func (h entryHeap) Len() int { return len(h) }
|
||||||
|
func (h entryHeap) Less(i, j int) bool { return h[i].Size < h[j].Size } // Min-heap based on Size
|
||||||
|
func (h entryHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
||||||
|
|
||||||
|
func (h *entryHeap) Push(x interface{}) {
|
||||||
|
*h = append(*h, x.(dirEntry))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *entryHeap) Pop() interface{} {
|
||||||
|
old := *h
|
||||||
|
n := len(old)
|
||||||
|
x := old[n-1]
|
||||||
|
*h = old[0 : n-1]
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
|
||||||
|
// largeFileHeap implements heap.Interface for fileEntry
|
||||||
|
type largeFileHeap []fileEntry
|
||||||
|
|
||||||
|
func (h largeFileHeap) Len() int { return len(h) }
|
||||||
|
func (h largeFileHeap) Less(i, j int) bool { return h[i].Size < h[j].Size }
|
||||||
|
func (h largeFileHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
||||||
|
|
||||||
|
func (h *largeFileHeap) Push(x interface{}) {
|
||||||
|
*h = append(*h, x.(fileEntry))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *largeFileHeap) Pop() interface{} {
|
||||||
|
old := *h
|
||||||
|
n := len(old)
|
||||||
|
x := old[n-1]
|
||||||
|
*h = old[0 : n-1]
|
||||||
|
return x
|
||||||
|
}
|
||||||
@@ -2,6 +2,7 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"container/heap"
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
@@ -29,8 +30,14 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
|
|||||||
}
|
}
|
||||||
|
|
||||||
var total int64
|
var total int64
|
||||||
entries := make([]dirEntry, 0, len(children))
|
|
||||||
largeFiles := make([]fileEntry, 0, maxLargeFiles*2)
|
// Use heaps to track Top N items, drastically reducing memory usage
|
||||||
|
// for directories with millions of files
|
||||||
|
entriesHeap := &entryHeap{}
|
||||||
|
heap.Init(entriesHeap)
|
||||||
|
|
||||||
|
largeFilesHeap := &largeFileHeap{}
|
||||||
|
heap.Init(largeFilesHeap)
|
||||||
|
|
||||||
// Use worker pool for concurrent directory scanning
|
// Use worker pool for concurrent directory scanning
|
||||||
// For I/O-bound operations, use more workers than CPU count
|
// For I/O-bound operations, use more workers than CPU count
|
||||||
@@ -54,19 +61,31 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
|
|||||||
entryChan := make(chan dirEntry, len(children))
|
entryChan := make(chan dirEntry, len(children))
|
||||||
largeFileChan := make(chan fileEntry, maxLargeFiles*2)
|
largeFileChan := make(chan fileEntry, maxLargeFiles*2)
|
||||||
|
|
||||||
// Start goroutines to collect from channels
|
// Start goroutines to collect from channels into heaps
|
||||||
var collectorWg sync.WaitGroup
|
var collectorWg sync.WaitGroup
|
||||||
collectorWg.Add(2)
|
collectorWg.Add(2)
|
||||||
go func() {
|
go func() {
|
||||||
defer collectorWg.Done()
|
defer collectorWg.Done()
|
||||||
for entry := range entryChan {
|
for entry := range entryChan {
|
||||||
entries = append(entries, entry)
|
// Maintain Top N Heap for entries
|
||||||
|
if entriesHeap.Len() < maxEntries {
|
||||||
|
heap.Push(entriesHeap, entry)
|
||||||
|
} else if entry.Size > (*entriesHeap)[0].Size {
|
||||||
|
heap.Pop(entriesHeap)
|
||||||
|
heap.Push(entriesHeap, entry)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
go func() {
|
go func() {
|
||||||
defer collectorWg.Done()
|
defer collectorWg.Done()
|
||||||
for file := range largeFileChan {
|
for file := range largeFileChan {
|
||||||
largeFiles = append(largeFiles, file)
|
// Maintain Top N Heap for large files
|
||||||
|
if largeFilesHeap.Len() < maxLargeFiles {
|
||||||
|
heap.Push(largeFilesHeap, file)
|
||||||
|
} else if file.Size > (*largeFilesHeap)[0].Size {
|
||||||
|
heap.Pop(largeFilesHeap)
|
||||||
|
heap.Push(largeFilesHeap, file)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -113,7 +132,7 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
|
|||||||
// Try du command first for folded dirs (much faster)
|
// Try du command first for folded dirs (much faster)
|
||||||
size, err := getDirectorySizeFromDu(path)
|
size, err := getDirectorySizeFromDu(path)
|
||||||
if err != nil || size <= 0 {
|
if err != nil || size <= 0 {
|
||||||
// Fallback to walk if du fails
|
// Fallback to concurrent walk if du fails
|
||||||
size = calculateDirSizeFast(path, filesScanned, dirsScanned, bytesScanned, currentPath)
|
size = calculateDirSizeFast(path, filesScanned, dirsScanned, bytesScanned, currentPath)
|
||||||
}
|
}
|
||||||
atomic.AddInt64(&total, size)
|
atomic.AddInt64(&total, size)
|
||||||
@@ -182,11 +201,15 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
|
|||||||
close(largeFileChan)
|
close(largeFileChan)
|
||||||
collectorWg.Wait()
|
collectorWg.Wait()
|
||||||
|
|
||||||
sort.Slice(entries, func(i, j int) bool {
|
// Convert Heaps to sorted slices (Descending order)
|
||||||
return entries[i].Size > entries[j].Size
|
entries := make([]dirEntry, entriesHeap.Len())
|
||||||
})
|
for i := len(entries) - 1; i >= 0; i-- {
|
||||||
if len(entries) > maxEntries {
|
entries[i] = heap.Pop(entriesHeap).(dirEntry)
|
||||||
entries = entries[:maxEntries]
|
}
|
||||||
|
|
||||||
|
largeFiles := make([]fileEntry, largeFilesHeap.Len())
|
||||||
|
for i := len(largeFiles) - 1; i >= 0; i-- {
|
||||||
|
largeFiles[i] = heap.Pop(largeFilesHeap).(fileEntry)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to use Spotlight (mdfind) for faster large file discovery
|
// Try to use Spotlight (mdfind) for faster large file discovery
|
||||||
@@ -194,18 +217,15 @@ func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *in
|
|||||||
// if Spotlight is unavailable or fails. The fallback is intentionally silent
|
// if Spotlight is unavailable or fails. The fallback is intentionally silent
|
||||||
// because users only care about correct results, not the method used.
|
// because users only care about correct results, not the method used.
|
||||||
if spotlightFiles := findLargeFilesWithSpotlight(root, minLargeFileSize); len(spotlightFiles) > 0 {
|
if spotlightFiles := findLargeFilesWithSpotlight(root, minLargeFileSize); len(spotlightFiles) > 0 {
|
||||||
|
// Spotlight results are already sorted top N
|
||||||
|
// Use them in place of scanned large files
|
||||||
largeFiles = spotlightFiles
|
largeFiles = spotlightFiles
|
||||||
} else {
|
|
||||||
// Use files collected during scanning (fallback path)
|
|
||||||
// Sort and trim large files collected from scanning
|
|
||||||
sort.Slice(largeFiles, func(i, j int) bool {
|
|
||||||
return largeFiles[i].Size > largeFiles[j].Size
|
|
||||||
})
|
|
||||||
if len(largeFiles) > maxLargeFiles {
|
|
||||||
largeFiles = largeFiles[:maxLargeFiles]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Double check sorting consistency (Spotlight returns sorted, but heap pop handles scan results)
|
||||||
|
// If needed, we could re-sort largeFiles, but heap pop ensures ascending, and we filled reverse, so it's Descending.
|
||||||
|
// Spotlight returns Descending. So no extra sort needed for either.
|
||||||
|
|
||||||
return scanResult{
|
return scanResult{
|
||||||
Entries: entries,
|
Entries: entries,
|
||||||
LargeFiles: largeFiles,
|
LargeFiles: largeFiles,
|
||||||
@@ -242,70 +262,77 @@ func shouldSkipFileForLargeTracking(path string) bool {
|
|||||||
return skipExtensions[ext]
|
return skipExtensions[ext]
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculateDirSizeFast performs fast directory size calculation without detailed tracking or large file detection.
|
// calculateDirSizeFast performs concurrent directory size calculation using os.ReadDir
|
||||||
// Updates progress counters in batches to reduce atomic operation overhead.
|
// This is a faster fallback than filepath.WalkDir when du fails
|
||||||
func calculateDirSizeFast(root string, filesScanned, dirsScanned, bytesScanned *int64, currentPath *string) int64 {
|
func calculateDirSizeFast(root string, filesScanned, dirsScanned, bytesScanned *int64, currentPath *string) int64 {
|
||||||
var total int64
|
var total int64
|
||||||
var localFiles, localDirs int64
|
var wg sync.WaitGroup
|
||||||
var batchBytes int64
|
|
||||||
|
|
||||||
// Create context with timeout
|
// Create context with timeout
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
walkFunc := func(path string, d fs.DirEntry, err error) error {
|
// Limit total concurrency for this walk
|
||||||
// Check for timeout
|
concurrency := runtime.NumCPU() * 4
|
||||||
|
if concurrency > 64 {
|
||||||
|
concurrency = 64
|
||||||
|
}
|
||||||
|
sem := make(chan struct{}, concurrency)
|
||||||
|
|
||||||
|
var walk func(string)
|
||||||
|
walk = func(dirPath string) {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return ctx.Err()
|
return
|
||||||
default:
|
default:
|
||||||
}
|
}
|
||||||
if err != nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
if d.IsDir() {
|
|
||||||
localDirs++
|
|
||||||
// Batch update every N dirs to reduce atomic operations
|
|
||||||
if localDirs%batchUpdateSize == 0 {
|
|
||||||
atomic.AddInt64(dirsScanned, batchUpdateSize)
|
|
||||||
localDirs = 0
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
info, err := d.Info()
|
|
||||||
if err != nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
// Get actual disk usage for sparse files and cloud files
|
|
||||||
size := getActualFileSize(path, info)
|
|
||||||
total += size
|
|
||||||
batchBytes += size
|
|
||||||
localFiles++
|
|
||||||
if currentPath != nil {
|
if currentPath != nil {
|
||||||
*currentPath = path
|
*currentPath = dirPath
|
||||||
}
|
}
|
||||||
// Batch update every N files to reduce atomic operations
|
|
||||||
if localFiles%batchUpdateSize == 0 {
|
entries, err := os.ReadDir(dirPath)
|
||||||
atomic.AddInt64(filesScanned, batchUpdateSize)
|
if err != nil {
|
||||||
atomic.AddInt64(bytesScanned, batchBytes)
|
return
|
||||||
localFiles = 0
|
}
|
||||||
batchBytes = 0
|
|
||||||
|
var localBytes, localFiles int64
|
||||||
|
|
||||||
|
for _, entry := range entries {
|
||||||
|
if entry.IsDir() {
|
||||||
|
// Directories: recurse concurrently
|
||||||
|
wg.Add(1)
|
||||||
|
// Capture loop variable
|
||||||
|
subDir := filepath.Join(dirPath, entry.Name())
|
||||||
|
go func(p string) {
|
||||||
|
defer wg.Done()
|
||||||
|
sem <- struct{}{} // Acquire token
|
||||||
|
defer func() { <-sem }() // Release token
|
||||||
|
walk(p)
|
||||||
|
}(subDir)
|
||||||
|
atomic.AddInt64(dirsScanned, 1)
|
||||||
|
} else {
|
||||||
|
// Files: process immediately
|
||||||
|
info, err := entry.Info()
|
||||||
|
if err == nil {
|
||||||
|
size := getActualFileSize(filepath.Join(dirPath, entry.Name()), info)
|
||||||
|
localBytes += size
|
||||||
|
localFiles++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if localBytes > 0 {
|
||||||
|
atomic.AddInt64(&total, localBytes)
|
||||||
|
atomic.AddInt64(bytesScanned, localBytes)
|
||||||
|
}
|
||||||
|
if localFiles > 0 {
|
||||||
|
atomic.AddInt64(filesScanned, localFiles)
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_ = filepath.WalkDir(root, walkFunc)
|
walk(root)
|
||||||
|
wg.Wait()
|
||||||
// Final update for remaining counts
|
|
||||||
if localFiles > 0 {
|
|
||||||
atomic.AddInt64(filesScanned, localFiles)
|
|
||||||
}
|
|
||||||
if localDirs > 0 {
|
|
||||||
atomic.AddInt64(dirsScanned, localDirs)
|
|
||||||
}
|
|
||||||
if batchBytes > 0 {
|
|
||||||
atomic.AddInt64(bytesScanned, batchBytes)
|
|
||||||
}
|
|
||||||
|
|
||||||
return total
|
return total
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user