mirror of
https://github.com/tw93/Mole.git
synced 2026-02-04 17:24:45 +00:00
Cap buffer at 4096. All entries still process normally—producers just briefly block if the buffer fills, which is negligible since the collector drains it quickly.
672 lines
16 KiB
Go
672 lines
16 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"container/heap"
|
|
"context"
|
|
"fmt"
|
|
"io/fs"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"runtime"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"syscall"
|
|
"time"
|
|
|
|
"golang.org/x/sync/singleflight"
|
|
)
|
|
|
|
var scanGroup singleflight.Group
|
|
|
|
func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *int64, currentPath *string) (scanResult, error) {
|
|
children, err := os.ReadDir(root)
|
|
if err != nil {
|
|
return scanResult{}, err
|
|
}
|
|
|
|
var total int64
|
|
|
|
// Keep Top N heaps.
|
|
entriesHeap := &entryHeap{}
|
|
heap.Init(entriesHeap)
|
|
|
|
largeFilesHeap := &largeFileHeap{}
|
|
heap.Init(largeFilesHeap)
|
|
|
|
// Worker pool sized for I/O-bound scanning.
|
|
numWorkers := max(runtime.NumCPU()*cpuMultiplier, minWorkers)
|
|
if numWorkers > maxWorkers {
|
|
numWorkers = maxWorkers
|
|
}
|
|
if numWorkers > len(children) {
|
|
numWorkers = len(children)
|
|
}
|
|
if numWorkers < 1 {
|
|
numWorkers = 1
|
|
}
|
|
sem := make(chan struct{}, numWorkers)
|
|
var wg sync.WaitGroup
|
|
|
|
// Collect results via channels.
|
|
// Cap buffer size to prevent memory spikes with huge directories.
|
|
entryBufSize := len(children)
|
|
if entryBufSize > 4096 {
|
|
entryBufSize = 4096
|
|
}
|
|
if entryBufSize < 1 {
|
|
entryBufSize = 1
|
|
}
|
|
entryChan := make(chan dirEntry, entryBufSize)
|
|
largeFileChan := make(chan fileEntry, maxLargeFiles*2)
|
|
|
|
var collectorWg sync.WaitGroup
|
|
collectorWg.Add(2)
|
|
go func() {
|
|
defer collectorWg.Done()
|
|
for entry := range entryChan {
|
|
if entriesHeap.Len() < maxEntries {
|
|
heap.Push(entriesHeap, entry)
|
|
} else if entry.Size > (*entriesHeap)[0].Size {
|
|
heap.Pop(entriesHeap)
|
|
heap.Push(entriesHeap, entry)
|
|
}
|
|
}
|
|
}()
|
|
go func() {
|
|
defer collectorWg.Done()
|
|
for file := range largeFileChan {
|
|
if largeFilesHeap.Len() < maxLargeFiles {
|
|
heap.Push(largeFilesHeap, file)
|
|
} else if file.Size > (*largeFilesHeap)[0].Size {
|
|
heap.Pop(largeFilesHeap)
|
|
heap.Push(largeFilesHeap, file)
|
|
}
|
|
}
|
|
}()
|
|
|
|
isRootDir := root == "/"
|
|
home := os.Getenv("HOME")
|
|
isHomeDir := home != "" && root == home
|
|
|
|
for _, child := range children {
|
|
fullPath := filepath.Join(root, child.Name())
|
|
|
|
// Skip symlinks to avoid following unexpected targets.
|
|
if child.Type()&fs.ModeSymlink != 0 {
|
|
targetInfo, err := os.Stat(fullPath)
|
|
isDir := false
|
|
if err == nil && targetInfo.IsDir() {
|
|
isDir = true
|
|
}
|
|
|
|
// Count link size only to avoid double-counting targets.
|
|
info, err := child.Info()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
size := getActualFileSize(fullPath, info)
|
|
atomic.AddInt64(&total, size)
|
|
|
|
entryChan <- dirEntry{
|
|
Name: child.Name() + " →",
|
|
Path: fullPath,
|
|
Size: size,
|
|
IsDir: isDir,
|
|
LastAccess: getLastAccessTimeFromInfo(info),
|
|
}
|
|
continue
|
|
}
|
|
|
|
if child.IsDir() {
|
|
if defaultSkipDirs[child.Name()] {
|
|
continue
|
|
}
|
|
|
|
// Skip system dirs at root.
|
|
if isRootDir && skipSystemDirs[child.Name()] {
|
|
continue
|
|
}
|
|
|
|
// ~/Library is scanned separately; reuse cache when possible.
|
|
if isHomeDir && child.Name() == "Library" {
|
|
wg.Add(1)
|
|
go func(name, path string) {
|
|
defer wg.Done()
|
|
sem <- struct{}{}
|
|
defer func() { <-sem }()
|
|
|
|
var size int64
|
|
if cached, err := loadStoredOverviewSize(path); err == nil && cached > 0 {
|
|
size = cached
|
|
} else if cached, err := loadCacheFromDisk(path); err == nil {
|
|
size = cached.TotalSize
|
|
} else {
|
|
size = calculateDirSizeConcurrent(path, largeFileChan, filesScanned, dirsScanned, bytesScanned, currentPath)
|
|
}
|
|
atomic.AddInt64(&total, size)
|
|
atomic.AddInt64(dirsScanned, 1)
|
|
|
|
entryChan <- dirEntry{
|
|
Name: name,
|
|
Path: path,
|
|
Size: size,
|
|
IsDir: true,
|
|
LastAccess: time.Time{},
|
|
}
|
|
}(child.Name(), fullPath)
|
|
continue
|
|
}
|
|
|
|
// Folded dirs: fast size without expanding.
|
|
if shouldFoldDirWithPath(child.Name(), fullPath) {
|
|
wg.Add(1)
|
|
go func(name, path string) {
|
|
defer wg.Done()
|
|
sem <- struct{}{}
|
|
defer func() { <-sem }()
|
|
|
|
size, err := getDirectorySizeFromDu(path)
|
|
if err != nil || size <= 0 {
|
|
size = calculateDirSizeFast(path, filesScanned, dirsScanned, bytesScanned, currentPath)
|
|
}
|
|
atomic.AddInt64(&total, size)
|
|
atomic.AddInt64(dirsScanned, 1)
|
|
|
|
entryChan <- dirEntry{
|
|
Name: name,
|
|
Path: path,
|
|
Size: size,
|
|
IsDir: true,
|
|
LastAccess: time.Time{},
|
|
}
|
|
}(child.Name(), fullPath)
|
|
continue
|
|
}
|
|
|
|
wg.Add(1)
|
|
go func(name, path string) {
|
|
defer wg.Done()
|
|
sem <- struct{}{}
|
|
defer func() { <-sem }()
|
|
|
|
size := calculateDirSizeConcurrent(path, largeFileChan, filesScanned, dirsScanned, bytesScanned, currentPath)
|
|
atomic.AddInt64(&total, size)
|
|
atomic.AddInt64(dirsScanned, 1)
|
|
|
|
entryChan <- dirEntry{
|
|
Name: name,
|
|
Path: path,
|
|
Size: size,
|
|
IsDir: true,
|
|
LastAccess: time.Time{},
|
|
}
|
|
}(child.Name(), fullPath)
|
|
continue
|
|
}
|
|
|
|
info, err := child.Info()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
// Actual disk usage for sparse/cloud files.
|
|
size := getActualFileSize(fullPath, info)
|
|
atomic.AddInt64(&total, size)
|
|
atomic.AddInt64(filesScanned, 1)
|
|
atomic.AddInt64(bytesScanned, size)
|
|
|
|
entryChan <- dirEntry{
|
|
Name: child.Name(),
|
|
Path: fullPath,
|
|
Size: size,
|
|
IsDir: false,
|
|
LastAccess: getLastAccessTimeFromInfo(info),
|
|
}
|
|
// Track large files only.
|
|
if !shouldSkipFileForLargeTracking(fullPath) && size >= minLargeFileSize {
|
|
largeFileChan <- fileEntry{Name: child.Name(), Path: fullPath, Size: size}
|
|
}
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
// Close channels and wait for collectors.
|
|
close(entryChan)
|
|
close(largeFileChan)
|
|
collectorWg.Wait()
|
|
|
|
// Convert heaps to sorted slices (descending).
|
|
entries := make([]dirEntry, entriesHeap.Len())
|
|
for i := len(entries) - 1; i >= 0; i-- {
|
|
entries[i] = heap.Pop(entriesHeap).(dirEntry)
|
|
}
|
|
|
|
largeFiles := make([]fileEntry, largeFilesHeap.Len())
|
|
for i := len(largeFiles) - 1; i >= 0; i-- {
|
|
largeFiles[i] = heap.Pop(largeFilesHeap).(fileEntry)
|
|
}
|
|
|
|
// Use Spotlight for large files when available.
|
|
if spotlightFiles := findLargeFilesWithSpotlight(root, minLargeFileSize); len(spotlightFiles) > 0 {
|
|
largeFiles = spotlightFiles
|
|
}
|
|
|
|
return scanResult{
|
|
Entries: entries,
|
|
LargeFiles: largeFiles,
|
|
TotalSize: total,
|
|
TotalFiles: atomic.LoadInt64(filesScanned),
|
|
}, nil
|
|
}
|
|
|
|
func shouldFoldDirWithPath(name, path string) bool {
|
|
if foldDirs[name] {
|
|
return true
|
|
}
|
|
|
|
// Handle npm cache structure.
|
|
if strings.Contains(path, "/.npm/") || strings.Contains(path, "/.tnpm/") {
|
|
parent := filepath.Base(filepath.Dir(path))
|
|
if parent == ".npm" || parent == ".tnpm" || strings.HasPrefix(parent, "_") {
|
|
return true
|
|
}
|
|
if len(name) == 1 {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func shouldSkipFileForLargeTracking(path string) bool {
|
|
ext := strings.ToLower(filepath.Ext(path))
|
|
return skipExtensions[ext]
|
|
}
|
|
|
|
// calculateDirSizeFast performs concurrent dir sizing using os.ReadDir.
|
|
func calculateDirSizeFast(root string, filesScanned, dirsScanned, bytesScanned *int64, currentPath *string) int64 {
|
|
var total int64
|
|
var wg sync.WaitGroup
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
defer cancel()
|
|
|
|
concurrency := min(runtime.NumCPU()*4, 64)
|
|
sem := make(chan struct{}, concurrency)
|
|
|
|
var walk func(string)
|
|
walk = func(dirPath string) {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
default:
|
|
}
|
|
|
|
if currentPath != nil && atomic.LoadInt64(filesScanned)%int64(batchUpdateSize) == 0 {
|
|
*currentPath = dirPath
|
|
}
|
|
|
|
entries, err := os.ReadDir(dirPath)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
var localBytes, localFiles int64
|
|
|
|
for _, entry := range entries {
|
|
if entry.IsDir() {
|
|
wg.Add(1)
|
|
subDir := filepath.Join(dirPath, entry.Name())
|
|
go func(p string) {
|
|
defer wg.Done()
|
|
sem <- struct{}{}
|
|
defer func() { <-sem }()
|
|
walk(p)
|
|
}(subDir)
|
|
atomic.AddInt64(dirsScanned, 1)
|
|
} else {
|
|
info, err := entry.Info()
|
|
if err == nil {
|
|
size := getActualFileSize(filepath.Join(dirPath, entry.Name()), info)
|
|
localBytes += size
|
|
localFiles++
|
|
}
|
|
}
|
|
}
|
|
|
|
if localBytes > 0 {
|
|
atomic.AddInt64(&total, localBytes)
|
|
atomic.AddInt64(bytesScanned, localBytes)
|
|
}
|
|
if localFiles > 0 {
|
|
atomic.AddInt64(filesScanned, localFiles)
|
|
}
|
|
}
|
|
|
|
walk(root)
|
|
wg.Wait()
|
|
|
|
return total
|
|
}
|
|
|
|
// Use Spotlight (mdfind) to quickly find large files.
|
|
func findLargeFilesWithSpotlight(root string, minSize int64) []fileEntry {
|
|
query := fmt.Sprintf("kMDItemFSSize >= %d", minSize)
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), mdlsTimeout)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(ctx, "mdfind", "-onlyin", root, query)
|
|
output, err := cmd.Output()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
var files []fileEntry
|
|
|
|
for line := range strings.Lines(strings.TrimSpace(string(output))) {
|
|
if line == "" {
|
|
continue
|
|
}
|
|
|
|
// Filter code files first (cheap).
|
|
if shouldSkipFileForLargeTracking(line) {
|
|
continue
|
|
}
|
|
|
|
// Filter folded directories (cheap string check).
|
|
if isInFoldedDir(line) {
|
|
continue
|
|
}
|
|
|
|
info, err := os.Lstat(line)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
if info.IsDir() || info.Mode()&os.ModeSymlink != 0 {
|
|
continue
|
|
}
|
|
|
|
// Actual disk usage for sparse/cloud files.
|
|
actualSize := getActualFileSize(line, info)
|
|
files = append(files, fileEntry{
|
|
Name: filepath.Base(line),
|
|
Path: line,
|
|
Size: actualSize,
|
|
})
|
|
}
|
|
|
|
// Sort by size (descending).
|
|
sort.Slice(files, func(i, j int) bool {
|
|
return files[i].Size > files[j].Size
|
|
})
|
|
|
|
if len(files) > maxLargeFiles {
|
|
files = files[:maxLargeFiles]
|
|
}
|
|
|
|
return files
|
|
}
|
|
|
|
// isInFoldedDir checks if a path is inside a folded directory.
|
|
func isInFoldedDir(path string) bool {
|
|
parts := strings.SplitSeq(path, string(os.PathSeparator))
|
|
for part := range parts {
|
|
if foldDirs[part] {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func calculateDirSizeConcurrent(root string, largeFileChan chan<- fileEntry, filesScanned, dirsScanned, bytesScanned *int64, currentPath *string) int64 {
|
|
children, err := os.ReadDir(root)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
|
|
var total int64
|
|
var wg sync.WaitGroup
|
|
|
|
// Limit concurrent subdirectory scans.
|
|
maxConcurrent := min(runtime.NumCPU()*2, maxDirWorkers)
|
|
sem := make(chan struct{}, maxConcurrent)
|
|
|
|
for _, child := range children {
|
|
fullPath := filepath.Join(root, child.Name())
|
|
|
|
if child.Type()&fs.ModeSymlink != 0 {
|
|
info, err := child.Info()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
size := getActualFileSize(fullPath, info)
|
|
total += size
|
|
atomic.AddInt64(filesScanned, 1)
|
|
atomic.AddInt64(bytesScanned, size)
|
|
continue
|
|
}
|
|
|
|
if child.IsDir() {
|
|
if shouldFoldDirWithPath(child.Name(), fullPath) {
|
|
wg.Add(1)
|
|
go func(path string) {
|
|
defer wg.Done()
|
|
size, err := getDirectorySizeFromDu(path)
|
|
if err == nil && size > 0 {
|
|
atomic.AddInt64(&total, size)
|
|
atomic.AddInt64(bytesScanned, size)
|
|
atomic.AddInt64(dirsScanned, 1)
|
|
}
|
|
}(fullPath)
|
|
continue
|
|
}
|
|
|
|
wg.Add(1)
|
|
go func(path string) {
|
|
defer wg.Done()
|
|
sem <- struct{}{}
|
|
defer func() { <-sem }()
|
|
|
|
size := calculateDirSizeConcurrent(path, largeFileChan, filesScanned, dirsScanned, bytesScanned, currentPath)
|
|
atomic.AddInt64(&total, size)
|
|
atomic.AddInt64(dirsScanned, 1)
|
|
}(fullPath)
|
|
continue
|
|
}
|
|
|
|
info, err := child.Info()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
size := getActualFileSize(fullPath, info)
|
|
total += size
|
|
atomic.AddInt64(filesScanned, 1)
|
|
atomic.AddInt64(bytesScanned, size)
|
|
|
|
if !shouldSkipFileForLargeTracking(fullPath) && size >= minLargeFileSize {
|
|
largeFileChan <- fileEntry{Name: child.Name(), Path: fullPath, Size: size}
|
|
}
|
|
|
|
// Update current path occasionally to prevent UI jitter.
|
|
if currentPath != nil && atomic.LoadInt64(filesScanned)%int64(batchUpdateSize) == 0 {
|
|
*currentPath = fullPath
|
|
}
|
|
}
|
|
|
|
wg.Wait()
|
|
return total
|
|
}
|
|
|
|
// measureOverviewSize calculates the size of a directory using multiple strategies.
|
|
// When scanning Home, it excludes ~/Library to avoid duplicate counting.
|
|
func measureOverviewSize(path string) (int64, error) {
|
|
if path == "" {
|
|
return 0, fmt.Errorf("empty path")
|
|
}
|
|
|
|
path = filepath.Clean(path)
|
|
if !filepath.IsAbs(path) {
|
|
return 0, fmt.Errorf("path must be absolute: %s", path)
|
|
}
|
|
|
|
if _, err := os.Stat(path); err != nil {
|
|
return 0, fmt.Errorf("cannot access path: %v", err)
|
|
}
|
|
|
|
// Determine if we should exclude ~/Library (when scanning Home)
|
|
home := os.Getenv("HOME")
|
|
excludePath := ""
|
|
if home != "" && path == home {
|
|
excludePath = filepath.Join(home, "Library")
|
|
}
|
|
|
|
if cached, err := loadStoredOverviewSize(path); err == nil && cached > 0 {
|
|
return cached, nil
|
|
}
|
|
|
|
if duSize, err := getDirectorySizeFromDuWithExclude(path, excludePath); err == nil && duSize > 0 {
|
|
_ = storeOverviewSize(path, duSize)
|
|
return duSize, nil
|
|
}
|
|
|
|
if logicalSize, err := getDirectoryLogicalSizeWithExclude(path, excludePath); err == nil && logicalSize > 0 {
|
|
_ = storeOverviewSize(path, logicalSize)
|
|
return logicalSize, nil
|
|
}
|
|
|
|
if cached, err := loadCacheFromDisk(path); err == nil {
|
|
_ = storeOverviewSize(path, cached.TotalSize)
|
|
return cached.TotalSize, nil
|
|
}
|
|
|
|
return 0, fmt.Errorf("unable to measure directory size with fast methods")
|
|
}
|
|
|
|
func getDirectorySizeFromDu(path string) (int64, error) {
|
|
return getDirectorySizeFromDuWithExclude(path, "")
|
|
}
|
|
|
|
func getDirectorySizeFromDuWithExclude(path string, excludePath string) (int64, error) {
|
|
runDuSize := func(target string) (int64, error) {
|
|
if _, err := os.Stat(target); err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), duTimeout)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(ctx, "du", "-sk", target)
|
|
var stdout, stderr bytes.Buffer
|
|
cmd.Stdout = &stdout
|
|
cmd.Stderr = &stderr
|
|
|
|
if err := cmd.Run(); err != nil {
|
|
if ctx.Err() == context.DeadlineExceeded {
|
|
return 0, fmt.Errorf("du timeout after %v", duTimeout)
|
|
}
|
|
if stderr.Len() > 0 {
|
|
return 0, fmt.Errorf("du failed: %v (%s)", err, stderr.String())
|
|
}
|
|
return 0, fmt.Errorf("du failed: %v", err)
|
|
}
|
|
fields := strings.Fields(stdout.String())
|
|
if len(fields) == 0 {
|
|
return 0, fmt.Errorf("du output empty")
|
|
}
|
|
kb, err := strconv.ParseInt(fields[0], 10, 64)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("failed to parse du output: %v", err)
|
|
}
|
|
if kb <= 0 {
|
|
return 0, fmt.Errorf("du size invalid: %d", kb)
|
|
}
|
|
return kb * 1024, nil
|
|
}
|
|
|
|
// When excluding a path (e.g., ~/Library), subtract only that exact directory instead of ignoring every "Library"
|
|
if excludePath != "" {
|
|
totalSize, err := runDuSize(path)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
excludeSize, err := runDuSize(excludePath)
|
|
if err != nil {
|
|
if !os.IsNotExist(err) {
|
|
return 0, err
|
|
}
|
|
excludeSize = 0
|
|
}
|
|
if excludeSize > totalSize {
|
|
excludeSize = 0
|
|
}
|
|
return totalSize - excludeSize, nil
|
|
}
|
|
|
|
return runDuSize(path)
|
|
}
|
|
|
|
func getDirectoryLogicalSizeWithExclude(path string, excludePath string) (int64, error) {
|
|
var total int64
|
|
err := filepath.WalkDir(path, func(p string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
if os.IsPermission(err) {
|
|
return filepath.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
// Skip excluded path
|
|
if excludePath != "" && p == excludePath {
|
|
return filepath.SkipDir
|
|
}
|
|
if d.IsDir() {
|
|
return nil
|
|
}
|
|
info, err := d.Info()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
total += getActualFileSize(p, info)
|
|
return nil
|
|
})
|
|
if err != nil && err != filepath.SkipDir {
|
|
return 0, err
|
|
}
|
|
return total, nil
|
|
}
|
|
|
|
func getActualFileSize(_ string, info fs.FileInfo) int64 {
|
|
stat, ok := info.Sys().(*syscall.Stat_t)
|
|
if !ok {
|
|
return info.Size()
|
|
}
|
|
|
|
actualSize := stat.Blocks * 512
|
|
if actualSize < info.Size() {
|
|
return actualSize
|
|
}
|
|
return info.Size()
|
|
}
|
|
|
|
func getLastAccessTime(path string) time.Time {
|
|
info, err := os.Stat(path)
|
|
if err != nil {
|
|
return time.Time{}
|
|
}
|
|
return getLastAccessTimeFromInfo(info)
|
|
}
|
|
|
|
func getLastAccessTimeFromInfo(info fs.FileInfo) time.Time {
|
|
stat, ok := info.Sys().(*syscall.Stat_t)
|
|
if !ok {
|
|
return time.Time{}
|
|
}
|
|
return time.Unix(stat.Atimespec.Sec, stat.Atimespec.Nsec)
|
|
}
|