mirror of
https://github.com/tw93/Mole.git
synced 2026-02-04 15:04:42 +00:00
Refactor Go analysis capabilities
This commit is contained in:
634
cmd/analyze/scanner.go
Normal file
634
cmd/analyze/scanner.go
Normal file
@@ -0,0 +1,634 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"golang.org/x/sync/singleflight"
|
||||
)
|
||||
|
||||
var scanGroup singleflight.Group
|
||||
|
||||
func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *int64, currentPath *string) (scanResult, error) {
|
||||
children, err := os.ReadDir(root)
|
||||
if err != nil {
|
||||
return scanResult{}, err
|
||||
}
|
||||
|
||||
var total int64
|
||||
entries := make([]dirEntry, 0, len(children))
|
||||
largeFiles := make([]fileEntry, 0, maxLargeFiles*2)
|
||||
|
||||
// Use worker pool for concurrent directory scanning
|
||||
// For I/O-bound operations, use more workers than CPU count
|
||||
maxWorkers := runtime.NumCPU() * 4
|
||||
if maxWorkers < 16 {
|
||||
maxWorkers = 16 // Minimum 16 workers for better I/O throughput
|
||||
}
|
||||
// Cap at 128 to avoid excessive goroutines
|
||||
if maxWorkers > 128 {
|
||||
maxWorkers = 128
|
||||
}
|
||||
if maxWorkers > len(children) {
|
||||
maxWorkers = len(children)
|
||||
}
|
||||
if maxWorkers < 1 {
|
||||
maxWorkers = 1
|
||||
}
|
||||
sem := make(chan struct{}, maxWorkers)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// Use channels to collect results without lock contention
|
||||
entryChan := make(chan dirEntry, len(children))
|
||||
largeFileChan := make(chan fileEntry, maxLargeFiles*2)
|
||||
|
||||
// Start goroutines to collect from channels
|
||||
var collectorWg sync.WaitGroup
|
||||
collectorWg.Add(2)
|
||||
go func() {
|
||||
defer collectorWg.Done()
|
||||
for entry := range entryChan {
|
||||
entries = append(entries, entry)
|
||||
}
|
||||
}()
|
||||
go func() {
|
||||
defer collectorWg.Done()
|
||||
for file := range largeFileChan {
|
||||
largeFiles = append(largeFiles, file)
|
||||
}
|
||||
}()
|
||||
|
||||
isRootDir := root == "/"
|
||||
|
||||
for _, child := range children {
|
||||
fullPath := filepath.Join(root, child.Name())
|
||||
|
||||
if child.IsDir() {
|
||||
// In root directory, skip system directories completely
|
||||
if isRootDir && skipSystemDirs[child.Name()] {
|
||||
continue
|
||||
}
|
||||
|
||||
// For folded directories, calculate size quickly without expanding
|
||||
if shouldFoldDirWithPath(child.Name(), fullPath) {
|
||||
wg.Add(1)
|
||||
go func(name, path string) {
|
||||
defer wg.Done()
|
||||
sem <- struct{}{}
|
||||
defer func() { <-sem }()
|
||||
|
||||
// Try du command first for folded dirs (much faster)
|
||||
size := calculateDirSizeWithDu(path)
|
||||
if size <= 0 {
|
||||
// Fallback to walk if du fails
|
||||
size = calculateDirSizeFast(path, filesScanned, dirsScanned, bytesScanned, currentPath)
|
||||
}
|
||||
atomic.AddInt64(&total, size)
|
||||
atomic.AddInt64(dirsScanned, 1)
|
||||
|
||||
entryChan <- dirEntry{
|
||||
name: name,
|
||||
path: path,
|
||||
size: size,
|
||||
isDir: true,
|
||||
lastAccess: time.Time{}, // Lazy load when displayed
|
||||
}
|
||||
}(child.Name(), fullPath)
|
||||
continue
|
||||
}
|
||||
|
||||
// Normal directory: full scan with detail
|
||||
wg.Add(1)
|
||||
go func(name, path string) {
|
||||
defer wg.Done()
|
||||
sem <- struct{}{}
|
||||
defer func() { <-sem }()
|
||||
|
||||
size := calculateDirSizeConcurrent(path, largeFileChan, filesScanned, dirsScanned, bytesScanned, currentPath)
|
||||
atomic.AddInt64(&total, size)
|
||||
atomic.AddInt64(dirsScanned, 1)
|
||||
|
||||
entryChan <- dirEntry{
|
||||
name: name,
|
||||
path: path,
|
||||
size: size,
|
||||
isDir: true,
|
||||
lastAccess: time.Time{}, // Lazy load when displayed
|
||||
}
|
||||
}(child.Name(), fullPath)
|
||||
continue
|
||||
}
|
||||
|
||||
info, err := child.Info()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// Get actual disk usage for sparse files and cloud files
|
||||
size := getActualFileSize(fullPath, info)
|
||||
atomic.AddInt64(&total, size)
|
||||
atomic.AddInt64(filesScanned, 1)
|
||||
atomic.AddInt64(bytesScanned, size)
|
||||
|
||||
entryChan <- dirEntry{
|
||||
name: child.Name(),
|
||||
path: fullPath,
|
||||
size: size,
|
||||
isDir: false,
|
||||
lastAccess: getLastAccessTimeFromInfo(info),
|
||||
}
|
||||
// Only track large files that are not code/text files
|
||||
if !shouldSkipFileForLargeTracking(fullPath) && size >= minLargeFileSize {
|
||||
largeFileChan <- fileEntry{name: child.Name(), path: fullPath, size: size}
|
||||
}
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
// Close channels and wait for collectors to finish
|
||||
close(entryChan)
|
||||
close(largeFileChan)
|
||||
collectorWg.Wait()
|
||||
|
||||
sort.Slice(entries, func(i, j int) bool {
|
||||
return entries[i].size > entries[j].size
|
||||
})
|
||||
if len(entries) > maxEntries {
|
||||
entries = entries[:maxEntries]
|
||||
}
|
||||
|
||||
// Try to use Spotlight for faster large file discovery
|
||||
if spotlightFiles := findLargeFilesWithSpotlight(root, minLargeFileSize); len(spotlightFiles) > 0 {
|
||||
largeFiles = spotlightFiles
|
||||
} else {
|
||||
// Sort and trim large files collected from scanning
|
||||
sort.Slice(largeFiles, func(i, j int) bool {
|
||||
return largeFiles[i].size > largeFiles[j].size
|
||||
})
|
||||
if len(largeFiles) > maxLargeFiles {
|
||||
largeFiles = largeFiles[:maxLargeFiles]
|
||||
}
|
||||
}
|
||||
|
||||
return scanResult{
|
||||
entries: entries,
|
||||
largeFiles: largeFiles,
|
||||
totalSize: total,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func shouldFoldDir(name string) bool {
|
||||
return foldDirs[name]
|
||||
}
|
||||
|
||||
// shouldFoldDirWithPath checks if a directory should be folded based on path context
|
||||
func shouldFoldDirWithPath(name, path string) bool {
|
||||
// Check basic fold list first
|
||||
if foldDirs[name] {
|
||||
return true
|
||||
}
|
||||
|
||||
// Special case: npm cache directories - fold all subdirectories
|
||||
// This includes: .npm/_quick/*, .npm/_cacache/*, .npm/a-z/*, .tnpm/*
|
||||
if strings.Contains(path, "/.npm/") || strings.Contains(path, "/.tnpm/") {
|
||||
// Get the parent directory name
|
||||
parent := filepath.Base(filepath.Dir(path))
|
||||
// If parent is a cache folder (_quick, _cacache, etc) or npm dir itself, fold it
|
||||
if parent == ".npm" || parent == ".tnpm" || strings.HasPrefix(parent, "_") {
|
||||
return true
|
||||
}
|
||||
// Also fold single-letter subdirectories (npm cache structure like .npm/a/, .npm/b/)
|
||||
if len(name) == 1 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// calculateDirSizeWithDu uses du command for fast directory size calculation
|
||||
// Returns size in bytes, or 0 if command fails
|
||||
func calculateDirSizeWithDu(path string) int64 {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Use -sk for 1K-block output, then convert to bytes
|
||||
// macOS du doesn't support -b flag
|
||||
cmd := exec.CommandContext(ctx, "du", "-sk", path)
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
fields := strings.Fields(string(output))
|
||||
if len(fields) < 1 {
|
||||
return 0
|
||||
}
|
||||
|
||||
kb, err := strconv.ParseInt(fields[0], 10, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
return kb * 1024
|
||||
}
|
||||
|
||||
func shouldSkipFileForLargeTracking(path string) bool {
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
return skipExtensions[ext]
|
||||
}
|
||||
|
||||
// calculateDirSizeFast performs fast directory size calculation without detailed tracking or large file detection.
|
||||
// Updates progress counters in batches to reduce atomic operation overhead.
|
||||
func calculateDirSizeFast(root string, filesScanned, dirsScanned, bytesScanned *int64, currentPath *string) int64 {
|
||||
var total int64
|
||||
var localFiles, localDirs int64
|
||||
var batchBytes int64
|
||||
|
||||
// Create context with timeout
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
walkFunc := func(path string, d fs.DirEntry, err error) error {
|
||||
// Check for timeout
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
if d.IsDir() {
|
||||
localDirs++
|
||||
// Batch update every N dirs to reduce atomic operations
|
||||
if localDirs%batchUpdateSize == 0 {
|
||||
atomic.AddInt64(dirsScanned, batchUpdateSize)
|
||||
localDirs = 0
|
||||
}
|
||||
return nil
|
||||
}
|
||||
info, err := d.Info()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
// Get actual disk usage for sparse files and cloud files
|
||||
size := getActualFileSize(path, info)
|
||||
total += size
|
||||
batchBytes += size
|
||||
localFiles++
|
||||
if currentPath != nil {
|
||||
*currentPath = path
|
||||
}
|
||||
// Batch update every N files to reduce atomic operations
|
||||
if localFiles%batchUpdateSize == 0 {
|
||||
atomic.AddInt64(filesScanned, batchUpdateSize)
|
||||
atomic.AddInt64(bytesScanned, batchBytes)
|
||||
localFiles = 0
|
||||
batchBytes = 0
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
_ = filepath.WalkDir(root, walkFunc)
|
||||
|
||||
// Final update for remaining counts
|
||||
if localFiles > 0 {
|
||||
atomic.AddInt64(filesScanned, localFiles)
|
||||
}
|
||||
if localDirs > 0 {
|
||||
atomic.AddInt64(dirsScanned, localDirs)
|
||||
}
|
||||
if batchBytes > 0 {
|
||||
atomic.AddInt64(bytesScanned, batchBytes)
|
||||
}
|
||||
|
||||
return total
|
||||
}
|
||||
|
||||
// Use Spotlight (mdfind) to quickly find large files in a directory
|
||||
func findLargeFilesWithSpotlight(root string, minSize int64) []fileEntry {
|
||||
// mdfind query: files >= minSize in the specified directory
|
||||
query := fmt.Sprintf("kMDItemFSSize >= %d", minSize)
|
||||
|
||||
cmd := exec.Command("mdfind", "-onlyin", root, query)
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
// Fallback: mdfind not available or failed
|
||||
return nil
|
||||
}
|
||||
|
||||
lines := strings.Split(strings.TrimSpace(string(output)), "\n")
|
||||
var files []fileEntry
|
||||
|
||||
for _, line := range lines {
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Filter out code files first (cheapest check, no I/O)
|
||||
if shouldSkipFileForLargeTracking(line) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Filter out files in folded directories (cheap string check)
|
||||
if isInFoldedDir(line) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Use Lstat instead of Stat (faster, doesn't follow symlinks)
|
||||
info, err := os.Lstat(line)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip if it's a directory or symlink
|
||||
if info.IsDir() || info.Mode()&os.ModeSymlink != 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Get actual disk usage for sparse files and cloud files
|
||||
actualSize := getActualFileSize(line, info)
|
||||
files = append(files, fileEntry{
|
||||
name: filepath.Base(line),
|
||||
path: line,
|
||||
size: actualSize,
|
||||
})
|
||||
}
|
||||
|
||||
// Sort by size (descending)
|
||||
sort.Slice(files, func(i, j int) bool {
|
||||
return files[i].size > files[j].size
|
||||
})
|
||||
|
||||
// Return top N
|
||||
if len(files) > maxLargeFiles {
|
||||
files = files[:maxLargeFiles]
|
||||
}
|
||||
|
||||
return files
|
||||
}
|
||||
|
||||
// isInFoldedDir checks if a path is inside a folded directory (optimized)
|
||||
func isInFoldedDir(path string) bool {
|
||||
// Split path into components for faster checking
|
||||
parts := strings.Split(path, string(os.PathSeparator))
|
||||
for _, part := range parts {
|
||||
if foldDirs[part] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func calculateDirSizeConcurrent(root string, largeFileChan chan<- fileEntry, filesScanned, dirsScanned, bytesScanned *int64, currentPath *string) int64 {
|
||||
// Read immediate children
|
||||
children, err := os.ReadDir(root)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
var total int64
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// Limit concurrent subdirectory scans to avoid too many goroutines
|
||||
maxConcurrent := runtime.NumCPU() * 2
|
||||
if maxConcurrent > 32 {
|
||||
maxConcurrent = 32
|
||||
}
|
||||
sem := make(chan struct{}, maxConcurrent)
|
||||
|
||||
for _, child := range children {
|
||||
fullPath := filepath.Join(root, child.Name())
|
||||
|
||||
if child.IsDir() {
|
||||
// Check if this is a folded directory
|
||||
if shouldFoldDirWithPath(child.Name(), fullPath) {
|
||||
// Use du for folded directories (much faster)
|
||||
wg.Add(1)
|
||||
go func(path string) {
|
||||
defer wg.Done()
|
||||
size := calculateDirSizeWithDu(path)
|
||||
if size > 0 {
|
||||
atomic.AddInt64(&total, size)
|
||||
atomic.AddInt64(bytesScanned, size)
|
||||
atomic.AddInt64(dirsScanned, 1)
|
||||
}
|
||||
}(fullPath)
|
||||
continue
|
||||
}
|
||||
|
||||
// Recursively scan subdirectory in parallel
|
||||
wg.Add(1)
|
||||
go func(path string) {
|
||||
defer wg.Done()
|
||||
sem <- struct{}{}
|
||||
defer func() { <-sem }()
|
||||
|
||||
size := calculateDirSizeConcurrent(path, largeFileChan, filesScanned, dirsScanned, bytesScanned, currentPath)
|
||||
atomic.AddInt64(&total, size)
|
||||
atomic.AddInt64(dirsScanned, 1)
|
||||
}(fullPath)
|
||||
continue
|
||||
}
|
||||
|
||||
// Handle files
|
||||
info, err := child.Info()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
size := getActualFileSize(fullPath, info)
|
||||
total += size
|
||||
atomic.AddInt64(filesScanned, 1)
|
||||
atomic.AddInt64(bytesScanned, size)
|
||||
|
||||
// Track large files
|
||||
if !shouldSkipFileForLargeTracking(fullPath) && size >= minLargeFileSize {
|
||||
largeFileChan <- fileEntry{name: child.Name(), path: fullPath, size: size}
|
||||
}
|
||||
|
||||
// Update current path
|
||||
if currentPath != nil {
|
||||
*currentPath = fullPath
|
||||
}
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
return total
|
||||
}
|
||||
|
||||
// measureOverviewSize calculates the size of a directory using multiple strategies.
|
||||
func measureOverviewSize(path string) (int64, error) {
|
||||
if path == "" {
|
||||
return 0, fmt.Errorf("empty path")
|
||||
}
|
||||
|
||||
path = filepath.Clean(path)
|
||||
if !filepath.IsAbs(path) {
|
||||
return 0, fmt.Errorf("path must be absolute: %s", path)
|
||||
}
|
||||
|
||||
if _, err := os.Stat(path); err != nil {
|
||||
return 0, fmt.Errorf("cannot access path: %v", err)
|
||||
}
|
||||
|
||||
if cached, err := loadStoredOverviewSize(path); err == nil && cached > 0 {
|
||||
return cached, nil
|
||||
}
|
||||
|
||||
if duSize, err := getDirectorySizeFromDu(path); err == nil && duSize > 0 {
|
||||
_ = storeOverviewSize(path, duSize)
|
||||
return duSize, nil
|
||||
}
|
||||
|
||||
if logicalSize, err := getDirectoryLogicalSize(path); err == nil && logicalSize > 0 {
|
||||
_ = storeOverviewSize(path, logicalSize)
|
||||
return logicalSize, nil
|
||||
}
|
||||
|
||||
if cached, err := loadCacheFromDisk(path); err == nil {
|
||||
_ = storeOverviewSize(path, cached.TotalSize)
|
||||
return cached.TotalSize, nil
|
||||
}
|
||||
|
||||
return 0, fmt.Errorf("unable to measure directory size with fast methods")
|
||||
}
|
||||
|
||||
func getDirectorySizeFromMetadata(path string) (int64, error) {
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("cannot stat path: %v", err)
|
||||
}
|
||||
if !info.IsDir() {
|
||||
return 0, fmt.Errorf("not a directory")
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), mdlsTimeout)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "mdls", "-raw", "-name", "kMDItemFSSize", path)
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
return 0, fmt.Errorf("mdls timeout after %v", mdlsTimeout)
|
||||
}
|
||||
if stderr.Len() > 0 {
|
||||
return 0, fmt.Errorf("mdls failed: %v (%s)", err, stderr.String())
|
||||
}
|
||||
return 0, fmt.Errorf("mdls failed: %v", err)
|
||||
}
|
||||
value := strings.TrimSpace(stdout.String())
|
||||
if value == "" || value == "(null)" {
|
||||
return 0, fmt.Errorf("metadata size unavailable")
|
||||
}
|
||||
size, err := strconv.ParseInt(value, 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to parse mdls output: %v", err)
|
||||
}
|
||||
if size <= 0 {
|
||||
return 0, fmt.Errorf("mdls size invalid: %d", size)
|
||||
}
|
||||
return size, nil
|
||||
}
|
||||
|
||||
func getDirectorySizeFromDu(path string) (int64, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), duTimeout)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "du", "-sk", path)
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
return 0, fmt.Errorf("du timeout after %v", duTimeout)
|
||||
}
|
||||
if stderr.Len() > 0 {
|
||||
return 0, fmt.Errorf("du failed: %v (%s)", err, stderr.String())
|
||||
}
|
||||
return 0, fmt.Errorf("du failed: %v", err)
|
||||
}
|
||||
fields := strings.Fields(stdout.String())
|
||||
if len(fields) == 0 {
|
||||
return 0, fmt.Errorf("du output empty")
|
||||
}
|
||||
kb, err := strconv.ParseInt(fields[0], 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to parse du output: %v", err)
|
||||
}
|
||||
if kb <= 0 {
|
||||
return 0, fmt.Errorf("du size invalid: %d", kb)
|
||||
}
|
||||
return kb * 1024, nil
|
||||
}
|
||||
|
||||
func getDirectoryLogicalSize(path string) (int64, error) {
|
||||
var total int64
|
||||
err := filepath.WalkDir(path, func(p string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
if os.IsPermission(err) {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if d.IsDir() {
|
||||
return nil
|
||||
}
|
||||
info, err := d.Info()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
total += getActualFileSize(p, info)
|
||||
return nil
|
||||
})
|
||||
if err != nil && err != filepath.SkipDir {
|
||||
return 0, err
|
||||
}
|
||||
return total, nil
|
||||
}
|
||||
|
||||
func getActualFileSize(_ string, info fs.FileInfo) int64 {
|
||||
stat, ok := info.Sys().(*syscall.Stat_t)
|
||||
if !ok {
|
||||
return info.Size()
|
||||
}
|
||||
|
||||
actualSize := stat.Blocks * 512
|
||||
if actualSize < info.Size() {
|
||||
return actualSize
|
||||
}
|
||||
return info.Size()
|
||||
}
|
||||
|
||||
func getLastAccessTime(path string) time.Time {
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return time.Time{}
|
||||
}
|
||||
return getLastAccessTimeFromInfo(info)
|
||||
}
|
||||
|
||||
func getLastAccessTimeFromInfo(info fs.FileInfo) time.Time {
|
||||
stat, ok := info.Sys().(*syscall.Stat_t)
|
||||
if !ok {
|
||||
return time.Time{}
|
||||
}
|
||||
return time.Unix(stat.Atimespec.Sec, stat.Atimespec.Nsec)
|
||||
}
|
||||
Reference in New Issue
Block a user