Ensure errors are handled or logged
if err != nil {
1// SPDX-License-Identifier: MIT23package processor45import (6 "context"7 "errors"8 "fmt"9 "io"10 "strings"11 "time"1213 "github.com/go-git/go-git/v5"14 "github.com/go-git/go-git/v5/plumbing"15 "github.com/go-git/go-git/v5/plumbing/filemode"16 fdiff "github.com/go-git/go-git/v5/plumbing/format/diff"17 "github.com/go-git/go-git/v5/plumbing/object"18 "github.com/go-git/go-git/v5/utils/merkletrie"19)2021// HistoryDepth is the maximum number of commits the history engine walks. 022// means "entire history". Wired to --depth in main.go.23var HistoryDepth = 10002425// LineRange is a half-open line span [Start, Start+Count) in 1-based line26// numbers. A FileChange carries one entry per contiguous run of added (or27// removed) lines emitted by go-git's diff.28type LineRange struct {29 Start int30 Count int31}3233// CommitInfo is the per-commit metadata handed to observers.34type CommitInfo struct {35 Hash plumbing.Hash36 Author string37 Email string38 When time.Time39}4041// FileChange is one changed file inside a commit. AddedRanges/RemovedRanges42// describe the diff against the first parent; LineTypes and Complexity are43// scc's classifier output for the new blob (one LineType per line, one entry44// in Complexity per line that fired a complexity tick).45type FileChange struct {46 Path string47 FromPath string // != Path on a detected rename; "" on a pure add48 Language string49 AddedRanges []LineRange50 RemovedRanges []LineRange51 LineTypes []LineType52 RemovedLineTypes []LineType // old-blob line types, for code-filtered removals53 Complexity []int54 NewBlob []byte55}5657// CommitObserver is implemented by each report's accumulator. The engine58// invokes Observe once per commit oldest-first, then Finalise once with the59// window metadata and a snapshot of the HEAD tree (latest language /60// complexity per surviving file).61type CommitObserver interface {62 Observe(c CommitInfo, changes []FileChange)63 Finalise(window HistoryWindow, head HeadSnapshot)64}6566// HistoryWindow describes the commit window the engine walked.67type HistoryWindow struct {68 Depth int69 Commits int70 From time.Time71 To time.Time72 Head plumbing.Hash73}7475// HeadFile is one file in the HEAD tree, classified by scc's engine.76type HeadFile struct {77 Path string78 Language string79 Complexity int6480}8182// HeadSnapshot is the set of files in HEAD, keyed by path.83type HeadSnapshot struct {84 Files map[string]HeadFile85}8687// BaselineFile is one file from the window's start-commit tree, classified by88// scc's engine. Carries per-line type and complexity placement so observers89// can attribute lines that survive untouched from before the window.90type BaselineFile struct {91 Path string92 Language string93 LineTypes []LineType94 Complexity []int // 1-based line numbers that fired a complexity tick95}9697// BaselineSnapshot is the optional pre-walk state handed to observers that98// implement BaselineObserver. Files holds the classified contents of the99// window's start-commit tree (empty when the window covers all history);100// Mailmap is the parsed .mailmap from the HEAD tree, if present.101type BaselineSnapshot struct {102 Files map[string]BaselineFile103 Mailmap *mailmap104}105106// BaselineObserver is an optional extension to CommitObserver. When an107// observer implements it, the engine builds the baseline snapshot before the108// walk and calls Seed once. Observers that don't need the baseline (e.g.109// Hotspots) skip the expense by not implementing the interface.110type BaselineObserver interface {111 Seed(BaselineSnapshot)112}113114// MailmapObserver is an optional extension to CommitObserver. The engine115// always parses the repo's .mailmap from HEAD — one small blob — and hands116// it to observers that implement this, before the walk. Unlike117// BaselineObserver it does NOT trigger the expensive start-tree118// classification, so observers that only need author folding (e.g. Hotspots,119// the author timeline) can implement it cheaply.120type MailmapObserver interface {121 SetMailmap(*mailmap)122}123124// errStopIter is a local sentinel used to terminate iter.ForEach once we've125// collected --depth commits. iter.ForEach surfaces whatever the callback126// returns, so we can compare it back at the call site directly.127var errStopIter = errors.New("history: stop iteration")128129// Bucketing divides [From, To] into N equal time slices. Used by the timeline130// reports (plans 04 and 05) to map per-commit timestamps to a fixed-resolution131// per-bucket series independent of terminal width.132type Bucketing struct {133 From time.Time134 To time.Time135 N int136 Width time.Duration137}138139// NewBucketing constructs a Bucketing covering [from, to] divided into n140// equal-width slices. n must be > 0; n <= 0 is normalised to 1 so callers can141// pass user input unchecked. A degenerate window (from == to or to before142// from) yields Width=0; all commits land in bucket 0 / N-1.143func NewBucketing(from, to time.Time, n int) Bucketing {144 if n <= 0 {145 n = 1146 }147 b := Bucketing{From: from, To: to, N: n}148 if to.After(from) {149 b.Width = to.Sub(from) / time.Duration(n)150 }151 return b152}153154// Index returns the 0..N-1 bucket slot for commit time t. Times before From155// clamp to 0 (defensive — should not happen given the walk window). Times at156// or after To clamp to N-1.157func (b Bucketing) Index(t time.Time) int {158 if b.N <= 0 {159 return 0160 }161 if b.Width <= 0 {162 return 0163 }164 if !t.After(b.From) {165 return 0166 }167 if !t.Before(b.To) {168 return b.N - 1169 }170 idx := int(t.Sub(b.From) / b.Width)171 if idx < 0 {172 return 0173 }174 if idx >= b.N {175 return b.N - 1176 }177 return idx178}179180// Start returns the wall-clock start time of bucket i. Indexes outside181// [0, N) are clamped.182func (b Bucketing) Start(i int) time.Time {183 if b.N <= 0 {184 return b.From185 }186 if i <= 0 {187 return b.From188 }189 if i >= b.N {190 i = b.N - 1191 }192 return b.From.Add(time.Duration(i) * b.Width)193}194195// emptySnapshot is what observers see when HEAD is missing or empty.196func emptySnapshot() HeadSnapshot {197 return HeadSnapshot{Files: map[string]HeadFile{}}198}199200// runHistory opens the repo at repoPath, walks up to HistoryDepth commits201// (newest first → oldest first), and feeds every commit's first-parent diff202// to the observer.203func runHistory(repoPath string, observer CommitObserver) (HistoryWindow, error) {204 repo, err := git.PlainOpenWithOptions(repoPath, &git.PlainOpenOptions{DetectDotGit: true})205 if err != nil {206 return HistoryWindow{}, fmt.Errorf("open git repository: %w", err)207 }208209 head, err := repo.Head()210 if err != nil {211 if errors.Is(err, plumbing.ErrReferenceNotFound) {212 observer.Finalise(HistoryWindow{}, emptySnapshot())213 return HistoryWindow{}, nil214 }215 return HistoryWindow{}, fmt.Errorf("read HEAD: %w", err)216 }217218 iter, err := repo.Log(&git.LogOptions{219 From: head.Hash(),220 Order: git.LogOrderCommitterTime,221 })222 if err != nil {223 return HistoryWindow{}, fmt.Errorf("walk log: %w", err)224 }225226 collected := make([]*object.Commit, 0)227 walkErr := iter.ForEach(func(c *object.Commit) error {228 collected = append(collected, c)229 if HistoryDepth > 0 && len(collected) >= HistoryDepth {230 return errStopIter231 }232 return nil233 })234 if walkErr != nil && !errors.Is(walkErr, errStopIter) {235 return HistoryWindow{}, fmt.Errorf("collect commits: %w", walkErr)236 }237238 if len(collected) == 0 {239 observer.Finalise(HistoryWindow{Head: head.Hash()}, emptySnapshot())240 return HistoryWindow{Head: head.Hash()}, nil241 }242243 window := HistoryWindow{244 Depth: HistoryDepth,245 Commits: len(collected),246 From: collected[len(collected)-1].Author.When,247 To: collected[0].Author.When,248 Head: head.Hash(),249 }250251 ignore, err := buildHistoryIgnore(repo, head.Hash())252 if err != nil {253 printWarnF("history: ignore matcher: %s", err)254 }255256 cache := newBlobClassifyCache()257258 if mo, ok := observer.(MailmapObserver); ok {259 mo.SetMailmap(loadMailmapForHead(collected[0]))260 }261262 if bo, ok := observer.(BaselineObserver); ok {263 baseline := buildBaselineForObserver(collected, ignore, cache)264 bo.Seed(baseline)265 }266267 ctx := context.Background()268 for i := len(collected) - 1; i >= 0; i-- {269 commit := collected[i]270 changes, err := commitChanges(ctx, commit, ignore, cache)271 if err != nil {272 printWarnF("history: diff %s: %s", commit.Hash, err)273 continue274 }275 observer.Observe(CommitInfo{276 Hash: commit.Hash,277 Author: commit.Author.Name,278 Email: commit.Author.Email,279 When: commit.Author.When,280 }, changes)281 }282283 snapshot, err := buildHeadSnapshot(collected[0], ignore, cache)284 if err != nil {285 printWarnF("history: head snapshot: %s", err)286 snapshot = emptySnapshot()287 }288289 observer.Finalise(window, snapshot)290 return window, nil291}292293// loadMailmapForHead parses .mailmap from the HEAD commit's tree. Returns294// nil when there is no .mailmap or the HEAD tree cannot be read. Cheap295// compared to building the full baseline, so observers that only need296// author folding (Hotspots, author timeline) can satisfy MailmapObserver297// without paying for the start-tree classification.298func loadMailmapForHead(headCommit *object.Commit) *mailmap {299 if headCommit == nil {300 return nil301 }302 tree, err := headCommit.Tree()303 if err != nil {304 return nil305 }306 return loadMailmapFromTree(tree)307}308309// buildBaselineForObserver loads the mailmap from HEAD and classifies the310// tree at the window's start commit. The start commit is the first-parent of311// the oldest commit in the window; if that commit has no parents (the window312// covers all history) the baseline files map is empty.313func buildBaselineForObserver(collected []*object.Commit, ignore *historyIgnore, cache *blobClassifyCache) (baseline BaselineSnapshot) {314 baseline = BaselineSnapshot{Files: map[string]BaselineFile{}}315 // Backstop for panics outside the per-file recover below — go-git's316 // tree.Files() iterator can itself panic on a corrupt object, and the317 // per-file handler is not in scope for that. Return whatever was318 // accumulated so far rather than crashing the report.319 defer func() {320 if r := recover(); r != nil {321 printWarnF("history: baseline walk panicked, using partial result: %v", r)322 }323 }()324 if len(collected) == 0 {325 return baseline326 }327328 baseline.Mailmap = loadMailmapForHead(collected[0])329330 oldest := collected[len(collected)-1]331 if oldest.NumParents() == 0 {332 return baseline333 }334 parent, err := oldest.Parent(0)335 if err != nil {336 printWarnF("history: baseline parent: %s", err)337 return baseline338 }339 tree, err := parent.Tree()340 if err != nil {341 printWarnF("history: baseline tree: %s", err)342 return baseline343 }344345 _ = tree.Files().ForEach(func(f *object.File) error {346 defer func() {347 if r := recover(); r != nil {348 name := "<unknown>"349 if f != nil {350 name = f.Name351 }352 printWarnF("history: skipping %s in baseline — panicked: %v", name, r)353 }354 }()355 if f.Mode == filemode.Dir || f.Mode == filemode.Submodule || f.Mode == filemode.Symlink {356 return nil357 }358 if ignore != nil && ignore.Match(f.Name, false) {359 return nil360 }361 reader, err := f.Reader()362 if err != nil {363 return nil364 }365 defer reader.Close()366 blob, err := io.ReadAll(reader)367 if err != nil {368 return nil369 }370 res := cache.classify(f.Hash, f.Name, blob)371 if !res.ok {372 return nil373 }374 baseline.Files[f.Name] = BaselineFile{375 Path: f.Name,376 Language: res.language,377 LineTypes: res.lineTypes,378 Complexity: res.complexLine,379 }380 return nil381 })382383 return baseline384}385386// buildHeadSnapshot walks the HEAD commit's tree and runs scc's classifier387// on each file. Used by hotspots (and future reports) to know each surviving388// file's current language and complexity.389func buildHeadSnapshot(headCommit *object.Commit, ignore *historyIgnore, cache *blobClassifyCache) (snap HeadSnapshot, err error) {390 snap = emptySnapshot()391 // Backstop for panics outside the per-file recover below — go-git's392 // tree.Files() iterator can itself panic on a corrupt object. Return the393 // partial snapshot with no error so the caller keeps what we collected.394 defer func() {395 if r := recover(); r != nil {396 printWarnF("history: HEAD snapshot walk panicked, using partial result: %v", r)397 err = nil398 }399 }()400401 tree, err := headCommit.Tree()402 if err != nil {403 return emptySnapshot(), err404 }405406 snap = HeadSnapshot{Files: map[string]HeadFile{}}407 err = tree.Files().ForEach(func(f *object.File) error {408 defer func() {409 if r := recover(); r != nil {410 name := "<unknown>"411 if f != nil {412 name = f.Name413 }414 printWarnF("history: skipping %s in HEAD snapshot — panicked: %v", name, r)415 }416 }()417 if f.Mode == filemode.Dir || f.Mode == filemode.Submodule || f.Mode == filemode.Symlink {418 return nil419 }420 if ignore != nil && ignore.Match(f.Name, false) {421 return nil422 }423 reader, err := f.Reader()424 if err != nil {425 return nil426 }427 defer reader.Close()428 blob, err := io.ReadAll(reader)429 if err != nil {430 return nil431 }432433 res := cache.classify(f.Hash, f.Name, blob)434 if !res.ok {435 return nil436 }437438 snap.Files[f.Name] = HeadFile{439 Path: f.Name,440 Language: res.language,441 Complexity: res.complexity,442 }443 return nil444 })445 return snap, err446}447448// historyDiffOptions forces rename detection on. The rename-aware reports449// (author rollup, hotspots) depend on renames arriving as a single change450// rather than a delete + add pair, so pin it explicitly — a future go-git451// bump can't silently disable it.452var historyDiffOptions = &object.DiffTreeOptions{DetectRenames: true}453454// commitChanges computes the first-parent diff for commit and projects every455// change into a FileChange. Skips paths that the engine can't count456// (binary blobs, no language detected, submodules, symlinks, ignored paths).457// Deletes are dropped because hotspots-style reports can't render files that458// no longer exist.459//460// The outer recover catches anything the per-call wrappers don't (corrupt461// packfiles via go-git object resolution, future regressions in the diff462// pipeline). One bad commit becomes a warning, not a crash.463func commitChanges(ctx context.Context, commit *object.Commit, ignore *historyIgnore, cache *blobClassifyCache) (out []FileChange, err error) {464 defer func() {465 if r := recover(); r != nil {466 printWarnF("history: skipping commit %s — diff pipeline panicked: %v", commit.Hash, r)467 out = nil468 err = nil469 }470 }()471472 toTree, err := commit.Tree()473 if err != nil {474 return nil, err475 }476477 var fromTree *object.Tree478 if commit.NumParents() > 0 {479 parent, err := commit.Parent(0)480 if err != nil {481 return nil, err482 }483 fromTree, err = parent.Tree()484 if err != nil {485 return nil, err486 }487 }488489 changes, err := object.DiffTreeWithOptions(ctx, fromTree, toTree, historyDiffOptions)490 if err != nil {491 return nil, err492 }493494 out = make([]FileChange, 0, len(changes))495 for _, change := range changes {496 fc, ok := buildFileChange(change, ignore, cache)497 if !ok {498 continue499 }500 out = append(out, fc)501 }502 return out, nil503}504505// buildFileChange converts a single object.Change into a FileChange.506func buildFileChange(change *object.Change, ignore *historyIgnore, cache *blobClassifyCache) (FileChange, bool) {507 action, err := change.Action()508 if err != nil {509 return FileChange{}, false510 }511 if action == merkletrie.Delete {512 return FileChange{}, false513 }514515 path := change.To.Name516 fromPath := change.From.Name517 toEntry := change.To.TreeEntry518 if toEntry.Mode == filemode.Dir || toEntry.Mode == filemode.Submodule || toEntry.Mode == filemode.Symlink {519 return FileChange{}, false520 }521522 if ignore != nil && ignore.Match(path, false) {523 return FileChange{}, false524 }525526 languages, _ := DetectLanguage(path)527 if len(languages) == 0 {528 return FileChange{}, false529 }530531 patch, ok := safePatch(change)532 if !ok {533 return FileChange{}, false534 }535536 var added, removed []LineRange537 for _, fp := range patch.FilePatches() {538 if fp.IsBinary() {539 return FileChange{}, false540 }541 toLine, fromLine := 1, 1542 for _, chunk := range fp.Chunks() {543 lines := lineCount(chunk.Content())544 switch chunk.Type() {545 case fdiff.Equal:546 toLine += lines547 fromLine += lines548 case fdiff.Add:549 if lines > 0 {550 added = append(added, LineRange{Start: toLine, Count: lines})551 }552 toLine += lines553 case fdiff.Delete:554 if lines > 0 {555 removed = append(removed, LineRange{Start: fromLine, Count: lines})556 }557 fromLine += lines558 }559 }560 }561562 blob, err := readBlob(change.To.Tree, &toEntry)563 if err != nil {564 return FileChange{}, false565 }566567 res := cache.classify(toEntry.Hash, path, blob)568 if !res.ok {569 return FileChange{}, false570 }571572 // Classify the parent blob so removed lines can be filtered to code —573 // the timeline reports need a symmetric code-only delta. The blob cache574 // makes this near-free: the old blob is normally an earlier commit's575 // new blob. Skip entirely when there are no removals (pure adds, or576 // when the diff produced no removed ranges).577 var removedLineTypes []LineType578 if len(removed) > 0 && change.From.Name != "" {579 fromEntry := change.From.TreeEntry580 if fromEntry.Mode != filemode.Dir &&581 fromEntry.Mode != filemode.Submodule &&582 fromEntry.Mode != filemode.Symlink {583 if oldBlob, rerr := readBlob(change.From.Tree, &fromEntry); rerr == nil {584 if oldRes := cache.classify(fromEntry.Hash, change.From.Name, oldBlob); oldRes.ok {585 removedLineTypes = oldRes.lineTypes586 }587 }588 }589 }590591 return FileChange{592 Path: path,593 FromPath: fromPath,594 Language: res.language,595 AddedRanges: added,596 RemovedRanges: removed,597 LineTypes: res.lineTypes,598 RemovedLineTypes: removedLineTypes,599 Complexity: res.complexLine,600 NewBlob: blob,601 }, true602}603604// safePatch wraps change.Patch() with panic recovery. The underlying605// sergi/go-diff line-to-rune encoding panics when a file has more distinct606// lines than fit in the Unicode code-point space (generated SQL, huge607// minified bundles, vendored data files). Treat any panic or error as608// "skip this file" so one bad file does not abort the whole report.609func safePatch(change *object.Change) (patch *object.Patch, ok bool) {610 defer func() {611 if r := recover(); r != nil {612 path := ""613 if change != nil {614 path = change.To.Name615 if path == "" {616 path = change.From.Name617 }618 }619 printWarnF("history: skipping %s — diff library panicked: %v", path, r)620 patch = nil621 ok = false622 }623 }()624 p, err := change.Patch()625 if err != nil {626 return nil, false627 }628 return p, true629}630631// classifyFn is the indirect reference to classifyHistoryBlob used by632// safeClassify. Tests substitute a panicking stub to exercise the recover633// path; production behaviour is unchanged.634var classifyFn = classifyHistoryBlob635636// safeClassify wraps the classifier with panic recovery. The history walk637// feeds the classifier many more blob shapes than the working-tree counter638// ever sees (legacy encodings, partial UTF-8, oversized blobs, vendored639// data). A panic in any one path-blob pair must not abort the report —640// skip the file with a warning instead.641func safeClassify(path string, blob []byte) (job *FileJob, lineTypes []LineType, ok bool) {642 defer func() {643 if r := recover(); r != nil {644 printWarnF("history: skipping %s — classifier panicked: %v", path, r)645 job = nil646 lineTypes = nil647 ok = false648 }649 }()650 return classifyFn(path, blob)651}652653// readBlob fetches the raw bytes for a tree entry.654func readBlob(tree *object.Tree, entry *object.TreeEntry) ([]byte, error) {655 file, err := tree.TreeEntryFile(entry)656 if err != nil {657 return nil, err658 }659 reader, err := file.Reader()660 if err != nil {661 return nil, err662 }663 defer reader.Close()664 return io.ReadAll(reader)665}666667// blobClassifyResult is the cached output of classifyHistoryBlob for a single668// blob hash. ok=false means the classifier rejected the blob (binary, no669// language); the vectors are nil in that case.670type blobClassifyResult struct {671 language string672 complexity int64673 lineTypes []LineType674 complexLine []int675 ok bool676}677678// blobClassifyCache memoises classifyHistoryBlob output keyed by blob hash so679// the same blob seen in baseline, commit changes, and HEAD is classified once680// per runHistory. The walk is sequential, so no mutex is required.681type blobClassifyCache struct {682 entries map[plumbing.Hash]blobClassifyResult683}684685func newBlobClassifyCache() *blobClassifyCache {686 return &blobClassifyCache{entries: make(map[plumbing.Hash]blobClassifyResult)}687}688689// classify returns the classifier output for blob, computing and caching it690// on first sight. Slices in the returned result are shared between callers —691// they must be treated as read-only. Negative results (ok=false) are cached692// too so binary/unknown blobs aren't re-attempted.693func (c *blobClassifyCache) classify(hash plumbing.Hash, path string, blob []byte) blobClassifyResult {694 if c != nil {695 if hit, found := c.entries[hash]; found {696 return hit697 }698 }699 job, lineTypes, ok := safeClassify(path, blob)700 res := blobClassifyResult{ok: ok}701 if ok {702 res.language = job.Language703 res.complexity = job.Complexity704 res.lineTypes = lineTypes705 res.complexLine = complexityLineNumbers(job)706 }707 if c != nil {708 c.entries[hash] = res709 }710 return res711}712713// classifyHistoryBlob runs scc's existing classifier on a git blob's bytes714// and returns the resulting FileJob (Language / Complexity / Code / Comment715// / Blank populated) plus the per-line type vector. ok=false means the file716// is binary or the language could not be resolved.717func classifyHistoryBlob(path string, blob []byte) (*FileJob, []LineType, bool) {718 languages, extension := DetectLanguage(path)719 if len(languages) == 0 {720 return nil, nil, false721 }722 for _, l := range languages {723 LoadLanguageFeature(l)724 }725726 job := &FileJob{727 Location: path,728 Filename: basename(path),729 Extension: extension,730 PossibleLanguages: languages,731 Bytes: int64(len(blob)),732 Content: blob,733 TrackComplexityLines: true,734 }735736 job.Language = DetermineLanguage(job.Filename, job.Language, job.PossibleLanguages, job.Content)737 if job.Language == SheBang {738 cutoff := len(blob)739 if cutoff > 200 {740 cutoff = 200741 }742 lang, err := DetectSheBang(string(blob[:cutoff]))743 if err != nil {744 return nil, nil, false745 }746 job.Language = lang747 LoadLanguageFeature(lang)748 }749750 classifier := &historyLineCallback{}751 job.Callback = classifier752753 CountStats(job)754755 if job.Binary {756 return nil, nil, false757 }758759 return job, classifier.lineTypes, true760}761762// complexityLineNumbers returns the 1-based line numbers in job that fired a763// complexity tick. Convenience wrapper for observers that want per-line764// complexity placement (the per-line attribution reports in plans 03–04).765func complexityLineNumbers(job *FileJob) []int {766 out := make([]int, 0)767 for i, count := range job.ComplexityLine {768 if count > 0 {769 out = append(out, i+1)770 }771 }772 return out773}774775type historyLineCallback struct {776 lineTypes []LineType777}778779func (h *historyLineCallback) ProcessLine(job *FileJob, currentLine int64, lineType LineType) bool {780 h.lineTypes = append(h.lineTypes, lineType)781 return true782}783784func basename(path string) string {785 if i := strings.LastIndex(path, "/"); i >= 0 {786 return path[i+1:]787 }788 return path789}790791func lineCount(s string) int {792 if s == "" {793 return 0794 }795 n := strings.Count(s, "\n")796 if !strings.HasSuffix(s, "\n") {797 n++798 }799 return n800}
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.